[英]How can I crawl all the <td> contents?(python3.6)
當使用python3和BeautifulSoup從Web獲取指定的內容時,我無法在“ td”中獲取所有信息。
這是我的代碼
import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def main():
try:
url = "http://baike.hrhrs.com/index.php?doc-view-3967.html"
html = getHTMLText(url)
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())
for tag in soup.find_all('h1',class_='title_thema'):
name = tag.find('span', id='doctitle').get_text()
# the first one
table1 = soup.find('table',attrs={'id': 'jqe-table-0'})
tr = table1.find_all('tr')
for trr in tr:
td = trr.findAll('td')
print(td)
這是輸出
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A1:</strong>A2</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);"A3:</strong>A4</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A5:</strong>A6</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A7:</strong>A8</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A9:</strong>A10</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A11:</strong>A12</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A13:</strong>A14</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A15:</strong>A16</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A17:</strong>A18</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A19:</strong>A20</td>]
[<td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A21:</strong>A22</td>, <td class="jg" style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 10px; border-bottom: none;"></td>, <td style="padding: 2px 0px; vertical-align: top; overflow: hidden; line-height: 18px; background: rgb(252, 253, 255); width: 350px; border-bottom: 1px dashed rgb(221, 221, 221);"><strong style="display: block; line-height: 22px; vertical-align: baseline; zoom: 1; width: 80px; float: left; color: rgb(153, 153, 153);">A23:</strong>A24</td>]
我使用代碼:
print(td[0].text)
結果是:
A1:A2
A5:A6
A9:A10
A13:A14
A17:A18
A21:A22
我想獲取“ td”中的所有內容,例如“ A3:A4”並繼續。 我該如何更改代碼以獲取所有內容。希望您的答復!
根據我的原始代碼,只需得到如下結果:
中文名:柳公權
別名:誠懸
出生地:京兆華原(今陝西銅川市耀州區)
民族:漢族
出生年月:公元778年
職業:書法家
更改代碼:
import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ''
def main():
try:
for count in range(100,1000):
url = "http://baike.hrhrs.com/index.php?doc-view-"+str(count)+".html"
html = getHTMLText(url)
soup = BeautifulSoup(html, 'html.parser')
# print(soup.prettify())
for tag in soup.find_all('h1', class_='title_thema'):
name = tag.find('span', id='doctitle').get_text()
n[0] = name;
tr = soup.find_all('tr')
for trr in tr:
tdlist = trr.find_all('td')
for i in range(len(tdlist)):
print(str(tdlist[i].text))
except:
print("error")
print("successfully!")
main()
結果:
中文名:柳公權
中文名:柳公權
別名:誠懸
籍貫:唐朝京兆華原(今陝西耀縣)
出生地:京兆華原(今陝西銅川市耀州區)
性別:男
民族:漢族
國籍:中國
......
所有內容均可用。
使用您的代碼,在Jupyter Notebook中逐步運行,我得到了
中文名:柳公權
別名:誠懸
出生地:京兆華原(今陝西銅川市耀州區)
民族:漢族
出生年月:公元778年
職業:書法家
你是這個意思嗎
這是我的更改:
import requests
from bs4 import BeautifulSoup
url = "http://baike.hrhrs.com/index.php?doc-view-3967.html"
def main():
global url
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
tags = soup.find_all('h1', class_='title_thema')
for tag in tags:
name = tag.find('span', id='doctitle').get_text()
print(name)
table1 = soup.find('table', attrs={'id':'jqe-table-0'})
tr = table1.find_all('tr')
for trr in tr:
td = trr.findAll('td')
print(td[0].text)
if __name__ == '__main__':
main()
嘗試這個..
import requests
from bs4 import BeautifulSoup
url = "http://baike.hrhrs.com/index.php?doc-view-3967.html"
res = requests.get(url)
data = BeautifulSoup(res.content, 'html.parser')
x for x in data.find_all('td')
print (x.text)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.