![](/img/trans.png)
[英]Issue with table structure of webpage when scraping data using beautiful soup
[英]Issue for scraping website data for every webpage automatically and save it in csv by using beautiful Soup, pandas and request
代碼無法成功逐頁抓取網頁數據,且csv格式與網頁數據記錄不匹配。 我想代碼啟用自動運行所有網頁。 目前,它只能運行首頁數據。 它如何自己運行第二、第三頁? 其次,在csv格式中,'hospital_name'、'name'、'license_type'列在csv格式中都是空的。 它們都出現在 csv 格式的末尾
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd
url = "https://www.abvma.ca/client/roster/clientRosterView.html?
clientRosterId=168"
url_page_2 = url + '&page=' + str(2)
def get_data_from_url(url):
#output the data
data = requests.get(url)
page_data = soup(data.text,'html.parser')
AB_data = page_data.find_all('div',{"class":"col-md-4 roster_tbl"})
#create a table
#for each in AB_data:
#print (each.text)
df=pd.DataFrame(AB_data)
df.head()
df.drop([0,1,2,9,3,4,5,6,7,8,10,12],axis=1, inplace=True)
for each in AB_data:
hospital = each.find('a').text
name = each.find('strong').text
license_type=each.find('font').text
#print(hospital)
#df['hospital_name']= hospital
df=df.append(pd.DataFrame({'hospital_name':hospital,
'name':name,'license_type':license_type},index=[0]), sort=False)
pd.set_option('display.max_columns',None)
print (df)
df.to_csv('AB_Vets_2018.csv',index=False)
import csv
import requests
from bs4 import BeautifulSoup
FIELDNAMES = (
'first_name',
'last_name',
'license_type',
'location',
'reg_num'
)
def get_page(page_num):
base_url = "https://www.abvma.ca/client/roster/clientRosterView.html"
params = {
'clientRosterId': 168,
'page': page_num
}
r = requests.get(base_url, params=params)
r.raise_for_status()
return r.text
def parse_page(page_html):
result = []
soup = BeautifulSoup(page_html, 'lxml')
for vet in soup.find_all('div', class_='col-md-4 roster_tbl'):
name, *location, title, licence_type, reg_num = vet.stripped_strings
last_name, first_name = name.split(', ', maxsplit=1)
result.append({
'first_name': first_name,
'last_name': last_name,
'license_type': licence_type,
'location': '' if not location else location[0],
'reg_num': int(reg_num.split()[-1])
})
return result
if __name__ == '__main__':
result = []
for page_num in range(1, 35):
page_html = get_page(page_num)
parsed_page = parse_page(page_html)
result.extend(parsed_page)
with open('output.csv', 'w') as f:
writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
writer.writeheader()
writer.writerows(result)
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.