[英]BeautifulSoup - how do i scrape multiple links to then scrape contents of links
我正在嘗試在登錄頁面有各種鏈接(頂部的 5 個子類別)的地方進行抓取: https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.ZFC35FDC70D5FC69D269883A822C7A5E
每個類別中都有一個產品列表https://mcavoyguns.co.uk/contents/en-uk/d411_Browning_B725_Shotguns.html
列出的每個產品都有一個鏈接以獲取更多詳細信息(作為單個頁面的產品直接鏈接) https://mcavoyguns.co.uk/contents/en-uk/p74600_Browning-B725-Sporter-over-_-under。 html
到目前為止,我收集的內容將創建一個包含所有所需單個頁面鏈接的列表。 但是當我嘗試循環每個單獨的產品鏈接以獲取數據時,我似乎無法從這些鏈接中獲取 BeautifulSoup 到 map 的數據。 就好像它停留在上一頁(如果你願意的話)。
我缺少什么以允許第二次“反彈”到“product_link”地址(例如https://mcavoyguns.co.uk/contents/en-uk/p74600_Browning-B725-Sporter-over-_-under.ZFC35FDC70D5FC69D269883A82EZ35A )並允許我從那里抓取數據? 我原以為我可能需要添加一個 time.sleep(5) 計時器以允許所有人加載但仍然一無所獲。
代碼:
from bs4 import BeautifulSoup
import math
import requests
import shutil
import csv
import pandas
import numpy as np
from pandas import DataFrame
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
time.sleep(2)
all_Outlinks=[]
all_links=[]
soup = BeautifulSoup(browser.page_source, features="lxml")
submenuFind = soup.find("div", "idx2Submenu")
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub")
for submenuItem in submenuItems:
for link in submenuItem.select('a[href]'):
all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href'])
#print(all_Outlinks)
for a_link in all_Outlinks:
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'html.parser')
pageLinkDivs = soup.find_all("div", "column full")
for pageLinkDiv in pageLinkDivs:
for pageLink in pageLinkDiv.select('a[href]'):
all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
#print(all_links)
for product_link in all_links:
time.sleep(5)
resSecond = requests.get(product_link)
soup = BeautifulSoup(resSecond.text, 'html.parser')
model = soup.find("div", "GC75 ProductChoiceName")
print(model)
PS 為額外的進口道歉。 它們是從以前的腳本中復制和粘貼的,一旦確認不需要,它們就會被刪除。
使用瀏覽器時,該信息是從腳本標簽中動態提取的。 在使用請求時,這將不在您可能正在尋找的位置。 相反,從腳本標簽中提取該信息。
在這種情況下,我提取與腳本中給定 model 相關的所有信息並生成 dataframe。 我將腳本標簽內的字符串轉換為帶有ast
的 python object 。 我將產品 url 和產品標題添加到 dataframe。
每個 df 都被添加到一個列表中,該列表被轉換為最終的 dataframe。 因為我不知道最終需要什么 header 名稱,所以我留下了一些默認名稱。
對於給定產品沒有列出 model 選項的情況,我已添加處理。
from bs4 import BeautifulSoup
import math
import requests
import shutil
import csv
import pandas as pd
import numpy as np
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select
import ast
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
time.sleep(2)
all_Outlinks=[]
all_links=[]
soup = BeautifulSoup(browser.page_source, features="lxml")
submenuFind = soup.find("div", "idx2Submenu")
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub")
for submenuItem in submenuItems:
for link in submenuItem.select('a[href]'):
all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href'])
#print(all_Outlinks)
with requests.Session() as s:
for a_link in all_Outlinks:
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'html.parser')
pageLinkDivs = soup.find_all("div", "column full")
for pageLinkDiv in pageLinkDivs:
for pageLink in pageLinkDiv.select('a[href]'):
all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
results = []
for product_link in all_links:
# print(product_link)
resSecond = s.get(product_link)
soup = BeautifulSoup(resSecond.text, 'html.parser')
title = soup.select_one('.ProductTitle').text
try:
df = pd.DataFrame(ast.literal_eval(re.search(r'(\[\[.*\]\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
df.iloc[:, -1] = product_link
except:
placeholder = ['No options listed'] * 8
placeholder.append(product_link)
df = pd.DataFrame([placeholder])
df.insert(0, 'title', title)
#print(df) # add headers you care about to df or do that at end on full list
results.append(df)
final = pd.concat(results) # or add header here
print(final)
然后你可以看看加速/整理東西:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os
import locale
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import ast
from multiprocessing import Pool, cpu_count
def get_models_df(product_link):
res = requests.get(product_link)
soup = BeautifulSoup(res.text, 'lxml')
title = soup.select_one('.ProductTitle').text
try:
df = pd.DataFrame(ast.literal_eval(re.search(r'(\[\[.*\]\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
df.iloc[:, -1] = product_link
except:
placeholder = ['No options listed'] * 8
placeholder.append(product_link)
df = pd.DataFrame([placeholder])
df.insert(0, 'title', title)
return(df)
def get_all_pages(a_link):
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'lxml')
all_links = ["https://mcavoyguns.co.uk/contents/en-uk/" + i['href'] for i in soup.select('.center-content > a')]
return all_links
if __name__ == '__main__':
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
all_outlinks = [i.get_attribute('href') for i in WebDriverWait(browser,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".idx2Submenu a")))]
browser.quit()
with Pool(cpu_count()-1) as p:
nested_links = p.map(get_all_pages , all_outlinks)
flat_list = [link for links in nested_links for link in links]
results = p.map(get_models_df, flat_list)
final = pd.concat(results)
#print(final)
final.to_csv('guninfo.csv', encoding='utf-8-sig', index = False)
所以我說我會看看其他請求的項目,它們確實只有requests
可用。 一些需要處理的事情:
nan
的處理去做:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os
import locale
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import ast
from multiprocessing import Pool, cpu_count
import numpy as np
import unicodedata
def get_models_df(product_link):
resSecond = requests.get(product_link)
soup = BeautifulSoup(resSecond.text, 'lxml')
title = soup.select_one('.ProductTitle').text
try:
df = pd.DataFrame(ast.literal_eval(re.search(r'(\[\[.*\]\])', soup.select_one('.ProductOptions script').string).groups(0)[0]))
except:
placeholder = ['No options listed'] * 8
df = pd.DataFrame([placeholder])
df.insert(0, 'title', title)
df['price'] = ' '.join([soup.select_one("[property='product:price:amount']")['content'],
soup.select_one("[property='product:price:currency']")['content']])
df['weight'] = ' '.join([soup.select_one("[property='product:weight:value']")['content'],
soup.select_one("[property='product:weight:units']")['content']])
output_headers = ['Action frame', 'Barrel','Barrel finish','Barrel length',
'Barrel length (mm-inch)','Buttstock','Calibre','Chokes','Code',
'Drop at comb','Drop at heel','Forearm','Length','N/A','Notes',
'Options','Packaging','Sights','Stock style','Top rib','Weight','Wood','Wood grade'
]
df = pd.concat([df, pd.DataFrame(columns = output_headers)])
try:
description_table = pd.read_html(str(soup.select_one('.ProductDetailedDescription table, table')))[0].transpose()
description_table.dropna(axis=0, how='all',inplace=True)
headers = list(description_table.iloc[0,:])
headers[:] = ['N/A' if pd.isnull(np.array([header], dtype=object)) else header for header in headers]
for number, header in enumerate(headers):
temp = header.lower()
value = description_table.iloc[1, number]
if temp == 'calibre':
df[header] = "'" + value
elif temp == 'top rib' and 'mm' not in value:
df[header] = value + 'mm'
else:
df[header] = value
except:
pass # no table
description = soup.select_one('#ProductDetailsTab [title=More]')
if description is None:
desc = 'N/A'
else:
desc = '. '.join([i.text for i in soup.select('.ProductDescription li, .ProductDescription span') if i.text !=''])
if desc == '':
desc = soup.select_one('.ProductIntroduction').get_text()
df['desc'] = unicodedata.normalize('NFKD', desc)
df['product_link'] = product_link
return(df)
def get_all_pages(a_link):
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'lxml')
all_links = ["https://mcavoyguns.co.uk/contents/en-uk/" + i['href'] for i in soup.select('.center-content > a')]
return all_links
if __name__ == '__main__':
#os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome()# executable_path='C:/Users/admin/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
all_outlinks = [i.get_attribute('href') for i in WebDriverWait(browser,10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".idx2Submenu a")))]
browser.quit()
with Pool(cpu_count()-1) as p:
nested_links = p.map(get_all_pages , all_outlinks)
flat_list = [link for links in nested_links for link in links]
results = p.map(get_models_df, flat_list)
final = pd.concat(results)
#print(final)
final.to_csv('guninfo.csv', encoding='utf-8-sig', index = False)
正如 QHarr 指出的那樣,Selenium 就是答案。 這給了我以不同的眼光看待它的方向,讓我找到了答案。
我將發布作為我的答案,但將 @QHarr 歸功於基於之前提供的工作和持續的幫助以幫助解決問題的工作。
from bs4 import BeautifulSoup
import math
import requests
import shutil
import csv
import pandas
import numpy as np
from pandas import DataFrame
import re
import os
import urllib.request as urllib2
import locale
import json
from selenium import webdriver
import lxml.html
import time
from selenium.webdriver.support.ui import Select
os.environ["PYTHONIOENCODING"] = "utf-8"
#selenium requests
browser = webdriver.Chrome(executable_path='C:/Users/andrew.glass/chromedriver.exe')
browser.get("https://mcavoyguns.co.uk/contents/en-uk/d410_New_Browning_over___under_shotguns.html")
time.sleep(2)
all_Outlinks=[]
all_links=[]
soup = BeautifulSoup(browser.page_source, features="lxml")
submenuFind = soup.find("div", "idx2Submenu")
submenuItems = submenuFind.find_all("li", "GC34 idx2Sub")
for submenuItem in submenuItems:
for link in submenuItem.select('a[href]'):
all_Outlinks.append("https://mcavoyguns.co.uk/contents/en-uk/" + link['href'])
#print(all_Outlinks)
for a_link in all_Outlinks:
res = requests.get(a_link)
soup = BeautifulSoup(res.text, 'html.parser')
pageLinkDivs = soup.find_all("div", "column full")
for pageLinkDiv in pageLinkDivs:
for pageLink in pageLinkDiv.select('a[href]'):
all_links.append("https://mcavoyguns.co.uk/contents/en-uk/" + pageLink['href'])
#print(all_links)
for product_link in all_links:
browser.get(product_link)
time.sleep(5)
soup = BeautifulSoup(browser.page_source, 'html.parser')
model = soup.find("div", "GC65 ProductOptions")
modelFind = soup.find('select', attrs={'name': re.compile('model')})
modelList = [x['origvalue'][:14] for x in modelFind.find_all('option')[1:]]
print(modelList)
Model 打印仍然有點亂,但是一旦收集了所有要求就可以清理它。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.