[英]Python BeautifulSoup selenium scraper
I'm using the following python script for scraping info from Amazon pages .我正在使用以下 python 脚本从亚马逊页面抓取信息。
At some point, it stopped returning page results.在某些时候,它停止返回页面结果。 The script is starting, browsing through the keywords/pages but I only get the headers as output:脚本正在启动,浏览关键字/页面,但我只得到标题作为输出:
Keyword Rank Title ASIN Score Reviews Prime Date关键字排名标题 ASIN 评分评论 Prime 日期
I suspect that the problem is in the following line as this tag doesn't exist anymore and the results
var doesn't get any value:我怀疑问题出在以下行中,因为此标记不再存在并且results
var 没有任何值:
results = soup.findAll('div', attrs={'class': 's-item-container'})
This is the full code:这是完整的代码:
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
import datetime
from collections import deque
import logging
import csv
class AmazonScaper(object):
def __init__(self,keywords, output_file='example.csv',sleep=2):
self.browser = webdriver.Chrome(executable_path='/Users/willcecil/Dropbox/Python/chromedriver') #Add path to your Chromedriver
self.keyword_queue = deque(keywords) #Add the start URL to our list of URLs to crawl
self.output_file = output_file
self.sleep = sleep
self.results = []
def get_page(self, keyword):
try:
self.browser.get('https://www.amazon.co.uk/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords={a}'.format(a=keyword))
return self.browser.page_source
except Exception as e:
logging.exception(e)
return
def get_soup(self, html):
if html is not None:
soup = BeautifulSoup(html, 'lxml')
return soup
else:
return
def get_data(self,soup,keyword):
try:
results = soup.findAll('div', attrs={'class': 's-item-container'})
for a, b in enumerate(results):
soup = b
header = soup.find('h2')
result = a + 1
title = header.text
try:
link = soup.find('a', attrs={'class': 'a-link-normal a-text-normal'})
url = link['href']
url = re.sub(r'/ref=.*', '', str(url))
except:
url = "None"
# Extract the ASIN from the URL - ASIN is the breaking point to filter out if the position is sponsored
ASIN = re.sub(r'.*amazon.co.uk.*/dp/', '', str(url))
# Extract Score Data using ASIN number to find the span class
score = soup.find('span', attrs={'name': ASIN})
try:
score = score.text
score = score.strip('\n')
score = re.sub(r' .*', '', str(score))
except:
score = "None"
# Extract Number of Reviews in the same way
reviews = soup.find('a', href=re.compile(r'.*#customerReviews'))
try:
reviews = reviews.text
except:
reviews = "None"
# And again for Prime
PRIME = soup.find('i', attrs={'aria-label': 'Prime'})
try:
PRIME = PRIME.text
except:
PRIME = "None"
data = {keyword:[keyword,str(result),title,ASIN,score,reviews,PRIME,datetime.datetime.today().strftime("%B %d, %Y")]}
self.results.append(data)
except Exception as e:
print(e)
return 1
def csv_output(self):
keys = ['Keyword','Rank','Title','ASIN','Score','Reviews','Prime','Date']
print(self.results)
with open(self.output_file, 'a', encoding='utf-8') as outputfile:
dict_writer = csv.DictWriter(outputfile, keys)
dict_writer.writeheader()
for item in self.results:
for key,value in item.items():
print(".".join(value))
outputfile.write(",".join('"' + item + '"' for item in value)+"\n") # Add "" quote character so the CSV accepts commas
def run_crawler(self):
while len(self.keyword_queue): #If we have keywords to check
keyword = self.keyword_queue.popleft() #We grab a keyword from the left of the list
html = self.get_page(keyword)
soup = self.get_soup(html)
time.sleep(self.sleep) # Wait for the specified time
if soup is not None: #If we have soup - parse and save data
self.get_data(soup,keyword)
self.browser.quit()
self.csv_output() # Save the object data to csv
if __name__ == "__main__":
keywords = [str.replace(line.rstrip('\n'),' ','+') for line in
open('keywords.txt')] # Use our file of keywords & replaces spaces with +
ranker = AmazonScaper(keywords) # Create the object
ranker.run_crawler() # Run the rank checker
The output should look like this (I have trimmed the Titles for clarity).输出应该是这样的(为了清晰起见,我已经修剪了标题)。
Keyword Rank Title ASIN Score Reviews Prime Date关键字排名标题 ASIN 评分评论 Prime 日期
Blue+Skateboard 3 Osprey Complete Beginn B00IL1JMF4 3.7 40 Prime February 21, 2019 Blue+Skateboard 4 ENKEEO Complete Mini C B078J9Y1DG 4.5 42 Prime February 21, 2019 Blue+Skateboard 5 skatro - Mini Cruiser B00K93PIXM 4.8 223 Prime February 21, 2019 Blue+Skateboard 7 Vinsani Retro Cruiser B00CSV72AK 4.4 8 Prime February 21, 2019 Blue+Skateboard 8 Ridge Retro Cruiser Bo B00CA33ISQ 4.1 207 Prime February 21, 2019 Blue+Skateboard 9 Xootz Kids Complete Be B01B2YNSJM 3.6 32 Prime February 21, 2019 Blue+Skateboard 10 Enuff Pyro II Skateboa B00MGRGX2Y 4.3 68 Prime February 21, 2019 Blue+Skateboard 3 Osprey Complete Beginn B00IL1JMF4 3.7 40 Prime 2019 年 2 月 21 日 Blue+Skateboard 4 ENKEEO Complete Mini C B078J9Y1DG 4.5 42 Prime 2019 年 2 月 21 日 Blue+Skateboard 5 B00K24M 2019 年 2 月 Mini Cruiser1000K239S 29X 蓝巡洋舰 4 月 29 月 39 日7 Vinsani Retro Cruiser B00CSV72AK 4.4 8 Prime 2019 年 2 月 21 日 Blue+Skateboard 8 Ridge Retro Cruiser Bo B00CA33ISQ 4.1 207 Prime 2019 年 2 月 21 日 Blue+Skateboard 9 Xootz Kids Complete Be B01B202YS2019P Blue+Skateboard 8 Ridge Retro Cruiser Bo B00CA33ISQ II Skateboa B00MGRGX2Y 4.3 68 Prime 2019 年 2 月 21 日
The following shows some changes you could make.下面显示了您可以进行的一些更改。 I have changed to using css selectors at some points.我在某些时候改用了 css 选择器。
The main result set to loop over are retrieved by soup.select('.s-result-list [data-asin]')
.要循环的主要结果集由soup.select('.s-result-list [data-asin]')
检索。 This specifies elements with class name .s-result-list
having children with attribute data-asin
.这指定了类名为.s-result-list
元素,其子元素具有属性data-asin
。 This matches the 60 (current) items on page.这与页面上的 60 个(当前)项目匹配。
I swapped the PRIME selection to using an attribute = value selector我将 PRIME 选择交换为使用属性 = 值选择器
Headers are now h5
ie header = soup.select_one('h5')
.标题现在是h5
即header = soup.select_one('h5')
。
soup.select_one('[aria-label="Amazon Prime"]
Example code:示例代码:
import datetime
from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
keyword = 'blue+skateboard'
driver = webdriver.Chrome()
url = 'https://www.amazon.co.uk/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords={}'
driver.get(url.format(keyword))
soup = BeautifulSoup(driver.page_source, 'lxml')
results = soup.select('.s-result-list [data-asin]')
for a, b in enumerate(results):
soup = b
header = soup.select_one('h5')
result = a + 1
title = header.text.strip()
try:
link = soup.select_one('h5 > a')
url = link['href']
url = re.sub(r'/ref=.*', '', str(url))
except:
url = "None"
if url !='/gp/slredirect/picassoRedirect.html':
ASIN = re.sub(r'.*/dp/', '', str(url))
#print(ASIN)
try:
score = soup.select_one('.a-icon-alt')
score = score.text
score = score.strip('\n')
score = re.sub(r' .*', '', str(score))
except:
score = "None"
try:
reviews = soup.select_one("href*='#customerReviews']")
reviews = reviews.text.strip()
except:
reviews = "None"
try:
PRIME = soup.select_one('[aria-label="Amazon Prime"]')
PRIME = PRIME['aria-label']
except:
PRIME = "None"
data = {keyword:[keyword,str(result),title,ASIN,score,reviews,PRIME,datetime.datetime.today().strftime("%B %d, %Y")]}
print(data)
Example output:示例输出:
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.