I am trying to scrape data of user review beer on beeradvocate.com to analyze user attitude towards different of beer type. But I can only have result of the first few page, remain is empty
Situation:
My approach
def review_scrape (beer_link, number_of_ratings):
reviews=[]
rate =[]
for pages_i in range(0,int(number_of_ratings),25): #site shows 25 resulst/page
session = requests.session() # Start the session
payload = {'login':'suzie102', 'password':''}
page1 = session.post("https://www.beeradvocate.com/community/login/login", data=payload)
url = beer_link+'/?view=beer&sort=&start=%d'%(pages_i)
page1= session.get(url)
time.sleep(3)
soup1 = lxml.html.fromstring(page1.text)
rate_i = soup1.xpath('//span[@class = "muted"]/text()')[8::3]
print(url)
reviews_i = soup1.xpath('//div/text()')
reviews.append(reviews_i)
print(len(reviews))
rate.append(rate_i)
return rate,reviews
Results:
There is only one problem that ive see.
url = beer_link+'/?view=beer&sort=&start=%d'%(pages_i)
/ is redudant, what you need is url = beer_link+'?view=beer&sort=&start=%d'%(pages_i)
that is why there are //?view in your print of links.
I can see that there are anchor links "next" leding to next page. I would recommend while loop or recursion.
Other than that, I cant see what is missing from your script. Everything else looks in order and it should work.
If you could give us more details, we might have more to work with.
update, thanks to everyone comment, I tried to use it with selenium to scrape. It works now
def webstite_scrape_p2 (beer_link, number_of_ratings):
driver = webdriver.Chrome('/home/sam/Downloads/chromedriver')
url = 'https://www.beeradvocate.com/community/login/'
driver.get(url)
loginelement = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[@class="xenForm formOverlay"]//dd//input[@name ="login"]')))
loginelement.send_keys('suzie102')
pwelement = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[@class="xenForm formOverlay"]//dl[@class ="ctrlUnit"]//dd//ul//li[@id = "ctrl_pageLogin_registered_Disabler"]//input[@name ="password"]')))
pwelement.send_keys('')
page_click = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//form[@class="xenForm formOverlay"]//dl[@class ="ctrlUnit submitUnit"]//dd//input[@type ="submit"]')))
page_click.click()
rate = []
reviews =[]
avg_user =[]
for link, i in zip(beer_link, number_of_rev):
for pages_i in tqdm(range(0,int(i),25)): #site shows 25 resulst/page)
new_url = link+'?view=beer&sort=&start=%d'%(pages_i)
print(new_url)
driver.get(new_url)
#print(driver.find_element_by_name("hideRatings").is_selected())
#check_box = WebDriverWait(driver,20).until(EC.element_to_be_clickable((By.XPATH, '//form[@style="display:inline;margin:0;padding:0;"]//input[@type = "checkbox"]')))#check_box.click()
#check_box.click()
time.sleep(5)
driver.get(new_url)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
rate_i = [ i.get_text() for i in soup.find_all('span', class_ = "muted")][8::3]
rate.append(rate_i)
reviews_i = [ i.get_text() for i in soup.find_all('div')]
reviews.append(reviews_i)
avg_i = [i.get_text() for i in soup.find_all('span', class_= "BAscore_norm")]
avg_user.append(avg_i)
return rate, reviews, avg_user
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.