[英]Skips the first page. Scraping python
The program does not want to collect data from the first page.该程序不想从第一页收集数据。 Starts collecting from the second page.从第二页开始收集。
If I try to collect data from the first page separately, everything works.如果我尝试单独从第一页收集数据,一切正常。 And with the help of a cycle through the pages, then the first page is skipped并借助页面循环,然后跳过第一页
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
}
def collect_products(url="https://www.olx.ua/d/uk/elektronika/noutbuki-i-aksesuary/noutbuki/?currency=UAH"):
response = requests.get(url = url, headers = headers)
data_list = []
soup = BeautifulSoup(response.text, 'lxml')
page_cout = int(soup.find('section', class_ = 'css-j8u5qq').find_all('a', class_ = 'css-1mi714g')[-1].text.strip())
print(f'[INFO] Total pages: { page_cout }')
for page in range(1, page_cout + 1):
data = {}
print(f'[INFO] Processing {page} page')
url = f"https://www.olx.ua/d/uk/elektronika/noutbuki-i-aksesuary/noutbuki/?currency=UAH"+f"&page={ page }"
response = requests.get(url = url, headers = headers)
soup = BeautifulSoup(response.text, 'lxml')
items = soup.find_all("div", {"data-cy" : "l-card"})
for item in items:
olx = 'https://www.olx.ua'
try:
link = olx + item.find('a', class_ = 'css-rc5s2u').get('href').strip()
except:
link = 'err'
try:
title = item.find('h6', class_ = 'css-1pvd0aj-Text eu5v0x0').text.strip()
except:
title = 'err'
try:
fettle = item.find('div', class_ = 'css-puf171').text.strip()
except:
fettle = 'err'
try:
price = item.find('p', class_ = 'css-1q7gvpp-Text eu5v0x0').text.strip()
except:
price = 'err'
try:
url = f"{link}"
response = requests.get(url = url, headers = headers)
soup = BeautifulSoup(response.text, 'lxml')
description = soup.find('div' , class_ = 'css-g5mtbi-Text').text.strip()
except:
description = 'err'
print(title)
print(fettle)
print(price)
print(link)
print(description)
return data_list
if __name__ == '__main__':
collect_products()
what are other options to solve the problem?解决问题的其他选择是什么?
import httpx
import trio
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0'
}
class Spider:
def __init__(self, client) -> None:
self.client = client
self.limiter = trio.CapacityLimiter(10)
async def get(self, page):
params = {
'currency': 'UAH',
'page': page
}
while True:
try:
r = await self.client.get('noutbuki', params=params)
if r.is_success:
break
except httpx.RequestError:
continue
return await self.get_soup(r.text)
async def get_soup(self, content):
return BeautifulSoup(content, 'lxml')
async def crawl(data, page, sender):
async with data.limiter, sender:
soup = await data.get(page)
goal = [urljoin(str(data.client.base_url), x['href'])
for x in soup.select('a.css-rc5s2u, a.marginright5')]
await sender.send(goal)
async def main():
async with httpx.AsyncClient(timeout=5, headers=headers, follow_redirects=True, base_url='https://www.olx.ua/d/uk/elektronika/noutbuki-i-aksesuary/') as client, trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
nurse.start_soon(rec, receiver)
data = Spider(client)
async with sender:
for page in range(1, 3):
nurse.start_soon(crawl, data, page, sender.clone())
async def rec(receiver):
async with receiver:
allin = []
async for val in receiver:
allin.extend(val)
df = pd.DataFrame(allin, columns=['URL'])
print(df)
if __name__ == "__main__":
trio.run(main)
Output: Output:
URL
0 https://www.olx.ua/d/uk/obyavlenie/lenovo-thin...
1 https://www.olx.ua/d/uk/obyavlenie/kak-novyy-i...
2 https://www.olx.ua/d/uk/obyavlenie/dell-xps-13...
3 https://www.olx.ua/d/uk/obyavlenie/apple-macbo...
4 https://www.olx.ua/d/uk/obyavlenie/u-menya-est...
.. ...
91 https://www.olx.ua/d/uk/obyavlenie/noutbuk-ace...
92 https://www.olx.ua/d/uk/obyavlenie/noutbuk-na-...
93 https://www.olx.ua/d/uk/obyavlenie/noutbuk-fuj...
94 https://www.olx.ua/d/uk/obyavlenie/noutbuk-15-...
95 https://www.olx.ua/d/uk/obyavlenie/ultrabuk-hp...
[96 rows x 1 columns]
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.