简体   繁体   English

跳过第一页。 刮痧 python

[英]Skips the first page. Scraping python

The program does not want to collect data from the first page.该程序不想从第一页收集数据。 Starts collecting from the second page.从第二页开始收集。

If I try to collect data from the first page separately, everything works.如果我尝试单独从第一页收集数据,一切正常。 And with the help of a cycle through the pages, then the first page is skipped并借助页面循环,然后跳过第一页

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"
}

def collect_products(url="https://www.olx.ua/d/uk/elektronika/noutbuki-i-aksesuary/noutbuki/?currency=UAH"):
    response = requests.get(url = url, headers = headers)
    data_list = []
    soup = BeautifulSoup(response.text, 'lxml')
    page_cout = int(soup.find('section', class_ = 'css-j8u5qq').find_all('a', class_ = 'css-1mi714g')[-1].text.strip())
    print(f'[INFO] Total pages: { page_cout }')
    for page in range(1, page_cout + 1):
        data = {}
        print(f'[INFO] Processing {page} page')
        url = f"https://www.olx.ua/d/uk/elektronika/noutbuki-i-aksesuary/noutbuki/?currency=UAH"+f"&page={ page }"
        response = requests.get(url = url, headers = headers)
        soup = BeautifulSoup(response.text, 'lxml')
        items = soup.find_all("div", {"data-cy" : "l-card"})
        for item in items:
            olx = 'https://www.olx.ua'
            try:
                link = olx + item.find('a', class_ = 'css-rc5s2u').get('href').strip()
            except:
                link = 'err'
            try:
                title = item.find('h6', class_ = 'css-1pvd0aj-Text eu5v0x0').text.strip()
            except:
                title = 'err'
            try:
                fettle = item.find('div', class_ = 'css-puf171').text.strip()
            except:
                fettle = 'err'
            try:
                price = item.find('p', class_ = 'css-1q7gvpp-Text eu5v0x0').text.strip()
            except:
                price = 'err'

            try:    
                url = f"{link}"
                response = requests.get(url = url, headers = headers)
                soup = BeautifulSoup(response.text, 'lxml')
                description = soup.find('div' , class_ = 'css-g5mtbi-Text').text.strip()
            except:
                description = 'err'


            print(title)
            print(fettle)
            print(price)
            print(link)
            print(description)


    return data_list

if __name__ == '__main__':
    collect_products()

what are other options to solve the problem?解决问题的其他选择是什么?

import httpx
import trio
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0'
}


class Spider:
    def __init__(self, client) -> None:
        self.client = client
        self.limiter = trio.CapacityLimiter(10)

    async def get(self, page):
        params = {
            'currency': 'UAH',
            'page': page
        }
        while True:
            try:
                r = await self.client.get('noutbuki', params=params)
                if r.is_success:
                    break
            except httpx.RequestError:
                continue
        return await self.get_soup(r.text)

    async def get_soup(self, content):
        return BeautifulSoup(content, 'lxml')


async def crawl(data, page, sender):
    async with data.limiter, sender:
        soup = await data.get(page)
        goal = [urljoin(str(data.client.base_url), x['href'])
                for x in soup.select('a.css-rc5s2u, a.marginright5')]
        await sender.send(goal)


async def main():
    async with httpx.AsyncClient(timeout=5, headers=headers, follow_redirects=True, base_url='https://www.olx.ua/d/uk/elektronika/noutbuki-i-aksesuary/') as client, trio.open_nursery() as nurse:
        sender, receiver = trio.open_memory_channel(0)
        nurse.start_soon(rec, receiver)
        data = Spider(client)
        async with sender:
            for page in range(1, 3):
                nurse.start_soon(crawl, data, page, sender.clone())


async def rec(receiver):
    async with receiver:
        allin = []
        async for val in receiver:
            allin.extend(val)
    df = pd.DataFrame(allin, columns=['URL'])
    print(df)


if __name__ == "__main__":
    trio.run(main)

Output: Output:

                                                  URL
0   https://www.olx.ua/d/uk/obyavlenie/lenovo-thin...
1   https://www.olx.ua/d/uk/obyavlenie/kak-novyy-i...
2   https://www.olx.ua/d/uk/obyavlenie/dell-xps-13...
3   https://www.olx.ua/d/uk/obyavlenie/apple-macbo...
4   https://www.olx.ua/d/uk/obyavlenie/u-menya-est...
..                                                ...
91  https://www.olx.ua/d/uk/obyavlenie/noutbuk-ace...
92  https://www.olx.ua/d/uk/obyavlenie/noutbuk-na-...
93  https://www.olx.ua/d/uk/obyavlenie/noutbuk-fuj...
94  https://www.olx.ua/d/uk/obyavlenie/noutbuk-15-...
95  https://www.olx.ua/d/uk/obyavlenie/ultrabuk-hp...

[96 rows x 1 columns]

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM