簡體   English   中英

抓取 Python - 嘗試獲取 URL 以獲取下一頁

[英]Scraping in Python - Trying to get URL for next page

我曾嘗試使用 Python 抓取此Olx 廣告網頁,但我無法找到下一頁的鏈接,因為h.xpath返回空數組。 我認為 XPath 表達式可能是錯誤的。 有人可以幫忙嗎?

XPath 表達式:

    next_page2 = h.xpath('//Div[@class="sc-hmzhuo inUUEM sc-jTzLTM iwtnNi"]/a/@href')
    print(next_page2)

完整代碼(.py):

#https://medium.com/@henriquecoura_87435/webscraping-com-python-extraindo-dados-de-um-ecommerce-89c16b622f69

import lxml.html as parser
import csv
import requests
import time
import os.path

MAX_PAGES = 3 #float('inf')
MAX_ADS =  100 #float('inf')
DELAY_INTERVAL = 5 #seconds

headers = {'User-Agent': '"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"'}
start_urls = [
#    "https://pe.olx.com.br/grande-recife/recife/pina/imoveis/aluguel/apartamentos?bas=2&pe=2500&ps=1000&q=suite&ros=2", #2500-1000, 2/4, suite, Pina
#    "https://pe.olx.com.br/grande-recife/recife/espinheiro/imoveis/aluguel/apartamentos?bas=2&pe=2500&ps=1000&q=suite&ros=2", #2500-1000, 2/4, suite, Espinheiro
#    "https://pe.olx.com.br/grande-recife/recife/imoveis/aluguel/apartamentos?bas=2&pe=2500&ps=1000&q=suite&ros=2&sd=3743&sd=3771&sd=3770&sd=3751", #2500-1000, 2/4, suite, Boa Vista, Torreão, Derby, Santo Amaro
#    "https://pe.olx.com.br/grande-recife/recife/gracas/imoveis/aluguel/apartamentos?bas=2&pe=2500&ps=1000&q=suite&ros=2", #2500-1000, 2/4, suite, Graças
    "https://ba.olx.com.br/grande-salvador/salvador/imbui/imoveis/venda/apartamentos?pe=280000&ps=200000&ros=2" #2500-1000, 2/4, suite, Boa Viagem
]

csv_dic = {'index': '', 'title': '', 'price': '', 'description': '',  'link': '', 'CEP': '', 'Município': '', 'Bairro': '', 'Logradouro': ''}

'''
# Crawlear a primeira página
r = requests.get(start_url, headers=headers)
h = parser.fromstring(r.text)
links += h.xpath('//li[@class="sc-1fcmfeb-2 ggOGTJ"]/a/@href') # returns the links for all ads
print("Page 1: " + start_url)
print("Links for all ads on the page 1")
print("Links: " + links)
print("How many links: " + len(links))
print("")

# A próxima página na página inicial está no index 0, nao tem página anterior
next_page = h.xpath('//li[@class="sc-1m4ygug-0 ilgdSS"]/a/@href')[0] # returns the link for next page
'''
ads = 0
j = 1
for start_url in start_urls:
    i = 1
    links = []
    print("Crawling page by page: " + str(j))

    next_page = start_url
    while next_page:

        r = requests.get(next_page, headers=headers)
        h = parser.fromstring(r.text)

        next_page2 = h.xpath('//Div[@class="sc-hmzhuo inUUEM sc-jTzLTM iwtnNi"]/a/@href')
        print(next_page2)

        '''
        catch = h.xpath('//li[@class="sc-1fcmfeb-2 juiJqh"]/a/@href')
        print("Page: " + str(i) + ": " + next_page)
        print("Links for all ads on the page " + str(i))
        print(catch)
        print("How many ads: " + str(len(catch)))
        links += catch
        print("How many ads in total: " + str(len(links)))
        print("")
        
        try:
#            next_page = h.xpath('//li[@class="sc-1m4ygug-0 ilgdSS"]/a/@href')[0]
            next_page = h.xpath('//Div[@class="sc-hmzhuo kJjuHR sc-jTzLTM iwtnNi"]/a/@href')[0]
            print("Next page: " + next_page)
            print("")
        # Se tivermos um IndexError é porque estamos na última página
        # não existe mais uma next_page
        # Interrompa o loop
        except IndexError as e:
            next_page = None
            print("except 1")
        if i >= MAX_PAGES:
            next_page = None
            print("except 2")
        i += 1
        time.sleep(DELAY_INTERVAL)  # Delays for 5 seconds
        '''
'''
    i=1
    print("Scraping ad/link by ad/link")
    print("How many ads to scrape: " + str(len(links)))
    for link in links:
        ads += 1
        towrite = []
        towrite_h = []
        csv_dic.clear()
        print("Ad/link spraping " + str(i) + ": " + link)

        r = requests.get(link, headers=headers)
        h = parser.fromstring(r.text)

        csv_dic["index"] = ads

        title = h.xpath('//h1[@class="sc-45jt43-0 eCghYu sc-ifAKCX cmFKIN"]/text()')[0]
        csv_dic["title"] = title
        print("title: " + title)

        price = h.xpath('//h2[@class="sc-1wimjbb-0 JzEH sc-ifAKCX cmFKIN"]/text()')[0]
        csv_dic["price"] = price
        print("price: " + price)

        description = h.xpath('//span[@class="sc-1sj3nln-1 eOSweo sc-ifAKCX cmFKIN"]/text()')[0].replace('\n','')
        csv_dic["description"] = description
        print("description: " + description)

        csv_dic["link"] = link
        print("link: " + link)

        loc = h.xpath('//div[@class="sc-bwzfXH h3us20-0 cBfPri"]//dd[@class="sc-1f2ug0x-1 ljYeKO sc-ifAKCX kaNiaQ"]/text()')
        loc.insert(0, "")
        loc.insert(0, "")
        loc.insert(5, "")
        print(loc)
        loc_keys = h.xpath('//div[@class="sc-bwzfXH h3us20-0 cBfPri"]//dt[@class="sc-1f2ug0x-0 cLGFbW sc-ifAKCX cmFKIN"]/text()')
        print(loc_keys)
        v = 0
        for key in loc_keys:
            csv_dic[key] = loc[v]
            v += 1
        print("loc: ")
        print(loc)

        towrite.extend(list(csv_dic.values()))
        towrite_h.extend(list(csv_dic.keys()))

        if os.path.exists('Olx.csv'):
            with open('Olx.csv', 'a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(towrite)
        else:
            with open('Olx.csv', 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(towrite_h)
                writer.writerow(towrite)

        if i >= MAX_ADS:
            break
        i += 1
        time.sleep(DELAY_INTERVAL)  # Delays for 5 seconds
    j += 1
    print("")
'''
print("Total of ads visited: " + str(ads))

您不僅可以查看類,還可以使用其他屬性,例如:

next_link = h.cssselect('a[data-lurker-detail="next_page"]')

if next_link:
    print(next_link[0].attrib["href"])

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM