I have been tried to scrape this Olx ad webpage with Python, but I cant make it to find the link for next page, because h.xpath
returns empty array. I think the XPath expressions is probably wrong. Could someone help?
XPath expression:
next_page2 = h.xpath('//Div[@class="sc-hmzhuo inUUEM sc-jTzLTM iwtnNi"]/a/@href')
print(next_page2)
Full code (.py):
#https://medium.com/@henriquecoura_87435/webscraping-com-python-extraindo-dados-de-um-ecommerce-89c16b622f69
import lxml.html as parser
import csv
import requests
import time
import os.path
MAX_PAGES = 3 #float('inf')
MAX_ADS = 100 #float('inf')
DELAY_INTERVAL = 5 #seconds
headers = {'User-Agent': '"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36"'}
start_urls = [
# "https://pe.olx.com.br/grande-recife/recife/pina/imoveis/aluguel/apartamentos?bas=2&pe=2500&ps=1000&q=suite&ros=2", #2500-1000, 2/4, suite, Pina
# "https://pe.olx.com.br/grande-recife/recife/espinheiro/imoveis/aluguel/apartamentos?bas=2&pe=2500&ps=1000&q=suite&ros=2", #2500-1000, 2/4, suite, Espinheiro
# "https://pe.olx.com.br/grande-recife/recife/imoveis/aluguel/apartamentos?bas=2&pe=2500&ps=1000&q=suite&ros=2&sd=3743&sd=3771&sd=3770&sd=3751", #2500-1000, 2/4, suite, Boa Vista, Torreão, Derby, Santo Amaro
# "https://pe.olx.com.br/grande-recife/recife/gracas/imoveis/aluguel/apartamentos?bas=2&pe=2500&ps=1000&q=suite&ros=2", #2500-1000, 2/4, suite, Graças
"https://ba.olx.com.br/grande-salvador/salvador/imbui/imoveis/venda/apartamentos?pe=280000&ps=200000&ros=2" #2500-1000, 2/4, suite, Boa Viagem
]
csv_dic = {'index': '', 'title': '', 'price': '', 'description': '', 'link': '', 'CEP': '', 'Município': '', 'Bairro': '', 'Logradouro': ''}
'''
# Crawlear a primeira página
r = requests.get(start_url, headers=headers)
h = parser.fromstring(r.text)
links += h.xpath('//li[@class="sc-1fcmfeb-2 ggOGTJ"]/a/@href') # returns the links for all ads
print("Page 1: " + start_url)
print("Links for all ads on the page 1")
print("Links: " + links)
print("How many links: " + len(links))
print("")
# A próxima página na página inicial está no index 0, nao tem página anterior
next_page = h.xpath('//li[@class="sc-1m4ygug-0 ilgdSS"]/a/@href')[0] # returns the link for next page
'''
ads = 0
j = 1
for start_url in start_urls:
i = 1
links = []
print("Crawling page by page: " + str(j))
next_page = start_url
while next_page:
r = requests.get(next_page, headers=headers)
h = parser.fromstring(r.text)
next_page2 = h.xpath('//Div[@class="sc-hmzhuo inUUEM sc-jTzLTM iwtnNi"]/a/@href')
print(next_page2)
'''
catch = h.xpath('//li[@class="sc-1fcmfeb-2 juiJqh"]/a/@href')
print("Page: " + str(i) + ": " + next_page)
print("Links for all ads on the page " + str(i))
print(catch)
print("How many ads: " + str(len(catch)))
links += catch
print("How many ads in total: " + str(len(links)))
print("")
try:
# next_page = h.xpath('//li[@class="sc-1m4ygug-0 ilgdSS"]/a/@href')[0]
next_page = h.xpath('//Div[@class="sc-hmzhuo kJjuHR sc-jTzLTM iwtnNi"]/a/@href')[0]
print("Next page: " + next_page)
print("")
# Se tivermos um IndexError é porque estamos na última página
# não existe mais uma next_page
# Interrompa o loop
except IndexError as e:
next_page = None
print("except 1")
if i >= MAX_PAGES:
next_page = None
print("except 2")
i += 1
time.sleep(DELAY_INTERVAL) # Delays for 5 seconds
'''
'''
i=1
print("Scraping ad/link by ad/link")
print("How many ads to scrape: " + str(len(links)))
for link in links:
ads += 1
towrite = []
towrite_h = []
csv_dic.clear()
print("Ad/link spraping " + str(i) + ": " + link)
r = requests.get(link, headers=headers)
h = parser.fromstring(r.text)
csv_dic["index"] = ads
title = h.xpath('//h1[@class="sc-45jt43-0 eCghYu sc-ifAKCX cmFKIN"]/text()')[0]
csv_dic["title"] = title
print("title: " + title)
price = h.xpath('//h2[@class="sc-1wimjbb-0 JzEH sc-ifAKCX cmFKIN"]/text()')[0]
csv_dic["price"] = price
print("price: " + price)
description = h.xpath('//span[@class="sc-1sj3nln-1 eOSweo sc-ifAKCX cmFKIN"]/text()')[0].replace('\n','')
csv_dic["description"] = description
print("description: " + description)
csv_dic["link"] = link
print("link: " + link)
loc = h.xpath('//div[@class="sc-bwzfXH h3us20-0 cBfPri"]//dd[@class="sc-1f2ug0x-1 ljYeKO sc-ifAKCX kaNiaQ"]/text()')
loc.insert(0, "")
loc.insert(0, "")
loc.insert(5, "")
print(loc)
loc_keys = h.xpath('//div[@class="sc-bwzfXH h3us20-0 cBfPri"]//dt[@class="sc-1f2ug0x-0 cLGFbW sc-ifAKCX cmFKIN"]/text()')
print(loc_keys)
v = 0
for key in loc_keys:
csv_dic[key] = loc[v]
v += 1
print("loc: ")
print(loc)
towrite.extend(list(csv_dic.values()))
towrite_h.extend(list(csv_dic.keys()))
if os.path.exists('Olx.csv'):
with open('Olx.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(towrite)
else:
with open('Olx.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(towrite_h)
writer.writerow(towrite)
if i >= MAX_ADS:
break
i += 1
time.sleep(DELAY_INTERVAL) # Delays for 5 seconds
j += 1
print("")
'''
print("Total of ads visited: " + str(ads))
You can look not only at the classes, you can use and other attributes, eg:
next_link = h.cssselect('a[data-lurker-detail="next_page"]')
if next_link:
print(next_link[0].attrib["href"])
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.