[英]How to scrape a webpage that uses javascript?
我正在使用 requests 和 BeautifulSoup 从房地产网站上抓取数据。 它有几个编号的“页面”,显示了几十个公寓。 我编写了一个循环运行所有这些页面并从公寓收集数据,但不幸的是,它们使用 javascript,因此,代码仅返回第一页的公寓。 我也尝试过使用 selenium,但遇到了同样的问题。
非常感谢您的任何建议!
这是代码:
# Create empty lists to append data scraped from URL
# Number of lists depends on the number of features you want to extract
lista_preco = []
lista_endereco = []
lista_tamanho = []
lista_quartos = []
lista_banheiros = []
lista_vagas = []
lista_condominio = []
lista_amenidades = []
lista_fotos = []
lista_sites = []
n_pages = 0
for page in range(1, 15):
n_pages += 1
url = "https://www.vivareal.com.br/venda/bahia/salvador/apartamento_residencial/"+'?pagina='+str(page)
url = requests.get(url)
soup = BeautifulSoup(url.content, 'html.parser')
house_containers = soup.find_all('div', {'class' :'js-card-selector'})
if house_containers != []:
for container in house_containers:
# Price
price = container.find_all('section', class_='property-card__values')[0].text
try:
price = int(price[:price.find('C')].replace('R$', '').replace('.','').strip())
except:
price = 0
lista_preco.append(price)
# Zone
location = container.find_all('span', class_='property-card__address')[0].text
location = location.strip()
lista_endereco.append(location)
# Size
size = container.find_all('span', class_='property-card__detail-value js-property-card-value property-card__detail-area js-property-card-detail-area')[0].text
if '-' not in size:
size = int(size[:size.find('m')].replace(',','').strip())
else:
size = int(size[:size.find('-')].replace(',','').strip())
lista_tamanho.append(size)
# Rooms
quartos = container.find_all('li', class_='property-card__detail-item property-card__detail-room js-property-detail-rooms')[0].text
quartos = quartos[:quartos.find('Q')].strip()
if '-' in quartos:
quartos = quartos[:quartos.find('-')].strip()
lista_quartos.append(int(quartos))
# Bathrooms
banheiros = container.find_all('li', class_='property-card__detail-item property-card__detail-bathroom js-property-detail-bathroom')[0].text
banheiros = banheiros[:banheiros.find('B')].strip()
if '-' in banheiros:
banheiros = banheiros[:banheiros.find('-')].strip()
lista_banheiros.append(int(banheiros))
# Garage
vagas = container.find_all('li', class_='property-card__detail-item property-card__detail-garage js-property-detail-garages')[0].text
vagas = vagas[:vagas.find('V')].strip()
if '--' in vagas:
vagas = '0'
lista_vagas.append(int(vagas))
# Condomínio
condominio = container.find_all('section', class_='property-card__values')[0].text
try:
condominio = int(condominio[condominio.rfind('R$'):].replace('R$','').replace('.','').strip())
except:
condominio = 0
lista_condominio.append(condominio)
# Amenidades
try:
amenidades = container.find_all('ul', class_='property-card__amenities')[0].text
amenidades = amenidades.split()
except:
amenidades = 'Zero'
lista_amenidades.append(amenidades)
# url
link = 'https://www.vivareal.com.br/' + container.find_all('a')[0].get('href')[1:-1]
lista_sites.append(link)
# image
#p = str(container.find_all('img')[0])
#p
#2x size thumbnail
#imgurl = p[p.find('https'):p.rfind('data-src')]
#imgurl.replace('"', '').strip()
#lista_fotos.append(imgurl)
else:
break
time.sleep(randint(1,2))
print('You scraped {} pages containing {} properties.'.format(n_pages, len(lista_preco)))```
你确实有选择。 无需使用 Selenium,因为您可以通过 api 访问数据。
该网站有一个限制,仅允许您对最多 10,000 个列表进行分页。 返回的数据远远多于您想要的数据,因此您可以查看 json 响应,看看是否还有其他要添加的内容:
代码:
import pandas as pd
import requests
import math
import time
import random
url = 'https://glue-api.vivareal.com/v2/listings'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
'x-domain': 'www.vivareal.com.br'}
payload = {
'addressCity': 'Salvador',
'addressLocationId': 'BR>Bahia>NULL>Salvador',
'addressNeighborhood': '',
'addressState': 'Bahia',
'addressCountry': 'Brasil',
'addressStreet': '',
'addressZone': '',
'addressPointLat': '-12.977738',
'addressPointLon': '-38.501636',
'business': 'SALE',
'facets': 'amenities',
'unitTypes': 'APARTMENT',
'unitSubTypes': 'UnitSubType_NONE,DUPLEX,LOFT,STUDIO,TRIPLEX',
'unitTypesV3': 'APARTMENT',
'usageTypes': 'RESIDENTIAL',
'listingType': 'USED',
'parentId': 'null',
'categoryPage': 'RESULT',
'size': '350',
'from': '0',
'q': '',
'developmentsSize': '5',
'__vt': '',
'levels': 'CITY,UNIT_TYPE',
'ref': '/venda/bahia/salvador/apartamento_residencial/',
'pointRadius':''}
def get_num_of_listings(priceMin, priceMax, payload, url, previous_priceMax, jsonData, previous_jsonData):
randInt = random.uniform(5.1, 7.9)
payload.update({'from':'0'})
#time.sleep(randInt)
if priceMax > 2500000:
priceMax = 100000000
payload.update({'priceMin':'%s' %priceMin,'priceMax':'%s' %priceMax})
jsonData = requests.get(url, headers=headers, params=payload).json()
listings_count = jsonData['search']['totalCount']
if listings_count < 10000:
if priceMax < 100000000:
print ('Price range %s - %s returns %s listings.' %(priceMin, priceMax, listings_count))
previous_jsonData = jsonData
previous_priceMax = priceMax
priceMax += 25000
listings_count, priceMin, priceMax, previous_priceMax, jsonData, previous_jsonData = get_num_of_listings(priceMin, priceMax, payload, url, previous_priceMax, jsonData, previous_jsonData)
else:
previous_jsonData = jsonData
previous_priceMax = 100000000
priceMin = previous_priceMax + 1
priceMax = priceMin + 250000 - 1
return listings_count, priceMin, priceMax, previous_priceMax, jsonData, previous_jsonData
rows = []
priceMin = 1
priceMax = 250000
finished = False
aquired = []
while finished == False:
randInt = random.uniform(5.1, 7.9)
listings_count, priceMin, priceMax, previous_priceMax, jsonData, previous_jsonData = get_num_of_listings(priceMin, priceMax, payload, url, None, None, None)
total_pages = math.ceil(previous_jsonData['search']['totalCount'] / 350)
for page in range(1, total_pages+1):
if page == 1:
idx=0
jsonData = previous_jsonData
else:
idx = 350*page
payload.update({'from':'%s' %idx})
if idx == 9800:
payload.update({'size':200})
else:
payload.update({'size':350})
if idx > 9800:
continue
#time.sleep(randInt)
jsonData = requests.get(url, headers=headers, params=payload).json()
listings = jsonData['search']['result']['listings']
for listing in listings:
listingId = listing['listing']['id']
if listingId in aquired:
continue
zone = listing['listing']['address']['zone']
size = listing['listing']['usableAreas'][0]
bedrooms = listing['listing']['bedrooms'][0]
bathrooms = listing['listing']['bathrooms'][0]
if listing['listing']['parkingSpaces'] != []:
parking = listing['listing']['parkingSpaces'][0]
else:
parking = None
price = listing['listing']['pricingInfos'][0]['price']
try:
condoFee = listing['listing']['pricingInfos'][0]['monthlyCondoFee']
except:
condoFee = None
amenities = listing['listing']['amenities']
listingUrl = 'https://www.vivareal.com.br' + listing['link']['href']
row = {
'Id':listingId,
'Zone' : zone,
'Size' : size,
'Bedrooms' : bedrooms,
'Bathrooms': bathrooms,
'Garage' : parking,
'Price': price,
'Condominio' : condoFee,
'Amenidades' : amenities,
'url' : listingUrl}
aquired.append(listingId)
rows.append(row)
print('Page %s of %s' %(page, total_pages))
if priceMax > 100000000:
print('Done')
finished = True
df = pd.DataFrame(rows)
Output:
IPdb [3]: print(df)
Id ... url
0 2511396476 ... https://www.vivareal.com.br/imovel/apartamento...
1 2494354474 ... https://www.vivareal.com.br/imovel/apartamento...
2 2504461896 ... https://www.vivareal.com.br/imovel/apartamento...
3 2508574459 ... https://www.vivareal.com.br/imovel/apartamento...
4 2511489082 ... https://www.vivareal.com.br/imovel/apartamento...
... ... ...
26244 94618731 ... https://www.vivareal.com.br/imovel/apartamento...
26245 93437597 ... https://www.vivareal.com.br/imovel/apartamento...
26246 79341843 ... https://www.vivareal.com.br/imovel/apartamento...
26247 2455978575 ... https://www.vivareal.com.br/imovel/apartamento...
26248 2509913182 ... https://www.vivareal.com.br/imovel/apartamento...
[26249 rows x 10 columns]
不幸的是,我相信您别无选择。 原因是使用新的前端技术 html 呈现异步,它需要 javascript 能够运行和加载页面的“真实”环境。 例如,对于 Ajax,您将需要一个真正的浏览器(Chrome、Firefox)才能使其工作。 所以,我的建议是你应该继续深入挖掘 Selenium 并模仿点击事件来点击每一页(点击像 1..2..3 这样的页码直到最后)然后等到数据加载,然后读取 html 和提取您需要的数据。 问候。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.