[英]Beautiful Soup can't access all images in a webpage
我正在嘗試在Trendyol 中下載產品的所有圖像。 當我嘗試使用 find_all 方法時,Soup 沒有看到 div“gallery-container”。 越來越空了之后,我嘗試在頁面上查找所有圖像。 我無法訪問所有圖像。 我得到低分辨率的圖像。 但是,當我從 chrome 檢查中檢查時,我可以看到高分辨率的圖像。 我怎樣才能得到圖像?
代碼 - GetProductInfo.py
from bs4 import BeautifulSoup
from requests_html import HTMLSession
from os.path import basename
import requests
class GetProductInfo:
global session
global html_text
def __init__(self,link):
session = HTMLSession()
html_text = session.get(link)
html_text.html.render()
self.soup = BeautifulSoup(html_text.html.html, 'lxml')
def saveImages(self):
for link in self.soup.select("img[src^=http]"):
lnk = link["src"]
with open("images/"+basename(lnk),"wb") as f:
f.write(requests.get(lnk).content)
代碼 - test.py
from GetProductInfo import *
product = GetProductInfo("https://www.trendyol.com/aqua-di-polo-1987/unisex-kol-saati-apl12c195h02-p-3929108")
product.saveImages()
您可以簡單地將托管圖像鏈接的 JavaScript 對象正則表達式,然后使用 json 進行解析。 根本不需要渲染頁面。 這些鏈接適用於該圖庫中可查看的高分辨率圖像。
import requests, json, re
r = requests.get('https://www.trendyol.com/aqua-di-polo-1987/unisex-kol-saati-apl12c195h02-p-3929108')
data = json.loads(re.search(r'PRODUCT_DETAIL_APP_INITIAL_STATE__=(.*?);', r.text).group(1))
images = ['https://www.trendyol.com' + img for img in data['product']['images']]
print(images)
嘿,我正在測試 github copilot 如何解決 stackoverflow 問題。
# Problem: Beautiful Soup can't access all images in a webpage
# Solution:
# 1. Get all images from the webpage
# 2. Save all images to a folder
import requests
from bs4 import BeautifulSoup
import os
# Get all images from a webpage
def get_all_images(url):
# get the webpage
r = requests.get(url)
# create a BeautifulSoup object
soup = BeautifulSoup(r.text, "html.parser")
# find all images
images = [img for img in soup.findAll('img')]
# return all images
print(str(len(images)) + " images found.")
# print(images)
# print(str(len(images)) + " images found.")
return images
# Save all images to a folder
def save_all_images(images, folder):
# create a directory to store images
if not os.path.exists(folder):
os.makedirs(folder)
print("Saving images to " + folder)
# save all images to the folder
for i, img in enumerate(images):
try:
# get the image source
img_data = requests.get(img['src']).content
# get the image name
with open(folder + '/' + str(i) + '.' + img['src'].split('.')[-1], 'wb') as handler:
# save the image to the folder
handler.write(img_data)
except Exception as e:
# skip the image if any error
print(e)
print("All images saved.")
# Main
def main():
url = "https://www.trendyol.com/aqua-di-polo-1987/unisex-kol-saati-apl12c195h02-p-3929108"
images = get_all_images(url)
save_all_images(images, 'images')
main()
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.