简体   繁体   中英

Beautiful Soup can't access all images in a webpage

I am trying to download all images of a product in Trendyol . The Soup does not see the div "gallery-container" when I try to use the find_all method. It's getting empty.After that, I tried to find all images on the page. I can't access all images. I get low-resolution images. However, when I check from chrome inspect, I can see the images with high-resolution. How can I get images?

Screenshots

Code - GetProductInfo.py

from bs4 import BeautifulSoup
from requests_html import HTMLSession
from os.path  import basename
import requests

class GetProductInfo:
    global session
    global html_text

def __init__(self,link):
    session = HTMLSession()
    html_text = session.get(link)
    html_text.html.render()
    self.soup = BeautifulSoup(html_text.html.html, 'lxml')

def saveImages(self):
    for link in self.soup.select("img[src^=http]"):
        lnk = link["src"]
        with open("images/"+basename(lnk),"wb") as f:
            f.write(requests.get(lnk).content)

Code - test.py

from GetProductInfo import *
product = GetProductInfo("https://www.trendyol.com/aqua-di-polo-1987/unisex-kol-saati-apl12c195h02-p-3929108")
product.saveImages()

You could simply regex out the JavaScript object hosting the image links, then parse with json. No need to render the page at all. The links are for the high resolution images viewable in that gallery.

import requests, json, re

r = requests.get('https://www.trendyol.com/aqua-di-polo-1987/unisex-kol-saati-apl12c195h02-p-3929108')
data = json.loads(re.search(r'PRODUCT_DETAIL_APP_INITIAL_STATE__=(.*?);', r.text).group(1))
images = ['https://www.trendyol.com' + img for img in data['product']['images']]
print(images)

Hey I'm testing out on how github copilot solves stackoverflow questions.

# Problem: Beautiful Soup can't access all images in a webpage
# Solution:
# 1. Get all images from the webpage
# 2. Save all images to a folder

import requests
from bs4 import BeautifulSoup
import os


# Get all images from a webpage
def get_all_images(url):
    # get the webpage
    r = requests.get(url)

    # create a BeautifulSoup object
    soup = BeautifulSoup(r.text, "html.parser")

    # find all images
    images = [img for img in soup.findAll('img')]

    # return all images
    print(str(len(images)) + " images found.")
    # print(images)
    # print(str(len(images)) + " images found.")
    return images


# Save all images to a folder
def save_all_images(images, folder):
    # create a directory to store images
    if not os.path.exists(folder):
        os.makedirs(folder)
    print("Saving images to " + folder)

    # save all images to the folder
    for i, img in enumerate(images):
        try:
            # get the image source
            img_data = requests.get(img['src']).content

            # get the image name
            with open(folder + '/' + str(i) + '.' + img['src'].split('.')[-1], 'wb') as handler:
                # save the image to the folder
                handler.write(img_data)
        except Exception as e:
            # skip the image if any error
            print(e)
    print("All images saved.")


# Main
def main():
    url = "https://www.trendyol.com/aqua-di-polo-1987/unisex-kol-saati-apl12c195h02-p-3929108"
    images = get_all_images(url)
    save_all_images(images, 'images')


main()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM