简体   繁体   中英

How can I, scrape data from a Javascript Content of a website?

Actually, I am trying to fetch the content from the Product Description from the Nykaa Website.<\/strong><\/em>

Shop more Nykaa Cosmetics products here.You can browse through the complete world of Nykaa Cosmetics Foundation . Alternatively, you can also find many more products from the Nykaa SkinShield Anti-Pollution Matte Foundation range.

It means the content is loading dynamically with the help of Javascript.

And This, is what I have tried.<\/strong>

 It would be a big help.

You can do something like this

from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv

browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')


browser.maximize_window()  # For maximizing window
browser.implicitly_wait(20)  # gives an implicit wait for 20 seconds

browser.get(
    "https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")


# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath("/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")

loadMore.click()
browser.implicitly_wait(20)

desc_data = browser.find_elements_by_id('content-details')

for desc in desc_data:
    para_details = browser.find_element_by_xpath('//*[@id="content-details"]/p[1]').text
    expiry = browser.find_element_by_xpath('//*[@id="content-details"]/p[2]').text
    country = browser.find_element_by_xpath('//*[@id="content-details"]/p[3]').text
    importer = browser.find_element_by_xpath('//*[@id="content-details"]/p[4]').text
    address = browser.find_element_by_xpath('//*[@id="content-details"]/p[5]').text
    print(para_details, country, importer, address)

For the desc_data you were looking for a class name with that string, when there isn't one on the page it is actually an id tag with that string.

inside the for loop you had inserted a bunch of xpaths inside of find_elements_by_xpath() which only takes one xpath to an element.

try

from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv

browser = webdriver.Chrome(
    r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')

browser.maximize_window()  # For maximizing window
browser.implicitly_wait(20)  # gives an implicit wait for 20 seconds

browser.get(
    "https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")

# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath(xpath='//div [@class="css-mqbsar"]')
loadMore.click()

browser.implicitly_wait(20)
desc_data = browser.find_elements_by_xpath('//div[@id="content-details"]/p')

# desc_data = browser.find_elements_by_class_name('content-details')
# here in your previous code this class('content-details') which is a single element so it is not iterable
# I used xpath to locate every every element <p> under the (id="content-details) attrid=bute

for desc in desc_data:
    para_detail = desc.text
    print(para_detail)

# if you you want to specify try this
#  para_detail = desc_data[0].text
#  expiry_ date = desc_data[1].text


and don't just copy the XPath from the chrome dev tools it's not reliable for dynamic content.

You are getting this error because that element is not loaded correctly while you executed the click function. I use these 2 function to locate elements:

def find_until_located(eltype,name):
    element = WebDriverWait(driver, 60).until(
    EC.presence_of_element_located((eltype, name)))
    return element
def find_until_clicklable(eltype,name):
    element=WebDriverWait(driver, 60).until(EC.element_to_be_clickable((eltype, name)))
    return element

Final Answer - To this Question.<\/strong><\/em>

from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv

browser = webdriver.Chrome(
    r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')


browser.maximize_window()  # For maximizing window
browser.implicitly_wait(20)  # gives an implicit wait for 20 seconds

# browser.get(
#     "https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")

browser.get(
    "https://www.nykaa.com/kay-beauty-hydrating-foundation/p/1229442?productId=1229442&pps=3&skuId=772975")

browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")


# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(By.XPATH,
                                "/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")

loadMore.click()
browser.implicitly_wait(20)

desc_data = browser.find_elements(By.ID, 'content-details')

for desc in desc_data:
    para_details = browser.find_element(By.XPATH,
                                        '//*[@id="content-details"]/p[1]').text
    expiry = browser.find_element(By.XPATH,
                                  '//*[@id="content-details"]/p[2]').text
    country = browser.find_element(By.XPATH,
                                   '//*[@id="content-details"]/p[3]').text
    importer = browser.find_element(By.XPATH,
                                    '//*[@id="content-details"]/p[4]').text
    address = browser.find_element(By.XPATH,
                                   '//*[@id="content-details"]/p[5]').text
    # print(para_details, country, importer, address)
    print(f"{para_details} \n")
    print(f"{expiry} \n")
    print(f"{country} \n")
    print(f"{importer} \n")
    print(f"{address} \n")

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM