[英]How can I, scrape data from a Javascript Content of a website?
Actually, I am trying to fetch the content from the Product Description from the Nykaa Website.<\/strong><\/em>实际上,我正在尝试从 Nykaa 网站的产品描述中获取内容。<\/strong><\/em>
URL:- https:\/\/www.nykaa.com\/nykaa-skinshield-matte-foundation\/p\/460512?productId=460512&pps=1&skuId=460502<\/a>网址:- https:\/\/www.nykaa.com\/nykaa-skinshield-matte-foundation\/p\/460512?productId=460512&pps=1&skuId=460502<\/a>
The Text<\/strong> which, I want to extract<\/strong> is :我要提取<\/strong>的文本<\/strong>是:
Expiry Date: 15 February 2024到期日:2024 年 2 月 15 日
Country of Origin: India原产国:印度
<\/blockquote>from msilib.schema import Error from tkinter import ON from turtle import goto import time from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import numpy as np from random import randint import pandas as pd import requests import csv browser = webdriver.Chrome( r'C:\\Users\\paart\\.wdm\\drivers\\chromedriver\\win32\\97.0.4692.71\\chromedriver.exe') browser.maximize_window() # For maximizing window browser.implicitly_wait(20) # gives an implicit wait for 20 seconds browser.get( "https:\/\/www.nykaa.com\/nykaa-skinshield-matte-foundation\/p\/460512?productId=460512&pps=1&skuId=460502") # Creates "load more" button object. browser.implicitly_wait(20) loadMore = browser.find_element_by_xpath(xpath="\/html\/body\/div[1]\/div\/div[3]\/div[1]\/div[2]\/div\/div\/div[2]") loadMore.click() browser.implicitly_wait(20) desc_data = browser.find_elements_by_class_name('content-details') for desc in desc_data: para_details = browser.find_element_by_xpath( '.\/\/*[@id="content-details"]\/p[1]').text extra_details = browser.find_elements_by_xpath( '.\/\/*[@id="content-details"]\/p[2]', '.\/\/*[@id="content-details"]\/p[3]', '.\/\/*[@id="content-details"]\/p[4]', '.\/\/*[@id="content-details"]\/p[5]').text print(para_details, extra_details)<\/code><\/pre>
And this, is the output which is displaying.这是正在显示的输出。
PS E:\\Web Scraping - Nykaa> python -u "e:\\Web Scraping - Nykaa\\scrape_nykaa_final.py" e:\\Web Scraping - Nykaa\\scrape_nykaa_final.py:16: DeprecationWarning: executable_path has been deprecated, please pass in a Service object browser = webdriver.Chrome( DevTools listening on ws:\/\/127.0.0.1:1033\/devtools\/browser\/097c0e11-6f2c-4742-a2b5-cd05bee72661 e:\\Web Scraping - Nykaa\\scrape_nykaa_final.py:28: DeprecationWarning: find_element_by_* commands are deprecated. Please use find_element() instead loadMore = browser.find_element_by_xpath( [9312:4972:0206\/110327.883:ERROR:ssl_client_socket_impl.cc(996)] handshake failed; returned -1, SSL error code 1, net_error -101 [9312:4972:0206\/110328.019:ERROR:ssl_client_socket_impl.cc(996)] handshake failed; returned -1, SSL error code 1, net_error -101 Traceback (most recent call last): File "e:\\Web Scraping - Nykaa\\scrape_nykaa_final.py", line 28, in <module> loadMore = browser.find_element_by_xpath( File "C:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py", line 520, in find_element_by_xpath return self.find_element(by=By.XPATH, value=xpath) File "C:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py", line 1244, in find_element return self.execute(Command.FIND_ELEMENT, { File "C:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py", line 424, in execute self.error_handler.check_response(response) File "C:\\Python310\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py", line 247, in check_response raise exception_class(message, screen, stacktrace) selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"\/html\/body\/div[1]\/div\/div[3]\/div[1]\/div[2]\/div\/div\/div[2]"} (Session info: chrome=97.0.4692.99) Stacktrace: Backtrace: Ordinal0 [0x00FDFDC3+2555331] Ordinal0 [0x00F777F1+2127857] Ordinal0 [0x00E72E08+1060360] Ordinal0 [0x00E9E49E+1238174] Ordinal0 [0x00E9E69B+1238683] Ordinal0 [0x00EC9252+1413714] Ordinal0 [0x00EB7B54+1342292] Ordinal0 [0x00EC75FA+1406458] Ordinal0 [0x00EB7976+1341814] Ordinal0 [0x00E936B6+1193654] Ordinal0 [0x00E94546+1197382] GetHandleVerifier [0x01179622+1619522] GetHandleVerifier [0x0122882C+2336844] GetHandleVerifier [0x010723E1+541697] GetHandleVerifier [0x01071443+537699] Ordinal0 [0x00F7D18E+2150798] Ordinal0 [0x00F81518+2168088] Ordinal0 [0x00F81660+2168416] Ordinal0 [0x00F8B330+2208560] BaseThreadInitThunk [0x76C9FA29+25] RtlGetAppContainerNamedObjectPath [0x77337A9E+286] RtlGetAppContainerNamedObjectPath [0x77337A6E+238]<\/code><\/pre>
Please, anyone help me getting this issue resolved, or any another specific piece of the code to write, which I am missing to fetch the text content from Product description<\/strong> .请任何人帮助我解决这个问题,或者任何其他要编写的特定代码,我缺少从 Product description 获取文本内容<\/strong>。 It would be a big help.这将是一个很大的帮助。
Thanks 🙏🏻.谢谢🙏🏻。
"
You can do something like this你可以做这样的事情
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath("/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements_by_id('content-details')
for desc in desc_data:
para_details = browser.find_element_by_xpath('//*[@id="content-details"]/p[1]').text
expiry = browser.find_element_by_xpath('//*[@id="content-details"]/p[2]').text
country = browser.find_element_by_xpath('//*[@id="content-details"]/p[3]').text
importer = browser.find_element_by_xpath('//*[@id="content-details"]/p[4]').text
address = browser.find_element_by_xpath('//*[@id="content-details"]/p[5]').text
print(para_details, country, importer, address)
For the desc_data you were looking for a class name with that string, when there isn't one on the page it is actually an id tag with that string.对于 desc_data 您正在寻找带有该字符串的类名,当页面上没有类名时,它实际上是带有该字符串的 id 标记。
inside the for loop you had inserted a bunch of xpaths inside of find_elements_by_xpath() which only takes one xpath to an element.在 for 循环中,您在 find_elements_by_xpath() 中插入了一堆 xpath,它只需要一个 xpath 到一个元素。
try尝试
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
browser.get(
"https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element_by_xpath(xpath='//div [@class="css-mqbsar"]')
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements_by_xpath('//div[@id="content-details"]/p')
# desc_data = browser.find_elements_by_class_name('content-details')
# here in your previous code this class('content-details') which is a single element so it is not iterable
# I used xpath to locate every every element <p> under the (id="content-details) attrid=bute
for desc in desc_data:
para_detail = desc.text
print(para_detail)
# if you you want to specify try this
# para_detail = desc_data[0].text
# expiry_ date = desc_data[1].text
and don't just copy the XPath from the chrome dev tools it's not reliable for dynamic content.并且不要只是从 chrome 开发工具中复制 XPath,它对于动态内容是不可靠的。
You are getting this error because that element is not loaded correctly while you executed the click function.您收到此错误是因为在执行 click 功能时该元素未正确加载。 I use these 2 function to locate elements:我使用这两个函数来定位元素:
def find_until_located(eltype,name):
element = WebDriverWait(driver, 60).until(
EC.presence_of_element_located((eltype, name)))
return element
def find_until_clicklable(eltype,name):
element=WebDriverWait(driver, 60).until(EC.element_to_be_clickable((eltype, name)))
return element
Final Answer - To this Question.<\/strong><\/em>最终答案 - 这个问题。<\/strong><\/em>
from msilib.schema import Error
from tkinter import ON
from turtle import goto
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import numpy as np
from random import randint
import pandas as pd
import requests
import csv
browser = webdriver.Chrome(
r'C:\Users\paart\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe')
browser.maximize_window() # For maximizing window
browser.implicitly_wait(20) # gives an implicit wait for 20 seconds
# browser.get(
# "https://www.nykaa.com/nykaa-skinshield-matte-foundation/p/460512?productId=460512&pps=1&skuId=460502")
browser.get(
"https://www.nykaa.com/kay-beauty-hydrating-foundation/p/1229442?productId=1229442&pps=3&skuId=772975")
browser.execute_script("document.body.style.zoom='50%'")
browser.execute_script("document.body.style.zoom='100%'")
# Creates "load more" button object.
browser.implicitly_wait(20)
loadMore = browser.find_element(By.XPATH,
"/html/body/div[1]/div/div[3]/div[1]/div[2]/div/div/div[2]")
loadMore.click()
browser.implicitly_wait(20)
desc_data = browser.find_elements(By.ID, 'content-details')
for desc in desc_data:
para_details = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[1]').text
expiry = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[2]').text
country = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[3]').text
importer = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[4]').text
address = browser.find_element(By.XPATH,
'//*[@id="content-details"]/p[5]').text
# print(para_details, country, importer, address)
print(f"{para_details} \n")
print(f"{expiry} \n")
print(f"{country} \n")
print(f"{importer} \n")
print(f"{address} \n")
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.