[英]How to change path to webdriver_manager to custom path in the cloud function environment
I'm trying to create a headless web scraper on cloud function. I have used Selenium to automate the driver provided by the Webdriver manager.我正在尝试在云 function 上创建一个无头 web 刮板。我已经使用 Selenium 来自动化 Webdriver 管理器提供的驱动程序。
Can you please tell me how to change the wdm.cachePath according to virtual environment?你能告诉我如何根据虚拟环境更改 wdm.cachePath 吗? Below is my code and the error I'm getting.下面是我的代码和我得到的错误。
import os
import logging
# selenium 4
os.environ['GH_TOKEN'] = "gkjkjhjkhjhkjhuihjhgjhg"
os.environ['WDM_LOG'] = str(logging.NOTSET)
os.environ['WDM_LOCAL'] = '1'
os.environ['WDM_SSL_VERIFY'] = '0'
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
def hello_world(request):
"""Responds to any HTTP request.
Args:
request (flask.Request): HTTP request object.
Returns:
The response text or any set of values that can be turned into a
Response object using
`make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
"""
# instance of Options class allows
# us to configure Headless Chrome
options = Options()
print("options")
options.headless = True
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager("2.26", cache_valid_range=1, path = r".\\temp\\Drivers").install()
), options=options)
print("driver was initiated")
# this parameter tells Chrome that
# it should be run without UI (Headless)
# initializing webdriver for Chrome with our options
# driver = webdriver.Chrome(options=options)
# driver = webdriver.Chrome(ChromeDriverManager(path = r"/temp/data").install())
request_json = request.get_json()
if request_json and 'url' in request_json:
url = request_json['url']
driver.get('https://www.geeksforgeeks.org')
print(driver.title)
driver.close()
return f'Success!'
else:
return f'Not run'
Error logs -错误日志 -
Traceback (most recent call last): File "/layers/google.python.pip/pip/lib/python3.10/site-packages/flask/app.py", line 2525, in wsgi_app response = self.full_dispatch_request() File "/layers/google.python.pip/pip/lib/python3.10/site-packages/flask/app.py", line 1822, in full_dispatch_request rv = self.handle_user_exception(e) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/flask/app.py", line 1820, in full_dispatch_request rv = self.dispatch_request() File "/layers/google.python.pip/pip/lib/python3.10/site-packages/flask/app.py", line 1796, in dispatch_request return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/functions_framework/__init__.py", line 98, in view_func return function(request._get_current_object()) File "/workspace/main.py", line 28, in hello_world driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager("2.26", cache_valid_range=1, path = r".\\temp\\Drivers").install() File "/layers/google.python.pip/pip/lib/python3.10/site-packages/webdriver_manager/chrome.py", line 39, in install driver_path = self._get_driver_path(self.driver) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/webdriver_manager/core/manager.py", line 31, in _get_driver_path binary_path = self.driver_cache.save_file_to_cache(driver, file) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/webdriver_manager/core/driver_cache.py", line 45, in save_file_to_cache archive = save_file(file, path) File "/layers/google.python.pip/pip/lib/python3.10/site-packages/webdriver_manager/core/utils.py", line 38, in save_file os.makedirs(directory, exist_ok=True) File "/layers/google.python.runtime/python/lib/python3.10/os.py", line 215, in makedirs makedirs(head, exist_ok=exist_ok) File "/layers/google.python.runtime/python/lib/python3.10/os.py", line 215, in makedirs makedirs(head, exist_ok=exist_ok) File "/layers/google.python.runtime/python/lib/python3.10/os.py", line 215, in makedirs makedirs(head, exist_ok=exist_ok)
I think the error is caused due to web driver manager trying to save the driver to cache is some static path, I already changed the path setting using我认为错误是由于 web 驱动程序管理器试图将驱动程序保存到缓存是一些 static 路径引起的,我已经使用更改了路径设置
path = r".\\temp\\Drivers"
How to do it correctly?如何正确地做到这一点?
So I figured this out...所以我想通了...
import os
import logging
# selenium 4
os.environ['WDM_LOG'] = str(logging.NOTSET)
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import Chrome
def hello_world(request):
"""Responds to any HTTP request.
Args:
request (flask.Request): HTTP request object.
Returns:
The response text or any set of values that can be turned into a
Response object using
`make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`.
"""
# instance of Options class allows
# us to configure Headless Chrome
print("driver was initiated")
# this parameter tells Chrome that
# it should be run without UI (Headless)
opts = Options()
opts.add_experimental_option("detach", True)
opts.headless= True
# initializing webdriver for Chrome with our options
driver = webdriver.Chrome(service= ChromeService(ChromeDriverManager(cache_valid_range=1).install() ), options = opts)
# chrome_driver_path = ChromeDriverManager().install()
request_json = request.get_json()
if request_json and 'url' in request_json:
# driver = webdriver.Chrome(service= chrome_driver_path, options = opts)
url = request_json['url']
driver.get(url)
driver.get(url)
# driver.find_element(By.XPATH,'//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/div[2]').click()
#to make sure content is fully loaded we can use time.sleep() after navigating to each page
import time
time.sleep(3)
#Find the total number of reviews
# total_number_of_reviews = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[9]').text.splitlines()[3]
# total_number_of_reviews = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[9]').text
# time.sleep(3)
# print(total_number_of_reviews)
# Find scroll layout
scrollable_div = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
# time.sleep(3)
#Scroll as many times as necessary to load all reviews
total_reviews = int(driver.find_element('xpath', '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[2]').text.split(' ')[0].replace(',',''))
time.sleep(3)
print(total_reviews)
for i in range(0, min(total_reviews, 500) ):
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight',scrollable_div)
time.sleep(1.5)
response = BeautifulSoup(driver.page_source, 'html.parser')
reviews = response.find_all('span', class_='wiI7pd')
restaurant__reviews = []
for review in reviews:
restaurant__reviews.append(review.text)
print(restaurant__reviews)
driver.close()
return f'Success!'
else:
driver.close()
return f'Not run'
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.