I am crawling images with Python's selenium web driver(Chrome)
Can I use multiple drivers and have each driver crawl the image?
I want to do the following things with multiple processing
def crawl(searchText):
driver = webdriver.Chrome('C:\\Users\\HYOWON\\Desktop\\Desktop\\Graduation\\Code\\Crawling\\chromedriver.exe')
searchUrl = "https://www.google.com/search?q={}&site=webhp&tbm=isch".format(searchText)
driver.get(searchUrl)
imgs_urls = [] # Url 저장 배열
cnt = 0
for j in range(20):
element = driver.find_element_by_css_selector("div[data-ri = '" + str(cnt + j) + "'] img")
element.click()
sleep(1)
soup = create_soup()
for img in soup.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except:
pass
driver.close()
return(imgs_urls)
def crawl():
imgs_urls = []
for j in range(50):
element1 = driver1.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element2 = driver2.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element3 = driver3.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
element1.click()
WebDriverWait(driver1, 1)
soup1 = create_soup(driver1)
for img in soup1.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'): # http로 시작 jpg로 끝나는것만
imgs_urls.append(img['src'])
except: # 예외 pass
pass
element2.click()
WebDriverWait(driver2, 1)
soup2 = create_soup(driver2)
for img in soup2.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except: # 예외 pass
pass
element3.click()
WebDriverWait(driver3, 1)
soup3 = create_soup(driver3)
for img in soup3.find_all('img'):
try:
if img['src'].startswith('http') and img['src'].endswith('jpg'):
imgs_urls.append(img['src'])
except: # 예외 pass
pass
cnt += 3
return (imgs_urls)
def download_img(url, filename):
full_name = str(filename) + ".jpg"
urllib.request.urlretrieve(url, 'C:/Python/' + full_name)
for url in crawl():
download_img(url, filename)
Indeed you can! I've been thinking about using a multi-driver solution for a current project I'm working on as well.
In this example I'm just declaring the driver objects separately, though personally I would want to put them into some kind of array to reference them easier so you can iterate through them. Naturally this would make the structure of your code a bit different, though you shouldn't run into too many issues here.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
baseURL_1 = "http://www.stackoverflow.com/"
baseURL_2 = "http://www.google.com/"
def main():
init()
initialPage()
return
def init():
global drv1
global drv2
chromedrvPath = "C:\\path_to_chrome\\chromedriver.exe"
opt = webdriver.ChromeOptions()
opt.add_experimental_option('prefs', {
'credentials_enable_service': False,
'profile': {
'password_manager_enabled': False
}
})
drv1 = webdriver.Chrome(chromedrvPath,chrome_options=opt)
drv2 = webdriver.Chrome(chromedrvPath,chrome_options=opt)
return
def initialPage():
navigate(baseURL_1,1)
navigate(baseURL_2,2)
return
def navigate(URL,d):
if(d == 1):
drv1.get(URL)
if(d == 2):
drv2.get(URL)
return
if __name__ == "__main__":
main()
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.