简体   繁体   中英

Python selenium web driver multiprocessing

I am crawling images with Python's selenium web driver(Chrome)

Can I use multiple drivers and have each driver crawl the image?

I want to do the following things with multiple processing

source code

def crawl(searchText):
  driver = webdriver.Chrome('C:\\Users\\HYOWON\\Desktop\\Desktop\\Graduation\\Code\\Crawling\\chromedriver.exe')

  searchUrl = "https://www.google.com/search?q={}&site=webhp&tbm=isch".format(searchText)

  driver.get(searchUrl)

  imgs_urls = []  # Url 저장 배열
  cnt = 0

  for j in range(20):
    element = driver.find_element_by_css_selector("div[data-ri = '" + str(cnt + j) + "'] img")
      element.click()
      sleep(1)

      soup = create_soup() 

      for img in soup.find_all('img'):
          try:
              if img['src'].startswith('http') and img['src'].endswith('jpg'):  
                  imgs_urls.append(img['src'])
          except:  
              pass

  driver.close()
  return(imgs_urls)

Modification code

 def crawl():
    imgs_urls = []
    for j in range(50):
      element1 = driver1.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
      element2 = driver2.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")
      element3 = driver3.find_element_by_css_selector("div[data-ri = '" + str(cnt) + "'] img")

      element1.click()
      WebDriverWait(driver1, 1)
      soup1 = create_soup(driver1)

      for img in soup1.find_all('img'):
          try:
              if img['src'].startswith('http') and img['src'].endswith('jpg'):  # http로 시작 jpg로 끝나는것만
                imgs_urls.append(img['src'])
          except:  # 예외 pass
              pass

      element2.click()
      WebDriverWait(driver2, 1)
      soup2 = create_soup(driver2)

      for img in soup2.find_all('img'):
          try:
              if img['src'].startswith('http') and img['src'].endswith('jpg'):
                imgs_urls.append(img['src'])
          except:  # 예외 pass
              pass

      element3.click()
      WebDriverWait(driver3, 1)
      soup3 = create_soup(driver3)


      for img in soup3.find_all('img'):
          try:
              if img['src'].startswith('http') and img['src'].endswith('jpg'):
                imgs_urls.append(img['src'])
          except:  # 예외 pass
              pass

      cnt += 3

  return (imgs_urls)

def download_img(url, filename):
  full_name = str(filename) + ".jpg"
  urllib.request.urlretrieve(url, 'C:/Python/' + full_name)

for url in crawl():
  download_img(url, filename)

Indeed you can! I've been thinking about using a multi-driver solution for a current project I'm working on as well.

In this example I'm just declaring the driver objects separately, though personally I would want to put them into some kind of array to reference them easier so you can iterate through them. Naturally this would make the structure of your code a bit different, though you shouldn't run into too many issues here.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

baseURL_1 = "http://www.stackoverflow.com/"
baseURL_2 = "http://www.google.com/"

def main():
    init()
    initialPage()
    return

def init():
    global drv1
    global drv2

    chromedrvPath = "C:\\path_to_chrome\\chromedriver.exe"
    opt = webdriver.ChromeOptions()
    opt.add_experimental_option('prefs', {
        'credentials_enable_service': False,
        'profile': {
            'password_manager_enabled': False
        }
    })
    drv1 = webdriver.Chrome(chromedrvPath,chrome_options=opt)
    drv2 = webdriver.Chrome(chromedrvPath,chrome_options=opt)

    return

def initialPage():
    navigate(baseURL_1,1)
    navigate(baseURL_2,2)
    return

def navigate(URL,d):
    if(d == 1):
        drv1.get(URL)
    if(d == 2):
        drv2.get(URL)
    return

if __name__ == "__main__":
    main()

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM