使用Python中的Selenium刮取Google圖像

Question

現在，我一直在嘗試使用以下代碼來刮取谷歌圖像：

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys 
import os
import time
import requests
import re
import urllib2
import re
from threading import Thread
import json
#Assuming I have a folder named Pictures1, the images are downloaded there. 
def threaded_func(url,i):
     raw_img = urllib2.urlopen(url).read()
     cntr = len([i for i in os.listdir("Pictures1") if image_type in i]) + 1
     f = open("Pictures1/" + image_type + "_"+ str(total), 'wb')
     f.write(raw_img)
     f.close()
driver = webdriver.Firefox()
driver.get("https://images.google.com/")
elem = driver.find_element_by_xpath('/html/body/div/div[3]/div[3]/form/div[2]/div[2]/div[1]/div[1]/div[3]/div/div/div[2]/div/input[1]')
elem.clear()
elem.send_keys("parrot")
elem.send_keys(Keys.RETURN)
image_type = "parrot_defG"
images=[]
total=0
time.sleep(10)
for a in driver.find_elements_by_class_name('rg_meta'):
     link =json.loads(a.text)["ou"]
     thread = Thread(target = threaded_func, args = (link,total))
     thread.start()
     thread.join()
     total+=1

我嘗試使用Selenium打開谷歌的圖像結果頁面，然后注意每個div都有類'rg-meta'，然后是JSON代碼。

我試圖使用.text訪問它。 JSON的'ou'索引具有我嘗試下載的圖像的來源。 我試圖用類'rg-meta'獲取所有這些div並下載圖像。 但是它顯示錯誤“NO JSON OBJECT可以被解碼” ，我不知道該怎么做。

編輯：這就是我所說的：

    <div class="rg_meta">{"cl":3,"id":"FqCGaup9noXlMM:","isu":"kids.britannica.com","itg":false,"ity":"jpg","oh":600,"ou":"http://media.web.britannica.com/eb-media/89/89689-004-4C85E0F0.jpg","ow":380,"pt":"grain weevil -- Kids Encyclopedia | Children\u0026#39;s Homework Help ...","rid":"EusB0pk_sLg7vM","ru":"http://kids.britannica.com/comptons/art-143712/grain-or-granary-weevil","s":"grain weevil","sc":1,"st":"Kids Britannica","th":282,"tu":"https://encrypted-tbn2.gstatic.com/images?q\u003dtbn:ANd9GcQPbgXbRVzOicvPfBRtAkLOpJwy_wDQEC6a2q0BuTsUx-s0-h4b","tw":179}</div>

檢查JSON的“ou”索引。 請幫我提取它。

請原諒我的無知。

這是我通過進行以下更新來解決它的方法：

    for a in driver.find_elements_by_xpath('//div[@class="rg_meta"]'):
        atext = a.get_attribute('innerHTML')
        link =json.loads(atext)["ou"]
        print link
        thread = Thread(target = threaded_func, args = (link,total))
        thread.start()
        thread.join()
        total+=1

Answer 1

更換：

driver.find_elements_by_class_name('rg_meta')與driver.find_element_by_xpath('//div[@class="rg_meta"]/text()')

和a.text with a

將解決您的問題。

結果代碼：

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys 
import os
import time
import requests
import re
import urllib2
import re
from threading import Thread
import json
#Assuming I have a folder named Pictures1, the images are downloaded there. 
def threaded_func(url,i):
     raw_img = urllib2.urlopen(url).read()
     cntr = len([i for i in os.listdir("Pictures1") if image_type in i]) + 1
     f = open("Pictures1/" + image_type + "_"+ str(total), 'wb')
     f.write(raw_img)
     f.close()
driver = webdriver.Firefox()
driver.get("https://images.google.com/")
elem = driver.find_element_by_xpath('/html/body/div/div[3]/div[3]/form/div[2]/div[2]/div[1]/div[1]/div[3]/div/div/div[2]/div/input[1]')
elem.clear()
elem.send_keys("parrot")
elem.send_keys(Keys.RETURN)
image_type = "parrot_defG"
images=[]
total=0
time.sleep(10)
for a in driver.find_element_by_xpath('//div[@class="rg_meta"]/text()'):
     link =json.loads(a)["ou"]
     thread = Thread(target = threaded_func, args = (link,total))
     thread.start()
     thread.join()
     total+=1

打印鏈接結果如下：

http://media.web.britannica.com/eb-media/89/89689-004-4C85E0F0.jpg

使用Python中的Selenium刮取Google圖像

問題描述

1 個解決方案

解決方案1
0 2017-03-19 23:23:10

使用Python中的Selenium刮取Google圖像

問題描述

1 個解決方案

解決方案1 0 2017-03-19 23:23:10

解決方案1
0 2017-03-19 23:23:10