简体   繁体   中英

Python2 BeautifulSoup returns Blank output

This is the code I am using for downloading the images from Google page. This code is taking time in Evaluating and downloading the images. Hence, I thought of using the Beautifulsoup Library for faster evaluation and download. Check the below original code:

import time       
import sys    
import os
import urllib2



search_keyword = ['Australia']


keywords = [' high resolution']


def download_page(url):
    import urllib2
    try:
        headers = {}
        headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
        req = urllib2.Request(url, headers = headers)
        response = urllib2.urlopen(req)
        page = response.read()
        return page
    except:
        return"Page Not found"



def _images_get_next_item(s):
    start_line = s.find('rg_di')
    if start_line == -1:    
        end_quote = 0
        link = "no_links"
        return link, end_quote
    else:
        start_line = s.find('"class="rg_meta"')
        start_content = s.find('"ou"',start_line+1)
        end_content = s.find(',"ow"',start_content+1)
        content_raw = str(s[start_content+6:end_content-1])
        return content_raw, end_content



def _images_get_all_items(page):
    items = []
    while True:
        item, end_content = _images_get_next_item(page)
        if item == "no_links":
            break
        else:
            items.append(item)      
            time.sleep(0.1)        
            page = page[end_content:]
    return items



t0 = time.time()   


i= 0
while i<len(search_keyword):
    items = []
    iteration = "Item no.: " + str(i+1) + " -->" + " Item name = " + str(search_keyword[i])
    print (iteration)
    print ("Evaluating...")
    search_keywords = search_keyword[i]
    search = search_keywords.replace(' ','%20')


    try:
        os.makedirs(search_keywords)
    except OSError, e:
        if e.errno != 17:
            raise   

        pass

    j = 0
    while j<len(keywords):
        pure_keyword = keywords[j].replace(' ','%20')
        url = 'https://www.google.com/search?q=' + search + pure_keyword + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
        raw_html =  (download_page(url))
        time.sleep(0.1)
        items = items + (_images_get_all_items(raw_html))
        j = j + 1

    print ("Total Image Links = "+str(len(items)))
    print ("\n")



    info = open('output.txt', 'a')        
    info.write(str(i) + ': ' + str(search_keyword[i-1]) + ": " + str(items) + "\n\n\n")         
    info.close()                            

    t1 = time.time()    
    total_time = t1-t0   
    print("Total time taken: "+str(total_time)+" Seconds")
    print ("Starting Download...")




    k=0
    errorCount=0
    while(k<len(items)):
        from urllib2 import Request,urlopen
        from urllib2 import URLError, HTTPError

        try:
            req = Request(items[k], headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"})
            response = urlopen(req,None,15)
            output_file = open(search_keywords+"/"+str(k+1)+".jpg",'wb')

            data = response.read()
            output_file.write(data)
            response.close();

            print("completed ====> "+str(k+1))

            k=k+1;

        except IOError:   

            errorCount+=1
            print("IOError on image "+str(k+1))
            k=k+1;

        except HTTPError as e:  

            errorCount+=1
            print("HTTPError"+str(k))
            k=k+1;
        except URLError as e:

            errorCount+=1
            print("URLError "+str(k))
            k=k+1;

    i = i+1

print("\n")
print("Everything downloaded!")
print("\n"+str(errorCount)+" ----> total Errors")

I thought editing the below code, will help in making the code work with BeautifulSoup Library and my work will be completed faster:

def download_page(url):
        import urllib2
        try:
            headers = {}
            headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
            req = urllib2.Request(url, headers = headers)
            #response = urllib2.urlopen(req)
            #page = response.read()
            return BeautifulSoup(urlopen(Request(req)), 'html.parser')
        except:
            return"Page Not found"

But the above code is returning blank. Kindly, let me know what I might do to make the code work excellently well with BeautifulSoup without any trouble.

You can't just pass Google headers like that. The search engine is ALOT more complex than simply substituting some keywords into a GET URL.

HTML is a markup language only useful for one way rendering of human readable information. For your application, you need machine readable markup rather than trying to decipher human readable text. Google already has a very comprehensive API https://developers.google.com/custom-search/ which is easy to use, and a much better way of achieving this than using BeautifulSoup

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM