简体   繁体   中英

Python scrape google search results

I am trying to scrape all the data of the google search results - title, URL and description. However, I cant grab the description of the search results, it returns an empty string.

# check Chrome version: Menue (the three dots - upper right corner -> Help -> About Google Chrome)
# download ChromeDriver according to the Chrome version (example version 79)
# download from https://sites.google.com/a/chromium.org/chromedriver/downloads
# place the chromedriver.exe file in the current working directory
# pip install selenium

from selenium import webdriver
from bs4 import BeautifulSoup
import time
from bs4.element import Tag
import pandas as pd
import random


keywords = pd.read_csv('keywords.csv', header=0, index_col=None)
df = pd.DataFrame(columns=['keyword', 'title', 'url', 'description'])
for i in keywords['keyword']:

    # Scraper that gives bacck: titles, links, descriptions
    driver = webdriver.Chrome()
    google_url = "https://www.google.com/search?gl=US&q=" + i + "&num=" + str(10)
    driver.get(google_url)
    time.sleep(random.randrange(15,50))

    soup = BeautifulSoup(driver.page_source,'lxml')
    result_div = soup.find_all('div', attrs={'class': 'g'})

    links = []
    titles = []
    descriptions = []
    for r in result_div:
        # Checks if each element is present, else, raise exception
        try:
            link = r.find('a', href=True)
            title = None
            title = r.find('h3')

            if isinstance(title,Tag):
                title = title.get_text()

            description = None
            description = r.find('span', attrs={'class': 'st'})

            if isinstance(description, Tag):
                description = description.get_text()

            # Check to make sure everything is present before appending
            if link != '' and title != '' and description != '':
                links.append(link['href'])
                titles.append(title)
                descriptions.append(description)
        # Next loop if one element is not present
        except Exception as e:
            print(e)
            continue
    for link, title, description in zip(links, titles, descriptions):
        df = df.append({'keyword': i, 'title': title, 'url': link, 'description': description}, ignore_index=True)

df.to_csv(r'final_dataset.csv', index=False)


Anyone has an idea how to grab the description in the google search results.

Get the description node with the following code.

description = r.select('.aCOpRe span:not(.f)')

Also, you can use requests instead of selenium . The full example is in online IDE .

from requests import Session
from bs4 import BeautifulSoup
from bs4.element import Tag
import pandas as pd

keywords = pd.read_csv('keywords.csv', header=0, index_col=None)

df = pd.DataFrame(columns=['keyword', 'title', 'url', 'description'])

for i in keywords['keyword']:
    # Scraper that gives back: titles, links, descriptions

    params = {"q": i, 'gl': 'US', 'num': 10}
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 Edg/80.0.361.62"
    }

    with Session() as session:
        r = session.get(
            "https://google.com/search", params=params, headers=headers)

        soup = BeautifulSoup(r.content, 'lxml')

        result_div = soup.find_all('div', attrs={'class': 'g'})

        links = []
        titles = []
        descriptions = []

        for r in result_div:
            # Checks if each element is present, else, raise exception
            try:
                link = r.find('a', href=True)
                title = r.find('h3')

                if isinstance(title, Tag):
                    title = title.get_text()

                description = r.select('.aCOpRe span:not(.f)')

                if isinstance(description, Tag):
                    description = description.get_text()

                # Check to make sure everything is present before appending
                if link != '' and title != '' and description != '':
                    links.append(link['href'])
                    titles.append(title)
                    descriptions.append(description)
            # Next loop if one element is not present
            except Exception as e:
                print(e)
                continue

    for link, title, description in zip(links, titles, descriptions):
        df = df.append({
            'keyword': i,
            'title': title,
            'url': link,
            'description': description
        }, ignore_index=True)

df.to_csv(r'final_dataset.csv', index=False)

Alternatively, you can extract data from Google Search via SerpApi .


Disclaimer: I work at SerpApi.

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM