使用 BeautifulSoup 抓取 Google 搜索

Question

I wanted to scrape multiple pages of Google search.我想抓取 Google 搜索的多个页面。 Till now I could manage to scrape only the first page, but how could I do it for multiple pages.到目前为止，我只能抓取第一页，但是我怎么能抓取多个页面。

from bs4 import BeautifulSoup
import requests
import urllib.request
import re
from collections import Counter

def search(query):
    url = "http://www.google.com/search?q="+query

    text = []
    final_text = []

    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text,"html.parser")

    for desc in soup.find_all("span",{"class":"st"}):
        text.append(desc.text)

    for title in soup.find_all("h3",attrs={"class":"r"}):
        text.append(title.text)

    for string in text:
        string  = re.sub("[^A-Za-z ]","",string)
        final_text.append(string)

    count_text = ' '.join(final_text)
    res = Counter(count_text.split())

    keyword_Count = dict(sorted(res.items(), key=lambda x: (-x[1], x[0])))

    for x,y in keyword_Count.items():
        print(x ," : ",y)


search("girl")

Answer 1

url = "http://www.google.com/search?q=" + query + "&start=" + str((page - 1) * 10)

Answer 2

like comment above you need next page URL and put the code inside loop像上面的评论一样，您需要下一页 URL 并将代码放入循环中

def search(query):
    url = "https://www.google.com/search?hl=en&q=" + query
    while url:
        text = []
        ....
        ....
        for x,y in keyword_Count.items():
            print(x ," : ",y)

        # get next page url
        url = soup.find('a', id='pnnext')
        if url:
            url = 'https://www.google.com/' + url['href']
        else:
            print('no next page, loop ended')
            break

To make soup.find('a', id='pnnext') work you may need to set user-agent for requests要使soup.find('a', id='pnnext')工作，您可能需要为请求设置用户代理

Answer 3

Code below does actual pagination through the "Next" button link.下面的代码通过“下一步”按钮链接进行实际分页。

from bs4 import BeautifulSoup
import requests, urllib.parse
import lxml

def print_extracted_data_from_url(url):

    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
    }
    response = requests.get(url, headers=headers).text

    soup = BeautifulSoup(response, 'lxml')

    print(f'Current page: {int(soup.select_one(".YyVfkd").text)}')
    print(f'Current URL: {url}')
    print()

    for container in soup.findAll('div', class_='tF2Cxc'):
        head_text = container.find('h3', class_='LC20lb DKV0Md').text
        head_sum = container.find('div', class_='IsZvec').text
        head_link = container.a['href']
        print(head_text)
        print(head_sum)
        print(head_link)
        print()

    return soup.select_one('a#pnnext')


def scrape():
    next_page_node = print_extracted_data_from_url(
        'https://www.google.com/search?hl=en-US&q=coca cola')

    while next_page_node is not None:
        next_page_url = urllib.parse.urljoin('https://www.google.com', next_page_node['href'])

        next_page_node = print_extracted_data_from_url(next_page_url)

scrape()

Part of the output:部分输出：

Results via beautifulsoup

Current page: 1
Current URL: https://www.google.com/search?hl=en-US&q=coca cola

The Coca-Cola Company: Refresh the World. Make a Difference
We are here to refresh the world and make a difference. Learn more about the Coca-Cola Company, our brands, and how we strive to do business the right way.‎Careers · ‎Contact Us · ‎Jobs at Coca-Cola · ‎Our Company
https://www.coca-colacompany.com/home

Coca-Cola
2021 The Coca-Cola Company, all rights reserved. COCA-COLA®, "TASTE THE FEELING", and the Contour Bottle are trademarks of The Coca-Cola Company.
https://www.coca-cola.com/

Alternatively, you can do this using Google Search Engine Results API from SerpApi.或者，您可以使用来自 SerpApi 的Google Search Engine Results API执行此操作。 It's a paid API with a free trial of 5,000 searches.这是一个付费 API，可免费试用 5,000 次搜索。

Code to integrate:集成代码：

import os
from serpapi import GoogleSearch

def scrape():
  
  params = {
    "engine": "google",
    "q": "coca cola",
    "api_key": os.getenv("API_KEY"),
  }

  search = GoogleSearch(params)
  results = search.get_dict()

  print(f"Current page: {results['serpapi_pagination']['current']}")

  for result in results["organic_results"]:
      print(f"Title: {result['title']}\nLink: {result['link']}\n")

  while 'next' in results['serpapi_pagination']:
      search.params_dict["start"] = results['serpapi_pagination']['current'] * 10
      results = search.get_dict()

      print(f"Current page: {results['serpapi_pagination']['current']}")

      for result in results["organic_results"]:
          print(f"Title: {result['title']}\nLink: {result['link']}\n")

scrape()

Part of the output:部分输出：

Results from SerpApi

Current page: 1
Current URL: https://www.google.com/search?hl=en-US&q=coca cola

The Coca-Cola Company: Refresh the World. Make a Difference
We are here to refresh the world and make a difference. Learn more about the Coca-Cola Company, our brands, and how we strive to do business the right way.‎Careers · ‎Contact Us · ‎Jobs at Coca-Cola · ‎Our Company
https://www.coca-colacompany.com/home

Coca-Cola
2021 The Coca-Cola Company, all rights reserved. COCA-COLA®, "TASTE THE FEELING", and the Contour Bottle are trademarks of The Coca-Cola Company.
https://www.coca-cola.com/

Disclaimer, I work for SerpApi.免责声明，我为 SerpApi 工作。

使用 BeautifulSoup 抓取 Google 搜索

问题描述

3 个解决方案

解决方案1
2 2018-11-16 10:02:22

解决方案2
0 2018-11-16 01:07:38

解决方案3
0 2021-05-26 05:57:23

使用 BeautifulSoup 抓取 Google 搜索

问题描述

3 个解决方案

解决方案1 2 2018-11-16 10:02:22

解决方案2 0 2018-11-16 01:07:38

解决方案3 0 2021-05-26 05:57:23

解决方案1
2 2018-11-16 10:02:22

解决方案2
0 2018-11-16 01:07:38

解决方案3
0 2021-05-26 05:57:23