如何使用 Beautiful soup 在一个 Python 脚本中抓取多个 url

Question

I have the following Python script that works well for what I need and gives me the output I want.我有以下 Python 脚本，它可以很好地满足我的需要，并为我提供了我想要的 output。 However, I have another url ( https://www.website2/page- ) that I'd like to add to the script.但是，我还有另一个 url ( https://www.website2/page- ) 我想添加到脚本中。 Currently I manually swap the urls and run them as separate scripts but I'd like to do it in one go, is this possible?目前我手动交换 url 并将它们作为单独的脚本运行，但我想在一个 go 中完成，这可能吗？

Ps - required script for each site is identical other than the url property. Ps - 除了 url 属性之外，每个站点所需的脚本都是相同的。 TIA! TIA！

import itertools
import random
import time
import typing
import signal

import requests
from bs4 import BeautifulSoup

from model import Model, Data

RUNNING = True


def sigint_handler(*args: typing.Any) -> None:
    global RUNNING
    print("Signal received, exiting gracefully ...")
    RUNNING = False


def scrape(url: str, model: Model, session: requests.Session, headers: typing.Dict[str, str]) -> None:
    for page in itertools.count(1):
        if not RUNNING:
            break
        req = session.get(f"{url}{page}", headers=headers)
        soup = BeautifulSoup(req.content, 'html.parser')

        for li in soup.find_all('li', class_="container"):
            title = li.find('h2').text
            price = li.find('p', class_="price-text").text
            print(f"Title: {title}, Price: {price}")
            model.insert_or_update(Data(address=title, price=price))

        time.sleep(random.randint(1, 5))


def run() -> None:
    url = "https://www.website1/page-"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    model, session = Model(), requests.Session()
    scrape(url, model, session, headers)


if __name__ == '__main__':
    signal.signal(signal.SIGINT, sigint_handler)
    run()

Answer 1

if you want it to parse another page add a comma如果你想让它解析另一个页面添加一个逗号

## Added Comma meaning 2 urls can be parsed for efficiency as you wouldn't want to rewrite your code again
import itertools
import random
import time
import typing
import signal
import requests
from bs4 import BeautifulSoup
from model import Model, Data
RUNNING = True
def sigint_handler(*args: typing.Any) -> None:
    global RUNNING
    print("Signal received, exiting gracefully ...")
    RUNNING = False


def scrape(url: str, model: Model, session: requests.Session, headers: typing.Dict[str, str]) -> None:
    for page in itertools.count(1):
        if not RUNNING:
            break
        req = session.get(f"{url}{page}", headers=headers)
        soup = BeautifulSoup(req.content, 'html.parser')

        for li in soup.find_all('li', class_="container"):
            title = li.find('h2').text
            price = li.find('p', class_="price-text").text
            print(f"Title: {title}, Price: {price}")
            model.insert_or_update(Data(address=title, price=price))

        time.sleep(random.randint(1, 5))


def run() -> None:
    url = "https://www.website1/page- , https://www.website1/page- "


    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    model, session = Model(), requests.Session()
    scrape(url, model, session, headers)


if __name__ == '__main__':
    signal.signal(signal.SIGINT, sigint_handler)
    run()

Answer 2

I think you just need an array and for loop:我认为你只需要一个数组和 for 循环：

def run() -> None:
    urls = ["https://www.website1/page-", "https://www.website2/page-"]
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) 
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    model, session = Model(), requests.Session()
    for url in urls:
        scrape(url, model, session, headers)

如何使用 Beautiful soup 在一个 Python 脚本中抓取多个 url

问题描述

2 个解决方案

解决方案1
0 2022-10-06 21:19:53

解决方案2
0 2022-10-06 21:27:36

如何使用 Beautiful soup 在一个 Python 脚本中抓取多个 url

问题描述

2 个解决方案

解决方案1 0 2022-10-06 21:19:53

解决方案2 0 2022-10-06 21:27:36

解决方案1
0 2022-10-06 21:19:53

解决方案2
0 2022-10-06 21:27:36