简体   繁体   中英

How to scrape multiple urls in one Python script with Beautiful soup

I have the following Python script that works well for what I need and gives me the output I want. However, I have another url ( https://www.website2/page- ) that I'd like to add to the script. Currently I manually swap the urls and run them as separate scripts but I'd like to do it in one go, is this possible?

Ps - required script for each site is identical other than the url property. TIA!

import itertools
import random
import time
import typing
import signal

import requests
from bs4 import BeautifulSoup

from model import Model, Data

RUNNING = True


def sigint_handler(*args: typing.Any) -> None:
    global RUNNING
    print("Signal received, exiting gracefully ...")
    RUNNING = False


def scrape(url: str, model: Model, session: requests.Session, headers: typing.Dict[str, str]) -> None:
    for page in itertools.count(1):
        if not RUNNING:
            break
        req = session.get(f"{url}{page}", headers=headers)
        soup = BeautifulSoup(req.content, 'html.parser')

        for li in soup.find_all('li', class_="container"):
            title = li.find('h2').text
            price = li.find('p', class_="price-text").text
            print(f"Title: {title}, Price: {price}")
            model.insert_or_update(Data(address=title, price=price))

        time.sleep(random.randint(1, 5))


def run() -> None:
    url = "https://www.website1/page-"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    model, session = Model(), requests.Session()
    scrape(url, model, session, headers)


if __name__ == '__main__':
    signal.signal(signal.SIGINT, sigint_handler)
    run()

if you want it to parse another page add a comma

## Added Comma meaning 2 urls can be parsed for efficiency as you wouldn't want to rewrite your code again
import itertools
import random
import time
import typing
import signal
import requests
from bs4 import BeautifulSoup
from model import Model, Data
RUNNING = True
def sigint_handler(*args: typing.Any) -> None:
    global RUNNING
    print("Signal received, exiting gracefully ...")
    RUNNING = False


def scrape(url: str, model: Model, session: requests.Session, headers: typing.Dict[str, str]) -> None:
    for page in itertools.count(1):
        if not RUNNING:
            break
        req = session.get(f"{url}{page}", headers=headers)
        soup = BeautifulSoup(req.content, 'html.parser')

        for li in soup.find_all('li', class_="container"):
            title = li.find('h2').text
            price = li.find('p', class_="price-text").text
            print(f"Title: {title}, Price: {price}")
            model.insert_or_update(Data(address=title, price=price))

        time.sleep(random.randint(1, 5))


def run() -> None:
    url = "https://www.website1/page- , https://www.website1/page- "


    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    model, session = Model(), requests.Session()
    scrape(url, model, session, headers)


if __name__ == '__main__':
    signal.signal(signal.SIGINT, sigint_handler)
    run()

I think you just need an array and for loop:

def run() -> None:
    urls = ["https://www.website1/page-", "https://www.website2/page-"]
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) 
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
    }
    model, session = Model(), requests.Session()
    for url in urls:
        scrape(url, model, session, headers)

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM