简体   繁体   中英

How to paginate through all the pages of a website using requests and beautifulsoup in python

So i was trying to scrape names and prices for the shower curtains from this site. The site has above 200 pages, but this code works only for the first 100 pages and then it repeats scraping the same 100 pages again.

import requests
from bs4 import BeautifulSoup
import re
import csv

site = "https://ih1.redbubble.net/image.{}/ur,shower_curtain_closed,square,600x600.1.jpg"

firstrow = ['No.', 'Name', 'Price', 'Image Url']
with open('curtains.csv', 'a', newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(firstrow)
    csvFile.close()


def main(url):
    count = 0
    for page in range(1,205):
        print('\n','*'*10 , 'Scraping Page # {}'.format(page) , '*'*10)
        print('Link # {}'.format(url.format(page)))
        final_url = url.format(page)
        r = requests.get(final_url)
        soup = BeautifulSoup(r.content, 'html.parser')
        target = soup.select("img[class*=styles__rounded--1lyoH]")
        imgs = [img.group(1) for img in re.finditer(r'\.(\d+\.\d{4})', r.text)]
        goal = list(dict.fromkeys(imgs))
        for tar, go in zip(target, goal):
            count += 1

            name = tar['alt']
            price = tar.find_all_next('span')[3].text
            img = site.format(go)

            print('*'*20 , count , '*'*20)
            print('Name: {}'.format(name))
            print('Price: {}'.format(price))
            print('Image Url: {}'.format(img))


            row = [count, name, price, img]
            with open('curtains.csv', 'a', newline='' , encoding='utf-8') as csvFile:
                writer = csv.writer(csvFile)
                writer.writerow(row)
                csvFile.close()


main("https://www.redbubble.com/shop/shower-curtains?page={}")
import requests
import csv


data = {
    "operationName": "withSearchResults",
    "query": "query withSearchResults($query: String!, $queryParams: QueryParams, $locale: String!, $country: String!, $currency: String!, $previewTypeIds: [String!], $experience: String) {\n  searchResults(query: $query, queryParams: $queryParams, locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds, experience: $experience) {\n    ...Results\n    ...TrendingResults\n    ...Metadata\n    ...Filters\n    ...Pagination\n    ...LandingPage\n    __typename\n  }\n}\n\nfragment Results on SearchResults {\n  results {\n    inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n      id\n      description\n      productTypeId\n      productPageUrl\n      blankItemId\n      price {\n        id\n        amount\n        currency\n        __typename\n      }\n      previewSet {\n        id\n        previews {\n          previewTypeId\n          url\n          __typename\n        }\n        __typename\n      }\n      gaCode\n      gaCategory\n      attributes {\n        name\n        value\n        attributes {\n          name\n          value\n          __typename\n        }\n        __typename\n      }\n      volumeDiscount {\n        id\n        thresholds {\n          percentOff\n          quantity\n          __typename\n        }\n        __typename\n      }\n      experiencesProductCard {\n        name\n        value\n        __typename\n      }\n      __typename\n    }\n    work(locale: $locale) {\n      id\n      title\n      artistName\n      isMatureContent\n      tags\n      __typename\n    }\n    defaultPreviewTypeId\n    groupId\n    rank\n    __typename\n  }\n  __typename\n}\n\nfragment TrendingResults on SearchResults {\n  trendingResults {\n    inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n      id\n      description\n      productPageUrl\n      productTypeId\n      price {\n        id\n        amount\n        currency\n        __typename\n      }\n      previewSet {\n        id\n        previews {\n          previewTypeId\n          url\n          __typename\n        }\n        __typename\n      }\n      volumeDiscount {\n        id\n        thresholds {\n          percentOff\n          quantity\n          __typename\n        }\n        __typename\n      }\n      gaCode\n      gaCategory\n      attributes {\n        name\n        value\n        attributes {\n          name\n          value\n          __typename\n        }\n        __typename\n      }\n      experiencesProductCard {\n        name\n        value\n        __typename\n      }\n      __typename\n    }\n    work(locale: $locale) {\n      id\n      title\n      artistName\n      isMatureContent\n      tags\n      __typename\n    }\n    defaultPreviewTypeId\n    rank\n    __typename\n  }\n  __typename\n}\n\nfragment Metadata on SearchResults {\n  metadata {\n    title\n    searchContext {\n      category\n      __typename\n    }\n    resultCount\n    topic\n    searchBar {\n      iaCode\n      pillLabel\n      keywords\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment Filters on SearchResults {\n  filters {\n    resetUrl\n    staticFilters {\n      type\n      label\n      options {\n        name\n        label\n        applied\n        url\n        options {\n          name\n          label\n          applied\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    filters {\n      type\n      label\n      experiences {\n        name\n        value\n        __typename\n      }\n      options {\n        name\n        label\n        applied\n        disabled\n        url\n        hexColor\n        imageUrl\n        __typename\n      }\n      __typename\n    }\n    appliedCount\n    appliedPath\n    resets {\n      label\n      url\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment Pagination on SearchResults {\n  pagination {\n    currentPage\n    perPage\n    showPreviousPageLink\n    showNextPageLink\n    paginationLinks {\n      namedLinks {\n        previousPage {\n          rel\n          url\n          __typename\n        }\n        nextPage {\n          rel\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    fromNumber\n    toNumber\n    total\n    __typename\n  }\n  __typename\n}\n\nfragment LandingPage on SearchResults {\n  metadata {\n    formattedQuery\n    landingPage {\n      hero {\n        pitch\n        title\n        image\n        color\n        __typename\n      }\n      bubbles {\n        title\n        items {\n          title\n          image\n          realisticImage\n          url\n          isExternal\n          __typename\n        }\n        hasImages\n        __typename\n      }\n      seoMetadata {\n        pageDescription\n        robots\n        canonicalURL\n        searchTitle\n        seoImage\n        alternatePageVersions {\n          href\n          locale\n          __typename\n        }\n        relatedTagLinks {\n          title\n          href\n          text\n          __typename\n        }\n        __typename\n      }\n      footer {\n        text\n        breadcrumbs {\n          name\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    relatedTopics {\n      title\n      url\n      __typename\n    }\n    relatedProducts {\n      id\n      url\n      productTitle\n      fullTitle\n      __typename\n    }\n    searchPageType\n    resultCount\n    searchUUID\n    __typename\n  }\n  __typename\n}\n",
    "variables": {"country": "EG", "currency": "USD", "experience": "srp", "locale": "en",
                  "previewTypeIds": ["product_close", "alternate_product_close", "artwork"],
                  "query": "shower-curtains", "queryParams": {"page": 0}
                  }
}


def main(url):
    with requests.Session() as req:
        with open("data.csv", 'w', newline="", encoding="UTF-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Name", "Price", "IMG"])
            for item in range(1, 11):
                print(f"Extracting Page# {item}")
                data['variables']['queryParams']['page'] = item
                r = req.post(url, json=data).json()
                for item in r['data']['searchResults']['results']:
                    writer.writerow([
                        item['work']['title'],
                        item['inventoryItem']['price']['amount'],
                        item['inventoryItem']['previewSet']['previews'][0]['url']
                    ])


main("https://www.redbubble.com/boom/graphql")

Output: view-online

Sample:

在此处输入图像描述

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM