如何使用请求和 python 中的 beautifulsoup 对网站的所有页面进行分页

Question

So i was trying to scrape names and prices for the shower curtains from this site.所以我试图从这个网站上搜集浴帘的名称和价格。 The site has above 200 pages, but this code works only for the first 100 pages and then it repeats scraping the same 100 pages again.该站点有超过 200 个页面，但此代码仅适用于前 100 个页面，然后它会再次重复抓取相同的 100 个页面。

import requests
from bs4 import BeautifulSoup
import re
import csv

site = "https://ih1.redbubble.net/image.{}/ur,shower_curtain_closed,square,600x600.1.jpg"

firstrow = ['No.', 'Name', 'Price', 'Image Url']
with open('curtains.csv', 'a', newline='') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(firstrow)
    csvFile.close()


def main(url):
    count = 0
    for page in range(1,205):
        print('\n','*'*10 , 'Scraping Page # {}'.format(page) , '*'*10)
        print('Link # {}'.format(url.format(page)))
        final_url = url.format(page)
        r = requests.get(final_url)
        soup = BeautifulSoup(r.content, 'html.parser')
        target = soup.select("img[class*=styles__rounded--1lyoH]")
        imgs = [img.group(1) for img in re.finditer(r'\.(\d+\.\d{4})', r.text)]
        goal = list(dict.fromkeys(imgs))
        for tar, go in zip(target, goal):
            count += 1

            name = tar['alt']
            price = tar.find_all_next('span')[3].text
            img = site.format(go)

            print('*'*20 , count , '*'*20)
            print('Name: {}'.format(name))
            print('Price: {}'.format(price))
            print('Image Url: {}'.format(img))


            row = [count, name, price, img]
            with open('curtains.csv', 'a', newline='' , encoding='utf-8') as csvFile:
                writer = csv.writer(csvFile)
                writer.writerow(row)
                csvFile.close()


main("https://www.redbubble.com/shop/shower-curtains?page={}")

Answer 1

import requests
import csv


data = {
    "operationName": "withSearchResults",
    "query": "query withSearchResults($query: String!, $queryParams: QueryParams, $locale: String!, $country: String!, $currency: String!, $previewTypeIds: [String!], $experience: String) {\n  searchResults(query: $query, queryParams: $queryParams, locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds, experience: $experience) {\n    ...Results\n    ...TrendingResults\n    ...Metadata\n    ...Filters\n    ...Pagination\n    ...LandingPage\n    __typename\n  }\n}\n\nfragment Results on SearchResults {\n  results {\n    inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n      id\n      description\n      productTypeId\n      productPageUrl\n      blankItemId\n      price {\n        id\n        amount\n        currency\n        __typename\n      }\n      previewSet {\n        id\n        previews {\n          previewTypeId\n          url\n          __typename\n        }\n        __typename\n      }\n      gaCode\n      gaCategory\n      attributes {\n        name\n        value\n        attributes {\n          name\n          value\n          __typename\n        }\n        __typename\n      }\n      volumeDiscount {\n        id\n        thresholds {\n          percentOff\n          quantity\n          __typename\n        }\n        __typename\n      }\n      experiencesProductCard {\n        name\n        value\n        __typename\n      }\n      __typename\n    }\n    work(locale: $locale) {\n      id\n      title\n      artistName\n      isMatureContent\n      tags\n      __typename\n    }\n    defaultPreviewTypeId\n    groupId\n    rank\n    __typename\n  }\n  __typename\n}\n\nfragment TrendingResults on SearchResults {\n  trendingResults {\n    inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n      id\n      description\n      productPageUrl\n      productTypeId\n      price {\n        id\n        amount\n        currency\n        __typename\n      }\n      previewSet {\n        id\n        previews {\n          previewTypeId\n          url\n          __typename\n        }\n        __typename\n      }\n      volumeDiscount {\n        id\n        thresholds {\n          percentOff\n          quantity\n          __typename\n        }\n        __typename\n      }\n      gaCode\n      gaCategory\n      attributes {\n        name\n        value\n        attributes {\n          name\n          value\n          __typename\n        }\n        __typename\n      }\n      experiencesProductCard {\n        name\n        value\n        __typename\n      }\n      __typename\n    }\n    work(locale: $locale) {\n      id\n      title\n      artistName\n      isMatureContent\n      tags\n      __typename\n    }\n    defaultPreviewTypeId\n    rank\n    __typename\n  }\n  __typename\n}\n\nfragment Metadata on SearchResults {\n  metadata {\n    title\n    searchContext {\n      category\n      __typename\n    }\n    resultCount\n    topic\n    searchBar {\n      iaCode\n      pillLabel\n      keywords\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment Filters on SearchResults {\n  filters {\n    resetUrl\n    staticFilters {\n      type\n      label\n      options {\n        name\n        label\n        applied\n        url\n        options {\n          name\n          label\n          applied\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    filters {\n      type\n      label\n      experiences {\n        name\n        value\n        __typename\n      }\n      options {\n        name\n        label\n        applied\n        disabled\n        url\n        hexColor\n        imageUrl\n        __typename\n      }\n      __typename\n    }\n    appliedCount\n    appliedPath\n    resets {\n      label\n      url\n      __typename\n    }\n    __typename\n  }\n  __typename\n}\n\nfragment Pagination on SearchResults {\n  pagination {\n    currentPage\n    perPage\n    showPreviousPageLink\n    showNextPageLink\n    paginationLinks {\n      namedLinks {\n        previousPage {\n          rel\n          url\n          __typename\n        }\n        nextPage {\n          rel\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    fromNumber\n    toNumber\n    total\n    __typename\n  }\n  __typename\n}\n\nfragment LandingPage on SearchResults {\n  metadata {\n    formattedQuery\n    landingPage {\n      hero {\n        pitch\n        title\n        image\n        color\n        __typename\n      }\n      bubbles {\n        title\n        items {\n          title\n          image\n          realisticImage\n          url\n          isExternal\n          __typename\n        }\n        hasImages\n        __typename\n      }\n      seoMetadata {\n        pageDescription\n        robots\n        canonicalURL\n        searchTitle\n        seoImage\n        alternatePageVersions {\n          href\n          locale\n          __typename\n        }\n        relatedTagLinks {\n          title\n          href\n          text\n          __typename\n        }\n        __typename\n      }\n      footer {\n        text\n        breadcrumbs {\n          name\n          url\n          __typename\n        }\n        __typename\n      }\n      __typename\n    }\n    relatedTopics {\n      title\n      url\n      __typename\n    }\n    relatedProducts {\n      id\n      url\n      productTitle\n      fullTitle\n      __typename\n    }\n    searchPageType\n    resultCount\n    searchUUID\n    __typename\n  }\n  __typename\n}\n",
    "variables": {"country": "EG", "currency": "USD", "experience": "srp", "locale": "en",
                  "previewTypeIds": ["product_close", "alternate_product_close", "artwork"],
                  "query": "shower-curtains", "queryParams": {"page": 0}
                  }
}


def main(url):
    with requests.Session() as req:
        with open("data.csv", 'w', newline="", encoding="UTF-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Name", "Price", "IMG"])
            for item in range(1, 11):
                print(f"Extracting Page# {item}")
                data['variables']['queryParams']['page'] = item
                r = req.post(url, json=data).json()
                for item in r['data']['searchResults']['results']:
                    writer.writerow([
                        item['work']['title'],
                        item['inventoryItem']['price']['amount'],
                        item['inventoryItem']['previewSet']['previews'][0]['url']
                    ])


main("https://www.redbubble.com/boom/graphql")

Output: view-online Output: 在线查看

Sample:样本：

如何使用请求和 python 中的 beautifulsoup 对网站的所有页面进行分页

问题描述

1 个解决方案

解决方案1
0 2020-04-27 21:52:44

如何使用请求和 python 中的 beautifulsoup 对网站的所有页面进行分页

问题描述

1 个解决方案

解决方案1 0 2020-04-27 21:52:44

解决方案1
0 2020-04-27 21:52:44