[英]How to paginate through all the pages of a website using requests and beautifulsoup in python
So i was trying to scrape names and prices for the shower curtains from this site.所以我试图从这个网站上搜集浴帘的名称和价格。 The site has above 200 pages, but this code works only for the first 100 pages and then it repeats scraping the same 100 pages again.
该站点有超过 200 个页面,但此代码仅适用于前 100 个页面,然后它会再次重复抓取相同的 100 个页面。
import requests
from bs4 import BeautifulSoup
import re
import csv
site = "https://ih1.redbubble.net/image.{}/ur,shower_curtain_closed,square,600x600.1.jpg"
firstrow = ['No.', 'Name', 'Price', 'Image Url']
with open('curtains.csv', 'a', newline='') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(firstrow)
csvFile.close()
def main(url):
count = 0
for page in range(1,205):
print('\n','*'*10 , 'Scraping Page # {}'.format(page) , '*'*10)
print('Link # {}'.format(url.format(page)))
final_url = url.format(page)
r = requests.get(final_url)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.select("img[class*=styles__rounded--1lyoH]")
imgs = [img.group(1) for img in re.finditer(r'\.(\d+\.\d{4})', r.text)]
goal = list(dict.fromkeys(imgs))
for tar, go in zip(target, goal):
count += 1
name = tar['alt']
price = tar.find_all_next('span')[3].text
img = site.format(go)
print('*'*20 , count , '*'*20)
print('Name: {}'.format(name))
print('Price: {}'.format(price))
print('Image Url: {}'.format(img))
row = [count, name, price, img]
with open('curtains.csv', 'a', newline='' , encoding='utf-8') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
csvFile.close()
main("https://www.redbubble.com/shop/shower-curtains?page={}")
import requests
import csv
data = {
"operationName": "withSearchResults",
"query": "query withSearchResults($query: String!, $queryParams: QueryParams, $locale: String!, $country: String!, $currency: String!, $previewTypeIds: [String!], $experience: String) {\n searchResults(query: $query, queryParams: $queryParams, locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds, experience: $experience) {\n ...Results\n ...TrendingResults\n ...Metadata\n ...Filters\n ...Pagination\n ...LandingPage\n __typename\n }\n}\n\nfragment Results on SearchResults {\n results {\n inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n id\n description\n productTypeId\n productPageUrl\n blankItemId\n price {\n id\n amount\n currency\n __typename\n }\n previewSet {\n id\n previews {\n previewTypeId\n url\n __typename\n }\n __typename\n }\n gaCode\n gaCategory\n attributes {\n name\n value\n attributes {\n name\n value\n __typename\n }\n __typename\n }\n volumeDiscount {\n id\n thresholds {\n percentOff\n quantity\n __typename\n }\n __typename\n }\n experiencesProductCard {\n name\n value\n __typename\n }\n __typename\n }\n work(locale: $locale) {\n id\n title\n artistName\n isMatureContent\n tags\n __typename\n }\n defaultPreviewTypeId\n groupId\n rank\n __typename\n }\n __typename\n}\n\nfragment TrendingResults on SearchResults {\n trendingResults {\n inventoryItem(locale: $locale, country: $country, currency: $currency, previewTypeIds: $previewTypeIds) {\n id\n description\n productPageUrl\n productTypeId\n price {\n id\n amount\n currency\n __typename\n }\n previewSet {\n id\n previews {\n previewTypeId\n url\n __typename\n }\n __typename\n }\n volumeDiscount {\n id\n thresholds {\n percentOff\n quantity\n __typename\n }\n __typename\n }\n gaCode\n gaCategory\n attributes {\n name\n value\n attributes {\n name\n value\n __typename\n }\n __typename\n }\n experiencesProductCard {\n name\n value\n __typename\n }\n __typename\n }\n work(locale: $locale) {\n id\n title\n artistName\n isMatureContent\n tags\n __typename\n }\n defaultPreviewTypeId\n rank\n __typename\n }\n __typename\n}\n\nfragment Metadata on SearchResults {\n metadata {\n title\n searchContext {\n category\n __typename\n }\n resultCount\n topic\n searchBar {\n iaCode\n pillLabel\n keywords\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment Filters on SearchResults {\n filters {\n resetUrl\n staticFilters {\n type\n label\n options {\n name\n label\n applied\n url\n options {\n name\n label\n applied\n url\n __typename\n }\n __typename\n }\n __typename\n }\n filters {\n type\n label\n experiences {\n name\n value\n __typename\n }\n options {\n name\n label\n applied\n disabled\n url\n hexColor\n imageUrl\n __typename\n }\n __typename\n }\n appliedCount\n appliedPath\n resets {\n label\n url\n __typename\n }\n __typename\n }\n __typename\n}\n\nfragment Pagination on SearchResults {\n pagination {\n currentPage\n perPage\n showPreviousPageLink\n showNextPageLink\n paginationLinks {\n namedLinks {\n previousPage {\n rel\n url\n __typename\n }\n nextPage {\n rel\n url\n __typename\n }\n __typename\n }\n __typename\n }\n fromNumber\n toNumber\n total\n __typename\n }\n __typename\n}\n\nfragment LandingPage on SearchResults {\n metadata {\n formattedQuery\n landingPage {\n hero {\n pitch\n title\n image\n color\n __typename\n }\n bubbles {\n title\n items {\n title\n image\n realisticImage\n url\n isExternal\n __typename\n }\n hasImages\n __typename\n }\n seoMetadata {\n pageDescription\n robots\n canonicalURL\n searchTitle\n seoImage\n alternatePageVersions {\n href\n locale\n __typename\n }\n relatedTagLinks {\n title\n href\n text\n __typename\n }\n __typename\n }\n footer {\n text\n breadcrumbs {\n name\n url\n __typename\n }\n __typename\n }\n __typename\n }\n relatedTopics {\n title\n url\n __typename\n }\n relatedProducts {\n id\n url\n productTitle\n fullTitle\n __typename\n }\n searchPageType\n resultCount\n searchUUID\n __typename\n }\n __typename\n}\n",
"variables": {"country": "EG", "currency": "USD", "experience": "srp", "locale": "en",
"previewTypeIds": ["product_close", "alternate_product_close", "artwork"],
"query": "shower-curtains", "queryParams": {"page": 0}
}
}
def main(url):
with requests.Session() as req:
with open("data.csv", 'w', newline="", encoding="UTF-8") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Price", "IMG"])
for item in range(1, 11):
print(f"Extracting Page# {item}")
data['variables']['queryParams']['page'] = item
r = req.post(url, json=data).json()
for item in r['data']['searchResults']['results']:
writer.writerow([
item['work']['title'],
item['inventoryItem']['price']['amount'],
item['inventoryItem']['previewSet']['previews'][0]['url']
])
main("https://www.redbubble.com/boom/graphql")
Output: view-online Output: 在线查看
Sample:样本:
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.