[英]how to improve the Webscraping code speed by multi threading code python
下面是我逐行编写的代码(大约有900 页,每行 10 行和 5 个数据)有什么方法可以使它更快。 目前将数据导出到 csv 需要80 分钟。有什么方法可以对页面进行并行请求并提高此代码的效率。
import requests
from urllib3.exceptions import InsecureRequestWarning
import csv
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
f = csv.writer(open('GEM.csv', 'w', newline=''))
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate'])
def scrap_bid_data():
page_no = 1
while page_no < 910:
print('Hold on creating URL to fetch data...')
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no)
print('URL created: ' + url)
scraped_data = requests.get(url, verify=False)
soup_data = bs(scraped_data.text, 'lxml')
extracted_data = soup_data.find('div', {'id': 'pagi_content'})
if len(extracted_data) == 0:
break
else:
for idx in range(len(extracted_data)):
if (idx % 2 == 1):
bid_data = extracted_data.contents[idx].text.strip().split('\n')
bidno = bid_data[0].split(":")[-1]
items = bid_data[5].split(":")[-1]
qnty = int(bid_data[6].split(':')[1].strip())
dept = (bid_data[10] + bid_data[12].strip()).split(":")[-1]
edate = bid_data[17].split("End Date:")[-1]
f.writerow([bidno, items, qnty, dept, edate])
page_no=page_no+1
scrap_bid_data()
我对您的代码进行了一些重组,以确保您的 CSV 文件已关闭。 我还收到以下错误消息:
ConnectionError: HTTPSConnectionPool(host='bidplus.gem.gov.in', port=443): Max retries exceeded with url: /bidlists?bidlists&page_no=1 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000012EB0DF1E80>:建立新连接失败:[WinError 10060] 连接尝试失败,因为连接方在一段时间后没有正确响应,或建立连接失败,因为连接的主机没有响应',))
您应该尝试使用NUMBER_THREADS
值:
import requests
from urllib3.exceptions import InsecureRequestWarning
import csv
import concurrent.futures
import functools
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
def download_page(session, page_no):
url = 'https://bidplus.gem.gov.in/bidlists?bidlists&page_no=' + str(page_no)
print('URL created: ' + url)
resp = session.get(url, verify=False)
return resp.text
def scrap_bid_data():
NUMBER_THREADS = 30 # number of concurrent download requests
with open('GEM.csv', 'w', newline='') as out_file:
f = csv.writer(out_file)
f.writerow(['Bidnumber', 'Items', 'Quantitiy', 'Department', 'Enddate'])
with requests.Session() as session:
page_downloader = functools.partial(download_page, session)
with concurrent.futures.ThreadPoolExecutor(max_workers=NUMBER_THREADS) as executor:
pages = executor.map(page_downloader, range(1, 910))
page_no = 0
for page in pages:
page_no += 1
soup_data = bs(page, 'lxml')
extracted_data = soup_data.find('div', {'id': 'pagi_content'})
if extracted_data is None or len(extracted_data) == 0:
print('No data at page number', page_no)
print(page)
break
else:
for idx in range(len(extracted_data)):
if (idx % 2 == 1):
bid_data = extracted_data.contents[idx].text.strip().split('\n')
bidno = bid_data[0].split(":")[-1]
items = bid_data[5].split(":")[-1]
qnty = int(bid_data[6].split(':')[1].strip())
dept = (bid_data[10] + bid_data[12].strip()).split(":")[-1]
edate = bid_data[17].split("End Date:")[-1]
f.writerow([bidno, items, qnty, dept, edate])
scrap_bid_data()
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.