[英]Python break loop into several section
我想从7000个网址,并保存刮信息为CSV获取数据。 而不是一次遍历所有 7000 个 URL。 我怎样才能将 csv 分成每个 csv 1000 个 URL。
下面是我当前代码的示例。 我已将总数更改为索引 7000 = 10 和每个 csv = 2 url。
urls = ['www.1.com', 'www.2.com', 'www.3.com', 'www.4.com', 'www.5.com', 'www.6.com', 'www.7.com', 'www.8.com', 'www.9.com', 'www.10.com']
ranks = []
names = []
prices = []
count = 0
rows_count = 0
total_index = 10
i = 1
while i < total_index:
for url in urls[rows_count+0:rows_count+2]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
count += 1
print('Loop', count, f'started for {url}')
rank = []
name = []
price = []
# loop for watchlist
for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz'):
item = item.text
rank.append(item)
ranks.append(rank)
# loop for ticker name
for ticker in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1'):
ticker = ticker.text
name.append(ticker)
names.append(name)
# loop for price
for price_tag in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle'):
price_tag = price_tag.text
price.append(price_tag)
prices.append(price)
sleep_interval = randint(1, 2)
print('Sleep interval ', sleep_interval)
time.sleep(sleep_interval)
rows_count += 2
df = pd.DataFrame(ranks)
df2 = pd.DataFrame(names)
df3 = pd.DataFrame(prices)
final_table = pd.concat([df, df2, df3], axis=1)
final_table.columns=['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
final_table.to_csv(os.path.join(path,fr'summary_{rows_count}.csv'))
i += 2
为我的问题寻求高级助理。
或者有没有其他方法可以做到。
据我了解,您将通过抓取每个 URL 获得一行数据。 分块抓取并写入 CSV 的通用解决方案如下所示:
def scrape_in_chunks(urls, scraping_function, chunk_size, filename_template):
""" Apply a scraping function to a list of URLs and save as a series of CSVs with data from
one URL on each row and chunk_size urls in each CSV file.
"""
for i in range(0, len(urls), chunk_size):
df = pd.DataFrame([scrape(url) for url in urls[i:i+chunk_size]])
df.to_csv(filename_template.format(start=i, end=i+chunk_size-1))
def my_scraper(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
print('Loop', count, f'started for {url}')
keys = ['rank', 'type', 'watchlist', 'name', 'symbol', 'price', 'changes']
data = ([item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 bILTHz')] +
[item.text for item in soup.find('h2', class_ = 'sc-1q9q90x-0 jCInrl h1')] +
[item.text for item in soup.find('div', class_ = 'sc-16r8icm-0 kjciSH priceTitle')])
return dict(zip(keys, data)) # You could alternatively return a dataframe or series here but dict seems simpler
scrape_in_chunks(urls, my_scraper, 1000, os.path.join(path, "summary {start}-{end}.csv"))
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.