[英]How do i make python find differing elements in a list and their number of occurrences within it
[英]How do I make a pair with a list of different number of elements?
我想配对 a + d、c + e 和 b+x。 但我不知道该怎么做。
我以为我可以使用 zip_longest 来完成,但它没有用。
combine_list = zip_longest(list1, list2)
我正在抓取 BBS 并跨多个页面获取信息。
主题:A店
线程 URL: https://thread1, thread2
第1页评论:哇哇哇哇
第2页评论:呜呜呜
帖子标题:B店
线程 URL:https://thread1
第1页评论:哇哇哇哇
帖子标题:店铺C
线程 URL: https://thread1, thread2
第1页评论:哇哇哇哇
第2页评论:呜呜呜
list1 = ['ShopA', 'thread_url', 'Page1 评论'], ['ShopB', 'thread_url', 'Page1 评论'], ['ShopC', 'thread_url', 'Page1 评论']
list2 = ['ShopA', 'thread_url', 'Page2 评论'], ['ShopC', 'thread_url', 'Page1 评论']
我有这样的情况,想把A店的Page1和Page2的评论合并成一个评论。 B店要获取B店的评论,C店要合并Page1和Page2的评论。
做这个的最好方式是什么?
等待帮助。
代码
import requests
from bs4 import BeautifulSoup
import pymongo
import re
import time
import itertools
def browse_header():
headers = {"User-Agent": "Mozilla/5.0 (X11; Mac OS X x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"}
return headers
def domain():
domain = 'https://bakusai.com'
return domain
def bbs_url_out():
thread_list = []
with open('./thread_url.csv', mode='r', encoding='utf-8') as f:
for urls in f:
thread_list.append(urls.strip())
return thread_list
def thread_requests_parse():
headers = browse_header()
url_list = bbs_url_out()
thread_url_lists = []
for thread_url in url_list:
time.sleep(1)
try:
r = requests.get(thread_url, headers=headers)
thread_url_lists.append(thread_url)
except Exception as ex:
print('Except:', ex)
pass
return thread_url_lists
def thread_article_parse(domain):
thread_url_lists = thread_requests_parse()
headers = browse_header()
domain = domain
shop_info_list_1 = []
for thread_url in thread_url_lists:
try:
r = requests.get(thread_url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
htmls = soup.find_all('div', {'class': 'article'})
title1 = soup.find('div', {'id': 'title_thr'}).text
shop_texts_1 = []
for html in htmls:
post = html.get_text()
time_pat = r'\d\d:\d\d'
posts = re.split(time_pat, post)
time_post = re.search(time_pat, post)
try:
seikei1_1 = str(posts[0]) + time_post.group() + '\n' + '<br><br><span style="font-size: 200%;"><b>' + str(posts[1]) + '\n' + '</span></b><br>[匿名さん]<br><br>'
seikei1_2 = re.sub('最新レス', '', seikei1_1)
seikei1_3 = seikei1_2.replace('[匿名さん]', '')
# print(seikei1_3)
shop_texts_1.append(seikei1_3)
except Exception as ex:
print('Except:', ex)
pass
thread_info_list = list([title1, thread_url, shop_texts_1])
shop_info_list_1.append(thread_info_list)
print(shop_info_list_1)
except Exception as ex:
print('Except:', ex)
pass
def next_page_url_parse():
thread_url_lists = thread_requests_parse()
next_url_lists = []
for thread_url in thread_url_lists:
try:
r = requests.get(thread_url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
nexts = soup.find('div', {'class': 'paging'}).find('span', {'class': 'paging_nextlink'}).find('a')
b = nexts.get('href')
next_url = domain + b
print('[thread_parse] : next_url', f'{next_url}')
next_url_lists.append(next_url)
except Exception as ex:
print('Except:', ex)
pass
return next_url_lists
def next_page_thread_parse():
next_url_lists = next_page_url_parse()
shop_info_list_next = []
for next_url in next_url_lists:
r2 = requests.get(next_url, headers=headers)
soup2 = BeautifulSoup(r2.text, 'html.parser')
html2s = soup2.find_all('div', {'class': 'article'})
title2 = soup2.find('div', {'id': 'title_thr'}).text
shop_texts2 = []
for html2 in html2s:
post2 = html2.get_text()
time_pat = r'\d\d:\d\d'
posts2 = re.split(time_pat, post2)
time_post = re.search(time_pat, post2)
try:
seikei2_1 = str(posts2[0]) + time_post.group() + '\n' + '<br><br><span style="font-size: 200%;"><b>' + str(posts2[1]) + '\n' + '</span></b><br>[匿名さん]<br><br>'
seikei2_2 = re.sub('最新レス', '', seikei2_1)
seikei2_3 = seikei2_2.replace('[匿名さん]', '')
# print('[thread_parse]', seikei2_3)
shop_texts2.append(seikei2_3)
except Exception as ex:
print('Except:', ex)
pass
thread_info_list_next = list([title2, next_url, shop_texts2])
shop_info_list_next.append(thread_info_list_next)
print(shop_info_list_next)
# write_text2 = '\n'.join(texts2)
# print(write_text2)
# with open('./thread.txt', mode='a', encoding='utf-8') as f:
# f.write('\n\n' + write_text2 + '\n\n')
return shop_info_list_next
shop_info_list_next = next_page_thread_parse()
shop_info_list_matome = list(itertools.zip_longest(shop_info_list_1, shop_info_list_next))
# thread_info_list = list([title1, thread_url, shop_texts_1, shop_texts2])
# print(thread_info_list)
return shop_info_list_matome
def text_mix(shop_info_list_matome):
shop_info_list_matome = shop_info_list_matome
for shop_info_list in shop_info_list_matome:
for texts in shop_info_list:
print(texts, type(texts))
if __name__ == '__main__':
browse_header()
domain = domain()
bbs_url_out()
shop_info_list_matome = thread_article_parse(domain)
text_mix(shop_info_list_matome)
我想你想要做的是建立一个以商店名称(每个子列表的第一个元素)为键的字典,这样你就可以组合来自给定商店不同页面的所有信息。 下面是一些示例代码:
page1 = [['ShopA', 'thread_url', 'Page1 comment'], ['ShopB', 'thread_url', 'Page1 comment'], ['ShopC', 'thread_url', 'Page1 comment']]
page2 = [['ShopA', 'thread_url', 'Page2 comment'], ['ShopC', 'thread_url', 'Page1 comment']]
shops = {}
for page in (page1, page2):
for shop, thread_url, comment in page:
shops.setdefault(shop, []).extend([thread_url, comment])
combined = [[k] + v for k, v in shops.items()]
print("\n".join(map(str, combined)))
# ['ShopA', 'thread_url', 'Page1 comment', 'thread_url', 'Page2 comment']
# ['ShopB', 'thread_url', 'Page1 comment'],
# ['ShopC', 'thread_url', 'Page1 comment', 'thread_url', 'Page1 comment']
您可能需要根据您的目的修改此代码,因为您没有给出一个示例来说明您希望如何组合列表,但希望这足以为您指明正确的方向。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.