繁体   English   中英

如何用不同数量的元素列表配对?

[英]How do I make a pair with a list of different number of elements?

我想配对 a + d、c + e 和 b+x。 但我不知道该怎么做。

我以为我可以使用 zip_longest 来完成,但它没有用。

combine_list = zip_longest(list1, list2)

我正在抓取 BBS 并跨多个页面获取信息。


主题:A店

线程 URL: https://thread1, thread2

第1页评论:哇哇哇哇

第2页评论:呜呜呜


帖子标题:B店

线程 URL:https://thread1

第1页评论:哇哇哇哇


帖子标题:店铺C

线程 URL: https://thread1, thread2

第1页评论:哇哇哇哇

第2页评论:呜呜呜


list1 = ['ShopA', 'thread_url', 'Page1 评论'], ['ShopB', 'thread_url', 'Page1 评论'], ['ShopC', 'thread_url', 'Page1 评论']

list2 = ['ShopA', 'thread_url', 'Page2 评论'], ['ShopC', 'thread_url', 'Page1 评论']

我有这样的情况,想把A店的Page1和Page2的评论合并成一个评论。 B店要获取B店的评论,C店要合并Page1和Page2的评论。

做这个的最好方式是什么?

等待帮助。

代码

import requests
from bs4 import BeautifulSoup
import pymongo
import re
import time
import itertools



def browse_header():
    headers = {"User-Agent": "Mozilla/5.0 (X11; Mac OS X x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"}

    return headers


def domain():
    domain = 'https://bakusai.com'

    return domain


def bbs_url_out():

    thread_list = []

    with open('./thread_url.csv', mode='r', encoding='utf-8') as f:
        for urls in f:
            thread_list.append(urls.strip())

    return thread_list


def thread_requests_parse():
    headers = browse_header()
    url_list = bbs_url_out()

    thread_url_lists = []

    for thread_url in url_list:
        time.sleep(1)
        try:
            r = requests.get(thread_url, headers=headers)
            thread_url_lists.append(thread_url)

        except Exception as ex:
            print('Except:', ex)
            pass

    return thread_url_lists


def thread_article_parse(domain):

    thread_url_lists = thread_requests_parse()
    headers = browse_header()
    domain = domain
    shop_info_list_1 = []


    for thread_url in thread_url_lists:
        try:
            r = requests.get(thread_url, headers=headers)
            soup = BeautifulSoup(r.text, 'html.parser')
            htmls = soup.find_all('div', {'class': 'article'})
            title1 = soup.find('div', {'id': 'title_thr'}).text


            shop_texts_1 = []
            for html in htmls:
                post = html.get_text()
                time_pat = r'\d\d:\d\d'
                posts = re.split(time_pat, post)
                time_post = re.search(time_pat, post)
                try:
                    seikei1_1 = str(posts[0]) + time_post.group() + '\n' + '<br><br><span style="font-size: 200%;"><b>' + str(posts[1]) + '\n' + '</span></b><br>[匿名さん]<br><br>'
                    seikei1_2 = re.sub('最新レス', '', seikei1_1)
                    seikei1_3 = seikei1_2.replace('[匿名さん]', '')
                    # print(seikei1_3)
                    shop_texts_1.append(seikei1_3)

                except Exception as ex:
                    print('Except:', ex)
                    pass

            thread_info_list = list([title1, thread_url, shop_texts_1])
            shop_info_list_1.append(thread_info_list)
            print(shop_info_list_1)

        except Exception as ex:
            print('Except:', ex)
            pass

    def next_page_url_parse():
        thread_url_lists = thread_requests_parse()
        next_url_lists = []

        for thread_url in thread_url_lists:
            try:
                r = requests.get(thread_url, headers=headers)
                soup = BeautifulSoup(r.text, 'html.parser')

                nexts = soup.find('div', {'class': 'paging'}).find('span', {'class': 'paging_nextlink'}).find('a')
                b = nexts.get('href')
                next_url = domain + b
                print('[thread_parse] : next_url', f'{next_url}')
                next_url_lists.append(next_url)

            except Exception as ex:
                print('Except:', ex)
                pass

        return next_url_lists


    def next_page_thread_parse():
        next_url_lists = next_page_url_parse()
        shop_info_list_next = []

        for next_url in next_url_lists:
            r2 = requests.get(next_url, headers=headers)
            soup2 = BeautifulSoup(r2.text, 'html.parser')
            html2s = soup2.find_all('div', {'class': 'article'})
            title2 = soup2.find('div', {'id': 'title_thr'}).text


            shop_texts2 = []
            for html2 in html2s:
                post2 = html2.get_text()
                time_pat = r'\d\d:\d\d'
                posts2 = re.split(time_pat, post2)
                time_post = re.search(time_pat, post2)

                try:
                    seikei2_1 = str(posts2[0]) + time_post.group() + '\n' + '<br><br><span style="font-size: 200%;"><b>' + str(posts2[1]) + '\n' + '</span></b><br>[匿名さん]<br><br>'
                    seikei2_2 = re.sub('最新レス', '', seikei2_1)
                    seikei2_3 = seikei2_2.replace('[匿名さん]', '')
                    # print('[thread_parse]', seikei2_3)
                    shop_texts2.append(seikei2_3)
                except Exception as ex:
                    print('Except:', ex)
                    pass

            thread_info_list_next = list([title2, next_url, shop_texts2])
            shop_info_list_next.append(thread_info_list_next)
            print(shop_info_list_next)

            # write_text2 = '\n'.join(texts2)
            # print(write_text2)
        # with open('./thread.txt', mode='a', encoding='utf-8') as f:
        #     f.write('\n\n' + write_text2 + '\n\n')

        return shop_info_list_next

    shop_info_list_next = next_page_thread_parse()


    shop_info_list_matome = list(itertools.zip_longest(shop_info_list_1, shop_info_list_next))


    # thread_info_list = list([title1, thread_url, shop_texts_1, shop_texts2])
    # print(thread_info_list)
    return shop_info_list_matome

def text_mix(shop_info_list_matome):

    shop_info_list_matome = shop_info_list_matome
    for shop_info_list in shop_info_list_matome:
        for texts in shop_info_list:
            print(texts, type(texts))


if __name__ == '__main__':
    browse_header()
    domain = domain()
    bbs_url_out()
    shop_info_list_matome = thread_article_parse(domain)
    text_mix(shop_info_list_matome)

我想你想要做的是建立一个以商店名称(每个子列表的第一个元素)为键的字典,这样你就可以组合来自给定商店不同页面的所有信息。 下面是一些示例代码:

page1 = [['ShopA', 'thread_url', 'Page1 comment'], ['ShopB', 'thread_url', 'Page1 comment'], ['ShopC', 'thread_url', 'Page1 comment']]
page2 = [['ShopA', 'thread_url', 'Page2 comment'], ['ShopC', 'thread_url', 'Page1 comment']]

shops = {}
for page in (page1, page2):
    for shop, thread_url, comment in page:
        shops.setdefault(shop, []).extend([thread_url, comment])

combined = [[k] + v for k, v in shops.items()]
print("\n".join(map(str, combined)))
# ['ShopA', 'thread_url', 'Page1 comment', 'thread_url', 'Page2 comment']
# ['ShopB', 'thread_url', 'Page1 comment'], 
# ['ShopC', 'thread_url', 'Page1 comment', 'thread_url', 'Page1 comment']

您可能需要根据您的目的修改此代码,因为您没有给出一个示例来说明您希望如何组合列表,但希望这足以为您指明正确的方向。

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM