python中计算词频和排序列表的高效快速方法

Question

我已经分析了文本数据，现在我想从分析结果中计算出满足特定条件（日期、类别等）的关键字。 分析结果每个都超过5万个，我有1500个条件。 是否有一种高效/快速的方法来提取满足条件的关键字？

下面是我写的代码，它非常耗时，所以我需要一些有效的方法。

from collections import defaultdict
from typing import DefaultDict

# function for counting keywords
def count_words(top_rel: DefaultDict, top_pos: DefaultDict, top_neg: DefaultDict, data: pd.DataFrame):
    if isinstance(data.loc[:, "3"].values[0], str):
        for i, item in data.loc[:, "0":"3"].iterrows():
            for pos_word in ast.literal_eval(item["1"]):
                top_pos[pos_word] += 1
            for neg_word in ast.literal_eval(item["2"]):
                top_neg[neg_word] += 1
            for rel_word in ast.literal_eval(item["3"]):
                top_rel[rel_word] += 1
    else:
        for i, item in data.loc[:, "0":"3"].iterrows():

            for pos_word in item["1"]:
                top_pos[pos_word] += 1
            for neg_word in item["2"]:
                top_neg[neg_word] += 1
            for rel_word in item["3"]:
                top_rel[rel_word] += 1
    return top_rel, top_pos, top_neg

# Create conditions 
cat_ids = [subcats['id'] for subcats in cp.cat_config['cat'].values()] # cat ids in the category table
index = pd.MultiIndex.from_product([cat_ids, data.code.unique(), [start_date.strftime("%Y%m%d")], 
    data.target.unique(), [datetime.datetime.strptime(str(data._dates.unique()[0]), 
    "%Y%m%d").date().isocalendar()[1]]], names=["category_code", "region_code", "start_date",  
    "target", "year_week"]) # Cartesian product
top_word_id = pd.DataFrame(index=index).reset_index()

# Create defaultdict for each condition
top_word_id.loc[:, 'weekly_associated_top_word'] = [defaultdict(int) for _ in range(top_word_id.shape[0])]
top_word_id.loc[:, 'weekly_positive_top_word'] = [defaultdict(int) for _ in range(top_word_id.shape[0])]
top_word_id.loc[:, 'weekly_negative_top_word'] = [defaultdict(int) for _ in range(top_word_id.shape[0])]

# for specific periods,
while dates_queue:
    date = dates_queue.popleft()
    date_str = date.strftime("%Y%m%d.tsv")
    data = pd.read_csv(PurePath("../results", date_str), sep='\t', engine='python', encoding='utf-8')

    for i, item in top_word_id.iterrows(): # for each condition
        # find data matched to the condition
        id = item.loc["category_code"]
        target = item.loc['target']
        code = item.loc['region_code']
        
        category_data = data[data.loc[:, id] == 1]

        if category_data.shape[0] == 0:
            continue

        temp = category_data[(category_data.loc[:, 'target'] == target) & (category_data.loc[:, 'code'] == code)]
        
        if temp.shape[0] == 0:
            continue
            
        top_pos, top_neg, top_rel = count_words(top_word_id.iloc[i, 6], top_word_id.iloc[i, 7], top_word_id.iloc[i, 8], data)
        top_word_id.at[i, "weekly_associated_top_word"] = rel
        top_word_id.at[i, "weekly_positive_top_word"] = pos
        top_word_id.at[i, "weekly_negative_top_word"] = neg

编辑

我真的很想给你看一个样本，但是它太大了，而且是韩语，我认为你看不到。 相反，我说明了逻辑的伪代码。

输入
- 数据（ pd.DataFrame ）：输入是一天内的文档集合。 它有名为target 、 category和code列。 此外，数据包含名为0, 1, 2, 3 。 每个元素都是一个单词列表。 （例如data.loc[0, "0"] = ['a', 'b', 'c'] , data.loc[0, "1"] = ['hello', 'world', '.'] )
- top_word_id（ pd.DataFrame ）：所述的每行DataFrame表示每一个条件。
算法：我想要的是找到该行满足某些特定条件（即target 、 category和code ）的data行。 一个条件是我之前提到的每行tow_word_id 。
输出：假设我想找到满足条件的数据，条件是top_word_id的第 j 行。 满足条件的数据个数为2个，分别为数据的i1和i2。 所以我想聚合数据的i1和i2的词频。 聚合的结果必须保留，因为我想聚合今天和明天的文档的词频。

Answer 1

在简单的用例中，给定一个可迭代对象，您可以使用collections.Counter对象， https : //docs.python.org/3/library/collections.html#collections.Counter例如

>>> from collections import Counter
>>> mylist = [1,2,3,3,2,1]
>>> Counter(mylist)
Counter({1: 2, 2: 2, 3: 2})

给定一个字符串：

>>> text = "This is a sentence with repeated words words words in the sentence"
>>> tokenized_text = text.split()
>>> Counter(tokenized_text)
Counter({'This': 1,
         'is': 1,
         'a': 1,
         'sentence': 2,
         'with': 1,
         'repeated': 1,
         'words': 3,
         'in': 1,
         'the': 1})

要更新计数器：


>> counter = Counter()
>>> counter.update(tokenized_text_1) # assuming tokenized text is an iterable of strings.
>>> counter.update(tokenized_text_2)

Answer 2

我通过利用collections.Counter 、 Cython 和multiprocessing.Pool实现了高效和快速的逻辑。 我用Counter替换了计数部分，并利用 Cython 和multiprocessing.Pool来提高效率。

下面是整个代码：

from collections import defaultdict, Counter
from typing import DefaultDict

def count_words(top_pos: DefaultDict, top_neg: DefaultDict, top_rel: DefaultDict, data: pd.DataFrame):
    print(data)
    if isinstance(data.loc[:, "3"].values[0], str):
        data_pos = data.loc[:, "1"].apply(lambda x: Counter(ast.literal_eval(x)))
        data_neg = data.loc[:, "2"].apply(lambda x: Counter(ast.literal_eval(x)))
        data_rel = data.loc[:, "3"].apply(lambda x: Counter(ast.literal_eval(x)))
        
        print(data_pos)
        print(data_neg)
        print(data_rel)
        for item in data_pos:
            for k, v in item.items():
                top_pos[k] += v
        for item in data_neg:
            for k, v in item.items():
                top_neg[k] += v
        for item in data_rel:
            for k, v in item.items():
                top_rel[k] += v
                
    elif isinstance(data.loc[:, "3"].values[0], list):
        
        print(data_pos)
        print(data_neg)
        print(data_rel)
        data_pos = data.loc[:, "1"].apply(lambda x: Counter(x))
        data_neg = data.loc[:, "2"].apply(lambda x: Counter(x))
        data_rel = data.loc[:, "3"].apply(lambda x: Counter(x))
        for item in data_pos:
            for k, v in item.items():
                top_pos[k] += v
        for item in data_neg.items():
            for k, v in item.items():
                top_neg[k] += v
        for item in data_rel.items():
            for k, v in item.items():
                top_rel[k] += v
    else:
        raise ValueError("The type must be either list or str")
            
    return top_pos, top_neg, top_rel 

def test(data, top_word_id):
    for i, item in top_word_id.iterrows():
        id = item.loc["category_code"]
        target = item.loc['target']
        code = item.loc['region_code']

        category_data = data[data.loc[:, id] == 1]

        if category_data.shape[0] == 0:
            continue

        temp = category_data[(category_data.loc[:, 'target'] == target) & (category_data.loc[:, 'code'] == code)]

        if temp.shape[0] == 0:
            continue

        top_pos, top_neg, top_rel = count_words(top_word_id.loc[i, "weekly_positive_top_word"], top_word_id.loc[i, "weekly_negative_top_word"], top_word_id.loc[i, "weekly_associated_top_word"], data)
        top_word_id.at[i, "weekly_associated_top_word"] = top_rel
        top_word_id.at[i, "weekly_positive_top_word"] = top_pos
        top_word_id.at[i, "weekly_negative_top_word"] = top_neg

    return top_word_id

from multiprocessing import Pool, cpu_count
from contextlib import contextmanager
import numpy as np

@contextmanager
def poolcontext(*args, **kwargs):
    try:
        pool = Pool(*args, **kwargs)
        yield pool
    finally:
        pool.terminate()

def parallelize_aggregation(data, top_word_id, func):
    num_cores = cpu_count()
    df_split = np.array_split(top_word_id, num_cores, axis=0)
    
    with poolcontext(processes=num_cores) as pool:
        results = pool.starmap(test, zip([data for _ in range(num_cores)], df_split))
    return results

parallelize_aggregation(data, top_word_id, aggregate.test)

下表说明了代码的时间：

代码	时代
Cython（问题中的代码）	4749s
Cython + `Counter`	3066s
Cython + `Counter` + `multiprocessing.Pool`	10s

python中计算词频和排序列表的高效快速方法

问题描述

编辑

2 个解决方案

解决方案1
1 2021-07-13 14:05:08

解决方案2
0 已采纳 2021-07-15 06:15:45

python中计算词频和排序列表的高效快速方法

问题描述

编辑

2 个解决方案

解决方案1 1 2021-07-13 14:05:08

解决方案2 0 已采纳 2021-07-15 06:15:45

解决方案1
1 2021-07-13 14:05:08

解决方案2
0 已采纳 2021-07-15 06:15:45