简体   繁体   中英

Python - Finding word frequencies and string frequency with possible misspellings and saving as txt file or CSV

What I'm trying to do is scrape text from pretty messy text files for specific words that sometimes have misspellings or characters that don't belong. I have been able to accomplish single words with exact spellings across multiple files in a directory, which is close, but not exactly what I'm looking for. The last thing is that I want to save this list with the counts of words and phrases into a text file, and not just print it as a summary, which is what my code does now.

If it's not possible to find close matches, that's okay, but that would be ideal.

Thanks for your help.

import os
from collections import Counter
import glob

def word_frequency(fileobj, words):
    """Build a Counter of specified words in fileobj"""
    # initialise the counter to 0 for each word
    ct = Counter(dict((w, 0) for w in words))
    file_words = (word for line in fileobj for word in line.split())
    filtered_words = (word for word in file_words if word in words)
    return Counter(filtered_words)


def count_words_in_dir(dirpath, words, action=None):
    """For each .txt file in a dir, count the specified words"""
    for filepath in glob.iglob(os.path.join(path, '*.txt')):
        with open(filepath) as f:
            ct = word_frequency(f, words)
            if action:
                action(filepath, ct)


def print_summary(filename, ct):
    words = sorted(ct.keys())
    counts = [str(ct[k]) for k in words]
    print('{0}\n{1}\n{2}\n\n'.format(
        filepath,
        ', '.join(words),
        ', '.join(counts)))


words = set(['JUSTICE', "policy payment", "payment", "annuity", "CYNTHEA" ])
count_words_in_dir('./', words, action=print_summary)
import sys
import os
from collections import Counter
import glob
# def count_words_in_dir(dirpath, words, action=None):
#     """For each .txt file in a dir, count the specified words"""
#     for filepath in glob.iglob(os.path.join(path, '*.txt')):
#         with open(filepath) as f:
#             data = f.read()
#             for key,val in words.items():
#                 print("key is " + key + "\n")
#                 ct = data.count(key)
#                 words[key] = ct
#             if action:
#                 action(filepath, ct)
stdoutOrigin=sys.stdout 
sys.stdout = open("log.txt", "w")
              
def count_words_in_dir(dirpath, words, action=None):
    for filepath in glob.iglob(os.path.join("path", '*.txt')):
        with open(filepath) as f:
            data = f.read()
            for key,val in words.items():
                #print("key is " + key + "\n")
                ct = data.count(key)
                words[key] = ct
            if action:
                 action(filepath, words)


def print_summary(filepath, words):
    print(filepath)
    for key,val in sorted(words.items()):
        print('{0}:\t{1}'.format(
            key,
            val))




filepath = sys.argv[1]
keys = ["keyword",
"keyword"]
words = dict.fromkeys(keys,0)

count_words_in_dir(filepath, words, action=print_summary)

sys.stdout.close()
sys.stdout=stdoutOrigin

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM