I need to scan through a text file, and upon scanning through it, need to accomplish the following:
<P ID=xxx></P>
tags) An example of the file looks like this:
<P ID=1>
Lorem ipsum dolor sit amet
</P>
<P ID=2>
consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
</P>
<P ID=3>
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.
</P>
<P ID=4>
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</P>
<P ID=5>
654654
</P>
Currently, I can accomplish the first two requirements, and part of the second. My code is currently able to produce how many "documents" (defined by the existence of the tags) a word appears in, and for each document, which words appear and how many times.
My program is below:
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
import csv
import operator
import re
import pandas
import collections
from collections import defaultdict, Counter
import sys
def remove_nums(arr):
pattern = '[0-9]'
arr = [re.sub(pattern, '', i) for i in arr]
return arr
# Main Program
def main():
myfile = get_input("path")
stop_words = list(stopwords.words('english'))
p = r'<P ID=\d+>(.*?)</P>'
paras = RegexpTokenizer(p)
num_paragraphs = len(paras.tokenize(myfile))
currTokens = []
currFrequency = collections.Counter()
#currFrequencies = []
currID = []
id_num = 1
words = RegexpTokenizer(r'\w+')
document_frequency = collections.Counter()
for para in paras.tokenize(myfile):
lower = [word.lower() for word in words.tokenize(para)]
no_integers = remove_nums(lower)
dirty_tokens = [data for data in no_integers if data not in stop_words]
tokens = [data for data in dirty_tokens if data.strip()]
document_frequency.update(set(tokens))
for para in paras.tokenize(myfile):
lower = [word.lower() for word in words.tokenize(para)]
no_integers = remove_nums(lower)
dirty_tokens = [data for data in no_integers if data not in stop_words]
tokens = [data for data in dirty_tokens if data.strip()]
currFrequencies = collections.Counter(tokens)
print(id_num, " - ", currFrequencies)
id_num += 1
print()
print(document_frequency)
Which produces:
1 - Counter({'lorem': 1, 'ipsum': 1, 'dolor': 1, 'sit': 1, 'amet': 1})
2 - Counter({'ut': 3, 'consectetur': 1, 'adipiscing': 1, 'elit': 1, 'sed': 1, 'eiusmod': 1, 'tempor': 1, 'incididunt': 1, 'labore': 1, 'et': 1, 'dolore': 1, 'magna': 1, 'aliqua': 1, 'enim': 1, 'ad': 1, 'minim': 1, 'veniam': 1, 'quis': 1, 'nostrud': 1, 'exercitation': 1, 'ullamco':
1, 'laboris': 1, 'nisi': 1, 'aliquip': 1, 'ex': 1, 'ea': 1, 'commodo': 1, 'consequat': 1})
3 - Counter({'duis': 1, 'aute': 1, 'irure': 1, 'dolor': 1, 'reprehenderit': 1, 'voluptate': 1, 'velit': 1, 'esse': 1, 'cillum': 1, 'dolore': 1, 'eu': 1, 'fugiat': 1, 'nulla': 1, 'pariatur': 1})
4 - Counter({'excepteur': 1, 'sint': 1, 'occaecat': 1, 'cupidatat': 1, 'non': 1, 'proident': 1, 'sunt': 1, 'culpa': 1, 'qui': 1, 'officia':
1, 'deserunt': 1, 'mollit': 1, 'anim': 1, 'id': 1, 'est': 1, 'laborum': 1})
5 - Counter()
Counter({'dolor': 2, 'dolore': 2, 'lorem': 1, 'ipsum': 1, 'amet': 1, 'sit': 1, 'sed': 1, 'incididunt': 1, 'tempor': 1, 'labore': 1, 'laboris': 1, 'aliqua': 1, 'veniam': 1, 'quis': 1, 'consequat': 1, 'adipiscing': 1, 'ea': 1, 'nostrud': 1, 'consectetur': 1, 'aliquip': 1, 'ad': 1, 'enim': 1, 'elit': 1, 'eiusmod': 1, 'commodo': 1, 'ut': 1, 'ex': 1, 'ullamco': 1, 'nisi': 1, 'minim': 1, 'exercitation': 1, 'et': 1, 'magna': 1, 'fugiat': 1, 'eu': 1, 'duis': 1, 'reprehenderit': 1, 'voluptate': 1, 'pariatur': 1, 'irure': 1, 'esse': 1, 'nulla': 1, 'aute': 1, 'velit': 1, 'cillum': 1, 'mollit': 1, 'excepteur': 1, 'non': 1, 'cupidatat': 1, 'laborum': 1, 'anim': 1, 'est': 1, 'officia': 1, 'deserunt': 1, 'sunt': 1,
'occaecat': 1, 'culpa': 1, 'id': 1, 'proident': 1, 'qui': 1, 'sint': 1})
However, I need a data structure in the shape of a dictionary that has:
word: document_frequency, (paragraph, count of appearances), (paragraph, count of appearances), (paragraph, count of appearances), etc.
For example:
{dolor: 2, (1, 1), (2, 0), (3, 1), (4, 0), (5, 0)}
How can I combine my collection frequencies to achieve the desired result above?
Something like the following pseudocode
words = set(all the words from all the counters)
d = {}
for word in words:
para_counts = [0] # This will end up as [2, (1, 1), (2, 0) ...]
for i, para in enumerate(paras, 1): # 1-indexed
para_count = para[word]
para_counts[0] += para_count # Update the all-paras word total
para_counts.append((i, para_count)) # Add the this-para (1, 1) tuple
d[word] = para_counts
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.