I'm trying to extract all the human names from a filepath. My approach is to split the filepath into individual words, then apply NTLK's Parts Of Speech tag to identify proper nouns, followed by the ne_chunk
function to identify persons.
import nltk
import re
def extract_entities(y):
#make an empty list to receive results of operation
AggPeople = []
#split the filepath by backslashes
for y in y.split("\\"):
#separate the product above into words, then attach nltk tags (ie. NNP), then attach more specific ntlk tags (ie. Person)
for chunk in nltk.ne_chunk(nltk.pos_tag(re.findall(r"[\w]+", y))) :
#filter out everything but the person labels
if hasattr(chunk, 'label') and chunk.label() == "PERSON":
#bring the results of the above into a list
AggPeople.append(' '.join(c[0] for c in chunk.leaves()).capitalize())
#filter out words you don't want
AggPeople = [x for x in AggPeople if (x not in ['Schedules','Old'])]
#get rid of duplicate words with 'set'
return set(AggPeople)
text = "O:\Country\Province\District\city\Cricket, Jimmy (Y1617F)\Old Schedules\Cricket, Jimmy (78655) Golick doo wop 7 Sept 2016.xlsx"
print(extract_entities(text))
The problem is that the result is 'Jimmy y1617f' and I want it to say 'Jimmy'
I think the nltk.ne_chunk
is grouping up words in a way that makes sense when dealing with text, but not with filepaths. To solve the problem, I tried to define my own equivalent of nltk.ne_chunk
as follows:
import nltk
import re
from nltk import RegexpParser
def extract_entities(y):
AggPeople = []
patterns= r"<NP:{<NNP>+}"
chunker = RegexpParser(patterns)
print(chunker)
for y in y.split("\\"):
for chunk in chunker(nltk.pos_tag(re.findall(r"[\w]+", y))) :
if hasattr(chunk, 'label') and chunk.label() == "PERSON":
AggPeople.append(' '.join(c[0] for c in chunk.leaves()).capitalize())
AggPeople = [x for x in AggPeople if (x not in ['Schedules','Old'])]
return set(AggPeople)
Received an error code:
'RegexpParser' object is not callable
Full traceback:
chunk.RegexpParser with 1 stages:
RegexpChunkParser with 1 rules:
<ChunkRule: '<NNP>'>
Traceback (most recent call last):
File "<ipython-input-282-cb323eff63b4>", line 1, in <module>
runfile('C:/Users//.spyder-py3/ExtractingNames.py', wdir='C:/Users//.spyder-py3')
File "C:\spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "C:\spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users//.spyder-py3/ExtractingNames.py", line 32, in <module>
print(extract_entities(text))
File "C:/Users//.spyder-py3/ExtractingNames.py", line 23, in extract_entities
for chunk in chunker(nltk.pos_tag(re.findall(r"[\w]+", y))) :
TypeError: 'RegexpParser' object is not callable
#looks for two proper nouns side-by-side
patterns= r"P:{<NNP>{2}}"
chunker = nltk.RegexpParser(patterns)
def extract_entities(y):
AggPeople = []
for y in y.split("\\"):
#excludes words with digits and schedules
for chunk in chunker.parse(nltk.pos_tag(re.findall(r"\b(?!Schedules|Old)[^\d\W]+\b", y))) :
if hasattr(chunk, 'label') and chunk.label() == "P" :
AggPeople.append(' '.join(c[0] for c in chunk.leaves()).capitalize())
return set(AggPeople)
text = "O:\Country\Province\District\city\Cricket, Jimmy (Y1617F)\Old Schedules\Cricket, Jimmy (78655) Golick doo wop 7 Sept 2016.xlsx"
print(extract_entities(text))
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.