简体   繁体   中英

Using multiple time Matcher in Spacy

I would like to use Matcher from Spacy on a list of span (sents)

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        print("(", span, ")")
        self.phrase[string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        sents = list(doc.sents)
        for sent in sents:
            self.phrase = {
                "NP": [],
                "VP": [],
                "VVP": []
            }
            self.phrases.append(self.phrase)
            print("[", sent, "]")
            self.matcher(sent)

            for phrase in self.phrase.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)

phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.\nI love pdf, it is wonderfull.")
print(phrases)

but it seems confused and give me this response

[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
 ]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]

The first element is good but not the second {'NP': [built], 'VP': [is, of], 'VVP': []} Is there a problem if we use the matcher several times with different text?

Instead of using multiple sentence, I check the sentence ID on the callback function, It work but looks a bit gross

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        sents = list(doc.sents)
        sent_id = sents.index(span.sent)
        print("(", span, ")")
        print("Sentence number: ", sent_id)

        self.phrases[sent_id][string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
        self.matcher(doc)

        for phrases in self.phrases:
            for phrase in phrases.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM