繁体   English   中英

在 Spacy 中使用多个时间匹配器

[英]Using multiple time Matcher in Spacy

我想在跨度(发送)列表中使用 Spacy 的 Matcher

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        print("(", span, ")")
        self.phrase[string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        sents = list(doc.sents)
        for sent in sents:
            self.phrase = {
                "NP": [],
                "VP": [],
                "VVP": []
            }
            self.phrases.append(self.phrase)
            print("[", sent, "]")
            self.matcher(sent)

            for phrase in self.phrase.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
chunker = Chunker(nlp, matcher)

phrases = chunker.chunk("Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.\nI love pdf, it is wonderfull.")
print(phrases)

但它似乎很困惑并给我这个回应

[ Pytables is built on top of the HDF5 library, using the Python language and the NumPy package.
 ]
( the HDF5 library )
( the Python language )
( the NumPy package )
( Pytables )
( top )
( is built on )
( using )
[ I love pdf, it is wonderfull. ]
( is )
( of )
( built )
[{'NP': [Pytables, top, the HDF5 library, the Python language, the NumPy package], 'VP': [is built on, using], 'VVP': []}, {'NP': [built], 'VP': [is, of], 'VVP': []}]

第一个元素很好,但第二个元素不好{'NP': [built], 'VP': [is, of], 'VVP': []}如果我们多次使用不同的文本匹配器会有问题吗?

我没有使用多个句子,而是检查回调 function 上的句子 ID,它有效但看起来有点恶心

class Chunker:
    def __init__(self, nlp, matcher):
        self.nlp = nlp
        self.matcher = matcher
        self.matcher.add("NP", NP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VP", VP_pattern, on_match=self.on_match_callback, greedy="LONGEST")
        self.matcher.add("VVP", VVP_pattern, on_match=self.on_match_callback, greedy="LONGEST")

    def on_match_callback(self, matcher, doc, i, matches):
        match_id, start, end = matches[i]
        string_id = self.nlp.vocab.strings[match_id]
        span = doc[start:end]
        sents = list(doc.sents)
        sent_id = sents.index(span.sent)
        print("(", span, ")")
        print("Sentence number: ", sent_id)

        self.phrases[sent_id][string_id].append(span)

    def chunk(self, text):
        self.phrases = []
        doc = self.nlp(text)
        self.phrases = [{"NP": [], "VP": [], "VVP": []} for _ in doc.sents]
        self.matcher(doc)

        for phrases in self.phrases:
            for phrase in phrases.values():
                phrase.sort(key=lambda x: x.start)

        return self.phrases

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM