简体   繁体   English

使用斯坦福 CoreNLP 进行 CorefResolution

[英]Using Stanford CoreNLP for CorefResolution

I am trying to use Stanford CoreNLP to perform Coref resolution.我正在尝试使用斯坦福 CoreNLP 来执行 Coref 解析。 The version I use is stanford-corenlp-full-2015-12-09.我使用的版本是 stanford-corenlp-full-2015-12-09。 Basically, I have wrote some classes:基本上,我写了一些类:

import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;


public class CorefResolution {
    public static String corefResolute(String text, List<String> tokenToReplace) {
        Properties props = new Properties();
        props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);

        Annotation doc = new Annotation(text);
        pipeline.annotate(doc);

        Map<Integer, CorefChain> corefs = doc.get(CorefCoreAnnotations.CorefChainAnnotation.class);
        System.out.println(corefs);
        List<CoreMap> sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
        List<String> resolved = new ArrayList<String>();

        for (CoreMap sentence : sentences) {
            List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);

            for (CoreLabel token : tokens) {

                Integer corefClustId = token.get(CorefCoreAnnotations.CorefClusterIdAnnotation.class);
                token.get(Coref)

                if (corefClustId == null) {
                    System.out.println("NULL NULL NULL\n");
                    resolved.add(token.word());
                    continue;
                }
                else {
                    System.out.println("Exist Exist Exist\n");
                }

                System.out.println("coreClustId is "+corefClustId.toString()+"\n");
                CorefChain chain = corefs.get(corefClustId);

                if (chain == null || chain.getMentionsInTextualOrder().size() == 1) {
                    resolved.add(token.word());
                } else {
                    int sentINdx = chain.getRepresentativeMention().sentNum - 1;
                    CoreMap corefSentence = sentences.get(sentINdx);
                    List<CoreLabel> corefSentenceTokens = corefSentence.get(CoreAnnotations.TokensAnnotation.class);

                    CorefChain.CorefMention reprMent = chain.getRepresentativeMention();

                    if (tokenToReplace.contains(token.word())) {
                        for (int i = reprMent.startIndex; i < reprMent.endIndex; i++) {
                            CoreLabel matchedLabel = corefSentenceTokens.get(i - 1);
                            resolved.add(matchedLabel.word());
                        }
                    } else {
                        resolved.add(token.word());
                    }
                }
            }
        }

        Detokenizer detokenizer = new Detokenizer();
        String resolvedStr = detokenizer.detokenize(resolved);

        return resolvedStr;
    }
}

Another class另一堂课

import java.util.Arrays;
import java.util.List;
import java.util.LinkedList;


public class Detokenizer {

    public String detokenize(List<String> tokens) {
        //Define list of punctuation characters that should NOT have spaces before or after
        List<String> noSpaceBefore = new LinkedList<String>(Arrays.asList(",", ".",";", ":", ")", "}", "]", "'", "'s", "n't"));
        List<String> noSpaceAfter = new LinkedList<String>(Arrays.asList("(", "[","{", "\"",""));

        StringBuilder sentence = new StringBuilder();

        tokens.add(0, "");  //Add an empty token at the beginning because loop checks as position-1 and "" is in noSpaceAfter
        for (int i = 1; i < tokens.size(); i++) {
            if (noSpaceBefore.contains(tokens.get(i))
                    || noSpaceAfter.contains(tokens.get(i - 1))) {
                sentence.append(tokens.get(i));
            } else {
                sentence.append(" " + tokens.get(i));
            }

            // Assumption that opening double quotes are always followed by matching closing double quotes
            // This block switches the " to the other set after each occurrence
            // ie The first double quotes should have no space after, then the 2nd double quotes should have no space before
            if ("\"".equals(tokens.get(i - 1))) {
                if (noSpaceAfter.contains("\"")) {
                    noSpaceAfter.remove("\"");
                    noSpaceBefore.add("\"");
                } else {
                    noSpaceAfter.add("\"");
                    noSpaceBefore.remove("\"");
                }
            }
        }
        return sentence.toString();
    }
}

Another class file另一个类文件

import java.io.*;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.List;


public class PlainTextCorefResolver {

    public static void resolveFile(File inputFile, File outputFile) {
        try {
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile), Charset.forName("UTF-8")));
            PrintWriter writer = new PrintWriter(outputFile, "UTF-8");


            if (inputFile.exists()) System.out.println("input exist\n");
            else System.out.println("input not exist\n");

            if (outputFile.exists()) System.out.println("output exist\n");
            else System.out.println("output not exist\n");

            while(true){
                String line = reader.readLine();
                //EOF
                if(line == null)
                    break;
                //Resolve line
                List<String> tokenToReplace = Arrays.asList("He", "he", "She", "she", "It", "it", "They", "they"); //!!!
                String resolvedLine = CorefResolution.corefResolute(line, tokenToReplace);
                writer.println(resolvedLine);
            }
            reader.close();
            writer.close();

        } catch (Exception e){
            System.err.println("Failed to open/resolve input file [" +inputFile.getAbsoluteFile()+ "] in loader");
            e.printStackTrace();
            return;
        }

    }


    public static void main(String[] args) {
        String inputFileName = "path/file.txt";
        String outputFileName =  "path/file.resolved.txt";
        File inputFile = new File(inputFileName);
        File outputFile = new File(outputFileName);
        resolveFile(inputFile, outputFile);
    }

}

However, it doesn't give any useful result.但是,它没有给出任何有用的结果。 The corefClusterId is always null, thus I always get a bunch of "NULL NULL NULL" outputs. corefClusterId 始终为空,因此我总是得到一堆“NULL NULL NULL”输出。

How can I correctly perform coreference resolution to replace such as "He/he/She/she/It/it/The stadium/..." with its most typical mention (person or organization's name)?我如何正确执行共指解析以用最典型的提及(人或组织的名称)替换诸如“他/他/她/她/它/它/体育场/......”?

For example, given: "Estadio El Madrigal is a stadium in Spain, used since 1923. It is currently mostly used for football matches."例如,给出:“Estadio El Madrigal 是西班牙的一个体育场,自 1923 年开始使用。目前主要用于足球比赛。” I want to get "Estadio El Madrigal is a stadium in Spain, used since 1923. Estadio El Madrigal is currently mostly used for football matches."我想得到“Estadio El Madrigal 是西班牙的一个体育场,自 1923 年开始使用。Estadio El Madrigal 目前主要用于足球比赛。”

I don't think our coref system is attaching "Estadio El Madrigal" to "It" in your example.我不认为我们的 coref 系统在您的示例中将“Estadio El Madrigal”附加到“It”。

Here is some example code for accessing the CorefChains and mentions in general.这是一些用于访问 CorefChains 和一般提及的示例代码。

import edu.stanford.nlp.hcoref.CorefCoreAnnotations;
import edu.stanford.nlp.hcoref.data.CorefChain;
import edu.stanford.nlp.hcoref.data.Mention;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;

import java.util.*;

public class CorefExample {

    public static void main(String[] args) throws Exception {

        Annotation document = new Annotation("John Kerry is the secretary of state.  He ran for president in 2004.");
        Properties props = new Properties();
        props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,mention,coref");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        pipeline.annotate(document);
        System.out.println("---");
        System.out.println("coref chains");
        for (CorefChain cc : document.get(CorefCoreAnnotations.CorefChainAnnotation.class).values()) {
            System.out.println("\t"+cc);
            System.out.println(cc.getMentionMap());
            List<CorefChain.CorefMention> corefMentions = cc.getMentionsInTextualOrder();
            for (CorefChain.CorefMention cm : corefMentions) {
                System.out.println("---");
                System.out.println("full text: "+cm.mentionSpan);
                System.out.println("position: "+cm.position);
                System.out.println("start index of first word: "+cm.startIndex);
            }
        }
        for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
            System.out.println("---");
            System.out.println("mentions");
            for (Mention m : sentence.get(CorefCoreAnnotations.CorefMentionsAnnotation.class)) {
                System.out.println("\t"+m);
            }
        }
    }
}

====================== ======================
Update更新
@StanfordNLPHelper, there is the error I get when using "coref" rather than "dcoref": @StanfordNLPHelper,使用“coref”而不是“dcoref”时出现错误:

INFO: Read 25 rules
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator parse
[main] INFO edu.stanford.nlp.parser.common.ParserGrammar - Loading parser from serialized file edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz ... 
done [0.3 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator mention
Using mention detector type: rule
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator coref
Exception in thread "main" java.lang.OutOfMemoryError: GC overhead limit exceeded
    at java.util.Arrays.copyOfRange(Arrays.java:3664)
    at java.lang.String.<init>(String.java:207)
    at java.lang.StringBuilder.toString(StringBuilder.java:407)
    at java.io.ObjectInputStream$BlockDataInputStream.readUTFBody(ObjectInputStream.java:3079)
    at java.io.ObjectInputStream$BlockDataInputStream.readUTF(ObjectInputStream.java:2874)
    at java.io.ObjectInputStream.readString(ObjectInputStream.java:1639)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1342)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
    at java.util.HashMap.readObject(HashMap.java:1394)
    at sun.reflect.GeneratedMethodAccessor2.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:497)
    at java.io.ObjectStreamClass.invokeReadObject(ObjectStreamClass.java:1017)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1900)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
    at java.io.ObjectInputStream.defaultReadFields(ObjectInputStream.java:2000)
    at java.io.ObjectInputStream.readSerialData(ObjectInputStream.java:1924)
    at java.io.ObjectInputStream.readOrdinaryObject(ObjectInputStream.java:1801)
    at java.io.ObjectInputStream.readObject0(ObjectInputStream.java:1351)
    at java.io.ObjectInputStream.readObject(ObjectInputStream.java:371)
    at edu.stanford.nlp.io.IOUtils.readObjectFromURLOrClasspathOrFileSystem(IOUtils.java:324)
    at edu.stanford.nlp.scoref.SimpleLinearClassifier.<init>(SimpleLinearClassifier.java:30)
    at edu.stanford.nlp.scoref.PairwiseModel.<init>(PairwiseModel.java:75)
    at edu.stanford.nlp.scoref.PairwiseModel$Builder.build(PairwiseModel.java:57)
    at edu.stanford.nlp.scoref.ClusteringCorefSystem.<init>(ClusteringCorefSystem.java:31)
    at edu.stanford.nlp.scoref.StatisticalCorefSystem.fromProps(StatisticalCorefSystem.java:48)
    at edu.stanford.nlp.pipeline.CorefAnnotator.<init>(CorefAnnotator.java:66)
    at edu.stanford.nlp.pipeline.AnnotatorImplementations.coref(AnnotatorImplementations.java:220)
    at edu.stanford.nlp.pipeline.AnnotatorFactories$13.create(AnnotatorFactories.java:515)
    at edu.stanford.nlp.pipeline.AnnotatorPool.get(AnnotatorPool.java:85)
    at edu.stanford.nlp.pipeline.StanfordCoreNLP.construct(StanfordCoreNLP.java:375)

Process finished with exit code 1

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM