繁体   English   中英

XML在SAX或StAX中逐节解析

[英]XML parse section by section in SAX or StAX

我的XML文件的简短版本如下所示:

<?xml version="1.0" encoding="UTF-8"?>
<MzIdentML id="MS-GF+">
    <SequenceCollection xmlns="http://psidev.info/psi/pi/mzIdentML/1.1">
        <DBSequence length="146" id="DBSeq143">
            <cvParam cvRef="PSI-MS" accession="MS:1001088"></cvParam>
        </DBSequence>
        <Peptide id="Pep7">
            <PeptideSequence>MFLSFPTTK</PeptideSequence>
            <Modification location="1" monoisotopicMassDelta="15.994915">
                <cvParam cvRef="UNIMOD" accession="UNIMOD:35" name="Oxidation"></cvParam>
            </Modification>
        </Peptide>
        <PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_160_1_18"></PeptideEvidence>
        <PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_275_8_133"></PeptideEvidence>
    </SequenceCollection>
</MzIdentML>

我想分别获取DBSequence,Peptide和PeptideEvidence详细信息,但是要获取父级和子级的属性(或嵌套的子级..如果有),换句话说,我希望在以下各节中将所有属性作为键值对:

----------------------------------------------------------------------
<DBSequence length="146" id="DBSeq143">
    <cvParam cvRef="PSI-MS" accession="MS:1001088"></cvParam>
</DBSequence>
----------------------------------------------------------------------
<Peptide id="Pep7">
    <PeptideSequence>MFLSFPTTK</PeptideSequence>
    <Modification location="1" monoisotopicMassDelta="15.994915">
        <cvParam cvRef="UNIMOD" accession="UNIMOD:35" name="Oxidation"></cvParam>
    </Modification>
</Peptide>
----------------------------------------------------------------------
<PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_160_1_18"></PeptideEvidence>
<PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_275_8_133"></PeptideEvidence>
----------------------------------------------------------------------

例如,如果我们考虑<DBSequence>部分:

    <DBSequence length="146" id="DBSeq143">
    <cvParam cvRef="PSI-MS" accession="MS:1001088"></cvParam>
</DBSequence>

应该输出为:

DBSequence=>length=146;id=DBSeq143;cvRef=PSI-MS;accession=MS:1001088;

这是我在SAX中编写的代码:

package lucene.parse;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class MzIdentMLSAXParser extends DefaultHandler {

    private boolean isDBsequence = false;

    String DBSequenceSection;
    String PeptideEvidenceDocument;

    public static void main(String[] argv) throws SAXException, ParserConfigurationException, IOException {
        MzIdentMLSAXParser ps = new MzIdentMLSAXParser("file_path_here/sample.xml");
    }

    public MzIdentMLSAXParser(String dataDir) throws FileNotFoundException, SAXException, ParserConfigurationException, IOException {

        FileInputStream fis = new FileInputStream(dataDir);
        SAXParserFactory spf = SAXParserFactory.newInstance();
        SAXParser parser = spf.newSAXParser();
        parser.parse(fis, this);
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {

        if (qName.equals("DBSequence")) {
            // each time we found a new DBSequence, we re-initialize DBSequenceSection
            DBSequenceSection = "";

            // get attributes of DBSequence
            for (int i = 0; i < atts.getLength(); i++) {
                DBSequenceSection += atts.getQName(i) + "=" + atts.getValue(i) + ";";
            }
            isDBsequence = true;
        } else if ((qName.equals("cvParam")) && (isDBsequence)) {
            // get attributes of cvParam which are belongs to DBSequence
            // there can be cvParam that are not belongs to DBSequence. 
            for (int i = 0; i < atts.getLength(); i++) {
                DBSequenceSection += atts.getQName(i) + "=" + atts.getValue(i) + ";";
            }
        } else if (qName.equals("PeptideEvidence")) {
            // each time we found a new PeptideEvidence, we re-initialize docuDBSequenceSectionment
            PeptideEvidenceDocument = "";

            for (int i = 0; i < atts.getLength(); i++) {
                PeptideEvidenceDocument += atts.getQName(i) + "=" + atts.getValue(i) + ";";
            }
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
        if (qName.equals("DBSequence")) {
            System.out.println(qName +"=>"+DBSequenceSection);
            isDBsequence = false;
        } else if (qName.equals("PeptideEvidence")) {
            System.out.println(qName +"=>"+PeptideEvidenceDocument);
        }
    }
}

有没有简单的方法可以做到这一点? 因为我有很多这样的带有嵌套节点的标签。 这里的挑战是<cvParam>不仅出现在<DBSequence>标记中,而且出现在其他标记(例如<Modification>等)中。我也尝试使用StAX。 但是做不到。

这是使用StAX的工作示例。 StAX在解析已知的XML结构时表现出色,但是也可以用于动态解析。

此代码依赖于知识,例如,知道我们想要的内容DBSequencePeptidePeptideEvidence ,而且PeptideSequence具有文本内容,而有的则没有。

这些方法使用递归来遵循XML的结构。

public static void main(String[] args) throws Exception {
    String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
                 "<MzIdentML id=\"MS-GF+\">\n" +
                 "    <SequenceCollection xmlns=\"http://psidev.info/psi/pi/mzIdentML/1.1\">\n" +
                 "        <DBSequence length=\"146\" id=\"DBSeq143\">\n" +
                 "            <cvParam cvRef=\"PSI-MS\" accession=\"MS:1001088\"></cvParam>\n" +
                 "        </DBSequence>\n" +
                 "        <Peptide id=\"Pep7\">\n" +
                 "            <PeptideSequence>MFLSFPTTK</PeptideSequence>\n" +
                 "            <Modification location=\"1\" monoisotopicMassDelta=\"15.994915\">\n" +
                 "                <cvParam cvRef=\"UNIMOD\" accession=\"UNIMOD:35\" name=\"Oxidation\"></cvParam>\n" +
                 "            </Modification>\n" +
                 "        </Peptide>\n" +
                 "        <PeptideEvidence dBSequence_ref=\"DBSeq143\" id=\"PepEv_160_1_18\"></PeptideEvidence>\n" +
                 "        <PeptideEvidence dBSequence_ref=\"DBSeq143\" id=\"PepEv_275_8_133\"></PeptideEvidence>\n" +
                 "    </SequenceCollection>\n" +
                 "</MzIdentML>";
    XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(new StringReader(xml));
    try {
        reader.nextTag();
        search(reader);
    } finally {
        reader.close();
    }
}
private static void search(XMLStreamReader reader) throws XMLStreamException {
    // reader must be on START_ELEMENT upon entry, and will be on matching END_ELEMENT on return
    assert reader.getEventType() == XMLStreamConstants.START_ELEMENT;
    while (reader.nextTag() == XMLStreamConstants.START_ELEMENT) {
        String name = reader.getLocalName();
        switch (name) {
            case "DBSequence":
            case "Peptide":
            case "PeptideEvidence": {
                Map<String, String> props = new LinkedHashMap<>();
                collectProps(reader, props);
                System.out.println(name + ": " + props);
                break; }
            default:
                search(reader);
        }
    }
}
private static void collectProps(XMLStreamReader reader, Map<String, String> props) throws XMLStreamException {
    // reader must be on START_ELEMENT upon entry, and will be on matching END_ELEMENT on return
    assert reader.getEventType() == XMLStreamConstants.START_ELEMENT;
    for (int i = 0; i < reader.getAttributeCount(); i++)
        props.put(reader.getAttributeLocalName(i), reader.getAttributeValue(i));
    String name = reader.getLocalName();
    switch (name) {
        case "PeptideSequence":
            props.put(name, reader.getElementText());
            break;
        default:
            while (reader.nextTag() == XMLStreamConstants.START_ELEMENT)
                collectProps(reader, props);
    }
}

输出值

DBSequence: {length=146, id=DBSeq143, cvRef=PSI-MS, accession=MS:1001088}
Peptide: {id=Pep7, PeptideSequence=MFLSFPTTK, location=1, monoisotopicMassDelta=15.994915, cvRef=UNIMOD, accession=UNIMOD:35, name=Oxidation}
PeptideEvidence: {dBSequence_ref=DBSeq143, id=PepEv_160_1_18}
PeptideEvidence: {dBSequence_ref=DBSeq143, id=PepEv_275_8_133}

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM