简体   繁体   中英

XML parse section by section in SAX or StAX

shorten version of my XML file look like this:

<?xml version="1.0" encoding="UTF-8"?>
<MzIdentML id="MS-GF+">
    <SequenceCollection xmlns="http://psidev.info/psi/pi/mzIdentML/1.1">
        <DBSequence length="146" id="DBSeq143">
            <cvParam cvRef="PSI-MS" accession="MS:1001088"></cvParam>
        </DBSequence>
        <Peptide id="Pep7">
            <PeptideSequence>MFLSFPTTK</PeptideSequence>
            <Modification location="1" monoisotopicMassDelta="15.994915">
                <cvParam cvRef="UNIMOD" accession="UNIMOD:35" name="Oxidation"></cvParam>
            </Modification>
        </Peptide>
        <PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_160_1_18"></PeptideEvidence>
        <PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_275_8_133"></PeptideEvidence>
    </SequenceCollection>
</MzIdentML>

I want to get DBSequence, Peptide and PeptideEvidence details separately.but attributes of parent and children(or nested children..if there are).In other words, I want all the attribues as key-value pairs in each section I illustrated bellow:

----------------------------------------------------------------------
<DBSequence length="146" id="DBSeq143">
    <cvParam cvRef="PSI-MS" accession="MS:1001088"></cvParam>
</DBSequence>
----------------------------------------------------------------------
<Peptide id="Pep7">
    <PeptideSequence>MFLSFPTTK</PeptideSequence>
    <Modification location="1" monoisotopicMassDelta="15.994915">
        <cvParam cvRef="UNIMOD" accession="UNIMOD:35" name="Oxidation"></cvParam>
    </Modification>
</Peptide>
----------------------------------------------------------------------
<PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_160_1_18"></PeptideEvidence>
<PeptideEvidence dBSequence_ref="DBSeq143" id="PepEv_275_8_133"></PeptideEvidence>
----------------------------------------------------------------------

For example, if we consider <DBSequence> section:

    <DBSequence length="146" id="DBSeq143">
    <cvParam cvRef="PSI-MS" accession="MS:1001088"></cvParam>
</DBSequence>

should be output as:

DBSequence=>length=146;id=DBSeq143;cvRef=PSI-MS;accession=MS:1001088;

This is the code I wrote in SAX:

package lucene.parse;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class MzIdentMLSAXParser extends DefaultHandler {

    private boolean isDBsequence = false;

    String DBSequenceSection;
    String PeptideEvidenceDocument;

    public static void main(String[] argv) throws SAXException, ParserConfigurationException, IOException {
        MzIdentMLSAXParser ps = new MzIdentMLSAXParser("file_path_here/sample.xml");
    }

    public MzIdentMLSAXParser(String dataDir) throws FileNotFoundException, SAXException, ParserConfigurationException, IOException {

        FileInputStream fis = new FileInputStream(dataDir);
        SAXParserFactory spf = SAXParserFactory.newInstance();
        SAXParser parser = spf.newSAXParser();
        parser.parse(fis, this);
    }

    @Override
    public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {

        if (qName.equals("DBSequence")) {
            // each time we found a new DBSequence, we re-initialize DBSequenceSection
            DBSequenceSection = "";

            // get attributes of DBSequence
            for (int i = 0; i < atts.getLength(); i++) {
                DBSequenceSection += atts.getQName(i) + "=" + atts.getValue(i) + ";";
            }
            isDBsequence = true;
        } else if ((qName.equals("cvParam")) && (isDBsequence)) {
            // get attributes of cvParam which are belongs to DBSequence
            // there can be cvParam that are not belongs to DBSequence. 
            for (int i = 0; i < atts.getLength(); i++) {
                DBSequenceSection += atts.getQName(i) + "=" + atts.getValue(i) + ";";
            }
        } else if (qName.equals("PeptideEvidence")) {
            // each time we found a new PeptideEvidence, we re-initialize docuDBSequenceSectionment
            PeptideEvidenceDocument = "";

            for (int i = 0; i < atts.getLength(); i++) {
                PeptideEvidenceDocument += atts.getQName(i) + "=" + atts.getValue(i) + ";";
            }
        }
    }

    @Override
    public void endElement(String uri, String localName, String qName) throws SAXException {
        if (qName.equals("DBSequence")) {
            System.out.println(qName +"=>"+DBSequenceSection);
            isDBsequence = false;
        } else if (qName.equals("PeptideEvidence")) {
            System.out.println(qName +"=>"+PeptideEvidenceDocument);
        }
    }
}

Is there any easy way of doing this? because I have lots of tags like this with nested nodes. Challenge here is <cvParam> appears not only in <DBSequence> tag, but in other tags like <Modification> etc. I tried with StAX too. but couldn't make it.

Here is a working example of using StAX. StAX excels when parsing known XML structures, but can be used for dynamic parsing too.

This code relies on knowledge, eg knowing that we want the content of DBSequence , Peptide , and PeptideEvidence , and that PeptideSequence has text content, while the others don't.

The methods use recursion to follow the structure of the XML.

public static void main(String[] args) throws Exception {
    String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
                 "<MzIdentML id=\"MS-GF+\">\n" +
                 "    <SequenceCollection xmlns=\"http://psidev.info/psi/pi/mzIdentML/1.1\">\n" +
                 "        <DBSequence length=\"146\" id=\"DBSeq143\">\n" +
                 "            <cvParam cvRef=\"PSI-MS\" accession=\"MS:1001088\"></cvParam>\n" +
                 "        </DBSequence>\n" +
                 "        <Peptide id=\"Pep7\">\n" +
                 "            <PeptideSequence>MFLSFPTTK</PeptideSequence>\n" +
                 "            <Modification location=\"1\" monoisotopicMassDelta=\"15.994915\">\n" +
                 "                <cvParam cvRef=\"UNIMOD\" accession=\"UNIMOD:35\" name=\"Oxidation\"></cvParam>\n" +
                 "            </Modification>\n" +
                 "        </Peptide>\n" +
                 "        <PeptideEvidence dBSequence_ref=\"DBSeq143\" id=\"PepEv_160_1_18\"></PeptideEvidence>\n" +
                 "        <PeptideEvidence dBSequence_ref=\"DBSeq143\" id=\"PepEv_275_8_133\"></PeptideEvidence>\n" +
                 "    </SequenceCollection>\n" +
                 "</MzIdentML>";
    XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(new StringReader(xml));
    try {
        reader.nextTag();
        search(reader);
    } finally {
        reader.close();
    }
}
private static void search(XMLStreamReader reader) throws XMLStreamException {
    // reader must be on START_ELEMENT upon entry, and will be on matching END_ELEMENT on return
    assert reader.getEventType() == XMLStreamConstants.START_ELEMENT;
    while (reader.nextTag() == XMLStreamConstants.START_ELEMENT) {
        String name = reader.getLocalName();
        switch (name) {
            case "DBSequence":
            case "Peptide":
            case "PeptideEvidence": {
                Map<String, String> props = new LinkedHashMap<>();
                collectProps(reader, props);
                System.out.println(name + ": " + props);
                break; }
            default:
                search(reader);
        }
    }
}
private static void collectProps(XMLStreamReader reader, Map<String, String> props) throws XMLStreamException {
    // reader must be on START_ELEMENT upon entry, and will be on matching END_ELEMENT on return
    assert reader.getEventType() == XMLStreamConstants.START_ELEMENT;
    for (int i = 0; i < reader.getAttributeCount(); i++)
        props.put(reader.getAttributeLocalName(i), reader.getAttributeValue(i));
    String name = reader.getLocalName();
    switch (name) {
        case "PeptideSequence":
            props.put(name, reader.getElementText());
            break;
        default:
            while (reader.nextTag() == XMLStreamConstants.START_ELEMENT)
                collectProps(reader, props);
    }
}

OUTPUT

DBSequence: {length=146, id=DBSeq143, cvRef=PSI-MS, accession=MS:1001088}
Peptide: {id=Pep7, PeptideSequence=MFLSFPTTK, location=1, monoisotopicMassDelta=15.994915, cvRef=UNIMOD, accession=UNIMOD:35, name=Oxidation}
PeptideEvidence: {dBSequence_ref=DBSeq143, id=PepEv_160_1_18}
PeptideEvidence: {dBSequence_ref=DBSeq143, id=PepEv_275_8_133}

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM