简体   繁体   中英

How can I split large xml into small chunks using VTDGenHuge?

I want split large xml into small chunks. I am using VTDGen to split the xml file into small chunks and it works fine for file size < 2 GB. VTD-xml uses IN-Memory to parse the xml where i don't want to load the xml in to memory . So i am trying to map Memory using VTDGenHuge .

Code works fine with VTDGen but when i us VTDGenHuge it not working.

        String prefix = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"+"\n";
    String suffix = "\n</Employees>\n";
    try {

        VTDGenHuge vg = new VTDGenHuge();
        if (vg.parseFile("C:\\Users\\abc\\Desktop\\latestxml\\Input_1.xml", true,VTDGenHuge.MEM_MAPPED)) {
            int splitBy = ;
           System.out.println("Started time"+ new Date());
            VTDNavHuge vn = vg.getNav();               
            AutoPilotHuge ap = new AutoPilotHuge(vn);
            ap.selectXPath("/Employees/Employee");
            FastLongBuffer flb = new FastLongBuffer(4);
            int i;
            byte[] xml = vn.getXML().getBytes();          
            while ((i = ap.evalXPath()) != -1) {  
                flb.append(vn.getElementFragment());
            }
            int size = flb.size();
            if (size != 0) {
                File fo = null;
                FileOutputStream fos = null;
                for (int k = 0; k < size; k++) {
                   if (k % splitBy == 0) {
                        if (fo != null) {
                            fos.write(suffix.getBytes());
                            fos.close();
                            fo = null;
                        }
                    }
                    if (fo == null) {
                        fo = new File("C:\\Users\\abc\\Desktop\\Test\\xml\\"+"out" + k + ".xml");
                        fos = new FileOutputStream(fo);
                        fos.write(prefix.getBytes());
                    }
                    fos.write("\n".getBytes());                       
                    fos.write(xml, flb.lower32At(k), flb.upper32At(k));
                }
                if (fo != null) {                       
                    fos.write(suffix.getBytes());                  
                    fos.close();
                    fo = null;
                }
            }

        }
    } catch (Exception e) {
        e.printStackTrace();
    }

I get NUll value at "byte[] xml = vn.getXML().getBytes();" when u do syso vn.getXML() you get object value. but with "getBytes()" return null.I don't no why. But if u do "byteAt(x)" x = any long value it return value.

My xml file is:

<?xml version="1.0" encoding="UTF-8"?>
<Employees>
<Employee id="1">
    <age>29</age>
    <name>Pankaj</name>
    <gender>Male</gender>
    <role>Java Developer</role>
</Employee>
<Employee id="2">
    <age>35</age>
    <name>Lisa</name>
    <gender>Female</gender>
    <role>CEO</role>
</Employee>
<Employee id="3">
    <age>40</age>
    <name>Tom</name>
    <gender>Male</gender>
    <role>Manager</role>
</Employee>
    <Employee id="1">
    <age>29</age>
    <name>Pankaj</name>
    <gender>Male</gender>
    <role>Java Developer</role>
</Employee>
<Employee id="2">
    <age>35</age>
    <name>Lisa</name>
    <gender>Female</gender>
    <role>CEO</role>
</Employee>
<Employee id="3">
    <age>40</age>
    <name>Tom</name>
    <gender>Male</gender>
    <role>Manager</role>
</Employee>
<Employees>

I want out put like this.

<?xml version="1.0" encoding="UTF-8"?>
 <Employees>
<Employee id="1">
    <age>29</age>
    <name>Pankaj</name>
    <gender>Male</gender>
    <role>Java Developer</role>
</Employee>
<Employee id="2">
    <age>35</age>
    <name>Lisa</name>
    <gender>Female</gender>
    <role>CEO</role>
</Employee>
<Employee id="3">
    <age>40</age>
    <name>Tom</name>
    <gender>Male</gender>
    <role>Manager</role>    
</Employee>
<Employees> 

<?xml version="1.0" encoding="UTF-8"?>
<Employees> 
    <Employee id="1">
    <age>29</age>
    <name>Pankaj</name>
    <gender>Male</gender>
    <role>Java Developer</role>
</Employee>
<Employee id="2">
    <age>35</age>
    <name>Lisa</name>
    <gender>Female</gender>
    <role>CEO</role>
</Employee>
<Employee id="3">
    <age>40</age>
    <name>Tom</name>
    <gender>Male</gender>
    <role>Manager</role>
</Employee>
<Employees>

I think that vn.getXML() for extended vtd-xml returns a IbyteBuffer interface object different from standard vtd-xml. You can call the inteface method called writeOutputToFile() and pass to it the offset and value parameters.. sorry the documentation part of it is lacking, but that is the basic lowdown...

Try this

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;

namespace ConsoleApplication1
{
    class Program
    {
        const string FILENAME = @"c:\temp\test.xml";
        const int OUTPUT_ELEMENTS = 3;
        static void Main(string[] args)
        {

            XmlReader reader = XmlTextReader.Create(FILENAME, new XmlReaderSettings() { IgnoreWhitespace = true });

            int count = 0;
            XDocument doc = null;
            XElement employees = null;
            reader.ReadToFollowing("Employee");
            while (!reader.EOF)
            {
                if (reader.Name == "Employee")
                {
                    if (doc == null)
                    {
                        string root = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" +
                                        "<Employees>" +
                                        "</Employees>";
                        doc = XDocument.Parse(root);
                        employees = (XElement)doc.FirstNode;
                    }
                    employees.Add(XElement.Parse(reader.ReadOuterXml()));

                    count += 1;
                    if (count % OUTPUT_ELEMENTS == 0)
                    {
                        doc.Save(string.Format(@"c:\temp\test{0}.xml", (int)(count / OUTPUT_ELEMENTS)));
                        doc = null;
                    }
                }
                else
                {
                    if (reader.Value == "")
                    {
                        break;
                    }
                    else
                    {
                        reader.Read();
                    }
                }

            }
            if (doc != null)
            {
                doc.Save(string.Format(@"c:\temp\test{0}", (int)(count / OUTPUT_ELEMENTS)));
            }
        }
    }
}
​

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM