簡體   English   中英

使用 XmlReader 和 XmlWriter 合並 XML 文件

[英]Merge XML files using XmlReader and XmlWriter

我正在嘗試使用XmlReaderXmlWriter將多個 XML 文件合並為一個文件,盡管我的最終文件只包含最后一個文件中的數據。

我使用XmlReaderXmlWriter因為要合並的 XML 文件很大。

我在下面的代碼中做錯了什么?

class Program
    {
        static void Main(string[] args)
        {
            string folder = @"C:\Temp\";
            string output = folder + "_all.xml";
            Encoding readEncoding = System.Text.Encoding.Default;

            XmlWriterSettings writerSettings = new XmlWriterSettings();
            writerSettings.Encoding = Encoding.UTF8;
            writerSettings.ConformanceLevel = ConformanceLevel.Fragment;

            XmlWriter writer = XmlWriter.Create(new StreamWriter(output, false), writerSettings);
            bool firstFile = true;

            foreach (FileInfo file in new DirectoryInfo(folder).GetFiles("*.xml").Where(f => f.Name != "_all.xml"))
            {
                XmlReader reader = XmlReader.Create(new StreamReader(file.FullName, readEncoding));
                while(reader.Read())
                {

                    switch (reader.NodeType)
                    {
                        case XmlNodeType.Element:
                            if (firstFile && reader.Name == "CYPHS:CYPHS")
                            {
                                writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
                                writer.WriteAttributes(reader, true);
                            }
                            else if (firstFile && reader.Name == "CYP000")
                                writer.WriteStartElement(reader.Name);
                            else if (firstFile && reader.Name.StartsWith("C000"))
                                writer.WriteNode(reader, false);
                            else if (!firstFile && reader.Name != "CYPHS:CYPHS" && reader.Name != "CYP000" && !reader.Name.StartsWith("C000"))
                                writer.WriteNode(reader, false);
                            break;

                        default:
                            break;
                    }
                }

                firstFile = false;
                reader.Close();
            }

            writer.WriteEndElement();
            writer.WriteEndElement();

            writer.Close();

            Console.WriteLine("Done!");
            Console.ReadLine();
        }
    }

文件 1

<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <CYP000>
    <C000010>File 1</C000010>
    <CYP001>
      <C001901>File 1</C001901>
      <CYP101>
        <C101902>File 1</C101902>
        <CYP102>
          <C102902>File 1</C102902>
        </CYP102>
      </CYP101>
      <CYP002>
        <C002901>File 1</C002901>
      </CYP002>
    </CYP001>
  </CYP000>
</CYPHS:CYPHS>

檔案 2

<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <CYP000>
    <C000010>File 2</C000010>
    <CYP001>
      <C001901>File 2</C001901>
      <CYP101>
        <C101902>File 2</C101902>
        <CYP102>
          <C102902>File 2</C102902>
        </CYP102>
      </CYP101>
      <CYP002>
        <C002901>File 2</C002901>
      </CYP002>
    </CYP001>
  </CYP000>
</CYPHS:CYPHS>

應該像這樣合並到文件中:

<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <CYP000>
    <C000010>File 1</C000010>
    <CYP001>
      <C001901>File 1</C001901>
      <CYP101>
        <C101902>File 1</C101902>
        <CYP102>
          <C102902>File 1</C102902>
        </CYP102>
      </CYP101>
      <CYP002>
        <C002901>File 1</C002901>
      </CYP002>
    </CYP001>
    <CYP001>
      <C001901>File 2</C001901>
      <CYP101>
        <C101902>File 2</C101902>
        <CYP102>
          <C102902>File 2</C102902>
        </CYP102>
      </CYP101>
      <CYP002>
        <C002901>File 2</C002901>
      </CYP002>
    </CYP001>
  </CYP000>
</CYPHS:CYPHS>

像這樣

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;


namespace ConsoleApplication53
{
    class Program
    {
        static void Main(string[] args)
        {
            string file1 =
                "<CYPHS:CYPHS xsi:schemaLocation=\"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd\"" +
                    " xmlns:CYPHS=\"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5\"" +
                    " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">" +
                      "<CYP000>" +
                        "<C000010>File 1</C000010>" +
                        "<CYP001>" +
                          "<C001901>File 1</C001901>" +
                          "<CYP101>" +
                            "<C101902>File 1</C101902>" +
                            "<CYP102>" +
                              "<C102902>File 1</C102902>" +
                            "</CYP102>" +
                          "</CYP101>" +
                          "<CYP002>" +
                            "<C002901>File 1</C002901>" +
                          "</CYP002>" +
                        "</CYP001>" +
                      "</CYP000>" +
                    "</CYPHS:CYPHS>";
            XDocument doc1 = XDocument.Parse(file1);

            XElement doc1_CYP000 = doc1.Descendants("CYP000").FirstOrDefault();

            string file2 =
                "<CYPHS:CYPHS xsi:schemaLocation=\"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd\"" +
                " xmlns:CYPHS=\"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5\"" +
                " xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">" +
                  "<CYP000>" +
                    "<C000010>File 2</C000010>" +
                    "<CYP001>" +
                      "<C001901>File 2</C001901>" +
                      "<CYP101>" +
                        "<C101902>File 2</C101902>" +
                        "<CYP102>" +
                          "<C102902>File 2</C102902>" +
                        "</CYP102>" +
                      "</CYP101>" +
                      "<CYP002>" +
                        "<C002901>File 2</C002901>" +
                      "</CYP002>" +
                    "</CYP001>" +
                  "</CYP000>" +
                "</CYPHS:CYPHS>";

            XDocument doc2 = XDocument.Parse(file2);

            XElement doc2_CYP000 = doc2.Descendants("CYP000").FirstOrDefault();
            doc1_CYP000.Add(doc2_CYP000.Descendants());

        }

    }
}

我不完全確定您哪里出錯了,但在組合 XML 文件時檢查XmlReaderDepthLocalNameNamespaceURI屬性似乎最直接。 我強烈建議不要對命名空間前綴進行硬編碼,因為前綴可以替換為任何其他前綴,而不會更改 XML 文件的語義。

需要注意的一件事: XmlWriter.WriteNode(XmlReader, bool)將讀取器前進到下一個節點的開頭,因此如果您隨后調用Read()並且文件中沒有空格,您將跳過下一個元素。 考慮到這一點,當直接使用XmlReader ,最好在有和沒有間距的情況下進行測試。

因此:

public class XmlConcatenate
{
    public static void ConcatenateAllFiles()
    {
        string folder = "C:\\Temp\\";
        string output = folder + "_all.xml";
        Encoding readEncoding = System.Text.Encoding.Default; // WHY NOT Encoding.UTF8 !?

        var files = new DirectoryInfo(folder).GetFiles("*.xml").Where(f => f.Name != "_all.xml").Select(f => f.FullName).Select(n => (TextReader)new StreamReader(n, readEncoding));

        using (var textWriter = new StreamWriter(output, false))
        {
            Concatenate(files, textWriter);
        }
    }

    public static void Concatenate(IEnumerable<TextReader> inputs, TextWriter output)
    {
        var writerSettings = new XmlWriterSettings() { Encoding = Encoding.UTF8, ConformanceLevel = ConformanceLevel.Fragment };
        var whiteSpace = new StringBuilder();
        int indent = 0;

        using (var writer = XmlWriter.Create(output, writerSettings))
        {
            var writeDepth = 0;
            var first = true;

            foreach (var input in inputs)
            {
                using (input)
                using (var reader = XmlReader.Create(input))
                {
                    bool alreadyRead = false;
                    while (!reader.EOF && (alreadyRead || reader.Read()))
                    {
                        alreadyRead = false;
                        switch (reader.NodeType)
                        {
                            case XmlNodeType.Element:
                                {
                                    if (reader.Depth == 0 && reader.LocalName == "CYPHS" && reader.NamespaceURI == "http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5")
                                    {
                                        if (writeDepth == 0)
                                        {
                                            writer.WriteWhitespace(whiteSpace.ToString());
                                            writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
                                            writer.WriteAttributes(reader, true);
                                            writeDepth++;
                                        }
                                    }
                                    else if (reader.Depth == 1 && reader.LocalName == "CYP000" && reader.NamespaceURI == "")
                                    {
                                        if (writeDepth == 1)
                                        {
                                            indent = whiteSpace.ToString().Replace("\n", "").Replace("\r", "").Length;

                                            writer.WriteWhitespace(whiteSpace.ToString());
                                            writer.WriteStartElement(reader.LocalName, reader.NamespaceURI);
                                            writeDepth++;
                                        }
                                    }
                                    else if (reader.Depth == 2)
                                    {
                                        if (reader.LocalName.StartsWith("C000") && reader.NamespaceURI == "")
                                        {
                                            if (first)
                                            {
                                                first = false;
                                                writer.WriteWhitespace(whiteSpace.ToString());
                                                writer.WriteNode(reader, false);
                                                alreadyRead = true;
                                            }
                                        }
                                        else
                                        {
                                            writer.WriteWhitespace(whiteSpace.ToString());
                                            writer.WriteNode(reader, false);
                                            alreadyRead = true;
                                        }
                                    }
                                    whiteSpace.Length = 0; // Clear accumulated whitespace.
                                }
                                break;
                            case XmlNodeType.Whitespace:
                                {
                                    whiteSpace.Append(reader.Value);
                                }
                                break;
                            default:
                                break;
                        }
                    }
                }
            }
            while (writeDepth-- > 0)
            {
                if (indent > 0)
                    writer.WriteWhitespace("\n" + new string(' ', indent * writeDepth));
                writer.WriteEndElement();
            }
        }
    }
}

合並間距有點麻煩,如果您不關心保留間距,則可以大大簡化代碼。

工作小提琴

您可能不想使用System.Text.Encoding.Default來讀取 XML 文件。 從文檔:

由於所有默認編碼都會丟失數據,因此您可以改用 UTF8。 UTF-8 在 U+00 到 U+7F 范圍內通常是相同的,但可以編碼其他字符而不會丟失。

不同的解決方案可能是使用自定義XmlReader實現在讀取文件時連接文件。 然后使用這個自定義閱讀器和一個XmlWriter來創建合並的文件。

自定義XmlReader為每個文件保留內部XmlReader intro/end 僅從第一個文件中讀取。 從其他文件中僅讀取相關(待追加)元素。

  1. 為第一個文件創建一個XmlReader
  2. 讀到應該附加元素的地方
  3. 對於每個后續文件
    1. 創建一個新的XmlReader
    2. 跳到第一個相關元素
    3. 閱讀相關元素
    4. 處置讀者
  4. 讀取第一個文件的其余部分(從步驟 1 恢復讀取器)
  5. 處置讀者

示例實現

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Xml;

public static class XmlConcatenator
{
    // first: pause reading at the end of this element, will resume after subsequent streams are read
    // subsequent: stop reading at the end of this element
    private const string StopAtEndOf = "CYP000";

    // first: (ignores this)
    // subsequent: skip ahead to the first instance of this element
    private const string ResumeAtFirst = "CYP001";

    private static readonly XmlReaderSettings XmlReaderSettings = new XmlReaderSettings() { DtdProcessing = DtdProcessing.Ignore };
    private static readonly XmlWriterSettings XmlWriterSettings = new XmlWriterSettings() { Encoding = Encoding.UTF8, Indent = true };

    public static void Concat(Stream outStream, Stream[] fileStreams)
    {
        using var reader = XmlConcatReader.Create(fileStreams);
        using var writer = XmlWriter.Create(outStream, XmlWriterSettings);
        writer.WriteNode(reader, true);
    }

    private class XmlConcatReader : XmlReader
    {
        private readonly XmlReader _firstReader;
        private readonly IEnumerator<Stream> _streams;
        private XmlReader _currentReader;

        private XmlConcatReader(Stream first, IEnumerable<Stream> streams)
        {
            _firstReader = XmlReader.Create(first, XmlReaderSettings);
            _streams = streams.GetEnumerator();
            _currentReader = _firstReader;
        }

        public static XmlReader Create(Stream[] inputStreams)
        {
            if (!(inputStreams?.Length > 1))
            {
                throw new InvalidOperationException($"{nameof(inputStreams)} must contain at least two streams");
            }

            return new XmlConcatReader(inputStreams[0], inputStreams.Skip(1));
        }

        public override bool Read()
        {
            var b = _currentReader.Read();
            if (_currentReader.NodeType == XmlNodeType.EndElement && _currentReader.LocalName == StopAtEndOf)
            {
                // note: _firstReader is disposed at the end. See: Dispose(bool)
                if (!ReferenceEquals(_currentReader, _firstReader))
                {
                    _currentReader.Dispose();
                }

                if (_streams.MoveNext())
                {
                    _currentReader = XmlReader.Create(_streams.Current, XmlReaderSettings);

                    while (_currentReader.Read())
                    {
                        if (_currentReader.LocalName == ResumeAtFirst)
                        {
                            return true;
                        }
                    }
                }
                else
                {
                    _currentReader = _firstReader;
                    return true;
                }
            }

            return b;
        }

        protected override void Dispose(bool disposing)
        {
            if (disposing)
            {
                _firstReader?.Dispose();
            }

            base.Dispose(disposing);
        }

        public override XmlNodeType NodeType => _currentReader.NodeType;
        public override string LocalName => _currentReader.LocalName;
        public override string NamespaceURI => _currentReader.NamespaceURI;
        public override string Prefix => _currentReader.Prefix;
        public override string Value => _currentReader.Value;
        public override int Depth => _currentReader.Depth;
        public override string BaseURI => _currentReader.BaseURI;
        public override bool IsEmptyElement => _currentReader.IsEmptyElement;
        public override int AttributeCount => _currentReader.AttributeCount;
        public override bool EOF => _currentReader.EOF;
        public override ReadState ReadState => _currentReader.ReadState;
        public override XmlNameTable NameTable => _currentReader.NameTable;
        public override string GetAttribute(string name) => _currentReader.GetAttribute(name);
        public override string GetAttribute(string name, string namespaceURI) => _currentReader.GetAttribute(name, namespaceURI);
        public override string GetAttribute(int i) => _currentReader.GetAttribute(i);
        public override string LookupNamespace(string prefix) => _currentReader.LookupNamespace(prefix);
        public override bool MoveToAttribute(string name) => _currentReader.MoveToAttribute(name);
        public override bool MoveToAttribute(string name, string ns) => _currentReader.MoveToAttribute(name, ns);
        public override bool MoveToElement() => _currentReader.MoveToElement();
        public override bool MoveToFirstAttribute() => _currentReader.MoveToFirstAttribute();
        public override bool MoveToNextAttribute() => _currentReader.MoveToNextAttribute();
        public override bool ReadAttributeValue() => _currentReader.ReadAttributeValue();
        public override void ResolveEntity() => _currentReader.ResolveEntity();
    }
}

使用示例

using System.IO;
using System.Linq;

internal static class Program
{
    private static void Main()
    {
        var input = new[] { "in1.xml", "in2.xml" };
        var output = "output.xml";

        var inputStreams = input.Select(p => File.Open(p, FileMode.Open)).ToArray();
        using var outputStream = File.Create(output);

        XmlConcatenator.Concat(outputStream, inputStreams);

        foreach (var stream in inputStreams)
        {
            stream.Dispose();
        }
    }
}

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM