[英]XMLReader to XML file using XMLWriter WriteNode - Very Slow for Large XML
[英]Merge XML files using XmlReader and XmlWriter
我正在嘗試使用XmlReader
和XmlWriter
將多個 XML 文件合並為一個文件,盡管我的最終文件只包含最后一個文件中的數據。
我使用XmlReader
和XmlWriter
因為要合並的 XML 文件很大。
我在下面的代碼中做錯了什么?
class Program
{
static void Main(string[] args)
{
string folder = @"C:\Temp\";
string output = folder + "_all.xml";
Encoding readEncoding = System.Text.Encoding.Default;
XmlWriterSettings writerSettings = new XmlWriterSettings();
writerSettings.Encoding = Encoding.UTF8;
writerSettings.ConformanceLevel = ConformanceLevel.Fragment;
XmlWriter writer = XmlWriter.Create(new StreamWriter(output, false), writerSettings);
bool firstFile = true;
foreach (FileInfo file in new DirectoryInfo(folder).GetFiles("*.xml").Where(f => f.Name != "_all.xml"))
{
XmlReader reader = XmlReader.Create(new StreamReader(file.FullName, readEncoding));
while(reader.Read())
{
switch (reader.NodeType)
{
case XmlNodeType.Element:
if (firstFile && reader.Name == "CYPHS:CYPHS")
{
writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
writer.WriteAttributes(reader, true);
}
else if (firstFile && reader.Name == "CYP000")
writer.WriteStartElement(reader.Name);
else if (firstFile && reader.Name.StartsWith("C000"))
writer.WriteNode(reader, false);
else if (!firstFile && reader.Name != "CYPHS:CYPHS" && reader.Name != "CYP000" && !reader.Name.StartsWith("C000"))
writer.WriteNode(reader, false);
break;
default:
break;
}
}
firstFile = false;
reader.Close();
}
writer.WriteEndElement();
writer.WriteEndElement();
writer.Close();
Console.WriteLine("Done!");
Console.ReadLine();
}
}
文件 1
<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<CYP000>
<C000010>File 1</C000010>
<CYP001>
<C001901>File 1</C001901>
<CYP101>
<C101902>File 1</C101902>
<CYP102>
<C102902>File 1</C102902>
</CYP102>
</CYP101>
<CYP002>
<C002901>File 1</C002901>
</CYP002>
</CYP001>
</CYP000>
</CYPHS:CYPHS>
檔案 2
<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<CYP000>
<C000010>File 2</C000010>
<CYP001>
<C001901>File 2</C001901>
<CYP101>
<C101902>File 2</C101902>
<CYP102>
<C102902>File 2</C102902>
</CYP102>
</CYP101>
<CYP002>
<C002901>File 2</C002901>
</CYP002>
</CYP001>
</CYP000>
</CYPHS:CYPHS>
應該像這樣合並到文件中:
<CYPHS:CYPHS xsi:schemaLocation="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd"
xmlns:CYPHS="http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<CYP000>
<C000010>File 1</C000010>
<CYP001>
<C001901>File 1</C001901>
<CYP101>
<C101902>File 1</C101902>
<CYP102>
<C102902>File 1</C102902>
</CYP102>
</CYP101>
<CYP002>
<C002901>File 1</C002901>
</CYP002>
</CYP001>
<CYP001>
<C001901>File 2</C001901>
<CYP101>
<C101902>File 2</C101902>
<CYP102>
<C102902>File 2</C102902>
</CYP102>
</CYP101>
<CYP002>
<C002901>File 2</C002901>
</CYP002>
</CYP001>
</CYP000>
</CYPHS:CYPHS>
像這樣
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Xml;
using System.Xml.Linq;
namespace ConsoleApplication53
{
class Program
{
static void Main(string[] args)
{
string file1 =
"<CYPHS:CYPHS xsi:schemaLocation=\"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd\"" +
" xmlns:CYPHS=\"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5\"" +
" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">" +
"<CYP000>" +
"<C000010>File 1</C000010>" +
"<CYP001>" +
"<C001901>File 1</C001901>" +
"<CYP101>" +
"<C101902>File 1</C101902>" +
"<CYP102>" +
"<C102902>File 1</C102902>" +
"</CYP102>" +
"</CYP101>" +
"<CYP002>" +
"<C002901>File 1</C002901>" +
"</CYP002>" +
"</CYP001>" +
"</CYP000>" +
"</CYPHS:CYPHS>";
XDocument doc1 = XDocument.Parse(file1);
XElement doc1_CYP000 = doc1.Descendants("CYP000").FirstOrDefault();
string file2 =
"<CYPHS:CYPHS xsi:schemaLocation=\"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5 CYPHSCYPHS_XMLSchema-v1-5.xsd\"" +
" xmlns:CYPHS=\"http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5\"" +
" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">" +
"<CYP000>" +
"<C000010>File 2</C000010>" +
"<CYP001>" +
"<C001901>File 2</C001901>" +
"<CYP101>" +
"<C101902>File 2</C101902>" +
"<CYP102>" +
"<C102902>File 2</C102902>" +
"</CYP102>" +
"</CYP101>" +
"<CYP002>" +
"<C002901>File 2</C002901>" +
"</CYP002>" +
"</CYP001>" +
"</CYP000>" +
"</CYPHS:CYPHS>";
XDocument doc2 = XDocument.Parse(file2);
XElement doc2_CYP000 = doc2.Descendants("CYP000").FirstOrDefault();
doc1_CYP000.Add(doc2_CYP000.Descendants());
}
}
}
我不完全確定您哪里出錯了,但在組合 XML 文件時檢查XmlReader
的Depth
、 LocalName
和NamespaceURI
屬性似乎最直接。 我強烈建議不要對命名空間前綴進行硬編碼,因為前綴可以替換為任何其他前綴,而不會更改 XML 文件的語義。
需要注意的一件事: XmlWriter.WriteNode(XmlReader, bool)
將讀取器前進到下一個節點的開頭,因此如果您隨后調用Read()
並且文件中沒有空格,您將跳過下一個元素。 考慮到這一點,當直接使用XmlReader
,最好在有和沒有間距的情況下進行測試。
因此:
public class XmlConcatenate
{
public static void ConcatenateAllFiles()
{
string folder = "C:\\Temp\\";
string output = folder + "_all.xml";
Encoding readEncoding = System.Text.Encoding.Default; // WHY NOT Encoding.UTF8 !?
var files = new DirectoryInfo(folder).GetFiles("*.xml").Where(f => f.Name != "_all.xml").Select(f => f.FullName).Select(n => (TextReader)new StreamReader(n, readEncoding));
using (var textWriter = new StreamWriter(output, false))
{
Concatenate(files, textWriter);
}
}
public static void Concatenate(IEnumerable<TextReader> inputs, TextWriter output)
{
var writerSettings = new XmlWriterSettings() { Encoding = Encoding.UTF8, ConformanceLevel = ConformanceLevel.Fragment };
var whiteSpace = new StringBuilder();
int indent = 0;
using (var writer = XmlWriter.Create(output, writerSettings))
{
var writeDepth = 0;
var first = true;
foreach (var input in inputs)
{
using (input)
using (var reader = XmlReader.Create(input))
{
bool alreadyRead = false;
while (!reader.EOF && (alreadyRead || reader.Read()))
{
alreadyRead = false;
switch (reader.NodeType)
{
case XmlNodeType.Element:
{
if (reader.Depth == 0 && reader.LocalName == "CYPHS" && reader.NamespaceURI == "http://www.datadictionary.nhs.uk/messages/CYPHS-v1-5")
{
if (writeDepth == 0)
{
writer.WriteWhitespace(whiteSpace.ToString());
writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
writer.WriteAttributes(reader, true);
writeDepth++;
}
}
else if (reader.Depth == 1 && reader.LocalName == "CYP000" && reader.NamespaceURI == "")
{
if (writeDepth == 1)
{
indent = whiteSpace.ToString().Replace("\n", "").Replace("\r", "").Length;
writer.WriteWhitespace(whiteSpace.ToString());
writer.WriteStartElement(reader.LocalName, reader.NamespaceURI);
writeDepth++;
}
}
else if (reader.Depth == 2)
{
if (reader.LocalName.StartsWith("C000") && reader.NamespaceURI == "")
{
if (first)
{
first = false;
writer.WriteWhitespace(whiteSpace.ToString());
writer.WriteNode(reader, false);
alreadyRead = true;
}
}
else
{
writer.WriteWhitespace(whiteSpace.ToString());
writer.WriteNode(reader, false);
alreadyRead = true;
}
}
whiteSpace.Length = 0; // Clear accumulated whitespace.
}
break;
case XmlNodeType.Whitespace:
{
whiteSpace.Append(reader.Value);
}
break;
default:
break;
}
}
}
}
while (writeDepth-- > 0)
{
if (indent > 0)
writer.WriteWhitespace("\n" + new string(' ', indent * writeDepth));
writer.WriteEndElement();
}
}
}
}
合並間距有點麻煩,如果您不關心保留間距,則可以大大簡化代碼。
工作小提琴。
您可能不想使用System.Text.Encoding.Default
來讀取 XML 文件。 從文檔:
由於所有默認編碼都會丟失數據,因此您可以改用 UTF8。 UTF-8 在 U+00 到 U+7F 范圍內通常是相同的,但可以編碼其他字符而不會丟失。
不同的解決方案可能是使用自定義XmlReader
實現在讀取文件時連接文件。 然后使用這個自定義閱讀器和一個XmlWriter
來創建合並的文件。
自定義XmlReader
為每個文件保留內部XmlReader
。 intro/end 僅從第一個文件中讀取。 從其他文件中僅讀取相關(待追加)元素。
XmlReader
XmlReader
示例實現
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Xml;
public static class XmlConcatenator
{
// first: pause reading at the end of this element, will resume after subsequent streams are read
// subsequent: stop reading at the end of this element
private const string StopAtEndOf = "CYP000";
// first: (ignores this)
// subsequent: skip ahead to the first instance of this element
private const string ResumeAtFirst = "CYP001";
private static readonly XmlReaderSettings XmlReaderSettings = new XmlReaderSettings() { DtdProcessing = DtdProcessing.Ignore };
private static readonly XmlWriterSettings XmlWriterSettings = new XmlWriterSettings() { Encoding = Encoding.UTF8, Indent = true };
public static void Concat(Stream outStream, Stream[] fileStreams)
{
using var reader = XmlConcatReader.Create(fileStreams);
using var writer = XmlWriter.Create(outStream, XmlWriterSettings);
writer.WriteNode(reader, true);
}
private class XmlConcatReader : XmlReader
{
private readonly XmlReader _firstReader;
private readonly IEnumerator<Stream> _streams;
private XmlReader _currentReader;
private XmlConcatReader(Stream first, IEnumerable<Stream> streams)
{
_firstReader = XmlReader.Create(first, XmlReaderSettings);
_streams = streams.GetEnumerator();
_currentReader = _firstReader;
}
public static XmlReader Create(Stream[] inputStreams)
{
if (!(inputStreams?.Length > 1))
{
throw new InvalidOperationException($"{nameof(inputStreams)} must contain at least two streams");
}
return new XmlConcatReader(inputStreams[0], inputStreams.Skip(1));
}
public override bool Read()
{
var b = _currentReader.Read();
if (_currentReader.NodeType == XmlNodeType.EndElement && _currentReader.LocalName == StopAtEndOf)
{
// note: _firstReader is disposed at the end. See: Dispose(bool)
if (!ReferenceEquals(_currentReader, _firstReader))
{
_currentReader.Dispose();
}
if (_streams.MoveNext())
{
_currentReader = XmlReader.Create(_streams.Current, XmlReaderSettings);
while (_currentReader.Read())
{
if (_currentReader.LocalName == ResumeAtFirst)
{
return true;
}
}
}
else
{
_currentReader = _firstReader;
return true;
}
}
return b;
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
_firstReader?.Dispose();
}
base.Dispose(disposing);
}
public override XmlNodeType NodeType => _currentReader.NodeType;
public override string LocalName => _currentReader.LocalName;
public override string NamespaceURI => _currentReader.NamespaceURI;
public override string Prefix => _currentReader.Prefix;
public override string Value => _currentReader.Value;
public override int Depth => _currentReader.Depth;
public override string BaseURI => _currentReader.BaseURI;
public override bool IsEmptyElement => _currentReader.IsEmptyElement;
public override int AttributeCount => _currentReader.AttributeCount;
public override bool EOF => _currentReader.EOF;
public override ReadState ReadState => _currentReader.ReadState;
public override XmlNameTable NameTable => _currentReader.NameTable;
public override string GetAttribute(string name) => _currentReader.GetAttribute(name);
public override string GetAttribute(string name, string namespaceURI) => _currentReader.GetAttribute(name, namespaceURI);
public override string GetAttribute(int i) => _currentReader.GetAttribute(i);
public override string LookupNamespace(string prefix) => _currentReader.LookupNamespace(prefix);
public override bool MoveToAttribute(string name) => _currentReader.MoveToAttribute(name);
public override bool MoveToAttribute(string name, string ns) => _currentReader.MoveToAttribute(name, ns);
public override bool MoveToElement() => _currentReader.MoveToElement();
public override bool MoveToFirstAttribute() => _currentReader.MoveToFirstAttribute();
public override bool MoveToNextAttribute() => _currentReader.MoveToNextAttribute();
public override bool ReadAttributeValue() => _currentReader.ReadAttributeValue();
public override void ResolveEntity() => _currentReader.ResolveEntity();
}
}
使用示例
using System.IO;
using System.Linq;
internal static class Program
{
private static void Main()
{
var input = new[] { "in1.xml", "in2.xml" };
var output = "output.xml";
var inputStreams = input.Select(p => File.Open(p, FileMode.Open)).ToArray();
using var outputStream = File.Create(output);
XmlConcatenator.Concat(outputStream, inputStreams);
foreach (var stream in inputStreams)
{
stream.Dispose();
}
}
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.