[英]Fastest way to read large binary file into array of int in c#
我有一种情况,我正在尝试读取大量二进制文件。 这些文件包含数百万个纯整数(长)键值对。 因此,我可以使用带有 ReadInt64() 的二进制阅读器一次读取一个,但这很耗时,并且最终比将相同的文本文件作为纯文本读取要慢。 有谁知道使用整数上的键值对快速读取二进制文件的更好方法? 我得到了一些可以使用缓冲区更快地工作的东西,但它仍然很难读取每个长。 一定有更好的方法。 任何帮助表示赞赏! 谢谢
如果您只想在 Windows 上运行应用程序,您可以加快速度。
注意:以下代码仅在T
是原始值类型时才有效:
/// <summary>
/// Reads array data from a file stream as quickly as possible,
/// without making any additional copies of the data.
/// </summary>
/// <typeparam name="T">The type of the array elements.</typeparam>
/// <param name="fs">The file stream from which to read.</param>
/// <param name="count">The number of elements to read.</param>
/// <returns>
/// The array of elements that was read. This may be less than the number that was
/// requested if the end of the file was reached. It may even be empty.
/// NOTE: There may still be data left in the file, even if not all the requested
/// elements were returned - this happens if the number of bytes remaining in the
/// file is less than the size of the array elements.
/// </returns>
/// <exception cref="IOException">Thrown on error. See inner exception for <see cref="Win32Exception"/></exception>
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Reliability", "CA2004:RemoveCallsToGCKeepAlive")]
public static T[] FastRead<T>(FileStream fs, int count) where T: struct
{
int sizeOfT = Marshal.SizeOf(typeof(T));
long bytesRemaining = fs.Length - fs.Position;
long wantedBytes = count * sizeOfT;
long bytesAvailable = Math.Min(bytesRemaining, wantedBytes);
long availableValues = bytesAvailable / sizeOfT;
long bytesToRead = (availableValues * sizeOfT);
if ((bytesRemaining < wantedBytes) && ((bytesRemaining - bytesToRead) > 0))
Debug.WriteLine("Requested data exceeds available data and partial data remains in the file.");
T[] result = new T[availableValues];
GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned);
try
{
uint bytesRead;
if (!ReadFile(
fs.SafeFileHandle,
gcHandle.AddrOfPinnedObject(),
(uint)bytesToRead,
out bytesRead,
IntPtr.Zero))
{
throw new IOException("Unable to read file.", new Win32Exception(Marshal.GetLastWin32Error()));
}
Debug.Assert(bytesRead == bytesToRead);
}
finally
{
gcHandle.Free();
}
GC.KeepAlive(fs);
return result;
}
测试了 Matthews Watson 的代码(应该被确认为答案)
确实很快。 我为 ReadFile (Kernel32) 添加了 DllImport neccesary,并通过引入一个空的 NativeOverlapped ref 来替换 IntPtr.Zero 来避免使用 unsafe
被测文件为 5.9Mb html,在 3-5ms 内从 SSD 读取为 byte[] 块
class Program
{
static void Main(string[] args)
{
Stopwatch sw = new Stopwatch();
sw.Start();
byte[] b = TestIt.FastRead<byte>(new FileStream("Tracking Covid-19 cases in the US.htm", FileMode.Open), 10000000);
sw.Stop();
Debug.WriteLine("b.Length=" + b.Length +" sw="+sw.ElapsedMilliseconds);
for (int i = 0; i < 200; i++) Debug.Write((char)b[i]);
Debug.WriteLine("==");
Debug.WriteLine("==");
for (int i = 0; i < 200; i++) Debug.Write((char)b[b.Length-200+i]); // END </html>
Console.ReadKey();
}
}
public static class TestIt
{
// https://stackoverflow.com/questions/66789631/fastest-way-to-read-large-binary-file-into-array-of-int-in-c-sharp/67332253#67332253
[DllImport("kernel32.dll", SetLastError = true)]
static extern bool ReadFile(Microsoft.Win32.SafeHandles.SafeFileHandle hFile, [Out] IntPtr lpBuffer, uint nNumberOfBytesToRead,
out uint lpNumberOfBytesRead, [In] ref System.Threading.NativeOverlapped lpOverlapped);
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Reliability", "CA2004:RemoveCallsToGCKeepAlive")]
public static T[] FastRead<T>(FileStream fs, int count) where T : struct
{
int sizeOfT = Marshal.SizeOf(typeof(T));
long bytesRemaining = fs.Length - fs.Position;
long wantedBytes = count * sizeOfT;
long bytesAvailable = Math.Min(bytesRemaining, wantedBytes);
long availableValues = bytesAvailable / sizeOfT;
long bytesToRead = (availableValues * sizeOfT);
if ((bytesRemaining < wantedBytes) && ((bytesRemaining - bytesToRead) > 0))
Debug.WriteLine("Requested data exceeds available data and partial data remains in the file.");
T[] result = new T[availableValues];
GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned);
var ipp = new System.Threading.NativeOverlapped(); // need this with above pInvoke
try
{
uint bytesRead;
if (!ReadFile(
fs.SafeFileHandle,
gcHandle.AddrOfPinnedObject(),
(uint)bytesToRead,
out bytesRead, ref ipp))
{
throw new IOException("Unable to read file.", new Win32Exception(Marshal.GetLastWin32Error()));
}
Debug.Assert(bytesRead == bytesToRead);
}
finally
{
gcHandle.Free();
}
GC.KeepAlive(fs);
return result;
}
}
}
}
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.