[英]Fastest way to read large binary file into array of int in c#
我有一種情況,我正在嘗試讀取大量二進制文件。 這些文件包含數百萬個純整數(長)鍵值對。 因此,我可以使用帶有 ReadInt64() 的二進制閱讀器一次讀取一個,但這很耗時,並且最終比將相同的文本文件作為純文本讀取要慢。 有誰知道使用整數上的鍵值對快速讀取二進制文件的更好方法? 我得到了一些可以使用緩沖區更快地工作的東西,但它仍然很難讀取每個長。 一定有更好的方法。 任何幫助表示贊賞! 謝謝
如果您只想在 Windows 上運行應用程序,您可以加快速度。
注意:以下代碼僅在T
是原始值類型時才有效:
/// <summary>
/// Reads array data from a file stream as quickly as possible,
/// without making any additional copies of the data.
/// </summary>
/// <typeparam name="T">The type of the array elements.</typeparam>
/// <param name="fs">The file stream from which to read.</param>
/// <param name="count">The number of elements to read.</param>
/// <returns>
/// The array of elements that was read. This may be less than the number that was
/// requested if the end of the file was reached. It may even be empty.
/// NOTE: There may still be data left in the file, even if not all the requested
/// elements were returned - this happens if the number of bytes remaining in the
/// file is less than the size of the array elements.
/// </returns>
/// <exception cref="IOException">Thrown on error. See inner exception for <see cref="Win32Exception"/></exception>
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Reliability", "CA2004:RemoveCallsToGCKeepAlive")]
public static T[] FastRead<T>(FileStream fs, int count) where T: struct
{
int sizeOfT = Marshal.SizeOf(typeof(T));
long bytesRemaining = fs.Length - fs.Position;
long wantedBytes = count * sizeOfT;
long bytesAvailable = Math.Min(bytesRemaining, wantedBytes);
long availableValues = bytesAvailable / sizeOfT;
long bytesToRead = (availableValues * sizeOfT);
if ((bytesRemaining < wantedBytes) && ((bytesRemaining - bytesToRead) > 0))
Debug.WriteLine("Requested data exceeds available data and partial data remains in the file.");
T[] result = new T[availableValues];
GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned);
try
{
uint bytesRead;
if (!ReadFile(
fs.SafeFileHandle,
gcHandle.AddrOfPinnedObject(),
(uint)bytesToRead,
out bytesRead,
IntPtr.Zero))
{
throw new IOException("Unable to read file.", new Win32Exception(Marshal.GetLastWin32Error()));
}
Debug.Assert(bytesRead == bytesToRead);
}
finally
{
gcHandle.Free();
}
GC.KeepAlive(fs);
return result;
}
測試了 Matthews Watson 的代碼(應該被確認為答案)
確實很快。 我為 ReadFile (Kernel32) 添加了 DllImport neccesary,並通過引入一個空的 NativeOverlapped ref 來替換 IntPtr.Zero 來避免使用 unsafe
被測文件為 5.9Mb html,在 3-5ms 內從 SSD 讀取為 byte[] 塊
class Program
{
static void Main(string[] args)
{
Stopwatch sw = new Stopwatch();
sw.Start();
byte[] b = TestIt.FastRead<byte>(new FileStream("Tracking Covid-19 cases in the US.htm", FileMode.Open), 10000000);
sw.Stop();
Debug.WriteLine("b.Length=" + b.Length +" sw="+sw.ElapsedMilliseconds);
for (int i = 0; i < 200; i++) Debug.Write((char)b[i]);
Debug.WriteLine("==");
Debug.WriteLine("==");
for (int i = 0; i < 200; i++) Debug.Write((char)b[b.Length-200+i]); // END </html>
Console.ReadKey();
}
}
public static class TestIt
{
// https://stackoverflow.com/questions/66789631/fastest-way-to-read-large-binary-file-into-array-of-int-in-c-sharp/67332253#67332253
[DllImport("kernel32.dll", SetLastError = true)]
static extern bool ReadFile(Microsoft.Win32.SafeHandles.SafeFileHandle hFile, [Out] IntPtr lpBuffer, uint nNumberOfBytesToRead,
out uint lpNumberOfBytesRead, [In] ref System.Threading.NativeOverlapped lpOverlapped);
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Reliability", "CA2004:RemoveCallsToGCKeepAlive")]
public static T[] FastRead<T>(FileStream fs, int count) where T : struct
{
int sizeOfT = Marshal.SizeOf(typeof(T));
long bytesRemaining = fs.Length - fs.Position;
long wantedBytes = count * sizeOfT;
long bytesAvailable = Math.Min(bytesRemaining, wantedBytes);
long availableValues = bytesAvailable / sizeOfT;
long bytesToRead = (availableValues * sizeOfT);
if ((bytesRemaining < wantedBytes) && ((bytesRemaining - bytesToRead) > 0))
Debug.WriteLine("Requested data exceeds available data and partial data remains in the file.");
T[] result = new T[availableValues];
GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned);
var ipp = new System.Threading.NativeOverlapped(); // need this with above pInvoke
try
{
uint bytesRead;
if (!ReadFile(
fs.SafeFileHandle,
gcHandle.AddrOfPinnedObject(),
(uint)bytesToRead,
out bytesRead, ref ipp))
{
throw new IOException("Unable to read file.", new Win32Exception(Marshal.GetLastWin32Error()));
}
Debug.Assert(bytesRead == bytesToRead);
}
finally
{
gcHandle.Free();
}
GC.KeepAlive(fs);
return result;
}
}
}
}
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.