简体   繁体   English

比较 C 中的两个二进制文件

[英]Compare two binary files in C

I am writing a program to compare two binary files and plot the first difference.我正在编写一个程序来比较两个二进制文件并绘制第一个差异。 I want to read 16 bytes of data from each file continuously and compare them.我想从每个文件中连续读取 16 个字节的数据并进行比较。 For that I am storing 16 bytes from both file into char *buffer1, buffer2 .为此,我将两个文件中的 16 个字节存储到char *buffer1, buffer2 When I print the output I am getting that buffer1 has both the data of file1 and file2 .当我打印输出时,我得到buffer1file1file2的数据。

The code is as follows:代码如下:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

void printConversion(char *buf1, char *buf2) {
    size_t len = strlen(buf1);
    char *binary = malloc(len * 8 + 1); 
    binary[0] = '\0';
    for (size_t i = 0; i < len; ++i) {
        char ch = buf1[i];
        for (int j = 7; j >= 0; --j) {
            if (ch & (1 << j)) {
                strcat(binary,"1");
            } else {
                strcat(binary,"0");
            }
        }
    }

    printf("File1: %s\t", binary);
    free(binary);
    printf("File2:");
    for (int i = 0; i < sizeof(buf2); i++) {
        printf("%x", buf2[i] - '0');
    }
}

void fileRead(FILE *fp, char *buf, int count) {
    fseek(fp, count, SEEK_SET);    
    fread(buf, 1, 16, fp);
}

int fileSize(FILE *fp) {
    fseek(fp, 0, SEEK_END);
    int size = ftell(fp) + 1;
    return size;
}

int main(int argc, char *argv[]) {
    printf("***Binary File Comparator***\n ");
    int count = 0;
    int index = 0;
    char buffer1[16];
    char buffer2[16];
    char buffer3[16];
    char buffer4[16];

    // Invalid Number of Arguments
    if (argc < 3 || argc > 3) {
        printf("Invalid Number of Arguments\n");
    }

    FILE *fp1, *fp2;
    fp1 = fopen(argv[1], "rb");
    int size = fileSize(fp1);
    int size1 = size;
    fclose(fp1);

    while (size > 1) {
        fp1 = fopen(argv[1], "rb");
        fileRead(fp1, buffer1, count);
        fclose(fp1);

        fp2 = fopen(argv[2], "rb");
        fileRead(fp2, buffer2, count);
        if (size1 < count) {
            int lastSize = count - size1;
            count = count + lastSize;
            fclose(fp2);
        } else {
            count = count+16;
            fclose(fp2);
        }

        **printf("buffer1:%s\tbuffer2:%s\n", buffer1, buffer2)**;
        size = size - 16;

        int result = strcmp(buffer1, buffer2);
        if (result != 0) {
            for (int i = 0; i < sizeof(buffer1); i++) {
                if (buffer1[i] != buffer2[i]) {
                    int count1 = (count - 16) + i;
                    index++;
                    if (index == 1) {
                        printf("Byte_Offset:%x\n", count1);
                        fp1 = fopen(argv[1], "rb");
                        fileRead(fp1, buffer3, count1);
                        fclose(fp1);
                        fp2 = fopen(argv[2], "rb");
                        fileRead(fp2, buffer4, count1);
                        fclose(fp2);
                        printConversion(buffer3, buffer4);
                        break;
                    }
                } else {
                    continue;
                }
            } 
        }
    }
}

I have tried to highlight the printf part that is printing my buffer1 and buffer2我试图突出显示正在打印我的buffer1buffer2的 printf 部分

The output is as follows:输出如下:

 buffer1:83867715933586928386771593358692   buffer2:8386771593358692
buffer1:49216227905963264921622790596326    buffer2:4921622790596326
buffer1:40267236116867294026723611686729    buffer2:4026723611686729
buffer1:82306223673529228230622367352922    buffer2:8230622367352922
buffer1:25869679356114222586967935611422    buffer2:2586967935611422

Can anybody help what I am doing wrong.任何人都可以帮助我做错了什么。 Please point me the error and what optimization changes could be done in code.请指出错误以及可以在代码中进行哪些优化更改。 I am at learning stage your feedback will be very helpful.我正处于学习阶段,您的反馈将非常有帮助。

You are complicating the task by reading 16 bytes at a time.一次读取 16 个字节会使任务复杂化。 If the goal is to indicate the first difference, just read one byte at a time from both files with getc() this way:如果目标是指示第一个差异,只需使用getc()这种方式一次从两个文件中读取一个字节:

int compare_files(FILE *fp1, FILE *fp2) {
    unsigned long pos;
    int c1, c2;
    for (pos = 0;; pos++) {
        c1 = getc(fp1);
        c2 = getc(fp2);
        if (c1 != c2 || c1 == EOF)
            break;
    }
    if (c1 == c2) {
        printf("files are identical and have %lu bytes\n", pos);
        return 0;  // files are identical
    } else
    if (c1 == EOF) {
        printf("file1 is included in file2, the first %lu bytes are identical\n", pos);
        return 1;
    } else
    if (c2 == EOF) {
        printf("file2 is included in file1, the first %lu bytes are identical\n", pos);
        return 2;
    } else {
        printf("file1 and file2 differ at position %lu: 0x%02X <> 0x%02X\n", pos, c1, c2);
        return 3;
    }
}

In terms of efficiency, reading one byte at a time does not pose a problem if the streams are buffered.在效率方面,如果流被缓冲,一次读取一个字节不会造成问题。 For large files, you can get better performance by memory mapping the file contents if available on the target system and for the given input streams.对于大文件,如果目标系统和给定的输入流可用,您可以通过内存映射文件内容来获得更好的性能。

Not an actual answer, but a word on optimisation.不是一个实际的答案,而是一个关于优化的词。 You can increase the speed of the program if you have a bigger buffer.如果您有更大的缓冲区,您可以提高程序的速度。 Basically the larger the buffer the faster the program runs HOWEVER the speed you gain from just making it larger will increase logarithmically.基本上,缓冲区越大,程序运行得越快,但是您通过增大缓冲区获得的速度将以对数方式增加。

Here is a picture of a graph that will help you understand.这是一张图表,可以帮助您理解。 Also, what i mentioned applies to any simmilar situation.此外,我提到的适用于任何类似的情况。 This includes: Copying files, filling the sound buffer etc. Loading the entire file in your RAM first and operationg on it will usually be faster than loading parts of it.这包括:复制文件、填充声音缓冲区等。首先将整个文件加载到 RAM 中并对其进行操作通常比加载部分文件更快。 Ofc this is not possible with larger files but still this is what you should aim for if you want speed. Ofc 这对于较大的文件是不可能的,但如果你想要速度,这仍然是你应该追求的目标。

PS: I'm writting here because i don't have rep to comment. PS:我写在这里是因为我没有代表发表评论。

EDIT: I came up with solution but since you did not state what you need to do with your buffer3 and buffer4 i packed it up inside a function.编辑:我想出了解决方案,但由于您没有说明需要对buffer3buffer4做什么,我将其打包在一个函数中。

If you are sure that you are only going to use 16 bytes as a buffer size, remove the nBufferSize parameter and replace the buffer dynamic allocation with a static one.如果您确定只使用 16 字节作为缓冲区大小,请删除nBufferSize参数并将缓冲区动态分配替换为静态分配。

If after the execution you need the buffers, add them as parameters and keep the nBufferSize param.如果执行后您需要缓冲区,请将它们添加为参数并保留nBufferSize参数。 Keep in mind that if you intend to use them outside the function, you should also allocate them outside the function, so things don't get messy.请记住,如果您打算在函数外使用它们,您也应该在函数外分配它们,以免事情变得混乱。

/** Returns 0 if files are identical, 1 if they are different and -1 if there 
is an error. */
int FileCmp(char* szFile1, char* szFile2, int nBufferSize)
{
    FILE *f1, *f2;
    f1 = fopen(szFile1, "rb");
    f2 = fopen(szFile2, "rb");

    // Some error checking?
    if (f1 == NULL || f2 == NULL)
        return -1;

    // You can check here for file sizes before you start comparing them.
    //  ...

    // Start the comparrison.

    /// Replace this part with static allocation. --------
    char* lpBuffer1 = malloc(sizeof(char)*nBufferSize);
    if (lpBuffer1 == NULL) // close the files and return error.
    {
        fclose(f1);
        fclose(f2);
        return -1;
    }
    char* lpBuffer2 = malloc(sizeof(char)*nBufferSize);
    if (lpBuffer2 == NULL) // close the files, free buffer1 and return error.
    {
        free(lpBuffer1);
        fclose(f1);
        fclose(f2);
        return -1;
    }
    /// --------------------------------------------------

    while(1)
    {
        unsigned int uRead1 = fread(lpBuffer1, sizeof(char), nBufferSize, f1);
        unsigned int uRead2 = fread(lpBuffer2, sizeof(char), nBufferSize, f2);

        if (uRead1 != uRead2)
            goto lFilesAreDifferent;

        for(unsigned int i = 0; i < uRead1; i++)
            if (lpBuffer1[i] != lpBuffer2[i])
                goto lFilesAreDifferent;

        if ((feof(f1) != 0) && (feof(f2) != 0))
            break; // both files have nothing more to read and are identical.

        goto lSkip;

        lFilesAreDifferent:
            free(lpBuffer1);
            free(lpBuffer2);
            fclose(f1);
            fclose(f2);
            return 1;

        lSkip:;
    }

    // The files are the same. Close them, free the buffers and return 0.
    free(lpBuffer1);
    free(lpBuffer2);
    fclose(f1);
    fclose(f2);
    return 0;
}

A simple Demo:一个简单的演示:

#define BUFFER_SIZE 16
int main(int nArgs, char** szArgs)
{
    if (nArgs != 3)
    {
        printf("Invalid number of arguments.");
        return 0;
    }

    int nResult = FileCmp(szArgs[1], szArgs[2], BUFFER_SIZE);
    switch (nResult)
    {
        case 0: printf("Files [%s] and [%s] are identical.", szArgs[1], szArgs[2]); break;
        case 1: printf("Files [%s] and [%s] are different.", szArgs[1], szArgs[2]); break;
        case -1: printf("Error."); break;
    }

    return 0;
}

EDIT II: Personally, i have never used the C standard FILE library (it was either C++ fstream or pure win32 fileapi) so don't take my word here for granted but fread is the fastest function i could find (faster than fgets or fgetc ).编辑二:就我个人而言,我从未使用过 C 标准 FILE 库(它是 C++ fstream 或纯 win32 fileapi)所以不要把我的话当成理所当然,但fread是我能找到的最快的函数(比fgetsfgetc快)。 If you want even faster than this you should get into OS dependant functions (like ReadFile() for Windows).如果你想要比这更快,你应该进入依赖于操作系统的函数(比如 Windows 的ReadFile() )。

chqrlie's solution using getc is absolutely the right way to do this. chqrlie 使用getc的解决方案绝对是做到这一点的正确方法。 I wanted to address some points brought up in comments, and find it's best to do that with code.我想解决评论中提出的一些问题,并发现最好用代码来解决。 In one comment, I recommend pseudo code which could be confusing (namely, you can't write fwrite(file1...) || fwrite(file2 ...) because of the short circuit. But you can implement the idea of that with:在一个评论中,我推荐伪代码,它可能会令人困惑(即,由于短路,您不能编写fwrite(file1...) || fwrite(file2 ...) 。但是您可以实现该想法和:

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

/*
 * Compare two files, 16 bytes at a time. (Purely to demonstrate memcmp.
 * Clearly, this should be implemented with getc.)
 */

FILE * xfopen(const char *, const char *);
size_t xfread(void *, FILE *, const char *);

int
main(int argc, char **argv)
{
        FILE *fp[2];
        size_t n[2];
        char buf[2][16];
        unsigned count = 0;

        if(argc != 3) { return EXIT_FAILURE; }
        fp[0] = xfopen(argv[1], "r");
        fp[1] = xfopen(argv[2], "r");
        do {
                n[0] = xfread(buf[0], fp[0], argv[1]);
                n[1] = xfread(buf[1], fp[1], argv[2]);
                if( n[0] != n[1] || (n[0] && memcmp(buf[0], buf[1], n[0]))) {
                        fprintf(stderr, "files differ in block %u\n", count);
                        return 1;
                }
                count += 1;
        } while(n[0]);
        puts("files are identical");
        return 0;
}

size_t
xfread(void *b, FILE *fp, const char *name)
{
        size_t n = fread(b, 1, 16, fp);
        if(n == 0 && ferror(fp)) {
                fprintf(stderr, "Error reading %s\n", name);
                exit(EXIT_FAILURE);
        }
        return n;
}

FILE *
xfopen(const char *path, const char *mode)
{
        FILE *fp = strcmp(path, "-") ? fopen(path, mode) : stdin;
        if( fp == NULL ) {
                perror(path);
                exit(EXIT_FAILURE);
        }
        return fp;
}

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM