简体   繁体   中英

reading unknown length file in buffer chunks in c

I'm trying to read unknown length binary file into buffer chunks without using the functions like lseek() , fseek .

  1. I have used struct buffer that has 1024 bytes at once. when reading file larger than 1012 bytes it will allocate several buffers. However, when it encounters the last chunk it will definitely have less or equal to 1024 bytes. Thus, i try to count the length of the last chunk so that I can read last chunk up until the eof but i am kind of confused with how to implement this.

Thanks in advance.

#include <stdio.h>
#include <stdlib.h>

typedef struct Buffer{
  unsigned char data[1012];
  struct Buffer *next; //12 bytes
}Buffer;

void mymemcpy(void *dest, void *src, size_t length){
  Buffer *buffer_toFill = (Buffer *)dest;
  Buffer *buffer_toAdd = (Buffer *)src;
  int a = 0; 
  for(int i = 0; i < length; i++){
    buffer_toFill->data[i] = buffer_toAdd->data[i];
  }
}

Buffer* add_buffer_front(Buffer *head, Buffer *read_buffer, int size){
  Buffer *new_buffer = malloc(sizeof(Buffer));
  mymemcpy(new_buffer, read_buffer, size);
  if(head != NULL){
    new_buffer->next = head;
  }
  return new_buffer;
}

void display_List(Buffer *head, size_t length){
  Buffer *current = head;
  while(current != NULL){
    for(int i = 0; i < length; i++){
      printf("%02X",(unsigned)current->data[i]); //this shows different value compare with  xxd <filename>
      //printf("%c", current->data[i]);  
    }
    Buffer *prev = current;
    free(prev);
    current = current->next;
  }
}

int main(int argc, char **argv){
  FILE *fd;
  Buffer *head_buffer = NULL;
  int file_length = 0;
  int eof_int = 1;
  if(argc != 2){
    printf("Usage: readFile <filename>\n");
    return 1; 
  }

  fd = fopen(argv[1], "rb");

  while(eof_int != 0){ 
    Buffer *new_buffer = malloc(sizeof(Buffer));
    eof_int = fread(new_buffer, sizeof(Buffer)-12, 1, fd);
    if(eof_int == 0){ 
      //size_t length
      //
      //
      head_buffer = add_buffer_front(head_buffer, new_buffer, length);
      file_length += length;
    }else{
      head_buffer = add_buffer_front(head_buffer, new_buffer, (sizeof(new_buffer->data)));
      file_length += (sizeof(new_buffer->data));
    }
  }
  display_List(head_buffer, file_length);
  fclose(fd);
  return 0;
}

You have several problems.

(1) fread returns the number of items read, but it will not return an eof indication. You need to call feof(stream*) to find out if you've reached the end of file.

(2) You are saying your next pointer is 12 bytes. This is a very dangerous assumption. Prefer to read the 1012 bytes you've allocated to the data struct. In all likelihood you are currently printing stuff that wasn't read in, but is just uninitialized memory.

(3) Use the return value from fread to decide how much memory to copy.

Please see the comments in the code below - also consider changing the 1012 to use a #define.

#include <stdio.h>
#include <stdlib.h>

typedef struct Buffer{
  unsigned char data[1012];
  struct Buffer *next; //12 bytes
}Buffer;

// Create a structure to store stuff about a file

typedef struct {
   Buffer *head;
   Buffer *tail;
   size_t length;
} MyFile;

/*
void mymemcpy(void *dest, void *src, size_t length){
  Buffer *buffer_toFill = (Buffer *)dest;
  Buffer *buffer_toAdd = (Buffer *)src;
  int a = 0; 
  for(int i = 0; i < length; i++){
    buffer_toFill->data[i] = buffer_toAdd->data[i];
  }
}

Buffer* add_buffer_front(Buffer *head, Buffer *read_buffer, int size){
  Buffer *new_buffer = malloc(sizeof(Buffer));
  mymemcpy(new_buffer, read_buffer, size);
  if(head != NULL){
    new_buffer->next = head;
  }
  return new_buffer;
}

*/

// Lets make this easier - The buffer has already been "malloced" once - why do it again

// And why are you reversing the file

// Perhaps 

void add_buffer(Buffer *to_be_added, MyFile *file, size_t extra_length) {
   if (file->tail) { // We have one item in the list
     file->tail->next = to_be_added;
   } else { // First buffer!
     file-> head = to_be_added;
     file-> tail = to_be_added;
   }
   to_be_added->next = NULL;  // This is always the case as it is the last one
   file->length += extra_length;
}

/*
void display_List(Buffer *head, size_t length){
  Buffer *current = head;
  while(current != NULL){
    for(int i = 0; i < length; i++){
      printf("%02X",(unsigned)current->data[i]); //this shows different value compare with  xxd <filename>
      //printf("%c", current->data[i]);  
    }
    Buffer *prev = current;
    free(prev);
    current = current->next;
  }
}

*/

// Instead pass in the new structure

void display_list(MyFile *file) {
   size_t contents_left = file -> length;
   Buffer * current = file -> head;
   while (current) {
      // At most each chunk has 1012 bytes - Check for that
      size_t chunk_length = contents_left > 1012 ? 1012 : contents_left;
       for(int i = 0; i <chunk_length ; i++){
         printf("%02X",(unsigned)current->data[i]);
       }
       current = current -> next;
   }
}


}
int main(int argc, char **argv){
  FILE *fd;
  MyFile read_file;
  read_file.head = NULL;
  read_file.tail = NULL;
  read_file.length = 0;

  Buffer *head_buffer = NULL;
  int file_length = 0;
  int eof_int = 1;
  if(argc != 2){
    printf("Usage: readFile <filename>\n");
    return 1; 
  }

  fd = fopen(argv[1], "rb");

  // Check fd
  if (fd == NULL) {
    // error stuff
    return EXIT_FAILURE; // Look up the include for this
 }
  while(eof_int != 0){ 
    Buffer *new_buffer = malloc(sizeof(Buffer));
    eof_int = fread(new_buffer->data, 1012, 1, fd); // Do not make assumptions on the size of a pointer and store it in the correct location
    if(eof_int == 0) { // Read nothing
       free(new_buffer); // We was too optimistic! Did Not need this in the end 
       break;
    } else {
      add_buffer(&read_file, new_buffer, eof_int);
    }
  }
  display_List(&read_file);
  fclose(fd);
  return 0;
}

The trick you're looking for is that fread returns the number of items read. You're reading 1 buffer full so it will only tell you that you read 0 or 1 buffers. Instead, flip it and reverse it: read a buffer's worth of bytes .

size_t bytes_read = fread(buffer, 1, sizeof(Buffer)-12, fd);

Now you can know how many bytes were read into your buffer. We can add a size field to the Buffer so each buffer can remember how many bytes it read and only print that many.

const size_t BUFFER_SIZE = 1024;

typedef struct Buffer {
    // I'll explain why I switched to a pointer in a moment
    unsigned char *data;
    size_t size;
    struct Buffer *next;
} Buffer;

void Buffer_print( Buffer *buffer ) {
    for( size_t i = 0; i < buffer->size; i++ ) {
        printf("%02hhX ", buffer->data[i]);
    }
}

Buffer *Buffer_new() {
    Buffer *buffer = malloc(sizeof(Buffer));

    buffer->size = 0;
    buffer->data = NULL;
    buffer->next = NULL;

    return buffer;
}

Note that I'm careful to initialize all the fields of the buffer, else we risk getting garbage.

Now we've changed our buffer, so our assumption about its size and position is broken. That's ok, we should be reading straight into buffer->data anyway.

size_t Buffer_read( Buffer *buffer, size_t buffer_size, FILE* fp ) {
    buffer->data = malloc(buffer_size);
    size_t bytes_read = fread(buffer->data, 1, buffer_size, fp);
    buffer->size = bytes_read;
    return bytes_read;
}

Now that the buffer knows how much data its read, we can allocate any size of the data as we like. There's no need to hard code that into the struct. This makes the code more flexible and efficient. It lets us cheaply allocate empty buffers and that will make things much simpler.

We can also get away with using malloc and leaving buffer->data initialized with garbage. If fread only partially fills buffer->data the rest will remain garbage. That's ok, knowing the size of the data we've read means we stop printing before we hit garbage.


Now we can construct our loop. When it's read 0 bytes we know it's done reading.

while( Buffer_read( buffer, BUFFER_SIZE, fp ) > 0 ) {
    ... now what ...
}
fclose(fp);

The way a linked list works, when you add to the list you add to tail->next and make that the new tail. This is often called "pushing".

Buffer *Buffer_push( Buffer *tail, Buffer *new_tail ) {
    tail->next = new_tail;
    return new_tail;
}

Buffer *head = Buffer_new();
Buffer *tail = head;
while( Buffer_read( tail, BUFFER_SIZE, fp ) > 0 ) {
    tail = Buffer_push( tail, Buffer_new() );
}
fclose(fp);

Note that we start with an empty head which is also the tail . Starting with these both allocated makes the loop much simpler. There's no need to check if( head ) or if( tail ) . It does mean that we always have an empty buffer on the end of our list. That's ok. Since we're no longer using a fixed buffer->data empty buffers are now tiny and cheap.


The final step is to print everything. We can already print a single buffer, so we just need to walk the linked list and print each buffer.

void Buffer_print_all( Buffer *head ) {
    for( Buffer *buffer = head; buffer != NULL; buffer = buffer->next ) {
        Buffer_print(buffer);
    }
}

Buffer_print_all(head);

That final, empty buffer hanging off the end is fine. It knows its size is 0 so Buffer_print won't actually use the null buffer->data .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM