![](/img/trans.png)
[英]Read characters from file to end of line without using fread, fopen, fscanf, fgets
[英]Windows 10: fopen, fread, fgets and mixed EOL characters in large text file
我正在为计算材料科学家开发一个程序:
Atomes 可以导入包含原子坐标的大型文本文件,为此我使用fread
读取整个文件,然后使用 OpenMP 在 CPU 内核上拆分文本缓冲区。
在 Linux 和 Windows 上工作奇迹,直到有人出现了我没想到的问题。 具有混合且不定期放置的 EOL 符号( \n
和\r
)的文件。
我找到了解决 Windows 和 Windows 问题的方法,如果我使用了正确的更正,我将非常感谢您对我所做的事情的建议/评论。
在该解决方案之前,我尝试向fopen
function 添加选项,例如-t
或-b
但没有效果。
最后我注意到如果我更改编译选项并使用fgets
function 从文件中读取数据没有问题,只有在这种情况下,对于大文件,数据的处理更复杂,到目前为止我没有办法 //使用 OpenMP,并且需要更多时间。
这是我为读取文件而编写的代码:
int open_coord_file (gchar * filename)
{
int res;
#ifdef OPENMP
// In that case I read the file in a single buffer, then work on that buffer
struct stat status;
res = stat (filename, & status);
if (res == -1)
{
// Basic function to store information on the reading process
add_reader_info ("Error - cannot get file statistics !");
return 1;
}
int fsize = status.st_size;
#endif
coordf = fopen (filename, "r");
if (! coordf)
{
add_reader_info ("Error - cannot open coordinates file !");
return 1;
}
int i, j, k;
#ifdef OPENMP
gchar * coord_content = g_malloc0(fsize*sizeof*coord_content);
// Using fread to read the entire file
fread (coord_content, fsize, 1, coordf);
fclose (coordf);
int linecount = 0;
// Evaluating the number of lines in the file:
for (j=0; j<fsize; j++) if (coord_content[j] == '\n') linecount ++;
#ifdef G_OS_WIN32
// What happen in Windows is that some '\r' symbols were found
// and not on all lines, so I decided to check for \r symbols:
int neolr = 0;
for (j=0; j<fsize; j++) if (coord_content[j] == '\r') neolr ++;
// And mofidy the number of lines accordingly
linecount -= neolr;
#endif
coord_line = g_malloc0 (linecount*sizeof*coord_line);
coord_line[0] = & coord_content[0];
i = 1;
int nfsize = fsize;
#ifdef G_OS_WIN32
// Now deleting the corresponding EOL symbols in the text buffer
// This is only required for Windows, and I am not sure that it is
// the proper way to do thing, any though on the matter would be appreciated.
for (j=0; j<fsize; j++)
{
if (coord_content[j] == '\n')
{
coord_content[j] = '\0';
}
else if (coord_content[j] == '\r')
{
for (k=j; k<fsize-1; k++)
{
coord_content[k] = coord_content[k+1];
}
nfsize --;
}
}
#endif
// And referencing properly the lines to work on the buffer:
for (j=0; j<nfsize; j++)
{
if (coord_content[j] == '\0')
{
if (i < linecount)
{
coord_line[i] = & coord_content[j+1];
i++;
}
}
}
#else
// On the other side if turn down OpenMP, then I use the fgets function
// to read the data from the text file, then there no problem what so ever
// with the EOL symbols and everything work smoothly.
// The fopen options being the same I am somewhat confused by this result.
gchar * buf = g_malloc0(LINE_SIZE*sizeof*buf);
struct line_node
{
gchar * line;//[LINE_SIZE];
struct line_node * next;
struct line_node * prev;
};
struct line_node * head = NULL;
struct line_node * tail = NULL;
i = 0;
while (fgets(buf, LINE_SIZE, coordf))
{
if (head == NULL)
{
head = g_malloc0 (sizeof*head);
tail = g_malloc0 (sizeof*tail);
tail = head;
}
else
{
tail -> next = g_malloc0 (sizeof*tail -> next);
tail = tail -> next;
}
tail -> line = g_strdup_printf ("%s", buf);
tail -> line = substitute_string (tail -> line, "\n", "\0");
i ++;
}
g_free (buf);
fclose (coordf);
#endif
// And then latter in the code I process the data
// providing i the number of lines as an input value.
return read_xyz_file (i);
任何建议将不胜感激。
[编辑]
我找到了一种方法来解决我的fread
问题,使用带有fgets
的临时缓冲区,这样我就可以再次轻松地使用 OpenMP // 获取我的数据:
coord_line = g_malloc0 (i*sizeof*coord_line);
tail = head;
j = 0;
while (tail)
{
coord_line[j] = & tail -> line[0];
j ++;
tail = tail -> next;
}
现在一切都很好,尽管我仍然不知道为什么我遇到fread
问题
[/编辑]
SO 不是免费的代码编写服务。 然而,你已经表现出真正的努力,试图为自己解决这个问题。 逐点更正需要很长时间,所以这里有一个“代码转储”,应该很容易理解,并且(我希望)能做到你努力实现的目标。
这是在“普通,普通”C 中,没有代码中显示的“gXXXX”函数。 这将打开并加载整个(假定为)文本文件,如果存在 CR,则将其挤出,然后将分配指针的行分割成不断增长的指向每一行的指针数组。 (空行也将被分配一个指针)一些printf
行报告进程的一些统计信息。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Load file contents to alloc'd memory, return pointer to buffer (to be free'd!)
char *loadFile( char *fname ) {
FILE *fp;
if( ( fp = fopen( fname, "rb" ) ) == NULL )
fprintf( stderr, "Cannot open '%s'\n", fname ), exit(1);
fseek( fp, 0, SEEK_END );
size_t size = ftell( fp );
fseek( fp, 0, SEEK_SET );
char *buf;
if( ( buf = (char*)malloc( size + 1) ) == NULL )
fprintf( stderr, "Malloc() failed\n" ), exit(1);
if( fread( buf, sizeof *buf, size, fp ) != size )
fprintf( stderr, "Read incomplete\n" ), exit(1);
fclose( fp );
*(buf + size) = '\0'; // set xtra byte allocated to NULL (allows str functions to work)
return buf;
}
int main() {
char *fname = "FOO.BAR"; // To be defined...
char *fCont = loadFile( fname ), *d, *s;
// crush out '\r', if any
for( d = fCont, s = fCont; (*d = *s) != '\0'; s++ )
d += *d != '\r';
fprintf( stderr, "Orig %ld. Without CR %ld\n", s - fCont, d - fCont );
char **arr = NULL;
int lcnt = 0;
for( char *t = fCont; ( t = strtok( t, "\n" ) ) != NULL; t = NULL ) {
char **tmp = (char**)realloc( arr, (lcnt+1) * sizeof *tmp );
if( tmp == NULL )
fprintf( stderr, "realloc() failed\n" ), exit(1);
arr = tmp;
arr[lcnt++] = t;
}
fprintf( stderr, "%ld lines loaded\n", lcnt );
// "demo" the first 5 lines
for( int i = 0; i < 5 && i < lcnt; i++ )
fprintf( stderr, "%d - '%s'\n", i+1, arr[i] );
/* process from arr[0] to arr[lcnt-1] */
free( arr );
free( fCont );
return 0;
}
希望这可以帮助。 Ball现在在你的球场上...
这可以使用fopen
function 的b
选项来解决:
coordf = fopen (filename, "rb");
之后fread
行为正常。
请注意,在我的第一次尝试中,我可能使用了以下错误的参数顺序:
coordf = fopen (filename, "br");
那是行不通的。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.