[英]Concatenating multiple files python
我正在尝试使用 python 将多个文件合并到一个文件中,我尝试了几种方法,但它们都导致最终文件在某些行上丢失。 文件的大小可以变化很大,所以我更喜欢使用不会将整个文件加载到 memory 的东西。
我对此的了解有点有限,但我读到这可能是由于写入缓冲,也就是文件没有立即写入,而是暂时保存在 memory 中,然后写入文件。
我尝试了多种方法来解决这个问题:使用shutil.copyfileobj,经典python的读/写,在文件末尾添加标签,检查两个文件的尾部,使用file.flush后跟os.fsync,最后,添加几秒钟的 time.sleep。 一切都失败了,任何人都可以就合并文件的可靠方法提出建议吗? 有些方法在我的本地 PC 上似乎可以正常工作,但是在另一个系统 (HPC) 上尝试时会发生错误,所以这有点难以复制。
这些是我迄今为止尝试过的所有方法:
#support functions
def tail(file_path):
last_line = None
with open(file_path) as file:
line=file.readline()
while line:
last_line=str(line)
line=file.readline()
return last_line
def wait_for_flush(output_file,tail_in):
c = 0
while not file_exists(output_file):
sleep(5)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
while tail_out != tail_in:
while not tail_out:
sleep(2)
tail_out = tail(output_file)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
c += 1
sleep(2)
if c > 100: raise BrokenConcatenation(output_file)
def merge_two_files(file1,file2):
with open(file1, 'a+') as f1:
with open(file2) as f2:
line=f2.readline()
while line:
f1.write(line)
line=f2.readline()
#forcing disk write
f1.flush()
os.fsync(f1)
#main functions
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
print(output_file)
list_files=list(list_file_paths)
while len(list_files)>1:
file1=list_files.pop(0)
file2=list_files.pop(0)
merge_two_files(file1,file2)
sleep(1)
os.remove(file2)
list_files.append(file1)
final_file=list_files.pop()
move_file(final_file,output_file)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb',buffering=0) as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
sleep(2)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as wfd:
for f in list_file_paths:
with open(f) as fd:
line = fd.readline()
while line:
wfd.write(line)
line = fd.readline()
if add_tag:
tail_in='#'+f+'\n'
wfd.write(tail_in)
else: tail_in=tail(f)
# forcing disk write
wfd.flush()
os.fsync(wfd)
wait_for_flush(output_file,tail_in)
#resets file whenever we open file, doesnt work
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
for f in list_file_paths:
with open(output_file, 'wb') as wfd:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as outfile:
for f in list_file_paths:
with open(f) as infile:
line=infile.readline()
while line:
outfile.write(line)
line=infile.readline()
#forcing disk write
outfile.flush()
os.fsync(outfile)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb') as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
如果您不想将大文件读入 memory,我会说这应该可以工作:
def concat_files(output_file, list_file_paths):
print('Concatenating files into', output_file)
with open(output_file, 'w') as wfd:
for f in list_file_paths:
print(f, '...')
with open(f) as fd:
for line in fd:
wfd.write(line)
wfd.write(f'eof - {f}\n') # mod to indicate end of this file
print('Done.')
这应该将output_file
创建为一个新文件,并从list_file_paths
读取每个文件,一次一行,写入新文件。
更新:查看mod 以指示此文件的结尾
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.