[英]Concatenating multiple files python
我正在嘗試使用 python 將多個文件合並到一個文件中,我嘗試了幾種方法,但它們都導致最終文件在某些行上丟失。 文件的大小可以變化很大,所以我更喜歡使用不會將整個文件加載到 memory 的東西。
我對此的了解有點有限,但我讀到這可能是由於寫入緩沖,也就是文件沒有立即寫入,而是暫時保存在 memory 中,然后寫入文件。
我嘗試了多種方法來解決這個問題:使用shutil.copyfileobj,經典python的讀/寫,在文件末尾添加標簽,檢查兩個文件的尾部,使用file.flush后跟os.fsync,最后,添加幾秒鍾的 time.sleep。 一切都失敗了,任何人都可以就合並文件的可靠方法提出建議嗎? 有些方法在我的本地 PC 上似乎可以正常工作,但是在另一個系統 (HPC) 上嘗試時會發生錯誤,所以這有點難以復制。
這些是我迄今為止嘗試過的所有方法:
#support functions
def tail(file_path):
last_line = None
with open(file_path) as file:
line=file.readline()
while line:
last_line=str(line)
line=file.readline()
return last_line
def wait_for_flush(output_file,tail_in):
c = 0
while not file_exists(output_file):
sleep(5)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
while tail_out != tail_in:
while not tail_out:
sleep(2)
tail_out = tail(output_file)
c += 1
if c > 100: raise BrokenConcatenation(output_file)
tail_out = tail(output_file)
c += 1
sleep(2)
if c > 100: raise BrokenConcatenation(output_file)
def merge_two_files(file1,file2):
with open(file1, 'a+') as f1:
with open(file2) as f2:
line=f2.readline()
while line:
f1.write(line)
line=f2.readline()
#forcing disk write
f1.flush()
os.fsync(f1)
#main functions
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
print(output_file)
list_files=list(list_file_paths)
while len(list_files)>1:
file1=list_files.pop(0)
file2=list_files.pop(0)
merge_two_files(file1,file2)
sleep(1)
os.remove(file2)
list_files.append(file1)
final_file=list_files.pop()
move_file(final_file,output_file)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb',buffering=0) as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
sleep(2)
def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as wfd:
for f in list_file_paths:
with open(f) as fd:
line = fd.readline()
while line:
wfd.write(line)
line = fd.readline()
if add_tag:
tail_in='#'+f+'\n'
wfd.write(tail_in)
else: tail_in=tail(f)
# forcing disk write
wfd.flush()
os.fsync(wfd)
wait_for_flush(output_file,tail_in)
#resets file whenever we open file, doesnt work
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
for f in list_file_paths:
with open(output_file, 'wb') as wfd:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'w+') as outfile:
for f in list_file_paths:
with open(f) as infile:
line=infile.readline()
while line:
outfile.write(line)
line=infile.readline()
#forcing disk write
outfile.flush()
os.fsync(outfile)
def concat_files(output_file,list_file_paths,stdout_file=None):
print('Concatenating files into ',output_file,flush=True,file=stdout_file)
with open(output_file, 'wb') as wfd:
for f in list_file_paths:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#forcing disk write
wfd.flush()
os.fsync(wfd)
如果您不想將大文件讀入 memory,我會說這應該可以工作:
def concat_files(output_file, list_file_paths):
print('Concatenating files into', output_file)
with open(output_file, 'w') as wfd:
for f in list_file_paths:
print(f, '...')
with open(f) as fd:
for line in fd:
wfd.write(line)
wfd.write(f'eof - {f}\n') # mod to indicate end of this file
print('Done.')
這應該將output_file
創建為一個新文件,並從list_file_paths
讀取每個文件,一次一行,寫入新文件。
更新:查看mod 以指示此文件的結尾
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.