連接多個文件 python

Question

我正在嘗試使用 python 將多個文件合並到一個文件中，我嘗試了幾種方法，但它們都導致最終文件在某些行上丟失。 文件的大小可以變化很大，所以我更喜歡使用不會將整個文件加載到 memory 的東西。

我對此的了解有點有限，但我讀到這可能是由於寫入緩沖，也就是文件沒有立即寫入，而是暫時保存在 memory 中，然后寫入文件。

我嘗試了多種方法來解決這個問題：使用shutil.copyfileobj，經典python的讀/寫，在文件末尾添加標簽，檢查兩個文件的尾部，使用file.flush后跟os.fsync，最后，添加幾秒鍾的 time.sleep。 一切都失敗了，任何人都可以就合並文件的可靠方法提出建議嗎？ 有些方法在我的本地 PC 上似乎可以正常工作，但是在另一個系統 (HPC) 上嘗試時會發生錯誤，所以這有點難以復制。

這些是我迄今為止嘗試過的所有方法：



#support functions
def tail(file_path):
    last_line = None
    with open(file_path) as file:
        line=file.readline()
        while line:
            last_line=str(line)
            line=file.readline()
    return last_line


def wait_for_flush(output_file,tail_in):
    c = 0
    while not file_exists(output_file):
        sleep(5)
        c += 1
        if c > 100: raise BrokenConcatenation(output_file)
    tail_out = tail(output_file)
    while tail_out != tail_in:
        while not tail_out:
            sleep(2)
            tail_out = tail(output_file)
            c += 1
            if c > 100: raise BrokenConcatenation(output_file)
        tail_out = tail(output_file)
        c += 1
        sleep(2)
        if c > 100: raise BrokenConcatenation(output_file)

def merge_two_files(file1,file2):
    with open(file1, 'a+') as f1:
        with open(file2) as f2:
            line=f2.readline()
            while line:
                f1.write(line)
                line=f2.readline()
        #forcing disk write
        f1.flush()
        os.fsync(f1)

#main functions

def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    print(output_file)
    list_files=list(list_file_paths)
    while len(list_files)>1:
        file1=list_files.pop(0)
        file2=list_files.pop(0)
        merge_two_files(file1,file2)
        sleep(1)
        os.remove(file2)
        list_files.append(file1)
    final_file=list_files.pop()
    move_file(final_file,output_file)


def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    with open(output_file, 'wb',buffering=0) as wfd:
        for f in list_file_paths:
            with open(f,'rb') as fd:
                shutil.copyfileobj(fd, wfd)
                #forcing disk write
                wfd.flush()
                os.fsync(wfd)
                sleep(2)

def concat_files(output_file,list_file_paths,stdout_file=None,add_tag=False):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    with open(output_file, 'w+') as wfd:
        for f in list_file_paths:
            with open(f) as fd:
                line = fd.readline()
                while line:
                    wfd.write(line)
                    line = fd.readline()
            if add_tag:
                tail_in='#'+f+'\n'
                wfd.write(tail_in)
            else: tail_in=tail(f)
            # forcing disk write
            wfd.flush()
            os.fsync(wfd)
            wait_for_flush(output_file,tail_in)


#resets file whenever we open file, doesnt work
def concat_files(output_file,list_file_paths,stdout_file=None):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    for f in list_file_paths:
        with open(output_file, 'wb') as wfd:
            with open(f,'rb') as fd:
                shutil.copyfileobj(fd, wfd)
                #forcing disk write
                wfd.flush()
                os.fsync(wfd)


def concat_files(output_file,list_file_paths,stdout_file=None):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    with open(output_file, 'w+') as outfile:
        for f in list_file_paths:
            with open(f) as infile:
                line=infile.readline()
                while line:
                    outfile.write(line)
                    line=infile.readline()
            #forcing disk write
            outfile.flush()
            os.fsync(outfile)


def concat_files(output_file,list_file_paths,stdout_file=None):
    print('Concatenating files into ',output_file,flush=True,file=stdout_file)
    with open(output_file, 'wb') as wfd:
        for f in list_file_paths:
            with open(f,'rb') as fd:
                shutil.copyfileobj(fd, wfd)
                #forcing disk write
                wfd.flush()
                os.fsync(wfd)

Answer 1

如果您不想將大文件讀入 memory，我會說這應該可以工作：

def concat_files(output_file, list_file_paths):
    print('Concatenating files into', output_file)
    with open(output_file, 'w') as wfd:
        for f in list_file_paths:
            print(f, '...')
            with open(f) as fd:
                for line in fd:
                    wfd.write(line)
                wfd.write(f'eof - {f}\n')  # mod to indicate end of this file
    print('Done.')

這應該將output_file創建為一個新文件，並從list_file_paths讀取每個文件，一次一行，寫入新文件。

更新：查看mod 以指示此文件的結尾

連接多個文件 python

問題描述

1 個解決方案

解決方案1
0 2021-01-12 09:03:00

連接多個文件 python

問題描述

1 個解決方案

解決方案1 0 2021-01-12 09:03:00

解決方案1
0 2021-01-12 09:03:00