[英]memory error when splitting big file into smaller files in python
我已經讀過幾篇文章,包括這篇文章。 但沒有一個幫助。
這是我目前用於分割文件的python代碼
我的輸入文件大小為15G,我將其分割為128MB。 我的電腦有8G內存
import sys
def read_line(f_object,terminal_byte):
line = ''.join(iter(lambda:f_object.read(1),terminal_byte))
line+="\x01"
return line
def read_lines(f_object,terminal_byte):
tmp = read_line(f_object,terminal_byte)
while tmp:
yield tmp
tmp = read_line(f_object,terminal_byte)
def make_chunks(f_object,terminal_byte,max_size):
current_chunk = []
current_chunk_size = 0
for line in read_lines(f_object,terminal_byte):
current_chunk.append(line)
current_chunk_size += len(line)
if current_chunk_size > max_size:
yield "".join(current_chunk)
current_chunk = []
current_chunk_size = 0
if current_chunk:
yield ''.join(current_chunk)
inputfile=sys.argv[1]
with open(inputfile,"rb") as f_in:
for i,chunk in enumerate(make_chunks(f_in, bytes(chr(1)),1024*1000*128)):
with open("out%d.txt"%i,"wb") as f_out:
f_out.write(chunk)
執行腳本時,出現以下錯誤:
Traceback (most recent call last):
File "splitter.py", line 30, in <module>
for i,chunk in enumerate(make_chunks(f_in, bytes(chr(1)),1024*1000*128)):
File "splitter.py", line 17, in make_chunks
for line in read_lines(f_object,terminal_byte):
File "splitter.py", line 12, in read_lines
tmp = read_line(f_object,terminal_byte)
File "splitter.py", line 4, in read_line
line = ''.join(iter(lambda:f_object.read(1),terminal_byte))
MemoryError
問題 :將大文件拆分為較小的文件
不必查找每個\\x01
而是僅在Last chunk
執行此操作。
或者復位文件指針,以offset+1
最后發現\\x01
和繼續或寫入高達offset
在當前塊文件和剩余部分chunk
在未來塊文件。
注意 :您的
chunk_size
應該是io.DEFAULT_BUFFER_SIZE
或它的倍數。
如果將chunk_size
增大為高,則不會加速。
閱讀此相關的SO QA: 文件的默認緩沖區大小
我的示例顯示了重置Filepointer的用法,例如:
import io
large_data = b"""Lorem ipsum\x01dolor sit\x01sadipscing elitr, sed\x01labore et\x01dolores et ea rebum.\x01magna aliquyam erat,\x01"""
def split(chunk_size, split_size):
with io.BytesIO(large_data) as fh_in:
_size = 0
# Used to verify chunked writes
result_data = io.BytesIO()
while True:
chunk = fh_in.read(chunk_size)
print('read({})'.format(bytearray(chunk)))
if not chunk: break
_size += chunk_size
if _size >= split_size:
_size = 0
# Split on last 0x01
l = len(chunk)
print('\tsplit_on_last_\\x01({})\t{}'.format(l, bytearray(chunk)))
# Reverse iterate
for p in range(l-1, -1, -1):
c = chunk[p:p+1]
if ord(c) == ord('\x01'):
offset = l-(p+1)
# Condition if \x01 is the Last Byte in chunk
if offset == 0:
print('\toffset={} write({})\t\t{}'.format(offset, l - offset, bytearray(chunk)))
result_data.write(chunk)
else:
# Reset Fileppointer
fh_in.seek(fh_in.tell()-offset)
print('\toffset={} write({})\t\t{}'.format(offset, l-offset, bytearray(chunk[:-offset])))
result_data.write(chunk[:-offset])
break
else:
print('\twrite({}) {}'.format(chunk_size, bytearray(chunk)))
result_data.write(chunk)
print('INPUT :{}\nOUTPUT:{}'.format(large_data, result_data.getvalue()))
if __name__ == '__main__':
split(chunk_size=30, split_size=60)
輸出 :
read(bytearray(b'Lorem ipsum\\x01dolor sit\\x01sadipsci')) write(30) bytearray(b'Lorem ipsum\\x01dolor sit\\x01sadipsci') read(bytearray(b'ng elitr, sed\\x01labore et\\x01dolore')) split_on_last_\\x01(30) bytearray(b'ng elitr, sed\\x01labore et\\x01dolore') offset=6 write(24) bytearray(b'ng elitr, sed\\x01labore et\\x01') read(bytearray(b'dolores et ea rebum.\\x01magna ali')) write(30) bytearray(b'dolores et ea rebum.\\x01magna ali') read(bytearray(b'quyam erat,\\x01')) split_on_last_\\x01(12) bytearray(b'quyam erat,\\x01') offset=0 write(12) bytearray(b'quyam erat,\\x01') read(bytearray(b'')) INPUT :b'Lorem ipsum\\x01dolor sit\\x01sadipscing elitr, sed\\x01labore et\\x01dolores et ea rebum.\\x01magna aliquyam erat,\\x01' OUTPUT:b'Lorem ipsum\\x01dolor sit\\x01sadipscing elitr, sed\\x01labore et\\x01dolores et ea rebum.\\x01magna aliquyam erat,\\x01'
使用Python測試:3.4.2
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.