[英]Python: compress and save/load large data from/into memory
我有一本巨大的字典,其中有numpy个数组作为值,几乎消耗了所有RAM。 无法完全腌制或压缩它。 我已经检查了一些使用zlib
读取/写入块的解决方案 ,但是当我想从RAM读取/写入RAM时,它们可以与文件, StringIO
等一起使用。
这是我想要的最接近的示例 ,但仅包含部分内容。 保存后,由于块被一起写入并且压缩的块当然具有不同的长度,我如何读取对象?
import zlib
class ZlibWrapper():
# chunksize is used to save memory, otherwise huge object will be copied
def __init__(self, filename, chunksize=268435456): # 256 MB
self.filename = filename
self.chunksize = chunksize
def save(self, data):
"""Saves a compressed object to disk
"""
mdata = memoryview(data)
with open(self.filename, 'wb') as f:
for i in range(0, len(mdata), self.chunksize):
mychunk = zlib.compress(bytes(mdata[i:i+self.chunksize]))
f.write(mychunk)
def load(self):
# ???
return data
不幸的是,未压缩的对象太大了,无法通过网络发送,而将它们外部压缩会带来更多的复杂性。
不幸的是,Pickle开始消耗RAM,并且系统挂起。
在与Charles Duffy讨论之后,这是我尝试序列化的操作(目前不起作用-甚至不压缩字符串):
import zlib
import json
import numpy as np
mydict = {"a":np.array([1,2,3]),"b":np.array([4,5,6]),"c":np.array([0,0,0])}
#------------
# write to compressed stream ---------------------
def string_stream_serialization(dic):
for key, val in dic.items():
#key_encoded = key.encode("utf-8") # is not json serializable
yield json.dumps([key,val.tolist()])
output = ""
compressor = zlib.compressobj()
decompressor = zlib.decompressobj()
stream = string_stream_serialization(mydict)
with open("outfile.compressed", "wb") as f:
for s in stream:
if not s:
f.write(compressor.flush())
break
f.write(compressor.compress(s.encode('utf-8'))) # .encode('utf-8') converts to bytes
# read from compressed stream: --------------------
def read_in_chunks(file_object, chunk_size=1024): # I set another chunk size intentionally
"""Lazy function (generator) to read a file piece by piece.
Default chunk size: 1k."""
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
reconstructed = {}
with open("outfile.compressed", "rb") as f:
for s in read_in_chunks(f):
data = decompressor.decompress(decompressor.unconsumed_tail + s)
while data:
arr = json.loads(data.decode("utf-8"))
reconstructed[arr[0]] = np.array(arr[1])
data = decompressor.decompress(decompressor.unconsumed_tail)
print(reconstructed)
您的首要重点应该是采用一种明智的方式来序列化和反序列化数据。 对于问题本身或在注释中提供的数据,我们有一些限制:
这建议一个相当简单的实现:
def serialize(f, content):
for k,v in content.items():
# write length of key, followed by key as string
k_bstr = k.encode('utf-8')
f.write(struct.pack('L', len(k_bstr)))
f.write(k_bstr)
# write length of value, followed by value in numpy.save format
memfile = io.BytesIO()
numpy.save(memfile, v)
f.write(struct.pack('L', memfile.tell()))
f.write(memfile.getvalue())
def deserialize(f):
retval = {}
while True:
content = f.read(struct.calcsize('L'))
if not content: break
k_len = struct.unpack('L', content)[0]
k_bstr = f.read(k_len)
k = k_bstr.decode('utf-8')
v_len = struct.unpack('L', f.read(struct.calcsize('L')))[0]
v_bytes = io.BytesIO(f.read(v_len))
v = numpy.load(v_bytes)
retval[k] = v
return retval
作为一个简单的测试:
test_file = io.BytesIO()
serialize(test_file, {
"First Key": numpy.array([123,234,345]),
"Second Key": numpy.array([321,432,543]),
})
test_file.seek(0)
print(deserialize(test_file))
...所以,我们知道了-现在,如何添加压缩? 容易。
with gzip.open('filename.gz', 'wb') as gzip_file:
serialize(gzip_file, your_data)
...或者在减压方面:
with gzip.open('filename.gz', 'rb') as gzip_file:
your_data = deserialize(gzip_file)
之所以gzip
是因为gzip
库已经按要求将数据流式传输出去,而不是立即压缩或解压缩所有数据。 无需自己进行加窗和分块操作-只需将其留在较低层即可。
要将字典写入磁盘, zipfile模块非常适合。
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.