I'm learning how to raw deflate (no header or trailer information) & inflate data in C++, so I decided to try the zlib
and Crypto++
libraries. I've found that, when deflating the same file, Crypto++
sometimes adds 4 extra bytes (depending on the method used).
For example, for a file containing the following sequence, whitespaces included: 1 2 3 4 5 6
, deflating with zlib
produces a file of size 14 bytes.
This holds true for Crypto++ deflate_method1
, but for Crypto++ deflate_method2
, the file size is 18 bytes.
Also, when trying to inflate a file that was deflated using Crypto++ deflate_method2
with Crypto++ inflate_method1
, an exception is raised:
terminate called after throwing an instance of 'CryptoPP::Inflator::UnexpectedEndErr'
what(): Inflator: unexpected end of compressed block
Aborted (core dumped)
To compare, I did another test deflating/inflating with Python:
At this point, I would like to understand two things:
Why is there a discrepancy in the size of the deflated files?
Why Python is able to inflate any of the files but Crypto++ is being picky?
Input & output files as base64:
MSAyIDMgNCA1IDYK
M1QwUjBWMFEwVTDjAgA=
M1QwUjBWMFEwVTDjAgA=
M1QwUjBWMFEwVTDjAgA=
MlQwUjBWMFEwVTDjAgAAAP//
Zlib:
#include <algorithm>
#include <cstdlib>
#include <cstring>
#include <iterator>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include "zlib.h"
constexpr uint32_t BUFFER_READ_SIZE = 128;
constexpr uint32_t BUFFER_WRITE_SIZE = 128;
bool mydeflate(std::vector<unsigned char> & input)
{
const std::string inputStream{ input.begin(), input.end() };
uint64_t inputSize = input.size();
// Create a string stream where output will be created.
std::stringstream outputStringStream(std::ios::in | std::ios::out | std::ios::binary);
// Initialize zlib structures.
std::vector<char *> readBuffer(BUFFER_READ_SIZE);
std::vector<char *> writeBuffer(BUFFER_WRITE_SIZE);
z_stream zipStream;
zipStream.avail_in = 0;
zipStream.avail_out = BUFFER_WRITE_SIZE;
zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
zipStream.total_in = 0;
zipStream.total_out = 0;
zipStream.data_type = Z_BINARY;
zipStream.zalloc = nullptr;
zipStream.zfree = nullptr;
zipStream.opaque = nullptr;
// Window bits is passed < 0 to tell that there is no zlib header.
if (deflateInit2_(&zipStream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, 8, Z_DEFAULT_STRATEGY, ZLIB_VERSION, sizeof(zipStream)) != Z_OK)
{
return false;
}
// Deflate the input stream
uint32_t readSize = 0;
uint64_t dataPendingToCompress = inputSize;
uint64_t dataPendingToWrite = 0;
bool isEndOfInput = false;
while (dataPendingToCompress > 0)
{
if (dataPendingToCompress > BUFFER_READ_SIZE)
{
readSize = BUFFER_READ_SIZE;
}
else
{
readSize = dataPendingToCompress;
isEndOfInput = true;
}
// Copy the piece of input stream to the read buffer.
std::memcpy(readBuffer.data(), &inputStream[inputSize - dataPendingToCompress], readSize);
dataPendingToCompress -= readSize;
zipStream.next_in = reinterpret_cast<Bytef *>(readBuffer.data());
zipStream.avail_in = readSize;
// While there is input data to compress.
while (zipStream.avail_in > 0)
{
// Output buffer is full.
if (zipStream.avail_out == 0)
{
outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), dataPendingToWrite);
zipStream.total_in = 0;
zipStream.avail_out = BUFFER_WRITE_SIZE;
zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
dataPendingToWrite = 0;
}
uint64_t totalOutBefore = zipStream.total_out;
int zlibError = deflate(&zipStream, isEndOfInput ? Z_FINISH : Z_NO_FLUSH);
if ((zlibError != Z_OK) && (zlibError != Z_STREAM_END))
{
deflateEnd(&zipStream);
return false;
}
dataPendingToWrite += static_cast<uint64_t>(zipStream.total_out - totalOutBefore);
}
}
// Flush last compressed data.
while (dataPendingToWrite > 0)
{
if (dataPendingToWrite > BUFFER_WRITE_SIZE)
{
outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), BUFFER_WRITE_SIZE);
}
else
{
outputStringStream.write(reinterpret_cast<const char*>(writeBuffer.data()), dataPendingToWrite);
}
zipStream.total_in = 0;
zipStream.avail_out = BUFFER_WRITE_SIZE;
zipStream.next_out = reinterpret_cast<Bytef *>(writeBuffer.data());
uint64_t totalOutBefore = zipStream.total_out;
int zlibError = deflate(&zipStream, Z_FINISH);
if ((zlibError != Z_OK) && (zlibError != Z_STREAM_END))
{
deflateEnd(&zipStream);
return false;
}
dataPendingToWrite = static_cast<uint64_t>(zipStream.total_out - totalOutBefore);
}
deflateEnd(&zipStream);
const std::string & outputString = outputStringStream.str();
std::vector<unsigned char> deflated{outputString.begin(), outputString.end()};
std::cout << "Output String size: " << outputString.size() << std::endl;
input.swap(deflated);
return true;
}
int main(int argc, char * argv[])
{
std::ifstream input_file{"/tmp/test.txt"};
std::vector<unsigned char> data((std::istreambuf_iterator<char>(input_file)), std::istreambuf_iterator<char>());
std::cout << "Deflated: " << mydeflate(data) << '\n';
std::ofstream output_file{"/tmp/deflated.txt"};
output_file.write(reinterpret_cast<char *>(data.data()), data.size());
return 0;
}
Crypto++:
#include "cryptopp/files.h"
#include "cryptopp/zdeflate.h"
#include "cryptopp/zinflate.h"
void deflate_method1(const std::string & input_file_path, const std::string & output_file_path)
{
CryptoPP::Deflator deflator(new CryptoPP::FileSink(output_file_path.c_str(), true), CryptoPP::Deflator::DEFAULT_DEFLATE_LEVEL, CryptoPP::Deflator::MAX_LOG2_WINDOW_SIZE);
CryptoPP::FileSource fs(input_file_path.c_str(), true);
fs.TransferAllTo(deflator);
}
void inflate_method1(const std::string & input_file_path, const std::string & output_file_path)
{
CryptoPP::FileSource fs(input_file_path.c_str(), true);
CryptoPP::Inflator inflator(new CryptoPP::FileSink(output_file_path.c_str(), true));
fs.TransferAllTo(inflator);
}
void deflate_method2(const std::string& input_file_path, const std::string& output_file_path)
{
CryptoPP::Deflator deflator(new CryptoPP::FileSink(output_file_path.c_str(), true), CryptoPP::Deflator::DEFAULT_DEFLATE_LEVEL, 15);
std::ifstream file_in;
file_in.open(input_file_path, std::ios::binary);
std::string buffer;
size_t num_read = 0;
const size_t buffer_size(1024 * 1024);
buffer.resize(buffer_size);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
while (num_read) {
deflator.ChannelPut(CryptoPP::DEFAULT_CHANNEL, reinterpret_cast<unsigned char*>(const_cast<char *>(buffer.data())), num_read);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
}
file_in.close();
deflator.Flush(true);
}
void inflate_method2(const std::string& input_file_path, const std::string& output_file_path)
{
CryptoPP::Inflator inflator(new CryptoPP::FileSink(output_file_path.c_str(), true));
std::ifstream file_in;
file_in.open(input_file_path, std::ios::binary);
std::string buffer;
size_t num_read = 0;
const size_t buffer_size(1024 * 1024);
buffer.resize(buffer_size);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
while (num_read) {
inflator.ChannelPut(CryptoPP::DEFAULT_CHANNEL, reinterpret_cast<unsigned char*>(const_cast<char *>(buffer.data())), num_read);
file_in.read(const_cast<char*>(buffer.data()), buffer_size);
num_read = file_in.gcount();
}
file_in.close();
inflator.Flush(true);
}
int main(int argc, char * argv[])
{
deflate_method1("/tmp/test.txt", "/tmp/deflated_method1.bin");
inflate_method1("/tmp/deflated_method1.bin", "/tmp/inflated_method1.txt");
deflate_method2("/tmp/test.txt", "/tmp/deflated_method2.bin");
inflate_method2("/tmp/deflated_method2.bin", "/tmp/inflated_method2.txt");
// This throws: Inflator: unexpected end of compressed block
inflate_method1("/tmp/deflated_method2.bin", "/tmp/inflated_with_method1_file_deflated_with_method2.txt");
return 0;
}
Python:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import zlib
def CHUNKSIZE():
return 128
def deflate(file_path, compression_level, method, wbits):
plain_data = None
deflated_data = bytearray()
deflator = zlib.compressobj(compression_level, method, wbits)
with open(file_path, 'rb') as input_file:
while True:
plain_data = input_file.read(CHUNKSIZE())
if not plain_data:
break
deflated_data += deflator.compress(plain_data)
deflated_data += deflator.flush()
return deflated_data
def inflate(file_path, wbits):
inflated_data = bytearray()
inflator = zlib.decompressobj(wbits)
with open(file_path, 'rb') as deflated_file:
buffer = deflated_file.read(CHUNKSIZE())
while buffer:
inflated_data += inflator.decompress(buffer)
buffer = deflated_file.read(CHUNKSIZE())
inflated_data += inflator.flush()
return inflated_data
def write_file(file_path, data):
with open(file_path, 'wb') as output_file:
output_file.write(data)
if __name__ == "__main__":
deflated_data = deflate("/tmp/test.txt", zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -zlib.MAX_WBITS)
write_file("/tmp/deflated_python.bin", deflated_data)
The first three are working correctly, generating a valid deflate compressed stream with a single, last deflate block.
You "Crypto++ method2" is generating two deflate blocks, where the second one is an empty stored block that is not marked as the last block. This is not a valid deflate stream since it does not terminate. You are not correctly finishing the compression.
Your deflator.Flush(true)
is flushing the first block and emitting that empty stored block, without ending the deflate stream.
I'm not seeing much in the way of documentation, or really any at all, but looking at the source code, I would try deflator.EndBlock(true)
instead.
Update:
Per the comment below, EndBlock
is not public. Instead MessageEnd
is what is needed to terminate the deflate stream.
The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.