From d8ad0705e681955701d58e04a6478acf523b7899 Mon Sep 17 00:00:00 2001 From: Aaron Lav Date: Wed, 26 Feb 2020 12:45:46 -0600 Subject: [PATCH] Fix decompression for chunked transfers (issue #96) --- asks/http_utils.py | 57 ++++++++++++++++++++++++++++++---------- tests/test_http_utils.py | 42 +++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 14 deletions(-) create mode 100644 tests/test_http_utils.py diff --git a/asks/http_utils.py b/asks/http_utils.py index c32575c..71fc820 100644 --- a/asks/http_utils.py +++ b/asks/http_utils.py @@ -3,19 +3,15 @@ """ -__all__ = ['decompress', 'parse_content_encoding'] +__all__ = ['decompress', 'decompress_one', 'parse_content_encoding'] -from gzip import decompress as gdecompress -from zlib import decompress as zdecompress +import codecs +from zlib import decompressobj, MAX_WBITS from .utils import processor -_compression_mapping = { - 'gzip': gdecompress, - 'deflate': zdecompress -} def parse_content_encoding(content_encoding: str) -> [str]: @@ -26,11 +22,44 @@ def parse_content_encoding(content_encoding: str) -> [str]: @processor def decompress(compressions, encoding=None): data = b'' + # https://tools.ietf.org/html/rfc7231 + # "If one or more encodings have been applied to a representation, the + # sender that applied the encodings MUST generate a Content-Encoding + # header field that lists the content codings in the order in which + # they were applied." + # Thus, reversed(compressions). + decompressors = [decompress_one(compression) + for compression in reversed(compressions)] + if encoding: + decompressors.append(make_decoder_shim(encoding)) while True: - if encoding: - data = yield data.decode(encoding, errors='replace') - else: - data = yield data - for compression in compressions: - if compression in _compression_mapping: - data = _compression_mapping[compression](data) + data = yield data + for decompressor in decompressors: + data = decompressor.send(data) + +# https://tools.ietf.org/html/rfc7230#section-4.2.1 - #section-4.2.3 + +DECOMPRESS_WBITS = { + 'deflate' : MAX_WBITS, + 'gzip' : MAX_WBITS + 16, + 'x-gzip' : MAX_WBITS + 16 + } + +@processor +def decompress_one(compression): + data = b'' + decompressor = decompressobj( + wbits=DECOMPRESS_WBITS[compression]) + while True: + data = yield data + data = decompressor.decompress(data) + yield decompressor.flush() + +@processor +def make_decoder_shim(encoding): + data = b'' + decoder = codecs.getincrementaldecoder(encoding)(errors='replace') + while True: + data = yield data + data = decoder.decode(data) + yield decoder.decode(b'', final=True) diff --git a/tests/test_http_utils.py b/tests/test_http_utils.py new file mode 100644 index 0000000..a5b0281 --- /dev/null +++ b/tests/test_http_utils.py @@ -0,0 +1,42 @@ +import zlib +import gzip + +import pytest + +from asks import http_utils + +INPUT_DATA = b'abcdefghijklmnopqrstuvwxyz' +UNICODE_INPUT_DATA = '\U0001f408\U0001F431' * 5 + +@pytest.mark.parametrize('compressor,name', + [(zlib.compress, 'deflate'), + (gzip.compress, 'gzip')]) +def test_decompress_one_zlib(compressor, name): + data = zlib.compress(INPUT_DATA) + decompressor = http_utils.decompress_one('deflate') + result = b'' + for i in range(len(data)): + b = data[i:i+1] + result += decompressor.send(b) + assert result == INPUT_DATA + +def test_decompress(): + # we don't expect to see multiple compression types in the wild + # but test anyway + data = zlib.compress(gzip.compress(INPUT_DATA)) + decompressor = http_utils.decompress(['gzip', 'deflate']) + result = b'' + for i in range(len(data)): + b = data[i:i+1] + result += decompressor.send(b) + assert result == INPUT_DATA + +def test_decompress_decoding(): + data = zlib.compress(UNICODE_INPUT_DATA.encode('utf-8')) + decompressor = http_utils.decompress(['deflate'], encoding='utf-8') + result = '' + for i in range(len(data)): + b = data[i:i+1] + res = decompressor.send(b) + result += res + assert result == UNICODE_INPUT_DATA