From d8ad0705e681955701d58e04a6478acf523b7899 Mon Sep 17 00:00:00 2001
From: Aaron Lav <asl2@pobox.com>
Date: Wed, 26 Feb 2020 12:45:46 -0600
Subject: [PATCH] Fix decompression for chunked transfers (issue #96)

---
 asks/http_utils.py       | 57 ++++++++++++++++++++++++++++++----------
 tests/test_http_utils.py | 42 +++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 14 deletions(-)
 create mode 100644 tests/test_http_utils.py

diff --git a/asks/http_utils.py b/asks/http_utils.py
index c32575c..71fc820 100644
--- a/asks/http_utils.py
+++ b/asks/http_utils.py
@@ -3,19 +3,15 @@
 """
 
 
-__all__ = ['decompress', 'parse_content_encoding']
+__all__ = ['decompress', 'decompress_one', 'parse_content_encoding']
 
 
-from gzip import decompress as gdecompress
-from zlib import decompress as zdecompress
+import codecs
+from zlib import decompressobj, MAX_WBITS
 
 from .utils import processor
 
 
-_compression_mapping = {
-    'gzip': gdecompress,
-    'deflate': zdecompress
-}
 
 
 def parse_content_encoding(content_encoding: str) -> [str]:
@@ -26,11 +22,44 @@ def parse_content_encoding(content_encoding: str) -> [str]:
 @processor
 def decompress(compressions, encoding=None):
     data = b''
+    # https://tools.ietf.org/html/rfc7231
+    # "If one or more encodings have been applied to a representation, the
+    # sender that applied the encodings MUST generate a Content-Encoding
+    # header field that lists the content codings in the order in which
+    # they were applied."
+    # Thus, reversed(compressions).
+    decompressors = [decompress_one(compression)
+                     for compression in reversed(compressions)]
+    if encoding:
+        decompressors.append(make_decoder_shim(encoding))
     while True:
-        if encoding:
-            data = yield data.decode(encoding, errors='replace')
-        else:
-            data = yield data
-        for compression in compressions:
-            if compression in _compression_mapping:
-                data = _compression_mapping[compression](data)
+        data = yield data
+        for decompressor in decompressors:
+            data = decompressor.send(data)
+
+# https://tools.ietf.org/html/rfc7230#section-4.2.1 - #section-4.2.3
+
+DECOMPRESS_WBITS = {
+    'deflate' : MAX_WBITS,
+    'gzip' : MAX_WBITS + 16,
+    'x-gzip' : MAX_WBITS + 16
+    }
+
+@processor
+def decompress_one(compression):
+    data = b''
+    decompressor = decompressobj(
+        wbits=DECOMPRESS_WBITS[compression])
+    while True:
+        data = yield data
+        data = decompressor.decompress(data)
+    yield decompressor.flush()
+
+@processor
+def make_decoder_shim(encoding):
+    data = b''
+    decoder = codecs.getincrementaldecoder(encoding)(errors='replace')
+    while True:
+        data = yield data
+        data = decoder.decode(data)
+    yield decoder.decode(b'', final=True)
diff --git a/tests/test_http_utils.py b/tests/test_http_utils.py
new file mode 100644
index 0000000..a5b0281
--- /dev/null
+++ b/tests/test_http_utils.py
@@ -0,0 +1,42 @@
+import zlib
+import gzip
+
+import pytest
+
+from asks import http_utils
+
+INPUT_DATA = b'abcdefghijklmnopqrstuvwxyz'
+UNICODE_INPUT_DATA = '\U0001f408\U0001F431' * 5
+
+@pytest.mark.parametrize('compressor,name',
+                         [(zlib.compress, 'deflate'),
+                          (gzip.compress, 'gzip')])
+def test_decompress_one_zlib(compressor, name):
+    data = zlib.compress(INPUT_DATA)
+    decompressor = http_utils.decompress_one('deflate')
+    result = b''
+    for i in range(len(data)):
+        b = data[i:i+1]
+        result += decompressor.send(b)
+    assert result == INPUT_DATA
+
+def test_decompress():
+    # we don't expect to see multiple compression types in the wild
+    # but test anyway
+    data = zlib.compress(gzip.compress(INPUT_DATA))
+    decompressor = http_utils.decompress(['gzip', 'deflate'])
+    result = b''
+    for i in range(len(data)):
+        b = data[i:i+1]
+        result += decompressor.send(b)
+    assert result == INPUT_DATA
+
+def test_decompress_decoding():
+    data = zlib.compress(UNICODE_INPUT_DATA.encode('utf-8'))
+    decompressor = http_utils.decompress(['deflate'], encoding='utf-8')
+    result = ''
+    for i in range(len(data)):
+        b = data[i:i+1]
+        res = decompressor.send(b)
+        result += res
+    assert result == UNICODE_INPUT_DATA