From 4990b782da7e6a43fe22140f48d5f3d041d135c7 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Tue, 13 Aug 2024 16:06:56 -0700
Subject: [PATCH 01/13] Faster mode

- FTL mode without the top bit step function (20% faster)
- Use inline constants instead of double barrel decoding (3% faster)
---
 QB3lib/QB3.h         |  7 +++-
 QB3lib/QB3decode.cpp |  2 +-
 QB3lib/QB3decode.h   | 98 +++++++++++++++++++++++++++++++++++++++++++-
 QB3lib/QB3encode.cpp |  4 +-
 QB3lib/QB3encode.h   | 23 ++++++-----
 test_qb3.cpp         | 16 +++++---
 6 files changed, 130 insertions(+), 20 deletions(-)
diff --git a/QB3lib/QB3.h b/QB3lib/QB3.h
index 359644d..5d623d0 100644
--- a/QB3lib/QB3.h
+++ b/QB3lib/QB3.h
@@ -53,7 +53,12 @@ enum qb3_mode {
     QB3M_RLE_H = 6, // QB3 Hilbert + RLE
     QB3M_CF_RLE_H = 7, // QB3 Hilbert + CF + RLE
 
-    QB3M_STORED = 255, // Raw bypass
+    // Faster and only slightly worse than base in many cases
+    // Hilbert curve but no bit-step, no CF, no RLE
+    QB3M_FTL = 8,
+    QB3M_END, // Marks the end of the settable modes
+
+    QB3M_STORED = 255, // Raw bypass, can't be requested
     QB3M_INVALID = -1 // Invalid mode
 }; // Best compression, one of the above
 
diff --git a/QB3lib/QB3decode.cpp b/QB3lib/QB3decode.cpp
index fac5a2f..2dfe423 100644
--- a/QB3lib/QB3decode.cpp
+++ b/QB3lib/QB3decode.cpp
@@ -149,7 +149,7 @@ decsp qb3_read_start(void* source, size_t source_size, size_t *image_size) {
     val >>= 8; // 40 bits left
     // Also check that the next 2 bytes are a signature
     if (p->nbands > QB3_MAXBANDS 
-        || (p->mode > qb3_mode::QB3M_BEST && p->mode != qb3_mode::QB3M_STORED)
+        || (p->mode >= qb3_mode::QB3M_END && p->mode != qb3_mode::QB3M_STORED)
         || 0 != (val & 0x8080) 
         || p->type > qb3_dtype::QB3_I64) {
         delete p;
diff --git a/QB3lib/QB3decode.h b/QB3lib/QB3decode.h
index c0d99bd..9df3b09 100644
--- a/QB3lib/QB3decode.h
+++ b/QB3lib/QB3decode.h
@@ -140,7 +140,7 @@ static std::pair<size_t, uint64_t> qb3dsztbl(uint64_t val, size_t rung) {
 // For rung 0, it works with 17bits or more
 // For rung 1, it works with 47bits or more
 // returns false on failure
-template<typename T>
+template<bool applystep = true, typename T>
 static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits) {
     assert(((rung > 1) && (abits <= 8))
         || ((rung == 1) && (abits <= 17)) // B2 + 1
@@ -266,6 +266,7 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
             }
         }
     }
+    if (applystep) // template parameter to avoid dead code
     if (0 == (group[B2 - 1] >> rung)) {
         auto stepp = step(group, rung);
         if (stepp < B2)
@@ -280,10 +281,105 @@ template<typename T> static T magsabs(T v) { return (v >> 1) + (v & 1); }
 // Multiply v(in magsign) by m(normal, positive)
 template<typename T> static T magsmul(T v, T m) { return magsabs(v) * (m << 1) - (v & 1); }
 
+// Streamlined decoding for FTL mode
+template<typename T>
+static bool decodeFTL(uint8_t* src, size_t len, T* image, const decs& info)
+{
+    auto xsize(info.xsize), ysize(info.ysize), bands(info.nbands), stride(info.stride);
+    auto cband = info.cband;
+    static_assert(std::is_integral<T>() && std::is_unsigned<T>(), "Only unsigned integer types allowed");
+    constexpr size_t UBITS(sizeof(T) == 1 ? 3 : sizeof(T) == 2 ? 4 : sizeof(T) == 4 ? 5 : 6);
+    constexpr auto NORM_MASK((1ull << UBITS) - 1); // UBITS set
+    constexpr auto LONG_MASK(NORM_MASK * 2 + 1); // UBITS + 1 set
+    T prev[QB3_MAXBANDS] = {}, group[B2] = {};
+    size_t runbits[QB3_MAXBANDS] = {};
+    const uint16_t* dsw = sizeof(T) == 1 ? dsw3 : sizeof(T) == 2 ? dsw4 : sizeof(T) == 4 ? dsw5 : dsw6;
+    stride = stride ? stride : xsize * bands;
+    // Set up block offsets based on traversal order, defaults to HILBERT
+    uint64_t order(info.order);
+    order = order ? order : HILBERT;
+    size_t offset[B2] = {};
+    for (size_t i = 0; i < B2; i++) {
+        size_t n = (order >> ((B2 - 1 - i) << 2));
+        offset[i] = ((n >> 2) & 0b11) * stride + (n & 0b11) * bands;
+    }
+    iBits s(src, len);
+    bool failed(false);
+    for (size_t y = 0; y < ysize; y += B) {
+        // If the last row is partial, roll it up
+        if (y + B > ysize)
+            y = ysize - B;
+        for (size_t x = 0; x < xsize; x += B) {
+            // If the last column is partial, move it left
+            if (x + B > xsize)
+                x = xsize - B;
+            for (int c = 0; c < bands; c++) {
+                failed = s.empty();
+                uint64_t cs(0), abits(1), acc(s.peek());
+                if (acc & 1) { // Rung change
+                    cs = dsw[(acc >> 1) & LONG_MASK];
+                    abits = cs >> 12;
+                    failed |= (0 == (cs & TBLMASK)); // no signals
+                }
+                acc >>= abits;
+                // abits is never > 8, so it's safe to call gdecode
+                auto rung = (runbits[c] + cs) & NORM_MASK;
+                runbits[c] = rung;
+                if (rung > 1) { // longer codes
+                    failed |= !gdecode<false>(s, rung, group, acc, abits);
+                } else if (rung == 0) { // single bits, direct decoding
+                    if (0 != (acc & 1)) {
+                        abits += B2;
+                        for (size_t i = 0; i < B2; i++) {
+                            acc >>= 1;
+                            group[i] = static_cast<T>(1 & acc);
+                        }
+                    }
+                    else
+                        for (size_t i = 0; i < B2; i++)
+                            group[i] = static_cast<T>(0);
+                    s.advance(abits + 1);
+                }
+                else { // rung == 1
+                    for (size_t i = 0; i < B2; i++) {
+                        auto size = (0x31213121u >> ((acc & 7) << 2)) & 0xf;
+                        group[i] = T((0x30102010u >> ((acc & 7) << 2)) & 0xf);
+                        abits += size;
+                        acc >>= size;
+                    }
+                    s.advance(abits);
+                }
+                // Undo delta encoding for this block
+                auto prv = prev[c];
+                T* const blockp = image + y * stride + x * bands + c;
+                for (int i = 0; i < B2; i++)
+                    blockp[offset[i]] = prv += smag(group[i]);
+                prev[c] = prv;
+                if (failed) break;
+            } // Per band per block
+            if (failed) break;
+        } // per block
+        if (failed) break;
+        // For performance apply band delta per block strip, in linear order
+        for (size_t j = 0; j < B; j++) {
+            for (int c = 0; c < bands; c++) if (c != cband[c]) {
+                auto dimg = image + stride * (y + j) + c;
+                auto simg = image + stride * (y + j) + cband[c];
+                for (int i = 0; i < xsize; i++, dimg += bands, simg += bands)
+                    *dimg += *simg;
+            }
+        }
+    } // per strip
+    // It might not catch all errors
+    return failed || s.avail() > 7;
+}
+
 // reports most but not all errors, for example if the input stream is too short for the last block
 template<typename T>
 static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
 {
+    if (info.mode == QB3M_FTL)
+        return decodeFTL(src, len, image, info);
     auto xsize(info.xsize), ysize(info.ysize), bands(info.nbands), stride(info.stride);
     auto cband = info.cband;
     static_assert(std::is_integral<T>() && std::is_unsigned<T>(), "Only unsigned integer types allowed");
diff --git a/QB3lib/QB3encode.cpp b/QB3lib/QB3encode.cpp
index 0e0d766..af7f8fe 100644
--- a/QB3lib/QB3encode.cpp
+++ b/QB3lib/QB3encode.cpp
@@ -135,7 +135,7 @@ size_t qb3_max_encoded_size(const encsp p) {
 }
 
 qb3_mode qb3_set_encoder_mode(encsp p, qb3_mode mode) {
-    if (mode <= qb3_mode::QB3M_BEST)
+    if (mode < qb3_mode::QB3M_END)
         p->mode = mode;
     // Default curve is HILBERT, change it if needed
     switch (p->mode) {
@@ -421,7 +421,7 @@ static size_t raw_size(encsp const &p) {
 int qb3_get_encoder_state(encsp p) { return p->error; }
 
 static bool is_fast(qb3_mode mode) {
-    return (QB3M_BASE_H == mode) || (QB3M_BASE_Z == mode);
+    return (QB3M_BASE_H == mode) || (QB3M_BASE_Z == mode) || (QB3M_FTL == mode);
 }
 
 // ONLY QB3M_BASE and QB3M_CF are supported here
diff --git a/QB3lib/QB3encode.h b/QB3lib/QB3encode.h
index e9b6ab3..d954fd9 100644
--- a/QB3lib/QB3encode.h
+++ b/QB3lib/QB3encode.h
@@ -147,7 +147,7 @@ static std::pair<size_t, uint64_t> qb3csztbl(uint64_t val, size_t rung) {
 // bitsused is used to choose the rung for encoding
 // If abits > 0, the accumulator is also pushed into the stream
 template <typename T>
-static void groupencode(T group[B2], T bitsused, oBits& s, uint64_t acc, size_t abits)
+static void groupencode(T group[B2], T bitsused, oBits& s, uint64_t acc, size_t abits, bool skipstep = false)
 {
     assert(abits <= 64);
     const size_t rung = topbit(bitsused | 1);
@@ -159,12 +159,15 @@ static void groupencode(T group[B2], T bitsused, oBits& s, uint64_t acc, size_t
         s.push(acc, abits);
         return;
     }
-    // Flip the last set rung bit if the rung bit sequence is a step down
-    // At least one rung bit has to be set, so it can't return 0
-    auto stepp = step(group, rung);
-    assert(stepp > 0); // At least one rung bit should be set
-    if (stepp <= B2)
-        group[stepp - 1] ^= static_cast<T>(1ull << rung);
+    size_t stepp(B2 + 1);
+    if (!skipstep) {
+        // Flip the last set rung bit if the rung bit sequence is a step down
+        // At least one rung bit has to be set, so it can't return 0
+        stepp = step(group, rung);
+        assert(stepp > 0); // At least one rung bit should be set
+        if (stepp <= B2)
+            group[stepp - 1] ^= static_cast<T>(1ull << rung);
+    }
     if (abits > 8) { // Just in case, a rung switch is 8 bits at most
         s.push(acc, abits);
         acc = abits = 0;
@@ -240,10 +243,10 @@ static void groupencode(T group[B2], T bitsused, oBits& s, uint64_t acc, size_t
 
 // Base QB3 group encode with code switch, returns encoded size
 template <typename T>
-static void groupencode(T group[B2], T bitsused, size_t oldrung, oBits& s) {
+static void groupencode(T group[B2], T bitsused, size_t oldrung, oBits& s, bool skipstep = false) {
     constexpr size_t UBITS = sizeof(T) == 1 ? 3 : sizeof(T) == 2 ? 4 : sizeof(T) == 4 ? 5 : 6;
     uint64_t acc = CSW[UBITS][(topbit(bitsused | 1) - oldrung) & ((1ull << UBITS) - 1)];
-    groupencode(group, bitsused, s, acc & TBLMASK, static_cast<size_t>(acc >> 12));
+    groupencode(group, bitsused, s, acc & TBLMASK, static_cast<size_t>(acc >> 12), skipstep);
 }
 
 // Group encode with cf
@@ -398,7 +401,7 @@ static int encode_fast(const T* image, oBits& s, encs &info)
                     }
                 }
                 prev[c] = prv;
-                groupencode(group, bitsused, runbits[c], s);
+                groupencode(group, bitsused, runbits[c], s, info.mode == QB3M_FTL);
                 runbits[c] = topbit(bitsused | 1);
             }
         }
diff --git a/test_qb3.cpp b/test_qb3.cpp
index 0cc6805..ed6b25b 100644
--- a/test_qb3.cpp
+++ b/test_qb3.cpp
@@ -61,7 +61,7 @@ vector<outT> toplus(vector<inT>& v, outT m) {
 template<typename T>
 void check(vector<uint8_t> &image, const Raster &raster,
     uint64_t m, int main_band = 0,
-    bool fast = 0, uint64_t q = 1, bool away = false)
+    bool fast = 0, uint64_t q = 1, bool away = false, bool ftl = false)
 {
     size_t xsize = raster.size.x;
     size_t ysize = raster.size.y;
@@ -93,7 +93,7 @@ void check(vector<uint8_t> &image, const Raster &raster,
     // This is sufficient to trigger the quanta encoding
     if (q > 1)
         qb3_set_encoder_quanta(qenc, q, away);
-    qb3_set_encoder_mode(qenc, fast ? qb3_mode::QB3M_BASE : qb3_mode::QB3M_BEST);
+    qb3_set_encoder_mode(qenc, ftl? qb3_mode::QB3M_FTL : fast ? qb3_mode::QB3M_BASE : qb3_mode::QB3M_BEST);
 
     t1 = high_resolution_clock::now();
     auto outsize = qb3_encode(qenc, static_cast<void *>(img.data()), outvec.data());
@@ -137,9 +137,7 @@ void check(vector<uint8_t> &image, const Raster &raster,
 
     time_span = duration_cast<duration<double>>(t2 - t1).count();
     cout << sizeof(T) * image.size() /time_span / 1024 / 1024 << '\t'
-        << time_span << '\t' << sizeof(T) << '\t' << m << '\t';
-    if (fast)
-        cout << "Fast";
+        << time_span << '\t' << sizeof(T) << '\t' << m << '\t' << (ftl ? "FTL" : fast ? "Fast" : "");
 
     if (q > 1) {
         auto hq = T(q / 2); // precision
@@ -530,21 +528,29 @@ int main(int argc, char **argv)
             cout << endl;
             check<uint64_t>(image, raster, 1, 1, true);
             cout << endl;
+            check<uint64_t>(image, raster, 1, 1, true, 1, 0, true);
+            cout << endl;
 
             check<uint32_t>(image, raster, 1, 1);
             cout << endl;
             check<uint32_t>(image, raster, 1, 1, true);
             cout << endl;
+            check<uint32_t>(image, raster, 1, 1, true, 1, 0, true);
+            cout << endl;
 
             check<uint16_t>(image, raster, 1, 1);
             cout << endl;
             check<uint16_t>(image, raster, 1, 1, true);
             cout << endl;
+            check<uint16_t>(image, raster, 1, 1, true, 1, 0, true);
+            cout << endl;
 
             check<uint8_t>(image, raster, 1, 1);
             cout << endl;
             check<uint8_t>(image, raster, 1, 1, true);
             cout << endl;
+            check<uint8_t>(image, raster, 1, 1, true, 1, 0, true);
+            cout << endl;
         }
         else if (raster.dt == ICDT_Int16 || raster.dt == ICDT_UInt16) {
             std::vector<uint16_t> image(params.get_buffer_size() / 2);

From 70f98ad329ab0c85fef0db9210d9b1430fce0daf Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Tue, 13 Aug 2024 16:24:39 -0700
Subject: [PATCH 02/13] Use inline constants when decoding index values

Doesn't make much difference since index encoding is seldom used
---
 QB3lib/QB3decode.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/QB3lib/QB3decode.h b/QB3lib/QB3decode.h
index 9df3b09..7133408 100644
--- a/QB3lib/QB3decode.h
+++ b/QB3lib/QB3decode.h
@@ -485,15 +485,15 @@ static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
                         acc >>= (cs >> 12) - 1; // No flag
                         abits += (cs >> 12) - 1;
                         failed |= rung == 63; // TODO: Deal with 64bit overflow
-                        // 16 index values in group, max is 7, use rung 2
+                        // 16 index values in group, max group size is 7, use rung 2, accumulator is sufficient
                         T maxval(0);
                         for (int i = 0; i < B2; i++) {
-                            auto v = DRG[2][acc & 0xf];
-                            group[i] = static_cast<uint8_t>(v);
+                            unsigned int size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
+                            group[i] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
+                            acc >>= size;
+                            abits += size;
                             if (maxval < group[i])
                                 maxval = group[i];
-                            acc >>= v >> 12;
-                            abits += v >> 12;
                         }
                         s.advance(abits);
                         T idxarray[B2 / 2] = {};

From 86e85c596f2f3f8ad0dd9f5d84d3b0b7a922ca35 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 14 Aug 2024 14:06:09 -0700
Subject: [PATCH 03/13] Explicit use of 64bit integers when needed

---
 QB3lib/QB3.h         | 8 +++++---
 QB3lib/QB3common.h   | 4 ++--
 QB3lib/QB3decode.cpp | 4 ++--
 QB3lib/QB3decode.h   | 4 ++++
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/QB3lib/QB3.h b/QB3lib/QB3.h
index 5d623d0..057c36a 100644
--- a/QB3lib/QB3.h
+++ b/QB3lib/QB3.h
@@ -18,6 +18,8 @@ Contributors:  Lucian Plesea
 #pragma once
 // For size_t
 #include <stddef.h>
+// For uint64_t
+#include <stdint.h>
 
 // CMake will generate LIBQB3_EXPORT linkage as needed
 #include <libqb3_export.h>
@@ -92,7 +94,7 @@ LIBQB3_EXPORT bool qb3_set_encoder_coreband(encsp p, size_t bands, size_t *cband
 
 // Sets quantization parameters, returns true on success
 // away = true -> round away from zero
-LIBQB3_EXPORT bool qb3_set_encoder_quanta(encsp p, size_t q, bool away);
+LIBQB3_EXPORT bool qb3_set_encoder_quanta(encsp p, uint64_t q, bool away);
 
 // Upper bound of encoded size, without taking the header into consideration
 LIBQB3_EXPORT size_t qb3_max_encoded_size(const encsp p);
@@ -141,10 +143,10 @@ LIBQB3_EXPORT void qb3_set_decoder_stride(decsp p, size_t stride);
 LIBQB3_EXPORT qb3_mode qb3_get_mode(const decsp p);
 
 // Returns the number of quantization bits used, returns 0 if failed
-LIBQB3_EXPORT size_t qb3_get_quanta(const decsp p);
+LIBQB3_EXPORT uint64_t qb3_get_quanta(const decsp p);
 
 // Return the scanning curve used, returns 0 if failed
-LIBQB3_EXPORT size_t qb3_get_order(const decsp p);
+LIBQB3_EXPORT uint64_t qb3_get_order(const decsp p);
 
 // Sets the cband array and returns true if successful
 LIBQB3_EXPORT bool qb3_get_coreband(const decsp p, size_t *cband);
diff --git a/QB3lib/QB3common.h b/QB3lib/QB3common.h
index cafedd1..c6da41c 100644
--- a/QB3lib/QB3common.h
+++ b/QB3lib/QB3common.h
@@ -86,7 +86,7 @@ struct encs {
     size_t nbands;
     // micro block scanning order
     uint64_t order;
-    size_t quanta;
+    uint64_t quanta;
 
     // Persistent state by band
     band_state band[QB3_MAXBANDS];
@@ -109,7 +109,7 @@ struct decs {
     size_t stride;
     // micro block scanning order
     uint64_t order;
-    size_t quanta;
+    uint64_t quanta;
     int error;
     int stage;
 
diff --git a/QB3lib/QB3decode.cpp b/QB3lib/QB3decode.cpp
index 2dfe423..4609d08 100644
--- a/QB3lib/QB3decode.cpp
+++ b/QB3lib/QB3decode.cpp
@@ -48,13 +48,13 @@ qb3_mode qb3_get_mode(const decsp p) {
     return p->mode;
 }
 
-size_t qb3_get_quanta(const decsp p) {
+uint64_t qb3_get_quanta(const decsp p) {
     if (p->stage != 2)
         return 0; // Error
     return p->quanta;
 }
 
-size_t qb3_get_order(const decsp p) {
+uint64_t qb3_get_order(const decsp p) {
     if (p->stage != 2)
         return 0; // Error
     return p->order ? p->order : ZCURVE;
diff --git a/QB3lib/QB3decode.h b/QB3lib/QB3decode.h
index 7133408..cb9b17c 100644
--- a/QB3lib/QB3decode.h
+++ b/QB3lib/QB3decode.h
@@ -174,6 +174,10 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
         }
         else if (2 == rung) { // max symbol len is 4, there are at least 14 in the accumulator
             // Use inline constants as nibble tables
+            // Faster than a double value table decode, but only in this specific code organization
+            // Cleaning it up, for example doing a peek at the start then looping 16 times, makes it slower
+            // The masks and inline constants could be smaller for size, but that eliminates the
+            // common expression, making it slower
             unsigned int size;
             for (size_t i = 0; i < 14; i++) {
                 size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;

From fdd89eb6402f89b3e36665c08ce5239b502e7898 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 14 Aug 2024 14:46:43 -0700
Subject: [PATCH 04/13] Force use of export header collocated with the QB3 one

---
 QB3lib/QB3.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QB3lib/QB3.h b/QB3lib/QB3.h
index 057c36a..11b9b3b 100644
--- a/QB3lib/QB3.h
+++ b/QB3lib/QB3.h
@@ -22,7 +22,7 @@ Contributors:  Lucian Plesea
 #include <stdint.h>
 
 // CMake will generate LIBQB3_EXPORT linkage as needed
-#include <libqb3_export.h>
+#include "libqb3_export.h"
 
 // Keep this close to plain C so it can have a C API
 #define QB3_MAXBANDS 16

From 68ed92330cf8c26798d430b86426489d4fc12562 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 14 Aug 2024 14:49:22 -0700
Subject: [PATCH 05/13] Add wasm test program

---
 attic/world.cpp | 186 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 attic/world.cpp

diff --git a/attic/world.cpp b/attic/world.cpp
new file mode 100644
index 0000000..821228f
--- /dev/null
+++ b/attic/world.cpp
@@ -0,0 +1,186 @@
+/*
+  Prototype use of QB3 library in wasm, with emscripten
+  This is a simple test program that reads a PPM image, encodes it and then decodes it
+  The image is then displayed in a window
+
+  Once libQB3.a is built using emscripten and a test ppm image is available,
+  use the following command do compile this program into a runnable html file:
+
+  emcc -O2 -o world.html world.cpp libQB3.a --preload-file Image.ppm -sUSE_SDL=2
+  
+  Current performance is 300MB/sec encode and 270MB/sec decode, RGB 8 bit data
+*/
+
+#include "../include/QB3.h"
+#include <cstdio>
+#include <cstdlib>
+#include <emscripten.h>
+
+#include <SDL.h>
+
+int read_ppm_header(FILE *f, int &x, int &y, int &zmax) {
+	char line[1024];
+	char *v = fgets(line, 1024, f);
+	if (v[0] != 'P' || v[1] != '6') return 1; // Error
+	v = fgets(line, 1024, f);
+	sscanf(v, "%d %d", &x, &y);
+	v = fgets(line, 1024, f);
+	sscanf(v, "%d", &zmax);
+	return 0;
+}
+
+// Display in a 1024 * 1024 window, image y has to be at least 1024
+void display_data(int x, char * data) {
+	if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS) < 0)
+		printf("Can't initialize SDL\n");
+	// if (IMG_Init(IMG_INIT_PNG) == 0)
+	// 	printf("Error SDL2_Image Initialization\n");
+
+	SDL_Window *window = SDL_CreateWindow("Display",
+		SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, 1024, 1024, SDL_WINDOW_OPENGL
+		);
+	if (!window) printf("Create window failed\n");
+
+	SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
+
+	SDL_Texture *texture = SDL_CreateTexture(renderer,
+		SDL_PIXELFORMAT_RGB24, SDL_TEXTUREACCESS_STATIC, 1024, 1024);
+	if (!texture)
+		printf("Can't create texture\n");
+
+	if (SDL_UpdateTexture(texture, NULL, data, x * 3))
+		printf("Can't update texture\n");
+	SDL_RenderClear(renderer);
+	// This scales it to full size, which is a mistake
+	SDL_RenderCopy(renderer, texture, NULL, NULL);
+	SDL_RenderPresent(renderer);
+
+	SDL_DestroyTexture(texture);
+	SDL_DestroyRenderer(renderer);
+	// Should clean up the window too
+}
+
+// Single image decode and display
+int main2(int argc, char **argv) {
+	FILE *f = fopen("Image.qb3", "rb");
+	if (!f) {
+		printf("Didn't work\n");
+		return 1;
+	}
+
+	int x, y, bands;
+	fseek(f, 0, SEEK_END);
+	auto fsize = ftell(f);
+	rewind(f);
+	auto buffer = malloc(fsize);
+	fread(buffer, fsize, 1, f);
+	size_t image_size[3]; // x, y and bands
+	auto qdec = qb3_read_start(buffer, size_t(fsize), image_size);
+	x = image_size[0];
+	y = image_size[1];
+	bands = image_size[2];
+	printf("Image is %dx%d@%d\n", x, y, bands);
+	qb3_read_info(qdec);
+	auto tp = qb3_get_type(qdec);
+	if (tp != QB3_U8) {
+		free(buffer);
+		qb3_destroy_decoder(qdec);
+
+		printf("Not byte data\n");
+		return 1;
+	}
+
+	auto raw_size = qb3_decoded_size(qdec);
+	char *raw_buffer = (char *)malloc(raw_size);
+	// Final decode
+	qb3_read_data(qdec, raw_buffer);
+	qb3_destroy_decoder(qdec);
+	display_data(x, raw_buffer);
+	return 0;
+}
+
+int main(int argc, char **argv) {
+	FILE *f=fopen("Image.ppm", "rb");
+	
+	int x, y, max_val;
+	read_ppm_header(f, x, y, max_val);
+	printf("%d %d %d\n", x, y, max_val);
+	int raw_size = x * y * 3;
+	printf("Raw size %d\n", raw_size);
+	char *data = static_cast<char *>(malloc(x * y * 3));
+	fread(data, 3 * x, y, f);
+	qb3_dtype tp = qb3_dtype::QB3_U8;
+	auto qenc = qb3_create_encoder(x, y, 3, tp);
+	auto maxsz = qb3_max_encoded_size(qenc);
+	char *outbuff = static_cast<char *>(malloc(maxsz));
+	qb3_destroy_encoder(qenc);
+	int loops = 50;
+	auto C = 1e-3 * raw_size * loops;
+	// Twice, first is warmup then actual
+	for (int j = 0; j < 2; j++)
+	{
+		double encode_time(0), decode_time(0);
+		for (int i = 0; i < loops; i++)
+		{
+			// printf("Loop %d %f\n", i, encode_time);
+			qenc = qb3_create_encoder(x, y, 3, tp);
+			auto start = emscripten_get_now();
+			qb3_set_encoder_mode(qenc, QB3M_FTL);
+			auto outsize = qb3_encode(qenc, data, outbuff);
+			auto stop = emscripten_get_now();
+			encode_time += stop - start;
+			// printf("Compressed to %lu\n", outsize);
+			// printf("Encoding took %f\n", stop - start);
+			// Let's try decompression too
+			qb3_destroy_encoder(qenc);
+
+			//
+			size_t image_size[3];
+			auto qdec = qb3_read_start(outbuff, outsize, image_size);
+			x = image_size[0];
+			y = image_size[1];
+			max_val = image_size[2];
+			// printf("Got size %u %u %u\n", x, y, max_val);
+			qb3_read_info(qdec);
+			start = emscripten_get_now();
+			if (raw_size != qb3_decoded_size(qdec))
+				printf("Size error on decode\n");
+			if (!qb3_read_data(qdec, data))
+				printf("decode error\n");
+			stop = emscripten_get_now();
+			// printf("Decoding took %f ms\n", stop - start);
+			qb3_destroy_decoder(qdec);
+			decode_time += stop - start;
+		}
+		printf("Time (ms)      Encode: %f, Decode: %f\n", encode_time / loops, decode_time / loops);
+		printf("Speed (MB/sec) Encode: %f, Decode: %f\n", C / encode_time, C / decode_time);
+	}
+
+	free(outbuff);
+	if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS) < 0)
+		printf("Can't initialize SDL\n");
+	// if (IMG_Init(IMG_INIT_PNG) == 0)
+	// 	printf("Error SDL2_Image Initialization\n");
+
+	SDL_Window *window = SDL_CreateWindow("First program",
+		SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, x, y, SDL_WINDOW_OPENGL
+		);
+	if (!window) printf("Create window failed\n");
+
+	SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
+
+	SDL_Texture *texture = SDL_CreateTexture(renderer,
+		SDL_PIXELFORMAT_RGB24, SDL_TEXTUREACCESS_STATIC, 1024, 1024);
+	if (!texture)
+		printf("Can't create texture\n");
+
+	if (SDL_UpdateTexture(texture, NULL, data, x * 3))
+		printf("Can't update texture\n");
+	SDL_RenderClear(renderer);
+	SDL_RenderCopy(renderer, texture, NULL, NULL);
+	SDL_RenderPresent(renderer);
+
+	free(data);
+	
+	return 0;
+}

From 40e321c3c7b27a527645b159d12af4e8b301b559 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 14 Aug 2024 15:53:48 -0700
Subject: [PATCH 06/13] Add fast-er mode to cqb3

---
 cqb3.cpp | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/cqb3.cpp b/cqb3.cpp
index 9c15ab0..e7fab85 100644
--- a/cqb3.cpp
+++ b/cqb3.cpp
@@ -41,7 +41,8 @@ struct options {
         trim(false),
         rle(false), // non-default RLE (off for best, on for fast)
         legacy(false), // legacy mode
-        verbose(false), 
+        verbose(false),
+        ftl(false),
         decode(false)
     {};
 
@@ -56,6 +57,7 @@ struct options {
     bool rle; // Skip RLE
     bool legacy; // Legacy mode
     bool verbose;
+    bool ftl; // Fastest compression
     bool decode;
 };
 
@@ -68,6 +70,7 @@ int Usage(const options &opt) {
         << "\n"
         << "Compression only options:\n"
         << "\t-b : best compression\n"
+        << "\t-f : fastest compression\n"
         << "\t-l : legacy mode (deprecated)\n"
         << "\t-q <n> : quanta\n"
         << "\t-r : reverse RLE behavior, off for best, on for fast\n"
@@ -110,6 +113,9 @@ bool parse_args(int argc, char** argv, options& opt) {
             case 'd':
                 opt.decode = true;
                 break;
+            case 'f':
+                opt.ftl = true;
+                break;
             case 't':
                 opt.trim = true;
                 break;
@@ -161,6 +167,14 @@ bool parse_args(int argc, char** argv, options& opt) {
         return false;
     }
 
+    // best, rle with ftl , turn them off
+    // In theory, legacy mode would work with ftl, but not supported
+    if (opt.ftl) {
+        opt.best = false;
+        opt.rle = false;
+        opt.legacy = false;
+    }
+
     // If output file name is not provided, extract from input file name
     if (opt.out_fname.empty()) {
         string fname(opt.in_fname);
@@ -203,6 +217,7 @@ const char *mode_string(qb3_mode m) {
     case QB3M_CF: return "Legacy CF";
     case QB3M_RLE: return "Legacy Base + RLE";
     case QB3M_CF_RLE: return "Legacy CF + RLE";
+    case QB3M_FTL: return "Fast";
     case QB3M_STORED: return "Stored";
     default:
         return "Unknown mode";
@@ -421,6 +436,9 @@ int encode(Raster &raster, std::vector<std::uint8_t> &image, std::vector<std::ui
             }
         }
 
+        if (opts.ftl)
+            mode = QB3M_FTL;
+
         if (mode != qb3_set_encoder_mode(qenc, mode)) {
             cerr << "Invalid mode\n";
             throw 1;

From 54de2247be757643800054f5713b204c7f9ab432 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 14 Aug 2024 18:03:41 -0700
Subject: [PATCH 07/13] Add option for quantization away and from zero

---
 cqb3.cpp | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/cqb3.cpp b/cqb3.cpp
index e7fab85..ed48537 100644
--- a/cqb3.cpp
+++ b/cqb3.cpp
@@ -58,6 +58,7 @@ struct options {
     bool legacy; // Legacy mode
     bool verbose;
     bool ftl; // Fastest compression
+    bool away; // quantize away from zero
     bool decode;
 };
 
@@ -124,8 +125,20 @@ bool parse_args(int argc, char** argv, options& opt) {
                 break;
             case 'q':
                 opt.quanta = 2; // Default
-                if ((i < argc) && isdigit(argv[i + 1][0]))
-                    opt.quanta = strtoull(argv[++i], nullptr, 10);
+                if (i < argc) { // Will fail anyhow, missing file name
+                    // If the first char is +, we quantize away from zero
+                    auto c = argv[i + 1][0];// First char
+                    if (isdigit(c)) {
+                        opt.away = false;
+                        opt.quanta = strtoull(argv[i + 1], nullptr, 10);
+                        i++;
+                    }
+                    if (c == '+') {
+                        opt.away = true;
+                        opt.quanta = strtoull(argv[i + 1] + 1, nullptr, 10);
+                        i++;
+                    }
+                }
                 break;
             case 'm':
                 // The next parameter is a comma separated band list if it starts with a digit
@@ -444,12 +457,12 @@ int encode(Raster &raster, std::vector<std::uint8_t> &image, std::vector<std::ui
             throw 1;
         }
         if (opts.quanta > 1) {
-            if (!qb3_set_encoder_quanta(qenc, opts.quanta, true)) {
+            if (!qb3_set_encoder_quanta(qenc, opts.quanta, opts.away)) {
                 cerr << "Invalid quanta\n";
                 throw 1;
             }
             else if (opts.verbose) {
-                cout << "Lossy compression, quantized by " << opts.quanta << endl;
+                cout << "Lossy compression, quantized by " << (opts.away ? "+" : "") << opts.quanta << endl;
             }
         }
         t1 = high_resolution_clock::now();

From 9e0667d9fcbcb8773a1a2773b75b9d9c6d6badd0 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 14 Aug 2024 18:03:57 -0700
Subject: [PATCH 08/13] updated cqb3 documentation

---
 cqb3.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/cqb3.md b/cqb3.md
index 5cd2a99..d061ba2 100644
--- a/cqb3.md
+++ b/cqb3.md
@@ -28,6 +28,10 @@ Decompress. Reads a QB3 formatted file and writes a PNG.
 -b
 Best. Turns on the **best** QB3 compression mode, which is slower but can produce better compression, especially for larger integer types.
 
+-f
+Fast. Turns on the **fast** QB3 mode, which is faster than the default by about 10% while loosing less than .5 % of the compression. It is not
+compatible with the rle mode
+
 -m <a,b,c,...>
 band Mapping control. For images with more than one channel, QB3 can apply a band decorrelation filter which improves the compression. It does this
 by subtracting one band from another. On decompression the effect of the filter is removed and the output image is identical to the input.
@@ -43,6 +47,17 @@ while band 1 (green) is to be subtracted from the 0 (red) and 2 (blue) bands. If
 input, the unspecified band mappings are left unmodified (core). Following the same logic, the -m option with no parameters is equivalent to the
 identity mapping, -m 0,1,2,... For RGBI (infrared) imagery, the 1,1,1,1 might be better than the default, which leaves the last band as is.
 The QB3 compressor will adjust the band input mapping if the values are not valid, a warning will be printed by cqb3 when this happens.
+A special case is "-m x", which tries all possible band mappings and selects the one that gives the best compression. This is only valid for 3 or 4
+band images. Note that if this option is provided, it will take about 9 times longer to finish the compression.
+
+-r
+RLE. A run length encoding is applied after the QB3 compression. This can improve the compression ratio, especially for images with large areas of
+constant values. The RLE encoding is lossless, the original image is restored on decompression. The RLE encoding is not compatible with the fast mode.
+
+-l
+Legacy microblock scan order. Uses the Morton (Z) scan order for the 4x4 pixel blocks in the QB3 compression. This is the original scan order used
+in the QB3 compressor. The default scan order is the Hilbert scan order, which results in better compression for most images. Use of this option is
+not recommended.
 
 -t
 Trim. QB3 compression operates on 4x4 pixel blocks. When the input image size is not a multiple of 4x4, libQB3 will internally encode a few lines
@@ -50,3 +65,11 @@ and columns more than once. This may result in an output size that is slightly l
 the input image will be trimmed to a multiple of 4x4 pixels before compression to QB3. The output QB3 raster size will reflect this trimmed size.
 1, 2 or three lines and/or columns will be trimmed, in the last, then first, then last again order, as necessary to make the respective dimension 
 a multiple of 4.
+
+-q <val>
+Lossy encoding, by quantizing (divide) input values by a small integer, before doing the actual QB3 decoding. On decoding, the decoded values are
+multiplied by the same value, restoring the normal range. The division is done with rounding, toward zero by default, it can be changed to rounding 
+away from zero by adding a + sign before the value. If this option is not followed by a value, the default value of 2 is used. The value must be
+within the range valid for the input data type.
+
+

From 3842fdb81b0f55b1c342d5c5b418fdb8f4ae3012 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Thu, 15 Aug 2024 12:51:33 -0700
Subject: [PATCH 09/13] Pre-shift accumulator to make decoding even faster.

---
 QB3lib/QB3decode.h | 99 ++++++++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 42 deletions(-)

diff --git a/QB3lib/QB3decode.h b/QB3lib/QB3decode.h
index cb9b17c..a9a66b1 100644
--- a/QB3lib/QB3decode.h
+++ b/QB3lib/QB3decode.h
@@ -164,9 +164,11 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
         if (1 == rung) {
             // Use inline constants as nibble tables
             // The lower two bits of the accumulator determine the size
+            // Preshift accumulator
+            acc <<= 2;
             for (size_t i=0; i < B2; i++) {
-                auto size = (0x31213121u >> ((acc & 7) << 2)) & 0xf;
-                group[i] = T((0x30102010u >> ((acc & 7) << 2)) & 0xf);
+                auto size = (0x31213121u >> (acc & 0b11100)) & 0xf;
+                group[i] = T((0x30102010u >> (acc & 0b11100)) & 0xf);
                 abits += size;
                 acc >>= size;
             }
@@ -178,24 +180,26 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
             // Cleaning it up, for example doing a peek at the start then looping 16 times, makes it slower
             // The masks and inline constants could be smaller for size, but that eliminates the
             // common expression, making it slower
+            // pre-shift accumulator, top 2 bits are not needed
+            acc <<= 2;
             unsigned int size;
             for (size_t i = 0; i < 14; i++) {
-                size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
-                group[i] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
+                size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+                group[i] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
                 abits += size;
                 acc >>= size;
             }
-            if (abits > 56) { // Rare
-                s.advance(abits);
+            if (abits > 56) { // Rare, max is 60, there are still 2 safe bits
+                s.advance(abits - 2);
                 acc = s.peek();
-                abits = 0;
+                abits = 2;
             }
-            size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
-            group[14] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
+            size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+            group[14] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
             acc >>= size;
             abits += size;
-            size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
-            group[15] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
+            size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+            group[15] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
             s.advance(abits + size);
         }
         else if (6 > rung) { // Table decode at 3,4 and 5, half of the values per accumulator
@@ -270,8 +274,8 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
             }
         }
     }
-    if (applystep) // template parameter to avoid dead code
-    if (0 == (group[B2 - 1] >> rung)) {
+    // template parameter to avoid a test when not needed
+    if (applystep && (0 == (group[B2 - 1] >> rung))) {
         auto stepp = step(group, rung);
         if (stepp < B2)
             group[stepp] ^= static_cast<T>(1ull << rung);
@@ -318,6 +322,8 @@ static bool decodeFTL(uint8_t* src, size_t len, T* image, const decs& info)
             if (x + B > xsize)
                 x = xsize - B;
             for (int c = 0; c < bands; c++) {
+                auto prv = prev[c];
+                T* const blockp = image + y * stride + x * bands + c;
                 failed = s.empty();
                 uint64_t cs(0), abits(1), acc(s.peek());
                 if (acc & 1) { // Rung change
@@ -329,33 +335,41 @@ static bool decodeFTL(uint8_t* src, size_t len, T* image, const decs& info)
                 // abits is never > 8, so it's safe to call gdecode
                 auto rung = (runbits[c] + cs) & NORM_MASK;
                 runbits[c] = rung;
-                if (rung > 1) { // longer codes
-                    failed |= !gdecode<false>(s, rung, group, acc, abits);
-                } else if (rung == 0) { // single bits, direct decoding
-                    if (0 != (acc & 1)) {
-                        abits += B2;
-                        for (size_t i = 0; i < B2; i++) {
-                            acc >>= 1;
-                            group[i] = static_cast<T>(1 & acc);
+                if (rung < 2) { // decode inlined
+                    if (rung == 0) { // single bits or all zeros
+                        abits++;
+                        if (0 != (acc & 1)) {
+                            abits += B2;
+                            for (int i = 0; i < B2; i++) {
+                                acc >>= 1;
+                                blockp[offset[i]] = prv -= (1 & acc);
+                            }
+                            prev[c] = prv;
+                        }
+                        else {
+                            for (int i = 0; i < B2; i++)
+                                blockp[offset[i]] = prv;
                         }
                     }
-                    else
-                        for (size_t i = 0; i < B2; i++)
-                            group[i] = static_cast<T>(0);
-                    s.advance(abits + 1);
-                }
-                else { // rung == 1
-                    for (size_t i = 0; i < B2; i++) {
-                        auto size = (0x31213121u >> ((acc & 7) << 2)) & 0xf;
-                        group[i] = T((0x30102010u >> ((acc & 7) << 2)) & 0xf);
-                        abits += size;
-                        acc >>= size;
+                    else { // rung == 1
+                        // Use inline constants as nibble tables
+                        // The lower two bits of the accumulator determine the size
+                        // Shift the accumulator to the left to place the selector in the right place
+                        acc <<= 2;
+                        for (size_t i = 0; i < B2; i++) {
+                            auto size = (0x31213121u >> (acc & 0b11100)) & 0xf;
+                            blockp[offset[i]] = prv += smag(T((0x30102010u >> (acc & 0b11100)) & 0xf));
+                            abits += size;
+                            acc >>= size;
+                        }
+                        prev[c] = prv;
                     }
                     s.advance(abits);
+                    continue;
                 }
+                // longer codes
+                failed |= !gdecode<false>(s, rung, group, acc, abits);
                 // Undo delta encoding for this block
-                auto prv = prev[c];
-                T* const blockp = image + y * stride + x * bands + c;
                 for (int i = 0; i < B2; i++)
                     blockp[offset[i]] = prv += smag(group[i]);
                 prev[c] = prv;
@@ -422,9 +436,8 @@ static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
                 acc >>= abits;
                 if (0 == cs || 0 != (cs & TBLMASK)) { // Normal decoding, not a signal
                     // abits is never > 8, so it's safe to call gdecode
-                    auto rung = (runbits[c] + cs) & NORM_MASK;
+                    auto rung = runbits[c] = (runbits[c] + cs) & NORM_MASK;
                     failed |= !gdecode(s, rung, group, acc, abits);
-                    runbits[c] = rung;
                 }
                 else { // extra encoding
                     cs = dsw[acc & LONG_MASK]; // rung, no flag
@@ -490,23 +503,25 @@ static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
                         abits += (cs >> 12) - 1;
                         failed |= rung == 63; // TODO: Deal with 64bit overflow
                         // 16 index values in group, max group size is 7, use rung 2, accumulator is sufficient
-                        T maxval(0);
+                        T maxidx(0);
+                        acc <<= 2; // preshift accumulator
                         for (int i = 0; i < B2; i++) {
-                            unsigned int size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
-                            group[i] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
+                            unsigned int size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+                            group[i] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
                             acc >>= size;
                             abits += size;
-                            if (maxval < group[i])
-                                maxval = group[i];
+                            if (maxidx < group[i])
+                                maxidx = group[i];
                         }
                         s.advance(abits);
                         T idxarray[B2 / 2] = {};
-                        for (size_t i = 0; i <= maxval; i++) {
+                        for (size_t i = 0; i <= maxidx; i++) {
                             acc = s.peek();
                             auto v = qb3dsztbl(acc, rung);
                             s.advance(v.first);
                             idxarray[i] = T(v.second);
                         }
+                        // Apply idxarray to group
                         for (int i = 0; i < B2; i++)
                             group[i] = idxarray[group[i]];
                     }

From e75c91042405240cf37342ad846589f71bddea55 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 4 Sep 2024 10:46:12 -0700
Subject: [PATCH 10/13] Add define for FTL mode

---
 QB3lib/QB3.h       | 3 +++
 QB3lib/QB3decode.h | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/QB3lib/QB3.h b/QB3lib/QB3.h
index 11b9b3b..3bdc8a5 100644
--- a/QB3lib/QB3.h
+++ b/QB3lib/QB3.h
@@ -36,6 +36,9 @@ typedef struct decs * decsp; // decoder
 // Types
 enum qb3_dtype { QB3_U8 = 0, QB3_I8, QB3_U16, QB3_I16, QB3_U32, QB3_I32, QB3_U64, QB3_I64 };
 
+// To check if the library has QB3M_FTL
+#define QB3_HAS_FTL 1
+
 // Encode mode, default is fastest, best is best compression
 enum qb3_mode {
     // Aliases, values might change
diff --git a/QB3lib/QB3decode.h b/QB3lib/QB3decode.h
index a9a66b1..7d74a17 100644
--- a/QB3lib/QB3decode.h
+++ b/QB3lib/QB3decode.h
@@ -182,7 +182,7 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
             // common expression, making it slower
             // pre-shift accumulator, top 2 bits are not needed
             acc <<= 2;
-            unsigned int size;
+            uint32_t size;
             for (size_t i = 0; i < 14; i++) {
                 size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
                 group[i] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
@@ -502,11 +502,11 @@ static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
                         acc >>= (cs >> 12) - 1; // No flag
                         abits += (cs >> 12) - 1;
                         failed |= rung == 63; // TODO: Deal with 64bit overflow
-                        // 16 index values in group, max group size is 7, use rung 2, accumulator is sufficient
+                        // 16 index values in group, max group size is 7, use rung 2
                         T maxidx(0);
                         acc <<= 2; // preshift accumulator
                         for (int i = 0; i < B2; i++) {
-                            unsigned int size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+                            unint32_t size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
                             group[i] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
                             acc >>= size;
                             abits += size;

From 6497080190073675966eb48f1ecb674eb9baddbb Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 4 Sep 2024 11:21:28 -0700
Subject: [PATCH 11/13] Update README.md

---
 README.md | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 31a52f4..e252b17 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,16 @@
 # QB3: Fast and Efficient Image/Raster Compression
 
+- Compression and decompression speed is 500MB/sec for byte data and close to 4GB/sec for 64 bit data
 - Better compression than PNG in most cases
-- Compression and decompression speed is 400MB/sec for byte data and above 3GB/sec for 64bit int
-- Integer values from 8 to 64bit per value, signed and unsigned
-- No significant memory footprint
-- Very low complexity
 - No external dependencies
+- Any integer type, from 8 to 64bit per value, signed and unsigned
+- No significant memory footprint during encoding or decoding
+- Very low complexity
 
 # Library
 The library, located in [QB3lib](QB3lib) provides the core QB3 
 algorithm implementation with a C API.
-It is implemented in C++, can be built on most platforms using cmake.
+Implemented in C++, can be built on most platforms using cmake.
 It requires a little endian, two's complement architecture with 8, 16, 32 
 and 64 bit integers, which includes the common AMD64 and ARM64 platforms.
 Only 64bit builds should be used since this implementation uses 64 bit integers heavily.
@@ -49,7 +49,12 @@ metadata to allow decoding.
 
 # Change Log
 
-## Version 1.1.0:
+## Version 1.2.0
+- Speed improvements
+- Add QB3M_FTL mode, fastest mode by about 20% with a tiny compression loss
+    - Test availability by testing that QB3_HAS_FTL is defined
+
+## Version 1.1.0
 - Better scan ordering, second order Hilbert curve is the default
     - 5% better compression with no speed penalty
     - Legacy scan order (Morton) is optional
@@ -62,7 +67,8 @@ metadata to allow decoding.
     - Default build target is the library, eliminating external dependencies
     - Conversion utility is optional
 
-## Version 1.0.0: Initial release
+## Version 1.0.0
+- Initial release
 - C API
 - All integer types
 

From b05c4f149595213fb8a859dc3e8d061cd251c9a9 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 4 Sep 2024 11:29:43 -0700
Subject: [PATCH 12/13] typo

---
 QB3lib/QB3decode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QB3lib/QB3decode.h b/QB3lib/QB3decode.h
index 7d74a17..b78718f 100644
--- a/QB3lib/QB3decode.h
+++ b/QB3lib/QB3decode.h
@@ -506,7 +506,7 @@ static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
                         T maxidx(0);
                         acc <<= 2; // preshift accumulator
                         for (int i = 0; i < B2; i++) {
-                            unint32_t size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+                            uint32_t size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
                             group[i] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
                             acc >>= size;
                             abits += size;

From 9edfd8fc4d414719c44cb3316761430081358851 Mon Sep 17 00:00:00 2001
From: Lucian Plesea <LPlesea@esri.com>
Date: Wed, 4 Sep 2024 11:41:47 -0700
Subject: [PATCH 13/13] Update README.md

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e252b17..9d4ea2b 100644
--- a/README.md
+++ b/README.md
@@ -50,8 +50,10 @@ metadata to allow decoding.
 # Change Log
 
 ## Version 1.2.0
-- Speed improvements
-- Add QB3M_FTL mode, fastest mode by about 20% with a tiny compression loss
+- Speed optimizations, both compression and decompression
+    - More than 400MB/sec for byte data using the default mode
+- New QB3M_FTL mode, 25% faster than QB3M_DEFAULT with a tiny compression loss
+	- 500MB/sec for byte data
     - Test availability by testing that QB3_HAS_FTL is defined
 
 ## Version 1.1.0