diff --git a/QB3lib/QB3.h b/QB3lib/QB3.h
index 359644d..3bdc8a5 100644
--- a/QB3lib/QB3.h
+++ b/QB3lib/QB3.h
@@ -18,9 +18,11 @@ Contributors:  Lucian Plesea
 #pragma once
 // For size_t
 #include <stddef.h>
+// For uint64_t
+#include <stdint.h>
 
 // CMake will generate LIBQB3_EXPORT linkage as needed
-#include <libqb3_export.h>
+#include "libqb3_export.h"
 
 // Keep this close to plain C so it can have a C API
 #define QB3_MAXBANDS 16
@@ -34,6 +36,9 @@ typedef struct decs * decsp; // decoder
 // Types
 enum qb3_dtype { QB3_U8 = 0, QB3_I8, QB3_U16, QB3_I16, QB3_U32, QB3_I32, QB3_U64, QB3_I64 };
 
+// To check if the library has QB3M_FTL
+#define QB3_HAS_FTL 1
+
 // Encode mode, default is fastest, best is best compression
 enum qb3_mode {
     // Aliases, values might change
@@ -53,7 +58,12 @@ enum qb3_mode {
     QB3M_RLE_H = 6, // QB3 Hilbert + RLE
     QB3M_CF_RLE_H = 7, // QB3 Hilbert + CF + RLE
 
-    QB3M_STORED = 255, // Raw bypass
+    // Faster and only slightly worse than base in many cases
+    // Hilbert curve but no bit-step, no CF, no RLE
+    QB3M_FTL = 8,
+    QB3M_END, // Marks the end of the settable modes
+
+    QB3M_STORED = 255, // Raw bypass, can't be requested
     QB3M_INVALID = -1 // Invalid mode
 }; // Best compression, one of the above
 
@@ -87,7 +97,7 @@ LIBQB3_EXPORT bool qb3_set_encoder_coreband(encsp p, size_t bands, size_t *cband
 
 // Sets quantization parameters, returns true on success
 // away = true -> round away from zero
-LIBQB3_EXPORT bool qb3_set_encoder_quanta(encsp p, size_t q, bool away);
+LIBQB3_EXPORT bool qb3_set_encoder_quanta(encsp p, uint64_t q, bool away);
 
 // Upper bound of encoded size, without taking the header into consideration
 LIBQB3_EXPORT size_t qb3_max_encoded_size(const encsp p);
@@ -136,10 +146,10 @@ LIBQB3_EXPORT void qb3_set_decoder_stride(decsp p, size_t stride);
 LIBQB3_EXPORT qb3_mode qb3_get_mode(const decsp p);
 
 // Returns the number of quantization bits used, returns 0 if failed
-LIBQB3_EXPORT size_t qb3_get_quanta(const decsp p);
+LIBQB3_EXPORT uint64_t qb3_get_quanta(const decsp p);
 
 // Return the scanning curve used, returns 0 if failed
-LIBQB3_EXPORT size_t qb3_get_order(const decsp p);
+LIBQB3_EXPORT uint64_t qb3_get_order(const decsp p);
 
 // Sets the cband array and returns true if successful
 LIBQB3_EXPORT bool qb3_get_coreband(const decsp p, size_t *cband);
diff --git a/QB3lib/QB3common.h b/QB3lib/QB3common.h
index cafedd1..c6da41c 100644
--- a/QB3lib/QB3common.h
+++ b/QB3lib/QB3common.h
@@ -86,7 +86,7 @@ struct encs {
     size_t nbands;
     // micro block scanning order
     uint64_t order;
-    size_t quanta;
+    uint64_t quanta;
 
     // Persistent state by band
     band_state band[QB3_MAXBANDS];
@@ -109,7 +109,7 @@ struct decs {
     size_t stride;
     // micro block scanning order
     uint64_t order;
-    size_t quanta;
+    uint64_t quanta;
     int error;
     int stage;
 
diff --git a/QB3lib/QB3decode.cpp b/QB3lib/QB3decode.cpp
index fac5a2f..4609d08 100644
--- a/QB3lib/QB3decode.cpp
+++ b/QB3lib/QB3decode.cpp
@@ -48,13 +48,13 @@ qb3_mode qb3_get_mode(const decsp p) {
     return p->mode;
 }
 
-size_t qb3_get_quanta(const decsp p) {
+uint64_t qb3_get_quanta(const decsp p) {
     if (p->stage != 2)
         return 0; // Error
     return p->quanta;
 }
 
-size_t qb3_get_order(const decsp p) {
+uint64_t qb3_get_order(const decsp p) {
     if (p->stage != 2)
         return 0; // Error
     return p->order ? p->order : ZCURVE;
@@ -149,7 +149,7 @@ decsp qb3_read_start(void* source, size_t source_size, size_t *image_size) {
     val >>= 8; // 40 bits left
     // Also check that the next 2 bytes are a signature
     if (p->nbands > QB3_MAXBANDS 
-        || (p->mode > qb3_mode::QB3M_BEST && p->mode != qb3_mode::QB3M_STORED)
+        || (p->mode >= qb3_mode::QB3M_END && p->mode != qb3_mode::QB3M_STORED)
         || 0 != (val & 0x8080) 
         || p->type > qb3_dtype::QB3_I64) {
         delete p;
diff --git a/QB3lib/QB3decode.h b/QB3lib/QB3decode.h
index c0d99bd..b78718f 100644
--- a/QB3lib/QB3decode.h
+++ b/QB3lib/QB3decode.h
@@ -140,7 +140,7 @@ static std::pair<size_t, uint64_t> qb3dsztbl(uint64_t val, size_t rung) {
 // For rung 0, it works with 17bits or more
 // For rung 1, it works with 47bits or more
 // returns false on failure
-template<typename T>
+template<bool applystep = true, typename T>
 static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits) {
     assert(((rung > 1) && (abits <= 8))
         || ((rung == 1) && (abits <= 17)) // B2 + 1
@@ -164,9 +164,11 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
         if (1 == rung) {
             // Use inline constants as nibble tables
             // The lower two bits of the accumulator determine the size
+            // Preshift accumulator
+            acc <<= 2;
             for (size_t i=0; i < B2; i++) {
-                auto size = (0x31213121u >> ((acc & 7) << 2)) & 0xf;
-                group[i] = T((0x30102010u >> ((acc & 7) << 2)) & 0xf);
+                auto size = (0x31213121u >> (acc & 0b11100)) & 0xf;
+                group[i] = T((0x30102010u >> (acc & 0b11100)) & 0xf);
                 abits += size;
                 acc >>= size;
             }
@@ -174,24 +176,30 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
         }
         else if (2 == rung) { // max symbol len is 4, there are at least 14 in the accumulator
             // Use inline constants as nibble tables
-            unsigned int size;
+            // Faster than a double value table decode, but only in this specific code organization
+            // Cleaning it up, for example doing a peek at the start then looping 16 times, makes it slower
+            // The masks and inline constants could be smaller for size, but that eliminates the
+            // common expression, making it slower
+            // pre-shift accumulator, top 2 bits are not needed
+            acc <<= 2;
+            uint32_t size;
             for (size_t i = 0; i < 14; i++) {
-                size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
-                group[i] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
+                size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+                group[i] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
                 abits += size;
                 acc >>= size;
             }
-            if (abits > 56) { // Rare
-                s.advance(abits);
+            if (abits > 56) { // Rare, max is 60, there are still 2 safe bits
+                s.advance(abits - 2);
                 acc = s.peek();
-                abits = 0;
+                abits = 2;
             }
-            size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
-            group[14] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
+            size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+            group[14] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
             acc >>= size;
             abits += size;
-            size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
-            group[15] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
+            size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+            group[15] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
             s.advance(abits + size);
         }
         else if (6 > rung) { // Table decode at 3,4 and 5, half of the values per accumulator
@@ -266,7 +274,8 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
             }
         }
     }
-    if (0 == (group[B2 - 1] >> rung)) {
+    // template parameter to avoid a test when not needed
+    if (applystep && (0 == (group[B2 - 1] >> rung))) {
         auto stepp = step(group, rung);
         if (stepp < B2)
             group[stepp] ^= static_cast<T>(1ull << rung);
@@ -280,10 +289,115 @@ template<typename T> static T magsabs(T v) { return (v >> 1) + (v & 1); }
 // Multiply v(in magsign) by m(normal, positive)
 template<typename T> static T magsmul(T v, T m) { return magsabs(v) * (m << 1) - (v & 1); }
 
+// Streamlined decoding for FTL mode
+template<typename T>
+static bool decodeFTL(uint8_t* src, size_t len, T* image, const decs& info)
+{
+    auto xsize(info.xsize), ysize(info.ysize), bands(info.nbands), stride(info.stride);
+    auto cband = info.cband;
+    static_assert(std::is_integral<T>() && std::is_unsigned<T>(), "Only unsigned integer types allowed");
+    constexpr size_t UBITS(sizeof(T) == 1 ? 3 : sizeof(T) == 2 ? 4 : sizeof(T) == 4 ? 5 : 6);
+    constexpr auto NORM_MASK((1ull << UBITS) - 1); // UBITS set
+    constexpr auto LONG_MASK(NORM_MASK * 2 + 1); // UBITS + 1 set
+    T prev[QB3_MAXBANDS] = {}, group[B2] = {};
+    size_t runbits[QB3_MAXBANDS] = {};
+    const uint16_t* dsw = sizeof(T) == 1 ? dsw3 : sizeof(T) == 2 ? dsw4 : sizeof(T) == 4 ? dsw5 : dsw6;
+    stride = stride ? stride : xsize * bands;
+    // Set up block offsets based on traversal order, defaults to HILBERT
+    uint64_t order(info.order);
+    order = order ? order : HILBERT;
+    size_t offset[B2] = {};
+    for (size_t i = 0; i < B2; i++) {
+        size_t n = (order >> ((B2 - 1 - i) << 2));
+        offset[i] = ((n >> 2) & 0b11) * stride + (n & 0b11) * bands;
+    }
+    iBits s(src, len);
+    bool failed(false);
+    for (size_t y = 0; y < ysize; y += B) {
+        // If the last row is partial, roll it up
+        if (y + B > ysize)
+            y = ysize - B;
+        for (size_t x = 0; x < xsize; x += B) {
+            // If the last column is partial, move it left
+            if (x + B > xsize)
+                x = xsize - B;
+            for (int c = 0; c < bands; c++) {
+                auto prv = prev[c];
+                T* const blockp = image + y * stride + x * bands + c;
+                failed = s.empty();
+                uint64_t cs(0), abits(1), acc(s.peek());
+                if (acc & 1) { // Rung change
+                    cs = dsw[(acc >> 1) & LONG_MASK];
+                    abits = cs >> 12;
+                    failed |= (0 == (cs & TBLMASK)); // no signals
+                }
+                acc >>= abits;
+                // abits is never > 8, so it's safe to call gdecode
+                auto rung = (runbits[c] + cs) & NORM_MASK;
+                runbits[c] = rung;
+                if (rung < 2) { // decode inlined
+                    if (rung == 0) { // single bits or all zeros
+                        abits++;
+                        if (0 != (acc & 1)) {
+                            abits += B2;
+                            for (int i = 0; i < B2; i++) {
+                                acc >>= 1;
+                                blockp[offset[i]] = prv -= (1 & acc);
+                            }
+                            prev[c] = prv;
+                        }
+                        else {
+                            for (int i = 0; i < B2; i++)
+                                blockp[offset[i]] = prv;
+                        }
+                    }
+                    else { // rung == 1
+                        // Use inline constants as nibble tables
+                        // The lower two bits of the accumulator determine the size
+                        // Shift the accumulator to the left to place the selector in the right place
+                        acc <<= 2;
+                        for (size_t i = 0; i < B2; i++) {
+                            auto size = (0x31213121u >> (acc & 0b11100)) & 0xf;
+                            blockp[offset[i]] = prv += smag(T((0x30102010u >> (acc & 0b11100)) & 0xf));
+                            abits += size;
+                            acc >>= size;
+                        }
+                        prev[c] = prv;
+                    }
+                    s.advance(abits);
+                    continue;
+                }
+                // longer codes
+                failed |= !gdecode<false>(s, rung, group, acc, abits);
+                // Undo delta encoding for this block
+                for (int i = 0; i < B2; i++)
+                    blockp[offset[i]] = prv += smag(group[i]);
+                prev[c] = prv;
+                if (failed) break;
+            } // Per band per block
+            if (failed) break;
+        } // per block
+        if (failed) break;
+        // For performance apply band delta per block strip, in linear order
+        for (size_t j = 0; j < B; j++) {
+            for (int c = 0; c < bands; c++) if (c != cband[c]) {
+                auto dimg = image + stride * (y + j) + c;
+                auto simg = image + stride * (y + j) + cband[c];
+                for (int i = 0; i < xsize; i++, dimg += bands, simg += bands)
+                    *dimg += *simg;
+            }
+        }
+    } // per strip
+    // It might not catch all errors
+    return failed || s.avail() > 7;
+}
+
 // reports most but not all errors, for example if the input stream is too short for the last block
 template<typename T>
 static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
 {
+    if (info.mode == QB3M_FTL)
+        return decodeFTL(src, len, image, info);
     auto xsize(info.xsize), ysize(info.ysize), bands(info.nbands), stride(info.stride);
     auto cband = info.cband;
     static_assert(std::is_integral<T>() && std::is_unsigned<T>(), "Only unsigned integer types allowed");
@@ -322,9 +436,8 @@ static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
                 acc >>= abits;
                 if (0 == cs || 0 != (cs & TBLMASK)) { // Normal decoding, not a signal
                     // abits is never > 8, so it's safe to call gdecode
-                    auto rung = (runbits[c] + cs) & NORM_MASK;
+                    auto rung = runbits[c] = (runbits[c] + cs) & NORM_MASK;
                     failed |= !gdecode(s, rung, group, acc, abits);
-                    runbits[c] = rung;
                 }
                 else { // extra encoding
                     cs = dsw[acc & LONG_MASK]; // rung, no flag
@@ -389,24 +502,26 @@ static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
                         acc >>= (cs >> 12) - 1; // No flag
                         abits += (cs >> 12) - 1;
                         failed |= rung == 63; // TODO: Deal with 64bit overflow
-                        // 16 index values in group, max is 7, use rung 2
-                        T maxval(0);
+                        // 16 index values in group, max group size is 7, use rung 2
+                        T maxidx(0);
+                        acc <<= 2; // preshift accumulator
                         for (int i = 0; i < B2; i++) {
-                            auto v = DRG[2][acc & 0xf];
-                            group[i] = static_cast<uint8_t>(v);
-                            if (maxval < group[i])
-                                maxval = group[i];
-                            acc >>= v >> 12;
-                            abits += v >> 12;
+                            uint32_t size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
+                            group[i] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
+                            acc >>= size;
+                            abits += size;
+                            if (maxidx < group[i])
+                                maxidx = group[i];
                         }
                         s.advance(abits);
                         T idxarray[B2 / 2] = {};
-                        for (size_t i = 0; i <= maxval; i++) {
+                        for (size_t i = 0; i <= maxidx; i++) {
                             acc = s.peek();
                             auto v = qb3dsztbl(acc, rung);
                             s.advance(v.first);
                             idxarray[i] = T(v.second);
                         }
+                        // Apply idxarray to group
                         for (int i = 0; i < B2; i++)
                             group[i] = idxarray[group[i]];
                     }
diff --git a/QB3lib/QB3encode.cpp b/QB3lib/QB3encode.cpp
index 0e0d766..af7f8fe 100644
--- a/QB3lib/QB3encode.cpp
+++ b/QB3lib/QB3encode.cpp
@@ -135,7 +135,7 @@ size_t qb3_max_encoded_size(const encsp p) {
 }
 
 qb3_mode qb3_set_encoder_mode(encsp p, qb3_mode mode) {
-    if (mode <= qb3_mode::QB3M_BEST)
+    if (mode < qb3_mode::QB3M_END)
         p->mode = mode;
     // Default curve is HILBERT, change it if needed
     switch (p->mode) {
@@ -421,7 +421,7 @@ static size_t raw_size(encsp const &p) {
 int qb3_get_encoder_state(encsp p) { return p->error; }
 
 static bool is_fast(qb3_mode mode) {
-    return (QB3M_BASE_H == mode) || (QB3M_BASE_Z == mode);
+    return (QB3M_BASE_H == mode) || (QB3M_BASE_Z == mode) || (QB3M_FTL == mode);
 }
 
 // ONLY QB3M_BASE and QB3M_CF are supported here
diff --git a/QB3lib/QB3encode.h b/QB3lib/QB3encode.h
index e9b6ab3..d954fd9 100644
--- a/QB3lib/QB3encode.h
+++ b/QB3lib/QB3encode.h
@@ -147,7 +147,7 @@ static std::pair<size_t, uint64_t> qb3csztbl(uint64_t val, size_t rung) {
 // bitsused is used to choose the rung for encoding
 // If abits > 0, the accumulator is also pushed into the stream
 template <typename T>
-static void groupencode(T group[B2], T bitsused, oBits& s, uint64_t acc, size_t abits)
+static void groupencode(T group[B2], T bitsused, oBits& s, uint64_t acc, size_t abits, bool skipstep = false)
 {
     assert(abits <= 64);
     const size_t rung = topbit(bitsused | 1);
@@ -159,12 +159,15 @@ static void groupencode(T group[B2], T bitsused, oBits& s, uint64_t acc, size_t
         s.push(acc, abits);
         return;
     }
-    // Flip the last set rung bit if the rung bit sequence is a step down
-    // At least one rung bit has to be set, so it can't return 0
-    auto stepp = step(group, rung);
-    assert(stepp > 0); // At least one rung bit should be set
-    if (stepp <= B2)
-        group[stepp - 1] ^= static_cast<T>(1ull << rung);
+    size_t stepp(B2 + 1);
+    if (!skipstep) {
+        // Flip the last set rung bit if the rung bit sequence is a step down
+        // At least one rung bit has to be set, so it can't return 0
+        stepp = step(group, rung);
+        assert(stepp > 0); // At least one rung bit should be set
+        if (stepp <= B2)
+            group[stepp - 1] ^= static_cast<T>(1ull << rung);
+    }
     if (abits > 8) { // Just in case, a rung switch is 8 bits at most
         s.push(acc, abits);
         acc = abits = 0;
@@ -240,10 +243,10 @@ static void groupencode(T group[B2], T bitsused, oBits& s, uint64_t acc, size_t
 
 // Base QB3 group encode with code switch, returns encoded size
 template <typename T>
-static void groupencode(T group[B2], T bitsused, size_t oldrung, oBits& s) {
+static void groupencode(T group[B2], T bitsused, size_t oldrung, oBits& s, bool skipstep = false) {
     constexpr size_t UBITS = sizeof(T) == 1 ? 3 : sizeof(T) == 2 ? 4 : sizeof(T) == 4 ? 5 : 6;
     uint64_t acc = CSW[UBITS][(topbit(bitsused | 1) - oldrung) & ((1ull << UBITS) - 1)];
-    groupencode(group, bitsused, s, acc & TBLMASK, static_cast<size_t>(acc >> 12));
+    groupencode(group, bitsused, s, acc & TBLMASK, static_cast<size_t>(acc >> 12), skipstep);
 }
 
 // Group encode with cf
@@ -398,7 +401,7 @@ static int encode_fast(const T* image, oBits& s, encs &info)
                     }
                 }
                 prev[c] = prv;
-                groupencode(group, bitsused, runbits[c], s);
+                groupencode(group, bitsused, runbits[c], s, info.mode == QB3M_FTL);
                 runbits[c] = topbit(bitsused | 1);
             }
         }
diff --git a/README.md b/README.md
index 31a52f4..9d4ea2b 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,16 @@
 # QB3: Fast and Efficient Image/Raster Compression
 
+- Compression and decompression speed is 500MB/sec for byte data and close to 4GB/sec for 64 bit data
 - Better compression than PNG in most cases
-- Compression and decompression speed is 400MB/sec for byte data and above 3GB/sec for 64bit int
-- Integer values from 8 to 64bit per value, signed and unsigned
-- No significant memory footprint
-- Very low complexity
 - No external dependencies
+- Any integer type, from 8 to 64bit per value, signed and unsigned
+- No significant memory footprint during encoding or decoding
+- Very low complexity
 
 # Library
 The library, located in [QB3lib](QB3lib) provides the core QB3 
 algorithm implementation with a C API.
-It is implemented in C++, can be built on most platforms using cmake.
+Implemented in C++, can be built on most platforms using cmake.
 It requires a little endian, two's complement architecture with 8, 16, 32 
 and 64 bit integers, which includes the common AMD64 and ARM64 platforms.
 Only 64bit builds should be used since this implementation uses 64 bit integers heavily.
@@ -49,7 +49,14 @@ metadata to allow decoding.
 
 # Change Log
 
-## Version 1.1.0:
+## Version 1.2.0
+- Speed optimizations, both compression and decompression
+    - More than 400MB/sec for byte data using the default mode
+- New QB3M_FTL mode, 25% faster than QB3M_DEFAULT with a tiny compression loss
+	- 500MB/sec for byte data
+    - Test availability by testing that QB3_HAS_FTL is defined
+
+## Version 1.1.0
 - Better scan ordering, second order Hilbert curve is the default
     - 5% better compression with no speed penalty
     - Legacy scan order (Morton) is optional
@@ -62,7 +69,8 @@ metadata to allow decoding.
     - Default build target is the library, eliminating external dependencies
     - Conversion utility is optional
 
-## Version 1.0.0: Initial release
+## Version 1.0.0
+- Initial release
 - C API
 - All integer types
 
diff --git a/attic/world.cpp b/attic/world.cpp
new file mode 100644
index 0000000..821228f
--- /dev/null
+++ b/attic/world.cpp
@@ -0,0 +1,186 @@
+/*
+  Prototype use of QB3 library in wasm, with emscripten
+  This is a simple test program that reads a PPM image, encodes it and then decodes it
+  The image is then displayed in a window
+
+  Once libQB3.a is built using emscripten and a test ppm image is available,
+  use the following command do compile this program into a runnable html file:
+
+  emcc -O2 -o world.html world.cpp libQB3.a --preload-file Image.ppm -sUSE_SDL=2
+  
+  Current performance is 300MB/sec encode and 270MB/sec decode, RGB 8 bit data
+*/
+
+#include "../include/QB3.h"
+#include <cstdio>
+#include <cstdlib>
+#include <emscripten.h>
+
+#include <SDL.h>
+
+int read_ppm_header(FILE *f, int &x, int &y, int &zmax) {
+	char line[1024];
+	char *v = fgets(line, 1024, f);
+	if (v[0] != 'P' || v[1] != '6') return 1; // Error
+	v = fgets(line, 1024, f);
+	sscanf(v, "%d %d", &x, &y);
+	v = fgets(line, 1024, f);
+	sscanf(v, "%d", &zmax);
+	return 0;
+}
+
+// Display in a 1024 * 1024 window, image y has to be at least 1024
+void display_data(int x, char * data) {
+	if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS) < 0)
+		printf("Can't initialize SDL\n");
+	// if (IMG_Init(IMG_INIT_PNG) == 0)
+	// 	printf("Error SDL2_Image Initialization\n");
+
+	SDL_Window *window = SDL_CreateWindow("Display",
+		SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, 1024, 1024, SDL_WINDOW_OPENGL
+		);
+	if (!window) printf("Create window failed\n");
+
+	SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
+
+	SDL_Texture *texture = SDL_CreateTexture(renderer,
+		SDL_PIXELFORMAT_RGB24, SDL_TEXTUREACCESS_STATIC, 1024, 1024);
+	if (!texture)
+		printf("Can't create texture\n");
+
+	if (SDL_UpdateTexture(texture, NULL, data, x * 3))
+		printf("Can't update texture\n");
+	SDL_RenderClear(renderer);
+	// This scales it to full size, which is a mistake
+	SDL_RenderCopy(renderer, texture, NULL, NULL);
+	SDL_RenderPresent(renderer);
+
+	SDL_DestroyTexture(texture);
+	SDL_DestroyRenderer(renderer);
+	// Should clean up the window too
+}
+
+// Single image decode and display
+int main2(int argc, char **argv) {
+	FILE *f = fopen("Image.qb3", "rb");
+	if (!f) {
+		printf("Didn't work\n");
+		return 1;
+	}
+
+	int x, y, bands;
+	fseek(f, 0, SEEK_END);
+	auto fsize = ftell(f);
+	rewind(f);
+	auto buffer = malloc(fsize);
+	fread(buffer, fsize, 1, f);
+	size_t image_size[3]; // x, y and bands
+	auto qdec = qb3_read_start(buffer, size_t(fsize), image_size);
+	x = image_size[0];
+	y = image_size[1];
+	bands = image_size[2];
+	printf("Image is %dx%d@%d\n", x, y, bands);
+	qb3_read_info(qdec);
+	auto tp = qb3_get_type(qdec);
+	if (tp != QB3_U8) {
+		free(buffer);
+		qb3_destroy_decoder(qdec);
+
+		printf("Not byte data\n");
+		return 1;
+	}
+
+	auto raw_size = qb3_decoded_size(qdec);
+	char *raw_buffer = (char *)malloc(raw_size);
+	// Final decode
+	qb3_read_data(qdec, raw_buffer);
+	qb3_destroy_decoder(qdec);
+	display_data(x, raw_buffer);
+	return 0;
+}
+
+int main(int argc, char **argv) {
+	FILE *f=fopen("Image.ppm", "rb");
+	
+	int x, y, max_val;
+	read_ppm_header(f, x, y, max_val);
+	printf("%d %d %d\n", x, y, max_val);
+	int raw_size = x * y * 3;
+	printf("Raw size %d\n", raw_size);
+	char *data = static_cast<char *>(malloc(x * y * 3));
+	fread(data, 3 * x, y, f);
+	qb3_dtype tp = qb3_dtype::QB3_U8;
+	auto qenc = qb3_create_encoder(x, y, 3, tp);
+	auto maxsz = qb3_max_encoded_size(qenc);
+	char *outbuff = static_cast<char *>(malloc(maxsz));
+	qb3_destroy_encoder(qenc);
+	int loops = 50;
+	auto C = 1e-3 * raw_size * loops;
+	// Twice, first is warmup then actual
+	for (int j = 0; j < 2; j++)
+	{
+		double encode_time(0), decode_time(0);
+		for (int i = 0; i < loops; i++)
+		{
+			// printf("Loop %d %f\n", i, encode_time);
+			qenc = qb3_create_encoder(x, y, 3, tp);
+			auto start = emscripten_get_now();
+			qb3_set_encoder_mode(qenc, QB3M_FTL);
+			auto outsize = qb3_encode(qenc, data, outbuff);
+			auto stop = emscripten_get_now();
+			encode_time += stop - start;
+			// printf("Compressed to %lu\n", outsize);
+			// printf("Encoding took %f\n", stop - start);
+			// Let's try decompression too
+			qb3_destroy_encoder(qenc);
+
+			//
+			size_t image_size[3];
+			auto qdec = qb3_read_start(outbuff, outsize, image_size);
+			x = image_size[0];
+			y = image_size[1];
+			max_val = image_size[2];
+			// printf("Got size %u %u %u\n", x, y, max_val);
+			qb3_read_info(qdec);
+			start = emscripten_get_now();
+			if (raw_size != qb3_decoded_size(qdec))
+				printf("Size error on decode\n");
+			if (!qb3_read_data(qdec, data))
+				printf("decode error\n");
+			stop = emscripten_get_now();
+			// printf("Decoding took %f ms\n", stop - start);
+			qb3_destroy_decoder(qdec);
+			decode_time += stop - start;
+		}
+		printf("Time (ms)      Encode: %f, Decode: %f\n", encode_time / loops, decode_time / loops);
+		printf("Speed (MB/sec) Encode: %f, Decode: %f\n", C / encode_time, C / decode_time);
+	}
+
+	free(outbuff);
+	if (SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS) < 0)
+		printf("Can't initialize SDL\n");
+	// if (IMG_Init(IMG_INIT_PNG) == 0)
+	// 	printf("Error SDL2_Image Initialization\n");
+
+	SDL_Window *window = SDL_CreateWindow("First program",
+		SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, x, y, SDL_WINDOW_OPENGL
+		);
+	if (!window) printf("Create window failed\n");
+
+	SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
+
+	SDL_Texture *texture = SDL_CreateTexture(renderer,
+		SDL_PIXELFORMAT_RGB24, SDL_TEXTUREACCESS_STATIC, 1024, 1024);
+	if (!texture)
+		printf("Can't create texture\n");
+
+	if (SDL_UpdateTexture(texture, NULL, data, x * 3))
+		printf("Can't update texture\n");
+	SDL_RenderClear(renderer);
+	SDL_RenderCopy(renderer, texture, NULL, NULL);
+	SDL_RenderPresent(renderer);
+
+	free(data);
+	
+	return 0;
+}
diff --git a/cqb3.cpp b/cqb3.cpp
index 9c15ab0..ed48537 100644
--- a/cqb3.cpp
+++ b/cqb3.cpp
@@ -41,7 +41,8 @@ struct options {
         trim(false),
         rle(false), // non-default RLE (off for best, on for fast)
         legacy(false), // legacy mode
-        verbose(false), 
+        verbose(false),
+        ftl(false),
         decode(false)
     {};
 
@@ -56,6 +57,8 @@ struct options {
     bool rle; // Skip RLE
     bool legacy; // Legacy mode
     bool verbose;
+    bool ftl; // Fastest compression
+    bool away; // quantize away from zero
     bool decode;
 };
 
@@ -68,6 +71,7 @@ int Usage(const options &opt) {
         << "\n"
         << "Compression only options:\n"
         << "\t-b : best compression\n"
+        << "\t-f : fastest compression\n"
         << "\t-l : legacy mode (deprecated)\n"
         << "\t-q <n> : quanta\n"
         << "\t-r : reverse RLE behavior, off for best, on for fast\n"
@@ -110,6 +114,9 @@ bool parse_args(int argc, char** argv, options& opt) {
             case 'd':
                 opt.decode = true;
                 break;
+            case 'f':
+                opt.ftl = true;
+                break;
             case 't':
                 opt.trim = true;
                 break;
@@ -118,8 +125,20 @@ bool parse_args(int argc, char** argv, options& opt) {
                 break;
             case 'q':
                 opt.quanta = 2; // Default
-                if ((i < argc) && isdigit(argv[i + 1][0]))
-                    opt.quanta = strtoull(argv[++i], nullptr, 10);
+                if (i < argc) { // Will fail anyhow, missing file name
+                    // If the first char is +, we quantize away from zero
+                    auto c = argv[i + 1][0];// First char
+                    if (isdigit(c)) {
+                        opt.away = false;
+                        opt.quanta = strtoull(argv[i + 1], nullptr, 10);
+                        i++;
+                    }
+                    if (c == '+') {
+                        opt.away = true;
+                        opt.quanta = strtoull(argv[i + 1] + 1, nullptr, 10);
+                        i++;
+                    }
+                }
                 break;
             case 'm':
                 // The next parameter is a comma separated band list if it starts with a digit
@@ -161,6 +180,14 @@ bool parse_args(int argc, char** argv, options& opt) {
         return false;
     }
 
+    // best, rle with ftl , turn them off
+    // In theory, legacy mode would work with ftl, but not supported
+    if (opt.ftl) {
+        opt.best = false;
+        opt.rle = false;
+        opt.legacy = false;
+    }
+
     // If output file name is not provided, extract from input file name
     if (opt.out_fname.empty()) {
         string fname(opt.in_fname);
@@ -203,6 +230,7 @@ const char *mode_string(qb3_mode m) {
     case QB3M_CF: return "Legacy CF";
     case QB3M_RLE: return "Legacy Base + RLE";
     case QB3M_CF_RLE: return "Legacy CF + RLE";
+    case QB3M_FTL: return "Fast";
     case QB3M_STORED: return "Stored";
     default:
         return "Unknown mode";
@@ -421,17 +449,20 @@ int encode(Raster &raster, std::vector<std::uint8_t> &image, std::vector<std::ui
             }
         }
 
+        if (opts.ftl)
+            mode = QB3M_FTL;
+
         if (mode != qb3_set_encoder_mode(qenc, mode)) {
             cerr << "Invalid mode\n";
             throw 1;
         }
         if (opts.quanta > 1) {
-            if (!qb3_set_encoder_quanta(qenc, opts.quanta, true)) {
+            if (!qb3_set_encoder_quanta(qenc, opts.quanta, opts.away)) {
                 cerr << "Invalid quanta\n";
                 throw 1;
             }
             else if (opts.verbose) {
-                cout << "Lossy compression, quantized by " << opts.quanta << endl;
+                cout << "Lossy compression, quantized by " << (opts.away ? "+" : "") << opts.quanta << endl;
             }
         }
         t1 = high_resolution_clock::now();
diff --git a/cqb3.md b/cqb3.md
index 5cd2a99..d061ba2 100644
--- a/cqb3.md
+++ b/cqb3.md
@@ -28,6 +28,10 @@ Decompress. Reads a QB3 formatted file and writes a PNG.
 -b
 Best. Turns on the **best** QB3 compression mode, which is slower but can produce better compression, especially for larger integer types.
 
+-f
+Fast. Turns on the **fast** QB3 mode, which is faster than the default by about 10% while loosing less than .5 % of the compression. It is not
+compatible with the rle mode
+
 -m <a,b,c,...>
 band Mapping control. For images with more than one channel, QB3 can apply a band decorrelation filter which improves the compression. It does this
 by subtracting one band from another. On decompression the effect of the filter is removed and the output image is identical to the input.
@@ -43,6 +47,17 @@ while band 1 (green) is to be subtracted from the 0 (red) and 2 (blue) bands. If
 input, the unspecified band mappings are left unmodified (core). Following the same logic, the -m option with no parameters is equivalent to the
 identity mapping, -m 0,1,2,... For RGBI (infrared) imagery, the 1,1,1,1 might be better than the default, which leaves the last band as is.
 The QB3 compressor will adjust the band input mapping if the values are not valid, a warning will be printed by cqb3 when this happens.
+A special case is "-m x", which tries all possible band mappings and selects the one that gives the best compression. This is only valid for 3 or 4
+band images. Note that if this option is provided, it will take about 9 times longer to finish the compression.
+
+-r
+RLE. A run length encoding is applied after the QB3 compression. This can improve the compression ratio, especially for images with large areas of
+constant values. The RLE encoding is lossless, the original image is restored on decompression. The RLE encoding is not compatible with the fast mode.
+
+-l
+Legacy microblock scan order. Uses the Morton (Z) scan order for the 4x4 pixel blocks in the QB3 compression. This is the original scan order used
+in the QB3 compressor. The default scan order is the Hilbert scan order, which results in better compression for most images. Use of this option is
+not recommended.
 
 -t
 Trim. QB3 compression operates on 4x4 pixel blocks. When the input image size is not a multiple of 4x4, libQB3 will internally encode a few lines
@@ -50,3 +65,11 @@ and columns more than once. This may result in an output size that is slightly l
 the input image will be trimmed to a multiple of 4x4 pixels before compression to QB3. The output QB3 raster size will reflect this trimmed size.
 1, 2 or three lines and/or columns will be trimmed, in the last, then first, then last again order, as necessary to make the respective dimension 
 a multiple of 4.
+
+-q <val>
+Lossy encoding, by quantizing (divide) input values by a small integer, before doing the actual QB3 decoding. On decoding, the decoded values are
+multiplied by the same value, restoring the normal range. The division is done with rounding, toward zero by default, it can be changed to rounding 
+away from zero by adding a + sign before the value. If this option is not followed by a value, the default value of 2 is used. The value must be
+within the range valid for the input data type.
+
+
diff --git a/test_qb3.cpp b/test_qb3.cpp
index 0cc6805..ed6b25b 100644
--- a/test_qb3.cpp
+++ b/test_qb3.cpp
@@ -61,7 +61,7 @@ vector<outT> toplus(vector<inT>& v, outT m) {
 template<typename T>
 void check(vector<uint8_t> &image, const Raster &raster,
     uint64_t m, int main_band = 0,
-    bool fast = 0, uint64_t q = 1, bool away = false)
+    bool fast = 0, uint64_t q = 1, bool away = false, bool ftl = false)
 {
     size_t xsize = raster.size.x;
     size_t ysize = raster.size.y;
@@ -93,7 +93,7 @@ void check(vector<uint8_t> &image, const Raster &raster,
     // This is sufficient to trigger the quanta encoding
     if (q > 1)
         qb3_set_encoder_quanta(qenc, q, away);
-    qb3_set_encoder_mode(qenc, fast ? qb3_mode::QB3M_BASE : qb3_mode::QB3M_BEST);
+    qb3_set_encoder_mode(qenc, ftl? qb3_mode::QB3M_FTL : fast ? qb3_mode::QB3M_BASE : qb3_mode::QB3M_BEST);
 
     t1 = high_resolution_clock::now();
     auto outsize = qb3_encode(qenc, static_cast<void *>(img.data()), outvec.data());
@@ -137,9 +137,7 @@ void check(vector<uint8_t> &image, const Raster &raster,
 
     time_span = duration_cast<duration<double>>(t2 - t1).count();
     cout << sizeof(T) * image.size() /time_span / 1024 / 1024 << '\t'
-        << time_span << '\t' << sizeof(T) << '\t' << m << '\t';
-    if (fast)
-        cout << "Fast";
+        << time_span << '\t' << sizeof(T) << '\t' << m << '\t' << (ftl ? "FTL" : fast ? "Fast" : "");
 
     if (q > 1) {
         auto hq = T(q / 2); // precision
@@ -530,21 +528,29 @@ int main(int argc, char **argv)
             cout << endl;
             check<uint64_t>(image, raster, 1, 1, true);
             cout << endl;
+            check<uint64_t>(image, raster, 1, 1, true, 1, 0, true);
+            cout << endl;
 
             check<uint32_t>(image, raster, 1, 1);
             cout << endl;
             check<uint32_t>(image, raster, 1, 1, true);
             cout << endl;
+            check<uint32_t>(image, raster, 1, 1, true, 1, 0, true);
+            cout << endl;
 
             check<uint16_t>(image, raster, 1, 1);
             cout << endl;
             check<uint16_t>(image, raster, 1, 1, true);
             cout << endl;
+            check<uint16_t>(image, raster, 1, 1, true, 1, 0, true);
+            cout << endl;
 
             check<uint8_t>(image, raster, 1, 1);
             cout << endl;
             check<uint8_t>(image, raster, 1, 1, true);
             cout << endl;
+            check<uint8_t>(image, raster, 1, 1, true, 1, 0, true);
+            cout << endl;
         }
         else if (raster.dt == ICDT_Int16 || raster.dt == ICDT_UInt16) {
             std::vector<uint16_t> image(params.get_buffer_size() / 2);