Skip to content

Commit

Permalink
Merge pull request #20 from lucianpls/FTL
Browse files Browse the repository at this point in the history
FTL mode
  • Loading branch information
lucianpls authored Sep 4, 2024
2 parents e05fec6 + 9edfd8f commit 19c8ff1
Show file tree
Hide file tree
Showing 11 changed files with 446 additions and 64 deletions.
20 changes: 15 additions & 5 deletions QB3lib/QB3.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ Contributors: Lucian Plesea
#pragma once
// For size_t
#include <stddef.h>
// For uint64_t
#include <stdint.h>

// CMake will generate LIBQB3_EXPORT linkage as needed
#include <libqb3_export.h>
#include "libqb3_export.h"

// Keep this close to plain C so it can have a C API
#define QB3_MAXBANDS 16
Expand All @@ -34,6 +36,9 @@ typedef struct decs * decsp; // decoder
// Types
enum qb3_dtype { QB3_U8 = 0, QB3_I8, QB3_U16, QB3_I16, QB3_U32, QB3_I32, QB3_U64, QB3_I64 };

// To check if the library has QB3M_FTL
#define QB3_HAS_FTL 1

// Encode mode, default is fastest, best is best compression
enum qb3_mode {
// Aliases, values might change
Expand All @@ -53,7 +58,12 @@ enum qb3_mode {
QB3M_RLE_H = 6, // QB3 Hilbert + RLE
QB3M_CF_RLE_H = 7, // QB3 Hilbert + CF + RLE

QB3M_STORED = 255, // Raw bypass
// Faster and only slightly worse than base in many cases
// Hilbert curve but no bit-step, no CF, no RLE
QB3M_FTL = 8,
QB3M_END, // Marks the end of the settable modes

QB3M_STORED = 255, // Raw bypass, can't be requested
QB3M_INVALID = -1 // Invalid mode
}; // Best compression, one of the above

Expand Down Expand Up @@ -87,7 +97,7 @@ LIBQB3_EXPORT bool qb3_set_encoder_coreband(encsp p, size_t bands, size_t *cband

// Sets quantization parameters, returns true on success
// away = true -> round away from zero
LIBQB3_EXPORT bool qb3_set_encoder_quanta(encsp p, size_t q, bool away);
LIBQB3_EXPORT bool qb3_set_encoder_quanta(encsp p, uint64_t q, bool away);

// Upper bound of encoded size, without taking the header into consideration
LIBQB3_EXPORT size_t qb3_max_encoded_size(const encsp p);
Expand Down Expand Up @@ -136,10 +146,10 @@ LIBQB3_EXPORT void qb3_set_decoder_stride(decsp p, size_t stride);
LIBQB3_EXPORT qb3_mode qb3_get_mode(const decsp p);

// Returns the number of quantization bits used, returns 0 if failed
LIBQB3_EXPORT size_t qb3_get_quanta(const decsp p);
LIBQB3_EXPORT uint64_t qb3_get_quanta(const decsp p);

// Return the scanning curve used, returns 0 if failed
LIBQB3_EXPORT size_t qb3_get_order(const decsp p);
LIBQB3_EXPORT uint64_t qb3_get_order(const decsp p);

// Sets the cband array and returns true if successful
LIBQB3_EXPORT bool qb3_get_coreband(const decsp p, size_t *cband);
Expand Down
4 changes: 2 additions & 2 deletions QB3lib/QB3common.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ struct encs {
size_t nbands;
// micro block scanning order
uint64_t order;
size_t quanta;
uint64_t quanta;

// Persistent state by band
band_state band[QB3_MAXBANDS];
Expand All @@ -109,7 +109,7 @@ struct decs {
size_t stride;
// micro block scanning order
uint64_t order;
size_t quanta;
uint64_t quanta;
int error;
int stage;

Expand Down
6 changes: 3 additions & 3 deletions QB3lib/QB3decode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,13 @@ qb3_mode qb3_get_mode(const decsp p) {
return p->mode;
}

size_t qb3_get_quanta(const decsp p) {
uint64_t qb3_get_quanta(const decsp p) {
if (p->stage != 2)
return 0; // Error
return p->quanta;
}

size_t qb3_get_order(const decsp p) {
uint64_t qb3_get_order(const decsp p) {
if (p->stage != 2)
return 0; // Error
return p->order ? p->order : ZCURVE;
Expand Down Expand Up @@ -149,7 +149,7 @@ decsp qb3_read_start(void* source, size_t source_size, size_t *image_size) {
val >>= 8; // 40 bits left
// Also check that the next 2 bytes are a signature
if (p->nbands > QB3_MAXBANDS
|| (p->mode > qb3_mode::QB3M_BEST && p->mode != qb3_mode::QB3M_STORED)
|| (p->mode >= qb3_mode::QB3M_END && p->mode != qb3_mode::QB3M_STORED)
|| 0 != (val & 0x8080)
|| p->type > qb3_dtype::QB3_I64) {
delete p;
Expand Down
165 changes: 140 additions & 25 deletions QB3lib/QB3decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ static std::pair<size_t, uint64_t> qb3dsztbl(uint64_t val, size_t rung) {
// For rung 0, it works with 17bits or more
// For rung 1, it works with 47bits or more
// returns false on failure
template<typename T>
template<bool applystep = true, typename T>
static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits) {
assert(((rung > 1) && (abits <= 8))
|| ((rung == 1) && (abits <= 17)) // B2 + 1
Expand All @@ -164,34 +164,42 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
if (1 == rung) {
// Use inline constants as nibble tables
// The lower two bits of the accumulator determine the size
// Preshift accumulator
acc <<= 2;
for (size_t i=0; i < B2; i++) {
auto size = (0x31213121u >> ((acc & 7) << 2)) & 0xf;
group[i] = T((0x30102010u >> ((acc & 7) << 2)) & 0xf);
auto size = (0x31213121u >> (acc & 0b11100)) & 0xf;
group[i] = T((0x30102010u >> (acc & 0b11100)) & 0xf);
abits += size;
acc >>= size;
}
s.advance(abits);
}
else if (2 == rung) { // max symbol len is 4, there are at least 14 in the accumulator
// Use inline constants as nibble tables
unsigned int size;
// Faster than a double value table decode, but only in this specific code organization
// Cleaning it up, for example doing a peek at the start then looping 16 times, makes it slower
// The masks and inline constants could be smaller for size, but that eliminates the
// common expression, making it slower
// pre-shift accumulator, top 2 bits are not needed
acc <<= 2;
uint32_t size;
for (size_t i = 0; i < 14; i++) {
size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
group[i] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
group[i] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
abits += size;
acc >>= size;
}
if (abits > 56) { // Rare
s.advance(abits);
if (abits > 56) { // Rare, max is 60, there are still 2 safe bits
s.advance(abits - 2);
acc = s.peek();
abits = 0;
abits = 2;
}
size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
group[14] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
group[14] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
acc >>= size;
abits += size;
size = (0x4232423242324232ull >> ((acc & 0xf) << 2)) & 0xf;
group[15] = T((0x7130612051304120ull >> ((acc & 0xf) << 2)) & 0xf);
size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
group[15] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
s.advance(abits + size);
}
else if (6 > rung) { // Table decode at 3,4 and 5, half of the values per accumulator
Expand Down Expand Up @@ -266,7 +274,8 @@ static bool gdecode(iBits& s, size_t rung, T* group, uint64_t acc, size_t abits)
}
}
}
if (0 == (group[B2 - 1] >> rung)) {
// template parameter to avoid a test when not needed
if (applystep && (0 == (group[B2 - 1] >> rung))) {
auto stepp = step(group, rung);
if (stepp < B2)
group[stepp] ^= static_cast<T>(1ull << rung);
Expand All @@ -280,10 +289,115 @@ template<typename T> static T magsabs(T v) { return (v >> 1) + (v & 1); }
// Multiply v(in magsign) by m(normal, positive)
template<typename T> static T magsmul(T v, T m) { return magsabs(v) * (m << 1) - (v & 1); }

// Streamlined decoding for FTL mode
template<typename T>
static bool decodeFTL(uint8_t* src, size_t len, T* image, const decs& info)
{
auto xsize(info.xsize), ysize(info.ysize), bands(info.nbands), stride(info.stride);
auto cband = info.cband;
static_assert(std::is_integral<T>() && std::is_unsigned<T>(), "Only unsigned integer types allowed");
constexpr size_t UBITS(sizeof(T) == 1 ? 3 : sizeof(T) == 2 ? 4 : sizeof(T) == 4 ? 5 : 6);
constexpr auto NORM_MASK((1ull << UBITS) - 1); // UBITS set
constexpr auto LONG_MASK(NORM_MASK * 2 + 1); // UBITS + 1 set
T prev[QB3_MAXBANDS] = {}, group[B2] = {};
size_t runbits[QB3_MAXBANDS] = {};
const uint16_t* dsw = sizeof(T) == 1 ? dsw3 : sizeof(T) == 2 ? dsw4 : sizeof(T) == 4 ? dsw5 : dsw6;
stride = stride ? stride : xsize * bands;
// Set up block offsets based on traversal order, defaults to HILBERT
uint64_t order(info.order);
order = order ? order : HILBERT;
size_t offset[B2] = {};
for (size_t i = 0; i < B2; i++) {
size_t n = (order >> ((B2 - 1 - i) << 2));
offset[i] = ((n >> 2) & 0b11) * stride + (n & 0b11) * bands;
}
iBits s(src, len);
bool failed(false);
for (size_t y = 0; y < ysize; y += B) {
// If the last row is partial, roll it up
if (y + B > ysize)
y = ysize - B;
for (size_t x = 0; x < xsize; x += B) {
// If the last column is partial, move it left
if (x + B > xsize)
x = xsize - B;
for (int c = 0; c < bands; c++) {
auto prv = prev[c];
T* const blockp = image + y * stride + x * bands + c;
failed = s.empty();
uint64_t cs(0), abits(1), acc(s.peek());
if (acc & 1) { // Rung change
cs = dsw[(acc >> 1) & LONG_MASK];
abits = cs >> 12;
failed |= (0 == (cs & TBLMASK)); // no signals
}
acc >>= abits;
// abits is never > 8, so it's safe to call gdecode
auto rung = (runbits[c] + cs) & NORM_MASK;
runbits[c] = rung;
if (rung < 2) { // decode inlined
if (rung == 0) { // single bits or all zeros
abits++;
if (0 != (acc & 1)) {
abits += B2;
for (int i = 0; i < B2; i++) {
acc >>= 1;
blockp[offset[i]] = prv -= (1 & acc);
}
prev[c] = prv;
}
else {
for (int i = 0; i < B2; i++)
blockp[offset[i]] = prv;
}
}
else { // rung == 1
// Use inline constants as nibble tables
// The lower two bits of the accumulator determine the size
// Shift the accumulator to the left to place the selector in the right place
acc <<= 2;
for (size_t i = 0; i < B2; i++) {
auto size = (0x31213121u >> (acc & 0b11100)) & 0xf;
blockp[offset[i]] = prv += smag(T((0x30102010u >> (acc & 0b11100)) & 0xf));
abits += size;
acc >>= size;
}
prev[c] = prv;
}
s.advance(abits);
continue;
}
// longer codes
failed |= !gdecode<false>(s, rung, group, acc, abits);
// Undo delta encoding for this block
for (int i = 0; i < B2; i++)
blockp[offset[i]] = prv += smag(group[i]);
prev[c] = prv;
if (failed) break;
} // Per band per block
if (failed) break;
} // per block
if (failed) break;
// For performance apply band delta per block strip, in linear order
for (size_t j = 0; j < B; j++) {
for (int c = 0; c < bands; c++) if (c != cband[c]) {
auto dimg = image + stride * (y + j) + c;
auto simg = image + stride * (y + j) + cband[c];
for (int i = 0; i < xsize; i++, dimg += bands, simg += bands)
*dimg += *simg;
}
}
} // per strip
// It might not catch all errors
return failed || s.avail() > 7;
}

// reports most but not all errors, for example if the input stream is too short for the last block
template<typename T>
static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
{
if (info.mode == QB3M_FTL)
return decodeFTL(src, len, image, info);
auto xsize(info.xsize), ysize(info.ysize), bands(info.nbands), stride(info.stride);
auto cband = info.cband;
static_assert(std::is_integral<T>() && std::is_unsigned<T>(), "Only unsigned integer types allowed");
Expand Down Expand Up @@ -322,9 +436,8 @@ static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
acc >>= abits;
if (0 == cs || 0 != (cs & TBLMASK)) { // Normal decoding, not a signal
// abits is never > 8, so it's safe to call gdecode
auto rung = (runbits[c] + cs) & NORM_MASK;
auto rung = runbits[c] = (runbits[c] + cs) & NORM_MASK;
failed |= !gdecode(s, rung, group, acc, abits);
runbits[c] = rung;
}
else { // extra encoding
cs = dsw[acc & LONG_MASK]; // rung, no flag
Expand Down Expand Up @@ -389,24 +502,26 @@ static bool decode(uint8_t *src, size_t len, T* image, const decs &info)
acc >>= (cs >> 12) - 1; // No flag
abits += (cs >> 12) - 1;
failed |= rung == 63; // TODO: Deal with 64bit overflow
// 16 index values in group, max is 7, use rung 2
T maxval(0);
// 16 index values in group, max group size is 7, use rung 2
T maxidx(0);
acc <<= 2; // preshift accumulator
for (int i = 0; i < B2; i++) {
auto v = DRG[2][acc & 0xf];
group[i] = static_cast<uint8_t>(v);
if (maxval < group[i])
maxval = group[i];
acc >>= v >> 12;
abits += v >> 12;
uint32_t size = (0x4232423242324232ull >> (acc & 0b111100)) & 0xf;
group[i] = T((0x7130612051304120ull >> (acc & 0b111100)) & 0xf);
acc >>= size;
abits += size;
if (maxidx < group[i])
maxidx = group[i];
}
s.advance(abits);
T idxarray[B2 / 2] = {};
for (size_t i = 0; i <= maxval; i++) {
for (size_t i = 0; i <= maxidx; i++) {
acc = s.peek();
auto v = qb3dsztbl(acc, rung);
s.advance(v.first);
idxarray[i] = T(v.second);
}
// Apply idxarray to group
for (int i = 0; i < B2; i++)
group[i] = idxarray[group[i]];
}
Expand Down
4 changes: 2 additions & 2 deletions QB3lib/QB3encode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ size_t qb3_max_encoded_size(const encsp p) {
}

qb3_mode qb3_set_encoder_mode(encsp p, qb3_mode mode) {
if (mode <= qb3_mode::QB3M_BEST)
if (mode < qb3_mode::QB3M_END)
p->mode = mode;
// Default curve is HILBERT, change it if needed
switch (p->mode) {
Expand Down Expand Up @@ -421,7 +421,7 @@ static size_t raw_size(encsp const &p) {
int qb3_get_encoder_state(encsp p) { return p->error; }

static bool is_fast(qb3_mode mode) {
return (QB3M_BASE_H == mode) || (QB3M_BASE_Z == mode);
return (QB3M_BASE_H == mode) || (QB3M_BASE_Z == mode) || (QB3M_FTL == mode);
}

// ONLY QB3M_BASE and QB3M_CF are supported here
Expand Down
Loading

0 comments on commit 19c8ff1

Please sign in to comment.