From af37fd8b30e37ccbffdd82e6f48559e2fb7ce7dd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 8 Mar 2024 12:40:02 +0200 Subject: [PATCH 01/39] server : fix EOS token detection with disabled cache (#5938) --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f255ad76455bf..1434095fc4e09 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1123,7 +1123,7 @@ struct server_context { }); } - if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) { + if (result.tok == llama_token_eos(model)) { slot.stopped_eos = true; slot.has_next_token = false; From e457fb3540e0aaec47cfde0abf784c213f9216ee Mon Sep 17 00:00:00 2001 From: Don Mahurin Date: Fri, 8 Mar 2024 02:41:50 -0800 Subject: [PATCH 02/39] llama : assume tied weights if lm_head/output weights is missing (#5824) This is to support model configurations with "tie_word_embeddings" set to true. Co-authored-by: Don Mahurin <2797413+dmahurin@users.noreply.github.com> --- llama.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 4225f955590dd..458382b21057c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3888,7 +3888,13 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); if (model.arch != LLM_ARCH_MINICPM){ - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } } } From 76e868821a94072fbc87cb1fcca291694319eae8 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 8 Mar 2024 12:25:04 +0100 Subject: [PATCH 03/39] server: metrics: add llamacpp:prompt_seconds_total and llamacpp:tokens_predicted_seconds_total, reset bucket only on /metrics. Fix values cast to int. Add Process-Start-Time-Unix header. (#5937) Closes #5850 --- examples/server/server.cpp | 47 ++++++++++++++----- examples/server/tests/features/server.feature | 1 + examples/server/tests/features/steps/steps.py | 11 ++++- 3 files changed, 46 insertions(+), 13 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1434095fc4e09..109ff717577ec 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -335,8 +335,12 @@ struct server_slot { }; struct server_metrics { + const int64_t t_start = ggml_time_us(); + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t t_prompt_processing_total = 0; uint64_t n_tokens_predicted_total = 0; + uint64_t t_tokens_generation_total = 0; uint64_t n_prompt_tokens_processed = 0; uint64_t t_prompt_processing = 0; @@ -348,12 +352,14 @@ struct server_metrics { n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed; n_prompt_tokens_processed += slot.n_prompt_tokens_processed; t_prompt_processing += slot.t_prompt_processing; + t_prompt_processing_total += slot.t_prompt_processing; } void on_prediction(const server_slot &slot) { - n_tokens_predicted_total += slot.n_decoded; - n_tokens_predicted += slot.n_decoded; - t_tokens_generation += slot.t_token_generation; + n_tokens_predicted_total += slot.n_decoded; + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; + t_tokens_generation_total += slot.t_token_generation; } void reset_bucket() { @@ -1502,9 +1508,12 @@ struct server_context { { "idle", n_idle_slots }, { "processing", n_processing_slots }, { "deferred", queue_tasks.queue_tasks_deferred.size() }, + { "t_start", metrics.t_start}, { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total}, + { "t_tokens_generation_total", metrics.t_tokens_generation_total}, { "n_tokens_predicted_total", metrics.n_tokens_predicted_total}, + { "t_prompt_processing_total", metrics.t_prompt_processing_total}, { "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed}, { "t_prompt_processing", metrics.t_prompt_processing}, @@ -1517,7 +1526,9 @@ struct server_context { { "slots", slots_data }, }; - metrics.reset_bucket(); + if (json_value(task.data, "reset_bucket", false)) { + metrics.reset_bucket(); + } queue_results.send(res); } break; } @@ -2709,6 +2720,7 @@ int main(int argc, char ** argv) { task.id_multi = -1; task.id_target = -1; task.type = SERVER_TASK_TYPE_METRICS; + task.data.push_back({{"reset_bucket", true}}); ctx_server.queue_results.add_waiting_task_id(task.id); ctx_server.queue_tasks.post(task); @@ -2732,20 +2744,28 @@ int main(int argc, char ** argv) { {"counter", {{ {"name", "prompt_tokens_total"}, {"help", "Number of prompt tokens processed."}, - {"value", data["n_prompt_tokens_processed_total"]} + {"value", (uint64_t) data["n_prompt_tokens_processed_total"]} + }, { + {"name", "prompt_seconds_total"}, + {"help", "Prompt process time"}, + {"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3} }, { {"name", "tokens_predicted_total"}, {"help", "Number of generation tokens processed."}, - {"value", data["n_tokens_predicted_total"]} + {"value", (uint64_t) data["n_tokens_predicted_total"]} + }, { + {"name", "tokens_predicted_seconds_total"}, + {"help", "Predict process time"}, + {"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3} }}}, {"gauge", {{ {"name", "prompt_tokens_seconds"}, {"help", "Average prompt throughput in tokens/s."}, - {"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0} + {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.} },{ {"name", "predicted_tokens_seconds"}, {"help", "Average generation throughput in tokens/s."}, - {"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0} + {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.} },{ {"name", "kv_cache_usage_ratio"}, {"help", "KV-cache usage. 1 means 100 percent usage."}, @@ -2753,15 +2773,15 @@ int main(int argc, char ** argv) { },{ {"name", "kv_cache_tokens"}, {"help", "KV-cache tokens."}, - {"value", data["kv_cache_tokens_count"]} + {"value", (uint64_t) data["kv_cache_tokens_count"]} },{ {"name", "requests_processing"}, {"help", "Number of request processing."}, - {"value", data["processing"]} + {"value", (uint64_t) data["processing"]} },{ {"name", "requests_deferred"}, {"help", "Number of request deferred."}, - {"value", data["deferred"]} + {"value", (uint64_t) data["deferred"]} }}} }; @@ -2775,13 +2795,16 @@ int main(int argc, char ** argv) { const std::string name = metric_def["name"]; const std::string help = metric_def["help"]; - auto value = json_value(metric_def, "value", 0); + auto value = json_value(metric_def, "value", 0.); prometheus << "# HELP llamacpp:" << name << " " << help << "\n" << "# TYPE llamacpp:" << name << " " << type << "\n" << "llamacpp:" << name << " " << value << "\n"; } } + const int64_t t_start = data["t_start"]; + res.set_header("Process-Start-Time-Unix", std::to_string(t_start)); + res.set_content(prometheus.str(), "text/plain; version=0.0.4"); res.status = 200; // HTTP OK }); diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index f3b758c798c8d..878ac1363c419 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -29,6 +29,7 @@ Feature: llama.cpp server And a completion request with no api error Then tokens are predicted matching And prometheus metrics are exposed + And metric llamacpp:tokens_predicted is Examples: Prompts | prompt | n_predict | re_content | n_predicted | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index a0b2ffdfef9f0..d7f0058360c23 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -586,14 +586,24 @@ async def step_prometheus_metrics_exported(context): metric_exported = False if context.debug: print(f"/metrics answer:\n{metrics_raw}\n") + context.metrics = {} for metric in parser.text_string_to_metric_families(metrics_raw): match metric.name: case "llamacpp:kv_cache_usage_ratio": assert len(metric.samples) > 0 metric_exported = True + context.metrics[metric.name] = metric + assert int(metrics_response.headers["Process-Start-Time-Unix"]) > 0, "no header process start time" assert metric_exported, "No metrics exported" +@step(u'metric {metric_name} is {metric_value:d}') +def step_assert_metric_value(context, metric_name, metric_value): + if metric_name not in context.metrics: + assert False, f"no metric {metric_name} in {context.metrics.keys()}" + assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}" + + @step(u'available models') def step_available_models(context): # openai client always expects an api_key @@ -879,7 +889,6 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re f' {n_predicted} <> {expected_predicted_n}') - async def gather_tasks_results(context): n_tasks = len(context.concurrent_tasks) if context.debug: From 515f7d0d4fce41c752fc253acf30707c3be2531e Mon Sep 17 00:00:00 2001 From: compilade <113953597+compilade@users.noreply.github.com> Date: Fri, 8 Mar 2024 10:53:37 -0500 Subject: [PATCH 04/39] llama : fix quantization of shared token_embd (#5944) --- llama.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 458382b21057c..4a20b79289fa5 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10973,6 +10973,9 @@ struct quantize_state_internal { bool has_imatrix = false; + // used to figure out if a model shares tok_embd with the output weight + bool has_output = false; + quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) : model(model) , params(params) @@ -11070,8 +11073,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings // with the quantization of the output tensor - if (name == tn(LLM_TENSOR_OUTPUT, "weight") || - (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) { + if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { int nx = tensor->ne[0]; if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0; @@ -11460,6 +11462,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s else if (name.find("ffn_up") != std::string::npos) { ++qs.n_ffn_up; } + else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { + qs.has_output = true; + } } if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", From c2101a2e909ac7c08976d414e64e96c90ee5fa9e Mon Sep 17 00:00:00 2001 From: compilade <113953597+compilade@users.noreply.github.com> Date: Fri, 8 Mar 2024 17:31:00 -0500 Subject: [PATCH 05/39] llama : support Mamba Selective State Space Models (#5328) * mamba : begin working on support for Mamba SSM * mamba : begin figuring out how to (ab)use the kv cache for Mamba * mamba : recurrent inference almost works, but incoherent * mamba : recurrent inference WORKS!!! * convert : optionally use d_conv and d_state from config.json for Mamba * mamba : refactor recurrent conv, resulting in 20% perf increase It's still slower than I'd like, but I did not really optimize `ggml_exp` yet. I also refactored `ggml_exp` to work with tensors with more than 2 dimensions. * ggml : parallelize ggml_exp This results in 8% faster token generation for Mamba-130M. * mamba : simplify the conv step with a self-overlapping view Turns out the conv_state can be made smaller by one column. Note that this breaks existing GGUFs of Mamba, because the key_value_length field is tied to the conv_state size. Convolution with a self-overlapping view is cool! And it's much simpler than what I initially thought would be necessary to make the convolution step work with more than 1 token at a time. Next step is to make the SSM step work on batches of tokens too, and thus I need to figure out a way to make a parallel selective scan which will keep the ssm_state small and won't make it bigger by a factor of (n_layer * batch_size). * llama : fix Mamba KV self size wrongly displaying as f16 instead of f32 Relatedly, I also tried to see if other types than f32 worked for the states, but they don't, because of the operators used. It's probably better anyway to keep lots of precision there, since the states are small anyway. * mamba : fix self-overlapping view depth stride * mamba : handle batches of more than 1 token This means running Mamba no longer crashes when using the default settings! And probably also slightly faster prompt processing. Both batched and non-batched processing yield the same output. Previously, the state was not cleared when starting a sequence. Next step is to make the KV cache API work as expected for Mamba models. * ggml: add ggml_ssm_scan to help with parallel selective scan If the selective scan was implemented without a custom operator, there would be waaay too many nodes in the graph. For example, for Mamba-130M, with a batch size of 512 (the default), a naive selective scan could add at least 24*512=12288 nodes, which is more than LLAMA_MAX_NODES (8192), and that's only for the smallest Mamba model. So it's much cleaner with a custom operator. Not sure about the name, though. * ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation This will help with performance on CPU if ggml_vec_mul_f32 and ggml_vec_add_f32 are ever optimized with SIMD. * mamba : very basic quantization support Mostly works, but there is currently no difference between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same). Most of the SSM-specific weights can be kept in f32 without affecting the size that much, since they are relatively small. (the linear projection weights are responsible for most of Mamba's size) Too much quantization seems to make the state degrade quite fast, and the model begins to output gibberish. It seems to affect bigger models to a lesser extent than small models, but I'm not sure by how much. Experimentation will be needed to figure out which weights are more important for the _M (and _L?) variants of k-quants for Mamba. * convert : fix wrong name for layer norm weight of offical Mamba models I was using Q-bert/Mamba-* models before, which have a slighlty different naming scheme for the weights. (they start with "model.layers" instead of "backbone.layers") * mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator This increases performance on CPU by around 30% for prompt processing, and by around 20% for text generation. However, it also makes the ggml_exp and ggml_soft_plus operators unused. Whether or not they should be kept will be decided later. * convert : for Mamba, also consider the "MambaLMHeadModel" arch name It's the name of the class of the official implementation, though they don't use it (yet) in the "architectures" field of config.json * mamba : fix vocab size problems with official models The perplexity was waaaay to high for models with a non-round vocab size. Not sure why, but it needed to be fixed in the metadata. Note that this breaks existing GGUF-converted Mamba models, but **only if** the vocab size was not already rounded. * ggml : remove ggml_exp and ggml_soft_plus They did not exist anyway outside of this branch, and since ggml_ssm_scan fused operations together, they are unused. It's always possible to bring them back if needed. * mamba : remove some useless comments No code change. * convert : fix flake8 linter errors * mamba : apply suggestions from code review * mamba : remove unecessary branch for row-wise ssm_state and C multiplication It was previously done to avoid permuting when only one token is processed at a time (like when generating text), but permuting is cheap, and dynamically changing the compute graph is not future-proof. * ggml : in ggml_ssm_scan, use more appropriate asserts * ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32 * mamba : multiple sequences, but one at a time This is a step towards making this Mamba implementation usable with the server example (the way the system prompt is kept when clearing the client slots will need to be changed before this can work, though). The KV cache size for this kind of model is tied to the maximum number of sequences kept at any single time. For now, this number is obtained from n_parallel (plus one, to have an extra sequence to dedicate to the system prompt), but there might be a better way to do this which won't also make the main example use 2 cells even if only 1 is really used. (for this specific case, --parallel 0 helps) Simultaneous sequence processing will probably require changes to ggml_ssm_scan, and possibly a new operator for the conv step. * mamba : support llama_kv_cache_seq_cp This (mis)uses the logic around K shifts, because tokens in a state can't be shifted anyway, and because inp_K_shift has the right shape and type. Using ggml_get_rows is a nice way to do copies, but copy chains can't work. Fortunately, copy chains don't really seem to be used in the examples. Each KV cell is dedicated to the sequence ID corresponding to its own index. * mamba : use a state mask It's cleaner than the previous heuristic of checking for the pos of the first token in the batch. inp_KQ_mask could not be re-used for this, because it has the wrong shape and because it seems more suited to the next step of simultaneous sequence processing (helping with the problem of remembering which token belongs to which sequence(s)/state(s)). * llama : replace the usage of n_ctx with kv_self.size in many places * mamba : use n_tokens directly instead of n_tok * mamba : in comments, properly refer to KV cells instead of slots * mamba : reduce memory usage of ggml_ssm_scan From 290.37 MiB to 140.68 MiB of CPU compute buffer size with Mamba 3B with a batch size of 512. The result tensor of ggml_ssm_scan was previously a big part of the CPU compute buffer size. To make it smaller, it does not contain the intermediate ssm states anymore. Both y and the last ssm state are combined in the result tensor, because it seems only a single tensor can be returned by an operator with the way the graph is built. * mamba : simultaneous sequence processing A batch can now contain tokens from multiple sequences. This is necessary for at least the parallel example, the server example, and the HellaSwag test in the perplexity example. However, for this to be useful, uses of llama_kv_cache_seq_rm/cp will need to be changed to work on whole sequences. * ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba This operator makes it possible to use and update the correct states for each token of the batch in the same way as ggml_ssm_scan. Other solutions which use existing operators would need loops which would add too many nodes to the graph (at least the ones I thought of). Using this operator further reduces the size of the CPU compute buffer from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512. And (at least on CPU), it's a bit faster than before. Note that "ggml_ssm_conv" is probably not the most appropriate name, and it could be changed if a better one is found. * llama : add inp_s_seq as a new input tensor The most convenient implementation to select the correct state (for Mamba) for each token is to directly get the correct index from a tensor. This is why inp_s_seq is storing int32_t and not floats. The other, less convenient way to select the correct state would be to have inp_KQ_mask contain 1.0f for each state used by a token and 0.0f otherwise. This complicates quickly fetching the first used state of a token, and is also less efficient because a whole row of the mask would always need to be read for each token. Using indexes makes it easy to stop searching when there are no more sequences for a token, and the first sequence assigned is always very quickly available (it's the first element of each row). * mamba : support llama_kv_cache_seq_cp copy chains * mamba : support shifting and dividing the kv cache pos * mamba : make the server and parallel examples work with whole sequences A seq_id is dedicated to the system prompt in both cases. * llama : make llama_kv_cache_seq_rm return whether it succeeded or not * mamba : dedicate an input tensor for state copy indices This is cleaner and makes it easier to adapt when/if token positions (and by extension, inp_K_shift) are no longer integers. * mamba : adapt perplexity, batched, and batched-bench examples * perplexity : limit the max number of sequences This adapts to what the loaded model can provide. * llama : add llama_n_max_seq to get the upper limit for seq_ids Used by the perplexity example. * batched : pass n_parallel to the model's context params This should have been there already, but it wasn't. * batched-bench : reserve sequences to support Mamba * batched-bench : fix tokens being put in wrong sequences Generation quality isn't what's measured in there anyway, but at least using the correct sequences avoids using non-consecutive token positions. * mamba : stop abusing attention metadata This breaks existing converted-to-GGUF Mamba models, but will allow supporting mixed architectures like MambaFormer without needing to break Mamba models. This will also allow changing the size of Mamba's states without having to reconvert models in the future. (e.g. using something else than d_conv - 1 columns for the conv_states will not require breaking existing converted Mamba models again) * gguf-py : add new KV metadata key-value pairs for Mamba * llama : add new metadata key-value pairs for Mamba * llama : guard against divisions by zero when n_head is 0 * mamba : rename "unlimited" KV cache property to "recurrent" * mamba : more correctly update the "used" field of the KV cache * ggml : in ggml_ssm_scan, use a threshold for soft_plus This is how the official Mamba implementation does it, and it's also what torch.nn.Softplus does. * convert : for Mamba, fallback to internal NeoX tokenizer The resulting models are exactly the same as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there. * mamba : support state saving and restoring * ggml : implicitly pass src tensors through dst for Mamba-related ops * mamba : clarify some comments * server : fix cache_tokens not getting correctly resized Otherwise, when the "we have to evaluate at least 1 token" special case was triggered, an extra token was kept in cache_tokens even if it was removed from the KV cache. For Mamba, this caused useless prompt reprocessing when the previous request triggered the above case. * convert-hf : support new metadata keys for Mamba For the models available at https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406 * mamba : rename metadata to be more similar to transformers library This breaks existing converted-to-GGUF models, but the metadata names are more "standard". * mamba : support mamba-*-hf models These models share their token_embd.weight with their output.weight * mamba : add missing spaces This is purely a formatting change. * convert-hf : omit output.weight when identical with token_embd.weight Only for Mamba for now, but it might be relevant for other models eventually. Most Mamba models actually share these two tensors, albeit implicitly. * readme : add Mamba to supported models, and add recent API changes * mamba : move state_seq and state_mask views outside layer loop A few tensors were also missing `struct` in front of `ggml_tensor`. --- README.md | 2 + common/common.cpp | 1 + convert-hf-to-gguf.py | 118 ++++ examples/batched-bench/batched-bench.cpp | 13 +- examples/batched/batched.cpp | 3 +- examples/parallel/parallel.cpp | 20 +- examples/perplexity/perplexity.cpp | 9 +- examples/server/server.cpp | 51 +- ggml.c | 379 +++++++++++- ggml.h | 19 + gguf-py/gguf/constants.py | 41 ++ gguf-py/gguf/gguf_writer.py | 12 + gguf-py/gguf/tensor_mapping.py | 46 +- llama.cpp | 698 +++++++++++++++++++++-- llama.h | 4 +- 15 files changed, 1342 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index f754022de894d..d7dba73e62267 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Recent API changes +- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849 @@ -110,6 +111,7 @@ Typically finetunes of the base models below are supported as well. - [x] [InternLM2](https://huggingface.co/models?search=internlm2) - [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [Gemma](https://ai.google.dev/gemma) +- [x] [Mamba](https://github.com/state-spaces/mamba) **Multimodal models:** diff --git a/common/common.cpp b/common/common.cpp index c244db6443eaa..d7f650ef486e9 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1288,6 +1288,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_ctx = params.n_ctx; cparams.n_batch = params.n_batch; + cparams.n_parallel = params.n_parallel; cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; cparams.seed = params.seed; diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index f6369af38081d..5eee320163d29 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1847,6 +1847,124 @@ class StarCoder2Model(Model): model_arch = gguf.MODEL_ARCH.STARCODER2 +@Model.register("MambaForCausalLM", "MambaLMHeadModel") +class MambaModel(Model): + model_arch = gguf.MODEL_ARCH.MAMBA + + def set_vocab(self): + vocab_size = self.hparams["vocab_size"] + # Round vocab size to next multiple of 8 + pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + vocab_size = -(vocab_size // -pad_vocab) * pad_vocab + self.hparams["vocab_size"] = vocab_size + + if (self.dir_model / "tokenizer.json").is_file(): + self._set_vocab_gpt2() + else: + # Use the GPT-NeoX tokenizer when no tokenizer files are present + tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" + print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + neox_reader = gguf.GGUFReader(tokenizer_path, "r") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1])) + field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) + self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) + self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) + + def set_gguf_parameters(self): + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 + + # Fail early for models which don't have a block expansion factor of 2 + assert d_inner == 2 * d_model + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default + self.gguf_writer.add_embedding_length(d_model) + self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading + self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_ssm_conv_kernel(d_conv) + self.gguf_writer.add_ssm_inner_size(d_inner) + self.gguf_writer.add_ssm_state_size(d_state) + self.gguf_writer.add_ssm_time_step_rank(dt_rank) + self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) + self.gguf_writer.add_file_type(self.ftype) + + def write_tensors(self): + block_count = self.hparams["n_layer"] + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + tok_embd = None + tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight" + output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight" + + for name, data_torch in self.get_tensors(): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + if name.endswith(".A_log"): + print("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) + + # assuming token_embd.weight is seen before output.weight + if tok_embd is not None and new_name == output_name: + if torch.equal(tok_embd, data_torch): + print(f"{output_name} is equivalent to {tok_embd_name}, omitting") + continue + if new_name == tok_embd_name: + tok_embd = data_torch + + data = data_torch.squeeze().numpy() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert big float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + + ###### CONVERSION LOGIC ###### diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 19aff18aefde7..dff6c68ec2e69 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -105,6 +105,9 @@ int main(int argc, char ** argv) { ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + // ensure enough sequences are available + ctx_params.n_parallel = *std::max_element(n_pl.begin(), n_pl.end()); + llama_context * ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { @@ -174,10 +177,10 @@ int main(int argc, char ** argv) { llama_batch_clear(batch); - const int n_tokens = is_pp_shared ? pp : pl*pp; - - for (int i = 0; i < n_tokens; ++i) { - llama_batch_add(batch, 0, i, { 0 }, false); + for (int i = 0; i < pp; ++i) { + for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) { + llama_batch_add(batch, 0, i, { j }, false); + } } batch.logits[batch.n_tokens - 1] = true; @@ -192,7 +195,7 @@ int main(int argc, char ** argv) { if (is_pp_shared) { for (int32_t i = 1; i < pl; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, pp); + llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } } diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 9be7eb56bcd8a..dde4d5a068e24 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -80,6 +80,7 @@ int main(int argc, char ** argv) { ctx_params.seed = 1234; ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_len, n_parallel); + ctx_params.n_parallel = n_parallel; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -132,7 +133,7 @@ int main(int argc, char ** argv) { // assign the system KV cache to all parallel sequences // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them for (int32_t i = 1; i < n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens); + llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } if (n_parallel > 1) { diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 7d11fcd593080..a2ef0fb039c3f 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -107,6 +107,9 @@ int main(int argc, char ** argv) { // number of simultaneous "clients" to simulate const int32_t n_clients = params.n_parallel; + // dedicate one sequence to the system prompt + params.n_parallel += 1; + // requests to simulate const int32_t n_seq = params.n_sequences; @@ -196,8 +199,8 @@ int main(int argc, char ** argv) { } // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i < n_clients; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system); + for (int32_t i = 1; i <= n_clients; ++i) { + llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } LOG_TEE("\n"); @@ -221,15 +224,17 @@ int main(int argc, char ** argv) { client.i_batch = batch.n_tokens; - llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true); + llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true); client.n_decoded += 1; } if (batch.n_tokens == 0) { // all sequences have ended - clear the entire KV cache - for (int i = 0; i < n_clients; ++i) { - llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1); + for (int i = 1; i <= n_clients; ++i) { + llama_kv_cache_seq_rm(ctx, i, -1, -1); + // but keep the system prompt + llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } LOG_TEE("%s: clearing the KV cache\n", __func__); @@ -255,7 +260,7 @@ int main(int argc, char ** argv) { tokens_prompt = ::llama_tokenize(ctx, client.prompt, false); for (size_t i = 0; i < tokens_prompt.size(); ++i) { - llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false); + llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false); } // extract the logits only for the last token @@ -366,7 +371,8 @@ int main(int argc, char ** argv) { } // delete only the generated part of the sequence, i.e. keep the system prompt in the cache - llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1); + llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1); + llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1); const auto t_main_end = ggml_time_us(); diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9ec989389cfad..52789ee631234 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -809,7 +809,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { const int n_batch = params.n_batch; const int max_tasks_per_batch = 32; - const int max_seq = 4*max_tasks_per_batch; + const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx)); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); @@ -1086,7 +1086,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { const int n_batch = params.n_batch; const int max_tasks_per_batch = 128; - const int max_seq = 2*max_tasks_per_batch; + const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_max_seq(ctx)); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); @@ -1438,7 +1438,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params const int n_batch = params.n_batch; const int max_tasks_per_batch = 32; - const int max_seq = 4*max_tasks_per_batch; + const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx)); llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); @@ -1815,6 +1815,9 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; + // ensure there's at least enough seq_ids for HellaSwag + params.n_parallel = std::max(4, params.n_parallel); + // load the model and apply lora adapter, if any std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == NULL) { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 109ff717577ec..59a59d56b6023 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -659,7 +659,11 @@ struct server_context { bool load_model(const gpt_params & params_) { params = params_; + // dedicate one sequence to the system prompt + params.n_parallel += 1; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + params.n_parallel -= 1; // but be sneaky about it if (model == nullptr) { LOG_ERROR("unable to load model", {{"model", params.model}}); return false; @@ -1018,8 +1022,8 @@ struct server_context { } // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i < params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); + for (int32_t i = 1; i <= params.n_parallel; ++i) { + llama_kv_cache_seq_cp(ctx, 0, i, -1, -1); } } @@ -1306,7 +1310,7 @@ struct server_context { const int n_embd = llama_n_embd(model); for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { + if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { continue; } @@ -1633,8 +1637,8 @@ struct server_context { {"n_cache_tokens", slot.cache_tokens.size()} }); - llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); - llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); + llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard); + llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); if (slot.params.cache_prompt) { for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { @@ -1666,7 +1670,7 @@ struct server_context { // TODO: we always have to take into account the "system_tokens" // this is not great and needs to be improved somehow - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true); + llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true); slot.n_past += 1; @@ -1804,9 +1808,6 @@ struct server_context { // reuse any previously computed tokens that are common with the new prompt slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - // remove the non-common part from the cache - slot.cache_tokens.resize(slot.n_past); - // push the prompt into the sampling context (do not apply grammar) for (int i = 0; i < slot.n_past; ++i) { llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false); @@ -1837,8 +1838,28 @@ struct server_context { } } - const int p0 = (int) system_tokens.size() + slot.n_past; - llama_kv_cache_seq_rm(ctx, slot.id, p0, -1); + // keep only the common part + int p0 = (int) system_tokens.size() + slot.n_past; + if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) { + // could not partially delete (likely using a non-Transformer model) + llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); + + p0 = (int) system_tokens.size(); + if (p0 != 0) { + // copy over the system prompt when there is one + llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1); + } + + // there is no common part left (except for the system prompt) + slot.n_past = 0; + slot.n_past_se = 0; + slot.ga_i = 0; + // TODO: is the system prompt ever in the sampling context? + llama_sampling_reset(slot.ctx_sampling); + } + + // remove the non-common part from the cache + slot.cache_tokens.resize(slot.n_past); LOG_INFO("kv cache rm [p0, end)", { { "id_slot", slot.id }, @@ -1863,7 +1884,7 @@ struct server_context { } } - llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id }, false); + llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); @@ -1937,9 +1958,9 @@ struct server_context { LOG_TEE("div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n); LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd); - llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd); - llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); - llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); + llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd); + llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n); + llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd); slot.n_past_se -= bd; diff --git a/ggml.c b/ggml.c index 92b17ee6ed7bd..6eff98ab63f0c 100644 --- a/ggml.c +++ b/ggml.c @@ -1841,6 +1841,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", + "SSM_CONV", + "SSM_SCAN", "WIN_PART", "WIN_UNPART", "GET_REL_POS", @@ -1863,7 +1865,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); +static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1929,6 +1931,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", + "ssm_conv(x)", + "ssm_scan(x)", "win_part(x)", "win_unpart(x)", "get_rel_pos(x)", @@ -1951,7 +1955,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74"); +static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -6154,6 +6158,108 @@ struct ggml_tensor * ggml_flash_attn_back( return result; } +// ggml_ssm_conv + +struct ggml_tensor * ggml_ssm_conv( + struct ggml_context * ctx, + struct ggml_tensor * s, + struct ggml_tensor * x, + struct ggml_tensor * c, + struct ggml_tensor * sq) { + GGML_ASSERT(ggml_is_3d(s)); + GGML_ASSERT(ggml_is_matrix(x)); + GGML_ASSERT(ggml_is_matrix(c)); + GGML_ASSERT(ggml_is_matrix(sq)); + GGML_ASSERT(sq->type == GGML_TYPE_I32); + + const int64_t d_conv = c->ne[0]; + const int64_t d_inner = c->ne[1]; + const int64_t n_tokens = x->ne[1]; + const int64_t n_kv = s->ne[2]; + + GGML_ASSERT( s->ne[0] == d_conv - 1); + GGML_ASSERT( s->ne[1] == d_inner); + GGML_ASSERT( x->ne[0] == d_inner); + GGML_ASSERT(sq->ne[0] == n_kv); + GGML_ASSERT(sq->ne[1] == n_tokens); + + bool is_node = false; + + if (s->grad || x->grad || c->grad || sq->grad) { + GGML_ASSERT(false); // TODO: implement + is_node = true; + } + + // 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv} + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv)); + + result->op = GGML_OP_SSM_CONV; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = s; + result->src[1] = x; + result->src[2] = c; + result->src[3] = sq; + + return result; +} + +// ggml_ssm_scan + +struct ggml_tensor * ggml_ssm_scan( + struct ggml_context * ctx, + struct ggml_tensor * s, + struct ggml_tensor * x, + struct ggml_tensor * dt, + struct ggml_tensor * A, + struct ggml_tensor * B, + struct ggml_tensor * C, + struct ggml_tensor * sq) { + GGML_ASSERT(ggml_is_contiguous(s)); + GGML_ASSERT(ggml_is_contiguous(x)); + GGML_ASSERT(ggml_is_contiguous(dt)); + GGML_ASSERT(ggml_is_contiguous(A)); + GGML_ASSERT(sq->type == GGML_TYPE_I32); + GGML_ASSERT(B->nb[0] == ggml_type_size(B->type)); + GGML_ASSERT(C->nb[0] == ggml_type_size(C->type)); + GGML_ASSERT(ggml_are_same_shape(x, dt)); + + { + const int64_t d_state = s->ne[0]; + const int64_t d_inner = s->ne[1]; + const int64_t n_tokens = x->ne[1]; + + GGML_ASSERT(x->ne[0] == d_inner); + GGML_ASSERT(A->ne[0] == d_state); + GGML_ASSERT(A->ne[1] == d_inner); + GGML_ASSERT(B->ne[0] == d_state); + GGML_ASSERT(B->ne[1] == n_tokens); + GGML_ASSERT(C->ne[0] == d_state); + GGML_ASSERT(C->ne[1] == n_tokens); + } + + bool is_node = false; + + if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) { + GGML_ASSERT(false); // TODO: implement + is_node = true; + } + + // 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv} + struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s)); + + result->op = GGML_OP_SSM_SCAN; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = s; + result->src[1] = x; + result->src[2] = dt; + result->src[3] = A; + result->src[4] = B; + result->src[5] = C; + result->src[6] = sq; + + return result; +} + // ggml_win_part struct ggml_tensor * ggml_win_part( @@ -14771,6 +14877,257 @@ static void ggml_compute_forward_flash_attn_back( } } +// ggml_compute_forward_ssm_conv + +static void ggml_compute_forward_ssm_conv_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + const struct ggml_tensor * src0 = dst->src[0]; // conv_state + const struct ggml_tensor * src1 = dst->src[1]; // x + const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight + const struct ggml_tensor * src3 = dst->src[3]; // state_seq + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src2->ne[0]; // d_conv + const int nr = src0->ne[1]; // d_inner + const int n_t = src1->ne[1]; // n_tokens + const int n_kv = src0->ne[2]; // max number of sequences in the batch + + GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src1->nb[0] == sizeof(float)); + GGML_ASSERT(src2->nb[0] == sizeof(float)); + GGML_ASSERT(src3->nb[0] == sizeof(int32_t)); + GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); + // for use with the destination state offset between sequences + GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + const int ir = ir1 - ir0; + + if (n_kv > 1) { + // multiple sequences means it's hard to know when it's the first time a state is read, + // so copy them all over to the destination, just to be sure. + for (int i3 = 0; i3 < n_kv; ++i3) { + float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); + float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float)); + // can't use memcpy because of d_conv vs d_conv - 1 + for (int i1 = 0; i1 < ir; ++i1) { + for (int i0 = 0; i0 < nc - 1; ++i0) { + // copy s0 to last (d_conv - 1) columns of s + s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)]; + } + } + } + } + + for (int i2 = 0; i2 < n_t; ++i2) { + int32_t * sq = (int32_t *) ((char *) src3->data + i2*(src3->nb[1])); // {n_kv, n_tokens} + float * x = (float *) ((char *) dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens} + float * s = (float *) ((char *) dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv} + float * s0; // {d_conv - 1, d_inner, n_kv} + float * x0 = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens} + float * c = (float *) ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner} + int ne0s0; + + GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv); + + // avoid needing to copy the state for the first token + if (i2 == 0) { + s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv} + ne0s0 = src0->ne[0]; + } else { + // the source is the last (d_conv - 1) columns of the destination + s0 = s + 1; + ne0s0 = nc; + } + + // d_inner + for (int i1 = 0; i1 < ir; ++i1) { + // shift state left + for (int i0 = 0; i0 < nc - 1; ++i0) { + s[i0 + i1*nc] = s0[i0 + i1*ne0s0]; + } + // insert x on the last column + s[(nc - 1) + i1*nc] = x0[i1]; + } + + // handle copies when there are multiple output states + for (int i3 = 1; i3 < n_kv; ++i3) { + int32_t seq = sq[i3]; + if (0 <= seq && seq < n_kv) { + float * s1 = s + (seq - sq[0])*nc*nr; + memcpy(s1, s, nc*ir*sizeof(float)); + } else { + // stop at negative or too big seq_ids + break; + } + } + + // it seems a little faster when this is separate from the state shift + for (int i1 = 0; i1 < ir; ++i1) { + // rowwise dot product + float sumf = 0.0f; + for (int i0 = 0; i0 < nc; ++i0) { + int i = i0 + i1*nc; + sumf += s[i] * c[i]; + } + x[i1] = sumf; + } + } +} + +static void ggml_compute_forward_ssm_conv( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + switch (dst->src[0]->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_ssm_conv_f32(params, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_ssm_scan + +static void ggml_compute_forward_ssm_scan_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + const struct ggml_tensor * src0 = dst->src[0]; // s + const struct ggml_tensor * src1 = dst->src[1]; // x + const struct ggml_tensor * src2 = dst->src[2]; // dt + const struct ggml_tensor * src3 = dst->src[3]; // A + const struct ggml_tensor * src4 = dst->src[4]; // B + const struct ggml_tensor * src5 = dst->src[5]; // C + const struct ggml_tensor * src6 = dst->src[6]; // sq + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t nc = src0->ne[0]; // d_state + const int64_t nr = src0->ne[1]; // d_inner + const int64_t n_t = src1->ne[1]; // number of tokens in the batch + const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch + + GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst)); + GGML_ASSERT(src0->nb[0] == sizeof(float)); + GGML_ASSERT(src1->nb[0] == sizeof(float)); + GGML_ASSERT(src2->nb[0] == sizeof(float)); + GGML_ASSERT(src3->nb[0] == sizeof(float)); + GGML_ASSERT(src4->nb[0] == sizeof(float)); + GGML_ASSERT(src5->nb[0] == sizeof(float)); + // required for the dot product between s and C, and when copying the states + GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float)); + // required for per-sequence offsets for states + GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float)); + // required to get correct offset for state destination (i.e. src1->nb[2]) + GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + const int ir = ir1 - ir0; + + if (n_kv > 1) { + // it's hard to know if the source states have already been copied + // when there are multiple, so copy them already. + for (int i3 = 0; i3 < n_kv; ++i3) { + float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2])); + float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]); + memcpy(s, s0, nc*ir*sizeof(float)); + } + } + + for (int i2 = 0; i2 < n_t; ++i2) { + int32_t * sq = (int32_t *) ((char *) src6->data + i2*(src6->nb[1])); // {n_kv, n_tokens} + float * y = (float *) ((char *) dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens} + float * s = (float *) ((char *) dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv} + float * s0; + float * x = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens} + float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens} + float * A = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner} + float * B = (float *) ((char *) src4->data + i2*(src4->nb[1])); // {d_state, n_tokens} + float * C = (float *) ((char *) src5->data + i2*(src5->nb[1])); // {d_state, n_tokens} + + GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv); + + // avoid needing to copy the state for the first token + if (i2 == 0) { + s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv} + } else { + // otherwise the source is the same as the destination + s0 = s; + } + + // d_inner + for (int i1 = 0; i1 < ir; ++i1) { + // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78 + float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1]; + float x_dt = x[i1] * dt_soft_plus; + float sumf = 0.0f; + // d_state + for (int i0 = 0; i0 < nc; ++i0) { + int i = i0 + i1*nc; + // state = prev_state * dA + dB * x + float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt); + // y = rowwise_dotprod(state, C) + sumf += state * C[i0]; + s[i] = state; + } + y[i1] = sumf; + } + + // handle copies when there are multiple output states + for (int i3 = 1; i3 < n_kv; ++i3) { + int32_t seq = sq[i3]; + if (0 <= seq && seq < n_kv) { + float * s1 = s + (seq - sq[0])*nc*nr; + memcpy(s1, s, nc*ir*sizeof(float)); + } else { + // stop at negative or too big seq_ids + break; + } + } + } +} + +static void ggml_compute_forward_ssm_scan( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + switch (dst->src[0]->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_ssm_scan_f32(params, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_win_part static void ggml_compute_forward_win_part_f32( @@ -15830,6 +16187,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm bool masked = t != 0; ggml_compute_forward_flash_attn_back(params, masked, tensor); } break; + case GGML_OP_SSM_CONV: + { + ggml_compute_forward_ssm_conv(params, tensor); + } break; + case GGML_OP_SSM_SCAN: + { + ggml_compute_forward_ssm_scan(params, tensor); + } break; case GGML_OP_WIN_PART: { ggml_compute_forward_win_part(params, tensor); @@ -16884,6 +17249,11 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // not supported } break; + case GGML_OP_SSM_CONV: + case GGML_OP_SSM_SCAN: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_WIN_PART: case GGML_OP_WIN_UNPART: case GGML_OP_UNARY: @@ -17590,6 +17960,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = n_threads; } break; + case GGML_OP_SSM_CONV: + case GGML_OP_SSM_SCAN: + { + n_tasks = n_threads; + } break; case GGML_OP_WIN_PART: case GGML_OP_WIN_UNPART: case GGML_OP_GET_REL_POS: diff --git a/ggml.h b/ggml.h index 0ea4f8847795c..a13b0cec4edcf 100644 --- a/ggml.h +++ b/ggml.h @@ -472,6 +472,8 @@ extern "C" { GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, GGML_OP_FLASH_ATTN_BACK, + GGML_OP_SSM_CONV, + GGML_OP_SSM_SCAN, GGML_OP_WIN_PART, GGML_OP_WIN_UNPART, GGML_OP_GET_REL_POS, @@ -1728,6 +1730,23 @@ extern "C" { struct ggml_tensor * c0, struct ggml_tensor * c1); + GGML_API struct ggml_tensor * ggml_ssm_conv( + struct ggml_context * ctx, + struct ggml_tensor * s, + struct ggml_tensor * x, + struct ggml_tensor * c, + struct ggml_tensor * sq); + + GGML_API struct ggml_tensor * ggml_ssm_scan( + struct ggml_context * ctx, + struct ggml_tensor * s, + struct ggml_tensor * x, + struct ggml_tensor * dt, + struct ggml_tensor * A, + struct ggml_tensor * B, + struct ggml_tensor * C, + struct ggml_tensor * sq); + // partition into non-overlapping windows with padding if needed // example: // a: 768 64 64 1 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a62139811ef36..b23badb1019c1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -61,6 +61,12 @@ class Rope: SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length" SCALING_FINETUNED = "{arch}.rope.scaling.finetuned" + class SSM: + CONV_KERNEL = "{arch}.ssm.conv_kernel" + INNER_SIZE = "{arch}.ssm.inner_size" + STATE_SIZE = "{arch}.ssm.state_size" + TIME_STEP_RANK = "{arch}.ssm.time_step_rank" + class Tokenizer: MODEL = "tokenizer.ggml.model" LIST = "tokenizer.ggml.tokens" @@ -113,6 +119,7 @@ class MODEL_ARCH(IntEnum): MINICPM = auto() GEMMA = auto() STARCODER2 = auto() + MAMBA = auto() class MODEL_TENSOR(IntEnum): @@ -144,6 +151,13 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_NORM = auto() ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() + SSM_IN = auto() + SSM_CONV1D = auto() + SSM_X = auto() + SSM_DT = auto() + SSM_A = auto() + SSM_D = auto() + SSM_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -171,6 +185,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.MAMBA: "mamba", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -202,6 +217,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -543,6 +565,19 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.MAMBA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.SSM_IN, + MODEL_TENSOR.SSM_CONV1D, + MODEL_TENSOR.SSM_X, + MODEL_TENSOR.SSM_DT, + MODEL_TENSOR.SSM_A, + MODEL_TENSOR.SSM_D, + MODEL_TENSOR.SSM_OUT, + ], # TODO } @@ -734,6 +769,12 @@ def get_type(val: Any) -> GGUFValueType: KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED +# SSM +KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL +KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE +KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE +KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK + # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 8011608323c45..e49c5db6866a2 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -382,6 +382,18 @@ def add_rope_scaling_orig_ctx_len(self, value: int) -> None: def add_rope_scaling_finetuned(self, value: bool) -> None: self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value) + def add_ssm_conv_kernel(self, value: int) -> None: + self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value) + + def add_ssm_inner_size(self, value: int) -> None: + self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value) + + def add_ssm_state_size(self, value: int) -> None: + self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value) + + def add_ssm_time_step_rank(self, value: int) -> None: + self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value) + def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index db2ec9704a441..ed89955d8970f 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -20,6 +20,9 @@ class TensorNameMap: "wte", # gpt2 "transformer.embd.wte", # phi2 "model.tok_embeddings", # internlm2 + "model.embedding", # mamba-qbert + "backbone.embedding", # mamba + "backbone.embeddings", # mamba-hf ), # Token type embeddings @@ -44,7 +47,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -61,6 +64,8 @@ class TensorNameMap: "language_model.encoder.final_layernorm", # persimmon "model.final_layernorm", # persimmon "lm_head.ln", # phi2 + "model.norm_f", # mamba-qbert + "backbone.norm_f", # mamba ), # Rope frequencies @@ -86,6 +91,8 @@ class TensorNameMap: "transformer.h.{bid}.ln", # phi2 "model.layers.layers.{bid}.norm", # plamo "model.layers.{bid}.attention_norm", # internlm2 + "model.layers.{bid}.norm", # mamba-qbert + "backbone.layers.{bid}.norm", # mamba ), # Attention norm 2 @@ -282,7 +289,42 @@ class TensorNameMap: MODEL_TENSOR.LAYER_OUT_NORM: ( "encoder.layer.{bid}.output.LayerNorm", # bert "encoder.layers.{bid}.norm2", # nomic-bert - ) + ), + + MODEL_TENSOR.SSM_IN: ( + "model.layers.{bid}.in_proj", + "backbone.layers.{bid}.mixer.in_proj", + ), + + MODEL_TENSOR.SSM_CONV1D: ( + "model.layers.{bid}.conv1d", + "backbone.layers.{bid}.mixer.conv1d", + ), + + MODEL_TENSOR.SSM_X: ( + "model.layers.{bid}.x_proj", + "backbone.layers.{bid}.mixer.x_proj", + ), + + MODEL_TENSOR.SSM_DT: ( + "model.layers.{bid}.dt_proj", + "backbone.layers.{bid}.mixer.dt_proj", + ), + + MODEL_TENSOR.SSM_A: ( + "model.layers.{bid}.A_log", + "backbone.layers.{bid}.mixer.A_log", + ), + + MODEL_TENSOR.SSM_D: ( + "model.layers.{bid}.D", + "backbone.layers.{bid}.mixer.D", + ), + + MODEL_TENSOR.SSM_OUT: ( + "model.layers.{bid}.out_proj", + "backbone.layers.{bid}.mixer.out_proj", + ), } mapping: dict[str, tuple[MODEL_TENSOR, str]] diff --git a/llama.cpp b/llama.cpp index 4a20b79289fa5..8c147a42b1e53 100644 --- a/llama.cpp +++ b/llama.cpp @@ -213,6 +213,7 @@ enum llm_arch { LLM_ARCH_MINICPM, LLM_ARCH_GEMMA, LLM_ARCH_STARCODER2, + LLM_ARCH_MAMBA, LLM_ARCH_UNKNOWN, }; @@ -241,6 +242,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_STARCODER2, "starcoder2" }, + { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -284,6 +286,11 @@ enum llm_kv { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, + LLM_KV_SSM_INNER_SIZE, + LLM_KV_SSM_CONV_KERNEL, + LLM_KV_SSM_STATE_SIZE, + LLM_KV_SSM_TIME_STEP_RANK, + LLM_KV_TOKENIZER_MODEL, LLM_KV_TOKENIZER_LIST, LLM_KV_TOKENIZER_TOKEN_TYPE, @@ -342,6 +349,11 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, + { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, + { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, + { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, @@ -399,6 +411,13 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_LAYER_OUT_NORM, + LLM_TENSOR_SSM_IN, + LLM_TENSOR_SSM_CONV1D, + LLM_TENSOR_SSM_X, + LLM_TENSOR_SSM_DT, + LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_D, + LLM_TENSOR_SSM_OUT, }; static const std::map> LLM_TENSOR_NAMES = { @@ -801,6 +820,22 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_MAMBA, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1613,6 +1648,12 @@ struct llama_hparams { float rope_freq_scale_train; uint32_t n_yarn_orig_ctx; + // for State Space Models + uint32_t ssm_d_conv = 0; + uint32_t ssm_d_inner = 0; + uint32_t ssm_d_state = 0; + uint32_t ssm_dt_rank = 0; + float f_clamp_kqv = 0.0f; float f_max_alibi_bias = 0.0f; @@ -1641,6 +1682,11 @@ struct llama_hparams { if (this->rope_finetuned != other.rope_finetuned) return true; if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true; + if (this->ssm_d_conv != other.ssm_d_conv) return true; + if (this->ssm_d_inner != other.ssm_d_inner) return true; + if (this->ssm_d_state != other.ssm_d_state) return true; + if (this->ssm_dt_rank != other.ssm_dt_rank) return true; + const float EPSILON = 1e-9f; if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; @@ -1652,6 +1698,9 @@ struct llama_hparams { } uint32_t n_gqa() const { + if (n_head_kv == 0) { + return 0; + } return n_head/n_head_kv; } @@ -1662,6 +1711,18 @@ struct llama_hparams { uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads return n_embd_head_v * n_head_kv; } + + uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings + // corresponds to Mamba's conv_states size + // TODO: maybe support other convolution strides than 1 + // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed + return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; + } + + uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings + // corresponds to Mamba's ssm_states size + return ssm_d_state * ssm_d_inner; + } }; struct llama_cparams { @@ -1739,11 +1800,27 @@ struct llama_layer { struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_up_b; // b3 struct ggml_tensor * ffn_act; + + // mamba proj + struct ggml_tensor * ssm_in; + struct ggml_tensor * ssm_x; + struct ggml_tensor * ssm_dt; + struct ggml_tensor * ssm_out; + + // mamba + struct ggml_tensor * ssm_conv1d; + struct ggml_tensor * ssm_a; + struct ggml_tensor * ssm_d; + + // mamba bias + struct ggml_tensor * ssm_conv1d_b; + struct ggml_tensor * ssm_dt_b; }; struct llama_kv_cell { llama_pos pos = -1; llama_pos delta = 0; + int32_t src = 0; // used by recurrent state models to copy states std::set seq_id; @@ -1764,6 +1841,9 @@ struct llama_kv_cell { struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; + bool do_copy = false; + // with recurrent state models, a cell can hold the state for more than one past token + bool recurrent = false; // Note: The value of head isn't only used to optimize searching // for a free KV slot. llama_decode_internal also uses it, so it @@ -2003,11 +2083,14 @@ struct llama_context { struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] - struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch] - struct ggml_tensor * inp_KQ_pos; // F32 [n_ctx] - struct ggml_tensor * inp_K_shift; // I32 [n_ctx] + struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] + struct ggml_tensor * inp_KQ_pos; // F32 [kv_size] + struct ggml_tensor * inp_K_shift; // I32 [kv_size] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] + struct ggml_tensor * inp_s_copy; // I32 [kv_size] + struct ggml_tensor * inp_s_mask; // F32 [kv_size] + struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] #ifdef GGML_USE_MPI ggml_mpi_context * ctx_mpi = NULL; @@ -2023,25 +2106,42 @@ static bool llama_kv_cache_init( const llama_model & model, ggml_type type_k, ggml_type type_v, - uint32_t n_ctx, + uint32_t kv_size, bool offload) { const struct llama_hparams & hparams = model.hparams; - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s(); const int64_t n_layer = hparams.n_layer; cache.has_shift = false; + // TODO: find a nicer way to add other recurrent model architectures + cache.recurrent = model.arch == LLM_ARCH_MAMBA; + + // TODO: support mixed reccurent Transformer architectues + // NOTE: (!a || b) is a logical implication (a -> b) + GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s()); + GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s()); + GGML_ASSERT( cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_gqa()); + GGML_ASSERT( cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_gqa()); + cache.head = 0; - cache.size = n_ctx; + cache.size = kv_size; cache.used = 0; cache.type_k = type_k; cache.type_v = type_v; cache.cells.clear(); - cache.cells.resize(n_ctx); + cache.cells.resize(kv_size); + + if (cache.recurrent) { + // init state copy sources + for (uint32_t i = 0; i < cache.size; ++i) { + cache.cells[i].src = i; + } + } #ifdef GGML_USE_CLBLAST offload = false; @@ -2080,8 +2180,8 @@ static bool llama_kv_cache_init( for (int i = 0; i < (int) n_layer; i++) { struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); - ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx); - ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx); + ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); @@ -2115,6 +2215,54 @@ static bool llama_kv_cache_find_slot( const uint32_t n_ctx = cache.size; const uint32_t n_tokens = batch.n_tokens; + if (cache.recurrent) { + // For recurrent state architectures (like Mamba), + // each KV cache cell can store the state for a whole sequence. + + llama_seq_id min = cache.size - 1; + llama_seq_id max = 0; + + for (uint32_t i = 0; i < n_tokens; ++i) { + for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) { + llama_seq_id seq_id = batch.seq_id[i][j]; + // make sure it's a valid seq_id + if ((uint32_t) seq_id < cache.size) { + if (seq_id > max) { + max = seq_id; + } + if (seq_id < min) { + min = seq_id; + } + // Assuming the tokens are in-order + if (batch.pos[i] != cache.cells[seq_id].pos + 1) { + // What should happen when the pos backtracks or skips a value? + // Clearing the state mid-batch would require special-casing which isn't done. + LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n", + __func__, batch.pos[i], cache.cells[seq_id].pos, seq_id); + } + if (cache.cells[seq_id].pos < 0 && 0 <= batch.pos[i]) { + cache.used += 1; + } + cache.cells[seq_id].pos = batch.pos[i]; + // NOTE: seq_ids are not inserted here; they are handled when the input tensors are set + } else { + // too big seq_id + // TODO: would it be possible to resize the KV cache size instead? + LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size); + return false; + } + } + } + + // allow getting the range of used cells, from head to head + n + cache.head = min; + cache.n = max - min + 1; + + // sanity check + return max >= min; + } + // otherwise, one cell per token. + if (n_tokens > n_ctx) { LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx); return false; @@ -2184,7 +2332,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) { cache.used = 0; } -static void llama_kv_cache_seq_rm( +static bool llama_kv_cache_seq_rm( struct llama_kv_cache & cache, llama_seq_id seq_id, llama_pos p0, @@ -2194,6 +2342,25 @@ static void llama_kv_cache_seq_rm( if (p0 < 0) p0 = 0; if (p1 < 0) p1 = std::numeric_limits::max(); + // models like Mamba can't have a state partially erased + if (cache.recurrent) { + if (seq_id >= (int64_t) cache.size) { + // could be fatal + return false; + } + if (0 <= seq_id) { + // partial intersection is invalid + if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) { + return false; + } + } else { + // seq_id is negative, then the range should include everything or nothing + if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits::max())) { + return false; + } + } + } + for (uint32_t i = 0; i < cache.size; ++i) { if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { if (seq_id < 0) { @@ -2215,6 +2382,8 @@ static void llama_kv_cache_seq_rm( // If we freed up a slot, set head to it so searching can start there. if (new_head != cache.size && new_head < cache.head) cache.head = new_head; + + return true; } static void llama_kv_cache_seq_cp( @@ -2226,6 +2395,29 @@ static void llama_kv_cache_seq_cp( if (p0 < 0) p0 = 0; if (p1 < 0) p1 = std::numeric_limits::max(); + if (cache.recurrent) { + if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) { + seq_id_src = cache.cells[seq_id_src].src; + GGML_ASSERT((uint32_t) seq_id_src < cache.size); + // intent to "copy from" + // supports copy chains thanks to taking the source of the source + cache.cells[seq_id_dst].src = seq_id_src; + + // preserve the "keep or clear" status of the copied sequence + if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) { + cache.cells[seq_id_dst].seq_id.insert(seq_id_dst); + } else { + cache.cells[seq_id_dst].seq_id.erase(seq_id_dst); + } + + cache.do_copy = true; + + cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos; + } + return; + } + // otherwise, this is the KV cache of a Transformer-like model + cache.head = 0; for (uint32_t i = 0; i < cache.size; ++i) { @@ -2265,6 +2457,17 @@ static void llama_kv_cache_seq_add( if (p0 < 0) p0 = 0; if (p1 < 0) p1 = std::numeric_limits::max(); + if (cache.recurrent) { + // for Mamba-like models, only the pos needs to be shifted + if (0 <= seq_id && seq_id < (int64_t) cache.size) { + llama_kv_cell & cell = cache.cells[seq_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos += delta; + } + } + return; + } + for (uint32_t i = 0; i < cache.size; ++i) { if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { cache.has_shift = true; @@ -2298,6 +2501,17 @@ static void llama_kv_cache_seq_div( if (p0 < 0) p0 = 0; if (p1 < 0) p1 = std::numeric_limits::max(); + if (cache.recurrent) { + // for Mamba-like models, only the pos needs to be changed + if (0 <= seq_id && seq_id < (int64_t) cache.size) { + llama_kv_cell & cell = cache.cells[seq_id]; + if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) { + cell.pos /= d; + } + } + return; + } + for (uint32_t i = 0; i < cache.size; ++i) { if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) { cache.has_shift = true; @@ -3117,7 +3331,7 @@ static void llm_load_hparams( // sanity check for n_rot (optional) { - hparams.n_rot = hparams.n_embd / hparams.n_head; + hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head; ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); @@ -3130,10 +3344,10 @@ static void llm_load_hparams( // gpt-j n_rot = rotary_dim } - hparams.n_embd_head_k = hparams.n_embd / hparams.n_head; + hparams.n_embd_head_k = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head; ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false); - hparams.n_embd_head_v = hparams.n_embd / hparams.n_head; + hparams.n_embd_head_v = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head; ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false); // arch-specific KVs @@ -3383,6 +3597,36 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_MAMBA: + { + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 24: + switch (hparams.n_embd) { + case 768: model.type = e_model::MODEL_SMALL; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 48: + switch (hparams.n_embd) { + case 1024: model.type = e_model::MODEL_MEDIUM; break; + case 1536: model.type = e_model::MODEL_LARGE; break; + case 2048: model.type = e_model::MODEL_XL; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + case 64: + switch (hparams.n_embd) { + case 2560: model.type = e_model::MODEL_3B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -3702,6 +3946,10 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); LLAMA_LOG_INFO("%s: n_yarn_orig_ctx = %u\n", __func__, hparams.n_yarn_orig_ctx); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); + LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv); + LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner); + LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state); + LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank); LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type)); LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str()); if (ml.n_elements >= 1e12) { @@ -4609,6 +4857,57 @@ static bool llm_load_tensors( layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}); } } break; + case LLM_ARCH_MAMBA: + { + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t dt_rank = hparams.ssm_dt_rank; + // only an expansion factor of 2 is supported for now + GGML_ASSERT(2 * n_embd == d_inner); + + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed, duplicated to allow offloading + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + // norm + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.ssm_in = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}); + + layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}); + layer.ssm_conv1d_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}); + + layer.ssm_x = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}); + + layer.ssm_dt = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}); + layer.ssm_dt_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}); + + // no "weight" suffix for these + layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}); + layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, i), {d_inner}); + + // out_proj + layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -4834,6 +5133,8 @@ static void llm_build_kv_store( const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(kv.size == n_ctx); + // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens)); //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed @@ -5043,6 +5344,8 @@ static struct ggml_tensor * llm_build_kqv( cb(kq, "kq_soft_max_ext", il); } + GGML_ASSERT(kv.size == n_ctx); + // split cached v into n_head heads struct ggml_tensor * v = ggml_view_3d(ctx, kv.v_l[il], @@ -5190,8 +5493,8 @@ struct llm_build_context { norm_eps (hparams.f_norm_eps), norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (batch.n_tokens), - n_kv (worst_case ? n_ctx : kv_self.n), - kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), + n_kv (worst_case ? kv_self.size : kv_self.n), + kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), @@ -5220,6 +5523,8 @@ struct llm_build_context { struct ggml_cgraph * build_k_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + GGML_ASSERT(kv_self.size == n_ctx); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = // we rotate only the first n_rot dimensions @@ -5238,6 +5543,27 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_s_copy() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + GGML_ASSERT(kv_self.recurrent); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size); + struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size); + + conv_states = ggml_get_rows(ctx0, conv_states, lctx.inp_s_copy); + ssm_states = ggml_get_rows(ctx0, ssm_states, lctx.inp_s_copy); + + // TODO: name the intermediate tensors with cb() + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_states, kv_self.k_l[il])); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, ssm_states, kv_self.v_l[il])); + } + + return gf; + } + struct ggml_cgraph * build_defrag(const std::vector & ids) { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -7835,6 +8161,145 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_mamba() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t d_model = n_embd; + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + GGML_ASSERT(2 * d_model == d_inner); + const int64_t d_state = hparams.ssm_d_state; + const int64_t dt_rank = hparams.ssm_dt_rank; + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + // {n_embd, n_tokens} + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + struct ggml_tensor * state_mask = ggml_view_2d(ctx0, lctx.inp_s_mask, 1, n_kv, lctx.inp_s_mask->nb[0], 0); + struct ggml_tensor * state_seq = ggml_view_2d(ctx0, lctx.inp_s_seq, n_kv, n_tokens, n_kv*ggml_element_size(lctx.inp_s_seq), 0); + + for (int il = 0; il < n_layer; ++il) { + // (ab)using the KV cache to store the states + struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size); + struct ggml_tensor * ssm_states = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size); + + // clear states of sequences which are starting at the beginning of this batch + { + conv_states = ggml_mul(ctx0, + ggml_view_2d(ctx0, conv_states, conv_states->ne[0], n_kv, conv_states->nb[1], kv_head*conv_states->nb[1]), + state_mask); + ssm_states = ggml_mul(ctx0, + ggml_view_2d(ctx0, ssm_states, ssm_states->ne[0], n_kv, ssm_states->nb[1], kv_head*ssm_states->nb[1]), + state_mask); + } + + conv_states = ggml_reshape_3d(ctx0, conv_states, d_conv - 1, d_inner, n_kv); + ssm_states = ggml_reshape_3d(ctx0, ssm_states, d_state, d_inner, n_kv); + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens} + struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur); + // split the above in two + // => {d_inner, n_tokens} + struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0); + struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner); + + // conv + { + // Custom operator which is needed only to ease simultaneous sequence processing. + // For a single sequence, the equivalent is to concatenate the columns of conv_states and x, + // then make a self-overlapping view of that over d_conv columns at each stride in the 3rd dimension, + // then element-wise multiply that with the conv1d weigth, + // then sum the elements of each row, + // (the last two steps are a dot product over rows (also doable with mul_mat)) + // then permute away the ne[0] dimension, + // and then you're left with the resulting x tensor. + // The new conv_states is the last (d_conv - 1) columns + // of the last 3rd dimensional "layer" of the self-overlapping view. + // For simultaneous sequences, it's more complicated. + struct ggml_tensor * x_conv = ggml_ssm_conv(ctx0, conv_states, x, model.layers[il].ssm_conv1d, state_seq); + + // store last (d_conv - 1) columns of the conv_state part of x_conv back into the KV cache + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, + ggml_view_2d(ctx0, x_conv, d_conv - 1, d_inner*n_kv, d_conv*ggml_element_size(x_conv), (1+d_inner*n_tokens)*ggml_element_size(x_conv)), + ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner)*(n_kv), kv_self.head*(d_conv - 1)*(d_inner)*ggml_element_size(x_conv)))); + + // extract x from x_conv + x = ggml_view_2d(ctx0, x_conv, d_inner, n_tokens, d_inner*ggml_element_size(x_conv), 0); + + // bias + x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b); + + x = ggml_silu(ctx0, x); + } + + // ssm + { + // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens} + struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x); + // split + struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0); + struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank); + struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state)); + + // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens} + dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt); + dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b); + + // Custom operator to optimize the parallel associative scan + // as described in the Annex D of the Mamba paper. + // => {d_inner, n_tokens} and {d_state, d_inner, n_kv} combined, + // because only a single tensor can be returned. + struct ggml_tensor * y_ssm_states = ggml_ssm_scan(ctx0, ssm_states, x, dt, model.layers[il].ssm_a, B, C, state_seq); + + // store last states (the second part of y_ssm_states) + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_ssm_states, d_state*d_inner*n_kv, d_inner*n_tokens*ggml_element_size(y_ssm_states)), + ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_self.head*d_state*d_inner*ggml_element_size(ssm_states)))); + + struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0); + + // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens} + y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); + y = ggml_mul(ctx0, y, ggml_silu(ctx0, z)); + + // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens} + cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y); + } + + // residual + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + // final rmsnorm + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -7871,6 +8336,23 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) { return result; } +static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) { + llama_batch dummy; + dummy.n_tokens = 0; + + llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { }; + + struct llm_build_context llm(lctx, dummy, cb, false); + + llm.init(); + + struct ggml_cgraph * result = llm.build_s_copy(); + + llm.free(); + + return result; +} + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch, @@ -7985,6 +8467,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_starcoder2(); } break; + case LLM_ARCH_MAMBA: + { + result = llm.build_mamba(); + } break; default: GGML_ASSERT(false); } @@ -7995,19 +8481,29 @@ static struct ggml_cgraph * llama_build_graph( } static void llama_set_k_shift(llama_context & lctx) { - const auto & cparams = lctx.cparams; - - const int64_t n_ctx = cparams.n_ctx; + const int64_t kv_size = lctx.kv_self.size; assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer)); int32_t * data = (int32_t *) lctx.inp_K_shift->data; - for (int i = 0; i < n_ctx; ++i) { + for (int i = 0; i < kv_size; ++i) { data[i] = lctx.kv_self.cells[i].delta; } } +static void llama_set_s_copy(llama_context & lctx) { + const int64_t kv_size = lctx.kv_self.size; + + assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer)); + + int32_t * data = (int32_t *) lctx.inp_s_copy->data; + + for (int i = 0; i < kv_size; ++i) { + data[i] = lctx.kv_self.cells[i].src; + } +} + static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { // // set input data @@ -8044,6 +8540,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { float * data = (float *) lctx.inp_KQ_mask->data; + // For causal attention, use only the previous KV cells + // of the correct sequence for each token of the batch. + // It's assumed that if a token in the batch has multiple sequences, they are equivalent. for (int h = 0; h < 1; ++h) { for (int j = 0; j < n_tokens; ++j) { const llama_pos pos = batch.pos[j]; @@ -8149,6 +8648,53 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } } + + if (kv_self.recurrent) { + const int64_t n_kv = kv_self.n; + + { + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer)); + float * data = (float *) lctx.inp_s_mask->data; + + // states which are not affected by the current batch are left untouched + for (int i = 0; i < n_kv; ++i) { + llama_seq_id seq_id = i + lctx.kv_self.head; + llama_kv_cell & kv_cell = lctx.kv_self.cells[seq_id]; + bool has_self_seq = kv_cell.has_seq_id(seq_id); + + data[i] = (float) has_self_seq; + + // ensure current sequences will be kept + if (!has_self_seq && kv_cell.pos >= 0) { + kv_cell.seq_id.insert(seq_id); + } + } + } + // For Mamba (and other recurrent architectures), + // update the correct state(s)/sequence(s) for each token of the batch. + // Like with the KQ_mask, if a token in the batch has multiple sequences, + // they are assumed to be equivalent (not here, but in ggml_ssm_scan and ggml_ssm_conv). + { + const int64_t n_tokens = batch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer)); + int32_t * data = (int32_t *) lctx.inp_s_seq->data; + + for (int j = 0; j < n_tokens; ++j) { + const int32_t n_seq = batch.n_seq_id[j]; + GGML_ASSERT(0 < n_seq); // a token should be part of at least 1 sequence + + for (int i = 0; i < n_kv; ++i) { + if (i < n_seq) { + // for this type of model, the head is the minimum seq_id of the batch + data[j*n_kv + i] = batch.seq_id[j][i] - kv_self.head; + } else { + data[j*n_kv + i] = -1; + } + } + } + } + } } static void llama_graph_compute( @@ -8271,11 +8817,13 @@ static int llama_decode_internal( return 1; } - // a heuristic, to avoid attending the full cache if it is not yet utilized - // after enough generations, the benefit from this heuristic disappears - // if we start defragmenting the cache, the benefit from this will be more important - kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); - //kv_self.n = llama_kv_cache_cell_max(kv_self); + if (!kv_self.recurrent) { + // a heuristic, to avoid attending the full cache if it is not yet utilized + // after enough generations, the benefit from this heuristic disappears + // if we start defragmenting the cache, the benefit from this will be more important + kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); + //kv_self.n = llama_kv_cache_cell_max(kv_self); + } } //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); @@ -8701,6 +9249,26 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { } } + if (lctx.kv_self.recurrent && lctx.kv_self.do_copy) { + llama_set_s_copy(lctx); + + { + ggml_cgraph * gf = llama_build_graph_s_copy(lctx); + + llama_graph_compute(lctx, gf, lctx.cparams.n_threads); + } + + { + auto & kv_self = lctx.kv_self; + + kv_self.do_copy = false; + + for (uint32_t i = 0; i < kv_self.size; ++i) { + kv_self.cells[i].src = i; + } + } + } + // defragment the KV cache if needed if (lctx.kv_self.do_defrag) { llama_kv_cache_defrag_internal(lctx); @@ -11535,6 +12103,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); + // do not quantize Mamba's small yet 2D weights + // NOTE: can't use LLM_TN here because the layer number is not known + quantize &= name.find("ssm_conv1d.weight") == std::string::npos; + quantize &= name.find("ssm_x.weight") == std::string::npos; + quantize &= name.find("ssm_dt.weight") == std::string::npos; + enum ggml_type new_type; void * new_data; size_t new_size; @@ -11985,6 +12559,7 @@ struct llama_context_params llama_context_default_params() { /*.seed =*/ LLAMA_DEFAULT_SEED, /*.n_ctx =*/ 512, /*.n_batch =*/ 512, + /*.n_parallel =*/ 1, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, @@ -12146,6 +12721,7 @@ struct llama_context * llama_new_context_with_model( auto & cparams = ctx->cparams; cparams.n_batch = params.n_batch; + // TODO: maybe add n_parallel here too cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; cparams.yarn_ext_factor = params.yarn_ext_factor; @@ -12203,8 +12779,18 @@ struct llama_context * llama_new_context_with_model( ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; - const ggml_type type_k = params.type_k; - const ggml_type type_v = params.type_v; + uint32_t kv_size = cparams.n_ctx; + ggml_type type_k = params.type_k; + ggml_type type_v = params.type_v; + + // Mamba only needs a constant number of KV cache cells per sequence + if (model->arch == LLM_ARCH_MAMBA) { + // Mamba needs at least as many KV cells as there are sequences kept at any time + kv_size = std::max((uint32_t) 1, params.n_parallel); + // it's probably best to keep as much precision as possible for the states + type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states + type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states + } GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); @@ -12304,7 +12890,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_cpu); - if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) { + if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -12338,7 +12924,7 @@ struct llama_context * llama_new_context_with_model( // graph inputs { ggml_init_params init_params = { - /* .mem_size */ ggml_tensor_overhead()*8, + /* .mem_size */ ggml_tensor_overhead()*(8 + 3*(ctx->kv_self.recurrent)), /* .mem_buffer */ nullptr, /* .no_alloc */ true, }; @@ -12347,11 +12933,16 @@ struct llama_context * llama_new_context_with_model( ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch); ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); - ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch); - ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx); - ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx); + ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, kv_size, cparams.n_batch); + ctx->inp_KQ_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, kv_size); + ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, kv_size); ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch); ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch); + if (ctx->kv_self.recurrent) { + ctx->inp_s_copy = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, kv_size); + ctx->inp_s_mask = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, kv_size); + ctx->inp_s_seq = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_I32, kv_size, cparams.n_batch); + } ggml_set_name(ctx->inp_tokens, "inp_tokens"); ggml_set_name(ctx->inp_embd, "inp_embd"); @@ -12361,6 +12952,11 @@ struct llama_context * llama_new_context_with_model( ggml_set_name(ctx->inp_K_shift, "inp_K_shift"); ggml_set_name(ctx->inp_mean, "inp_mean"); ggml_set_name(ctx->inp_cls, "inp_cls"); + if (ctx->kv_self.recurrent) { + ggml_set_name(ctx->inp_s_copy, "inp_s_copy"); + ggml_set_name(ctx->inp_s_mask, "inp_s_mask"); + ggml_set_name(ctx->inp_s_seq, "inp_s_seq"); + } ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__, @@ -12447,6 +13043,10 @@ uint32_t llama_n_batch(const struct llama_context * ctx) { return ctx->cparams.n_batch; } +uint32_t llama_n_max_seq(const struct llama_context * ctx) { + return ctx->kv_self.size; +} + enum llama_vocab_type llama_vocab_type(const struct llama_model * model) { return model->vocab.type; } @@ -12460,6 +13060,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MPT: case LLM_ARCH_REFACT: case LLM_ARCH_BLOOM: + case LLM_ARCH_MAMBA: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values @@ -12713,8 +13314,8 @@ void llama_kv_cache_clear(struct llama_context * ctx) { llama_kv_cache_clear(ctx->kv_self); } -void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1); +bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1); } void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { @@ -12891,8 +13492,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto & hparams = ctx->model.hparams; const uint32_t n_layer = hparams.n_layer; - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s(); const size_t kv_buf_size = kv_self.total_size(); const uint32_t kv_head = llama_kv_cache_cell_max(kv_self); @@ -12913,6 +13514,17 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); + if (kv_self.recurrent) { + // v is contiguous for recurrent models + // TODO: use other tensors for state models than k and v + const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); + + tmp_buf.resize(v_size); + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); + continue; + } + // v is not contiguous, copy row by row const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size); @@ -13005,8 +13617,8 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { const auto & hparams = ctx->model.hparams; const uint32_t n_layer = hparams.n_layer; - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s(); size_t kv_buf_size; uint32_t kv_head; @@ -13027,6 +13639,16 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); inp += k_size; + if (kv_self.recurrent) { + // v is contiguous for recurrent models + // TODO: use other tensors for state models than k and v + const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); + + ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size); + inp += v_size; + continue; + } + // v is not contiguous, copy row by row const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size); diff --git a/llama.h b/llama.h index 3dc162b078d30..7a107c7f335d5 100644 --- a/llama.h +++ b/llama.h @@ -235,6 +235,7 @@ extern "C" { uint32_t seed; // RNG seed, -1 for random uint32_t n_ctx; // text context, 0 = from model uint32_t n_batch; // prompt processing maximum batch size + uint32_t n_parallel; // number of parallel sequences (i.e. distinct states for recurrent models) uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing @@ -376,6 +377,7 @@ extern "C" { LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_max_seq (const struct llama_context * ctx); LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); @@ -502,7 +504,7 @@ extern "C" { // seq_id < 0 : match any sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) - LLAMA_API void llama_kv_cache_seq_rm( + LLAMA_API bool llama_kv_cache_seq_rm( struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, From fd72d2d2a5e79d61ccef6af3d15f16e5e5cbc352 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 9 Mar 2024 10:30:04 +0100 Subject: [PATCH 06/39] server: tests: add truncated prompt tests, better kv cache size (#5933) * server: tests: add truncated prompt tests, better size * server, tests : update regex --------- Co-authored-by: Georgi Gerganov --- examples/server/server.cpp | 23 +++++++++-- .../server/tests/features/parallel.feature | 5 ++- examples/server/tests/features/server.feature | 41 ++++++++++++++----- examples/server/tests/features/steps/steps.py | 35 +++++++++++++--- 4 files changed, 81 insertions(+), 23 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 59a59d56b6023..6f44499843a63 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1128,6 +1128,7 @@ struct server_context { LOG_VERBOSE("stopped by limit", { {"id_slot", slot.id}, + {"id_task", slot.id_task}, {"n_decoded", slot.n_decoded}, {"n_predict", slot.params.n_predict}, }); @@ -1141,6 +1142,8 @@ struct server_context { } LOG_VERBOSE("next token", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, {"token", result.tok}, {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, {"has_next_token", slot.has_next_token}, @@ -1750,6 +1753,15 @@ struct server_context { slot.n_past = 0; slot.n_prompt_tokens = prompt_tokens.size(); + LOG_VERBOSE("prompt tokenized", { + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_prompt_tokens", slot.n_prompt_tokens}, + {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, + }); + if (slot.embedding) { // this prompt is too large to process - discard it if (slot.n_prompt_tokens > n_batch) { @@ -1788,10 +1800,13 @@ struct server_context { slot.n_prompt_tokens = prompt_tokens.size(); LOG_VERBOSE("input truncated", { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, + {"id_slot", slot.id}, + {"id_task", slot.id_task}, + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, + {"n_prompt_tokens", slot.n_prompt_tokens}, + {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, }); GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 066698c8e3c1e..a66fed626619d 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -6,8 +6,8 @@ Feature: Parallel Given a server listening on localhost:8080 And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models And 42 as server seed - And 512 as batch size - And 64 KV cache size + And 128 as batch size + And 256 KV cache size And 2 slots And continuous batching Then the server is starting @@ -76,6 +76,7 @@ Feature: Parallel | disabled | 128 | | enabled | 64 | + Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969 Given a prompt: """ diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 878ac1363c419..aa132fa3472ef 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -10,11 +10,10 @@ Feature: llama.cpp server # KV Cache corresponds to the total amount of tokens # that can be stored across all independent sequences: #4130 # see --ctx-size and #5568 - And 32 KV cache size - And 512 as batch size - And 1 slots - And embeddings extraction - And 32 server max tokens to predict + And 256 KV cache size + And 32 as batch size + And 2 slots + And 64 server max tokens to predict And prometheus compatible metrics exposed Then the server is starting Then the server is healthy @@ -23,18 +22,35 @@ Feature: llama.cpp server Then the server is ready And all slots are idle + Scenario Outline: Completion Given a prompt And max tokens to predict And a completion request with no api error Then tokens are predicted matching + And the completion is truncated + And prompt tokens are processed And prometheus metrics are exposed And metric llamacpp:tokens_predicted is Examples: Prompts - | prompt | n_predict | re_content | n_predicted | - | I believe the meaning of life is | 8 | (read\|going)+ | 8 | - | Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 | + | prompt | n_predict | re_content | n_prompt | n_predicted | truncated | + | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not | + | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids)+ | 46 | 64 | not | + + Scenario: Completion prompt truncated + Given a prompt: + """ + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. + Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. + Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. + Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + """ + And a completion request with no api error + Then 64 tokens are predicted matching fun|Annaks|popcorns + And the completion is truncated + And 109 prompt tokens are processed + Scenario Outline: OAI Compatibility Given a model @@ -44,11 +60,14 @@ Feature: llama.cpp server And streaming is Given an OAI compatible chat completions request with no api error Then tokens are predicted matching + And prompt tokens are processed + And the completion is truncated Examples: Prompts - | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming | - | llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled | - | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled | + | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated | + | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird)+ | -1 | 64 | enabled | | + Scenario: Tokenize / Detokenize When tokenizing: diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index d7f0058360c23..0076f805be4d3 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -196,12 +196,30 @@ async def step_request_completion(context, api_error): @step(u'{predicted_n:d} tokens are predicted matching {re_content}') def step_n_tokens_predicted_with_content(context, predicted_n, re_content): - assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content) + context.completion = context.tasks_result.pop() + assert_n_tokens_predicted(context.completion, predicted_n, re_content) @step(u'{predicted_n:d} tokens are predicted') def step_n_tokens_predicted(context, predicted_n): - assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n) + context.completion = context.tasks_result.pop() + assert_n_tokens_predicted(context.completion, predicted_n) + + +@step(u'the completion is truncated') +def step_assert_completion_truncated(context): + step_assert_completion_truncated(context, '') + + +@step(u'the completion is {truncated} truncated') +def step_assert_completion_truncated(context, truncated): + truncated = truncated != "not" + assert context.completion['truncated'] == truncated, f'{context.completion}' + + +@step(u'{n_prompt:d} prompt tokens are processed') +def step_impl(context, n_prompt): + assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}" @step(u'a user prompt {user_prompt}') @@ -722,7 +740,8 @@ async def oai_chat_completions(user_prompt, completion_response = { 'content': '', 'timings': { - 'predicted_n': 0 + 'predicted_n': 0, + 'prompt_n': 0 } } if async_client: @@ -763,7 +782,8 @@ async def oai_chat_completions(user_prompt, completion_response = { 'content': chat_completion_raw['choices'][0]['message'], 'timings': { - 'predicted_n': chat_completion_raw['usage']['completion_tokens'] + 'predicted_n': chat_completion_raw['usage']['completion_tokens'], + 'prompt_n': chat_completion_raw['usage']['prompt_tokens'] } } else: @@ -792,13 +812,16 @@ async def oai_chat_completions(user_prompt, if 'content' in delta: completion_response['content'] += delta['content'] completion_response['timings']['predicted_n'] += 1 + completion_response['truncated'] = chunk.choices[0].finish_reason != 'stop' else: assert len(chat_completion.choices) == 1 completion_response = { 'content': chat_completion.choices[0].message.content, 'timings': { - 'predicted_n': chat_completion.usage.completion_tokens - } + 'predicted_n': chat_completion.usage.completion_tokens, + 'prompt_n': chat_completion.usage.prompt_tokens + }, + 'truncated': chat_completion.choices[0].finish_reason != 'stop' } if debug: print("OAI response formatted to llama.cpp:", completion_response) From e1fa9569ba8ce276bc7801a3cebdcf8b1aa116ea Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Sat, 9 Mar 2024 02:57:09 -0700 Subject: [PATCH 07/39] server : add SSL support (#5926) * add cmake build toggle to enable ssl support in server Signed-off-by: Gabe Goodhart * add flags for ssl key/cert files and use SSLServer if set All SSL setup is hidden behind CPPHTTPLIB_OPENSSL_SUPPORT in the same way that the base httlib hides the SSL support Signed-off-by: Gabe Goodhart * Update readme for SSL support in server Signed-off-by: Gabe Goodhart * Add LLAMA_SERVER_SSL variable setup to top-level Makefile Signed-off-by: Gabe Goodhart --------- Signed-off-by: Gabe Goodhart --- Makefile | 4 ++ examples/server/CMakeLists.txt | 6 ++ examples/server/README.md | 26 ++++++++ examples/server/server.cpp | 108 ++++++++++++++++++++++----------- 4 files changed, 110 insertions(+), 34 deletions(-) diff --git a/Makefile b/Makefile index efce10bb8bd7e..aea9692227298 100644 --- a/Makefile +++ b/Makefile @@ -201,6 +201,10 @@ ifdef LLAMA_SERVER_VERBOSE MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) endif +ifdef LLAMA_SERVER_SSL + MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT + MK_LDFLAGS += -lssl -lcrypto +endif ifdef LLAMA_CODE_COVERAGE MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase '' diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index c21eba634310b..f94de1e99b7e9 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,5 +1,6 @@ set(TARGET server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) +option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) install(TARGETS ${TARGET} RUNTIME) @@ -7,6 +8,11 @@ target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) +if (LLAMA_SERVER_SSL) + find_package(OpenSSL REQUIRED) + target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto) + target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT) +endif() if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() diff --git a/examples/server/README.md b/examples/server/README.md index 591f748f84cd7..bf8c450b60223 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -59,6 +59,10 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--log-disable`: Output logs to stdout only, default: enabled. - `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json) +**If compiled with `LLAMA_SERVER_SSL=ON`** +- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key +- `--ssl-cert-file FNAME`: path to file a PEM-encoded SSL certificate + ## Build server is build alongside everything else from the root of the project @@ -75,6 +79,28 @@ server is build alongside everything else from the root of the project cmake --build . --config Release ``` +## Build with SSL + +server can also be built with SSL support using OpenSSL 3 + +- Using `make`: + + ```bash + # NOTE: For non-system openssl, use the following: + # CXXFLAGS="-I /path/to/openssl/include" + # LDFLAGS="-L /path/to/openssl/lib" + make LLAMA_SERVER_SSL=true server + ``` + +- Using `CMake`: + + ```bash + mkdir build + cd build + cmake .. -DLLAMA_SERVER_SSL=ON + make server + ``` + ## Quick Start To get started right away, run the following command, making sure to use the correct path for the model you have: diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6f44499843a63..c3b87c846a6e2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -27,6 +27,7 @@ #include #include #include +#include using json = nlohmann::json; @@ -118,6 +119,11 @@ struct server_params { std::vector api_keys; +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT + std::string ssl_key_file = ""; + std::string ssl_cert_file = ""; +#endif + bool slots_endpoint = true; bool metrics_endpoint = false; }; @@ -2142,6 +2148,10 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT + printf(" --ssl-key-file FNAME path to file a PEM-encoded SSL private key\n"); + printf(" --ssl-cert-file FNAME path to file a PEM-encoded SSL certificate\n"); +#endif printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" --embeddings enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); @@ -2220,7 +2230,24 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, } } key_file.close(); - } else if (arg == "--timeout" || arg == "-to") { + + } +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT + else if (arg == "--ssl-key-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.ssl_key_file = argv[i]; + } else if (arg == "--ssl-cert-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.ssl_cert_file = argv[i]; + } +#endif + else if (arg == "--timeout" || arg == "-to") { if (++i >= argc) { invalid_param = true; break; @@ -2658,21 +2685,34 @@ int main(int argc, char ** argv) { {"system_info", llama_print_system_info()}, }); - httplib::Server svr; + std::unique_ptr svr; +#ifdef CPPHTTPLIB_OPENSSL_SUPPORT + if (sparams.ssl_key_file != "" && sparams.ssl_cert_file != "") { + LOG_INFO("Running with SSL", {{"key", sparams.ssl_key_file}, {"cert", sparams.ssl_cert_file}}); + svr.reset( + new httplib::SSLServer(sparams.ssl_cert_file.c_str(), sparams.ssl_key_file.c_str()) + ); + } else { + LOG_INFO("Running without SSL", {}); + svr.reset(new httplib::Server()); + } +#else + svr.reset(new httplib::Server()); +#endif std::atomic state{SERVER_STATE_LOADING_MODEL}; - svr.set_default_headers({{"Server", "llama.cpp"}}); + svr->set_default_headers({{"Server", "llama.cpp"}}); // CORS preflight - svr.Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) { + svr->Options(R"(.*)", [](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_header("Access-Control-Allow-Credentials", "true"); res.set_header("Access-Control-Allow-Methods", "POST"); res.set_header("Access-Control-Allow-Headers", "*"); }); - svr.Get("/health", [&](const httplib::Request & req, httplib::Response & res) { + svr->Get("/health", [&](const httplib::Request & req, httplib::Response & res) { server_state current_state = state.load(); switch (current_state) { case SERVER_STATE_READY: @@ -2728,7 +2768,7 @@ int main(int argc, char ** argv) { }); if (sparams.slots_endpoint) { - svr.Get("/slots", [&](const httplib::Request &, httplib::Response & res) { + svr->Get("/slots", [&](const httplib::Request &, httplib::Response & res) { // request slots data using task queue server_task task; task.id = ctx_server.queue_tasks.get_new_id(); @@ -2749,7 +2789,7 @@ int main(int argc, char ** argv) { } if (sparams.metrics_endpoint) { - svr.Get("/metrics", [&](const httplib::Request &, httplib::Response & res) { + svr->Get("/metrics", [&](const httplib::Request &, httplib::Response & res) { // request slots data using task queue server_task task; task.id = ctx_server.queue_tasks.get_new_id(); @@ -2846,9 +2886,9 @@ int main(int argc, char ** argv) { }); } - svr.set_logger(log_server_request); + svr->set_logger(log_server_request); - svr.set_exception_handler([](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { + svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { const char fmt[] = "500 Internal Server Error\n%s"; char buf[BUFSIZ]; @@ -2864,7 +2904,7 @@ int main(int argc, char ** argv) { res.status = 500; }); - svr.set_error_handler([](const httplib::Request &, httplib::Response & res) { + svr->set_error_handler([](const httplib::Request &, httplib::Response & res) { if (res.status == 401) { res.set_content("Unauthorized", "text/plain; charset=utf-8"); } @@ -2877,16 +2917,16 @@ int main(int argc, char ** argv) { }); // set timeouts and change hostname and port - svr.set_read_timeout (sparams.read_timeout); - svr.set_write_timeout(sparams.write_timeout); + svr->set_read_timeout (sparams.read_timeout); + svr->set_write_timeout(sparams.write_timeout); - if (!svr.bind_to_port(sparams.hostname, sparams.port)) { + if (!svr->bind_to_port(sparams.hostname, sparams.port)) { fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port); return 1; } // Set the base directory for serving static files - svr.set_base_dir(sparams.public_path); + svr->set_base_dir(sparams.public_path); std::unordered_map log_data; @@ -2947,30 +2987,30 @@ int main(int argc, char ** argv) { }; // this is only called if no index.html is found in the public --path - svr.Get("/", [](const httplib::Request &, httplib::Response & res) { + svr->Get("/", [](const httplib::Request &, httplib::Response & res) { res.set_content(reinterpret_cast(&index_html), index_html_len, "text/html; charset=utf-8"); return false; }); // this is only called if no index.js is found in the public --path - svr.Get("/index.js", [](const httplib::Request &, httplib::Response & res) { + svr->Get("/index.js", [](const httplib::Request &, httplib::Response & res) { res.set_content(reinterpret_cast(&index_js), index_js_len, "text/javascript; charset=utf-8"); return false; }); // this is only called if no index.html is found in the public --path - svr.Get("/completion.js", [](const httplib::Request &, httplib::Response & res) { + svr->Get("/completion.js", [](const httplib::Request &, httplib::Response & res) { res.set_content(reinterpret_cast(&completion_js), completion_js_len, "application/javascript; charset=utf-8"); return false; }); // this is only called if no index.html is found in the public --path - svr.Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response & res) { + svr->Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response & res) { res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8"); return false; }); - svr.Get("/props", [&ctx_server](const httplib::Request & req, httplib::Response & res) { + svr->Get("/props", [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { { "user_name", ctx_server.name_user.c_str() }, @@ -3062,11 +3102,11 @@ int main(int argc, char ** argv) { } }; - svr.Post("/completion", completions); // legacy - svr.Post("/completions", completions); - svr.Post("/v1/completions", completions); + svr->Post("/completion", completions); // legacy + svr->Post("/completions", completions); + svr->Post("/v1/completions", completions); - svr.Get("/v1/models", [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) { + svr->Get("/v1/models", [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json models = { @@ -3161,10 +3201,10 @@ int main(int argc, char ** argv) { } }; - svr.Post("/chat/completions", chat_completions); - svr.Post("/v1/chat/completions", chat_completions); + svr->Post("/chat/completions", chat_completions); + svr->Post("/v1/chat/completions", chat_completions); - svr.Post("/infill", [&ctx_server, &validate_api_key](const httplib::Request & req, httplib::Response & res) { + svr->Post("/infill", [&ctx_server, &validate_api_key](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!validate_api_key(req, res)) { return; @@ -3228,11 +3268,11 @@ int main(int argc, char ** argv) { } }); - svr.Options(R"(/.*)", [](const httplib::Request &, httplib::Response & res) { + svr->Options(R"(/.*)", [](const httplib::Request &, httplib::Response & res) { return res.set_content("", "application/json; charset=utf-8"); }); - svr.Post("/tokenize", [&ctx_server](const httplib::Request & req, httplib::Response & res) { + svr->Post("/tokenize", [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); @@ -3244,7 +3284,7 @@ int main(int argc, char ** argv) { return res.set_content(data.dump(), "application/json; charset=utf-8"); }); - svr.Post("/detokenize", [&ctx_server](const httplib::Request & req, httplib::Response & res) { + svr->Post("/detokenize", [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); @@ -3258,7 +3298,7 @@ int main(int argc, char ** argv) { return res.set_content(data.dump(), "application/json; charset=utf-8"); }); - svr.Post("/embedding", [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) { + svr->Post("/embedding", [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!params.embedding) { res.status = 501; @@ -3289,7 +3329,7 @@ int main(int argc, char ** argv) { return res.set_content(result.data.dump(), "application/json; charset=utf-8"); }); - svr.Post("/v1/embeddings", [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) { + svr->Post("/v1/embeddings", [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!params.embedding) { res.status = 501; @@ -3360,13 +3400,13 @@ int main(int argc, char ** argv) { sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); } log_data["n_threads_http"] = std::to_string(sparams.n_threads_http); - svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; + svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; LOG_INFO("HTTP server listening", log_data); // run the HTTP server in a thread - see comment below std::thread t([&]() { - if (!svr.listen_after_bind()) { + if (!svr->listen_after_bind()) { state.store(SERVER_STATE_ERROR); return 1; } @@ -3407,7 +3447,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.start_loop(); - svr.stop(); + svr->stop(); t.join(); llama_backend_free(); From 950ba1ab84db199f0bbdecdb2bb911f35261b321 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 9 Mar 2024 11:27:53 +0100 Subject: [PATCH 08/39] Server: reorganize some http logic (#5939) * refactor static file handler * use set_pre_routing_handler for validate_api_key * merge embedding handlers * correct http verb for endpoints * fix embedding response * fix test case CORS Options * fix code style --- examples/server/README.md | 4 +- examples/server/server.cpp | 637 +++++++++--------- .../server/tests/features/security.feature | 5 +- examples/server/tests/features/steps/steps.py | 3 +- 4 files changed, 335 insertions(+), 314 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index bf8c450b60223..3abb1abe3b92b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -42,7 +42,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--port`: Set the port to listen. Default: `8080`. -- `--path`: path from which to serve static files (default examples/server/public) +- `--path`: path from which to serve static files (default: disabled) - `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. - `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s. - `--embedding`: Enable embedding extraction, Default: disabled. @@ -558,7 +558,7 @@ The HTTP server supports OAI-like API ### Extending or building alternative Web Front End -The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method. +You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method. Read the documentation in `/completion.js` to see convenient ways to access llama. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c3b87c846a6e2..6e0f8328cdf5a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -113,7 +113,7 @@ struct server_params { int32_t n_threads_http = -1; std::string hostname = "127.0.0.1"; - std::string public_path = "examples/server/public"; + std::string public_path = ""; std::string chat_template = ""; std::string system_prompt = ""; @@ -2145,7 +2145,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); - printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); + printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n"); printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); #ifdef CPPHTTPLIB_OPENSSL_SUPPORT @@ -2211,7 +2211,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } - sparams.api_keys.emplace_back(argv[i]); + sparams.api_keys.push_back(argv[i]); } else if (arg == "--api-key-file") { if (++i >= argc) { invalid_param = true; @@ -2712,180 +2712,6 @@ int main(int argc, char ** argv) { res.set_header("Access-Control-Allow-Headers", "*"); }); - svr->Get("/health", [&](const httplib::Request & req, httplib::Response & res) { - server_state current_state = state.load(); - switch (current_state) { - case SERVER_STATE_READY: - { - // request slots data using task queue - server_task task; - task.id = ctx_server.queue_tasks.get_new_id(); - task.type = SERVER_TASK_TYPE_METRICS; - task.id_target = -1; - - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); - - // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); - - const int n_idle_slots = result.data["idle"]; - const int n_processing_slots = result.data["processing"]; - - json health = { - {"status", "ok"}, - {"slots_idle", n_idle_slots}, - {"slots_processing", n_processing_slots} - }; - - res.status = 200; // HTTP OK - if (sparams.slots_endpoint && req.has_param("include_slots")) { - health["slots"] = result.data["slots"]; - } - - if (n_idle_slots == 0) { - health["status"] = "no slot available"; - if (req.has_param("fail_on_no_slot")) { - res.status = 503; // HTTP Service Unavailable - } - } - - res.set_content(health.dump(), "application/json"); - break; - } - case SERVER_STATE_LOADING_MODEL: - { - res.set_content(R"({"status": "loading model"})", "application/json"); - res.status = 503; // HTTP Service Unavailable - } break; - case SERVER_STATE_ERROR: - { - res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json"); - res.status = 500; // HTTP Internal Server Error - } break; - } - }); - - if (sparams.slots_endpoint) { - svr->Get("/slots", [&](const httplib::Request &, httplib::Response & res) { - // request slots data using task queue - server_task task; - task.id = ctx_server.queue_tasks.get_new_id(); - task.id_multi = -1; - task.id_target = -1; - task.type = SERVER_TASK_TYPE_METRICS; - - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); - - // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); - - res.set_content(result.data["slots"].dump(), "application/json"); - res.status = 200; // HTTP OK - }); - } - - if (sparams.metrics_endpoint) { - svr->Get("/metrics", [&](const httplib::Request &, httplib::Response & res) { - // request slots data using task queue - server_task task; - task.id = ctx_server.queue_tasks.get_new_id(); - task.id_multi = -1; - task.id_target = -1; - task.type = SERVER_TASK_TYPE_METRICS; - task.data.push_back({{"reset_bucket", true}}); - - ctx_server.queue_results.add_waiting_task_id(task.id); - ctx_server.queue_tasks.post(task); - - // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); - ctx_server.queue_results.remove_waiting_task_id(task.id); - - json data = result.data; - - const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"]; - const uint64_t t_prompt_processing = data["t_prompt_processing"]; - - const uint64_t n_tokens_predicted = data["n_tokens_predicted"]; - const uint64_t t_tokens_generation = data["t_tokens_generation"]; - - const int32_t kv_cache_used_cells = data["kv_cache_used_cells"]; - - // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names - json all_metrics_def = json { - {"counter", {{ - {"name", "prompt_tokens_total"}, - {"help", "Number of prompt tokens processed."}, - {"value", (uint64_t) data["n_prompt_tokens_processed_total"]} - }, { - {"name", "prompt_seconds_total"}, - {"help", "Prompt process time"}, - {"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3} - }, { - {"name", "tokens_predicted_total"}, - {"help", "Number of generation tokens processed."}, - {"value", (uint64_t) data["n_tokens_predicted_total"]} - }, { - {"name", "tokens_predicted_seconds_total"}, - {"help", "Predict process time"}, - {"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3} - }}}, - {"gauge", {{ - {"name", "prompt_tokens_seconds"}, - {"help", "Average prompt throughput in tokens/s."}, - {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.} - },{ - {"name", "predicted_tokens_seconds"}, - {"help", "Average generation throughput in tokens/s."}, - {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.} - },{ - {"name", "kv_cache_usage_ratio"}, - {"help", "KV-cache usage. 1 means 100 percent usage."}, - {"value", 1. * kv_cache_used_cells / params.n_ctx} - },{ - {"name", "kv_cache_tokens"}, - {"help", "KV-cache tokens."}, - {"value", (uint64_t) data["kv_cache_tokens_count"]} - },{ - {"name", "requests_processing"}, - {"help", "Number of request processing."}, - {"value", (uint64_t) data["processing"]} - },{ - {"name", "requests_deferred"}, - {"help", "Number of request deferred."}, - {"value", (uint64_t) data["deferred"]} - }}} - }; - - std::stringstream prometheus; - - for (const auto & el : all_metrics_def.items()) { - const auto & type = el.key(); - const auto & metrics_def = el.value(); - - for (const auto & metric_def : metrics_def) { - const std::string name = metric_def["name"]; - const std::string help = metric_def["help"]; - - auto value = json_value(metric_def, "value", 0.); - prometheus << "# HELP llamacpp:" << name << " " << help << "\n" - << "# TYPE llamacpp:" << name << " " << type << "\n" - << "llamacpp:" << name << " " << value << "\n"; - } - } - - const int64_t t_start = data["t_start"]; - res.set_header("Process-Start-Time-Unix", std::to_string(t_start)); - - res.set_content(prometheus.str(), "text/plain; version=0.0.4"); - res.status = 200; // HTTP OK - }); - } - svr->set_logger(log_server_request); svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { @@ -2925,16 +2751,14 @@ int main(int argc, char ** argv) { return 1; } - // Set the base directory for serving static files - svr->set_base_dir(sparams.public_path); - std::unordered_map log_data; log_data["hostname"] = sparams.hostname; log_data["port"] = std::to_string(sparams.port); if (sparams.api_keys.size() == 1) { - log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4); + auto key = sparams.api_keys[0]; + log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0)); } else if (sparams.api_keys.size() > 1) { log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; } @@ -2959,13 +2783,37 @@ int main(int argc, char ** argv) { } } - // Middleware for API key validation - auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool { + // + // Middlewares + // + + auto middleware_validate_api_key = [&sparams](const httplib::Request & req, httplib::Response & res) { + // TODO: should we apply API key to all endpoints, including "/health" and "/models"? + static const std::set protected_endpoints = { + "/props", + "/completion", + "/completions", + "/v1/completions", + "/chat/completions", + "/v1/chat/completions", + "/infill", + "/tokenize", + "/detokenize", + "/embedding", + "/embeddings", + "/v1/embeddings", + }; + // If API key is not set, skip validation if (sparams.api_keys.empty()) { return true; } + // If path is not in protected_endpoints list, skip validation + if (protected_endpoints.find(req.path) == protected_endpoints.end()) { + return true; + } + // Check for API key in the header auto auth_header = req.get_header_value("Authorization"); @@ -2978,6 +2826,8 @@ int main(int argc, char ** argv) { } // API key is invalid or not provided + // TODO: make another middleware for CORS related logic + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8"); res.status = 401; // Unauthorized @@ -2986,31 +2836,201 @@ int main(int argc, char ** argv) { return false; }; - // this is only called if no index.html is found in the public --path - svr->Get("/", [](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(&index_html), index_html_len, "text/html; charset=utf-8"); - return false; + // register server middlewares + svr->set_pre_routing_handler([&middleware_validate_api_key](const httplib::Request & req, httplib::Response & res) { + if (!middleware_validate_api_key(req, res)) { + return httplib::Server::HandlerResponse::Handled; + } + return httplib::Server::HandlerResponse::Unhandled; }); - // this is only called if no index.js is found in the public --path - svr->Get("/index.js", [](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(&index_js), index_js_len, "text/javascript; charset=utf-8"); - return false; - }); + // + // Route handlers (or controllers) + // - // this is only called if no index.html is found in the public --path - svr->Get("/completion.js", [](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(&completion_js), completion_js_len, "application/javascript; charset=utf-8"); - return false; - }); + const auto handle_health = [&](const httplib::Request & req, httplib::Response & res) { + server_state current_state = state.load(); + switch (current_state) { + case SERVER_STATE_READY: + { + // request slots data using task queue + server_task task; + task.id = ctx_server.queue_tasks.get_new_id(); + task.type = SERVER_TASK_TYPE_METRICS; + task.id_target = -1; - // this is only called if no index.html is found in the public --path - svr->Get("/json-schema-to-grammar.mjs", [](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript; charset=utf-8"); - return false; - }); + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); + + // get the result + server_task_result result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); + + const int n_idle_slots = result.data["idle"]; + const int n_processing_slots = result.data["processing"]; + + json health = { + {"status", "ok"}, + {"slots_idle", n_idle_slots}, + {"slots_processing", n_processing_slots} + }; + + res.status = 200; // HTTP OK + if (sparams.slots_endpoint && req.has_param("include_slots")) { + health["slots"] = result.data["slots"]; + } - svr->Get("/props", [&ctx_server](const httplib::Request & req, httplib::Response & res) { + if (n_idle_slots == 0) { + health["status"] = "no slot available"; + if (req.has_param("fail_on_no_slot")) { + res.status = 503; // HTTP Service Unavailable + } + } + + res.set_content(health.dump(), "application/json"); + break; + } + case SERVER_STATE_LOADING_MODEL: + { + res.set_content(R"({"status": "loading model"})", "application/json"); + res.status = 503; // HTTP Service Unavailable + } break; + case SERVER_STATE_ERROR: + { + res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json"); + res.status = 500; // HTTP Internal Server Error + } break; + } + }; + + const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) { + if (!sparams.slots_endpoint) { + res.status = 501; + res.set_content("This server does not support slots endpoint.", "text/plain; charset=utf-8"); + return; + } + + // request slots data using task queue + server_task task; + task.id = ctx_server.queue_tasks.get_new_id(); + task.id_multi = -1; + task.id_target = -1; + task.type = SERVER_TASK_TYPE_METRICS; + + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); + + // get the result + server_task_result result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); + + res.set_content(result.data["slots"].dump(), "application/json"); + res.status = 200; // HTTP OK + }; + + const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { + if (!sparams.metrics_endpoint) { + res.status = 501; + res.set_content("This server does not support metrics endpoint.", "text/plain; charset=utf-8"); + return; + } + + // request slots data using task queue + server_task task; + task.id = ctx_server.queue_tasks.get_new_id(); + task.id_multi = -1; + task.id_target = -1; + task.type = SERVER_TASK_TYPE_METRICS; + task.data.push_back({{"reset_bucket", true}}); + + ctx_server.queue_results.add_waiting_task_id(task.id); + ctx_server.queue_tasks.post(task); + + // get the result + server_task_result result = ctx_server.queue_results.recv(task.id); + ctx_server.queue_results.remove_waiting_task_id(task.id); + + json data = result.data; + + const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"]; + const uint64_t t_prompt_processing = data["t_prompt_processing"]; + + const uint64_t n_tokens_predicted = data["n_tokens_predicted"]; + const uint64_t t_tokens_generation = data["t_tokens_generation"]; + + const int32_t kv_cache_used_cells = data["kv_cache_used_cells"]; + + // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names + json all_metrics_def = json { + {"counter", {{ + {"name", "prompt_tokens_total"}, + {"help", "Number of prompt tokens processed."}, + {"value", (uint64_t) data["n_prompt_tokens_processed_total"]} + }, { + {"name", "prompt_seconds_total"}, + {"help", "Prompt process time"}, + {"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3} + }, { + {"name", "tokens_predicted_total"}, + {"help", "Number of generation tokens processed."}, + {"value", (uint64_t) data["n_tokens_predicted_total"]} + }, { + {"name", "tokens_predicted_seconds_total"}, + {"help", "Predict process time"}, + {"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3} + }}}, + {"gauge", {{ + {"name", "prompt_tokens_seconds"}, + {"help", "Average prompt throughput in tokens/s."}, + {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.} + },{ + {"name", "predicted_tokens_seconds"}, + {"help", "Average generation throughput in tokens/s."}, + {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.} + },{ + {"name", "kv_cache_usage_ratio"}, + {"help", "KV-cache usage. 1 means 100 percent usage."}, + {"value", 1. * kv_cache_used_cells / params.n_ctx} + },{ + {"name", "kv_cache_tokens"}, + {"help", "KV-cache tokens."}, + {"value", (uint64_t) data["kv_cache_tokens_count"]} + },{ + {"name", "requests_processing"}, + {"help", "Number of request processing."}, + {"value", (uint64_t) data["processing"]} + },{ + {"name", "requests_deferred"}, + {"help", "Number of request deferred."}, + {"value", (uint64_t) data["deferred"]} + }}} + }; + + std::stringstream prometheus; + + for (const auto & el : all_metrics_def.items()) { + const auto & type = el.key(); + const auto & metrics_def = el.value(); + + for (const auto & metric_def : metrics_def) { + const std::string name = metric_def["name"]; + const std::string help = metric_def["help"]; + + auto value = json_value(metric_def, "value", 0.); + prometheus << "# HELP llamacpp:" << name << " " << help << "\n" + << "# TYPE llamacpp:" << name << " " << type << "\n" + << "llamacpp:" << name << " " << value << "\n"; + } + } + + const int64_t t_start = data["t_start"]; + res.set_header("Process-Start-Time-Unix", std::to_string(t_start)); + + res.set_content(prometheus.str(), "text/plain; version=0.0.4"); + res.status = 200; // HTTP OK + }; + + const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { { "user_name", ctx_server.name_user.c_str() }, @@ -3020,13 +3040,10 @@ int main(int argc, char ** argv) { }; res.set_content(data.dump(), "application/json; charset=utf-8"); - }); + }; - const auto completions = [&ctx_server, &validate_api_key](const httplib::Request & req, httplib::Response & res) { + const auto handle_completions = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - if (!validate_api_key(req, res)) { - return; - } json data = json::parse(req.body); @@ -3102,11 +3119,7 @@ int main(int argc, char ** argv) { } }; - svr->Post("/completion", completions); // legacy - svr->Post("/completions", completions); - svr->Post("/v1/completions", completions); - - svr->Get("/v1/models", [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) { + const auto handle_models = [¶ms, &model_meta](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json models = { @@ -3123,14 +3136,10 @@ int main(int argc, char ** argv) { }; res.set_content(models.dump(), "application/json; charset=utf-8"); - }); + }; - const auto chat_completions = [&ctx_server, &validate_api_key, &sparams](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, &sparams](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - if (!validate_api_key(req, res)) { - return; - } - json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template); const int id_task = ctx_server.queue_tasks.get_new_id(); @@ -3201,14 +3210,8 @@ int main(int argc, char ** argv) { } }; - svr->Post("/chat/completions", chat_completions); - svr->Post("/v1/chat/completions", chat_completions); - - svr->Post("/infill", [&ctx_server, &validate_api_key](const httplib::Request & req, httplib::Response & res) { + const auto handle_infill = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - if (!validate_api_key(req, res)) { - return; - } json data = json::parse(req.body); @@ -3266,13 +3269,9 @@ int main(int argc, char ** argv) { res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); } - }); - - svr->Options(R"(/.*)", [](const httplib::Request &, httplib::Response & res) { - return res.set_content("", "application/json; charset=utf-8"); - }); + }; - svr->Post("/tokenize", [&ctx_server](const httplib::Request & req, httplib::Response & res) { + const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); @@ -3282,9 +3281,9 @@ int main(int argc, char ** argv) { } const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json; charset=utf-8"); - }); + }; - svr->Post("/detokenize", [&ctx_server](const httplib::Request & req, httplib::Response & res) { + const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); @@ -3296,9 +3295,9 @@ int main(int argc, char ** argv) { const json data = format_detokenized_response(content); return res.set_content(data.dump(), "application/json; charset=utf-8"); - }); + }; - svr->Post("/embedding", [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) { + const auto handle_embeddings = [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!params.embedding) { res.status = 501; @@ -3307,94 +3306,114 @@ int main(int argc, char ** argv) { } const json body = json::parse(req.body); + bool is_openai = false; - json prompt; - if (body.count("content") != 0) { - prompt = body["content"]; + // an input prompt can string or a list of tokens (integer) + std::vector prompts; + if (body.count("input") != 0) { + is_openai = true; + if (body["input"].is_array()) { + // support multiple prompts + for (const json & elem : body["input"]) { + prompts.push_back(elem); + } + } else { + // single input prompt + prompts.push_back(body["input"]); + } + } else if (body.count("content") != 0) { + // only support single prompt here + std::string content = body["content"]; + prompts.push_back(content); } else { - prompt = ""; - } - - // create and queue the task - const int id_task = ctx_server.queue_tasks.get_new_id(); - - ctx_server.queue_results.add_waiting_task_id(id_task); - ctx_server.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0} }, false, true); - - // get the result - server_task_result result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); - - // send the result - return res.set_content(result.data.dump(), "application/json; charset=utf-8"); - }); - - svr->Post("/v1/embeddings", [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - if (!params.embedding) { - res.status = 501; - res.set_content("This server does not support embeddings. Start it with `--embeddings`", "text/plain; charset=utf-8"); - return; + // TODO @ngxson : should return an error here + prompts.push_back(""); } - const json body = json::parse(req.body); - - json prompt; - if (body.count("input") != 0) { - prompt = body["input"]; - if (prompt.is_array()) { - json data = json::array(); - - int i = 0; - for (const json & elem : prompt) { - const int id_task = ctx_server.queue_tasks.get_new_id(); - - ctx_server.queue_results.add_waiting_task_id(id_task); - ctx_server.request_completion(id_task, -1, { {"prompt", elem}, { "n_predict", 0} }, false, true); - - // get the result - server_task_result result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); - - json embedding = json{ - {"embedding", json_value(result.data, "embedding", json::array())}, - {"index", i++}, - {"object", "embedding"} - }; + // process all prompts + json responses = json::array(); + for (auto & prompt : prompts) { + // TODO @ngxson : maybe support multitask for this endpoint? + // create and queue the task + const int id_task = ctx_server.queue_tasks.get_new_id(); - data.push_back(embedding); - } - - json result = format_embeddings_response_oaicompat(body, data); + ctx_server.queue_results.add_waiting_task_id(id_task); + ctx_server.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0}}, false, true); - return res.set_content(result.dump(), "application/json; charset=utf-8"); + // get the result + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); + responses.push_back(result.data); + } + + // write JSON response + json root; + if (is_openai) { + json res_oai = json::array(); + int i = 0; + for (auto & elem : responses) { + res_oai.push_back(json{ + {"embedding", json_value(elem, "embedding", json::array())}, + {"index", i++}, + {"object", "embedding"} + }); } + root = format_embeddings_response_oaicompat(body, res_oai); } else { - prompt = ""; + root = responses[0]; } + return res.set_content(root.dump(), "application/json; charset=utf-8"); + }; - // create and queue the task - const int id_task = ctx_server.queue_tasks.get_new_id(); - - ctx_server.queue_results.add_waiting_task_id(id_task); - ctx_server.request_completion(id_task, -1, { {"prompt", prompt}, { "n_predict", 0}}, false, true); + // + // Router + // - // get the result - server_task_result result = ctx_server.queue_results.recv(id_task); - ctx_server.queue_results.remove_waiting_task_id(id_task); - - json data = json::array({json{ - {"embedding", json_value(result.data, "embedding", json::array())}, - {"index", 0}, - {"object", "embedding"} - }} - ); + // register static assets routes + if (!sparams.public_path.empty()) { + // Set the base directory for serving static files + svr->set_base_dir(sparams.public_path); + } - json root = format_embeddings_response_oaicompat(body, data); + // using embedded static files + auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) { + return [content, len, mime_type](const httplib::Request &, httplib::Response & res) { + res.set_content(reinterpret_cast(content), len, mime_type); + return false; + }; + }; - return res.set_content(root.dump(), "application/json; charset=utf-8"); + svr->Options(R"(/.*)", [](const httplib::Request &, httplib::Response & res) { + // TODO @ngxson : I have no idea what it is... maybe this is redundant? + return res.set_content("", "application/json; charset=utf-8"); }); + svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8")); + svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8")); + svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8")); + svr->Get("/json-schema-to-grammar.mjs", handle_static_file( + json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); + + // register API routes + svr->Get ("/health", handle_health); + svr->Get ("/slots", handle_slots); + svr->Get ("/metrics", handle_metrics); + svr->Get ("/props", handle_props); + svr->Get ("/v1/models", handle_models); + svr->Post("/completion", handle_completions); // legacy + svr->Post("/completions", handle_completions); + svr->Post("/v1/completions", handle_completions); + svr->Post("/chat/completions", handle_chat_completions); + svr->Post("/v1/chat/completions", handle_chat_completions); + svr->Post("/infill", handle_infill); + svr->Post("/embedding", handle_embeddings); // legacy + svr->Post("/embeddings", handle_embeddings); + svr->Post("/v1/embeddings", handle_embeddings); + svr->Post("/tokenize", handle_tokenize); + svr->Post("/detokenize", handle_detokenize); + // + // Start the server + // if (sparams.n_threads_http < 1) { // +2 threads for monitoring endpoints sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1); diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature index 42a6709a53380..1d6aa40ea6985 100644 --- a/examples/server/tests/features/security.feature +++ b/examples/server/tests/features/security.feature @@ -39,8 +39,9 @@ Feature: Security Scenario Outline: CORS Options - When an OPTIONS request is sent from - Then CORS header is set to + Given a user api key llama.cpp + When an OPTIONS request is sent from + Then CORS header is set to Examples: Headers | origin | cors_header | cors_header_value | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 0076f805be4d3..14204850960c9 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -582,8 +582,9 @@ async def step_detokenize(context): @async_run_until_complete async def step_options_request(context, origin): async with aiohttp.ClientSession() as session: + headers = {'Authorization': f'Bearer {context.user_api_key}', 'Origin': origin} async with session.options(f'{context.base_url}/v1/chat/completions', - headers={"Origin": origin}) as response: + headers=headers) as response: assert response.status == 200 context.options_response = response From 9674aaf35cb81478eb38c3f3ebde713ec72fbb79 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 12:34:18 +0200 Subject: [PATCH 09/39] server : simplify logic for empty prompts (#5953) --- examples/server/server.cpp | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6e0f8328cdf5a..aedf0afc603c3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1704,19 +1704,6 @@ struct server_context { // next, batch any pending prompts without exceeding n_batch if (params.cont_batching || batch.n_tokens == 0) { for (auto & slot : slots) { - const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()); - - // empty prompt passed -> release the slot and send empty response - // note: infill mode allows empty prompt - if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT && !has_prompt && !slot.infill) { - slot.state = SLOT_STATE_PROCESSING; - slot.command = SLOT_COMMAND_NONE; - slot.release(); - slot.print_timings(); - send_final_response(slot); - continue; - } - // this slot still has a prompt to be processed if (slot.state == SLOT_STATE_IDLE && slot.command == SLOT_COMMAND_LOAD_PROMPT) { auto & prompt_tokens = slot.prompt_tokens; @@ -1768,6 +1755,21 @@ struct server_context { {"prompt_tokens", tokens_to_str(ctx, prompt_tokens.cbegin(), prompt_tokens.cend())}, }); + // empty prompt passed -> release the slot and send empty response + if (prompt_tokens.empty()) { + LOG_INFO("empty prompt - releasing slot", { + {"id_slot", slot.id}, + {"id_task", slot.id_task} + }); + + slot.state = SLOT_STATE_PROCESSING; + slot.command = SLOT_COMMAND_NONE; + slot.release(); + slot.print_timings(); + send_final_response(slot); + continue; + } + if (slot.embedding) { // this prompt is too large to process - discard it if (slot.n_prompt_tokens > n_batch) { From 8a3012a4ad08112bb3dc3f1399afec4e93780c44 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 12:47:57 +0200 Subject: [PATCH 10/39] ggml : add ggml-common.h to deduplicate shared code (#5940) * ggml : add ggml-common.h to shared code ggml-ci * scripts : update sync scripts * sycl : reuse quantum tables ggml-ci * ggml : minor * ggml : minor * sycl : try to fix build --- CMakeLists.txt | 3 +- Makefile | 4 +- ggml-common.h | 779 ++++++++++++++++++++++++++++++++++++++++ ggml-cuda.cu | 743 +------------------------------------- ggml-metal.metal | 707 +----------------------------------- ggml-quants.c | 708 +----------------------------------- ggml-quants.h | 4 +- ggml-sycl.cpp | 384 +------------------- scripts/sync-ggml-am.sh | 2 + scripts/sync-ggml.sh | 1 + 10 files changed, 799 insertions(+), 2536 deletions(-) create mode 100644 ggml-common.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 48880f7204bf5..9309ca6bbe4d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -199,7 +199,8 @@ if (LLAMA_METAL) # get full path to the file #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/") - # copy ggml-metal.metal to bin directory + # copy ggml-common.h and ggml-metal.metal to bin directory + configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY) configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY) if (LLAMA_METAL_EMBED_LIBRARY) diff --git a/Makefile b/Makefile index aea9692227298..d809b8b3bbfc1 100644 --- a/Makefile +++ b/Makefile @@ -453,7 +453,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE ifdef LLAMA_CUDA_CCBIN MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h ifdef JETSON_EOL_MODULE_DETECT $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ else @@ -630,7 +630,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h $(CC) $(CFLAGS) -c $< -o $@ -ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h +ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h $(CC) $(CFLAGS) -c $< -o $@ OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o diff --git a/ggml-common.h b/ggml-common.h new file mode 100644 index 0000000000000..4b6d248b6db90 --- /dev/null +++ b/ggml-common.h @@ -0,0 +1,779 @@ +#pragma once + +#if defined(GGML_COMMON_IMPL_C) +#include + +#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { +#define GGML_TABLE_END() }; + +#define GGML_COMMON_IMPL +#elif defined(GGML_COMMON_IMPL_METAL) +#include + +#define GGML_TABLE_BEGIN(type, name, size) static const constant type name[size] = { +#define GGML_TABLE_END() }; + +#define GGML_COMMON_IMPL +#elif defined(GGML_COMMON_IMPL_CUDA) +#include + +#define GGML_TABLE_BEGIN(type, name, size) static const __device__ __constant__ type name[size] = { +#define GGML_TABLE_END() }; + +#define GGML_COMMON_IMPL +#elif defined(GGML_COMMON_IMPL_SYCL) +#include + +#define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory name(sycl::range<1>(size), { +#define GGML_TABLE_END() }); + +#define GGML_COMMON_IMPL +#endif + +#if defined(GGML_COMMON_IMPL) + +GGML_TABLE_BEGIN(uint8_t, kmask_iq2xs, 8) + 1, 2, 4, 8, 16, 32, 64, 128 +GGML_TABLE_END() + +GGML_TABLE_BEGIN(uint8_t, ksigns_iq2xs, 128) + 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, + 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, + 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, + 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, + 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, + 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, + 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, + 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, +GGML_TABLE_END() + +//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +GGML_TABLE_BEGIN(uint64_t, ksigns64, 128) + 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, + 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, + 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff, + 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff, + 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff, + 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff, + 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff, + 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff, + 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff, + 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff, + 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff, + 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff, + 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff, + 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff, + 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff, + 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff, + 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff, + 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff, + 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff, + 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff, + 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff, + 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff, + 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff, + 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff, + 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff, + 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff, + 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff, + 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff, + 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff, + 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff, + 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff, + 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff, +GGML_TABLE_END() +//#endif + + +GGML_TABLE_BEGIN(uint64_t, iq2xxs_grid, 256) + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, + 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, + 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819, + 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b, + 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808, + 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08, + 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b, + 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819, + 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08, + 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, + 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08, + 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808, + 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808, + 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919, + 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08, + 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908, + 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819, + 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808, + 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808, + 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908, + 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808, + 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08, + 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819, + 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819, + 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908, + 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19, + 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819, + 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b, + 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808, + 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908, + 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08, + 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08, + 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908, + 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819, + 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808, + 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808, + 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19, + 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819, + 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, + 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b, + 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08, + 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808, + 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908, + 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b, + 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819, + 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08, + 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08, + 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808, + 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b, + 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b, + 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908, + 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819, + 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808, + 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908, + 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b, + 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808, + 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b, + 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b, + 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808, + 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19, + 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, +GGML_TABLE_END() + +GGML_TABLE_BEGIN(uint64_t, iq2xs_grid, 512) + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, + 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, + 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, + 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, + 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, + 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, + 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, + 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, + 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, + 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, + 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, + 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, + 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, + 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, + 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, + 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, + 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, + 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, + 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, + 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, + 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, + 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, + 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, + 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, + 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, + 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, + 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, + 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, + 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, + 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, + 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, + 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, + 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, + 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, + 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, + 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, + 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, + 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, + 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, + 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, + 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, + 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, + 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, + 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, + 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, + 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, + 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, + 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, + 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, + 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, + 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, + 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, + 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, + 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, + 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, + 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, + 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, + 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, + 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, + 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, + 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, + 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, + 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, + 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, + 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, + 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, + 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, + 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, + 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, + 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, + 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, + 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, + 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, + 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, + 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, + 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, + 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, + 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, + 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, + 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, + 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, + 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, + 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, + 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, + 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, + 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, + 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, + 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, + 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, + 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, + 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, + 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, + 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, + 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, + 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, + 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, + 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, + 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, + 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, + 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, + 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, + 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, + 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, + 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, + 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, + 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, + 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, + 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, + 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, + 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, + 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, + 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, + 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, + 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, + 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, + 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, +GGML_TABLE_END() + +GGML_TABLE_BEGIN(uint64_t, iq2s_grid, 1024) + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b, + 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919, + 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808, + 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908, + 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b, + 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908, + 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08, + 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19, + 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819, + 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919, + 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b, + 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, + 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908, + 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908, + 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b, + 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919, + 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b, + 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, + 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908, + 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b, + 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b, + 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08, + 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, + 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819, + 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808, + 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908, + 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b, + 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908, + 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08, + 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808, + 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08, + 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819, + 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908, + 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919, + 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b, + 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919, + 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808, + 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819, + 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919, + 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919, + 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808, + 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819, + 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b, + 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908, + 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, + 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, + 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919, + 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b, + 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919, + 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b, + 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819, + 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919, + 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908, + 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b, + 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908, + 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b, + 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908, + 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08, + 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908, + 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819, + 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819, + 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808, + 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08, + 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19, + 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819, + 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808, + 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819, + 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919, + 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808, + 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19, + 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08, + 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b, + 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908, + 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808, + 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819, + 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908, + 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819, + 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808, + 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808, + 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819, + 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908, + 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08, + 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819, + 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b, + 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b, + 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08, + 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19, + 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819, + 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919, + 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908, + 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808, + 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808, + 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908, + 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808, + 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08, + 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08, + 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908, + 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919, + 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808, + 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819, + 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908, + 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08, + 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819, + 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808, + 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808, + 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819, + 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808, + 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908, + 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b, + 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, + 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, + 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b, + 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808, + 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b, + 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19, + 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819, + 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08, + 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b, + 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908, + 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b, + 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b, + 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919, + 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808, + 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819, + 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908, + 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08, + 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08, + 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819, + 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919, + 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908, + 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b, + 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908, + 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b, + 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908, + 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08, + 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819, + 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808, + 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819, + 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919, + 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808, + 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808, + 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08, + 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819, + 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919, + 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808, + 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819, + 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919, + 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808, + 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b, + 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908, + 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808, + 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908, + 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b, + 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908, + 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b, + 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908, + 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b, + 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908, + 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08, + 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908, + 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b, + 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908, + 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08, + 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819, + 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919, + 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808, + 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19, + 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b, + 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919, + 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808, + 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819, + 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908, + 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919, + 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808, + 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808, + 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b, + 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919, + 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808, + 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b, + 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808, + 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919, + 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b, + 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08, + 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919, + 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808, + 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b, + 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908, + 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808, + 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808, + 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808, + 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908, + 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808, + 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808, + 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b, + 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908, + 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808, + 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808, + 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819, + 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b, + 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808, + 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819, + 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b, + 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908, + 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08, + 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908, + 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919, + 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819, + 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908, + 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808, + 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819, + 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908, + 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919, + 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808, + 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808, + 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808, + 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919, + 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908, + 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908, + 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08, + 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819, + 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b, + 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808, + 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819, + 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908, + 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819, + 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808, + 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808, + 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b, + 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908, + 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808, + 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908, + 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819, + 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819, + 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808, + 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b, + 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b, + 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819, + 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b, + 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b, + 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b, + 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819, + 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19, + 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819, + 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908, + 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808, + 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b, +GGML_TABLE_END() + +GGML_TABLE_BEGIN(uint32_t, iq3xxs_grid, 256) + 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, + 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, + 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, + 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, + 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, + 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, + 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, + 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, + 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, + 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, + 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, + 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, + 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, + 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, + 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, + 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, + 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, + 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, + 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, + 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, + 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, + 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, + 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, + 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, + 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, + 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, + 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, + 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, + 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, + 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, + 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, + 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, +GGML_TABLE_END() + +GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512) + 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, + 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, + 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, + 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, + 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, + 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, + 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, + 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, + 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, + 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, + 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, + 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, + 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, + 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, + 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, + 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, + 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, + 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, + 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, + 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, + 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, + 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, + 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, + 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, + 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, + 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, + 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, + 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, + 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, + 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, + 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, + 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, + 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, + 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, + 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, + 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, + 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, + 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, + 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, + 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, + 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, + 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, + 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, + 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, + 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, + 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, + 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, + 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, + 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, + 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, + 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, + 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, + 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, + 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, + 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, + 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, + 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, + 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, + 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, + 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, + 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, + 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, + 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, + 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, +GGML_TABLE_END() + +#define NGRID_IQ2XXS 512 +GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ2XXS) + 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, + 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, + 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, + 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, + 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, + 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, + 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, + 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, + 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, + 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, + 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, + 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, + 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, + 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, + 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, + 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, + 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, + 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, + 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, + 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, + 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, + 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, + 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, + 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, + 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, + 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, + 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, + 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, + 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, + 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, + 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, + 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, + 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, + 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, + 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, + 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, + 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, + 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, + 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, + 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, + 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, + 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, + 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, + 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, + 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, + 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, + 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, + 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, + 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, + 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, + 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, + 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, + 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, + 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, + 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, + 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, + 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, + 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, + 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, + 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, + 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, + 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, + 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, + 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, + 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, + 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, + 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, + 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, + 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, + 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, + 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, + 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, + 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, + 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, + 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, + 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, + 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, + 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, + 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, + 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, + 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, + 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, + 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, + 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, + 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, + 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, + 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, + 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, + 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, + 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, + 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, + 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, + 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, + 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, + 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, + 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, + 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, + 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, + 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, + 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, + 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, + 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, + 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, + 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, + 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, + 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, + 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, + 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, + 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, + 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, + 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, + 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, + 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, + 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, + 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, + 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, + 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, + 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, + 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, + 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, + 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, + 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, + 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, + 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, + 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, + 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, + 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, + 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, +GGML_TABLE_END() + +#endif // GGML_COMMON_IMPL diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 72bcec8cdb17a..c207ff87a281a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2,6 +2,9 @@ #include "ggml.h" #include "ggml-backend-impl.h" +#define GGML_COMMON_IMPL_CUDA +#include "ggml-common.h" + #include #include #include @@ -1569,746 +1572,6 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t #endif } -static const __device__ uint64_t iq2xxs_grid[256] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, - 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, - 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819, - 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b, - 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808, - 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08, - 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b, - 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819, - 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08, - 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, - 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08, - 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808, - 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808, - 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919, - 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819, - 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08, - 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908, - 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819, - 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808, - 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808, - 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908, - 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808, - 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08, - 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819, - 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819, - 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819, - 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908, - 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19, - 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819, - 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b, - 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808, - 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908, - 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08, - 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08, - 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908, - 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819, - 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808, - 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808, - 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19, - 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819, - 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, - 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b, - 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08, - 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808, - 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908, - 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b, - 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819, - 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08, - 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08, - 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808, - 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b, - 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b, - 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908, - 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819, - 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808, - 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908, - 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b, - 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808, - 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b, - 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b, - 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808, - 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19, - 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, -}; - -static const __device__ uint64_t iq2xs_grid[512] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, - 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, - 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, - 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, - 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, - 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, - 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, - 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, - 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, - 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, - 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, - 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, - 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, - 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, - 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, - 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, - 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, - 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, - 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, - 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, - 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, - 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, - 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, - 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, - 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, - 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, - 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, - 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, - 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, - 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, - 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, - 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, - 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, - 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, - 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, - 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, - 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, - 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, - 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, - 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, - 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, - 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, - 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, - 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, - 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, - 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, - 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, - 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, - 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, - 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, - 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, - 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, - 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, - 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, - 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, - 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, - 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, - 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, - 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, - 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, - 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, - 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, - 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, - 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, - 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, - 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, - 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, - 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, - 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, - 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, - 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, - 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, - 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, - 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, - 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, - 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, - 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, - 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, - 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, - 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, - 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, - 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, - 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, - 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, - 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, - 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, - 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, - 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, - 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, - 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, - 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, - 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, - 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, - 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, - 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, - 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, - 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, - 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, - 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, - 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, - 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, - 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, - 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, - 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, - 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, - 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, - 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, - 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, - 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, - 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, - 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, - 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, - 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, - 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, - 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, - 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, - 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, - 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, - 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, - 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, - 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, - 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, - 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, - 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, - 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, - 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, - 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, -}; - -static const __device__ uint64_t iq2s_grid[1024] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, - 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, - 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, - 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, - 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b, - 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919, - 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808, - 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908, - 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b, - 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908, - 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08, - 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19, - 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819, - 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919, - 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b, - 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, - 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908, - 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919, - 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908, - 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b, - 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919, - 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b, - 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, - 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908, - 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b, - 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b, - 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08, - 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, - 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819, - 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808, - 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908, - 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b, - 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908, - 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08, - 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819, - 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808, - 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08, - 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819, - 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b, - 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908, - 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919, - 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b, - 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919, - 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808, - 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819, - 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919, - 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919, - 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808, - 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819, - 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b, - 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908, - 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, - 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, - 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919, - 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b, - 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919, - 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b, - 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819, - 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919, - 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908, - 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b, - 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908, - 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b, - 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908, - 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08, - 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908, - 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819, - 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819, - 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808, - 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08, - 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19, - 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819, - 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808, - 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819, - 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919, - 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808, - 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19, - 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08, - 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b, - 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908, - 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808, - 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819, - 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908, - 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819, - 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808, - 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808, - 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819, - 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908, - 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08, - 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819, - 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b, - 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b, - 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08, - 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19, - 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819, - 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919, - 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908, - 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808, - 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808, - 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908, - 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808, - 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08, - 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08, - 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908, - 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919, - 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808, - 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819, - 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908, - 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08, - 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819, - 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808, - 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808, - 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819, - 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808, - 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908, - 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b, - 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, - 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, - 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b, - 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808, - 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b, - 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19, - 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819, - 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08, - 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b, - 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908, - 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b, - 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b, - 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919, - 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808, - 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819, - 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908, - 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08, - 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08, - 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819, - 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919, - 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908, - 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b, - 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908, - 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b, - 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908, - 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08, - 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819, - 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808, - 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819, - 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919, - 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808, - 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808, - 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08, - 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819, - 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919, - 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808, - 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819, - 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919, - 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808, - 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b, - 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908, - 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808, - 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908, - 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b, - 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908, - 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b, - 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908, - 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b, - 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908, - 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08, - 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908, - 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b, - 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908, - 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08, - 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819, - 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919, - 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808, - 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19, - 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b, - 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919, - 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808, - 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819, - 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908, - 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919, - 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808, - 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808, - 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b, - 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919, - 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808, - 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b, - 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808, - 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919, - 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b, - 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08, - 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919, - 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808, - 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b, - 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908, - 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808, - 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808, - 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808, - 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908, - 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808, - 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808, - 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b, - 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908, - 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808, - 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808, - 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819, - 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919, - 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b, - 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808, - 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819, - 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b, - 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908, - 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08, - 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908, - 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919, - 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819, - 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908, - 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b, - 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808, - 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819, - 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908, - 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919, - 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808, - 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808, - 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808, - 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919, - 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908, - 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908, - 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08, - 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819, - 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b, - 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808, - 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819, - 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908, - 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819, - 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808, - 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808, - 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b, - 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908, - 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808, - 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908, - 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819, - 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819, - 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808, - 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b, - 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b, - 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819, - 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b, - 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b, - 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b, - 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819, - 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19, - 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819, - 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908, - 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808, - 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b, -}; - -static const __device__ uint32_t iq3xxs_grid[256] = { - 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, - 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, - 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, - 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, - 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, - 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, - 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, - 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, - 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, - 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, - 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, - 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, - 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, - 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, - 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, - 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, - 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, - 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, - 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, - 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, - 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, - 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, - 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, - 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, - 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, - 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, - 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, - 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, - 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, - 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, - 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, - 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, -}; - -static const __device__ uint32_t iq3s_grid[512] = { - 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, - 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, - 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, - 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, - 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, - 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, - 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, - 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, - 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, - 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, - 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, - 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, - 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, - 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, - 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, - 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, - 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, - 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, - 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, - 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, - 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, - 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, - 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, - 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, - 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, - 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, - 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, - 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, - 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, - 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, - 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, - 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, - 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, - 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, - 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, - 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, - 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, - 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, - 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, - 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, - 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, - 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, - 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, - 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, - 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, - 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, - 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, - 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, - 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, - 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, - 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, - 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, - 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, - 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, - 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, - 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, - 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, - 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, - 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, - 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, - 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, - 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, - 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, - 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, -}; - -static const __device__ uint64_t iq1s_grid[512] = { - 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, - 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, - 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, - 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, - 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, - 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, - 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, - 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, - 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, - 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, - 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, - 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, - 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, - 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, - 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, - 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, - 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, - 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, - 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, - 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, - 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, - 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, - 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, - 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, - 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, - 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, - 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, - 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, - 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, - 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, - 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, - 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, - 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, - 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, - 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, - 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, - 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, - 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, - 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, - 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, - 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, - 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, - 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, - 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, - 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, - 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, - 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, - 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, - 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, - 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, - 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, - 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, - 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, - 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, - 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, - 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, - 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, - 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, - 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, - 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, - 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, - 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, - 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, - 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, - 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, - 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, - 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, - 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, - 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, - 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, - 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, - 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, - 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, - 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, - 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, - 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, - 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, - 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, - 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, - 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, - 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, - 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, - 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, - 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, - 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, - 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, - 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, - 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, - 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, - 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, - 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, - 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, - 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, - 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, - 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, - 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, - 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, - 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, - 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, - 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, - 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, - 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, - 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, - 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, - 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, - 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, - 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, - 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, - 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, - 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, - 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, - 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, - 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, - 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, - 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, - 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, - 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, - 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, - 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, - 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, - 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, - 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, - 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, - 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, - 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, - 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, - 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, - 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, -}; - -static const __device__ uint8_t ksigns_iq2xs[128] = { - 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, - 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, - 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, - 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, - 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, - 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, - 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, - 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, -}; - -//#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -static const __device__ uint64_t ksigns64[128] = { - 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, 0x000000000000ffff, - 0xff00000000ff0000, 0x0000000000ff00ff, 0x0000000000ffff00, 0xff00000000ffffff, - 0xff000000ff000000, 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff, - 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, 0x00000000ffffffff, - 0xff0000ff00000000, 0x000000ff000000ff, 0x000000ff0000ff00, 0xff0000ff0000ffff, - 0x000000ff00ff0000, 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff, - 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0x000000ffff00ffff, - 0xff0000ffffff0000, 0x000000ffffff00ff, 0x000000ffffffff00, 0xff0000ffffffffff, - 0xff00ff0000000000, 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff, - 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0x0000ff0000ffffff, - 0x0000ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, 0x0000ff00ff00ffff, - 0xff00ff00ffff0000, 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff, - 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, 0x0000ffff0000ffff, - 0xff00ffff00ff0000, 0x0000ffff00ff00ff, 0x0000ffff00ffff00, 0xff00ffff00ffffff, - 0xff00ffffff000000, 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff, - 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, 0x0000ffffffffffff, - 0xffff000000000000, 0x00ff0000000000ff, 0x00ff00000000ff00, 0xffff00000000ffff, - 0x00ff000000ff0000, 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff, - 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0x00ff0000ff00ffff, - 0xffff0000ffff0000, 0x00ff0000ffff00ff, 0x00ff0000ffffff00, 0xffff0000ffffffff, - 0x00ff00ff00000000, 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff, - 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, 0xffff00ff00ffffff, - 0xffff00ffff000000, 0x00ff00ffff0000ff, 0x00ff00ffff00ff00, 0xffff00ffff00ffff, - 0x00ff00ffffff0000, 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff, - 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, 0x00ffff000000ffff, - 0xffffff0000ff0000, 0x00ffff0000ff00ff, 0x00ffff0000ffff00, 0xffffff0000ffffff, - 0xffffff00ff000000, 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff, - 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, 0x00ffff00ffffffff, - 0xffffffff00000000, 0x00ffffff000000ff, 0x00ffffff0000ff00, 0xffffffff0000ffff, - 0x00ffffff00ff0000, 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff, - 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, 0x00ffffffff00ffff, - 0xffffffffffff0000, 0x00ffffffffff00ff, 0x00ffffffffffff00, 0xffffffffffffffff, -}; -//#endif - -static const __device__ uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; - inline bool ggml_cuda_supports_mmq(enum ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: diff --git a/ggml-metal.metal b/ggml-metal.metal index a65d126412bfb..6ebbbd195e7ce 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -1,5 +1,8 @@ #include +#define GGML_COMMON_IMPL_METAL +#include "ggml-common.h" + using namespace metal; #define MAX(x, y) ((x) > (y) ? (x) : (y)) @@ -3638,710 +3641,6 @@ kernel void kernel_mul_mv_q6_K_f32( // ======================= "True" 2-bit -constexpr constant static uint64_t iq2xxs_grid[256] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, - 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, - 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819, - 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b, - 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808, - 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08, - 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b, - 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819, - 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08, - 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, - 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08, - 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808, - 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808, - 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919, - 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819, - 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08, - 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908, - 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819, - 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808, - 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808, - 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908, - 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808, - 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08, - 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819, - 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819, - 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819, - 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908, - 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19, - 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819, - 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b, - 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808, - 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908, - 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08, - 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08, - 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908, - 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819, - 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808, - 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808, - 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19, - 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819, - 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, - 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b, - 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08, - 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808, - 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908, - 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b, - 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819, - 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08, - 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08, - 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808, - 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b, - 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b, - 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908, - 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819, - 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808, - 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908, - 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b, - 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808, - 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b, - 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b, - 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808, - 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19, - 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, -}; - -constexpr constant static uint64_t iq2xs_grid[512] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, - 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, - 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, - 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, - 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, - 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, - 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, - 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, - 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, - 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, - 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, - 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, - 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, - 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, - 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, - 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, - 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, - 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, - 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, - 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, - 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, - 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, - 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, - 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, - 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, - 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, - 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, - 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, - 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, - 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, - 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, - 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, - 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, - 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, - 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, - 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, - 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, - 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, - 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, - 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, - 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, - 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, - 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, - 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, - 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, - 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, - 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, - 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, - 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, - 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, - 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, - 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, - 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, - 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, - 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, - 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, - 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, - 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, - 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, - 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, - 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, - 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, - 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, - 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, - 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, - 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, - 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, - 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, - 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, - 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, - 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, - 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, - 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, - 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, - 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, - 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, - 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, - 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, - 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, - 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, - 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, - 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, - 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, - 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, - 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, - 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, - 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, - 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, - 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, - 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, - 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, - 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, - 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, - 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, - 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, - 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, - 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, - 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, - 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, - 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, - 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, - 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, - 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, - 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, - 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, - 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, - 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, - 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, - 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, - 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, - 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, - 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, - 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, - 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, - 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, - 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, - 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, - 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, - 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, - 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, - 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, - 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, - 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, - 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, - 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, - 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, - 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, -}; - -constexpr constant static uint64_t iq2s_grid[1024] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, - 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, - 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, - 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, - 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b, - 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919, - 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808, - 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908, - 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b, - 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908, - 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08, - 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19, - 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819, - 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919, - 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b, - 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, - 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908, - 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919, - 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908, - 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b, - 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919, - 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b, - 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, - 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908, - 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b, - 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b, - 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08, - 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, - 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819, - 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808, - 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908, - 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b, - 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908, - 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08, - 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819, - 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808, - 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08, - 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819, - 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b, - 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908, - 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919, - 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b, - 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919, - 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808, - 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819, - 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919, - 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919, - 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808, - 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819, - 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b, - 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908, - 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, - 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, - 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919, - 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b, - 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919, - 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b, - 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819, - 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919, - 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908, - 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b, - 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908, - 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b, - 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908, - 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08, - 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908, - 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819, - 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819, - 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808, - 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08, - 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19, - 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819, - 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808, - 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819, - 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919, - 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808, - 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19, - 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08, - 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b, - 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908, - 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808, - 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819, - 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908, - 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819, - 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808, - 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808, - 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819, - 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908, - 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08, - 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819, - 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b, - 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b, - 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08, - 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19, - 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819, - 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919, - 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908, - 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808, - 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808, - 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908, - 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808, - 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08, - 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08, - 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908, - 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919, - 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808, - 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819, - 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908, - 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08, - 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819, - 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808, - 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808, - 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819, - 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808, - 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908, - 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b, - 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, - 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, - 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b, - 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808, - 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b, - 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19, - 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819, - 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08, - 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b, - 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908, - 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b, - 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b, - 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919, - 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808, - 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819, - 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908, - 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08, - 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08, - 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819, - 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919, - 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908, - 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b, - 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908, - 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b, - 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908, - 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08, - 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819, - 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808, - 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819, - 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919, - 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808, - 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808, - 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08, - 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819, - 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919, - 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808, - 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819, - 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919, - 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808, - 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b, - 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908, - 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808, - 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908, - 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b, - 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908, - 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b, - 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908, - 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b, - 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908, - 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08, - 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908, - 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b, - 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908, - 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08, - 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819, - 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919, - 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808, - 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19, - 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b, - 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919, - 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808, - 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819, - 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908, - 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919, - 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808, - 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808, - 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b, - 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919, - 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808, - 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b, - 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808, - 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919, - 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b, - 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08, - 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919, - 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808, - 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b, - 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908, - 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808, - 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808, - 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808, - 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908, - 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808, - 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808, - 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b, - 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908, - 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808, - 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808, - 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819, - 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919, - 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b, - 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808, - 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819, - 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b, - 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908, - 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08, - 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908, - 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919, - 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819, - 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908, - 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b, - 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808, - 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819, - 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908, - 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919, - 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808, - 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808, - 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808, - 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919, - 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908, - 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908, - 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08, - 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819, - 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b, - 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808, - 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819, - 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908, - 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819, - 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808, - 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808, - 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b, - 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908, - 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808, - 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908, - 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819, - 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819, - 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808, - 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b, - 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b, - 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819, - 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b, - 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b, - 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b, - 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819, - 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19, - 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819, - 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908, - 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808, - 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b, -}; - -constexpr constant static uint32_t iq3xxs_grid[256] = { - 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, - 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, - 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, - 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, - 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, - 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, - 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, - 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, - 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, - 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, - 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, - 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, - 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, - 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, - 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, - 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, - 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, - 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, - 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, - 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, - 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, - 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, - 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, - 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, - 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, - 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, - 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, - 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, - 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, - 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, - 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, - 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, -}; - -constexpr constant static uint32_t iq3s_grid[512] = { - 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, - 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, - 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, - 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, - 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, - 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, - 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, - 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, - 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, - 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, - 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, - 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, - 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, - 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, - 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, - 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, - 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, - 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, - 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, - 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, - 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, - 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, - 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, - 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, - 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, - 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, - 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, - 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, - 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, - 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, - 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, - 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, - 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, - 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, - 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, - 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, - 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, - 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, - 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, - 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, - 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, - 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, - 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, - 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, - 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, - 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, - 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, - 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, - 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, - 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, - 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, - 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, - 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, - 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, - 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, - 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, - 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, - 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, - 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, - 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, - 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, - 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, - 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, - 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, -}; - -#define NGRID_IQ1S 512 -constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = { - 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, - 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, - 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, - 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, - 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, - 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, - 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, - 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, - 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, - 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, - 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, - 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, - 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, - 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, - 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, - 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, - 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, - 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, - 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, - 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, - 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, - 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, - 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, - 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, - 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, - 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, - 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, - 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, - 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, - 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, - 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, - 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, - 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, - 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, - 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, - 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, - 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, - 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, - 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, - 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, - 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, - 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, - 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, - 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, - 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, - 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, - 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, - 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, - 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, - 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, - 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, - 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, - 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, - 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, - 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, - 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, - 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, - 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, - 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, - 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, - 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, - 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, - 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, - 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, - 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, - 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, - 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, - 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, - 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, - 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, - 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, - 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, - 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, - 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, - 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, - 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, - 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, - 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, - 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, - 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, - 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, - 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, - 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, - 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, - 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, - 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, - 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, - 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, - 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, - 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, - 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, - 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, - 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, - 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, - 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, - 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, - 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, - 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, - 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, - 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, - 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, - 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, - 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, - 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, - 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, - 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, - 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, - 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, - 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, - 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, - 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, - 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, - 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, - 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, - 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, - 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, - 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, - 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, - 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, - 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, - 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, - 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, - 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, - 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, - 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, - 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, - 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, - 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, -}; - -constexpr constant static uint8_t ksigns_iq2xs[128] = { - 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, - 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, - 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, - 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, - 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, - 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, - 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, - 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, -}; - -constexpr constant static uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; - void kernel_mul_mv_iq2_xxs_f32_impl( device const void * src0, device const float * src1, diff --git a/ggml-quants.c b/ggml-quants.c index 9dcb76def46db..5bb46def3d2b8 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1,6 +1,9 @@ #include "ggml-quants.h" #include "ggml-impl.h" +#define GGML_COMMON_IMPL_C +#include "ggml-common.h" + #include #include #include @@ -3327,711 +3330,6 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int // ====================== "True" 2-bit (de)-quantization -static const uint64_t iq2xxs_grid[256] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, - 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, - 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, 0x08080808192b0819, - 0x08080808192b1908, 0x080808082b080808, 0x080808082b08082b, 0x080808082b082b2b, - 0x080808082b2b082b, 0x0808081908080819, 0x0808081908081908, 0x0808081908190808, - 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, 0x080808192b192b08, - 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b082b082b, 0x0808082b2b08082b, - 0x0808190808080819, 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819, - 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, 0x0808190819082b08, - 0x08081908192b0808, 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, - 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08, - 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, 0x080819192b080808, - 0x080819192b190819, 0x0808192b08082b19, 0x0808192b08190808, 0x0808192b19080808, - 0x0808192b2b081908, 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919, - 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, 0x08082b0819080819, - 0x08082b0819081908, 0x08082b0819190808, 0x08082b081919082b, 0x08082b082b082b08, - 0x08082b1908081908, 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908, - 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, 0x08190808082b0819, - 0x0819080819080808, 0x08190808192b0808, 0x081908082b081908, 0x081908082b190808, - 0x081908082b191919, 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808, - 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, 0x0819082b082b1908, - 0x0819082b19081919, 0x0819190808080808, 0x0819190808082b08, 0x08191908082b0808, - 0x08191908082b1919, 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08, - 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, 0x08192b0808080819, - 0x08192b0808081908, 0x08192b0808190808, 0x08192b0819080808, 0x08192b082b080819, - 0x08192b1908080808, 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819, - 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, 0x082b080819081908, - 0x082b0808192b0819, 0x082b08082b080808, 0x082b08082b08082b, 0x082b0819082b2b19, - 0x082b081919082b08, 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819, - 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, 0x082b19081919192b, - 0x082b191908080808, 0x082b191919080819, 0x082b1919192b1908, 0x082b192b2b190808, - 0x082b2b0808082b08, 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908, - 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, 0x1908080808192b08, - 0x19080808082b0819, 0x19080808082b1908, 0x1908080819080808, 0x1908080819082b08, - 0x190808081919192b, 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908, - 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, 0x19080819192b0819, - 0x190808192b080808, 0x190808192b081919, 0x1908082b08080819, 0x1908082b08190808, - 0x1908082b19082b08, 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808, - 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, 0x190819082b192b19, - 0x190819190819082b, 0x19081919082b1908, 0x1908192b08080808, 0x19082b0808080819, - 0x19082b0808081908, 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, - 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, 0x19082b192b08082b, - 0x19082b2b19081919, 0x19082b2b2b190808, 0x1919080808080808, 0x1919080808082b08, - 0x1919080808190819, 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808, - 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, 0x191908192b2b1908, - 0x1919082b2b190819, 0x191919082b190808, 0x191919082b19082b, 0x1919191908082b2b, - 0x1919192b08080819, 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819, - 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, 0x19192b2b08082b08, - 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, 0x192b0808192b2b08, - 0x192b081908080808, 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808, - 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, 0x192b19190819082b, - 0x192b19192b081908, 0x192b2b081908082b, 0x2b08080808080808, 0x2b0808080808082b, - 0x2b08080808082b2b, 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908, - 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, 0x2b08190808080819, - 0x2b08190808081908, 0x2b08190808190808, 0x2b08190808191919, 0x2b08190819080808, - 0x2b081908192b0808, 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908, - 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, 0x2b082b080808082b, - 0x2b082b1908081908, 0x2b082b2b08190819, 0x2b19080808081908, 0x2b19080808190808, - 0x2b190808082b1908, 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b, - 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, 0x2b191908082b082b, - 0x2b19190819081908, 0x2b19191919190819, 0x2b192b082b080819, 0x2b192b19082b0808, - 0x2b2b08080808082b, 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19, - 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, -}; - -static const uint64_t iq2xs_grid[512] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, - 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, - 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, - 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, - 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, - 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, - 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, - 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, - 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, - 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, - 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, - 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, - 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, - 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, - 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, - 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, - 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, - 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, - 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, - 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, - 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, - 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, - 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, - 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, - 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, - 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, - 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, - 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, - 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, - 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, - 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, - 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, - 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, - 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, - 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, - 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, - 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, - 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, - 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, - 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, - 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, - 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, - 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, - 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, - 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, - 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, - 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, - 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, - 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, - 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, - 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, - 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, - 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, - 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, - 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, - 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, - 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, - 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, - 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, - 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, - 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, - 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, - 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, - 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, - 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, - 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, - 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, - 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, - 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, - 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, - 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, - 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, - 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, - 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, - 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, - 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, - 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, - 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, - 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, - 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, - 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, - 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, - 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, - 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, - 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, - 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, - 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, - 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, - 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, - 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, - 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, - 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, - 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, - 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, - 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, - 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, - 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, - 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, - 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, - 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, - 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, - 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, - 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, - 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, - 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, - 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, - 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, - 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, - 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, - 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, - 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, - 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, - 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, - 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, - 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, - 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, - 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, - 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, - 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, - 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, - 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, - 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, - 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, - 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, - 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, - 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, - 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, -}; - -static const uint64_t iq2s_grid[1024] = { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, - 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, - 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, - 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, - 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, - 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x08080808192b192b, - 0x08080808192b2b19, 0x080808082b080808, 0x080808082b08082b, 0x080808082b081919, - 0x080808082b082b08, 0x080808082b190819, 0x080808082b191908, 0x080808082b2b0808, - 0x080808082b2b1919, 0x080808082b2b2b2b, 0x0808081908080819, 0x0808081908081908, - 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, 0x080808190819082b, - 0x0808081908191919, 0x0808081908192b08, 0x08080819082b0819, 0x08080819082b1908, - 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, 0x0808081919082b08, - 0x0808081919190819, 0x0808081919191908, 0x080808191919192b, 0x0808081919192b19, - 0x08080819192b0808, 0x08080819192b1919, 0x08080819192b2b08, 0x080808192b080819, - 0x080808192b081908, 0x080808192b190808, 0x080808192b19082b, 0x080808192b191919, - 0x080808192b2b0819, 0x080808192b2b1908, 0x0808082b08080808, 0x0808082b0808082b, - 0x0808082b08081919, 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, - 0x0808082b082b0808, 0x0808082b082b2b2b, 0x0808082b19080819, 0x0808082b19081908, - 0x0808082b1908192b, 0x0808082b19082b19, 0x0808082b19190808, 0x0808082b19191919, - 0x0808082b2b080808, 0x0808082b2b081919, 0x0808082b2b082b2b, 0x0808082b2b191908, - 0x0808082b2b2b082b, 0x0808190808080819, 0x0808190808081908, 0x080819080808192b, - 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, 0x0808190808191919, - 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, 0x08081908082b192b, - 0x08081908082b2b19, 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, - 0x0808190819082b08, 0x0808190819082b2b, 0x0808190819190819, 0x0808190819191908, - 0x080819081919192b, 0x0808190819192b19, 0x08081908192b0808, 0x08081908192b082b, - 0x08081908192b1919, 0x080819082b080819, 0x080819082b081908, 0x080819082b08192b, - 0x080819082b082b19, 0x080819082b190808, 0x080819082b191919, 0x080819082b192b08, - 0x080819082b2b0819, 0x080819082b2b1908, 0x0808191908080808, 0x080819190808082b, - 0x0808191908081919, 0x0808191908082b08, 0x0808191908082b2b, 0x0808191908190819, - 0x0808191908191908, 0x080819190819192b, 0x0808191908192b19, 0x08081919082b0808, - 0x08081919082b1919, 0x08081919082b2b08, 0x0808191919080819, 0x0808191919081908, - 0x080819191908192b, 0x0808191919082b19, 0x0808191919190808, 0x080819191919082b, - 0x0808191919191919, 0x0808191919192b08, 0x08081919192b0819, 0x08081919192b1908, - 0x080819192b080808, 0x080819192b08082b, 0x080819192b081919, 0x080819192b082b08, - 0x080819192b190819, 0x080819192b191908, 0x080819192b2b0808, 0x0808192b08080819, - 0x0808192b08081908, 0x0808192b0808192b, 0x0808192b08082b19, 0x0808192b08190808, - 0x0808192b08191919, 0x0808192b19080808, 0x0808192b19081919, 0x0808192b19082b08, - 0x0808192b19190819, 0x0808192b19191908, 0x0808192b192b0808, 0x0808192b2b080819, - 0x0808192b2b081908, 0x0808192b2b190808, 0x08082b0808080808, 0x08082b080808082b, - 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808190819, 0x08082b0808191908, - 0x08082b080819192b, 0x08082b0808192b19, 0x08082b08082b0808, 0x08082b08082b1919, - 0x08082b08082b2b2b, 0x08082b0819080819, 0x08082b0819081908, 0x08082b081908192b, - 0x08082b0819082b19, 0x08082b0819190808, 0x08082b081919082b, 0x08082b0819191919, - 0x08082b0819192b08, 0x08082b08192b0819, 0x08082b08192b1908, 0x08082b082b080808, - 0x08082b082b081919, 0x08082b082b191908, 0x08082b082b2b2b2b, 0x08082b1908080819, - 0x08082b1908081908, 0x08082b1908190808, 0x08082b190819082b, 0x08082b1908191919, - 0x08082b1908192b08, 0x08082b19082b0819, 0x08082b1919080808, 0x08082b1919081919, - 0x08082b1919082b08, 0x08082b1919190819, 0x08082b1919191908, 0x08082b19192b0808, - 0x08082b192b080819, 0x08082b192b190808, 0x08082b2b08080808, 0x08082b2b08190819, - 0x08082b2b08191908, 0x08082b2b082b082b, 0x08082b2b082b2b08, 0x08082b2b082b2b2b, - 0x08082b2b19190808, 0x08082b2b2b192b19, 0x0819080808080819, 0x0819080808081908, - 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, - 0x0819080808191919, 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, - 0x08190808082b192b, 0x0819080819080808, 0x081908081908082b, 0x0819080819081919, - 0x0819080819082b08, 0x0819080819190819, 0x0819080819191908, 0x081908081919192b, - 0x0819080819192b19, 0x08190808192b0808, 0x08190808192b082b, 0x08190808192b1919, - 0x08190808192b2b08, 0x081908082b080819, 0x081908082b081908, 0x081908082b08192b, - 0x081908082b190808, 0x081908082b191919, 0x081908082b192b08, 0x081908082b2b0819, - 0x081908082b2b1908, 0x0819081908080808, 0x081908190808082b, 0x0819081908081919, - 0x0819081908082b08, 0x0819081908082b2b, 0x0819081908190819, 0x0819081908191908, - 0x081908190819192b, 0x0819081908192b19, 0x08190819082b0808, 0x08190819082b082b, - 0x08190819082b1919, 0x08190819082b2b08, 0x0819081919080819, 0x0819081919081908, - 0x081908191908192b, 0x0819081919082b19, 0x0819081919190808, 0x081908191919082b, - 0x0819081919191919, 0x0819081919192b08, 0x08190819192b0819, 0x08190819192b1908, - 0x081908192b080808, 0x081908192b08082b, 0x081908192b081919, 0x081908192b082b08, - 0x081908192b190819, 0x081908192b191908, 0x0819082b08080819, 0x0819082b08081908, - 0x0819082b08082b19, 0x0819082b08190808, 0x0819082b08191919, 0x0819082b082b0819, - 0x0819082b082b1908, 0x0819082b19080808, 0x0819082b19081919, 0x0819082b19190819, - 0x0819082b19191908, 0x0819082b2b080819, 0x0819082b2b081908, 0x0819082b2b190808, - 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08, - 0x0819190808190819, 0x0819190808191908, 0x081919080819192b, 0x0819190808192b19, - 0x08191908082b0808, 0x08191908082b1919, 0x08191908082b2b08, 0x0819190819080819, - 0x0819190819081908, 0x081919081908192b, 0x0819190819082b19, 0x0819190819190808, - 0x081919081919082b, 0x0819190819191919, 0x0819190819192b08, 0x08191908192b0819, - 0x08191908192b1908, 0x081919082b080808, 0x081919082b08082b, 0x081919082b081919, - 0x081919082b082b08, 0x081919082b190819, 0x081919082b191908, 0x081919082b2b0808, - 0x0819191908080819, 0x0819191908081908, 0x081919190808192b, 0x0819191908082b19, - 0x0819191908190808, 0x081919190819082b, 0x0819191908191919, 0x0819191908192b08, - 0x08191919082b0819, 0x08191919082b1908, 0x0819191919080808, 0x081919191908082b, - 0x0819191919081919, 0x0819191919082b08, 0x0819191919190819, 0x0819191919191908, - 0x08191919192b0808, 0x081919192b080819, 0x081919192b081908, 0x081919192b190808, - 0x0819192b08080808, 0x0819192b08081919, 0x0819192b08082b08, 0x0819192b08190819, - 0x0819192b08191908, 0x0819192b082b0808, 0x0819192b19080819, 0x0819192b19081908, - 0x0819192b19190808, 0x0819192b2b080808, 0x0819192b2b2b2b2b, 0x08192b0808080819, - 0x08192b0808081908, 0x08192b080808192b, 0x08192b0808082b19, 0x08192b0808190808, - 0x08192b0808191919, 0x08192b0808192b08, 0x08192b08082b0819, 0x08192b0819080808, - 0x08192b081908082b, 0x08192b0819081919, 0x08192b0819082b08, 0x08192b0819190819, - 0x08192b0819191908, 0x08192b08192b0808, 0x08192b082b080819, 0x08192b082b081908, - 0x08192b1908080808, 0x08192b190808082b, 0x08192b1908081919, 0x08192b1908082b08, - 0x08192b1908190819, 0x08192b1908191908, 0x08192b19082b0808, 0x08192b1919080819, - 0x08192b1919081908, 0x08192b1919190808, 0x08192b19192b2b19, 0x08192b192b2b082b, - 0x08192b2b08081908, 0x08192b2b08190808, 0x08192b2b19080808, 0x08192b2b1919192b, - 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, 0x082b080808082b08, - 0x082b080808190819, 0x082b080808191908, 0x082b08080819192b, 0x082b080808192b19, - 0x082b0808082b0808, 0x082b0808082b1919, 0x082b0808082b2b2b, 0x082b080819080819, - 0x082b080819081908, 0x082b080819190808, 0x082b08081919082b, 0x082b080819191919, - 0x082b0808192b1908, 0x082b08082b080808, 0x082b08082b082b2b, 0x082b08082b191908, - 0x082b08082b2b2b2b, 0x082b081908080819, 0x082b081908081908, 0x082b081908190808, - 0x082b08190819082b, 0x082b081908191919, 0x082b0819082b0819, 0x082b081919080808, - 0x082b08191908082b, 0x082b081919081919, 0x082b081919190819, 0x082b081919191908, - 0x082b0819192b0808, 0x082b08192b080819, 0x082b08192b081908, 0x082b08192b190808, - 0x082b082b08080808, 0x082b082b08082b2b, 0x082b082b082b082b, 0x082b082b082b2b08, - 0x082b082b082b2b2b, 0x082b082b19081908, 0x082b082b19190808, 0x082b082b2b082b08, - 0x082b082b2b082b2b, 0x082b082b2b2b2b08, 0x082b190808080819, 0x082b190808081908, - 0x082b19080808192b, 0x082b190808082b19, 0x082b190808190808, 0x082b190808191919, - 0x082b190808192b08, 0x082b1908082b0819, 0x082b1908082b1908, 0x082b190819080808, - 0x082b19081908082b, 0x082b190819081919, 0x082b190819082b08, 0x082b190819190819, - 0x082b190819191908, 0x082b1908192b0808, 0x082b19082b080819, 0x082b19082b081908, - 0x082b19082b190808, 0x082b191908080808, 0x082b191908081919, 0x082b191908082b08, - 0x082b191908190819, 0x082b191908191908, 0x082b1919082b0808, 0x082b191919080819, - 0x082b191919081908, 0x082b191919190808, 0x082b1919192b192b, 0x082b19192b080808, - 0x082b192b08080819, 0x082b192b08081908, 0x082b192b08190808, 0x082b192b19080808, - 0x082b192b19192b19, 0x082b2b0808080808, 0x082b2b0808081919, 0x082b2b0808190819, - 0x082b2b0808191908, 0x082b2b0819080819, 0x082b2b0819081908, 0x082b2b0819190808, - 0x082b2b082b082b2b, 0x082b2b082b2b2b2b, 0x082b2b1908080819, 0x082b2b1908081908, - 0x082b2b1908190808, 0x082b2b192b191919, 0x082b2b2b08082b2b, 0x082b2b2b082b082b, - 0x082b2b2b192b1908, 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, - 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, - 0x190808080819082b, 0x1908080808191919, 0x1908080808192b08, 0x1908080808192b2b, - 0x19080808082b0819, 0x19080808082b1908, 0x19080808082b192b, 0x1908080819080808, - 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, 0x1908080819082b2b, - 0x1908080819190819, 0x1908080819191908, 0x190808081919192b, 0x1908080819192b19, - 0x19080808192b0808, 0x19080808192b082b, 0x19080808192b1919, 0x190808082b080819, - 0x190808082b081908, 0x190808082b190808, 0x190808082b191919, 0x190808082b192b08, - 0x190808082b2b0819, 0x190808082b2b1908, 0x1908081908080808, 0x190808190808082b, - 0x1908081908081919, 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908, - 0x190808190819192b, 0x1908081908192b19, 0x19080819082b0808, 0x19080819082b082b, - 0x19080819082b1919, 0x1908081919080819, 0x1908081919081908, 0x190808191908192b, - 0x1908081919082b19, 0x1908081919190808, 0x190808191919082b, 0x1908081919191919, - 0x1908081919192b08, 0x19080819192b0819, 0x19080819192b1908, 0x190808192b080808, - 0x190808192b08082b, 0x190808192b081919, 0x190808192b082b08, 0x190808192b190819, - 0x190808192b191908, 0x190808192b2b0808, 0x1908082b08080819, 0x1908082b08081908, - 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b08191919, 0x1908082b08192b08, - 0x1908082b082b1908, 0x1908082b19080808, 0x1908082b19081919, 0x1908082b19082b08, - 0x1908082b19190819, 0x1908082b19191908, 0x1908082b192b0808, 0x1908082b2b080819, - 0x1908082b2b081908, 0x1908190808080808, 0x190819080808082b, 0x1908190808081919, - 0x1908190808082b08, 0x1908190808082b2b, 0x1908190808190819, 0x1908190808191908, - 0x190819080819192b, 0x1908190808192b19, 0x19081908082b0808, 0x19081908082b082b, - 0x19081908082b1919, 0x19081908082b2b08, 0x1908190819080819, 0x1908190819081908, - 0x190819081908192b, 0x1908190819082b19, 0x1908190819190808, 0x190819081919082b, - 0x1908190819191919, 0x1908190819192b08, 0x19081908192b0819, 0x19081908192b1908, - 0x190819082b080808, 0x190819082b08082b, 0x190819082b081919, 0x190819082b082b08, - 0x190819082b190819, 0x190819082b191908, 0x190819082b2b0808, 0x1908191908080819, - 0x1908191908081908, 0x190819190808192b, 0x1908191908082b19, 0x1908191908190808, - 0x190819190819082b, 0x1908191908191919, 0x1908191908192b08, 0x19081919082b0819, - 0x19081919082b1908, 0x1908191919080808, 0x190819191908082b, 0x1908191919081919, - 0x1908191919082b08, 0x1908191919190819, 0x1908191919191908, 0x19081919192b0808, - 0x19081919192b2b2b, 0x190819192b080819, 0x190819192b081908, 0x190819192b190808, - 0x1908192b08080808, 0x1908192b0808082b, 0x1908192b08081919, 0x1908192b08082b08, - 0x1908192b08190819, 0x1908192b08191908, 0x1908192b082b0808, 0x1908192b19080819, - 0x1908192b19081908, 0x1908192b19190808, 0x1908192b2b080808, 0x1908192b2b2b1919, - 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808082b19, 0x19082b0808190808, - 0x19082b080819082b, 0x19082b0808191919, 0x19082b0808192b08, 0x19082b08082b0819, - 0x19082b08082b1908, 0x19082b0819080808, 0x19082b081908082b, 0x19082b0819081919, - 0x19082b0819082b08, 0x19082b0819190819, 0x19082b0819191908, 0x19082b08192b0808, - 0x19082b082b081908, 0x19082b082b190808, 0x19082b1908080808, 0x19082b190808082b, - 0x19082b1908081919, 0x19082b1908082b08, 0x19082b1908190819, 0x19082b1908191908, - 0x19082b19082b0808, 0x19082b1919080819, 0x19082b1919081908, 0x19082b1919190808, - 0x19082b192b080808, 0x19082b192b19192b, 0x19082b2b08080819, 0x19082b2b08081908, - 0x19082b2b08190808, 0x19082b2b19080808, 0x1919080808080808, 0x191908080808082b, - 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, 0x1919080808191908, - 0x191908080819192b, 0x1919080808192b19, 0x19190808082b0808, 0x19190808082b082b, - 0x19190808082b1919, 0x19190808082b2b08, 0x1919080819080819, 0x1919080819081908, - 0x191908081908192b, 0x1919080819082b19, 0x1919080819190808, 0x191908081919082b, - 0x1919080819191919, 0x1919080819192b08, 0x19190808192b0819, 0x19190808192b1908, - 0x191908082b080808, 0x191908082b08082b, 0x191908082b081919, 0x191908082b082b08, - 0x191908082b190819, 0x191908082b191908, 0x1919081908080819, 0x1919081908081908, - 0x191908190808192b, 0x1919081908082b19, 0x1919081908190808, 0x191908190819082b, - 0x1919081908191919, 0x1919081908192b08, 0x19190819082b0819, 0x19190819082b1908, - 0x1919081919080808, 0x191908191908082b, 0x1919081919081919, 0x1919081919082b08, - 0x1919081919190819, 0x1919081919191908, 0x19190819192b0808, 0x191908192b080819, - 0x191908192b081908, 0x191908192b190808, 0x1919082b08080808, 0x1919082b08081919, - 0x1919082b08082b08, 0x1919082b08190819, 0x1919082b08191908, 0x1919082b082b0808, - 0x1919082b19080819, 0x1919082b19081908, 0x1919082b19190808, 0x1919082b192b2b19, - 0x1919082b2b080808, 0x1919190808080819, 0x1919190808081908, 0x191919080808192b, - 0x1919190808082b19, 0x1919190808190808, 0x191919080819082b, 0x1919190808191919, - 0x1919190808192b08, 0x19191908082b0819, 0x19191908082b1908, 0x1919190819080808, - 0x191919081908082b, 0x1919190819081919, 0x1919190819082b08, 0x1919190819190819, - 0x1919190819191908, 0x19191908192b0808, 0x191919082b080819, 0x191919082b081908, - 0x191919082b190808, 0x1919191908080808, 0x191919190808082b, 0x1919191908081919, - 0x1919191908082b08, 0x1919191908190819, 0x1919191908191908, 0x19191919082b0808, - 0x1919191919080819, 0x1919191919081908, 0x1919191919190808, 0x191919192b080808, - 0x1919192b08080819, 0x1919192b08081908, 0x1919192b08190808, 0x1919192b082b192b, - 0x1919192b19080808, 0x19192b0808080808, 0x19192b080808082b, 0x19192b0808081919, - 0x19192b0808082b08, 0x19192b0808190819, 0x19192b0808191908, 0x19192b08082b0808, - 0x19192b0819080819, 0x19192b0819081908, 0x19192b0819190808, 0x19192b0819192b2b, - 0x19192b082b080808, 0x19192b1908080819, 0x19192b1908081908, 0x19192b1908190808, - 0x19192b1919080808, 0x19192b2b08080808, 0x19192b2b08192b19, 0x19192b2b2b081919, - 0x19192b2b2b2b2b08, 0x192b080808080819, 0x192b080808081908, 0x192b08080808192b, - 0x192b080808190808, 0x192b08080819082b, 0x192b080808191919, 0x192b080808192b08, - 0x192b0808082b0819, 0x192b0808082b1908, 0x192b080819080808, 0x192b080819081919, - 0x192b080819082b08, 0x192b080819190819, 0x192b080819191908, 0x192b0808192b0808, - 0x192b08082b081908, 0x192b08082b190808, 0x192b081908080808, 0x192b08190808082b, - 0x192b081908081919, 0x192b081908082b08, 0x192b081908190819, 0x192b081908191908, - 0x192b0819082b0808, 0x192b081919080819, 0x192b081919081908, 0x192b081919190808, - 0x192b08192b080808, 0x192b08192b192b19, 0x192b082b08081908, 0x192b082b08190808, - 0x192b082b19080808, 0x192b082b1919192b, 0x192b082b2b2b0819, 0x192b190808080808, - 0x192b190808081919, 0x192b190808082b08, 0x192b190808190819, 0x192b190808191908, - 0x192b1908082b0808, 0x192b190819080819, 0x192b190819081908, 0x192b190819190808, - 0x192b19082b080808, 0x192b191908080819, 0x192b191908081908, 0x192b191908190808, - 0x192b191919080808, 0x192b191919082b2b, 0x192b1919192b2b08, 0x192b19192b19082b, - 0x192b192b08080808, 0x192b192b2b191908, 0x192b2b0808080819, 0x192b2b0808081908, - 0x192b2b0808190808, 0x192b2b08192b1919, 0x192b2b082b192b08, 0x192b2b1908080808, - 0x192b2b19082b2b2b, 0x192b2b2b1908082b, 0x192b2b2b2b2b0819, 0x2b08080808080808, - 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819, - 0x2b08080808191908, 0x2b08080808192b19, 0x2b080808082b0808, 0x2b080808082b1919, - 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808081919082b, - 0x2b08080819191919, 0x2b08080819192b08, 0x2b080808192b0819, 0x2b0808082b080808, - 0x2b0808082b081919, 0x2b0808082b190819, 0x2b0808082b191908, 0x2b08081908080819, - 0x2b08081908081908, 0x2b08081908082b19, 0x2b08081908190808, 0x2b0808190819082b, - 0x2b08081908191919, 0x2b08081908192b08, 0x2b080819082b0819, 0x2b080819082b1908, - 0x2b08081919080808, 0x2b0808191908082b, 0x2b08081919081919, 0x2b08081919082b08, - 0x2b08081919190819, 0x2b08081919191908, 0x2b0808192b080819, 0x2b0808192b081908, - 0x2b0808192b190808, 0x2b0808192b2b2b19, 0x2b08082b08080808, 0x2b08082b08081919, - 0x2b08082b08082b2b, 0x2b08082b08190819, 0x2b08082b08191908, 0x2b08082b19080819, - 0x2b08082b19081908, 0x2b08082b19190808, 0x2b08190808080819, 0x2b08190808081908, - 0x2b0819080808192b, 0x2b08190808082b19, 0x2b08190808190808, 0x2b0819080819082b, - 0x2b08190808191919, 0x2b08190808192b08, 0x2b081908082b0819, 0x2b08190819080808, - 0x2b0819081908082b, 0x2b08190819081919, 0x2b08190819082b08, 0x2b08190819190819, - 0x2b08190819191908, 0x2b081908192b0808, 0x2b0819082b080819, 0x2b0819082b081908, - 0x2b0819082b190808, 0x2b08191908080808, 0x2b0819190808082b, 0x2b08191908081919, - 0x2b08191908082b08, 0x2b08191908190819, 0x2b08191908191908, 0x2b081919082b0808, - 0x2b08191919080819, 0x2b08191919081908, 0x2b08191919190808, 0x2b0819192b080808, - 0x2b0819192b082b2b, 0x2b08192b08080819, 0x2b08192b08081908, 0x2b08192b08190808, - 0x2b08192b082b2b19, 0x2b08192b19080808, 0x2b082b0808080808, 0x2b082b0808081919, - 0x2b082b0808190819, 0x2b082b0808191908, 0x2b082b0819080819, 0x2b082b0819081908, - 0x2b082b0819190808, 0x2b082b082b2b082b, 0x2b082b1908080819, 0x2b082b1908081908, - 0x2b082b1919080808, 0x2b082b19192b1919, 0x2b082b2b082b082b, 0x2b082b2b19192b08, - 0x2b082b2b19192b2b, 0x2b082b2b2b08082b, 0x2b082b2b2b2b082b, 0x2b19080808080819, - 0x2b19080808081908, 0x2b19080808082b19, 0x2b19080808190808, 0x2b1908080819082b, - 0x2b19080808191919, 0x2b19080808192b08, 0x2b190808082b1908, 0x2b19080819080808, - 0x2b1908081908082b, 0x2b19080819081919, 0x2b19080819082b08, 0x2b19080819190819, - 0x2b19080819191908, 0x2b190808192b0808, 0x2b1908082b080819, 0x2b1908082b081908, - 0x2b1908082b190808, 0x2b19081908080808, 0x2b19081908081919, 0x2b19081908190819, - 0x2b19081908191908, 0x2b19081919080819, 0x2b19081919081908, 0x2b19081919190808, - 0x2b19081919192b2b, 0x2b19082b08080819, 0x2b19082b08081908, 0x2b19082b08190808, - 0x2b19082b19080808, 0x2b19082b2b2b192b, 0x2b19190808080808, 0x2b1919080808082b, - 0x2b19190808081919, 0x2b19190808082b08, 0x2b19190808190819, 0x2b19190808191908, - 0x2b191908082b0808, 0x2b19190819080819, 0x2b19190819081908, 0x2b19190819190808, - 0x2b1919082b080808, 0x2b1919082b19192b, 0x2b19191908080819, 0x2b19191908081908, - 0x2b19191908190808, 0x2b19191919080808, 0x2b1919192b192b08, 0x2b1919192b2b0819, - 0x2b19192b08080808, 0x2b19192b1908192b, 0x2b19192b192b1908, 0x2b192b0808080819, - 0x2b192b0808081908, 0x2b192b0808190808, 0x2b192b08082b192b, 0x2b192b0819080808, - 0x2b192b082b2b2b19, 0x2b192b1908080808, 0x2b192b1919082b19, 0x2b192b191919082b, - 0x2b192b2b2b190808, 0x2b2b080808080808, 0x2b2b080808081919, 0x2b2b080808082b2b, - 0x2b2b080808191908, 0x2b2b0808082b082b, 0x2b2b0808082b2b2b, 0x2b2b080819080819, - 0x2b2b080819081908, 0x2b2b080819190808, 0x2b2b08082b2b082b, 0x2b2b08082b2b2b2b, - 0x2b2b081919080808, 0x2b2b0819192b1919, 0x2b2b082b0808082b, 0x2b2b082b08082b2b, - 0x2b2b082b082b082b, 0x2b2b082b082b2b08, 0x2b2b082b082b2b2b, 0x2b2b082b2b08082b, - 0x2b2b082b2b082b08, 0x2b2b082b2b082b2b, 0x2b2b082b2b2b2b08, 0x2b2b190808080819, - 0x2b2b190808081908, 0x2b2b190808190808, 0x2b2b190819080808, 0x2b2b19082b082b19, - 0x2b2b19082b2b1908, 0x2b2b191908080808, 0x2b2b191908192b19, 0x2b2b192b19190819, - 0x2b2b2b0808082b2b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b082b, 0x2b2b2b1919191908, - 0x2b2b2b192b08192b, 0x2b2b2b2b08082b08, 0x2b2b2b2b08082b2b, 0x2b2b2b2b082b0808, - 0x2b2b2b2b082b082b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b2b082b08, 0x2b2b2b2b2b2b2b2b, -}; - -static const uint32_t iq3xxs_grid[256] = { - 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, - 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, - 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, - 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, - 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, - 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, - 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, - 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, - 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, - 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, - 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, - 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, - 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, - 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, - 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, - 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, - 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, - 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, - 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, - 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, - 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, - 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, - 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, - 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, - 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, - 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, - 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, - 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, - 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, - 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, - 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, - 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, -}; - -static const uint32_t iq3s_grid[512] = { - 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, - 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, - 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, - 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, - 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, - 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, - 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, - 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, - 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, - 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, - 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, - 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, - 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, - 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, - 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, - 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, - 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, - 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, - 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, - 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, - 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, - 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, - 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, - 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, - 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, - 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, - 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, - 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, - 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, - 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, - 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, - 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, - 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, - 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, - 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, - 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, - 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, - 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, - 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, - 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, - 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, - 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, - 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, - 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, - 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, - 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, - 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, - 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, - 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, - 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, - 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, - 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, - 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, - 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, - 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, - 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, - 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, - 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, - 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, - 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, - 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, - 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, - 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, - 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, -}; - -#define NGRID_IQ2XXS 512 -static const uint64_t iq1s_grid[NGRID_IQ2XXS] = { - 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, - 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, - 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, - 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, - 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, - 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, - 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, - 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, - 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, - 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, - 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, - 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, - 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, - 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, - 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, - 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, - 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, - 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, - 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, - 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, - 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, - 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, - 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, - 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, - 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, - 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, - 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, - 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, - 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, - 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, - 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, - 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, - 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, - 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, - 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, - 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, - 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, - 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, - 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, - 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, - 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, - 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, - 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, - 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, - 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, - 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, - 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, - 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, - 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, - 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, - 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, - 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, - 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, - 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, - 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, - 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, - 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, - 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, - 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, - 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, - 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, - 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, - 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, - 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, - 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, - 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, - 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, - 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, - 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, - 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, - 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, - 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, - 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, - 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, - 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, - 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, - 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, - 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, - 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, - 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, - 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, - 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, - 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, - 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, - 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, - 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, - 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, - 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, - 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, - 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, - 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, - 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, - 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, - 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, - 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, - 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, - 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, - 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, - 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, - 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, - 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, - 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, - 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, - 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, - 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, - 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, - 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, - 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, - 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, - 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, - 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, - 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, - 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, - 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, - 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, - 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, - 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, - 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, - 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, - 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, - 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, - 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, - 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, - 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, - 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, - 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, - 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, - 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, - -}; - -static const uint8_t ksigns_iq2xs[128] = { - 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, - 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, - 160, 33, 34, 163, 36, 165, 166, 39, 40, 169, 170, 43, 172, 45, 46, 175, - 48, 177, 178, 51, 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, - 192, 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, 78, 207, - 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, 219, 92, 221, 222, 95, - 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, - 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, -}; - -static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; - void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; diff --git a/ggml-quants.h b/ggml-quants.h index 316e356876037..d2d25eb4441d2 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -1,9 +1,9 @@ #pragma once -#include "ggml-impl.h" - // GGML internal header +#include "ggml-impl.h" + #include #include diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index ddd951dd684a7..6fafa5282c8f9 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3144,6 +3144,8 @@ namespace dpct } // COPY from DPCT head files +#define GGML_COMMON_IMPL_SYCL +#include "ggml-common.h" static int g_ggml_sycl_debug=0; #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0) @@ -4745,388 +4747,6 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri #endif } -static dpct::global_memory - iq2xxs_grid(sycl::range<1>(256), - { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, - 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819, - 0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b, - 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, - 0x0808080819081908, 0x0808080819190808, 0x0808080819192b08, - 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, - 0x080808082b08082b, 0x080808082b082b2b, 0x080808082b2b082b, - 0x0808081908080819, 0x0808081908081908, 0x0808081908190808, - 0x0808081908191919, 0x0808081919080808, 0x080808192b081908, - 0x080808192b192b08, 0x0808082b08080808, 0x0808082b0808082b, - 0x0808082b082b082b, 0x0808082b2b08082b, 0x0808190808080819, - 0x0808190808081908, 0x0808190808190808, 0x08081908082b0819, - 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, - 0x0808190819082b08, 0x08081908192b0808, 0x080819082b080819, - 0x080819082b081908, 0x080819082b190808, 0x080819082b2b1908, - 0x0808191908080808, 0x080819190808082b, 0x0808191908082b08, - 0x08081919082b0808, 0x080819191908192b, 0x08081919192b2b19, - 0x080819192b080808, 0x080819192b190819, 0x0808192b08082b19, - 0x0808192b08190808, 0x0808192b19080808, 0x0808192b2b081908, - 0x0808192b2b2b1908, 0x08082b0808080808, 0x08082b0808081919, - 0x08082b0808082b08, 0x08082b0808191908, 0x08082b08082b2b08, - 0x08082b0819080819, 0x08082b0819081908, 0x08082b0819190808, - 0x08082b081919082b, 0x08082b082b082b08, 0x08082b1908081908, - 0x08082b1919080808, 0x08082b2b0808082b, 0x08082b2b08191908, - 0x0819080808080819, 0x0819080808081908, 0x0819080808190808, - 0x08190808082b0819, 0x0819080819080808, 0x08190808192b0808, - 0x081908082b081908, 0x081908082b190808, 0x081908082b191919, - 0x0819081908080808, 0x0819081908082b08, 0x08190819082b0808, - 0x0819081919190808, 0x0819081919192b2b, 0x081908192b080808, - 0x0819082b082b1908, 0x0819082b19081919, 0x0819190808080808, - 0x0819190808082b08, 0x08191908082b0808, 0x08191908082b1919, - 0x0819190819082b19, 0x081919082b080808, 0x0819191908192b08, - 0x08191919192b082b, 0x0819192b08080808, 0x0819192b0819192b, - 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, - 0x08192b0819080808, 0x08192b082b080819, 0x08192b1908080808, - 0x08192b1908081919, 0x08192b192b2b0808, 0x08192b2b19190819, - 0x082b080808080808, 0x082b08080808082b, 0x082b080808082b2b, - 0x082b080819081908, 0x082b0808192b0819, 0x082b08082b080808, - 0x082b08082b08082b, 0x082b0819082b2b19, 0x082b081919082b08, - 0x082b082b08080808, 0x082b082b0808082b, 0x082b190808080819, - 0x082b190808081908, 0x082b190808190808, 0x082b190819080808, - 0x082b19081919192b, 0x082b191908080808, 0x082b191919080819, - 0x082b1919192b1908, 0x082b192b2b190808, 0x082b2b0808082b08, - 0x082b2b08082b0808, 0x082b2b082b191908, 0x082b2b2b19081908, - 0x1908080808080819, 0x1908080808081908, 0x1908080808190808, - 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, - 0x1908080819080808, 0x1908080819082b08, 0x190808081919192b, - 0x19080808192b0808, 0x190808082b080819, 0x190808082b081908, - 0x190808082b190808, 0x1908081908080808, 0x19080819082b0808, - 0x19080819192b0819, 0x190808192b080808, 0x190808192b081919, - 0x1908082b08080819, 0x1908082b08190808, 0x1908082b19082b08, - 0x1908082b1919192b, 0x1908082b192b2b08, 0x1908190808080808, - 0x1908190808082b08, 0x19081908082b0808, 0x190819082b080808, - 0x190819082b192b19, 0x190819190819082b, 0x19081919082b1908, - 0x1908192b08080808, 0x19082b0808080819, 0x19082b0808081908, - 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, - 0x19082b1908080808, 0x19082b1919192b08, 0x19082b19192b0819, - 0x19082b192b08082b, 0x19082b2b19081919, 0x19082b2b2b190808, - 0x1919080808080808, 0x1919080808082b08, 0x1919080808190819, - 0x1919080808192b19, 0x19190808082b0808, 0x191908082b080808, - 0x191908082b082b08, 0x1919081908081908, 0x191908191908082b, - 0x191908192b2b1908, 0x1919082b2b190819, 0x191919082b190808, - 0x191919082b19082b, 0x1919191908082b2b, 0x1919192b08080819, - 0x1919192b19191908, 0x19192b0808080808, 0x19192b0808190819, - 0x19192b0808192b19, 0x19192b08192b1908, 0x19192b1919080808, - 0x19192b2b08082b08, 0x192b080808081908, 0x192b080808190808, - 0x192b080819080808, 0x192b0808192b2b08, 0x192b081908080808, - 0x192b081919191919, 0x192b082b08192b08, 0x192b082b192b0808, - 0x192b190808080808, 0x192b190808081919, 0x192b191908190808, - 0x192b19190819082b, 0x192b19192b081908, 0x192b2b081908082b, - 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808082b2b, - 0x2b08080819080819, 0x2b0808082b08082b, 0x2b08081908081908, - 0x2b08081908192b08, 0x2b08081919080808, 0x2b08082b08190819, - 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, - 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, - 0x2b08191908080808, 0x2b0819191908192b, 0x2b0819192b191908, - 0x2b08192b08082b19, 0x2b08192b19080808, 0x2b08192b192b0808, - 0x2b082b080808082b, 0x2b082b1908081908, 0x2b082b2b08190819, - 0x2b19080808081908, 0x2b19080808190808, 0x2b190808082b1908, - 0x2b19080819080808, 0x2b1908082b2b0819, 0x2b1908190819192b, - 0x2b1908192b080808, 0x2b19082b19081919, 0x2b19190808080808, - 0x2b191908082b082b, 0x2b19190819081908, 0x2b19191919190819, - 0x2b192b082b080819, 0x2b192b19082b0808, 0x2b2b08080808082b, - 0x2b2b080819190808, 0x2b2b08082b081919, 0x2b2b081908082b19, - 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, - 0x2b2b2b1908081908, - }); - -static dpct::global_memory - iq2xs_grid(sycl::range<1>(512), - { - 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, - 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819, - 0x0808080808191908, 0x080808080819192b, 0x0808080808192b19, - 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, - 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, - 0x080808081908192b, 0x0808080819082b19, 0x0808080819190808, - 0x080808081919082b, 0x0808080819191919, 0x0808080819192b08, - 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, - 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, - 0x080808082b190819, 0x080808082b191908, 0x080808082b192b19, - 0x080808082b2b0808, 0x0808081908080819, 0x0808081908081908, - 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, - 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, - 0x0808081908192b2b, 0x08080819082b0819, 0x08080819082b1908, - 0x0808081919080808, 0x080808191908082b, 0x0808081919081919, - 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, - 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, - 0x080808192b081908, 0x080808192b190808, 0x0808082b08080808, - 0x0808082b0808082b, 0x0808082b08081919, 0x0808082b08082b08, - 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, - 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, - 0x0808082b19191919, 0x0808082b2b080808, 0x0808082b2b082b2b, - 0x0808190808080819, 0x0808190808081908, 0x080819080808192b, - 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, - 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, - 0x08081908082b1908, 0x0808190819080808, 0x080819081908082b, - 0x0808190819081919, 0x0808190819082b08, 0x0808190819190819, - 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, - 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, - 0x0808191908080808, 0x080819190808082b, 0x0808191908081919, - 0x0808191908082b08, 0x0808191908190819, 0x0808191908191908, - 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, - 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, - 0x0808192b08080819, 0x0808192b08081908, 0x0808192b08190808, - 0x0808192b082b192b, 0x0808192b19080808, 0x0808192b1908082b, - 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, - 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, - 0x08082b0808190819, 0x08082b0808191908, 0x08082b08082b0808, - 0x08082b08082b1919, 0x08082b0819080819, 0x08082b0819081908, - 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, - 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, - 0x08082b1908081908, 0x08082b1908190808, 0x08082b1919080808, - 0x08082b192b080819, 0x08082b192b082b19, 0x08082b2b08080808, - 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, - 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, - 0x081908080808192b, 0x0819080808082b19, 0x0819080808190808, - 0x081908080819082b, 0x0819080808191919, 0x0819080808192b08, - 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, - 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, - 0x0819080819190819, 0x0819080819191908, 0x08190808192b0808, - 0x08190808192b2b2b, 0x081908082b080819, 0x081908082b081908, - 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, - 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, - 0x0819081908191908, 0x08190819082b0808, 0x0819081919080819, - 0x0819081919081908, 0x0819081919190808, 0x081908192b080808, - 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, - 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, - 0x0819082b19080808, 0x0819082b192b0808, 0x0819190808080808, - 0x081919080808082b, 0x0819190808081919, 0x0819190808082b08, - 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, - 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, - 0x0819190819190808, 0x08191908192b1908, 0x081919082b080808, - 0x0819191908080819, 0x0819191908081908, 0x0819191908190808, - 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, - 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, - 0x08192b0808190808, 0x08192b080819082b, 0x08192b0819080808, - 0x08192b0819191908, 0x08192b082b08192b, 0x08192b1908080808, - 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, - 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, - 0x082b080808081919, 0x082b080808082b08, 0x082b080808082b2b, - 0x082b080808190819, 0x082b080808191908, 0x082b0808082b0808, - 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, - 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, - 0x082b081908081908, 0x082b081908190808, 0x082b081919080808, - 0x082b081919082b08, 0x082b0819192b1919, 0x082b082b08080808, - 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, - 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, - 0x082b1908082b2b19, 0x082b190819080808, 0x082b191908080808, - 0x082b191919080819, 0x082b19191919082b, 0x082b19192b192b19, - 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, - 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, - 0x082b2b08082b0808, 0x082b2b0819191919, 0x082b2b082b082b08, - 0x082b2b082b2b082b, 0x082b2b19192b2b08, 0x082b2b192b190808, - 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, - 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, - 0x1908080808081908, 0x190808080808192b, 0x1908080808082b19, - 0x1908080808190808, 0x190808080819082b, 0x1908080808191919, - 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, - 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, - 0x1908080819082b08, 0x1908080819082b2b, 0x1908080819190819, - 0x1908080819191908, 0x19080808192b0808, 0x19080808192b1919, - 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, - 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, - 0x1908081908082b08, 0x1908081908190819, 0x1908081908191908, - 0x19080819082b0808, 0x1908081919080819, 0x1908081919081908, - 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, - 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, - 0x1908082b08190808, 0x1908082b0819082b, 0x1908082b082b2b19, - 0x1908082b19080808, 0x1908190808080808, 0x190819080808082b, - 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, - 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, - 0x1908190819080819, 0x1908190819081908, 0x1908190819190808, - 0x190819082b080808, 0x190819082b191908, 0x1908191908080819, - 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, - 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, - 0x1908192b08082b2b, 0x1908192b19081908, 0x1908192b19190808, - 0x19082b0808080819, 0x19082b0808081908, 0x19082b0808190808, - 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, - 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, - 0x19082b1919081908, 0x19082b1919190808, 0x19082b19192b2b19, - 0x19082b2b08081908, 0x1919080808080808, 0x191908080808082b, - 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, - 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, - 0x1919080819080819, 0x1919080819081908, 0x1919080819190808, - 0x191908082b080808, 0x1919081908080819, 0x1919081908081908, - 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, - 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, - 0x1919082b2b2b2b2b, 0x1919190808080819, 0x1919190808081908, - 0x1919190808190808, 0x19191908082b0819, 0x1919190819080808, - 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, - 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, - 0x191919192b082b08, 0x1919192b082b0819, 0x1919192b192b2b08, - 0x1919192b2b2b0819, 0x19192b0808080808, 0x19192b0808191908, - 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, - 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, - 0x19192b2b2b081919, 0x192b080808080819, 0x192b080808081908, - 0x192b080808190808, 0x192b080819080808, 0x192b080819191908, - 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, - 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, - 0x192b082b2b19082b, 0x192b190808080808, 0x192b19080819192b, - 0x192b191908190808, 0x192b191919080808, 0x192b191919081919, - 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, - 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, - 0x192b2b2b192b082b, 0x2b08080808080808, 0x2b0808080808082b, - 0x2b08080808081919, 0x2b08080808082b08, 0x2b08080808190819, - 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, - 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, - 0x2b0808082b080808, 0x2b0808082b08082b, 0x2b0808082b2b2b08, - 0x2b0808082b2b2b2b, 0x2b08081908080819, 0x2b08081908081908, - 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, - 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, - 0x2b08082b082b0808, 0x2b08082b2b080808, 0x2b08082b2b08082b, - 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, 0x2b08190808080819, - 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, - 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, - 0x2b0819082b082b19, 0x2b08191908080808, 0x2b08191919081908, - 0x2b0819192b2b1919, 0x2b08192b08192b08, 0x2b08192b192b2b2b, - 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, - 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, - 0x2b082b082b2b2b08, 0x2b082b190808192b, 0x2b082b2b082b082b, - 0x2b082b2b2b080808, 0x2b082b2b2b082b08, 0x2b082b2b2b19192b, - 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, - 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, - 0x2b1908082b081908, 0x2b19081908080808, 0x2b190819082b082b, - 0x2b190819192b1908, 0x2b19082b1919192b, 0x2b19082b2b082b19, - 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, - 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, - 0x2b1919192b190808, 0x2b1919192b19082b, 0x2b19192b19080819, - 0x2b192b0819190819, 0x2b192b082b2b192b, 0x2b192b1919082b19, - 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, - 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, - 0x2b2b0808082b0808, 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, - 0x2b2b081919190819, 0x2b2b081919192b19, 0x2b2b08192b2b192b, - 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, - 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, - 0x2b2b190819080808, 0x2b2b19082b191919, 0x2b2b192b192b1919, - 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, 0x2b2b2b08082b0808, - 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, - 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, - 0x2b2b2b192b08192b, 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, - 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, - }); - -static dpct::global_memory iq3xxs_grid( - sycl::range<1>(256), - { - 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, - 0x04041404, 0x04041414, 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, - 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, 0x040c140c, 0x040c142c, - 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, - 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, - 0x04141c1c, 0x04141c3e, 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, - 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, 0x041c3e04, 0x04240c1c, - 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, - 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, - 0x043e0c24, 0x043e0c34, 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, - 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, 0x0c041c04, 0x0c041c14, - 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, - 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, - 0x0c14140c, 0x0c141c04, 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, - 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, 0x0c24042c, 0x0c242c04, - 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, - 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, - 0x14041414, 0x14041434, 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, - 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, 0x140c1c04, 0x140c341c, - 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, - 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, - 0x141c0c04, 0x141c0c24, 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, - 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, 0x143e040c, 0x143e041c, - 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, - 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, - 0x1c0c1404, 0x1c0c1c0c, 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, - 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, 0x1c1c0c0c, 0x1c1c1c1c, - 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, - 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, - 0x24040424, 0x24040c3e, 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, - 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, 0x24143404, 0x24143434, - 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, - 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, - 0x2c040c14, 0x2c04240c, 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, - 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, 0x2c1c0414, 0x2c1c2c1c, - 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, - 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, - 0x34043424, 0x340c140c, 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, - 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, 0x34341c1c, 0x343e041c, - 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, - 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, - 0x3e1c0404, 0x3e1c0c2c, 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, - 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, - }); - -static dpct::global_memory ksigns_iq2xs( - sycl::range<1>(128), - { - 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, - 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, - 154, 27, 156, 29, 30, 159, 160, 33, 34, 163, 36, 165, 166, - 39, 40, 169, 170, 43, 172, 45, 46, 175, 48, 177, 178, 51, - 180, 53, 54, 183, 184, 57, 58, 187, 60, 189, 190, 63, 192, - 65, 66, 195, 68, 197, 198, 71, 72, 201, 202, 75, 204, 77, - 78, 207, 80, 209, 210, 83, 212, 85, 86, 215, 216, 89, 90, - 219, 92, 221, 222, 95, 96, 225, 226, 99, 228, 101, 102, 231, - 232, 105, 106, 235, 108, 237, 238, 111, 240, 113, 114, 243, 116, - 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, - }); - -static dpct::global_memory - ksigns64(sycl::range<1>(128), - { - 0x0000000000000000, 0xff000000000000ff, 0xff0000000000ff00, - 0x000000000000ffff, 0xff00000000ff0000, 0x0000000000ff00ff, - 0x0000000000ffff00, 0xff00000000ffffff, 0xff000000ff000000, - 0x00000000ff0000ff, 0x00000000ff00ff00, 0xff000000ff00ffff, - 0x00000000ffff0000, 0xff000000ffff00ff, 0xff000000ffffff00, - 0x00000000ffffffff, 0xff0000ff00000000, 0x000000ff000000ff, - 0x000000ff0000ff00, 0xff0000ff0000ffff, 0x000000ff00ff0000, - 0xff0000ff00ff00ff, 0xff0000ff00ffff00, 0x000000ff00ffffff, - 0x000000ffff000000, 0xff0000ffff0000ff, 0xff0000ffff00ff00, - 0x000000ffff00ffff, 0xff0000ffffff0000, 0x000000ffffff00ff, - 0x000000ffffffff00, 0xff0000ffffffffff, 0xff00ff0000000000, - 0x0000ff00000000ff, 0x0000ff000000ff00, 0xff00ff000000ffff, - 0x0000ff0000ff0000, 0xff00ff0000ff00ff, 0xff00ff0000ffff00, - 0x0000ff0000ffffff, 0x0000ff00ff000000, 0xff00ff00ff0000ff, - 0xff00ff00ff00ff00, 0x0000ff00ff00ffff, 0xff00ff00ffff0000, - 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0xff00ff00ffffffff, - 0x0000ffff00000000, 0xff00ffff000000ff, 0xff00ffff0000ff00, - 0x0000ffff0000ffff, 0xff00ffff00ff0000, 0x0000ffff00ff00ff, - 0x0000ffff00ffff00, 0xff00ffff00ffffff, 0xff00ffffff000000, - 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0xff00ffffff00ffff, - 0x0000ffffffff0000, 0xff00ffffffff00ff, 0xff00ffffffffff00, - 0x0000ffffffffffff, 0xffff000000000000, 0x00ff0000000000ff, - 0x00ff00000000ff00, 0xffff00000000ffff, 0x00ff000000ff0000, - 0xffff000000ff00ff, 0xffff000000ffff00, 0x00ff000000ffffff, - 0x00ff0000ff000000, 0xffff0000ff0000ff, 0xffff0000ff00ff00, - 0x00ff0000ff00ffff, 0xffff0000ffff0000, 0x00ff0000ffff00ff, - 0x00ff0000ffffff00, 0xffff0000ffffffff, 0x00ff00ff00000000, - 0xffff00ff000000ff, 0xffff00ff0000ff00, 0x00ff00ff0000ffff, - 0xffff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, - 0xffff00ff00ffffff, 0xffff00ffff000000, 0x00ff00ffff0000ff, - 0x00ff00ffff00ff00, 0xffff00ffff00ffff, 0x00ff00ffffff0000, - 0xffff00ffffff00ff, 0xffff00ffffffff00, 0x00ff00ffffffffff, - 0x00ffff0000000000, 0xffffff00000000ff, 0xffffff000000ff00, - 0x00ffff000000ffff, 0xffffff0000ff0000, 0x00ffff0000ff00ff, - 0x00ffff0000ffff00, 0xffffff0000ffffff, 0xffffff00ff000000, - 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0xffffff00ff00ffff, - 0x00ffff00ffff0000, 0xffffff00ffff00ff, 0xffffff00ffffff00, - 0x00ffff00ffffffff, 0xffffffff00000000, 0x00ffffff000000ff, - 0x00ffffff0000ff00, 0xffffffff0000ffff, 0x00ffffff00ff0000, - 0xffffffff00ff00ff, 0xffffffff00ffff00, 0x00ffffff00ffffff, - 0x00ffffffff000000, 0xffffffffff0000ff, 0xffffffffff00ff00, - 0x00ffffffff00ffff, 0xffffffffffff0000, 0x00ffffffffff00ff, - 0x00ffffffffffff00, 0xffffffffffffffff, - }); -//#endif - -static dpct::global_memory - kmask_iq2xs(sycl::range<1>(8), {1, 2, 4, 8, 16, 32, 64, 128}); - template static void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, const sycl::nd_item<3> &item_ct1, diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 2c391e6415ca1..83e6359b1dcef 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -94,6 +94,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-alloc.c -> ggml-alloc.c # src/ggml-backend-impl.h -> ggml-backend-impl.h # src/ggml-backend.c -> ggml-backend.c + # src/ggml-common.h -> ggml-common.h # src/ggml-cuda.cu -> ggml-cuda.cu # src/ggml-cuda.h -> ggml-cuda.h # src/ggml-impl.h -> ggml-impl.h @@ -126,6 +127,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \ -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \ -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \ + -e 's/src\/ggml-common\.h/ggml-common.h/g' \ -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \ -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index feb34bbc8bb8d..f1e6f0e57ce0d 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -4,6 +4,7 @@ cp -rpv ../ggml/src/ggml.c ./ggml.c cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c +cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h From 0db32beaf09d90b8959d3d0cc493ed1e45685353 Mon Sep 17 00:00:00 2001 From: Alexey Parfenov Date: Sat, 9 Mar 2024 11:16:53 +0000 Subject: [PATCH 11/39] server : fix passing prompt as tokens (#5955) * server: fix passing prompt as tokens * Update examples/server/server.cpp --------- Co-authored-by: Georgi Gerganov --- examples/server/server.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index aedf0afc603c3..8cff514f2a98a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -852,7 +852,16 @@ struct server_context { // infill slot.params.input_prefix = json_value(data, "input_prefix", default_params.input_prefix); slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix); - slot.prompt = json_value(data, "prompt", std::string("")); + + // get prompt + { + const auto & prompt = data.find("prompt"); + if (prompt == data.end()) { + slot.prompt = ""; + } else { + slot.prompt = *prompt; + } + } // penalize user-provided tokens { From 2c4f566c88322ebf2f9bd11b01b5ebdaa0130b89 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 14:17:11 +0200 Subject: [PATCH 12/39] tests : gitignore ggml-common.h --- tests/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/.gitignore b/tests/.gitignore index 9427cf13d036f..620a48ee4449b 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,3 +1,4 @@ * !*.* *.o +ggml-common.h From fb215c3832236fec7380c4fb618bd7154cb196ef Mon Sep 17 00:00:00 2001 From: SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> Date: Sat, 9 Mar 2024 21:27:58 +0900 Subject: [PATCH 13/39] server : normalize embeddings (#5956) * output normalize embedding in '/v1/embeddings' * common : reuse llama_embd_normalize * common : better normalize impl --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 15 +++++++++++++++ common/common.h | 7 +++++++ examples/embedding/embedding.cpp | 14 +------------- examples/server/server.cpp | 8 +++++++- 4 files changed, 30 insertions(+), 14 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d7f650ef486e9..16ef4d7f74dd9 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1852,3 +1852,18 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) { printf("\n=== Done dumping\n"); } + +void llama_embd_normalize(const float * inp, float * out, int n) { + double sum = 0.0; + for (int i = 0; i < n; i++) { + sum += inp[i] * inp[i]; + } + sum = sqrt(sum); + + const float norm = sum > 0.0 ? 1.0f / sum : 0.0f; + + for (int i = 0; i < n; i++) { + out[i] = inp[i] * norm; + } +} + diff --git a/common/common.h b/common/common.h index 977ce419ff93b..f8d82b8713c87 100644 --- a/common/common.h +++ b/common/common.h @@ -260,3 +260,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); + +// +// Embedding utils +// + +void llama_embd_normalize(const float * inp, float * out, int n); + diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index ff5883da6ba27..a553ae1c3f35d 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -23,17 +23,6 @@ static void batch_add_seq(llama_batch & batch, const std::vector & toke } } -static void normalize(const float * vec, float * out, int n) { - float norm = 0; - for (int i = 0; i < n; i++) { - norm += vec[i] * vec[i]; - } - norm = sqrt(norm); - for (int i = 0; i < n; i++) { - out[i] = vec[i] / norm; - } -} - static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { // clear previous kv_cache values (irrelevant for embeddings) llama_kv_cache_clear(ctx); @@ -44,7 +33,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu fprintf(stderr, "%s : failed to decode\n", __func__); } - // normalize on copy for (int i = 0; i < batch.n_tokens; i++) { if (!batch.logits[i]) { continue; @@ -61,7 +49,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } float * out = output + batch.seq_id[i][0] * n_embd; - normalize(embd, out, n_embd); + llama_embd_normalize(embd, out, n_embd); } } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8cff514f2a98a..796f3499c9877 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1327,6 +1327,8 @@ struct server_context { const int n_embd = llama_n_embd(model); + std::vector embd_res(n_embd, 0.0f); + for (int i = 0; i < batch.n_tokens; ++i) { if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { continue; @@ -1350,8 +1352,10 @@ struct server_context { continue; } + llama_embd_normalize(embd, embd_res.data(), n_embd); + res.data = json { - {"embedding", std::vector(embd, embd + n_embd)}, + {"embedding", embd_res}, }; } @@ -3354,6 +3358,8 @@ int main(int argc, char ** argv) { // get the result server_task_result result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); + + // append to the responses responses.push_back(result.data); } From 97c09585d65a95864773b4d25d66d0f708baf38d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 15:47:47 +0200 Subject: [PATCH 14/39] server : clarify some items in the readme (#5957) * server : clarify some items in the readme * server : fix typo --- examples/server/README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 3abb1abe3b92b..23606b32a2c81 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -195,7 +195,11 @@ node index.js *Options:* - `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. If the prompt is a string or an array with the first element given as a string, a `bos` token is inserted in the front like `main` does. + `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true: + + - The prompt is a string or an array with the first element given as a string + - The model's `tokenizer.ggml.add_bos_token` metadata is `true` + - The system prompt is empty `temperature`: Adjust the randomness of the generated text (default: 0.8). @@ -308,7 +312,7 @@ Notice that each `probs` is an array of length `n_probs`. `content`: Set the text to tokenize. - Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`. + Note that a special `BOS` token is never inserted. - **POST** `/detokenize`: Convert tokens to text. From 5b09797321430f08caf0473143a962916ab2ea89 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 15:53:59 +0200 Subject: [PATCH 15/39] ggml : remove old quantization functions (#5942) * ggml : remove old quantization functions ggml-ci * ggml : simplify ggml_quantize_chunk ggml-ci * ggml : restrict correctness ggml-ci * ggml : remove hist data from the quantization API ggml-ci * tests : remove hist usage in test-backend-ops ggml-ci * vulkan : remove hist and fix typo --- examples/benchmark/benchmark-matmult.cpp | 6 +- examples/llava/clip.cpp | 55 +--- ggml-quants.c | 143 +++------- ggml-quants.h | 42 +-- ggml-vulkan.cpp | 40 +-- ggml.c | 339 +++-------------------- ggml.h | 23 +- llama.cpp | 48 +--- tests/test-backend-ops.cpp | 3 +- 9 files changed, 131 insertions(+), 568 deletions(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index e89f3de2fd397..47cb16c69d536 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -189,12 +189,10 @@ int main(int argc, char ** argv) { int32_t nelements = sizex*sizey; - std::vector hist_cur(1 << 4, 0); - // Set up a the benchmark matrices // printf("Creating new tensor q11 & Running quantize\n"); struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr); + ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr); // Set up a the compute graph // printf("Creating new tensor q31\n"); @@ -207,7 +205,7 @@ int main(int argc, char ** argv) { // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr); + ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr); // printf("Creating new tensor q32\n"); struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index ef9e4ba7a6b5a..6653b815d93a1 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1862,7 +1862,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i std::vector work(512); std::vector conv_buf(512); - std::vector hist_all(1 << 4, 0); size_t total_size_org = 0; size_t total_size_new = 0; @@ -1917,48 +1916,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i } new_data = work.data(); - std::vector hist_cur(1 << 4, 0); - - switch (new_type) { - case GGML_TYPE_Q4_0: { - new_size = ggml_quantize_q4_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: { - new_size = ggml_quantize_q4_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q5_0: { - new_size = ggml_quantize_q5_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q5_1: { - new_size = ggml_quantize_q5_1(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q8_0: { - new_size = ggml_quantize_q8_0(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q2_K: { - new_size = ggml_quantize_q2_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q3_K: { - new_size = ggml_quantize_q3_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_K: { - new_size = ggml_quantize_q4_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q5_K: { - new_size = ggml_quantize_q5_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q6_K: { - new_size = ggml_quantize_q6_K(f32_data, new_data, n_elms, cur->ne[0], hist_cur.data()); - } break; - default: { - fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, new_type); - return false; - } - } - - for (size_t j = 0; j < hist_cur.size(); ++j) { - hist_all[j] += hist_cur[j]; - } + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr); } else { new_type = cur->type; new_data = cur->data; @@ -1993,17 +1951,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i { printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); - - int64_t sum_all = 0; - for (size_t i = 0; i < hist_all.size(); ++i) { - sum_all += hist_all[i]; - } - - printf("%s: hist: ", __func__); - for (size_t i = 0; i < hist_all.size(); ++i) { - printf("%5.3f ", hist_all[i] / (float)sum_all); - } - printf("\n"); } return true; diff --git a/ggml-quants.c b/ggml-quants.c index 5bb46def3d2b8..4ee4e0604b02c 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -1704,16 +1704,6 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q2_K_reference(x, vy, k); } -size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; - quantize_row_q2_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q2_K)); -} - static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights, uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, float rmin, float rdelta, int nstep, bool use_mad) { @@ -1966,8 +1956,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } } -size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row); if (!quant_weights) { quantize_row_q2_K_reference(src, dst, nrow*n_per_row); @@ -2186,16 +2175,6 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q3_K_reference(x, vy, k); } -size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; - quantize_row_q3_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q3_K)); -} - static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) { #if QK_K != 256 (void)quant_weights; @@ -2285,8 +2264,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri #endif } -size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row); if (!quant_weights) { quantize_row_q3_K_reference(src, dst, nrow*n_per_row); @@ -2456,17 +2434,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q4_K_reference(x, y, k); } -size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; - quantize_row_q4_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q4_K)); -} - static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) { #if QK_K != 256 (void)quant_weights; @@ -2545,8 +2512,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri #endif } -size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row); if (!quant_weights) { quantize_row_q4_K_reference(src, dst, nrow*n_per_row); @@ -2757,17 +2723,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q5_K_reference(x, y, k); } -size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; - quantize_row_q5_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q5_K)); -} - static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) { #if QK_K != 256 (void)quant_weights; @@ -2866,8 +2821,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri #endif } -size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row); if (!quant_weights) { quantize_row_q5_K_reference(src, dst, nrow*n_per_row); @@ -3020,17 +2974,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { quantize_row_q6_K_reference(x, y, k); } -size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK_K == 0); - (void)hist; // TODO: collect histograms - - for (int j = 0; j < n; j += k) { - block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; - quantize_row_q6_K_reference(src + j, y, k); - } - return (n/QK_K*sizeof(block_q6_K)); -} - static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) { #if QK_K != 256 (void)quant_weights; @@ -3120,8 +3063,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri #endif } -size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row); if (!quant_weights) { quantize_row_q6_K_reference(src, dst, nrow*n_per_row); @@ -3165,9 +3107,10 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri } } -size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { if (!quant_weights) { - return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist); + quantize_row_q4_0_reference(src, dst, nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row); char * qrow = (char *)dst; @@ -3209,9 +3152,10 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri } } -size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { if (!quant_weights) { - return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist); + quantize_row_q4_1_reference(src, dst, nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row); char * qrow = (char *)dst; @@ -3262,9 +3206,10 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri } } -size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { if (!quant_weights) { - return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist); + quantize_row_q5_0_reference(src, dst, nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row); char * qrow = (char *)dst; @@ -3314,9 +3259,10 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri } } -size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { if (!quant_weights) { - return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist); + quantize_row_q5_1_reference(src, dst, nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row); char * qrow = (char *)dst; @@ -3328,6 +3274,13 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int return nrow * row_size; } +size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { + (void)quant_weights; // not used + const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row); + quantize_row_q8_0_reference(src, dst, nrow*n_per_row); + return nrow * row_size; +} + // ====================== "True" 2-bit (de)-quantization void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) { @@ -9373,7 +9326,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void #endif } -void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -9620,7 +9573,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { } #endif -void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { +void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); UNUSED(nrc); @@ -10220,7 +10173,7 @@ void iq2xs_init_impl(enum ggml_type type) { int * kmap_q2xs; uint16_t * kneighbors_q2xs; - printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); + //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t)); for (int k = 0; k < grid_size; ++k) { int8_t * pos = (int8_t *)(the_grid + k); @@ -10275,7 +10228,7 @@ void iq2xs_init_impl(enum ggml_type type) { } num_neighbors += n; } - printf("%s: %d neighbours in total\n", __func__, num_neighbors); + //printf("%s: %d neighbours in total\n", __func__, num_neighbors); kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); iq2_data[gindex].neighbours = kneighbors_q2xs; int counter = 0; @@ -10698,8 +10651,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v } } -size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -10711,8 +10663,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, return nrow * nblock * sizeof(block_iq2_xxs); } -size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -10816,7 +10767,7 @@ void iq3xs_init_impl(int grid_size) { int * kmap_q3xs; uint16_t * kneighbors_q3xs; - printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); + //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size); uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t)); for (int k = 0; k < grid_size; ++k) { int8_t * pos = (int8_t *)(the_grid + k); @@ -10871,7 +10822,7 @@ void iq3xs_init_impl(int grid_size) { } num_neighbors += n; } - printf("%s: %d neighbours in total\n", __func__, num_neighbors); + //printf("%s: %d neighbours in total\n", __func__, num_neighbors); kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t)); iq3_data[gindex].neighbours = kneighbors_q3xs; int counter = 0; @@ -11154,8 +11105,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v } } -size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -11361,8 +11311,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo } #define IQ3S_BLOCK_SIZE 32 -size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; float scales[QK_K/IQ3S_BLOCK_SIZE]; @@ -11392,7 +11341,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) { void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) { assert(k % QK_K == 0); - quantize_iq3_s(x, y, 1, k, NULL, NULL); + quantize_iq3_s(x, y, 1, k, NULL); } @@ -11587,8 +11536,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -11613,7 +11561,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) { return x - val[mu-1] < val[mu] - x ? mu-1 : mu; } -static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x, +static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x, ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l, float * scales, float * weight, uint8_t * L, const int8_t * values, @@ -11721,8 +11669,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block } } -size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK4_NL == 0); int nblock = n_per_row/QK4_NL; char * qrow = (char *)dst; @@ -11752,14 +11699,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { assert(k % QK4_NL == 0); - quantize_iq4_nl(x, y, 1, k, NULL, NULL); + quantize_iq4_nl(x, y, 1, k, NULL); } -size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { +size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { #if QK_K == 64 - return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights); + return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights); #else - (void)hist; GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -11788,7 +11734,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) { void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) { assert(k % QK_K == 0); - quantize_iq4_xs(x, y, 1, k, NULL, NULL); + quantize_iq4_xs(x, y, 1, k, NULL); } // =============================== 2.5625 bpw @@ -11961,8 +11907,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { - (void)hist; +size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); int nblock = n_per_row/QK_K; char * qrow = (char *)dst; @@ -11976,7 +11921,7 @@ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, in void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) { assert(k % QK_K == 0); - quantize_iq2_s(x, y, 1, k, NULL, NULL); + quantize_iq2_s(x, y, 1, k, NULL); } void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) { diff --git a/ggml-quants.h b/ggml-quants.h index d2d25eb4441d2..47dd52856422a 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -261,6 +261,7 @@ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGM void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k); void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k); void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); + void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k); void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k); @@ -280,6 +281,7 @@ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); @@ -300,6 +302,7 @@ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRI void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); @@ -321,6 +324,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); + void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -330,26 +334,26 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); -// // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") -// -size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); -size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); + +size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); void iq2xs_init_impl(enum ggml_type type); void iq2xs_free_impl(enum ggml_type type); diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 5a1b3f4776181..d41aa7d22f096 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -4102,45 +4102,7 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool } static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) { - std::vector hist_cur(1 << 4, 0); - - switch(quant) { - case GGML_TYPE_F32: - memcpy(to, from, sizeof(float) * ne); - break; - case GGML_TYPE_Q4_0: - ggml_quantize_q4_0(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q4_1: - ggml_quantize_q4_1(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q5_0: - ggml_quantize_q5_0(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q5_1: - ggml_quantize_q5_1(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q8_0: - ggml_quantize_q8_0(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q2_K: - ggml_quantize_q2_K(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q3_K: - ggml_quantize_q3_K(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q4_K: - ggml_quantize_q4_K(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q5_K: - ggml_quantize_q5_K(from, to, ne, ne, hist_cur.data()); - break; - case GGML_TYPE_Q6_K: - ggml_quantize_q6_K(from, to, ne, ne, hist_cur.data()); - break; - default: - GGML_ASSERT(false); - } + ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr); } static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) { diff --git a/ggml.c b/ggml.c index 6eff98ab63f0c..80efa6f2ac900 100644 --- a/ggml.c +++ b/ggml.c @@ -20159,133 +20159,6 @@ void ggml_quantize_free(void) { ggml_critical_section_end(); } -size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK4_0 == 0); - const int nb = k / QK4_0; - - for (int b = 0; b < n; b += k) { - block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0; - - quantize_row_q4_0_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - for (int j = 0; j < QK4_0; j += 2) { - const uint8_t vi0 = y[i].qs[j/2] & 0x0F; - const uint8_t vi1 = y[i].qs[j/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK4_0*sizeof(block_q4_0)); -} - -size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK4_1 == 0); - const int nb = k / QK4_1; - - for (int b = 0; b < n; b += k) { - block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1; - - quantize_row_q4_1_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - for (int j = 0; j < QK4_1; j += 2) { - const uint8_t vi0 = y[i].qs[j/2] & 0x0F; - const uint8_t vi1 = y[i].qs[j/2] >> 4; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK4_1*sizeof(block_q4_1)); -} - -size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK5_0 == 0); - const int nb = k / QK5_0; - - for (int b = 0; b < n; b += k) { - block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0; - - quantize_row_q5_0_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - uint32_t qh; - memcpy(&qh, &y[i].qh, sizeof(qh)); - - for (int j = 0; j < QK5_0; j += 2) { - const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4; - const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12)); - - // cast to 16 bins - const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; - const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK5_0*sizeof(block_q5_0)); -} - -size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK5_1 == 0); - const int nb = k / QK5_1; - - for (int b = 0; b < n; b += k) { - block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1; - - quantize_row_q5_1_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - uint32_t qh; - memcpy(&qh, &y[i].qh, sizeof(qh)); - - for (int j = 0; j < QK5_1; j += 2) { - const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4; - const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12)); - - // cast to 16 bins - const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2; - const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2; - - hist[vi0]++; - hist[vi1]++; - } - } - } - - return (n/QK5_1*sizeof(block_q5_1)); -} - -size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK8_0 == 0); - const int nb = k / QK8_0; - - for (int b = 0; b < n; b += k) { - block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0; - - quantize_row_q8_0_reference(src + b, y, k); - - for (int i = 0; i < nb; i++) { - for (int j = 0; j < QK8_0; ++j) { - const int8_t vi = y[i].qs[j]; - - hist[vi/16 + 8]++; - } - } - } - - return (n/QK8_0*sizeof(block_q8_0)); -} - bool ggml_quantize_requires_imatrix(enum ggml_type type) { return type == GGML_TYPE_IQ2_XXS || @@ -20293,177 +20166,52 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) { type == GGML_TYPE_IQ1_S; } -size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, - int nrows, int n_per_row, int64_t * hist, const float * imatrix) { +size_t ggml_quantize_chunk( + enum ggml_type type, + const float * src, + void * dst, + int start, + int nrows, + int n_per_row, + const float * imatrix) { + const int n = nrows * n_per_row; + + if (ggml_quantize_requires_imatrix(type)) { + GGML_ASSERT(imatrix != NULL); + } + + GGML_ASSERT(start % type_traits[type].blck_size == 0); + GGML_ASSERT(start % n_per_row == 0); + ggml_quantize_init(type); // this is noop if already initialized + + const size_t start_row = start / n_per_row; + const size_t row_size = ggml_row_size(type, n_per_row); + size_t result = 0; - int n = nrows * n_per_row; + switch (type) { - case GGML_TYPE_Q4_0: - { - GGML_ASSERT(start % QK4_0 == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q4_1: - { - GGML_ASSERT(start % QK4_1 == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q5_0: - { - GGML_ASSERT(start % QK5_0 == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q5_1: - { - GGML_ASSERT(start % QK5_1 == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q8_0: - { - GGML_ASSERT(start % QK8_0 == 0); - block_q8_0 * block = (block_q8_0*)dst + start / QK8_0; - result = ggml_quantize_q8_0(src + start, block, n, n, hist); - } break; - case GGML_TYPE_Q2_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q3_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q4_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q5_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_Q6_K: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ2_XXS: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - GGML_ASSERT(imatrix); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ2_XS: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - GGML_ASSERT(imatrix); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ3_XXS: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ3_S: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ2_S: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ1_S: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; - case GGML_TYPE_IQ4_NL: + case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; #if QK_K == 64 - case GGML_TYPE_IQ4_XS: -#endif - { - GGML_ASSERT(start % QK4_NL == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; -#if QK_K != 64 - case GGML_TYPE_IQ4_XS: - { - GGML_ASSERT(start % QK_K == 0); - GGML_ASSERT(start % n_per_row == 0); - size_t start_row = start / n_per_row; - size_t row_size = ggml_row_size(type, n_per_row); - result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); - GGML_ASSERT(result == row_size * nrows); - } break; + case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; +#else + case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; #endif case GGML_TYPE_F16: { @@ -20480,6 +20228,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i default: assert(false); } + + GGML_ASSERT(result == nrows * row_size); + return result; } diff --git a/ggml.h b/ggml.h index a13b0cec4edcf..1171088a96526 100644 --- a/ggml.h +++ b/ggml.h @@ -2194,25 +2194,18 @@ extern "C" { GGML_API void ggml_quantize_init(enum ggml_type type); GGML_API void ggml_quantize_free(void); - // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk - GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); - - GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); - GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); - // some quantization type cannot be used without an importance matrix GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type); // calls ggml_quantize_init internally (i.e. can allocate memory) - GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, - int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix); + GGML_API size_t ggml_quantize_chunk( + enum ggml_type type, + const float * src, + void * dst, + int start, + int nrows, + int n_per_row, + const float * imatrix); // // gguf diff --git a/llama.cpp b/llama.cpp index 8c147a42b1e53..c58a029f74faf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11890,17 +11890,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty return new_type; } -static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector & workers, const int nthread) { +static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector & workers, const int nthread) { std::mutex mutex; int counter = 0; size_t new_size = 0; if (nthread < 2) { // single-thread - return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix); + return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); } - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size, + auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix]() { - std::array local_hist = {}; const int nrows_per_chunk = chunk_size / n_per_row; size_t local_size = 0; while (true) { @@ -11908,17 +11907,13 @@ static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const flo int first_row = counter; counter += nrows_per_chunk; if (first_row >= nrows) { if (local_size > 0) { - for (int j=0; j hist_all(1 << 4, 0); std::vector workers; workers.reserve(nthread); @@ -12175,7 +12169,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s work.resize(nelements * 4); // upper bound on size } new_data = work.data(); - std::array hist_cur = {}; const int n_per_row = tensor->ne[0]; const int nrows = nelements / n_per_row; @@ -12185,22 +12178,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; - new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, hist_cur.data(), imatrix, workers, nthread_use); - - LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); - int64_t tot_count = 0; - for (size_t i = 0; i < hist_cur.size(); i++) { - hist_all[i] += hist_cur[i]; - tot_count += hist_cur[i]; - } + new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use); - if (tot_count > 0) { - LLAMA_LOG_INFO(" | hist: "); - for (size_t i = 0; i < hist_cur.size(); i++) { - LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements)); - } - } - LLAMA_LOG_INFO("\n"); + LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); } total_size_org += ggml_nbytes(tensor); total_size_new += new_size; @@ -12229,22 +12209,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); - // print histogram for all tensors - { - int64_t sum_all = 0; - for (size_t i = 0; i < hist_all.size(); i++) { - sum_all += hist_all[i]; - } - - if (sum_all > 0) { - LLAMA_LOG_INFO("%s: hist: ", __func__); - for (size_t i = 0; i < hist_all.size(); i++) { - LLAMA_LOG_INFO("%5.3f ", hist_all[i] / float(sum_all)); - } - LLAMA_LOG_INFO("\n"); - } - } - if (qs.n_fallback > 0) { LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n", __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 8a6999f21f1f6..fc5edcc4bf6fc 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -53,7 +53,6 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); - int64_t hist[16]; std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix const float * im = imatrix.data(); if (!ggml_quantize_requires_imatrix(tensor->type)) { @@ -63,7 +62,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m im = nullptr; } } - ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im); + ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im); ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { // This is going to create some weird integers though. From 58308a0ecce7cc261b802f4803c38d420063db21 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 17:34:15 +0200 Subject: [PATCH 16/39] server : fix metrics init (#5964) --- examples/server/server.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 796f3499c9877..2374b7e4ab232 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -341,7 +341,7 @@ struct server_slot { }; struct server_metrics { - const int64_t t_start = ggml_time_us(); + int64_t t_start = 0; uint64_t n_prompt_tokens_processed_total = 0; uint64_t t_prompt_processing_total = 0; @@ -354,14 +354,18 @@ struct server_metrics { uint64_t n_tokens_predicted = 0; uint64_t t_tokens_generation = 0; - void on_prompt_eval(const server_slot &slot) { + void init() { + t_start = ggml_time_us(); + } + + void on_prompt_eval(const server_slot & slot) { n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed; n_prompt_tokens_processed += slot.n_prompt_tokens_processed; t_prompt_processing += slot.t_prompt_processing; t_prompt_processing_total += slot.t_prompt_processing; } - void on_prediction(const server_slot &slot) { + void on_prediction(const server_slot & slot) { n_tokens_predicted_total += slot.n_decoded; n_tokens_predicted += slot.n_decoded; t_tokens_generation += slot.t_token_generation; @@ -690,10 +694,11 @@ struct server_context { return res > 0; } - void initialize() { + void init() { const int32_t n_ctx_slot = n_ctx / params.n_parallel; LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); + for (int i = 0; i < params.n_parallel; i++) { server_slot slot; @@ -735,6 +740,8 @@ struct server_context { default_generation_settings_for_props["seed"] = -1; batch = llama_batch_init(n_ctx, 0, params.n_parallel); + + metrics.init(); } std::vector tokenize(const json & json_prompt, bool add_bos) const { @@ -2783,7 +2790,7 @@ int main(int argc, char ** argv) { state.store(SERVER_STATE_ERROR); return 1; } else { - ctx_server.initialize(); + ctx_server.init(); state.store(SERVER_STATE_READY); } From 8380ecfb219c2e73cc706fcc83935b7c806cc7c3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 17:36:20 +0200 Subject: [PATCH 17/39] ggml : fix unnecessary f32 -> f16 -> f32 casts (mmla) (#5951) --- ggml-quants.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 4ee4e0604b02c..807c5e391069c 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -4059,10 +4059,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r const int8x16_t y1_h = vld1q_s8(b_y1->qs + 16); // mmla into int32x4_t - float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x0->d)*GGML_FP16_TO_FP32(b_y1->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y0->d), - GGML_FP16_TO_FP32(b_x1->d)*GGML_FP16_TO_FP32(b_y1->d)}; + float32x4_t scale = {GGML_FP16_TO_FP32(b_x0->d)*b_y0->d, + GGML_FP16_TO_FP32(b_x0->d)*b_y1->d, + GGML_FP16_TO_FP32(b_x1->d)*b_y0->d, + GGML_FP16_TO_FP32(b_x1->d)*b_y1->d}; int8x16_t l0 = vreinterpretq_s8_s64(vzip1q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); int8x16_t l1 = vreinterpretq_s8_s64(vzip2q_s64(vreinterpretq_s64_s8(x0_l), vreinterpretq_s64_s8(x1_l))); From 098dbaab449f5309a54871ba7e5acef72ae696de Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 18:14:13 +0200 Subject: [PATCH 18/39] readme : update hot topics --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index d7dba73e62267..e3ec0817ae0e5 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,11 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ +> [!IMPORTANT] +> **Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962** +> +> Vote for which quantization type provides better responses, all other parameters being the same. + ### Recent API changes - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 @@ -16,11 +21,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics -- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761)) -- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) -- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631 -- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590 -- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216 +- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 ---- From d894f352bf433157232dc8dc54eacd50014e898e Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 9 Mar 2024 19:55:54 +0100 Subject: [PATCH 19/39] perplexity : support using multiple sequences to allow larger batch sizes (#5946) * perplexity : support using multiple sequences to allow larger batch sizes ggml-ci * set cparams.n_parallel to the number of sequences * print tested n_ctx, add assert --- examples/perplexity/perplexity.cpp | 139 +++++++++++++++++++---------- llama.cpp | 22 +++-- 2 files changed, 108 insertions(+), 53 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 52789ee631234..293eb52c33653 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -442,7 +442,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & return {tokens, std::exp(nll / count), logit_history, prob_history}; } -static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) { +static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) { if (params.ppl_stride > 0) { return perplexity_v2(ctx, params); } @@ -453,7 +453,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // BOS tokens will be added for each chunk before eval const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - const int n_ctx = llama_n_ctx(ctx); std::ofstream logits_stream; if (!params.logits_file.empty()) { @@ -499,13 +498,19 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par double nll2 = 0.0; const int num_batches = (n_ctx + n_batch - 1) / n_batch; + const int n_seq = std::max(1, n_batch / n_ctx); + + GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0); + GGML_ASSERT(params.n_ctx == n_seq * n_ctx); + + llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1); std::vector logits; if (num_batches > 1) { logits.reserve((size_t)n_ctx * n_vocab); } - fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); + fprintf(stderr, "%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq); std::vector workers(std::thread::hardware_concurrency() - 1); @@ -518,10 +523,26 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par log_probs.resize(n_ctx * nv); } - for (int i = 0; i < n_chunk; ++i) { + // We get the logits for all the tokens in the context window (params.n_ctx) + // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, + // calculate the perplexity over the last half of the window (so the model always has + // some context to predict the token). + // + // We rely on the fact that attention in the forward pass only looks at previous + // tokens here, so the logits returned for each token are an accurate representation + // of what the model would have predicted at that point. + // + // Example, we have a context window of 512, we will compute perplexity for each of the + // last 256 tokens. Then, we split the input up into context window size chunks to + // process the entire prompt. + const int first = n_ctx/2; + + for (int i = 0; i < n_chunk; i += n_seq) { const int start = i * n_ctx; const int end = start + n_ctx; + const int n_seq_batch = std::min(n_seq, n_chunk - i); + const auto t_start = std::chrono::high_resolution_clock::now(); // clear the KV cache @@ -531,22 +552,37 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); - // save original token and restore it after eval - const auto token_org = tokens[batch_start]; + batch.n_tokens = 0; + for (int seq = 0; seq < n_seq_batch; seq++) { + int seq_start = batch_start + seq*n_ctx; - // add BOS token for the first batch of each chunk - if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + // save original token and restore it after eval + const auto token_org = tokens[seq_start]; + + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[seq_start] = llama_token_bos(llama_get_model(ctx)); + } + + for (int k = 0; k < batch_size; ++k) { + const int idx = seq*n_ctx + k; + batch.token[idx] = tokens[seq_start + k]; + batch.pos[idx] = j*n_batch + k; + batch.n_seq_id[idx] = 1; + batch.seq_id[idx][0] = seq; + batch.logits[idx] = batch.pos[idx] >= first ? 1 : 0; + } + batch.n_tokens += batch_size; + + // restore the original token in case it was set to BOS + tokens[seq_start] = token_org; } - if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + if (llama_decode(ctx, batch)) { fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; } - // restore the original token in case it was set to BOS - tokens[batch_start] = token_org; - if (num_batches > 1) { const auto * batch_logits = llama_get_logits(ctx); logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); @@ -558,7 +594,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par if (i == 0) { const float t_total = std::chrono::duration(t_end - t_start).count(); fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); - int total_seconds = (int)(t_total * n_chunk); + int total_seconds = (int)(t_total*n_chunk/n_seq); if (total_seconds >= 60*60) { fprintf(stderr, "%d hours ", total_seconds / (60*60)); total_seconds = total_seconds % (60*60); @@ -566,37 +602,31 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); } - // We get the logits for all the tokens in the context window (params.n_ctx) - // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, - // calculate the perplexity over the last half of the window (so the model always has - // some context to predict the token). - // - // We rely on the fact that attention in the forward pass only looks at previous - // tokens here, so the logits returned for each token are an accurate representation - // of what the model would have predicted at that point. - // - // Example, we have a context window of 512, we will compute perplexity for each of the - // last 256 tokens. Then, we split the input up into context window size chunks to - // process the entire prompt. - const int first = n_ctx/2; - const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); - if (!params.logits_file.empty()) { - process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, log_probs, nll, nll2); - } else { - process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); - } - count += n_ctx - first - 1; - - // perplexity is e^(average negative log-likelihood) - if (params.ppl_output_type == 0) { - printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); - } else { - double av = nll/count; - double av2 = nll2/count - av*av; - if (av2 > 0) av2 = sqrt(av2/(count-1)); - printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); + for (int seq = 0; seq < n_seq_batch; seq++) { + const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx); + llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first; + if (!params.logits_file.empty()) { + process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, + tokens_data, n_ctx - 1 - first, + workers, log_probs, nll, nll2); + } else { + process_logits(n_vocab, all_logits + first*n_vocab, + tokens_data, n_ctx - 1 - first, + workers, nll, nll2, + logit_history.data() + start + seq*n_ctx + first, + prob_history.data() + start + seq*n_ctx + first); + } + count += n_ctx - first - 1; + + // perplexity is e^(average negative log-likelihood) + if (params.ppl_output_type == 0) { + printf("[%d]%.4lf,", i + seq + 1, std::exp(nll / count)); + } else { + double av = nll/count; + double av2 = nll2/count - av*av; + if (av2 > 0) av2 = sqrt(av2/(count-1)); + printf("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2); + } } fflush(stdout); @@ -615,6 +645,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par printf("Unexpected negative standard deviation of log(prob)\n"); } + llama_batch_free(batch); + return {tokens, ppl, logit_history, prob_history}; } @@ -1782,13 +1814,24 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { int main(int argc, char ** argv) { gpt_params params; - params.n_batch = 512; if (!gpt_params_parse(argc, argv, params)) { return 1; } params.logits_all = true; - params.n_batch = std::min(params.n_batch, params.n_ctx); + + const int32_t n_ctx = params.n_ctx; + + const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence; + if (ppl) { + int n_seq = std::max(1, params.n_batch / n_ctx); + int32_t n_kv = n_seq * n_ctx; + params.n_parallel = n_seq; + params.n_ctx = n_kv; + params.n_batch = std::min(params.n_batch, n_kv); + } else { + params.n_batch = std::min(params.n_batch, params.n_ctx); + } if (params.ppl_stride > 0) { fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n", @@ -1847,7 +1890,7 @@ int main(int argc, char ** argv) { } else if (params.kl_divergence) { kl_divergence(ctx, params); } else { - results = perplexity(ctx, params); + results = perplexity(ctx, params, n_ctx); } llama_print_timings(ctx); diff --git a/llama.cpp b/llama.cpp index c58a029f74faf..b19616e8f9a5f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8925,17 +8925,29 @@ static int llama_decode_internal( if (batch.logits) { logits_out.resize(n_vocab * n_tokens); + int32_t i_first = -1; for (uint32_t i = 0; i < n_tokens; i++) { - if (batch.logits[i] == 0) { - continue; + if (batch.logits[i] && i_first == -1) { + i_first = (int32_t) i; + } + if (batch.logits[i] == 0 || i == n_tokens - 1) { + if (i_first != -1) { + int i_last = batch.logits[i] == 0 ? i : i + 1; + // extract logits for the range [i_first, i_last) + // group the requests to minimize the number of calls to the backend + ggml_backend_tensor_get_async(backend_res, res, + logits_out.data() + (n_vocab*i_first), + (n_vocab*i_first)*sizeof(float), + (i_last - i_first)*n_vocab*sizeof(float)); + i_first = -1; + } } - ggml_backend_tensor_get_async(backend_res, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG - logits_valid[i] = true; + logits_valid[i] = batch.logits[i] != 0; #endif } } else if (lctx.logits_all) { - logits_out.resize(n_vocab * n_tokens); + logits_out.resize(n_vocab*n_tokens); ggml_backend_tensor_get_async(backend_res, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); #ifndef NDEBUG std::fill(logits_valid.begin(), logits_valid.end(), true); From 77d1ac7e00bf049b9f2bba1b5a310a78318c49c4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 9 Mar 2024 22:04:00 +0200 Subject: [PATCH 20/39] server : print chat template info --- examples/server/server.cpp | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2374b7e4ab232..b14cca61b1530 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2197,7 +2197,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n"); printf(" --chat-template JINJA_TEMPLATE\n"); printf(" set custom jinja chat template (default: template taken from model's metadata)\n"); - printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n"); + printf(" only commonly used templates are accepted:\n"); + printf(" https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n"); printf("\n"); } @@ -2798,13 +2799,30 @@ int main(int argc, char ** argv) { const auto model_meta = ctx_server.model_meta(); - if (sparams.chat_template.empty()) { // custom chat template is not supplied + // if a custom chat template is not supplied, we will use the one that comes with the model (if any) + if (sparams.chat_template.empty()) { if (!ctx_server.validate_model_chat_template()) { LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {}); sparams.chat_template = "chatml"; } } + // print sample chat example to make it clear which template is used + { + json chat; + chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}}); + chat.push_back({{"role", "user"}, {"content", "Hello"}}); + chat.push_back({{"role", "assistant"}, {"content", "Hi there"}}); + chat.push_back({{"role", "user"}, {"content", "How are you?"}}); + + const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat); + + LOG_INFO("chat template", { + {"chat_example", chat_example}, + {"built_in", sparams.chat_template.empty()}, + }); + } + // // Middlewares // From 621e86b331f8b0e71f79fd82a4ae1cd54c3e4396 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 9 Mar 2024 23:41:49 +0100 Subject: [PATCH 21/39] server: benchmark: chat/completions scenario and other llm servers comparison (#5941) * server: bench: Init a bench scenario with K6 See #5827 * server: bench: EOL EOF * server: bench: PR feedback and improved k6 script configuration * server: bench: remove llamacpp_completions_tokens_seconds as it include prompt processing time and it's misleading server: bench: add max_tokens from SERVER_BENCH_MAX_TOKENS server: bench: increase truncated rate to 80% before failing * server: bench: fix doc * server: bench: change gauge custom metrics to trend * server: bench: change gauge custom metrics to trend server: bench: add trend custom metrics for total tokens per second average * server: bench: doc add an option to debug http request * server: bench: filter dataset too short and too long sequences * server: bench: allow to filter out conversation in the dataset based on env variable * server: bench: fix assistant message sent instead of user message * server: bench: fix assistant message sent instead of user message * server : add defrag thold parameter * server: bench: select prompts based on the current iteration id not randomly to make the bench more reproducible --------- Co-authored-by: Georgi Gerganov --- examples/server/bench/README.md | 88 +++++++++++++++++++++++ examples/server/bench/script.js | 120 ++++++++++++++++++++++++++++++++ examples/server/server.cpp | 8 +++ 3 files changed, 216 insertions(+) create mode 100644 examples/server/bench/README.md create mode 100644 examples/server/bench/script.js diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md new file mode 100644 index 0000000000000..a53ad64d7359b --- /dev/null +++ b/examples/server/bench/README.md @@ -0,0 +1,88 @@ +### Server benchmark tools + +Benchmark is using [k6](https://k6.io/). + +##### Install k6 + +Follow instruction from: https://k6.io/docs/get-started/installation/ + +Example for ubuntu: +```shell +snap install k6 +``` + +#### Download a dataset + +This dataset was originally proposed in [vLLM benchmarks](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md). + +```shell +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +``` + +#### Download a model +Example for PHI-2 + +```shell +../../../scripts/hf.sh --repo ggml-org/models --file phi-2/ggml-model-q4_0.gguf +``` + +#### Start the server +The server must answer OAI Chat completion requests on `http://localhost:8080/v1` or according to the environment variable `SERVER_BENCH_URL`. + +Example: +```shell +server --host localhost --port 8080 \ + --model ggml-model-q4_0.gguf \ + --cont-batching \ + --metrics \ + --parallel 8 \ + --batch-size 512 \ + --ctx-size 4096 \ + --log-format text \ + -ngl 33 +``` + +#### Run the benchmark + +For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run: +```shell +k6 run script.js --duration 10m --iterations 500 --vus 8 +``` + +The benchmark values can be overridden with: +- `SERVER_BENCH_URL` server url prefix for chat completions, default `http://localhost:8080/v1` +- `SERVER_BENCH_N_PROMPTS` total prompts to randomly select in the benchmark, default `480` +- `SERVER_BENCH_MODEL_ALIAS` model alias to pass in the completion request, default `my-model` +- `SERVER_BENCH_MAX_TOKENS` max tokens to predict, default: `512` +- `SERVER_BENCH_DATASET` path to the benchmark dataset file +- `SERVER_BENCH_MAX_PROMPT_TOKENS` maximum prompt tokens to filter out in the dataset: default `1024` +- `SERVER_BENCH_MAX_CONTEXT` maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens, default `2048` + +Note: the local tokenizer is just a string space split, real number of tokens will differ. + +Or with [k6 options](https://k6.io/docs/using-k6/k6-options/reference/): + +```shell +SERVER_BENCH_N_PROMPTS=500 k6 run script.js --duration 10m --iterations 500 --vus 8 +``` + +To [debug http request](https://k6.io/docs/using-k6/http-debugging/) use `--http-debug="full"`. + +#### Metrics + +Following metrics are available computed from the OAI chat completions response `usage`: +- `llamacpp_tokens_second` Trend of `usage.total_tokens / request duration` +- `llamacpp_prompt_tokens` Trend of `usage.prompt_tokens` +- `llamacpp_prompt_tokens_total_counter` Counter of `usage.prompt_tokens` +- `llamacpp_completion_tokens` Trend of `usage.completion_tokens` +- `llamacpp_completion_tokens_total_counter` Counter of `usage.completion_tokens` +- `llamacpp_completions_truncated_rate` Rate of completions truncated, i.e. if `finish_reason === 'length'` +- `llamacpp_completions_stop_rate` Rate of completions stopped by the model, i.e. if `finish_reason === 'stop'` + +The script will fail if too many completions are truncated, see `llamacpp_completions_truncated_rate`. + +K6 metrics might be compared against [server metrics](../README.md), with: + +```shell +curl http://localhost:8080/metrics +``` diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js new file mode 100644 index 0000000000000..a4f5ac5ab22ad --- /dev/null +++ b/examples/server/bench/script.js @@ -0,0 +1,120 @@ +import http from 'k6/http' +import {check, sleep} from 'k6' +import {SharedArray} from 'k6/data' +import {Counter, Rate, Trend} from 'k6/metrics' +import exec from 'k6/execution'; + +// Server chat completions prefix +const server_url = __ENV.SERVER_BENCH_URL ? __ENV.SERVER_BENCH_URL : 'http://localhost:8080/v1' + +// Number of total prompts in the dataset - default 10m / 10 seconds/request * number of users +const n_prompt = __ENV.SERVER_BENCH_N_PROMPTS ? parseInt(__ENV.SERVER_BENCH_N_PROMPTS) : 600 / 10 * 8 + +// Model name to request +const model = __ENV.SERVER_BENCH_MODEL_ALIAS ? __ENV.SERVER_BENCH_MODEL_ALIAS : 'my-model' + +// Dataset path +const dataset_path = __ENV.SERVER_BENCH_DATASET ? __ENV.SERVER_BENCH_DATASET : './ShareGPT_V3_unfiltered_cleaned_split.json' + +// Max tokens to predict +const max_tokens = __ENV.SERVER_BENCH_MAX_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_TOKENS) : 512 + +// Max prompt tokens +const n_prompt_tokens = __ENV.SERVER_BENCH_MAX_PROMPT_TOKENS ? parseInt(__ENV.SERVER_BENCH_MAX_PROMPT_TOKENS) : 1024 + +// Max slot context +const n_ctx_slot = __ENV.SERVER_BENCH_MAX_CONTEXT ? parseInt(__ENV.SERVER_BENCH_MAX_CONTEXT) : 2048 + +export function setup() { + console.info(`Benchmark config: server_url=${server_url} n_prompt=${n_prompt} model=${model} dataset_path=${dataset_path} max_tokens=${max_tokens}`) +} + +const data = new SharedArray('conversations', function () { + const tokenizer = (message) => message.split(/[\s,'".?]/) + + return JSON.parse(open(dataset_path)) + // Filter out the conversations with less than 2 turns. + .filter(data => data["conversations"].length >= 2) + .filter(data => data["conversations"][0]["from"] === "human") + .map(data => { + return { + prompt: data["conversations"][0]["value"], + n_prompt_tokens: tokenizer(data["conversations"][0]["value"]).length, + n_completion_tokens: tokenizer(data["conversations"][1]["value"]).length, + } + }) + // Filter out too short sequences + .filter(conv => conv.n_prompt_tokens >= 4 && conv.n_completion_tokens >= 4) + // Filter out too long sequences. + .filter(conv => conv.n_prompt_tokens <= n_prompt_tokens && conv.n_prompt_tokens + conv.n_completion_tokens <= n_ctx_slot) + // Keep only first n prompts + .slice(0, n_prompt) +}) + +const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') +const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') +const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') + +const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') +const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') + +const llamacpp_completions_truncated_rate = new Rate('llamacpp_completions_truncated_rate') +const llamacpp_completions_stop_rate = new Rate('llamacpp_completions_stop_rate') + +export const options = { + thresholds: { + llamacpp_completions_truncated_rate: [ + // more than 80% of truncated input will abort the test + {threshold: 'rate < 0.8', abortOnFail: true, delayAbortEval: '1m'}, + ], + }, + duration: '10m', + vus: 8, +} + +export default function () { + const conversation = data[exec.scenario.iterationInInstance % data.length] + const payload = { + "messages": [ + { + "role": "system", + "content": "You are ChatGPT, an AI assistant.", + }, + { + "role": "user", + "content": conversation.prompt, + } + ], + "model": model, + "stream": false, + "max_tokens": max_tokens + } + + const body = JSON.stringify(payload) + + let res = http.post(`${server_url}/chat/completions`, body, { + headers: {'Content-Type': 'application/json'}, + timeout: '300s' + }) + + check(res, {'success completion': (r) => r.status === 200}) + + if (res.status === 200) { + const completions = res.json() + + llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) + llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) + + llamacpp_completion_tokens.add(completions.usage.completion_tokens) + llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) + + llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') + llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') + + llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3) + } else { + console.error(`response: ${res.body} request=${payload}`) + } + + sleep(0.3) +} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b14cca61b1530..c7d3ed01b6347 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2133,6 +2133,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); printf(" --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n"); + printf(" -dt N, --defrag-thold N\n"); + printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); @@ -2355,6 +2357,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } else { invalid_param = true; break; } + } else if (arg == "--defrag-thold" || arg == "-dt") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.defrag_thold = std::stof(argv[i]); } else if (arg == "--threads" || arg == "-t") { if (++i >= argc) { From c78541479cf835dd9eb568ccd9a2083198a7203d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Mar 2024 16:43:08 +0200 Subject: [PATCH 22/39] nix: update flake.lock (#5969) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/1536926ef5621b09bba54035ae2bb6d806d72ac8' (2024-02-29) → 'github:NixOS/nixpkgs/9df3e30ce24fd28c7b3e2de0d986769db5d6225d' (2024-03-06) Co-authored-by: github-actions[bot] --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index b1b0916566453..f9865d5e464cd 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1709237383, - "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=", + "lastModified": 1709703039, + "narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8", + "rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d", "type": "github" }, "original": { From 2960eae847f8dbde23be6d170a61bcf44ebf32de Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Sun, 10 Mar 2024 11:17:43 -0400 Subject: [PATCH 23/39] grammar : verify parsed state (#5950) --- common/grammar-parser.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index bf89a96f3617f..2a1301569793a 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -278,6 +278,22 @@ namespace grammar_parser { while (*pos) { pos = parse_rule(state, pos); } + // Validate the state to ensure that all rules are defined + for (const auto & rule : state.rules) { + for (const auto & elem : rule) { + if (elem.type == LLAMA_GRETYPE_RULE_REF) { + // Ensure that the rule at that location exists + if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) { + // Get the name of the rule that is missing + for (const auto & kv : state.symbol_ids) { + if (kv.second == elem.value) { + throw std::runtime_error("Undefined rule identifier '" + kv.first + "'"); + } + } + } + } + } + } return state; } catch (const std::exception & err) { fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what()); From bcebd7dbf62fd7b293d5ed089023e4e733269c71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Sun, 10 Mar 2024 11:56:30 -0400 Subject: [PATCH 24/39] llama : add support for GritLM (#5959) * add gritlm example * gritlm results match * tabs to spaces * comment out debug printing * rebase to new embed * gritlm embeddings are back babeee * add to gitignore * allow to toggle embedding mode * Clean-up GritLM sample code. * Fix types. * Flush stdout and output ending newline if streaming. * mostly style fixes; correct KQ_mask comment * add causal_attn flag to llama_cparams * gritml : minor * llama : minor --------- Co-authored-by: Douglas Hanley Co-authored-by: Georgi Gerganov --- .gitignore | 1 + Makefile | 6 +- examples/CMakeLists.txt | 1 + examples/gritlm/CMakeLists.txt | 5 + examples/gritlm/gritlm.cpp | 229 +++++++++++++++++++++++++++++++++ llama.cpp | 25 +++- llama.h | 4 + 7 files changed, 267 insertions(+), 4 deletions(-) create mode 100644 examples/gritlm/CMakeLists.txt create mode 100644 examples/gritlm/gritlm.cpp diff --git a/.gitignore b/.gitignore index 62b6b8b1ab250..d28f4d1b801ea 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ models-mnt /embedding /gguf /gguf-llama-simple +/gritlm /imatrix /infill /libllama.so diff --git a/Makefile b/Makefile index d809b8b3bbfc1..6492da8a02fb7 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ - speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o + speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ @@ -724,6 +724,10 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 653abc73ac98f..e762cf8b9238b 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -20,6 +20,7 @@ else() add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(finetune) + add_subdirectory(gritlm) add_subdirectory(infill) add_subdirectory(llama-bench) add_subdirectory(llava) diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt new file mode 100644 index 0000000000000..ac4a5ae7937ea --- /dev/null +++ b/examples/gritlm/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gritlm) +add_executable(${TARGET} gritlm.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp new file mode 100644 index 0000000000000..3d4b085d69b6f --- /dev/null +++ b/examples/gritlm/gritlm.cpp @@ -0,0 +1,229 @@ +#include "common.h" +#include "llama.h" + +#include +#include + +// #define GRIT_DEBUG + +static float dot_product(const std::vector & v1, const std::vector & v2) { + float dot = 0.0f; + for (uint64_t i = 0; i < v1.size(); ++i) { + dot += v1[i] * v2[i]; + } + return dot; +} + +static float norm(const std::vector & v) { + return std::sqrt(dot_product(v, v)); +} + +static float cosine_similarity(const std::vector & v1, const std::vector & v2) { + return dot_product(v1, v2) / (norm(v1) * norm(v2)); +} + +static std::vector> encode(llama_context * ctx, const std::vector & sentences, const std::string & instruction) { + std::vector> result; + + const llama_model * mdl = llama_get_model(ctx); + + llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1); + + for (uint64_t i = 0; i < sentences.size(); i++) { + llama_batch_clear(batch); + + const std::string input_string = instruction + sentences[i]; + + std::vector inputs = llama_tokenize(mdl, input_string, true, false); + + const int32_t n_toks = inputs.size(); + + // GritLM seems to have EOS = "" + // https://github.com/ContextualAI/gritlm/blob/92025b16534712b31b3c4aaaf069350e222bd5f8/gritlm/gritlm.py#L18 + // inputs.push_back(llama_token_eos(mdl)); + + // we want to ignore instruction tokens for mean pooling + const int32_t n_inst = llama_tokenize(mdl, instruction, true, false).size(); + +#ifdef GRIT_DEBUG + // debug tokens - should be matching as referenced in the GritLM sample + std::for_each(inputs.begin(), inputs.end(), [&ctx](llama_token t) { + std::printf("[%u:%s]", t, llama_token_to_piece(ctx, t).c_str()); + }); + std::printf("\n"); +#endif + + // add input to batch (this increments n_tokens) + for (int32_t j = 0; j < n_toks; j++) { + llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst); + } + + // clear previous kv_cache values (irrelevant for embeddings) + llama_kv_cache_clear(ctx); + llama_set_causal_attn(ctx, false); + + // run model + llama_decode(ctx, batch); + + // get embedding dimensions + uint64_t n_embd = llama_n_embd(mdl); + + // allocate embedding output + std::vector emb_unorm(n_embd, 0.0f); + + // sum up all token embeddings + for (int32_t k = n_inst; k < n_toks; k++) { + float * emb = llama_get_embeddings_ith(ctx, k); + for (uint64_t j = 0; j < n_embd; j++) { + emb_unorm[j] += emb[j]; + } + } + + // divide by number of tokens (mean pooling) + { + const uint64_t n_sent = n_toks - n_inst; + + for (uint64_t j = 0; j < n_embd; j++) { + emb_unorm[j] /= n_sent; + } + } + + std::vector emb_norm(emb_unorm.size()); + llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); + result.push_back(emb_norm); + +#ifdef GRIT_DEBUG + // print out emb_norm + std::printf("embedding %ld: ", i); + for (uint64_t j = 0; j < n_embd; j++) { + std::printf("%.5f ", emb_norm[j]); + } + std::printf("\n\n"); +#endif + } + + llama_batch_free(batch); + + return result; +} + +static std::string generate(llama_context * ctx, const std::string & prompt, bool stream) { + std::string result; + + const llama_model * mdl = llama_get_model(ctx); + llama_token eos_token = llama_token_eos(mdl); + + llama_kv_cache_clear(ctx); + llama_set_causal_attn(ctx, true); + llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1); + + std::vector inputs = llama_tokenize(mdl, prompt, false, true); + int32_t i_current_token = 0; + + while (true) { + llama_batch_clear(bat); + auto n_inputs = (int32_t)inputs.size(); + for (int32_t i = 0; i < n_inputs; i++) { + llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1); + } + inputs.clear(); + + llama_decode(ctx, bat); + auto logits = llama_get_logits_ith(ctx, bat.n_tokens - 1); + + auto candidates = std::vector(llama_n_vocab(mdl)); + auto n_candidates = (int32_t)candidates.size(); + for (int32_t token = 0; token < n_candidates; token++) { + candidates[token] = llama_token_data{ token, logits[token], 0.0f }; + } + auto candidates_p = llama_token_data_array{ candidates.data(), candidates.size(), false }; + + llama_token token = llama_sample_token_greedy(ctx, &candidates_p); + if (token == eos_token) { + break; + } + + std::string piece = llama_token_to_piece(ctx, token); + if (stream) { + std::printf("%s", piece.c_str()); + std::fflush(stdout); + } + + inputs.push_back(token); + + result += piece; + } + + if (stream) { + std::printf("\n"); + } + + llama_batch_free(bat); + + return result; +} + +static std::string gritlm_instruction(const std::string & instruction) { + return !instruction.empty() ? "<|user|>\n" + instruction + "\n<|embed|>\n" : "<|embed|>\n"; +} + +int main(int argc, char * argv[]) { + gpt_params params; + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + llama_model_params mparams = llama_model_params_from_gpt_params(params); + llama_context_params cparams = llama_context_params_from_gpt_params(params); + + llama_backend_init(); + + llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams); + + // create new context - set to embedding mode + cparams.embeddings = true; + llama_context * ctx = llama_new_context_with_model(mdl, cparams); + + // ### Embedding/Representation ### + // samples taken from: https://github.com/ContextualAI/gritlm#basic + { + const std::string instruction = "Given a scientific paper title, retrieve the paper's abstract"; + + const std::vector queries = { + "Bitcoin: A Peer-to-Peer Electronic Cash System", + "Generative Representational Instruction Tuning", + }; + + const std::vector documents = { + "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.", + "All text-based language problems can be reduced to either generation or embedding. Current models only perform well at one or the other. We introduce generative representational instruction tuning (GRIT) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions. Compared to other open models, our resulting GritLM 7B sets a new state of the art on the Massive Text Embedding Benchmark (MTEB) and outperforms all models up to its size on a range of generative tasks. By scaling up further, GritLM 8X7B outperforms all open generative language models that we tried while still being among the best embedding models. Notably, we find that GRIT matches training on only generative or embedding data, thus we can unify both at no performance loss. Among other benefits, the unification via GRIT speeds up Retrieval-Augmented Generation (RAG) by > 60% for long documents, by no longer requiring separate retrieval and generation models. Models, code, etc. are freely available at https://github.com/ContextualAI/gritlm.", + }; + + // No need to add instruction for retrieval documents + const std::vector> d_rep = encode(ctx, documents, gritlm_instruction("")); + const std::vector> q_rep = encode(ctx, queries, gritlm_instruction(instruction)); + + const float cosine_sim_q0_d0 = cosine_similarity(q_rep[0], d_rep[0]); + const float cosine_sim_q0_d1 = cosine_similarity(q_rep[0], d_rep[1]); + const float cosine_sim_q1_d0 = cosine_similarity(q_rep[1], d_rep[0]); + const float cosine_sim_q1_d1 = cosine_similarity(q_rep[1], d_rep[1]); + + std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0); + std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1); + std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[0].c_str(), cosine_sim_q1_d0); + std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[1].c_str(), documents[1].c_str(), cosine_sim_q1_d1); + } + + // ### Generation ### + // GritLM models are not finetuned with system prompts, as you can just include system-like instructions together with your user instruction + { + const std::string prompt = "<|user|>\nPlease write me a poem about my recent hike of Mt. Fuji at midnight in the style of Shakespeare.\n<|assistant|>\n"; + std::string response = generate(ctx, prompt, true); + } + + llama_free(ctx); + llama_free_model(mdl); + llama_backend_free(); + + return 0; +} diff --git a/llama.cpp b/llama.cpp index b19616e8f9a5f..249442166d24e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1744,6 +1744,7 @@ struct llama_cparams { float defrag_thold; bool embeddings; + bool causal_attn; bool offload_kqv; enum llama_pooling_type pooling_type; @@ -3939,6 +3940,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); + LLAMA_LOG_INFO("%s: causal attm = %d\n", __func__, hparams.causal_attn); LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type); LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type); LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type); @@ -8532,7 +8534,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); } - if (hparams.causal_attn) { + GGML_ASSERT( + (hparams.causal_attn || !cparams.causal_attn) && + "non-causal attention with generative models is not supported" + ); + + // NOTE: hparams.causal_attn indicates the model is capable of generation and uses the kv cache. + if (cparams.causal_attn) { const int64_t n_kv = kv_self.n; const int64_t n_tokens = batch.n_tokens; @@ -8560,8 +8568,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } } else { - // non-causal attention attends only the tokens within the batch (i.e. the KV cache is not used) + // when using kv cache, the mask needs to match the kv cache size const int64_t n_tokens = batch.n_tokens; + const int64_t n_stride = hparams.causal_attn ? kv_self.n : n_tokens; assert(ggml_backend_buffer_is_host(lctx.inp_KQ_mask->buffer)); @@ -8580,7 +8589,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - data[h*(n_tokens*n_tokens) + j*n_tokens + i] = f; + data[h*(n_tokens*n_tokens) + j*n_stride + i] = f; + } + + for (int i = n_tokens; i < n_stride; ++i) { + data[h*(n_tokens*n_tokens) + j*n_stride + i] = -INFINITY; } } } @@ -12733,6 +12746,8 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } + cparams.causal_attn = hparams.causal_attn; + if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) { cparams.pooling_type = LLAMA_POOLING_TYPE_NONE; @@ -13767,6 +13782,10 @@ void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback) ctx->abort_callback_data = abort_callback_data; } +void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { + ctx->cparams.causal_attn = causal_attn; +} + struct llama_batch llama_batch_get_one( llama_token * tokens, int32_t n_tokens, diff --git a/llama.h b/llama.h index 7a107c7f335d5..c8e05aadd9913 100644 --- a/llama.h +++ b/llama.h @@ -643,6 +643,10 @@ extern "C" { // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); + // Set whether to use causal attention or not + // If set to true, the model will only attend to the past tokens + LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn); + // Set abort callback LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); From fa8a809a9119411bc9c3f00026550d0343f4d9b7 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 10 Mar 2024 18:17:47 +0100 Subject: [PATCH 25/39] server: ci: windows build and tests (#5968) * server: ci: windows build and tests * server: ci: remove tmp push branch * server: ci: EOF EOL * Use builti Co-authored-by: Jared Van Bortel * server: tests: server graceful shutdown, then kill, then hard kill * server: tests: remove python2 unicode string * server: tests: remove wrong comment on server starting, close_fds is always true * server: tests: server kill, if pid exists * server: tests: remove dependency to killall * server: tests: ci windows: pid exists better handling --------- Co-authored-by: Jared Van Bortel --- .github/workflows/server.yml | 46 ++++- examples/server/tests/features/environment.py | 66 +++++-- examples/server/tests/features/server.feature | 2 +- examples/server/tests/features/steps/steps.py | 176 ++++++++++-------- 4 files changed, 191 insertions(+), 99 deletions(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index f9aeefaa847ed..e385e03f3a245 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -47,6 +47,8 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: Dependencies id: depends @@ -58,7 +60,6 @@ jobs: cmake \ python3-pip \ wget \ - psmisc \ language-pack-en - name: Build @@ -90,3 +91,46 @@ jobs: run: | cd examples/server/tests PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow + + + server-windows: + runs-on: windows-latest + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ; + cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server + + - name: Python setup + id: setup_python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Tests dependencies + id: test_dependencies + run: | + pip install -r examples/server/tests/requirements.txt + + - name: Tests + id: server_integration_tests + run: | + cd examples/server/tests + behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp + + - name: Slow tests + id: server_integration_tests_slow + if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }} + run: | + cd examples/server/tests + behave.exe --stop --no-skipped --no-capture --tags slow diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 9fd330db6ddc9..8ad987e1bb618 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -1,9 +1,10 @@ +import errno import os import socket import subprocess import time from contextlib import closing -from signal import SIGKILL +import signal def before_scenario(context, scenario): @@ -29,44 +30,71 @@ def after_scenario(context, scenario): for line in f: print(line) if not is_server_listening(context.server_fqdn, context.server_port): - print("\x1b[33;101mERROR: Server stopped listening\x1b[0m") + print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n") if not pid_exists(context.server_process.pid): assert False, f"Server not running pid={context.server_process.pid} ..." - print(f"stopping server pid={context.server_process.pid} ...") - context.server_process.kill() + server_graceful_shutdown(context) + # Wait few for socket to free up time.sleep(0.05) attempts = 0 - while is_server_listening(context.server_fqdn, context.server_port): - print(f"stopping server pid={context.server_process.pid} ...") - os.kill(context.server_process.pid, SIGKILL) + while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port): + server_kill(context) time.sleep(0.1) attempts += 1 if attempts > 5: - print(f"Server dangling exits, killing all {context.server_path} ...") - process = subprocess.run(['killall', '-9', context.server_path], - stderr=subprocess.PIPE, - universal_newlines=True) - print(process) + server_kill_hard(context) + + +def server_graceful_shutdown(context): + print(f"shutting down server pid={context.server_process.pid} ...\n") + if os.name == 'nt': + os.kill(context.server_process.pid, signal.CTRL_C_EVENT) + else: + os.kill(context.server_process.pid, signal.SIGINT) + + +def server_kill(context): + print(f"killing server pid={context.server_process.pid} ...\n") + context.server_process.kill() + + +def server_kill_hard(context): + pid = context.server_process.pid + path = context.server_path + + print(f"Server dangling exits, hard killing force {pid}={path}...\n") + if os.name == 'nt': + process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode() + print(process) + else: + os.kill(-pid, signal.SIGKILL) def is_server_listening(server_fqdn, server_port): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: result = sock.connect_ex((server_fqdn, server_port)) - return result == 0 + _is_server_listening = result == 0 + if _is_server_listening: + print(f"server is listening on {server_fqdn}:{server_port}...\n") + return _is_server_listening def pid_exists(pid): """Check whether pid exists in the current process table.""" - import errno if pid < 0: return False - try: - os.kill(pid, 0) - except OSError as e: - return e.errno == errno.EPERM + if os.name == 'nt': + output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode() + print(output) + return "No tasks are running" not in output else: - return True + try: + os.kill(pid, 0) + except OSError as e: + return e.errno == errno.EPERM + else: + return True diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index aa132fa3472ef..5014f326dc050 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -47,7 +47,7 @@ Feature: llama.cpp server Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. """ And a completion request with no api error - Then 64 tokens are predicted matching fun|Annaks|popcorns + Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry And the completion is truncated And 109 prompt tokens are processed diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 14204850960c9..4e81a255a08d9 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -18,7 +18,7 @@ from prometheus_client import parser -@step(u"a server listening on {server_fqdn}:{server_port}") +@step("a server listening on {server_fqdn}:{server_port}") def step_server_config(context, server_fqdn, server_port): context.server_fqdn = server_fqdn context.server_port = int(server_port) @@ -57,24 +57,24 @@ def step_server_config(context, server_fqdn, server_port): context.prompts = [] -@step(u'a model file {hf_file} from HF repo {hf_repo}') +@step('a model file {hf_file} from HF repo {hf_repo}') def step_download_hf_model(context, hf_file, hf_repo): context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file) if context.debug: print(f"model file: {context.model_file}\n") -@step(u'a model alias {model_alias}') +@step('a model alias {model_alias}') def step_model_alias(context, model_alias): context.model_alias = model_alias -@step(u'{seed:d} as server seed') +@step('{seed:d} as server seed') def step_seed(context, seed): context.server_seed = seed -@step(u'{ngl:d} GPU offloaded layers') +@step('{ngl:d} GPU offloaded layers') def step_n_gpu_layer(context, ngl): if 'N_GPU_LAYERS' in os.environ: new_ngl = int(os.environ['N_GPU_LAYERS']) @@ -84,37 +84,37 @@ def step_n_gpu_layer(context, ngl): context.n_gpu_layer = ngl -@step(u'{n_ctx:d} KV cache size') +@step('{n_ctx:d} KV cache size') def step_n_ctx(context, n_ctx): context.n_ctx = n_ctx -@step(u'{n_slots:d} slots') +@step('{n_slots:d} slots') def step_n_slots(context, n_slots): context.n_slots = n_slots -@step(u'{n_predict:d} server max tokens to predict') +@step('{n_predict:d} server max tokens to predict') def step_server_n_predict(context, n_predict): context.n_server_predict = n_predict -@step(u'continuous batching') +@step('continuous batching') def step_server_continuous_batching(context): context.server_continuous_batching = True -@step(u'embeddings extraction') +@step('embeddings extraction') def step_server_embeddings(context): context.server_embeddings = True -@step(u'prometheus compatible metrics exposed') +@step('prometheus compatible metrics exposed') def step_server_metrics(context): context.server_metrics = True -@step(u"the server is starting") +@step("the server is starting") def step_start_server(context): start_server_background(context) attempts = 0 @@ -131,7 +131,7 @@ def step_start_server(context): time.sleep(0.1) -@step(u"the server is {expecting_status}") +@step("the server is {expecting_status}") @async_run_until_complete async def step_wait_for_the_server_to_be_started(context, expecting_status): match expecting_status: @@ -160,7 +160,7 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status): assert False, "unknown status" -@step(u'all slots are {expected_slot_status_string}') +@step('all slots are {expected_slot_status_string}') @async_run_until_complete async def step_all_slots_status(context, expected_slot_status_string): match expected_slot_status_string: @@ -176,7 +176,7 @@ async def step_all_slots_status(context, expected_slot_status_string): await request_slots_status(context, expected_slots) -@step(u'a completion request with {api_error} api error') +@step('a completion request with {api_error} api error') @async_run_until_complete async def step_request_completion(context, api_error): expect_api_error = api_error == 'raised' @@ -194,133 +194,133 @@ async def step_request_completion(context, api_error): assert completion == 401, f"completion must be an 401 status code: {completion}" -@step(u'{predicted_n:d} tokens are predicted matching {re_content}') +@step('{predicted_n:d} tokens are predicted matching {re_content}') def step_n_tokens_predicted_with_content(context, predicted_n, re_content): context.completion = context.tasks_result.pop() assert_n_tokens_predicted(context.completion, predicted_n, re_content) -@step(u'{predicted_n:d} tokens are predicted') +@step('{predicted_n:d} tokens are predicted') def step_n_tokens_predicted(context, predicted_n): context.completion = context.tasks_result.pop() assert_n_tokens_predicted(context.completion, predicted_n) -@step(u'the completion is truncated') +@step('the completion is truncated') def step_assert_completion_truncated(context): step_assert_completion_truncated(context, '') -@step(u'the completion is {truncated} truncated') +@step('the completion is {truncated} truncated') def step_assert_completion_truncated(context, truncated): truncated = truncated != "not" assert context.completion['truncated'] == truncated, f'{context.completion}' -@step(u'{n_prompt:d} prompt tokens are processed') +@step('{n_prompt:d} prompt tokens are processed') def step_impl(context, n_prompt): assert n_prompt < 0 or n_prompt == context.completion['timings']['prompt_n'], f"n_prompt={context.completion['timings']['prompt_n']}" -@step(u'a user prompt {user_prompt}') +@step('a user prompt {user_prompt}') def step_user_prompt(context, user_prompt): context.prompts.append(user_prompt) context.n_prompts = len(context.prompts) -@step(u'a system prompt {system_prompt}') +@step('a system prompt {system_prompt}') def step_system_prompt(context, system_prompt): context.system_prompt = system_prompt -@step(u'a model {model}') +@step('a model {model}') def step_model(context, model): context.model = model -@step(u'{max_tokens:d} max tokens to predict') +@step('{max_tokens:d} max tokens to predict') def step_max_tokens(context, max_tokens): context.n_predict = max_tokens -@step(u'streaming is {enable_streaming}') +@step('streaming is {enable_streaming}') def step_streaming(context, enable_streaming): context.enable_streaming = enable_streaming == 'enabled' -@step(u'a user api key {user_api_key}') +@step('a user api key {user_api_key}') def step_user_api_key(context, user_api_key): context.user_api_key = user_api_key -@step(u'no user api key') +@step('no user api key') def step_no_user_api_key(context): context.user_api_key = None -@step(u'a user api key ') +@step('a user api key ') def step_no_user_api_key_space(context): context.user_api_key = None -@step(u'a server api key {server_api_key}') +@step('a server api key {server_api_key}') def step_server_api_key(context, server_api_key): context.server_api_key = server_api_key -@step(u'{n_junk:d} as number of junk') +@step('{n_junk:d} as number of junk') def step_n_junk(context, n_junk): context.n_junk = n_junk -@step(u'{n_batch:d} as batch size') +@step('{n_batch:d} as batch size') def step_n_batch(context, n_batch): context.n_batch = n_batch -@step(u'{seed:d} as seed') +@step('{seed:d} as seed') def step_seed(context, seed): context.seed = seed -@step(u'a prefix prompt') +@step('a prefix prompt') def step_prompt_prefix(context): - context.prompt_prefix = context.text + context.prompt_prefix = context_text(context) -@step(u'a junk suffix prompt') +@step('a junk suffix prompt') def step_prompt_junk_suffix(context): - context.prompt_junk_suffix = context.text + context.prompt_junk_suffix = context_text(context) -@step(u'a suffix prompt') +@step('a suffix prompt') def step_prompt_suffix(context): - context.prompt_suffix = context.text + context.prompt_suffix = context_text(context) -@step(u'{n_ga:d} group attention factor' - u' to extend context size through self-extend') +@step('{n_ga:d} group attention factor' + ' to extend context size through self-extend') def step_impl(context, n_ga): context.n_ga = n_ga -@step(u'{n_ga_w:d} group attention width to extend context size through self-extend') +@step('{n_ga_w:d} group attention width to extend context size through self-extend') def step_impl(context, n_ga_w): context.n_ga_w = n_ga_w -@step(u'a passkey prompt template') +@step('a passkey prompt template') def step_prompt_passkey(context): - context.prompt_passkey = context.text + context.prompt_passkey = context_text(context) -@step(u'{n_prompts:d} fixed prompts') +@step('{n_prompts:d} fixed prompts') def step_fixed_prompts(context, n_prompts): context.prompts.extend([str(0)*(context.n_batch if context.n_batch is not None else 512) for i in range(n_prompts)]) context.n_prompts = n_prompts -@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk') +@step('a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk') def step_prompt_passkey(context, passkey, i_pos): prompt = "" for i in range(context.n_junk): @@ -334,7 +334,7 @@ def step_prompt_passkey(context, passkey, i_pos): context.n_prompts = len(context.prompts) -@step(u'an OAI compatible chat completions request with {api_error} api error') +@step('an OAI compatible chat completions request with {api_error} api error') @async_run_until_complete async def step_oai_chat_completions(context, api_error): if context.debug: @@ -369,19 +369,19 @@ async def step_oai_chat_completions(context, api_error): print(f"Completion response: {completion}") -@step(u'a prompt') +@step('a prompt') def step_a_prompt(context): - context.prompts.append(context.text) + context.prompts.append(context_text(context)) context.n_prompts = len(context.prompts) -@step(u'a prompt {prompt}') +@step('a prompt {prompt}') def step_a_prompt_prompt(context, prompt): context.prompts.append(prompt) context.n_prompts = len(context.prompts) -@step(u'concurrent completion requests') +@step('concurrent completion requests') @async_run_until_complete() async def step_concurrent_completion_requests(context): await concurrent_requests(context, @@ -397,7 +397,7 @@ async def step_concurrent_completion_requests(context): 'user_api_key') else None) -@step(u'concurrent OAI completions requests') +@step('concurrent OAI completions requests') @async_run_until_complete async def step_oai_chat_completions(context): await concurrent_requests(context, oai_chat_completions, @@ -417,7 +417,7 @@ async def step_oai_chat_completions(context): if hasattr(context, 'user_api_key') else None) -@step(u'concurrent OAI completions requests no v1') +@step('concurrent OAI completions requests no v1') @async_run_until_complete async def step_oai_chat_completions(context): await concurrent_requests(context, oai_chat_completions, @@ -440,13 +440,13 @@ async def step_oai_chat_completions(context): if hasattr(context, 'user_api_key') else None) -@step(u'all prompts are predicted') +@step('all prompts are predicted') @async_run_until_complete async def step_all_prompts_are_predicted(context): await all_prompts_are_predicted(context) -@step(u'all prompts are predicted with {n_expected_predicted:d} tokens') +@step('all prompts are predicted with {n_expected_predicted:d} tokens') @async_run_until_complete async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted): await all_prompts_are_predicted(context, n_expected_predicted) @@ -460,14 +460,14 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None): assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests" -@step(u'embeddings are computed for') +@step('embeddings are computed for') @async_run_until_complete async def step_compute_embedding(context): context.n_prompts = 1 - context.embeddings = await request_embedding(context.text, base_url=context.base_url) + context.embeddings = await request_embedding(context_text(context), base_url=context.base_url) -@step(u'all embeddings are the same') +@step('all embeddings are the same') @async_run_until_complete async def step_all_embeddings_are_the_same(context): n_embedding_requests = await gather_tasks_results(context) @@ -491,7 +491,8 @@ async def step_all_embeddings_are_the_same(context): print(f"{msg}\n") assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg -@step(u'embeddings are generated') + +@step('embeddings are generated') def step_assert_embeddings(context): assert context.n_prompts == len(context.embeddings), (f"unexpected response:\n" f"context.n_prompts={context.n_prompts}\n" @@ -500,17 +501,17 @@ def step_assert_embeddings(context): assert_embeddings(embedding) -@step(u'an OAI compatible embeddings computation request for') +@step('an OAI compatible embeddings computation request for') @async_run_until_complete async def step_oai_compute_embeddings(context): context.n_prompts = 1 - context.embeddings = await request_oai_embeddings(context.text, + context.embeddings = await request_oai_embeddings(context_text(context), base_url=context.base_url, user_api_key=context.user_api_key, model=context.model) -@step(u'an OAI compatible embeddings computation request for multiple inputs') +@step('an OAI compatible embeddings computation request for multiple inputs') @async_run_until_complete async def step_oai_compute_embeddings_multiple_inputs(context): context.embeddings = await request_oai_embeddings(context.prompts, @@ -520,7 +521,7 @@ async def step_oai_compute_embeddings_multiple_inputs(context): context.prompts.clear() -@step(u'concurrent embedding requests') +@step('concurrent embedding requests') @async_run_until_complete() async def step_concurrent_embedding_requests(context): await concurrent_requests(context, @@ -529,7 +530,7 @@ async def step_concurrent_embedding_requests(context): base_url=context.base_url) -@step(u'concurrent OAI embedding requests') +@step('concurrent OAI embedding requests') @async_run_until_complete() async def step_concurrent_oai_embedding_requests(context): await concurrent_requests(context, @@ -540,7 +541,7 @@ async def step_concurrent_oai_embedding_requests(context): model=context.model) -@step(u'all embeddings are generated') +@step('all embeddings are generated') @async_run_until_complete() async def all_embeddings_are_generated(context): n_embedding_requests = await gather_tasks_results(context) @@ -549,10 +550,10 @@ async def all_embeddings_are_generated(context): assert_embeddings(context.tasks_result.pop().pop()) -@step(u'tokenizing') +@step('tokenizing') @async_run_until_complete async def step_tokenize(context): - context.tokenized_text = context.text + context.tokenized_text = context_text(context) async with aiohttp.ClientSession() as session: async with session.post(f'{context.base_url}/tokenize', json={ @@ -563,7 +564,7 @@ async def step_tokenize(context): context.tokens = tokenize_json['tokens'] -@step(u'tokens can be detokenize') +@step('tokens can be detokenize') @async_run_until_complete async def step_detokenize(context): assert len(context.tokens) > 0 @@ -578,7 +579,7 @@ async def step_detokenize(context): assert context.tokenized_text == detokenize_json['content'].strip() -@step(u'an OPTIONS request is sent from {origin}') +@step('an OPTIONS request is sent from {origin}') @async_run_until_complete async def step_options_request(context, origin): async with aiohttp.ClientSession() as session: @@ -589,12 +590,12 @@ async def step_options_request(context, origin): context.options_response = response -@step(u'CORS header {cors_header} is set to {cors_header_value}') +@step('CORS header {cors_header} is set to {cors_header_value}') def step_check_options_header_value(context, cors_header, cors_header_value): assert context.options_response.headers[cors_header] == cors_header_value -@step(u'prometheus metrics are exposed') +@step('prometheus metrics are exposed') @async_run_until_complete async def step_prometheus_metrics_exported(context): async with aiohttp.ClientSession() as session: @@ -616,14 +617,14 @@ async def step_prometheus_metrics_exported(context): assert metric_exported, "No metrics exported" -@step(u'metric {metric_name} is {metric_value:d}') +@step('metric {metric_name} is {metric_value:d}') def step_assert_metric_value(context, metric_name, metric_value): if metric_name not in context.metrics: assert False, f"no metric {metric_name} in {context.metrics.keys()}" assert context.metrics[metric_name].samples[0].value == metric_value, f"metric: {context.metrics[metric_name]}" -@step(u'available models') +@step('available models') def step_available_models(context): # openai client always expects an api_key openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope' @@ -631,14 +632,14 @@ def step_available_models(context): context.models = openai.Model.list().data -@step(u'{n_model:d} models are supported') +@step('{n_model:d} models are supported') def step_supported_models(context, n_model): if context.debug: print("server models available:", context.models) assert len(context.models) == n_model -@step(u'model {i_model:d} is {param} {preposition} {param_value}') +@step('model {i_model:d} is {param} {preposition} {param_value}') def step_supported_models(context, i_model, param, preposition, param_value): assert i_model < len(context.models) model = context.models[i_model] @@ -1007,12 +1008,22 @@ async def completions_seed(context): else context.server_seed if hasattr(context, 'server_seed') else None +def context_text(context): + return context.text.replace('\r', '') + + def start_server_background(context): - context.server_path = '../../../build/bin/server' + if os.name == 'nt': + context.server_path = '../../../build/bin/Release/server.exe' + else: + context.server_path = '../../../build/bin/server' if 'LLAMA_SERVER_BIN_PATH' in os.environ: context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] + server_listen_addr = context.server_fqdn + if os.name == 'nt': + server_listen_addr = '0.0.0.0' server_args = [ - '--host', context.server_fqdn, + '--host', server_listen_addr, '--port', context.server_port, '--model', context.model_file ] @@ -1045,7 +1056,16 @@ def start_server_background(context): if 'SERVER_LOG_FORMAT_JSON' not in os.environ: server_args.extend(['--log-format', "text"]) print(f"starting server with: {context.server_path} {server_args}\n") + flags = 0 + if 'nt' == os.name: + flags |= subprocess.DETACHED_PROCESS + flags |= subprocess.CREATE_NEW_PROCESS_GROUP + flags |= subprocess.CREATE_NO_WINDOW + + pkwargs = { + 'creationflags': flags, + } context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], - close_fds=True) - print(f"server pid={context.server_process.pid}") + **pkwargs) + print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}") From bf47a5eefc669fdba71af096942af999bc1167d4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Mar 2024 20:09:24 +0200 Subject: [PATCH 26/39] ggml : remove __constant__ specifier for CUDA tables (#5940) --- ggml-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-common.h b/ggml-common.h index 4b6d248b6db90..84981212cfc46 100644 --- a/ggml-common.h +++ b/ggml-common.h @@ -17,7 +17,7 @@ #elif defined(GGML_COMMON_IMPL_CUDA) #include -#define GGML_TABLE_BEGIN(type, name, size) static const __device__ __constant__ type name[size] = { +#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = { #define GGML_TABLE_END() }; #define GGML_COMMON_IMPL From df4dc3e7cb43162862f339525638ba4febcb6158 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 8 Mar 2024 23:45:07 +0200 Subject: [PATCH 27/39] ggml : try fix 32-bit arm compat (whisper/1938) * ggml : try fix 32-bit arm compat * ggml : fix cont --- ggml-quants.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/ggml-quants.c b/ggml-quants.c index 807c5e391069c..42d8a5d805144 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -9009,8 +9009,8 @@ void ggml_vec_dot_iq2_s_q8_K(int n, float * restrict s, size_t bs, const void * static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; - const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1); - const uint8x16_t mask2 = vld1q_u8(k_mask2); + const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1); + const uint8x16_t mask2 = vld1q_u8(k_mask2); const uint8x16_t m1 = vdupq_n_u8(1); const int32x4_t vzero = vdupq_n_s32(0); @@ -9354,11 +9354,12 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1}; - const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1); - const uint8x16_t mask2 = vld1q_u8(k_mask2); - const int16x8_t hshift = vld1q_s16(k_shift); - const uint16x8_t m256 = vdupq_n_u16(256); - const uint8x16_t m1 = vdupq_n_u8(1); + const ggml_uint8x16x2_t mask1 = ggml_vld1q_u8_x2(k_mask1); + const uint8x16_t mask2 = vld1q_u8(k_mask2); + + const int16x8_t hshift = vld1q_s16(k_shift); + const uint16x8_t m256 = vdupq_n_u16(256); + const uint8x16_t m1 = vdupq_n_u8(1); uint8x16x2_t vs; ggml_int8x16x4_t q3s; From b838b53ad6de2e53f23ddf8f3ad5e6891cc3dd05 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Mar 2024 20:10:46 +0200 Subject: [PATCH 28/39] sync : ggml --- scripts/sync-ggml.last | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index afe68077d0ec9..7c30162e2c570 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -8695910a39102609073d0e099aa7c97d6bcb3bf9 +43a6d4af1971ee2912ff7bc2404011ff327b6a60 From d9f65c97c3dc3aa6fa27470b8c6e69b437ec1a27 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Mar 2024 20:58:26 +0200 Subject: [PATCH 29/39] readme : update hot topics --- README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e3ec0817ae0e5..98fdc6808ffe3 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++ -> [!IMPORTANT] -> **Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962** -> -> Vote for which quantization type provides better responses, all other parameters being the same. - ### Recent API changes - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 @@ -21,6 +16,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics +- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 +- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 ---- From 7ab7b733bb48250b2df26c12b00256ef42c76932 Mon Sep 17 00:00:00 2001 From: Dean Date: Mon, 11 Mar 2024 04:03:17 +0800 Subject: [PATCH 30/39] android : fix utf8 decoding error (#5935) * examples: fix utf8 decoding error some models have a tokenizer that decodes an id into an incomplete utf8 sequence, need to validate and wait for next token one example would be: https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q4_0.gguf and and an example of the token is 18137 * android : minor --------- Co-authored-by: zhangfuwen Co-authored-by: Georgi Gerganov --- .../app/src/main/cpp/llama-android.cpp | 53 ++++++++++++++++++- .../src/main/java/com/example/llama/Llm.kt | 4 +- 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp index 2beb1e0d5321d..ce8ab3b709407 100644 --- a/examples/llama.android/app/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp @@ -33,6 +33,45 @@ jclass la_int_var; jmethodID la_int_var_value; jmethodID la_int_var_inc; +std::string cached_token_chars; + +bool is_valid_utf8(const char * string) { + if (!string) { + return true; + } + + const unsigned char * bytes = (const unsigned char *)string; + int num; + + while (*bytes != 0x00) { + if ((*bytes & 0x80) == 0x00) { + // U+0000 to U+007F + num = 1; + } else if ((*bytes & 0xE0) == 0xC0) { + // U+0080 to U+07FF + num = 2; + } else if ((*bytes & 0xF0) == 0xE0) { + // U+0800 to U+FFFF + num = 3; + } else if ((*bytes & 0xF8) == 0xF0) { + // U+10000 to U+10FFFF + num = 4; + } else { + return false; + } + + bytes += 1; + for (int i = 1; i < num; ++i) { + if ((*bytes & 0xC0) != 0x80) { + return false; + } + bytes += 1; + } + } + + return true; +} + static void log_callback(ggml_log_level level, const char * fmt, void * data) { if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data); else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data); @@ -295,6 +334,8 @@ Java_com_example_llama_Llm_completion_1init( jint n_len ) { + cached_token_chars.clear(); + const auto text = env->GetStringUTFChars(jtext, 0); const auto context = reinterpret_cast(context_pointer); const auto batch = reinterpret_cast(batch_pointer); @@ -372,8 +413,16 @@ Java_com_example_llama_Llm_completion_1loop( } auto new_token_chars = llama_token_to_piece(context, new_token_id); - LOGi("new_token_chars: `%s`", new_token_chars.c_str()); - auto new_token = env->NewStringUTF(new_token_chars.c_str()); + cached_token_chars += new_token_chars; + + jstring new_token = nullptr; + if (is_valid_utf8(cached_token_chars.c_str())) { + new_token = env->NewStringUTF(cached_token_chars.c_str()); + LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id); + cached_token_chars.clear(); + } else { + new_token = env->NewStringUTF(""); + } llama_batch_clear(*batch); llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true); diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt index 5f32703724a49..d86afee379083 100644 --- a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt +++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt @@ -71,7 +71,7 @@ class Llm { batch: Long, nLen: Int, ncur: IntVar - ): String + ): String? private external fun kv_cache_clear(context: Long) @@ -115,7 +115,7 @@ class Llm { val ncur = IntVar(completion_init(state.context, state.batch, message, nlen)) while (ncur.value <= nlen) { val str = completion_loop(state.context, state.batch, nlen, ncur) - if (str.isEmpty()) { + if (str == null) { break } emit(str) From bb6d00bbf98476215596b9df3870b5504ef5a29a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 10 Mar 2024 23:12:48 +0200 Subject: [PATCH 31/39] metal : move mm_id indices to shared mem (#5982) --- ggml-metal.m | 6 +++--- ggml-metal.metal | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 00df2283821b2..3cf80de7bf2e0 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1642,8 +1642,8 @@ static enum ggml_status ggml_metal_graph_compute( // TODO: make this more general GGML_ASSERT(n_as <= 8); - // max size of the src1ids array in the kernel stack - GGML_ASSERT(ne11 <= 512); + // max size of the src1ids array in the kernel shared buffer + GGML_ASSERT(ne11 <= 4096); const int64_t ne20 = src2 ? src2->ne[0] : 0; const int64_t ne21 = src2 ? src2->ne[1] : 0; @@ -1741,7 +1741,7 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j]; } - [encoder setThreadgroupMemoryLength:8192 atIndex:0]; + [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { diff --git a/ggml-metal.metal b/ggml-metal.metal index 6ebbbd195e7ce..50185ae4dea09 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -5386,7 +5386,7 @@ template Date: Mon, 11 Mar 2024 01:13:57 +0000 Subject: [PATCH 32/39] [SYCL] Add support for SYCL Nvidia target (#5738) * Add support for nvidia target in CMake * Update sycl read-me for Nvidia target * Fix errors --- CMakeLists.txt | 14 +++++++++++++- README-sycl.md | 26 ++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9309ca6bbe4d7..00a26391a6626 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,6 +116,7 @@ option(LLAMA_MPI "llama: use MPI" option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) +set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) @@ -534,6 +535,10 @@ if (LLAMA_HIPBLAS) endif() if (LLAMA_SYCL) + if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$") + message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA") + endif() + if ( NOT DEFINED ENV{ONEAPI_ROOT}) message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh") endif() @@ -555,6 +560,9 @@ if (LLAMA_SYCL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib") + if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") + endif() set(GGML_HEADERS_SYCL ggml-sycl.h) set(GGML_SOURCES_SYCL ggml-sycl.cpp) @@ -562,7 +570,11 @@ if (LLAMA_SYCL) if (WIN32) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib) else() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + if (LLAMA_SYCL_TARGET STREQUAL "INTEL") + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) + elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA") + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl) + endif() endif() endif() diff --git a/README-sycl.md b/README-sycl.md index 85eb16f2be340..9359a94901677 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -73,6 +73,29 @@ For iGPU, please make sure the shared memory from host memory is enough. For lla For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+. +## Nvidia GPU + +### Verified + +|Intel GPU| Status | Verified Model| +|-|-|-| +|Ampere Series| Support| A100| + +### oneMKL + +The current oneMKL release does not contain the oneMKL cuBlas backend. +As a result for Nvidia GPU's oneMKL must be built from source. + +``` +git clone https://github.com/oneapi-src/oneMKL +cd oneMKL +mkdir build +cd build +cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON +ninja +// Add paths as necessary +``` + ## Docker Note: @@ -186,6 +209,9 @@ source /opt/intel/oneapi/setvars.sh # Or, for FP32: cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +# For Nvidia GPUs +cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + # Build example/main only #cmake --build . --config Release --target main From ef3ced26a3817d92890b97b83acaeb018ade02d0 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Mon, 11 Mar 2024 10:27:56 +0530 Subject: [PATCH 33/39] [SYCL] Add q3_s and q1_s (#5886) * Add q3_s and q1_s * fix compilation * fix build * fix build * fix build * enable ops * rm macro * increase grid space --- ggml-sycl.cpp | 378 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 375 insertions(+), 3 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 6fafa5282c8f9..6d56845821f69 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3494,6 +3494,31 @@ typedef struct dpct_type_block_iq3_xxs { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +#define QR3_XS 8 +#define QI3_XS (QK_K / (4*QR3_XS)) +#if QK_K == 64 +#define IQ3S_N_SCALE 2 +#else +#define IQ3S_N_SCALE QK_K/64 +#endif +typedef struct { + sycl::half d; + uint8_t qs[QK_K/4]; + uint8_t qh[QK_K/32]; + uint8_t signs[QK_K/8]; + uint8_t scales[IQ3S_N_SCALE]; +} block_iq3_s; +static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding"); + +#define QR1_S 8 +#define QI1_S (QK_K / (4*QR1_S)) +typedef struct { + sycl::half d; + uint8_t qs[QK_K/8]; + uint8_t scales[QK_K/16]; +} block_iq1_s; +static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); + #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -4833,6 +4858,62 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res } +template +static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq3s_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + + const int i = item_ct1.get_group(2); + const block_iq3_s * x = (const block_iq3_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + qs[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + qs[2*il+1]); + const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); + const uint8_t signs = x[i].signs[4*ib + il]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + +template +static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq1s_grid, + const uint8_t *ksigns_iq2xs, + const uint8_t *kmask_iq2xs) { + + const int i = item_ct1.get_group(2); + const block_iq1_s * x = (const block_iq1_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const int i8 = 4*ib+il; + uint8_t h = x[i].scales[i8/2] >> 4*(i8%2); + const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5))); + const float d = (float)x[i].d * (2*(h & 7) + 1); + for (int j = 0; j < 8; ++j) y[j] = d * grid[j]; +#else + assert(false); +#endif + +} + /* DPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register @@ -7679,6 +7760,76 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, #endif } +static __dpct_inline__ float +vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq3s_grid, const uint64_t *ksigns64) { +#if DPCT_COMPATIBILITY_TEMP >= \ + MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq3_s * bq2 = (const block_iq3_s *) vbq; + + const int ib32 = iqs; + const uint8_t * qs = bq2->qs + 8*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); + const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); + uint32_t signs0 = dpct::vectorized_binary( + ((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + uint32_t signs1 = dpct::vectorized_binary( + ((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid1[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid2[0] ^ signs1, signs1, std::minus<>()); + sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); + sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); + q8 += 8; + } + const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * bq8_1[ib32].ds[0]; + return d * sumi; +#else + assert(false); + return 0.f; +#endif +#else + assert(false); + return 0.f; +#endif +} + +static __dpct_inline__ float +vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint64_t *iq1s_grid, const uint64_t *ksigns64) { +#if QK_K == 256 + const block_iq1_s * bq1 = (const block_iq1_s *) vbq; + + const int ib32 = iqs; + int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0; + const uint8_t h1 = bq1->scales[2*ib32+0]; + const uint8_t h2 = bq1->scales[2*ib32+1]; + const int * q8 = (const int *)bq8_1[ib32].qs; + const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5))); + const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1))); + const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5))); + const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1))); + for (int j = 0; j < 2; ++j) { + sumi1 = dpct::dp4a(q8[j+0], grid1[j], sumi1); + sumi2 = dpct::dp4a(q8[j+2], grid2[j], sumi2); + sumi3 = dpct::dp4a(q8[j+4], grid3[j], sumi3); + sumi4 = dpct::dp4a(q8[j+6], grid4[j], sumi4); + } + const float d = (float)bq1->d * bq8_1[ib32].ds[0]; + return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) + + sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1)); +#else + assert(false); + return 0.f; +#endif +} template +static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq3s_grid_ptr, const uint64_t *ksigns64_ptr ) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid_ptr, ksigns64_ptr); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, + const sycl::nd_item<3> &item_ct1, + const uint64_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_ptr, ksigns64_ptr); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + template static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, const sycl::nd_item<3> &item_ct1) { @@ -10129,6 +10372,64 @@ static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k, } } +template +static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + iq3s_grid.init(*stream); + ksigns_iq2xs.init(*stream); + kmask_iq2xs.init(*stream); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr(); + auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); + auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq3_s( + vx, y, item_ct1, iq3s_grid_ptr_ct1, + ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + }); + }); + } +} + +template +static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + iq1s_grid.init(*stream); + ksigns_iq2xs.init(*stream); + kmask_iq2xs.init(*stream); + + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + auto iq1s_grid_ptr_ct1 = iq1s_grid.get_ptr(); + auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); + auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_s( + vx, y, item_ct1, iq1s_grid_ptr_ct1, + ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + }); + }); + } +} + template static void convert_unary_sycl(const void *__restrict__ vx, dst_t *__restrict__ y, const int k, @@ -10179,6 +10480,10 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try { return dequantize_row_iq2_xs_sycl; case GGML_TYPE_IQ3_XXS: return dequantize_row_iq3_xxs_sycl; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; case GGML_TYPE_F32: return convert_unary_sycl; default: @@ -10219,6 +10524,10 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { return dequantize_row_iq2_xs_sycl; case GGML_TYPE_IQ3_XXS: return dequantize_row_iq3_xxs_sycl; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; case GGML_TYPE_F16: return convert_unary_sycl; default: @@ -10808,6 +11117,61 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, } } +static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + iq3s_grid.init(*stream); + ksigns64.init(*stream); + + stream->submit([&](sycl::handler &cgh) { + auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr(); + auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq3_s_q8_1( + vx, vy, dst, ncols, nrows, item_ct1, + iq3s_grid_ptr_ct1, ksigns64_ptr_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + iq1s_grid.init(*stream); + ksigns64.init(*stream); + + stream->submit([&](sycl::handler &cgh) { + auto iq1s_grid_ptr_ct1 = iq1s_grid.get_ptr(); + auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq1_s_q8_1( + vx, vy, dst, ncols, nrows, item_ct1, + iq1s_grid_ptr_ct1, ksigns64_ptr_ct1); + }); + }); + } +} static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols_x, @@ -13556,8 +13920,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array= VER_GEN9 ? 128 : 64; + case GGML_TYPE_IQ3_S: + return max_compute_capability >= VER_GEN9 ? 128 : 64; case GGML_TYPE_Q6_K: return 64; default: @@ -13618,6 +13985,12 @@ inline void ggml_sycl_op_mul_mat_vec_q( case GGML_TYPE_IQ3_XXS: mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; default: GGML_ASSERT(false); break; @@ -16963,9 +17336,8 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || - a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || - a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { + if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S || + a_type == GGML_TYPE_IQ4_XS) { return false; } return true; From be858f620508385ad12d0e5e862010e666ca729c Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Mon, 11 Mar 2024 07:51:49 +0100 Subject: [PATCH 34/39] Better 1.5 bit quantization (#5971) * Trying blocvks of 16 for IQ1_S - seems slightly better * iq1s_blocks16: Adjust scale fudge factor to 1.125 * iq1s_blocks16: going to blocks of 32 with 2048 lattice points, so same bpw. This is even better than blocks of 16. Should I try blocks of 64? But to keep the same bpw, when I go to 4096 lattice points, I need to remove blocks alltogether and just have superblocks of 256 weights. * iq1s_blocks16: Use 2* as sigma2 in weight adjustment * iq1s_blocks16: scalar and AVX2 dot products * iq1s_blocks16: CUDA dot product * iq1s_blocks16: Metal works, Neon does not Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s). Not seeing the bug in the Neon implementation for now. * iq1s_blocks16: fixed Neon * iq1s_blocks16: very slightly faster TG on Metal Still pathetic at 37 t/s * iq1s_blocks16: speedup Metal by packing codebook into uint32_t's * Formatting * iq1s_blocks16: uint32_t codebook is also better in CUDA TG-128 is now 204 t/s up from 194 t/s. PP-512 is 5890 t/s, so significantly better than other quants * iq1s_blocks16: slightly faster Neon dot product * iq1s_blocks16: faster AVX2 dot product * iq1s_blocks16: adjust to ggml-common.h --------- Co-authored-by: Iwan Kawrakow --- ggml-common.h | 903 ++++++++++++++++++++++++++++++++++++++++------- ggml-cuda.cu | 62 ++-- ggml-metal.metal | 66 ++-- ggml-quants.c | 512 ++++++++++++++++----------- ggml-quants.h | 4 +- 5 files changed, 1153 insertions(+), 394 deletions(-) diff --git a/ggml-common.h b/ggml-common.h index 84981212cfc46..5dd918081c8dd 100644 --- a/ggml-common.h +++ b/ggml-common.h @@ -644,136 +644,781 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512) 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, GGML_TABLE_END() -#define NGRID_IQ2XXS 512 -GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ2XXS) - 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, - 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, - 0xffffff00ff000000, 0xffffff000000ff00, 0xffffff00000000ff, 0xffffff0000000100, - 0xffffff0000010000, 0xffffff0001000000, 0xffffff01ffff00ff, 0xffffff01ff01ff00, - 0xffffff01ff010100, 0xffffff0100000001, 0xffffff0101ffff00, 0xffffff0101ff0101, - 0xffffff0101010100, 0xffff00ffff00ff01, 0xffff00ffff0000ff, 0xffff00ff00ff0100, - 0xffff00ff0100ff00, 0xffff00ff010001ff, 0xffff0000ff0101ff, 0xffff000000ffff00, - 0xffff000000000000, 0xffff00000001ff01, 0xffff000001000101, 0xffff0000010100ff, - 0xffff0001ffff0100, 0xffff00010000ff00, 0xffff000100010101, 0xffff000101000000, - 0xffff01ffffff0000, 0xffff01ffff01ffff, 0xffff01ffff010100, 0xffff01ff00000000, - 0xffff01ff01ffffff, 0xffff01ff01ff0001, 0xffff01ff0101ffff, 0xffff01ff01010001, - 0xffff0100ffffff01, 0xffff01000000ffff, 0xffff010000000100, 0xffff010001ff01ff, - 0xffff010001000000, 0xffff0101ff000000, 0xffff0101000101ff, 0xffff010101ffff01, - 0xffff01010101ff00, 0xff00ffffff000000, 0xff00ffff00ffff00, 0xff00ffff00000001, - 0xff00ffff000001ff, 0xff00ffff01010000, 0xff00ff00ffff0000, 0xff00ff00ff00ff00, - 0xff00ff00ff0000ff, 0xff00ff00ff000100, 0xff00ff00ff010001, 0xff00ff0000ff0001, - 0xff00ff000000ffff, 0xff00ff0000000000, 0xff00ff000001ff00, 0xff00ff0000010100, - 0xff00ff0001ff0000, 0xff00ff000100ff00, 0xff00ff0001000100, 0xff00ff01ff000000, - 0xff00ff0100ff0000, 0xff00ff01000001ff, 0xff00ff0101010001, 0xff0000ff00000000, - 0xff0000ff0001ff00, 0xff0000ff00010100, 0xff000000ffff0101, 0xff000000ff000000, - 0xff000000ff01ff00, 0xff00000000ff0000, 0xff0000000000ff00, 0xff000000000000ff, - 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, 0xff0000000001ffff, - 0xff00000000010000, 0xff00000001000000, 0xff00000001010100, 0xff000001ff00ff01, - 0xff000001ff0100ff, 0xff00000100000000, 0xff0000010001ff00, 0xff00000101ff0100, - 0xff0000010100ff00, 0xff0001ff00ff00ff, 0xff0001ff00000101, 0xff0001ff000100ff, - 0xff0001ff01000000, 0xff000100ff0001ff, 0xff0001000000ff01, 0xff00010000000000, - 0xff00010000010001, 0xff00010000010100, 0xff00010001ffff00, 0xff00010001ff0101, - 0xff00010001010000, 0xff000101ffffffff, 0xff000101ff000101, 0xff00010101ff00ff, - 0xff00010101000001, 0xff000101010100ff, 0xff01ffffff000101, 0xff01ffffff01ffff, - 0xff01ffffff01ff01, 0xff01ffffff0101ff, 0xff01ffff00000000, 0xff01ffff01ff0001, - 0xff01ffff0101ff01, 0xff01ff00ff000000, 0xff01ff0000ff0100, 0xff01ff000000ff01, - 0xff01ff0000010000, 0xff01ff00010000ff, 0xff01ff01ff01ff00, 0xff01ff0100000101, - 0xff0100ffffff0000, 0xff0100ffff010000, 0xff0100ff01ff00ff, 0xff0100ff01000100, - 0xff0100ff010100ff, 0xff010000ffffff01, 0xff01000000000000, 0xff0100000101ff00, - 0xff010001ffff00ff, 0xff010001ff000100, 0xff01000100ffff00, 0xff01000100010001, - 0xff01000101ff0001, 0xff010001010001ff, 0xff0101ffffffffff, 0xff0101ffff01ffff, - 0xff0101ffff010101, 0xff0101ff0000ff00, 0xff0101ff01010001, 0xff010100ff000000, - 0xff010100ff01ff01, 0xff01010000ff0001, 0xff01010000000100, 0xff01010001000000, - 0xff0101010100ffff, 0x00ffffff0000ff01, 0x00ffffff000000ff, 0x00ffffff00000100, - 0x00ffffff00010000, 0x00ffff00ffff0001, 0x00ffff00ff0000ff, 0x00ffff00ff000100, - 0x00ffff0000000000, 0x00ffff0001000100, 0x00ffff0001010001, 0x00ffff01ff00ff01, - 0x00ffff0100ff0100, 0x00ffff010000ff00, 0x00ffff01000100ff, 0x00ffff0101ff00ff, - 0x00ffff010101ff00, 0x00ff00ffffffffff, 0x00ff00ffffff01ff, 0x00ff00ffff000101, - 0x00ff00ff00000000, 0x00ff00ff000101ff, 0x00ff00ff01010101, 0x00ff0000ff000000, - 0x00ff0000ff01ffff, 0x00ff000000ff0000, 0x00ff00000000ff00, 0x00ff0000000000ff, - 0x00ff000000000000, 0x00ff000000000001, 0x00ff000000000100, 0x00ff000000010000, - 0x00ff000001ffff01, 0x00ff000001000000, 0x00ff0001ff000101, 0x00ff000100ffffff, - 0x00ff000100000000, 0x00ff0001010001ff, 0x00ff01ffff000000, 0x00ff01ff0001ff00, - 0x00ff01ff01ff0100, 0x00ff0100ff01ff01, 0x00ff010000ff00ff, 0x00ff010000ff0101, - 0x00ff010000000000, 0x00ff010000010101, 0x00ff01000100ff00, 0x00ff010001010000, - 0x00ff0101ffffff00, 0x00ff01010000ff01, 0x00ff010100000100, 0x00ff010101ff0000, - 0x0000ffffffff0100, 0x0000ffffff00ff00, 0x0000ffffff0000ff, 0x0000ffffff010000, - 0x0000ffff00000000, 0x0000ffff00010101, 0x0000ffff01ffff01, 0x0000ffff01000100, - 0x0000ff00ff000000, 0x0000ff00ff01ff00, 0x0000ff00ff0101ff, 0x0000ff0000ff0000, - 0x0000ff000000ff00, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, - 0x0000ff0000000100, 0x0000ff0000010000, 0x0000ff0001ffffff, 0x0000ff0001ff01ff, - 0x0000ff0001000000, 0x0000ff000101ffff, 0x0000ff01ffff0101, 0x0000ff01ff010000, - 0x0000ff0100000000, 0x0000ff0101000101, 0x000000ffffff0001, 0x000000ffff000000, - 0x000000ff00ff0000, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, - 0x000000ff00000001, 0x000000ff00000100, 0x000000ff00010000, 0x000000ff01000000, - 0x000000ff0101ff00, 0x00000000ffff0000, 0x00000000ff00ff00, 0x00000000ff0000ff, - 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff010000, - 0x0000000000ffff00, 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, +#define NGRID_IQ1S 2048 +#if defined(GGML_COMMON_IMPL_C) +GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S) + 0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff, + 0xffffffffffff0101, 0xffffffffff00ff00, 0xffffffffff000000, 0xffffffffff01ffff, + 0xffffffffff01ff01, 0xffffffffff0101ff, 0xffffffffff010101, 0xffffffff00ff0000, + 0xffffffff0000ff00, 0xffffffff000000ff, 0xffffffff00000001, 0xffffffff00010000, + 0xffffffff01ffffff, 0xffffffff01ffff01, 0xffffffff01ff01ff, 0xffffffff01ff0101, + 0xffffffff01000000, 0xffffffff0101ffff, 0xffffffff0101ff01, 0xffffffff010101ff, + 0xffffffff01010101, 0xffffff00ffff00ff, 0xffffff00ffff0000, 0xffffff00ff00ff00, + 0xffffff00ff0000ff, 0xffffff00ff000001, 0xffffff00ff000100, 0xffffff00ff000101, + 0xffffff00ff010000, 0xffffff0000ffff00, 0xffffff0000ff0001, 0xffffff0000ff0100, + 0xffffff000000ff01, 0xffffff0000000000, 0xffffff0000000101, 0xffffff000001ff00, + 0xffffff00000100ff, 0xffffff0000010001, 0xffffff00000101ff, 0xffffff0001ff0000, + 0xffffff000100ff00, 0xffffff00010000ff, 0xffffff0001000001, 0xffffff0001010000, + 0xffffff01ffffffff, 0xffffff01ffffff01, 0xffffff01ffff01ff, 0xffffff01ffff0101, + 0xffffff01ff000000, 0xffffff01ff01ffff, 0xffffff01ff01ff01, 0xffffff01ff0101ff, + 0xffffff01ff010101, 0xffffff0100ff0000, 0xffffff010000ff00, 0xffffff0100000100, + 0xffffff01000100ff, 0xffffff0100010100, 0xffffff0101ffffff, 0xffffff0101ffff01, + 0xffffff0101ff01ff, 0xffffff0101ff0101, 0xffffff010100ff00, 0xffffff0101000000, + 0xffffff0101000100, 0xffffff010101ffff, 0xffffff010101ff01, 0xffffff01010101ff, + 0xffffff0101010101, 0xffff00ffff00ff00, 0xffff00ffff0000ff, 0xffff00ffff000001, + 0xffff00ffff010000, 0xffff00ff00ffff00, 0xffff00ff00ff0100, 0xffff00ff00000000, + 0xffff00ff00000101, 0xffff00ff000100ff, 0xffff00ff00010000, 0xffff00ff0100ff00, + 0xffff00ff01000100, 0xffff00ff01010000, 0xffff0000ffffff00, 0xffff0000ffff00ff, + 0xffff0000ffff0000, 0xffff0000ffff0001, 0xffff0000ff000000, 0xffff0000ff0001ff, + 0xffff0000ff000101, 0xffff0000ff010100, 0xffff000000ffffff, 0xffff000000ff0000, + 0xffff000000ff0101, 0xffff00000000ffff, 0xffff00000000ff00, 0xffff0000000000ff, + 0xffff000000000000, 0xffff000000000001, 0xffff000000000100, 0xffff00000001ffff, + 0xffff00000001ff01, 0xffff000000010000, 0xffff0000000101ff, 0xffff000000010101, + 0xffff000001ffff00, 0xffff00000100ff00, 0xffff000001000000, 0xffff0000010001ff, + 0xffff000001000101, 0xffff00000101ff00, 0xffff0000010100ff, 0xffff000001010000, + 0xffff000001010001, 0xffff000001010100, 0xffff0001ff0000ff, 0xffff0001ff000100, + 0xffff000100ffff00, 0xffff000100ff00ff, 0xffff00010000ffff, 0xffff00010000ff01, + 0xffff000100000000, 0xffff0001000001ff, 0xffff00010001ffff, 0xffff00010001ff00, + 0xffff000100010001, 0xffff000100010100, 0xffff000101ff0000, 0xffff00010100ff00, + 0xffff0001010000ff, 0xffff000101000100, 0xffff01ffffffffff, 0xffff01ffffffff01, + 0xffff01ffffff01ff, 0xffff01ffffff0101, 0xffff01ffff000000, 0xffff01ffff01ffff, + 0xffff01ffff01ff01, 0xffff01ffff0101ff, 0xffff01ffff010101, 0xffff01ff00ff0000, + 0xffff01ff0000ff00, 0xffff01ff00000001, 0xffff01ff00010000, 0xffff01ff01ffffff, + 0xffff01ff01ffff01, 0xffff01ff01ff01ff, 0xffff01ff01ff0101, 0xffff01ff01000000, + 0xffff01ff0101ffff, 0xffff01ff0101ff01, 0xffff01ff010101ff, 0xffff01ff01010101, + 0xffff0100ffff0000, 0xffff0100ff00ff00, 0xffff0100ff0000ff, 0xffff0100ff000100, + 0xffff0100ff0100ff, 0xffff0100ff010000, 0xffff010000ffff00, 0xffff01000000ffff, + 0xffff01000000ff00, 0xffff010000000000, 0xffff01000001ff00, 0xffff0100000100ff, + 0xffff010000010100, 0xffff01000100ff00, 0xffff0100010000ff, 0xffff010001000001, + 0xffff010001000100, 0xffff010001010000, 0xffff0101ffffffff, 0xffff0101ffffff01, + 0xffff0101ffff01ff, 0xffff0101ffff0101, 0xffff0101ff000000, 0xffff0101ff01ffff, + 0xffff0101ff01ff01, 0xffff0101ff0101ff, 0xffff0101ff010101, 0xffff010100ff0000, + 0xffff01010000ff00, 0xffff010100000100, 0xffff01010001ff00, 0xffff010100010000, + 0xffff010101ffffff, 0xffff010101ffff01, 0xffff010101ff0000, 0xffff010101ff01ff, + 0xffff010101ff0101, 0xffff010101000000, 0xffff01010101ffff, 0xffff01010101ff01, + 0xffff0101010101ff, 0xffff010101010101, 0xff00ffffff00ffff, 0xff00ffffff00ff00, + 0xff00ffffff0000ff, 0xff00ffffff000100, 0xff00ffffff0100ff, 0xff00ffffff010000, + 0xff00ffff00ffff00, 0xff00ffff00ff00ff, 0xff00ffff0000ffff, 0xff00ffff00000000, + 0xff00ffff000001ff, 0xff00ffff0001ff00, 0xff00ffff000100ff, 0xff00ffff00010000, + 0xff00ffff00010100, 0xff00ffff0100ff00, 0xff00ffff010000ff, 0xff00ffff01000001, + 0xff00ffff0101ff00, 0xff00ffff01010000, 0xff00ff00ffffff00, 0xff00ff00ffff00ff, + 0xff00ff00ffff0001, 0xff00ff00ffff0100, 0xff00ff00ff00ffff, 0xff00ff00ff00ff01, + 0xff00ff00ff000000, 0xff00ff00ff0001ff, 0xff00ff00ff01ff00, 0xff00ff00ff0100ff, + 0xff00ff00ff010100, 0xff00ff0000ff0000, 0xff00ff0000ff0101, 0xff00ff000000ffff, + 0xff00ff000000ff00, 0xff00ff000000ff01, 0xff00ff00000000ff, 0xff00ff0000000000, + 0xff00ff0000000001, 0xff00ff0000000100, 0xff00ff000001ffff, 0xff00ff0000010000, + 0xff00ff0001ff00ff, 0xff00ff000100ff01, 0xff00ff0001000000, 0xff00ff000101ff00, + 0xff00ff00010100ff, 0xff00ff01ff00ff00, 0xff00ff01ff0000ff, 0xff00ff01ff000001, + 0xff00ff01ff010000, 0xff00ff0100ffffff, 0xff00ff0100ff0001, 0xff00ff0100ff0100, + 0xff00ff010000ff01, 0xff00ff0100000000, 0xff00ff01000001ff, 0xff00ff0100000101, + 0xff00ff01000100ff, 0xff00ff0100010001, 0xff00ff0101ff0000, 0xff00ff010100ff00, + 0xff00ff01010000ff, 0xff00ff0101000001, 0xff00ff0101010000, 0xff0000ffffffff00, + 0xff0000ffffff0001, 0xff0000ffffff0100, 0xff0000ffff0000ff, 0xff0000ffff000000, + 0xff0000ffff0001ff, 0xff0000ffff000100, 0xff0000ffff01ff00, 0xff0000ffff010001, + 0xff0000ff00ffff00, 0xff0000ff00ff0000, 0xff0000ff00ff0001, 0xff0000ff00ff01ff, + 0xff0000ff00ff0101, 0xff0000ff0000ff00, 0xff0000ff000000ff, 0xff0000ff00000000, + 0xff0000ff00000001, 0xff0000ff00000100, 0xff0000ff0001ff01, 0xff0000ff00010000, + 0xff0000ff000101ff, 0xff0000ff01ff00ff, 0xff0000ff01ff0100, 0xff0000ff0100ffff, + 0xff0000ff010000ff, 0xff0000ff01000000, 0xff0000ff010001ff, 0xff0000ff01000100, + 0xff0000ff01000101, 0xff0000ff0101ff00, 0xff0000ff010100ff, 0xff0000ff01010000, + 0xff0000ff01010100, 0xff000000ffffff01, 0xff000000ffff0000, 0xff000000ffff0101, + 0xff000000ff00ff00, 0xff000000ff0000ff, 0xff000000ff000000, 0xff000000ff000001, + 0xff000000ff000100, 0xff000000ff01ffff, 0xff000000ff01ff01, 0xff000000ff010000, + 0xff000000ff0101ff, 0xff000000ff010101, 0xff00000000ffff00, 0xff00000000ff00ff, + 0xff00000000ff0000, 0xff00000000ff0001, 0xff0000000000ff00, 0xff0000000000ff01, + 0xff000000000000ff, 0xff00000000000000, 0xff00000000000001, 0xff00000000000100, + 0xff00000000000101, 0xff0000000001ff00, 0xff000000000100ff, 0xff00000000010000, + 0xff00000000010001, 0xff00000000010100, 0xff00000001ffffff, 0xff00000001ffff01, + 0xff00000001ff00ff, 0xff00000001ff0000, 0xff00000001ff01ff, 0xff00000001ff0101, + 0xff0000000100ffff, 0xff0000000100ff00, 0xff000000010000ff, 0xff00000001000000, + 0xff00000001000001, 0xff00000001000100, 0xff00000001000101, 0xff0000000101ffff, + 0xff0000000101ff01, 0xff00000001010000, 0xff000001ffffff00, 0xff000001ffff00ff, + 0xff000001ffff0000, 0xff000001ffff0001, 0xff000001ff000000, 0xff000001ff000001, + 0xff000001ff0001ff, 0xff000001ff000101, 0xff000001ff01ff00, 0xff000001ff010001, + 0xff00000100ffffff, 0xff00000100ffff01, 0xff00000100ff00ff, 0xff00000100ff0000, + 0xff00000100ff01ff, 0xff00000100ff0101, 0xff0000010000ff00, 0xff00000100000000, + 0xff00000100000001, 0xff000001000001ff, 0xff00000100000100, 0xff0000010001ff00, + 0xff000001000100ff, 0xff00000100010000, 0xff000001000101ff, 0xff00000100010100, + 0xff00000100010101, 0xff00000101ff0001, 0xff00000101ff0101, 0xff0000010100ff01, + 0xff00000101000000, 0xff000001010100ff, 0xff00000101010100, 0xff0001ffff00ff00, + 0xff0001ffff000001, 0xff0001ffff010000, 0xff0001ff00ffff00, 0xff0001ff00ff00ff, + 0xff0001ff00ff0001, 0xff0001ff00ff0100, 0xff0001ff0000ffff, 0xff0001ff00000000, + 0xff0001ff000001ff, 0xff0001ff00000101, 0xff0001ff0001ffff, 0xff0001ff0001ff00, + 0xff0001ff000100ff, 0xff0001ff00010001, 0xff0001ff00010100, 0xff0001ff01ff0000, + 0xff0001ff0100ff00, 0xff0001ff010000ff, 0xff0001ff01010000, 0xff000100ff00ffff, + 0xff000100ff00ff01, 0xff000100ff000000, 0xff000100ff000101, 0xff000100ff01ff00, + 0xff000100ff010000, 0xff00010000ffff01, 0xff00010000ff00ff, 0xff00010000ff0000, + 0xff00010000ff01ff, 0xff0001000000ff00, 0xff000100000000ff, 0xff00010000000000, + 0xff00010000000001, 0xff00010000000100, 0xff00010000000101, 0xff0001000001ffff, + 0xff00010000010000, 0xff00010000010101, 0xff00010001ff0100, 0xff0001000100ff00, + 0xff0001000100ff01, 0xff00010001000000, 0xff000100010001ff, 0xff0001000101ff00, + 0xff00010001010001, 0xff00010001010100, 0xff000101ffff0100, 0xff000101ff000001, + 0xff000101ff0100ff, 0xff000101ff010001, 0xff00010100ff00ff, 0xff00010100ff0001, + 0xff00010100ff0100, 0xff0001010000ffff, 0xff0001010000ff01, 0xff00010100000000, + 0xff000101000001ff, 0xff0001010001ff00, 0xff00010100010001, 0xff00010100010100, + 0xff00010101ff0000, 0xff0001010100ff00, 0xff00010101000001, 0xff00010101000101, + 0xff01ffffffffffff, 0xff01ffffffffff01, 0xff01ffffffff01ff, 0xff01ffffffff0101, + 0xff01ffffff000000, 0xff01ffffff01ffff, 0xff01ffffff01ff01, 0xff01ffffff010000, + 0xff01ffffff0101ff, 0xff01ffffff010101, 0xff01ffff00ff0000, 0xff01ffff0000ff00, + 0xff01ffff00000100, 0xff01ffff0001ff00, 0xff01ffff00010000, 0xff01ffff01ffffff, + 0xff01ffff01ffff01, 0xff01ffff01ff01ff, 0xff01ffff01ff0101, 0xff01ffff01000000, + 0xff01ffff0101ffff, 0xff01ffff0101ff01, 0xff01ffff01010000, 0xff01ffff010101ff, + 0xff01ffff01010101, 0xff01ff00ffff0000, 0xff01ff00ff00ff00, 0xff01ff00ff0000ff, + 0xff01ff00ff000100, 0xff01ff00ff010000, 0xff01ff0000ffff01, 0xff01ff0000ff00ff, + 0xff01ff0000ff0100, 0xff01ff0000000000, 0xff01ff00000001ff, 0xff01ff0000000101, + 0xff01ff000001ff00, 0xff01ff00000100ff, 0xff01ff0000010000, 0xff01ff0000010001, + 0xff01ff0001ff0000, 0xff01ff000100ffff, 0xff01ff0001000001, 0xff01ff0001000100, + 0xff01ff0001010000, 0xff01ff01ffffff00, 0xff01ff01ffff01ff, 0xff01ff01ffff0101, + 0xff01ff01ff00ff00, 0xff01ff01ff000000, 0xff01ff01ff01ffff, 0xff01ff01ff01ff01, + 0xff01ff01ff0101ff, 0xff01ff01ff010101, 0xff01ff0100ff0000, 0xff01ff010000ff00, + 0xff01ff0100000001, 0xff01ff0100000100, 0xff01ff0100010000, 0xff01ff0101ffff00, + 0xff01ff0101ff01ff, 0xff01ff0101ff0101, 0xff01ff010100ff00, 0xff01ff0101000000, + 0xff01ff010101ffff, 0xff01ff010101ff01, 0xff01ff01010101ff, 0xff01ff0101010101, + 0xff0100ffffff0000, 0xff0100ffff0000ff, 0xff0100ffff000001, 0xff0100ffff000100, + 0xff0100ffff010000, 0xff0100ff00ff00ff, 0xff0100ff00ff0000, 0xff0100ff00ff0001, + 0xff0100ff00ff0100, 0xff0100ff0000ff01, 0xff0100ff00000000, 0xff0100ff000001ff, + 0xff0100ff00000101, 0xff0100ff00010001, 0xff0100ff01ff0000, 0xff0100ff0100ff00, + 0xff0100ff010000ff, 0xff0100ff01000100, 0xff0100ff0101ff00, 0xff0100ff01010000, + 0xff010000ffff0100, 0xff010000ff000000, 0xff010000ff01ff00, 0xff010000ff010100, + 0xff01000000ffffff, 0xff01000000ff0000, 0xff01000000ff01ff, 0xff0100000000ff00, + 0xff010000000000ff, 0xff01000000000000, 0xff01000000000100, 0xff0100000001ff01, + 0xff01000000010000, 0xff010000000101ff, 0xff01000001ff0100, 0xff0100000100ffff, + 0xff010000010000ff, 0xff01000001000000, 0xff010000010001ff, 0xff01000001000101, + 0xff0100000101ff00, 0xff010000010100ff, 0xff01000001010001, 0xff01000001010100, + 0xff010001ffff0000, 0xff010001ff00ffff, 0xff010001ff00ff01, 0xff010001ff000100, + 0xff010001ff010000, 0xff01000100ffff00, 0xff01000100ff0100, 0xff01000100000000, + 0xff0100010001ffff, 0xff0100010001ff00, 0xff01000100010100, 0xff01000101ff00ff, + 0xff01000101ff0001, 0xff0100010100ffff, 0xff01000101000101, 0xff0101ffffffffff, + 0xff0101ffffffff01, 0xff0101ffffff01ff, 0xff0101ffffff0101, 0xff0101ffff000000, + 0xff0101ffff01ffff, 0xff0101ffff01ff01, 0xff0101ffff0101ff, 0xff0101ffff010101, + 0xff0101ff00ff0000, 0xff0101ff0000ff00, 0xff0101ff000000ff, 0xff0101ff00010000, + 0xff0101ff01ffffff, 0xff0101ff01ffff01, 0xff0101ff01ff01ff, 0xff0101ff01ff0101, + 0xff0101ff0101ffff, 0xff0101ff0101ff01, 0xff0101ff010101ff, 0xff0101ff01010101, + 0xff010100ffff0100, 0xff010100ff00ff00, 0xff010100ff0000ff, 0xff010100ff000100, + 0xff010100ff010000, 0xff01010000ff0001, 0xff01010000ff0100, 0xff0101000000ff01, + 0xff01010000000000, 0xff0101000001ff00, 0xff010100000100ff, 0xff01010000010001, + 0xff01010000010100, 0xff01010001ff0000, 0xff0101000100ffff, 0xff01010001000001, + 0xff01010001000100, 0xff010100010100ff, 0xff01010001010000, 0xff010101ffffffff, + 0xff010101ffffff01, 0xff010101ffff01ff, 0xff010101ffff0101, 0xff010101ff01ffff, + 0xff010101ff01ff01, 0xff010101ff0101ff, 0xff010101ff010101, 0xff01010100ff0000, + 0xff0101010000ff00, 0xff01010100000001, 0xff01010100000100, 0xff01010100010000, + 0xff01010101ffffff, 0xff01010101ffff01, 0xff01010101ff01ff, 0xff01010101ff0101, + 0xff01010101000000, 0xff0101010101ffff, 0xff0101010101ff01, 0xff010101010101ff, + 0xff01010101010101, 0x00ffffffffff0000, 0x00ffffffff00ff00, 0x00ffffffff000001, + 0x00ffffffff010000, 0x00ffffff00ff0100, 0x00ffffff0000ff01, 0x00ffffff00000000, + 0x00ffffff000001ff, 0x00ffffff00000101, 0x00ffffff0001ff00, 0x00ffffff000100ff, + 0x00ffffff00010001, 0x00ffffff010000ff, 0x00ffffff01000100, 0x00ffffff0101ff00, + 0x00ffffff01010001, 0x00ffff00ffffffff, 0x00ffff00ffffff00, 0x00ffff00ffff00ff, + 0x00ffff00ffff0001, 0x00ffff00ffff0100, 0x00ffff00ff00ff01, 0x00ffff00ff000000, + 0x00ffff00ff000001, 0x00ffff00ff0001ff, 0x00ffff00ff000101, 0x00ffff00ff01ff00, + 0x00ffff00ff010001, 0x00ffff00ff010100, 0x00ffff0000ff0000, 0x00ffff0000ff01ff, + 0x00ffff0000ff0101, 0x00ffff000000ff00, 0x00ffff00000000ff, 0x00ffff0000000000, + 0x00ffff0000000001, 0x00ffff0000000100, 0x00ffff0000000101, 0x00ffff0000010000, + 0x00ffff00000101ff, 0x00ffff0000010101, 0x00ffff0001ffff00, 0x00ffff0001ff00ff, + 0x00ffff0001ff0001, 0x00ffff000100ffff, 0x00ffff000100ff01, 0x00ffff0001000000, + 0x00ffff000101ffff, 0x00ffff000101ff00, 0x00ffff000101ff01, 0x00ffff01ffff0000, + 0x00ffff01ff00ff00, 0x00ffff01ff0000ff, 0x00ffff01ff000001, 0x00ffff01ff010000, + 0x00ffff0100ffff00, 0x00ffff010000ff01, 0x00ffff0100000000, 0x00ffff0100000101, + 0x00ffff01000100ff, 0x00ffff0100010100, 0x00ffff0101ff0100, 0x00ffff01010000ff, + 0x00ffff0101010000, 0x00ff00ffffffff00, 0x00ff00ffff000000, 0x00ff00ffff000100, + 0x00ff00ffff010100, 0x00ff00ff00ff0000, 0x00ff00ff00ff01ff, 0x00ff00ff00ff0101, + 0x00ff00ff0000ff00, 0x00ff00ff000000ff, 0x00ff00ff00000000, 0x00ff00ff00000001, + 0x00ff00ff0001ff00, 0x00ff00ff0001ff01, 0x00ff00ff00010000, 0x00ff00ff000101ff, + 0x00ff00ff00010101, 0x00ff00ff01ffff00, 0x00ff00ff01ff0001, 0x00ff00ff01ff0100, + 0x00ff00ff0100ffff, 0x00ff00ff0100ff01, 0x00ff00ff01000000, 0x00ff00ff0101ffff, + 0x00ff00ff0101ff00, 0x00ff00ff01010100, 0x00ff0000ffffff00, 0x00ff0000ffffff01, + 0x00ff0000ffff0000, 0x00ff0000ffff0101, 0x00ff0000ff00ff00, 0x00ff0000ff0000ff, + 0x00ff0000ff000000, 0x00ff0000ff000001, 0x00ff0000ff000100, 0x00ff0000ff01ffff, + 0x00ff0000ff010000, 0x00ff0000ff010101, 0x00ff000000ffff00, 0x00ff000000ff00ff, + 0x00ff000000ff0000, 0x00ff000000ff0001, 0x00ff000000ff0100, 0x00ff00000000ffff, + 0x00ff00000000ff00, 0x00ff0000000000ff, 0x00ff000000000000, 0x00ff000000000001, + 0x00ff0000000001ff, 0x00ff000000000100, 0x00ff00000001ff00, 0x00ff0000000100ff, + 0x00ff000000010000, 0x00ff000000010001, 0x00ff000000010100, 0x00ff000001ffff01, + 0x00ff000001ff00ff, 0x00ff000001ff0000, 0x00ff000001ff01ff, 0x00ff00000100ff00, + 0x00ff0000010000ff, 0x00ff000001000000, 0x00ff000001000001, 0x00ff000001000100, + 0x00ff000001000101, 0x00ff000001010000, 0x00ff0000010101ff, 0x00ff000001010101, + 0x00ff0001ffffff00, 0x00ff0001ffff0000, 0x00ff0001ffff0100, 0x00ff0001ff0000ff, + 0x00ff0001ff000000, 0x00ff0001ff0001ff, 0x00ff0001ff000101, 0x00ff0001ff01ff00, + 0x00ff0001ff0100ff, 0x00ff0001ff010100, 0x00ff000100ffffff, 0x00ff000100ffff01, + 0x00ff000100ff0000, 0x00ff000100ff01ff, 0x00ff00010000ffff, 0x00ff00010000ff00, + 0x00ff00010000ff01, 0x00ff000100000000, 0x00ff000100000001, 0x00ff000100000100, + 0x00ff00010001ff01, 0x00ff000100010000, 0x00ff0001000101ff, 0x00ff000101ffff00, + 0x00ff000101ff0000, 0x00ff000101ff0101, 0x00ff0001010000ff, 0x00ff000101000000, + 0x00ff00010101ff00, 0x00ff0001010100ff, 0x00ff000101010001, 0x00ff01ffffff0000, + 0x00ff01ffff00ff00, 0x00ff01ffff000000, 0x00ff01ffff000101, 0x00ff01ffff010000, + 0x00ff01ff00ffff01, 0x00ff01ff00ff0100, 0x00ff01ff0000ffff, 0x00ff01ff00000000, + 0x00ff01ff000001ff, 0x00ff01ff0001ff00, 0x00ff01ff000100ff, 0x00ff01ff00010001, + 0x00ff01ff00010100, 0x00ff01ff01ff0000, 0x00ff01ff0100ff00, 0x00ff01ff010000ff, + 0x00ff01ff01000001, 0x00ff01ff01000100, 0x00ff01ff01010000, 0x00ff0100ffffff00, + 0x00ff0100ffff0000, 0x00ff0100ffff0001, 0x00ff0100ffff0101, 0x00ff0100ff00ffff, + 0x00ff0100ff0000ff, 0x00ff0100ff000000, 0x00ff0100ff0001ff, 0x00ff0100ff01ff00, + 0x00ff0100ff0100ff, 0x00ff0100ff010001, 0x00ff010000ffffff, 0x00ff010000ff0000, + 0x00ff010000ff0101, 0x00ff01000000ff00, 0x00ff01000000ff01, 0x00ff0100000000ff, + 0x00ff010000000000, 0x00ff010000000001, 0x00ff010000000100, 0x00ff01000001ffff, + 0x00ff01000001ff01, 0x00ff010000010000, 0x00ff010000010001, 0x00ff010000010101, + 0x00ff010001ff0001, 0x00ff010001ff0100, 0x00ff01000100ff01, 0x00ff010001000000, + 0x00ff010001000001, 0x00ff0100010001ff, 0x00ff01000101ff00, 0x00ff0100010100ff, + 0x00ff010001010001, 0x00ff010001010100, 0x00ff0101ff000001, 0x00ff010100ff00ff, + 0x00ff010100ff0001, 0x00ff010100ff0100, 0x00ff010100000000, 0x00ff0101000001ff, + 0x00ff010100000101, 0x00ff0101000100ff, 0x00ff010100010100, 0x00ff0101010000ff, + 0x00ff010101010000, 0x0000ffffffffff00, 0x0000ffffffff00ff, 0x0000ffffffff0000, + 0x0000ffffffff0001, 0x0000ffffffff0100, 0x0000ffffff00ff01, 0x0000ffffff000000, + 0x0000ffffff000101, 0x0000ffffff01ff00, 0x0000ffffff0100ff, 0x0000ffffff010100, + 0x0000ffff00ffffff, 0x0000ffff00ff0000, 0x0000ffff00ff01ff, 0x0000ffff0000ff00, + 0x0000ffff000000ff, 0x0000ffff00000000, 0x0000ffff00000001, 0x0000ffff00000100, + 0x0000ffff00010000, 0x0000ffff000101ff, 0x0000ffff01ff0001, 0x0000ffff01ff0100, + 0x0000ffff01000000, 0x0000ffff010001ff, 0x0000ffff0101ffff, 0x0000ffff0101ff00, + 0x0000ffff01010001, 0x0000ffff01010100, 0x0000ff00ffff0000, 0x0000ff00ffff01ff, + 0x0000ff00ffff0100, 0x0000ff00ffff0101, 0x0000ff00ff00ff00, 0x0000ff00ff0000ff, + 0x0000ff00ff000000, 0x0000ff00ff000001, 0x0000ff00ff0001ff, 0x0000ff00ff000100, + 0x0000ff00ff01ffff, 0x0000ff00ff010000, 0x0000ff00ff010001, 0x0000ff00ff0101ff, + 0x0000ff00ff010101, 0x0000ff0000ffff00, 0x0000ff0000ff00ff, 0x0000ff0000ff0000, + 0x0000ff0000ff0001, 0x0000ff0000ff0100, 0x0000ff000000ffff, 0x0000ff000000ff00, + 0x0000ff000000ff01, 0x0000ff00000000ff, 0x0000ff0000000000, 0x0000ff0000000001, + 0x0000ff00000001ff, 0x0000ff0000000100, 0x0000ff0000000101, 0x0000ff000001ff00, + 0x0000ff00000100ff, 0x0000ff0000010000, 0x0000ff0000010001, 0x0000ff0000010100, + 0x0000ff0001ffff01, 0x0000ff0001ff0000, 0x0000ff000100ff00, 0x0000ff00010000ff, + 0x0000ff0001000000, 0x0000ff0001000001, 0x0000ff0001000100, 0x0000ff000101ffff, + 0x0000ff0001010000, 0x0000ff0001010101, 0x0000ff01ffffff00, 0x0000ff01ffff0001, + 0x0000ff01ff00ff01, 0x0000ff01ff000000, 0x0000ff01ff000101, 0x0000ff01ff01ff00, + 0x0000ff01ff0100ff, 0x0000ff0100ffff01, 0x0000ff0100ff0000, 0x0000ff0100ff0101, + 0x0000ff010000ff00, 0x0000ff01000000ff, 0x0000ff0100000000, 0x0000ff0100000001, + 0x0000ff0100000100, 0x0000ff010001ff01, 0x0000ff0100010000, 0x0000ff0101ff0000, + 0x0000ff010100ffff, 0x0000ff010100ff01, 0x0000ff0101000000, 0x0000ff0101000100, + 0x0000ff0101000101, 0x0000ff01010100ff, 0x000000ffffff00ff, 0x000000ffffff0000, + 0x000000ffff00ff00, 0x000000ffff0000ff, 0x000000ffff000000, 0x000000ffff000001, + 0x000000ffff0001ff, 0x000000ffff000100, 0x000000ffff01ff00, 0x000000ffff010000, + 0x000000ffff0101ff, 0x000000ffff010101, 0x000000ff00ffff00, 0x000000ff00ff00ff, + 0x000000ff00ff0000, 0x000000ff00ff0001, 0x000000ff00ff0100, 0x000000ff00ff0101, + 0x000000ff0000ffff, 0x000000ff0000ff00, 0x000000ff000000ff, 0x000000ff00000000, + 0x000000ff00000001, 0x000000ff000001ff, 0x000000ff00000100, 0x000000ff00000101, + 0x000000ff0001ff00, 0x000000ff0001ff01, 0x000000ff000100ff, 0x000000ff00010000, + 0x000000ff00010001, 0x000000ff00010100, 0x000000ff01ffffff, 0x000000ff01ff01ff, + 0x000000ff01ff0101, 0x000000ff0100ff00, 0x000000ff010000ff, 0x000000ff01000000, + 0x000000ff01000001, 0x000000ff01000100, 0x000000ff0101ff00, 0x000000ff010100ff, + 0x000000ff01010000, 0x000000ff01010101, 0x00000000ffffff00, 0x00000000ffffff01, + 0x00000000ffff00ff, 0x00000000ffff0000, 0x00000000ffff0001, 0x00000000ffff0100, + 0x00000000ff00ffff, 0x00000000ff00ff00, 0x00000000ff00ff01, 0x00000000ff0000ff, + 0x00000000ff000000, 0x00000000ff000001, 0x00000000ff000100, 0x00000000ff000101, + 0x00000000ff01ff00, 0x00000000ff0100ff, 0x00000000ff010000, 0x00000000ff010001, + 0x00000000ff010100, 0x0000000000ffffff, 0x0000000000ffff00, 0x0000000000ffff01, + 0x0000000000ff00ff, 0x0000000000ff0000, 0x0000000000ff0001, 0x0000000000ff01ff, 0x0000000000ff0100, 0x000000000000ffff, 0x000000000000ff00, 0x000000000000ff01, - 0x00000000000000ff, 0x0000000000000001, 0x00000000000001ff, 0x0000000000000100, - 0x0000000000000101, 0x000000000001ff00, 0x00000000000100ff, 0x0000000000010000, - 0x0000000000010001, 0x0000000000010100, 0x0000000001ff0000, 0x000000000100ff00, - 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, 0x0000000001000100, - 0x0000000001010000, 0x00000001ffff01ff, 0x00000001ff000000, 0x0000000100ff0000, - 0x000000010000ff00, 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, - 0x0000000100000100, 0x0000000100010000, 0x0000000101000000, 0x000001ffff00ff00, - 0x000001ffff010001, 0x000001ffff0101ff, 0x000001ff00ffff01, 0x000001ff0000ffff, - 0x000001ff00000000, 0x000001ff010000ff, 0x000001ff01010100, 0x00000100ffff0100, - 0x00000100ff000000, 0x0000010000ff0000, 0x000001000000ff00, 0x00000100000000ff, - 0x0000010000000000, 0x0000010000000001, 0x0000010000000100, 0x0000010000010000, - 0x0000010001000000, 0x000001000101ff01, 0x00000101ffff0001, 0x00000101ff01ffff, - 0x0000010100000000, 0x0000010101010100, 0x0001ffffff000000, 0x0001ffff00ffffff, - 0x0001ffff00000100, 0x0001ffff0001ff00, 0x0001ffff01000000, 0x0001ff00ffffff00, - 0x0001ff00ffff01ff, 0x0001ff00ff010000, 0x0001ff0000000000, 0x0001ff0000010001, - 0x0001ff0001ff0000, 0x0001ff0001010100, 0x0001ff01ff0000ff, 0x0001ff01ff000001, - 0x0001ff0100ffffff, 0x0001ff010001ffff, 0x0001ff01000101ff, 0x0001ff010100ff01, - 0x000100ffff00ffff, 0x000100ffff00ff01, 0x000100ffff000100, 0x000100ff00000000, - 0x000100ff000101ff, 0x000100ff01ff0101, 0x000100ff0100ffff, 0x000100ff01010101, - 0x00010000ff000000, 0x00010000ff010100, 0x0001000000ff0000, 0x000100000000ff00, - 0x00010000000000ff, 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, - 0x0001000000010000, 0x0001000001ffff01, 0x0001000001000000, 0x0001000100ff0101, - 0x0001000100000000, 0x00010001010100ff, 0x000101ffffff01ff, 0x000101ffffff0101, - 0x000101ff00010000, 0x000101ff01ff0000, 0x000101ff0100ff01, 0x00010100ffff0000, - 0x0001010000000000, 0x000101000001ffff, 0x0001010000010101, 0x00010100010001ff, - 0x00010101ff00ff00, 0x00010101ff010001, 0x0001010100ffffff, 0x0001010100ff01ff, - 0x00010101000101ff, 0x0001010101ff0000, 0x000101010100ff01, 0x0001010101000101, - 0x01ffffffffff0101, 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, - 0x01ffffffff010101, 0x01ffffff00000000, 0x01ffffff01ff01ff, 0x01ffffff01000101, - 0x01ffffff0101ff01, 0x01ffffff010100ff, 0x01ffff000000ff00, 0x01ffff0000000001, - 0x01ffff00000001ff, 0x01ffff0000010000, 0x01ffff0001ff0000, 0x01ffff01ffffffff, - 0x01ffff01ffff01ff, 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff0101ff, - 0x01ffff010100ffff, 0x01ff00ffffff0000, 0x01ff00ffff010000, 0x01ff00ff00ffff01, - 0x01ff0000ff0000ff, 0x01ff000000000000, 0x01ff00000001ff01, 0x01ff000001ffffff, - 0x01ff000001010100, 0x01ff0001ffffff01, 0x01ff0001ff010001, 0x01ff000101ff0100, - 0x01ff000101000001, 0x01ff0001010100ff, 0x01ff01ffff00ffff, 0x01ff01ff00010001, - 0x01ff01ff01000000, 0x01ff01ff010101ff, 0x01ff0100ff000001, 0x01ff010000ffff00, - 0x01ff010000000100, 0x01ff010001ff01ff, 0x01ff01000101ffff, 0x01ff0101ffff00ff, - 0x01ff0101ffff0101, 0x01ff0101ff0101ff, 0x01ff010100010000, 0x0100ffff00ff00ff, - 0x0100ffff00ff0001, 0x0100ffff00000100, 0x0100ffff0100ff00, 0x0100ff00ffff0000, - 0x0100ff00ff00ffff, 0x0100ff00ff00ff01, 0x0100ff00ff000100, 0x0100ff00ff010000, - 0x0100ff0000000000, 0x0100ff00000100ff, 0x0100ff0001ff0101, 0x0100ff0001010101, - 0x0100ff0100ff00ff, 0x0100ff0100ff0001, 0x0100ff0100000100, 0x0100ff0100010001, - 0x0100ff0101000000, 0x010000ffff00ff00, 0x010000ff0000ffff, 0x010000ff00000000, - 0x010000ff010001ff, 0x010000ff01010001, 0x01000000ffffff00, 0x01000000ffff0101, - 0x01000000ff000000, 0x01000000ff0100ff, 0x01000000ff010101, 0x0100000000ff0000, - 0x010000000000ff00, 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, - 0x0100000000000100, 0x0100000000010000, 0x0100000001000000, 0x0100000100000000, - 0x01000001000101ff, 0x0100000101ffff01, 0x010001ffff000101, 0x010001ff00ff0100, - 0x010001ff0000ff00, 0x010001ff000100ff, 0x010001ff01ffffff, 0x01000100ffff0000, - 0x01000100ff0001ff, 0x0100010000000000, 0x010001000001ff00, 0x0100010001ff0000, - 0x01000100010000ff, 0x0100010001000101, 0x01000101ff00ff01, 0x0100010100ff0100, - 0x010001010000ffff, 0x0100010101010001, 0x0101ffffffff0101, 0x0101ffffff0001ff, - 0x0101ffffff01ffff, 0x0101ffffff010101, 0x0101ffff00000000, 0x0101ffff0101ffff, - 0x0101ffff010101ff, 0x0101ff00ff000000, 0x0101ff0000ff0100, 0x0101ff000000ff00, - 0x0101ff0000010000, 0x0101ff00010000ff, 0x0101ff0001000001, 0x0101ff01ff010101, - 0x0101ff0100000000, 0x0101ff010101ff00, 0x010100ffffff0000, 0x010100ffff010000, - 0x010100ff00ff01ff, 0x010100ff000000ff, 0x010100ff00000101, 0x010100ff01ffff00, - 0x01010000ffffff01, 0x01010000ff000100, 0x01010000ff01ff01, 0x0101000000000000, - 0x01010000000100ff, 0x010100000101ff01, 0x01010001ffff0000, 0x01010001ff00ffff, - 0x01010001ff010000, 0x0101000101ffffff, 0x0101000101ff01ff, 0x0101000101010101, - 0x010101ffff01ffff, 0x010101ff00000000, 0x010101ff0001ff01, 0x010101ff0101ffff, - 0x010101ff010101ff, 0x01010100ffffffff, 0x01010100ff000001, 0x010101000000ff00, - 0x0101010001010000, 0x0101010100ff0001, 0x010101010001ff01, 0x010101010101ffff, + 0x00000000000000ff, 0x0000000000000000, 0x0000000000000001, 0x00000000000001ff, + 0x0000000000000100, 0x0000000000000101, 0x000000000001ffff, 0x000000000001ff00, + 0x00000000000100ff, 0x0000000000010000, 0x0000000000010001, 0x00000000000101ff, + 0x0000000000010100, 0x0000000000010101, 0x0000000001ffff00, 0x0000000001ff00ff, + 0x0000000001ff0000, 0x0000000001ff0100, 0x0000000001ff0101, 0x000000000100ffff, + 0x000000000100ff00, 0x00000000010000ff, 0x0000000001000000, 0x0000000001000001, + 0x00000000010001ff, 0x0000000001000100, 0x000000000101ff00, 0x00000000010100ff, + 0x0000000001010000, 0x0000000001010001, 0x0000000001010100, 0x00000001ffffffff, + 0x00000001ffffff00, 0x00000001ffffff01, 0x00000001ffff00ff, 0x00000001ffff0001, + 0x00000001ffff01ff, 0x00000001ffff0100, 0x00000001ff00ff00, 0x00000001ff0000ff, + 0x00000001ff000000, 0x00000001ff0001ff, 0x00000001ff000100, 0x00000001ff01ffff, + 0x00000001ff01ff00, 0x00000001ff01ff01, 0x00000001ff0100ff, 0x00000001ff010000, + 0x00000001ff010001, 0x00000001ff0101ff, 0x00000001ff010100, 0x0000000100ffff00, + 0x0000000100ff0000, 0x0000000100ff0001, 0x0000000100ff01ff, 0x0000000100ff0100, + 0x0000000100ff0101, 0x000000010000ffff, 0x000000010000ff00, 0x000000010000ff01, + 0x00000001000000ff, 0x0000000100000000, 0x0000000100000001, 0x00000001000001ff, + 0x0000000100000100, 0x0000000100000101, 0x000000010001ff00, 0x00000001000100ff, + 0x0000000100010000, 0x0000000100010100, 0x0000000101ffff01, 0x0000000101ff0000, + 0x0000000101ff0001, 0x0000000101ff01ff, 0x0000000101ff0100, 0x0000000101ff0101, + 0x000000010100ff00, 0x0000000101000000, 0x0000000101000101, 0x000000010101ff01, + 0x0000000101010000, 0x0000000101010001, 0x00000001010101ff, 0x0000000101010100, + 0x000001ffffff00ff, 0x000001ffffff0000, 0x000001ffffff0001, 0x000001ffffff0100, + 0x000001ffff00ffff, 0x000001ffff000000, 0x000001ffff0001ff, 0x000001ffff01ff00, + 0x000001ffff010101, 0x000001ff00ff0000, 0x000001ff00ff01ff, 0x000001ff00ff0101, + 0x000001ff0000ff00, 0x000001ff000000ff, 0x000001ff00000000, 0x000001ff00000001, + 0x000001ff000001ff, 0x000001ff00000100, 0x000001ff0001ffff, 0x000001ff0001ff01, + 0x000001ff000100ff, 0x000001ff00010000, 0x000001ff01ffff01, 0x000001ff01ff0100, + 0x000001ff0100ffff, 0x000001ff0100ff01, 0x000001ff01000000, 0x000001ff010001ff, + 0x000001ff0101ff00, 0x000001ff01010100, 0x00000100ffffff00, 0x00000100ffffff01, + 0x00000100ffff0000, 0x00000100ffff0101, 0x00000100ff00ff00, 0x00000100ff0000ff, + 0x00000100ff000000, 0x00000100ff000001, 0x00000100ff000100, 0x00000100ff010000, + 0x0000010000ffff00, 0x0000010000ff00ff, 0x0000010000ff0000, 0x0000010000ff0001, + 0x0000010000ff0100, 0x000001000000ffff, 0x000001000000ff00, 0x000001000000ff01, + 0x00000100000000ff, 0x0000010000000000, 0x0000010000000001, 0x00000100000001ff, + 0x0000010000000100, 0x0000010000000101, 0x000001000001ff00, 0x00000100000100ff, + 0x0000010000010000, 0x0000010000010001, 0x0000010000010100, 0x0000010001ffff00, + 0x0000010001ff0000, 0x0000010001ff0100, 0x000001000100ff00, 0x00000100010000ff, + 0x0000010001000000, 0x0000010001000001, 0x00000100010001ff, 0x0000010001000100, + 0x0000010001010000, 0x00000101ffff00ff, 0x00000101ffff01ff, 0x00000101ff000000, + 0x00000101ff000101, 0x00000101ff01ffff, 0x00000101ff010000, 0x00000101ff010001, + 0x00000101ff010100, 0x0000010100ff0000, 0x0000010100ff01ff, 0x0000010100ff0100, + 0x000001010000ff00, 0x0000010100000000, 0x0000010100000001, 0x00000101000001ff, + 0x0000010100000100, 0x000001010001ff01, 0x0000010100010000, 0x00000101000101ff, + 0x0000010100010101, 0x0000010101ffff00, 0x0000010101ff0101, 0x000001010100ff01, + 0x0000010101000000, 0x0000010101000001, 0x00000101010001ff, 0x0000010101000101, + 0x000001010101ff00, 0x0001ffffffff0000, 0x0001ffffff0000ff, 0x0001ffffff000001, + 0x0001ffffff000100, 0x0001ffffff010000, 0x0001ffff00ff00ff, 0x0001ffff0000ffff, + 0x0001ffff00000000, 0x0001ffff00000001, 0x0001ffff000001ff, 0x0001ffff00000101, + 0x0001ffff0001ff00, 0x0001ffff000100ff, 0x0001ffff00010001, 0x0001ffff00010100, + 0x0001ffff01ffff00, 0x0001ffff01000001, 0x0001ffff01010000, 0x0001ff00ffffff00, + 0x0001ff00ffff00ff, 0x0001ff00ffff0001, 0x0001ff00ffff0100, 0x0001ff00ff00ff01, + 0x0001ff00ff000000, 0x0001ff00ff01ff00, 0x0001ff00ff01ff01, 0x0001ff00ff010001, + 0x0001ff00ff010100, 0x0001ff0000ff0000, 0x0001ff0000ff0100, 0x0001ff000000ff00, + 0x0001ff0000000000, 0x0001ff0000000001, 0x0001ff0000000100, 0x0001ff0000010000, + 0x0001ff0000010001, 0x0001ff0000010101, 0x0001ff0001ff00ff, 0x0001ff0001ff0101, + 0x0001ff000100ff01, 0x0001ff0001000000, 0x0001ff000101ff00, 0x0001ff0001010001, + 0x0001ff0001010100, 0x0001ff01ff00ff00, 0x0001ff01ff000001, 0x0001ff01ff000100, + 0x0001ff0100ffffff, 0x0001ff0100ffff00, 0x0001ff0100ff0001, 0x0001ff0100000000, + 0x0001ff0100000001, 0x0001ff01000001ff, 0x0001ff010001ffff, 0x0001ff0101ff0000, + 0x0001ff010100ff00, 0x0001ff0101000001, 0x0001ff0101010000, 0x000100ffff00ff00, + 0x000100ffff00ff01, 0x000100ffff000000, 0x000100ffff000001, 0x000100ffff000101, + 0x000100ffff01ff00, 0x000100ffff010001, 0x000100ffff010100, 0x000100ff00ffffff, + 0x000100ff00ffff01, 0x000100ff00ff0000, 0x000100ff00ff01ff, 0x000100ff00ff0101, + 0x000100ff0000ff00, 0x000100ff000000ff, 0x000100ff00000000, 0x000100ff00000001, + 0x000100ff00000100, 0x000100ff00000101, 0x000100ff0001ffff, 0x000100ff0001ff01, + 0x000100ff00010000, 0x000100ff01ff00ff, 0x000100ff01ff0000, 0x000100ff01ff0100, + 0x000100ff0100ffff, 0x000100ff0100ff01, 0x000100ff010000ff, 0x000100ff01000000, + 0x000100ff01000001, 0x000100ff010001ff, 0x000100ff01000101, 0x000100ff0101ff00, + 0x000100ff010100ff, 0x000100ff01010100, 0x00010000ffff0000, 0x00010000ffff01ff, + 0x00010000ffff0101, 0x00010000ff00ff00, 0x00010000ff000000, 0x00010000ff000001, + 0x00010000ff000100, 0x0001000000ff00ff, 0x0001000000ff0000, 0x0001000000ff0001, + 0x0001000000ff0100, 0x000100000000ffff, 0x000100000000ff00, 0x00010000000000ff, + 0x0001000000000000, 0x0001000000000001, 0x0001000000000100, 0x000100000001ff00, + 0x00010000000100ff, 0x0001000000010000, 0x0001000000010001, 0x0001000000010100, + 0x0001000001ff0001, 0x0001000001ff0100, 0x0001000001ff0101, 0x000100000100ff00, + 0x0001000001000000, 0x0001000001000001, 0x0001000001000100, 0x0001000001000101, + 0x000100000101ff01, 0x0001000001010000, 0x0001000001010001, 0x00010000010101ff, + 0x00010001ffffff01, 0x00010001ffff0100, 0x00010001ff000000, 0x00010001ff01ffff, + 0x00010001ff010001, 0x00010001ff0101ff, 0x00010001ff010100, 0x0001000100ffffff, + 0x0001000100ff0000, 0x0001000100ff01ff, 0x0001000100ff0101, 0x000100010000ff00, + 0x00010001000000ff, 0x0001000100000000, 0x0001000100000001, 0x00010001000001ff, + 0x0001000100000101, 0x000100010001ffff, 0x0001000100010000, 0x00010001000101ff, + 0x0001000101ffffff, 0x0001000101ffff01, 0x0001000101ff0000, 0x0001000101ff0101, + 0x00010001010000ff, 0x0001000101000001, 0x00010001010001ff, 0x0001000101000100, + 0x000100010101ffff, 0x00010001010100ff, 0x0001000101010001, 0x0001000101010101, + 0x000101ffff000001, 0x000101ffff000100, 0x000101ffff010000, 0x000101ff00ffff00, + 0x000101ff0000ff01, 0x000101ff00000000, 0x000101ff00000101, 0x000101ff0001ff00, + 0x000101ff00010100, 0x000101ff01ff0000, 0x000101ff0100ff00, 0x000101ff010001ff, + 0x000101ff01010001, 0x00010100ffffff00, 0x00010100ffff00ff, 0x00010100ff00ffff, + 0x00010100ff000000, 0x00010100ff01ff00, 0x00010100ff0100ff, 0x00010100ff010001, + 0x00010100ff010100, 0x0001010000ffffff, 0x0001010000ffff00, 0x0001010000ff0000, + 0x0001010000ff0001, 0x0001010000ff01ff, 0x000101000000ff00, 0x00010100000000ff, + 0x0001010000000000, 0x0001010000000001, 0x0001010000000100, 0x000101000001ffff, + 0x0001010000010000, 0x0001010000010101, 0x0001010001ffff01, 0x0001010001ff00ff, + 0x0001010001ff0101, 0x0001010001000000, 0x000101000101ff00, 0x00010100010100ff, + 0x0001010001010000, 0x0001010001010100, 0x00010101ff00ff00, 0x00010101ff000001, + 0x00010101ff0001ff, 0x0001010100ffff00, 0x0001010100ff00ff, 0x0001010100ff0100, + 0x000101010000ffff, 0x0001010100000000, 0x00010101000001ff, 0x0001010100000101, + 0x00010101000100ff, 0x0001010100010000, 0x0001010100010100, 0x0001010101ff0001, + 0x00010101010000ff, 0x00010101010001ff, 0x0001010101000101, 0x0001010101010001, + 0x01ffffffffffffff, 0x01ffffffffffff01, 0x01ffffffffff01ff, 0x01ffffffffff0101, + 0x01ffffffff01ffff, 0x01ffffffff01ff01, 0x01ffffffff0101ff, 0x01ffffffff010101, + 0x01ffffff00ff0000, 0x01ffffff0000ffff, 0x01ffffff0000ff00, 0x01ffffff000000ff, + 0x01ffffff00000001, 0x01ffffff00000100, 0x01ffffff00010000, 0x01ffffff01ffffff, + 0x01ffffff01ffff01, 0x01ffffff01ff01ff, 0x01ffffff01ff0101, 0x01ffffff01000000, + 0x01ffffff0101ffff, 0x01ffffff0101ff01, 0x01ffffff010101ff, 0x01ffffff01010101, + 0x01ffff00ffff0000, 0x01ffff00ff00ff00, 0x01ffff00ff0000ff, 0x01ffff00ff000001, + 0x01ffff00ff000100, 0x01ffff00ff010000, 0x01ffff0000ffff00, 0x01ffff0000ff00ff, + 0x01ffff0000ff0100, 0x01ffff000000ffff, 0x01ffff000000ff01, 0x01ffff0000000000, + 0x01ffff0000000001, 0x01ffff00000001ff, 0x01ffff0000000100, 0x01ffff00000100ff, + 0x01ffff0000010001, 0x01ffff0000010100, 0x01ffff0001ff0000, 0x01ffff0001ff0100, + 0x01ffff00010000ff, 0x01ffff0001000001, 0x01ffff0001000100, 0x01ffff0001010000, + 0x01ffff01ffffffff, 0x01ffff01ffffff01, 0x01ffff01ffff01ff, 0x01ffff01ffff0101, + 0x01ffff01ff000000, 0x01ffff01ff01ffff, 0x01ffff01ff01ff01, 0x01ffff01ff0101ff, + 0x01ffff01ff010101, 0x01ffff010000ff00, 0x01ffff01000000ff, 0x01ffff0100000100, + 0x01ffff0100010000, 0x01ffff0101ffffff, 0x01ffff0101ffff01, 0x01ffff0101ff01ff, + 0x01ffff0101ff0101, 0x01ffff0101000000, 0x01ffff010101ffff, 0x01ffff010101ff01, + 0x01ffff01010101ff, 0x01ffff0101010101, 0x01ff00ffff0000ff, 0x01ff00ffff000100, + 0x01ff00ff00ffff00, 0x01ff00ff00ff00ff, 0x01ff00ff0000ff00, 0x01ff00ff00000000, + 0x01ff00ff00000101, 0x01ff00ff0001ff00, 0x01ff00ff000100ff, 0x01ff00ff00010100, + 0x01ff00ff010000ff, 0x01ff00ff01000100, 0x01ff0000ffffff00, 0x01ff0000ffff0100, + 0x01ff0000ff00ff01, 0x01ff0000ff000000, 0x01ff0000ff000101, 0x01ff0000ff010001, + 0x01ff0000ff010100, 0x01ff000000ffffff, 0x01ff000000ffff00, 0x01ff000000ff0000, + 0x01ff000000ff01ff, 0x01ff00000000ff00, 0x01ff0000000000ff, 0x01ff000000000000, + 0x01ff000000000001, 0x01ff000000000100, 0x01ff000000000101, 0x01ff000000010000, + 0x01ff000000010001, 0x01ff0000000101ff, 0x01ff000000010101, 0x01ff000001ffff00, + 0x01ff000001ff00ff, 0x01ff000001ff0001, 0x01ff000001ff0100, 0x01ff00000100ffff, + 0x01ff00000100ff01, 0x01ff000001000000, 0x01ff0000010001ff, 0x01ff000001010001, + 0x01ff0001ff00ff00, 0x01ff0001ff000001, 0x01ff0001ff000100, 0x01ff0001ff010000, + 0x01ff000100ffff00, 0x01ff000100ff00ff, 0x01ff000100ff0100, 0x01ff000100ff0101, + 0x01ff00010000ffff, 0x01ff000100000000, 0x01ff000100000100, 0x01ff000100000101, + 0x01ff00010001ff00, 0x01ff000100010001, 0x01ff000100010101, 0x01ff000101ff0000, + 0x01ff00010100ff00, 0x01ff000101000101, 0x01ff0001010100ff, 0x01ff01ffffffffff, + 0x01ff01ffffffff01, 0x01ff01ffffff01ff, 0x01ff01ffffff0101, 0x01ff01ffff000000, + 0x01ff01ffff01ffff, 0x01ff01ffff01ff01, 0x01ff01ffff0101ff, 0x01ff01ffff010101, + 0x01ff01ff00ffff00, 0x01ff01ff00ff0000, 0x01ff01ff0000ff00, 0x01ff01ff000000ff, + 0x01ff01ff00000100, 0x01ff01ff00010000, 0x01ff01ff00010100, 0x01ff01ff01ffffff, + 0x01ff01ff01ffff01, 0x01ff01ff01ff01ff, 0x01ff01ff01ff0101, 0x01ff01ff01000000, + 0x01ff01ff0101ffff, 0x01ff01ff0101ff01, 0x01ff01ff010101ff, 0x01ff01ff01010101, + 0x01ff0100ffff0000, 0x01ff0100ffff0001, 0x01ff0100ff00ff00, 0x01ff0100ff0000ff, + 0x01ff0100ff000001, 0x01ff0100ff010000, 0x01ff010000ffff00, 0x01ff010000ff00ff, + 0x01ff010000ff0001, 0x01ff010000ff0100, 0x01ff01000000ffff, 0x01ff01000000ff01, + 0x01ff010000000000, 0x01ff010000000101, 0x01ff01000001ff00, 0x01ff0100000100ff, + 0x01ff010001ff0000, 0x01ff010001000001, 0x01ff010001000100, 0x01ff010001010000, + 0x01ff0101ffffffff, 0x01ff0101ffffff01, 0x01ff0101ffff01ff, 0x01ff0101ffff0101, + 0x01ff0101ff000000, 0x01ff0101ff01ffff, 0x01ff0101ff01ff01, 0x01ff0101ff0101ff, + 0x01ff0101ff010101, 0x01ff010100ff0000, 0x01ff01010000ff00, 0x01ff0101000000ff, + 0x01ff010100000001, 0x01ff010101ffffff, 0x01ff010101ffff01, 0x01ff010101ff01ff, + 0x01ff010101ff0101, 0x01ff010101000000, 0x01ff01010101ffff, 0x01ff01010101ff01, + 0x01ff0101010101ff, 0x01ff010101010101, 0x0100ffffffff0000, 0x0100ffffff00ff00, + 0x0100ffffff000001, 0x0100ffffff0001ff, 0x0100ffffff000100, 0x0100ffffff010000, + 0x0100ffff00ffff00, 0x0100ffff00ff0001, 0x0100ffff00ff0100, 0x0100ffff00000000, + 0x0100ffff000001ff, 0x0100ffff00000101, 0x0100ffff00010100, 0x0100ffff00010101, + 0x0100ffff01ff0000, 0x0100ffff0100ff00, 0x0100ffff010000ff, 0x0100ffff01000001, + 0x0100ffff01000100, 0x0100ffff01010000, 0x0100ff00ffffff00, 0x0100ff00ffff00ff, + 0x0100ff00ffff0001, 0x0100ff00ffff0100, 0x0100ff00ff00ffff, 0x0100ff00ff000000, + 0x0100ff00ff0001ff, 0x0100ff00ff000101, 0x0100ff00ff01ff00, 0x0100ff00ff0100ff, + 0x0100ff00ff010001, 0x0100ff00ff010100, 0x0100ff0000ffffff, 0x0100ff0000ff0000, + 0x0100ff000000ffff, 0x0100ff000000ff00, 0x0100ff00000000ff, 0x0100ff0000000000, + 0x0100ff0000000001, 0x0100ff0000000100, 0x0100ff000001ff01, 0x0100ff0000010000, + 0x0100ff0001ff00ff, 0x0100ff0001ff0001, 0x0100ff000100ff01, 0x0100ff0001000000, + 0x0100ff00010001ff, 0x0100ff000101ff00, 0x0100ff00010100ff, 0x0100ff0001010001, + 0x0100ff0001010100, 0x0100ff01ffff0000, 0x0100ff01ff00ff00, 0x0100ff01ff0000ff, + 0x0100ff01ff000100, 0x0100ff01ff010000, 0x0100ff0100ff00ff, 0x0100ff0100ff0001, + 0x0100ff0100ff0100, 0x0100ff010000ffff, 0x0100ff010000ff01, 0x0100ff0100000000, + 0x0100ff01000001ff, 0x0100ff0100010001, 0x0100ff0100010100, 0x0100ff0101ff0000, + 0x0100ff01010000ff, 0x0100ff0101000001, 0x0100ff0101010100, 0x010000ffffffff00, + 0x010000ffffff00ff, 0x010000ffffff0001, 0x010000ffff00ffff, 0x010000ffff000000, + 0x010000ffff0001ff, 0x010000ffff010001, 0x010000ff00ffffff, 0x010000ff00ff0101, + 0x010000ff0000ff00, 0x010000ff000000ff, 0x010000ff00000000, 0x010000ff00000001, + 0x010000ff000001ff, 0x010000ff00000100, 0x010000ff0001ffff, 0x010000ff0001ff00, + 0x010000ff0001ff01, 0x010000ff00010000, 0x010000ff01ff00ff, 0x010000ff01ff0001, + 0x010000ff0100ff01, 0x010000ff010000ff, 0x010000ff01000000, 0x010000ff010001ff, + 0x010000ff0101ff00, 0x010000ff01010100, 0x01000000ffffffff, 0x01000000ffff0000, + 0x01000000ffff01ff, 0x01000000ffff0101, 0x01000000ff00ffff, 0x01000000ff00ff00, + 0x01000000ff0000ff, 0x01000000ff000000, 0x01000000ff000001, 0x01000000ff000100, + 0x01000000ff01ff00, 0x01000000ff010000, 0x01000000ff010100, 0x01000000ff010101, + 0x0100000000ffff00, 0x0100000000ff00ff, 0x0100000000ff0000, 0x0100000000ff0001, + 0x0100000000ff0100, 0x010000000000ffff, 0x010000000000ff00, 0x010000000000ff01, + 0x01000000000000ff, 0x0100000000000000, 0x0100000000000001, 0x01000000000001ff, + 0x0100000000000100, 0x0100000000000101, 0x010000000001ff00, 0x01000000000100ff, + 0x0100000000010000, 0x0100000000010001, 0x0100000000010100, 0x0100000001ffff00, + 0x0100000001ff0000, 0x0100000001ff01ff, 0x010000000100ff00, 0x010000000100ff01, + 0x01000000010000ff, 0x0100000001000000, 0x0100000001000001, 0x0100000001000100, + 0x0100000001000101, 0x010000000101ffff, 0x010000000101ff01, 0x0100000001010000, + 0x01000000010101ff, 0x0100000001010101, 0x01000001ffffff00, 0x01000001ffff00ff, + 0x01000001ff00ffff, 0x01000001ff000000, 0x01000001ff000100, 0x01000001ff01ffff, + 0x01000001ff010001, 0x01000001ff010100, 0x0100000100ff0000, 0x0100000100ff01ff, + 0x0100000100ff0100, 0x010000010000ff00, 0x010000010000ff01, 0x0100000100000000, + 0x0100000100000001, 0x0100000100000100, 0x0100000100010000, 0x01000001000101ff, + 0x0100000101ffff01, 0x0100000101ff00ff, 0x0100000101ff0100, 0x0100000101ff0101, + 0x010000010100ff01, 0x01000001010000ff, 0x0100000101000000, 0x01000001010100ff, + 0x0100000101010001, 0x0100000101010100, 0x010001ffffff0000, 0x010001ffff000001, + 0x010001ffff000100, 0x010001ffff010000, 0x010001ff00ffff00, 0x010001ff00ff0001, + 0x010001ff0000ffff, 0x010001ff0000ff01, 0x010001ff00000000, 0x010001ff00000001, + 0x010001ff00000101, 0x010001ff000100ff, 0x010001ff00010000, 0x010001ff01ff0000, + 0x010001ff0100ff00, 0x010001ff01000001, 0x010001ff01000100, 0x010001ff01010000, + 0x01000100ffff00ff, 0x01000100ffff0001, 0x01000100ffff0100, 0x01000100ff00ffff, + 0x01000100ff00ff01, 0x01000100ff000000, 0x01000100ff0001ff, 0x01000100ff000101, + 0x01000100ff01ffff, 0x01000100ff01ff00, 0x01000100ff0100ff, 0x01000100ff010001, + 0x0100010000ffffff, 0x0100010000ffff01, 0x0100010000ff0000, 0x0100010000ff01ff, + 0x0100010000ff0101, 0x010001000000ff00, 0x01000100000000ff, 0x0100010000000000, + 0x0100010000000001, 0x0100010000000100, 0x010001000001ff01, 0x0100010000010000, + 0x0100010000010001, 0x0100010000010101, 0x0100010001ffff00, 0x0100010001ff00ff, + 0x010001000100ffff, 0x010001000100ff01, 0x0100010001000000, 0x0100010001000101, + 0x010001000101ff00, 0x0100010001010001, 0x01000101ffff0000, 0x01000101ff000000, + 0x01000101ff010000, 0x0100010100ff00ff, 0x0100010100ff0001, 0x0100010100ff0100, + 0x010001010000ffff, 0x0100010100000000, 0x01000101000001ff, 0x010001010001ff00, + 0x0100010101ff0000, 0x010001010100ff00, 0x01000101010000ff, 0x0100010101000000, + 0x0100010101000001, 0x0101ffffffffffff, 0x0101ffffffffff01, 0x0101ffffffff01ff, + 0x0101ffffffff0101, 0x0101ffffff000000, 0x0101ffffff01ffff, 0x0101ffffff01ff01, + 0x0101ffffff0101ff, 0x0101ffffff010101, 0x0101ffff00ff0000, 0x0101ffff0000ff00, + 0x0101ffff000000ff, 0x0101ffff00000001, 0x0101ffff00000100, 0x0101ffff01ffffff, + 0x0101ffff01ffff01, 0x0101ffff01ff01ff, 0x0101ffff01ff0101, 0x0101ffff01000000, + 0x0101ffff0101ffff, 0x0101ffff0101ff01, 0x0101ffff010101ff, 0x0101ffff01010101, + 0x0101ff00ffff0000, 0x0101ff00ffff0100, 0x0101ff00ff00ff00, 0x0101ff00ff0000ff, + 0x0101ff00ff000001, 0x0101ff00ff000100, 0x0101ff00ff000101, 0x0101ff0000ff0001, + 0x0101ff0000ff0100, 0x0101ff000000ff00, 0x0101ff0000000000, 0x0101ff00000001ff, + 0x0101ff0000000101, 0x0101ff000001ff00, 0x0101ff00000100ff, 0x0101ff0001ff0000, + 0x0101ff000100ffff, 0x0101ff000100ff01, 0x0101ff0001000001, 0x0101ff0001000100, + 0x0101ff01ffffff01, 0x0101ff01ffff01ff, 0x0101ff01ffff0101, 0x0101ff01ff00ffff, + 0x0101ff01ff000100, 0x0101ff01ff01ff01, 0x0101ff01ff0101ff, 0x0101ff01ff010101, + 0x0101ff0100ff0000, 0x0101ff010000ff00, 0x0101ff0100000001, 0x0101ff0100000100, + 0x0101ff0100010000, 0x0101ff0101ffffff, 0x0101ff0101ffff01, 0x0101ff0101ff01ff, + 0x0101ff0101ff0101, 0x0101ff0101000000, 0x0101ff010101ffff, 0x0101ff010101ff01, + 0x0101ff01010101ff, 0x0101ff0101010101, 0x010100ffff000100, 0x010100ffff010000, + 0x010100ff00ffff00, 0x010100ff00ff00ff, 0x010100ff0000ffff, 0x010100ff000000ff, + 0x010100ff00000000, 0x010100ff000001ff, 0x010100ff00000101, 0x010100ff0001ff00, + 0x010100ff00010000, 0x010100ff00010001, 0x010100ff000101ff, 0x010100ff00010100, + 0x010100ff01ff0000, 0x01010000ffff0001, 0x01010000ffff0100, 0x01010000ff00ffff, + 0x01010000ff00ff01, 0x01010000ff000000, 0x01010000ff0001ff, 0x01010000ff010001, + 0x01010000ff010100, 0x0101000000ffff01, 0x0101000000ff0000, 0x010100000000ff00, + 0x01010000000000ff, 0x0101000000000000, 0x0101000000000001, 0x0101000000000100, + 0x0101000000010000, 0x0101000000010101, 0x0101000001ffff00, 0x0101000001ff00ff, + 0x0101000001ff0000, 0x0101000001ff0001, 0x0101000001ff0100, 0x010100000100ff01, + 0x0101000001000000, 0x01010000010001ff, 0x01010001ffff0000, 0x01010001ff00ff00, + 0x01010001ff000001, 0x01010001ff000101, 0x01010001ff01ff00, 0x01010001ff010000, + 0x0101000100ff00ff, 0x0101000100ff0001, 0x0101000100ff0101, 0x010100010000ff01, + 0x0101000100000000, 0x0101000100000001, 0x01010001000001ff, 0x010100010001ffff, + 0x010100010001ff01, 0x0101000101ff0001, 0x010100010100ffff, 0x0101000101000000, + 0x0101000101000001, 0x0101000101000100, 0x010100010101ff00, 0x01010001010100ff, + 0x0101000101010001, 0x010101ffffffffff, 0x010101ffffffff01, 0x010101ffffff01ff, + 0x010101ffffff0101, 0x010101ffff01ffff, 0x010101ffff01ff01, 0x010101ffff0101ff, + 0x010101ffff010101, 0x010101ff0000ff00, 0x010101ff000000ff, 0x010101ff00000001, + 0x010101ff00000100, 0x010101ff01ffffff, 0x010101ff01ffff01, 0x010101ff01ff01ff, + 0x010101ff01ff0101, 0x010101ff01000000, 0x010101ff0101ffff, 0x010101ff0101ff01, + 0x010101ff010101ff, 0x010101ff01010101, 0x01010100ffff0000, 0x01010100ff0000ff, + 0x01010100ff000100, 0x01010100ff01ff00, 0x01010100ff010000, 0x0101010000ffff00, + 0x010101000000ffff, 0x0101010000000000, 0x0101010000000101, 0x010101000001ff00, + 0x0101010000010001, 0x0101010000010100, 0x010101000100ffff, 0x0101010001000001, + 0x01010101ffffffff, 0x01010101ffffff01, 0x01010101ffff01ff, 0x01010101ffff0101, + 0x01010101ff01ffff, 0x01010101ff01ff01, 0x01010101ff0101ff, 0x01010101ff010101, + 0x010101010000ff00, 0x01010101000000ff, 0x0101010100000001, 0x0101010101ffffff, + 0x0101010101ffff01, 0x0101010101ff01ff, 0x0101010101ff0101, 0x0101010101000000, + 0x010101010101ffff, 0x010101010101ff01, 0x01010101010101ff, 0x0101010101010101, +GGML_TABLE_END() +#else +GGML_TABLE_BEGIN(uint32_t, iq1s_grid_gpu, NGRID_IQ1S) + 0x00000000, 0x00000002, 0x00000101, 0x00000200, 0x00000202, 0x00010001, 0x00010101, 0x00020000, + 0x00020002, 0x00020200, 0x00020202, 0x01000101, 0x01010001, 0x01010100, 0x01010102, 0x01020101, + 0x02000000, 0x02000002, 0x02000200, 0x02000202, 0x02010101, 0x02020000, 0x02020002, 0x02020200, + 0x02020202, 0x00000110, 0x00000111, 0x00010011, 0x00010110, 0x00010112, 0x00010211, 0x00010212, + 0x00020111, 0x01000011, 0x01000112, 0x01000211, 0x01010012, 0x01010111, 0x01010212, 0x01020011, + 0x01020110, 0x01020112, 0x01020210, 0x02000111, 0x02010011, 0x02010110, 0x02010112, 0x02020111, + 0x00000020, 0x00000022, 0x00000220, 0x00000222, 0x00010121, 0x00020020, 0x00020022, 0x00020220, + 0x00020222, 0x01000121, 0x01010021, 0x01010221, 0x01020120, 0x01020221, 0x02000020, 0x02000022, + 0x02000220, 0x02000222, 0x02010021, 0x02010121, 0x02010221, 0x02020020, 0x02020022, 0x02020220, + 0x02020222, 0x00011001, 0x00011100, 0x00011102, 0x00021101, 0x01001001, 0x01001201, 0x01011101, + 0x01011202, 0x01021100, 0x01021101, 0x02011001, 0x02011201, 0x02021101, 0x00001011, 0x00001110, + 0x00001111, 0x00001112, 0x00011111, 0x00011210, 0x00011212, 0x00021211, 0x01001010, 0x01001111, + 0x01001212, 0x01011010, 0x01011011, 0x01011110, 0x01011111, 0x01011112, 0x01011211, 0x01021010, + 0x01021012, 0x01021111, 0x01021210, 0x01021212, 0x02001011, 0x02011011, 0x02011111, 0x02011210, + 0x02011212, 0x02021011, 0x02021110, 0x02021111, 0x02021112, 0x02021211, 0x00011120, 0x00011221, + 0x01001021, 0x01001120, 0x01011020, 0x01011022, 0x01011121, 0x01011220, 0x01021020, 0x01021021, + 0x01021122, 0x01021221, 0x02001121, 0x02011021, 0x02011120, 0x02011221, 0x00002000, 0x00002002, + 0x00002200, 0x00002202, 0x00012101, 0x00022000, 0x00022002, 0x00022200, 0x00022202, 0x01002101, + 0x01012001, 0x01012102, 0x01022101, 0x02002000, 0x02002002, 0x02002200, 0x02002202, 0x02012101, + 0x02022000, 0x02022002, 0x02022200, 0x02022202, 0x00002111, 0x00012011, 0x00012110, 0x00012211, + 0x00022110, 0x00022111, 0x01002011, 0x01012010, 0x01012011, 0x01012111, 0x01022011, 0x01022110, + 0x01022211, 0x02012011, 0x02012110, 0x02012112, 0x02012211, 0x02022111, 0x00002020, 0x00002022, + 0x00002220, 0x00002222, 0x00012121, 0x00022020, 0x00022022, 0x00022220, 0x00022222, 0x01002121, + 0x01012021, 0x01012221, 0x01022021, 0x01022121, 0x02002020, 0x02002022, 0x02002121, 0x02002220, + 0x02002222, 0x02012121, 0x02022020, 0x02022022, 0x02022220, 0x02022222, 0x00110000, 0x00110001, + 0x00110100, 0x00110201, 0x00120100, 0x00120101, 0x01100001, 0x01100100, 0x01110000, 0x01110101, + 0x01110200, 0x01120001, 0x01120100, 0x01120101, 0x01120201, 0x02110001, 0x02110100, 0x02110102, + 0x02120001, 0x02120101, 0x00100011, 0x00100110, 0x00100112, 0x00100211, 0x00110010, 0x00110012, + 0x00110111, 0x00110210, 0x00120011, 0x00120110, 0x00120211, 0x01100111, 0x01100212, 0x01110010, + 0x01110011, 0x01110012, 0x01110110, 0x01110111, 0x01110112, 0x01110211, 0x01120010, 0x01120111, + 0x02100110, 0x02110012, 0x02110111, 0x02120011, 0x02120110, 0x00110021, 0x00110120, 0x00110122, + 0x00120121, 0x01100020, 0x01100122, 0x01100221, 0x01110022, 0x01110121, 0x01110220, 0x01110222, + 0x01120120, 0x01120122, 0x02100121, 0x02110021, 0x02110120, 0x02110122, 0x02120121, 0x00101001, + 0x00101102, 0x00101201, 0x00111100, 0x00111101, 0x00111200, 0x00111201, 0x00121001, 0x00121102, + 0x01101001, 0x01101101, 0x01101102, 0x01101200, 0x01101202, 0x01111001, 0x01111100, 0x01111101, + 0x01111102, 0x01111201, 0x01121002, 0x01121101, 0x01121200, 0x02101100, 0x02101201, 0x02111000, + 0x02111100, 0x02111101, 0x02111200, 0x02111201, 0x02111202, 0x02121001, 0x02121100, 0x02121101, + 0x02121201, 0x00101012, 0x00101111, 0x00101212, 0x00111011, 0x00111110, 0x00111111, 0x00111112, + 0x00111211, 0x00121010, 0x00121012, 0x00121111, 0x00121210, 0x00121212, 0x01101011, 0x01101110, + 0x01101111, 0x01101112, 0x01111011, 0x01111012, 0x01111110, 0x01111111, 0x01111112, 0x01111211, + 0x01111212, 0x01121011, 0x01121110, 0x01121111, 0x01121112, 0x01121211, 0x02101010, 0x02101012, + 0x02101110, 0x02101111, 0x02101210, 0x02101212, 0x02111010, 0x02111011, 0x02111110, 0x02111111, + 0x02111112, 0x02111211, 0x02111212, 0x02121010, 0x02121012, 0x02121111, 0x00101021, 0x00101120, + 0x00101121, 0x00101122, 0x00111121, 0x00111122, 0x00111220, 0x00111222, 0x00121021, 0x00121122, + 0x01101020, 0x01101022, 0x01101120, 0x01101121, 0x01101220, 0x01101222, 0x01111021, 0x01111121, + 0x01111122, 0x01111220, 0x01111221, 0x01121021, 0x01121120, 0x01121121, 0x01121220, 0x01121221, + 0x01121222, 0x02101122, 0x02101222, 0x02111022, 0x02111121, 0x02121120, 0x02121221, 0x00112001, + 0x00112102, 0x00122101, 0x01102001, 0x01102100, 0x01102102, 0x01102201, 0x01112000, 0x01112101, + 0x01112200, 0x01112202, 0x01122000, 0x01122001, 0x01122100, 0x01122102, 0x01122201, 0x02102101, + 0x02112001, 0x02112100, 0x02122101, 0x00112010, 0x00112012, 0x00112111, 0x00112212, 0x00122011, + 0x00122111, 0x01102012, 0x01102110, 0x01102111, 0x01102210, 0x01112011, 0x01112110, 0x01112111, + 0x01112112, 0x01112211, 0x01112212, 0x01122010, 0x01122111, 0x01122212, 0x02102211, 0x02112011, + 0x02112012, 0x02112111, 0x02112210, 0x02122011, 0x02122112, 0x02122211, 0x00102221, 0x00112122, + 0x00122120, 0x00122122, 0x01102120, 0x01102122, 0x01102221, 0x01112020, 0x01112022, 0x01112121, + 0x01112220, 0x01122021, 0x01122122, 0x01122221, 0x02102121, 0x02112021, 0x02112122, 0x02112222, + 0x00200000, 0x00200002, 0x00200200, 0x00200202, 0x00210101, 0x00220000, 0x00220002, 0x00220101, + 0x00220200, 0x00220202, 0x01200101, 0x01210001, 0x01210201, 0x01220001, 0x01220101, 0x02200000, + 0x02200002, 0x02200200, 0x02200202, 0x02210101, 0x02220000, 0x02220002, 0x02220101, 0x02220200, + 0x02220202, 0x00200111, 0x00210011, 0x00210110, 0x00210211, 0x00220111, 0x01200012, 0x01200110, + 0x01200211, 0x01210111, 0x01210210, 0x01210212, 0x01220011, 0x01220110, 0x01220111, 0x01220112, + 0x02200111, 0x02210010, 0x02210112, 0x02210211, 0x02220111, 0x00200021, 0x00200220, 0x00200222, + 0x00210021, 0x00210121, 0x00220020, 0x00220022, 0x00220220, 0x00220222, 0x01200121, 0x01210021, + 0x01210122, 0x01210221, 0x01220121, 0x02200021, 0x02200220, 0x02200222, 0x02210021, 0x02210121, + 0x02220020, 0x02220022, 0x02220220, 0x02220222, 0x00201101, 0x00211100, 0x00211102, 0x00211201, + 0x00221101, 0x01201100, 0x01201101, 0x01201102, 0x01201201, 0x01211002, 0x01211101, 0x01211200, + 0x01211202, 0x01221102, 0x02201101, 0x02211001, 0x02211100, 0x02211201, 0x02221001, 0x02221101, + 0x00201211, 0x00211111, 0x00221011, 0x00221211, 0x01201010, 0x01201111, 0x01201210, 0x01211011, + 0x01211110, 0x01211111, 0x01211211, 0x01221012, 0x01221111, 0x01221210, 0x02201211, 0x02211010, + 0x02211110, 0x02211111, 0x02211210, 0x02211212, 0x02221011, 0x02221110, 0x02221112, 0x02221211, + 0x00201121, 0x00211020, 0x00211022, 0x00211221, 0x00221121, 0x01201021, 0x01201221, 0x01211121, + 0x01221020, 0x01221021, 0x01221221, 0x02201120, 0x02201122, 0x02211020, 0x02211222, 0x00202000, + 0x00202002, 0x00202200, 0x00202202, 0x00212101, 0x00222000, 0x00222002, 0x00222200, 0x00222202, + 0x01202101, 0x01212001, 0x01212100, 0x01222101, 0x02202000, 0x02202002, 0x02202200, 0x02202202, + 0x02222000, 0x02222002, 0x02222200, 0x02222202, 0x00202211, 0x00212011, 0x00212110, 0x00212211, + 0x00222111, 0x01202112, 0x01202211, 0x01212012, 0x01212111, 0x01222011, 0x01222110, 0x01222112, + 0x01222211, 0x02202111, 0x02212010, 0x02212112, 0x02212211, 0x02222110, 0x02222111, 0x00202020, + 0x00202022, 0x00202220, 0x00202222, 0x00222020, 0x00222022, 0x00222220, 0x00222222, 0x01202121, + 0x01212021, 0x01212122, 0x01212221, 0x01222121, 0x02202020, 0x02202022, 0x02202220, 0x02202222, + 0x02212121, 0x02222020, 0x02222022, 0x02222220, 0x02222222, 0x10000101, 0x10010001, 0x10010102, + 0x10020101, 0x11000201, 0x11010002, 0x11010101, 0x11010200, 0x11010202, 0x11020001, 0x11020100, + 0x11020102, 0x12010100, 0x12010201, 0x12020001, 0x12020102, 0x10000010, 0x10000011, 0x10000110, + 0x10000112, 0x10000211, 0x10010012, 0x10010111, 0x10010112, 0x10010210, 0x10010212, 0x10020011, + 0x10020112, 0x10020211, 0x11000111, 0x11000210, 0x11000212, 0x11010011, 0x11010110, 0x11010111, + 0x11010112, 0x11010211, 0x11010212, 0x11020111, 0x11020210, 0x11020212, 0x12000011, 0x12000110, + 0x12000112, 0x12010010, 0x12010012, 0x12010111, 0x12020010, 0x12020011, 0x12020012, 0x10000121, + 0x10010021, 0x10010120, 0x10010122, 0x10020121, 0x11000021, 0x11010022, 0x11010121, 0x11010222, + 0x11020120, 0x11020221, 0x12000221, 0x12010120, 0x12020121, 0x10001001, 0x10011101, 0x10011201, + 0x10021201, 0x11001101, 0x11001200, 0x11001202, 0x11011001, 0x11011100, 0x11011101, 0x11011102, + 0x11021001, 0x11021002, 0x11021101, 0x11021200, 0x11021202, 0x12001001, 0x12001102, 0x12001201, + 0x12011000, 0x12011002, 0x12011101, 0x12021000, 0x12021001, 0x12021201, 0x10001011, 0x10001012, + 0x10001111, 0x10001212, 0x10011011, 0x10011110, 0x10011111, 0x10011112, 0x10011211, 0x10021010, + 0x10021111, 0x10021212, 0x11001011, 0x11001110, 0x11001111, 0x11001112, 0x11001211, 0x11011010, + 0x11011011, 0x11011110, 0x11011111, 0x11011112, 0x11011210, 0x11011211, 0x11021011, 0x11021110, + 0x11021111, 0x11021112, 0x11021211, 0x12001012, 0x12001110, 0x12001111, 0x12001210, 0x12011011, + 0x12011110, 0x12011111, 0x12011112, 0x12011211, 0x12011212, 0x12021111, 0x12021210, 0x12021212, + 0x10001021, 0x10001121, 0x10001221, 0x10011120, 0x10011121, 0x10011220, 0x10011222, 0x10021021, + 0x10021120, 0x10021221, 0x11001020, 0x11001022, 0x11001121, 0x11001220, 0x11011020, 0x11011021, + 0x11011022, 0x11011121, 0x11011122, 0x11011221, 0x11021022, 0x11021121, 0x11021220, 0x12001021, + 0x12001121, 0x12001222, 0x12011120, 0x12011121, 0x12021021, 0x12021120, 0x12021122, 0x10002101, + 0x10012001, 0x10012101, 0x10012202, 0x10022101, 0x11002002, 0x11002201, 0x11012000, 0x11012101, + 0x11012200, 0x11022001, 0x11022100, 0x11022102, 0x11022201, 0x12002101, 0x12012001, 0x12012100, + 0x12012102, 0x12012201, 0x12022101, 0x10002011, 0x10002111, 0x10002112, 0x10002212, 0x10012010, + 0x10012110, 0x10012111, 0x10012210, 0x10022011, 0x10022110, 0x10022112, 0x11002010, 0x11002111, + 0x11002212, 0x11012011, 0x11012012, 0x11012110, 0x11012111, 0x11012112, 0x11012211, 0x11022010, + 0x11022012, 0x11022111, 0x11022112, 0x11022212, 0x12002112, 0x12002211, 0x12012012, 0x12012111, + 0x12012112, 0x12012210, 0x12022011, 0x12022110, 0x12022112, 0x12022211, 0x10012122, 0x11002120, + 0x11002122, 0x11002221, 0x11012121, 0x11012220, 0x11012222, 0x11022120, 0x11022221, 0x12012120, + 0x12022121, 0x10100001, 0x10100100, 0x10100101, 0x10100102, 0x10100201, 0x10110002, 0x10110101, + 0x10110202, 0x10120001, 0x10120100, 0x10120201, 0x11100000, 0x11100101, 0x11100200, 0x11110001, + 0x11110100, 0x11110101, 0x11110102, 0x11110201, 0x11120101, 0x11120200, 0x12100102, 0x12100201, + 0x12110101, 0x12110200, 0x12120000, 0x12120001, 0x12120102, 0x12120201, 0x10100111, 0x10100210, + 0x10100211, 0x10100212, 0x10110011, 0x10110110, 0x10110111, 0x10110112, 0x10110210, 0x10110211, + 0x10120010, 0x10120111, 0x10120112, 0x10120210, 0x10120212, 0x11100011, 0x11100110, 0x11100111, + 0x11100112, 0x11100211, 0x11110010, 0x11110011, 0x11110012, 0x11110110, 0x11110111, 0x11110112, + 0x11110210, 0x11110211, 0x11110212, 0x11120011, 0x11120110, 0x11120111, 0x11120112, 0x11120211, + 0x12100012, 0x12100111, 0x12110011, 0x12110110, 0x12110111, 0x12110112, 0x12110211, 0x12120010, + 0x12120111, 0x12120212, 0x10100021, 0x10100122, 0x10110022, 0x10110121, 0x10110222, 0x10120021, + 0x10120120, 0x11100022, 0x11100121, 0x11100222, 0x11110021, 0x11110120, 0x11110121, 0x11110122, + 0x11110221, 0x11120022, 0x11120121, 0x12100121, 0x12110020, 0x12110022, 0x12110121, 0x12110221, + 0x12110222, 0x12120120, 0x10101100, 0x10101101, 0x10111001, 0x10111100, 0x10111101, 0x10111102, + 0x10111200, 0x10111201, 0x10121001, 0x10121101, 0x10121200, 0x10121202, 0x11101001, 0x11101100, + 0x11101101, 0x11101102, 0x11101201, 0x11101202, 0x11111000, 0x11111001, 0x11111100, 0x11111101, + 0x11111102, 0x11111200, 0x11111201, 0x11111202, 0x11121001, 0x11121002, 0x11121100, 0x11121101, + 0x11121102, 0x11121201, 0x12101000, 0x12101200, 0x12101202, 0x12111001, 0x12111100, 0x12111101, + 0x12111102, 0x12111201, 0x12121001, 0x12121100, 0x12121101, 0x12121202, 0x10101011, 0x10101012, + 0x10101110, 0x10101111, 0x10101112, 0x10101211, 0x10111010, 0x10111011, 0x10111012, 0x10111110, + 0x10111111, 0x10111112, 0x10111211, 0x10111212, 0x10121011, 0x10121110, 0x10121111, 0x10121112, + 0x10121211, 0x11101010, 0x11101011, 0x11101012, 0x11101110, 0x11101111, 0x11101112, 0x11101210, + 0x11101211, 0x11111010, 0x11111011, 0x11111012, 0x11111110, 0x11111111, 0x11111112, 0x11111210, + 0x11111211, 0x11111212, 0x11121010, 0x11121011, 0x11121110, 0x11121111, 0x11121112, 0x11121210, + 0x11121211, 0x11121212, 0x12101011, 0x12101110, 0x12101111, 0x12101211, 0x12101212, 0x12111010, + 0x12111011, 0x12111110, 0x12111111, 0x12111112, 0x12111210, 0x12111211, 0x12121011, 0x12121110, + 0x12121111, 0x12121112, 0x12121211, 0x10101020, 0x10101021, 0x10101022, 0x10101120, 0x10101122, + 0x10101220, 0x10101221, 0x10111021, 0x10111120, 0x10111121, 0x10111220, 0x10111221, 0x10121020, + 0x10121021, 0x10121022, 0x10121120, 0x10121121, 0x10121122, 0x10121220, 0x10121221, 0x11101021, + 0x11101121, 0x11101122, 0x11101220, 0x11101221, 0x11101222, 0x11111020, 0x11111021, 0x11111022, + 0x11111120, 0x11111121, 0x11111122, 0x11111220, 0x11111221, 0x11111222, 0x11121021, 0x11121120, + 0x11121121, 0x11121221, 0x12101022, 0x12101121, 0x12101122, 0x12101220, 0x12101221, 0x12101222, + 0x12111021, 0x12111121, 0x12111222, 0x12121022, 0x12121121, 0x12121122, 0x12121220, 0x12121221, + 0x10102100, 0x10102101, 0x10102102, 0x10102201, 0x10112000, 0x10112101, 0x10112200, 0x10122001, + 0x10122202, 0x11102101, 0x11102200, 0x11102202, 0x11112001, 0x11112100, 0x11112101, 0x11112102, + 0x11112200, 0x11112201, 0x11122000, 0x11122002, 0x11122100, 0x11122101, 0x12102002, 0x12102201, + 0x12112000, 0x12112002, 0x12112101, 0x12112200, 0x12122001, 0x12122201, 0x10102011, 0x10102012, + 0x10102111, 0x10102212, 0x10112011, 0x10112110, 0x10112111, 0x10112112, 0x10112211, 0x10122111, + 0x11102011, 0x11102110, 0x11102111, 0x11102112, 0x11102211, 0x11112010, 0x11112011, 0x11112012, + 0x11112110, 0x11112111, 0x11112112, 0x11112210, 0x11112211, 0x11112212, 0x11122011, 0x11122110, + 0x11122111, 0x11122112, 0x11122211, 0x12102011, 0x12102111, 0x12102211, 0x12112011, 0x12112110, + 0x12112111, 0x12112112, 0x12112210, 0x12112211, 0x12122111, 0x10102120, 0x10102220, 0x10112121, + 0x10112222, 0x10122020, 0x10122121, 0x10122122, 0x10122221, 0x11102121, 0x11102220, 0x11102221, + 0x11112021, 0x11112121, 0x11112122, 0x11112220, 0x11112221, 0x11122022, 0x11122121, 0x11122220, + 0x11122222, 0x12102021, 0x12102222, 0x12112022, 0x12112121, 0x12112122, 0x12112220, 0x12112222, + 0x12122021, 0x10200101, 0x10210100, 0x10210102, 0x10210201, 0x10220101, 0x11200100, 0x11210000, + 0x11210101, 0x11210102, 0x11210200, 0x11210202, 0x11220001, 0x11220100, 0x11220102, 0x11220201, + 0x12200001, 0x12210102, 0x12220101, 0x10200011, 0x10200110, 0x10200112, 0x10200211, 0x10210012, + 0x10210111, 0x10220011, 0x10220012, 0x10220112, 0x10220211, 0x11200111, 0x11200211, 0x11210011, + 0x11210111, 0x11210112, 0x11210211, 0x11220111, 0x11220112, 0x11220212, 0x12200110, 0x12200212, + 0x12210012, 0x12210111, 0x12220011, 0x12220112, 0x12220211, 0x10210021, 0x10210122, 0x10210221, + 0x11200020, 0x11200021, 0x11200122, 0x11210121, 0x11210122, 0x11210220, 0x11220020, 0x12200121, + 0x12210021, 0x12210122, 0x12220121, 0x10211001, 0x10211002, 0x10211101, 0x10211102, 0x10211202, + 0x10221001, 0x10221102, 0x10221201, 0x11201000, 0x11201002, 0x11201101, 0x11201200, 0x11201202, + 0x11211001, 0x11211100, 0x11211101, 0x11211102, 0x11211201, 0x11211202, 0x11221000, 0x11221002, + 0x11221101, 0x12201100, 0x12201101, 0x12201201, 0x12211000, 0x12211002, 0x12211100, 0x12211101, + 0x12211102, 0x12211200, 0x12211202, 0x12221001, 0x12221100, 0x12221201, 0x10201111, 0x10201210, + 0x10201212, 0x10211011, 0x10211111, 0x10211112, 0x10211211, 0x11201110, 0x11201111, 0x11201112, + 0x11201211, 0x11211010, 0x11211011, 0x11211110, 0x11211111, 0x11211112, 0x11211211, 0x11221011, + 0x11221110, 0x11221111, 0x11221112, 0x11221211, 0x12201112, 0x12201211, 0x12201212, 0x12211011, + 0x12211111, 0x12211112, 0x12211211, 0x12211212, 0x12221012, 0x12221111, 0x12221112, 0x12221210, + 0x10201022, 0x10201221, 0x10211121, 0x10221020, 0x10221122, 0x10221220, 0x10221221, 0x11201020, + 0x11201121, 0x11201220, 0x11201222, 0x11211021, 0x11211120, 0x11211121, 0x11211122, 0x11211220, + 0x11211222, 0x11221020, 0x11221121, 0x11221220, 0x12201020, 0x12201022, 0x12201121, 0x12201222, + 0x12211120, 0x12211122, 0x12211220, 0x12211221, 0x12221020, 0x12221120, 0x12221122, 0x12221222, + 0x10212102, 0x10212201, 0x10222101, 0x11202001, 0x11212002, 0x11212101, 0x11212202, 0x11222001, + 0x11222201, 0x12202101, 0x12212001, 0x12212200, 0x12222102, 0x10202011, 0x10202110, 0x10212010, + 0x10212111, 0x10222011, 0x10222110, 0x10222112, 0x10222211, 0x11202010, 0x11202011, 0x11202111, + 0x11202112, 0x11202210, 0x11212011, 0x11212110, 0x11212111, 0x11212112, 0x11212211, 0x11222010, + 0x11222111, 0x11222212, 0x12202012, 0x12202110, 0x12202212, 0x12212111, 0x12222011, 0x12222110, + 0x12222111, 0x12222211, 0x10212021, 0x10212122, 0x10212220, 0x11202021, 0x11202120, 0x11202221, + 0x11212020, 0x11212121, 0x11212220, 0x11212222, 0x11222120, 0x11222121, 0x11222221, 0x12202122, + 0x12212120, 0x12212220, 0x12212222, 0x12222122, 0x20000000, 0x20000002, 0x20000200, 0x20000202, + 0x20020000, 0x20020002, 0x20020200, 0x20020202, 0x21000101, 0x21010000, 0x21010001, 0x21010100, + 0x21010102, 0x21010201, 0x21020101, 0x22000000, 0x22000002, 0x22000200, 0x22000202, 0x22010101, + 0x22020000, 0x22020002, 0x22020200, 0x22020202, 0x20000111, 0x20010011, 0x20010110, 0x20010112, + 0x20010211, 0x20020111, 0x21000011, 0x21000110, 0x21000211, 0x21010010, 0x21010012, 0x21010111, + 0x21010112, 0x21010210, 0x21010211, 0x21020110, 0x21020112, 0x21020211, 0x22000111, 0x22000211, + 0x22010110, 0x22010112, 0x22010211, 0x22020111, 0x20000020, 0x20000022, 0x20000220, 0x20000222, + 0x20010121, 0x20020020, 0x20020022, 0x20020220, 0x20020222, 0x21010021, 0x21010120, 0x21010221, + 0x21020121, 0x22000020, 0x22000022, 0x22000220, 0x22000222, 0x22010121, 0x22020020, 0x22020022, + 0x22020220, 0x22020222, 0x20011100, 0x20011201, 0x21001001, 0x21001100, 0x21011001, 0x21011101, + 0x21011202, 0x21021001, 0x21021100, 0x21021201, 0x22011100, 0x22011201, 0x20001011, 0x20001211, + 0x20011012, 0x20011111, 0x20011212, 0x20021112, 0x20021211, 0x21001010, 0x21001011, 0x21001111, + 0x21001210, 0x21011011, 0x21011110, 0x21011111, 0x21011112, 0x21011211, 0x21011212, 0x21021111, + 0x21021112, 0x21021210, 0x21021212, 0x22001011, 0x22001110, 0x22001112, 0x22001211, 0x22011010, + 0x22011012, 0x22011111, 0x22011210, 0x22021112, 0x20011021, 0x20011122, 0x20011221, 0x20021121, + 0x21001021, 0x21001120, 0x21001221, 0x21001222, 0x21011020, 0x21011121, 0x21011221, 0x21011222, + 0x21021021, 0x21021122, 0x21021222, 0x22001121, 0x22011021, 0x22011222, 0x22021120, 0x20002000, + 0x20002002, 0x20002200, 0x20002202, 0x20012101, 0x20022000, 0x20022002, 0x20022200, 0x20022202, + 0x21002001, 0x21002101, 0x21012001, 0x21012100, 0x21012201, 0x21022101, 0x21022201, 0x22002000, + 0x22002002, 0x22002200, 0x22002202, 0x22012101, 0x22022000, 0x22022002, 0x22022200, 0x22022202, + 0x20002111, 0x20002112, 0x20012011, 0x20012110, 0x20012112, 0x20022111, 0x21002011, 0x21002110, + 0x21002112, 0x21002211, 0x21012010, 0x21012012, 0x21012111, 0x21012212, 0x21022011, 0x21022110, + 0x22002111, 0x22012112, 0x22012211, 0x22022111, 0x20002020, 0x20002022, 0x20002220, 0x20002222, + 0x20012121, 0x20022020, 0x20022022, 0x20022220, 0x20022222, 0x21002121, 0x21012021, 0x21012120, + 0x21012122, 0x22002020, 0x22002022, 0x22002220, 0x22002222, 0x22012121, 0x22022020, 0x22022022, + 0x22022220, 0x22022222, 0x20100101, 0x20110001, 0x20110102, 0x20110200, 0x20110201, 0x20120101, + 0x21100001, 0x21100102, 0x21100201, 0x21110101, 0x21110200, 0x21110202, 0x21120201, 0x21120202, + 0x22100101, 0x22110001, 0x22110100, 0x22110102, 0x22110201, 0x22120101, 0x20100011, 0x20100110, + 0x20100112, 0x20100211, 0x20110010, 0x20110111, 0x20110210, 0x20110212, 0x20120011, 0x20120110, + 0x20120112, 0x20120211, 0x21100010, 0x21100111, 0x21110010, 0x21110011, 0x21110110, 0x21110111, + 0x21110112, 0x21110211, 0x21120012, 0x21120111, 0x22100110, 0x22100112, 0x22110012, 0x22110111, + 0x22110210, 0x22120011, 0x22120110, 0x22120112, 0x22120211, 0x20100121, 0x20110021, 0x20110120, + 0x20110221, 0x20120121, 0x21100120, 0x21100122, 0x21100221, 0x21110020, 0x21110022, 0x21110121, + 0x21110220, 0x21120122, 0x21120221, 0x22100121, 0x22110120, 0x22110122, 0x22120221, 0x20101001, + 0x20101100, 0x20101102, 0x20111000, 0x20111101, 0x20111200, 0x20121102, 0x21101000, 0x21101202, + 0x21111001, 0x21111100, 0x21111101, 0x21111102, 0x21111200, 0x21111201, 0x21121000, 0x21121001, + 0x21121002, 0x21121101, 0x22101100, 0x22101102, 0x22111002, 0x22111100, 0x22111101, 0x22111200, + 0x22121001, 0x22121201, 0x20101010, 0x20101111, 0x20101210, 0x20101212, 0x20111010, 0x20111011, + 0x20111110, 0x20111111, 0x20111112, 0x20111211, 0x20121011, 0x20121111, 0x20121211, 0x20121212, + 0x21101011, 0x21101110, 0x21101111, 0x21101112, 0x21101211, 0x21111010, 0x21111011, 0x21111012, + 0x21111110, 0x21111111, 0x21111112, 0x21111210, 0x21111211, 0x21111212, 0x21121011, 0x21121110, + 0x21121111, 0x21121112, 0x21121211, 0x22101011, 0x22101111, 0x22101210, 0x22111011, 0x22111012, + 0x22111110, 0x22111111, 0x22111112, 0x22111211, 0x22111212, 0x22121010, 0x22121012, 0x22121111, + 0x22121210, 0x22121212, 0x20101021, 0x20101120, 0x20111020, 0x20111121, 0x20111221, 0x20121020, + 0x20121122, 0x20121221, 0x21101121, 0x21101220, 0x21101221, 0x21111021, 0x21111022, 0x21111121, + 0x21111122, 0x21111221, 0x21121121, 0x21121220, 0x22101022, 0x22101120, 0x22101221, 0x22101222, + 0x22111022, 0x22111120, 0x22111121, 0x22121120, 0x22121122, 0x22121221, 0x20102101, 0x20112102, + 0x20112201, 0x20122101, 0x21102001, 0x21102102, 0x21112000, 0x21112002, 0x21112101, 0x21112102, + 0x21112202, 0x21122100, 0x21122101, 0x22102101, 0x22112001, 0x22112102, 0x22112201, 0x22122101, + 0x20102110, 0x20102112, 0x20102211, 0x20112010, 0x20112012, 0x20112111, 0x20112210, 0x20112212, + 0x20122010, 0x20122011, 0x20122110, 0x20122112, 0x21102010, 0x21102012, 0x21102111, 0x21102210, + 0x21102212, 0x21112011, 0x21112110, 0x21112111, 0x21112112, 0x21112211, 0x21122012, 0x21122111, + 0x21122112, 0x21122212, 0x22102011, 0x22102110, 0x22112010, 0x22112012, 0x22112111, 0x22112212, + 0x22122011, 0x22122112, 0x20102121, 0x20112121, 0x20122121, 0x21102120, 0x21102122, 0x21102221, + 0x21112020, 0x21112121, 0x21112220, 0x21122021, 0x22102121, 0x22112021, 0x22112120, 0x22112121, + 0x22112122, 0x20200000, 0x20200002, 0x20200200, 0x20200202, 0x20210101, 0x20220000, 0x20220002, + 0x20220200, 0x20220202, 0x21200101, 0x21210001, 0x21210100, 0x21210102, 0x21210201, 0x22200000, + 0x22200002, 0x22200200, 0x22200202, 0x22210101, 0x22220000, 0x22220002, 0x22220200, 0x22220202, + 0x20200111, 0x20200211, 0x20210011, 0x20210110, 0x20210112, 0x20210211, 0x20210212, 0x21200112, + 0x21200211, 0x21210011, 0x21210111, 0x21210210, 0x21210212, 0x21220011, 0x21220110, 0x22200111, + 0x22210010, 0x22210012, 0x22210112, 0x22210211, 0x20200022, 0x20200220, 0x20200222, 0x20210020, + 0x20210221, 0x20220022, 0x20220220, 0x20220222, 0x21200121, 0x21210021, 0x21210122, 0x21210221, + 0x21220121, 0x22200020, 0x22200022, 0x22200220, 0x22200222, 0x22210121, 0x22220020, 0x22220022, + 0x22220220, 0x22220222, 0x20211201, 0x20221101, 0x21201001, 0x21201100, 0x21211000, 0x21211100, + 0x21211101, 0x21211200, 0x21211202, 0x21221001, 0x21221101, 0x21221102, 0x21221200, 0x21221201, + 0x22201101, 0x20201112, 0x20201211, 0x20211010, 0x20211012, 0x20211111, 0x20211210, 0x20221112, + 0x20221211, 0x21201012, 0x21201111, 0x21211011, 0x21211110, 0x21211111, 0x21211112, 0x21211211, + 0x21221111, 0x21221212, 0x22201011, 0x22201110, 0x22201111, 0x22201112, 0x22201211, 0x22211012, + 0x22211111, 0x22211210, 0x20201121, 0x20211021, 0x20211122, 0x20211222, 0x20221021, 0x20221121, + 0x21201120, 0x21201122, 0x21201222, 0x21211022, 0x21211121, 0x21211122, 0x21211220, 0x21221020, + 0x21221022, 0x22201122, 0x22211020, 0x22211121, 0x22211122, 0x22211221, 0x22221021, 0x22221120, + 0x22221122, 0x20202000, 0x20202002, 0x20202200, 0x20202202, 0x20222000, 0x20222002, 0x20222200, + 0x20222202, 0x21212001, 0x21212100, 0x21212102, 0x21212201, 0x22202000, 0x22202002, 0x22202200, + 0x22202202, 0x22212101, 0x22222000, 0x22222002, 0x22222200, 0x22222202, 0x20202111, 0x20212110, + 0x20212211, 0x20222011, 0x20222111, 0x21202011, 0x21212010, 0x21212111, 0x21212212, 0x21222011, + 0x21222112, 0x21222211, 0x22212010, 0x22212112, 0x20202020, 0x20202022, 0x20202220, 0x20202222, + 0x20222020, 0x20222022, 0x20222220, 0x20222222, 0x21212021, 0x21212120, 0x21212122, 0x22202020, + 0x22202022, 0x22202220, 0x22202222, 0x22212121, 0x22222020, 0x22222022, 0x22222220, 0x22222222, GGML_TABLE_END() +#endif #endif // GGML_COMMON_IMPL diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c207ff87a281a..d2945d3c2048d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -565,8 +565,8 @@ static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N #define QI1_S (QK_K / (4*QR1_S)) typedef struct { half d; - uint8_t qs[QK_K/8]; - uint8_t scales[QK_K/16]; + uint8_t qs[QK_K/8]; + uint16_t qh[QK_K/32]; } block_iq1_s; static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); @@ -1722,11 +1722,22 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_ const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const int i8 = 4*ib+il; - uint8_t h = x[i].scales[i8/2] >> 4*(i8%2); - const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5))); - const float d = (float)x[i].d * (2*(h & 7) + 1); - for (int j = 0; j < 8; ++j) y[j] = d * grid[j]; + const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1); +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = *((const int *)(iq1s_grid_gpu + (x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)))); + grid32[1] = __vsub4((grid32[0] >> 4) & 0x0f0f0f0f, 0x01010101); + grid32[0] = __vsub4(grid32[0] & 0x0f0f0f0f, 0x01010101); + for (int j = 0; j < 8; ++j) { + y[j] = d * q[j]; + } +#else + const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8))); + for (int j = 0; j < 4; ++j) { + y[j+0] = d * ((grid[j] & 0xf) - 1); + y[j+4] = d * ((grid[j] >> 4) - 1); + } +#endif #else assert(false); #endif @@ -4538,44 +4549,33 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( #endif } - static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { #if QK_K == 256 const block_iq1_s * bq1 = (const block_iq1_s *) vbq; const int ib32 = iqs; - int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0; - const uint8_t h1 = bq1->scales[2*ib32+0]; - const uint8_t h2 = bq1->scales[2*ib32+1]; + int sumi = 0; #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const int * q8 = (const int *)bq8_1[ib32].qs; - const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5))); - const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1))); - const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5))); - const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1))); - for (int j = 0; j < 2; ++j) { - sumi1 = __dp4a(q8[j+0], grid1[j], sumi1); - sumi2 = __dp4a(q8[j+2], grid2[j], sumi2); - sumi3 = __dp4a(q8[j+4], grid3[j], sumi3); - sumi4 = __dp4a(q8[j+6], grid4[j], sumi4); + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + int grid0 = __vsub4(grid[0] & 0x0f0f0f0f, 0x01010101); + int grid1 = __vsub4((grid[0] >> 4) & 0x0f0f0f0f, 0x01010101); + sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi)); } #else const int8_t * q8 = bq8_1[ib32].qs; - const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5))); - const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1))); - const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5))); - const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1))); - for (int j = 0; j < 8; ++j) { - sumi1 += q8[j+ 0] * grid1[j]; - sumi2 += q8[j+ 8] * grid2[j]; - sumi3 += q8[j+16] * grid3[j]; - sumi4 += q8[j+24] * grid4[j]; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + for (int j = 0; j < 4; ++j) { + sumi += q8[j] * ((grid[j] & 0xf) - 1) + q8[j+4] * ((grid[j] >> 4) - 1); + } + q8 += 8; } #endif const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds); - return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) + - sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1)); + return d * sumi * (2*(bq1->qh[ib32] >> 12) + 1); #else assert(false); return 0.f; diff --git a/ggml-metal.metal b/ggml-metal.metal index 50185ae4dea09..912822ee64bc3 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2595,8 +2595,8 @@ typedef struct { typedef struct { half d; - uint8_t qs[QK_K/8]; - uint8_t scales[QK_K/16]; + uint8_t qs[QK_K/8]; + uint16_t qh[QK_K/32]; } block_iq1_s; // Non-linear quants @@ -4338,48 +4338,53 @@ void kernel_mul_mv_iq1_s_f32_impl( device const block_iq1_s * x = (device const block_iq1_s *) src0 + ib_row + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; - float yl[16]; + float yl[32]; float sumf[N_DST]={0.f}, all_sum; const int nb32 = nb * (QK_K / 32); - const int ix = tiisg/2; - const int il = tiisg%2; + const int ix = tiisg; - device const float * y4 = y + 32 * ix + 16 * il; + device const float * y4 = y + 32 * ix; - for (int ib32 = ix; ib32 < nb32; ib32 += 16) { + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { - for (int i = 0; i < 16; ++i) { + float sumy = 0; + for (int i = 0; i < 32; ++i) { yl[i] = y4[i]; + sumy += yl[i]; } const int ibl = ib32 / (QK_K / 32); const int ib = ib32 % (QK_K / 32); device const block_iq1_s * xr = x + ibl; - device const uint8_t * qs = xr->qs + 4 * ib + 2 * il; - device const uint8_t * sc = xr->scales + 2 * ib + il; - device const half * dh = &xr->d; + device const uint8_t * qs = xr->qs + 4 * ib; + device const uint16_t * qh = xr->qh + ib; + device const half * dh = &xr->d; for (int row = 0; row < N_DST; row++) { - constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); - constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 5) & 0x700))); + constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[0] << 2) & 0x700))); + constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[0] >> 1) & 0x700))); - float2 sum = {0}; - for (int j = 0; j < 8; ++j) { - sum[0] += yl[j+ 0] * grid1[j]; - sum[1] += yl[j+ 8] * grid2[j]; + float sum = 0; + for (int j = 0; j < 4; ++j) { + sum += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4) + + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4) + + yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4) + + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4); } - sumf[row] += (float)dh[0] * (sum[0] * (2*(sc[0] & 7) + 1) + sum[1] * (2*((sc[0] >> 4) & 7) + 1)); + sumf[row] += (float)dh[0] * (sum - sumy) * (2*(qh[0] >> 12) + 1); dh += nb*sizeof(block_iq1_s)/2; qs += nb*sizeof(block_iq1_s); - sc += nb*sizeof(block_iq1_s); + qh += nb*sizeof(block_iq1_s)/2; } - y4 += 16 * 32; + y4 += 32 * 32; } for (int row = 0; row < N_DST; ++row) { @@ -5066,16 +5071,19 @@ void dequantize_iq2_s(device const block_iq2_s * xb, short il, thread type4x4 & template void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; const float d = xb->d; - device const uint8_t * qs = xb->qs + 2*il; - device const uint8_t * sc = xb->scales + il; - const float dl1 = d * (2*(sc[0] & 7) + 1); - const float dl2 = d * (2*((sc[0] >> 4) & 7) + 1); - constant int8_t * grid1 = (constant int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); - constant int8_t * grid2 = (constant int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); - for (int i = 0; i < 8; ++i) { - reg[i/4+0][i%4] = dl1 * grid1[i]; - reg[i/4+2][i%4] = dl2 * grid2[i]; + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint16_t * qh = xb->qh; + const float dl = d * (2*(qh[ib32] >> 12) + 1); + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | (((qh[ib32] >> (6*il+0)) & 7) << 8))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | (((qh[ib32] >> (6*il+3)) & 7) << 8))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) - dl; + reg[1][i] = dl * (grid1[i] >> 4) - dl; + reg[2][i] = dl * (grid2[i] & 0xf) - dl; + reg[3][i] = dl * (grid2[i] >> 4) - dl; } } diff --git a/ggml-quants.c b/ggml-quants.c index 42d8a5d805144..f9a3d9fd229e1 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3449,39 +3449,22 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in assert(k % QK_K == 0); const int nb = k / QK_K; - float db[4]; - uint16_t idx[4]; - //const int8_t * grid[4]; - for (int i = 0; i < nb; i++) { const float d = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * sc = x[i].scales; - const uint8_t * qs = x[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; - for (int i8 = 0; i8 < QK_K/8; i8 += 4) { - idx[0] = qs[0] | ((sc[0] & 0x08) << 5); - idx[1] = qs[1] | ((sc[0] & 0x80) << 1); - idx[2] = qs[2] | ((sc[1] & 0x08) << 5); - idx[3] = qs[3] | ((sc[1] & 0x80) << 1); - //grid[0] = (const int8_t *)(iq1s_grid + (qs[0] | ((sc[0] & 0x08) << 5))); - //grid[1] = (const int8_t *)(iq1s_grid + (qs[1] | ((sc[0] & 0x80) << 1))); - //grid[2] = (const int8_t *)(iq1s_grid + (qs[2] | ((sc[1] & 0x08) << 5))); - //grid[3] = (const int8_t *)(iq1s_grid + (qs[3] | ((sc[1] & 0x80) << 1))); - db[0] = d * (2*(sc[0] & 7) + 1); - db[1] = d * (2*((sc[0] >> 4) & 7) + 1); - db[2] = d * (2*(sc[1] & 7) + 1); - db[3] = d * (2*((sc[1] >> 4) & 7) + 1); + for (int ib = 0; ib < QK_K/32; ++ib) { + const float dl = d * (2*(qh[ib] >> 12) + 1); for (int l = 0; l < 4; ++l) { - const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8))); for (int j = 0; j < 8; ++j) { - //y[j] = db[l] * grid[l][j]; - y[j] = db[l] * grid[j]; + y[j] = dl * grid[j]; } y += 8; } qs += 4; - sc += 2; } } } @@ -9587,113 +9570,72 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void const int nb = n / QK_K; - // TODO: implement for QK_K = 64 -#if defined __ARM_NEON && QK_K == 256 - - const uint8x16_t m8 = vdupq_n_u8(0x08); - const uint8x16_t m7 = vdupq_n_u8(0x07); - const uint8x16_t m1 = vdupq_n_u8(0x01); - const int32x4_t vzero = vdupq_n_s32(0); +#if defined __ARM_NEON - uint16_t gindex[8]; - uint16x8x2_t vindex; - int8x16x4_t q1b; + ggml_int8x16x4_t q1b; ggml_int8x16x4_t q8b; - uint16x8x4_t scales; - int32x4x2_t sumi; - int32x4x2_t dotq; float sumf = 0; for (int i = 0; i < nb; ++i) { - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint8_t * sc = x[i].scales; + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; - sumi.val[0] = sumi.val[1] = vzero; + int sumi1 = 0, sumi2 = 0; - for (int i128 = 0; i128 < QK_K/128; ++i128) { - const uint8x16_t ql = vld1q_u8(qs); qs += 16; - const uint8x8_t tm1 = vld1_u8 (sc); sc += 8; - const uint8x8_t tm2 = vshr_n_u8(tm1, 4); - const uint8x16_t qh = vcombine_u8(vzip1_u8(tm1, tm2), vzip2_u8(tm1, tm2)); - const uint8x16_t hbit = vandq_u8(qh, m8); - vindex.val[0] = vorrq_u16(vmovl_u8(vget_low_u8 (ql)), vshlq_n_u16(vmovl_u8(vget_low_u8 (hbit)), 5)); - vindex.val[1] = vorrq_u16(vmovl_u8(vget_high_u8(ql)), vshlq_n_u16(vmovl_u8(vget_high_u8(hbit)), 5)); - const uint8x16_t scales8 = vorrq_u8(vshlq_n_u8(vandq_u8(qh, m7), 1), m1); - scales.val[0] = vmovl_u8(vget_low_u8 (scales8)); - scales.val[1] = vmovl_u8(vget_high_u8 (scales8)); + for (int ib = 0; ib < QK_K/32; ib += 2) { - for (int l = 0; l < 2; ++l) { - vst1q_u16(gindex+0, vindex.val[l]); - q1b.val[0] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[0])), vld1_s8((const void *)(iq1s_grid+gindex[1]))); - q1b.val[1] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[2])), vld1_s8((const void *)(iq1s_grid+gindex[3]))); - q1b.val[2] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[4])), vld1_s8((const void *)(iq1s_grid+gindex[5]))); - q1b.val[3] = vcombine_s8(vld1_s8((const void *)(iq1s_grid+gindex[6])), vld1_s8((const void *)(iq1s_grid+gindex[7]))); - q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[ib+0] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[ib+0] << 5) & 0x700))))); + q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[ib+0] << 2) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[ib+0] >> 1) & 0x700))))); + q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[ib+1] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[ib+1] << 5) & 0x700))))); + q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[ib+1] << 2) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[ib+1] >> 1) & 0x700))))); + qs += 8; + + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - dotq.val[0] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(vzero, q1b.val[1], q8b.val[1])); - dotq.val[1] = vpaddq_s32(ggml_vdotq_s32(vzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(vzero, q1b.val[3], q8b.val[3])); + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[0], q8b.val[0]), q1b.val[1], q8b.val[1]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q1b.val[2], q8b.val[2]), q1b.val[3], q8b.val[3]); + + sumi1 += vaddvq_s32(p1) * (2*(qh[ib+0] >> 12) + 1); + sumi2 += vaddvq_s32(p2) * (2*(qh[ib+1] >> 12) + 1); - sumi.val[0] = vmlaq_s32(sumi.val[0], dotq.val[0], vreinterpretq_s32_u32(vmovl_u16(vget_low_u16 (scales.val[l])))); - sumi.val[1] = vmlaq_s32(sumi.val[1], dotq.val[1], vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales.val[l])))); - } } - sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * vaddvq_s32(vaddq_s32(sumi.val[0], sumi.val[1])); + sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (sumi1 + sumi2); } *s = sumf; - // TODO: implement for QK_K = 64 -#elif defined __AVX2__ && QK_K == 256 - - const __m128i m8 = _mm_set1_epi8(0x08); - const __m128i m7 = _mm_set1_epi8(0x07); - const __m128i m1 = _mm_set1_epi8(0x01); - const __m128i shuffle_h = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0); - const __m128i shuffle_s[4] = { - _mm_set_epi32(0x03030303, 0x02020202, 0x01010101, 0x00000000), - _mm_set_epi32(0x07070707, 0x06060606, 0x05050505, 0x04040404), - _mm_set_epi32(0x0b0b0b0b, 0x0a0a0a0a, 0x09090909, 0x08080808), - _mm_set_epi32(0x0f0f0f0f, 0x0e0e0e0e, 0x0d0d0d0d, 0x0c0c0c0c) - }; - - uint64_t aux64; - - typedef union m256i_uint16 { - __m256i reg; - uint16_t s[16]; - } m256i_uint16_t; - - m256i_uint16_t v_gindex; +#elif defined __AVX2__ __m256 accum = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint8_t * sc = x[i].scales; + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; __m256i sumi = _mm256_setzero_si256(); - for (int i128 = 0; i128 < QK_K/128; ++i128) { - const __m128i ql = _mm_loadu_si128((const __m128i*)qs); qs += 16; - memcpy(&aux64, sc, 8); sc += 8; - const __m128i qh = _mm_shuffle_epi8(_mm_set_epi64x(aux64 >> 4, aux64), shuffle_h); - const __m256i hbit = _mm256_cvtepu8_epi16(_mm_and_si128(qh, m8)); - v_gindex.reg = _mm256_or_si256(_mm256_cvtepu8_epi16(ql), _mm256_slli_epi16(hbit, 5)); - const __m128i scales = _mm_or_si128(_mm_slli_epi16(_mm_and_si128(qh, m7), 1), m1); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)], + iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]); + const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)], + iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]); + qs += 8; + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - for (int i32 = 0; i32 < 4; ++i32) { - const __m256i q8b = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; - const __m256i q1b = _mm256_set_epi64x(iq1s_grid[v_gindex.s[4*i32+3]], iq1s_grid[v_gindex.s[4*i32+2]], - iq1s_grid[v_gindex.s[4*i32+1]], iq1s_grid[v_gindex.s[4*i32+0]]); - const __m256i dot = mul_add_epi8(q1b, q8b); - const __m256i s16 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, shuffle_s[i32])); - const __m256i p = _mm256_madd_epi16(s16, dot); - sumi = _mm256_add_epi32(sumi, p); - } + const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); + const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*(qh[ib+0] >> 12) + 1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*(qh[ib+1] >> 12) + 1)); + sumi = _mm256_add_epi32(sumi, _mm256_add_epi32(p1, p2)); } accum = _mm256_fmadd_ps(_mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)), _mm256_cvtepi32_ps(sumi), accum); @@ -9704,35 +9646,26 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void #else - int db[4]; - uint16_t idx[4]; - float sumf = 0; - for (int i = 0; i < nb; ++i) { + for (int i = 0; i < nb; i++) { - const int8_t * q8 = y[i].qs; - const uint8_t * qs = x[i].qs; - const uint8_t * sc = x[i].scales; + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint16_t * qh = x[i].qh; int sumi = 0; - for (int i32 = 0; i32 < QK_K/32; ++i32) { - idx[0] = qs[0] | ((sc[0] & 0x08) << 5); - idx[1] = qs[1] | ((sc[0] & 0x80) << 1); - idx[2] = qs[2] | ((sc[1] & 0x08) << 5); - idx[3] = qs[3] | ((sc[1] & 0x80) << 1); - db[0] = (2*(sc[0] & 7) + 1); - db[1] = (2*((sc[0] >> 4) & 7) + 1); - db[2] = (2*(sc[1] & 7) + 1); - db[3] = (2*((sc[1] >> 4) & 7) + 1); + for (int ib = 0; ib < QK_K/32; ++ib) { + const int ls = 2*(qh[ib] >> 12) + 1; + int lsum = 0; for (int l = 0; l < 4; ++l) { - const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); - int suml = 0; - for (int j = 0; j < 8; ++j) suml += q8[j] * grid[j]; - sumi += db[l] * suml; + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((qh[ib] >> 3*l) & 7) << 8))); + for (int j = 0; j < 8; ++j) { + lsum += q8[j] * grid[j]; + } q8 += 8; } + sumi += ls * lsum; qs += 4; - sc += 2; } sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * sumi; @@ -9996,7 +9929,7 @@ static inline int iq2_grid_size(enum ggml_type type) { GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S); return type == GGML_TYPE_IQ2_XXS ? 256 : type == GGML_TYPE_IQ2_XS ? 512 : - type == GGML_TYPE_IQ1_S ? 512 : 1024; + type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024; } static int iq2_compare_func(const void * left, const void * right) { @@ -10063,39 +9996,135 @@ void iq2xs_init_impl(enum ggml_type type) { 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048, 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690, }; - static const uint16_t kgrid_1bit_512[512] = { - 10, 33, 41, 85, 132, 134, 160, 162, 277, 337, 340, 345, 357, 405, 516, 545, - 553, 598, 641, 650, 681, 1042, 1044, 1097, 1169, 1176, 1320, 1345, 1365, 1378, 1434, 1444, - 1545, 1617, 1642, 1685, 2053, 2080, 2089, 2133, 2176, 2182, 2208, 2214, 2306, 2384, 2393, 2440, - 2453, 2581, 2664, 2690, 2721, 4117, 4161, 4182, 4184, 4261, 4357, 4369, 4372, 4377, 4390, 4422, - 4432, 4437, 4449, 4457, 4485, 4497, 4505, 4629, 4677, 4696, 4774, 5205, 5217, 5225, 5386, 5397, - 5409, 5445, 5457, 5460, 5461, 5462, 5465, 5472, 5477, 5525, 5545, 5650, 5668, 5717, 5729, 5769, - 5777, 6212, 6234, 6244, 6293, 6424, 6482, 6485, 6502, 6505, 6529, 6538, 6565, 6656, 6682, 6788, - 6806, 6820, 8218, 8224, 8226, 8232, 8277, 8326, 8354, 8469, 8521, 8530, 8549, 8596, 8737, 8794, - 9221, 9253, 9348, 9369, 9380, 9474, 9557, 9633, 9732, 9753, 9793, 9830, 9862, 9880, 10240, 10272, - 10282, 10321, 10406, 10517, 10530, 10566, 10585, 10645, 10896, 16466, 16468, 16473, 16485, 16646, 16660, 16665, - 16725, 16793, 16806, 16914, 16969, 16977, 16996, 17028, 17057, 17408, 17416, 17434, 17493, 17512, 17578, 17685, - 17696, 17733, 17745, 17748, 17749, 17750, 17753, 17765, 17794, 17813, 17946, 17984, 18005, 18072, 18453, 18529, - 18569, 18722, 18756, 18762, 18773, 18794, 18833, 18853, 18945, 19026, 19033, 19077, 20489, 20497, 20500, 20517, - 20565, 20586, 20610, 20633, 20757, 20769, 20776, 20805, 20817, 20820, 20821, 20822, 20825, 20837, 20864, 20872, - 20885, 20896, 21002, 21029, 21077, 21146, 21510, 21525, 21573, 21585, 21588, 21589, 21590, 21593, 21605, 21653, - 21665, 21765, 21777, 21780, 21781, 21782, 21785, 21797, 21825, 21828, 21829, 21830, 21833, 21840, 21841, 21842, - 21844, 21846, 21848, 21849, 21850, 21857, 21860, 21861, 21862, 21865, 21893, 21905, 21908, 21909, 21910, 21913, - 21925, 22024, 22037, 22085, 22097, 22100, 22101, 22102, 22105, 22117, 22165, 22545, 22566, 22568, 22594, 22608, - 22613, 22676, 22697, 22793, 22805, 22853, 22865, 22868, 22869, 22870, 22873, 22885, 22933, 22946, 23046, 23072, - 23125, 23209, 24597, 24640, 24665, 24673, 24725, 24833, 24840, 24869, 24917, 24934, 24965, 25001, 25108, 25110, - 25152, 25184, 25192, 25234, 25616, 25618, 25625, 25685, 25704, 25738, 25744, 25770, 25877, 25897, 25925, 25937, - 25940, 25941, 25942, 25945, 25957, 25986, 26005, 26186, 26197, 26276, 26632, 26634, 26725, 26757, 26770, 26885, - 26965, 26976, 26986, 27032, 27153, 27174, 27200, 27208, 27240, 27269, 27282, 27290, 32778, 32800, 32802, 32808, - 32810, 32853, 32904, 32922, 32930, 32932, 33105, 33110, 33112, 33125, 33157, 33280, 33288, 33301, 33312, 33320, - 33424, 33797, 33829, 33858, 34068, 34133, 34146, 34176, 34217, 34306, 34342, 34441, 34454, 34468, 34832, 34918, - 34965, 34984, 35094, 35137, 35161, 35208, 35232, 35332, 35338, 35368, 35429, 36932, 36934, 36953, 37009, 37125, - 37136, 37138, 37145, 37157, 37205, 37220, 37258, 37290, 37444, 37446, 37465, 37478, 37525, 37905, 37968, 37973, - 38040, 38054, 38145, 38154, 38165, 38180, 38186, 38213, 38225, 38228, 38229, 38230, 38233, 38245, 38293, 38485, - 38504, 38530, 38938, 38985, 38993, 39012, 39040, 39173, 39192, 39253, 39265, 39301, 39316, 39322, 39442, 39497, - 39504, 39590, 40970, 40984, 40992, 41002, 41045, 41120, 41128, 41237, 41289, 41297, 41317, 41364, 41366, 41514, - 41557, 41633, 41989, 42021, 42056, 42068, 42074, 42113, 42242, 42265, 42274, 42325, 42340, 42402, 42501, 42512, - 42533, 42624, 42632, 42666, 43040, 43093, 43106, 43168, 43176, 43264, 43286, 43345, 43429, 43590, 43618, 43680, + static const uint16_t kgrid_1bit_2048[NGRID_IQ1S] = { + 0, 2, 5, 8, 10, 17, 21, 32, 34, 40, 42, 69, 81, 84, 86, 101, + 128, 130, 136, 138, 149, 160, 162, 168, 170, 260, 261, 273, 276, 278, 281, 282, + 293, 321, 326, 329, 338, 341, 346, 353, 356, 358, 360, 389, 401, 404, 406, 421, + 512, 514, 520, 522, 533, 544, 546, 552, 554, 581, 593, 601, 612, 617, 640, 642, + 648, 650, 657, 661, 665, 672, 674, 680, 682, 1041, 1044, 1046, 1061, 1089, 1097, 1109, + 1114, 1124, 1125, 1169, 1177, 1189, 1281, 1284, 1285, 1286, 1301, 1304, 1306, 1321, 1344, 1349, + 1354, 1360, 1361, 1364, 1365, 1366, 1369, 1376, 1378, 1381, 1384, 1386, 1409, 1425, 1429, 1432, + 1434, 1441, 1444, 1445, 1446, 1449, 1556, 1561, 1601, 1604, 1616, 1618, 1621, 1624, 1632, 1633, + 1638, 1641, 1669, 1681, 1684, 1689, 2048, 2050, 2056, 2058, 2069, 2080, 2082, 2088, 2090, 2117, + 2129, 2134, 2149, 2176, 2178, 2184, 2186, 2197, 2208, 2210, 2216, 2218, 2309, 2321, 2324, 2329, + 2340, 2341, 2369, 2384, 2385, 2389, 2401, 2404, 2409, 2449, 2452, 2454, 2457, 2469, 2560, 2562, + 2568, 2570, 2581, 2592, 2594, 2600, 2602, 2629, 2641, 2649, 2657, 2661, 2688, 2690, 2693, 2696, + 2698, 2709, 2720, 2722, 2728, 2730, 4112, 4113, 4116, 4121, 4132, 4133, 4161, 4164, 4176, 4181, + 4184, 4193, 4196, 4197, 4201, 4241, 4244, 4246, 4257, 4261, 4353, 4356, 4358, 4361, 4368, 4370, + 4373, 4376, 4385, 4388, 4393, 4421, 4426, 4432, 4433, 4434, 4436, 4437, 4438, 4441, 4448, 4453, + 4484, 4498, 4501, 4513, 4516, 4625, 4628, 4630, 4645, 4672, 4678, 4681, 4690, 4693, 4696, 4698, + 4708, 4710, 4741, 4753, 4756, 4758, 4773, 5121, 5126, 5129, 5140, 5141, 5144, 5145, 5153, 5158, + 5185, 5189, 5190, 5192, 5194, 5201, 5204, 5205, 5206, 5209, 5218, 5221, 5224, 5252, 5257, 5264, + 5268, 5269, 5272, 5273, 5274, 5281, 5284, 5285, 5289, 5378, 5381, 5386, 5393, 5396, 5397, 5398, + 5401, 5408, 5410, 5413, 5416, 5418, 5441, 5444, 5445, 5446, 5457, 5458, 5460, 5461, 5462, 5465, + 5466, 5473, 5476, 5477, 5478, 5481, 5504, 5506, 5508, 5509, 5512, 5514, 5520, 5521, 5524, 5525, + 5526, 5529, 5530, 5536, 5538, 5541, 5633, 5636, 5637, 5638, 5653, 5654, 5656, 5658, 5665, 5670, + 5696, 5698, 5700, 5701, 5704, 5706, 5713, 5717, 5718, 5720, 5721, 5729, 5732, 5733, 5736, 5737, + 5738, 5766, 5770, 5778, 5781, 5796, 5801, 6161, 6166, 6181, 6209, 6212, 6214, 6217, 6224, 6229, + 6232, 6234, 6240, 6241, 6244, 6246, 6249, 6277, 6289, 6292, 6309, 6416, 6418, 6421, 6426, 6433, + 6437, 6466, 6468, 6469, 6472, 6481, 6484, 6485, 6486, 6489, 6490, 6496, 6501, 6506, 6537, 6545, + 6546, 6549, 6552, 6561, 6566, 6569, 6665, 6678, 6692, 6694, 6724, 6726, 6729, 6736, 6738, 6741, + 6744, 6753, 6758, 6761, 6789, 6801, 6806, 6810, 8192, 8194, 8200, 8202, 8213, 8224, 8226, 8229, + 8232, 8234, 8261, 8273, 8281, 8289, 8293, 8320, 8322, 8328, 8330, 8341, 8352, 8354, 8357, 8360, + 8362, 8453, 8465, 8468, 8473, 8485, 8514, 8516, 8521, 8533, 8536, 8538, 8545, 8548, 8549, 8550, + 8581, 8592, 8598, 8601, 8613, 8705, 8712, 8714, 8721, 8725, 8736, 8738, 8744, 8746, 8773, 8785, + 8790, 8793, 8805, 8833, 8840, 8842, 8849, 8853, 8864, 8866, 8872, 8874, 9221, 9236, 9238, 9241, + 9253, 9284, 9285, 9286, 9289, 9298, 9301, 9304, 9306, 9318, 9349, 9361, 9364, 9369, 9377, 9381, + 9481, 9493, 9505, 9513, 9536, 9541, 9544, 9553, 9556, 9557, 9561, 9570, 9573, 9576, 9609, 9616, + 9620, 9621, 9624, 9626, 9633, 9636, 9638, 9641, 9733, 9744, 9746, 9753, 9765, 9793, 9801, 9813, + 9824, 9825, 9833, 9860, 9862, 9872, 9882, 10240, 10242, 10248, 10250, 10261, 10272, 10274, 10280, 10282, + 10309, 10321, 10324, 10341, 10368, 10370, 10376, 10378, 10400, 10402, 10408, 10410, 10505, 10513, 10516, 10521, + 10533, 10566, 10569, 10578, 10581, 10593, 10596, 10598, 10601, 10629, 10640, 10646, 10649, 10660, 10661, 10752, + 10754, 10760, 10762, 10784, 10786, 10792, 10794, 10821, 10833, 10838, 10841, 10853, 10880, 10882, 10888, 10890, + 10901, 10912, 10914, 10920, 10922, 16389, 16401, 16406, 16421, 16457, 16466, 16469, 16472, 16474, 16481, 16484, + 16486, 16532, 16537, 16545, 16550, 16640, 16641, 16644, 16646, 16649, 16658, 16661, 16662, 16664, 16666, 16673, + 16678, 16681, 16709, 16712, 16714, 16721, 16724, 16725, 16726, 16729, 16730, 16741, 16744, 16746, 16769, 16772, + 16774, 16784, 16786, 16789, 16800, 16801, 16802, 16901, 16913, 16916, 16918, 16933, 16961, 16978, 16981, 16986, + 16996, 17001, 17033, 17044, 17061, 17409, 17429, 17433, 17449, 17477, 17480, 17482, 17489, 17492, 17493, 17494, + 17505, 17506, 17509, 17512, 17514, 17537, 17542, 17545, 17552, 17554, 17557, 17568, 17569, 17577, 17665, 17666, + 17669, 17674, 17681, 17684, 17685, 17686, 17689, 17696, 17701, 17706, 17729, 17732, 17733, 17734, 17737, 17744, + 17745, 17748, 17749, 17750, 17752, 17753, 17761, 17764, 17765, 17766, 17769, 17794, 17796, 17797, 17800, 17809, + 17812, 17813, 17814, 17817, 17818, 17829, 17832, 17834, 17921, 17925, 17929, 17940, 17941, 17944, 17946, 17953, + 17956, 17961, 17984, 17986, 17989, 17992, 18000, 18001, 18002, 18005, 18006, 18009, 18018, 18021, 18024, 18049, + 18053, 18058, 18068, 18069, 18081, 18084, 18086, 18437, 18449, 18453, 18458, 18469, 18498, 18505, 18512, 18517, + 18520, 18529, 18532, 18534, 18537, 18565, 18577, 18580, 18582, 18585, 18597, 18689, 18693, 18694, 18698, 18704, + 18708, 18709, 18712, 18721, 18724, 18726, 18752, 18757, 18762, 18769, 18770, 18772, 18773, 18774, 18777, 18784, + 18786, 18789, 18790, 18794, 18822, 18825, 18834, 18837, 18838, 18840, 18849, 18852, 18854, 18857, 18966, 19012, + 19014, 19017, 19029, 19032, 19034, 19044, 19049, 19092, 19109, 20481, 20484, 20485, 20486, 20489, 20498, 20501, + 20506, 20513, 20516, 20521, 20544, 20549, 20552, 20561, 20564, 20565, 20566, 20569, 20581, 20584, 20614, 20617, + 20629, 20632, 20640, 20641, 20646, 20649, 20741, 20744, 20745, 20746, 20753, 20756, 20757, 20758, 20760, 20761, + 20768, 20773, 20774, 20776, 20778, 20801, 20804, 20805, 20806, 20809, 20816, 20817, 20818, 20820, 20821, 20822, + 20824, 20825, 20826, 20833, 20836, 20837, 20838, 20841, 20866, 20869, 20881, 20884, 20885, 20886, 20889, 20896, + 20901, 20906, 20993, 20998, 21010, 21013, 21018, 21025, 21028, 21058, 21061, 21066, 21073, 21076, 21077, 21078, + 21081, 21090, 21093, 21125, 21136, 21138, 21141, 21145, 21146, 21156, 21508, 21509, 21521, 21524, 21525, 21526, + 21528, 21529, 21537, 21541, 21544, 21546, 21569, 21572, 21573, 21574, 21577, 21578, 21584, 21585, 21588, 21589, + 21590, 21592, 21593, 21594, 21601, 21602, 21604, 21605, 21606, 21609, 21632, 21640, 21642, 21649, 21652, 21653, + 21654, 21657, 21665, 21668, 21669, 21674, 21761, 21762, 21764, 21765, 21766, 21769, 21776, 21777, 21778, 21780, + 21781, 21782, 21785, 21786, 21793, 21796, 21797, 21798, 21801, 21824, 21825, 21826, 21828, 21829, 21830, 21832, + 21833, 21840, 21841, 21842, 21844, 21845, 21846, 21848, 21849, 21850, 21856, 21857, 21860, 21861, 21862, 21864, + 21865, 21866, 21889, 21892, 21893, 21897, 21898, 21904, 21905, 21908, 21909, 21910, 21912, 21913, 21921, 21924, + 21925, 21926, 21929, 22016, 22017, 22018, 22020, 22022, 22024, 22025, 22033, 22036, 22037, 22040, 22041, 22048, + 22049, 22050, 22052, 22053, 22054, 22056, 22057, 22081, 22085, 22086, 22088, 22089, 22090, 22096, 22097, 22098, + 22100, 22101, 22102, 22104, 22105, 22106, 22113, 22116, 22117, 22121, 22146, 22149, 22150, 22152, 22153, 22154, + 22161, 22165, 22170, 22178, 22181, 22182, 22184, 22185, 22532, 22533, 22534, 22537, 22544, 22549, 22552, 22561, + 22570, 22597, 22600, 22602, 22609, 22612, 22613, 22614, 22616, 22617, 22624, 22626, 22628, 22629, 22658, 22665, + 22672, 22674, 22677, 22680, 22689, 22697, 22785, 22786, 22789, 22794, 22801, 22804, 22805, 22806, 22809, 22821, + 22849, 22852, 22853, 22854, 22857, 22864, 22865, 22866, 22868, 22869, 22870, 22872, 22873, 22874, 22881, 22884, + 22885, 22886, 22889, 22913, 22917, 22921, 22929, 22932, 22933, 22934, 22936, 22937, 22949, 23044, 23048, 23061, + 23066, 23072, 23077, 23078, 23081, 23109, 23112, 23113, 23121, 23125, 23126, 23128, 23129, 23138, 23141, 23144, + 23146, 23169, 23178, 23186, 23189, 23190, 23192, 23194, 23201, 24581, 24596, 24598, 24601, 24613, 24644, 24656, + 24661, 24662, 24664, 24666, 24673, 24676, 24678, 24681, 24705, 24726, 24741, 24833, 24836, 24838, 24841, 24850, + 24853, 24865, 24866, 24870, 24873, 24901, 24905, 24913, 24917, 24918, 24921, 24933, 24934, 24938, 24964, 24970, + 24978, 24981, 24993, 24998, 25001, 25105, 25110, 25113, 25152, 25153, 25158, 25173, 25174, 25176, 25184, 25221, + 25233, 25238, 25253, 25617, 25618, 25621, 25622, 25626, 25633, 25638, 25641, 25664, 25666, 25669, 25672, 25674, + 25681, 25684, 25685, 25686, 25689, 25690, 25696, 25698, 25701, 25732, 25733, 25737, 25744, 25746, 25748, 25749, + 25750, 25752, 25754, 25761, 25764, 25769, 25861, 25864, 25866, 25873, 25877, 25878, 25881, 25924, 25925, 25926, + 25929, 25936, 25937, 25940, 25941, 25942, 25945, 25953, 25956, 25957, 25958, 25961, 25990, 25993, 25994, 26001, + 26005, 26006, 26009, 26010, 26018, 26021, 26022, 26024, 26114, 26121, 26133, 26144, 26150, 26152, 26153, 26176, + 26181, 26184, 26186, 26193, 26196, 26197, 26198, 26200, 26202, 26208, 26213, 26216, 26240, 26242, 26245, 26250, + 26260, 26262, 26264, 26265, 26272, 26276, 26278, 26282, 26646, 26649, 26661, 26689, 26706, 26709, 26714, 26721, + 26729, 26757, 26769, 26776, 26790, 26881, 26884, 26896, 26901, 26913, 26916, 26918, 26921, 26944, 26945, 26949, + 26950, 26952, 26961, 26964, 26965, 26966, 26969, 26976, 26981, 26986, 27010, 27012, 27018, 27029, 27041, 27044, + 27045, 27049, 27153, 27158, 27160, 27201, 27204, 27209, 27216, 27221, 27224, 27226, 27236, 27237, 27241, 27270, + 27284, 27288, 27290, 27302, 32768, 32770, 32776, 32778, 32800, 32802, 32808, 32810, 32837, 32848, 32849, 32852, + 32854, 32857, 32869, 32896, 32898, 32904, 32906, 32917, 32928, 32930, 32936, 32938, 33029, 33041, 33044, 33046, + 33049, 33061, 33089, 33092, 33097, 33104, 33106, 33109, 33110, 33112, 33113, 33124, 33126, 33129, 33157, 33161, + 33172, 33174, 33177, 33189, 33280, 33282, 33288, 33290, 33301, 33312, 33314, 33320, 33322, 33361, 33364, 33369, + 33381, 33408, 33410, 33416, 33418, 33429, 33440, 33442, 33448, 33450, 33812, 33817, 33857, 33860, 33873, 33877, + 33882, 33889, 33892, 33897, 33940, 33945, 34049, 34057, 34066, 34069, 34074, 34086, 34089, 34112, 34113, 34117, + 34120, 34129, 34132, 34133, 34134, 34137, 34138, 34149, 34150, 34152, 34154, 34177, 34180, 34182, 34185, 34192, + 34194, 34197, 34200, 34214, 34321, 34326, 34329, 34341, 34369, 34372, 34377, 34378, 34384, 34389, 34393, 34394, + 34401, 34406, 34410, 34437, 34449, 34458, 34468, 34816, 34818, 34824, 34826, 34837, 34848, 34850, 34856, 34858, + 34881, 34885, 34897, 34900, 34905, 34917, 34921, 34944, 34946, 34952, 34954, 34965, 34976, 34978, 34984, 34986, + 35077, 35078, 35089, 35092, 35094, 35109, 35137, 35140, 35142, 35145, 35152, 35154, 35157, 35162, 35169, 35172, + 35205, 35222, 35225, 35237, 35328, 35330, 35336, 35338, 35349, 35360, 35362, 35368, 35370, 35397, 35409, 35412, + 35414, 35456, 35458, 35464, 35466, 35477, 35488, 35490, 35496, 35498, 36869, 36881, 36886, 36888, 36889, 36901, + 36929, 36934, 36937, 36949, 36952, 36954, 36969, 36970, 36997, 37009, 37012, 37014, 37017, 37029, 37121, 37124, + 37126, 37129, 37136, 37141, 37144, 37146, 37153, 37156, 37158, 37161, 37184, 37189, 37200, 37201, 37204, 37205, + 37206, 37209, 37218, 37221, 37252, 37254, 37266, 37269, 37272, 37281, 37284, 37286, 37289, 37381, 37393, 37396, + 37401, 37413, 37444, 37446, 37449, 37456, 37458, 37461, 37464, 37478, 37481, 37509, 37524, 37526, 37545, 37889, + 37892, 37894, 37904, 37909, 37912, 37926, 37952, 37962, 37969, 37972, 37973, 37974, 37976, 37977, 37984, 37985, + 37986, 37989, 38020, 38022, 38034, 38036, 38037, 38040, 38049, 38057, 38144, 38149, 38152, 38154, 38160, 38161, + 38164, 38165, 38166, 38169, 38177, 38181, 38185, 38186, 38209, 38212, 38213, 38214, 38217, 38224, 38225, 38226, + 38228, 38229, 38230, 38232, 38233, 38234, 38241, 38244, 38245, 38246, 38249, 38273, 38277, 38280, 38289, 38290, + 38292, 38293, 38294, 38297, 38298, 38304, 38306, 38309, 38312, 38314, 38401, 38404, 38416, 38421, 38425, 38432, + 38438, 38441, 38469, 38472, 38473, 38481, 38482, 38485, 38486, 38489, 38501, 38504, 38530, 38532, 38537, 38538, + 38546, 38548, 38549, 38564, 38566, 38569, 38917, 38934, 38937, 38949, 38977, 38982, 38992, 38994, 38997, 38998, + 39002, 39012, 39013, 39045, 39057, 39062, 39065, 39077, 39172, 39174, 39177, 39184, 39186, 39189, 39192, 39194, + 39200, 39201, 39204, 39206, 39232, 39234, 39237, 39240, 39242, 39249, 39252, 39253, 39254, 39257, 39266, 39269, + 39270, 39274, 39297, 39300, 39312, 39314, 39317, 39322, 39329, 39334, 39429, 39445, 39461, 39492, 39494, 39497, + 39504, 39509, 39512, 39521, 39557, 39569, 39572, 39573, 39574, 40960, 40962, 40968, 40970, 40981, 40992, 40994, + 41000, 41002, 41029, 41041, 41044, 41046, 41049, 41088, 41090, 41096, 41098, 41109, 41120, 41122, 41128, 41130, + 41221, 41225, 41233, 41236, 41238, 41241, 41242, 41286, 41289, 41297, 41301, 41304, 41306, 41313, 41316, 41349, + 41360, 41362, 41366, 41369, 41474, 41480, 41482, 41488, 41497, 41506, 41512, 41514, 41541, 41553, 41558, 41561, + 41573, 41600, 41602, 41608, 41610, 41621, 41632, 41634, 41640, 41642, 42009, 42021, 42049, 42052, 42064, 42068, + 42069, 42072, 42074, 42081, 42085, 42086, 42088, 42089, 42117, 42246, 42249, 42256, 42258, 42261, 42264, 42278, + 42281, 42306, 42309, 42321, 42324, 42325, 42326, 42329, 42341, 42346, 42369, 42372, 42373, 42374, 42377, 42386, + 42389, 42392, 42501, 42513, 42518, 42522, 42529, 42533, 42564, 42566, 42570, 42578, 42581, 42582, 42584, 42592, + 42594, 42630, 42640, 42645, 42646, 42649, 42657, 42660, 42662, 43008, 43010, 43016, 43018, 43040, 43042, 43048, + 43050, 43089, 43092, 43094, 43097, 43136, 43138, 43144, 43146, 43157, 43168, 43170, 43176, 43178, 43269, 43284, + 43289, 43297, 43301, 43329, 43344, 43349, 43354, 43361, 43366, 43369, 43408, 43414, 43520, 43522, 43528, 43530, + 43552, 43554, 43560, 43562, 43601, 43604, 43606, 43648, 43650, 43656, 43658, 43669, 43680, 43682, 43688, 43690, }; static const uint16_t kgrid_2bit_1024[1024] = { 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70, @@ -10169,7 +10198,7 @@ void iq2xs_init_impl(enum ggml_type type) { const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2; const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 : type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : - type == GGML_TYPE_IQ1_S ? kgrid_1bit_512 : kgrid_2bit_1024; + type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024; uint64_t * kgrid_q2xs; int * kmap_q2xs; uint16_t * kneighbors_q2xs; @@ -11408,12 +11437,70 @@ static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const u return grid_index; } +static int iq1_find_best_neighbour2(const uint16_t * restrict neighbours, const uint64_t * restrict grid, + const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L, int ngrid) { + int num_neighbors = neighbours[0]; + GGML_ASSERT(num_neighbors > 0); + float best_score = FLT_MAX; + int grid_index = -1; + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float d2 = 0; + for (int i = 0; i < 8; ++i) { + float q = (pg[i] - 3)/2; + float w = weight[i]; + float diff = scale*q - xval[i]; + d2 += w*diff*diff; + } + if (d2 < best_score) { + best_score = d2; + grid_index = neighbours[j]; + } + } + if (grid_index < 0) { + for (int i = 0; i < ngrid; ++i) { + const int8_t * grid_i = (const int8_t *)(grid + i); + float d2 = 0; + for (int j = 0; j < 8; ++j) { + float w = weight[j]; + float q = (grid_i[j] - 3)/2; + float diff = scale*q - xval[i]; + d2 += w*diff*diff; + } + if (d2 < best_score) { + best_score = d2; + grid_index = i; + } + } + } + if (grid_index < 0) { + printf("Oops, did not find grid point\n"); + printf("Have %d neighbours\n", num_neighbors); + for (int j = 1; j <= num_neighbors; ++j) { + const int8_t * pg = (const int8_t *)(grid + neighbours[j]); + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < 8; ++i) { + float q = (pg[i] - 3)/2; + float w = weight[i]; + sumqx += w*q*xval[i]; + sumq2 += w*q*q; + } + printf(" neighbour %d: sumqx = %g sumq2 = %g\n", j, (double)sumqx, (double)sumq2); + } + } + GGML_ASSERT(grid_index >= 0); + const int8_t * pg = (const int8_t *)(grid + grid_index); + for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2; + return grid_index; +} + static int iq1_sort_helper(const void * left, const void * right) { const float * l = left; const float * r = right; return *l < *r ? -1 : *l > *r ? 1 : 0; } +#define IQ1S_BLOCK_SIZE 32 static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { const int gindex = iq2_data_index(GGML_TYPE_IQ1_S); @@ -11432,37 +11519,37 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy block_iq1_s * y = vy; - float scales[QK_K/8]; - float weight[8]; - int8_t L[8]; - float sumx[9]; - float sumw[9]; - float pairs[16]; + float scales[QK_K/IQ1S_BLOCK_SIZE]; + float weight[IQ1S_BLOCK_SIZE]; + int8_t L[IQ1S_BLOCK_SIZE]; + float sumx[IQ1S_BLOCK_SIZE+1]; + float sumw[IQ1S_BLOCK_SIZE+1]; + float pairs[2*IQ1S_BLOCK_SIZE]; int * idx = (int *)(pairs + 1); - uint8_t hbit[QK_K/8]; + uint16_t index[IQ1S_BLOCK_SIZE/8]; for (int ibl = 0; ibl < nbl; ++ibl) { y[ibl].d = GGML_FP32_TO_FP16(0.f); memset(y[ibl].qs, 0, QK_K/8); - memset(y[ibl].scales, 0, QK_K/16); + memset(y[ibl].qh, 0, QK_K/16); float max_scale = 0; const float * xbl = x + QK_K*ibl; float sumx2 = 0; for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; - float sigma2 = sumx2/QK_K; + float sigma2 = 2*sumx2/QK_K; - for (int ib = 0; ib < QK_K/8; ++ib) { - const float * xb = xbl + 8*ib; - const float * qw = quant_weights + QK_K*ibl + 8*ib; - for (int i = 0; i < 8; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) { + const float * xb = xbl + IQ1S_BLOCK_SIZE*ib; + const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib; + for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); float max = fabsf(xb[0]); - for (int i = 1; i < 8; ++i) max = MAX(max, fabsf(xb[i])); + for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i])); if (!max) { scales[ib] = 0; - memset(L, 1, 8); + memset(L, 1, IQ1S_BLOCK_SIZE); continue; } // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. @@ -11471,14 +11558,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale // for each possible and score for each split. - for (int j = 0; j < 8; ++j) { + for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) { pairs[2*j] = xb[j]; idx[2*j] = j; } - qsort(pairs, 8, 2*sizeof(float), iq1_sort_helper); + qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper); { sumx[0] = sumw[0] = 0; - for (int j = 0; j < 8; ++j) { + for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) { int i = idx[2*j]; sumx[j+1] = sumx[j] + weight[i]*xb[i]; sumw[j+1] = sumw[j] + weight[i]; @@ -11486,10 +11573,10 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } float best_score = 0, scale = max; int besti1 = 0, besti2 = 0; - for (int i1 = 0; i1 <= 8; ++i1) { - for (int i2 = i1; i2 <= 8; ++i2) { - float sumqx = -(sumx[i1] - sumx[0]) + (sumx[8] - sumx[i2]); - float sumq2 = (sumw[i1] - sumw[0]) + (sumw[8] - sumw[i2]); + for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) { + for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) { + float sumqx = -(sumx[i1] - sumx[0]) + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2]); + float sumq2 = (sumw[i1] - sumw[0]) + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2]); if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { scale = sumqx/sumq2; best_score = scale*sumqx; besti1 = i1; besti2 = i2; @@ -11498,23 +11585,43 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; - for (int j = besti2; j < 8; ++j) L[idx[2*j]] = 2; + for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2; if (scale < 0) { - for (int j = 0; j < 8; ++j) L[j] = 2 - L[j]; + for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j]; scale = -scale; } - // Now we check if the solution found above corresponds to a grid point and, if not, use a neighbouring - // grid point that minimizes SSD. - uint16_t u = 0; - for (int j = 0; j < 8; ++j) u |= (L[j] << 2*j); - int grid_index = kmap_q2xs[u]; - if (grid_index < 0) { - const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; - grid_index = iq1_find_best_neighbour(neighbours, kgrid_q2xs, xb, weight, &scale, L, NGRID_IQ2XXS); - GGML_ASSERT(grid_index >= 0); - } - y[ibl].qs[ib] = grid_index & 255; - hbit[ib] = grid_index >> 8; + bool all_on_grid = true; + for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) { + uint16_t u = 0; + for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + all_on_grid = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, L + 8*k, NGRID_IQ1S); + GGML_ASSERT(grid_index >= 0); + } + index[k] = grid_index; + } + if (!all_on_grid) { + float sumqx = 0, sumq2 = 0; + for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) { + const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); + for (int j = 0; j < 8; ++j) { + float w = weight[8*k + j]; + float q = (pg[j] - 3)/2; + sumqx += w*q*xb[8*k+j]; + sumq2 += w*q*q; + } + } + if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; + } + uint16_t h = 0; + for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) { + y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255; + h |= (index[k] >> 8) << 3*k; + } + y[ibl].qh[ib] = h; GGML_ASSERT(scale >= 0); scales[ib] = scale; max_scale = MAX(max_scale, scale); @@ -11525,14 +11632,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy continue; } - float d = max_scale/15; - y[ibl].d = GGML_FP32_TO_FP16(d*1.085f); // 1.085f is another fudge factor. Don't ask me why it is needed. + float d = max_scale/31; + y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed. float id = 1/d; - for (int ib = 0; ib < QK_K/8; ++ib) { + for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) { int l = nearest_int(0.5f*(id*scales[ib]-1)); - l = MAX(0, MIN(7, l)); - if (hbit[ib]) l |= 8; - y[ibl].scales[ib/2] |= (l << 4*(ib%2)); + l = MAX(0, MIN(15, l)); + y[ibl].qh[ib] |= (l << 12); } } } diff --git a/ggml-quants.h b/ggml-quants.h index 47dd52856422a..74aabf4156385 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -217,8 +217,8 @@ static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N typedef struct { ggml_fp16_t d; - uint8_t qs[QK_K/8]; - uint8_t scales[QK_K/16]; + uint8_t qs[QK_K/8]; + uint16_t qh[QK_K/32]; } block_iq1_s; static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); From ee35600b9061b1ea0c4ea87fce6844297632b2a8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 11 Mar 2024 09:56:47 +0200 Subject: [PATCH 35/39] llama : fix F16/F32 downcast + improve names (#5980) --- llama.cpp | 67 +++++++++++++++++++++++++++++-------------------------- llama.h | 2 +- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/llama.cpp b/llama.cpp index 249442166d24e..110e509ccd5b6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11636,7 +11636,7 @@ static void llama_tensor_dequantize_internal( workers.clear(); } -static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { +static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -11951,40 +11951,40 @@ static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const flo } static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { - ggml_type quantized_type; + ggml_type default_type; llama_ftype ftype = params->ftype; switch (params->ftype) { - case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; - case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; - case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; - case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; - case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; - case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break; - case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; + case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break; + case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; + case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; + case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break; + case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; + case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; + case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break; // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K_S: - case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; - case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: - case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break; case LLAMA_FTYPE_MOSTLY_Q4_K_S: - case LLAMA_FTYPE_MOSTLY_Q4_K_M: quantized_type = GGML_TYPE_Q4_K; break; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break; case LLAMA_FTYPE_MOSTLY_Q5_K_S: - case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; - case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break; - case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break; - case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break; - case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break; + case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; + case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; + case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -12125,23 +12125,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // do not quantize Mamba's small yet 2D weights // NOTE: can't use LLM_TN here because the layer number is not known quantize &= name.find("ssm_conv1d.weight") == std::string::npos; - quantize &= name.find("ssm_x.weight") == std::string::npos; - quantize &= name.find("ssm_dt.weight") == std::string::npos; + quantize &= name.find("ssm_x.weight") == std::string::npos; + quantize &= name.find("ssm_dt.weight") == std::string::npos; enum ggml_type new_type; void * new_data; size_t new_size; if (quantize) { - new_type = quantized_type; - if (!params->pure) { - new_type = get_k_quant_type(qs, new_type, tensor, ftype); + new_type = default_type; + + // get more optimal quantization type based on the tensor shape, layer, etc. + if (!params->pure && ggml_is_quantized(default_type)) { + new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); } // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. quantize = tensor->type != new_type; } + if (!quantize) { new_type = tensor->type; new_data = tensor->data; @@ -12187,7 +12190,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.data(); } - LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type)); + LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); fflush(stdout); if (work.size() < nelements * 4) { @@ -12235,7 +12238,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); if (qs.n_fallback > 0) { - LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n", + LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n", __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); } } diff --git a/llama.h b/llama.h index c8e05aadd9913..ccf65ca4e87a5 100644 --- a/llama.h +++ b/llama.h @@ -278,7 +278,7 @@ extern "C" { bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - bool pure; // disable k-quant mixtures and quantize all tensors to the same type + bool pure; // quantize all tensors to the default type void * imatrix; // pointer to importance matrix data } llama_model_quantize_params; From ecab1c75de68de7c41c254e2ae170d3b07bee6d4 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Mon, 11 Mar 2024 10:00:08 +0200 Subject: [PATCH 36/39] cmake : fix subdir for `LLAMA_METAL_EMBED_LIBRARY` (#5985) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 00a26391a6626..60b592bf53de7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -208,7 +208,7 @@ if (LLAMA_METAL) enable_language(ASM) add_compile_definitions(GGML_METAL_EMBED_LIBRARY) - set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal") + set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal") file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated") set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s") From 332bdfd7980718abf664bfa5460f2288a3314984 Mon Sep 17 00:00:00 2001 From: Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> Date: Mon, 11 Mar 2024 17:09:32 +0900 Subject: [PATCH 37/39] server : maintain chat completion id for streaming responses (#5988) * server: maintain chat completion id for streaming responses * Update examples/server/utils.hpp * Update examples/server/utils.hpp --------- Co-authored-by: Georgi Gerganov --- examples/server/server.cpp | 7 ++++--- examples/server/utils.hpp | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c7d3ed01b6347..3951507aa5fbe 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3195,11 +3195,12 @@ int main(int argc, char ** argv) { ctx_server.queue_results.add_waiting_task_id(id_task); ctx_server.request_completion(id_task, -1, data, false, false); + const auto completion_id = gen_chatcmplid(); if (!json_value(data, "stream", false)) { server_task_result result = ctx_server.queue_results.recv(id_task); if (!result.error && result.stop) { - json result_oai = format_final_response_oaicompat(data, result.data); + json result_oai = format_final_response_oaicompat(data, result.data, completion_id); res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); } else { @@ -3208,11 +3209,11 @@ int main(int argc, char ** argv) { } ctx_server.queue_results.remove_waiting_task_id(id_task); } else { - const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) { + const auto chunked_content_provider = [id_task, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { while (true) { server_task_result result = ctx_server.queue_results.recv(id_task); if (!result.error) { - std::vector result_array = format_partial_response_oaicompat(result.data); + std::vector result_array = format_partial_response_oaicompat(result.data, completion_id); for (auto it = result_array.begin(); it != result_array.end(); ++it) { if (!it->empty()) { diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index df0a27782e646..f27af81e99ef5 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -378,7 +378,7 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_final_response_oaicompat(const json & request, json result, bool streaming = false) { +static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) { bool stopped_word = result.count("stopped_word") != 0; bool stopped_eos = json_value(result, "stopped_eos", false); int num_tokens_predicted = json_value(result, "tokens_predicted", 0); @@ -412,7 +412,7 @@ static json format_final_response_oaicompat(const json & request, json result, b {"prompt_tokens", num_prompt_tokens}, {"total_tokens", num_tokens_predicted + num_prompt_tokens} }}, - {"id", gen_chatcmplid()} + {"id", completion_id} }; if (server_verbose) { @@ -427,7 +427,7 @@ static json format_final_response_oaicompat(const json & request, json result, b } // return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(json result) { +static std::vector format_partial_response_oaicompat(json result, const std::string & completion_id) { if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { return std::vector({result}); } @@ -471,7 +471,7 @@ static std::vector format_partial_response_oaicompat(json result) { {"role", "assistant"} }}}})}, {"created", t}, - {"id", gen_chatcmplid()}, + {"id", completion_id}, {"model", modelname}, {"object", "chat.completion.chunk"}}; @@ -482,7 +482,7 @@ static std::vector format_partial_response_oaicompat(json result) { {"content", content}}} }})}, {"created", t}, - {"id", gen_chatcmplid()}, + {"id", completion_id}, {"model", modelname}, {"object", "chat.completion.chunk"}}; @@ -509,7 +509,7 @@ static std::vector format_partial_response_oaicompat(json result) { json ret = json { {"choices", choices}, {"created", t}, - {"id", gen_chatcmplid()}, + {"id", completion_id}, {"model", modelname}, {"object", "chat.completion.chunk"} }; From 3202361c5b1ba15e695b31209567ef42c22c5c32 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Mon, 11 Mar 2024 10:28:51 +0100 Subject: [PATCH 38/39] ggml, ci : Windows ARM runner and build fixes (#5979) * windows arm ci * fix `error C2078: too many initializers` with ggml_vld1q_u32 macro for MSVC ARM64 * fix `warning C4146: unary minus operator applied to unsigned type, result still unsigned` * fix `error C2065: '__fp16': undeclared identifier` --- .github/workflows/build.yml | 4 +++- ggml-impl.h | 8 ++++++-- ggml-quants.c | 16 ++++++++-------- ggml.c | 4 ++-- llama.cpp | 4 ++-- 5 files changed, 21 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9144f926628a5..d39cd6bc338e0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -425,6 +425,8 @@ jobs: defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' - build: 'vulkan' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON' + - build: 'arm64' + defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' steps: - name: Clone @@ -520,7 +522,7 @@ jobs: - name: Test id: cmake_test # not all machines have native AVX-512 - if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} + if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} run: | cd build ctest -L main -C Release --verbose --timeout 900 diff --git a/ggml-impl.h b/ggml-impl.h index c5637e4d45d8c..e68b728775c41 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -53,26 +53,30 @@ extern "C" { // #include +typedef __fp16 ggml_fp16_internal_t; + #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - __fp16 tmp; + ggml_fp16_internal_t tmp; memcpy(&tmp, &h, sizeof(ggml_fp16_t)); return (float)tmp; } static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { ggml_fp16_t res; - __fp16 tmp = f; + ggml_fp16_internal_t tmp = f; memcpy(&res, &tmp, sizeof(ggml_fp16_t)); return res; } #else +typedef uint16_t ggml_fp16_internal_t; + #ifdef __wasm_simd128__ #include #else diff --git a/ggml-quants.c b/ggml-quants.c index f9a3d9fd229e1..86b0764cbae18 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -9374,15 +9374,15 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * const uint8x16_t idx_l = vld1q_u8(qs); qs += 16; idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256)); - const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], - iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]}; - const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], - iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]}; + const uint32x4_t aux32x4_0 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], + iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]); + const uint32x4_t aux32x4_1 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], + iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]); idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256)); - const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], - iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]}; - const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], - iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]}; + const uint32x4_t aux32x4_2 = ggml_vld1q_u32(iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], + iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]); + const uint32x4_t aux32x4_3 = ggml_vld1q_u32(iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], + iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]); vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16))); diff --git a/ggml.c b/ggml.c index 80efa6f2ac900..9a7bd1d8c527b 100644 --- a/ggml.c +++ b/ggml.c @@ -857,7 +857,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16x8 float16x8_t #define GGML_F16x8_ZERO vdupq_n_f16(0.0f) #define GGML_F16x8_SET1(x) vdupq_n_f16(x) - #define GGML_F16x8_LOAD(x) vld1q_f16((const __fp16 *)(x)) + #define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x)) #define GGML_F16x8_STORE vst1q_f16 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c) #define GGML_F16x8_ADD vaddq_f16 @@ -900,7 +900,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F32Cx4 float32x4_t #define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f) #define GGML_F32Cx4_SET1(x) vdupq_n_f32(x) - #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const __fp16 *)(x))) + #define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x))) #define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y)) #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32Cx4_ADD vaddq_f32 diff --git a/llama.cpp b/llama.cpp index 110e509ccd5b6..f61ea791b5be2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13980,7 +13980,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } else if (llama_is_user_defined_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; if (length < (int) result.length()) { - return -result.length(); + return -(int) result.length(); } memcpy(buf, result.c_str(), result.length()); return result.length(); @@ -14015,7 +14015,7 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } else if (llama_is_user_defined_token(model->vocab, token)) { std::string result = model->vocab.id_to_token[token].text; if (length < (int) result.length()) { - return -result.length(); + return -(int) result.length(); } memcpy(buf, result.c_str(), result.length()); return result.length(); From caa106d4e05a0ab94225c220b81f9e2cd522339b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 11 Mar 2024 10:56:41 +0100 Subject: [PATCH 39/39] Server: format error to json (#5961) * server: format error to json * server: do not crash on grammar error * fix api key test case * revert limit max n_predict * small fix * correct coding style * update completion.js * launch_slot_with_task * update docs * update_slots * update webui * update readme --- examples/server/README.md | 46 +- examples/server/completion.js.hpp | 469 +++++++++--------- examples/server/public/completion.js | 20 +- examples/server/server.cpp | 159 +++--- examples/server/tests/features/steps/steps.py | 2 +- examples/server/utils.hpp | 51 ++ 6 files changed, 430 insertions(+), 317 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 23606b32a2c81..3767390511e5f 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -556,9 +556,51 @@ Run with bash: bash chat.sh ``` -### API like OAI +### OAI-like API -The HTTP server supports OAI-like API +The HTTP server supports OAI-like API: https://github.com/openai/openai-openapi + +### API errors + +Server returns error in the same format as OAI: https://github.com/openai/openai-openapi + +Example of an error: + +```json +{ + "error": { + "code": 401, + "message": "Invalid API Key", + "type": "authentication_error" + } +} +``` + +Apart from error types supported by OAI, we also have custom types that are specific to functionalities of llama.cpp: + +**When /metrics or /slots endpoint is disabled** + +```json +{ + "error": { + "code": 501, + "message": "This server does not support metrics endpoint.", + "type": "not_supported_error" + } +} +``` + +**When the server receives invalid grammar via */completions endpoint** + +```json +{ + "error": { + "code": 400, + "message": "Failed to parse grammar", + "type": "invalid_request_error" + } +} +``` ### Extending or building alternative Web Front End diff --git a/examples/server/completion.js.hpp b/examples/server/completion.js.hpp index f5e696e17edfe..10734091eeaf6 100644 --- a/examples/server/completion.js.hpp +++ b/examples/server/completion.js.hpp @@ -231,255 +231,256 @@ unsigned char completion_js[] = { 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, - 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, - 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, - 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, - 0x65, 0x6e, 0x74, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73, - 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, - 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, + 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, + 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, + 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, + 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, + 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, + 0x65, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73, 0x28, 0x27, + 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c, + 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20, 0x62, 0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62, 0x79, 0x20, 0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, - 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x27, - 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c, - 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, - 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, - 0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, - 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, - 0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, + 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, + 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, + 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, + 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, + 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x5b, 0x24, 0x7b, 0x72, + 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, + 0x63, 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x2d, 0x20, 0x24, 0x7b, 0x72, 0x65, + 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x74, + 0x79, 0x70, 0x65, 0x7d, 0x5d, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, + 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, + 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x65, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, + 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, + 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, + 0x6f, 0x72, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, - 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, - 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, - 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, - 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, - 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, - 0x72, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, - 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, - 0x74, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, - 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, - 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x45, - 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, - 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, - 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, - 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, 0x6c, - 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79, 0x6f, - 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, - 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, - 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, - 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, - 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, - 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66, - 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, - 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, - 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, - 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, - 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, - 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28, - 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, - 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, - 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, - 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, - 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, - 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, - 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, - 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, - 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, - 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, - 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, - 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, - 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, - 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, - 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, - 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, - 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, + 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, + 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, + 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, + 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, + 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, + 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, + 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, + 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, + 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, + 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, + 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, + 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, + 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, + 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, + 0x63, 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, + 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, + 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, + 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, + 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, + 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, + 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, + 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, + 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, + 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, + 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, + 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, + 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, + 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, + 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, + 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, + 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, + 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, + 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, + 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, + 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, + 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, + 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, + 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, + 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, + 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, + 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, + 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, + 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, + 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, + 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, + 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, + 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, + 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, + 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, + 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, + 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, + 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, + 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, + 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, + 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, + 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, + 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, + 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, + 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, + 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, + 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, + 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, + 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, + 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, + 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, + 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, + 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, - 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, + 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, - 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, - 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, - 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, - 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, - 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e, - 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, - 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, - 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, - 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, - 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, - 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, - 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, - 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, - 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, - 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, - 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, - 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, - 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, - 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e, - 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, - 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, - 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b, - 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, - 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, - 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, - 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, - 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e, - 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73, - 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a, - 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, - 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, - 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, - 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, - 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, - 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, - 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, - 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f, - 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, - 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d, - 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, - 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, - 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, - 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, - 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, - 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, - 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, - 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50, - 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, - 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72, - 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, - 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, + 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, + 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, + 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, + 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, + 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, + 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, + 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, + 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, + 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, + 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, + 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, + 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, + 0x65, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, + 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, + 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, + 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, + 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, + 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, + 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, + 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, + 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, + 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, + 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, + 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, + 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, + 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, + 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, + 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, + 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, + 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, + 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, + 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, + 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, + 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, + 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, + 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, + 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, + 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, + 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, + 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, + 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, + 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, + 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, + 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, + 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, + 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, + 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, + 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, + 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, + 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, + 0x72, 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, + 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, + 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, + 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, + 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, + 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, - 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, - 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, - 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, - 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, - 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, - 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65, - 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f, - 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65, - 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65, - 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, - 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, - 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, - 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, - 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74, - 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69, - 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65, - 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69, - 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20, - 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20, - 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, - 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, - 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, - 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20, - 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e, - 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x20, - 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, - 0x68, 0x28, 0x22, 0x2f, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x22, 0x29, 0x2e, - 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x2e, - 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, - 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x70, - 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, - 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, - 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, + 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, + 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, + 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, + 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, + 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, + 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, + 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, + 0x65, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, + 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, + 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, + 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, + 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, + 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, + 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, + 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, + 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, + 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, + 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, + 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, - 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a + 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, + 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, + 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x22, + 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, + 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, + 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, + 0x6c, 0x74, 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, + 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, + 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, + 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, + 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a }; -unsigned int completion_js_len = 5782; +unsigned int completion_js_len = 5796; diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index ab38a7b409df1..835ce6e68422a 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -96,18 +96,18 @@ export async function* llama(prompt, params = {}, config = {}) { } } if (result.error) { - result.error = JSON.parse(result.error); - if (result.error.content.includes('slot unavailable')) { - // Throw an error to be caught by upstream callers - throw new Error('slot unavailable'); - } else { - console.error(`llama.cpp error: ${result.error.content}`); + try { + result.error = JSON.parse(result.error); + if (result.error.message.includes('slot unavailable')) { + // Throw an error to be caught by upstream callers + throw new Error('slot unavailable'); + } else { + console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`); + } + } catch(e) { + console.error(`llama.cpp error ${result.error}`) } } - if (result.error) { - result.error = JSON.parse(result.error); - console.error(`llama.cpp error: ${result.error.content}`); - } } } } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3951507aa5fbe..b63a6f243b696 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -396,7 +396,7 @@ struct server_queue { // callback functions std::function callback_new_task; std::function callback_finish_multitask; - std::function callback_run_slots; + std::function callback_update_slots; // Add a new task to the end of the queue int post(server_task task) { @@ -435,8 +435,8 @@ struct server_queue { } // Register the function to be called when all slots data is ready to be processed - void on_run_slots(std::function callback) { - callback_run_slots = std::move(callback); + void on_update_slots(std::function callback) { + callback_update_slots = std::move(callback); } // Call when the state of one slot is changed @@ -461,7 +461,7 @@ struct server_queue { * - Wait until a new task arrives * - Process the task (i.e. maybe copy data into slot) * - Check if multitask is finished - * - Run all slots + * - Update all slots */ void start_loop() { running = true; @@ -499,9 +499,9 @@ struct server_queue { } // all tasks in the current loop is processed, slots data is now ready - LOG_VERBOSE("callback_run_slots", {}); + LOG_VERBOSE("callback_update_slots", {}); - callback_run_slots(); + callback_update_slots(); LOG_VERBOSE("wait for new task", {}); { @@ -805,9 +805,10 @@ struct server_context { return last_used; } - bool launch_slot_with_data(server_slot & slot, json data) const { + bool launch_slot_with_task(server_slot & slot, const server_task & task) { slot_params default_params; llama_sampling_params default_sparams; + auto & data = task.data; if (data.count("__oaicompat") != 0) { slot.oaicompat = true; @@ -864,10 +865,15 @@ struct server_context { { const auto & prompt = data.find("prompt"); if (prompt == data.end()) { - slot.prompt = ""; + send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST); + return false; } else { slot.prompt = *prompt; } + if (slot.prompt.is_array() && slot.prompt.size() == 0) { + send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST); + return false; + } } // penalize user-provided tokens @@ -926,6 +932,7 @@ struct server_context { if (logit_bias != data.end() && logit_bias->is_array()) { const int n_vocab = llama_n_vocab(model); for (const auto & el : *logit_bias) { + // TODO: we may want to throw errors here, in case "el" is incorrect if (el.is_array() && el.size() == 2) { float bias; if (el[1].is_number()) { @@ -985,6 +992,11 @@ struct server_context { llama_sampling_free(slot.ctx_sampling); } slot.ctx_sampling = llama_sampling_init(slot.sparams); + if (slot.ctx_sampling == nullptr) { + // for now, the only error that may happen here is invalid grammar + send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST); + return false; + } llama_set_rng_seed(ctx, slot.params.seed); } @@ -1226,15 +1238,23 @@ struct server_context { }; } - void send_error(const server_task & task, const std::string & error) { - LOG_TEE("task %i - error: %s\n", task.id, error.c_str()); + void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { + send_error(task.id, task.id_multi, error, type); + } + + void send_error(const server_slot & slot, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { + send_error(slot.id_task, slot.id_multi, error, type); + } + + void send_error(const int id_task, const int id_multi, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { + LOG_TEE("task %i - error: %s\n", id_task, error.c_str()); server_task_result res; - res.id = task.id; - res.id_multi = task.id_multi; + res.id = id_task; + res.id_multi = id_multi; res.stop = false; res.error = true; - res.data = { { "content", error } }; + res.data = format_error_response(error, type); queue_results.send(res); } @@ -1468,9 +1488,8 @@ struct server_context { slot->infill = task.infill; slot->embedding = task.embedding; - if (!launch_slot_with_data(*slot, task.data)) { - // send error result - send_error(task, "internal_error"); + if (!launch_slot_with_task(*slot, task)) { + LOG_ERROR("error while launching slot", task.data); break; } } break; @@ -1587,7 +1606,7 @@ struct server_context { queue_results.send(result); } - bool update_slots() { + void update_slots() { if (system_need_update) { system_prompt_update(); } @@ -1630,7 +1649,7 @@ struct server_context { kv_cache_clear(); } - return true; + return; } } @@ -1975,8 +1994,7 @@ struct server_context { if (batch.n_tokens == 0) { LOG_VERBOSE("no tokens to decode", {}); - - return true; + return; } LOG_VERBOSE("decoding batch", { @@ -2033,7 +2051,13 @@ struct server_context { if (n_batch == 1 || ret < 0) { // if you get here, it means the KV cache is full - try increasing it via the context size LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n", __func__, n_batch, ret); - return false; + for (auto & slot : slots) { + slot.state = SLOT_STATE_PROCESSING; + slot.command = SLOT_COMMAND_NONE; + slot.release(); + send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size."); + } + break; // break loop of n_batch } LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2); @@ -2042,12 +2066,12 @@ struct server_context { n_batch /= 2; i -= n_batch; - continue; + continue; // continue loop of n_batch } for (auto & slot : slots) { if (slot.state != SLOT_STATE_PROCESSING || slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { - continue; + continue; // continue loop of slots } // prompt evaluated for embedding @@ -2055,7 +2079,7 @@ struct server_context { send_embedding(slot, batch_view); slot.release(); slot.i_batch = -1; - continue; + continue; // continue loop of slots } completion_token_output result; @@ -2097,9 +2121,7 @@ struct server_context { } } - LOG_VERBOSE("slots updated", {}); - - return true; + LOG_VERBOSE("run slots completed", {}); } json model_meta() const { @@ -2745,32 +2767,32 @@ int main(int argc, char ** argv) { svr->set_logger(log_server_request); - svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { - const char fmt[] = "500 Internal Server Error\n%s"; + auto res_error = [](httplib::Response & res, json error_data) { + json final_response {{"error", error_data}}; + res.set_content(final_response.dump(), "application/json; charset=utf-8"); + res.status = json_value(error_data, "code", 500); + }; - char buf[BUFSIZ]; + svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) { + std::string message; try { std::rethrow_exception(std::move(ep)); - } catch (std::exception &e) { - snprintf(buf, sizeof(buf), fmt, e.what()); + } catch (std::exception & e) { + message = e.what(); } catch (...) { - snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); + message = "Unknown Exception"; } - res.set_content(buf, "text/plain; charset=utf-8"); - res.status = 500; + json formatted_error = format_error_response(message, ERROR_TYPE_SERVER); + LOG_VERBOSE("Got exception", formatted_error); + res_error(res, formatted_error); }); - svr->set_error_handler([](const httplib::Request &, httplib::Response & res) { - if (res.status == 401) { - res.set_content("Unauthorized", "text/plain; charset=utf-8"); - } - if (res.status == 400) { - res.set_content("Invalid request", "text/plain; charset=utf-8"); - } + svr->set_error_handler([&res_error](const httplib::Request &, httplib::Response & res) { if (res.status == 404) { - res.set_content("File Not Found", "text/plain; charset=utf-8"); + res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND)); } + // for other error codes, we skip processing here because it's already done by res_error() }); // set timeouts and change hostname and port @@ -2835,7 +2857,7 @@ int main(int argc, char ** argv) { // Middlewares // - auto middleware_validate_api_key = [&sparams](const httplib::Request & req, httplib::Response & res) { + auto middleware_validate_api_key = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res) { // TODO: should we apply API key to all endpoints, including "/health" and "/models"? static const std::set protected_endpoints = { "/props", @@ -2876,8 +2898,7 @@ int main(int argc, char ** argv) { // API key is invalid or not provided // TODO: make another middleware for CORS related logic res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - res.set_content("Unauthorized: Invalid API Key", "text/plain; charset=utf-8"); - res.status = 401; // Unauthorized + res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION)); LOG_WARNING("Unauthorized: Invalid API Key", {}); @@ -2940,21 +2961,18 @@ int main(int argc, char ** argv) { } case SERVER_STATE_LOADING_MODEL: { - res.set_content(R"({"status": "loading model"})", "application/json"); - res.status = 503; // HTTP Service Unavailable + res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE)); } break; case SERVER_STATE_ERROR: { - res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json"); - res.status = 500; // HTTP Internal Server Error + res_error(res, format_error_response("Model failed to load", ERROR_TYPE_SERVER)); } break; } }; const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) { if (!sparams.slots_endpoint) { - res.status = 501; - res.set_content("This server does not support slots endpoint.", "text/plain; charset=utf-8"); + res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -2978,8 +2996,7 @@ int main(int argc, char ** argv) { const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { if (!sparams.metrics_endpoint) { - res.status = 501; - res.set_content("This server does not support metrics endpoint.", "text/plain; charset=utf-8"); + res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED)); return; } @@ -3090,7 +3107,7 @@ int main(int argc, char ** argv) { res.set_content(data.dump(), "application/json; charset=utf-8"); }; - const auto handle_completions = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + const auto handle_completions = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = json::parse(req.body); @@ -3105,8 +3122,7 @@ int main(int argc, char ** argv) { if (!result.error && result.stop) { res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); } else { - res.status = 500; - res.set_content(result.data["content"], "text/plain; charset=utf-8"); + res_error(res, result.data); } ctx_server.queue_results.remove_waiting_task_id(id_task); @@ -3186,7 +3202,7 @@ int main(int argc, char ** argv) { res.set_content(models.dump(), "application/json; charset=utf-8"); }; - const auto handle_chat_completions = [&ctx_server, &sparams](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template); @@ -3204,8 +3220,7 @@ int main(int argc, char ** argv) { res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); } else { - res.status = 500; - res.set_content(result.data["content"], "text/plain; charset=utf-8"); + res_error(res, result.data); } ctx_server.queue_results.remove_waiting_task_id(id_task); } else { @@ -3259,7 +3274,7 @@ int main(int argc, char ** argv) { } }; - const auto handle_infill = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + const auto handle_infill = [&ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = json::parse(req.body); @@ -3274,8 +3289,7 @@ int main(int argc, char ** argv) { if (!result.error && result.stop) { res.set_content(result.data.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8"); } else { - res.status = 404; - res.set_content(result.data["content"], "text/plain; charset=utf-8"); + res_error(res, result.data); } ctx_server.queue_results.remove_waiting_task_id(id_task); @@ -3346,7 +3360,7 @@ int main(int argc, char ** argv) { return res.set_content(data.dump(), "application/json; charset=utf-8"); }; - const auto handle_embeddings = [¶ms, &ctx_server](const httplib::Request & req, httplib::Response & res) { + const auto handle_embeddings = [¶ms, &ctx_server, &res_error](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!params.embedding) { res.status = 501; @@ -3375,8 +3389,8 @@ int main(int argc, char ** argv) { std::string content = body["content"]; prompts.push_back(content); } else { - // TODO @ngxson : should return an error here - prompts.push_back(""); + res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); + return; } // process all prompts @@ -3392,9 +3406,14 @@ int main(int argc, char ** argv) { // get the result server_task_result result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - - // append to the responses - responses.push_back(result.data); + if (!result.error) { + // append to the responses + responses.push_back(result.data); + } else { + // error received, ignore everything else + res_error(res, result.data); + return; + } } // write JSON response @@ -3488,7 +3507,7 @@ int main(int argc, char ** argv) { &server_context::process_single_task, &ctx_server, std::placeholders::_1)); ctx_server.queue_tasks.on_finish_multitask(std::bind( &server_context::on_finish_multitask, &ctx_server, std::placeholders::_1)); - ctx_server.queue_tasks.on_run_slots(std::bind( + ctx_server.queue_tasks.on_update_slots(std::bind( &server_context::update_slots, &ctx_server)); ctx_server.queue_results.on_multitask_update(std::bind( &server_queue::update_multitask, diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 4e81a255a08d9..98c2b61743cbf 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -801,7 +801,7 @@ async def oai_chat_completions(user_prompt, stream=enable_streaming, seed=seed ) - except openai.error.APIError as e: + except openai.error.AuthenticationError as e: if expect_api_error is not None and expect_api_error: return 401 else: diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index f27af81e99ef5..48aeef4ebea67 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -14,6 +14,17 @@ using json = nlohmann::json; +// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 +enum error_type { + ERROR_TYPE_INVALID_REQUEST, + ERROR_TYPE_AUTHENTICATION, + ERROR_TYPE_SERVER, + ERROR_TYPE_NOT_FOUND, + ERROR_TYPE_PERMISSION, + ERROR_TYPE_UNAVAILABLE, // custom error + ERROR_TYPE_NOT_SUPPORTED, // custom error +}; + extern bool server_verbose; extern bool server_log_json; @@ -542,3 +553,43 @@ static json format_detokenized_response(const std::string & content) { {"content", content} }; } + +static json format_error_response(const std::string & message, const enum error_type type) { + std::string type_str; + int code = 500; + switch (type) { + case ERROR_TYPE_INVALID_REQUEST: + type_str = "invalid_request_error"; + code = 400; + break; + case ERROR_TYPE_AUTHENTICATION: + type_str = "authentication_error"; + code = 401; + break; + case ERROR_TYPE_NOT_FOUND: + type_str = "not_found_error"; + code = 404; + break; + case ERROR_TYPE_SERVER: + type_str = "server_error"; + code = 500; + break; + case ERROR_TYPE_PERMISSION: + type_str = "permission_error"; + code = 403; + break; + case ERROR_TYPE_NOT_SUPPORTED: + type_str = "not_supported_error"; + code = 501; + break; + case ERROR_TYPE_UNAVAILABLE: + type_str = "unavailable_error"; + code = 503; + break; + } + return json { + {"code", code}, + {"message", message}, + {"type", type_str}, + }; +}