From 6f8e48923d95d19baf4dae39b0724e1fb76b0be6 Mon Sep 17 00:00:00 2001
From: Sangyub Lee <kykypyza3724@gmail.com>
Date: Thu, 10 Oct 2024 15:07:39 +0900
Subject: [PATCH] Add AIMer implementations(m4speed, m4stack) for all NIST
 security levels

---
 crypto_sign/aimer128f/m4speed/__asm_field.S | 544 +++++++++++++++
 crypto_sign/aimer128f/m4speed/aim2.c        | 459 +++++++++++++
 crypto_sign/aimer128f/m4speed/aim2.h        | 307 +++++++++
 crypto_sign/aimer128f/m4speed/api.h         |  44 ++
 crypto_sign/aimer128f/m4speed/field.c       | 406 ++++++++++++
 crypto_sign/aimer128f/m4speed/field.h       |  41 ++
 crypto_sign/aimer128f/m4speed/hash.c        |  41 ++
 crypto_sign/aimer128f/m4speed/hash.h        |  35 +
 crypto_sign/aimer128f/m4speed/params.h      |  28 +
 crypto_sign/aimer128f/m4speed/sign.c        | 547 +++++++++++++++
 crypto_sign/aimer128f/m4speed/sign.h        |  69 ++
 crypto_sign/aimer128f/m4speed/tree.c        |  94 +++
 crypto_sign/aimer128f/m4speed/tree.h        |  26 +
 crypto_sign/aimer128f/m4stack/__asm_field.S | 544 +++++++++++++++
 crypto_sign/aimer128f/m4stack/aim2.c        | 459 +++++++++++++
 crypto_sign/aimer128f/m4stack/aim2.h        | 307 +++++++++
 crypto_sign/aimer128f/m4stack/api.h         |  44 ++
 crypto_sign/aimer128f/m4stack/field.c       | 406 ++++++++++++
 crypto_sign/aimer128f/m4stack/field.h       |  41 ++
 crypto_sign/aimer128f/m4stack/hash.c        |  41 ++
 crypto_sign/aimer128f/m4stack/hash.h        |  35 +
 crypto_sign/aimer128f/m4stack/params.h      |  28 +
 crypto_sign/aimer128f/m4stack/sign.c        | 626 ++++++++++++++++++
 crypto_sign/aimer128f/m4stack/sign.h        |  73 ++
 crypto_sign/aimer128f/m4stack/tree.c        |  94 +++
 crypto_sign/aimer128f/m4stack/tree.h        |  28 +
 crypto_sign/aimer128s/m4speed/__asm_field.S | 544 +++++++++++++++
 crypto_sign/aimer128s/m4speed/aim2.c        | 459 +++++++++++++
 crypto_sign/aimer128s/m4speed/aim2.h        | 307 +++++++++
 crypto_sign/aimer128s/m4speed/api.h         |  44 ++
 crypto_sign/aimer128s/m4speed/field.c       | 406 ++++++++++++
 crypto_sign/aimer128s/m4speed/field.h       |  41 ++
 crypto_sign/aimer128s/m4speed/hash.c        |  71 ++
 crypto_sign/aimer128s/m4speed/hash.h        |  37 ++
 crypto_sign/aimer128s/m4speed/params.h      |  30 +
 crypto_sign/aimer128s/m4speed/sign.c        | 662 +++++++++++++++++++
 crypto_sign/aimer128s/m4speed/sign.h        |  82 +++
 crypto_sign/aimer128s/m4speed/tree.c        | 116 ++++
 crypto_sign/aimer128s/m4speed/tree.h        |  33 +
 crypto_sign/aimer128s/m4stack/__asm_field.S | 544 +++++++++++++++
 crypto_sign/aimer128s/m4stack/aim2.c        | 459 +++++++++++++
 crypto_sign/aimer128s/m4stack/aim2.h        | 307 +++++++++
 crypto_sign/aimer128s/m4stack/api.h         |  44 ++
 crypto_sign/aimer128s/m4stack/field.c       | 406 ++++++++++++
 crypto_sign/aimer128s/m4stack/field.h       |  41 ++
 crypto_sign/aimer128s/m4stack/hash.c        |  41 ++
 crypto_sign/aimer128s/m4stack/hash.h        |  35 +
 crypto_sign/aimer128s/m4stack/params.h      |  28 +
 crypto_sign/aimer128s/m4stack/sign.c        | 626 ++++++++++++++++++
 crypto_sign/aimer128s/m4stack/sign.h        |  73 ++
 crypto_sign/aimer128s/m4stack/tree.c        |  94 +++
 crypto_sign/aimer128s/m4stack/tree.h        |  28 +
 crypto_sign/aimer192f/m4speed/__asm_field.S | 617 +++++++++++++++++
 crypto_sign/aimer192f/m4speed/aim2.c        | 491 ++++++++++++++
 crypto_sign/aimer192f/m4speed/aim2.h        | 435 ++++++++++++
 crypto_sign/aimer192f/m4speed/api.h         |  44 ++
 crypto_sign/aimer192f/m4speed/field.c       | 482 ++++++++++++++
 crypto_sign/aimer192f/m4speed/field.h       |  41 ++
 crypto_sign/aimer192f/m4speed/hash.c        |  41 ++
 crypto_sign/aimer192f/m4speed/hash.h        |  35 +
 crypto_sign/aimer192f/m4speed/params.h      |  28 +
 crypto_sign/aimer192f/m4speed/sign.c        | 549 ++++++++++++++++
 crypto_sign/aimer192f/m4speed/sign.h        |  69 ++
 crypto_sign/aimer192f/m4speed/tree.c        |  94 +++
 crypto_sign/aimer192f/m4speed/tree.h        |  26 +
 crypto_sign/aimer192f/m4stack/__asm_field.S | 617 +++++++++++++++++
 crypto_sign/aimer192f/m4stack/aim2.c        | 491 ++++++++++++++
 crypto_sign/aimer192f/m4stack/aim2.h        | 435 ++++++++++++
 crypto_sign/aimer192f/m4stack/api.h         |  44 ++
 crypto_sign/aimer192f/m4stack/field.c       | 482 ++++++++++++++
 crypto_sign/aimer192f/m4stack/field.h       |  41 ++
 crypto_sign/aimer192f/m4stack/hash.c        |  41 ++
 crypto_sign/aimer192f/m4stack/hash.h        |  35 +
 crypto_sign/aimer192f/m4stack/params.h      |  28 +
 crypto_sign/aimer192f/m4stack/sign.c        | 628 ++++++++++++++++++
 crypto_sign/aimer192f/m4stack/sign.h        |  73 ++
 crypto_sign/aimer192f/m4stack/tree.c        |  94 +++
 crypto_sign/aimer192f/m4stack/tree.h        |  28 +
 crypto_sign/aimer192s/m4speed/__asm_field.S | 617 +++++++++++++++++
 crypto_sign/aimer192s/m4speed/aim2.c        | 491 ++++++++++++++
 crypto_sign/aimer192s/m4speed/aim2.h        | 435 ++++++++++++
 crypto_sign/aimer192s/m4speed/api.h         |  44 ++
 crypto_sign/aimer192s/m4speed/field.c       | 482 ++++++++++++++
 crypto_sign/aimer192s/m4speed/field.h       |  41 ++
 crypto_sign/aimer192s/m4speed/hash.c        |  71 ++
 crypto_sign/aimer192s/m4speed/hash.h        |  37 ++
 crypto_sign/aimer192s/m4speed/params.h      |  30 +
 crypto_sign/aimer192s/m4speed/sign.c        | 664 +++++++++++++++++++
 crypto_sign/aimer192s/m4speed/sign.h        |  82 +++
 crypto_sign/aimer192s/m4speed/tree.c        | 116 ++++
 crypto_sign/aimer192s/m4speed/tree.h        |  33 +
 crypto_sign/aimer192s/m4stack/__asm_field.S | 617 +++++++++++++++++
 crypto_sign/aimer192s/m4stack/aim2.c        | 491 ++++++++++++++
 crypto_sign/aimer192s/m4stack/aim2.h        | 435 ++++++++++++
 crypto_sign/aimer192s/m4stack/api.h         |  44 ++
 crypto_sign/aimer192s/m4stack/field.c       | 482 ++++++++++++++
 crypto_sign/aimer192s/m4stack/field.h       |  41 ++
 crypto_sign/aimer192s/m4stack/hash.c        |  41 ++
 crypto_sign/aimer192s/m4stack/hash.h        |  35 +
 crypto_sign/aimer192s/m4stack/params.h      |  28 +
 crypto_sign/aimer192s/m4stack/sign.c        | 628 ++++++++++++++++++
 crypto_sign/aimer192s/m4stack/sign.h        |  73 ++
 crypto_sign/aimer192s/m4stack/tree.c        |  94 +++
 crypto_sign/aimer192s/m4stack/tree.h        |  28 +
 crypto_sign/aimer256f/m4speed/__asm_field.S | 695 ++++++++++++++++++++
 crypto_sign/aimer256f/m4speed/aim2.c        | 598 +++++++++++++++++
 crypto_sign/aimer256f/m4speed/aim2.h        | 306 +++++++++
 crypto_sign/aimer256f/m4speed/api.h         |  44 ++
 crypto_sign/aimer256f/m4speed/field.c       | 608 +++++++++++++++++
 crypto_sign/aimer256f/m4speed/field.h       |  41 ++
 crypto_sign/aimer256f/m4speed/hash.c        |  41 ++
 crypto_sign/aimer256f/m4speed/hash.h        |  35 +
 crypto_sign/aimer256f/m4speed/params.h      |  28 +
 crypto_sign/aimer256f/m4speed/sign.c        | 573 ++++++++++++++++
 crypto_sign/aimer256f/m4speed/sign.h        |  69 ++
 crypto_sign/aimer256f/m4speed/tree.c        |  94 +++
 crypto_sign/aimer256f/m4speed/tree.h        |  26 +
 crypto_sign/aimer256f/m4stack/__asm_field.S | 695 ++++++++++++++++++++
 crypto_sign/aimer256f/m4stack/aim2.c        | 598 +++++++++++++++++
 crypto_sign/aimer256f/m4stack/aim2.h        | 306 +++++++++
 crypto_sign/aimer256f/m4stack/api.h         |  44 ++
 crypto_sign/aimer256f/m4stack/field.c       | 608 +++++++++++++++++
 crypto_sign/aimer256f/m4stack/field.h       |  41 ++
 crypto_sign/aimer256f/m4stack/hash.c        |  41 ++
 crypto_sign/aimer256f/m4stack/hash.h        |  35 +
 crypto_sign/aimer256f/m4stack/params.h      |  28 +
 crypto_sign/aimer256f/m4stack/sign.c        | 654 ++++++++++++++++++
 crypto_sign/aimer256f/m4stack/sign.h        |  73 ++
 crypto_sign/aimer256f/m4stack/tree.c        |  94 +++
 crypto_sign/aimer256f/m4stack/tree.h        |  28 +
 crypto_sign/aimer256s/m4speed/__asm_field.S | 695 ++++++++++++++++++++
 crypto_sign/aimer256s/m4speed/aim2.c        | 598 +++++++++++++++++
 crypto_sign/aimer256s/m4speed/aim2.h        | 306 +++++++++
 crypto_sign/aimer256s/m4speed/api.h         |  44 ++
 crypto_sign/aimer256s/m4speed/field.c       | 608 +++++++++++++++++
 crypto_sign/aimer256s/m4speed/field.h       |  41 ++
 crypto_sign/aimer256s/m4speed/hash.c        |  71 ++
 crypto_sign/aimer256s/m4speed/hash.h        |  37 ++
 crypto_sign/aimer256s/m4speed/params.h      |  30 +
 crypto_sign/aimer256s/m4speed/sign.c        | 690 +++++++++++++++++++
 crypto_sign/aimer256s/m4speed/sign.h        |  82 +++
 crypto_sign/aimer256s/m4speed/tree.c        | 116 ++++
 crypto_sign/aimer256s/m4speed/tree.h        |  33 +
 crypto_sign/aimer256s/m4stack/__asm_field.S | 695 ++++++++++++++++++++
 crypto_sign/aimer256s/m4stack/aim2.c        | 598 +++++++++++++++++
 crypto_sign/aimer256s/m4stack/aim2.h        | 306 +++++++++
 crypto_sign/aimer256s/m4stack/api.h         |  44 ++
 crypto_sign/aimer256s/m4stack/field.c       | 608 +++++++++++++++++
 crypto_sign/aimer256s/m4stack/field.h       |  41 ++
 crypto_sign/aimer256s/m4stack/hash.c        |  41 ++
 crypto_sign/aimer256s/m4stack/hash.h        |  35 +
 crypto_sign/aimer256s/m4stack/params.h      |  28 +
 crypto_sign/aimer256s/m4stack/sign.c        | 654 ++++++++++++++++++
 crypto_sign/aimer256s/m4stack/sign.h        |  73 ++
 crypto_sign/aimer256s/m4stack/tree.c        |  94 +++
 crypto_sign/aimer256s/m4stack/tree.h        |  28 +
 156 files changed, 36093 insertions(+)
 create mode 100644 crypto_sign/aimer128f/m4speed/__asm_field.S
 create mode 100644 crypto_sign/aimer128f/m4speed/aim2.c
 create mode 100644 crypto_sign/aimer128f/m4speed/aim2.h
 create mode 100644 crypto_sign/aimer128f/m4speed/api.h
 create mode 100644 crypto_sign/aimer128f/m4speed/field.c
 create mode 100644 crypto_sign/aimer128f/m4speed/field.h
 create mode 100644 crypto_sign/aimer128f/m4speed/hash.c
 create mode 100644 crypto_sign/aimer128f/m4speed/hash.h
 create mode 100644 crypto_sign/aimer128f/m4speed/params.h
 create mode 100644 crypto_sign/aimer128f/m4speed/sign.c
 create mode 100644 crypto_sign/aimer128f/m4speed/sign.h
 create mode 100644 crypto_sign/aimer128f/m4speed/tree.c
 create mode 100644 crypto_sign/aimer128f/m4speed/tree.h
 create mode 100644 crypto_sign/aimer128f/m4stack/__asm_field.S
 create mode 100644 crypto_sign/aimer128f/m4stack/aim2.c
 create mode 100644 crypto_sign/aimer128f/m4stack/aim2.h
 create mode 100644 crypto_sign/aimer128f/m4stack/api.h
 create mode 100644 crypto_sign/aimer128f/m4stack/field.c
 create mode 100644 crypto_sign/aimer128f/m4stack/field.h
 create mode 100644 crypto_sign/aimer128f/m4stack/hash.c
 create mode 100644 crypto_sign/aimer128f/m4stack/hash.h
 create mode 100644 crypto_sign/aimer128f/m4stack/params.h
 create mode 100644 crypto_sign/aimer128f/m4stack/sign.c
 create mode 100644 crypto_sign/aimer128f/m4stack/sign.h
 create mode 100644 crypto_sign/aimer128f/m4stack/tree.c
 create mode 100644 crypto_sign/aimer128f/m4stack/tree.h
 create mode 100644 crypto_sign/aimer128s/m4speed/__asm_field.S
 create mode 100644 crypto_sign/aimer128s/m4speed/aim2.c
 create mode 100644 crypto_sign/aimer128s/m4speed/aim2.h
 create mode 100644 crypto_sign/aimer128s/m4speed/api.h
 create mode 100644 crypto_sign/aimer128s/m4speed/field.c
 create mode 100644 crypto_sign/aimer128s/m4speed/field.h
 create mode 100644 crypto_sign/aimer128s/m4speed/hash.c
 create mode 100644 crypto_sign/aimer128s/m4speed/hash.h
 create mode 100644 crypto_sign/aimer128s/m4speed/params.h
 create mode 100644 crypto_sign/aimer128s/m4speed/sign.c
 create mode 100644 crypto_sign/aimer128s/m4speed/sign.h
 create mode 100644 crypto_sign/aimer128s/m4speed/tree.c
 create mode 100644 crypto_sign/aimer128s/m4speed/tree.h
 create mode 100644 crypto_sign/aimer128s/m4stack/__asm_field.S
 create mode 100644 crypto_sign/aimer128s/m4stack/aim2.c
 create mode 100644 crypto_sign/aimer128s/m4stack/aim2.h
 create mode 100644 crypto_sign/aimer128s/m4stack/api.h
 create mode 100644 crypto_sign/aimer128s/m4stack/field.c
 create mode 100644 crypto_sign/aimer128s/m4stack/field.h
 create mode 100644 crypto_sign/aimer128s/m4stack/hash.c
 create mode 100644 crypto_sign/aimer128s/m4stack/hash.h
 create mode 100644 crypto_sign/aimer128s/m4stack/params.h
 create mode 100644 crypto_sign/aimer128s/m4stack/sign.c
 create mode 100644 crypto_sign/aimer128s/m4stack/sign.h
 create mode 100644 crypto_sign/aimer128s/m4stack/tree.c
 create mode 100644 crypto_sign/aimer128s/m4stack/tree.h
 create mode 100644 crypto_sign/aimer192f/m4speed/__asm_field.S
 create mode 100644 crypto_sign/aimer192f/m4speed/aim2.c
 create mode 100644 crypto_sign/aimer192f/m4speed/aim2.h
 create mode 100644 crypto_sign/aimer192f/m4speed/api.h
 create mode 100644 crypto_sign/aimer192f/m4speed/field.c
 create mode 100644 crypto_sign/aimer192f/m4speed/field.h
 create mode 100644 crypto_sign/aimer192f/m4speed/hash.c
 create mode 100644 crypto_sign/aimer192f/m4speed/hash.h
 create mode 100644 crypto_sign/aimer192f/m4speed/params.h
 create mode 100644 crypto_sign/aimer192f/m4speed/sign.c
 create mode 100644 crypto_sign/aimer192f/m4speed/sign.h
 create mode 100644 crypto_sign/aimer192f/m4speed/tree.c
 create mode 100644 crypto_sign/aimer192f/m4speed/tree.h
 create mode 100644 crypto_sign/aimer192f/m4stack/__asm_field.S
 create mode 100644 crypto_sign/aimer192f/m4stack/aim2.c
 create mode 100644 crypto_sign/aimer192f/m4stack/aim2.h
 create mode 100644 crypto_sign/aimer192f/m4stack/api.h
 create mode 100644 crypto_sign/aimer192f/m4stack/field.c
 create mode 100644 crypto_sign/aimer192f/m4stack/field.h
 create mode 100644 crypto_sign/aimer192f/m4stack/hash.c
 create mode 100644 crypto_sign/aimer192f/m4stack/hash.h
 create mode 100644 crypto_sign/aimer192f/m4stack/params.h
 create mode 100644 crypto_sign/aimer192f/m4stack/sign.c
 create mode 100644 crypto_sign/aimer192f/m4stack/sign.h
 create mode 100644 crypto_sign/aimer192f/m4stack/tree.c
 create mode 100644 crypto_sign/aimer192f/m4stack/tree.h
 create mode 100644 crypto_sign/aimer192s/m4speed/__asm_field.S
 create mode 100644 crypto_sign/aimer192s/m4speed/aim2.c
 create mode 100644 crypto_sign/aimer192s/m4speed/aim2.h
 create mode 100644 crypto_sign/aimer192s/m4speed/api.h
 create mode 100644 crypto_sign/aimer192s/m4speed/field.c
 create mode 100644 crypto_sign/aimer192s/m4speed/field.h
 create mode 100644 crypto_sign/aimer192s/m4speed/hash.c
 create mode 100644 crypto_sign/aimer192s/m4speed/hash.h
 create mode 100644 crypto_sign/aimer192s/m4speed/params.h
 create mode 100644 crypto_sign/aimer192s/m4speed/sign.c
 create mode 100644 crypto_sign/aimer192s/m4speed/sign.h
 create mode 100644 crypto_sign/aimer192s/m4speed/tree.c
 create mode 100644 crypto_sign/aimer192s/m4speed/tree.h
 create mode 100644 crypto_sign/aimer192s/m4stack/__asm_field.S
 create mode 100644 crypto_sign/aimer192s/m4stack/aim2.c
 create mode 100644 crypto_sign/aimer192s/m4stack/aim2.h
 create mode 100644 crypto_sign/aimer192s/m4stack/api.h
 create mode 100644 crypto_sign/aimer192s/m4stack/field.c
 create mode 100644 crypto_sign/aimer192s/m4stack/field.h
 create mode 100644 crypto_sign/aimer192s/m4stack/hash.c
 create mode 100644 crypto_sign/aimer192s/m4stack/hash.h
 create mode 100644 crypto_sign/aimer192s/m4stack/params.h
 create mode 100644 crypto_sign/aimer192s/m4stack/sign.c
 create mode 100644 crypto_sign/aimer192s/m4stack/sign.h
 create mode 100644 crypto_sign/aimer192s/m4stack/tree.c
 create mode 100644 crypto_sign/aimer192s/m4stack/tree.h
 create mode 100644 crypto_sign/aimer256f/m4speed/__asm_field.S
 create mode 100644 crypto_sign/aimer256f/m4speed/aim2.c
 create mode 100644 crypto_sign/aimer256f/m4speed/aim2.h
 create mode 100644 crypto_sign/aimer256f/m4speed/api.h
 create mode 100644 crypto_sign/aimer256f/m4speed/field.c
 create mode 100644 crypto_sign/aimer256f/m4speed/field.h
 create mode 100644 crypto_sign/aimer256f/m4speed/hash.c
 create mode 100644 crypto_sign/aimer256f/m4speed/hash.h
 create mode 100644 crypto_sign/aimer256f/m4speed/params.h
 create mode 100644 crypto_sign/aimer256f/m4speed/sign.c
 create mode 100644 crypto_sign/aimer256f/m4speed/sign.h
 create mode 100644 crypto_sign/aimer256f/m4speed/tree.c
 create mode 100644 crypto_sign/aimer256f/m4speed/tree.h
 create mode 100644 crypto_sign/aimer256f/m4stack/__asm_field.S
 create mode 100644 crypto_sign/aimer256f/m4stack/aim2.c
 create mode 100644 crypto_sign/aimer256f/m4stack/aim2.h
 create mode 100644 crypto_sign/aimer256f/m4stack/api.h
 create mode 100644 crypto_sign/aimer256f/m4stack/field.c
 create mode 100644 crypto_sign/aimer256f/m4stack/field.h
 create mode 100644 crypto_sign/aimer256f/m4stack/hash.c
 create mode 100644 crypto_sign/aimer256f/m4stack/hash.h
 create mode 100644 crypto_sign/aimer256f/m4stack/params.h
 create mode 100644 crypto_sign/aimer256f/m4stack/sign.c
 create mode 100644 crypto_sign/aimer256f/m4stack/sign.h
 create mode 100644 crypto_sign/aimer256f/m4stack/tree.c
 create mode 100644 crypto_sign/aimer256f/m4stack/tree.h
 create mode 100644 crypto_sign/aimer256s/m4speed/__asm_field.S
 create mode 100644 crypto_sign/aimer256s/m4speed/aim2.c
 create mode 100644 crypto_sign/aimer256s/m4speed/aim2.h
 create mode 100644 crypto_sign/aimer256s/m4speed/api.h
 create mode 100644 crypto_sign/aimer256s/m4speed/field.c
 create mode 100644 crypto_sign/aimer256s/m4speed/field.h
 create mode 100644 crypto_sign/aimer256s/m4speed/hash.c
 create mode 100644 crypto_sign/aimer256s/m4speed/hash.h
 create mode 100644 crypto_sign/aimer256s/m4speed/params.h
 create mode 100644 crypto_sign/aimer256s/m4speed/sign.c
 create mode 100644 crypto_sign/aimer256s/m4speed/sign.h
 create mode 100644 crypto_sign/aimer256s/m4speed/tree.c
 create mode 100644 crypto_sign/aimer256s/m4speed/tree.h
 create mode 100644 crypto_sign/aimer256s/m4stack/__asm_field.S
 create mode 100644 crypto_sign/aimer256s/m4stack/aim2.c
 create mode 100644 crypto_sign/aimer256s/m4stack/aim2.h
 create mode 100644 crypto_sign/aimer256s/m4stack/api.h
 create mode 100644 crypto_sign/aimer256s/m4stack/field.c
 create mode 100644 crypto_sign/aimer256s/m4stack/field.h
 create mode 100644 crypto_sign/aimer256s/m4stack/hash.c
 create mode 100644 crypto_sign/aimer256s/m4stack/hash.h
 create mode 100644 crypto_sign/aimer256s/m4stack/params.h
 create mode 100644 crypto_sign/aimer256s/m4stack/sign.c
 create mode 100644 crypto_sign/aimer256s/m4stack/sign.h
 create mode 100644 crypto_sign/aimer256s/m4stack/tree.c
 create mode 100644 crypto_sign/aimer256s/m4stack/tree.h

diff --git a/crypto_sign/aimer128f/m4speed/__asm_field.S b/crypto_sign/aimer128f/m4speed/__asm_field.S
new file mode 100644
index 00000000..05656b37
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/__asm_field.S
@@ -0,0 +1,544 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w {R4-R10, lr}
+
+  ldr.w in0, [in_p, #0 * width]  // a[0]
+  ldr.w in2, [in_p, #1 * width]
+  ldr.w in4, [in_p, #2 * width]  // a[1]
+  ldr.w in6, [in_p, #3 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R10, C4
+
+  and.w in0, in0, R10, lsr #16
+  and.w in2, in2, R10, lsr #16
+  and.w in4, in4, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+  eor.w in4, in4, in7, lsr #25
+  eor.w in4, in4, in7, lsr #30
+  eor.w in4, in4, in7, lsr #31
+
+  // c[1] = temp[1] ^ temp[3];
+  eor.w in2, in2, in6
+  eor.w in3, in3, in7
+
+  // c[1] ^= (temp[3] << 7) | (t >> 57);
+  // c[1] ^= (temp[3] << 2) | (t >> 62);
+  // c[1] ^= (temp[3] << 1) | (t >> 63);
+  eor.w in2, in2, in5, lsr #25
+  eor.w in2, in2, in5, lsr #30
+  eor.w in2, in2, in5, lsr #31
+
+  eor.w in2, in2, in6, lsl #7
+  eor.w in2, in2, in6, lsl #2
+  eor.w in2, in2, in6, lsl #1
+
+  eor.w in3, in3, in6, lsr #25
+  eor.w in3, in3, in6, lsr #30
+  eor.w in3, in3, in6, lsr #31
+
+  eor.w in3, in3, in7, lsl #7
+  eor.w in3, in3, in7, lsl #2
+  eor.w in3, in3, in7, lsl #1
+
+  // c[0] = temp[0] ^ t;
+  eor.w in0, in0, in4
+  eor.w in1, in1, in5
+
+  // c[0] ^= (t << 7);
+  // c[0] ^= (t << 2);
+  // c[0] ^= (t << 1);
+  eor.w in0, in0, in4, lsl #7
+  eor.w in0, in0, in4, lsl #2
+  eor.w in0, in0, in4, lsl #1
+
+  eor.w in1, in1, in4, lsr #25
+  eor.w in1, in1, in4, lsr #30
+  eor.w in1, in1, in4, lsr #31
+
+  eor.w in1, in1, in5, lsl #7
+  eor.w in1, in1, in5, lsl #2
+  eor.w in1, in1, in5, lsl #1
+
+  str.w in0, [out_p, #0 * width]
+  str.w in1, [out_p, #1 * width]
+  str.w in2, [out_p, #2 * width]
+  str.w in3, [out_p, #3 * width]
+
+  pop.w {R4-R10, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer128f/m4speed/aim2.c b/crypto_sign/aimer128f/m4speed/aim2.c
new file mode 100644
index 00000000..7cb00352
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/aim2.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 49
+// (2 ^ 49 - 1) ^ (-1) mod (2 ^ 128 - 1) = 0xb6b6d6d6dadb5b5b6b6b6d6dadadb5b5
+// b6b6d6d 6 d a d b5 b5 b6 b6b6d6d a d a d b5 b5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // table_b = in ^ (0xb6), table_5 = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_6);
+  GF_mul_s(table_5, t1, table_5);
+
+  // t1 = in ^ (0xb6b6)
+  GF_sqr_s(t1, table_b);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6b6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t2 = in ^ (0xb6b6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d 6)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6b6d6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5 b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6 b6b6d6d)
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dad a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dada d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dadad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dadadb5 b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_5);
+}
+
+// inverse Mersenne S-box with e2 = 91
+// (2 ^ 91 - 1) ^ (-1) mod (2 ^ 128 - 1) = 0xb6db5b6dadb6dadb6d6db6d6db6b6db5
+// b6d b5 b6d a d b6d a d b6d 6 d b6d 6 d b6 b6d b5
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,}, t3 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t3 = in ^ (0xb6), table_b = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_5);
+  GF_mul_s(t3, t1, table_6);
+
+  // t2 = in ^ (0xb6 d)
+  GF_sqr_s(t1, t3);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // t1 = in ^ (0xb6d b5)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6db5 b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6db5b6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dad b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6db5b6dadb6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dad b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6d b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6d b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6db6 b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // out = in ^ (0xb6db5b6dadb6dadb6d6db6d6db6b6d b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_b);
+}
+
+// Mersenne exponentiation with e_star = 3
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+
+  // t1 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, in);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[0], state[0], vector_b);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+}
diff --git a/crypto_sign/aimer128f/m4speed/aim2.h b/crypto_sign/aimer128f/m4speed/aim2.h
new file mode 100644
index 00000000..5564fc71
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/aim2.h
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0x13198a2e03707344, 0x243f6a8885a308d3},
+  {0x082efa98ec4e6c89, 0xa4093822299f31d0},
+};
+
+static const GF aim2_e1_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000},
+  {0xb87c1159421de6c0, 0xfbcf8c1e442c8cf5},
+  {0x687634c0bd8f66a6, 0x4d328e5ae8b1bde5},
+  {0x742a6036d93c2057, 0x08974511b147a2fe},
+  {0xc8b21bf16608e4db, 0x4d758c29eeb484f7},
+  {0x0b5c6d5c43980a3c, 0x82739c986dfbdb20},
+  {0x0ace7f98da3711b9, 0x34f149a76cf782b0},
+  {0x321995ec53ea9914, 0xc2ff5007f8a98c83},
+  {0x939b53119c4b7496, 0x097da6d2e8f7686d},
+  {0x5fb6dd3ca90cff95, 0x10f77bb9e7748ed3},
+  {0x55194932141d0937, 0xc253f8ea7ac0779a},
+  {0xb2a4b4591251916b, 0xdfef8e3e1b142c07},
+  {0x14df24dfc33e1f4f, 0x931f7bdb443197a1},
+  {0xbd4cbe8b919dbb07, 0x24128da6bf057bc8},
+  {0x1be6a922a8d0d7d4, 0xb7330162b6115e90},
+  {0xb6d9e6635ec916aa, 0x930f20cea1c668e0},
+  {0xccbb31a458da0423, 0x60488351c7403436},
+  {0xef86b4dbc4263e4d, 0x9237f55823767eae},
+  {0xe2a0e301bed0748a, 0x967e64f599297f3c},
+  {0x2fde9314f05105e5, 0x58f5315e0e29e358},
+  {0xc9e5b15be18b7596, 0xa305f4f11aaa8ad2},
+  {0xa592cb3563071925, 0x31b050cca997ed24},
+  {0xa55f9e7374b10af1, 0x5904c31aaebea1ed},
+  {0xcf6921d88d12bbf2, 0xea5142776b77d368},
+  {0x28779ef24c9ddcb5, 0x448bfd74cc624506},
+  {0x0d2caf1924759d9a, 0xc66ef14828e98e80},
+  {0x312a49ac8d3790db, 0x5121956dac40960a},
+  {0x311230a0f0166f37, 0x41cdda4642d1e45a},
+  {0x152cd68f8d980779, 0x50accd8f44cc6a3b},
+  {0x0e6342e6e178a202, 0xaf2e59b6e13fec01},
+  {0xcdfaea274cfff823, 0x008f7a68483d8f8a},
+  {0x80183f4571309485, 0xdece92499f9521ba},
+  {0xaba321469362905f, 0x3c5814a4c792b3be},
+  {0x7e8680766e1d3ffc, 0x7585a167f0b843b8},
+  {0x4e81e572c5dbf79a, 0x114bd1d466ef8787},
+  {0x3a7e0a403a1da600, 0x014747267c0b38f9},
+  {0x23116c4dd539e293, 0x196284a6305e23d7},
+  {0xf0a02f00d5a45c0f, 0xae9980fc3aa3cd2e},
+  {0x7eae2c6dae8286e9, 0xd2be72a1da8addbc},
+  {0xbb8689cb630a9e23, 0x2d1eb9e86163e7f4},
+  {0xf0febfb8f6e46561, 0x8eda5ccd665a3ac2},
+  {0x370a6880719f8be9, 0x83fe14fe68c33df0},
+  {0xe9634dd58474116f, 0xdfb51a0ca76c9c82},
+  {0x9c40da32ca69fe52, 0xcecdf64c8559eef4},
+  {0xe29f358edce8d40e, 0x9256190cf3cfb1fa},
+  {0xb5431f672597e9cd, 0xc69025ae5a99210a},
+  {0x0f00e0c670d40d95, 0xdf81e3ce7617b0a1},
+  {0x699332d099ea38d7, 0xc24d5671c235f28e},
+  {0x89ea2f4529a74b45, 0x7c11f6654369b65d},
+  {0xeaa8470e44915e89, 0x049b62170967135b},
+  {0x39fb9877aadc951b, 0xba3743d76fda5083},
+  {0xe2da8722532e6fbb, 0xdef2a5ff6e028aba},
+  {0xb5e975340c6c76a1, 0xf28418e25fbc0144},
+  {0x035ab9363f6882be, 0xab56f227d4a26a26},
+  {0x273536b8b02dd5f1, 0x75af981a11d43e64},
+  {0x846e480a8bc44fa9, 0x507a048207335fa1},
+  {0x3808d8fa4fcba922, 0xf632f1c9c802ab76},
+  {0x34ecb7872eda1962, 0x2dcfedd3c12f73be},
+  {0xf884a540c1b411ea, 0xf77d23a1c6600553},
+  {0x0e106a0239843e3c, 0x7d5ef83763344eed},
+  {0x4192e743be4ae7d9, 0x5070be659c9249dd},
+  {0x6588c07b62dd03ba, 0x09d7b6469e953856},
+  {0x790b4af55db42c92, 0x5c859acd40414177},
+  {0xedda860c739ca8f7, 0xd728f7e92e3e7940},
+  {0xfbcf513b18b860f7, 0xf6fd92c58b52c44d},
+  {0x4f1571762119854e, 0x04286d00eb347197},
+  {0x3f777b9977ed2aa6, 0xf68288c09c8d73d4},
+  {0x538b16a3bd887a20, 0x86437c4cb491c94b},
+  {0x3656d64f9fdf8baf, 0x97db137363bf2a7e},
+  {0x0582fbdad31a1e6c, 0x213b4a759760ffe0},
+  {0xc7f42208feff0a47, 0x05cb6fb77aad0666},
+  {0x8f59c644fd5259d4, 0xd3740dabc91a5ecd},
+  {0xca19d9ef4ab67cee, 0xa2486f3cdc03c63f},
+  {0x8a1f14a7c3d2f88f, 0x71b6e4a0b3d4a2a0},
+  {0xe9ee9aa288652690, 0xa28d2266c47e02b2},
+  {0x759c7eee1a3eead3, 0x689aa81596670031},
+  {0x50a9a3f15e0032ae, 0x206b34f2ed6fc8ff},
+  {0x630774b85c40302e, 0xf7f5952347d531a6},
+  {0x78886ef4e794267b, 0x7072ec9b3a2ddd8f},
+  {0x754c7bf46deec1a2, 0xb360d5ec03ebf053},
+  {0x337080ceace4b67a, 0xbe8541809bccdc7d},
+  {0x8c243c5d486009a0, 0x87fc6f3fbe554f61},
+  {0x58e8f3ccf2596f26, 0xc7a500e89b1b40a6},
+  {0x516a6cbee9e76420, 0xe719cb9a5a49f8ed},
+  {0x96f150816f90c216, 0x484947f2b48d7882},
+  {0xdefb92978dfa0053, 0x58823337d6c0a641},
+  {0x98bbc22dd2d3262b, 0xdad5891c70205c95},
+  {0xbf1d06e5edc7d9ba, 0xea3e0a86c4241c1e},
+  {0x78e2cf480abc18ef, 0x1110bc39a35669cf},
+  {0xc188299c1375e7b2, 0x8eb4cf8cb0851480},
+  {0xd0ec275048c667d2, 0xff5c57071581e3b1},
+  {0x955c8d54a50fdd52, 0xcf79008ac79991d3},
+  {0xf46cdcd85b7289c9, 0x1c5fc0acfab2cbb2},
+  {0x676f48ac3ed3c825, 0x862183d1a9042f4d},
+  {0xf35fc7982c7daee6, 0xa655183af862baae},
+  {0x5335bbcaf8b9f37b, 0x963ed04a2a0b3eaa},
+  {0x76d009714121cb10, 0x82f1d3e8253374ee},
+  {0x50198339f3198270, 0xee023bd013e359f5},
+  {0x315d27ea94c7941a, 0x5c1520117e098dbf},
+  {0x96ccc513ba987df7, 0x7d84bbe2e504ff94},
+  {0x03464584b630d2b7, 0x7d9fc4a633f228f4},
+  {0x7e39cbb756cac943, 0x45a5498048f1a474},
+  {0x56a90669f7aa29c6, 0x4883787b94c90425},
+  {0x9a262b27cb8de6e9, 0x6495beb53f905401},
+  {0xdc5866e0159b2920, 0x6c2c9c31b3faab04},
+  {0x82f93c693fec7b5f, 0x1926807fb1c2bdd5},
+  {0x3a06ca560fda4251, 0xff56ec036c5f13d6},
+  {0xcf96fe4ae095a1c3, 0xaea98fd960fd6b9d},
+  {0xc2ae3b23e1b73447, 0xe7c1f21b63d4e19c},
+  {0x660f92196e62044c, 0xa61e4689ac8893c0},
+  {0x4aacc983cc5d9cfe, 0xb71adc881811c258},
+  {0xb01938e5f92ea2e0, 0x3d4b38fea83810f8},
+  {0x8195527abb10f039, 0x242e99777aeec42a},
+  {0x077a36f6536baf7f, 0x928620c22f148a6d},
+  {0xb4d16665e8f965a6, 0x300ecf50c00b75a7},
+  {0x53d4fbf144350d5c, 0x50967628985e6eaf},
+  {0xea67291009feb48e, 0x74a182255aa9ccae},
+  {0xe67c52e63c97fb3a, 0xbe1b4991d245fa61},
+  {0x6bd8d3685ed38551, 0xc26bdd871e8691e5},
+  {0x267c4e3df39e0a7e, 0x2408058c7b3e3c09},
+  {0x2bc55550057b4b4e, 0xe70baa2724b374d3},
+  {0x0e2984947284c4dd, 0x4f4e64ba26bfee68},
+  {0x78891ea4bacdb828, 0x357f7d8801646f08},
+  {0x220a9cb569d1ee6b, 0x8e6c9653552802fa},
+  {0x6159359f74dda4d7, 0xcbd0c89374b1cc2e},
+  {0x8dd5a4c4fe55c89f, 0xeeca37f94d3f69bd},
+  {0x22abf1f68e0f314a, 0x69b86caf61d48d15},
+  {0xab26c59f1090455d, 0x1a49957d9798f177},
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000},
+  {0xf50e0632f2a35f5b, 0x386db41096f62a8a},
+  {0x1843656b2ea8f397, 0xefdb454053648225},
+  {0xfc670d9cf3feeb63, 0x7582326d84c7a1de},
+  {0xf1c52011971b40b3, 0x864204566cee644d},
+  {0x5d8e354c13ae648b, 0x192b28f22b444709},
+  {0x9d5cef9c88eb0d9f, 0xb686d60b99470446},
+  {0xc91fa3a9b726fd99, 0xcf7a6d254a105b09},
+  {0x048e86e374780c55, 0x9f65220d0c78fc67},
+  {0xafa9c90017000acc, 0x83a4540ded360993},
+  {0x3e563c2c6efb6102, 0xb7147f0d38fa394e},
+  {0x858e694ad98264cb, 0x184d72cdc205efdb},
+  {0x260f2eae08292a50, 0x101cdb156939622a},
+  {0x4a9a43781e99484d, 0x8b9b7c41b6c639f6},
+  {0x16c9831c810a7459, 0xcb60c983013050be},
+  {0x96d02af1b8d2cba8, 0xb37b4c2c6ea27c34},
+  {0x3caadfab02ea679d, 0x6c3124a15e087d32},
+  {0xf0892e59955b87ae, 0xaab1aa69ba6853d8},
+  {0x8420916c212205ac, 0x86ed9039af31291c},
+  {0x0610fd444421f178, 0xa6b004a839e31b64},
+  {0xaebf5d9bae4e4ac1, 0x54bf9e6ec57b2d65},
+  {0x28bce750ebcba70e, 0x4ce04f578ca77d4d},
+  {0xe35d48d89312441e, 0xe6d91969fd74895a},
+  {0xcca901ef7fabb1c5, 0x117d2c0c4032a05b},
+  {0x4d05be0c6a5a2edc, 0x8314aecc100fcba9},
+  {0x7c685f4133a51825, 0x9acd72f51105c28b},
+  {0x5011fb2faa2c215a, 0xf33e2515d2bd65e4},
+  {0xcec542879e66d1d0, 0xb35dca22a0c3ce97},
+  {0x40849b4ce23375b2, 0x92453c68d163c3cb},
+  {0x807af8ab827e3617, 0x9aa0b258c13e1db7},
+  {0x02cf8f1292f7c659, 0x188599535df660bb},
+  {0x675c7dfe865c4b21, 0x60e7e01162356b69},
+  {0xdca8758ed620dd7b, 0x40e2dfc1450698ca},
+  {0xd4785af596fd0c85, 0x194dcdf10572a8d6},
+  {0x39c75c8db5a743fd, 0xaaff1be5fb825c25},
+  {0x76f287eaaf80a26e, 0x6d5c3d924e633b50},
+  {0xf3289f813d56fa87, 0x8a5781160603ae34},
+  {0x023097d7bf57b560, 0x5c09da41ceda1dab},
+  {0xaa7caa9af1506059, 0xd65b5a005d02edd2},
+  {0x837d13e5bec17d5c, 0x96732ad7e569d594},
+  {0xfca9d80d257930ce, 0xbb07355f7df706c0},
+  {0x7e719f925352363e, 0x61f17c3d17da7386},
+  {0xbdd686a4862a5d5b, 0x5ddbe9580f36ecca},
+  {0xcd8440580a8cb347, 0x8b395b802547e6d8},
+  {0x4338e255f15fc0d9, 0xf2400716b60d1c2c},
+  {0xc0a4a5181cf7a401, 0x208e7b27a3d4e578},
+  {0x6557dd7a9909844a, 0xd7dd867435b17ded},
+  {0xe7214501f52038cd, 0xf73bfe485cf7fdd0},
+  {0x93443a46972cbc70, 0xd2ca8f42b2d199e0},
+  {0xbea25cda0a9de799, 0x51886f07950aef32},
+  {0x82824ccfb37df72d, 0x71a58d7df86233f6},
+  {0x0ab442c2423ac6e3, 0x5d989eeb2df819bb},
+  {0x717b766d60dda065, 0x3899b1af41b28b8b},
+  {0x2fffad98c8e94310, 0x9ff893980c381280},
+  {0x9d7da6a6ca8c0d82, 0x09c78e0f83da5e2a},
+  {0x26b7e85d55753566, 0x48b0fee439062128},
+  {0x63896bb7a7a3c638, 0x551438e5f3ff5db9},
+  {0x080d9af5ef2e5865, 0x048eccc1b914ae50},
+  {0xf081a5f8ab004099, 0x24ffc9670c5492ac},
+  {0x7e4178c2bf375b5a, 0xa641e4982d1c8638},
+  {0x9f1874733c37691a, 0xa6e59883261af497},
+  {0x90068f05a814992d, 0x8f340c2ecb9a2bd2},
+  {0x2e0a82ad5f144c70, 0x783eb790b951d2d9},
+  {0xbf58c52a82e24af6, 0x190f49c97cd133af},
+  {0x29e30e4d37b882a1, 0x217bea750913f0db},
+  {0xfe2287c403984038, 0x870bd9dd397e696e},
+  {0x49e9bc6efdb97d7d, 0xf75f4c5e88587e96},
+  {0xa6223b70299d2836, 0xf27661ea227ab61b},
+  {0x4d6b8601ceb750cf, 0xfab6503eb520e48d},
+  {0xcaf2dd4a73f67c6c, 0x93f3baaf44fed4e0},
+  {0x1ff32e99fc57e662, 0x502b8bb6f2031150},
+  {0x1d8b5656e3d694dc, 0xb31de0d77f80372b},
+  {0x0f3d13aca2eac302, 0xb6d1f98a81d2cd6d},
+  {0x840d8615c90887b8, 0x1d44fc5efe63c574},
+  {0xded005c9eb05ef63, 0xdeb4246e55c121bb},
+  {0x3409b8d1c43c2415, 0x700c0d1dc307fe8d},
+  {0x8b361337911e3002, 0x7920c1039098414c},
+  {0xa5dddabdd1beecf5, 0x146aaf12b0d6da5e},
+  {0xe1a91d6f2a874e47, 0x0d63fcc83ef069d1},
+  {0x0ffd9177c1f3ebb2, 0x9a0cadce706c0cc0},
+  {0xc60d34aa0f45f13d, 0x2d0b4ea8c2bfdc70},
+  {0x83e36503d6399610, 0x6014c0c7cba2d2f2},
+  {0x9cc705d2ecaeca0e, 0x79f83e8c83e7f333},
+  {0x58c7035772444ccc, 0x789c6687005b995e},
+  {0x6b3d950394c886a1, 0x9b4f4564cd5b92d0},
+  {0x872c7f29b6dae6ca, 0xd2a320a97a0d0be9},
+  {0x14bb3a90e34016b2, 0xb308fa5fc47ad142},
+  {0xc6b31a14ce574546, 0xd7f758f96323f56e},
+  {0x046d3862feb271a0, 0x391175405eef9c5e},
+  {0xf7654c3e98aff433, 0x92b8d607c0180e5e},
+  {0xdfe26a4ee0edcbcf, 0x4c21afb68c481788},
+  {0xb9175aa38699a7fa, 0xa26d3569fb705b0a},
+  {0xd2955bcc820c812c, 0x29d30f039b37f636},
+  {0x37d8c59743ebdc8d, 0x19289d7baab847bc},
+  {0x8b8a25c0075e7200, 0x75fcbc7110b551c9},
+  {0x8ab2318dd48eb686, 0xca8ee9edf4a5a1e0},
+  {0x182033f6233cad5f, 0x743083edee67622b},
+  {0xc82b0364e7db3d93, 0x3cec89a9bc59587d},
+  {0x4fb362a6d33cdc65, 0xb2f2a5ce567b5b8e},
+  {0x90df4043911d6152, 0xfe9e1ef68cc145b2},
+  {0x4fcf7b4fcca5200b, 0xaba094d2f96d9249},
+  {0x5ac887c31fc3fd76, 0x1845172174cf2944},
+  {0x25180f84f6702866, 0xde5223f17c83df5d},
+  {0x2863a5b3ae30cdea, 0x610fc2ae8f7cfc74},
+  {0x64a4086ca77af644, 0xafe073214eb0e372},
+  {0xbdc97dadac10ab50, 0x97cf31c3dfa3a7ad},
+  {0x79f2ee819538d167, 0x68555fb401eb2780},
+  {0x72e2b904d5c7a7ed, 0x482326aa3e165b1c},
+  {0x92f65484dcff7fd8, 0x603faf9bafb86f1e},
+  {0x210e7817fff07876, 0xabdf6d8a0dd6d8a5},
+  {0xab561f7f19942dec, 0x55f71e3e54c7b523},
+  {0x8e7140a742fb2245, 0x34a49c54b5ad70ec},
+  {0x6da544268e007b3f, 0xebf2cf33aeaee1c9},
+  {0x010679622fe3753e, 0x40228d2a0d402ed0},
+  {0x2e128b07e6e4e311, 0x0811ebd4d8dde5b5},
+  {0x126cb02cee9ad020, 0x398e5321decfb79c},
+  {0x6dfdff51553fb5ff, 0x415b4003d55c33ab},
+  {0xd3b7fedc1cd8ab6e, 0x49dc7b6033f0ae60},
+  {0x7062ab84db2bbaed, 0xc33060adb11136c6},
+  {0xae149ced6b9cc3d3, 0xef2f29a2ebe433ce},
+  {0x133ca1e237105dc6, 0x9712a59673f1d79c},
+  {0xfcf98569ab4ec844, 0x6a40dd9e8d49194e},
+  {0xd73a65ce7e33212a, 0xaa29936469e73794},
+  {0x961009e50707fe21, 0x657c63ec063d9f23},
+  {0x6b1af6be25650671, 0xce96b0cb11ce0372},
+  {0xc7312488beda3b54, 0x9ee42f2347f50335},
+  {0x829d638189fca23f, 0xe3123a63017f9509},
+  {0xbb40cef8e0e85cea, 0xd8b3a76799622f49},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer128f/m4speed/api.h b/crypto_sign/aimer128f/m4speed/api.h
new file mode 100644
index 00000000..50b6558d
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 32
+#define CRYPTO_SECRETKEYBYTES 48
+#define CRYPTO_BYTES 5888
+#define CRYPTO_ALGNAME "aimer128f"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer128f/m4speed/field.c b/crypto_sign/aimer128f/m4speed/field.c
new file mode 100644
index 00000000..1e12b447
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/field.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+  uint64_t sub[2] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+
+  poly64_mul(&a[1], &b[1], &temp[3], &temp[2]);
+  poly64_mul(&a[0], &b[0], &temp[1], &temp[0]);
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] = temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+  uint64_t sub[2] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+
+  poly64_mul(&a[1], &b[1], &temp[3], &temp[2]);
+  poly64_mul(&a[0], &b[0], &temp[1], &temp[0]);
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] ^= temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 8, index >>= 8, b_ptr += 8)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+
+      mask = -((index >> 4) & 1);
+      temp_c0 ^= (b_ptr[4][0] & mask);
+      temp_c1 ^= (b_ptr[4][1] & mask);
+
+      mask = -((index >> 5) & 1);
+      temp_c0 ^= (b_ptr[5][0] & mask);
+      temp_c1 ^= (b_ptr[5][1] & mask);
+
+      mask = -((index >> 6) & 1);
+      temp_c0 ^= (b_ptr[6][0] & mask);
+      temp_c1 ^= (b_ptr[6][1] & mask);
+
+      mask = -((index >> 7) & 1);
+      temp_c0 ^= (b_ptr[7][0] & mask);
+      temp_c1 ^= (b_ptr[7][1] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 8, index >>= 8, b_ptr += 8)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+
+      mask = -((index >> 4) & 1);
+      temp_c0 ^= (b_ptr[4][0] & mask);
+      temp_c1 ^= (b_ptr[4][1] & mask);
+
+      mask = -((index >> 5) & 1);
+      temp_c0 ^= (b_ptr[5][0] & mask);
+      temp_c1 ^= (b_ptr[5][1] & mask);
+
+      mask = -((index >> 6) & 1);
+      temp_c0 ^= (b_ptr[6][0] & mask);
+      temp_c1 ^= (b_ptr[6][1] & mask);
+
+      mask = -((index >> 7) & 1);
+      temp_c0 ^= (b_ptr[7][0] & mask);
+      temp_c1 ^= (b_ptr[7][1] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+
+  poly64_mul_s(&temp[3], &temp[2], a[1], b[1]);
+  poly64_mul_s(&temp[1], &temp[0], a[0], b[0]);
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] = temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+
+  poly64_mul_s(&temp[3], &temp[2], a[1], b[1]);
+  poly64_mul_s(&temp[1], &temp[0], a[0], b[0]);
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] ^= temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
diff --git a/crypto_sign/aimer128f/m4speed/field.h b/crypto_sign/aimer128f/m4speed/field.h
new file mode 100644
index 00000000..e8fd7996
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[2];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer128f/m4speed/hash.c b/crypto_sign/aimer128f/m4speed/hash.c
new file mode 100644
index 00000000..71f3fb67
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/hash.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+void hash_init(hash_instance *ctx)
+{
+  shake128_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake128_inc_init(ctx);
+  shake128_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake128_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake128_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake128_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake128_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake128_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer128f/m4speed/hash.h b/crypto_sign/aimer128f/m4speed/hash.h
new file mode 100644
index 00000000..d6b05065
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/hash.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake128incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer128f/m4speed/params.h b/crypto_sign/aimer128f/m4speed/params.h
new file mode 100644
index 00000000..8753737e
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/params.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer128f_m4speed_##s
+
+#define SECURITY_BITS               128                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         2                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     33                   // number of parallel repetitions (Tau)
+#define AIMER_N                     16                   // number of MPC parties (N)
+#define AIMER_LOGN                  4                    // log_2(N)
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer128f/m4speed/sign.c b/crypto_sign/aimer128f/m4speed/sign.c
new file mode 100644
index 00000000..ce54ea41
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/sign.c
@@ -0,0 +1,547 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_mul(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->z_shares[0], mult_chk->x_shares[0],
+                           aim2_e1_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 uint8_t commits[AIMER_T][AIMER_N][AIMER_COMMIT_SIZE],
+                 uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 mult_chk_t mult_chk[AIMER_T][AIMER_N],
+                 GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                 const uint8_t *sk, const uint8_t *m, size_t mlen)
+{
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt
+  hash_init_prefix(&ctx, HASH_PREFIX_3);
+  hash_update(&ctx, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, random, SECURITY_BYTES);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // generate root seeds and expand seed trees
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    hash_squeeze(&ctx, nodes[rep][0], AIMER_SEED_SIZE);
+  }
+  expand_trees(nodes, sign->salt);
+  hash_ctx_release(&ctx);
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // initialize adjustment values
+    tape_t delta, tape;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      commit_and_expand_tape(&tape, commits[rep][party], &ctx_precom,
+                             nodes[rep][party + AIMER_N - 1], rep, party);
+      hash_update(&ctx, commits[rep][party], AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+      GF_set0(mult_chk[rep][party].x_shares[AIMER_L]);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk[rep][party].x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk[rep][party].pt_share, tape.pt_share);
+      GF_copy(mult_chk[rep][party].x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk[rep][party].x_shares[1], tape.t_shares[1]);
+      GF_copy(alpha_v_shares[rep][0][party], tape.a_share);
+      GF_copy(alpha_v_shares[rep][1][party], tape.c_share);
+
+      aim2_mpc(&mult_chk[rep][party],
+               (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_2_and_3(signature_t *sign,
+                       GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                       const mult_chk_t mult_chk[AIMER_T][AIMER_N])
+{
+  GF epsilons[AIMER_L + 1];
+
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    GF_set0(alpha);
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // alpha_share = a_share + sum x_share[i] * eps[i]
+      // v_share = c_share - pt_share * alpha + sum z_share[i] * eps[i]
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[rep][0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].pt_share, alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares[rep],
+                AIM2_NUM_BYTES_FIELD * 2 * AIMER_N);
+  }
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+  hash_ctx_release(&ctx_e);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  hash_instance ctx;
+  signature_t *sign = (signature_t *)sig;
+
+  //////////////////////////////////////////////////////////////////////////
+  // Phase 1: Committing to the seeds and the execution views of parties. //
+  //////////////////////////////////////////////////////////////////////////
+
+  // nodes for seed trees
+  uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  // commitments for seeds
+  uint8_t commits[AIMER_T][AIMER_N][AIMER_COMMIT_SIZE];
+
+  // multiplication check inputs
+  mult_chk_t mult_chk[AIMER_T][AIMER_N];
+
+  // multiplication check outputs
+  GF alpha_v_shares[AIMER_T][2][AIMER_N];
+
+  // commitments for phase 1
+  run_phase_1(sign, commits, nodes, mult_chk, alpha_v_shares, sk, m, mlen);
+
+  /////////////////////////////////////////////////////////////////
+  // Phase 2, 3: Challenging and committing to the simulation of //
+  //             the multiplication checking protocol.           //
+  /////////////////////////////////////////////////////////////////
+
+  // compute the commitment of phase 3
+  run_phase_2_and_3(sign, alpha_v_shares,
+                    (const mult_chk_t (*)[AIMER_N])mult_chk);
+
+  //////////////////////////////////////////////////////
+  // Phase 4: Challenging views of the MPC protocols. //
+  //////////////////////////////////////////////////////
+
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  //////////////////////////////////////////////////////
+  // Phase 5: Opening the views of the MPC protocols. //
+  //////////////////////////////////////////////////////
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes[rep], i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits[rep][i_bar],
+           AIMER_COMMIT_SIZE);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes,
+                alpha_v_shares[rep][0][i_bar]);
+  }
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(signature, CRYPTO_BYTES, message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer128f/m4speed/sign.h b/crypto_sign/aimer128f/m4speed/sign.h
new file mode 100644
index 00000000..e64c4350
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/sign.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF pt_share;
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 uint8_t commits[AIMER_T][AIMER_N][AIMER_COMMIT_SIZE],
+                 uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 mult_chk_t mult_chk[AIMER_T][AIMER_N],
+                 GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                 const uint8_t *sk, const uint8_t *m, size_t mlen);
+
+#define run_phase_2_and_3 AIMER_NAMESPACE(run_phase_2_and_3)
+void run_phase_2_and_3(signature_t *sign,
+                       GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                       const mult_chk_t mult_chk[AIMER_T][AIMER_N]);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer128f/m4speed/tree.c b/crypto_sign/aimer128f/m4speed/tree.c
new file mode 100644
index 00000000..84c23d7f
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/tree.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void expand_trees(uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                  const uint8_t salt[AIMER_SALT_SIZE])
+{
+  size_t rep, index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (rep = 0; rep < AIMER_T; rep++)
+  {
+    buffer[0] = (uint8_t)(rep);
+    for (index = 1; index < AIMER_N; index++)
+    {
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[rep][index - 1], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[rep][2 * index - 1], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer128f/m4speed/tree.h b/crypto_sign/aimer128f/m4speed/tree.h
new file mode 100644
index 00000000..b5a27867
--- /dev/null
+++ b/crypto_sign/aimer128f/m4speed/tree.h
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define expand_trees AIMER_NAMESPACE(expand_trees)
+void expand_trees(uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                  const uint8_t salt[AIMER_SALT_SIZE]);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer128f/m4stack/__asm_field.S b/crypto_sign/aimer128f/m4stack/__asm_field.S
new file mode 100644
index 00000000..05656b37
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/__asm_field.S
@@ -0,0 +1,544 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w {R4-R10, lr}
+
+  ldr.w in0, [in_p, #0 * width]  // a[0]
+  ldr.w in2, [in_p, #1 * width]
+  ldr.w in4, [in_p, #2 * width]  // a[1]
+  ldr.w in6, [in_p, #3 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R10, C4
+
+  and.w in0, in0, R10, lsr #16
+  and.w in2, in2, R10, lsr #16
+  and.w in4, in4, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+  eor.w in4, in4, in7, lsr #25
+  eor.w in4, in4, in7, lsr #30
+  eor.w in4, in4, in7, lsr #31
+
+  // c[1] = temp[1] ^ temp[3];
+  eor.w in2, in2, in6
+  eor.w in3, in3, in7
+
+  // c[1] ^= (temp[3] << 7) | (t >> 57);
+  // c[1] ^= (temp[3] << 2) | (t >> 62);
+  // c[1] ^= (temp[3] << 1) | (t >> 63);
+  eor.w in2, in2, in5, lsr #25
+  eor.w in2, in2, in5, lsr #30
+  eor.w in2, in2, in5, lsr #31
+
+  eor.w in2, in2, in6, lsl #7
+  eor.w in2, in2, in6, lsl #2
+  eor.w in2, in2, in6, lsl #1
+
+  eor.w in3, in3, in6, lsr #25
+  eor.w in3, in3, in6, lsr #30
+  eor.w in3, in3, in6, lsr #31
+
+  eor.w in3, in3, in7, lsl #7
+  eor.w in3, in3, in7, lsl #2
+  eor.w in3, in3, in7, lsl #1
+
+  // c[0] = temp[0] ^ t;
+  eor.w in0, in0, in4
+  eor.w in1, in1, in5
+
+  // c[0] ^= (t << 7);
+  // c[0] ^= (t << 2);
+  // c[0] ^= (t << 1);
+  eor.w in0, in0, in4, lsl #7
+  eor.w in0, in0, in4, lsl #2
+  eor.w in0, in0, in4, lsl #1
+
+  eor.w in1, in1, in4, lsr #25
+  eor.w in1, in1, in4, lsr #30
+  eor.w in1, in1, in4, lsr #31
+
+  eor.w in1, in1, in5, lsl #7
+  eor.w in1, in1, in5, lsl #2
+  eor.w in1, in1, in5, lsl #1
+
+  str.w in0, [out_p, #0 * width]
+  str.w in1, [out_p, #1 * width]
+  str.w in2, [out_p, #2 * width]
+  str.w in3, [out_p, #3 * width]
+
+  pop.w {R4-R10, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer128f/m4stack/aim2.c b/crypto_sign/aimer128f/m4stack/aim2.c
new file mode 100644
index 00000000..7cb00352
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/aim2.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 49
+// (2 ^ 49 - 1) ^ (-1) mod (2 ^ 128 - 1) = 0xb6b6d6d6dadb5b5b6b6b6d6dadadb5b5
+// b6b6d6d 6 d a d b5 b5 b6 b6b6d6d a d a d b5 b5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // table_b = in ^ (0xb6), table_5 = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_6);
+  GF_mul_s(table_5, t1, table_5);
+
+  // t1 = in ^ (0xb6b6)
+  GF_sqr_s(t1, table_b);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6b6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t2 = in ^ (0xb6b6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d 6)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6b6d6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5 b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6 b6b6d6d)
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dad a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dada d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dadad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dadadb5 b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_5);
+}
+
+// inverse Mersenne S-box with e2 = 91
+// (2 ^ 91 - 1) ^ (-1) mod (2 ^ 128 - 1) = 0xb6db5b6dadb6dadb6d6db6d6db6b6db5
+// b6d b5 b6d a d b6d a d b6d 6 d b6d 6 d b6 b6d b5
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,}, t3 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t3 = in ^ (0xb6), table_b = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_5);
+  GF_mul_s(t3, t1, table_6);
+
+  // t2 = in ^ (0xb6 d)
+  GF_sqr_s(t1, t3);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // t1 = in ^ (0xb6d b5)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6db5 b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6db5b6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dad b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6db5b6dadb6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dad b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6d b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6d b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6db6 b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // out = in ^ (0xb6db5b6dadb6dadb6d6db6d6db6b6d b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_b);
+}
+
+// Mersenne exponentiation with e_star = 3
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+
+  // t1 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, in);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[0], state[0], vector_b);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+}
diff --git a/crypto_sign/aimer128f/m4stack/aim2.h b/crypto_sign/aimer128f/m4stack/aim2.h
new file mode 100644
index 00000000..5564fc71
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/aim2.h
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0x13198a2e03707344, 0x243f6a8885a308d3},
+  {0x082efa98ec4e6c89, 0xa4093822299f31d0},
+};
+
+static const GF aim2_e1_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000},
+  {0xb87c1159421de6c0, 0xfbcf8c1e442c8cf5},
+  {0x687634c0bd8f66a6, 0x4d328e5ae8b1bde5},
+  {0x742a6036d93c2057, 0x08974511b147a2fe},
+  {0xc8b21bf16608e4db, 0x4d758c29eeb484f7},
+  {0x0b5c6d5c43980a3c, 0x82739c986dfbdb20},
+  {0x0ace7f98da3711b9, 0x34f149a76cf782b0},
+  {0x321995ec53ea9914, 0xc2ff5007f8a98c83},
+  {0x939b53119c4b7496, 0x097da6d2e8f7686d},
+  {0x5fb6dd3ca90cff95, 0x10f77bb9e7748ed3},
+  {0x55194932141d0937, 0xc253f8ea7ac0779a},
+  {0xb2a4b4591251916b, 0xdfef8e3e1b142c07},
+  {0x14df24dfc33e1f4f, 0x931f7bdb443197a1},
+  {0xbd4cbe8b919dbb07, 0x24128da6bf057bc8},
+  {0x1be6a922a8d0d7d4, 0xb7330162b6115e90},
+  {0xb6d9e6635ec916aa, 0x930f20cea1c668e0},
+  {0xccbb31a458da0423, 0x60488351c7403436},
+  {0xef86b4dbc4263e4d, 0x9237f55823767eae},
+  {0xe2a0e301bed0748a, 0x967e64f599297f3c},
+  {0x2fde9314f05105e5, 0x58f5315e0e29e358},
+  {0xc9e5b15be18b7596, 0xa305f4f11aaa8ad2},
+  {0xa592cb3563071925, 0x31b050cca997ed24},
+  {0xa55f9e7374b10af1, 0x5904c31aaebea1ed},
+  {0xcf6921d88d12bbf2, 0xea5142776b77d368},
+  {0x28779ef24c9ddcb5, 0x448bfd74cc624506},
+  {0x0d2caf1924759d9a, 0xc66ef14828e98e80},
+  {0x312a49ac8d3790db, 0x5121956dac40960a},
+  {0x311230a0f0166f37, 0x41cdda4642d1e45a},
+  {0x152cd68f8d980779, 0x50accd8f44cc6a3b},
+  {0x0e6342e6e178a202, 0xaf2e59b6e13fec01},
+  {0xcdfaea274cfff823, 0x008f7a68483d8f8a},
+  {0x80183f4571309485, 0xdece92499f9521ba},
+  {0xaba321469362905f, 0x3c5814a4c792b3be},
+  {0x7e8680766e1d3ffc, 0x7585a167f0b843b8},
+  {0x4e81e572c5dbf79a, 0x114bd1d466ef8787},
+  {0x3a7e0a403a1da600, 0x014747267c0b38f9},
+  {0x23116c4dd539e293, 0x196284a6305e23d7},
+  {0xf0a02f00d5a45c0f, 0xae9980fc3aa3cd2e},
+  {0x7eae2c6dae8286e9, 0xd2be72a1da8addbc},
+  {0xbb8689cb630a9e23, 0x2d1eb9e86163e7f4},
+  {0xf0febfb8f6e46561, 0x8eda5ccd665a3ac2},
+  {0x370a6880719f8be9, 0x83fe14fe68c33df0},
+  {0xe9634dd58474116f, 0xdfb51a0ca76c9c82},
+  {0x9c40da32ca69fe52, 0xcecdf64c8559eef4},
+  {0xe29f358edce8d40e, 0x9256190cf3cfb1fa},
+  {0xb5431f672597e9cd, 0xc69025ae5a99210a},
+  {0x0f00e0c670d40d95, 0xdf81e3ce7617b0a1},
+  {0x699332d099ea38d7, 0xc24d5671c235f28e},
+  {0x89ea2f4529a74b45, 0x7c11f6654369b65d},
+  {0xeaa8470e44915e89, 0x049b62170967135b},
+  {0x39fb9877aadc951b, 0xba3743d76fda5083},
+  {0xe2da8722532e6fbb, 0xdef2a5ff6e028aba},
+  {0xb5e975340c6c76a1, 0xf28418e25fbc0144},
+  {0x035ab9363f6882be, 0xab56f227d4a26a26},
+  {0x273536b8b02dd5f1, 0x75af981a11d43e64},
+  {0x846e480a8bc44fa9, 0x507a048207335fa1},
+  {0x3808d8fa4fcba922, 0xf632f1c9c802ab76},
+  {0x34ecb7872eda1962, 0x2dcfedd3c12f73be},
+  {0xf884a540c1b411ea, 0xf77d23a1c6600553},
+  {0x0e106a0239843e3c, 0x7d5ef83763344eed},
+  {0x4192e743be4ae7d9, 0x5070be659c9249dd},
+  {0x6588c07b62dd03ba, 0x09d7b6469e953856},
+  {0x790b4af55db42c92, 0x5c859acd40414177},
+  {0xedda860c739ca8f7, 0xd728f7e92e3e7940},
+  {0xfbcf513b18b860f7, 0xf6fd92c58b52c44d},
+  {0x4f1571762119854e, 0x04286d00eb347197},
+  {0x3f777b9977ed2aa6, 0xf68288c09c8d73d4},
+  {0x538b16a3bd887a20, 0x86437c4cb491c94b},
+  {0x3656d64f9fdf8baf, 0x97db137363bf2a7e},
+  {0x0582fbdad31a1e6c, 0x213b4a759760ffe0},
+  {0xc7f42208feff0a47, 0x05cb6fb77aad0666},
+  {0x8f59c644fd5259d4, 0xd3740dabc91a5ecd},
+  {0xca19d9ef4ab67cee, 0xa2486f3cdc03c63f},
+  {0x8a1f14a7c3d2f88f, 0x71b6e4a0b3d4a2a0},
+  {0xe9ee9aa288652690, 0xa28d2266c47e02b2},
+  {0x759c7eee1a3eead3, 0x689aa81596670031},
+  {0x50a9a3f15e0032ae, 0x206b34f2ed6fc8ff},
+  {0x630774b85c40302e, 0xf7f5952347d531a6},
+  {0x78886ef4e794267b, 0x7072ec9b3a2ddd8f},
+  {0x754c7bf46deec1a2, 0xb360d5ec03ebf053},
+  {0x337080ceace4b67a, 0xbe8541809bccdc7d},
+  {0x8c243c5d486009a0, 0x87fc6f3fbe554f61},
+  {0x58e8f3ccf2596f26, 0xc7a500e89b1b40a6},
+  {0x516a6cbee9e76420, 0xe719cb9a5a49f8ed},
+  {0x96f150816f90c216, 0x484947f2b48d7882},
+  {0xdefb92978dfa0053, 0x58823337d6c0a641},
+  {0x98bbc22dd2d3262b, 0xdad5891c70205c95},
+  {0xbf1d06e5edc7d9ba, 0xea3e0a86c4241c1e},
+  {0x78e2cf480abc18ef, 0x1110bc39a35669cf},
+  {0xc188299c1375e7b2, 0x8eb4cf8cb0851480},
+  {0xd0ec275048c667d2, 0xff5c57071581e3b1},
+  {0x955c8d54a50fdd52, 0xcf79008ac79991d3},
+  {0xf46cdcd85b7289c9, 0x1c5fc0acfab2cbb2},
+  {0x676f48ac3ed3c825, 0x862183d1a9042f4d},
+  {0xf35fc7982c7daee6, 0xa655183af862baae},
+  {0x5335bbcaf8b9f37b, 0x963ed04a2a0b3eaa},
+  {0x76d009714121cb10, 0x82f1d3e8253374ee},
+  {0x50198339f3198270, 0xee023bd013e359f5},
+  {0x315d27ea94c7941a, 0x5c1520117e098dbf},
+  {0x96ccc513ba987df7, 0x7d84bbe2e504ff94},
+  {0x03464584b630d2b7, 0x7d9fc4a633f228f4},
+  {0x7e39cbb756cac943, 0x45a5498048f1a474},
+  {0x56a90669f7aa29c6, 0x4883787b94c90425},
+  {0x9a262b27cb8de6e9, 0x6495beb53f905401},
+  {0xdc5866e0159b2920, 0x6c2c9c31b3faab04},
+  {0x82f93c693fec7b5f, 0x1926807fb1c2bdd5},
+  {0x3a06ca560fda4251, 0xff56ec036c5f13d6},
+  {0xcf96fe4ae095a1c3, 0xaea98fd960fd6b9d},
+  {0xc2ae3b23e1b73447, 0xe7c1f21b63d4e19c},
+  {0x660f92196e62044c, 0xa61e4689ac8893c0},
+  {0x4aacc983cc5d9cfe, 0xb71adc881811c258},
+  {0xb01938e5f92ea2e0, 0x3d4b38fea83810f8},
+  {0x8195527abb10f039, 0x242e99777aeec42a},
+  {0x077a36f6536baf7f, 0x928620c22f148a6d},
+  {0xb4d16665e8f965a6, 0x300ecf50c00b75a7},
+  {0x53d4fbf144350d5c, 0x50967628985e6eaf},
+  {0xea67291009feb48e, 0x74a182255aa9ccae},
+  {0xe67c52e63c97fb3a, 0xbe1b4991d245fa61},
+  {0x6bd8d3685ed38551, 0xc26bdd871e8691e5},
+  {0x267c4e3df39e0a7e, 0x2408058c7b3e3c09},
+  {0x2bc55550057b4b4e, 0xe70baa2724b374d3},
+  {0x0e2984947284c4dd, 0x4f4e64ba26bfee68},
+  {0x78891ea4bacdb828, 0x357f7d8801646f08},
+  {0x220a9cb569d1ee6b, 0x8e6c9653552802fa},
+  {0x6159359f74dda4d7, 0xcbd0c89374b1cc2e},
+  {0x8dd5a4c4fe55c89f, 0xeeca37f94d3f69bd},
+  {0x22abf1f68e0f314a, 0x69b86caf61d48d15},
+  {0xab26c59f1090455d, 0x1a49957d9798f177},
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000},
+  {0xf50e0632f2a35f5b, 0x386db41096f62a8a},
+  {0x1843656b2ea8f397, 0xefdb454053648225},
+  {0xfc670d9cf3feeb63, 0x7582326d84c7a1de},
+  {0xf1c52011971b40b3, 0x864204566cee644d},
+  {0x5d8e354c13ae648b, 0x192b28f22b444709},
+  {0x9d5cef9c88eb0d9f, 0xb686d60b99470446},
+  {0xc91fa3a9b726fd99, 0xcf7a6d254a105b09},
+  {0x048e86e374780c55, 0x9f65220d0c78fc67},
+  {0xafa9c90017000acc, 0x83a4540ded360993},
+  {0x3e563c2c6efb6102, 0xb7147f0d38fa394e},
+  {0x858e694ad98264cb, 0x184d72cdc205efdb},
+  {0x260f2eae08292a50, 0x101cdb156939622a},
+  {0x4a9a43781e99484d, 0x8b9b7c41b6c639f6},
+  {0x16c9831c810a7459, 0xcb60c983013050be},
+  {0x96d02af1b8d2cba8, 0xb37b4c2c6ea27c34},
+  {0x3caadfab02ea679d, 0x6c3124a15e087d32},
+  {0xf0892e59955b87ae, 0xaab1aa69ba6853d8},
+  {0x8420916c212205ac, 0x86ed9039af31291c},
+  {0x0610fd444421f178, 0xa6b004a839e31b64},
+  {0xaebf5d9bae4e4ac1, 0x54bf9e6ec57b2d65},
+  {0x28bce750ebcba70e, 0x4ce04f578ca77d4d},
+  {0xe35d48d89312441e, 0xe6d91969fd74895a},
+  {0xcca901ef7fabb1c5, 0x117d2c0c4032a05b},
+  {0x4d05be0c6a5a2edc, 0x8314aecc100fcba9},
+  {0x7c685f4133a51825, 0x9acd72f51105c28b},
+  {0x5011fb2faa2c215a, 0xf33e2515d2bd65e4},
+  {0xcec542879e66d1d0, 0xb35dca22a0c3ce97},
+  {0x40849b4ce23375b2, 0x92453c68d163c3cb},
+  {0x807af8ab827e3617, 0x9aa0b258c13e1db7},
+  {0x02cf8f1292f7c659, 0x188599535df660bb},
+  {0x675c7dfe865c4b21, 0x60e7e01162356b69},
+  {0xdca8758ed620dd7b, 0x40e2dfc1450698ca},
+  {0xd4785af596fd0c85, 0x194dcdf10572a8d6},
+  {0x39c75c8db5a743fd, 0xaaff1be5fb825c25},
+  {0x76f287eaaf80a26e, 0x6d5c3d924e633b50},
+  {0xf3289f813d56fa87, 0x8a5781160603ae34},
+  {0x023097d7bf57b560, 0x5c09da41ceda1dab},
+  {0xaa7caa9af1506059, 0xd65b5a005d02edd2},
+  {0x837d13e5bec17d5c, 0x96732ad7e569d594},
+  {0xfca9d80d257930ce, 0xbb07355f7df706c0},
+  {0x7e719f925352363e, 0x61f17c3d17da7386},
+  {0xbdd686a4862a5d5b, 0x5ddbe9580f36ecca},
+  {0xcd8440580a8cb347, 0x8b395b802547e6d8},
+  {0x4338e255f15fc0d9, 0xf2400716b60d1c2c},
+  {0xc0a4a5181cf7a401, 0x208e7b27a3d4e578},
+  {0x6557dd7a9909844a, 0xd7dd867435b17ded},
+  {0xe7214501f52038cd, 0xf73bfe485cf7fdd0},
+  {0x93443a46972cbc70, 0xd2ca8f42b2d199e0},
+  {0xbea25cda0a9de799, 0x51886f07950aef32},
+  {0x82824ccfb37df72d, 0x71a58d7df86233f6},
+  {0x0ab442c2423ac6e3, 0x5d989eeb2df819bb},
+  {0x717b766d60dda065, 0x3899b1af41b28b8b},
+  {0x2fffad98c8e94310, 0x9ff893980c381280},
+  {0x9d7da6a6ca8c0d82, 0x09c78e0f83da5e2a},
+  {0x26b7e85d55753566, 0x48b0fee439062128},
+  {0x63896bb7a7a3c638, 0x551438e5f3ff5db9},
+  {0x080d9af5ef2e5865, 0x048eccc1b914ae50},
+  {0xf081a5f8ab004099, 0x24ffc9670c5492ac},
+  {0x7e4178c2bf375b5a, 0xa641e4982d1c8638},
+  {0x9f1874733c37691a, 0xa6e59883261af497},
+  {0x90068f05a814992d, 0x8f340c2ecb9a2bd2},
+  {0x2e0a82ad5f144c70, 0x783eb790b951d2d9},
+  {0xbf58c52a82e24af6, 0x190f49c97cd133af},
+  {0x29e30e4d37b882a1, 0x217bea750913f0db},
+  {0xfe2287c403984038, 0x870bd9dd397e696e},
+  {0x49e9bc6efdb97d7d, 0xf75f4c5e88587e96},
+  {0xa6223b70299d2836, 0xf27661ea227ab61b},
+  {0x4d6b8601ceb750cf, 0xfab6503eb520e48d},
+  {0xcaf2dd4a73f67c6c, 0x93f3baaf44fed4e0},
+  {0x1ff32e99fc57e662, 0x502b8bb6f2031150},
+  {0x1d8b5656e3d694dc, 0xb31de0d77f80372b},
+  {0x0f3d13aca2eac302, 0xb6d1f98a81d2cd6d},
+  {0x840d8615c90887b8, 0x1d44fc5efe63c574},
+  {0xded005c9eb05ef63, 0xdeb4246e55c121bb},
+  {0x3409b8d1c43c2415, 0x700c0d1dc307fe8d},
+  {0x8b361337911e3002, 0x7920c1039098414c},
+  {0xa5dddabdd1beecf5, 0x146aaf12b0d6da5e},
+  {0xe1a91d6f2a874e47, 0x0d63fcc83ef069d1},
+  {0x0ffd9177c1f3ebb2, 0x9a0cadce706c0cc0},
+  {0xc60d34aa0f45f13d, 0x2d0b4ea8c2bfdc70},
+  {0x83e36503d6399610, 0x6014c0c7cba2d2f2},
+  {0x9cc705d2ecaeca0e, 0x79f83e8c83e7f333},
+  {0x58c7035772444ccc, 0x789c6687005b995e},
+  {0x6b3d950394c886a1, 0x9b4f4564cd5b92d0},
+  {0x872c7f29b6dae6ca, 0xd2a320a97a0d0be9},
+  {0x14bb3a90e34016b2, 0xb308fa5fc47ad142},
+  {0xc6b31a14ce574546, 0xd7f758f96323f56e},
+  {0x046d3862feb271a0, 0x391175405eef9c5e},
+  {0xf7654c3e98aff433, 0x92b8d607c0180e5e},
+  {0xdfe26a4ee0edcbcf, 0x4c21afb68c481788},
+  {0xb9175aa38699a7fa, 0xa26d3569fb705b0a},
+  {0xd2955bcc820c812c, 0x29d30f039b37f636},
+  {0x37d8c59743ebdc8d, 0x19289d7baab847bc},
+  {0x8b8a25c0075e7200, 0x75fcbc7110b551c9},
+  {0x8ab2318dd48eb686, 0xca8ee9edf4a5a1e0},
+  {0x182033f6233cad5f, 0x743083edee67622b},
+  {0xc82b0364e7db3d93, 0x3cec89a9bc59587d},
+  {0x4fb362a6d33cdc65, 0xb2f2a5ce567b5b8e},
+  {0x90df4043911d6152, 0xfe9e1ef68cc145b2},
+  {0x4fcf7b4fcca5200b, 0xaba094d2f96d9249},
+  {0x5ac887c31fc3fd76, 0x1845172174cf2944},
+  {0x25180f84f6702866, 0xde5223f17c83df5d},
+  {0x2863a5b3ae30cdea, 0x610fc2ae8f7cfc74},
+  {0x64a4086ca77af644, 0xafe073214eb0e372},
+  {0xbdc97dadac10ab50, 0x97cf31c3dfa3a7ad},
+  {0x79f2ee819538d167, 0x68555fb401eb2780},
+  {0x72e2b904d5c7a7ed, 0x482326aa3e165b1c},
+  {0x92f65484dcff7fd8, 0x603faf9bafb86f1e},
+  {0x210e7817fff07876, 0xabdf6d8a0dd6d8a5},
+  {0xab561f7f19942dec, 0x55f71e3e54c7b523},
+  {0x8e7140a742fb2245, 0x34a49c54b5ad70ec},
+  {0x6da544268e007b3f, 0xebf2cf33aeaee1c9},
+  {0x010679622fe3753e, 0x40228d2a0d402ed0},
+  {0x2e128b07e6e4e311, 0x0811ebd4d8dde5b5},
+  {0x126cb02cee9ad020, 0x398e5321decfb79c},
+  {0x6dfdff51553fb5ff, 0x415b4003d55c33ab},
+  {0xd3b7fedc1cd8ab6e, 0x49dc7b6033f0ae60},
+  {0x7062ab84db2bbaed, 0xc33060adb11136c6},
+  {0xae149ced6b9cc3d3, 0xef2f29a2ebe433ce},
+  {0x133ca1e237105dc6, 0x9712a59673f1d79c},
+  {0xfcf98569ab4ec844, 0x6a40dd9e8d49194e},
+  {0xd73a65ce7e33212a, 0xaa29936469e73794},
+  {0x961009e50707fe21, 0x657c63ec063d9f23},
+  {0x6b1af6be25650671, 0xce96b0cb11ce0372},
+  {0xc7312488beda3b54, 0x9ee42f2347f50335},
+  {0x829d638189fca23f, 0xe3123a63017f9509},
+  {0xbb40cef8e0e85cea, 0xd8b3a76799622f49},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer128f/m4stack/api.h b/crypto_sign/aimer128f/m4stack/api.h
new file mode 100644
index 00000000..50b6558d
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 32
+#define CRYPTO_SECRETKEYBYTES 48
+#define CRYPTO_BYTES 5888
+#define CRYPTO_ALGNAME "aimer128f"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer128f/m4stack/field.c b/crypto_sign/aimer128f/m4stack/field.c
new file mode 100644
index 00000000..1e12b447
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/field.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+  uint64_t sub[2] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+
+  poly64_mul(&a[1], &b[1], &temp[3], &temp[2]);
+  poly64_mul(&a[0], &b[0], &temp[1], &temp[0]);
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] = temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+  uint64_t sub[2] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+
+  poly64_mul(&a[1], &b[1], &temp[3], &temp[2]);
+  poly64_mul(&a[0], &b[0], &temp[1], &temp[0]);
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] ^= temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 8, index >>= 8, b_ptr += 8)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+
+      mask = -((index >> 4) & 1);
+      temp_c0 ^= (b_ptr[4][0] & mask);
+      temp_c1 ^= (b_ptr[4][1] & mask);
+
+      mask = -((index >> 5) & 1);
+      temp_c0 ^= (b_ptr[5][0] & mask);
+      temp_c1 ^= (b_ptr[5][1] & mask);
+
+      mask = -((index >> 6) & 1);
+      temp_c0 ^= (b_ptr[6][0] & mask);
+      temp_c1 ^= (b_ptr[6][1] & mask);
+
+      mask = -((index >> 7) & 1);
+      temp_c0 ^= (b_ptr[7][0] & mask);
+      temp_c1 ^= (b_ptr[7][1] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 8, index >>= 8, b_ptr += 8)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+
+      mask = -((index >> 4) & 1);
+      temp_c0 ^= (b_ptr[4][0] & mask);
+      temp_c1 ^= (b_ptr[4][1] & mask);
+
+      mask = -((index >> 5) & 1);
+      temp_c0 ^= (b_ptr[5][0] & mask);
+      temp_c1 ^= (b_ptr[5][1] & mask);
+
+      mask = -((index >> 6) & 1);
+      temp_c0 ^= (b_ptr[6][0] & mask);
+      temp_c1 ^= (b_ptr[6][1] & mask);
+
+      mask = -((index >> 7) & 1);
+      temp_c0 ^= (b_ptr[7][0] & mask);
+      temp_c1 ^= (b_ptr[7][1] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+
+  poly64_mul_s(&temp[3], &temp[2], a[1], b[1]);
+  poly64_mul_s(&temp[1], &temp[0], a[0], b[0]);
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] = temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+
+  poly64_mul_s(&temp[3], &temp[2], a[1], b[1]);
+  poly64_mul_s(&temp[1], &temp[0], a[0], b[0]);
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] ^= temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
diff --git a/crypto_sign/aimer128f/m4stack/field.h b/crypto_sign/aimer128f/m4stack/field.h
new file mode 100644
index 00000000..e8fd7996
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[2];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer128f/m4stack/hash.c b/crypto_sign/aimer128f/m4stack/hash.c
new file mode 100644
index 00000000..71f3fb67
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/hash.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+void hash_init(hash_instance *ctx)
+{
+  shake128_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake128_inc_init(ctx);
+  shake128_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake128_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake128_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake128_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake128_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake128_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer128f/m4stack/hash.h b/crypto_sign/aimer128f/m4stack/hash.h
new file mode 100644
index 00000000..d6b05065
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/hash.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake128incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer128f/m4stack/params.h b/crypto_sign/aimer128f/m4stack/params.h
new file mode 100644
index 00000000..689e8830
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/params.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer128f_m4stack_##s
+
+#define SECURITY_BITS               128                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         2                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     33                   // number of parallel repetitions (Tau)
+#define AIMER_N                     16                   // number of MPC parties (N)
+#define AIMER_LOGN                  4                    // log_2(N)
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer128f/m4stack/sign.c b/crypto_sign/aimer128f/m4stack/sign.c
new file mode 100644
index 00000000..deaea5b7
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/sign.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_mul(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->z_shares[0], mult_chk->x_shares[0],
+                           aim2_e1_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE])
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  hash_instance ctx;
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1],  rep, party);
+      hash_update(&ctx, commits, AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+      }
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF)
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF pt_shares[AIMER_N];
+  GF alpha_v_shares[2][AIMER_N];
+  GF epsilons[AIMER_L + 1];
+
+  // prepare h2
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF_set0(alpha);
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1], rep, party);
+
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk));
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+        GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+        GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+        GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares, sizeof(alpha_v_shares));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b)
+{
+  // prepare challenge parties
+  hash_instance ctx;
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF epsilons[AIMER_L + 1];
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  mult_chk_t mult_chk;
+  memset(&mult_chk, 0, sizeof(mult_chk_t));
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes, i_bar);
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha_share = {0,};
+
+    tape_t tape;
+    commit_and_expand_tape(&tape, commits, &ctx_precom,
+                           nodes[AIMER_N + i_bar - 1], rep, i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits,
+           AIMER_COMMIT_SIZE);
+
+    GF_set0(mult_chk.x_shares[AIMER_L]);
+
+    // adjust the last share and prepare the proof and h_1
+    if (i_bar == AIMER_N - 1)
+    {
+      // initialize adjustment values
+      tape_t delta;
+      memset(&delta, 0, sizeof(tape_t));
+
+      GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+      GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+      GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+      GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+      GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+      GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+    }
+
+    for (size_t ell = 0; ell < AIMER_L; ell++)
+    {
+      GF_copy(mult_chk.x_shares[ell], tape.t_shares[ell]);
+
+      // x_* = sum_i A[i] * t[i] + b
+      GF_transposed_matmul_add(mult_chk.x_shares[AIMER_L],
+                               tape.t_shares[ell], matrix_A[ell]);
+    }
+    GF_copy(alpha_share, tape.a_share);
+
+    GF_mul_add(alpha_share, mult_chk.x_shares[0], epsilons[0]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[1], epsilons[1]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[2], epsilons[2]);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes, alpha_share);
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  signature_t *sign = (signature_t *)sig;
+
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt and root seeds
+  hash_instance ctx_roots;
+  hash_init_prefix(&ctx_roots, HASH_PREFIX_3);
+  hash_update(&ctx_roots, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_roots, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_roots, random, SECURITY_BYTES);
+  hash_final(&ctx_roots);
+  hash_squeeze(&ctx_roots, sign->salt, AIMER_SALT_SIZE);
+
+  uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE];
+  hash_squeeze(&ctx_roots, (uint8_t *)root_seeds, AIMER_T * AIMER_SEED_SIZE);
+  hash_ctx_release(&ctx_roots);
+
+  run_phase_1(sign, pt_GF, (const GF *)sbox_outputs, mu,
+              (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds);
+
+  run_phase_1_to_3(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, ct_GF);
+
+  run_phase_1_to_5(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b);
+
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(signature, CRYPTO_BYTES, message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer128f/m4stack/sign.h b/crypto_sign/aimer128f/m4stack/sign.h
new file mode 100644
index 00000000..2e2ab61d
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/sign.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_3 AIMER_NAMESPACE(run_phase_1_to_3)
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF);
+
+#define run_phase_1_to_5 AIMER_NAMESPACE(run_phase_1_to_5)
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer128f/m4stack/tree.c b/crypto_sign/aimer128f/m4stack/tree.c
new file mode 100644
index 00000000..0585e7fa
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/tree.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE])
+{
+  size_t index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  memcpy(nodes[0], root_seed, AIMER_SEED_SIZE);
+  buffer[0] = (uint8_t)(rep_index);
+  for (index = 1; index < AIMER_N; index++)
+  {
+    buffer[1] = (uint8_t)(index);
+    memcpy(buffer + 2, nodes[index - 1], AIMER_SEED_SIZE);
+
+    hash_ctx_clone(&ctx, &ctx_);
+    hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+    hash_final(&ctx);
+    hash_squeeze(&ctx, nodes[2 * index - 1], AIMER_SEED_SIZE << 1);
+    hash_ctx_release(&ctx);
+  }
+  hash_ctx_release(&ctx_);
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer128f/m4stack/tree.h b/crypto_sign/aimer128f/m4stack/tree.h
new file mode 100644
index 00000000..f1354f19
--- /dev/null
+++ b/crypto_sign/aimer128f/m4stack/tree.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define expand_tree AIMER_NAMESPACE(expand_tree)
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE]);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer128s/m4speed/__asm_field.S b/crypto_sign/aimer128s/m4speed/__asm_field.S
new file mode 100644
index 00000000..05656b37
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/__asm_field.S
@@ -0,0 +1,544 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w {R4-R10, lr}
+
+  ldr.w in0, [in_p, #0 * width]  // a[0]
+  ldr.w in2, [in_p, #1 * width]
+  ldr.w in4, [in_p, #2 * width]  // a[1]
+  ldr.w in6, [in_p, #3 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R10, C4
+
+  and.w in0, in0, R10, lsr #16
+  and.w in2, in2, R10, lsr #16
+  and.w in4, in4, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+  eor.w in4, in4, in7, lsr #25
+  eor.w in4, in4, in7, lsr #30
+  eor.w in4, in4, in7, lsr #31
+
+  // c[1] = temp[1] ^ temp[3];
+  eor.w in2, in2, in6
+  eor.w in3, in3, in7
+
+  // c[1] ^= (temp[3] << 7) | (t >> 57);
+  // c[1] ^= (temp[3] << 2) | (t >> 62);
+  // c[1] ^= (temp[3] << 1) | (t >> 63);
+  eor.w in2, in2, in5, lsr #25
+  eor.w in2, in2, in5, lsr #30
+  eor.w in2, in2, in5, lsr #31
+
+  eor.w in2, in2, in6, lsl #7
+  eor.w in2, in2, in6, lsl #2
+  eor.w in2, in2, in6, lsl #1
+
+  eor.w in3, in3, in6, lsr #25
+  eor.w in3, in3, in6, lsr #30
+  eor.w in3, in3, in6, lsr #31
+
+  eor.w in3, in3, in7, lsl #7
+  eor.w in3, in3, in7, lsl #2
+  eor.w in3, in3, in7, lsl #1
+
+  // c[0] = temp[0] ^ t;
+  eor.w in0, in0, in4
+  eor.w in1, in1, in5
+
+  // c[0] ^= (t << 7);
+  // c[0] ^= (t << 2);
+  // c[0] ^= (t << 1);
+  eor.w in0, in0, in4, lsl #7
+  eor.w in0, in0, in4, lsl #2
+  eor.w in0, in0, in4, lsl #1
+
+  eor.w in1, in1, in4, lsr #25
+  eor.w in1, in1, in4, lsr #30
+  eor.w in1, in1, in4, lsr #31
+
+  eor.w in1, in1, in5, lsl #7
+  eor.w in1, in1, in5, lsl #2
+  eor.w in1, in1, in5, lsl #1
+
+  str.w in0, [out_p, #0 * width]
+  str.w in1, [out_p, #1 * width]
+  str.w in2, [out_p, #2 * width]
+  str.w in3, [out_p, #3 * width]
+
+  pop.w {R4-R10, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer128s/m4speed/aim2.c b/crypto_sign/aimer128s/m4speed/aim2.c
new file mode 100644
index 00000000..7cb00352
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/aim2.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 49
+// (2 ^ 49 - 1) ^ (-1) mod (2 ^ 128 - 1) = 0xb6b6d6d6dadb5b5b6b6b6d6dadadb5b5
+// b6b6d6d 6 d a d b5 b5 b6 b6b6d6d a d a d b5 b5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // table_b = in ^ (0xb6), table_5 = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_6);
+  GF_mul_s(table_5, t1, table_5);
+
+  // t1 = in ^ (0xb6b6)
+  GF_sqr_s(t1, table_b);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6b6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t2 = in ^ (0xb6b6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d 6)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6b6d6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5 b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6 b6b6d6d)
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dad a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dada d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dadad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dadadb5 b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_5);
+}
+
+// inverse Mersenne S-box with e2 = 91
+// (2 ^ 91 - 1) ^ (-1) mod (2 ^ 128 - 1) = 0xb6db5b6dadb6dadb6d6db6d6db6b6db5
+// b6d b5 b6d a d b6d a d b6d 6 d b6d 6 d b6 b6d b5
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,}, t3 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t3 = in ^ (0xb6), table_b = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_5);
+  GF_mul_s(t3, t1, table_6);
+
+  // t2 = in ^ (0xb6 d)
+  GF_sqr_s(t1, t3);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // t1 = in ^ (0xb6d b5)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6db5 b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6db5b6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dad b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6db5b6dadb6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dad b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6d b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6d b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6db6 b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // out = in ^ (0xb6db5b6dadb6dadb6d6db6d6db6b6d b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_b);
+}
+
+// Mersenne exponentiation with e_star = 3
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+
+  // t1 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, in);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[0], state[0], vector_b);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+}
diff --git a/crypto_sign/aimer128s/m4speed/aim2.h b/crypto_sign/aimer128s/m4speed/aim2.h
new file mode 100644
index 00000000..5564fc71
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/aim2.h
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0x13198a2e03707344, 0x243f6a8885a308d3},
+  {0x082efa98ec4e6c89, 0xa4093822299f31d0},
+};
+
+static const GF aim2_e1_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000},
+  {0xb87c1159421de6c0, 0xfbcf8c1e442c8cf5},
+  {0x687634c0bd8f66a6, 0x4d328e5ae8b1bde5},
+  {0x742a6036d93c2057, 0x08974511b147a2fe},
+  {0xc8b21bf16608e4db, 0x4d758c29eeb484f7},
+  {0x0b5c6d5c43980a3c, 0x82739c986dfbdb20},
+  {0x0ace7f98da3711b9, 0x34f149a76cf782b0},
+  {0x321995ec53ea9914, 0xc2ff5007f8a98c83},
+  {0x939b53119c4b7496, 0x097da6d2e8f7686d},
+  {0x5fb6dd3ca90cff95, 0x10f77bb9e7748ed3},
+  {0x55194932141d0937, 0xc253f8ea7ac0779a},
+  {0xb2a4b4591251916b, 0xdfef8e3e1b142c07},
+  {0x14df24dfc33e1f4f, 0x931f7bdb443197a1},
+  {0xbd4cbe8b919dbb07, 0x24128da6bf057bc8},
+  {0x1be6a922a8d0d7d4, 0xb7330162b6115e90},
+  {0xb6d9e6635ec916aa, 0x930f20cea1c668e0},
+  {0xccbb31a458da0423, 0x60488351c7403436},
+  {0xef86b4dbc4263e4d, 0x9237f55823767eae},
+  {0xe2a0e301bed0748a, 0x967e64f599297f3c},
+  {0x2fde9314f05105e5, 0x58f5315e0e29e358},
+  {0xc9e5b15be18b7596, 0xa305f4f11aaa8ad2},
+  {0xa592cb3563071925, 0x31b050cca997ed24},
+  {0xa55f9e7374b10af1, 0x5904c31aaebea1ed},
+  {0xcf6921d88d12bbf2, 0xea5142776b77d368},
+  {0x28779ef24c9ddcb5, 0x448bfd74cc624506},
+  {0x0d2caf1924759d9a, 0xc66ef14828e98e80},
+  {0x312a49ac8d3790db, 0x5121956dac40960a},
+  {0x311230a0f0166f37, 0x41cdda4642d1e45a},
+  {0x152cd68f8d980779, 0x50accd8f44cc6a3b},
+  {0x0e6342e6e178a202, 0xaf2e59b6e13fec01},
+  {0xcdfaea274cfff823, 0x008f7a68483d8f8a},
+  {0x80183f4571309485, 0xdece92499f9521ba},
+  {0xaba321469362905f, 0x3c5814a4c792b3be},
+  {0x7e8680766e1d3ffc, 0x7585a167f0b843b8},
+  {0x4e81e572c5dbf79a, 0x114bd1d466ef8787},
+  {0x3a7e0a403a1da600, 0x014747267c0b38f9},
+  {0x23116c4dd539e293, 0x196284a6305e23d7},
+  {0xf0a02f00d5a45c0f, 0xae9980fc3aa3cd2e},
+  {0x7eae2c6dae8286e9, 0xd2be72a1da8addbc},
+  {0xbb8689cb630a9e23, 0x2d1eb9e86163e7f4},
+  {0xf0febfb8f6e46561, 0x8eda5ccd665a3ac2},
+  {0x370a6880719f8be9, 0x83fe14fe68c33df0},
+  {0xe9634dd58474116f, 0xdfb51a0ca76c9c82},
+  {0x9c40da32ca69fe52, 0xcecdf64c8559eef4},
+  {0xe29f358edce8d40e, 0x9256190cf3cfb1fa},
+  {0xb5431f672597e9cd, 0xc69025ae5a99210a},
+  {0x0f00e0c670d40d95, 0xdf81e3ce7617b0a1},
+  {0x699332d099ea38d7, 0xc24d5671c235f28e},
+  {0x89ea2f4529a74b45, 0x7c11f6654369b65d},
+  {0xeaa8470e44915e89, 0x049b62170967135b},
+  {0x39fb9877aadc951b, 0xba3743d76fda5083},
+  {0xe2da8722532e6fbb, 0xdef2a5ff6e028aba},
+  {0xb5e975340c6c76a1, 0xf28418e25fbc0144},
+  {0x035ab9363f6882be, 0xab56f227d4a26a26},
+  {0x273536b8b02dd5f1, 0x75af981a11d43e64},
+  {0x846e480a8bc44fa9, 0x507a048207335fa1},
+  {0x3808d8fa4fcba922, 0xf632f1c9c802ab76},
+  {0x34ecb7872eda1962, 0x2dcfedd3c12f73be},
+  {0xf884a540c1b411ea, 0xf77d23a1c6600553},
+  {0x0e106a0239843e3c, 0x7d5ef83763344eed},
+  {0x4192e743be4ae7d9, 0x5070be659c9249dd},
+  {0x6588c07b62dd03ba, 0x09d7b6469e953856},
+  {0x790b4af55db42c92, 0x5c859acd40414177},
+  {0xedda860c739ca8f7, 0xd728f7e92e3e7940},
+  {0xfbcf513b18b860f7, 0xf6fd92c58b52c44d},
+  {0x4f1571762119854e, 0x04286d00eb347197},
+  {0x3f777b9977ed2aa6, 0xf68288c09c8d73d4},
+  {0x538b16a3bd887a20, 0x86437c4cb491c94b},
+  {0x3656d64f9fdf8baf, 0x97db137363bf2a7e},
+  {0x0582fbdad31a1e6c, 0x213b4a759760ffe0},
+  {0xc7f42208feff0a47, 0x05cb6fb77aad0666},
+  {0x8f59c644fd5259d4, 0xd3740dabc91a5ecd},
+  {0xca19d9ef4ab67cee, 0xa2486f3cdc03c63f},
+  {0x8a1f14a7c3d2f88f, 0x71b6e4a0b3d4a2a0},
+  {0xe9ee9aa288652690, 0xa28d2266c47e02b2},
+  {0x759c7eee1a3eead3, 0x689aa81596670031},
+  {0x50a9a3f15e0032ae, 0x206b34f2ed6fc8ff},
+  {0x630774b85c40302e, 0xf7f5952347d531a6},
+  {0x78886ef4e794267b, 0x7072ec9b3a2ddd8f},
+  {0x754c7bf46deec1a2, 0xb360d5ec03ebf053},
+  {0x337080ceace4b67a, 0xbe8541809bccdc7d},
+  {0x8c243c5d486009a0, 0x87fc6f3fbe554f61},
+  {0x58e8f3ccf2596f26, 0xc7a500e89b1b40a6},
+  {0x516a6cbee9e76420, 0xe719cb9a5a49f8ed},
+  {0x96f150816f90c216, 0x484947f2b48d7882},
+  {0xdefb92978dfa0053, 0x58823337d6c0a641},
+  {0x98bbc22dd2d3262b, 0xdad5891c70205c95},
+  {0xbf1d06e5edc7d9ba, 0xea3e0a86c4241c1e},
+  {0x78e2cf480abc18ef, 0x1110bc39a35669cf},
+  {0xc188299c1375e7b2, 0x8eb4cf8cb0851480},
+  {0xd0ec275048c667d2, 0xff5c57071581e3b1},
+  {0x955c8d54a50fdd52, 0xcf79008ac79991d3},
+  {0xf46cdcd85b7289c9, 0x1c5fc0acfab2cbb2},
+  {0x676f48ac3ed3c825, 0x862183d1a9042f4d},
+  {0xf35fc7982c7daee6, 0xa655183af862baae},
+  {0x5335bbcaf8b9f37b, 0x963ed04a2a0b3eaa},
+  {0x76d009714121cb10, 0x82f1d3e8253374ee},
+  {0x50198339f3198270, 0xee023bd013e359f5},
+  {0x315d27ea94c7941a, 0x5c1520117e098dbf},
+  {0x96ccc513ba987df7, 0x7d84bbe2e504ff94},
+  {0x03464584b630d2b7, 0x7d9fc4a633f228f4},
+  {0x7e39cbb756cac943, 0x45a5498048f1a474},
+  {0x56a90669f7aa29c6, 0x4883787b94c90425},
+  {0x9a262b27cb8de6e9, 0x6495beb53f905401},
+  {0xdc5866e0159b2920, 0x6c2c9c31b3faab04},
+  {0x82f93c693fec7b5f, 0x1926807fb1c2bdd5},
+  {0x3a06ca560fda4251, 0xff56ec036c5f13d6},
+  {0xcf96fe4ae095a1c3, 0xaea98fd960fd6b9d},
+  {0xc2ae3b23e1b73447, 0xe7c1f21b63d4e19c},
+  {0x660f92196e62044c, 0xa61e4689ac8893c0},
+  {0x4aacc983cc5d9cfe, 0xb71adc881811c258},
+  {0xb01938e5f92ea2e0, 0x3d4b38fea83810f8},
+  {0x8195527abb10f039, 0x242e99777aeec42a},
+  {0x077a36f6536baf7f, 0x928620c22f148a6d},
+  {0xb4d16665e8f965a6, 0x300ecf50c00b75a7},
+  {0x53d4fbf144350d5c, 0x50967628985e6eaf},
+  {0xea67291009feb48e, 0x74a182255aa9ccae},
+  {0xe67c52e63c97fb3a, 0xbe1b4991d245fa61},
+  {0x6bd8d3685ed38551, 0xc26bdd871e8691e5},
+  {0x267c4e3df39e0a7e, 0x2408058c7b3e3c09},
+  {0x2bc55550057b4b4e, 0xe70baa2724b374d3},
+  {0x0e2984947284c4dd, 0x4f4e64ba26bfee68},
+  {0x78891ea4bacdb828, 0x357f7d8801646f08},
+  {0x220a9cb569d1ee6b, 0x8e6c9653552802fa},
+  {0x6159359f74dda4d7, 0xcbd0c89374b1cc2e},
+  {0x8dd5a4c4fe55c89f, 0xeeca37f94d3f69bd},
+  {0x22abf1f68e0f314a, 0x69b86caf61d48d15},
+  {0xab26c59f1090455d, 0x1a49957d9798f177},
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000},
+  {0xf50e0632f2a35f5b, 0x386db41096f62a8a},
+  {0x1843656b2ea8f397, 0xefdb454053648225},
+  {0xfc670d9cf3feeb63, 0x7582326d84c7a1de},
+  {0xf1c52011971b40b3, 0x864204566cee644d},
+  {0x5d8e354c13ae648b, 0x192b28f22b444709},
+  {0x9d5cef9c88eb0d9f, 0xb686d60b99470446},
+  {0xc91fa3a9b726fd99, 0xcf7a6d254a105b09},
+  {0x048e86e374780c55, 0x9f65220d0c78fc67},
+  {0xafa9c90017000acc, 0x83a4540ded360993},
+  {0x3e563c2c6efb6102, 0xb7147f0d38fa394e},
+  {0x858e694ad98264cb, 0x184d72cdc205efdb},
+  {0x260f2eae08292a50, 0x101cdb156939622a},
+  {0x4a9a43781e99484d, 0x8b9b7c41b6c639f6},
+  {0x16c9831c810a7459, 0xcb60c983013050be},
+  {0x96d02af1b8d2cba8, 0xb37b4c2c6ea27c34},
+  {0x3caadfab02ea679d, 0x6c3124a15e087d32},
+  {0xf0892e59955b87ae, 0xaab1aa69ba6853d8},
+  {0x8420916c212205ac, 0x86ed9039af31291c},
+  {0x0610fd444421f178, 0xa6b004a839e31b64},
+  {0xaebf5d9bae4e4ac1, 0x54bf9e6ec57b2d65},
+  {0x28bce750ebcba70e, 0x4ce04f578ca77d4d},
+  {0xe35d48d89312441e, 0xe6d91969fd74895a},
+  {0xcca901ef7fabb1c5, 0x117d2c0c4032a05b},
+  {0x4d05be0c6a5a2edc, 0x8314aecc100fcba9},
+  {0x7c685f4133a51825, 0x9acd72f51105c28b},
+  {0x5011fb2faa2c215a, 0xf33e2515d2bd65e4},
+  {0xcec542879e66d1d0, 0xb35dca22a0c3ce97},
+  {0x40849b4ce23375b2, 0x92453c68d163c3cb},
+  {0x807af8ab827e3617, 0x9aa0b258c13e1db7},
+  {0x02cf8f1292f7c659, 0x188599535df660bb},
+  {0x675c7dfe865c4b21, 0x60e7e01162356b69},
+  {0xdca8758ed620dd7b, 0x40e2dfc1450698ca},
+  {0xd4785af596fd0c85, 0x194dcdf10572a8d6},
+  {0x39c75c8db5a743fd, 0xaaff1be5fb825c25},
+  {0x76f287eaaf80a26e, 0x6d5c3d924e633b50},
+  {0xf3289f813d56fa87, 0x8a5781160603ae34},
+  {0x023097d7bf57b560, 0x5c09da41ceda1dab},
+  {0xaa7caa9af1506059, 0xd65b5a005d02edd2},
+  {0x837d13e5bec17d5c, 0x96732ad7e569d594},
+  {0xfca9d80d257930ce, 0xbb07355f7df706c0},
+  {0x7e719f925352363e, 0x61f17c3d17da7386},
+  {0xbdd686a4862a5d5b, 0x5ddbe9580f36ecca},
+  {0xcd8440580a8cb347, 0x8b395b802547e6d8},
+  {0x4338e255f15fc0d9, 0xf2400716b60d1c2c},
+  {0xc0a4a5181cf7a401, 0x208e7b27a3d4e578},
+  {0x6557dd7a9909844a, 0xd7dd867435b17ded},
+  {0xe7214501f52038cd, 0xf73bfe485cf7fdd0},
+  {0x93443a46972cbc70, 0xd2ca8f42b2d199e0},
+  {0xbea25cda0a9de799, 0x51886f07950aef32},
+  {0x82824ccfb37df72d, 0x71a58d7df86233f6},
+  {0x0ab442c2423ac6e3, 0x5d989eeb2df819bb},
+  {0x717b766d60dda065, 0x3899b1af41b28b8b},
+  {0x2fffad98c8e94310, 0x9ff893980c381280},
+  {0x9d7da6a6ca8c0d82, 0x09c78e0f83da5e2a},
+  {0x26b7e85d55753566, 0x48b0fee439062128},
+  {0x63896bb7a7a3c638, 0x551438e5f3ff5db9},
+  {0x080d9af5ef2e5865, 0x048eccc1b914ae50},
+  {0xf081a5f8ab004099, 0x24ffc9670c5492ac},
+  {0x7e4178c2bf375b5a, 0xa641e4982d1c8638},
+  {0x9f1874733c37691a, 0xa6e59883261af497},
+  {0x90068f05a814992d, 0x8f340c2ecb9a2bd2},
+  {0x2e0a82ad5f144c70, 0x783eb790b951d2d9},
+  {0xbf58c52a82e24af6, 0x190f49c97cd133af},
+  {0x29e30e4d37b882a1, 0x217bea750913f0db},
+  {0xfe2287c403984038, 0x870bd9dd397e696e},
+  {0x49e9bc6efdb97d7d, 0xf75f4c5e88587e96},
+  {0xa6223b70299d2836, 0xf27661ea227ab61b},
+  {0x4d6b8601ceb750cf, 0xfab6503eb520e48d},
+  {0xcaf2dd4a73f67c6c, 0x93f3baaf44fed4e0},
+  {0x1ff32e99fc57e662, 0x502b8bb6f2031150},
+  {0x1d8b5656e3d694dc, 0xb31de0d77f80372b},
+  {0x0f3d13aca2eac302, 0xb6d1f98a81d2cd6d},
+  {0x840d8615c90887b8, 0x1d44fc5efe63c574},
+  {0xded005c9eb05ef63, 0xdeb4246e55c121bb},
+  {0x3409b8d1c43c2415, 0x700c0d1dc307fe8d},
+  {0x8b361337911e3002, 0x7920c1039098414c},
+  {0xa5dddabdd1beecf5, 0x146aaf12b0d6da5e},
+  {0xe1a91d6f2a874e47, 0x0d63fcc83ef069d1},
+  {0x0ffd9177c1f3ebb2, 0x9a0cadce706c0cc0},
+  {0xc60d34aa0f45f13d, 0x2d0b4ea8c2bfdc70},
+  {0x83e36503d6399610, 0x6014c0c7cba2d2f2},
+  {0x9cc705d2ecaeca0e, 0x79f83e8c83e7f333},
+  {0x58c7035772444ccc, 0x789c6687005b995e},
+  {0x6b3d950394c886a1, 0x9b4f4564cd5b92d0},
+  {0x872c7f29b6dae6ca, 0xd2a320a97a0d0be9},
+  {0x14bb3a90e34016b2, 0xb308fa5fc47ad142},
+  {0xc6b31a14ce574546, 0xd7f758f96323f56e},
+  {0x046d3862feb271a0, 0x391175405eef9c5e},
+  {0xf7654c3e98aff433, 0x92b8d607c0180e5e},
+  {0xdfe26a4ee0edcbcf, 0x4c21afb68c481788},
+  {0xb9175aa38699a7fa, 0xa26d3569fb705b0a},
+  {0xd2955bcc820c812c, 0x29d30f039b37f636},
+  {0x37d8c59743ebdc8d, 0x19289d7baab847bc},
+  {0x8b8a25c0075e7200, 0x75fcbc7110b551c9},
+  {0x8ab2318dd48eb686, 0xca8ee9edf4a5a1e0},
+  {0x182033f6233cad5f, 0x743083edee67622b},
+  {0xc82b0364e7db3d93, 0x3cec89a9bc59587d},
+  {0x4fb362a6d33cdc65, 0xb2f2a5ce567b5b8e},
+  {0x90df4043911d6152, 0xfe9e1ef68cc145b2},
+  {0x4fcf7b4fcca5200b, 0xaba094d2f96d9249},
+  {0x5ac887c31fc3fd76, 0x1845172174cf2944},
+  {0x25180f84f6702866, 0xde5223f17c83df5d},
+  {0x2863a5b3ae30cdea, 0x610fc2ae8f7cfc74},
+  {0x64a4086ca77af644, 0xafe073214eb0e372},
+  {0xbdc97dadac10ab50, 0x97cf31c3dfa3a7ad},
+  {0x79f2ee819538d167, 0x68555fb401eb2780},
+  {0x72e2b904d5c7a7ed, 0x482326aa3e165b1c},
+  {0x92f65484dcff7fd8, 0x603faf9bafb86f1e},
+  {0x210e7817fff07876, 0xabdf6d8a0dd6d8a5},
+  {0xab561f7f19942dec, 0x55f71e3e54c7b523},
+  {0x8e7140a742fb2245, 0x34a49c54b5ad70ec},
+  {0x6da544268e007b3f, 0xebf2cf33aeaee1c9},
+  {0x010679622fe3753e, 0x40228d2a0d402ed0},
+  {0x2e128b07e6e4e311, 0x0811ebd4d8dde5b5},
+  {0x126cb02cee9ad020, 0x398e5321decfb79c},
+  {0x6dfdff51553fb5ff, 0x415b4003d55c33ab},
+  {0xd3b7fedc1cd8ab6e, 0x49dc7b6033f0ae60},
+  {0x7062ab84db2bbaed, 0xc33060adb11136c6},
+  {0xae149ced6b9cc3d3, 0xef2f29a2ebe433ce},
+  {0x133ca1e237105dc6, 0x9712a59673f1d79c},
+  {0xfcf98569ab4ec844, 0x6a40dd9e8d49194e},
+  {0xd73a65ce7e33212a, 0xaa29936469e73794},
+  {0x961009e50707fe21, 0x657c63ec063d9f23},
+  {0x6b1af6be25650671, 0xce96b0cb11ce0372},
+  {0xc7312488beda3b54, 0x9ee42f2347f50335},
+  {0x829d638189fca23f, 0xe3123a63017f9509},
+  {0xbb40cef8e0e85cea, 0xd8b3a76799622f49},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer128s/m4speed/api.h b/crypto_sign/aimer128s/m4speed/api.h
new file mode 100644
index 00000000..170f69cc
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 32
+#define CRYPTO_SECRETKEYBYTES 48
+#define CRYPTO_BYTES 4160
+#define CRYPTO_ALGNAME "aimer128s"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer128s/m4speed/field.c b/crypto_sign/aimer128s/m4speed/field.c
new file mode 100644
index 00000000..1e12b447
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/field.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+  uint64_t sub[2] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+
+  poly64_mul(&a[1], &b[1], &temp[3], &temp[2]);
+  poly64_mul(&a[0], &b[0], &temp[1], &temp[0]);
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] = temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+  uint64_t sub[2] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+
+  poly64_mul(&a[1], &b[1], &temp[3], &temp[2]);
+  poly64_mul(&a[0], &b[0], &temp[1], &temp[0]);
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] ^= temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 8, index >>= 8, b_ptr += 8)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+
+      mask = -((index >> 4) & 1);
+      temp_c0 ^= (b_ptr[4][0] & mask);
+      temp_c1 ^= (b_ptr[4][1] & mask);
+
+      mask = -((index >> 5) & 1);
+      temp_c0 ^= (b_ptr[5][0] & mask);
+      temp_c1 ^= (b_ptr[5][1] & mask);
+
+      mask = -((index >> 6) & 1);
+      temp_c0 ^= (b_ptr[6][0] & mask);
+      temp_c1 ^= (b_ptr[6][1] & mask);
+
+      mask = -((index >> 7) & 1);
+      temp_c0 ^= (b_ptr[7][0] & mask);
+      temp_c1 ^= (b_ptr[7][1] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 8, index >>= 8, b_ptr += 8)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+
+      mask = -((index >> 4) & 1);
+      temp_c0 ^= (b_ptr[4][0] & mask);
+      temp_c1 ^= (b_ptr[4][1] & mask);
+
+      mask = -((index >> 5) & 1);
+      temp_c0 ^= (b_ptr[5][0] & mask);
+      temp_c1 ^= (b_ptr[5][1] & mask);
+
+      mask = -((index >> 6) & 1);
+      temp_c0 ^= (b_ptr[6][0] & mask);
+      temp_c1 ^= (b_ptr[6][1] & mask);
+
+      mask = -((index >> 7) & 1);
+      temp_c0 ^= (b_ptr[7][0] & mask);
+      temp_c1 ^= (b_ptr[7][1] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+
+  poly64_mul_s(&temp[3], &temp[2], a[1], b[1]);
+  poly64_mul_s(&temp[1], &temp[0], a[0], b[0]);
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] = temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+
+  poly64_mul_s(&temp[3], &temp[2], a[1], b[1]);
+  poly64_mul_s(&temp[1], &temp[0], a[0], b[0]);
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] ^= temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
diff --git a/crypto_sign/aimer128s/m4speed/field.h b/crypto_sign/aimer128s/m4speed/field.h
new file mode 100644
index 00000000..e8fd7996
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[2];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer128s/m4speed/hash.c b/crypto_sign/aimer128s/m4speed/hash.c
new file mode 100644
index 00000000..443de84b
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/hash.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include "keccakf1600.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static void shake128_inc_skip_squeeze(shake128incctx *state, size_t outlen)
+{
+  size_t i;
+
+  for (i = 0; i < outlen && i < state->ctx[25]; i++)
+  {
+    continue;
+  }
+  outlen -= i;
+  state->ctx[25] -= i;
+
+  while (outlen > 0)
+  {
+    KeccakF1600_StatePermute(state->ctx);
+
+    for (i = 0; i < outlen && i < SHAKE128_RATE; i++)
+    {
+      continue;
+    }
+    outlen -= i;
+    state->ctx[25] = SHAKE128_RATE - i;
+  }
+}
+
+void hash_init(hash_instance *ctx)
+{
+  shake128_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake128_inc_init(ctx);
+  shake128_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake128_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake128_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake128_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_skip_squeeze(hash_instance *ctx, size_t buffer_len)
+{
+  shake128_inc_skip_squeeze(ctx, buffer_len);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake128_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake128_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer128s/m4speed/hash.h b/crypto_sign/aimer128s/m4speed/hash.h
new file mode 100644
index 00000000..dbac6717
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/hash.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake128incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_skip_squeeze AIMER_NAMESPACE(hash_skip_squeeze)
+void hash_skip_squeeze(hash_instance *ctx, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer128s/m4speed/params.h b/crypto_sign/aimer128s/m4speed/params.h
new file mode 100644
index 00000000..91a31ba7
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/params.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer128s_m4speed_##s
+
+#define SECURITY_BITS               128                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         2                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     17                   // number of parallel repetitions (Tau)
+#define AIMER_N                     256                  // number of MPC parties (N)
+#define AIMER_LOGN                  8                    // log_2(N)
+
+#define PRE_TREE_IDX                256
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer128s/m4speed/sign.c b/crypto_sign/aimer128s/m4speed/sign.c
new file mode 100644
index 00000000..a4992979
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/sign.c
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void commit_and_expand_tape_phase_3(tape_t *tape,
+                                    const hash_instance *ctx_precom,
+                                    const uint8_t seed[AIMER_SEED_SIZE],
+                                    size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_skip_squeeze(&ctx, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_mul(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->z_shares[0], mult_chk->x_shares[0],
+                           aim2_e1_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const hash_instance *ctx_tree,
+                 const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  hash_instance ctx;
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    memcpy(nodes, pre_nodes[rep][0], (2 * PRE_TREE_IDX - 1) * AIMER_SEED_SIZE);
+    expand_tree(nodes, ctx_tree, rep);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1],  rep, party);
+      hash_update(&ctx, commits, AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+      }
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_3(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])
+{
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF pt_shares[AIMER_N];
+  GF alpha_v_shares[2][AIMER_N];
+  GF epsilons[AIMER_L + 1];
+
+  // prepare h2
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    memcpy(nodes, pre_nodes[rep][0], (2 * PRE_TREE_IDX - 1) * AIMER_SEED_SIZE);
+    expand_tree(nodes, ctx_tree, rep);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF_set0(alpha);
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape_phase_3(&tape, &ctx_precom,
+                                     nodes[AIMER_N + party - 1], rep, party);
+
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk));
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+        GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+        GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+        GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares, sizeof(alpha_v_shares));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_5(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])
+{
+  // prepare challenge parties
+  hash_instance ctx;
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF epsilons[AIMER_L + 1];
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  mult_chk_t mult_chk;
+  memset(&mult_chk, 0, sizeof(mult_chk_t));
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+
+    // compute parties' seeds using binary tree
+    memcpy(nodes, pre_nodes[rep][0], (2 * PRE_TREE_IDX - 1) * AIMER_SEED_SIZE);
+    expand_tree(nodes, ctx_tree, rep);
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes, i_bar);
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha_share = {0,};
+
+    tape_t tape;
+    commit_and_expand_tape(&tape, commits, &ctx_precom,
+                           nodes[AIMER_N + i_bar - 1], rep, i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits,
+           AIMER_COMMIT_SIZE);
+
+    GF_set0(mult_chk.x_shares[AIMER_L]);
+
+    // adjust the last share and prepare the proof and h_1
+    if (i_bar == AIMER_N - 1)
+    {
+      // initialize adjustment values
+      tape_t delta;
+      memset(&delta, 0, sizeof(tape_t));
+
+      GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+      GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+      GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+      GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+      GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+      GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+    }
+
+    for (size_t ell = 0; ell < AIMER_L; ell++)
+    {
+      GF_copy(mult_chk.x_shares[ell], tape.t_shares[ell]);
+
+      // x_* = sum_i A[i] * t[i] + b
+      GF_transposed_matmul_add(mult_chk.x_shares[AIMER_L],
+                               tape.t_shares[ell], matrix_A[ell]);
+    }
+    GF_copy(alpha_share, tape.a_share);
+
+    GF_mul_add(alpha_share, mult_chk.x_shares[0], epsilons[0]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[1], epsilons[1]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[2], epsilons[2]);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes, alpha_share);
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  signature_t *sign = (signature_t *)sig;
+
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt and root seeds
+  hash_instance ctx_roots;
+  hash_init_prefix(&ctx_roots, HASH_PREFIX_3);
+  hash_update(&ctx_roots, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_roots, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_roots, random, SECURITY_BYTES);
+  hash_final(&ctx_roots);
+  hash_squeeze(&ctx_roots, sign->salt, AIMER_SALT_SIZE);
+
+  uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE];
+  hash_squeeze(&ctx_roots, (uint8_t *)root_seeds, AIMER_T * AIMER_SEED_SIZE);
+  hash_ctx_release(&ctx_roots);
+
+  uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE] = {0,};
+
+  hash_instance ctx_tree;
+  hash_init_prefix(&ctx_tree, HASH_PREFIX_4);
+  hash_update(&ctx_tree, sign->salt, AIMER_SALT_SIZE);
+
+  pre_expand_trees(pre_nodes, &ctx_tree, root_seeds);
+
+  run_phase_1(sign, pt_GF, (const GF *)sbox_outputs, mu,
+              &ctx_tree,
+              (const uint8_t (*)[2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])pre_nodes);
+  run_phase_1_to_3(sign, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, ct_GF, &ctx_tree,
+                   (const uint8_t (*)[2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])pre_nodes);
+  run_phase_1_to_5(sign, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, &ctx_tree,
+                   (const uint8_t (*)[2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])pre_nodes);
+
+  hash_ctx_release(&ctx_tree);
+
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(
+      sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig,
+        size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(
+          signature, CRYPTO_BYTES,
+          message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer128s/m4speed/sign.h b/crypto_sign/aimer128s/m4speed/sign.h
new file mode 100644
index 00000000..0c168ee0
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/sign.h
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define commit_and_expand_tape_phase_3 AIMER_NAMESPACE(commit_and_expand_tape_phase_3)
+void commit_and_expand_tape_phase_3(tape_t *tape,
+                                    const hash_instance *ctx_precom,
+                                    const uint8_t seed[AIMER_SEED_SIZE],
+                                    size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const hash_instance *ctx_tree,
+                 const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_3 AIMER_NAMESPACE(run_phase_1_to_3)
+void run_phase_1_to_3(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_5 AIMER_NAMESPACE(run_phase_1_to_5)
+void run_phase_1_to_5(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE]);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer128s/m4speed/tree.c b/crypto_sign/aimer128s/m4speed/tree.c
new file mode 100644
index 00000000..3f4fa554
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/tree.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void pre_expand_trees(uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE],
+                      const hash_instance *ctx_tree,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE])
+{
+  size_t rep_index, node_index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx;
+
+  for (rep_index = 0; rep_index < AIMER_T; rep_index++)
+  {
+    memcpy(pre_nodes[rep_index][0], root_seeds[rep_index], AIMER_SEED_SIZE);
+    buffer[0] = (uint8_t)(rep_index);
+    for (node_index = 1; node_index < PRE_TREE_IDX; node_index++)
+    {
+      buffer[1] = (uint8_t)(node_index);
+      memcpy(buffer + 2, pre_nodes[rep_index][node_index - 1], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, ctx_tree);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, pre_nodes[rep_index][2 * node_index - 1], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+}
+
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const hash_instance *ctx_tree,
+                 size_t rep_index)
+{
+  size_t index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx;
+
+  buffer[0] = (uint8_t)(rep_index);
+  for (index = PRE_TREE_IDX; index < AIMER_N; index++)
+  {
+    buffer[1] = (uint8_t)(index);
+    memcpy(buffer + 2, nodes[index - 1], AIMER_SEED_SIZE);
+
+    hash_ctx_clone(&ctx, ctx_tree);
+    hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+    hash_final(&ctx);
+    hash_squeeze(&ctx, nodes[2 * index - 1], AIMER_SEED_SIZE << 1);
+    hash_ctx_release(&ctx);
+  }
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer128s/m4speed/tree.h b/crypto_sign/aimer128s/m4speed/tree.h
new file mode 100644
index 00000000..364c85f7
--- /dev/null
+++ b/crypto_sign/aimer128s/m4speed/tree.h
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define pre_expand_trees AIMER_NAMESPACE(pre_expand_trees)
+void pre_expand_trees(uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE],
+                      const hash_instance *ctx_tree,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE]);
+
+#define expand_tree AIMER_NAMESPACE(expand_tree)
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const hash_instance *ctx_tree,
+                 size_t rep_index);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer128s/m4stack/__asm_field.S b/crypto_sign/aimer128s/m4stack/__asm_field.S
new file mode 100644
index 00000000..05656b37
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/__asm_field.S
@@ -0,0 +1,544 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w {R4-R10, lr}
+
+  ldr.w in0, [in_p, #0 * width]  // a[0]
+  ldr.w in2, [in_p, #1 * width]
+  ldr.w in4, [in_p, #2 * width]  // a[1]
+  ldr.w in6, [in_p, #3 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R10, C4
+
+  and.w in0, in0, R10, lsr #16
+  and.w in2, in2, R10, lsr #16
+  and.w in4, in4, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+  eor.w in4, in4, in7, lsr #25
+  eor.w in4, in4, in7, lsr #30
+  eor.w in4, in4, in7, lsr #31
+
+  // c[1] = temp[1] ^ temp[3];
+  eor.w in2, in2, in6
+  eor.w in3, in3, in7
+
+  // c[1] ^= (temp[3] << 7) | (t >> 57);
+  // c[1] ^= (temp[3] << 2) | (t >> 62);
+  // c[1] ^= (temp[3] << 1) | (t >> 63);
+  eor.w in2, in2, in5, lsr #25
+  eor.w in2, in2, in5, lsr #30
+  eor.w in2, in2, in5, lsr #31
+
+  eor.w in2, in2, in6, lsl #7
+  eor.w in2, in2, in6, lsl #2
+  eor.w in2, in2, in6, lsl #1
+
+  eor.w in3, in3, in6, lsr #25
+  eor.w in3, in3, in6, lsr #30
+  eor.w in3, in3, in6, lsr #31
+
+  eor.w in3, in3, in7, lsl #7
+  eor.w in3, in3, in7, lsl #2
+  eor.w in3, in3, in7, lsl #1
+
+  // c[0] = temp[0] ^ t;
+  eor.w in0, in0, in4
+  eor.w in1, in1, in5
+
+  // c[0] ^= (t << 7);
+  // c[0] ^= (t << 2);
+  // c[0] ^= (t << 1);
+  eor.w in0, in0, in4, lsl #7
+  eor.w in0, in0, in4, lsl #2
+  eor.w in0, in0, in4, lsl #1
+
+  eor.w in1, in1, in4, lsr #25
+  eor.w in1, in1, in4, lsr #30
+  eor.w in1, in1, in4, lsr #31
+
+  eor.w in1, in1, in5, lsl #7
+  eor.w in1, in1, in5, lsl #2
+  eor.w in1, in1, in5, lsl #1
+
+  str.w in0, [out_p, #0 * width]
+  str.w in1, [out_p, #1 * width]
+  str.w in2, [out_p, #2 * width]
+  str.w in3, [out_p, #3 * width]
+
+  pop.w {R4-R10, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer128s/m4stack/aim2.c b/crypto_sign/aimer128s/m4stack/aim2.c
new file mode 100644
index 00000000..7cb00352
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/aim2.c
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 49
+// (2 ^ 49 - 1) ^ (-1) mod (2 ^ 128 - 1) = 0xb6b6d6d6dadb5b5b6b6b6d6dadadb5b5
+// b6b6d6d 6 d a d b5 b5 b6 b6b6d6d a d a d b5 b5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // table_b = in ^ (0xb6), table_5 = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_6);
+  GF_mul_s(table_5, t1, table_5);
+
+  // t1 = in ^ (0xb6b6)
+  GF_sqr_s(t1, table_b);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6b6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t2 = in ^ (0xb6b6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d 6)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6b6d6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5 b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6 b6b6d6d)
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dad a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dada d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dadad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6b6d6d6dadb5b5b6b6b6d6dadadb5 b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_5);
+}
+
+// inverse Mersenne S-box with e2 = 91
+// (2 ^ 91 - 1) ^ (-1) mod (2 ^ 128 - 1) = 0xb6db5b6dadb6dadb6d6db6d6db6b6db5
+// b6d b5 b6d a d b6d a d b6d 6 d b6d 6 d b6 b6d b5
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,}, t3 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t3 = in ^ (0xb6), table_b = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_5);
+  GF_mul_s(t3, t1, table_6);
+
+  // t2 = in ^ (0xb6 d)
+  GF_sqr_s(t1, t3);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // t1 = in ^ (0xb6d b5)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6db5 b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6db5b6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dad b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6db5b6dadb6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dad b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6d b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6d b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0xb6db5b6dadb6dadb6d6db6d6db6 b6d)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // out = in ^ (0xb6db5b6dadb6dadb6d6db6d6db6b6d b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_b);
+}
+
+// Mersenne exponentiation with e_star = 3
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+
+  // t1 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, in);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[0], state[0], vector_b);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+}
diff --git a/crypto_sign/aimer128s/m4stack/aim2.h b/crypto_sign/aimer128s/m4stack/aim2.h
new file mode 100644
index 00000000..5564fc71
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/aim2.h
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0x13198a2e03707344, 0x243f6a8885a308d3},
+  {0x082efa98ec4e6c89, 0xa4093822299f31d0},
+};
+
+static const GF aim2_e1_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000},
+  {0xb87c1159421de6c0, 0xfbcf8c1e442c8cf5},
+  {0x687634c0bd8f66a6, 0x4d328e5ae8b1bde5},
+  {0x742a6036d93c2057, 0x08974511b147a2fe},
+  {0xc8b21bf16608e4db, 0x4d758c29eeb484f7},
+  {0x0b5c6d5c43980a3c, 0x82739c986dfbdb20},
+  {0x0ace7f98da3711b9, 0x34f149a76cf782b0},
+  {0x321995ec53ea9914, 0xc2ff5007f8a98c83},
+  {0x939b53119c4b7496, 0x097da6d2e8f7686d},
+  {0x5fb6dd3ca90cff95, 0x10f77bb9e7748ed3},
+  {0x55194932141d0937, 0xc253f8ea7ac0779a},
+  {0xb2a4b4591251916b, 0xdfef8e3e1b142c07},
+  {0x14df24dfc33e1f4f, 0x931f7bdb443197a1},
+  {0xbd4cbe8b919dbb07, 0x24128da6bf057bc8},
+  {0x1be6a922a8d0d7d4, 0xb7330162b6115e90},
+  {0xb6d9e6635ec916aa, 0x930f20cea1c668e0},
+  {0xccbb31a458da0423, 0x60488351c7403436},
+  {0xef86b4dbc4263e4d, 0x9237f55823767eae},
+  {0xe2a0e301bed0748a, 0x967e64f599297f3c},
+  {0x2fde9314f05105e5, 0x58f5315e0e29e358},
+  {0xc9e5b15be18b7596, 0xa305f4f11aaa8ad2},
+  {0xa592cb3563071925, 0x31b050cca997ed24},
+  {0xa55f9e7374b10af1, 0x5904c31aaebea1ed},
+  {0xcf6921d88d12bbf2, 0xea5142776b77d368},
+  {0x28779ef24c9ddcb5, 0x448bfd74cc624506},
+  {0x0d2caf1924759d9a, 0xc66ef14828e98e80},
+  {0x312a49ac8d3790db, 0x5121956dac40960a},
+  {0x311230a0f0166f37, 0x41cdda4642d1e45a},
+  {0x152cd68f8d980779, 0x50accd8f44cc6a3b},
+  {0x0e6342e6e178a202, 0xaf2e59b6e13fec01},
+  {0xcdfaea274cfff823, 0x008f7a68483d8f8a},
+  {0x80183f4571309485, 0xdece92499f9521ba},
+  {0xaba321469362905f, 0x3c5814a4c792b3be},
+  {0x7e8680766e1d3ffc, 0x7585a167f0b843b8},
+  {0x4e81e572c5dbf79a, 0x114bd1d466ef8787},
+  {0x3a7e0a403a1da600, 0x014747267c0b38f9},
+  {0x23116c4dd539e293, 0x196284a6305e23d7},
+  {0xf0a02f00d5a45c0f, 0xae9980fc3aa3cd2e},
+  {0x7eae2c6dae8286e9, 0xd2be72a1da8addbc},
+  {0xbb8689cb630a9e23, 0x2d1eb9e86163e7f4},
+  {0xf0febfb8f6e46561, 0x8eda5ccd665a3ac2},
+  {0x370a6880719f8be9, 0x83fe14fe68c33df0},
+  {0xe9634dd58474116f, 0xdfb51a0ca76c9c82},
+  {0x9c40da32ca69fe52, 0xcecdf64c8559eef4},
+  {0xe29f358edce8d40e, 0x9256190cf3cfb1fa},
+  {0xb5431f672597e9cd, 0xc69025ae5a99210a},
+  {0x0f00e0c670d40d95, 0xdf81e3ce7617b0a1},
+  {0x699332d099ea38d7, 0xc24d5671c235f28e},
+  {0x89ea2f4529a74b45, 0x7c11f6654369b65d},
+  {0xeaa8470e44915e89, 0x049b62170967135b},
+  {0x39fb9877aadc951b, 0xba3743d76fda5083},
+  {0xe2da8722532e6fbb, 0xdef2a5ff6e028aba},
+  {0xb5e975340c6c76a1, 0xf28418e25fbc0144},
+  {0x035ab9363f6882be, 0xab56f227d4a26a26},
+  {0x273536b8b02dd5f1, 0x75af981a11d43e64},
+  {0x846e480a8bc44fa9, 0x507a048207335fa1},
+  {0x3808d8fa4fcba922, 0xf632f1c9c802ab76},
+  {0x34ecb7872eda1962, 0x2dcfedd3c12f73be},
+  {0xf884a540c1b411ea, 0xf77d23a1c6600553},
+  {0x0e106a0239843e3c, 0x7d5ef83763344eed},
+  {0x4192e743be4ae7d9, 0x5070be659c9249dd},
+  {0x6588c07b62dd03ba, 0x09d7b6469e953856},
+  {0x790b4af55db42c92, 0x5c859acd40414177},
+  {0xedda860c739ca8f7, 0xd728f7e92e3e7940},
+  {0xfbcf513b18b860f7, 0xf6fd92c58b52c44d},
+  {0x4f1571762119854e, 0x04286d00eb347197},
+  {0x3f777b9977ed2aa6, 0xf68288c09c8d73d4},
+  {0x538b16a3bd887a20, 0x86437c4cb491c94b},
+  {0x3656d64f9fdf8baf, 0x97db137363bf2a7e},
+  {0x0582fbdad31a1e6c, 0x213b4a759760ffe0},
+  {0xc7f42208feff0a47, 0x05cb6fb77aad0666},
+  {0x8f59c644fd5259d4, 0xd3740dabc91a5ecd},
+  {0xca19d9ef4ab67cee, 0xa2486f3cdc03c63f},
+  {0x8a1f14a7c3d2f88f, 0x71b6e4a0b3d4a2a0},
+  {0xe9ee9aa288652690, 0xa28d2266c47e02b2},
+  {0x759c7eee1a3eead3, 0x689aa81596670031},
+  {0x50a9a3f15e0032ae, 0x206b34f2ed6fc8ff},
+  {0x630774b85c40302e, 0xf7f5952347d531a6},
+  {0x78886ef4e794267b, 0x7072ec9b3a2ddd8f},
+  {0x754c7bf46deec1a2, 0xb360d5ec03ebf053},
+  {0x337080ceace4b67a, 0xbe8541809bccdc7d},
+  {0x8c243c5d486009a0, 0x87fc6f3fbe554f61},
+  {0x58e8f3ccf2596f26, 0xc7a500e89b1b40a6},
+  {0x516a6cbee9e76420, 0xe719cb9a5a49f8ed},
+  {0x96f150816f90c216, 0x484947f2b48d7882},
+  {0xdefb92978dfa0053, 0x58823337d6c0a641},
+  {0x98bbc22dd2d3262b, 0xdad5891c70205c95},
+  {0xbf1d06e5edc7d9ba, 0xea3e0a86c4241c1e},
+  {0x78e2cf480abc18ef, 0x1110bc39a35669cf},
+  {0xc188299c1375e7b2, 0x8eb4cf8cb0851480},
+  {0xd0ec275048c667d2, 0xff5c57071581e3b1},
+  {0x955c8d54a50fdd52, 0xcf79008ac79991d3},
+  {0xf46cdcd85b7289c9, 0x1c5fc0acfab2cbb2},
+  {0x676f48ac3ed3c825, 0x862183d1a9042f4d},
+  {0xf35fc7982c7daee6, 0xa655183af862baae},
+  {0x5335bbcaf8b9f37b, 0x963ed04a2a0b3eaa},
+  {0x76d009714121cb10, 0x82f1d3e8253374ee},
+  {0x50198339f3198270, 0xee023bd013e359f5},
+  {0x315d27ea94c7941a, 0x5c1520117e098dbf},
+  {0x96ccc513ba987df7, 0x7d84bbe2e504ff94},
+  {0x03464584b630d2b7, 0x7d9fc4a633f228f4},
+  {0x7e39cbb756cac943, 0x45a5498048f1a474},
+  {0x56a90669f7aa29c6, 0x4883787b94c90425},
+  {0x9a262b27cb8de6e9, 0x6495beb53f905401},
+  {0xdc5866e0159b2920, 0x6c2c9c31b3faab04},
+  {0x82f93c693fec7b5f, 0x1926807fb1c2bdd5},
+  {0x3a06ca560fda4251, 0xff56ec036c5f13d6},
+  {0xcf96fe4ae095a1c3, 0xaea98fd960fd6b9d},
+  {0xc2ae3b23e1b73447, 0xe7c1f21b63d4e19c},
+  {0x660f92196e62044c, 0xa61e4689ac8893c0},
+  {0x4aacc983cc5d9cfe, 0xb71adc881811c258},
+  {0xb01938e5f92ea2e0, 0x3d4b38fea83810f8},
+  {0x8195527abb10f039, 0x242e99777aeec42a},
+  {0x077a36f6536baf7f, 0x928620c22f148a6d},
+  {0xb4d16665e8f965a6, 0x300ecf50c00b75a7},
+  {0x53d4fbf144350d5c, 0x50967628985e6eaf},
+  {0xea67291009feb48e, 0x74a182255aa9ccae},
+  {0xe67c52e63c97fb3a, 0xbe1b4991d245fa61},
+  {0x6bd8d3685ed38551, 0xc26bdd871e8691e5},
+  {0x267c4e3df39e0a7e, 0x2408058c7b3e3c09},
+  {0x2bc55550057b4b4e, 0xe70baa2724b374d3},
+  {0x0e2984947284c4dd, 0x4f4e64ba26bfee68},
+  {0x78891ea4bacdb828, 0x357f7d8801646f08},
+  {0x220a9cb569d1ee6b, 0x8e6c9653552802fa},
+  {0x6159359f74dda4d7, 0xcbd0c89374b1cc2e},
+  {0x8dd5a4c4fe55c89f, 0xeeca37f94d3f69bd},
+  {0x22abf1f68e0f314a, 0x69b86caf61d48d15},
+  {0xab26c59f1090455d, 0x1a49957d9798f177},
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000},
+  {0xf50e0632f2a35f5b, 0x386db41096f62a8a},
+  {0x1843656b2ea8f397, 0xefdb454053648225},
+  {0xfc670d9cf3feeb63, 0x7582326d84c7a1de},
+  {0xf1c52011971b40b3, 0x864204566cee644d},
+  {0x5d8e354c13ae648b, 0x192b28f22b444709},
+  {0x9d5cef9c88eb0d9f, 0xb686d60b99470446},
+  {0xc91fa3a9b726fd99, 0xcf7a6d254a105b09},
+  {0x048e86e374780c55, 0x9f65220d0c78fc67},
+  {0xafa9c90017000acc, 0x83a4540ded360993},
+  {0x3e563c2c6efb6102, 0xb7147f0d38fa394e},
+  {0x858e694ad98264cb, 0x184d72cdc205efdb},
+  {0x260f2eae08292a50, 0x101cdb156939622a},
+  {0x4a9a43781e99484d, 0x8b9b7c41b6c639f6},
+  {0x16c9831c810a7459, 0xcb60c983013050be},
+  {0x96d02af1b8d2cba8, 0xb37b4c2c6ea27c34},
+  {0x3caadfab02ea679d, 0x6c3124a15e087d32},
+  {0xf0892e59955b87ae, 0xaab1aa69ba6853d8},
+  {0x8420916c212205ac, 0x86ed9039af31291c},
+  {0x0610fd444421f178, 0xa6b004a839e31b64},
+  {0xaebf5d9bae4e4ac1, 0x54bf9e6ec57b2d65},
+  {0x28bce750ebcba70e, 0x4ce04f578ca77d4d},
+  {0xe35d48d89312441e, 0xe6d91969fd74895a},
+  {0xcca901ef7fabb1c5, 0x117d2c0c4032a05b},
+  {0x4d05be0c6a5a2edc, 0x8314aecc100fcba9},
+  {0x7c685f4133a51825, 0x9acd72f51105c28b},
+  {0x5011fb2faa2c215a, 0xf33e2515d2bd65e4},
+  {0xcec542879e66d1d0, 0xb35dca22a0c3ce97},
+  {0x40849b4ce23375b2, 0x92453c68d163c3cb},
+  {0x807af8ab827e3617, 0x9aa0b258c13e1db7},
+  {0x02cf8f1292f7c659, 0x188599535df660bb},
+  {0x675c7dfe865c4b21, 0x60e7e01162356b69},
+  {0xdca8758ed620dd7b, 0x40e2dfc1450698ca},
+  {0xd4785af596fd0c85, 0x194dcdf10572a8d6},
+  {0x39c75c8db5a743fd, 0xaaff1be5fb825c25},
+  {0x76f287eaaf80a26e, 0x6d5c3d924e633b50},
+  {0xf3289f813d56fa87, 0x8a5781160603ae34},
+  {0x023097d7bf57b560, 0x5c09da41ceda1dab},
+  {0xaa7caa9af1506059, 0xd65b5a005d02edd2},
+  {0x837d13e5bec17d5c, 0x96732ad7e569d594},
+  {0xfca9d80d257930ce, 0xbb07355f7df706c0},
+  {0x7e719f925352363e, 0x61f17c3d17da7386},
+  {0xbdd686a4862a5d5b, 0x5ddbe9580f36ecca},
+  {0xcd8440580a8cb347, 0x8b395b802547e6d8},
+  {0x4338e255f15fc0d9, 0xf2400716b60d1c2c},
+  {0xc0a4a5181cf7a401, 0x208e7b27a3d4e578},
+  {0x6557dd7a9909844a, 0xd7dd867435b17ded},
+  {0xe7214501f52038cd, 0xf73bfe485cf7fdd0},
+  {0x93443a46972cbc70, 0xd2ca8f42b2d199e0},
+  {0xbea25cda0a9de799, 0x51886f07950aef32},
+  {0x82824ccfb37df72d, 0x71a58d7df86233f6},
+  {0x0ab442c2423ac6e3, 0x5d989eeb2df819bb},
+  {0x717b766d60dda065, 0x3899b1af41b28b8b},
+  {0x2fffad98c8e94310, 0x9ff893980c381280},
+  {0x9d7da6a6ca8c0d82, 0x09c78e0f83da5e2a},
+  {0x26b7e85d55753566, 0x48b0fee439062128},
+  {0x63896bb7a7a3c638, 0x551438e5f3ff5db9},
+  {0x080d9af5ef2e5865, 0x048eccc1b914ae50},
+  {0xf081a5f8ab004099, 0x24ffc9670c5492ac},
+  {0x7e4178c2bf375b5a, 0xa641e4982d1c8638},
+  {0x9f1874733c37691a, 0xa6e59883261af497},
+  {0x90068f05a814992d, 0x8f340c2ecb9a2bd2},
+  {0x2e0a82ad5f144c70, 0x783eb790b951d2d9},
+  {0xbf58c52a82e24af6, 0x190f49c97cd133af},
+  {0x29e30e4d37b882a1, 0x217bea750913f0db},
+  {0xfe2287c403984038, 0x870bd9dd397e696e},
+  {0x49e9bc6efdb97d7d, 0xf75f4c5e88587e96},
+  {0xa6223b70299d2836, 0xf27661ea227ab61b},
+  {0x4d6b8601ceb750cf, 0xfab6503eb520e48d},
+  {0xcaf2dd4a73f67c6c, 0x93f3baaf44fed4e0},
+  {0x1ff32e99fc57e662, 0x502b8bb6f2031150},
+  {0x1d8b5656e3d694dc, 0xb31de0d77f80372b},
+  {0x0f3d13aca2eac302, 0xb6d1f98a81d2cd6d},
+  {0x840d8615c90887b8, 0x1d44fc5efe63c574},
+  {0xded005c9eb05ef63, 0xdeb4246e55c121bb},
+  {0x3409b8d1c43c2415, 0x700c0d1dc307fe8d},
+  {0x8b361337911e3002, 0x7920c1039098414c},
+  {0xa5dddabdd1beecf5, 0x146aaf12b0d6da5e},
+  {0xe1a91d6f2a874e47, 0x0d63fcc83ef069d1},
+  {0x0ffd9177c1f3ebb2, 0x9a0cadce706c0cc0},
+  {0xc60d34aa0f45f13d, 0x2d0b4ea8c2bfdc70},
+  {0x83e36503d6399610, 0x6014c0c7cba2d2f2},
+  {0x9cc705d2ecaeca0e, 0x79f83e8c83e7f333},
+  {0x58c7035772444ccc, 0x789c6687005b995e},
+  {0x6b3d950394c886a1, 0x9b4f4564cd5b92d0},
+  {0x872c7f29b6dae6ca, 0xd2a320a97a0d0be9},
+  {0x14bb3a90e34016b2, 0xb308fa5fc47ad142},
+  {0xc6b31a14ce574546, 0xd7f758f96323f56e},
+  {0x046d3862feb271a0, 0x391175405eef9c5e},
+  {0xf7654c3e98aff433, 0x92b8d607c0180e5e},
+  {0xdfe26a4ee0edcbcf, 0x4c21afb68c481788},
+  {0xb9175aa38699a7fa, 0xa26d3569fb705b0a},
+  {0xd2955bcc820c812c, 0x29d30f039b37f636},
+  {0x37d8c59743ebdc8d, 0x19289d7baab847bc},
+  {0x8b8a25c0075e7200, 0x75fcbc7110b551c9},
+  {0x8ab2318dd48eb686, 0xca8ee9edf4a5a1e0},
+  {0x182033f6233cad5f, 0x743083edee67622b},
+  {0xc82b0364e7db3d93, 0x3cec89a9bc59587d},
+  {0x4fb362a6d33cdc65, 0xb2f2a5ce567b5b8e},
+  {0x90df4043911d6152, 0xfe9e1ef68cc145b2},
+  {0x4fcf7b4fcca5200b, 0xaba094d2f96d9249},
+  {0x5ac887c31fc3fd76, 0x1845172174cf2944},
+  {0x25180f84f6702866, 0xde5223f17c83df5d},
+  {0x2863a5b3ae30cdea, 0x610fc2ae8f7cfc74},
+  {0x64a4086ca77af644, 0xafe073214eb0e372},
+  {0xbdc97dadac10ab50, 0x97cf31c3dfa3a7ad},
+  {0x79f2ee819538d167, 0x68555fb401eb2780},
+  {0x72e2b904d5c7a7ed, 0x482326aa3e165b1c},
+  {0x92f65484dcff7fd8, 0x603faf9bafb86f1e},
+  {0x210e7817fff07876, 0xabdf6d8a0dd6d8a5},
+  {0xab561f7f19942dec, 0x55f71e3e54c7b523},
+  {0x8e7140a742fb2245, 0x34a49c54b5ad70ec},
+  {0x6da544268e007b3f, 0xebf2cf33aeaee1c9},
+  {0x010679622fe3753e, 0x40228d2a0d402ed0},
+  {0x2e128b07e6e4e311, 0x0811ebd4d8dde5b5},
+  {0x126cb02cee9ad020, 0x398e5321decfb79c},
+  {0x6dfdff51553fb5ff, 0x415b4003d55c33ab},
+  {0xd3b7fedc1cd8ab6e, 0x49dc7b6033f0ae60},
+  {0x7062ab84db2bbaed, 0xc33060adb11136c6},
+  {0xae149ced6b9cc3d3, 0xef2f29a2ebe433ce},
+  {0x133ca1e237105dc6, 0x9712a59673f1d79c},
+  {0xfcf98569ab4ec844, 0x6a40dd9e8d49194e},
+  {0xd73a65ce7e33212a, 0xaa29936469e73794},
+  {0x961009e50707fe21, 0x657c63ec063d9f23},
+  {0x6b1af6be25650671, 0xce96b0cb11ce0372},
+  {0xc7312488beda3b54, 0x9ee42f2347f50335},
+  {0x829d638189fca23f, 0xe3123a63017f9509},
+  {0xbb40cef8e0e85cea, 0xd8b3a76799622f49},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer128s/m4stack/api.h b/crypto_sign/aimer128s/m4stack/api.h
new file mode 100644
index 00000000..170f69cc
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 32
+#define CRYPTO_SECRETKEYBYTES 48
+#define CRYPTO_BYTES 4160
+#define CRYPTO_ALGNAME "aimer128s"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer128s/m4stack/field.c b/crypto_sign/aimer128s/m4stack/field.c
new file mode 100644
index 00000000..1e12b447
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/field.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+  uint64_t sub[2] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+
+  poly64_mul(&a[1], &b[1], &temp[3], &temp[2]);
+  poly64_mul(&a[0], &b[0], &temp[1], &temp[0]);
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] = temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+  uint64_t sub[2] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+
+  poly64_mul(&a[1], &b[1], &temp[3], &temp[2]);
+  poly64_mul(&a[0], &b[0], &temp[1], &temp[0]);
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] ^= temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 8, index >>= 8, b_ptr += 8)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+
+      mask = -((index >> 4) & 1);
+      temp_c0 ^= (b_ptr[4][0] & mask);
+      temp_c1 ^= (b_ptr[4][1] & mask);
+
+      mask = -((index >> 5) & 1);
+      temp_c0 ^= (b_ptr[5][0] & mask);
+      temp_c1 ^= (b_ptr[5][1] & mask);
+
+      mask = -((index >> 6) & 1);
+      temp_c0 ^= (b_ptr[6][0] & mask);
+      temp_c1 ^= (b_ptr[6][1] & mask);
+
+      mask = -((index >> 7) & 1);
+      temp_c0 ^= (b_ptr[7][0] & mask);
+      temp_c1 ^= (b_ptr[7][1] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 8, index >>= 8, b_ptr += 8)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+
+      mask = -((index >> 4) & 1);
+      temp_c0 ^= (b_ptr[4][0] & mask);
+      temp_c1 ^= (b_ptr[4][1] & mask);
+
+      mask = -((index >> 5) & 1);
+      temp_c0 ^= (b_ptr[5][0] & mask);
+      temp_c1 ^= (b_ptr[5][1] & mask);
+
+      mask = -((index >> 6) & 1);
+      temp_c0 ^= (b_ptr[6][0] & mask);
+      temp_c1 ^= (b_ptr[6][1] & mask);
+
+      mask = -((index >> 7) & 1);
+      temp_c0 ^= (b_ptr[7][0] & mask);
+      temp_c1 ^= (b_ptr[7][1] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+
+  poly64_mul_s(&temp[3], &temp[2], a[1], b[1]);
+  poly64_mul_s(&temp[1], &temp[0], a[0], b[0]);
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] = temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[2] = {0,};
+  uint64_t temp[4] = {0,};
+
+  poly64_mul_s(&temp[3], &temp[2], a[1], b[1]);
+  poly64_mul_s(&temp[1], &temp[0], a[0], b[0]);
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0] ^ temp[0] ^ temp[2];
+  temp[2] = t[0] ^ t[1] ^ temp[0] ^ temp[1] ^ temp[3];
+
+  t[0] = temp[2] ^ ((temp[3] >> 57) ^ (temp[3] >> 62) ^ (temp[3] >> 63));
+
+  c[1] ^= temp[1] ^ temp[3];
+  c[1] ^= (temp[3] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[3] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[3] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
diff --git a/crypto_sign/aimer128s/m4stack/field.h b/crypto_sign/aimer128s/m4stack/field.h
new file mode 100644
index 00000000..e8fd7996
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[2];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer128s/m4stack/hash.c b/crypto_sign/aimer128s/m4stack/hash.c
new file mode 100644
index 00000000..71f3fb67
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/hash.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+void hash_init(hash_instance *ctx)
+{
+  shake128_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake128_inc_init(ctx);
+  shake128_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake128_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake128_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake128_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake128_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake128_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer128s/m4stack/hash.h b/crypto_sign/aimer128s/m4stack/hash.h
new file mode 100644
index 00000000..d6b05065
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/hash.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake128incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer128s/m4stack/params.h b/crypto_sign/aimer128s/m4stack/params.h
new file mode 100644
index 00000000..8f03a035
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/params.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer128s_m4stack_##s
+
+#define SECURITY_BITS               128                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         2                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     17                   // number of parallel repetitions (Tau)
+#define AIMER_N                     256                  // number of MPC parties (N)
+#define AIMER_LOGN                  8                    // log_2(N)
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer128s/m4stack/sign.c b/crypto_sign/aimer128s/m4stack/sign.c
new file mode 100644
index 00000000..deaea5b7
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/sign.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_mul(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->z_shares[0], mult_chk->x_shares[0],
+                           aim2_e1_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE])
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  hash_instance ctx;
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1],  rep, party);
+      hash_update(&ctx, commits, AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+      }
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF)
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF pt_shares[AIMER_N];
+  GF alpha_v_shares[2][AIMER_N];
+  GF epsilons[AIMER_L + 1];
+
+  // prepare h2
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF_set0(alpha);
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1], rep, party);
+
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk));
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+        GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+        GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+        GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares, sizeof(alpha_v_shares));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b)
+{
+  // prepare challenge parties
+  hash_instance ctx;
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF epsilons[AIMER_L + 1];
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  mult_chk_t mult_chk;
+  memset(&mult_chk, 0, sizeof(mult_chk_t));
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes, i_bar);
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha_share = {0,};
+
+    tape_t tape;
+    commit_and_expand_tape(&tape, commits, &ctx_precom,
+                           nodes[AIMER_N + i_bar - 1], rep, i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits,
+           AIMER_COMMIT_SIZE);
+
+    GF_set0(mult_chk.x_shares[AIMER_L]);
+
+    // adjust the last share and prepare the proof and h_1
+    if (i_bar == AIMER_N - 1)
+    {
+      // initialize adjustment values
+      tape_t delta;
+      memset(&delta, 0, sizeof(tape_t));
+
+      GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+      GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+      GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+      GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+      GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+      GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+    }
+
+    for (size_t ell = 0; ell < AIMER_L; ell++)
+    {
+      GF_copy(mult_chk.x_shares[ell], tape.t_shares[ell]);
+
+      // x_* = sum_i A[i] * t[i] + b
+      GF_transposed_matmul_add(mult_chk.x_shares[AIMER_L],
+                               tape.t_shares[ell], matrix_A[ell]);
+    }
+    GF_copy(alpha_share, tape.a_share);
+
+    GF_mul_add(alpha_share, mult_chk.x_shares[0], epsilons[0]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[1], epsilons[1]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[2], epsilons[2]);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes, alpha_share);
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  signature_t *sign = (signature_t *)sig;
+
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt and root seeds
+  hash_instance ctx_roots;
+  hash_init_prefix(&ctx_roots, HASH_PREFIX_3);
+  hash_update(&ctx_roots, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_roots, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_roots, random, SECURITY_BYTES);
+  hash_final(&ctx_roots);
+  hash_squeeze(&ctx_roots, sign->salt, AIMER_SALT_SIZE);
+
+  uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE];
+  hash_squeeze(&ctx_roots, (uint8_t *)root_seeds, AIMER_T * AIMER_SEED_SIZE);
+  hash_ctx_release(&ctx_roots);
+
+  run_phase_1(sign, pt_GF, (const GF *)sbox_outputs, mu,
+              (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds);
+
+  run_phase_1_to_3(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, ct_GF);
+
+  run_phase_1_to_5(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b);
+
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(signature, CRYPTO_BYTES, message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer128s/m4stack/sign.h b/crypto_sign/aimer128s/m4stack/sign.h
new file mode 100644
index 00000000..2e2ab61d
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/sign.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_3 AIMER_NAMESPACE(run_phase_1_to_3)
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF);
+
+#define run_phase_1_to_5 AIMER_NAMESPACE(run_phase_1_to_5)
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer128s/m4stack/tree.c b/crypto_sign/aimer128s/m4stack/tree.c
new file mode 100644
index 00000000..0585e7fa
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/tree.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE])
+{
+  size_t index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  memcpy(nodes[0], root_seed, AIMER_SEED_SIZE);
+  buffer[0] = (uint8_t)(rep_index);
+  for (index = 1; index < AIMER_N; index++)
+  {
+    buffer[1] = (uint8_t)(index);
+    memcpy(buffer + 2, nodes[index - 1], AIMER_SEED_SIZE);
+
+    hash_ctx_clone(&ctx, &ctx_);
+    hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+    hash_final(&ctx);
+    hash_squeeze(&ctx, nodes[2 * index - 1], AIMER_SEED_SIZE << 1);
+    hash_ctx_release(&ctx);
+  }
+  hash_ctx_release(&ctx_);
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer128s/m4stack/tree.h b/crypto_sign/aimer128s/m4stack/tree.h
new file mode 100644
index 00000000..f1354f19
--- /dev/null
+++ b/crypto_sign/aimer128s/m4stack/tree.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define expand_tree AIMER_NAMESPACE(expand_tree)
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE]);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer192f/m4speed/__asm_field.S b/crypto_sign/aimer192f/m4speed/__asm_field.S
new file mode 100644
index 00000000..26575c28
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/__asm_field.S
@@ -0,0 +1,617 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  ldr.w R2, [in_p, #4 * width]
+  ldr.w R3, [in_p, #5 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R3, [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R2, [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  ldr.w R3,  [in0_p, #4 * width]
+  ldr.w R12, [in1_p, #4 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #4 * width]
+
+  ldr.w R3,  [in0_p, #5 * width]
+  ldr.w R12, [in1_p, #5 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w {R4-R10, lr}
+
+  ldr.w in0, [in_p, #2 * width]  // a[1]
+  ldr.w in2, [in_p, #3 * width]
+  ldr.w in4, [in_p, #4 * width]  // a[2]
+  ldr.w in6, [in_p, #5 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R10, C4
+
+  and.w in0, in0, R10, lsr #16
+  and.w in2, in2, R10, lsr #16
+  and.w in4, in4, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+  eor.w in2, in2, in7, lsr #25
+  eor.w in2, in2, in7, lsr #30
+  eor.w in2, in2, in7, lsr #31
+
+  // c[2] = temp[2] ^ temp[5];
+  eor.w in0, in0, in6
+  eor.w in1, in1, in7
+
+  // c[2] ^= (temp[5] << 7) | ((temp[4] >> 57);
+  // c[2] ^= (temp[5] << 2) | ((temp[4] >> 62);
+  // c[2] ^= (temp[5] << 1) | ((temp[4] >> 63);
+  eor.w in0, in0, in5, lsr #25
+  eor.w in0, in0, in5, lsr #30
+  eor.w in0, in0, in5, lsr #31
+
+  eor.w in0, in0, in6, lsl #7
+  eor.w in0, in0, in6, lsl #2
+  eor.w in0, in0, in6, lsl #1
+
+  eor.w in1, in1, in6, lsr #25
+  eor.w in1, in1, in6, lsr #30
+  eor.w in1, in1, in6, lsr #31
+
+  eor.w in1, in1, in7, lsl #7
+  eor.w in1, in1, in7, lsl #2
+  eor.w in1, in1, in7, lsl #1
+
+  str.w in0, [out_p, #4 * width]
+  str.w in1, [out_p, #5 * width]
+
+  ldr.w in0, [in_p, #0 * width]  // a[0]
+  ldr.w in6, [in_p, #1 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in7, in6, #16
+
+  and.w in0, in0, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // c[1] = temp[1] ^ temp[4];
+  eor.w in6, in6, in4
+  eor.w in7, in7, in5
+
+  // c[1] ^= (temp[4] << 7) | (t >> 57);
+  // c[1] ^= (temp[4] << 2) | (t >> 62);
+  // c[1] ^= (temp[4] << 1) | (t >> 63);
+  eor.w in6, in6, in3, lsr #25
+  eor.w in6, in6, in3, lsr #30
+  eor.w in6, in6, in3, lsr #31
+
+  eor.w in6, in6, in4, lsl #7
+  eor.w in6, in6, in4, lsl #2
+  eor.w in6, in6, in4, lsl #1
+
+  eor.w in7, in7, in4, lsr #25
+  eor.w in7, in7, in4, lsr #30
+  eor.w in7, in7, in4, lsr #31
+
+  eor.w in7, in7, in5, lsl #7
+  eor.w in7, in7, in5, lsl #2
+  eor.w in7, in7, in5, lsl #1
+
+  str.w in6, [out_p, #2 * width]
+  str.w in7, [out_p, #3 * width]
+
+  // c[0] = temp[0] ^ t;
+  eor.w in0, in0, in2
+  eor.w in1, in1, in3
+
+  // c[0] ^= (t << 7);
+  // c[0] ^= (t << 2);
+  // c[0] ^= (t << 1);
+  eor.w in0, in0, in2, lsl #7
+  eor.w in0, in0, in2, lsl #2
+  eor.w in0, in0, in2, lsl #1
+
+  eor.w in1, in1, in2, lsr #25
+  eor.w in1, in1, in2, lsr #30
+  eor.w in1, in1, in2, lsr #31
+
+  eor.w in1, in1, in3, lsl #7
+  eor.w in1, in1, in3, lsl #2
+  eor.w in1, in1, in3, lsl #1
+
+  str.w in0, [out_p, #0 * width]
+  str.w in1, [out_p, #1 * width]
+
+  pop.w {R4-R10, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer192f/m4speed/aim2.c b/crypto_sign/aimer192f/m4speed/aim2.c
new file mode 100644
index 00000000..b5dbbc85
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/aim2.c
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 17
+// (2 ^ 17 - 1) ^ (-1) mod (2 ^ 192 - 1)
+// = 0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5ad6ad6b56b5ab5ad5
+// ad6b56b5ab5ad5 ad6 ad6b56b5ab5ad5 ad6 ad6b56b5ab5ad5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t1 = in ^ (0xad)
+  GF_sqr_s(t1, table_a);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t2 = in ^ (0xad 6), table_d = in ^ (0xad5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_6);
+  GF_mul_s(table_d, t1, table_5);
+
+  // t1 = in ^ (0xad6 b)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xad6b5 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xad6b56 b)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b56b 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xad6b56b5 a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xad6b56b5a b)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b56b5ab 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // table_d = in ^ (0xad6b56b5ab5 ad5)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(table_d, t1, table_d);
+
+  // t1 = n ^ (0xad6b56b5ab5ad5 ad6)
+  GF_sqr_s(t1, table_d);
+  for (i = 1; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6 ad6b56b5ab5ad5)
+  for (i = 0; i < 56; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5 ad6)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5ad6 ad6b56b5ab5ad5)
+  for (i = 0; i < 56; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_d);
+}
+
+// inverse Mersenne S-box with e2 = 47
+// (2 ^ 47 - 1) ^ (-1) mod (2 ^ 192 - 1)
+// = 0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeeeeeed
+// dddd dddd dddd bb bb bb bb bb bb 77 77 77 77 77 76 ee ee ee ee ee ed
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_6 = {0,}, table_7 = {0,};
+  GF table_b = {0,}, table_d = {0,}, table_e = {0,};
+
+  // t1 = in ^ 3
+  GF_sqr_s(table_d, in);
+  GF_mul_s(t1, table_d, in);
+
+  // table_6 = (in ^ 3) ^ 2
+  GF_sqr_s(table_6, t1);
+  // table_7 = in ^ 7
+  GF_mul_s(table_7, table_6, in);
+  // table_b = in ^ 11
+  GF_sqr_s(table_b, table_d);
+  GF_mul_s(table_b, table_b, table_7);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_6, table_7);
+  // table_e = in ^ 14
+  GF_sqr_s(table_e, table_7);
+
+  // table_b = in ^ (0xbb)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_b);
+
+  // table_7 = in ^ (0x77), table_6 = in ^ (0x76)
+  GF_sqr_s(t1, table_7);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_6, t1, table_6);
+  GF_mul_s(table_7, t1, table_7);
+
+  // t2 = in ^ (0xdd)
+  GF_sqr_s(t1, table_d);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // table_e = in ^ (0xee), table_d = in ^ (0xed)
+  GF_sqr_s(t1, table_e);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_d, t1, table_d);
+  GF_mul_s(table_e, t1, table_e);
+
+  // t2 = in ^ (0xdd dd)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t2, t1, t2);
+
+  // t1 = in ^ (0xdddd dddd)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 16; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xdddddddd dddd)
+  for (i = 0; i < 16; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xdddddddddddd bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb77 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb7777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb77777777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb7777777777 76)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776 ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776ee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // out = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeeeee ed)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_d);
+}
+
+// Mersenne exponentiation with e_star = 5
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+  GF t2 = {0,};
+
+  // t2 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t2, t1, in);
+
+  // t1 = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t2);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 5 - 1)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, t2);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[0], state[0], vector_b);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+}
diff --git a/crypto_sign/aimer192f/m4speed/aim2.h b/crypto_sign/aimer192f/m4speed/aim2.h
new file mode 100644
index 00000000..b30d4cb9
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/aim2.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0xc0ac29b7c97c50dd, 0xbe5466cf34e90c6c, 0x452821e638d01377},
+  {0xd1310ba698dfb5ac, 0x9216d5d98979fb1b, 0x3f84d5b5b5470917}
+};
+
+static const GF aim2_e1_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000, 0x0000000000000000},
+  {0x24187d60404121f6, 0x994d0c36800d12c1, 0x00911dd52a0924f1},
+  {0x764f49362db3c478, 0x3bcc2005010a3fa6, 0x402147d6af1a6ff4},
+  {0x1c0878591079091d, 0x9b08ffde1c878f59, 0x8ff70000000021c2},
+  {0xbc23dcb74c10198f, 0xe23fb48357412666, 0x70031ccb7f97795b},
+  {0xffa72d9a27550570, 0xc7dab56f7d5ade7c, 0x78cd4c6283845a4b},
+  {0x655b34aa00430d9a, 0x0150004209eea37c, 0xea5061fe40551141},
+  {0x291b4f90d5814c36, 0xcda4bfb158be9a9b, 0x0d4558cc51c4127c},
+  {0xbe4eb108521087f8, 0x855a49e49b1f9165, 0xfa15129aaa8d8745},
+  {0xef60386cb35ccf9a, 0x5115765ff710f9f9, 0x205677891921e135},
+  {0xbed705ee53ec571d, 0x97ef8c6dd0851236, 0xdfb8887b08ee7b6d},
+  {0x6731ce99be825c5a, 0x78665e68455482e1, 0x8b867f2046054b3e},
+  {0x008fe70500592609, 0x6419eeb2829f34c3, 0x8f95a35e28a915f4},
+  {0x4dd556b654d54730, 0x07e8d738dc4b2c41, 0x6de823272f319c70},
+  {0xc805945260585e93, 0xf3efb93595438399, 0x387f3dab97add8fd},
+  {0x8825784a2bb54db6, 0x8d1d21f68a9fed14, 0xd72c5de2e4375500},
+  {0xd9fbd5d41179e461, 0xbaa9f9428fe27896, 0x49998ea2c43c70ee},
+  {0xba1e061ac9218b6b, 0x93a1c1ea0a23984a, 0x145015f4bca9f514},
+  {0xb2829eadb1319c61, 0xf21008aca9c587af, 0x491dfc66b48bb406},
+  {0xdc192cc5729969e6, 0x19aeec2c6a3facb8, 0xeaf05f73c034e88f},
+  {0x5cc5d35af8af5039, 0x64bfd6b3c8401142, 0x4d083af0e0cecd4f},
+  {0xbcb663181c16e418, 0x9d73d6e08b40b1cf, 0xe6a19d2ea608b779},
+  {0x8f9e2660cdf64ce3, 0x6e790dfe030df1e7, 0xf36bdb76802d4809},
+  {0x24e27b21fdd534eb, 0x9b2abc8327bb58a6, 0xa60607784f3d2a8a},
+  {0x6470b72d839b493a, 0x3de3bd12dbc9236b, 0xab0e0e81db838cab},
+  {0x9fa25765dfa0dc0c, 0xa4866af77f3c1d39, 0xa22985fd177fb75e},
+  {0x1bd0dcf82dfcbaf6, 0x2778cab77faeae14, 0x144c9d871ac906e8},
+  {0xab206aa0299e585a, 0x1f2a1c115b2b24e7, 0xd683dc1df4f0e8e4},
+  {0x3db096486b11d3fc, 0x1d88f50f57fb1318, 0xfbdbd02cf211be3b},
+  {0x83c0ed680040dbeb, 0x01d5321e9c73822a, 0x5c78f9da86ddc253},
+  {0xed72eb240cfd7027, 0xe43295f2eab71065, 0x7dad74ed8a4daf27},
+  {0x593448e3f55865bc, 0x3dbc22ef1d415b62, 0xff617d36a6e04fd7},
+  {0x79fee82d5e5f6225, 0xe933e7ffba3ad69f, 0x11333262fecf9f21},
+  {0xaccf982f89364968, 0x961868954276eacd, 0x3903286905b4951a},
+  {0x15f9d8aff0e99b99, 0x37d7fc3823e38e15, 0x8f3cf305ce9c3317},
+  {0x5f1db90ec8ff178c, 0xef61eb5b69c0cf16, 0xd6d4428841ba2406},
+  {0x6c1d820160b3e589, 0x1655a37c12244e16, 0x1506fe0d42af221a},
+  {0x776220241d5f52f8, 0xbbd873a1a32d77fc, 0x2967ed932de2646d},
+  {0xb360b6c691f374f5, 0xe152921a89b1bb3a, 0x9bb32e5d9871acf2},
+  {0xbbae8029d2f0211d, 0xdfa58ed49cdc469a, 0x298aa1fd3b5fee94},
+  {0x311334572c4f58e2, 0xbd79cb94c83a4a65, 0x097731c2b9f63b2f},
+  {0x202f161d6f618d78, 0xb30f00f7d63d2b1c, 0xba3ba40cb586c147},
+  {0x6f6de8a66957b811, 0x933c64f745e4cb26, 0xe60acca62b3467da},
+  {0x2d52d8e03eadc408, 0x020b8ada8b0cbcfb, 0x97e520c15d31d866},
+  {0x17f79f53394c41f8, 0x8057746b55d4354d, 0x29944f234150b558},
+  {0xd48d6f8d466f4fb7, 0xe62aa6c05e099abf, 0xe72196d812cdf8ff},
+  {0x31086eee778187b7, 0x5f39e6312ab8e7fc, 0xd2794f291ba18edc},
+  {0x8bb7a2d05d52dd01, 0x898fee2a72a51691, 0xaf83c32d4f112cdf},
+  {0xf219effd62769131, 0x006ad7baac86fb08, 0xae1e7bed2f88d4eb},
+  {0x085e604007b4850e, 0x74969c7dc17959a0, 0x70af70f460fd6854},
+  {0x85048e661ea730d2, 0xccb4840c40f6c89e, 0xcb4b3836c98d0776},
+  {0xac7fadd0308807de, 0x93e5399425e1f409, 0x6cebcde031477957},
+  {0x12b09fb9d6bb04ff, 0xa5b0c0475b17d882, 0x9a2d1dc52a42cbfb},
+  {0x2a89655cb1fec3db, 0xb8a64412d508abdf, 0x3998b588ed04feab},
+  {0xa8687e88bff0829c, 0x671e2f2b99afe070, 0x2c08c6f71aa0fa09},
+  {0xe1ce5c820d6be145, 0x7c9485f929d3a113, 0x35a20e96293d131a},
+  {0xba53e0ea72f26b2a, 0x2c4dc2a431baa81b, 0x19674137360734db},
+  {0xde4269315e846bfb, 0x9ed583db0c4ca349, 0x315852fa0660ab68},
+  {0x00ae2ff5c859fcd1, 0x8a404e1ee645e1db, 0x9feadfee4a6a10b9},
+  {0x098454c0f608253b, 0xbf09d16ec3b96f79, 0xe63451db95697baf},
+  {0xa422cc6c5adc283f, 0xb7854c10a36c12d0, 0x9650b028e25b9107},
+  {0x8da1b75903dd2aa8, 0xef8f3a20c77f4c10, 0x11e6a8d176631e6f},
+  {0xe70563f20a26d72a, 0xc706a9184b4269ec, 0x01707c8cd370854b},
+  {0x4c497f712f722710, 0x40d97c17a9f96a81, 0x61ac088c7242b19b},
+  {0x9c1188e5b2c4043a, 0x15c4ce5e386918fd, 0xc2c19cddc8022f62},
+  {0x334dd52624b37647, 0x0ecfeb52b8db6b3a, 0x7cb0cc6a541d915f},
+  {0x0d2da3de5da05ab9, 0x4c8403040eb7a0a8, 0xaa43178d698e1d16},
+  {0x94dd24ac7d70454e, 0x19c81eacd2305f1d, 0xab7995a48e6230a2},
+  {0xc4c2698143f7ebe6, 0x9a9c3bf3c8dbc9bc, 0xef2ce69e69cf09cf},
+  {0xe4d55e8362bd6084, 0x4bd67382e024dfd0, 0x821aed870355bf63},
+  {0xd76139f98e468054, 0x61f1798f51310a13, 0x29046f782268e0dc},
+  {0xd415fc0d991dd093, 0x40c961038916982c, 0x50c6b0ef248e059b},
+  {0x9964bad18a8082f1, 0x666ff6785e18a4dd, 0x8ef30e5710f8282b},
+  {0xb414e2f6230594fe, 0x1bc6a73e670570f9, 0x58556965657d0723},
+  {0x7923079ff8bc88c9, 0x2009ba12607a4104, 0x79486291900310c9},
+  {0xbee4fd3a8ba864ef, 0x5df270cc7b675b45, 0x8fe410ae3a6416b4},
+  {0xed8ea038500ce1aa, 0x23cfffa4b08f7923, 0x24391c9872e1db52},
+  {0xea11414bd1ee6f54, 0x57a5ebe50ea4869b, 0x18f580aebbed4614},
+  {0x4d0c81d6ef843f2f, 0xfd169854c78d4b18, 0x7c36b2afccb84371},
+  {0x0c639f2dc76998e0, 0xdc8e28abec0a421f, 0xfba0c0a5251cd144},
+  {0x766dda3b823a1b74, 0x7f6d206bbd49261d, 0x710de4ad8beaa62e},
+  {0x7abd0b3c484d3910, 0x58abd14b6ee2e49b, 0x78652fe31e4d6d19},
+  {0x4dce3f2a407a25c2, 0x57d6ce10b19b7b99, 0x29cabd29d03528c3},
+  {0xf03c709f8b55bbc2, 0x10f449ee0641e483, 0xf60bd442dfd1a803},
+  {0x51d8a3af211b35bb, 0x2b0c872b328250e9, 0xb67d77e5c9d6d27a},
+  {0x9a731c8f091b2c24, 0x04cf41a716e1e225, 0x9b354a2d84899ec9},
+  {0x0748672bb3e504fb, 0xda648aaa478a326c, 0x0d85a4a55979e5ca},
+  {0xbb732bb90d147586, 0x446c43c25a19dc66, 0x18523f7f708eff36},
+  {0xc549edb1f37b1b15, 0x719aa23612aac7e4, 0x2c771e685e380ec2},
+  {0xe2b6b4207ad6a4b6, 0xf7cc2a116c9527ba, 0xdf6e5d55b2406221},
+  {0xb67a2baac610e044, 0xd425d94d1ebe4051, 0xb7bd1ce70c015395},
+  {0x64ff5ff72d64a1b1, 0xdaca2b8812d90ae6, 0x79a022efcc594eaf},
+  {0xc93cfa6de67bcacd, 0xa179dce6ffd14aec, 0x31528f0f0f3c6817},
+  {0x3ec18f7af7342039, 0xf8d7aa856a662ed9, 0x097b848460df8308},
+  {0xf037fa04d6ff2eb4, 0x1b6ec290719d4d0a, 0xe20e86a3b38d743e},
+  {0x8aea64bccc94d424, 0x2cc260f4f6b65bad, 0x355d31f6d901a260},
+  {0x140e5ae17cc96cb4, 0x620ee0a86b0eda0a, 0xb3fcecb29d358575},
+  {0x5ec85d1f29af07e2, 0xd6c8834f22331d6a, 0xcef37a820396e162},
+  {0xe344085d2eabc755, 0x6c6b136959c8ef7a, 0xbb22e260fa6a677a},
+  {0x7a64bfaa585ae30a, 0xe317efc967bbe220, 0x9a9780dfb02d4b7e},
+  {0x98c71744cd706ceb, 0xd177e9274ab5f551, 0x8353064dea82d011},
+  {0xff04c178eec23d3e, 0x2f460919349f2d47, 0x78fe5c7e69a969f2},
+  {0x40b0e4b5ba731b12, 0xdfdf6fb48e1eacca, 0x418adb73cc0cac43},
+  {0x07e5547b971dc85a, 0x9bb127d9e57350ef, 0xdb9801dd4d74063c},
+  {0x85c01e6cb0183fd9, 0x3ed03735d2254d39, 0x759b3422ff5ef8f1},
+  {0x6d72fa4b71c48c98, 0x3a991af37f04f9e1, 0xb32059432a68082f},
+  {0x3fe283302875d557, 0x8173481a149eee28, 0xeb7766a31793b0be},
+  {0x7acae2d67f591873, 0xb326c3aa2ed4173a, 0x1946cb0d5f62d04d},
+  {0x23bef9ae772d7f05, 0xe0bfc86b1d88610d, 0x74f165bcee4734eb},
+  {0x1d4726ce666680c3, 0x2ce0e6d607113532, 0xffc5de80c34f2df4},
+  {0xc2c05b149cdd1b58, 0x6944e26394cbe4d2, 0x97958f196f8c4c6b},
+  {0x270456c0b2e40aa0, 0x55d5c764d7670e84, 0x717d55b1ebf4aac6},
+  {0x20bc0c1aa67ad034, 0xd4281becc759401d, 0xa34c23a734c590ac},
+  {0x5847ae572b03bf5c, 0xfcac4377aa016371, 0xc37160769e1a862d},
+  {0x7dd17fc6d6f74010, 0x5b327c27eb1048e0, 0x9bdfc698b132189d},
+  {0xab7a432b47cdddcb, 0xa929bbd83ccbd1f9, 0x4d454da5089a34f2},
+  {0xb39461490efcedca, 0x53d60b8883762f77, 0x38149fe44801d6e1},
+  {0x7c94c03395823033, 0xdeeb603aad8b99f6, 0x6135272e4190f922},
+  {0x253f212e339c57b8, 0x4fbc0d5dd968a708, 0xf66bd639e3fb013b},
+  {0x6607bb8d9f1426d8, 0x0b9156b2a938e184, 0x1d6f7d7b46319a77},
+  {0x408e99af5df09232, 0xea04d07e17d71e98, 0x0961e3735a066ceb},
+  {0x0ac48cb89fc1d495, 0xe5ed5004fadbdcb6, 0xb371ec4e641dbdfd},
+  {0x870fba78bc9a5840, 0xa1372a9ae9b35641, 0xd7b9b31aedb9368d},
+  {0x9ec8171425817f91, 0x46d3a766e6d0c217, 0x6d410a83cdfd91e4},
+  {0xbaaf0e5bac52a284, 0x6184eb30dcfa0676, 0x10c8fb0ed6d0bdc9},
+  {0xac8814d3e0fe8707, 0x86d0ff1167e53b8a, 0x10e6600f84bbd4e6},
+  {0x747c0349c6a589dd, 0xf944627e4ef37152, 0x28e5a0f135a5a9bb},
+  {0x382e5c28e3026945, 0xee877613758af703, 0x2d922be5a1610e7f},
+  {0xcadae8499bb4cdb7, 0xd090031f77613a0d, 0xb775a4e76fd94b4f},
+  {0xd09a761e6898eced, 0x5669242c2f84d5da, 0x3d97c6bded80996e},
+  {0x2f95de059a47e03f, 0xfa75be47169ed83f, 0x87d30a6c8dff4a90},
+  {0xf8588b0cb7a0c692, 0xd246208d9f6dc4fb, 0xe36d575d6c2485c0},
+  {0x48c08c7013df5c58, 0x4d37effdea32dc30, 0xff80378ec9caad7d},
+  {0xf9e43db917658f34, 0xb76c0ff79e41f707, 0x8e4935c0b5c08083},
+  {0xb33f84c0bc9ef48d, 0xaab63f4f9f339a4c, 0xae55cf665e81d500},
+  {0x15e234561c4632f1, 0xe084e7a57d035829, 0xbaa1511cb0ed12a0},
+  {0x74f83ba7ec3568de, 0x1d7ecb2f352fdb0b, 0xd76964def60c29f6},
+  {0xd1c2b81f2e13a757, 0xf84d5af929439b5d, 0xc34a2d0878b81e8d},
+  {0x47767837fdba926b, 0x5683aec561752e96, 0x961ca0e7d4439beb},
+  {0x7d73c95d078b625f, 0x6e621c6b3817a9f1, 0xd300b482fda5d226},
+  {0x2cf83b998a66fb35, 0x4f0359eaa9684bfb, 0x2c460d7b4765cbc7},
+  {0xa5c0e6cf67395406, 0xb659d3e82276235d, 0x2c5c851229561369},
+  {0x3168901c3d8747a6, 0x4541eabd5d866402, 0xb768bb5b1a6b8379},
+  {0xb5fa4b6cdc308417, 0x8100841dbbeb59e8, 0x4db5eb632adc8553},
+  {0x2622070061628fa6, 0xc66a1ed278866e50, 0xfad328db6fb4acba},
+  {0x6734cb1adfc5db87, 0xd7f8cfed34d7e713, 0x259e5c52bef9b101},
+  {0xa077ba5e97f9e1c0, 0x21edc3275eed4b8f, 0xc2ddffec584d31bc},
+  {0xe8074b1519eb9faa, 0xa35f39294a8283ed, 0xffbfa9f0fdcce212},
+  {0x49406434389cd06b, 0x5241069e873cd010, 0xde4f448e7e3c47b6},
+  {0x8cb6dafda57a1b04, 0xb80b06fb012be0f6, 0x6c1f61ef626c5ee2},
+  {0x9e596d56ff39dd82, 0xfd823060d81e563c, 0xfe45b0659666e7bf},
+  {0x713e642578abac3b, 0x1e13b3773dddffd6, 0xf7ebe45d0b4ed62e},
+  {0x0fb29b505409913a, 0xbd66ecfa5053f05e, 0x5172fa12bbd062cf},
+  {0x7a8cd2f2af8db5c7, 0xf1c96d88f03f2f0c, 0xfaa8376f49a0abd5},
+  {0xacc980889b25b5e7, 0x2c34843e6a6d9f3d, 0xa6bf67c68037b6ca},
+  {0xaff8095311a13c10, 0x1d4a259b84ca7804, 0x3cbb9d0b61f7ff43},
+  {0x5662cd5d639dfe13, 0x89c27a983290bab8, 0x92a7d11e497af642},
+  {0x4157aad5c3c645ca, 0xf51297f3f77a30f2, 0x83c9dda7804ac4d8},
+  {0x4e84ffef7ca3be0a, 0x14a7ba9c76da7c08, 0x5c28dc6da027d5a0},
+  {0xb0964b96303be4e5, 0x4615a98b7f22a76d, 0xf222f844d2b37df9},
+  {0x802540711d4f5f7d, 0xf6649bae872a32e3, 0xaed6395da047f447},
+  {0x2f0953d8ce80f600, 0xdcf66d5eaf05752f, 0x209193bacdf14ef8},
+  {0xc6a3ef2332ce576d, 0xb9e01c6c4572a31f, 0xde9e30f16310efde},
+  {0xba02b8398971d6e6, 0xd1bab81c9c5221d6, 0x1c9c2d1f1b7f3f2b},
+  {0xedc228019fbdd60a, 0x2753c3a138bcb6d7, 0x786fd2ba67707c2f},
+  {0x448e2cb6c1407cbf, 0xf7b738377f0cfb97, 0x4c9212bdc0657e9c},
+  {0xc76e32691429c2f9, 0x490232f4e8c043ce, 0x217833736b683230},
+  {0xd1499dc75ffd2a9c, 0xd4b5f702de32b776, 0xd6dfbb898f67a374},
+  {0x3b5a28d4cff86b77, 0x806f6c0571138c8b, 0x54628239f0c0f09f},
+  {0xb8d45dd4a900ea0a, 0x2a9169078690c168, 0xb3657df1647fbd66},
+  {0x08189a6674f4c29c, 0x8915f4636dd5d112, 0x654dc7fe07da3107},
+  {0x5250e18c883794b0, 0x8828b68987cd0d9a, 0x300a18a7c772270d},
+  {0x51d33040e3efaa99, 0xd658da2cb0cb97b0, 0x39038890d157c0af},
+  {0x68f5a5cd07a32b53, 0x46b4f5ec1368cf94, 0xf2e0d23f40742f45},
+  {0x782b44a867a3f208, 0xae64fe82046cd425, 0xb78cf45fe171d435},
+  {0xde012b438c92c4d6, 0x4733810dca874273, 0x206a03d102c15302},
+  {0xbea371badf5b9173, 0x8cbfaa817fd4f717, 0x34bea5affcb319d8},
+  {0x1a26c2090378d01a, 0xf3d15fc5c66a7f39, 0x4de762da9a07d052},
+  {0x3486c8a67bccd6cc, 0x0d10351e2b0e18ac, 0x087106b5da2aba90},
+  {0xbd5c398105759654, 0x932e7ce0d2415118, 0xff7a9395dd694851},
+  {0x6f6615de424f584e, 0x6ca415cbf1ff0b9a, 0x509c3763be9bb7ea},
+  {0xe45a5c178e450e25, 0x48cc200c65039546, 0x2c2d872741a6e8d2},
+  {0x10a487ce7b7ba1f7, 0x8da8831a4adaa217, 0xcb608d431e73d316},
+  {0x480667a3a33a0923, 0x3a6fc63a03c45c96, 0xebed952f29ad80c0},
+  {0x8899df2b4edff733, 0x7b68b7ea18849999, 0xcedaa43cfb6f7f7b},
+  {0x356eff5782ed987f, 0xca6aab13ed43b0ce, 0x9dd8a4a5288bc18a},
+  {0x5ffc38d8fbfdcdb6, 0x697d4c0b82ce34af, 0x3509dc6ecc05993b},
+  {0x83905969be9090dd, 0x2125eb5bbd23d5da, 0x64224c3dfae48ffe},
+  {0xf54512d0b6691741, 0x0cbaec28b636b0bc, 0xbb1d6adcda1edefc},
+  {0x89ea6a9a58cddfdb, 0x845d179babdb73f7, 0xcf74a641c412cff5},
+  {0x65c9f3063d3b266e, 0x560354e0ca062952, 0xc6eb9b218ae96514},
+  {0x8e8c7412b3689e52, 0x99b2ec666a8a4e48, 0x5b4477de15147c03},
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000, 0x0000000000000000},
+  {0x75575b2a01927c2b, 0xe38f9eab8f685827, 0x782b0bd5192bca87},
+  {0xaebbaa0e79dffe28, 0xb3542e6782b8ce84, 0x8a972b1b32323be2},
+  {0x62cbd1af5c77da14, 0x3bbc6119877bbc1d, 0x8b6d73bce65ed541},
+  {0x4cd2ae2762f272cc, 0xdd4adc5bfc34ae6b, 0xa3f908a96f0fe449},
+  {0x60cfbdb9b6447e24, 0x721af8263082c01c, 0x68cb54e6fc7104af},
+  {0xa92867af3dc3b730, 0xb2608cc06efe34d0, 0xa3445078ace873ca},
+  {0xeedad86c96afe677, 0x52afb525bd42562f, 0x38cf8ddb97dc96e5},
+  {0x0a3b06f10bbc9562, 0x577b7a04e02c557b, 0x8be00f5765b7e908},
+  {0xaa72b3916d207e20, 0x50f0cce86025ffac, 0x09f7f935bbde0a04},
+  {0xac08b4e71f96174a, 0x16babbd24d02b260, 0x48e9d357af5ba717},
+  {0xe122c9c16beaa8be, 0x07043902949cfad6, 0xf78fd47b58608577},
+  {0x5c473c24ac8ca469, 0xb1da898afda7d7aa, 0xcac72d2cf21a9be3},
+  {0x7da00b91479d06e3, 0xc4c76d79d51eb15a, 0xb6c2e5796630269d},
+  {0xb42bb35d07e100ec, 0x19964fd51c07b0af, 0xffb88b0ef80a102f},
+  {0x60e6beb41a673a07, 0x75bec86a6c06b470, 0x61bb7f05fc39be82},
+  {0xfda48d0189cd0c13, 0x649054858d5374c9, 0xb770a8503a32e8a4},
+  {0x4a376d825f3006c4, 0x8896eb44124e97aa, 0x70e626bebfff29b5},
+  {0xe37a2f298ccf89c0, 0x3c3609a866d94979, 0x356c25d15f10d784},
+  {0x458ca204a347e41a, 0x59568e0a4da4e181, 0xab475a7c61d9014c},
+  {0x8c1a39eb79672160, 0xb373d43893fbd9fe, 0x41ba8d6a7097e9c8},
+  {0x7c01434b5f8e1448, 0x662bd055a2512d4f, 0x652c31c38e992dbc},
+  {0x1eae5c36fe075219, 0x82cb682598bcb1ea, 0x60daaa526a3e9947},
+  {0xf7ab17ab10f03bdf, 0x1e124b56f71a4c37, 0x75df607068cfdcff},
+  {0x1ffe54ada576e3c2, 0x384cb4e86120aaa8, 0xc6a4fefc642071df},
+  {0x1ae57be1013b7efb, 0x28d36534e13a369d, 0x75612cd220210f77},
+  {0xa84cfbb045298f2b, 0x733fdf0216082f1f, 0x0054b363e1fcdb09},
+  {0x8745e66041e62570, 0x980a16636c09d9b2, 0x51695306d0539b47},
+  {0xa67319655b027ef6, 0x4fd02799c207267a, 0x01587af4a65b6fd2},
+  {0x8d991698735bcc88, 0xf14dbd2b19f99a78, 0x6947a3b95199d2f4},
+  {0xe2906490948e4aff, 0x16b2ee7035d98706, 0x78f47845853b1ddc},
+  {0x58d9cdc2dd693cd9, 0x7e9c240b1b252019, 0x5cbd3d458a53ca24},
+  {0x24101759ff01d89b, 0xed8fdd27cdb2d47b, 0x11b0fa26e8d8a743},
+  {0x9da3e8ee96db2f59, 0x68285801543b4ac8, 0x618cc8ad53d51b65},
+  {0xf0b448478f472d56, 0x6044053c293513d2, 0xea2fb63a575a34cb},
+  {0x56bd7f9b430ca7de, 0xf883dbb4c18d2e0b, 0x6c8030ef1a38c730},
+  {0xf2e2c1396125acce, 0x882e926d399fcc33, 0x87e914f3049f22ce},
+  {0x7ec0b0443f81915d, 0x4573c52a818a44f3, 0xafc01f5cc8120f6b},
+  {0x924aed58bfbc33f1, 0x7cbf5617448b59a8, 0xef023ef380d782bb},
+  {0xed78ebbcc2543624, 0x4fbdf96f5a481d8a, 0x7dea022c85973850},
+  {0x4cd0fda73b73aaf8, 0xab714c84882fc5a6, 0x31a12db8b87c1a82},
+  {0x4f55b122e52b04af, 0x2b6abc206fcdea22, 0xeecc6a28e10f3cd9},
+  {0x773b7f263618ea81, 0xfedd6644251162ff, 0x20f124b39fffa2ef},
+  {0xc86672d34c7f9c99, 0xa1a9bedd91ba54ab, 0xe3164453cdbc1680},
+  {0x976193445cc61080, 0x4e8af4d9771f7fde, 0x2d6951afbad5a152},
+  {0xac8104ed45afc3e0, 0x2daa407aee0854a8, 0x93bf8a5f6332934b},
+  {0xadacd0145616a90f, 0x18fcdf471f8e446e, 0xb6cb1d657c5aee1f},
+  {0x39f4888a9f625046, 0x714ced776be006aa, 0x301aab64f4c07bac},
+  {0xfed94c87075ec99b, 0x6527495efabe5878, 0xae4ed05b44c346fa},
+  {0xe8089970ab84a9ad, 0xfa8ef420f612f142, 0x3033e1b424799c03},
+  {0x3de830d471a1c303, 0x1d4648963e64b5e8, 0xb7fc69c1308d744f},
+  {0xf917cc81a21178a2, 0xf51c71d20d3dde0f, 0xc755e70d903eca43},
+  {0xf988b4435c7e0659, 0xe8ec12c9411e644d, 0x011cff135dc46fe5},
+  {0x45eb42b4bc82e615, 0xbb1ea1d87fa2dcc8, 0xbbf258cddfcc5a4e},
+  {0x76c177c889777fa3, 0x771de5ab30476eca, 0xe3dd4d0ea4da4f41},
+  {0x62d43190a74afaab, 0x8c72e6cc25a0906d, 0x6560641e35c269c1},
+  {0x4a473706039e3353, 0x9270c15446432105, 0x508bd6dfcce33617},
+  {0x58e979ef836cb200, 0x64a108a5f68530dc, 0xeeb5a210610292b9},
+  {0x3e8a485122657a2d, 0xb7f7272f3423621b, 0x4c0e2f899ffc6f0f},
+  {0xb03f26ebad2101f3, 0x2bf27f00ccb827ad, 0xf2c32d1c9db42e29},
+  {0xcc5f196397e2bb63, 0x9cf1f95bba0e5fb0, 0xcffa723b8add78c2},
+  {0x5198cabd81774aa6, 0x79e142bd7c3981f1, 0xcfb65a6d42815d8a},
+  {0x91dc7af311207622, 0xf294a4f3c38f447e, 0xdfd67624b63f7997},
+  {0xfb2f51ed0b5b44c1, 0x6eeb2b229427682c, 0xfad555a3f1680200},
+  {0xd043eb034f7557ae, 0x89f917e3d7f663f1, 0xd7f51e2f59ce0302},
+  {0xd1738764ddee76f4, 0x28a966bea5ec647e, 0xa322c656d7bc27d1},
+  {0x0cd66c8dd29514f0, 0xb4e37bf2f01130a9, 0x7db6ecdc81a7a57f},
+  {0xc8cb28a44796dc78, 0x88eb0048501b3765, 0x8ff3fbd6d703c26d},
+  {0x2c5d68650ca4b6f5, 0xa8e391ce83198344, 0x8b9f3219506be9d0},
+  {0x911906127a1ba855, 0x30d5215961ac95e7, 0x71827dfac7504342},
+  {0x1ae4c2e2506d0712, 0xb5caffb8afbcda6e, 0x159080539f7f876e},
+  {0x86571676f6228cdb, 0x3a51f0bfed40380f, 0x5dec5a0cee962a54},
+  {0xf5c3339c01460504, 0x5d55382d4e349ecc, 0xcf81cc12df0b2c9e},
+  {0x89a775997037437a, 0xc86002223b57f27f, 0xfe795feb841f08ef},
+  {0x7da8a9b3f9f43fe4, 0x8494d51c6e215f43, 0xb703f044bc338b9c},
+  {0xf73c2c9d450a092f, 0xce0ae97084884a01, 0x9a647f6d5f970839},
+  {0x87c63573f869cdbb, 0x812d2d8e966e6911, 0x973b425ba1c66dfa},
+  {0x7de5a1e78d630e85, 0x765d7d5a4a6e3cb7, 0x28170eef2a846d99},
+  {0x0b0c630c0f59460d, 0x9c8758a9ee8db258, 0xd3589f9c034f75d5},
+  {0xe1a6d8e757067309, 0xd18498099be244d9, 0x9b10a894502fc4e1},
+  {0xfa14fe8a1dd59c3e, 0x6a9a93b0f1ac862a, 0xdbe4d8d065053ef7},
+  {0x5c94965ff0a8e28e, 0xc2a32a0d57f1faa2, 0x24dc5effe1fa9e37},
+  {0x6b404bba72a24d04, 0xbcd23a38f7981241, 0x93d0c9eb1b9a39ef},
+  {0xa53a198b9e74e59c, 0x17cb3bc05f9608d1, 0x21bcc23eb5e75655},
+  {0x05911f7d3220397f, 0x7915054dcb628314, 0x183a2a8400570cef},
+  {0x2a420bf34788186c, 0x8c83a2945ee3027b, 0x606a65c37a8f2fe3},
+  {0xccf4e83131d54a27, 0xc95466a498499126, 0xef9ac8206968b1f7},
+  {0xe457b2ff12256f1e, 0x57fd60a454e5f68d, 0xf3388bb1de5dd1c2},
+  {0x4addb3e322595749, 0x39e02bd59d8ae504, 0x20284c1ae2f1a65c},
+  {0x9fbb5574795cac4d, 0x9fedac975974c8bb, 0xd307ecf05fd4fd22},
+  {0x2505bb81200f8cbb, 0x2ac9d93c45830708, 0x11ec704af2c49861},
+  {0xfa1702dd351d3b22, 0xbe0dfc13d607f962, 0x82c611b8ccd1e9f2},
+  {0xb7ff038d58626bd7, 0x86e990a7d6acad3b, 0x5010d30fbe2d70a9},
+  {0xc42bda459ef1afca, 0x83c5891e3eff20a0, 0xdefbb485c364fd5a},
+  {0xaada4d9f943df0f1, 0x2618e51a8838b5fe, 0x8f45f0ffff45201f},
+  {0xb55e3891213f972c, 0xdb4f56b16dc4e905, 0x30fd462a4cf268fc},
+  {0x64e007b7010e8c80, 0x2d0de3d26a1748c3, 0xa2e01ed12648c113},
+  {0x5128d2b5c4bac674, 0xb80b46283a340508, 0x1c1f01fe24b17a66},
+  {0x4cb8ab976733595f, 0x403aca262ff117b0, 0xce1698b4f9a54376},
+  {0x7781e71d8805fdc4, 0x40c3c2110800e7a0, 0xe72e9e63999cc311},
+  {0xbb3e3e6501e45c00, 0x9e70bd7de6780a3b, 0x549416aa087fe4c5},
+  {0xae1da809d7eed055, 0x06ba5804e029b01c, 0x490555c99e76bd05},
+  {0x67f3afbbfeee6547, 0x1243b190c38432b1, 0xbab2fa8df7bf2943},
+  {0x6d7197464f15c83c, 0x9283ced1147a6a85, 0x96ba1a0e47d9dd96},
+  {0x9cbb90e485218006, 0x8b5ff83a0210b4d9, 0x1086afcf143b95c2},
+  {0xa07d026b378f963b, 0x2debd80b456cd3e3, 0xc7792b9bc7f54c4a},
+  {0x3d0bec8b88ba06b8, 0x0c13cdfdc4d01e9f, 0x6d256d1087b9c95e},
+  {0x9216a33ea47259ff, 0x2bde0cfcb54abe8d, 0xaaef421825f1b47b},
+  {0xa1aabb09b181ae0f, 0xc14d44d54e3620cd, 0xabb20e2a4d637bcb},
+  {0x2544eba1038d1b04, 0xda1f84aa9bc120c2, 0x41fd7f657a18c45d},
+  {0xadaff973f301d8c3, 0x87dae306486ff1a6, 0x60ec280a2570b8ff},
+  {0x624994b2704d4c20, 0x532232f1cf209482, 0x861b9c2a5a7d0a43},
+  {0x4513aa7db58aea4d, 0x89dfbe8c94798dde, 0xe735f37739441c13},
+  {0x2f534ce65fbe5d87, 0xf8fcb2432339f543, 0x8ea957572a77e395},
+  {0x2456c8d764e7c1a6, 0x7dc7567c507e2e18, 0xd29b13c5db1cd65a},
+  {0x885705a845bb1199, 0xebc702d7e1680421, 0x9aeba22f533cbac9},
+  {0x55c435f803ad3742, 0x695442fe576b3a09, 0x5ca02fab230ee023},
+  {0x0d446bb06a3cbf8b, 0x5bfc8414d84fff9a, 0x157e3384708408a8},
+  {0x7b212d17c02a4054, 0x2b14562733ba6900, 0x7965f7d93122eac0},
+  {0x349446294451df24, 0x2b91f57cdcc289f3, 0x829cb5a03cce767d},
+  {0x2f8e7fa84f0ad401, 0xb3a50f68cba8a638, 0xde440882f84bfd7a},
+  {0xd1ba1db41829f412, 0x9a2c4c23fb8538f7, 0x86ca32d92d99ecb9},
+  {0x8a6db99a627b227c, 0x633c81cf8e52a687, 0x8e58542594d7103e},
+  {0x4c5a928b8610d6cd, 0x6a38a81e5ec41b61, 0x05ac22b201c86322},
+  {0x283c4b53c14f39c0, 0x106fe171df2218c5, 0x4c077d33f17e0107},
+  {0x198b4c90bd33552f, 0x5853a4c2f74596db, 0x1018dd6bf21150d4},
+  {0x47c29e1c2f495b4c, 0x7ec84995131d545b, 0x49e53beaeb94dae0},
+  {0x2678b3f7b548fc9f, 0x63a6b9322f3a574c, 0xef6d85f1091f1aeb},
+  {0xf1391f569cd5fe90, 0x876e8ba956de0238, 0x6cd576e3b8ab6222},
+  {0x827547465967b775, 0x4197e1290368e412, 0xee63a7ef2156fb67},
+  {0x6cb2a919735b34d5, 0x6cc967b756d72395, 0x9a884a65ae74e811},
+  {0xbdebcb5fbfafafc0, 0xb7fc62a4c7947030, 0x554c36728822d8b6},
+  {0x025fef80c960792a, 0xc0f487dcc0ad8059, 0x9714504680995ad0},
+  {0x19ffb11f02502666, 0x482fc0fae8608ad2, 0x781175f6049c62ee},
+  {0xf1fece4f515854e7, 0x6dab52f7b6560106, 0xfa0028f50d672954},
+  {0x844afcd287c1ddba, 0x47234b529fe3ca41, 0x3ca221c08f88140a},
+  {0xfdbbeaaa02badeda, 0xf35a5e21992e2332, 0xa37f6d68d919b65f},
+  {0x6d218f603725748a, 0xb6df3c61103e9c3e, 0xbb7ac1cf4c1f4692},
+  {0x8e6d3eb058cfc260, 0xfbe2f6497287731a, 0xffa78646830d5ce0},
+  {0x8c07c328df449acd, 0x500ba217a7af529f, 0x19ab11b99a1a2a19},
+  {0x42de87a6001d7bc4, 0x6d65941a9ae5138b, 0xcb830271914ce1ee},
+  {0x25f950eb4e2b9669, 0x0c9f7a2279a16278, 0x86503e9de2e76202},
+  {0xedc0f3a86b732556, 0xc7995c7b3ec0ea66, 0x8a4d95b8d19c29ce},
+  {0x01b5ab0eca4d3189, 0xed7898b982b519ad, 0x24c5f841a769f11b},
+  {0xde3eefe1bad32178, 0x493a735c30942df4, 0x8b5ec5bed8e4d565},
+  {0xa974a9d616b752fa, 0x09d37b2ab193ca1c, 0x55b8aaf3af4481ba},
+  {0x84ca6915121b1e09, 0x8831e83e34fac643, 0x05e3db5a89049a2f},
+  {0x5375a9f4aefd0f44, 0xaf272fd031366078, 0xbbd286c07ed80632},
+  {0x9d101a493aa2ebc9, 0x67e3ddfaa73b2b94, 0x45bf06b13a5d6856},
+  {0x6469dfeed8b766bc, 0x41a958a8c84553fc, 0xc3665b3f060a6808},
+  {0x8bbd23b38d0cff32, 0x891f48bb2592fb3f, 0x24c6243ad065453e},
+  {0xf3d1cc12dcb4e302, 0x588dfaa464f518be, 0xfe082e8b4a39cf26},
+  {0x95c521746547be8e, 0x9cbbea72400d1df8, 0x0cfdac076655d579},
+  {0xa6c4c57375f48495, 0xd63f47b41907a3f7, 0x34e17c2df60668d7},
+  {0xa135ca38c26b95c3, 0x2aac9c6b01173258, 0x2d8499bf2ed7c23c},
+  {0xba02892976144352, 0x9e4d9906dc2ae94e, 0x6535b5091d0535a4},
+  {0x6ec4dba2c6f7e949, 0x02d65b71f7db3f86, 0x61c796b0290e7ff0},
+  {0xac044d22d442ff2e, 0x29d00d9db764b6ff, 0x9ec4ff5f21f3216d},
+  {0x26b3c84573c53161, 0xa3037316e91bf8bb, 0x251ed327edf11e39},
+  {0x2917804d2422970c, 0x16119362ba8934be, 0xafa94e1359c77cce},
+  {0x4eac35ec04e84a0b, 0x31b309e5e5d361a5, 0x4171e00956fd334e},
+  {0xa02b9fdd9f6b8162, 0xabd8bc110f4e1f52, 0x75578ed77238fedc},
+  {0xe73f9ad96bd8686d, 0xbdfc49ed2dba8097, 0x054c4bb989c34404},
+  {0xa0d01888aa5b1042, 0x8c33305a0dc075b1, 0x75f81fe0369e7b86},
+  {0x679d711aa88faab7, 0xb03f74deaa29c24c, 0x10a7766990689f5a},
+  {0x827d13e4d6310b6b, 0xc5a73641d06e47d1, 0xf2f0d06e14e2ab1f},
+  {0xcc968649ec63f05e, 0x17cda3a7fc25bfb2, 0x0df1338db25ee18e},
+  {0x7d4acd6c3cf8c18b, 0x4bd734fd562d48ad, 0xae50c4f72f542533},
+  {0xcf438bf70dbe4c62, 0x0019bcea28ce9270, 0xf687acda7ff8c960},
+  {0x5b24783c5318fd09, 0x5623189d31422de8, 0x862fd585eeb3e3f0},
+  {0xf98482f8df7d5e16, 0xccb9fb2d3745fbbf, 0x7d5e1bd364daa7d4},
+  {0x024849574a40a831, 0x48cae56880d67329, 0xfafa85469a93e6b3},
+  {0x944eae6b760bc534, 0x1d1d18f30fec24c3, 0xc64a74b4d0c3181e},
+  {0x19c52990a4e62d2d, 0x37b473c7ed759ef9, 0x04080c0ade3df738},
+  {0xfcc4062c7876c075, 0x48b4cf0b72aae741, 0x3889eef0b66c1bff},
+  {0x49c26471ae06da0b, 0x109da4749a70108b, 0x443b50c74915bd54},
+  {0xbe68bd432e672eb8, 0xbe737af593618ab7, 0x5d537d8c0da1a4e7},
+  {0xa3ca7393ce4e8d7c, 0x0fcf46d53a057c21, 0x7451a590ca6c1db1},
+  {0x79419444b1c149e5, 0x9d577a1e13240b2d, 0x24da1fd0d5db6e4d},
+  {0xe8c3caf37ad5170c, 0x423b4593d3f4c834, 0xff039eaad5042ae3},
+  {0x3bf5913b5615f7f5, 0x2d24b840238f2c84, 0x97bdc5bfeb1d53b7},
+  {0x53538b2293df4606, 0x169029e2d8675ec6, 0x9ab1ac25ee4982a4},
+  {0x75bd284d07f591f8, 0xccdd36b98d68786e, 0x9321ba79d2e56eed},
+  {0xe63236d17de7e69c, 0x9600d5f5cca5b08a, 0x8ff14c81e5d61843},
+  {0xdb079962536683c6, 0x35bb6068eb26bd37, 0xa614c37971ca2e4d},
+  {0xab78167ac83c4064, 0xb6a1928d6f89cdd1, 0xc97cc61d01ffe82f},
+  {0x83e6edd7a512e8b7, 0xe281601e537bc4ec, 0x19d35d2d57518cde},
+  {0xf737f3ddfa7fc9b2, 0x4a8f04a9cb4847be, 0x2946f3355994de91},
+  {0x577ca3baf1f7e1ba, 0x446729b10c51ed7c, 0xab637d9c6e3a5554},
+  {0x4e31798071664def, 0xec15c968e363630d, 0xd7ce5f867f758e48},
+  {0x10525e76bc5a5ed9, 0x1c8a384248ab4398, 0x8f7a522f2e2f3fc5},
+  {0xdee25133572d24bf, 0x37203f7f6c2e0e36, 0x89ba27d9b1233156},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer192f/m4speed/api.h b/crypto_sign/aimer192f/m4speed/api.h
new file mode 100644
index 00000000..dba6ebd7
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 48
+#define CRYPTO_SECRETKEYBYTES 72
+#define CRYPTO_BYTES 13056
+#define CRYPTO_ALGNAME "aimer192f"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer192f/m4speed/field.c b/crypto_sign/aimer192f/m4speed/field.c
new file mode 100644
index 00000000..91ee3d55
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/field.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+  uint64_t sub[6] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[0] ^ a[2];
+  sub[3] = b[0] ^ b[2];
+  sub[4] = a[1] ^ a[2];
+  sub[5] = b[1] ^ b[2];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &temp[5], &t[1]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul(&sub[4], &sub[5], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] = temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] = temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+  uint64_t sub[6] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[0] ^ a[2];
+  sub[3] = b[0] ^ b[2];
+  sub[4] = a[1] ^ a[2];
+  sub[5] = b[1] ^ b[2];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &temp[5], &t[1]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul(&sub[4], &sub[5], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] ^= temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] ^= temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+  c[2] = temp_c2;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+  c[2] ^= temp_c2;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&temp[5], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[2]), (b[0] ^ b[2]));
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[1] ^ a[2]), (b[1] ^ b[2]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] = temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] = temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&temp[5], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[2]), (b[0] ^ b[2]));
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[1] ^ a[2]), (b[1] ^ b[2]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] ^= temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] ^= temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
diff --git a/crypto_sign/aimer192f/m4speed/field.h b/crypto_sign/aimer192f/m4speed/field.h
new file mode 100644
index 00000000..5182adc4
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[3];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer192f/m4speed/hash.c b/crypto_sign/aimer192f/m4speed/hash.c
new file mode 100644
index 00000000..06a6f473
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/hash.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+void hash_init(hash_instance *ctx)
+{
+  shake256_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake256_inc_init(ctx);
+  shake256_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake256_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake256_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake256_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake256_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake256_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer192f/m4speed/hash.h b/crypto_sign/aimer192f/m4speed/hash.h
new file mode 100644
index 00000000..0fb7c8ef
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/hash.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake256incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer192f/m4speed/params.h b/crypto_sign/aimer192f/m4speed/params.h
new file mode 100644
index 00000000..2415b973
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/params.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer192f_m4speed_##s
+
+#define SECURITY_BITS               192                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         2                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     49                   // number of parallel repetitions (Tau)
+#define AIMER_N                     16                   // number of MPC parties (N)
+#define AIMER_LOGN                  4                    // log_2(N)
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer192f/m4speed/sign.c b/crypto_sign/aimer192f/m4speed/sign.c
new file mode 100644
index 00000000..98688f9c
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/sign.c
@@ -0,0 +1,549 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_mul(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->z_shares[0], mult_chk->x_shares[0],
+                           aim2_e1_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 uint8_t commits[AIMER_T][AIMER_N][AIMER_COMMIT_SIZE],
+                 uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 mult_chk_t mult_chk[AIMER_T][AIMER_N],
+                 GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                 const uint8_t *sk, const uint8_t *m, size_t mlen)
+{
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt
+  hash_init_prefix(&ctx, HASH_PREFIX_3);
+  hash_update(&ctx, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, random, SECURITY_BYTES);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // generate root seeds and expand seed trees
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    hash_squeeze(&ctx, nodes[rep][0], AIMER_SEED_SIZE);
+  }
+  expand_trees(nodes, sign->salt);
+  hash_ctx_release(&ctx);
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // initialize adjustment values
+    tape_t delta, tape;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      commit_and_expand_tape(&tape, commits[rep][party], &ctx_precom,
+                             nodes[rep][party + AIMER_N - 1], rep, party);
+      hash_update(&ctx, commits[rep][party], AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+      GF_set0(mult_chk[rep][party].x_shares[AIMER_L]);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk[rep][party].x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk[rep][party].pt_share, tape.pt_share);
+      GF_copy(mult_chk[rep][party].x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk[rep][party].x_shares[1], tape.t_shares[1]);
+      GF_copy(alpha_v_shares[rep][0][party], tape.a_share);
+      GF_copy(alpha_v_shares[rep][1][party], tape.c_share);
+
+      aim2_mpc(&mult_chk[rep][party],
+               (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_2_and_3(signature_t *sign,
+                       GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                       const mult_chk_t mult_chk[AIMER_T][AIMER_N])
+{
+  GF epsilons[AIMER_L + 1];
+
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    GF_set0(alpha);
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // alpha_share = a_share + sum x_share[i] * eps[i]
+      // v_share = c_share - pt_share * alpha + sum z_share[i] * eps[i]
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[rep][0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].pt_share, alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares[rep],
+                AIM2_NUM_BYTES_FIELD * 2 * AIMER_N);
+  }
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+  hash_ctx_release(&ctx_e);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  hash_instance ctx;
+  signature_t *sign = (signature_t *)sig;
+
+  //////////////////////////////////////////////////////////////////////////
+  // Phase 1: Committing to the seeds and the execution views of parties. //
+  //////////////////////////////////////////////////////////////////////////
+
+  // nodes for seed trees
+  uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  // commitments for seeds
+  uint8_t commits[AIMER_T][AIMER_N][AIMER_COMMIT_SIZE];
+
+  // multiplication check inputs
+  mult_chk_t mult_chk[AIMER_T][AIMER_N];
+
+  // multiplication check outputs
+  GF alpha_v_shares[AIMER_T][2][AIMER_N];
+
+  // commitments for phase 1
+  run_phase_1(sign, commits, nodes, mult_chk, alpha_v_shares, sk, m, mlen);
+
+  /////////////////////////////////////////////////////////////////
+  // Phase 2, 3: Challenging and committing to the simulation of //
+  //             the multiplication checking protocol.           //
+  /////////////////////////////////////////////////////////////////
+
+  // compute the commitment of phase 3
+  run_phase_2_and_3(sign, alpha_v_shares,
+                    (const mult_chk_t (*)[AIMER_N])mult_chk);
+
+  //////////////////////////////////////////////////////
+  // Phase 4: Challenging views of the MPC protocols. //
+  //////////////////////////////////////////////////////
+
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  //////////////////////////////////////////////////////
+  // Phase 5: Opening the views of the MPC protocols. //
+  //////////////////////////////////////////////////////
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes[rep], i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits[rep][i_bar],
+           AIMER_COMMIT_SIZE);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes,
+                alpha_v_shares[rep][0][i_bar]);
+  }
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(signature, CRYPTO_BYTES, message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer192f/m4speed/sign.h b/crypto_sign/aimer192f/m4speed/sign.h
new file mode 100644
index 00000000..e64c4350
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/sign.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF pt_share;
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 uint8_t commits[AIMER_T][AIMER_N][AIMER_COMMIT_SIZE],
+                 uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 mult_chk_t mult_chk[AIMER_T][AIMER_N],
+                 GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                 const uint8_t *sk, const uint8_t *m, size_t mlen);
+
+#define run_phase_2_and_3 AIMER_NAMESPACE(run_phase_2_and_3)
+void run_phase_2_and_3(signature_t *sign,
+                       GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                       const mult_chk_t mult_chk[AIMER_T][AIMER_N]);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer192f/m4speed/tree.c b/crypto_sign/aimer192f/m4speed/tree.c
new file mode 100644
index 00000000..84c23d7f
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/tree.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void expand_trees(uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                  const uint8_t salt[AIMER_SALT_SIZE])
+{
+  size_t rep, index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (rep = 0; rep < AIMER_T; rep++)
+  {
+    buffer[0] = (uint8_t)(rep);
+    for (index = 1; index < AIMER_N; index++)
+    {
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[rep][index - 1], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[rep][2 * index - 1], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer192f/m4speed/tree.h b/crypto_sign/aimer192f/m4speed/tree.h
new file mode 100644
index 00000000..b5a27867
--- /dev/null
+++ b/crypto_sign/aimer192f/m4speed/tree.h
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define expand_trees AIMER_NAMESPACE(expand_trees)
+void expand_trees(uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                  const uint8_t salt[AIMER_SALT_SIZE]);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer192f/m4stack/__asm_field.S b/crypto_sign/aimer192f/m4stack/__asm_field.S
new file mode 100644
index 00000000..26575c28
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/__asm_field.S
@@ -0,0 +1,617 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  ldr.w R2, [in_p, #4 * width]
+  ldr.w R3, [in_p, #5 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R3, [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R2, [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  ldr.w R3,  [in0_p, #4 * width]
+  ldr.w R12, [in1_p, #4 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #4 * width]
+
+  ldr.w R3,  [in0_p, #5 * width]
+  ldr.w R12, [in1_p, #5 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w {R4-R10, lr}
+
+  ldr.w in0, [in_p, #2 * width]  // a[1]
+  ldr.w in2, [in_p, #3 * width]
+  ldr.w in4, [in_p, #4 * width]  // a[2]
+  ldr.w in6, [in_p, #5 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R10, C4
+
+  and.w in0, in0, R10, lsr #16
+  and.w in2, in2, R10, lsr #16
+  and.w in4, in4, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+  eor.w in2, in2, in7, lsr #25
+  eor.w in2, in2, in7, lsr #30
+  eor.w in2, in2, in7, lsr #31
+
+  // c[2] = temp[2] ^ temp[5];
+  eor.w in0, in0, in6
+  eor.w in1, in1, in7
+
+  // c[2] ^= (temp[5] << 7) | ((temp[4] >> 57);
+  // c[2] ^= (temp[5] << 2) | ((temp[4] >> 62);
+  // c[2] ^= (temp[5] << 1) | ((temp[4] >> 63);
+  eor.w in0, in0, in5, lsr #25
+  eor.w in0, in0, in5, lsr #30
+  eor.w in0, in0, in5, lsr #31
+
+  eor.w in0, in0, in6, lsl #7
+  eor.w in0, in0, in6, lsl #2
+  eor.w in0, in0, in6, lsl #1
+
+  eor.w in1, in1, in6, lsr #25
+  eor.w in1, in1, in6, lsr #30
+  eor.w in1, in1, in6, lsr #31
+
+  eor.w in1, in1, in7, lsl #7
+  eor.w in1, in1, in7, lsl #2
+  eor.w in1, in1, in7, lsl #1
+
+  str.w in0, [out_p, #4 * width]
+  str.w in1, [out_p, #5 * width]
+
+  ldr.w in0, [in_p, #0 * width]  // a[0]
+  ldr.w in6, [in_p, #1 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in7, in6, #16
+
+  and.w in0, in0, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // c[1] = temp[1] ^ temp[4];
+  eor.w in6, in6, in4
+  eor.w in7, in7, in5
+
+  // c[1] ^= (temp[4] << 7) | (t >> 57);
+  // c[1] ^= (temp[4] << 2) | (t >> 62);
+  // c[1] ^= (temp[4] << 1) | (t >> 63);
+  eor.w in6, in6, in3, lsr #25
+  eor.w in6, in6, in3, lsr #30
+  eor.w in6, in6, in3, lsr #31
+
+  eor.w in6, in6, in4, lsl #7
+  eor.w in6, in6, in4, lsl #2
+  eor.w in6, in6, in4, lsl #1
+
+  eor.w in7, in7, in4, lsr #25
+  eor.w in7, in7, in4, lsr #30
+  eor.w in7, in7, in4, lsr #31
+
+  eor.w in7, in7, in5, lsl #7
+  eor.w in7, in7, in5, lsl #2
+  eor.w in7, in7, in5, lsl #1
+
+  str.w in6, [out_p, #2 * width]
+  str.w in7, [out_p, #3 * width]
+
+  // c[0] = temp[0] ^ t;
+  eor.w in0, in0, in2
+  eor.w in1, in1, in3
+
+  // c[0] ^= (t << 7);
+  // c[0] ^= (t << 2);
+  // c[0] ^= (t << 1);
+  eor.w in0, in0, in2, lsl #7
+  eor.w in0, in0, in2, lsl #2
+  eor.w in0, in0, in2, lsl #1
+
+  eor.w in1, in1, in2, lsr #25
+  eor.w in1, in1, in2, lsr #30
+  eor.w in1, in1, in2, lsr #31
+
+  eor.w in1, in1, in3, lsl #7
+  eor.w in1, in1, in3, lsl #2
+  eor.w in1, in1, in3, lsl #1
+
+  str.w in0, [out_p, #0 * width]
+  str.w in1, [out_p, #1 * width]
+
+  pop.w {R4-R10, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer192f/m4stack/aim2.c b/crypto_sign/aimer192f/m4stack/aim2.c
new file mode 100644
index 00000000..b5dbbc85
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/aim2.c
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 17
+// (2 ^ 17 - 1) ^ (-1) mod (2 ^ 192 - 1)
+// = 0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5ad6ad6b56b5ab5ad5
+// ad6b56b5ab5ad5 ad6 ad6b56b5ab5ad5 ad6 ad6b56b5ab5ad5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t1 = in ^ (0xad)
+  GF_sqr_s(t1, table_a);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t2 = in ^ (0xad 6), table_d = in ^ (0xad5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_6);
+  GF_mul_s(table_d, t1, table_5);
+
+  // t1 = in ^ (0xad6 b)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xad6b5 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xad6b56 b)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b56b 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xad6b56b5 a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xad6b56b5a b)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b56b5ab 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // table_d = in ^ (0xad6b56b5ab5 ad5)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(table_d, t1, table_d);
+
+  // t1 = n ^ (0xad6b56b5ab5ad5 ad6)
+  GF_sqr_s(t1, table_d);
+  for (i = 1; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6 ad6b56b5ab5ad5)
+  for (i = 0; i < 56; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5 ad6)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5ad6 ad6b56b5ab5ad5)
+  for (i = 0; i < 56; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_d);
+}
+
+// inverse Mersenne S-box with e2 = 47
+// (2 ^ 47 - 1) ^ (-1) mod (2 ^ 192 - 1)
+// = 0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeeeeeed
+// dddd dddd dddd bb bb bb bb bb bb 77 77 77 77 77 76 ee ee ee ee ee ed
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_6 = {0,}, table_7 = {0,};
+  GF table_b = {0,}, table_d = {0,}, table_e = {0,};
+
+  // t1 = in ^ 3
+  GF_sqr_s(table_d, in);
+  GF_mul_s(t1, table_d, in);
+
+  // table_6 = (in ^ 3) ^ 2
+  GF_sqr_s(table_6, t1);
+  // table_7 = in ^ 7
+  GF_mul_s(table_7, table_6, in);
+  // table_b = in ^ 11
+  GF_sqr_s(table_b, table_d);
+  GF_mul_s(table_b, table_b, table_7);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_6, table_7);
+  // table_e = in ^ 14
+  GF_sqr_s(table_e, table_7);
+
+  // table_b = in ^ (0xbb)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_b);
+
+  // table_7 = in ^ (0x77), table_6 = in ^ (0x76)
+  GF_sqr_s(t1, table_7);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_6, t1, table_6);
+  GF_mul_s(table_7, t1, table_7);
+
+  // t2 = in ^ (0xdd)
+  GF_sqr_s(t1, table_d);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // table_e = in ^ (0xee), table_d = in ^ (0xed)
+  GF_sqr_s(t1, table_e);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_d, t1, table_d);
+  GF_mul_s(table_e, t1, table_e);
+
+  // t2 = in ^ (0xdd dd)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t2, t1, t2);
+
+  // t1 = in ^ (0xdddd dddd)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 16; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xdddddddd dddd)
+  for (i = 0; i < 16; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xdddddddddddd bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb77 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb7777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb77777777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb7777777777 76)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776 ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776ee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // out = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeeeee ed)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_d);
+}
+
+// Mersenne exponentiation with e_star = 5
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+  GF t2 = {0,};
+
+  // t2 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t2, t1, in);
+
+  // t1 = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t2);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 5 - 1)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, t2);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[0], state[0], vector_b);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+}
diff --git a/crypto_sign/aimer192f/m4stack/aim2.h b/crypto_sign/aimer192f/m4stack/aim2.h
new file mode 100644
index 00000000..b30d4cb9
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/aim2.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0xc0ac29b7c97c50dd, 0xbe5466cf34e90c6c, 0x452821e638d01377},
+  {0xd1310ba698dfb5ac, 0x9216d5d98979fb1b, 0x3f84d5b5b5470917}
+};
+
+static const GF aim2_e1_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000, 0x0000000000000000},
+  {0x24187d60404121f6, 0x994d0c36800d12c1, 0x00911dd52a0924f1},
+  {0x764f49362db3c478, 0x3bcc2005010a3fa6, 0x402147d6af1a6ff4},
+  {0x1c0878591079091d, 0x9b08ffde1c878f59, 0x8ff70000000021c2},
+  {0xbc23dcb74c10198f, 0xe23fb48357412666, 0x70031ccb7f97795b},
+  {0xffa72d9a27550570, 0xc7dab56f7d5ade7c, 0x78cd4c6283845a4b},
+  {0x655b34aa00430d9a, 0x0150004209eea37c, 0xea5061fe40551141},
+  {0x291b4f90d5814c36, 0xcda4bfb158be9a9b, 0x0d4558cc51c4127c},
+  {0xbe4eb108521087f8, 0x855a49e49b1f9165, 0xfa15129aaa8d8745},
+  {0xef60386cb35ccf9a, 0x5115765ff710f9f9, 0x205677891921e135},
+  {0xbed705ee53ec571d, 0x97ef8c6dd0851236, 0xdfb8887b08ee7b6d},
+  {0x6731ce99be825c5a, 0x78665e68455482e1, 0x8b867f2046054b3e},
+  {0x008fe70500592609, 0x6419eeb2829f34c3, 0x8f95a35e28a915f4},
+  {0x4dd556b654d54730, 0x07e8d738dc4b2c41, 0x6de823272f319c70},
+  {0xc805945260585e93, 0xf3efb93595438399, 0x387f3dab97add8fd},
+  {0x8825784a2bb54db6, 0x8d1d21f68a9fed14, 0xd72c5de2e4375500},
+  {0xd9fbd5d41179e461, 0xbaa9f9428fe27896, 0x49998ea2c43c70ee},
+  {0xba1e061ac9218b6b, 0x93a1c1ea0a23984a, 0x145015f4bca9f514},
+  {0xb2829eadb1319c61, 0xf21008aca9c587af, 0x491dfc66b48bb406},
+  {0xdc192cc5729969e6, 0x19aeec2c6a3facb8, 0xeaf05f73c034e88f},
+  {0x5cc5d35af8af5039, 0x64bfd6b3c8401142, 0x4d083af0e0cecd4f},
+  {0xbcb663181c16e418, 0x9d73d6e08b40b1cf, 0xe6a19d2ea608b779},
+  {0x8f9e2660cdf64ce3, 0x6e790dfe030df1e7, 0xf36bdb76802d4809},
+  {0x24e27b21fdd534eb, 0x9b2abc8327bb58a6, 0xa60607784f3d2a8a},
+  {0x6470b72d839b493a, 0x3de3bd12dbc9236b, 0xab0e0e81db838cab},
+  {0x9fa25765dfa0dc0c, 0xa4866af77f3c1d39, 0xa22985fd177fb75e},
+  {0x1bd0dcf82dfcbaf6, 0x2778cab77faeae14, 0x144c9d871ac906e8},
+  {0xab206aa0299e585a, 0x1f2a1c115b2b24e7, 0xd683dc1df4f0e8e4},
+  {0x3db096486b11d3fc, 0x1d88f50f57fb1318, 0xfbdbd02cf211be3b},
+  {0x83c0ed680040dbeb, 0x01d5321e9c73822a, 0x5c78f9da86ddc253},
+  {0xed72eb240cfd7027, 0xe43295f2eab71065, 0x7dad74ed8a4daf27},
+  {0x593448e3f55865bc, 0x3dbc22ef1d415b62, 0xff617d36a6e04fd7},
+  {0x79fee82d5e5f6225, 0xe933e7ffba3ad69f, 0x11333262fecf9f21},
+  {0xaccf982f89364968, 0x961868954276eacd, 0x3903286905b4951a},
+  {0x15f9d8aff0e99b99, 0x37d7fc3823e38e15, 0x8f3cf305ce9c3317},
+  {0x5f1db90ec8ff178c, 0xef61eb5b69c0cf16, 0xd6d4428841ba2406},
+  {0x6c1d820160b3e589, 0x1655a37c12244e16, 0x1506fe0d42af221a},
+  {0x776220241d5f52f8, 0xbbd873a1a32d77fc, 0x2967ed932de2646d},
+  {0xb360b6c691f374f5, 0xe152921a89b1bb3a, 0x9bb32e5d9871acf2},
+  {0xbbae8029d2f0211d, 0xdfa58ed49cdc469a, 0x298aa1fd3b5fee94},
+  {0x311334572c4f58e2, 0xbd79cb94c83a4a65, 0x097731c2b9f63b2f},
+  {0x202f161d6f618d78, 0xb30f00f7d63d2b1c, 0xba3ba40cb586c147},
+  {0x6f6de8a66957b811, 0x933c64f745e4cb26, 0xe60acca62b3467da},
+  {0x2d52d8e03eadc408, 0x020b8ada8b0cbcfb, 0x97e520c15d31d866},
+  {0x17f79f53394c41f8, 0x8057746b55d4354d, 0x29944f234150b558},
+  {0xd48d6f8d466f4fb7, 0xe62aa6c05e099abf, 0xe72196d812cdf8ff},
+  {0x31086eee778187b7, 0x5f39e6312ab8e7fc, 0xd2794f291ba18edc},
+  {0x8bb7a2d05d52dd01, 0x898fee2a72a51691, 0xaf83c32d4f112cdf},
+  {0xf219effd62769131, 0x006ad7baac86fb08, 0xae1e7bed2f88d4eb},
+  {0x085e604007b4850e, 0x74969c7dc17959a0, 0x70af70f460fd6854},
+  {0x85048e661ea730d2, 0xccb4840c40f6c89e, 0xcb4b3836c98d0776},
+  {0xac7fadd0308807de, 0x93e5399425e1f409, 0x6cebcde031477957},
+  {0x12b09fb9d6bb04ff, 0xa5b0c0475b17d882, 0x9a2d1dc52a42cbfb},
+  {0x2a89655cb1fec3db, 0xb8a64412d508abdf, 0x3998b588ed04feab},
+  {0xa8687e88bff0829c, 0x671e2f2b99afe070, 0x2c08c6f71aa0fa09},
+  {0xe1ce5c820d6be145, 0x7c9485f929d3a113, 0x35a20e96293d131a},
+  {0xba53e0ea72f26b2a, 0x2c4dc2a431baa81b, 0x19674137360734db},
+  {0xde4269315e846bfb, 0x9ed583db0c4ca349, 0x315852fa0660ab68},
+  {0x00ae2ff5c859fcd1, 0x8a404e1ee645e1db, 0x9feadfee4a6a10b9},
+  {0x098454c0f608253b, 0xbf09d16ec3b96f79, 0xe63451db95697baf},
+  {0xa422cc6c5adc283f, 0xb7854c10a36c12d0, 0x9650b028e25b9107},
+  {0x8da1b75903dd2aa8, 0xef8f3a20c77f4c10, 0x11e6a8d176631e6f},
+  {0xe70563f20a26d72a, 0xc706a9184b4269ec, 0x01707c8cd370854b},
+  {0x4c497f712f722710, 0x40d97c17a9f96a81, 0x61ac088c7242b19b},
+  {0x9c1188e5b2c4043a, 0x15c4ce5e386918fd, 0xc2c19cddc8022f62},
+  {0x334dd52624b37647, 0x0ecfeb52b8db6b3a, 0x7cb0cc6a541d915f},
+  {0x0d2da3de5da05ab9, 0x4c8403040eb7a0a8, 0xaa43178d698e1d16},
+  {0x94dd24ac7d70454e, 0x19c81eacd2305f1d, 0xab7995a48e6230a2},
+  {0xc4c2698143f7ebe6, 0x9a9c3bf3c8dbc9bc, 0xef2ce69e69cf09cf},
+  {0xe4d55e8362bd6084, 0x4bd67382e024dfd0, 0x821aed870355bf63},
+  {0xd76139f98e468054, 0x61f1798f51310a13, 0x29046f782268e0dc},
+  {0xd415fc0d991dd093, 0x40c961038916982c, 0x50c6b0ef248e059b},
+  {0x9964bad18a8082f1, 0x666ff6785e18a4dd, 0x8ef30e5710f8282b},
+  {0xb414e2f6230594fe, 0x1bc6a73e670570f9, 0x58556965657d0723},
+  {0x7923079ff8bc88c9, 0x2009ba12607a4104, 0x79486291900310c9},
+  {0xbee4fd3a8ba864ef, 0x5df270cc7b675b45, 0x8fe410ae3a6416b4},
+  {0xed8ea038500ce1aa, 0x23cfffa4b08f7923, 0x24391c9872e1db52},
+  {0xea11414bd1ee6f54, 0x57a5ebe50ea4869b, 0x18f580aebbed4614},
+  {0x4d0c81d6ef843f2f, 0xfd169854c78d4b18, 0x7c36b2afccb84371},
+  {0x0c639f2dc76998e0, 0xdc8e28abec0a421f, 0xfba0c0a5251cd144},
+  {0x766dda3b823a1b74, 0x7f6d206bbd49261d, 0x710de4ad8beaa62e},
+  {0x7abd0b3c484d3910, 0x58abd14b6ee2e49b, 0x78652fe31e4d6d19},
+  {0x4dce3f2a407a25c2, 0x57d6ce10b19b7b99, 0x29cabd29d03528c3},
+  {0xf03c709f8b55bbc2, 0x10f449ee0641e483, 0xf60bd442dfd1a803},
+  {0x51d8a3af211b35bb, 0x2b0c872b328250e9, 0xb67d77e5c9d6d27a},
+  {0x9a731c8f091b2c24, 0x04cf41a716e1e225, 0x9b354a2d84899ec9},
+  {0x0748672bb3e504fb, 0xda648aaa478a326c, 0x0d85a4a55979e5ca},
+  {0xbb732bb90d147586, 0x446c43c25a19dc66, 0x18523f7f708eff36},
+  {0xc549edb1f37b1b15, 0x719aa23612aac7e4, 0x2c771e685e380ec2},
+  {0xe2b6b4207ad6a4b6, 0xf7cc2a116c9527ba, 0xdf6e5d55b2406221},
+  {0xb67a2baac610e044, 0xd425d94d1ebe4051, 0xb7bd1ce70c015395},
+  {0x64ff5ff72d64a1b1, 0xdaca2b8812d90ae6, 0x79a022efcc594eaf},
+  {0xc93cfa6de67bcacd, 0xa179dce6ffd14aec, 0x31528f0f0f3c6817},
+  {0x3ec18f7af7342039, 0xf8d7aa856a662ed9, 0x097b848460df8308},
+  {0xf037fa04d6ff2eb4, 0x1b6ec290719d4d0a, 0xe20e86a3b38d743e},
+  {0x8aea64bccc94d424, 0x2cc260f4f6b65bad, 0x355d31f6d901a260},
+  {0x140e5ae17cc96cb4, 0x620ee0a86b0eda0a, 0xb3fcecb29d358575},
+  {0x5ec85d1f29af07e2, 0xd6c8834f22331d6a, 0xcef37a820396e162},
+  {0xe344085d2eabc755, 0x6c6b136959c8ef7a, 0xbb22e260fa6a677a},
+  {0x7a64bfaa585ae30a, 0xe317efc967bbe220, 0x9a9780dfb02d4b7e},
+  {0x98c71744cd706ceb, 0xd177e9274ab5f551, 0x8353064dea82d011},
+  {0xff04c178eec23d3e, 0x2f460919349f2d47, 0x78fe5c7e69a969f2},
+  {0x40b0e4b5ba731b12, 0xdfdf6fb48e1eacca, 0x418adb73cc0cac43},
+  {0x07e5547b971dc85a, 0x9bb127d9e57350ef, 0xdb9801dd4d74063c},
+  {0x85c01e6cb0183fd9, 0x3ed03735d2254d39, 0x759b3422ff5ef8f1},
+  {0x6d72fa4b71c48c98, 0x3a991af37f04f9e1, 0xb32059432a68082f},
+  {0x3fe283302875d557, 0x8173481a149eee28, 0xeb7766a31793b0be},
+  {0x7acae2d67f591873, 0xb326c3aa2ed4173a, 0x1946cb0d5f62d04d},
+  {0x23bef9ae772d7f05, 0xe0bfc86b1d88610d, 0x74f165bcee4734eb},
+  {0x1d4726ce666680c3, 0x2ce0e6d607113532, 0xffc5de80c34f2df4},
+  {0xc2c05b149cdd1b58, 0x6944e26394cbe4d2, 0x97958f196f8c4c6b},
+  {0x270456c0b2e40aa0, 0x55d5c764d7670e84, 0x717d55b1ebf4aac6},
+  {0x20bc0c1aa67ad034, 0xd4281becc759401d, 0xa34c23a734c590ac},
+  {0x5847ae572b03bf5c, 0xfcac4377aa016371, 0xc37160769e1a862d},
+  {0x7dd17fc6d6f74010, 0x5b327c27eb1048e0, 0x9bdfc698b132189d},
+  {0xab7a432b47cdddcb, 0xa929bbd83ccbd1f9, 0x4d454da5089a34f2},
+  {0xb39461490efcedca, 0x53d60b8883762f77, 0x38149fe44801d6e1},
+  {0x7c94c03395823033, 0xdeeb603aad8b99f6, 0x6135272e4190f922},
+  {0x253f212e339c57b8, 0x4fbc0d5dd968a708, 0xf66bd639e3fb013b},
+  {0x6607bb8d9f1426d8, 0x0b9156b2a938e184, 0x1d6f7d7b46319a77},
+  {0x408e99af5df09232, 0xea04d07e17d71e98, 0x0961e3735a066ceb},
+  {0x0ac48cb89fc1d495, 0xe5ed5004fadbdcb6, 0xb371ec4e641dbdfd},
+  {0x870fba78bc9a5840, 0xa1372a9ae9b35641, 0xd7b9b31aedb9368d},
+  {0x9ec8171425817f91, 0x46d3a766e6d0c217, 0x6d410a83cdfd91e4},
+  {0xbaaf0e5bac52a284, 0x6184eb30dcfa0676, 0x10c8fb0ed6d0bdc9},
+  {0xac8814d3e0fe8707, 0x86d0ff1167e53b8a, 0x10e6600f84bbd4e6},
+  {0x747c0349c6a589dd, 0xf944627e4ef37152, 0x28e5a0f135a5a9bb},
+  {0x382e5c28e3026945, 0xee877613758af703, 0x2d922be5a1610e7f},
+  {0xcadae8499bb4cdb7, 0xd090031f77613a0d, 0xb775a4e76fd94b4f},
+  {0xd09a761e6898eced, 0x5669242c2f84d5da, 0x3d97c6bded80996e},
+  {0x2f95de059a47e03f, 0xfa75be47169ed83f, 0x87d30a6c8dff4a90},
+  {0xf8588b0cb7a0c692, 0xd246208d9f6dc4fb, 0xe36d575d6c2485c0},
+  {0x48c08c7013df5c58, 0x4d37effdea32dc30, 0xff80378ec9caad7d},
+  {0xf9e43db917658f34, 0xb76c0ff79e41f707, 0x8e4935c0b5c08083},
+  {0xb33f84c0bc9ef48d, 0xaab63f4f9f339a4c, 0xae55cf665e81d500},
+  {0x15e234561c4632f1, 0xe084e7a57d035829, 0xbaa1511cb0ed12a0},
+  {0x74f83ba7ec3568de, 0x1d7ecb2f352fdb0b, 0xd76964def60c29f6},
+  {0xd1c2b81f2e13a757, 0xf84d5af929439b5d, 0xc34a2d0878b81e8d},
+  {0x47767837fdba926b, 0x5683aec561752e96, 0x961ca0e7d4439beb},
+  {0x7d73c95d078b625f, 0x6e621c6b3817a9f1, 0xd300b482fda5d226},
+  {0x2cf83b998a66fb35, 0x4f0359eaa9684bfb, 0x2c460d7b4765cbc7},
+  {0xa5c0e6cf67395406, 0xb659d3e82276235d, 0x2c5c851229561369},
+  {0x3168901c3d8747a6, 0x4541eabd5d866402, 0xb768bb5b1a6b8379},
+  {0xb5fa4b6cdc308417, 0x8100841dbbeb59e8, 0x4db5eb632adc8553},
+  {0x2622070061628fa6, 0xc66a1ed278866e50, 0xfad328db6fb4acba},
+  {0x6734cb1adfc5db87, 0xd7f8cfed34d7e713, 0x259e5c52bef9b101},
+  {0xa077ba5e97f9e1c0, 0x21edc3275eed4b8f, 0xc2ddffec584d31bc},
+  {0xe8074b1519eb9faa, 0xa35f39294a8283ed, 0xffbfa9f0fdcce212},
+  {0x49406434389cd06b, 0x5241069e873cd010, 0xde4f448e7e3c47b6},
+  {0x8cb6dafda57a1b04, 0xb80b06fb012be0f6, 0x6c1f61ef626c5ee2},
+  {0x9e596d56ff39dd82, 0xfd823060d81e563c, 0xfe45b0659666e7bf},
+  {0x713e642578abac3b, 0x1e13b3773dddffd6, 0xf7ebe45d0b4ed62e},
+  {0x0fb29b505409913a, 0xbd66ecfa5053f05e, 0x5172fa12bbd062cf},
+  {0x7a8cd2f2af8db5c7, 0xf1c96d88f03f2f0c, 0xfaa8376f49a0abd5},
+  {0xacc980889b25b5e7, 0x2c34843e6a6d9f3d, 0xa6bf67c68037b6ca},
+  {0xaff8095311a13c10, 0x1d4a259b84ca7804, 0x3cbb9d0b61f7ff43},
+  {0x5662cd5d639dfe13, 0x89c27a983290bab8, 0x92a7d11e497af642},
+  {0x4157aad5c3c645ca, 0xf51297f3f77a30f2, 0x83c9dda7804ac4d8},
+  {0x4e84ffef7ca3be0a, 0x14a7ba9c76da7c08, 0x5c28dc6da027d5a0},
+  {0xb0964b96303be4e5, 0x4615a98b7f22a76d, 0xf222f844d2b37df9},
+  {0x802540711d4f5f7d, 0xf6649bae872a32e3, 0xaed6395da047f447},
+  {0x2f0953d8ce80f600, 0xdcf66d5eaf05752f, 0x209193bacdf14ef8},
+  {0xc6a3ef2332ce576d, 0xb9e01c6c4572a31f, 0xde9e30f16310efde},
+  {0xba02b8398971d6e6, 0xd1bab81c9c5221d6, 0x1c9c2d1f1b7f3f2b},
+  {0xedc228019fbdd60a, 0x2753c3a138bcb6d7, 0x786fd2ba67707c2f},
+  {0x448e2cb6c1407cbf, 0xf7b738377f0cfb97, 0x4c9212bdc0657e9c},
+  {0xc76e32691429c2f9, 0x490232f4e8c043ce, 0x217833736b683230},
+  {0xd1499dc75ffd2a9c, 0xd4b5f702de32b776, 0xd6dfbb898f67a374},
+  {0x3b5a28d4cff86b77, 0x806f6c0571138c8b, 0x54628239f0c0f09f},
+  {0xb8d45dd4a900ea0a, 0x2a9169078690c168, 0xb3657df1647fbd66},
+  {0x08189a6674f4c29c, 0x8915f4636dd5d112, 0x654dc7fe07da3107},
+  {0x5250e18c883794b0, 0x8828b68987cd0d9a, 0x300a18a7c772270d},
+  {0x51d33040e3efaa99, 0xd658da2cb0cb97b0, 0x39038890d157c0af},
+  {0x68f5a5cd07a32b53, 0x46b4f5ec1368cf94, 0xf2e0d23f40742f45},
+  {0x782b44a867a3f208, 0xae64fe82046cd425, 0xb78cf45fe171d435},
+  {0xde012b438c92c4d6, 0x4733810dca874273, 0x206a03d102c15302},
+  {0xbea371badf5b9173, 0x8cbfaa817fd4f717, 0x34bea5affcb319d8},
+  {0x1a26c2090378d01a, 0xf3d15fc5c66a7f39, 0x4de762da9a07d052},
+  {0x3486c8a67bccd6cc, 0x0d10351e2b0e18ac, 0x087106b5da2aba90},
+  {0xbd5c398105759654, 0x932e7ce0d2415118, 0xff7a9395dd694851},
+  {0x6f6615de424f584e, 0x6ca415cbf1ff0b9a, 0x509c3763be9bb7ea},
+  {0xe45a5c178e450e25, 0x48cc200c65039546, 0x2c2d872741a6e8d2},
+  {0x10a487ce7b7ba1f7, 0x8da8831a4adaa217, 0xcb608d431e73d316},
+  {0x480667a3a33a0923, 0x3a6fc63a03c45c96, 0xebed952f29ad80c0},
+  {0x8899df2b4edff733, 0x7b68b7ea18849999, 0xcedaa43cfb6f7f7b},
+  {0x356eff5782ed987f, 0xca6aab13ed43b0ce, 0x9dd8a4a5288bc18a},
+  {0x5ffc38d8fbfdcdb6, 0x697d4c0b82ce34af, 0x3509dc6ecc05993b},
+  {0x83905969be9090dd, 0x2125eb5bbd23d5da, 0x64224c3dfae48ffe},
+  {0xf54512d0b6691741, 0x0cbaec28b636b0bc, 0xbb1d6adcda1edefc},
+  {0x89ea6a9a58cddfdb, 0x845d179babdb73f7, 0xcf74a641c412cff5},
+  {0x65c9f3063d3b266e, 0x560354e0ca062952, 0xc6eb9b218ae96514},
+  {0x8e8c7412b3689e52, 0x99b2ec666a8a4e48, 0x5b4477de15147c03},
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000, 0x0000000000000000},
+  {0x75575b2a01927c2b, 0xe38f9eab8f685827, 0x782b0bd5192bca87},
+  {0xaebbaa0e79dffe28, 0xb3542e6782b8ce84, 0x8a972b1b32323be2},
+  {0x62cbd1af5c77da14, 0x3bbc6119877bbc1d, 0x8b6d73bce65ed541},
+  {0x4cd2ae2762f272cc, 0xdd4adc5bfc34ae6b, 0xa3f908a96f0fe449},
+  {0x60cfbdb9b6447e24, 0x721af8263082c01c, 0x68cb54e6fc7104af},
+  {0xa92867af3dc3b730, 0xb2608cc06efe34d0, 0xa3445078ace873ca},
+  {0xeedad86c96afe677, 0x52afb525bd42562f, 0x38cf8ddb97dc96e5},
+  {0x0a3b06f10bbc9562, 0x577b7a04e02c557b, 0x8be00f5765b7e908},
+  {0xaa72b3916d207e20, 0x50f0cce86025ffac, 0x09f7f935bbde0a04},
+  {0xac08b4e71f96174a, 0x16babbd24d02b260, 0x48e9d357af5ba717},
+  {0xe122c9c16beaa8be, 0x07043902949cfad6, 0xf78fd47b58608577},
+  {0x5c473c24ac8ca469, 0xb1da898afda7d7aa, 0xcac72d2cf21a9be3},
+  {0x7da00b91479d06e3, 0xc4c76d79d51eb15a, 0xb6c2e5796630269d},
+  {0xb42bb35d07e100ec, 0x19964fd51c07b0af, 0xffb88b0ef80a102f},
+  {0x60e6beb41a673a07, 0x75bec86a6c06b470, 0x61bb7f05fc39be82},
+  {0xfda48d0189cd0c13, 0x649054858d5374c9, 0xb770a8503a32e8a4},
+  {0x4a376d825f3006c4, 0x8896eb44124e97aa, 0x70e626bebfff29b5},
+  {0xe37a2f298ccf89c0, 0x3c3609a866d94979, 0x356c25d15f10d784},
+  {0x458ca204a347e41a, 0x59568e0a4da4e181, 0xab475a7c61d9014c},
+  {0x8c1a39eb79672160, 0xb373d43893fbd9fe, 0x41ba8d6a7097e9c8},
+  {0x7c01434b5f8e1448, 0x662bd055a2512d4f, 0x652c31c38e992dbc},
+  {0x1eae5c36fe075219, 0x82cb682598bcb1ea, 0x60daaa526a3e9947},
+  {0xf7ab17ab10f03bdf, 0x1e124b56f71a4c37, 0x75df607068cfdcff},
+  {0x1ffe54ada576e3c2, 0x384cb4e86120aaa8, 0xc6a4fefc642071df},
+  {0x1ae57be1013b7efb, 0x28d36534e13a369d, 0x75612cd220210f77},
+  {0xa84cfbb045298f2b, 0x733fdf0216082f1f, 0x0054b363e1fcdb09},
+  {0x8745e66041e62570, 0x980a16636c09d9b2, 0x51695306d0539b47},
+  {0xa67319655b027ef6, 0x4fd02799c207267a, 0x01587af4a65b6fd2},
+  {0x8d991698735bcc88, 0xf14dbd2b19f99a78, 0x6947a3b95199d2f4},
+  {0xe2906490948e4aff, 0x16b2ee7035d98706, 0x78f47845853b1ddc},
+  {0x58d9cdc2dd693cd9, 0x7e9c240b1b252019, 0x5cbd3d458a53ca24},
+  {0x24101759ff01d89b, 0xed8fdd27cdb2d47b, 0x11b0fa26e8d8a743},
+  {0x9da3e8ee96db2f59, 0x68285801543b4ac8, 0x618cc8ad53d51b65},
+  {0xf0b448478f472d56, 0x6044053c293513d2, 0xea2fb63a575a34cb},
+  {0x56bd7f9b430ca7de, 0xf883dbb4c18d2e0b, 0x6c8030ef1a38c730},
+  {0xf2e2c1396125acce, 0x882e926d399fcc33, 0x87e914f3049f22ce},
+  {0x7ec0b0443f81915d, 0x4573c52a818a44f3, 0xafc01f5cc8120f6b},
+  {0x924aed58bfbc33f1, 0x7cbf5617448b59a8, 0xef023ef380d782bb},
+  {0xed78ebbcc2543624, 0x4fbdf96f5a481d8a, 0x7dea022c85973850},
+  {0x4cd0fda73b73aaf8, 0xab714c84882fc5a6, 0x31a12db8b87c1a82},
+  {0x4f55b122e52b04af, 0x2b6abc206fcdea22, 0xeecc6a28e10f3cd9},
+  {0x773b7f263618ea81, 0xfedd6644251162ff, 0x20f124b39fffa2ef},
+  {0xc86672d34c7f9c99, 0xa1a9bedd91ba54ab, 0xe3164453cdbc1680},
+  {0x976193445cc61080, 0x4e8af4d9771f7fde, 0x2d6951afbad5a152},
+  {0xac8104ed45afc3e0, 0x2daa407aee0854a8, 0x93bf8a5f6332934b},
+  {0xadacd0145616a90f, 0x18fcdf471f8e446e, 0xb6cb1d657c5aee1f},
+  {0x39f4888a9f625046, 0x714ced776be006aa, 0x301aab64f4c07bac},
+  {0xfed94c87075ec99b, 0x6527495efabe5878, 0xae4ed05b44c346fa},
+  {0xe8089970ab84a9ad, 0xfa8ef420f612f142, 0x3033e1b424799c03},
+  {0x3de830d471a1c303, 0x1d4648963e64b5e8, 0xb7fc69c1308d744f},
+  {0xf917cc81a21178a2, 0xf51c71d20d3dde0f, 0xc755e70d903eca43},
+  {0xf988b4435c7e0659, 0xe8ec12c9411e644d, 0x011cff135dc46fe5},
+  {0x45eb42b4bc82e615, 0xbb1ea1d87fa2dcc8, 0xbbf258cddfcc5a4e},
+  {0x76c177c889777fa3, 0x771de5ab30476eca, 0xe3dd4d0ea4da4f41},
+  {0x62d43190a74afaab, 0x8c72e6cc25a0906d, 0x6560641e35c269c1},
+  {0x4a473706039e3353, 0x9270c15446432105, 0x508bd6dfcce33617},
+  {0x58e979ef836cb200, 0x64a108a5f68530dc, 0xeeb5a210610292b9},
+  {0x3e8a485122657a2d, 0xb7f7272f3423621b, 0x4c0e2f899ffc6f0f},
+  {0xb03f26ebad2101f3, 0x2bf27f00ccb827ad, 0xf2c32d1c9db42e29},
+  {0xcc5f196397e2bb63, 0x9cf1f95bba0e5fb0, 0xcffa723b8add78c2},
+  {0x5198cabd81774aa6, 0x79e142bd7c3981f1, 0xcfb65a6d42815d8a},
+  {0x91dc7af311207622, 0xf294a4f3c38f447e, 0xdfd67624b63f7997},
+  {0xfb2f51ed0b5b44c1, 0x6eeb2b229427682c, 0xfad555a3f1680200},
+  {0xd043eb034f7557ae, 0x89f917e3d7f663f1, 0xd7f51e2f59ce0302},
+  {0xd1738764ddee76f4, 0x28a966bea5ec647e, 0xa322c656d7bc27d1},
+  {0x0cd66c8dd29514f0, 0xb4e37bf2f01130a9, 0x7db6ecdc81a7a57f},
+  {0xc8cb28a44796dc78, 0x88eb0048501b3765, 0x8ff3fbd6d703c26d},
+  {0x2c5d68650ca4b6f5, 0xa8e391ce83198344, 0x8b9f3219506be9d0},
+  {0x911906127a1ba855, 0x30d5215961ac95e7, 0x71827dfac7504342},
+  {0x1ae4c2e2506d0712, 0xb5caffb8afbcda6e, 0x159080539f7f876e},
+  {0x86571676f6228cdb, 0x3a51f0bfed40380f, 0x5dec5a0cee962a54},
+  {0xf5c3339c01460504, 0x5d55382d4e349ecc, 0xcf81cc12df0b2c9e},
+  {0x89a775997037437a, 0xc86002223b57f27f, 0xfe795feb841f08ef},
+  {0x7da8a9b3f9f43fe4, 0x8494d51c6e215f43, 0xb703f044bc338b9c},
+  {0xf73c2c9d450a092f, 0xce0ae97084884a01, 0x9a647f6d5f970839},
+  {0x87c63573f869cdbb, 0x812d2d8e966e6911, 0x973b425ba1c66dfa},
+  {0x7de5a1e78d630e85, 0x765d7d5a4a6e3cb7, 0x28170eef2a846d99},
+  {0x0b0c630c0f59460d, 0x9c8758a9ee8db258, 0xd3589f9c034f75d5},
+  {0xe1a6d8e757067309, 0xd18498099be244d9, 0x9b10a894502fc4e1},
+  {0xfa14fe8a1dd59c3e, 0x6a9a93b0f1ac862a, 0xdbe4d8d065053ef7},
+  {0x5c94965ff0a8e28e, 0xc2a32a0d57f1faa2, 0x24dc5effe1fa9e37},
+  {0x6b404bba72a24d04, 0xbcd23a38f7981241, 0x93d0c9eb1b9a39ef},
+  {0xa53a198b9e74e59c, 0x17cb3bc05f9608d1, 0x21bcc23eb5e75655},
+  {0x05911f7d3220397f, 0x7915054dcb628314, 0x183a2a8400570cef},
+  {0x2a420bf34788186c, 0x8c83a2945ee3027b, 0x606a65c37a8f2fe3},
+  {0xccf4e83131d54a27, 0xc95466a498499126, 0xef9ac8206968b1f7},
+  {0xe457b2ff12256f1e, 0x57fd60a454e5f68d, 0xf3388bb1de5dd1c2},
+  {0x4addb3e322595749, 0x39e02bd59d8ae504, 0x20284c1ae2f1a65c},
+  {0x9fbb5574795cac4d, 0x9fedac975974c8bb, 0xd307ecf05fd4fd22},
+  {0x2505bb81200f8cbb, 0x2ac9d93c45830708, 0x11ec704af2c49861},
+  {0xfa1702dd351d3b22, 0xbe0dfc13d607f962, 0x82c611b8ccd1e9f2},
+  {0xb7ff038d58626bd7, 0x86e990a7d6acad3b, 0x5010d30fbe2d70a9},
+  {0xc42bda459ef1afca, 0x83c5891e3eff20a0, 0xdefbb485c364fd5a},
+  {0xaada4d9f943df0f1, 0x2618e51a8838b5fe, 0x8f45f0ffff45201f},
+  {0xb55e3891213f972c, 0xdb4f56b16dc4e905, 0x30fd462a4cf268fc},
+  {0x64e007b7010e8c80, 0x2d0de3d26a1748c3, 0xa2e01ed12648c113},
+  {0x5128d2b5c4bac674, 0xb80b46283a340508, 0x1c1f01fe24b17a66},
+  {0x4cb8ab976733595f, 0x403aca262ff117b0, 0xce1698b4f9a54376},
+  {0x7781e71d8805fdc4, 0x40c3c2110800e7a0, 0xe72e9e63999cc311},
+  {0xbb3e3e6501e45c00, 0x9e70bd7de6780a3b, 0x549416aa087fe4c5},
+  {0xae1da809d7eed055, 0x06ba5804e029b01c, 0x490555c99e76bd05},
+  {0x67f3afbbfeee6547, 0x1243b190c38432b1, 0xbab2fa8df7bf2943},
+  {0x6d7197464f15c83c, 0x9283ced1147a6a85, 0x96ba1a0e47d9dd96},
+  {0x9cbb90e485218006, 0x8b5ff83a0210b4d9, 0x1086afcf143b95c2},
+  {0xa07d026b378f963b, 0x2debd80b456cd3e3, 0xc7792b9bc7f54c4a},
+  {0x3d0bec8b88ba06b8, 0x0c13cdfdc4d01e9f, 0x6d256d1087b9c95e},
+  {0x9216a33ea47259ff, 0x2bde0cfcb54abe8d, 0xaaef421825f1b47b},
+  {0xa1aabb09b181ae0f, 0xc14d44d54e3620cd, 0xabb20e2a4d637bcb},
+  {0x2544eba1038d1b04, 0xda1f84aa9bc120c2, 0x41fd7f657a18c45d},
+  {0xadaff973f301d8c3, 0x87dae306486ff1a6, 0x60ec280a2570b8ff},
+  {0x624994b2704d4c20, 0x532232f1cf209482, 0x861b9c2a5a7d0a43},
+  {0x4513aa7db58aea4d, 0x89dfbe8c94798dde, 0xe735f37739441c13},
+  {0x2f534ce65fbe5d87, 0xf8fcb2432339f543, 0x8ea957572a77e395},
+  {0x2456c8d764e7c1a6, 0x7dc7567c507e2e18, 0xd29b13c5db1cd65a},
+  {0x885705a845bb1199, 0xebc702d7e1680421, 0x9aeba22f533cbac9},
+  {0x55c435f803ad3742, 0x695442fe576b3a09, 0x5ca02fab230ee023},
+  {0x0d446bb06a3cbf8b, 0x5bfc8414d84fff9a, 0x157e3384708408a8},
+  {0x7b212d17c02a4054, 0x2b14562733ba6900, 0x7965f7d93122eac0},
+  {0x349446294451df24, 0x2b91f57cdcc289f3, 0x829cb5a03cce767d},
+  {0x2f8e7fa84f0ad401, 0xb3a50f68cba8a638, 0xde440882f84bfd7a},
+  {0xd1ba1db41829f412, 0x9a2c4c23fb8538f7, 0x86ca32d92d99ecb9},
+  {0x8a6db99a627b227c, 0x633c81cf8e52a687, 0x8e58542594d7103e},
+  {0x4c5a928b8610d6cd, 0x6a38a81e5ec41b61, 0x05ac22b201c86322},
+  {0x283c4b53c14f39c0, 0x106fe171df2218c5, 0x4c077d33f17e0107},
+  {0x198b4c90bd33552f, 0x5853a4c2f74596db, 0x1018dd6bf21150d4},
+  {0x47c29e1c2f495b4c, 0x7ec84995131d545b, 0x49e53beaeb94dae0},
+  {0x2678b3f7b548fc9f, 0x63a6b9322f3a574c, 0xef6d85f1091f1aeb},
+  {0xf1391f569cd5fe90, 0x876e8ba956de0238, 0x6cd576e3b8ab6222},
+  {0x827547465967b775, 0x4197e1290368e412, 0xee63a7ef2156fb67},
+  {0x6cb2a919735b34d5, 0x6cc967b756d72395, 0x9a884a65ae74e811},
+  {0xbdebcb5fbfafafc0, 0xb7fc62a4c7947030, 0x554c36728822d8b6},
+  {0x025fef80c960792a, 0xc0f487dcc0ad8059, 0x9714504680995ad0},
+  {0x19ffb11f02502666, 0x482fc0fae8608ad2, 0x781175f6049c62ee},
+  {0xf1fece4f515854e7, 0x6dab52f7b6560106, 0xfa0028f50d672954},
+  {0x844afcd287c1ddba, 0x47234b529fe3ca41, 0x3ca221c08f88140a},
+  {0xfdbbeaaa02badeda, 0xf35a5e21992e2332, 0xa37f6d68d919b65f},
+  {0x6d218f603725748a, 0xb6df3c61103e9c3e, 0xbb7ac1cf4c1f4692},
+  {0x8e6d3eb058cfc260, 0xfbe2f6497287731a, 0xffa78646830d5ce0},
+  {0x8c07c328df449acd, 0x500ba217a7af529f, 0x19ab11b99a1a2a19},
+  {0x42de87a6001d7bc4, 0x6d65941a9ae5138b, 0xcb830271914ce1ee},
+  {0x25f950eb4e2b9669, 0x0c9f7a2279a16278, 0x86503e9de2e76202},
+  {0xedc0f3a86b732556, 0xc7995c7b3ec0ea66, 0x8a4d95b8d19c29ce},
+  {0x01b5ab0eca4d3189, 0xed7898b982b519ad, 0x24c5f841a769f11b},
+  {0xde3eefe1bad32178, 0x493a735c30942df4, 0x8b5ec5bed8e4d565},
+  {0xa974a9d616b752fa, 0x09d37b2ab193ca1c, 0x55b8aaf3af4481ba},
+  {0x84ca6915121b1e09, 0x8831e83e34fac643, 0x05e3db5a89049a2f},
+  {0x5375a9f4aefd0f44, 0xaf272fd031366078, 0xbbd286c07ed80632},
+  {0x9d101a493aa2ebc9, 0x67e3ddfaa73b2b94, 0x45bf06b13a5d6856},
+  {0x6469dfeed8b766bc, 0x41a958a8c84553fc, 0xc3665b3f060a6808},
+  {0x8bbd23b38d0cff32, 0x891f48bb2592fb3f, 0x24c6243ad065453e},
+  {0xf3d1cc12dcb4e302, 0x588dfaa464f518be, 0xfe082e8b4a39cf26},
+  {0x95c521746547be8e, 0x9cbbea72400d1df8, 0x0cfdac076655d579},
+  {0xa6c4c57375f48495, 0xd63f47b41907a3f7, 0x34e17c2df60668d7},
+  {0xa135ca38c26b95c3, 0x2aac9c6b01173258, 0x2d8499bf2ed7c23c},
+  {0xba02892976144352, 0x9e4d9906dc2ae94e, 0x6535b5091d0535a4},
+  {0x6ec4dba2c6f7e949, 0x02d65b71f7db3f86, 0x61c796b0290e7ff0},
+  {0xac044d22d442ff2e, 0x29d00d9db764b6ff, 0x9ec4ff5f21f3216d},
+  {0x26b3c84573c53161, 0xa3037316e91bf8bb, 0x251ed327edf11e39},
+  {0x2917804d2422970c, 0x16119362ba8934be, 0xafa94e1359c77cce},
+  {0x4eac35ec04e84a0b, 0x31b309e5e5d361a5, 0x4171e00956fd334e},
+  {0xa02b9fdd9f6b8162, 0xabd8bc110f4e1f52, 0x75578ed77238fedc},
+  {0xe73f9ad96bd8686d, 0xbdfc49ed2dba8097, 0x054c4bb989c34404},
+  {0xa0d01888aa5b1042, 0x8c33305a0dc075b1, 0x75f81fe0369e7b86},
+  {0x679d711aa88faab7, 0xb03f74deaa29c24c, 0x10a7766990689f5a},
+  {0x827d13e4d6310b6b, 0xc5a73641d06e47d1, 0xf2f0d06e14e2ab1f},
+  {0xcc968649ec63f05e, 0x17cda3a7fc25bfb2, 0x0df1338db25ee18e},
+  {0x7d4acd6c3cf8c18b, 0x4bd734fd562d48ad, 0xae50c4f72f542533},
+  {0xcf438bf70dbe4c62, 0x0019bcea28ce9270, 0xf687acda7ff8c960},
+  {0x5b24783c5318fd09, 0x5623189d31422de8, 0x862fd585eeb3e3f0},
+  {0xf98482f8df7d5e16, 0xccb9fb2d3745fbbf, 0x7d5e1bd364daa7d4},
+  {0x024849574a40a831, 0x48cae56880d67329, 0xfafa85469a93e6b3},
+  {0x944eae6b760bc534, 0x1d1d18f30fec24c3, 0xc64a74b4d0c3181e},
+  {0x19c52990a4e62d2d, 0x37b473c7ed759ef9, 0x04080c0ade3df738},
+  {0xfcc4062c7876c075, 0x48b4cf0b72aae741, 0x3889eef0b66c1bff},
+  {0x49c26471ae06da0b, 0x109da4749a70108b, 0x443b50c74915bd54},
+  {0xbe68bd432e672eb8, 0xbe737af593618ab7, 0x5d537d8c0da1a4e7},
+  {0xa3ca7393ce4e8d7c, 0x0fcf46d53a057c21, 0x7451a590ca6c1db1},
+  {0x79419444b1c149e5, 0x9d577a1e13240b2d, 0x24da1fd0d5db6e4d},
+  {0xe8c3caf37ad5170c, 0x423b4593d3f4c834, 0xff039eaad5042ae3},
+  {0x3bf5913b5615f7f5, 0x2d24b840238f2c84, 0x97bdc5bfeb1d53b7},
+  {0x53538b2293df4606, 0x169029e2d8675ec6, 0x9ab1ac25ee4982a4},
+  {0x75bd284d07f591f8, 0xccdd36b98d68786e, 0x9321ba79d2e56eed},
+  {0xe63236d17de7e69c, 0x9600d5f5cca5b08a, 0x8ff14c81e5d61843},
+  {0xdb079962536683c6, 0x35bb6068eb26bd37, 0xa614c37971ca2e4d},
+  {0xab78167ac83c4064, 0xb6a1928d6f89cdd1, 0xc97cc61d01ffe82f},
+  {0x83e6edd7a512e8b7, 0xe281601e537bc4ec, 0x19d35d2d57518cde},
+  {0xf737f3ddfa7fc9b2, 0x4a8f04a9cb4847be, 0x2946f3355994de91},
+  {0x577ca3baf1f7e1ba, 0x446729b10c51ed7c, 0xab637d9c6e3a5554},
+  {0x4e31798071664def, 0xec15c968e363630d, 0xd7ce5f867f758e48},
+  {0x10525e76bc5a5ed9, 0x1c8a384248ab4398, 0x8f7a522f2e2f3fc5},
+  {0xdee25133572d24bf, 0x37203f7f6c2e0e36, 0x89ba27d9b1233156},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer192f/m4stack/api.h b/crypto_sign/aimer192f/m4stack/api.h
new file mode 100644
index 00000000..dba6ebd7
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 48
+#define CRYPTO_SECRETKEYBYTES 72
+#define CRYPTO_BYTES 13056
+#define CRYPTO_ALGNAME "aimer192f"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer192f/m4stack/field.c b/crypto_sign/aimer192f/m4stack/field.c
new file mode 100644
index 00000000..91ee3d55
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/field.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+  uint64_t sub[6] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[0] ^ a[2];
+  sub[3] = b[0] ^ b[2];
+  sub[4] = a[1] ^ a[2];
+  sub[5] = b[1] ^ b[2];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &temp[5], &t[1]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul(&sub[4], &sub[5], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] = temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] = temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+  uint64_t sub[6] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[0] ^ a[2];
+  sub[3] = b[0] ^ b[2];
+  sub[4] = a[1] ^ a[2];
+  sub[5] = b[1] ^ b[2];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &temp[5], &t[1]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul(&sub[4], &sub[5], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] ^= temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] ^= temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+  c[2] = temp_c2;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+  c[2] ^= temp_c2;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&temp[5], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[2]), (b[0] ^ b[2]));
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[1] ^ a[2]), (b[1] ^ b[2]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] = temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] = temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&temp[5], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[2]), (b[0] ^ b[2]));
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[1] ^ a[2]), (b[1] ^ b[2]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] ^= temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] ^= temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
diff --git a/crypto_sign/aimer192f/m4stack/field.h b/crypto_sign/aimer192f/m4stack/field.h
new file mode 100644
index 00000000..5182adc4
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[3];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer192f/m4stack/hash.c b/crypto_sign/aimer192f/m4stack/hash.c
new file mode 100644
index 00000000..06a6f473
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/hash.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+void hash_init(hash_instance *ctx)
+{
+  shake256_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake256_inc_init(ctx);
+  shake256_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake256_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake256_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake256_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake256_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake256_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer192f/m4stack/hash.h b/crypto_sign/aimer192f/m4stack/hash.h
new file mode 100644
index 00000000..0fb7c8ef
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/hash.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake256incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer192f/m4stack/params.h b/crypto_sign/aimer192f/m4stack/params.h
new file mode 100644
index 00000000..d7e67589
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/params.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer192f_m4stack_##s
+
+#define SECURITY_BITS               192                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         2                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     49                   // number of parallel repetitions (Tau)
+#define AIMER_N                     16                   // number of MPC parties (N)
+#define AIMER_LOGN                  4                    // log_2(N)
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer192f/m4stack/sign.c b/crypto_sign/aimer192f/m4stack/sign.c
new file mode 100644
index 00000000..905b10f8
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/sign.c
@@ -0,0 +1,628 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_mul(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->z_shares[0], mult_chk->x_shares[0],
+                           aim2_e1_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE])
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  hash_instance ctx;
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1],  rep, party);
+      hash_update(&ctx, commits, AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+      }
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF)
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF pt_shares[AIMER_N];
+  GF alpha_v_shares[2][AIMER_N];
+  GF epsilons[AIMER_L + 1];
+
+  // prepare h2
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF_set0(alpha);
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1], rep, party);
+
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk));
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+        GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+        GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+        GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares, sizeof(alpha_v_shares));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b)
+{
+  // prepare challenge parties
+  hash_instance ctx;
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF epsilons[AIMER_L + 1];
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  mult_chk_t mult_chk;
+  memset(&mult_chk, 0, sizeof(mult_chk_t));
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes, i_bar);
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha_share = {0,};
+
+    tape_t tape;
+    commit_and_expand_tape(&tape, commits, &ctx_precom,
+                           nodes[AIMER_N + i_bar - 1], rep, i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits,
+           AIMER_COMMIT_SIZE);
+
+    GF_set0(mult_chk.x_shares[AIMER_L]);
+
+    // adjust the last share and prepare the proof and h_1
+    if (i_bar == AIMER_N - 1)
+    {
+      // initialize adjustment values
+      tape_t delta;
+      memset(&delta, 0, sizeof(tape_t));
+
+      GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+      GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+      GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+      GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+      GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+      GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+    }
+
+    for (size_t ell = 0; ell < AIMER_L; ell++)
+    {
+      GF_copy(mult_chk.x_shares[ell], tape.t_shares[ell]);
+
+      // x_* = sum_i A[i] * t[i] + b
+      GF_transposed_matmul_add(mult_chk.x_shares[AIMER_L],
+                               tape.t_shares[ell], matrix_A[ell]);
+    }
+    GF_copy(alpha_share, tape.a_share);
+
+    GF_mul_add(alpha_share, mult_chk.x_shares[0], epsilons[0]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[1], epsilons[1]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[2], epsilons[2]);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes, alpha_share);
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  signature_t *sign = (signature_t *)sig;
+
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt and root seeds
+  hash_instance ctx_roots;
+  hash_init_prefix(&ctx_roots, HASH_PREFIX_3);
+  hash_update(&ctx_roots, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_roots, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_roots, random, SECURITY_BYTES);
+  hash_final(&ctx_roots);
+  hash_squeeze(&ctx_roots, sign->salt, AIMER_SALT_SIZE);
+
+  uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE];
+  hash_squeeze(&ctx_roots, (uint8_t *)root_seeds, AIMER_T * AIMER_SEED_SIZE);
+  hash_ctx_release(&ctx_roots);
+
+  run_phase_1(sign, pt_GF, (const GF *)sbox_outputs, mu,
+              (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds);
+
+  run_phase_1_to_3(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, ct_GF);
+
+  run_phase_1_to_5(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b);
+
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(signature, CRYPTO_BYTES, message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer192f/m4stack/sign.h b/crypto_sign/aimer192f/m4stack/sign.h
new file mode 100644
index 00000000..2e2ab61d
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/sign.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_3 AIMER_NAMESPACE(run_phase_1_to_3)
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF);
+
+#define run_phase_1_to_5 AIMER_NAMESPACE(run_phase_1_to_5)
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer192f/m4stack/tree.c b/crypto_sign/aimer192f/m4stack/tree.c
new file mode 100644
index 00000000..0585e7fa
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/tree.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE])
+{
+  size_t index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  memcpy(nodes[0], root_seed, AIMER_SEED_SIZE);
+  buffer[0] = (uint8_t)(rep_index);
+  for (index = 1; index < AIMER_N; index++)
+  {
+    buffer[1] = (uint8_t)(index);
+    memcpy(buffer + 2, nodes[index - 1], AIMER_SEED_SIZE);
+
+    hash_ctx_clone(&ctx, &ctx_);
+    hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+    hash_final(&ctx);
+    hash_squeeze(&ctx, nodes[2 * index - 1], AIMER_SEED_SIZE << 1);
+    hash_ctx_release(&ctx);
+  }
+  hash_ctx_release(&ctx_);
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer192f/m4stack/tree.h b/crypto_sign/aimer192f/m4stack/tree.h
new file mode 100644
index 00000000..f1354f19
--- /dev/null
+++ b/crypto_sign/aimer192f/m4stack/tree.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define expand_tree AIMER_NAMESPACE(expand_tree)
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE]);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer192s/m4speed/__asm_field.S b/crypto_sign/aimer192s/m4speed/__asm_field.S
new file mode 100644
index 00000000..26575c28
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/__asm_field.S
@@ -0,0 +1,617 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  ldr.w R2, [in_p, #4 * width]
+  ldr.w R3, [in_p, #5 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R3, [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R2, [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  ldr.w R3,  [in0_p, #4 * width]
+  ldr.w R12, [in1_p, #4 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #4 * width]
+
+  ldr.w R3,  [in0_p, #5 * width]
+  ldr.w R12, [in1_p, #5 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w {R4-R10, lr}
+
+  ldr.w in0, [in_p, #2 * width]  // a[1]
+  ldr.w in2, [in_p, #3 * width]
+  ldr.w in4, [in_p, #4 * width]  // a[2]
+  ldr.w in6, [in_p, #5 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R10, C4
+
+  and.w in0, in0, R10, lsr #16
+  and.w in2, in2, R10, lsr #16
+  and.w in4, in4, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+  eor.w in2, in2, in7, lsr #25
+  eor.w in2, in2, in7, lsr #30
+  eor.w in2, in2, in7, lsr #31
+
+  // c[2] = temp[2] ^ temp[5];
+  eor.w in0, in0, in6
+  eor.w in1, in1, in7
+
+  // c[2] ^= (temp[5] << 7) | ((temp[4] >> 57);
+  // c[2] ^= (temp[5] << 2) | ((temp[4] >> 62);
+  // c[2] ^= (temp[5] << 1) | ((temp[4] >> 63);
+  eor.w in0, in0, in5, lsr #25
+  eor.w in0, in0, in5, lsr #30
+  eor.w in0, in0, in5, lsr #31
+
+  eor.w in0, in0, in6, lsl #7
+  eor.w in0, in0, in6, lsl #2
+  eor.w in0, in0, in6, lsl #1
+
+  eor.w in1, in1, in6, lsr #25
+  eor.w in1, in1, in6, lsr #30
+  eor.w in1, in1, in6, lsr #31
+
+  eor.w in1, in1, in7, lsl #7
+  eor.w in1, in1, in7, lsl #2
+  eor.w in1, in1, in7, lsl #1
+
+  str.w in0, [out_p, #4 * width]
+  str.w in1, [out_p, #5 * width]
+
+  ldr.w in0, [in_p, #0 * width]  // a[0]
+  ldr.w in6, [in_p, #1 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in7, in6, #16
+
+  and.w in0, in0, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // c[1] = temp[1] ^ temp[4];
+  eor.w in6, in6, in4
+  eor.w in7, in7, in5
+
+  // c[1] ^= (temp[4] << 7) | (t >> 57);
+  // c[1] ^= (temp[4] << 2) | (t >> 62);
+  // c[1] ^= (temp[4] << 1) | (t >> 63);
+  eor.w in6, in6, in3, lsr #25
+  eor.w in6, in6, in3, lsr #30
+  eor.w in6, in6, in3, lsr #31
+
+  eor.w in6, in6, in4, lsl #7
+  eor.w in6, in6, in4, lsl #2
+  eor.w in6, in6, in4, lsl #1
+
+  eor.w in7, in7, in4, lsr #25
+  eor.w in7, in7, in4, lsr #30
+  eor.w in7, in7, in4, lsr #31
+
+  eor.w in7, in7, in5, lsl #7
+  eor.w in7, in7, in5, lsl #2
+  eor.w in7, in7, in5, lsl #1
+
+  str.w in6, [out_p, #2 * width]
+  str.w in7, [out_p, #3 * width]
+
+  // c[0] = temp[0] ^ t;
+  eor.w in0, in0, in2
+  eor.w in1, in1, in3
+
+  // c[0] ^= (t << 7);
+  // c[0] ^= (t << 2);
+  // c[0] ^= (t << 1);
+  eor.w in0, in0, in2, lsl #7
+  eor.w in0, in0, in2, lsl #2
+  eor.w in0, in0, in2, lsl #1
+
+  eor.w in1, in1, in2, lsr #25
+  eor.w in1, in1, in2, lsr #30
+  eor.w in1, in1, in2, lsr #31
+
+  eor.w in1, in1, in3, lsl #7
+  eor.w in1, in1, in3, lsl #2
+  eor.w in1, in1, in3, lsl #1
+
+  str.w in0, [out_p, #0 * width]
+  str.w in1, [out_p, #1 * width]
+
+  pop.w {R4-R10, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer192s/m4speed/aim2.c b/crypto_sign/aimer192s/m4speed/aim2.c
new file mode 100644
index 00000000..b5dbbc85
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/aim2.c
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 17
+// (2 ^ 17 - 1) ^ (-1) mod (2 ^ 192 - 1)
+// = 0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5ad6ad6b56b5ab5ad5
+// ad6b56b5ab5ad5 ad6 ad6b56b5ab5ad5 ad6 ad6b56b5ab5ad5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t1 = in ^ (0xad)
+  GF_sqr_s(t1, table_a);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t2 = in ^ (0xad 6), table_d = in ^ (0xad5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_6);
+  GF_mul_s(table_d, t1, table_5);
+
+  // t1 = in ^ (0xad6 b)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xad6b5 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xad6b56 b)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b56b 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xad6b56b5 a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xad6b56b5a b)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b56b5ab 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // table_d = in ^ (0xad6b56b5ab5 ad5)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(table_d, t1, table_d);
+
+  // t1 = n ^ (0xad6b56b5ab5ad5 ad6)
+  GF_sqr_s(t1, table_d);
+  for (i = 1; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6 ad6b56b5ab5ad5)
+  for (i = 0; i < 56; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5 ad6)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5ad6 ad6b56b5ab5ad5)
+  for (i = 0; i < 56; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_d);
+}
+
+// inverse Mersenne S-box with e2 = 47
+// (2 ^ 47 - 1) ^ (-1) mod (2 ^ 192 - 1)
+// = 0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeeeeeed
+// dddd dddd dddd bb bb bb bb bb bb 77 77 77 77 77 76 ee ee ee ee ee ed
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_6 = {0,}, table_7 = {0,};
+  GF table_b = {0,}, table_d = {0,}, table_e = {0,};
+
+  // t1 = in ^ 3
+  GF_sqr_s(table_d, in);
+  GF_mul_s(t1, table_d, in);
+
+  // table_6 = (in ^ 3) ^ 2
+  GF_sqr_s(table_6, t1);
+  // table_7 = in ^ 7
+  GF_mul_s(table_7, table_6, in);
+  // table_b = in ^ 11
+  GF_sqr_s(table_b, table_d);
+  GF_mul_s(table_b, table_b, table_7);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_6, table_7);
+  // table_e = in ^ 14
+  GF_sqr_s(table_e, table_7);
+
+  // table_b = in ^ (0xbb)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_b);
+
+  // table_7 = in ^ (0x77), table_6 = in ^ (0x76)
+  GF_sqr_s(t1, table_7);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_6, t1, table_6);
+  GF_mul_s(table_7, t1, table_7);
+
+  // t2 = in ^ (0xdd)
+  GF_sqr_s(t1, table_d);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // table_e = in ^ (0xee), table_d = in ^ (0xed)
+  GF_sqr_s(t1, table_e);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_d, t1, table_d);
+  GF_mul_s(table_e, t1, table_e);
+
+  // t2 = in ^ (0xdd dd)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t2, t1, t2);
+
+  // t1 = in ^ (0xdddd dddd)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 16; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xdddddddd dddd)
+  for (i = 0; i < 16; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xdddddddddddd bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb77 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb7777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb77777777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb7777777777 76)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776 ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776ee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // out = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeeeee ed)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_d);
+}
+
+// Mersenne exponentiation with e_star = 5
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+  GF t2 = {0,};
+
+  // t2 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t2, t1, in);
+
+  // t1 = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t2);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 5 - 1)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, t2);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[0], state[0], vector_b);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+}
diff --git a/crypto_sign/aimer192s/m4speed/aim2.h b/crypto_sign/aimer192s/m4speed/aim2.h
new file mode 100644
index 00000000..b30d4cb9
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/aim2.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0xc0ac29b7c97c50dd, 0xbe5466cf34e90c6c, 0x452821e638d01377},
+  {0xd1310ba698dfb5ac, 0x9216d5d98979fb1b, 0x3f84d5b5b5470917}
+};
+
+static const GF aim2_e1_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000, 0x0000000000000000},
+  {0x24187d60404121f6, 0x994d0c36800d12c1, 0x00911dd52a0924f1},
+  {0x764f49362db3c478, 0x3bcc2005010a3fa6, 0x402147d6af1a6ff4},
+  {0x1c0878591079091d, 0x9b08ffde1c878f59, 0x8ff70000000021c2},
+  {0xbc23dcb74c10198f, 0xe23fb48357412666, 0x70031ccb7f97795b},
+  {0xffa72d9a27550570, 0xc7dab56f7d5ade7c, 0x78cd4c6283845a4b},
+  {0x655b34aa00430d9a, 0x0150004209eea37c, 0xea5061fe40551141},
+  {0x291b4f90d5814c36, 0xcda4bfb158be9a9b, 0x0d4558cc51c4127c},
+  {0xbe4eb108521087f8, 0x855a49e49b1f9165, 0xfa15129aaa8d8745},
+  {0xef60386cb35ccf9a, 0x5115765ff710f9f9, 0x205677891921e135},
+  {0xbed705ee53ec571d, 0x97ef8c6dd0851236, 0xdfb8887b08ee7b6d},
+  {0x6731ce99be825c5a, 0x78665e68455482e1, 0x8b867f2046054b3e},
+  {0x008fe70500592609, 0x6419eeb2829f34c3, 0x8f95a35e28a915f4},
+  {0x4dd556b654d54730, 0x07e8d738dc4b2c41, 0x6de823272f319c70},
+  {0xc805945260585e93, 0xf3efb93595438399, 0x387f3dab97add8fd},
+  {0x8825784a2bb54db6, 0x8d1d21f68a9fed14, 0xd72c5de2e4375500},
+  {0xd9fbd5d41179e461, 0xbaa9f9428fe27896, 0x49998ea2c43c70ee},
+  {0xba1e061ac9218b6b, 0x93a1c1ea0a23984a, 0x145015f4bca9f514},
+  {0xb2829eadb1319c61, 0xf21008aca9c587af, 0x491dfc66b48bb406},
+  {0xdc192cc5729969e6, 0x19aeec2c6a3facb8, 0xeaf05f73c034e88f},
+  {0x5cc5d35af8af5039, 0x64bfd6b3c8401142, 0x4d083af0e0cecd4f},
+  {0xbcb663181c16e418, 0x9d73d6e08b40b1cf, 0xe6a19d2ea608b779},
+  {0x8f9e2660cdf64ce3, 0x6e790dfe030df1e7, 0xf36bdb76802d4809},
+  {0x24e27b21fdd534eb, 0x9b2abc8327bb58a6, 0xa60607784f3d2a8a},
+  {0x6470b72d839b493a, 0x3de3bd12dbc9236b, 0xab0e0e81db838cab},
+  {0x9fa25765dfa0dc0c, 0xa4866af77f3c1d39, 0xa22985fd177fb75e},
+  {0x1bd0dcf82dfcbaf6, 0x2778cab77faeae14, 0x144c9d871ac906e8},
+  {0xab206aa0299e585a, 0x1f2a1c115b2b24e7, 0xd683dc1df4f0e8e4},
+  {0x3db096486b11d3fc, 0x1d88f50f57fb1318, 0xfbdbd02cf211be3b},
+  {0x83c0ed680040dbeb, 0x01d5321e9c73822a, 0x5c78f9da86ddc253},
+  {0xed72eb240cfd7027, 0xe43295f2eab71065, 0x7dad74ed8a4daf27},
+  {0x593448e3f55865bc, 0x3dbc22ef1d415b62, 0xff617d36a6e04fd7},
+  {0x79fee82d5e5f6225, 0xe933e7ffba3ad69f, 0x11333262fecf9f21},
+  {0xaccf982f89364968, 0x961868954276eacd, 0x3903286905b4951a},
+  {0x15f9d8aff0e99b99, 0x37d7fc3823e38e15, 0x8f3cf305ce9c3317},
+  {0x5f1db90ec8ff178c, 0xef61eb5b69c0cf16, 0xd6d4428841ba2406},
+  {0x6c1d820160b3e589, 0x1655a37c12244e16, 0x1506fe0d42af221a},
+  {0x776220241d5f52f8, 0xbbd873a1a32d77fc, 0x2967ed932de2646d},
+  {0xb360b6c691f374f5, 0xe152921a89b1bb3a, 0x9bb32e5d9871acf2},
+  {0xbbae8029d2f0211d, 0xdfa58ed49cdc469a, 0x298aa1fd3b5fee94},
+  {0x311334572c4f58e2, 0xbd79cb94c83a4a65, 0x097731c2b9f63b2f},
+  {0x202f161d6f618d78, 0xb30f00f7d63d2b1c, 0xba3ba40cb586c147},
+  {0x6f6de8a66957b811, 0x933c64f745e4cb26, 0xe60acca62b3467da},
+  {0x2d52d8e03eadc408, 0x020b8ada8b0cbcfb, 0x97e520c15d31d866},
+  {0x17f79f53394c41f8, 0x8057746b55d4354d, 0x29944f234150b558},
+  {0xd48d6f8d466f4fb7, 0xe62aa6c05e099abf, 0xe72196d812cdf8ff},
+  {0x31086eee778187b7, 0x5f39e6312ab8e7fc, 0xd2794f291ba18edc},
+  {0x8bb7a2d05d52dd01, 0x898fee2a72a51691, 0xaf83c32d4f112cdf},
+  {0xf219effd62769131, 0x006ad7baac86fb08, 0xae1e7bed2f88d4eb},
+  {0x085e604007b4850e, 0x74969c7dc17959a0, 0x70af70f460fd6854},
+  {0x85048e661ea730d2, 0xccb4840c40f6c89e, 0xcb4b3836c98d0776},
+  {0xac7fadd0308807de, 0x93e5399425e1f409, 0x6cebcde031477957},
+  {0x12b09fb9d6bb04ff, 0xa5b0c0475b17d882, 0x9a2d1dc52a42cbfb},
+  {0x2a89655cb1fec3db, 0xb8a64412d508abdf, 0x3998b588ed04feab},
+  {0xa8687e88bff0829c, 0x671e2f2b99afe070, 0x2c08c6f71aa0fa09},
+  {0xe1ce5c820d6be145, 0x7c9485f929d3a113, 0x35a20e96293d131a},
+  {0xba53e0ea72f26b2a, 0x2c4dc2a431baa81b, 0x19674137360734db},
+  {0xde4269315e846bfb, 0x9ed583db0c4ca349, 0x315852fa0660ab68},
+  {0x00ae2ff5c859fcd1, 0x8a404e1ee645e1db, 0x9feadfee4a6a10b9},
+  {0x098454c0f608253b, 0xbf09d16ec3b96f79, 0xe63451db95697baf},
+  {0xa422cc6c5adc283f, 0xb7854c10a36c12d0, 0x9650b028e25b9107},
+  {0x8da1b75903dd2aa8, 0xef8f3a20c77f4c10, 0x11e6a8d176631e6f},
+  {0xe70563f20a26d72a, 0xc706a9184b4269ec, 0x01707c8cd370854b},
+  {0x4c497f712f722710, 0x40d97c17a9f96a81, 0x61ac088c7242b19b},
+  {0x9c1188e5b2c4043a, 0x15c4ce5e386918fd, 0xc2c19cddc8022f62},
+  {0x334dd52624b37647, 0x0ecfeb52b8db6b3a, 0x7cb0cc6a541d915f},
+  {0x0d2da3de5da05ab9, 0x4c8403040eb7a0a8, 0xaa43178d698e1d16},
+  {0x94dd24ac7d70454e, 0x19c81eacd2305f1d, 0xab7995a48e6230a2},
+  {0xc4c2698143f7ebe6, 0x9a9c3bf3c8dbc9bc, 0xef2ce69e69cf09cf},
+  {0xe4d55e8362bd6084, 0x4bd67382e024dfd0, 0x821aed870355bf63},
+  {0xd76139f98e468054, 0x61f1798f51310a13, 0x29046f782268e0dc},
+  {0xd415fc0d991dd093, 0x40c961038916982c, 0x50c6b0ef248e059b},
+  {0x9964bad18a8082f1, 0x666ff6785e18a4dd, 0x8ef30e5710f8282b},
+  {0xb414e2f6230594fe, 0x1bc6a73e670570f9, 0x58556965657d0723},
+  {0x7923079ff8bc88c9, 0x2009ba12607a4104, 0x79486291900310c9},
+  {0xbee4fd3a8ba864ef, 0x5df270cc7b675b45, 0x8fe410ae3a6416b4},
+  {0xed8ea038500ce1aa, 0x23cfffa4b08f7923, 0x24391c9872e1db52},
+  {0xea11414bd1ee6f54, 0x57a5ebe50ea4869b, 0x18f580aebbed4614},
+  {0x4d0c81d6ef843f2f, 0xfd169854c78d4b18, 0x7c36b2afccb84371},
+  {0x0c639f2dc76998e0, 0xdc8e28abec0a421f, 0xfba0c0a5251cd144},
+  {0x766dda3b823a1b74, 0x7f6d206bbd49261d, 0x710de4ad8beaa62e},
+  {0x7abd0b3c484d3910, 0x58abd14b6ee2e49b, 0x78652fe31e4d6d19},
+  {0x4dce3f2a407a25c2, 0x57d6ce10b19b7b99, 0x29cabd29d03528c3},
+  {0xf03c709f8b55bbc2, 0x10f449ee0641e483, 0xf60bd442dfd1a803},
+  {0x51d8a3af211b35bb, 0x2b0c872b328250e9, 0xb67d77e5c9d6d27a},
+  {0x9a731c8f091b2c24, 0x04cf41a716e1e225, 0x9b354a2d84899ec9},
+  {0x0748672bb3e504fb, 0xda648aaa478a326c, 0x0d85a4a55979e5ca},
+  {0xbb732bb90d147586, 0x446c43c25a19dc66, 0x18523f7f708eff36},
+  {0xc549edb1f37b1b15, 0x719aa23612aac7e4, 0x2c771e685e380ec2},
+  {0xe2b6b4207ad6a4b6, 0xf7cc2a116c9527ba, 0xdf6e5d55b2406221},
+  {0xb67a2baac610e044, 0xd425d94d1ebe4051, 0xb7bd1ce70c015395},
+  {0x64ff5ff72d64a1b1, 0xdaca2b8812d90ae6, 0x79a022efcc594eaf},
+  {0xc93cfa6de67bcacd, 0xa179dce6ffd14aec, 0x31528f0f0f3c6817},
+  {0x3ec18f7af7342039, 0xf8d7aa856a662ed9, 0x097b848460df8308},
+  {0xf037fa04d6ff2eb4, 0x1b6ec290719d4d0a, 0xe20e86a3b38d743e},
+  {0x8aea64bccc94d424, 0x2cc260f4f6b65bad, 0x355d31f6d901a260},
+  {0x140e5ae17cc96cb4, 0x620ee0a86b0eda0a, 0xb3fcecb29d358575},
+  {0x5ec85d1f29af07e2, 0xd6c8834f22331d6a, 0xcef37a820396e162},
+  {0xe344085d2eabc755, 0x6c6b136959c8ef7a, 0xbb22e260fa6a677a},
+  {0x7a64bfaa585ae30a, 0xe317efc967bbe220, 0x9a9780dfb02d4b7e},
+  {0x98c71744cd706ceb, 0xd177e9274ab5f551, 0x8353064dea82d011},
+  {0xff04c178eec23d3e, 0x2f460919349f2d47, 0x78fe5c7e69a969f2},
+  {0x40b0e4b5ba731b12, 0xdfdf6fb48e1eacca, 0x418adb73cc0cac43},
+  {0x07e5547b971dc85a, 0x9bb127d9e57350ef, 0xdb9801dd4d74063c},
+  {0x85c01e6cb0183fd9, 0x3ed03735d2254d39, 0x759b3422ff5ef8f1},
+  {0x6d72fa4b71c48c98, 0x3a991af37f04f9e1, 0xb32059432a68082f},
+  {0x3fe283302875d557, 0x8173481a149eee28, 0xeb7766a31793b0be},
+  {0x7acae2d67f591873, 0xb326c3aa2ed4173a, 0x1946cb0d5f62d04d},
+  {0x23bef9ae772d7f05, 0xe0bfc86b1d88610d, 0x74f165bcee4734eb},
+  {0x1d4726ce666680c3, 0x2ce0e6d607113532, 0xffc5de80c34f2df4},
+  {0xc2c05b149cdd1b58, 0x6944e26394cbe4d2, 0x97958f196f8c4c6b},
+  {0x270456c0b2e40aa0, 0x55d5c764d7670e84, 0x717d55b1ebf4aac6},
+  {0x20bc0c1aa67ad034, 0xd4281becc759401d, 0xa34c23a734c590ac},
+  {0x5847ae572b03bf5c, 0xfcac4377aa016371, 0xc37160769e1a862d},
+  {0x7dd17fc6d6f74010, 0x5b327c27eb1048e0, 0x9bdfc698b132189d},
+  {0xab7a432b47cdddcb, 0xa929bbd83ccbd1f9, 0x4d454da5089a34f2},
+  {0xb39461490efcedca, 0x53d60b8883762f77, 0x38149fe44801d6e1},
+  {0x7c94c03395823033, 0xdeeb603aad8b99f6, 0x6135272e4190f922},
+  {0x253f212e339c57b8, 0x4fbc0d5dd968a708, 0xf66bd639e3fb013b},
+  {0x6607bb8d9f1426d8, 0x0b9156b2a938e184, 0x1d6f7d7b46319a77},
+  {0x408e99af5df09232, 0xea04d07e17d71e98, 0x0961e3735a066ceb},
+  {0x0ac48cb89fc1d495, 0xe5ed5004fadbdcb6, 0xb371ec4e641dbdfd},
+  {0x870fba78bc9a5840, 0xa1372a9ae9b35641, 0xd7b9b31aedb9368d},
+  {0x9ec8171425817f91, 0x46d3a766e6d0c217, 0x6d410a83cdfd91e4},
+  {0xbaaf0e5bac52a284, 0x6184eb30dcfa0676, 0x10c8fb0ed6d0bdc9},
+  {0xac8814d3e0fe8707, 0x86d0ff1167e53b8a, 0x10e6600f84bbd4e6},
+  {0x747c0349c6a589dd, 0xf944627e4ef37152, 0x28e5a0f135a5a9bb},
+  {0x382e5c28e3026945, 0xee877613758af703, 0x2d922be5a1610e7f},
+  {0xcadae8499bb4cdb7, 0xd090031f77613a0d, 0xb775a4e76fd94b4f},
+  {0xd09a761e6898eced, 0x5669242c2f84d5da, 0x3d97c6bded80996e},
+  {0x2f95de059a47e03f, 0xfa75be47169ed83f, 0x87d30a6c8dff4a90},
+  {0xf8588b0cb7a0c692, 0xd246208d9f6dc4fb, 0xe36d575d6c2485c0},
+  {0x48c08c7013df5c58, 0x4d37effdea32dc30, 0xff80378ec9caad7d},
+  {0xf9e43db917658f34, 0xb76c0ff79e41f707, 0x8e4935c0b5c08083},
+  {0xb33f84c0bc9ef48d, 0xaab63f4f9f339a4c, 0xae55cf665e81d500},
+  {0x15e234561c4632f1, 0xe084e7a57d035829, 0xbaa1511cb0ed12a0},
+  {0x74f83ba7ec3568de, 0x1d7ecb2f352fdb0b, 0xd76964def60c29f6},
+  {0xd1c2b81f2e13a757, 0xf84d5af929439b5d, 0xc34a2d0878b81e8d},
+  {0x47767837fdba926b, 0x5683aec561752e96, 0x961ca0e7d4439beb},
+  {0x7d73c95d078b625f, 0x6e621c6b3817a9f1, 0xd300b482fda5d226},
+  {0x2cf83b998a66fb35, 0x4f0359eaa9684bfb, 0x2c460d7b4765cbc7},
+  {0xa5c0e6cf67395406, 0xb659d3e82276235d, 0x2c5c851229561369},
+  {0x3168901c3d8747a6, 0x4541eabd5d866402, 0xb768bb5b1a6b8379},
+  {0xb5fa4b6cdc308417, 0x8100841dbbeb59e8, 0x4db5eb632adc8553},
+  {0x2622070061628fa6, 0xc66a1ed278866e50, 0xfad328db6fb4acba},
+  {0x6734cb1adfc5db87, 0xd7f8cfed34d7e713, 0x259e5c52bef9b101},
+  {0xa077ba5e97f9e1c0, 0x21edc3275eed4b8f, 0xc2ddffec584d31bc},
+  {0xe8074b1519eb9faa, 0xa35f39294a8283ed, 0xffbfa9f0fdcce212},
+  {0x49406434389cd06b, 0x5241069e873cd010, 0xde4f448e7e3c47b6},
+  {0x8cb6dafda57a1b04, 0xb80b06fb012be0f6, 0x6c1f61ef626c5ee2},
+  {0x9e596d56ff39dd82, 0xfd823060d81e563c, 0xfe45b0659666e7bf},
+  {0x713e642578abac3b, 0x1e13b3773dddffd6, 0xf7ebe45d0b4ed62e},
+  {0x0fb29b505409913a, 0xbd66ecfa5053f05e, 0x5172fa12bbd062cf},
+  {0x7a8cd2f2af8db5c7, 0xf1c96d88f03f2f0c, 0xfaa8376f49a0abd5},
+  {0xacc980889b25b5e7, 0x2c34843e6a6d9f3d, 0xa6bf67c68037b6ca},
+  {0xaff8095311a13c10, 0x1d4a259b84ca7804, 0x3cbb9d0b61f7ff43},
+  {0x5662cd5d639dfe13, 0x89c27a983290bab8, 0x92a7d11e497af642},
+  {0x4157aad5c3c645ca, 0xf51297f3f77a30f2, 0x83c9dda7804ac4d8},
+  {0x4e84ffef7ca3be0a, 0x14a7ba9c76da7c08, 0x5c28dc6da027d5a0},
+  {0xb0964b96303be4e5, 0x4615a98b7f22a76d, 0xf222f844d2b37df9},
+  {0x802540711d4f5f7d, 0xf6649bae872a32e3, 0xaed6395da047f447},
+  {0x2f0953d8ce80f600, 0xdcf66d5eaf05752f, 0x209193bacdf14ef8},
+  {0xc6a3ef2332ce576d, 0xb9e01c6c4572a31f, 0xde9e30f16310efde},
+  {0xba02b8398971d6e6, 0xd1bab81c9c5221d6, 0x1c9c2d1f1b7f3f2b},
+  {0xedc228019fbdd60a, 0x2753c3a138bcb6d7, 0x786fd2ba67707c2f},
+  {0x448e2cb6c1407cbf, 0xf7b738377f0cfb97, 0x4c9212bdc0657e9c},
+  {0xc76e32691429c2f9, 0x490232f4e8c043ce, 0x217833736b683230},
+  {0xd1499dc75ffd2a9c, 0xd4b5f702de32b776, 0xd6dfbb898f67a374},
+  {0x3b5a28d4cff86b77, 0x806f6c0571138c8b, 0x54628239f0c0f09f},
+  {0xb8d45dd4a900ea0a, 0x2a9169078690c168, 0xb3657df1647fbd66},
+  {0x08189a6674f4c29c, 0x8915f4636dd5d112, 0x654dc7fe07da3107},
+  {0x5250e18c883794b0, 0x8828b68987cd0d9a, 0x300a18a7c772270d},
+  {0x51d33040e3efaa99, 0xd658da2cb0cb97b0, 0x39038890d157c0af},
+  {0x68f5a5cd07a32b53, 0x46b4f5ec1368cf94, 0xf2e0d23f40742f45},
+  {0x782b44a867a3f208, 0xae64fe82046cd425, 0xb78cf45fe171d435},
+  {0xde012b438c92c4d6, 0x4733810dca874273, 0x206a03d102c15302},
+  {0xbea371badf5b9173, 0x8cbfaa817fd4f717, 0x34bea5affcb319d8},
+  {0x1a26c2090378d01a, 0xf3d15fc5c66a7f39, 0x4de762da9a07d052},
+  {0x3486c8a67bccd6cc, 0x0d10351e2b0e18ac, 0x087106b5da2aba90},
+  {0xbd5c398105759654, 0x932e7ce0d2415118, 0xff7a9395dd694851},
+  {0x6f6615de424f584e, 0x6ca415cbf1ff0b9a, 0x509c3763be9bb7ea},
+  {0xe45a5c178e450e25, 0x48cc200c65039546, 0x2c2d872741a6e8d2},
+  {0x10a487ce7b7ba1f7, 0x8da8831a4adaa217, 0xcb608d431e73d316},
+  {0x480667a3a33a0923, 0x3a6fc63a03c45c96, 0xebed952f29ad80c0},
+  {0x8899df2b4edff733, 0x7b68b7ea18849999, 0xcedaa43cfb6f7f7b},
+  {0x356eff5782ed987f, 0xca6aab13ed43b0ce, 0x9dd8a4a5288bc18a},
+  {0x5ffc38d8fbfdcdb6, 0x697d4c0b82ce34af, 0x3509dc6ecc05993b},
+  {0x83905969be9090dd, 0x2125eb5bbd23d5da, 0x64224c3dfae48ffe},
+  {0xf54512d0b6691741, 0x0cbaec28b636b0bc, 0xbb1d6adcda1edefc},
+  {0x89ea6a9a58cddfdb, 0x845d179babdb73f7, 0xcf74a641c412cff5},
+  {0x65c9f3063d3b266e, 0x560354e0ca062952, 0xc6eb9b218ae96514},
+  {0x8e8c7412b3689e52, 0x99b2ec666a8a4e48, 0x5b4477de15147c03},
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000, 0x0000000000000000},
+  {0x75575b2a01927c2b, 0xe38f9eab8f685827, 0x782b0bd5192bca87},
+  {0xaebbaa0e79dffe28, 0xb3542e6782b8ce84, 0x8a972b1b32323be2},
+  {0x62cbd1af5c77da14, 0x3bbc6119877bbc1d, 0x8b6d73bce65ed541},
+  {0x4cd2ae2762f272cc, 0xdd4adc5bfc34ae6b, 0xa3f908a96f0fe449},
+  {0x60cfbdb9b6447e24, 0x721af8263082c01c, 0x68cb54e6fc7104af},
+  {0xa92867af3dc3b730, 0xb2608cc06efe34d0, 0xa3445078ace873ca},
+  {0xeedad86c96afe677, 0x52afb525bd42562f, 0x38cf8ddb97dc96e5},
+  {0x0a3b06f10bbc9562, 0x577b7a04e02c557b, 0x8be00f5765b7e908},
+  {0xaa72b3916d207e20, 0x50f0cce86025ffac, 0x09f7f935bbde0a04},
+  {0xac08b4e71f96174a, 0x16babbd24d02b260, 0x48e9d357af5ba717},
+  {0xe122c9c16beaa8be, 0x07043902949cfad6, 0xf78fd47b58608577},
+  {0x5c473c24ac8ca469, 0xb1da898afda7d7aa, 0xcac72d2cf21a9be3},
+  {0x7da00b91479d06e3, 0xc4c76d79d51eb15a, 0xb6c2e5796630269d},
+  {0xb42bb35d07e100ec, 0x19964fd51c07b0af, 0xffb88b0ef80a102f},
+  {0x60e6beb41a673a07, 0x75bec86a6c06b470, 0x61bb7f05fc39be82},
+  {0xfda48d0189cd0c13, 0x649054858d5374c9, 0xb770a8503a32e8a4},
+  {0x4a376d825f3006c4, 0x8896eb44124e97aa, 0x70e626bebfff29b5},
+  {0xe37a2f298ccf89c0, 0x3c3609a866d94979, 0x356c25d15f10d784},
+  {0x458ca204a347e41a, 0x59568e0a4da4e181, 0xab475a7c61d9014c},
+  {0x8c1a39eb79672160, 0xb373d43893fbd9fe, 0x41ba8d6a7097e9c8},
+  {0x7c01434b5f8e1448, 0x662bd055a2512d4f, 0x652c31c38e992dbc},
+  {0x1eae5c36fe075219, 0x82cb682598bcb1ea, 0x60daaa526a3e9947},
+  {0xf7ab17ab10f03bdf, 0x1e124b56f71a4c37, 0x75df607068cfdcff},
+  {0x1ffe54ada576e3c2, 0x384cb4e86120aaa8, 0xc6a4fefc642071df},
+  {0x1ae57be1013b7efb, 0x28d36534e13a369d, 0x75612cd220210f77},
+  {0xa84cfbb045298f2b, 0x733fdf0216082f1f, 0x0054b363e1fcdb09},
+  {0x8745e66041e62570, 0x980a16636c09d9b2, 0x51695306d0539b47},
+  {0xa67319655b027ef6, 0x4fd02799c207267a, 0x01587af4a65b6fd2},
+  {0x8d991698735bcc88, 0xf14dbd2b19f99a78, 0x6947a3b95199d2f4},
+  {0xe2906490948e4aff, 0x16b2ee7035d98706, 0x78f47845853b1ddc},
+  {0x58d9cdc2dd693cd9, 0x7e9c240b1b252019, 0x5cbd3d458a53ca24},
+  {0x24101759ff01d89b, 0xed8fdd27cdb2d47b, 0x11b0fa26e8d8a743},
+  {0x9da3e8ee96db2f59, 0x68285801543b4ac8, 0x618cc8ad53d51b65},
+  {0xf0b448478f472d56, 0x6044053c293513d2, 0xea2fb63a575a34cb},
+  {0x56bd7f9b430ca7de, 0xf883dbb4c18d2e0b, 0x6c8030ef1a38c730},
+  {0xf2e2c1396125acce, 0x882e926d399fcc33, 0x87e914f3049f22ce},
+  {0x7ec0b0443f81915d, 0x4573c52a818a44f3, 0xafc01f5cc8120f6b},
+  {0x924aed58bfbc33f1, 0x7cbf5617448b59a8, 0xef023ef380d782bb},
+  {0xed78ebbcc2543624, 0x4fbdf96f5a481d8a, 0x7dea022c85973850},
+  {0x4cd0fda73b73aaf8, 0xab714c84882fc5a6, 0x31a12db8b87c1a82},
+  {0x4f55b122e52b04af, 0x2b6abc206fcdea22, 0xeecc6a28e10f3cd9},
+  {0x773b7f263618ea81, 0xfedd6644251162ff, 0x20f124b39fffa2ef},
+  {0xc86672d34c7f9c99, 0xa1a9bedd91ba54ab, 0xe3164453cdbc1680},
+  {0x976193445cc61080, 0x4e8af4d9771f7fde, 0x2d6951afbad5a152},
+  {0xac8104ed45afc3e0, 0x2daa407aee0854a8, 0x93bf8a5f6332934b},
+  {0xadacd0145616a90f, 0x18fcdf471f8e446e, 0xb6cb1d657c5aee1f},
+  {0x39f4888a9f625046, 0x714ced776be006aa, 0x301aab64f4c07bac},
+  {0xfed94c87075ec99b, 0x6527495efabe5878, 0xae4ed05b44c346fa},
+  {0xe8089970ab84a9ad, 0xfa8ef420f612f142, 0x3033e1b424799c03},
+  {0x3de830d471a1c303, 0x1d4648963e64b5e8, 0xb7fc69c1308d744f},
+  {0xf917cc81a21178a2, 0xf51c71d20d3dde0f, 0xc755e70d903eca43},
+  {0xf988b4435c7e0659, 0xe8ec12c9411e644d, 0x011cff135dc46fe5},
+  {0x45eb42b4bc82e615, 0xbb1ea1d87fa2dcc8, 0xbbf258cddfcc5a4e},
+  {0x76c177c889777fa3, 0x771de5ab30476eca, 0xe3dd4d0ea4da4f41},
+  {0x62d43190a74afaab, 0x8c72e6cc25a0906d, 0x6560641e35c269c1},
+  {0x4a473706039e3353, 0x9270c15446432105, 0x508bd6dfcce33617},
+  {0x58e979ef836cb200, 0x64a108a5f68530dc, 0xeeb5a210610292b9},
+  {0x3e8a485122657a2d, 0xb7f7272f3423621b, 0x4c0e2f899ffc6f0f},
+  {0xb03f26ebad2101f3, 0x2bf27f00ccb827ad, 0xf2c32d1c9db42e29},
+  {0xcc5f196397e2bb63, 0x9cf1f95bba0e5fb0, 0xcffa723b8add78c2},
+  {0x5198cabd81774aa6, 0x79e142bd7c3981f1, 0xcfb65a6d42815d8a},
+  {0x91dc7af311207622, 0xf294a4f3c38f447e, 0xdfd67624b63f7997},
+  {0xfb2f51ed0b5b44c1, 0x6eeb2b229427682c, 0xfad555a3f1680200},
+  {0xd043eb034f7557ae, 0x89f917e3d7f663f1, 0xd7f51e2f59ce0302},
+  {0xd1738764ddee76f4, 0x28a966bea5ec647e, 0xa322c656d7bc27d1},
+  {0x0cd66c8dd29514f0, 0xb4e37bf2f01130a9, 0x7db6ecdc81a7a57f},
+  {0xc8cb28a44796dc78, 0x88eb0048501b3765, 0x8ff3fbd6d703c26d},
+  {0x2c5d68650ca4b6f5, 0xa8e391ce83198344, 0x8b9f3219506be9d0},
+  {0x911906127a1ba855, 0x30d5215961ac95e7, 0x71827dfac7504342},
+  {0x1ae4c2e2506d0712, 0xb5caffb8afbcda6e, 0x159080539f7f876e},
+  {0x86571676f6228cdb, 0x3a51f0bfed40380f, 0x5dec5a0cee962a54},
+  {0xf5c3339c01460504, 0x5d55382d4e349ecc, 0xcf81cc12df0b2c9e},
+  {0x89a775997037437a, 0xc86002223b57f27f, 0xfe795feb841f08ef},
+  {0x7da8a9b3f9f43fe4, 0x8494d51c6e215f43, 0xb703f044bc338b9c},
+  {0xf73c2c9d450a092f, 0xce0ae97084884a01, 0x9a647f6d5f970839},
+  {0x87c63573f869cdbb, 0x812d2d8e966e6911, 0x973b425ba1c66dfa},
+  {0x7de5a1e78d630e85, 0x765d7d5a4a6e3cb7, 0x28170eef2a846d99},
+  {0x0b0c630c0f59460d, 0x9c8758a9ee8db258, 0xd3589f9c034f75d5},
+  {0xe1a6d8e757067309, 0xd18498099be244d9, 0x9b10a894502fc4e1},
+  {0xfa14fe8a1dd59c3e, 0x6a9a93b0f1ac862a, 0xdbe4d8d065053ef7},
+  {0x5c94965ff0a8e28e, 0xc2a32a0d57f1faa2, 0x24dc5effe1fa9e37},
+  {0x6b404bba72a24d04, 0xbcd23a38f7981241, 0x93d0c9eb1b9a39ef},
+  {0xa53a198b9e74e59c, 0x17cb3bc05f9608d1, 0x21bcc23eb5e75655},
+  {0x05911f7d3220397f, 0x7915054dcb628314, 0x183a2a8400570cef},
+  {0x2a420bf34788186c, 0x8c83a2945ee3027b, 0x606a65c37a8f2fe3},
+  {0xccf4e83131d54a27, 0xc95466a498499126, 0xef9ac8206968b1f7},
+  {0xe457b2ff12256f1e, 0x57fd60a454e5f68d, 0xf3388bb1de5dd1c2},
+  {0x4addb3e322595749, 0x39e02bd59d8ae504, 0x20284c1ae2f1a65c},
+  {0x9fbb5574795cac4d, 0x9fedac975974c8bb, 0xd307ecf05fd4fd22},
+  {0x2505bb81200f8cbb, 0x2ac9d93c45830708, 0x11ec704af2c49861},
+  {0xfa1702dd351d3b22, 0xbe0dfc13d607f962, 0x82c611b8ccd1e9f2},
+  {0xb7ff038d58626bd7, 0x86e990a7d6acad3b, 0x5010d30fbe2d70a9},
+  {0xc42bda459ef1afca, 0x83c5891e3eff20a0, 0xdefbb485c364fd5a},
+  {0xaada4d9f943df0f1, 0x2618e51a8838b5fe, 0x8f45f0ffff45201f},
+  {0xb55e3891213f972c, 0xdb4f56b16dc4e905, 0x30fd462a4cf268fc},
+  {0x64e007b7010e8c80, 0x2d0de3d26a1748c3, 0xa2e01ed12648c113},
+  {0x5128d2b5c4bac674, 0xb80b46283a340508, 0x1c1f01fe24b17a66},
+  {0x4cb8ab976733595f, 0x403aca262ff117b0, 0xce1698b4f9a54376},
+  {0x7781e71d8805fdc4, 0x40c3c2110800e7a0, 0xe72e9e63999cc311},
+  {0xbb3e3e6501e45c00, 0x9e70bd7de6780a3b, 0x549416aa087fe4c5},
+  {0xae1da809d7eed055, 0x06ba5804e029b01c, 0x490555c99e76bd05},
+  {0x67f3afbbfeee6547, 0x1243b190c38432b1, 0xbab2fa8df7bf2943},
+  {0x6d7197464f15c83c, 0x9283ced1147a6a85, 0x96ba1a0e47d9dd96},
+  {0x9cbb90e485218006, 0x8b5ff83a0210b4d9, 0x1086afcf143b95c2},
+  {0xa07d026b378f963b, 0x2debd80b456cd3e3, 0xc7792b9bc7f54c4a},
+  {0x3d0bec8b88ba06b8, 0x0c13cdfdc4d01e9f, 0x6d256d1087b9c95e},
+  {0x9216a33ea47259ff, 0x2bde0cfcb54abe8d, 0xaaef421825f1b47b},
+  {0xa1aabb09b181ae0f, 0xc14d44d54e3620cd, 0xabb20e2a4d637bcb},
+  {0x2544eba1038d1b04, 0xda1f84aa9bc120c2, 0x41fd7f657a18c45d},
+  {0xadaff973f301d8c3, 0x87dae306486ff1a6, 0x60ec280a2570b8ff},
+  {0x624994b2704d4c20, 0x532232f1cf209482, 0x861b9c2a5a7d0a43},
+  {0x4513aa7db58aea4d, 0x89dfbe8c94798dde, 0xe735f37739441c13},
+  {0x2f534ce65fbe5d87, 0xf8fcb2432339f543, 0x8ea957572a77e395},
+  {0x2456c8d764e7c1a6, 0x7dc7567c507e2e18, 0xd29b13c5db1cd65a},
+  {0x885705a845bb1199, 0xebc702d7e1680421, 0x9aeba22f533cbac9},
+  {0x55c435f803ad3742, 0x695442fe576b3a09, 0x5ca02fab230ee023},
+  {0x0d446bb06a3cbf8b, 0x5bfc8414d84fff9a, 0x157e3384708408a8},
+  {0x7b212d17c02a4054, 0x2b14562733ba6900, 0x7965f7d93122eac0},
+  {0x349446294451df24, 0x2b91f57cdcc289f3, 0x829cb5a03cce767d},
+  {0x2f8e7fa84f0ad401, 0xb3a50f68cba8a638, 0xde440882f84bfd7a},
+  {0xd1ba1db41829f412, 0x9a2c4c23fb8538f7, 0x86ca32d92d99ecb9},
+  {0x8a6db99a627b227c, 0x633c81cf8e52a687, 0x8e58542594d7103e},
+  {0x4c5a928b8610d6cd, 0x6a38a81e5ec41b61, 0x05ac22b201c86322},
+  {0x283c4b53c14f39c0, 0x106fe171df2218c5, 0x4c077d33f17e0107},
+  {0x198b4c90bd33552f, 0x5853a4c2f74596db, 0x1018dd6bf21150d4},
+  {0x47c29e1c2f495b4c, 0x7ec84995131d545b, 0x49e53beaeb94dae0},
+  {0x2678b3f7b548fc9f, 0x63a6b9322f3a574c, 0xef6d85f1091f1aeb},
+  {0xf1391f569cd5fe90, 0x876e8ba956de0238, 0x6cd576e3b8ab6222},
+  {0x827547465967b775, 0x4197e1290368e412, 0xee63a7ef2156fb67},
+  {0x6cb2a919735b34d5, 0x6cc967b756d72395, 0x9a884a65ae74e811},
+  {0xbdebcb5fbfafafc0, 0xb7fc62a4c7947030, 0x554c36728822d8b6},
+  {0x025fef80c960792a, 0xc0f487dcc0ad8059, 0x9714504680995ad0},
+  {0x19ffb11f02502666, 0x482fc0fae8608ad2, 0x781175f6049c62ee},
+  {0xf1fece4f515854e7, 0x6dab52f7b6560106, 0xfa0028f50d672954},
+  {0x844afcd287c1ddba, 0x47234b529fe3ca41, 0x3ca221c08f88140a},
+  {0xfdbbeaaa02badeda, 0xf35a5e21992e2332, 0xa37f6d68d919b65f},
+  {0x6d218f603725748a, 0xb6df3c61103e9c3e, 0xbb7ac1cf4c1f4692},
+  {0x8e6d3eb058cfc260, 0xfbe2f6497287731a, 0xffa78646830d5ce0},
+  {0x8c07c328df449acd, 0x500ba217a7af529f, 0x19ab11b99a1a2a19},
+  {0x42de87a6001d7bc4, 0x6d65941a9ae5138b, 0xcb830271914ce1ee},
+  {0x25f950eb4e2b9669, 0x0c9f7a2279a16278, 0x86503e9de2e76202},
+  {0xedc0f3a86b732556, 0xc7995c7b3ec0ea66, 0x8a4d95b8d19c29ce},
+  {0x01b5ab0eca4d3189, 0xed7898b982b519ad, 0x24c5f841a769f11b},
+  {0xde3eefe1bad32178, 0x493a735c30942df4, 0x8b5ec5bed8e4d565},
+  {0xa974a9d616b752fa, 0x09d37b2ab193ca1c, 0x55b8aaf3af4481ba},
+  {0x84ca6915121b1e09, 0x8831e83e34fac643, 0x05e3db5a89049a2f},
+  {0x5375a9f4aefd0f44, 0xaf272fd031366078, 0xbbd286c07ed80632},
+  {0x9d101a493aa2ebc9, 0x67e3ddfaa73b2b94, 0x45bf06b13a5d6856},
+  {0x6469dfeed8b766bc, 0x41a958a8c84553fc, 0xc3665b3f060a6808},
+  {0x8bbd23b38d0cff32, 0x891f48bb2592fb3f, 0x24c6243ad065453e},
+  {0xf3d1cc12dcb4e302, 0x588dfaa464f518be, 0xfe082e8b4a39cf26},
+  {0x95c521746547be8e, 0x9cbbea72400d1df8, 0x0cfdac076655d579},
+  {0xa6c4c57375f48495, 0xd63f47b41907a3f7, 0x34e17c2df60668d7},
+  {0xa135ca38c26b95c3, 0x2aac9c6b01173258, 0x2d8499bf2ed7c23c},
+  {0xba02892976144352, 0x9e4d9906dc2ae94e, 0x6535b5091d0535a4},
+  {0x6ec4dba2c6f7e949, 0x02d65b71f7db3f86, 0x61c796b0290e7ff0},
+  {0xac044d22d442ff2e, 0x29d00d9db764b6ff, 0x9ec4ff5f21f3216d},
+  {0x26b3c84573c53161, 0xa3037316e91bf8bb, 0x251ed327edf11e39},
+  {0x2917804d2422970c, 0x16119362ba8934be, 0xafa94e1359c77cce},
+  {0x4eac35ec04e84a0b, 0x31b309e5e5d361a5, 0x4171e00956fd334e},
+  {0xa02b9fdd9f6b8162, 0xabd8bc110f4e1f52, 0x75578ed77238fedc},
+  {0xe73f9ad96bd8686d, 0xbdfc49ed2dba8097, 0x054c4bb989c34404},
+  {0xa0d01888aa5b1042, 0x8c33305a0dc075b1, 0x75f81fe0369e7b86},
+  {0x679d711aa88faab7, 0xb03f74deaa29c24c, 0x10a7766990689f5a},
+  {0x827d13e4d6310b6b, 0xc5a73641d06e47d1, 0xf2f0d06e14e2ab1f},
+  {0xcc968649ec63f05e, 0x17cda3a7fc25bfb2, 0x0df1338db25ee18e},
+  {0x7d4acd6c3cf8c18b, 0x4bd734fd562d48ad, 0xae50c4f72f542533},
+  {0xcf438bf70dbe4c62, 0x0019bcea28ce9270, 0xf687acda7ff8c960},
+  {0x5b24783c5318fd09, 0x5623189d31422de8, 0x862fd585eeb3e3f0},
+  {0xf98482f8df7d5e16, 0xccb9fb2d3745fbbf, 0x7d5e1bd364daa7d4},
+  {0x024849574a40a831, 0x48cae56880d67329, 0xfafa85469a93e6b3},
+  {0x944eae6b760bc534, 0x1d1d18f30fec24c3, 0xc64a74b4d0c3181e},
+  {0x19c52990a4e62d2d, 0x37b473c7ed759ef9, 0x04080c0ade3df738},
+  {0xfcc4062c7876c075, 0x48b4cf0b72aae741, 0x3889eef0b66c1bff},
+  {0x49c26471ae06da0b, 0x109da4749a70108b, 0x443b50c74915bd54},
+  {0xbe68bd432e672eb8, 0xbe737af593618ab7, 0x5d537d8c0da1a4e7},
+  {0xa3ca7393ce4e8d7c, 0x0fcf46d53a057c21, 0x7451a590ca6c1db1},
+  {0x79419444b1c149e5, 0x9d577a1e13240b2d, 0x24da1fd0d5db6e4d},
+  {0xe8c3caf37ad5170c, 0x423b4593d3f4c834, 0xff039eaad5042ae3},
+  {0x3bf5913b5615f7f5, 0x2d24b840238f2c84, 0x97bdc5bfeb1d53b7},
+  {0x53538b2293df4606, 0x169029e2d8675ec6, 0x9ab1ac25ee4982a4},
+  {0x75bd284d07f591f8, 0xccdd36b98d68786e, 0x9321ba79d2e56eed},
+  {0xe63236d17de7e69c, 0x9600d5f5cca5b08a, 0x8ff14c81e5d61843},
+  {0xdb079962536683c6, 0x35bb6068eb26bd37, 0xa614c37971ca2e4d},
+  {0xab78167ac83c4064, 0xb6a1928d6f89cdd1, 0xc97cc61d01ffe82f},
+  {0x83e6edd7a512e8b7, 0xe281601e537bc4ec, 0x19d35d2d57518cde},
+  {0xf737f3ddfa7fc9b2, 0x4a8f04a9cb4847be, 0x2946f3355994de91},
+  {0x577ca3baf1f7e1ba, 0x446729b10c51ed7c, 0xab637d9c6e3a5554},
+  {0x4e31798071664def, 0xec15c968e363630d, 0xd7ce5f867f758e48},
+  {0x10525e76bc5a5ed9, 0x1c8a384248ab4398, 0x8f7a522f2e2f3fc5},
+  {0xdee25133572d24bf, 0x37203f7f6c2e0e36, 0x89ba27d9b1233156},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer192s/m4speed/api.h b/crypto_sign/aimer192s/m4speed/api.h
new file mode 100644
index 00000000..c4b90d12
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 48
+#define CRYPTO_SECRETKEYBYTES 72
+#define CRYPTO_BYTES 9120
+#define CRYPTO_ALGNAME "aimer192s"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer192s/m4speed/field.c b/crypto_sign/aimer192s/m4speed/field.c
new file mode 100644
index 00000000..91ee3d55
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/field.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+  uint64_t sub[6] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[0] ^ a[2];
+  sub[3] = b[0] ^ b[2];
+  sub[4] = a[1] ^ a[2];
+  sub[5] = b[1] ^ b[2];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &temp[5], &t[1]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul(&sub[4], &sub[5], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] = temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] = temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+  uint64_t sub[6] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[0] ^ a[2];
+  sub[3] = b[0] ^ b[2];
+  sub[4] = a[1] ^ a[2];
+  sub[5] = b[1] ^ b[2];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &temp[5], &t[1]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul(&sub[4], &sub[5], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] ^= temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] ^= temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+  c[2] = temp_c2;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+  c[2] ^= temp_c2;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&temp[5], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[2]), (b[0] ^ b[2]));
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[1] ^ a[2]), (b[1] ^ b[2]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] = temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] = temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&temp[5], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[2]), (b[0] ^ b[2]));
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[1] ^ a[2]), (b[1] ^ b[2]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] ^= temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] ^= temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
diff --git a/crypto_sign/aimer192s/m4speed/field.h b/crypto_sign/aimer192s/m4speed/field.h
new file mode 100644
index 00000000..5182adc4
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[3];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer192s/m4speed/hash.c b/crypto_sign/aimer192s/m4speed/hash.c
new file mode 100644
index 00000000..005b51d1
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/hash.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include "keccakf1600.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static void shake256_inc_skip_squeeze(shake256incctx *state, size_t outlen)
+{
+  size_t i;
+
+  for (i = 0; i < outlen && i < state->ctx[25]; i++)
+  {
+    continue;
+  }
+  outlen -= i;
+  state->ctx[25] -= i;
+
+  while (outlen > 0)
+  {
+    KeccakF1600_StatePermute(state->ctx);
+
+    for (i = 0; i < outlen && i < SHAKE256_RATE; i++)
+    {
+      continue;
+    }
+    outlen -= i;
+    state->ctx[25] = SHAKE256_RATE - i;
+  }
+}
+
+void hash_init(hash_instance *ctx)
+{
+  shake256_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake256_inc_init(ctx);
+  shake256_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake256_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake256_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake256_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_skip_squeeze(hash_instance *ctx, size_t buffer_len)
+{
+  shake256_inc_skip_squeeze(ctx, buffer_len);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake256_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake256_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer192s/m4speed/hash.h b/crypto_sign/aimer192s/m4speed/hash.h
new file mode 100644
index 00000000..8f3450ea
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/hash.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake256incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_skip_squeeze AIMER_NAMESPACE(hash_skip_squeeze)
+void hash_skip_squeeze(hash_instance *ctx, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer192s/m4speed/params.h b/crypto_sign/aimer192s/m4speed/params.h
new file mode 100644
index 00000000..209888d7
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/params.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer192s_m4speed_##s
+
+#define SECURITY_BITS               192                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         2                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     25                   // number of parallel repetitions (Tau)
+#define AIMER_N                     256                  // number of MPC parties (N)
+#define AIMER_LOGN                  8                    // log_2(N)
+
+#define PRE_TREE_IDX                256
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer192s/m4speed/sign.c b/crypto_sign/aimer192s/m4speed/sign.c
new file mode 100644
index 00000000..6a16e2e0
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/sign.c
@@ -0,0 +1,664 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void commit_and_expand_tape_phase_3(tape_t *tape,
+                                    const hash_instance *ctx_precom,
+                                    const uint8_t seed[AIMER_SEED_SIZE],
+                                    size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_skip_squeeze(&ctx, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_mul(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->z_shares[0], mult_chk->x_shares[0],
+                           aim2_e1_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const hash_instance *ctx_tree,
+                 const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  hash_instance ctx;
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    memcpy(nodes, pre_nodes[rep][0], (2 * PRE_TREE_IDX - 1) * AIMER_SEED_SIZE);
+    expand_tree(nodes, ctx_tree, rep);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1],  rep, party);
+      hash_update(&ctx, commits, AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+      }
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_3(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])
+{
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF pt_shares[AIMER_N];
+  GF alpha_v_shares[2][AIMER_N];
+  GF epsilons[AIMER_L + 1];
+
+  // prepare h2
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    memcpy(nodes, pre_nodes[rep][0], (2 * PRE_TREE_IDX - 1) * AIMER_SEED_SIZE);
+    expand_tree(nodes, ctx_tree, rep);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF_set0(alpha);
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape_phase_3(&tape, &ctx_precom,
+                                     nodes[AIMER_N + party - 1], rep, party);
+
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk));
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+        GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+        GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+        GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares, sizeof(alpha_v_shares));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_5(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])
+{
+  // prepare challenge parties
+  hash_instance ctx;
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF epsilons[AIMER_L + 1];
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  mult_chk_t mult_chk;
+  memset(&mult_chk, 0, sizeof(mult_chk_t));
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+
+    // compute parties' seeds using binary tree
+    memcpy(nodes, pre_nodes[rep][0], (2 * PRE_TREE_IDX - 1) * AIMER_SEED_SIZE);
+    expand_tree(nodes, ctx_tree, rep);
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes, i_bar);
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha_share = {0,};
+
+    tape_t tape;
+    commit_and_expand_tape(&tape, commits, &ctx_precom,
+                           nodes[AIMER_N + i_bar - 1], rep, i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits,
+           AIMER_COMMIT_SIZE);
+
+    GF_set0(mult_chk.x_shares[AIMER_L]);
+
+    // adjust the last share and prepare the proof and h_1
+    if (i_bar == AIMER_N - 1)
+    {
+      // initialize adjustment values
+      tape_t delta;
+      memset(&delta, 0, sizeof(tape_t));
+
+      GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+      GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+      GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+      GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+      GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+      GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+    }
+
+    for (size_t ell = 0; ell < AIMER_L; ell++)
+    {
+      GF_copy(mult_chk.x_shares[ell], tape.t_shares[ell]);
+
+      // x_* = sum_i A[i] * t[i] + b
+      GF_transposed_matmul_add(mult_chk.x_shares[AIMER_L],
+                               tape.t_shares[ell], matrix_A[ell]);
+    }
+    GF_copy(alpha_share, tape.a_share);
+
+    GF_mul_add(alpha_share, mult_chk.x_shares[0], epsilons[0]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[1], epsilons[1]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[2], epsilons[2]);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes, alpha_share);
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  signature_t *sign = (signature_t *)sig;
+
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt and root seeds
+  hash_instance ctx_roots;
+  hash_init_prefix(&ctx_roots, HASH_PREFIX_3);
+  hash_update(&ctx_roots, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_roots, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_roots, random, SECURITY_BYTES);
+  hash_final(&ctx_roots);
+  hash_squeeze(&ctx_roots, sign->salt, AIMER_SALT_SIZE);
+
+  uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE];
+  hash_squeeze(&ctx_roots, (uint8_t *)root_seeds, AIMER_T * AIMER_SEED_SIZE);
+  hash_ctx_release(&ctx_roots);
+
+  uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE] = {0,};
+
+  hash_instance ctx_tree;
+  hash_init_prefix(&ctx_tree, HASH_PREFIX_4);
+  hash_update(&ctx_tree, sign->salt, AIMER_SALT_SIZE);
+
+  pre_expand_trees(pre_nodes, &ctx_tree, root_seeds);
+
+  run_phase_1(sign, pt_GF, (const GF *)sbox_outputs, mu,
+              &ctx_tree,
+              (const uint8_t (*)[2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])pre_nodes);
+  run_phase_1_to_3(sign, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, ct_GF, &ctx_tree,
+                   (const uint8_t (*)[2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])pre_nodes);
+  run_phase_1_to_5(sign, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, &ctx_tree,
+                   (const uint8_t (*)[2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])pre_nodes);
+
+  hash_ctx_release(&ctx_tree);
+
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(
+      sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig,
+        size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(
+          signature, CRYPTO_BYTES,
+          message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer192s/m4speed/sign.h b/crypto_sign/aimer192s/m4speed/sign.h
new file mode 100644
index 00000000..0c168ee0
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/sign.h
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define commit_and_expand_tape_phase_3 AIMER_NAMESPACE(commit_and_expand_tape_phase_3)
+void commit_and_expand_tape_phase_3(tape_t *tape,
+                                    const hash_instance *ctx_precom,
+                                    const uint8_t seed[AIMER_SEED_SIZE],
+                                    size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const hash_instance *ctx_tree,
+                 const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_3 AIMER_NAMESPACE(run_phase_1_to_3)
+void run_phase_1_to_3(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_5 AIMER_NAMESPACE(run_phase_1_to_5)
+void run_phase_1_to_5(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE]);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer192s/m4speed/tree.c b/crypto_sign/aimer192s/m4speed/tree.c
new file mode 100644
index 00000000..3f4fa554
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/tree.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void pre_expand_trees(uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE],
+                      const hash_instance *ctx_tree,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE])
+{
+  size_t rep_index, node_index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx;
+
+  for (rep_index = 0; rep_index < AIMER_T; rep_index++)
+  {
+    memcpy(pre_nodes[rep_index][0], root_seeds[rep_index], AIMER_SEED_SIZE);
+    buffer[0] = (uint8_t)(rep_index);
+    for (node_index = 1; node_index < PRE_TREE_IDX; node_index++)
+    {
+      buffer[1] = (uint8_t)(node_index);
+      memcpy(buffer + 2, pre_nodes[rep_index][node_index - 1], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, ctx_tree);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, pre_nodes[rep_index][2 * node_index - 1], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+}
+
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const hash_instance *ctx_tree,
+                 size_t rep_index)
+{
+  size_t index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx;
+
+  buffer[0] = (uint8_t)(rep_index);
+  for (index = PRE_TREE_IDX; index < AIMER_N; index++)
+  {
+    buffer[1] = (uint8_t)(index);
+    memcpy(buffer + 2, nodes[index - 1], AIMER_SEED_SIZE);
+
+    hash_ctx_clone(&ctx, ctx_tree);
+    hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+    hash_final(&ctx);
+    hash_squeeze(&ctx, nodes[2 * index - 1], AIMER_SEED_SIZE << 1);
+    hash_ctx_release(&ctx);
+  }
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer192s/m4speed/tree.h b/crypto_sign/aimer192s/m4speed/tree.h
new file mode 100644
index 00000000..364c85f7
--- /dev/null
+++ b/crypto_sign/aimer192s/m4speed/tree.h
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define pre_expand_trees AIMER_NAMESPACE(pre_expand_trees)
+void pre_expand_trees(uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE],
+                      const hash_instance *ctx_tree,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE]);
+
+#define expand_tree AIMER_NAMESPACE(expand_tree)
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const hash_instance *ctx_tree,
+                 size_t rep_index);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer192s/m4stack/__asm_field.S b/crypto_sign/aimer192s/m4stack/__asm_field.S
new file mode 100644
index 00000000..26575c28
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/__asm_field.S
@@ -0,0 +1,617 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  ldr.w R2, [in_p, #4 * width]
+  ldr.w R3, [in_p, #5 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R3, [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R2, [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  ldr.w R3,  [in0_p, #4 * width]
+  ldr.w R12, [in1_p, #4 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #4 * width]
+
+  ldr.w R3,  [in0_p, #5 * width]
+  ldr.w R12, [in1_p, #5 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #5 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w {R4-R10, lr}
+
+  ldr.w in0, [in_p, #2 * width]  // a[1]
+  ldr.w in2, [in_p, #3 * width]
+  ldr.w in4, [in_p, #4 * width]  // a[2]
+  ldr.w in6, [in_p, #5 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R10, C4
+
+  and.w in0, in0, R10, lsr #16
+  and.w in2, in2, R10, lsr #16
+  and.w in4, in4, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+  eor.w in2, in2, in7, lsr #25
+  eor.w in2, in2, in7, lsr #30
+  eor.w in2, in2, in7, lsr #31
+
+  // c[2] = temp[2] ^ temp[5];
+  eor.w in0, in0, in6
+  eor.w in1, in1, in7
+
+  // c[2] ^= (temp[5] << 7) | ((temp[4] >> 57);
+  // c[2] ^= (temp[5] << 2) | ((temp[4] >> 62);
+  // c[2] ^= (temp[5] << 1) | ((temp[4] >> 63);
+  eor.w in0, in0, in5, lsr #25
+  eor.w in0, in0, in5, lsr #30
+  eor.w in0, in0, in5, lsr #31
+
+  eor.w in0, in0, in6, lsl #7
+  eor.w in0, in0, in6, lsl #2
+  eor.w in0, in0, in6, lsl #1
+
+  eor.w in1, in1, in6, lsr #25
+  eor.w in1, in1, in6, lsr #30
+  eor.w in1, in1, in6, lsr #31
+
+  eor.w in1, in1, in7, lsl #7
+  eor.w in1, in1, in7, lsl #2
+  eor.w in1, in1, in7, lsl #1
+
+  str.w in0, [out_p, #4 * width]
+  str.w in1, [out_p, #5 * width]
+
+  ldr.w in0, [in_p, #0 * width]  // a[0]
+  ldr.w in6, [in_p, #1 * width]
+
+  lsr.w in1, in0, #16
+  lsr.w in7, in6, #16
+
+  and.w in0, in0, R10, lsr #16
+  and.w in6, in6, R10, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // c[1] = temp[1] ^ temp[4];
+  eor.w in6, in6, in4
+  eor.w in7, in7, in5
+
+  // c[1] ^= (temp[4] << 7) | (t >> 57);
+  // c[1] ^= (temp[4] << 2) | (t >> 62);
+  // c[1] ^= (temp[4] << 1) | (t >> 63);
+  eor.w in6, in6, in3, lsr #25
+  eor.w in6, in6, in3, lsr #30
+  eor.w in6, in6, in3, lsr #31
+
+  eor.w in6, in6, in4, lsl #7
+  eor.w in6, in6, in4, lsl #2
+  eor.w in6, in6, in4, lsl #1
+
+  eor.w in7, in7, in4, lsr #25
+  eor.w in7, in7, in4, lsr #30
+  eor.w in7, in7, in4, lsr #31
+
+  eor.w in7, in7, in5, lsl #7
+  eor.w in7, in7, in5, lsl #2
+  eor.w in7, in7, in5, lsl #1
+
+  str.w in6, [out_p, #2 * width]
+  str.w in7, [out_p, #3 * width]
+
+  // c[0] = temp[0] ^ t;
+  eor.w in0, in0, in2
+  eor.w in1, in1, in3
+
+  // c[0] ^= (t << 7);
+  // c[0] ^= (t << 2);
+  // c[0] ^= (t << 1);
+  eor.w in0, in0, in2, lsl #7
+  eor.w in0, in0, in2, lsl #2
+  eor.w in0, in0, in2, lsl #1
+
+  eor.w in1, in1, in2, lsr #25
+  eor.w in1, in1, in2, lsr #30
+  eor.w in1, in1, in2, lsr #31
+
+  eor.w in1, in1, in3, lsl #7
+  eor.w in1, in1, in3, lsl #2
+  eor.w in1, in1, in3, lsl #1
+
+  str.w in0, [out_p, #0 * width]
+  str.w in1, [out_p, #1 * width]
+
+  pop.w {R4-R10, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer192s/m4stack/aim2.c b/crypto_sign/aimer192s/m4stack/aim2.c
new file mode 100644
index 00000000..b5dbbc85
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/aim2.c
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 17
+// (2 ^ 17 - 1) ^ (-1) mod (2 ^ 192 - 1)
+// = 0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5ad6ad6b56b5ab5ad5
+// ad6b56b5ab5ad5 ad6 ad6b56b5ab5ad5 ad6 ad6b56b5ab5ad5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_a, in);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t1 = in ^ (0xad)
+  GF_sqr_s(t1, table_a);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t2 = in ^ (0xad 6), table_d = in ^ (0xad5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_6);
+  GF_mul_s(table_d, t1, table_5);
+
+  // t1 = in ^ (0xad6 b)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xad6b5 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xad6b56 b)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b56b 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xad6b56b5 a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xad6b56b5a b)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xad6b56b5ab 5)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_5);
+
+  // table_d = in ^ (0xad6b56b5ab5 ad5)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(table_d, t1, table_d);
+
+  // t1 = n ^ (0xad6b56b5ab5ad5 ad6)
+  GF_sqr_s(t1, table_d);
+  for (i = 1; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6 ad6b56b5ab5ad5)
+  for (i = 0; i < 56; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5 ad6)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xad6b56b5ab5ad5ad6ad6b56b5ab5ad5ad6 ad6b56b5ab5ad5)
+  for (i = 0; i < 56; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_d);
+}
+
+// inverse Mersenne S-box with e2 = 47
+// (2 ^ 47 - 1) ^ (-1) mod (2 ^ 192 - 1)
+// = 0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeeeeeed
+// dddd dddd dddd bb bb bb bb bb bb 77 77 77 77 77 76 ee ee ee ee ee ed
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,};
+  GF table_6 = {0,}, table_7 = {0,};
+  GF table_b = {0,}, table_d = {0,}, table_e = {0,};
+
+  // t1 = in ^ 3
+  GF_sqr_s(table_d, in);
+  GF_mul_s(t1, table_d, in);
+
+  // table_6 = (in ^ 3) ^ 2
+  GF_sqr_s(table_6, t1);
+  // table_7 = in ^ 7
+  GF_mul_s(table_7, table_6, in);
+  // table_b = in ^ 11
+  GF_sqr_s(table_b, table_d);
+  GF_mul_s(table_b, table_b, table_7);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_6, table_7);
+  // table_e = in ^ 14
+  GF_sqr_s(table_e, table_7);
+
+  // table_b = in ^ (0xbb)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_b, t1, table_b);
+
+  // table_7 = in ^ (0x77), table_6 = in ^ (0x76)
+  GF_sqr_s(t1, table_7);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_6, t1, table_6);
+  GF_mul_s(table_7, t1, table_7);
+
+  // t2 = in ^ (0xdd)
+  GF_sqr_s(t1, table_d);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, table_d);
+
+  // table_e = in ^ (0xee), table_d = in ^ (0xed)
+  GF_sqr_s(t1, table_e);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_d, t1, table_d);
+  GF_mul_s(table_e, t1, table_e);
+
+  // t2 = in ^ (0xdd dd)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t2, t1, t2);
+
+  // t1 = in ^ (0xdddd dddd)
+  GF_sqr_s(t1, t2);
+  for (i = 1; i < 16; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xdddddddd dddd)
+  for (i = 0; i < 16; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t1 = in ^ (0xdddddddddddd bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbb bb)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb77 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb7777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb77777777 77)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb7777777777 76)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776 ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776ee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // t1 = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeee ee)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_e);
+
+  // out = in ^ (0xddddddddddddbbbbbbbbbbbb777777777776eeeeeeeeee ed)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_d);
+}
+
+// Mersenne exponentiation with e_star = 5
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+  GF t2 = {0,};
+
+  // t2 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t2, t1, in);
+
+  // t1 = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t2);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 5 - 1)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, t2);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[0], state[0], vector_b);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+}
diff --git a/crypto_sign/aimer192s/m4stack/aim2.h b/crypto_sign/aimer192s/m4stack/aim2.h
new file mode 100644
index 00000000..b30d4cb9
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/aim2.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0xc0ac29b7c97c50dd, 0xbe5466cf34e90c6c, 0x452821e638d01377},
+  {0xd1310ba698dfb5ac, 0x9216d5d98979fb1b, 0x3f84d5b5b5470917}
+};
+
+static const GF aim2_e1_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000, 0x0000000000000000},
+  {0x24187d60404121f6, 0x994d0c36800d12c1, 0x00911dd52a0924f1},
+  {0x764f49362db3c478, 0x3bcc2005010a3fa6, 0x402147d6af1a6ff4},
+  {0x1c0878591079091d, 0x9b08ffde1c878f59, 0x8ff70000000021c2},
+  {0xbc23dcb74c10198f, 0xe23fb48357412666, 0x70031ccb7f97795b},
+  {0xffa72d9a27550570, 0xc7dab56f7d5ade7c, 0x78cd4c6283845a4b},
+  {0x655b34aa00430d9a, 0x0150004209eea37c, 0xea5061fe40551141},
+  {0x291b4f90d5814c36, 0xcda4bfb158be9a9b, 0x0d4558cc51c4127c},
+  {0xbe4eb108521087f8, 0x855a49e49b1f9165, 0xfa15129aaa8d8745},
+  {0xef60386cb35ccf9a, 0x5115765ff710f9f9, 0x205677891921e135},
+  {0xbed705ee53ec571d, 0x97ef8c6dd0851236, 0xdfb8887b08ee7b6d},
+  {0x6731ce99be825c5a, 0x78665e68455482e1, 0x8b867f2046054b3e},
+  {0x008fe70500592609, 0x6419eeb2829f34c3, 0x8f95a35e28a915f4},
+  {0x4dd556b654d54730, 0x07e8d738dc4b2c41, 0x6de823272f319c70},
+  {0xc805945260585e93, 0xf3efb93595438399, 0x387f3dab97add8fd},
+  {0x8825784a2bb54db6, 0x8d1d21f68a9fed14, 0xd72c5de2e4375500},
+  {0xd9fbd5d41179e461, 0xbaa9f9428fe27896, 0x49998ea2c43c70ee},
+  {0xba1e061ac9218b6b, 0x93a1c1ea0a23984a, 0x145015f4bca9f514},
+  {0xb2829eadb1319c61, 0xf21008aca9c587af, 0x491dfc66b48bb406},
+  {0xdc192cc5729969e6, 0x19aeec2c6a3facb8, 0xeaf05f73c034e88f},
+  {0x5cc5d35af8af5039, 0x64bfd6b3c8401142, 0x4d083af0e0cecd4f},
+  {0xbcb663181c16e418, 0x9d73d6e08b40b1cf, 0xe6a19d2ea608b779},
+  {0x8f9e2660cdf64ce3, 0x6e790dfe030df1e7, 0xf36bdb76802d4809},
+  {0x24e27b21fdd534eb, 0x9b2abc8327bb58a6, 0xa60607784f3d2a8a},
+  {0x6470b72d839b493a, 0x3de3bd12dbc9236b, 0xab0e0e81db838cab},
+  {0x9fa25765dfa0dc0c, 0xa4866af77f3c1d39, 0xa22985fd177fb75e},
+  {0x1bd0dcf82dfcbaf6, 0x2778cab77faeae14, 0x144c9d871ac906e8},
+  {0xab206aa0299e585a, 0x1f2a1c115b2b24e7, 0xd683dc1df4f0e8e4},
+  {0x3db096486b11d3fc, 0x1d88f50f57fb1318, 0xfbdbd02cf211be3b},
+  {0x83c0ed680040dbeb, 0x01d5321e9c73822a, 0x5c78f9da86ddc253},
+  {0xed72eb240cfd7027, 0xe43295f2eab71065, 0x7dad74ed8a4daf27},
+  {0x593448e3f55865bc, 0x3dbc22ef1d415b62, 0xff617d36a6e04fd7},
+  {0x79fee82d5e5f6225, 0xe933e7ffba3ad69f, 0x11333262fecf9f21},
+  {0xaccf982f89364968, 0x961868954276eacd, 0x3903286905b4951a},
+  {0x15f9d8aff0e99b99, 0x37d7fc3823e38e15, 0x8f3cf305ce9c3317},
+  {0x5f1db90ec8ff178c, 0xef61eb5b69c0cf16, 0xd6d4428841ba2406},
+  {0x6c1d820160b3e589, 0x1655a37c12244e16, 0x1506fe0d42af221a},
+  {0x776220241d5f52f8, 0xbbd873a1a32d77fc, 0x2967ed932de2646d},
+  {0xb360b6c691f374f5, 0xe152921a89b1bb3a, 0x9bb32e5d9871acf2},
+  {0xbbae8029d2f0211d, 0xdfa58ed49cdc469a, 0x298aa1fd3b5fee94},
+  {0x311334572c4f58e2, 0xbd79cb94c83a4a65, 0x097731c2b9f63b2f},
+  {0x202f161d6f618d78, 0xb30f00f7d63d2b1c, 0xba3ba40cb586c147},
+  {0x6f6de8a66957b811, 0x933c64f745e4cb26, 0xe60acca62b3467da},
+  {0x2d52d8e03eadc408, 0x020b8ada8b0cbcfb, 0x97e520c15d31d866},
+  {0x17f79f53394c41f8, 0x8057746b55d4354d, 0x29944f234150b558},
+  {0xd48d6f8d466f4fb7, 0xe62aa6c05e099abf, 0xe72196d812cdf8ff},
+  {0x31086eee778187b7, 0x5f39e6312ab8e7fc, 0xd2794f291ba18edc},
+  {0x8bb7a2d05d52dd01, 0x898fee2a72a51691, 0xaf83c32d4f112cdf},
+  {0xf219effd62769131, 0x006ad7baac86fb08, 0xae1e7bed2f88d4eb},
+  {0x085e604007b4850e, 0x74969c7dc17959a0, 0x70af70f460fd6854},
+  {0x85048e661ea730d2, 0xccb4840c40f6c89e, 0xcb4b3836c98d0776},
+  {0xac7fadd0308807de, 0x93e5399425e1f409, 0x6cebcde031477957},
+  {0x12b09fb9d6bb04ff, 0xa5b0c0475b17d882, 0x9a2d1dc52a42cbfb},
+  {0x2a89655cb1fec3db, 0xb8a64412d508abdf, 0x3998b588ed04feab},
+  {0xa8687e88bff0829c, 0x671e2f2b99afe070, 0x2c08c6f71aa0fa09},
+  {0xe1ce5c820d6be145, 0x7c9485f929d3a113, 0x35a20e96293d131a},
+  {0xba53e0ea72f26b2a, 0x2c4dc2a431baa81b, 0x19674137360734db},
+  {0xde4269315e846bfb, 0x9ed583db0c4ca349, 0x315852fa0660ab68},
+  {0x00ae2ff5c859fcd1, 0x8a404e1ee645e1db, 0x9feadfee4a6a10b9},
+  {0x098454c0f608253b, 0xbf09d16ec3b96f79, 0xe63451db95697baf},
+  {0xa422cc6c5adc283f, 0xb7854c10a36c12d0, 0x9650b028e25b9107},
+  {0x8da1b75903dd2aa8, 0xef8f3a20c77f4c10, 0x11e6a8d176631e6f},
+  {0xe70563f20a26d72a, 0xc706a9184b4269ec, 0x01707c8cd370854b},
+  {0x4c497f712f722710, 0x40d97c17a9f96a81, 0x61ac088c7242b19b},
+  {0x9c1188e5b2c4043a, 0x15c4ce5e386918fd, 0xc2c19cddc8022f62},
+  {0x334dd52624b37647, 0x0ecfeb52b8db6b3a, 0x7cb0cc6a541d915f},
+  {0x0d2da3de5da05ab9, 0x4c8403040eb7a0a8, 0xaa43178d698e1d16},
+  {0x94dd24ac7d70454e, 0x19c81eacd2305f1d, 0xab7995a48e6230a2},
+  {0xc4c2698143f7ebe6, 0x9a9c3bf3c8dbc9bc, 0xef2ce69e69cf09cf},
+  {0xe4d55e8362bd6084, 0x4bd67382e024dfd0, 0x821aed870355bf63},
+  {0xd76139f98e468054, 0x61f1798f51310a13, 0x29046f782268e0dc},
+  {0xd415fc0d991dd093, 0x40c961038916982c, 0x50c6b0ef248e059b},
+  {0x9964bad18a8082f1, 0x666ff6785e18a4dd, 0x8ef30e5710f8282b},
+  {0xb414e2f6230594fe, 0x1bc6a73e670570f9, 0x58556965657d0723},
+  {0x7923079ff8bc88c9, 0x2009ba12607a4104, 0x79486291900310c9},
+  {0xbee4fd3a8ba864ef, 0x5df270cc7b675b45, 0x8fe410ae3a6416b4},
+  {0xed8ea038500ce1aa, 0x23cfffa4b08f7923, 0x24391c9872e1db52},
+  {0xea11414bd1ee6f54, 0x57a5ebe50ea4869b, 0x18f580aebbed4614},
+  {0x4d0c81d6ef843f2f, 0xfd169854c78d4b18, 0x7c36b2afccb84371},
+  {0x0c639f2dc76998e0, 0xdc8e28abec0a421f, 0xfba0c0a5251cd144},
+  {0x766dda3b823a1b74, 0x7f6d206bbd49261d, 0x710de4ad8beaa62e},
+  {0x7abd0b3c484d3910, 0x58abd14b6ee2e49b, 0x78652fe31e4d6d19},
+  {0x4dce3f2a407a25c2, 0x57d6ce10b19b7b99, 0x29cabd29d03528c3},
+  {0xf03c709f8b55bbc2, 0x10f449ee0641e483, 0xf60bd442dfd1a803},
+  {0x51d8a3af211b35bb, 0x2b0c872b328250e9, 0xb67d77e5c9d6d27a},
+  {0x9a731c8f091b2c24, 0x04cf41a716e1e225, 0x9b354a2d84899ec9},
+  {0x0748672bb3e504fb, 0xda648aaa478a326c, 0x0d85a4a55979e5ca},
+  {0xbb732bb90d147586, 0x446c43c25a19dc66, 0x18523f7f708eff36},
+  {0xc549edb1f37b1b15, 0x719aa23612aac7e4, 0x2c771e685e380ec2},
+  {0xe2b6b4207ad6a4b6, 0xf7cc2a116c9527ba, 0xdf6e5d55b2406221},
+  {0xb67a2baac610e044, 0xd425d94d1ebe4051, 0xb7bd1ce70c015395},
+  {0x64ff5ff72d64a1b1, 0xdaca2b8812d90ae6, 0x79a022efcc594eaf},
+  {0xc93cfa6de67bcacd, 0xa179dce6ffd14aec, 0x31528f0f0f3c6817},
+  {0x3ec18f7af7342039, 0xf8d7aa856a662ed9, 0x097b848460df8308},
+  {0xf037fa04d6ff2eb4, 0x1b6ec290719d4d0a, 0xe20e86a3b38d743e},
+  {0x8aea64bccc94d424, 0x2cc260f4f6b65bad, 0x355d31f6d901a260},
+  {0x140e5ae17cc96cb4, 0x620ee0a86b0eda0a, 0xb3fcecb29d358575},
+  {0x5ec85d1f29af07e2, 0xd6c8834f22331d6a, 0xcef37a820396e162},
+  {0xe344085d2eabc755, 0x6c6b136959c8ef7a, 0xbb22e260fa6a677a},
+  {0x7a64bfaa585ae30a, 0xe317efc967bbe220, 0x9a9780dfb02d4b7e},
+  {0x98c71744cd706ceb, 0xd177e9274ab5f551, 0x8353064dea82d011},
+  {0xff04c178eec23d3e, 0x2f460919349f2d47, 0x78fe5c7e69a969f2},
+  {0x40b0e4b5ba731b12, 0xdfdf6fb48e1eacca, 0x418adb73cc0cac43},
+  {0x07e5547b971dc85a, 0x9bb127d9e57350ef, 0xdb9801dd4d74063c},
+  {0x85c01e6cb0183fd9, 0x3ed03735d2254d39, 0x759b3422ff5ef8f1},
+  {0x6d72fa4b71c48c98, 0x3a991af37f04f9e1, 0xb32059432a68082f},
+  {0x3fe283302875d557, 0x8173481a149eee28, 0xeb7766a31793b0be},
+  {0x7acae2d67f591873, 0xb326c3aa2ed4173a, 0x1946cb0d5f62d04d},
+  {0x23bef9ae772d7f05, 0xe0bfc86b1d88610d, 0x74f165bcee4734eb},
+  {0x1d4726ce666680c3, 0x2ce0e6d607113532, 0xffc5de80c34f2df4},
+  {0xc2c05b149cdd1b58, 0x6944e26394cbe4d2, 0x97958f196f8c4c6b},
+  {0x270456c0b2e40aa0, 0x55d5c764d7670e84, 0x717d55b1ebf4aac6},
+  {0x20bc0c1aa67ad034, 0xd4281becc759401d, 0xa34c23a734c590ac},
+  {0x5847ae572b03bf5c, 0xfcac4377aa016371, 0xc37160769e1a862d},
+  {0x7dd17fc6d6f74010, 0x5b327c27eb1048e0, 0x9bdfc698b132189d},
+  {0xab7a432b47cdddcb, 0xa929bbd83ccbd1f9, 0x4d454da5089a34f2},
+  {0xb39461490efcedca, 0x53d60b8883762f77, 0x38149fe44801d6e1},
+  {0x7c94c03395823033, 0xdeeb603aad8b99f6, 0x6135272e4190f922},
+  {0x253f212e339c57b8, 0x4fbc0d5dd968a708, 0xf66bd639e3fb013b},
+  {0x6607bb8d9f1426d8, 0x0b9156b2a938e184, 0x1d6f7d7b46319a77},
+  {0x408e99af5df09232, 0xea04d07e17d71e98, 0x0961e3735a066ceb},
+  {0x0ac48cb89fc1d495, 0xe5ed5004fadbdcb6, 0xb371ec4e641dbdfd},
+  {0x870fba78bc9a5840, 0xa1372a9ae9b35641, 0xd7b9b31aedb9368d},
+  {0x9ec8171425817f91, 0x46d3a766e6d0c217, 0x6d410a83cdfd91e4},
+  {0xbaaf0e5bac52a284, 0x6184eb30dcfa0676, 0x10c8fb0ed6d0bdc9},
+  {0xac8814d3e0fe8707, 0x86d0ff1167e53b8a, 0x10e6600f84bbd4e6},
+  {0x747c0349c6a589dd, 0xf944627e4ef37152, 0x28e5a0f135a5a9bb},
+  {0x382e5c28e3026945, 0xee877613758af703, 0x2d922be5a1610e7f},
+  {0xcadae8499bb4cdb7, 0xd090031f77613a0d, 0xb775a4e76fd94b4f},
+  {0xd09a761e6898eced, 0x5669242c2f84d5da, 0x3d97c6bded80996e},
+  {0x2f95de059a47e03f, 0xfa75be47169ed83f, 0x87d30a6c8dff4a90},
+  {0xf8588b0cb7a0c692, 0xd246208d9f6dc4fb, 0xe36d575d6c2485c0},
+  {0x48c08c7013df5c58, 0x4d37effdea32dc30, 0xff80378ec9caad7d},
+  {0xf9e43db917658f34, 0xb76c0ff79e41f707, 0x8e4935c0b5c08083},
+  {0xb33f84c0bc9ef48d, 0xaab63f4f9f339a4c, 0xae55cf665e81d500},
+  {0x15e234561c4632f1, 0xe084e7a57d035829, 0xbaa1511cb0ed12a0},
+  {0x74f83ba7ec3568de, 0x1d7ecb2f352fdb0b, 0xd76964def60c29f6},
+  {0xd1c2b81f2e13a757, 0xf84d5af929439b5d, 0xc34a2d0878b81e8d},
+  {0x47767837fdba926b, 0x5683aec561752e96, 0x961ca0e7d4439beb},
+  {0x7d73c95d078b625f, 0x6e621c6b3817a9f1, 0xd300b482fda5d226},
+  {0x2cf83b998a66fb35, 0x4f0359eaa9684bfb, 0x2c460d7b4765cbc7},
+  {0xa5c0e6cf67395406, 0xb659d3e82276235d, 0x2c5c851229561369},
+  {0x3168901c3d8747a6, 0x4541eabd5d866402, 0xb768bb5b1a6b8379},
+  {0xb5fa4b6cdc308417, 0x8100841dbbeb59e8, 0x4db5eb632adc8553},
+  {0x2622070061628fa6, 0xc66a1ed278866e50, 0xfad328db6fb4acba},
+  {0x6734cb1adfc5db87, 0xd7f8cfed34d7e713, 0x259e5c52bef9b101},
+  {0xa077ba5e97f9e1c0, 0x21edc3275eed4b8f, 0xc2ddffec584d31bc},
+  {0xe8074b1519eb9faa, 0xa35f39294a8283ed, 0xffbfa9f0fdcce212},
+  {0x49406434389cd06b, 0x5241069e873cd010, 0xde4f448e7e3c47b6},
+  {0x8cb6dafda57a1b04, 0xb80b06fb012be0f6, 0x6c1f61ef626c5ee2},
+  {0x9e596d56ff39dd82, 0xfd823060d81e563c, 0xfe45b0659666e7bf},
+  {0x713e642578abac3b, 0x1e13b3773dddffd6, 0xf7ebe45d0b4ed62e},
+  {0x0fb29b505409913a, 0xbd66ecfa5053f05e, 0x5172fa12bbd062cf},
+  {0x7a8cd2f2af8db5c7, 0xf1c96d88f03f2f0c, 0xfaa8376f49a0abd5},
+  {0xacc980889b25b5e7, 0x2c34843e6a6d9f3d, 0xa6bf67c68037b6ca},
+  {0xaff8095311a13c10, 0x1d4a259b84ca7804, 0x3cbb9d0b61f7ff43},
+  {0x5662cd5d639dfe13, 0x89c27a983290bab8, 0x92a7d11e497af642},
+  {0x4157aad5c3c645ca, 0xf51297f3f77a30f2, 0x83c9dda7804ac4d8},
+  {0x4e84ffef7ca3be0a, 0x14a7ba9c76da7c08, 0x5c28dc6da027d5a0},
+  {0xb0964b96303be4e5, 0x4615a98b7f22a76d, 0xf222f844d2b37df9},
+  {0x802540711d4f5f7d, 0xf6649bae872a32e3, 0xaed6395da047f447},
+  {0x2f0953d8ce80f600, 0xdcf66d5eaf05752f, 0x209193bacdf14ef8},
+  {0xc6a3ef2332ce576d, 0xb9e01c6c4572a31f, 0xde9e30f16310efde},
+  {0xba02b8398971d6e6, 0xd1bab81c9c5221d6, 0x1c9c2d1f1b7f3f2b},
+  {0xedc228019fbdd60a, 0x2753c3a138bcb6d7, 0x786fd2ba67707c2f},
+  {0x448e2cb6c1407cbf, 0xf7b738377f0cfb97, 0x4c9212bdc0657e9c},
+  {0xc76e32691429c2f9, 0x490232f4e8c043ce, 0x217833736b683230},
+  {0xd1499dc75ffd2a9c, 0xd4b5f702de32b776, 0xd6dfbb898f67a374},
+  {0x3b5a28d4cff86b77, 0x806f6c0571138c8b, 0x54628239f0c0f09f},
+  {0xb8d45dd4a900ea0a, 0x2a9169078690c168, 0xb3657df1647fbd66},
+  {0x08189a6674f4c29c, 0x8915f4636dd5d112, 0x654dc7fe07da3107},
+  {0x5250e18c883794b0, 0x8828b68987cd0d9a, 0x300a18a7c772270d},
+  {0x51d33040e3efaa99, 0xd658da2cb0cb97b0, 0x39038890d157c0af},
+  {0x68f5a5cd07a32b53, 0x46b4f5ec1368cf94, 0xf2e0d23f40742f45},
+  {0x782b44a867a3f208, 0xae64fe82046cd425, 0xb78cf45fe171d435},
+  {0xde012b438c92c4d6, 0x4733810dca874273, 0x206a03d102c15302},
+  {0xbea371badf5b9173, 0x8cbfaa817fd4f717, 0x34bea5affcb319d8},
+  {0x1a26c2090378d01a, 0xf3d15fc5c66a7f39, 0x4de762da9a07d052},
+  {0x3486c8a67bccd6cc, 0x0d10351e2b0e18ac, 0x087106b5da2aba90},
+  {0xbd5c398105759654, 0x932e7ce0d2415118, 0xff7a9395dd694851},
+  {0x6f6615de424f584e, 0x6ca415cbf1ff0b9a, 0x509c3763be9bb7ea},
+  {0xe45a5c178e450e25, 0x48cc200c65039546, 0x2c2d872741a6e8d2},
+  {0x10a487ce7b7ba1f7, 0x8da8831a4adaa217, 0xcb608d431e73d316},
+  {0x480667a3a33a0923, 0x3a6fc63a03c45c96, 0xebed952f29ad80c0},
+  {0x8899df2b4edff733, 0x7b68b7ea18849999, 0xcedaa43cfb6f7f7b},
+  {0x356eff5782ed987f, 0xca6aab13ed43b0ce, 0x9dd8a4a5288bc18a},
+  {0x5ffc38d8fbfdcdb6, 0x697d4c0b82ce34af, 0x3509dc6ecc05993b},
+  {0x83905969be9090dd, 0x2125eb5bbd23d5da, 0x64224c3dfae48ffe},
+  {0xf54512d0b6691741, 0x0cbaec28b636b0bc, 0xbb1d6adcda1edefc},
+  {0x89ea6a9a58cddfdb, 0x845d179babdb73f7, 0xcf74a641c412cff5},
+  {0x65c9f3063d3b266e, 0x560354e0ca062952, 0xc6eb9b218ae96514},
+  {0x8e8c7412b3689e52, 0x99b2ec666a8a4e48, 0x5b4477de15147c03},
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001, 0x0000000000000000, 0x0000000000000000},
+  {0x75575b2a01927c2b, 0xe38f9eab8f685827, 0x782b0bd5192bca87},
+  {0xaebbaa0e79dffe28, 0xb3542e6782b8ce84, 0x8a972b1b32323be2},
+  {0x62cbd1af5c77da14, 0x3bbc6119877bbc1d, 0x8b6d73bce65ed541},
+  {0x4cd2ae2762f272cc, 0xdd4adc5bfc34ae6b, 0xa3f908a96f0fe449},
+  {0x60cfbdb9b6447e24, 0x721af8263082c01c, 0x68cb54e6fc7104af},
+  {0xa92867af3dc3b730, 0xb2608cc06efe34d0, 0xa3445078ace873ca},
+  {0xeedad86c96afe677, 0x52afb525bd42562f, 0x38cf8ddb97dc96e5},
+  {0x0a3b06f10bbc9562, 0x577b7a04e02c557b, 0x8be00f5765b7e908},
+  {0xaa72b3916d207e20, 0x50f0cce86025ffac, 0x09f7f935bbde0a04},
+  {0xac08b4e71f96174a, 0x16babbd24d02b260, 0x48e9d357af5ba717},
+  {0xe122c9c16beaa8be, 0x07043902949cfad6, 0xf78fd47b58608577},
+  {0x5c473c24ac8ca469, 0xb1da898afda7d7aa, 0xcac72d2cf21a9be3},
+  {0x7da00b91479d06e3, 0xc4c76d79d51eb15a, 0xb6c2e5796630269d},
+  {0xb42bb35d07e100ec, 0x19964fd51c07b0af, 0xffb88b0ef80a102f},
+  {0x60e6beb41a673a07, 0x75bec86a6c06b470, 0x61bb7f05fc39be82},
+  {0xfda48d0189cd0c13, 0x649054858d5374c9, 0xb770a8503a32e8a4},
+  {0x4a376d825f3006c4, 0x8896eb44124e97aa, 0x70e626bebfff29b5},
+  {0xe37a2f298ccf89c0, 0x3c3609a866d94979, 0x356c25d15f10d784},
+  {0x458ca204a347e41a, 0x59568e0a4da4e181, 0xab475a7c61d9014c},
+  {0x8c1a39eb79672160, 0xb373d43893fbd9fe, 0x41ba8d6a7097e9c8},
+  {0x7c01434b5f8e1448, 0x662bd055a2512d4f, 0x652c31c38e992dbc},
+  {0x1eae5c36fe075219, 0x82cb682598bcb1ea, 0x60daaa526a3e9947},
+  {0xf7ab17ab10f03bdf, 0x1e124b56f71a4c37, 0x75df607068cfdcff},
+  {0x1ffe54ada576e3c2, 0x384cb4e86120aaa8, 0xc6a4fefc642071df},
+  {0x1ae57be1013b7efb, 0x28d36534e13a369d, 0x75612cd220210f77},
+  {0xa84cfbb045298f2b, 0x733fdf0216082f1f, 0x0054b363e1fcdb09},
+  {0x8745e66041e62570, 0x980a16636c09d9b2, 0x51695306d0539b47},
+  {0xa67319655b027ef6, 0x4fd02799c207267a, 0x01587af4a65b6fd2},
+  {0x8d991698735bcc88, 0xf14dbd2b19f99a78, 0x6947a3b95199d2f4},
+  {0xe2906490948e4aff, 0x16b2ee7035d98706, 0x78f47845853b1ddc},
+  {0x58d9cdc2dd693cd9, 0x7e9c240b1b252019, 0x5cbd3d458a53ca24},
+  {0x24101759ff01d89b, 0xed8fdd27cdb2d47b, 0x11b0fa26e8d8a743},
+  {0x9da3e8ee96db2f59, 0x68285801543b4ac8, 0x618cc8ad53d51b65},
+  {0xf0b448478f472d56, 0x6044053c293513d2, 0xea2fb63a575a34cb},
+  {0x56bd7f9b430ca7de, 0xf883dbb4c18d2e0b, 0x6c8030ef1a38c730},
+  {0xf2e2c1396125acce, 0x882e926d399fcc33, 0x87e914f3049f22ce},
+  {0x7ec0b0443f81915d, 0x4573c52a818a44f3, 0xafc01f5cc8120f6b},
+  {0x924aed58bfbc33f1, 0x7cbf5617448b59a8, 0xef023ef380d782bb},
+  {0xed78ebbcc2543624, 0x4fbdf96f5a481d8a, 0x7dea022c85973850},
+  {0x4cd0fda73b73aaf8, 0xab714c84882fc5a6, 0x31a12db8b87c1a82},
+  {0x4f55b122e52b04af, 0x2b6abc206fcdea22, 0xeecc6a28e10f3cd9},
+  {0x773b7f263618ea81, 0xfedd6644251162ff, 0x20f124b39fffa2ef},
+  {0xc86672d34c7f9c99, 0xa1a9bedd91ba54ab, 0xe3164453cdbc1680},
+  {0x976193445cc61080, 0x4e8af4d9771f7fde, 0x2d6951afbad5a152},
+  {0xac8104ed45afc3e0, 0x2daa407aee0854a8, 0x93bf8a5f6332934b},
+  {0xadacd0145616a90f, 0x18fcdf471f8e446e, 0xb6cb1d657c5aee1f},
+  {0x39f4888a9f625046, 0x714ced776be006aa, 0x301aab64f4c07bac},
+  {0xfed94c87075ec99b, 0x6527495efabe5878, 0xae4ed05b44c346fa},
+  {0xe8089970ab84a9ad, 0xfa8ef420f612f142, 0x3033e1b424799c03},
+  {0x3de830d471a1c303, 0x1d4648963e64b5e8, 0xb7fc69c1308d744f},
+  {0xf917cc81a21178a2, 0xf51c71d20d3dde0f, 0xc755e70d903eca43},
+  {0xf988b4435c7e0659, 0xe8ec12c9411e644d, 0x011cff135dc46fe5},
+  {0x45eb42b4bc82e615, 0xbb1ea1d87fa2dcc8, 0xbbf258cddfcc5a4e},
+  {0x76c177c889777fa3, 0x771de5ab30476eca, 0xe3dd4d0ea4da4f41},
+  {0x62d43190a74afaab, 0x8c72e6cc25a0906d, 0x6560641e35c269c1},
+  {0x4a473706039e3353, 0x9270c15446432105, 0x508bd6dfcce33617},
+  {0x58e979ef836cb200, 0x64a108a5f68530dc, 0xeeb5a210610292b9},
+  {0x3e8a485122657a2d, 0xb7f7272f3423621b, 0x4c0e2f899ffc6f0f},
+  {0xb03f26ebad2101f3, 0x2bf27f00ccb827ad, 0xf2c32d1c9db42e29},
+  {0xcc5f196397e2bb63, 0x9cf1f95bba0e5fb0, 0xcffa723b8add78c2},
+  {0x5198cabd81774aa6, 0x79e142bd7c3981f1, 0xcfb65a6d42815d8a},
+  {0x91dc7af311207622, 0xf294a4f3c38f447e, 0xdfd67624b63f7997},
+  {0xfb2f51ed0b5b44c1, 0x6eeb2b229427682c, 0xfad555a3f1680200},
+  {0xd043eb034f7557ae, 0x89f917e3d7f663f1, 0xd7f51e2f59ce0302},
+  {0xd1738764ddee76f4, 0x28a966bea5ec647e, 0xa322c656d7bc27d1},
+  {0x0cd66c8dd29514f0, 0xb4e37bf2f01130a9, 0x7db6ecdc81a7a57f},
+  {0xc8cb28a44796dc78, 0x88eb0048501b3765, 0x8ff3fbd6d703c26d},
+  {0x2c5d68650ca4b6f5, 0xa8e391ce83198344, 0x8b9f3219506be9d0},
+  {0x911906127a1ba855, 0x30d5215961ac95e7, 0x71827dfac7504342},
+  {0x1ae4c2e2506d0712, 0xb5caffb8afbcda6e, 0x159080539f7f876e},
+  {0x86571676f6228cdb, 0x3a51f0bfed40380f, 0x5dec5a0cee962a54},
+  {0xf5c3339c01460504, 0x5d55382d4e349ecc, 0xcf81cc12df0b2c9e},
+  {0x89a775997037437a, 0xc86002223b57f27f, 0xfe795feb841f08ef},
+  {0x7da8a9b3f9f43fe4, 0x8494d51c6e215f43, 0xb703f044bc338b9c},
+  {0xf73c2c9d450a092f, 0xce0ae97084884a01, 0x9a647f6d5f970839},
+  {0x87c63573f869cdbb, 0x812d2d8e966e6911, 0x973b425ba1c66dfa},
+  {0x7de5a1e78d630e85, 0x765d7d5a4a6e3cb7, 0x28170eef2a846d99},
+  {0x0b0c630c0f59460d, 0x9c8758a9ee8db258, 0xd3589f9c034f75d5},
+  {0xe1a6d8e757067309, 0xd18498099be244d9, 0x9b10a894502fc4e1},
+  {0xfa14fe8a1dd59c3e, 0x6a9a93b0f1ac862a, 0xdbe4d8d065053ef7},
+  {0x5c94965ff0a8e28e, 0xc2a32a0d57f1faa2, 0x24dc5effe1fa9e37},
+  {0x6b404bba72a24d04, 0xbcd23a38f7981241, 0x93d0c9eb1b9a39ef},
+  {0xa53a198b9e74e59c, 0x17cb3bc05f9608d1, 0x21bcc23eb5e75655},
+  {0x05911f7d3220397f, 0x7915054dcb628314, 0x183a2a8400570cef},
+  {0x2a420bf34788186c, 0x8c83a2945ee3027b, 0x606a65c37a8f2fe3},
+  {0xccf4e83131d54a27, 0xc95466a498499126, 0xef9ac8206968b1f7},
+  {0xe457b2ff12256f1e, 0x57fd60a454e5f68d, 0xf3388bb1de5dd1c2},
+  {0x4addb3e322595749, 0x39e02bd59d8ae504, 0x20284c1ae2f1a65c},
+  {0x9fbb5574795cac4d, 0x9fedac975974c8bb, 0xd307ecf05fd4fd22},
+  {0x2505bb81200f8cbb, 0x2ac9d93c45830708, 0x11ec704af2c49861},
+  {0xfa1702dd351d3b22, 0xbe0dfc13d607f962, 0x82c611b8ccd1e9f2},
+  {0xb7ff038d58626bd7, 0x86e990a7d6acad3b, 0x5010d30fbe2d70a9},
+  {0xc42bda459ef1afca, 0x83c5891e3eff20a0, 0xdefbb485c364fd5a},
+  {0xaada4d9f943df0f1, 0x2618e51a8838b5fe, 0x8f45f0ffff45201f},
+  {0xb55e3891213f972c, 0xdb4f56b16dc4e905, 0x30fd462a4cf268fc},
+  {0x64e007b7010e8c80, 0x2d0de3d26a1748c3, 0xa2e01ed12648c113},
+  {0x5128d2b5c4bac674, 0xb80b46283a340508, 0x1c1f01fe24b17a66},
+  {0x4cb8ab976733595f, 0x403aca262ff117b0, 0xce1698b4f9a54376},
+  {0x7781e71d8805fdc4, 0x40c3c2110800e7a0, 0xe72e9e63999cc311},
+  {0xbb3e3e6501e45c00, 0x9e70bd7de6780a3b, 0x549416aa087fe4c5},
+  {0xae1da809d7eed055, 0x06ba5804e029b01c, 0x490555c99e76bd05},
+  {0x67f3afbbfeee6547, 0x1243b190c38432b1, 0xbab2fa8df7bf2943},
+  {0x6d7197464f15c83c, 0x9283ced1147a6a85, 0x96ba1a0e47d9dd96},
+  {0x9cbb90e485218006, 0x8b5ff83a0210b4d9, 0x1086afcf143b95c2},
+  {0xa07d026b378f963b, 0x2debd80b456cd3e3, 0xc7792b9bc7f54c4a},
+  {0x3d0bec8b88ba06b8, 0x0c13cdfdc4d01e9f, 0x6d256d1087b9c95e},
+  {0x9216a33ea47259ff, 0x2bde0cfcb54abe8d, 0xaaef421825f1b47b},
+  {0xa1aabb09b181ae0f, 0xc14d44d54e3620cd, 0xabb20e2a4d637bcb},
+  {0x2544eba1038d1b04, 0xda1f84aa9bc120c2, 0x41fd7f657a18c45d},
+  {0xadaff973f301d8c3, 0x87dae306486ff1a6, 0x60ec280a2570b8ff},
+  {0x624994b2704d4c20, 0x532232f1cf209482, 0x861b9c2a5a7d0a43},
+  {0x4513aa7db58aea4d, 0x89dfbe8c94798dde, 0xe735f37739441c13},
+  {0x2f534ce65fbe5d87, 0xf8fcb2432339f543, 0x8ea957572a77e395},
+  {0x2456c8d764e7c1a6, 0x7dc7567c507e2e18, 0xd29b13c5db1cd65a},
+  {0x885705a845bb1199, 0xebc702d7e1680421, 0x9aeba22f533cbac9},
+  {0x55c435f803ad3742, 0x695442fe576b3a09, 0x5ca02fab230ee023},
+  {0x0d446bb06a3cbf8b, 0x5bfc8414d84fff9a, 0x157e3384708408a8},
+  {0x7b212d17c02a4054, 0x2b14562733ba6900, 0x7965f7d93122eac0},
+  {0x349446294451df24, 0x2b91f57cdcc289f3, 0x829cb5a03cce767d},
+  {0x2f8e7fa84f0ad401, 0xb3a50f68cba8a638, 0xde440882f84bfd7a},
+  {0xd1ba1db41829f412, 0x9a2c4c23fb8538f7, 0x86ca32d92d99ecb9},
+  {0x8a6db99a627b227c, 0x633c81cf8e52a687, 0x8e58542594d7103e},
+  {0x4c5a928b8610d6cd, 0x6a38a81e5ec41b61, 0x05ac22b201c86322},
+  {0x283c4b53c14f39c0, 0x106fe171df2218c5, 0x4c077d33f17e0107},
+  {0x198b4c90bd33552f, 0x5853a4c2f74596db, 0x1018dd6bf21150d4},
+  {0x47c29e1c2f495b4c, 0x7ec84995131d545b, 0x49e53beaeb94dae0},
+  {0x2678b3f7b548fc9f, 0x63a6b9322f3a574c, 0xef6d85f1091f1aeb},
+  {0xf1391f569cd5fe90, 0x876e8ba956de0238, 0x6cd576e3b8ab6222},
+  {0x827547465967b775, 0x4197e1290368e412, 0xee63a7ef2156fb67},
+  {0x6cb2a919735b34d5, 0x6cc967b756d72395, 0x9a884a65ae74e811},
+  {0xbdebcb5fbfafafc0, 0xb7fc62a4c7947030, 0x554c36728822d8b6},
+  {0x025fef80c960792a, 0xc0f487dcc0ad8059, 0x9714504680995ad0},
+  {0x19ffb11f02502666, 0x482fc0fae8608ad2, 0x781175f6049c62ee},
+  {0xf1fece4f515854e7, 0x6dab52f7b6560106, 0xfa0028f50d672954},
+  {0x844afcd287c1ddba, 0x47234b529fe3ca41, 0x3ca221c08f88140a},
+  {0xfdbbeaaa02badeda, 0xf35a5e21992e2332, 0xa37f6d68d919b65f},
+  {0x6d218f603725748a, 0xb6df3c61103e9c3e, 0xbb7ac1cf4c1f4692},
+  {0x8e6d3eb058cfc260, 0xfbe2f6497287731a, 0xffa78646830d5ce0},
+  {0x8c07c328df449acd, 0x500ba217a7af529f, 0x19ab11b99a1a2a19},
+  {0x42de87a6001d7bc4, 0x6d65941a9ae5138b, 0xcb830271914ce1ee},
+  {0x25f950eb4e2b9669, 0x0c9f7a2279a16278, 0x86503e9de2e76202},
+  {0xedc0f3a86b732556, 0xc7995c7b3ec0ea66, 0x8a4d95b8d19c29ce},
+  {0x01b5ab0eca4d3189, 0xed7898b982b519ad, 0x24c5f841a769f11b},
+  {0xde3eefe1bad32178, 0x493a735c30942df4, 0x8b5ec5bed8e4d565},
+  {0xa974a9d616b752fa, 0x09d37b2ab193ca1c, 0x55b8aaf3af4481ba},
+  {0x84ca6915121b1e09, 0x8831e83e34fac643, 0x05e3db5a89049a2f},
+  {0x5375a9f4aefd0f44, 0xaf272fd031366078, 0xbbd286c07ed80632},
+  {0x9d101a493aa2ebc9, 0x67e3ddfaa73b2b94, 0x45bf06b13a5d6856},
+  {0x6469dfeed8b766bc, 0x41a958a8c84553fc, 0xc3665b3f060a6808},
+  {0x8bbd23b38d0cff32, 0x891f48bb2592fb3f, 0x24c6243ad065453e},
+  {0xf3d1cc12dcb4e302, 0x588dfaa464f518be, 0xfe082e8b4a39cf26},
+  {0x95c521746547be8e, 0x9cbbea72400d1df8, 0x0cfdac076655d579},
+  {0xa6c4c57375f48495, 0xd63f47b41907a3f7, 0x34e17c2df60668d7},
+  {0xa135ca38c26b95c3, 0x2aac9c6b01173258, 0x2d8499bf2ed7c23c},
+  {0xba02892976144352, 0x9e4d9906dc2ae94e, 0x6535b5091d0535a4},
+  {0x6ec4dba2c6f7e949, 0x02d65b71f7db3f86, 0x61c796b0290e7ff0},
+  {0xac044d22d442ff2e, 0x29d00d9db764b6ff, 0x9ec4ff5f21f3216d},
+  {0x26b3c84573c53161, 0xa3037316e91bf8bb, 0x251ed327edf11e39},
+  {0x2917804d2422970c, 0x16119362ba8934be, 0xafa94e1359c77cce},
+  {0x4eac35ec04e84a0b, 0x31b309e5e5d361a5, 0x4171e00956fd334e},
+  {0xa02b9fdd9f6b8162, 0xabd8bc110f4e1f52, 0x75578ed77238fedc},
+  {0xe73f9ad96bd8686d, 0xbdfc49ed2dba8097, 0x054c4bb989c34404},
+  {0xa0d01888aa5b1042, 0x8c33305a0dc075b1, 0x75f81fe0369e7b86},
+  {0x679d711aa88faab7, 0xb03f74deaa29c24c, 0x10a7766990689f5a},
+  {0x827d13e4d6310b6b, 0xc5a73641d06e47d1, 0xf2f0d06e14e2ab1f},
+  {0xcc968649ec63f05e, 0x17cda3a7fc25bfb2, 0x0df1338db25ee18e},
+  {0x7d4acd6c3cf8c18b, 0x4bd734fd562d48ad, 0xae50c4f72f542533},
+  {0xcf438bf70dbe4c62, 0x0019bcea28ce9270, 0xf687acda7ff8c960},
+  {0x5b24783c5318fd09, 0x5623189d31422de8, 0x862fd585eeb3e3f0},
+  {0xf98482f8df7d5e16, 0xccb9fb2d3745fbbf, 0x7d5e1bd364daa7d4},
+  {0x024849574a40a831, 0x48cae56880d67329, 0xfafa85469a93e6b3},
+  {0x944eae6b760bc534, 0x1d1d18f30fec24c3, 0xc64a74b4d0c3181e},
+  {0x19c52990a4e62d2d, 0x37b473c7ed759ef9, 0x04080c0ade3df738},
+  {0xfcc4062c7876c075, 0x48b4cf0b72aae741, 0x3889eef0b66c1bff},
+  {0x49c26471ae06da0b, 0x109da4749a70108b, 0x443b50c74915bd54},
+  {0xbe68bd432e672eb8, 0xbe737af593618ab7, 0x5d537d8c0da1a4e7},
+  {0xa3ca7393ce4e8d7c, 0x0fcf46d53a057c21, 0x7451a590ca6c1db1},
+  {0x79419444b1c149e5, 0x9d577a1e13240b2d, 0x24da1fd0d5db6e4d},
+  {0xe8c3caf37ad5170c, 0x423b4593d3f4c834, 0xff039eaad5042ae3},
+  {0x3bf5913b5615f7f5, 0x2d24b840238f2c84, 0x97bdc5bfeb1d53b7},
+  {0x53538b2293df4606, 0x169029e2d8675ec6, 0x9ab1ac25ee4982a4},
+  {0x75bd284d07f591f8, 0xccdd36b98d68786e, 0x9321ba79d2e56eed},
+  {0xe63236d17de7e69c, 0x9600d5f5cca5b08a, 0x8ff14c81e5d61843},
+  {0xdb079962536683c6, 0x35bb6068eb26bd37, 0xa614c37971ca2e4d},
+  {0xab78167ac83c4064, 0xb6a1928d6f89cdd1, 0xc97cc61d01ffe82f},
+  {0x83e6edd7a512e8b7, 0xe281601e537bc4ec, 0x19d35d2d57518cde},
+  {0xf737f3ddfa7fc9b2, 0x4a8f04a9cb4847be, 0x2946f3355994de91},
+  {0x577ca3baf1f7e1ba, 0x446729b10c51ed7c, 0xab637d9c6e3a5554},
+  {0x4e31798071664def, 0xec15c968e363630d, 0xd7ce5f867f758e48},
+  {0x10525e76bc5a5ed9, 0x1c8a384248ab4398, 0x8f7a522f2e2f3fc5},
+  {0xdee25133572d24bf, 0x37203f7f6c2e0e36, 0x89ba27d9b1233156},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer192s/m4stack/api.h b/crypto_sign/aimer192s/m4stack/api.h
new file mode 100644
index 00000000..c4b90d12
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 48
+#define CRYPTO_SECRETKEYBYTES 72
+#define CRYPTO_BYTES 9120
+#define CRYPTO_ALGNAME "aimer192s"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer192s/m4stack/field.c b/crypto_sign/aimer192s/m4stack/field.c
new file mode 100644
index 00000000..91ee3d55
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/field.c
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+  uint64_t sub[6] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[0] ^ a[2];
+  sub[3] = b[0] ^ b[2];
+  sub[4] = a[1] ^ a[2];
+  sub[5] = b[1] ^ b[2];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &temp[5], &t[1]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul(&sub[4], &sub[5], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] = temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] = temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+  uint64_t sub[6] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[0] ^ a[2];
+  sub[3] = b[0] ^ b[2];
+  sub[4] = a[1] ^ a[2];
+  sub[5] = b[1] ^ b[2];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &temp[5], &t[1]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul(&sub[4], &sub[5], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] ^= temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] ^= temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+  c[2] = temp_c2;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+  c[2] ^= temp_c2;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&temp[5], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[2]), (b[0] ^ b[2]));
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[1] ^ a[2]), (b[1] ^ b[2]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] = temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] = temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[3] = {0,};
+  uint64_t temp[6] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&temp[5], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  temp[1] = t[0] ^ temp[0];
+  temp[2] = t[1] ^ temp[1];
+  temp[4] = temp[5] ^ t[1];
+  temp[3] = temp[4] ^ t[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[2]), (b[0] ^ b[2]));
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[1] ^ a[2]), (b[1] ^ b[2]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[3] ^ ((temp[5] >> 57) ^ (temp[5] >> 62) ^ (temp[5] >> 63));
+
+  c[2] ^= temp[2] ^ temp[5];
+  c[2] ^= (temp[5] << 7) | (temp[4] >> 57);
+  c[2] ^= (temp[5] << 2) | (temp[4] >> 62);
+  c[2] ^= (temp[5] << 1) | (temp[4] >> 63);
+
+  c[1] ^= temp[1] ^ temp[4];
+  c[1] ^= (temp[4] << 7) | (t[0] >> 57);
+  c[1] ^= (temp[4] << 2) | (t[0] >> 62);
+  c[1] ^= (temp[4] << 1) | (t[0] >> 63);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 7);
+  c[0] ^= (t[0] << 2);
+  c[0] ^= (t[0] << 1);
+}
diff --git a/crypto_sign/aimer192s/m4stack/field.h b/crypto_sign/aimer192s/m4stack/field.h
new file mode 100644
index 00000000..5182adc4
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[3];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer192s/m4stack/hash.c b/crypto_sign/aimer192s/m4stack/hash.c
new file mode 100644
index 00000000..06a6f473
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/hash.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+void hash_init(hash_instance *ctx)
+{
+  shake256_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake256_inc_init(ctx);
+  shake256_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake256_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake256_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake256_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake256_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake256_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer192s/m4stack/hash.h b/crypto_sign/aimer192s/m4stack/hash.h
new file mode 100644
index 00000000..0fb7c8ef
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/hash.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake256incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer192s/m4stack/params.h b/crypto_sign/aimer192s/m4stack/params.h
new file mode 100644
index 00000000..2afef61c
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/params.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer192s_m4stack_##s
+
+#define SECURITY_BITS               192                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         2                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     25                   // number of parallel repetitions (Tau)
+#define AIMER_N                     256                  // number of MPC parties (N)
+#define AIMER_LOGN                  8                    // log_2(N)
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer192s/m4stack/sign.c b/crypto_sign/aimer192s/m4stack/sign.c
new file mode 100644
index 00000000..905b10f8
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/sign.c
@@ -0,0 +1,628 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_mul(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->z_shares[0], mult_chk->x_shares[0],
+                           aim2_e1_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE])
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  hash_instance ctx;
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1],  rep, party);
+      hash_update(&ctx, commits, AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+      }
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF)
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF pt_shares[AIMER_N];
+  GF alpha_v_shares[2][AIMER_N];
+  GF epsilons[AIMER_L + 1];
+
+  // prepare h2
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF_set0(alpha);
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1], rep, party);
+
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk));
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+        GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+        GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+        GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares, sizeof(alpha_v_shares));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b)
+{
+  // prepare challenge parties
+  hash_instance ctx;
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF epsilons[AIMER_L + 1];
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  mult_chk_t mult_chk;
+  memset(&mult_chk, 0, sizeof(mult_chk_t));
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes, i_bar);
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha_share = {0,};
+
+    tape_t tape;
+    commit_and_expand_tape(&tape, commits, &ctx_precom,
+                           nodes[AIMER_N + i_bar - 1], rep, i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits,
+           AIMER_COMMIT_SIZE);
+
+    GF_set0(mult_chk.x_shares[AIMER_L]);
+
+    // adjust the last share and prepare the proof and h_1
+    if (i_bar == AIMER_N - 1)
+    {
+      // initialize adjustment values
+      tape_t delta;
+      memset(&delta, 0, sizeof(tape_t));
+
+      GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+      GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+      GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+      GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+      GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+      GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+    }
+
+    for (size_t ell = 0; ell < AIMER_L; ell++)
+    {
+      GF_copy(mult_chk.x_shares[ell], tape.t_shares[ell]);
+
+      // x_* = sum_i A[i] * t[i] + b
+      GF_transposed_matmul_add(mult_chk.x_shares[AIMER_L],
+                               tape.t_shares[ell], matrix_A[ell]);
+    }
+    GF_copy(alpha_share, tape.a_share);
+
+    GF_mul_add(alpha_share, mult_chk.x_shares[0], epsilons[0]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[1], epsilons[1]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[2], epsilons[2]);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes, alpha_share);
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  signature_t *sign = (signature_t *)sig;
+
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt and root seeds
+  hash_instance ctx_roots;
+  hash_init_prefix(&ctx_roots, HASH_PREFIX_3);
+  hash_update(&ctx_roots, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_roots, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_roots, random, SECURITY_BYTES);
+  hash_final(&ctx_roots);
+  hash_squeeze(&ctx_roots, sign->salt, AIMER_SALT_SIZE);
+
+  uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE];
+  hash_squeeze(&ctx_roots, (uint8_t *)root_seeds, AIMER_T * AIMER_SEED_SIZE);
+  hash_ctx_release(&ctx_roots);
+
+  run_phase_1(sign, pt_GF, (const GF *)sbox_outputs, mu,
+              (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds);
+
+  run_phase_1_to_3(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, ct_GF);
+
+  run_phase_1_to_5(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b);
+
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(signature, CRYPTO_BYTES, message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer192s/m4stack/sign.h b/crypto_sign/aimer192s/m4stack/sign.h
new file mode 100644
index 00000000..2e2ab61d
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/sign.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_3 AIMER_NAMESPACE(run_phase_1_to_3)
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF);
+
+#define run_phase_1_to_5 AIMER_NAMESPACE(run_phase_1_to_5)
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer192s/m4stack/tree.c b/crypto_sign/aimer192s/m4stack/tree.c
new file mode 100644
index 00000000..0585e7fa
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/tree.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE])
+{
+  size_t index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  memcpy(nodes[0], root_seed, AIMER_SEED_SIZE);
+  buffer[0] = (uint8_t)(rep_index);
+  for (index = 1; index < AIMER_N; index++)
+  {
+    buffer[1] = (uint8_t)(index);
+    memcpy(buffer + 2, nodes[index - 1], AIMER_SEED_SIZE);
+
+    hash_ctx_clone(&ctx, &ctx_);
+    hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+    hash_final(&ctx);
+    hash_squeeze(&ctx, nodes[2 * index - 1], AIMER_SEED_SIZE << 1);
+    hash_ctx_release(&ctx);
+  }
+  hash_ctx_release(&ctx_);
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer192s/m4stack/tree.h b/crypto_sign/aimer192s/m4stack/tree.h
new file mode 100644
index 00000000..f1354f19
--- /dev/null
+++ b/crypto_sign/aimer192s/m4stack/tree.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define expand_tree AIMER_NAMESPACE(expand_tree)
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE]);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer256f/m4speed/__asm_field.S b/crypto_sign/aimer256f/m4speed/__asm_field.S
new file mode 100644
index 00000000..6181c602
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/__asm_field.S
@@ -0,0 +1,695 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  ldr.w R2, [in_p, #4 * width]
+  ldr.w R3, [in_p, #5 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R3, [out_p, #5 * width]
+
+  ldr.w R2, [in_p, #6 * width]
+  ldr.w R3, [in_p, #7 * width]
+  str.w R2, [out_p, #6 * width]
+  str.w R3, [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R2, [out_p, #5 * width]
+  str.w R2, [out_p, #6 * width]
+  str.w R2, [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  ldr.w R3,  [in0_p, #4 * width]
+  ldr.w R12, [in1_p, #4 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #4 * width]
+
+  ldr.w R3,  [in0_p, #5 * width]
+  ldr.w R12, [in1_p, #5 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #5 * width]
+
+  ldr.w R3,  [in0_p, #6 * width]
+  ldr.w R12, [in1_p, #6 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #6 * width]
+
+  ldr.w R3,  [in0_p, #7 * width]
+  ldr.w R12, [in1_p, #7 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+  in8         .req R10
+  in9         .req R11
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w    {R4-R12, lr}
+
+  ldr.w in0, [in_p, #4 * width]  // a[2]
+  ldr.w in2, [in_p, #5 * width]
+  ldr.w in4, [in_p, #6 * width]  // a[3]
+  ldr.w in6, [in_p, #7 * width]  
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R12, C4
+
+  and.w in0, in0, R12, lsr #16
+  and.w in2, in2, R12, lsr #16
+  and.w in4, in4, R12, lsr #16
+  and.w in6, in6, R12, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+  eor.w in0, in0, in7, lsr #22
+  eor.w in0, in0, in7, lsr #27
+  eor.w in0, in0, in7, lsr #30
+
+  push.w {in2, in3}              // temp[5]
+
+  ldr.w in2, [in_p, #2 * width]  // a[1]
+  ldr.w in8, [in_p, #3 * width]
+
+  lsr.w in3, in2, #16
+  lsr.w in9, in8, #16
+
+  and.w in2, in2, R12, lsr #16
+  and.w in8, in8, R12, lsr #16
+
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in8, C3, 8
+  or_shift_and in9, C3, 8
+
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in8, C2, 4
+  or_shift_and in9, C2, 4
+
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in8, C1, 2
+  or_shift_and in9, C1, 2
+
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in8, C0, 1
+  or_shift_and in9, C0, 1
+
+  // c[3] = temp[3] ^ temp[7];
+  eor.w in8, in8, in6
+  eor.w in9, in9, in7
+
+  // c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  // c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  eor.w in8, in8, in5, lsr #22
+  eor.w in8, in8, in5, lsr #27
+  eor.w in8, in8, in5, lsr #30
+
+  eor.w in8, in8, in6, lsl #10
+  eor.w in8, in8, in6, lsl #5
+  eor.w in8, in8, in6, lsl #2
+
+  eor.w in9, in9, in6, lsr #22
+  eor.w in9, in9, in6, lsr #27
+  eor.w in9, in9, in6, lsr #30
+
+  eor.w in9, in9, in7, lsl #10
+  eor.w in9, in9, in7, lsl #5
+  eor.w in9, in9, in7, lsl #2
+
+  str.w in8, [out_p, #6 * width]
+  str.w in9, [out_p, #7 * width]
+
+  // c[2] = temp[2] ^ temp[6];
+  eor.w in2, in2, in4
+  eor.w in3, in3, in5
+
+  // c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  // c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  // c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  pop.w {in6, in7}               // temp[5]
+
+  eor.w in2, in2, in7, lsr #22
+  eor.w in2, in2, in7, lsr #27
+  eor.w in2, in2, in7, lsr #30
+
+  eor.w in2, in2, in4, lsl #10
+  eor.w in2, in2, in4, lsl #5
+  eor.w in2, in2, in4, lsl #2
+
+  eor.w in3, in3, in4, lsr #22
+  eor.w in3, in3, in4, lsr #27
+  eor.w in3, in3, in4, lsr #30
+
+  eor.w in3, in3, in5, lsl #10
+  eor.w in3, in3, in5, lsl #5
+  eor.w in3, in3, in5, lsl #2
+
+  str.w in2, [out_p, #4 * width]
+  str.w in3, [out_p, #5 * width]
+
+  ldr.w in2, [in_p, #0 * width]  // a[0]
+  ldr.w in4, [in_p, #1 * width]
+
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+
+  and.w in2, in2, R12, lsr #16
+  and.w in4, in4, R12, lsr #16
+
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+
+  // c[1] = temp[1] ^ temp[5];
+  eor.w in4, in4, in6
+  eor.w in5, in5, in7
+
+  // c[1] ^= (temp[5] << 10) | (t >> 54);
+  // c[1] ^= (temp[5] <<  5) | (t >> 59);
+  // c[1] ^= (temp[5] <<  2) | (t >> 62);
+  eor.w in4, in4, in1, lsr #22
+  eor.w in4, in4, in1, lsr #27
+  eor.w in4, in4, in1, lsr #30
+
+  eor.w in4, in4, in6, lsl #10
+  eor.w in4, in4, in6, lsl #5
+  eor.w in4, in4, in6, lsl #2
+
+  eor.w in5, in5, in6, lsr #22
+  eor.w in5, in5, in6, lsr #27
+  eor.w in5, in5, in6, lsr #30
+
+  eor.w in5, in5, in7, lsl #10
+  eor.w in5, in5, in7, lsl #5
+  eor.w in5, in5, in7, lsl #2
+
+  str.w in4, [out_p, #2 * width]
+  str.w in5, [out_p, #3 * width]
+
+  // c[0] = temp[0] ^ t;
+  eor.w in2, in2, in0
+  eor.w in3, in3, in1
+
+  // c[0] ^= (t << 10);
+  // c[0] ^= (t << 5);
+  // c[0] ^= (t << 2);
+  eor.w in2, in2, in0, lsl #10
+  eor.w in2, in2, in0, lsl #5
+  eor.w in2, in2, in0, lsl #2
+
+  eor.w in3, in3, in0, lsr #22
+  eor.w in3, in3, in0, lsr #27
+  eor.w in3, in3, in0, lsr #30
+
+  eor.w in3, in3, in1, lsl #10
+  eor.w in3, in3, in1, lsl #5
+  eor.w in3, in3, in1, lsl #2
+
+  str.w in2, [out_p, #0 * width]
+  str.w in3, [out_p, #1 * width]
+
+  pop.w {R4-R12, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer256f/m4speed/aim2.c b/crypto_sign/aimer256f/m4speed/aim2.c
new file mode 100644
index 00000000..74e41922
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/aim2.c
@@ -0,0 +1,598 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 11
+// (2 ^ 11 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5
+// b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_5, table_6);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // table_b = in ^ (0xb6), table_5 = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_5, t1, table_5);
+  GF_mul_s(table_b, t1, table_6);
+
+  // t1 = in ^ (0xb6 d)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // table_5 = in ^ (0xb6d6dad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(table_5, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5 b6)
+  GF_sqr_s(t1, table_5);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // out = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_5);
+}
+
+// inverse Mersenne S-box with e2 = 141
+// (2 ^ 141 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0x2224448889112222444888911222244488911122244448891112224444889111
+// 222444 8889112 222444 8889112 222444 889111 222444 4889111 222444 4889111
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,}, t3 = {0,}, t4 = {0,}, t5 = {0,};
+  GF table_9 = {0,};
+
+  // t2 = in ^ (0x11), table_9 = in ^ 9
+  GF_sqr_s(t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_9, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, in);
+
+  // t3 = in ^ (0x111)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t3, t1, in);
+
+  // t4 = in ^ (0x222444)
+  GF_sqr_s(t1, t3);
+  for (i = 0; i < 10; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t4, t1);
+
+  // t1 = in ^ (0x222444 8889)
+  GF_sqr_s(t1, t4);
+  for (i = 1; i < 9; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x2224448889 11)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t5 = in ^ (0x222444888911 2)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t5, t1);
+
+  // t1 = in ^ (0x2224448889112 2224448889112)
+  GF_sqr_s(t1, t5);
+  for (i = 1; i < 52; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t5);
+
+  // t1 = in ^ (0x22244488891122224448889112 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x22244488891122224448889112222444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889111 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889111222444 4)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889111 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889111222444 4)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+
+  // t1 = in ^ (0x2224448889112222444888911222244488911122244448891112224444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // out = in ^ (0x2224448889112222444888911222244488911122244448891112224444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, t3);
+}
+
+// inverse Mersenne S-box with e3 = 7
+// (2 ^ 7 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76ed
+// ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e d
+void GF_exp_invmer_e_3(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,};
+  GF table_6 = {0,}, table_7 = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 3
+  GF_sqr_s(table_d, in);
+  GF_mul_s(t1, table_d, in);
+
+  // table_6 = in ^ 6
+  GF_sqr_s(table_6, t1);
+  // table_7 = in ^ 7
+  GF_mul_s(table_7, table_6, in);
+  // table_b = in ^ 11
+  GF_sqr_s(table_b, table_d);
+  GF_mul_s(table_b, table_7, table_b);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t1 = in ^ 0xdd
+  GF_sqr_s(t1, table_d);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ 0xdd b
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ 0xddb b
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ 0xddbb 7
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb7 6
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // table_7 = in ^ 0xddbb76 e
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_7);
+  GF_sqr_s(table_7, t1);
+
+  // t1 = in ^ 0xddbb76e ddbb76e
+  GF_sqr_s(t1, table_7);
+  for (i = 1; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // out = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e d
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, table_d);
+}
+
+// Mersenne exponentiation with e_star = 3
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+
+  // t1 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, in);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+  GF_add(state[2], pt_GF, aim2_constants[2]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+  GF_exp_invmer_e_3(state[2], state[2]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_transposed_matmul(state[2], state[2], (const GF *)matrix_U[2]);
+  GF_transposed_matmul(state[2], state[2], (const GF *)matrix_L[2]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[2], state[2], vector_b);
+  GF_add(state[0], state[0], state[2]);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+  GF_add(sbox_outputs[2], pt, aim2_constants[2]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+  GF_exp_invmer_e_3(sbox_outputs[2], sbox_outputs[2]);
+}
diff --git a/crypto_sign/aimer256f/m4speed/aim2.h b/crypto_sign/aimer256f/m4speed/aim2.h
new file mode 100644
index 00000000..bdc50429
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/aim2.h
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0x24a19947b3916cf7,0xba7c9045f12c7f99,0xb8e1afed6a267e96,0x2ffd72dbd01adfb7},
+  {0x0d95748f728eb658,0xa458fea3f4933d7e,0x636920d871574e69,0x0801f2e2858efc16},
+  {0xc5d1b023286085f0,0x9c30d5392af26013,0x7b54a41dc25a59b5,0x718bcd5882154aee}
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001,0x0000000000000000,0x0000000000000000,0x0000000000000000},
+  {0x13269d7dcfc555c3,0x6fe13874c42fedfb,0xc69f003d9d5abb9c,0x05636fd04ebf7feb},
+  {0x7a273dd9fcec7e15,0x42cd3eb54144ea68,0x5a88aaa3ebaacdff,0x527284e39fae2053},
+  {0x56bb9ab537abf542,0x768c3d772850c862,0x0160d91d288fd0e0,0x342e111e0a022022},
+  {0xcdb998ce4b3eee2e,0x78984c4dc99c90aa,0x2bb89f84c00275b6,0x75c6a0cc065fd4ac},
+  {0x74b2cd2360cb32af,0xbde82f7cf42dd1bf,0x7ceed82d54d965c4,0xf4e9f207aa17f2e9},
+  {0x995d5aab614ac6c0,0x1563800b79242f35,0x1d940184c4509090,0xe6558fd024716b90},
+  {0x8d0b793b4375cc8a,0xfcf792217776a3ee,0x5da44008043b7450,0xc77adf87407cf838},
+  {0x00451596f23df45e,0xd8bcbc0d7ae8534f,0x02c26abe3748db45,0xb37e029dc51a4b41},
+  {0x177dbfce6cbc8c0b,0x62cdd72c8cbd2d2a,0x568802d992bd7a2c,0xd0082d2193b6e383},
+  {0x221e6872863f45c6,0xbe5a9bce6c00df76,0x98c076efe1cfcc67,0xa75bdc7ab5c142a9},
+  {0x088d4e8e27e0b74d,0x71046740fe7e6c5a,0x20123cab6052c1d6,0xa7135d055351c99b},
+  {0x46176449341c7657,0x2a7936011468475e,0xc347e166dca96014,0xd79326785eee3555},
+  {0xc6b77e5a8b6dcae9,0x6dc641a8e07c54d4,0x37055c3ed77341a8,0xd75eaedd0ec6f1d1},
+  {0x5240b9b6f3433443,0x7b7d965745400c05,0x4542be5aec50ec53,0x13e6ac8f2aac12a2},
+  {0x66c30b9da469d401,0xcd5dbf02dc359172,0xf16b3e62f8a57e1d,0x362c2bc9345b97ed},
+  {0xb2a65d5f7da755e8,0x11df10d6ddd9eb84,0x433468d75cb64470,0xb4a6ffd454c82b2f},
+  {0x1c87142145f7c112,0xde2854fa4939dc0b,0x10a503b51b7c7a19,0x174f91701431e1b3},
+  {0x60d8fb32b890cec6,0x27d95c11548f693c,0x30fce7ce95e950b3,0x210559008a309578},
+  {0x5de49c870dd8fb60,0x1f480e246bb2c961,0xdc5efcb1f4ee90ae,0x165c3f5b62136c5e},
+  {0xc17b4bbe4b5780a8,0x690f1102a6decffe,0xa26e146710d9cd7d,0xc7f278fb3f02a99d},
+  {0x4fe7916de7e17f1c,0xe9e59586ac0a7185,0x092b72935bc23437,0xa306568e985edbfa},
+  {0xc05330df507b35c8,0x944475d0eb5c89f7,0x34a3653b083969a5,0x97e431e62e205633},
+  {0x19fe581ef3e9a896,0x720ab1851376eff0,0xda5ca1af445dea40,0xe3899fd1cdc93f2f},
+  {0x7a18d867d11567d6,0x14e706af946787cb,0x2ececbd0e726236a,0x66a864e0c387e806},
+  {0x0a0a9e1dc2c9d30d,0xa1bd85358585db7a,0x78f90bb68d83e25e,0x2275165a7e496039},
+  {0x23f2e1a2057c9892,0xb7f503272b51fa8f,0x0ecf56cbb57a6021,0x77f77f889ecb3e74},
+  {0x237633913a45a827,0x3a2c98b4d38d139b,0xbc1dfd5ddab4bb19,0xf2bcbdc105b017fd},
+  {0x9a53645fca466120,0x07335188ef82289a,0x9cdd8f1434ddc4c7,0x25afc28ddf0c0ea5},
+  {0x0166bda62c3c97ac,0x4821343275a35741,0xa4a1f8ef377f5177,0x3008d4b041fc0802},
+  {0xed498663eb9138f0,0xb16289e1ea93949b,0xa2476ced73badf6e,0xb384ce50cdee1d75},
+  {0x25430e5e2ea409d8,0xf8909d2164becc11,0x77663884798e456b,0xe11b963640c6a7da},
+  {0x2a5ce7930313e789,0x01a1b717dd5e72f3,0x674b4810dda58bf3,0xb348d6cffeee2602},
+  {0xe4871c9932b98648,0x90432c7798b61577,0xf803346f3989e611,0x176c5f43490e3127},
+  {0x28b7ff52a8d039f5,0x2549d26014bcb371,0x7705b13fd068e5f0,0x22f60aec7063b440},
+  {0xa90087e5804b094e,0x17b587e9f7b1334c,0x7e9128a8fd49f502,0x10a15de60dcc1259},
+  {0x676fc8232449f7f5,0xa45eba0b86ee4f8d,0x48d0f0583763ed04,0x9430177369350009},
+  {0x8bb187487d0ca392,0x8b34c408cf71198e,0x4c5b9033c740f6cb,0x15165d415ea592e5},
+  {0xe25b8fc9315d8b10,0x6f067bcaaa5db46f,0xc0d574e6df163bcb,0x76d62e45eeb26cb3},
+  {0xc7bb4eaa81af7e21,0xc0c25e2c4da66ca9,0x20a5b7a6ef682683,0xe0c40a42bed8c878},
+  {0x340b283a1f67eb72,0x94c68ac57747d7b1,0xaab540d8883c7e78,0x53ffb196e81fbce0},
+  {0x03d1fe920cc5c8b6,0x2d058e7c02de80d0,0x349140f34518313d,0x52d8d34dce452897},
+  {0x3daf5481e615a4ec,0x1d21ddb2b19865a7,0x28572f8e3caef8c4,0x94f0069367dd5a9c},
+  {0xf97efd31544a2432,0x79cc100bcd1c95c5,0x630dd7dbdcda2efa,0xb0c94889efaeabe6},
+  {0x1855a973cd69d2ac,0xa249d1e68760fda5,0x9bd185166791f0b0,0x73aad654a16f87d5},
+  {0xb64f4c4f69887572,0x0dd0ddfafeaec759,0x9a2b2e01a2dfdf21,0x23e6842e19958e74},
+  {0x47126f2ed9d35243,0x2dd26a5dc07d8ab7,0x5f7a0864bae59fef,0x84bd4c2d7eef707e},
+  {0xc2b75aa6809fde33,0x4e05ff4138a1458a,0x4283e814ca9b30b5,0x46b1bcf0f62d4313},
+  {0x83f0c7c594f6cf9a,0xdb8a4b8e5dfe204e,0x44a803aecd550290,0x96cc8907871fc11e},
+  {0x7ca33f7d36e71a53,0x609b8f2296791418,0xd9e9118ba8ddf5e9,0x813002deae63def5},
+  {0x5e3805abc5d66c85,0xe95aac205db8a39d,0xfad61d269550a976,0xc0c3e22037926992},
+  {0xf3ba3f8e2a564d34,0xfd74426f936299c0,0x23bb54e8112b82e3,0xc5afe8e8365a6000},
+  {0xb733edd6855182ef,0x5ecb1ae3728f48e8,0x3b8b1ce5bf96e304,0xf3aba2a7bfac4c59},
+  {0x78f2ea71794eaef2,0x59f25ef7fe359b84,0xacfd3e59513654c8,0xd1e24fda7d0c3936},
+  {0x288da25da8b17fb3,0xbe107e7feb777a7a,0x166db15573baae6c,0xb5ccbf5cfe3e5135},
+  {0x4637849d0285089d,0x4f671ebc0437c2ce,0x188565bc785f8268,0x712dec2cd1ba005e},
+  {0xa25a6b6a471a00b1,0x6e1a6a380bb57611,0x3ef50b155eddd23d,0xd3788fef109d4e3b},
+  {0x4f403f37eba563c1,0x76a201773cddd009,0x58fba6bec18e06a6,0x11a19d4cbf2a6331},
+  {0xe3e6bbb73066a175,0x9748c56fec4b9fa1,0x406aae141855018c,0xa1410c0e735df446},
+  {0x5e569e71e70eb719,0xa673071887dd4687,0x07055d8d0a23d785,0x74d498384aee1190},
+  {0xa0e8a89b6fb6984d,0x908716f3ce5edf66,0x0a2b9e842b73e729,0xa1b9171e0b83204b},
+  {0xbe7532657aadaa20,0x1b66940116e06582,0x7385fd540009963d,0x847a9b51570e7ff8},
+  {0xe9395fd61662cbe6,0xb3a286d4b91d1353,0x455b0689d3ff2d83,0xd56078fc7681e787},
+  {0x8b470957a3441b8a,0x7df431ebbf7e447b,0x0e0f4fa397edd83c,0xd793865c1388620e},
+  {0x7b29927808bfa739,0x96e65ce20d51654b,0xaa8fcec0d3c045c3,0xe5f31c0e239b4fea},
+  {0x5525c2a74e77bf9e,0x88cf3be85881afff,0x7c81312941d70c3c,0x23d8a44e23a9c737},
+  {0xb869097f96d421f1,0xfc5054b0f253daf5,0x1c241e84b424d6aa,0x32b29f522eb351e7},
+  {0x6a466e2ed7c0ad0b,0x5590c446ea6f583b,0x56d2464d3ee4d099,0x068910c7eb32dd95},
+  {0x71139d1bc66bb641,0xb3a1027da065feea,0xe04294fcf6174557,0x81dae384498adb46},
+  {0xf43ed00c527a209a,0xa5754026d1f22c89,0xc78a8d365f196923,0xf5154817fc84f220},
+  {0xae764c7fe7341054,0xffc86134dc4d880f,0x1b6a1e1530d66862,0x250c95737b7b8284},
+  {0xbfee6b3c1e46c128,0xa78dc08ba0e7251d,0x3a95f11bcef9d4c7,0x34f2831709c6a420},
+  {0xe3a3c1aa9e2407d8,0x4c1a200af1077851,0x8965a32110544d77,0x6354a05036f3f5a7},
+  {0xbd108a58fc17d8a6,0x61b0351824a54794,0x499e7fd9fdd626df,0x850217a6be595511},
+  {0x53f2510fb68b5c61,0x5b122cfd2501b4ba,0x7fc88679758e8262,0x233472936a675422},
+  {0x11965eaffc401c95,0x0af31e003ba1fb12,0x2facfdd6611b7f8b,0xd67eaae060c88abf},
+  {0x6fa46680edff5f3f,0x454b6266e25e87cb,0x9addf096cb1df0af,0xa6de67c1da83476b},
+  {0xbf6f0cb8a600033a,0xf520f28cc3846c4b,0x008f972a2108bd6a,0x55bbe0da272b6cb0},
+  {0x9bf38905d29c13e7,0xc50cd62db6acc3da,0xbb9b791e0d47ac11,0xd54b025508c245d8},
+  {0x3a2547ab532ec9ff,0x79495ddf670c8bc8,0xdf4ed2dcee44e1bc,0xc2e52f1fc1f7d4d5},
+  {0x4800ee52ee97ecda,0xc9d9b772550e380c,0x98506ba8ea5ec019,0x21ffafa8b46c668f},
+  {0x3464a9138085b307,0xf67a192be113e9cc,0xfdd61b66e0e162dc,0xd612aba17d397d2c},
+  {0x16207c45e571aabf,0xf2583066040bf4f7,0x4bc24730dc4d62f5,0x608b3d1e61a60b2f},
+  {0xc2a6d2c707faaab1,0xc9cfa575f99f891a,0x61ea461507f40f96,0x67104299d7331a82},
+  {0xaf1c8fcbed1f1699,0x985767a5dbb95b90,0xd6ae3b3279c96a14,0x275ea501029834e7},
+  {0x4e19e32114de1e9c,0x165f71d116e0afb9,0xe968cbf378c1a2f7,0x912182eb2d02ef2d},
+  {0x6e4e3c81caceef19,0x85f15b2e37fe2cbe,0x8ae88fcc89bb8687,0xe50b4d7659484c7a},
+  {0x80353d06c9930d5c,0x723d1f993acaffad,0x89e273ac935dc5e2,0x51356090a9eecbf9},
+  {0xc3bd743bf118e69e,0x78fe213d42306293,0x90638ea842ff3668,0xb0addcda3683625d},
+  {0xe26008c6b83cc264,0x74bbbd5777680be8,0xa8892126f9cc485a,0x54899977a5cc34a1},
+  {0xd19b2baf7fa0c771,0x39d199b5dfd41569,0x7c3c66294bc7b31d,0x81bb86cd53109ac5},
+  {0xe4a790156b11f26a,0xb496c49018830c99,0xf19e574456b9d549,0x867aa70b9bbd4fd0},
+  {0xb8ce927c2afbcba9,0x3ae3f9d11d478318,0xebdecea6a113ffd6,0x071def720f45ca33},
+  {0xa18c4347c3dba5da,0xc231d50db69b59f6,0x784caea3c01900f9,0x21b179202d1177e0},
+  {0x48d839b0e148b37a,0x119910fe9c00220e,0xf6959f7654a471b7,0x138df428ee1ab05e},
+  {0x2378b25ea2d743c2,0x52a0660820b6ff4b,0xb20d6835419796a6,0x77d41062fb9a7654},
+  {0x1e63666141c834dd,0x534d884045bcdedf,0x07b52ebe10206e92,0x67cb1a5c5d2017bc},
+  {0xbd489efa4249447b,0x81b1f830bdd020d0,0xb8db0042e390a71b,0x90b877cf8d8200e6},
+  {0xd91a2f7fe76f986d,0x2c6fcd64257849b8,0xcec2c4be6ecbe77b,0x5031f045518f6b98},
+  {0x3cc9f99a10cba6b9,0x7df264605ea09f19,0xc6099006fa2f35a0,0xf31aa1999c65f2ed},
+  {0x7322250ccd66f2d2,0xa8cf62816a34838c,0xf7bd30878c6d359b,0x450a14aed0d49014},
+  {0xf753996b7d7c1d54,0x45e2b366fb683eae,0xcef4cef44af75b4a,0xd1e647d51db49a04},
+  {0x257099ec419b94a6,0xd4a8a9f3335fcd10,0xa286788285415010,0x023c9feb9c1e9901},
+  {0x229d6fd7eed1531c,0x04cefb6c19ff0062,0x9130be016eed6e29,0xa1a04435eb4cd39d},
+  {0xefbd279ed0b045c6,0xe8ec58f13b1a927d,0xbabddf060b172c30,0xa5fd98adc4c9d7f8},
+  {0x0f859d44ce18448a,0x07af518284a5a680,0xff7565589bc19136,0x72e50c2e9eaa580a},
+  {0x6470f3d6724b5dd7,0x8b0ebb24be876d22,0xfb604e14fd34a2cc,0x213fc1d31fbb7996},
+  {0x50e1d4f6f24a3685,0x69348d20cb64f7b6,0xa13da095f7678267,0xb63a6ac7a66c3284},
+  {0xb0edaccd9a8698dc,0x73d7ca79b1672272,0xaed4ffd76475e235,0xf36b5b0cbbb22a1b},
+  {0x24acd40ab0b10aba,0xafb39e3ea0656a92,0xcfe743611a51fa5a,0xb4f8251f0f0e0d41},
+  {0xe8036bb95086dfdf,0x3d5d0332c379fb16,0x3029edc150437ed5,0xf561ce7ace559b0f},
+  {0x01047fd87eb154ca,0xce04d75cd86f0d9d,0x33f6d9a762e84d0b,0x52f77f2619632746},
+  {0x3fdf7a3e2584aaa7,0xafdff63009b07776,0x24496f671e85ade5,0x35b2e80c0abfdee5},
+  {0x4bb3e9185acc78b3,0xe5634557a7f532a4,0xa6a979853e645782,0x97e9a6c3f5ed6068},
+  {0x41685f9547d8c651,0x6d4bade8828daeda,0xabe0dcd781a5b523,0x3528952d2a770f19},
+  {0xe4e43b26b587ea84,0xf0f3f420178def6d,0xd48cb1f978a8bb2e,0x25de266fb8567a86},
+  {0x2906276141285c5c,0x045688d8cac52240,0xa1a62b2fa2474687,0x917244641b004f87},
+  {0x73897ebb86a40eb0,0x0df1bc6722ab333e,0xb7815fffa0c79792,0x322111adf2c83d06},
+  {0x4dd181aa27fc54fa,0x47b557267a691a35,0x089b8ed1303c2515,0xe60b63596c40b943},
+  {0xe574bb3f5e1d3fe5,0x7e5e1dd1aaea6c56,0x443b9d58176d285b,0xa2c066cf80f1c62d},
+  {0x9df2b1fe93b4cb69,0x5dc5dcbd7bcd4304,0x4cac45f5c51659e4,0x9039bc7472f02b80},
+  {0x81c7d14b2ff6f3d6,0x76b7422e6f000e01,0x23e23fa520ed280b,0x50a4f9ded0d07978},
+  {0x154548397391fa38,0xb1ec123aeb772341,0x22f40fd3abeee812,0x0342edcc39a77162},
+  {0xa7ef812f5e9d9ba6,0x65de86bcc8071b0d,0x4b9bbe60fe0a1fad,0xf4f8322efc5e2f45},
+  {0x21fbeab48a7c1136,0x42736db042991d3e,0xf78c442fd2ed07a6,0x36228053a90abb56},
+  {0x6ebfcec360d88021,0x7deeafd7cae1b159,0x6f32c272246a4999,0xb2f984f6c2b488dd},
+  {0x76beda6b3d15abc7,0x1bc04ff70ef9d0a9,0x75ec5c46c4854ec0,0x77bd25a817826a51},
+  {0xf79c8d9bd7aaf4f0,0xae5add9fd1454f93,0xd9f264167923d698,0x273bf89c8b33a9ec},
+  {0x20ba5517532e42a7,0xd9991aaa0bcb040f,0x81ec69b31aea8c89,0x823ee1a07f410f90},
+  {0x3e10957041e49998,0x9746fdf5f3deb53d,0xbae6be6d5a7923dd,0xd4aa255a7e60b5f8},
+  {0x453e76f50e50f914,0xba084020e530dd32,0x90e9982f02a0b2e3,0xc1bf6d0c93565fd4},
+  {0xbb44043183434a96,0xb6839987e4d3fbf5,0x780e11ff154ba921,0x46deb765191c6fad},
+  {0xf254860f62ddca11,0x2b40c2147fcc1618,0x9b9df4f2213a87f5,0xf5d9f1982bf72085},
+  {0x9ec887ef1dac7ea8,0xf9b9f41cd1a90cab,0x5106c66727088891,0xa079314a8a7aa0cc},
+  {0xbaca971f705d6820,0x32cf35c216d31b74,0xcda1b48f6a782676,0x42dd0c61745b57af},
+  {0x774f50e70700fa3c,0xfe706a77d17875e0,0x50acd4b9e4f085bc,0xa70b2f3a3373b5cf},
+  {0xa3d467e6532333ed,0x9143409c675fea0a,0x186d4c8b7de757db,0x006e698e91bc1742},
+  {0x042690d62241c815,0xf8a04fddc8420797,0x9ff8cf1394eaada4,0x921b7749e0687334},
+  {0x3ba03d72cd709236,0x12f95d885e21e3d3,0xba18560bb5d4d50a,0xa3607627494476ab},
+  {0xade31f9ca5377f89,0x635510178eec1003,0x3dca939c351bf98d,0x339c87aee1cf78dd},
+  {0xe45a6287cc1287d4,0x7cf6c8c56ed07634,0xadf6eda911dd0200,0x87211a5d3722f0f6},
+  {0x7b07d341c0de902d,0x69838993df5c9429,0xb8921642be862244,0x555819247b006cca},
+  {0x4b8ebd3e261a1065,0xec8c767eb1653ceb,0x482e17c892519544,0xb61af0cc04b533a5},
+  {0x4fb9d38c4e2f7113,0x50030b8523699320,0x5716f5c60cedd7d8,0x0673e662c18aadef},
+  {0x641233031f77a5fb,0x1932c76990a0d465,0xb79ab4fbf32c92e8,0xc0a7370dd0467550},
+  {0xdb899bf50910763a,0xf026477f262eb097,0x76b70a1b2163a0d4,0x93a2873f23165f6e},
+  {0xba2a66c196ce2eb8,0x19383fd3ffab287b,0xaed33c3223646076,0x1274559077e98698},
+  {0x035a94843c44ec7a,0x6de99478a3c009e3,0x8a7ecba43ae87e6e,0x458c9cbfca30c71a},
+  {0xe3695ac8419682c5,0xaeee4d4d0392ec66,0xd99792a67250c187,0x91e0f202f4c924b3},
+  {0x9c784cfbf5192c27,0xc113eee0e80c2eae,0x3f7b5a6101ce5f5e,0x842e2d646ecd9d6e},
+  {0x957028a6befc0d73,0xcbb8df5afe2e23c3,0xc00f5c490d8dafb4,0x67d7ee99cddb8452},
+  {0xac8c3e869f704d2d,0xc928ad50bd4faf6e,0x114a0001a078d1a0,0x8375ad6cc681586b},
+  {0x59a53e3fb149cca9,0x69cf3f7ab419768e,0x79d945a746a788b8,0x979b7e9387ae017f},
+  {0x41f7712568f43935,0xf17647a51bb6cff9,0x593eb0f68e21db19,0x77bf0442e77fdbdd},
+  {0xd430085cbe62c90e,0x445d0af933a0c884,0x92f5c9b29a5de145,0x6778e9aad04a6c94},
+  {0x4914b4bd446c5d64,0x21b19c795fec736d,0x72cf9cdf7fa1c0db,0xf67226412058b23c},
+  {0x1e7346a99e1464a3,0xacb82da3ac217e94,0x4d1f4486473e6c18,0x23274da141c63725},
+  {0xf58a0445c9b4903b,0x4f196615648056a4,0xeaf0d8fc78e51fe3,0xc71e969830bec69e},
+  {0xcec3175fd17dee42,0x6fa60eda34cf3b0f,0x016ff6fe365a227b,0x148ed225daf52abf},
+  {0x5eb5954a6c060dcd,0x67ed2e3411fbde9d,0xdaddfd054f15c5a4,0x80e12ae0d1591ef3},
+  {0xc9c76eda44553b71,0x7c4675538cbdcd1e,0xa2128f16928c1efd,0xc13aaef8cfacc959},
+  {0x525318d3ea7544bd,0x6f3e0f4d85ce7b2a,0x397102e6892ab449,0xd028319bc9ef0676},
+  {0xc55bc06690da6f96,0xea6a73d17ce2969a,0xfa21bd37fa658e1e,0x32d421c8c9a9d437},
+  {0x4f53f0e462a9f4f0,0x18c65d2ba362d43b,0x53b8871400599e70,0xf291e9ac535cfe6e},
+  {0x2a420a66918ee17a,0x4dae04d613a5a05f,0xc12c868048f09ef7,0x900c4ca4fb306ac2},
+  {0x357f0638ee05acbc,0x389db47cc78620f7,0x3c531ff5b9fff02b,0x902c96f5fb2c18f9},
+  {0x57abb6151ae9319f,0x917bd98253c43360,0x36b4e4e17d9c5182,0xc2a4751705897c3b},
+  {0x91ee0ad214084c1b,0xfe17b657a9ea9054,0x7b304880e7a3efb6,0xd497c8cea46cf443},
+  {0xb97e1c63dcc46441,0x22898ec1ecb0f186,0x40dd2915e34e92ae,0x83e63e8886604034},
+  {0xf159f13af4545efc,0x6b0312cbfce549f7,0x1632f9e6624b3c5e,0xc387a21c7c20a6d6},
+  {0xe81b4468c49ba628,0x9962cd4b58abb1e7,0xda2145ce9fe59f2e,0x6021807944cfc8e1},
+  {0x9e98852b17310f23,0x3cbe1c8bceb45120,0x0e165b29c57ec0eb,0x305bf854fb1aea8d},
+  {0x1c3dbdac479d54f7,0x4cda9c1c1bbb1a19,0x7d330c571f17bc88,0x826548b30e26b7d7},
+  {0x446afa2ca1809535,0x8d3c9693ee673350,0x7893a83f58de1ffc,0xb19954f7647195ff},
+  {0x21b77a7b577e945a,0x0c3e91d3f1f89e09,0xdd7b8e8a59fae93c,0x6435f276c4582559},
+  {0x4d0e6426007bc199,0x5c13184bcf7dd24a,0x26f1f87322e213d0,0x97243e676a3eb387},
+  {0x14cbfff5b787dedd,0x355794e80f8cd847,0xf2c951e3c0d77a3d,0xe558cf2f7b5f2991},
+  {0xf87b23ea7452e43c,0x92521695b010b548,0xb7af363918a98cf1,0x473e6304c6f3f9cf},
+  {0xe86f5e030902695a,0x884c59759075978e,0x862a4f44f20c857d,0x2348092c2d62a7ed},
+  {0xbebdf27580f800b3,0x4c82348a99cfaf36,0x7fec6e2fb343c70a,0xd0a2b036a8d95707},
+  {0x59ef03fcb5a57f39,0xfb04bb079290dd73,0x30e0751c7c8e4263,0x4078bcf952cf1a62},
+  {0xa19ffba37095d58a,0x9d164dabb30dff6a,0x16de88d2bac7642a,0x8232b5dca704cbcd},
+  {0x329dfc2b2636492c,0xf0397ad762a31307,0x78adfe730ebe751e,0x5783b8d9d2f05dfd},
+  {0xf2d6e8a736f23aa1,0xe2102f9bd2267093,0xdf2af690beecc500,0x11398c83a817f593},
+  {0xd46565aaafab2385,0xddff3f9a0b99928d,0x5eb2072a49c5a5ab,0x53a03f6a8eb6a094},
+  {0x57fb689ec7092868,0xc2040eb173de1a44,0x810031fb7b19e630,0x53960a9b3b1ef568},
+  {0x40007920454fbf71,0xac025a589e98d1ef,0x9e256036a7fbd143,0xc13cf073bd649440},
+  {0xd06cd6829f0fea3d,0x2a51b1d71d1ac07b,0x3546a5854571bbc6,0xc30b6bf46c0b42fe},
+  {0x62488646a13da231,0xe28973393fb6f682,0x9ed13dc9f5432f8f,0x31b84f2be241c94e},
+  {0x9bb19ea5428d66ae,0xe0080b8616f3babc,0x9610055711788ae5,0x7652d184a46c90fe},
+  {0x112c63f926d9850e,0xe5905a268850e663,0xb9fd3996e6d72608,0xa7aa33543146d58a},
+  {0x77de728df392f575,0x637633946129f8e6,0x72a867e08e3bfce6,0x754f7149e15a365b},
+  {0x3511c4139b98679f,0x56a8a361da8cbe81,0x2a34d15423a9eb45,0x82ae1da57cd32e57},
+  {0xcb3ecb886171f719,0xfcbf82d884e8e020,0xc6d2502bd1e6f6cf,0x80bb7b1db5c2a777},
+  {0x81b8745892f03d2d,0xbc5f38b14116148b,0x4b6d0194055b86d8,0x241dbd17e3eb4ba9},
+  {0x5bbc585152fcd142,0x930f31c230a2050e,0xabf51e10a3e969e5,0x72a0a1c90ead638e},
+  {0xcadb18ff93f7f93e,0x1b8e009b5719bf82,0x743c0ab2c8bc284a,0x7144a02ff1130223},
+  {0x41b95e62522de019,0xcd3465a01c9b93fb,0x236600ff15e70ef3,0x3658cd0c29ea6f20},
+  {0xb9c59bf0b27dc282,0x47955c29304112de,0x3f16c72af19bcb3f,0xa0e568c9c5397d69},
+  {0x9251cf7a209add18,0x8e3a95a336fe4170,0xf28c14a751527126,0xb3d3a9a208590971},
+  {0x5b129f35a37c28ff,0xe3f8ba25b41817b9,0x200b734d2501265f,0x52344985724cceca},
+  {0xa8e27fd1e60dffab,0xa8ea4523b64f5aa4,0xa475b8437f8165a1,0xd644c1691c3c7548},
+  {0x5ddae2f669e64957,0x1fcef31f0b9af756,0x3e6da61c7980074e,0x206f828242ab6764},
+  {0x33144ea9f76bb631,0x9f36e03e21fa3065,0xfe08e97dc86bceb2,0x640b723c98cd7479},
+  {0x1636152634146114,0xc18c0793a80805cd,0x2b106edd3834043c,0x4191bf5c7fbacdf8},
+  {0x429dddfd03ef7bc4,0x4db9b9d6da197cd3,0xe74baaea7f22abc0,0x4364ff1e20f72e64},
+  {0xca8a9a678e94da68,0x6535f14dbed15563,0x98f34f0a20bd3f3c,0xd12c84164701e27f},
+  {0xc02c8d4c379b7ce5,0x7069499c81e1f16e,0x9bf97727b1a05c04,0xf27fa10bb0a78610},
+  {0x4cf536f0cf11a349,0xbd9dfa2a6eb41391,0x565f1d6e23bbbc0d,0xc76bbb697c18cf7f},
+  {0xa17601bde8ac478c,0x8db87c51403e365d,0x4088a87a96d9c622,0x31f82a7918dd0d06},
+  {0x29ee14687120f04e,0xfea2e736c3636d5c,0x7f8c89823855588a,0xf0da86215a008e8c},
+  {0xdd645ec1d816c223,0x0aa7edbc5ba5d0cf,0xfced1c8e126396e5,0x201b07bc6f65eddb},
+  {0xfb25e20cd48f4855,0xa8b3d1435e85371a,0x3ee9acb3f939329e,0xd075efbe502f25a4},
+  {0x0541c9b35049c704,0x94986dc9cd668f39,0x17f4cfb2726cd68b,0x508c14a670636ed4},
+  {0xa2b783ac55d68039,0xc130ab2d841d773e,0xd6d29b14f588465d,0xb790ad979cce43f8},
+  {0x4f8ce0df03c43b98,0xbbda15818c06d7a2,0x380dd95f0f042fdb,0x05f429bccfb597f3},
+  {0xc742e63ad5c5f5e6,0xcbcb225fbbbe33e2,0xa8edf59089d52ced,0xa0e788a338b45f4d},
+  {0x20e95da4bdb0c82f,0x3e63b532cc85e2d9,0x163e3d2b90d4ddaf,0xc71593e07530219a},
+  {0x7992357ab8d37b59,0x4aea96f315f3c064,0x1ba04f945b33146b,0xf65bed5593247ff4},
+  {0x2d4ad59bdce5563d,0x3a24253d449dc88d,0x41c7ffbd062c28f4,0x42734ae219aa9361},
+  {0x644204f2ea9b71e3,0xe551983ade3b5122,0x1bc727382db55ea1,0xe276d03e4bd6fc9a},
+  {0xfb20c1e51a924e81,0x2f795f1d4507decd,0x154de4d0aca02046,0x72ddcd99451381dd},
+  {0xc09ac8020e255c2b,0xa4eff29a2c29d3f3,0x7977c4f4c2f24381,0x349ff7a6efa4d791},
+  {0xea5d2cd9592cb4e6,0xf63dcd3ff0c8104a,0x66d7254c1252ca0d,0x822791068962c667},
+  {0x7b9c477dde2ad4a4,0xd3460c638eb797b0,0x1889eaef7acad771,0xb23db19bd8554e11},
+  {0x6f1c469240cd647f,0x31825907e279b274,0xb97cebbf2c37c29e,0x74ce50e87690b22e},
+  {0xfb92d64637ac0508,0x97999c37b92d0720,0xa23a9e76c1578849,0x66aa9c79979e14fb},
+  {0xcf78e912e65a8877,0xb7dcb878bdeec090,0xe678ac56695a99fc,0x0338870b34c11cee},
+  {0x5529c228e771c374,0xd8ab910e6e0a23f9,0xcd86f7b11bf07839,0xe3358c0867358f64},
+  {0x7c0e69e5db7dc1c3,0x355a9bbca9523a64,0x86985b53d32a3f4b,0xc715ea89b184099b},
+  {0xac499c49b8a4cdc4,0x22485e1df13ea826,0xf91367c2ad8807da,0x863b3b9193879ebd},
+  {0x8086427544d93f9b,0xf378d24905271a4e,0x8a2211f2e881884b,0x27f11aae6fbdeb19},
+  {0xd4d702e312991728,0xd57d86c18df5deb9,0x68b550520aac07f8,0x6163e0c25242d715},
+  {0x0484539b5bd55737,0x69b34b6b4664d575,0xcafeeeea78048b31,0x25a0aca017ec768d},
+  {0x955f03fc32b86250,0xb3ed04233eabbafd,0xb4da5d10fb30568d,0xa1d5c520656d8e7f},
+  {0x18e6772ac0c7b0b5,0x1e8c41bfa134bd72,0x36b1b28f157526b2,0xf5ac9222151e43e1},
+  {0xb500af50c3647566,0x181d28f85aca7575,0x9a16455dfd6341a1,0xc6d058b2c1e37c22},
+  {0x01b46ff0be3c6ef8,0x7f5abf4a7e4a72fe,0xe18780f7372db81b,0x91d1172dbd7c1d3b},
+  {0x62e68a7598567ebd,0x4654b8ed6f377911,0x051bf02a5685ca63,0xd08b010696df1fa9},
+  {0x656ce860674f0d36,0x8bcbf7bc1ef730bf,0x00a0260df392d280,0x33930145fd64eecf},
+  {0x17743293297fc288,0x5b59ca56522bb36a,0xe58ef14098fd4053,0x7444ed68eb16e657},
+  {0x31beae245608121f,0xea349f5c00e7cc25,0xf076aacf6db8c528,0x13c58f0b1e99ac1a},
+  {0x910f9e30c8455d7b,0xc1ebc494beb98220,0x201a3557ec66e851,0x610dd21bbd2f6b9f},
+  {0x317d8fa79aa99e03,0x7b670f771c4590dd,0x77052e1a54ac4638,0x17309eb8c690df96},
+  {0xddae9fdfd80030d4,0x84daf3404eae25e8,0xe93997a2e172c485,0x51f2159ecb7b5e41},
+  {0x9f02a3e12da8bc2c,0x1c746f4b943dc8e5,0xb31951aeeaac4e5e,0x0128a606643b4341},
+  {0xebd158803af98ce2,0x08e82db8ead7c10b,0xba172e80caa61667,0xf61ff900e1918b8a},
+  {0x8c3c570f9ffae2bc,0xff0827921f27e4f5,0x6256d4a0913919b5,0xc1f4fcc60f17957e},
+  {0x648ade6556f9d114,0xf2e85e1746058ffe,0xc9605989ede623cf,0xf3d09098541725a9},
+  {0xc57b49460d911255,0xc0767005f4affb44,0x486c21436602612a,0x87617ddb2a9643c0},
+  {0xc2038cd71c6d3ead,0x8fe1e58a5096a181,0x51cde6590d0f6b27,0xf59bf938475aa39a},
+  {0x9d8138454badbf16,0xaf8306904b15d8a8,0x83bd9fd79c159b39,0xb85db82acdbbf3ae},
+  {0x560807274e8b13db,0xb33b8a036f1617ca,0x72bc05868c923532,0xb7b8ee25c3388851},
+  {0xc042df127c4f6747,0x704ed715ba3ca7d4,0x678f93c55bc0c5d2,0xd2ee482f0bfe6c9a},
+  {0xbd60c5ba33d87b10,0x6c2ff096c60536d6,0x0ce4b4b8c86a8f5b,0x86a0bcebf81d6e4d},
+  {0xf9384ef3a44799c2,0x8b78ec1c676a7fcd,0x5f7c3edb312b00da,0x2390763c1712af67},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_invmer_e_3 AIMER_NAMESPACE(GF_exp_invmer_e_3)
+void GF_exp_invmer_e_3(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer256f/m4speed/api.h b/crypto_sign/aimer256f/m4speed/api.h
new file mode 100644
index 00000000..2bc176d6
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 64
+#define CRYPTO_SECRETKEYBYTES 96
+#define CRYPTO_BYTES 25120
+#define CRYPTO_ALGNAME "aimer256f"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer256f/m4speed/field.c b/crypto_sign/aimer256f/m4speed/field.c
new file mode 100644
index 00000000..5c27f63a
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/field.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t temp[8] = {0,};
+  uint64_t sub[10] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[2] ^ a[3];
+  sub[3] = b[2] ^ b[3];
+  sub[4] = a[0] ^ a[2];
+  sub[5] = a[1] ^ a[3];
+  sub[6] = b[0] ^ b[2];
+  sub[7] = b[1] ^ b[3];
+  sub[8] = sub[4] ^ sub[5];
+  sub[9] = sub[6] ^ sub[7];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &t[3], &t[1]);
+  t[1] ^= t[2];
+
+  poly64_mul(&a[3], &b[3], &temp[7], &t[2]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  poly64_mul(&sub[4], &sub[6], &t[1], &t[0]);
+  poly64_mul(&sub[5], &sub[7], &t[3], &t[2]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul(&sub[8], &sub[9], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] = temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] = temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] = temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t temp[8] = {0,};
+  uint64_t sub[10] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[2] ^ a[3];
+  sub[3] = b[2] ^ b[3];
+  sub[4] = a[0] ^ a[2];
+  sub[5] = a[1] ^ a[3];
+  sub[6] = b[0] ^ b[2];
+  sub[7] = b[1] ^ b[3];
+  sub[8] = sub[4] ^ sub[5];
+  sub[9] = sub[6] ^ sub[7];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &t[3], &t[1]);
+  t[1] ^= t[2];
+
+  poly64_mul(&a[3], &b[3], &temp[7], &t[2]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  poly64_mul(&sub[4], &sub[6], &t[1], &t[0]);
+  poly64_mul(&sub[5], &sub[7], &t[3], &t[2]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul(&sub[8], &sub[9], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] ^= temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] ^= temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] ^= temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t temp_c3 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+      temp_c3 ^= (b_ptr[0][3] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+      temp_c3 ^= (b_ptr[1][3] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+      temp_c3 ^= (b_ptr[2][3] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+      temp_c3 ^= (b_ptr[3][3] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+  c[2] = temp_c2;
+  c[3] = temp_c3;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t temp_c3 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+      temp_c3 ^= (b_ptr[0][3] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+      temp_c3 ^= (b_ptr[1][3] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+      temp_c3 ^= (b_ptr[2][3] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+      temp_c3 ^= (b_ptr[3][3] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+  c[2] ^= temp_c2;
+  c[3] ^= temp_c3;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t add[4] = {0,};
+  uint64_t temp[8] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&t[3], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  poly64_mul_s(&temp[7], &t[2], a[3], b[3]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[2] ^ a[3]), (b[2] ^ b[3]));
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  add[0] = a[0] ^ a[2];
+  add[1] = a[1] ^ a[3];
+  add[2] = b[0] ^ b[2];
+  add[3] = b[1] ^ b[3];
+  poly64_mul_s(&t[1], &t[0], add[0], add[2]);
+  poly64_mul_s(&t[3], &t[2], add[1], add[3]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul_s(&t[1], &t[0], (add[0] ^ add[1]), (add[2] ^ add[3]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] = temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] = temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] = temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t add[4] = {0,};
+  uint64_t temp[8] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&t[3], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  poly64_mul_s(&temp[7], &t[2], a[3], b[3]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[2] ^ a[3]), (b[2] ^ b[3]));
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  add[0] = a[0] ^ a[2];
+  add[1] = a[1] ^ a[3];
+  add[2] = b[0] ^ b[2];
+  add[3] = b[1] ^ b[3];
+  poly64_mul_s(&t[1], &t[0], add[0], add[2]);
+  poly64_mul_s(&t[3], &t[2], add[1], add[3]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul_s(&t[1], &t[0], (add[0] ^ add[1]), (add[2] ^ add[3]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] ^= temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] ^= temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] ^= temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
diff --git a/crypto_sign/aimer256f/m4speed/field.h b/crypto_sign/aimer256f/m4speed/field.h
new file mode 100644
index 00000000..089ad983
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[4];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer256f/m4speed/hash.c b/crypto_sign/aimer256f/m4speed/hash.c
new file mode 100644
index 00000000..06a6f473
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/hash.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+void hash_init(hash_instance *ctx)
+{
+  shake256_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake256_inc_init(ctx);
+  shake256_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake256_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake256_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake256_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake256_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake256_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer256f/m4speed/hash.h b/crypto_sign/aimer256f/m4speed/hash.h
new file mode 100644
index 00000000..0fb7c8ef
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/hash.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake256incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer256f/m4speed/params.h b/crypto_sign/aimer256f/m4speed/params.h
new file mode 100644
index 00000000..39714e9a
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/params.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer256f_m4speed_##s
+
+#define SECURITY_BITS               256                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         3                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     65                   // number of parallel repetitions (Tau)
+#define AIMER_N                     16                   // number of MPC parties (N)
+#define AIMER_LOGN                  4                    // log_2(N)
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer256f/m4speed/sign.c b/crypto_sign/aimer256f/m4speed/sign.c
new file mode 100644
index 00000000..ff32211b
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/sign.c
@@ -0,0 +1,573 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[0], mult_chk->x_shares[0]);
+  for (size_t i = 1; i < 11; i++)
+  {
+    GF_sqr_s(mult_chk->z_shares[0], mult_chk->z_shares[0]); 
+  }
+  GF_mul_add(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  GF_sqr_s(mult_chk->z_shares[2], mult_chk->x_shares[2]);
+  for (size_t i = 1; i < 7; i++)
+  {
+    GF_sqr_s(mult_chk->z_shares[2], mult_chk->z_shares[2]);
+  }
+  GF_mul_add(mult_chk->z_shares[2], mult_chk->x_shares[2], aim2_constants[2]);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[2],
+                           matrix_A[2]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 uint8_t commits[AIMER_T][AIMER_N][AIMER_COMMIT_SIZE],
+                 uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 mult_chk_t mult_chk[AIMER_T][AIMER_N],
+                 GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                 const uint8_t *sk, const uint8_t *m, size_t mlen)
+{
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt
+  hash_init_prefix(&ctx, HASH_PREFIX_3);
+  hash_update(&ctx, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, random, SECURITY_BYTES);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // generate root seeds and expand seed trees
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    hash_squeeze(&ctx, nodes[rep][0], AIMER_SEED_SIZE);
+  }
+  expand_trees(nodes, sign->salt);
+  hash_ctx_release(&ctx);
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // initialize adjustment values
+    tape_t delta, tape;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      commit_and_expand_tape(&tape, commits[rep][party], &ctx_precom,
+                             nodes[rep][party + AIMER_N - 1], rep, party);
+      hash_update(&ctx, commits[rep][party], AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+      GF_set0(mult_chk[rep][party].x_shares[AIMER_L]);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_add(delta.t_shares[2], delta.t_shares[2], sbox_outputs[2]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[2], delta.t_shares[2]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk[rep][party].x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk[rep][party].pt_share, tape.pt_share);
+      GF_copy(mult_chk[rep][party].x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk[rep][party].x_shares[1], tape.t_shares[1]);
+      GF_copy(mult_chk[rep][party].x_shares[2], tape.t_shares[2]);
+      GF_copy(alpha_v_shares[rep][0][party], tape.a_share);
+      GF_copy(alpha_v_shares[rep][1][party], tape.c_share);
+
+      aim2_mpc(&mult_chk[rep][party],
+               (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_2_and_3(signature_t *sign,
+                       GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                       const mult_chk_t mult_chk[AIMER_T][AIMER_N])
+{
+  GF epsilons[AIMER_L + 1];
+
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    GF_set0(alpha);
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // alpha_share = a_share + sum x_share[i] * eps[i]
+      // v_share = c_share - pt_share * alpha + sum z_share[i] * eps[i]
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[rep][0][party],
+                 mult_chk[rep][party].x_shares[3], epsilons[3]);
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].z_shares[3], epsilons[3]);
+
+      GF_add(alpha, alpha, alpha_v_shares[rep][0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[rep][1][party],
+                 mult_chk[rep][party].pt_share, alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares[rep],
+                AIM2_NUM_BYTES_FIELD * 2 * AIMER_N);
+  }
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+  hash_ctx_release(&ctx_e);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  hash_instance ctx;
+  signature_t *sign = (signature_t *)sig;
+
+  //////////////////////////////////////////////////////////////////////////
+  // Phase 1: Committing to the seeds and the execution views of parties. //
+  //////////////////////////////////////////////////////////////////////////
+
+  // nodes for seed trees
+  uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  // commitments for seeds
+  uint8_t commits[AIMER_T][AIMER_N][AIMER_COMMIT_SIZE];
+
+  // multiplication check inputs
+  mult_chk_t mult_chk[AIMER_T][AIMER_N];
+
+  // multiplication check outputs
+  GF alpha_v_shares[AIMER_T][2][AIMER_N];
+
+  // commitments for phase 1
+  run_phase_1(sign, commits, nodes, mult_chk, alpha_v_shares, sk, m, mlen);
+
+  /////////////////////////////////////////////////////////////////
+  // Phase 2, 3: Challenging and committing to the simulation of //
+  //             the multiplication checking protocol.           //
+  /////////////////////////////////////////////////////////////////
+
+  // compute the commitment of phase 3
+  run_phase_2_and_3(sign, alpha_v_shares,
+                    (const mult_chk_t (*)[AIMER_N])mult_chk);
+
+  //////////////////////////////////////////////////////
+  // Phase 4: Challenging views of the MPC protocols. //
+  //////////////////////////////////////////////////////
+
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  //////////////////////////////////////////////////////
+  // Phase 5: Opening the views of the MPC protocols. //
+  //////////////////////////////////////////////////////
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes[rep], i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits[rep][i_bar],
+           AIMER_COMMIT_SIZE);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes,
+                alpha_v_shares[rep][0][i_bar]);
+  }
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[2]);
+        GF_add(tape.t_shares[2], tape.t_shares[2], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(mult_chk.x_shares[2], tape.t_shares[2]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[3], epsilons[3]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[3], epsilons[3]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(signature, CRYPTO_BYTES, message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer256f/m4speed/sign.h b/crypto_sign/aimer256f/m4speed/sign.h
new file mode 100644
index 00000000..e64c4350
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/sign.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF pt_share;
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 uint8_t commits[AIMER_T][AIMER_N][AIMER_COMMIT_SIZE],
+                 uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 mult_chk_t mult_chk[AIMER_T][AIMER_N],
+                 GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                 const uint8_t *sk, const uint8_t *m, size_t mlen);
+
+#define run_phase_2_and_3 AIMER_NAMESPACE(run_phase_2_and_3)
+void run_phase_2_and_3(signature_t *sign,
+                       GF alpha_v_shares[AIMER_T][2][AIMER_N],
+                       const mult_chk_t mult_chk[AIMER_T][AIMER_N]);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer256f/m4speed/tree.c b/crypto_sign/aimer256f/m4speed/tree.c
new file mode 100644
index 00000000..84c23d7f
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/tree.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void expand_trees(uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                  const uint8_t salt[AIMER_SALT_SIZE])
+{
+  size_t rep, index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (rep = 0; rep < AIMER_T; rep++)
+  {
+    buffer[0] = (uint8_t)(rep);
+    for (index = 1; index < AIMER_N; index++)
+    {
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[rep][index - 1], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[rep][2 * index - 1], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer256f/m4speed/tree.h b/crypto_sign/aimer256f/m4speed/tree.h
new file mode 100644
index 00000000..b5a27867
--- /dev/null
+++ b/crypto_sign/aimer256f/m4speed/tree.h
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define expand_trees AIMER_NAMESPACE(expand_trees)
+void expand_trees(uint8_t nodes[AIMER_T][2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                  const uint8_t salt[AIMER_SALT_SIZE]);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer256f/m4stack/__asm_field.S b/crypto_sign/aimer256f/m4stack/__asm_field.S
new file mode 100644
index 00000000..6181c602
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/__asm_field.S
@@ -0,0 +1,695 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  ldr.w R2, [in_p, #4 * width]
+  ldr.w R3, [in_p, #5 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R3, [out_p, #5 * width]
+
+  ldr.w R2, [in_p, #6 * width]
+  ldr.w R3, [in_p, #7 * width]
+  str.w R2, [out_p, #6 * width]
+  str.w R3, [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R2, [out_p, #5 * width]
+  str.w R2, [out_p, #6 * width]
+  str.w R2, [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  ldr.w R3,  [in0_p, #4 * width]
+  ldr.w R12, [in1_p, #4 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #4 * width]
+
+  ldr.w R3,  [in0_p, #5 * width]
+  ldr.w R12, [in1_p, #5 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #5 * width]
+
+  ldr.w R3,  [in0_p, #6 * width]
+  ldr.w R12, [in1_p, #6 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #6 * width]
+
+  ldr.w R3,  [in0_p, #7 * width]
+  ldr.w R12, [in1_p, #7 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+  in8         .req R10
+  in9         .req R11
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w    {R4-R12, lr}
+
+  ldr.w in0, [in_p, #4 * width]  // a[2]
+  ldr.w in2, [in_p, #5 * width]
+  ldr.w in4, [in_p, #6 * width]  // a[3]
+  ldr.w in6, [in_p, #7 * width]  
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R12, C4
+
+  and.w in0, in0, R12, lsr #16
+  and.w in2, in2, R12, lsr #16
+  and.w in4, in4, R12, lsr #16
+  and.w in6, in6, R12, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+  eor.w in0, in0, in7, lsr #22
+  eor.w in0, in0, in7, lsr #27
+  eor.w in0, in0, in7, lsr #30
+
+  push.w {in2, in3}              // temp[5]
+
+  ldr.w in2, [in_p, #2 * width]  // a[1]
+  ldr.w in8, [in_p, #3 * width]
+
+  lsr.w in3, in2, #16
+  lsr.w in9, in8, #16
+
+  and.w in2, in2, R12, lsr #16
+  and.w in8, in8, R12, lsr #16
+
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in8, C3, 8
+  or_shift_and in9, C3, 8
+
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in8, C2, 4
+  or_shift_and in9, C2, 4
+
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in8, C1, 2
+  or_shift_and in9, C1, 2
+
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in8, C0, 1
+  or_shift_and in9, C0, 1
+
+  // c[3] = temp[3] ^ temp[7];
+  eor.w in8, in8, in6
+  eor.w in9, in9, in7
+
+  // c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  // c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  eor.w in8, in8, in5, lsr #22
+  eor.w in8, in8, in5, lsr #27
+  eor.w in8, in8, in5, lsr #30
+
+  eor.w in8, in8, in6, lsl #10
+  eor.w in8, in8, in6, lsl #5
+  eor.w in8, in8, in6, lsl #2
+
+  eor.w in9, in9, in6, lsr #22
+  eor.w in9, in9, in6, lsr #27
+  eor.w in9, in9, in6, lsr #30
+
+  eor.w in9, in9, in7, lsl #10
+  eor.w in9, in9, in7, lsl #5
+  eor.w in9, in9, in7, lsl #2
+
+  str.w in8, [out_p, #6 * width]
+  str.w in9, [out_p, #7 * width]
+
+  // c[2] = temp[2] ^ temp[6];
+  eor.w in2, in2, in4
+  eor.w in3, in3, in5
+
+  // c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  // c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  // c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  pop.w {in6, in7}               // temp[5]
+
+  eor.w in2, in2, in7, lsr #22
+  eor.w in2, in2, in7, lsr #27
+  eor.w in2, in2, in7, lsr #30
+
+  eor.w in2, in2, in4, lsl #10
+  eor.w in2, in2, in4, lsl #5
+  eor.w in2, in2, in4, lsl #2
+
+  eor.w in3, in3, in4, lsr #22
+  eor.w in3, in3, in4, lsr #27
+  eor.w in3, in3, in4, lsr #30
+
+  eor.w in3, in3, in5, lsl #10
+  eor.w in3, in3, in5, lsl #5
+  eor.w in3, in3, in5, lsl #2
+
+  str.w in2, [out_p, #4 * width]
+  str.w in3, [out_p, #5 * width]
+
+  ldr.w in2, [in_p, #0 * width]  // a[0]
+  ldr.w in4, [in_p, #1 * width]
+
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+
+  and.w in2, in2, R12, lsr #16
+  and.w in4, in4, R12, lsr #16
+
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+
+  // c[1] = temp[1] ^ temp[5];
+  eor.w in4, in4, in6
+  eor.w in5, in5, in7
+
+  // c[1] ^= (temp[5] << 10) | (t >> 54);
+  // c[1] ^= (temp[5] <<  5) | (t >> 59);
+  // c[1] ^= (temp[5] <<  2) | (t >> 62);
+  eor.w in4, in4, in1, lsr #22
+  eor.w in4, in4, in1, lsr #27
+  eor.w in4, in4, in1, lsr #30
+
+  eor.w in4, in4, in6, lsl #10
+  eor.w in4, in4, in6, lsl #5
+  eor.w in4, in4, in6, lsl #2
+
+  eor.w in5, in5, in6, lsr #22
+  eor.w in5, in5, in6, lsr #27
+  eor.w in5, in5, in6, lsr #30
+
+  eor.w in5, in5, in7, lsl #10
+  eor.w in5, in5, in7, lsl #5
+  eor.w in5, in5, in7, lsl #2
+
+  str.w in4, [out_p, #2 * width]
+  str.w in5, [out_p, #3 * width]
+
+  // c[0] = temp[0] ^ t;
+  eor.w in2, in2, in0
+  eor.w in3, in3, in1
+
+  // c[0] ^= (t << 10);
+  // c[0] ^= (t << 5);
+  // c[0] ^= (t << 2);
+  eor.w in2, in2, in0, lsl #10
+  eor.w in2, in2, in0, lsl #5
+  eor.w in2, in2, in0, lsl #2
+
+  eor.w in3, in3, in0, lsr #22
+  eor.w in3, in3, in0, lsr #27
+  eor.w in3, in3, in0, lsr #30
+
+  eor.w in3, in3, in1, lsl #10
+  eor.w in3, in3, in1, lsl #5
+  eor.w in3, in3, in1, lsl #2
+
+  str.w in2, [out_p, #0 * width]
+  str.w in3, [out_p, #1 * width]
+
+  pop.w {R4-R12, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer256f/m4stack/aim2.c b/crypto_sign/aimer256f/m4stack/aim2.c
new file mode 100644
index 00000000..74e41922
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/aim2.c
@@ -0,0 +1,598 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 11
+// (2 ^ 11 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5
+// b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_5, table_6);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // table_b = in ^ (0xb6), table_5 = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_5, t1, table_5);
+  GF_mul_s(table_b, t1, table_6);
+
+  // t1 = in ^ (0xb6 d)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // table_5 = in ^ (0xb6d6dad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(table_5, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5 b6)
+  GF_sqr_s(t1, table_5);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // out = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_5);
+}
+
+// inverse Mersenne S-box with e2 = 141
+// (2 ^ 141 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0x2224448889112222444888911222244488911122244448891112224444889111
+// 222444 8889112 222444 8889112 222444 889111 222444 4889111 222444 4889111
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,}, t3 = {0,}, t4 = {0,}, t5 = {0,};
+  GF table_9 = {0,};
+
+  // t2 = in ^ (0x11), table_9 = in ^ 9
+  GF_sqr_s(t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_9, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, in);
+
+  // t3 = in ^ (0x111)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t3, t1, in);
+
+  // t4 = in ^ (0x222444)
+  GF_sqr_s(t1, t3);
+  for (i = 0; i < 10; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t4, t1);
+
+  // t1 = in ^ (0x222444 8889)
+  GF_sqr_s(t1, t4);
+  for (i = 1; i < 9; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x2224448889 11)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t5 = in ^ (0x222444888911 2)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t5, t1);
+
+  // t1 = in ^ (0x2224448889112 2224448889112)
+  GF_sqr_s(t1, t5);
+  for (i = 1; i < 52; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t5);
+
+  // t1 = in ^ (0x22244488891122224448889112 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x22244488891122224448889112222444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889111 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889111222444 4)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889111 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889111222444 4)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+
+  // t1 = in ^ (0x2224448889112222444888911222244488911122244448891112224444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // out = in ^ (0x2224448889112222444888911222244488911122244448891112224444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, t3);
+}
+
+// inverse Mersenne S-box with e3 = 7
+// (2 ^ 7 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76ed
+// ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e d
+void GF_exp_invmer_e_3(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,};
+  GF table_6 = {0,}, table_7 = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 3
+  GF_sqr_s(table_d, in);
+  GF_mul_s(t1, table_d, in);
+
+  // table_6 = in ^ 6
+  GF_sqr_s(table_6, t1);
+  // table_7 = in ^ 7
+  GF_mul_s(table_7, table_6, in);
+  // table_b = in ^ 11
+  GF_sqr_s(table_b, table_d);
+  GF_mul_s(table_b, table_7, table_b);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t1 = in ^ 0xdd
+  GF_sqr_s(t1, table_d);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ 0xdd b
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ 0xddb b
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ 0xddbb 7
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb7 6
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // table_7 = in ^ 0xddbb76 e
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_7);
+  GF_sqr_s(table_7, t1);
+
+  // t1 = in ^ 0xddbb76e ddbb76e
+  GF_sqr_s(t1, table_7);
+  for (i = 1; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // out = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e d
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, table_d);
+}
+
+// Mersenne exponentiation with e_star = 3
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+
+  // t1 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, in);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+  GF_add(state[2], pt_GF, aim2_constants[2]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+  GF_exp_invmer_e_3(state[2], state[2]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_transposed_matmul(state[2], state[2], (const GF *)matrix_U[2]);
+  GF_transposed_matmul(state[2], state[2], (const GF *)matrix_L[2]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[2], state[2], vector_b);
+  GF_add(state[0], state[0], state[2]);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+  GF_add(sbox_outputs[2], pt, aim2_constants[2]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+  GF_exp_invmer_e_3(sbox_outputs[2], sbox_outputs[2]);
+}
diff --git a/crypto_sign/aimer256f/m4stack/aim2.h b/crypto_sign/aimer256f/m4stack/aim2.h
new file mode 100644
index 00000000..bdc50429
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/aim2.h
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0x24a19947b3916cf7,0xba7c9045f12c7f99,0xb8e1afed6a267e96,0x2ffd72dbd01adfb7},
+  {0x0d95748f728eb658,0xa458fea3f4933d7e,0x636920d871574e69,0x0801f2e2858efc16},
+  {0xc5d1b023286085f0,0x9c30d5392af26013,0x7b54a41dc25a59b5,0x718bcd5882154aee}
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001,0x0000000000000000,0x0000000000000000,0x0000000000000000},
+  {0x13269d7dcfc555c3,0x6fe13874c42fedfb,0xc69f003d9d5abb9c,0x05636fd04ebf7feb},
+  {0x7a273dd9fcec7e15,0x42cd3eb54144ea68,0x5a88aaa3ebaacdff,0x527284e39fae2053},
+  {0x56bb9ab537abf542,0x768c3d772850c862,0x0160d91d288fd0e0,0x342e111e0a022022},
+  {0xcdb998ce4b3eee2e,0x78984c4dc99c90aa,0x2bb89f84c00275b6,0x75c6a0cc065fd4ac},
+  {0x74b2cd2360cb32af,0xbde82f7cf42dd1bf,0x7ceed82d54d965c4,0xf4e9f207aa17f2e9},
+  {0x995d5aab614ac6c0,0x1563800b79242f35,0x1d940184c4509090,0xe6558fd024716b90},
+  {0x8d0b793b4375cc8a,0xfcf792217776a3ee,0x5da44008043b7450,0xc77adf87407cf838},
+  {0x00451596f23df45e,0xd8bcbc0d7ae8534f,0x02c26abe3748db45,0xb37e029dc51a4b41},
+  {0x177dbfce6cbc8c0b,0x62cdd72c8cbd2d2a,0x568802d992bd7a2c,0xd0082d2193b6e383},
+  {0x221e6872863f45c6,0xbe5a9bce6c00df76,0x98c076efe1cfcc67,0xa75bdc7ab5c142a9},
+  {0x088d4e8e27e0b74d,0x71046740fe7e6c5a,0x20123cab6052c1d6,0xa7135d055351c99b},
+  {0x46176449341c7657,0x2a7936011468475e,0xc347e166dca96014,0xd79326785eee3555},
+  {0xc6b77e5a8b6dcae9,0x6dc641a8e07c54d4,0x37055c3ed77341a8,0xd75eaedd0ec6f1d1},
+  {0x5240b9b6f3433443,0x7b7d965745400c05,0x4542be5aec50ec53,0x13e6ac8f2aac12a2},
+  {0x66c30b9da469d401,0xcd5dbf02dc359172,0xf16b3e62f8a57e1d,0x362c2bc9345b97ed},
+  {0xb2a65d5f7da755e8,0x11df10d6ddd9eb84,0x433468d75cb64470,0xb4a6ffd454c82b2f},
+  {0x1c87142145f7c112,0xde2854fa4939dc0b,0x10a503b51b7c7a19,0x174f91701431e1b3},
+  {0x60d8fb32b890cec6,0x27d95c11548f693c,0x30fce7ce95e950b3,0x210559008a309578},
+  {0x5de49c870dd8fb60,0x1f480e246bb2c961,0xdc5efcb1f4ee90ae,0x165c3f5b62136c5e},
+  {0xc17b4bbe4b5780a8,0x690f1102a6decffe,0xa26e146710d9cd7d,0xc7f278fb3f02a99d},
+  {0x4fe7916de7e17f1c,0xe9e59586ac0a7185,0x092b72935bc23437,0xa306568e985edbfa},
+  {0xc05330df507b35c8,0x944475d0eb5c89f7,0x34a3653b083969a5,0x97e431e62e205633},
+  {0x19fe581ef3e9a896,0x720ab1851376eff0,0xda5ca1af445dea40,0xe3899fd1cdc93f2f},
+  {0x7a18d867d11567d6,0x14e706af946787cb,0x2ececbd0e726236a,0x66a864e0c387e806},
+  {0x0a0a9e1dc2c9d30d,0xa1bd85358585db7a,0x78f90bb68d83e25e,0x2275165a7e496039},
+  {0x23f2e1a2057c9892,0xb7f503272b51fa8f,0x0ecf56cbb57a6021,0x77f77f889ecb3e74},
+  {0x237633913a45a827,0x3a2c98b4d38d139b,0xbc1dfd5ddab4bb19,0xf2bcbdc105b017fd},
+  {0x9a53645fca466120,0x07335188ef82289a,0x9cdd8f1434ddc4c7,0x25afc28ddf0c0ea5},
+  {0x0166bda62c3c97ac,0x4821343275a35741,0xa4a1f8ef377f5177,0x3008d4b041fc0802},
+  {0xed498663eb9138f0,0xb16289e1ea93949b,0xa2476ced73badf6e,0xb384ce50cdee1d75},
+  {0x25430e5e2ea409d8,0xf8909d2164becc11,0x77663884798e456b,0xe11b963640c6a7da},
+  {0x2a5ce7930313e789,0x01a1b717dd5e72f3,0x674b4810dda58bf3,0xb348d6cffeee2602},
+  {0xe4871c9932b98648,0x90432c7798b61577,0xf803346f3989e611,0x176c5f43490e3127},
+  {0x28b7ff52a8d039f5,0x2549d26014bcb371,0x7705b13fd068e5f0,0x22f60aec7063b440},
+  {0xa90087e5804b094e,0x17b587e9f7b1334c,0x7e9128a8fd49f502,0x10a15de60dcc1259},
+  {0x676fc8232449f7f5,0xa45eba0b86ee4f8d,0x48d0f0583763ed04,0x9430177369350009},
+  {0x8bb187487d0ca392,0x8b34c408cf71198e,0x4c5b9033c740f6cb,0x15165d415ea592e5},
+  {0xe25b8fc9315d8b10,0x6f067bcaaa5db46f,0xc0d574e6df163bcb,0x76d62e45eeb26cb3},
+  {0xc7bb4eaa81af7e21,0xc0c25e2c4da66ca9,0x20a5b7a6ef682683,0xe0c40a42bed8c878},
+  {0x340b283a1f67eb72,0x94c68ac57747d7b1,0xaab540d8883c7e78,0x53ffb196e81fbce0},
+  {0x03d1fe920cc5c8b6,0x2d058e7c02de80d0,0x349140f34518313d,0x52d8d34dce452897},
+  {0x3daf5481e615a4ec,0x1d21ddb2b19865a7,0x28572f8e3caef8c4,0x94f0069367dd5a9c},
+  {0xf97efd31544a2432,0x79cc100bcd1c95c5,0x630dd7dbdcda2efa,0xb0c94889efaeabe6},
+  {0x1855a973cd69d2ac,0xa249d1e68760fda5,0x9bd185166791f0b0,0x73aad654a16f87d5},
+  {0xb64f4c4f69887572,0x0dd0ddfafeaec759,0x9a2b2e01a2dfdf21,0x23e6842e19958e74},
+  {0x47126f2ed9d35243,0x2dd26a5dc07d8ab7,0x5f7a0864bae59fef,0x84bd4c2d7eef707e},
+  {0xc2b75aa6809fde33,0x4e05ff4138a1458a,0x4283e814ca9b30b5,0x46b1bcf0f62d4313},
+  {0x83f0c7c594f6cf9a,0xdb8a4b8e5dfe204e,0x44a803aecd550290,0x96cc8907871fc11e},
+  {0x7ca33f7d36e71a53,0x609b8f2296791418,0xd9e9118ba8ddf5e9,0x813002deae63def5},
+  {0x5e3805abc5d66c85,0xe95aac205db8a39d,0xfad61d269550a976,0xc0c3e22037926992},
+  {0xf3ba3f8e2a564d34,0xfd74426f936299c0,0x23bb54e8112b82e3,0xc5afe8e8365a6000},
+  {0xb733edd6855182ef,0x5ecb1ae3728f48e8,0x3b8b1ce5bf96e304,0xf3aba2a7bfac4c59},
+  {0x78f2ea71794eaef2,0x59f25ef7fe359b84,0xacfd3e59513654c8,0xd1e24fda7d0c3936},
+  {0x288da25da8b17fb3,0xbe107e7feb777a7a,0x166db15573baae6c,0xb5ccbf5cfe3e5135},
+  {0x4637849d0285089d,0x4f671ebc0437c2ce,0x188565bc785f8268,0x712dec2cd1ba005e},
+  {0xa25a6b6a471a00b1,0x6e1a6a380bb57611,0x3ef50b155eddd23d,0xd3788fef109d4e3b},
+  {0x4f403f37eba563c1,0x76a201773cddd009,0x58fba6bec18e06a6,0x11a19d4cbf2a6331},
+  {0xe3e6bbb73066a175,0x9748c56fec4b9fa1,0x406aae141855018c,0xa1410c0e735df446},
+  {0x5e569e71e70eb719,0xa673071887dd4687,0x07055d8d0a23d785,0x74d498384aee1190},
+  {0xa0e8a89b6fb6984d,0x908716f3ce5edf66,0x0a2b9e842b73e729,0xa1b9171e0b83204b},
+  {0xbe7532657aadaa20,0x1b66940116e06582,0x7385fd540009963d,0x847a9b51570e7ff8},
+  {0xe9395fd61662cbe6,0xb3a286d4b91d1353,0x455b0689d3ff2d83,0xd56078fc7681e787},
+  {0x8b470957a3441b8a,0x7df431ebbf7e447b,0x0e0f4fa397edd83c,0xd793865c1388620e},
+  {0x7b29927808bfa739,0x96e65ce20d51654b,0xaa8fcec0d3c045c3,0xe5f31c0e239b4fea},
+  {0x5525c2a74e77bf9e,0x88cf3be85881afff,0x7c81312941d70c3c,0x23d8a44e23a9c737},
+  {0xb869097f96d421f1,0xfc5054b0f253daf5,0x1c241e84b424d6aa,0x32b29f522eb351e7},
+  {0x6a466e2ed7c0ad0b,0x5590c446ea6f583b,0x56d2464d3ee4d099,0x068910c7eb32dd95},
+  {0x71139d1bc66bb641,0xb3a1027da065feea,0xe04294fcf6174557,0x81dae384498adb46},
+  {0xf43ed00c527a209a,0xa5754026d1f22c89,0xc78a8d365f196923,0xf5154817fc84f220},
+  {0xae764c7fe7341054,0xffc86134dc4d880f,0x1b6a1e1530d66862,0x250c95737b7b8284},
+  {0xbfee6b3c1e46c128,0xa78dc08ba0e7251d,0x3a95f11bcef9d4c7,0x34f2831709c6a420},
+  {0xe3a3c1aa9e2407d8,0x4c1a200af1077851,0x8965a32110544d77,0x6354a05036f3f5a7},
+  {0xbd108a58fc17d8a6,0x61b0351824a54794,0x499e7fd9fdd626df,0x850217a6be595511},
+  {0x53f2510fb68b5c61,0x5b122cfd2501b4ba,0x7fc88679758e8262,0x233472936a675422},
+  {0x11965eaffc401c95,0x0af31e003ba1fb12,0x2facfdd6611b7f8b,0xd67eaae060c88abf},
+  {0x6fa46680edff5f3f,0x454b6266e25e87cb,0x9addf096cb1df0af,0xa6de67c1da83476b},
+  {0xbf6f0cb8a600033a,0xf520f28cc3846c4b,0x008f972a2108bd6a,0x55bbe0da272b6cb0},
+  {0x9bf38905d29c13e7,0xc50cd62db6acc3da,0xbb9b791e0d47ac11,0xd54b025508c245d8},
+  {0x3a2547ab532ec9ff,0x79495ddf670c8bc8,0xdf4ed2dcee44e1bc,0xc2e52f1fc1f7d4d5},
+  {0x4800ee52ee97ecda,0xc9d9b772550e380c,0x98506ba8ea5ec019,0x21ffafa8b46c668f},
+  {0x3464a9138085b307,0xf67a192be113e9cc,0xfdd61b66e0e162dc,0xd612aba17d397d2c},
+  {0x16207c45e571aabf,0xf2583066040bf4f7,0x4bc24730dc4d62f5,0x608b3d1e61a60b2f},
+  {0xc2a6d2c707faaab1,0xc9cfa575f99f891a,0x61ea461507f40f96,0x67104299d7331a82},
+  {0xaf1c8fcbed1f1699,0x985767a5dbb95b90,0xd6ae3b3279c96a14,0x275ea501029834e7},
+  {0x4e19e32114de1e9c,0x165f71d116e0afb9,0xe968cbf378c1a2f7,0x912182eb2d02ef2d},
+  {0x6e4e3c81caceef19,0x85f15b2e37fe2cbe,0x8ae88fcc89bb8687,0xe50b4d7659484c7a},
+  {0x80353d06c9930d5c,0x723d1f993acaffad,0x89e273ac935dc5e2,0x51356090a9eecbf9},
+  {0xc3bd743bf118e69e,0x78fe213d42306293,0x90638ea842ff3668,0xb0addcda3683625d},
+  {0xe26008c6b83cc264,0x74bbbd5777680be8,0xa8892126f9cc485a,0x54899977a5cc34a1},
+  {0xd19b2baf7fa0c771,0x39d199b5dfd41569,0x7c3c66294bc7b31d,0x81bb86cd53109ac5},
+  {0xe4a790156b11f26a,0xb496c49018830c99,0xf19e574456b9d549,0x867aa70b9bbd4fd0},
+  {0xb8ce927c2afbcba9,0x3ae3f9d11d478318,0xebdecea6a113ffd6,0x071def720f45ca33},
+  {0xa18c4347c3dba5da,0xc231d50db69b59f6,0x784caea3c01900f9,0x21b179202d1177e0},
+  {0x48d839b0e148b37a,0x119910fe9c00220e,0xf6959f7654a471b7,0x138df428ee1ab05e},
+  {0x2378b25ea2d743c2,0x52a0660820b6ff4b,0xb20d6835419796a6,0x77d41062fb9a7654},
+  {0x1e63666141c834dd,0x534d884045bcdedf,0x07b52ebe10206e92,0x67cb1a5c5d2017bc},
+  {0xbd489efa4249447b,0x81b1f830bdd020d0,0xb8db0042e390a71b,0x90b877cf8d8200e6},
+  {0xd91a2f7fe76f986d,0x2c6fcd64257849b8,0xcec2c4be6ecbe77b,0x5031f045518f6b98},
+  {0x3cc9f99a10cba6b9,0x7df264605ea09f19,0xc6099006fa2f35a0,0xf31aa1999c65f2ed},
+  {0x7322250ccd66f2d2,0xa8cf62816a34838c,0xf7bd30878c6d359b,0x450a14aed0d49014},
+  {0xf753996b7d7c1d54,0x45e2b366fb683eae,0xcef4cef44af75b4a,0xd1e647d51db49a04},
+  {0x257099ec419b94a6,0xd4a8a9f3335fcd10,0xa286788285415010,0x023c9feb9c1e9901},
+  {0x229d6fd7eed1531c,0x04cefb6c19ff0062,0x9130be016eed6e29,0xa1a04435eb4cd39d},
+  {0xefbd279ed0b045c6,0xe8ec58f13b1a927d,0xbabddf060b172c30,0xa5fd98adc4c9d7f8},
+  {0x0f859d44ce18448a,0x07af518284a5a680,0xff7565589bc19136,0x72e50c2e9eaa580a},
+  {0x6470f3d6724b5dd7,0x8b0ebb24be876d22,0xfb604e14fd34a2cc,0x213fc1d31fbb7996},
+  {0x50e1d4f6f24a3685,0x69348d20cb64f7b6,0xa13da095f7678267,0xb63a6ac7a66c3284},
+  {0xb0edaccd9a8698dc,0x73d7ca79b1672272,0xaed4ffd76475e235,0xf36b5b0cbbb22a1b},
+  {0x24acd40ab0b10aba,0xafb39e3ea0656a92,0xcfe743611a51fa5a,0xb4f8251f0f0e0d41},
+  {0xe8036bb95086dfdf,0x3d5d0332c379fb16,0x3029edc150437ed5,0xf561ce7ace559b0f},
+  {0x01047fd87eb154ca,0xce04d75cd86f0d9d,0x33f6d9a762e84d0b,0x52f77f2619632746},
+  {0x3fdf7a3e2584aaa7,0xafdff63009b07776,0x24496f671e85ade5,0x35b2e80c0abfdee5},
+  {0x4bb3e9185acc78b3,0xe5634557a7f532a4,0xa6a979853e645782,0x97e9a6c3f5ed6068},
+  {0x41685f9547d8c651,0x6d4bade8828daeda,0xabe0dcd781a5b523,0x3528952d2a770f19},
+  {0xe4e43b26b587ea84,0xf0f3f420178def6d,0xd48cb1f978a8bb2e,0x25de266fb8567a86},
+  {0x2906276141285c5c,0x045688d8cac52240,0xa1a62b2fa2474687,0x917244641b004f87},
+  {0x73897ebb86a40eb0,0x0df1bc6722ab333e,0xb7815fffa0c79792,0x322111adf2c83d06},
+  {0x4dd181aa27fc54fa,0x47b557267a691a35,0x089b8ed1303c2515,0xe60b63596c40b943},
+  {0xe574bb3f5e1d3fe5,0x7e5e1dd1aaea6c56,0x443b9d58176d285b,0xa2c066cf80f1c62d},
+  {0x9df2b1fe93b4cb69,0x5dc5dcbd7bcd4304,0x4cac45f5c51659e4,0x9039bc7472f02b80},
+  {0x81c7d14b2ff6f3d6,0x76b7422e6f000e01,0x23e23fa520ed280b,0x50a4f9ded0d07978},
+  {0x154548397391fa38,0xb1ec123aeb772341,0x22f40fd3abeee812,0x0342edcc39a77162},
+  {0xa7ef812f5e9d9ba6,0x65de86bcc8071b0d,0x4b9bbe60fe0a1fad,0xf4f8322efc5e2f45},
+  {0x21fbeab48a7c1136,0x42736db042991d3e,0xf78c442fd2ed07a6,0x36228053a90abb56},
+  {0x6ebfcec360d88021,0x7deeafd7cae1b159,0x6f32c272246a4999,0xb2f984f6c2b488dd},
+  {0x76beda6b3d15abc7,0x1bc04ff70ef9d0a9,0x75ec5c46c4854ec0,0x77bd25a817826a51},
+  {0xf79c8d9bd7aaf4f0,0xae5add9fd1454f93,0xd9f264167923d698,0x273bf89c8b33a9ec},
+  {0x20ba5517532e42a7,0xd9991aaa0bcb040f,0x81ec69b31aea8c89,0x823ee1a07f410f90},
+  {0x3e10957041e49998,0x9746fdf5f3deb53d,0xbae6be6d5a7923dd,0xd4aa255a7e60b5f8},
+  {0x453e76f50e50f914,0xba084020e530dd32,0x90e9982f02a0b2e3,0xc1bf6d0c93565fd4},
+  {0xbb44043183434a96,0xb6839987e4d3fbf5,0x780e11ff154ba921,0x46deb765191c6fad},
+  {0xf254860f62ddca11,0x2b40c2147fcc1618,0x9b9df4f2213a87f5,0xf5d9f1982bf72085},
+  {0x9ec887ef1dac7ea8,0xf9b9f41cd1a90cab,0x5106c66727088891,0xa079314a8a7aa0cc},
+  {0xbaca971f705d6820,0x32cf35c216d31b74,0xcda1b48f6a782676,0x42dd0c61745b57af},
+  {0x774f50e70700fa3c,0xfe706a77d17875e0,0x50acd4b9e4f085bc,0xa70b2f3a3373b5cf},
+  {0xa3d467e6532333ed,0x9143409c675fea0a,0x186d4c8b7de757db,0x006e698e91bc1742},
+  {0x042690d62241c815,0xf8a04fddc8420797,0x9ff8cf1394eaada4,0x921b7749e0687334},
+  {0x3ba03d72cd709236,0x12f95d885e21e3d3,0xba18560bb5d4d50a,0xa3607627494476ab},
+  {0xade31f9ca5377f89,0x635510178eec1003,0x3dca939c351bf98d,0x339c87aee1cf78dd},
+  {0xe45a6287cc1287d4,0x7cf6c8c56ed07634,0xadf6eda911dd0200,0x87211a5d3722f0f6},
+  {0x7b07d341c0de902d,0x69838993df5c9429,0xb8921642be862244,0x555819247b006cca},
+  {0x4b8ebd3e261a1065,0xec8c767eb1653ceb,0x482e17c892519544,0xb61af0cc04b533a5},
+  {0x4fb9d38c4e2f7113,0x50030b8523699320,0x5716f5c60cedd7d8,0x0673e662c18aadef},
+  {0x641233031f77a5fb,0x1932c76990a0d465,0xb79ab4fbf32c92e8,0xc0a7370dd0467550},
+  {0xdb899bf50910763a,0xf026477f262eb097,0x76b70a1b2163a0d4,0x93a2873f23165f6e},
+  {0xba2a66c196ce2eb8,0x19383fd3ffab287b,0xaed33c3223646076,0x1274559077e98698},
+  {0x035a94843c44ec7a,0x6de99478a3c009e3,0x8a7ecba43ae87e6e,0x458c9cbfca30c71a},
+  {0xe3695ac8419682c5,0xaeee4d4d0392ec66,0xd99792a67250c187,0x91e0f202f4c924b3},
+  {0x9c784cfbf5192c27,0xc113eee0e80c2eae,0x3f7b5a6101ce5f5e,0x842e2d646ecd9d6e},
+  {0x957028a6befc0d73,0xcbb8df5afe2e23c3,0xc00f5c490d8dafb4,0x67d7ee99cddb8452},
+  {0xac8c3e869f704d2d,0xc928ad50bd4faf6e,0x114a0001a078d1a0,0x8375ad6cc681586b},
+  {0x59a53e3fb149cca9,0x69cf3f7ab419768e,0x79d945a746a788b8,0x979b7e9387ae017f},
+  {0x41f7712568f43935,0xf17647a51bb6cff9,0x593eb0f68e21db19,0x77bf0442e77fdbdd},
+  {0xd430085cbe62c90e,0x445d0af933a0c884,0x92f5c9b29a5de145,0x6778e9aad04a6c94},
+  {0x4914b4bd446c5d64,0x21b19c795fec736d,0x72cf9cdf7fa1c0db,0xf67226412058b23c},
+  {0x1e7346a99e1464a3,0xacb82da3ac217e94,0x4d1f4486473e6c18,0x23274da141c63725},
+  {0xf58a0445c9b4903b,0x4f196615648056a4,0xeaf0d8fc78e51fe3,0xc71e969830bec69e},
+  {0xcec3175fd17dee42,0x6fa60eda34cf3b0f,0x016ff6fe365a227b,0x148ed225daf52abf},
+  {0x5eb5954a6c060dcd,0x67ed2e3411fbde9d,0xdaddfd054f15c5a4,0x80e12ae0d1591ef3},
+  {0xc9c76eda44553b71,0x7c4675538cbdcd1e,0xa2128f16928c1efd,0xc13aaef8cfacc959},
+  {0x525318d3ea7544bd,0x6f3e0f4d85ce7b2a,0x397102e6892ab449,0xd028319bc9ef0676},
+  {0xc55bc06690da6f96,0xea6a73d17ce2969a,0xfa21bd37fa658e1e,0x32d421c8c9a9d437},
+  {0x4f53f0e462a9f4f0,0x18c65d2ba362d43b,0x53b8871400599e70,0xf291e9ac535cfe6e},
+  {0x2a420a66918ee17a,0x4dae04d613a5a05f,0xc12c868048f09ef7,0x900c4ca4fb306ac2},
+  {0x357f0638ee05acbc,0x389db47cc78620f7,0x3c531ff5b9fff02b,0x902c96f5fb2c18f9},
+  {0x57abb6151ae9319f,0x917bd98253c43360,0x36b4e4e17d9c5182,0xc2a4751705897c3b},
+  {0x91ee0ad214084c1b,0xfe17b657a9ea9054,0x7b304880e7a3efb6,0xd497c8cea46cf443},
+  {0xb97e1c63dcc46441,0x22898ec1ecb0f186,0x40dd2915e34e92ae,0x83e63e8886604034},
+  {0xf159f13af4545efc,0x6b0312cbfce549f7,0x1632f9e6624b3c5e,0xc387a21c7c20a6d6},
+  {0xe81b4468c49ba628,0x9962cd4b58abb1e7,0xda2145ce9fe59f2e,0x6021807944cfc8e1},
+  {0x9e98852b17310f23,0x3cbe1c8bceb45120,0x0e165b29c57ec0eb,0x305bf854fb1aea8d},
+  {0x1c3dbdac479d54f7,0x4cda9c1c1bbb1a19,0x7d330c571f17bc88,0x826548b30e26b7d7},
+  {0x446afa2ca1809535,0x8d3c9693ee673350,0x7893a83f58de1ffc,0xb19954f7647195ff},
+  {0x21b77a7b577e945a,0x0c3e91d3f1f89e09,0xdd7b8e8a59fae93c,0x6435f276c4582559},
+  {0x4d0e6426007bc199,0x5c13184bcf7dd24a,0x26f1f87322e213d0,0x97243e676a3eb387},
+  {0x14cbfff5b787dedd,0x355794e80f8cd847,0xf2c951e3c0d77a3d,0xe558cf2f7b5f2991},
+  {0xf87b23ea7452e43c,0x92521695b010b548,0xb7af363918a98cf1,0x473e6304c6f3f9cf},
+  {0xe86f5e030902695a,0x884c59759075978e,0x862a4f44f20c857d,0x2348092c2d62a7ed},
+  {0xbebdf27580f800b3,0x4c82348a99cfaf36,0x7fec6e2fb343c70a,0xd0a2b036a8d95707},
+  {0x59ef03fcb5a57f39,0xfb04bb079290dd73,0x30e0751c7c8e4263,0x4078bcf952cf1a62},
+  {0xa19ffba37095d58a,0x9d164dabb30dff6a,0x16de88d2bac7642a,0x8232b5dca704cbcd},
+  {0x329dfc2b2636492c,0xf0397ad762a31307,0x78adfe730ebe751e,0x5783b8d9d2f05dfd},
+  {0xf2d6e8a736f23aa1,0xe2102f9bd2267093,0xdf2af690beecc500,0x11398c83a817f593},
+  {0xd46565aaafab2385,0xddff3f9a0b99928d,0x5eb2072a49c5a5ab,0x53a03f6a8eb6a094},
+  {0x57fb689ec7092868,0xc2040eb173de1a44,0x810031fb7b19e630,0x53960a9b3b1ef568},
+  {0x40007920454fbf71,0xac025a589e98d1ef,0x9e256036a7fbd143,0xc13cf073bd649440},
+  {0xd06cd6829f0fea3d,0x2a51b1d71d1ac07b,0x3546a5854571bbc6,0xc30b6bf46c0b42fe},
+  {0x62488646a13da231,0xe28973393fb6f682,0x9ed13dc9f5432f8f,0x31b84f2be241c94e},
+  {0x9bb19ea5428d66ae,0xe0080b8616f3babc,0x9610055711788ae5,0x7652d184a46c90fe},
+  {0x112c63f926d9850e,0xe5905a268850e663,0xb9fd3996e6d72608,0xa7aa33543146d58a},
+  {0x77de728df392f575,0x637633946129f8e6,0x72a867e08e3bfce6,0x754f7149e15a365b},
+  {0x3511c4139b98679f,0x56a8a361da8cbe81,0x2a34d15423a9eb45,0x82ae1da57cd32e57},
+  {0xcb3ecb886171f719,0xfcbf82d884e8e020,0xc6d2502bd1e6f6cf,0x80bb7b1db5c2a777},
+  {0x81b8745892f03d2d,0xbc5f38b14116148b,0x4b6d0194055b86d8,0x241dbd17e3eb4ba9},
+  {0x5bbc585152fcd142,0x930f31c230a2050e,0xabf51e10a3e969e5,0x72a0a1c90ead638e},
+  {0xcadb18ff93f7f93e,0x1b8e009b5719bf82,0x743c0ab2c8bc284a,0x7144a02ff1130223},
+  {0x41b95e62522de019,0xcd3465a01c9b93fb,0x236600ff15e70ef3,0x3658cd0c29ea6f20},
+  {0xb9c59bf0b27dc282,0x47955c29304112de,0x3f16c72af19bcb3f,0xa0e568c9c5397d69},
+  {0x9251cf7a209add18,0x8e3a95a336fe4170,0xf28c14a751527126,0xb3d3a9a208590971},
+  {0x5b129f35a37c28ff,0xe3f8ba25b41817b9,0x200b734d2501265f,0x52344985724cceca},
+  {0xa8e27fd1e60dffab,0xa8ea4523b64f5aa4,0xa475b8437f8165a1,0xd644c1691c3c7548},
+  {0x5ddae2f669e64957,0x1fcef31f0b9af756,0x3e6da61c7980074e,0x206f828242ab6764},
+  {0x33144ea9f76bb631,0x9f36e03e21fa3065,0xfe08e97dc86bceb2,0x640b723c98cd7479},
+  {0x1636152634146114,0xc18c0793a80805cd,0x2b106edd3834043c,0x4191bf5c7fbacdf8},
+  {0x429dddfd03ef7bc4,0x4db9b9d6da197cd3,0xe74baaea7f22abc0,0x4364ff1e20f72e64},
+  {0xca8a9a678e94da68,0x6535f14dbed15563,0x98f34f0a20bd3f3c,0xd12c84164701e27f},
+  {0xc02c8d4c379b7ce5,0x7069499c81e1f16e,0x9bf97727b1a05c04,0xf27fa10bb0a78610},
+  {0x4cf536f0cf11a349,0xbd9dfa2a6eb41391,0x565f1d6e23bbbc0d,0xc76bbb697c18cf7f},
+  {0xa17601bde8ac478c,0x8db87c51403e365d,0x4088a87a96d9c622,0x31f82a7918dd0d06},
+  {0x29ee14687120f04e,0xfea2e736c3636d5c,0x7f8c89823855588a,0xf0da86215a008e8c},
+  {0xdd645ec1d816c223,0x0aa7edbc5ba5d0cf,0xfced1c8e126396e5,0x201b07bc6f65eddb},
+  {0xfb25e20cd48f4855,0xa8b3d1435e85371a,0x3ee9acb3f939329e,0xd075efbe502f25a4},
+  {0x0541c9b35049c704,0x94986dc9cd668f39,0x17f4cfb2726cd68b,0x508c14a670636ed4},
+  {0xa2b783ac55d68039,0xc130ab2d841d773e,0xd6d29b14f588465d,0xb790ad979cce43f8},
+  {0x4f8ce0df03c43b98,0xbbda15818c06d7a2,0x380dd95f0f042fdb,0x05f429bccfb597f3},
+  {0xc742e63ad5c5f5e6,0xcbcb225fbbbe33e2,0xa8edf59089d52ced,0xa0e788a338b45f4d},
+  {0x20e95da4bdb0c82f,0x3e63b532cc85e2d9,0x163e3d2b90d4ddaf,0xc71593e07530219a},
+  {0x7992357ab8d37b59,0x4aea96f315f3c064,0x1ba04f945b33146b,0xf65bed5593247ff4},
+  {0x2d4ad59bdce5563d,0x3a24253d449dc88d,0x41c7ffbd062c28f4,0x42734ae219aa9361},
+  {0x644204f2ea9b71e3,0xe551983ade3b5122,0x1bc727382db55ea1,0xe276d03e4bd6fc9a},
+  {0xfb20c1e51a924e81,0x2f795f1d4507decd,0x154de4d0aca02046,0x72ddcd99451381dd},
+  {0xc09ac8020e255c2b,0xa4eff29a2c29d3f3,0x7977c4f4c2f24381,0x349ff7a6efa4d791},
+  {0xea5d2cd9592cb4e6,0xf63dcd3ff0c8104a,0x66d7254c1252ca0d,0x822791068962c667},
+  {0x7b9c477dde2ad4a4,0xd3460c638eb797b0,0x1889eaef7acad771,0xb23db19bd8554e11},
+  {0x6f1c469240cd647f,0x31825907e279b274,0xb97cebbf2c37c29e,0x74ce50e87690b22e},
+  {0xfb92d64637ac0508,0x97999c37b92d0720,0xa23a9e76c1578849,0x66aa9c79979e14fb},
+  {0xcf78e912e65a8877,0xb7dcb878bdeec090,0xe678ac56695a99fc,0x0338870b34c11cee},
+  {0x5529c228e771c374,0xd8ab910e6e0a23f9,0xcd86f7b11bf07839,0xe3358c0867358f64},
+  {0x7c0e69e5db7dc1c3,0x355a9bbca9523a64,0x86985b53d32a3f4b,0xc715ea89b184099b},
+  {0xac499c49b8a4cdc4,0x22485e1df13ea826,0xf91367c2ad8807da,0x863b3b9193879ebd},
+  {0x8086427544d93f9b,0xf378d24905271a4e,0x8a2211f2e881884b,0x27f11aae6fbdeb19},
+  {0xd4d702e312991728,0xd57d86c18df5deb9,0x68b550520aac07f8,0x6163e0c25242d715},
+  {0x0484539b5bd55737,0x69b34b6b4664d575,0xcafeeeea78048b31,0x25a0aca017ec768d},
+  {0x955f03fc32b86250,0xb3ed04233eabbafd,0xb4da5d10fb30568d,0xa1d5c520656d8e7f},
+  {0x18e6772ac0c7b0b5,0x1e8c41bfa134bd72,0x36b1b28f157526b2,0xf5ac9222151e43e1},
+  {0xb500af50c3647566,0x181d28f85aca7575,0x9a16455dfd6341a1,0xc6d058b2c1e37c22},
+  {0x01b46ff0be3c6ef8,0x7f5abf4a7e4a72fe,0xe18780f7372db81b,0x91d1172dbd7c1d3b},
+  {0x62e68a7598567ebd,0x4654b8ed6f377911,0x051bf02a5685ca63,0xd08b010696df1fa9},
+  {0x656ce860674f0d36,0x8bcbf7bc1ef730bf,0x00a0260df392d280,0x33930145fd64eecf},
+  {0x17743293297fc288,0x5b59ca56522bb36a,0xe58ef14098fd4053,0x7444ed68eb16e657},
+  {0x31beae245608121f,0xea349f5c00e7cc25,0xf076aacf6db8c528,0x13c58f0b1e99ac1a},
+  {0x910f9e30c8455d7b,0xc1ebc494beb98220,0x201a3557ec66e851,0x610dd21bbd2f6b9f},
+  {0x317d8fa79aa99e03,0x7b670f771c4590dd,0x77052e1a54ac4638,0x17309eb8c690df96},
+  {0xddae9fdfd80030d4,0x84daf3404eae25e8,0xe93997a2e172c485,0x51f2159ecb7b5e41},
+  {0x9f02a3e12da8bc2c,0x1c746f4b943dc8e5,0xb31951aeeaac4e5e,0x0128a606643b4341},
+  {0xebd158803af98ce2,0x08e82db8ead7c10b,0xba172e80caa61667,0xf61ff900e1918b8a},
+  {0x8c3c570f9ffae2bc,0xff0827921f27e4f5,0x6256d4a0913919b5,0xc1f4fcc60f17957e},
+  {0x648ade6556f9d114,0xf2e85e1746058ffe,0xc9605989ede623cf,0xf3d09098541725a9},
+  {0xc57b49460d911255,0xc0767005f4affb44,0x486c21436602612a,0x87617ddb2a9643c0},
+  {0xc2038cd71c6d3ead,0x8fe1e58a5096a181,0x51cde6590d0f6b27,0xf59bf938475aa39a},
+  {0x9d8138454badbf16,0xaf8306904b15d8a8,0x83bd9fd79c159b39,0xb85db82acdbbf3ae},
+  {0x560807274e8b13db,0xb33b8a036f1617ca,0x72bc05868c923532,0xb7b8ee25c3388851},
+  {0xc042df127c4f6747,0x704ed715ba3ca7d4,0x678f93c55bc0c5d2,0xd2ee482f0bfe6c9a},
+  {0xbd60c5ba33d87b10,0x6c2ff096c60536d6,0x0ce4b4b8c86a8f5b,0x86a0bcebf81d6e4d},
+  {0xf9384ef3a44799c2,0x8b78ec1c676a7fcd,0x5f7c3edb312b00da,0x2390763c1712af67},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_invmer_e_3 AIMER_NAMESPACE(GF_exp_invmer_e_3)
+void GF_exp_invmer_e_3(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer256f/m4stack/api.h b/crypto_sign/aimer256f/m4stack/api.h
new file mode 100644
index 00000000..2bc176d6
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 64
+#define CRYPTO_SECRETKEYBYTES 96
+#define CRYPTO_BYTES 25120
+#define CRYPTO_ALGNAME "aimer256f"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer256f/m4stack/field.c b/crypto_sign/aimer256f/m4stack/field.c
new file mode 100644
index 00000000..5c27f63a
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/field.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t temp[8] = {0,};
+  uint64_t sub[10] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[2] ^ a[3];
+  sub[3] = b[2] ^ b[3];
+  sub[4] = a[0] ^ a[2];
+  sub[5] = a[1] ^ a[3];
+  sub[6] = b[0] ^ b[2];
+  sub[7] = b[1] ^ b[3];
+  sub[8] = sub[4] ^ sub[5];
+  sub[9] = sub[6] ^ sub[7];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &t[3], &t[1]);
+  t[1] ^= t[2];
+
+  poly64_mul(&a[3], &b[3], &temp[7], &t[2]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  poly64_mul(&sub[4], &sub[6], &t[1], &t[0]);
+  poly64_mul(&sub[5], &sub[7], &t[3], &t[2]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul(&sub[8], &sub[9], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] = temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] = temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] = temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t temp[8] = {0,};
+  uint64_t sub[10] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[2] ^ a[3];
+  sub[3] = b[2] ^ b[3];
+  sub[4] = a[0] ^ a[2];
+  sub[5] = a[1] ^ a[3];
+  sub[6] = b[0] ^ b[2];
+  sub[7] = b[1] ^ b[3];
+  sub[8] = sub[4] ^ sub[5];
+  sub[9] = sub[6] ^ sub[7];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &t[3], &t[1]);
+  t[1] ^= t[2];
+
+  poly64_mul(&a[3], &b[3], &temp[7], &t[2]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  poly64_mul(&sub[4], &sub[6], &t[1], &t[0]);
+  poly64_mul(&sub[5], &sub[7], &t[3], &t[2]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul(&sub[8], &sub[9], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] ^= temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] ^= temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] ^= temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t temp_c3 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+      temp_c3 ^= (b_ptr[0][3] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+      temp_c3 ^= (b_ptr[1][3] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+      temp_c3 ^= (b_ptr[2][3] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+      temp_c3 ^= (b_ptr[3][3] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+  c[2] = temp_c2;
+  c[3] = temp_c3;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t temp_c3 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+      temp_c3 ^= (b_ptr[0][3] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+      temp_c3 ^= (b_ptr[1][3] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+      temp_c3 ^= (b_ptr[2][3] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+      temp_c3 ^= (b_ptr[3][3] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+  c[2] ^= temp_c2;
+  c[3] ^= temp_c3;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t add[4] = {0,};
+  uint64_t temp[8] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&t[3], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  poly64_mul_s(&temp[7], &t[2], a[3], b[3]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[2] ^ a[3]), (b[2] ^ b[3]));
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  add[0] = a[0] ^ a[2];
+  add[1] = a[1] ^ a[3];
+  add[2] = b[0] ^ b[2];
+  add[3] = b[1] ^ b[3];
+  poly64_mul_s(&t[1], &t[0], add[0], add[2]);
+  poly64_mul_s(&t[3], &t[2], add[1], add[3]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul_s(&t[1], &t[0], (add[0] ^ add[1]), (add[2] ^ add[3]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] = temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] = temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] = temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t add[4] = {0,};
+  uint64_t temp[8] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&t[3], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  poly64_mul_s(&temp[7], &t[2], a[3], b[3]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[2] ^ a[3]), (b[2] ^ b[3]));
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  add[0] = a[0] ^ a[2];
+  add[1] = a[1] ^ a[3];
+  add[2] = b[0] ^ b[2];
+  add[3] = b[1] ^ b[3];
+  poly64_mul_s(&t[1], &t[0], add[0], add[2]);
+  poly64_mul_s(&t[3], &t[2], add[1], add[3]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul_s(&t[1], &t[0], (add[0] ^ add[1]), (add[2] ^ add[3]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] ^= temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] ^= temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] ^= temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
diff --git a/crypto_sign/aimer256f/m4stack/field.h b/crypto_sign/aimer256f/m4stack/field.h
new file mode 100644
index 00000000..089ad983
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[4];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer256f/m4stack/hash.c b/crypto_sign/aimer256f/m4stack/hash.c
new file mode 100644
index 00000000..06a6f473
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/hash.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+void hash_init(hash_instance *ctx)
+{
+  shake256_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake256_inc_init(ctx);
+  shake256_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake256_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake256_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake256_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake256_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake256_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer256f/m4stack/hash.h b/crypto_sign/aimer256f/m4stack/hash.h
new file mode 100644
index 00000000..0fb7c8ef
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/hash.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake256incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer256f/m4stack/params.h b/crypto_sign/aimer256f/m4stack/params.h
new file mode 100644
index 00000000..3134a863
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/params.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer256f_m4stack_##s
+
+#define SECURITY_BITS               256                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         3                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     65                   // number of parallel repetitions (Tau)
+#define AIMER_N                     16                   // number of MPC parties (N)
+#define AIMER_LOGN                  4                    // log_2(N)
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer256f/m4stack/sign.c b/crypto_sign/aimer256f/m4stack/sign.c
new file mode 100644
index 00000000..601718ad
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/sign.c
@@ -0,0 +1,654 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[0], mult_chk->x_shares[0]);
+  for (size_t i = 1; i < 11; i++)
+  {
+    GF_sqr_s(mult_chk->z_shares[0], mult_chk->z_shares[0]); 
+  }
+  GF_mul_add(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  GF_sqr_s(mult_chk->z_shares[2], mult_chk->x_shares[2]);
+  for (size_t i = 1; i < 7; i++)
+  {
+    GF_sqr_s(mult_chk->z_shares[2], mult_chk->z_shares[2]);
+  }
+  GF_mul_add(mult_chk->z_shares[2], mult_chk->x_shares[2], aim2_constants[2]);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[2],
+                           matrix_A[2]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE])
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  hash_instance ctx;
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1],  rep, party);
+      hash_update(&ctx, commits, AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_add(delta.t_shares[2], delta.t_shares[2], sbox_outputs[2]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[2], delta.t_shares[2]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+      }
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF)
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF pt_shares[AIMER_N];
+  GF alpha_v_shares[2][AIMER_N];
+  GF epsilons[AIMER_L + 1];
+
+  // prepare h2
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF_set0(alpha);
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1], rep, party);
+
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk));
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+        GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+        GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+        GF_from_bytes(delta.t_shares[2], sign->proofs[rep].delta_ts_bytes[2]);
+        GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(mult_chk.x_shares[2], tape.t_shares[2]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[3], epsilons[3]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[3], epsilons[3]);
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares, sizeof(alpha_v_shares));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b)
+{
+  // prepare challenge parties
+  hash_instance ctx;
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF epsilons[AIMER_L + 1];
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  mult_chk_t mult_chk;
+  memset(&mult_chk, 0, sizeof(mult_chk_t));
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes, i_bar);
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha_share = {0,};
+
+    tape_t tape;
+    commit_and_expand_tape(&tape, commits, &ctx_precom,
+                           nodes[AIMER_N + i_bar - 1], rep, i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits,
+           AIMER_COMMIT_SIZE);
+
+    GF_set0(mult_chk.x_shares[AIMER_L]);
+
+    // adjust the last share and prepare the proof and h_1
+    if (i_bar == AIMER_N - 1)
+    {
+      // initialize adjustment values
+      tape_t delta;
+      memset(&delta, 0, sizeof(tape_t));
+
+      GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+      GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+      GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+      GF_from_bytes(delta.t_shares[2], sign->proofs[rep].delta_ts_bytes[2]);
+      GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+      GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(tape.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+      GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+      GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+    }
+
+    for (size_t ell = 0; ell < AIMER_L; ell++)
+    {
+      GF_copy(mult_chk.x_shares[ell], tape.t_shares[ell]);
+
+      // x_* = sum_i A[i] * t[i] + b
+      GF_transposed_matmul_add(mult_chk.x_shares[AIMER_L],
+                               tape.t_shares[ell], matrix_A[ell]);
+    }
+    GF_copy(alpha_share, tape.a_share);
+
+    GF_mul_add(alpha_share, mult_chk.x_shares[0], epsilons[0]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[1], epsilons[1]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[2], epsilons[2]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[3], epsilons[3]);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes, alpha_share);
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  signature_t *sign = (signature_t *)sig;
+
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt and root seeds
+  hash_instance ctx_roots;
+  hash_init_prefix(&ctx_roots, HASH_PREFIX_3);
+  hash_update(&ctx_roots, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_roots, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_roots, random, SECURITY_BYTES);
+  hash_final(&ctx_roots);
+  hash_squeeze(&ctx_roots, sign->salt, AIMER_SALT_SIZE);
+
+  uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE];
+  hash_squeeze(&ctx_roots, (uint8_t *)root_seeds, AIMER_T * AIMER_SEED_SIZE);
+  hash_ctx_release(&ctx_roots);
+
+  run_phase_1(sign, pt_GF, (const GF *)sbox_outputs, mu,
+              (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds);
+
+  run_phase_1_to_3(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, ct_GF);
+
+  run_phase_1_to_5(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b);
+
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[2]);
+        GF_add(tape.t_shares[2], tape.t_shares[2], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(mult_chk.x_shares[2], tape.t_shares[2]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[3], epsilons[3]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[3], epsilons[3]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(signature, CRYPTO_BYTES, message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer256f/m4stack/sign.h b/crypto_sign/aimer256f/m4stack/sign.h
new file mode 100644
index 00000000..2e2ab61d
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/sign.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_3 AIMER_NAMESPACE(run_phase_1_to_3)
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF);
+
+#define run_phase_1_to_5 AIMER_NAMESPACE(run_phase_1_to_5)
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer256f/m4stack/tree.c b/crypto_sign/aimer256f/m4stack/tree.c
new file mode 100644
index 00000000..0585e7fa
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/tree.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE])
+{
+  size_t index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  memcpy(nodes[0], root_seed, AIMER_SEED_SIZE);
+  buffer[0] = (uint8_t)(rep_index);
+  for (index = 1; index < AIMER_N; index++)
+  {
+    buffer[1] = (uint8_t)(index);
+    memcpy(buffer + 2, nodes[index - 1], AIMER_SEED_SIZE);
+
+    hash_ctx_clone(&ctx, &ctx_);
+    hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+    hash_final(&ctx);
+    hash_squeeze(&ctx, nodes[2 * index - 1], AIMER_SEED_SIZE << 1);
+    hash_ctx_release(&ctx);
+  }
+  hash_ctx_release(&ctx_);
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer256f/m4stack/tree.h b/crypto_sign/aimer256f/m4stack/tree.h
new file mode 100644
index 00000000..f1354f19
--- /dev/null
+++ b/crypto_sign/aimer256f/m4stack/tree.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define expand_tree AIMER_NAMESPACE(expand_tree)
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE]);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer256s/m4speed/__asm_field.S b/crypto_sign/aimer256s/m4speed/__asm_field.S
new file mode 100644
index 00000000..6181c602
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/__asm_field.S
@@ -0,0 +1,695 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  ldr.w R2, [in_p, #4 * width]
+  ldr.w R3, [in_p, #5 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R3, [out_p, #5 * width]
+
+  ldr.w R2, [in_p, #6 * width]
+  ldr.w R3, [in_p, #7 * width]
+  str.w R2, [out_p, #6 * width]
+  str.w R3, [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R2, [out_p, #5 * width]
+  str.w R2, [out_p, #6 * width]
+  str.w R2, [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  ldr.w R3,  [in0_p, #4 * width]
+  ldr.w R12, [in1_p, #4 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #4 * width]
+
+  ldr.w R3,  [in0_p, #5 * width]
+  ldr.w R12, [in1_p, #5 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #5 * width]
+
+  ldr.w R3,  [in0_p, #6 * width]
+  ldr.w R12, [in1_p, #6 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #6 * width]
+
+  ldr.w R3,  [in0_p, #7 * width]
+  ldr.w R12, [in1_p, #7 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+  in8         .req R10
+  in9         .req R11
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w    {R4-R12, lr}
+
+  ldr.w in0, [in_p, #4 * width]  // a[2]
+  ldr.w in2, [in_p, #5 * width]
+  ldr.w in4, [in_p, #6 * width]  // a[3]
+  ldr.w in6, [in_p, #7 * width]  
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R12, C4
+
+  and.w in0, in0, R12, lsr #16
+  and.w in2, in2, R12, lsr #16
+  and.w in4, in4, R12, lsr #16
+  and.w in6, in6, R12, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+  eor.w in0, in0, in7, lsr #22
+  eor.w in0, in0, in7, lsr #27
+  eor.w in0, in0, in7, lsr #30
+
+  push.w {in2, in3}              // temp[5]
+
+  ldr.w in2, [in_p, #2 * width]  // a[1]
+  ldr.w in8, [in_p, #3 * width]
+
+  lsr.w in3, in2, #16
+  lsr.w in9, in8, #16
+
+  and.w in2, in2, R12, lsr #16
+  and.w in8, in8, R12, lsr #16
+
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in8, C3, 8
+  or_shift_and in9, C3, 8
+
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in8, C2, 4
+  or_shift_and in9, C2, 4
+
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in8, C1, 2
+  or_shift_and in9, C1, 2
+
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in8, C0, 1
+  or_shift_and in9, C0, 1
+
+  // c[3] = temp[3] ^ temp[7];
+  eor.w in8, in8, in6
+  eor.w in9, in9, in7
+
+  // c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  // c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  eor.w in8, in8, in5, lsr #22
+  eor.w in8, in8, in5, lsr #27
+  eor.w in8, in8, in5, lsr #30
+
+  eor.w in8, in8, in6, lsl #10
+  eor.w in8, in8, in6, lsl #5
+  eor.w in8, in8, in6, lsl #2
+
+  eor.w in9, in9, in6, lsr #22
+  eor.w in9, in9, in6, lsr #27
+  eor.w in9, in9, in6, lsr #30
+
+  eor.w in9, in9, in7, lsl #10
+  eor.w in9, in9, in7, lsl #5
+  eor.w in9, in9, in7, lsl #2
+
+  str.w in8, [out_p, #6 * width]
+  str.w in9, [out_p, #7 * width]
+
+  // c[2] = temp[2] ^ temp[6];
+  eor.w in2, in2, in4
+  eor.w in3, in3, in5
+
+  // c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  // c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  // c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  pop.w {in6, in7}               // temp[5]
+
+  eor.w in2, in2, in7, lsr #22
+  eor.w in2, in2, in7, lsr #27
+  eor.w in2, in2, in7, lsr #30
+
+  eor.w in2, in2, in4, lsl #10
+  eor.w in2, in2, in4, lsl #5
+  eor.w in2, in2, in4, lsl #2
+
+  eor.w in3, in3, in4, lsr #22
+  eor.w in3, in3, in4, lsr #27
+  eor.w in3, in3, in4, lsr #30
+
+  eor.w in3, in3, in5, lsl #10
+  eor.w in3, in3, in5, lsl #5
+  eor.w in3, in3, in5, lsl #2
+
+  str.w in2, [out_p, #4 * width]
+  str.w in3, [out_p, #5 * width]
+
+  ldr.w in2, [in_p, #0 * width]  // a[0]
+  ldr.w in4, [in_p, #1 * width]
+
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+
+  and.w in2, in2, R12, lsr #16
+  and.w in4, in4, R12, lsr #16
+
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+
+  // c[1] = temp[1] ^ temp[5];
+  eor.w in4, in4, in6
+  eor.w in5, in5, in7
+
+  // c[1] ^= (temp[5] << 10) | (t >> 54);
+  // c[1] ^= (temp[5] <<  5) | (t >> 59);
+  // c[1] ^= (temp[5] <<  2) | (t >> 62);
+  eor.w in4, in4, in1, lsr #22
+  eor.w in4, in4, in1, lsr #27
+  eor.w in4, in4, in1, lsr #30
+
+  eor.w in4, in4, in6, lsl #10
+  eor.w in4, in4, in6, lsl #5
+  eor.w in4, in4, in6, lsl #2
+
+  eor.w in5, in5, in6, lsr #22
+  eor.w in5, in5, in6, lsr #27
+  eor.w in5, in5, in6, lsr #30
+
+  eor.w in5, in5, in7, lsl #10
+  eor.w in5, in5, in7, lsl #5
+  eor.w in5, in5, in7, lsl #2
+
+  str.w in4, [out_p, #2 * width]
+  str.w in5, [out_p, #3 * width]
+
+  // c[0] = temp[0] ^ t;
+  eor.w in2, in2, in0
+  eor.w in3, in3, in1
+
+  // c[0] ^= (t << 10);
+  // c[0] ^= (t << 5);
+  // c[0] ^= (t << 2);
+  eor.w in2, in2, in0, lsl #10
+  eor.w in2, in2, in0, lsl #5
+  eor.w in2, in2, in0, lsl #2
+
+  eor.w in3, in3, in0, lsr #22
+  eor.w in3, in3, in0, lsr #27
+  eor.w in3, in3, in0, lsr #30
+
+  eor.w in3, in3, in1, lsl #10
+  eor.w in3, in3, in1, lsl #5
+  eor.w in3, in3, in1, lsl #2
+
+  str.w in2, [out_p, #0 * width]
+  str.w in3, [out_p, #1 * width]
+
+  pop.w {R4-R12, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer256s/m4speed/aim2.c b/crypto_sign/aimer256s/m4speed/aim2.c
new file mode 100644
index 00000000..74e41922
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/aim2.c
@@ -0,0 +1,598 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 11
+// (2 ^ 11 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5
+// b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_5, table_6);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // table_b = in ^ (0xb6), table_5 = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_5, t1, table_5);
+  GF_mul_s(table_b, t1, table_6);
+
+  // t1 = in ^ (0xb6 d)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // table_5 = in ^ (0xb6d6dad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(table_5, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5 b6)
+  GF_sqr_s(t1, table_5);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // out = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_5);
+}
+
+// inverse Mersenne S-box with e2 = 141
+// (2 ^ 141 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0x2224448889112222444888911222244488911122244448891112224444889111
+// 222444 8889112 222444 8889112 222444 889111 222444 4889111 222444 4889111
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,}, t3 = {0,}, t4 = {0,}, t5 = {0,};
+  GF table_9 = {0,};
+
+  // t2 = in ^ (0x11), table_9 = in ^ 9
+  GF_sqr_s(t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_9, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, in);
+
+  // t3 = in ^ (0x111)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t3, t1, in);
+
+  // t4 = in ^ (0x222444)
+  GF_sqr_s(t1, t3);
+  for (i = 0; i < 10; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t4, t1);
+
+  // t1 = in ^ (0x222444 8889)
+  GF_sqr_s(t1, t4);
+  for (i = 1; i < 9; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x2224448889 11)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t5 = in ^ (0x222444888911 2)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t5, t1);
+
+  // t1 = in ^ (0x2224448889112 2224448889112)
+  GF_sqr_s(t1, t5);
+  for (i = 1; i < 52; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t5);
+
+  // t1 = in ^ (0x22244488891122224448889112 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x22244488891122224448889112222444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889111 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889111222444 4)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889111 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889111222444 4)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+
+  // t1 = in ^ (0x2224448889112222444888911222244488911122244448891112224444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // out = in ^ (0x2224448889112222444888911222244488911122244448891112224444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, t3);
+}
+
+// inverse Mersenne S-box with e3 = 7
+// (2 ^ 7 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76ed
+// ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e d
+void GF_exp_invmer_e_3(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,};
+  GF table_6 = {0,}, table_7 = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 3
+  GF_sqr_s(table_d, in);
+  GF_mul_s(t1, table_d, in);
+
+  // table_6 = in ^ 6
+  GF_sqr_s(table_6, t1);
+  // table_7 = in ^ 7
+  GF_mul_s(table_7, table_6, in);
+  // table_b = in ^ 11
+  GF_sqr_s(table_b, table_d);
+  GF_mul_s(table_b, table_7, table_b);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t1 = in ^ 0xdd
+  GF_sqr_s(t1, table_d);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ 0xdd b
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ 0xddb b
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ 0xddbb 7
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb7 6
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // table_7 = in ^ 0xddbb76 e
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_7);
+  GF_sqr_s(table_7, t1);
+
+  // t1 = in ^ 0xddbb76e ddbb76e
+  GF_sqr_s(t1, table_7);
+  for (i = 1; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // out = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e d
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, table_d);
+}
+
+// Mersenne exponentiation with e_star = 3
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+
+  // t1 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, in);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+  GF_add(state[2], pt_GF, aim2_constants[2]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+  GF_exp_invmer_e_3(state[2], state[2]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_transposed_matmul(state[2], state[2], (const GF *)matrix_U[2]);
+  GF_transposed_matmul(state[2], state[2], (const GF *)matrix_L[2]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[2], state[2], vector_b);
+  GF_add(state[0], state[0], state[2]);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+  GF_add(sbox_outputs[2], pt, aim2_constants[2]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+  GF_exp_invmer_e_3(sbox_outputs[2], sbox_outputs[2]);
+}
diff --git a/crypto_sign/aimer256s/m4speed/aim2.h b/crypto_sign/aimer256s/m4speed/aim2.h
new file mode 100644
index 00000000..bdc50429
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/aim2.h
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0x24a19947b3916cf7,0xba7c9045f12c7f99,0xb8e1afed6a267e96,0x2ffd72dbd01adfb7},
+  {0x0d95748f728eb658,0xa458fea3f4933d7e,0x636920d871574e69,0x0801f2e2858efc16},
+  {0xc5d1b023286085f0,0x9c30d5392af26013,0x7b54a41dc25a59b5,0x718bcd5882154aee}
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001,0x0000000000000000,0x0000000000000000,0x0000000000000000},
+  {0x13269d7dcfc555c3,0x6fe13874c42fedfb,0xc69f003d9d5abb9c,0x05636fd04ebf7feb},
+  {0x7a273dd9fcec7e15,0x42cd3eb54144ea68,0x5a88aaa3ebaacdff,0x527284e39fae2053},
+  {0x56bb9ab537abf542,0x768c3d772850c862,0x0160d91d288fd0e0,0x342e111e0a022022},
+  {0xcdb998ce4b3eee2e,0x78984c4dc99c90aa,0x2bb89f84c00275b6,0x75c6a0cc065fd4ac},
+  {0x74b2cd2360cb32af,0xbde82f7cf42dd1bf,0x7ceed82d54d965c4,0xf4e9f207aa17f2e9},
+  {0x995d5aab614ac6c0,0x1563800b79242f35,0x1d940184c4509090,0xe6558fd024716b90},
+  {0x8d0b793b4375cc8a,0xfcf792217776a3ee,0x5da44008043b7450,0xc77adf87407cf838},
+  {0x00451596f23df45e,0xd8bcbc0d7ae8534f,0x02c26abe3748db45,0xb37e029dc51a4b41},
+  {0x177dbfce6cbc8c0b,0x62cdd72c8cbd2d2a,0x568802d992bd7a2c,0xd0082d2193b6e383},
+  {0x221e6872863f45c6,0xbe5a9bce6c00df76,0x98c076efe1cfcc67,0xa75bdc7ab5c142a9},
+  {0x088d4e8e27e0b74d,0x71046740fe7e6c5a,0x20123cab6052c1d6,0xa7135d055351c99b},
+  {0x46176449341c7657,0x2a7936011468475e,0xc347e166dca96014,0xd79326785eee3555},
+  {0xc6b77e5a8b6dcae9,0x6dc641a8e07c54d4,0x37055c3ed77341a8,0xd75eaedd0ec6f1d1},
+  {0x5240b9b6f3433443,0x7b7d965745400c05,0x4542be5aec50ec53,0x13e6ac8f2aac12a2},
+  {0x66c30b9da469d401,0xcd5dbf02dc359172,0xf16b3e62f8a57e1d,0x362c2bc9345b97ed},
+  {0xb2a65d5f7da755e8,0x11df10d6ddd9eb84,0x433468d75cb64470,0xb4a6ffd454c82b2f},
+  {0x1c87142145f7c112,0xde2854fa4939dc0b,0x10a503b51b7c7a19,0x174f91701431e1b3},
+  {0x60d8fb32b890cec6,0x27d95c11548f693c,0x30fce7ce95e950b3,0x210559008a309578},
+  {0x5de49c870dd8fb60,0x1f480e246bb2c961,0xdc5efcb1f4ee90ae,0x165c3f5b62136c5e},
+  {0xc17b4bbe4b5780a8,0x690f1102a6decffe,0xa26e146710d9cd7d,0xc7f278fb3f02a99d},
+  {0x4fe7916de7e17f1c,0xe9e59586ac0a7185,0x092b72935bc23437,0xa306568e985edbfa},
+  {0xc05330df507b35c8,0x944475d0eb5c89f7,0x34a3653b083969a5,0x97e431e62e205633},
+  {0x19fe581ef3e9a896,0x720ab1851376eff0,0xda5ca1af445dea40,0xe3899fd1cdc93f2f},
+  {0x7a18d867d11567d6,0x14e706af946787cb,0x2ececbd0e726236a,0x66a864e0c387e806},
+  {0x0a0a9e1dc2c9d30d,0xa1bd85358585db7a,0x78f90bb68d83e25e,0x2275165a7e496039},
+  {0x23f2e1a2057c9892,0xb7f503272b51fa8f,0x0ecf56cbb57a6021,0x77f77f889ecb3e74},
+  {0x237633913a45a827,0x3a2c98b4d38d139b,0xbc1dfd5ddab4bb19,0xf2bcbdc105b017fd},
+  {0x9a53645fca466120,0x07335188ef82289a,0x9cdd8f1434ddc4c7,0x25afc28ddf0c0ea5},
+  {0x0166bda62c3c97ac,0x4821343275a35741,0xa4a1f8ef377f5177,0x3008d4b041fc0802},
+  {0xed498663eb9138f0,0xb16289e1ea93949b,0xa2476ced73badf6e,0xb384ce50cdee1d75},
+  {0x25430e5e2ea409d8,0xf8909d2164becc11,0x77663884798e456b,0xe11b963640c6a7da},
+  {0x2a5ce7930313e789,0x01a1b717dd5e72f3,0x674b4810dda58bf3,0xb348d6cffeee2602},
+  {0xe4871c9932b98648,0x90432c7798b61577,0xf803346f3989e611,0x176c5f43490e3127},
+  {0x28b7ff52a8d039f5,0x2549d26014bcb371,0x7705b13fd068e5f0,0x22f60aec7063b440},
+  {0xa90087e5804b094e,0x17b587e9f7b1334c,0x7e9128a8fd49f502,0x10a15de60dcc1259},
+  {0x676fc8232449f7f5,0xa45eba0b86ee4f8d,0x48d0f0583763ed04,0x9430177369350009},
+  {0x8bb187487d0ca392,0x8b34c408cf71198e,0x4c5b9033c740f6cb,0x15165d415ea592e5},
+  {0xe25b8fc9315d8b10,0x6f067bcaaa5db46f,0xc0d574e6df163bcb,0x76d62e45eeb26cb3},
+  {0xc7bb4eaa81af7e21,0xc0c25e2c4da66ca9,0x20a5b7a6ef682683,0xe0c40a42bed8c878},
+  {0x340b283a1f67eb72,0x94c68ac57747d7b1,0xaab540d8883c7e78,0x53ffb196e81fbce0},
+  {0x03d1fe920cc5c8b6,0x2d058e7c02de80d0,0x349140f34518313d,0x52d8d34dce452897},
+  {0x3daf5481e615a4ec,0x1d21ddb2b19865a7,0x28572f8e3caef8c4,0x94f0069367dd5a9c},
+  {0xf97efd31544a2432,0x79cc100bcd1c95c5,0x630dd7dbdcda2efa,0xb0c94889efaeabe6},
+  {0x1855a973cd69d2ac,0xa249d1e68760fda5,0x9bd185166791f0b0,0x73aad654a16f87d5},
+  {0xb64f4c4f69887572,0x0dd0ddfafeaec759,0x9a2b2e01a2dfdf21,0x23e6842e19958e74},
+  {0x47126f2ed9d35243,0x2dd26a5dc07d8ab7,0x5f7a0864bae59fef,0x84bd4c2d7eef707e},
+  {0xc2b75aa6809fde33,0x4e05ff4138a1458a,0x4283e814ca9b30b5,0x46b1bcf0f62d4313},
+  {0x83f0c7c594f6cf9a,0xdb8a4b8e5dfe204e,0x44a803aecd550290,0x96cc8907871fc11e},
+  {0x7ca33f7d36e71a53,0x609b8f2296791418,0xd9e9118ba8ddf5e9,0x813002deae63def5},
+  {0x5e3805abc5d66c85,0xe95aac205db8a39d,0xfad61d269550a976,0xc0c3e22037926992},
+  {0xf3ba3f8e2a564d34,0xfd74426f936299c0,0x23bb54e8112b82e3,0xc5afe8e8365a6000},
+  {0xb733edd6855182ef,0x5ecb1ae3728f48e8,0x3b8b1ce5bf96e304,0xf3aba2a7bfac4c59},
+  {0x78f2ea71794eaef2,0x59f25ef7fe359b84,0xacfd3e59513654c8,0xd1e24fda7d0c3936},
+  {0x288da25da8b17fb3,0xbe107e7feb777a7a,0x166db15573baae6c,0xb5ccbf5cfe3e5135},
+  {0x4637849d0285089d,0x4f671ebc0437c2ce,0x188565bc785f8268,0x712dec2cd1ba005e},
+  {0xa25a6b6a471a00b1,0x6e1a6a380bb57611,0x3ef50b155eddd23d,0xd3788fef109d4e3b},
+  {0x4f403f37eba563c1,0x76a201773cddd009,0x58fba6bec18e06a6,0x11a19d4cbf2a6331},
+  {0xe3e6bbb73066a175,0x9748c56fec4b9fa1,0x406aae141855018c,0xa1410c0e735df446},
+  {0x5e569e71e70eb719,0xa673071887dd4687,0x07055d8d0a23d785,0x74d498384aee1190},
+  {0xa0e8a89b6fb6984d,0x908716f3ce5edf66,0x0a2b9e842b73e729,0xa1b9171e0b83204b},
+  {0xbe7532657aadaa20,0x1b66940116e06582,0x7385fd540009963d,0x847a9b51570e7ff8},
+  {0xe9395fd61662cbe6,0xb3a286d4b91d1353,0x455b0689d3ff2d83,0xd56078fc7681e787},
+  {0x8b470957a3441b8a,0x7df431ebbf7e447b,0x0e0f4fa397edd83c,0xd793865c1388620e},
+  {0x7b29927808bfa739,0x96e65ce20d51654b,0xaa8fcec0d3c045c3,0xe5f31c0e239b4fea},
+  {0x5525c2a74e77bf9e,0x88cf3be85881afff,0x7c81312941d70c3c,0x23d8a44e23a9c737},
+  {0xb869097f96d421f1,0xfc5054b0f253daf5,0x1c241e84b424d6aa,0x32b29f522eb351e7},
+  {0x6a466e2ed7c0ad0b,0x5590c446ea6f583b,0x56d2464d3ee4d099,0x068910c7eb32dd95},
+  {0x71139d1bc66bb641,0xb3a1027da065feea,0xe04294fcf6174557,0x81dae384498adb46},
+  {0xf43ed00c527a209a,0xa5754026d1f22c89,0xc78a8d365f196923,0xf5154817fc84f220},
+  {0xae764c7fe7341054,0xffc86134dc4d880f,0x1b6a1e1530d66862,0x250c95737b7b8284},
+  {0xbfee6b3c1e46c128,0xa78dc08ba0e7251d,0x3a95f11bcef9d4c7,0x34f2831709c6a420},
+  {0xe3a3c1aa9e2407d8,0x4c1a200af1077851,0x8965a32110544d77,0x6354a05036f3f5a7},
+  {0xbd108a58fc17d8a6,0x61b0351824a54794,0x499e7fd9fdd626df,0x850217a6be595511},
+  {0x53f2510fb68b5c61,0x5b122cfd2501b4ba,0x7fc88679758e8262,0x233472936a675422},
+  {0x11965eaffc401c95,0x0af31e003ba1fb12,0x2facfdd6611b7f8b,0xd67eaae060c88abf},
+  {0x6fa46680edff5f3f,0x454b6266e25e87cb,0x9addf096cb1df0af,0xa6de67c1da83476b},
+  {0xbf6f0cb8a600033a,0xf520f28cc3846c4b,0x008f972a2108bd6a,0x55bbe0da272b6cb0},
+  {0x9bf38905d29c13e7,0xc50cd62db6acc3da,0xbb9b791e0d47ac11,0xd54b025508c245d8},
+  {0x3a2547ab532ec9ff,0x79495ddf670c8bc8,0xdf4ed2dcee44e1bc,0xc2e52f1fc1f7d4d5},
+  {0x4800ee52ee97ecda,0xc9d9b772550e380c,0x98506ba8ea5ec019,0x21ffafa8b46c668f},
+  {0x3464a9138085b307,0xf67a192be113e9cc,0xfdd61b66e0e162dc,0xd612aba17d397d2c},
+  {0x16207c45e571aabf,0xf2583066040bf4f7,0x4bc24730dc4d62f5,0x608b3d1e61a60b2f},
+  {0xc2a6d2c707faaab1,0xc9cfa575f99f891a,0x61ea461507f40f96,0x67104299d7331a82},
+  {0xaf1c8fcbed1f1699,0x985767a5dbb95b90,0xd6ae3b3279c96a14,0x275ea501029834e7},
+  {0x4e19e32114de1e9c,0x165f71d116e0afb9,0xe968cbf378c1a2f7,0x912182eb2d02ef2d},
+  {0x6e4e3c81caceef19,0x85f15b2e37fe2cbe,0x8ae88fcc89bb8687,0xe50b4d7659484c7a},
+  {0x80353d06c9930d5c,0x723d1f993acaffad,0x89e273ac935dc5e2,0x51356090a9eecbf9},
+  {0xc3bd743bf118e69e,0x78fe213d42306293,0x90638ea842ff3668,0xb0addcda3683625d},
+  {0xe26008c6b83cc264,0x74bbbd5777680be8,0xa8892126f9cc485a,0x54899977a5cc34a1},
+  {0xd19b2baf7fa0c771,0x39d199b5dfd41569,0x7c3c66294bc7b31d,0x81bb86cd53109ac5},
+  {0xe4a790156b11f26a,0xb496c49018830c99,0xf19e574456b9d549,0x867aa70b9bbd4fd0},
+  {0xb8ce927c2afbcba9,0x3ae3f9d11d478318,0xebdecea6a113ffd6,0x071def720f45ca33},
+  {0xa18c4347c3dba5da,0xc231d50db69b59f6,0x784caea3c01900f9,0x21b179202d1177e0},
+  {0x48d839b0e148b37a,0x119910fe9c00220e,0xf6959f7654a471b7,0x138df428ee1ab05e},
+  {0x2378b25ea2d743c2,0x52a0660820b6ff4b,0xb20d6835419796a6,0x77d41062fb9a7654},
+  {0x1e63666141c834dd,0x534d884045bcdedf,0x07b52ebe10206e92,0x67cb1a5c5d2017bc},
+  {0xbd489efa4249447b,0x81b1f830bdd020d0,0xb8db0042e390a71b,0x90b877cf8d8200e6},
+  {0xd91a2f7fe76f986d,0x2c6fcd64257849b8,0xcec2c4be6ecbe77b,0x5031f045518f6b98},
+  {0x3cc9f99a10cba6b9,0x7df264605ea09f19,0xc6099006fa2f35a0,0xf31aa1999c65f2ed},
+  {0x7322250ccd66f2d2,0xa8cf62816a34838c,0xf7bd30878c6d359b,0x450a14aed0d49014},
+  {0xf753996b7d7c1d54,0x45e2b366fb683eae,0xcef4cef44af75b4a,0xd1e647d51db49a04},
+  {0x257099ec419b94a6,0xd4a8a9f3335fcd10,0xa286788285415010,0x023c9feb9c1e9901},
+  {0x229d6fd7eed1531c,0x04cefb6c19ff0062,0x9130be016eed6e29,0xa1a04435eb4cd39d},
+  {0xefbd279ed0b045c6,0xe8ec58f13b1a927d,0xbabddf060b172c30,0xa5fd98adc4c9d7f8},
+  {0x0f859d44ce18448a,0x07af518284a5a680,0xff7565589bc19136,0x72e50c2e9eaa580a},
+  {0x6470f3d6724b5dd7,0x8b0ebb24be876d22,0xfb604e14fd34a2cc,0x213fc1d31fbb7996},
+  {0x50e1d4f6f24a3685,0x69348d20cb64f7b6,0xa13da095f7678267,0xb63a6ac7a66c3284},
+  {0xb0edaccd9a8698dc,0x73d7ca79b1672272,0xaed4ffd76475e235,0xf36b5b0cbbb22a1b},
+  {0x24acd40ab0b10aba,0xafb39e3ea0656a92,0xcfe743611a51fa5a,0xb4f8251f0f0e0d41},
+  {0xe8036bb95086dfdf,0x3d5d0332c379fb16,0x3029edc150437ed5,0xf561ce7ace559b0f},
+  {0x01047fd87eb154ca,0xce04d75cd86f0d9d,0x33f6d9a762e84d0b,0x52f77f2619632746},
+  {0x3fdf7a3e2584aaa7,0xafdff63009b07776,0x24496f671e85ade5,0x35b2e80c0abfdee5},
+  {0x4bb3e9185acc78b3,0xe5634557a7f532a4,0xa6a979853e645782,0x97e9a6c3f5ed6068},
+  {0x41685f9547d8c651,0x6d4bade8828daeda,0xabe0dcd781a5b523,0x3528952d2a770f19},
+  {0xe4e43b26b587ea84,0xf0f3f420178def6d,0xd48cb1f978a8bb2e,0x25de266fb8567a86},
+  {0x2906276141285c5c,0x045688d8cac52240,0xa1a62b2fa2474687,0x917244641b004f87},
+  {0x73897ebb86a40eb0,0x0df1bc6722ab333e,0xb7815fffa0c79792,0x322111adf2c83d06},
+  {0x4dd181aa27fc54fa,0x47b557267a691a35,0x089b8ed1303c2515,0xe60b63596c40b943},
+  {0xe574bb3f5e1d3fe5,0x7e5e1dd1aaea6c56,0x443b9d58176d285b,0xa2c066cf80f1c62d},
+  {0x9df2b1fe93b4cb69,0x5dc5dcbd7bcd4304,0x4cac45f5c51659e4,0x9039bc7472f02b80},
+  {0x81c7d14b2ff6f3d6,0x76b7422e6f000e01,0x23e23fa520ed280b,0x50a4f9ded0d07978},
+  {0x154548397391fa38,0xb1ec123aeb772341,0x22f40fd3abeee812,0x0342edcc39a77162},
+  {0xa7ef812f5e9d9ba6,0x65de86bcc8071b0d,0x4b9bbe60fe0a1fad,0xf4f8322efc5e2f45},
+  {0x21fbeab48a7c1136,0x42736db042991d3e,0xf78c442fd2ed07a6,0x36228053a90abb56},
+  {0x6ebfcec360d88021,0x7deeafd7cae1b159,0x6f32c272246a4999,0xb2f984f6c2b488dd},
+  {0x76beda6b3d15abc7,0x1bc04ff70ef9d0a9,0x75ec5c46c4854ec0,0x77bd25a817826a51},
+  {0xf79c8d9bd7aaf4f0,0xae5add9fd1454f93,0xd9f264167923d698,0x273bf89c8b33a9ec},
+  {0x20ba5517532e42a7,0xd9991aaa0bcb040f,0x81ec69b31aea8c89,0x823ee1a07f410f90},
+  {0x3e10957041e49998,0x9746fdf5f3deb53d,0xbae6be6d5a7923dd,0xd4aa255a7e60b5f8},
+  {0x453e76f50e50f914,0xba084020e530dd32,0x90e9982f02a0b2e3,0xc1bf6d0c93565fd4},
+  {0xbb44043183434a96,0xb6839987e4d3fbf5,0x780e11ff154ba921,0x46deb765191c6fad},
+  {0xf254860f62ddca11,0x2b40c2147fcc1618,0x9b9df4f2213a87f5,0xf5d9f1982bf72085},
+  {0x9ec887ef1dac7ea8,0xf9b9f41cd1a90cab,0x5106c66727088891,0xa079314a8a7aa0cc},
+  {0xbaca971f705d6820,0x32cf35c216d31b74,0xcda1b48f6a782676,0x42dd0c61745b57af},
+  {0x774f50e70700fa3c,0xfe706a77d17875e0,0x50acd4b9e4f085bc,0xa70b2f3a3373b5cf},
+  {0xa3d467e6532333ed,0x9143409c675fea0a,0x186d4c8b7de757db,0x006e698e91bc1742},
+  {0x042690d62241c815,0xf8a04fddc8420797,0x9ff8cf1394eaada4,0x921b7749e0687334},
+  {0x3ba03d72cd709236,0x12f95d885e21e3d3,0xba18560bb5d4d50a,0xa3607627494476ab},
+  {0xade31f9ca5377f89,0x635510178eec1003,0x3dca939c351bf98d,0x339c87aee1cf78dd},
+  {0xe45a6287cc1287d4,0x7cf6c8c56ed07634,0xadf6eda911dd0200,0x87211a5d3722f0f6},
+  {0x7b07d341c0de902d,0x69838993df5c9429,0xb8921642be862244,0x555819247b006cca},
+  {0x4b8ebd3e261a1065,0xec8c767eb1653ceb,0x482e17c892519544,0xb61af0cc04b533a5},
+  {0x4fb9d38c4e2f7113,0x50030b8523699320,0x5716f5c60cedd7d8,0x0673e662c18aadef},
+  {0x641233031f77a5fb,0x1932c76990a0d465,0xb79ab4fbf32c92e8,0xc0a7370dd0467550},
+  {0xdb899bf50910763a,0xf026477f262eb097,0x76b70a1b2163a0d4,0x93a2873f23165f6e},
+  {0xba2a66c196ce2eb8,0x19383fd3ffab287b,0xaed33c3223646076,0x1274559077e98698},
+  {0x035a94843c44ec7a,0x6de99478a3c009e3,0x8a7ecba43ae87e6e,0x458c9cbfca30c71a},
+  {0xe3695ac8419682c5,0xaeee4d4d0392ec66,0xd99792a67250c187,0x91e0f202f4c924b3},
+  {0x9c784cfbf5192c27,0xc113eee0e80c2eae,0x3f7b5a6101ce5f5e,0x842e2d646ecd9d6e},
+  {0x957028a6befc0d73,0xcbb8df5afe2e23c3,0xc00f5c490d8dafb4,0x67d7ee99cddb8452},
+  {0xac8c3e869f704d2d,0xc928ad50bd4faf6e,0x114a0001a078d1a0,0x8375ad6cc681586b},
+  {0x59a53e3fb149cca9,0x69cf3f7ab419768e,0x79d945a746a788b8,0x979b7e9387ae017f},
+  {0x41f7712568f43935,0xf17647a51bb6cff9,0x593eb0f68e21db19,0x77bf0442e77fdbdd},
+  {0xd430085cbe62c90e,0x445d0af933a0c884,0x92f5c9b29a5de145,0x6778e9aad04a6c94},
+  {0x4914b4bd446c5d64,0x21b19c795fec736d,0x72cf9cdf7fa1c0db,0xf67226412058b23c},
+  {0x1e7346a99e1464a3,0xacb82da3ac217e94,0x4d1f4486473e6c18,0x23274da141c63725},
+  {0xf58a0445c9b4903b,0x4f196615648056a4,0xeaf0d8fc78e51fe3,0xc71e969830bec69e},
+  {0xcec3175fd17dee42,0x6fa60eda34cf3b0f,0x016ff6fe365a227b,0x148ed225daf52abf},
+  {0x5eb5954a6c060dcd,0x67ed2e3411fbde9d,0xdaddfd054f15c5a4,0x80e12ae0d1591ef3},
+  {0xc9c76eda44553b71,0x7c4675538cbdcd1e,0xa2128f16928c1efd,0xc13aaef8cfacc959},
+  {0x525318d3ea7544bd,0x6f3e0f4d85ce7b2a,0x397102e6892ab449,0xd028319bc9ef0676},
+  {0xc55bc06690da6f96,0xea6a73d17ce2969a,0xfa21bd37fa658e1e,0x32d421c8c9a9d437},
+  {0x4f53f0e462a9f4f0,0x18c65d2ba362d43b,0x53b8871400599e70,0xf291e9ac535cfe6e},
+  {0x2a420a66918ee17a,0x4dae04d613a5a05f,0xc12c868048f09ef7,0x900c4ca4fb306ac2},
+  {0x357f0638ee05acbc,0x389db47cc78620f7,0x3c531ff5b9fff02b,0x902c96f5fb2c18f9},
+  {0x57abb6151ae9319f,0x917bd98253c43360,0x36b4e4e17d9c5182,0xc2a4751705897c3b},
+  {0x91ee0ad214084c1b,0xfe17b657a9ea9054,0x7b304880e7a3efb6,0xd497c8cea46cf443},
+  {0xb97e1c63dcc46441,0x22898ec1ecb0f186,0x40dd2915e34e92ae,0x83e63e8886604034},
+  {0xf159f13af4545efc,0x6b0312cbfce549f7,0x1632f9e6624b3c5e,0xc387a21c7c20a6d6},
+  {0xe81b4468c49ba628,0x9962cd4b58abb1e7,0xda2145ce9fe59f2e,0x6021807944cfc8e1},
+  {0x9e98852b17310f23,0x3cbe1c8bceb45120,0x0e165b29c57ec0eb,0x305bf854fb1aea8d},
+  {0x1c3dbdac479d54f7,0x4cda9c1c1bbb1a19,0x7d330c571f17bc88,0x826548b30e26b7d7},
+  {0x446afa2ca1809535,0x8d3c9693ee673350,0x7893a83f58de1ffc,0xb19954f7647195ff},
+  {0x21b77a7b577e945a,0x0c3e91d3f1f89e09,0xdd7b8e8a59fae93c,0x6435f276c4582559},
+  {0x4d0e6426007bc199,0x5c13184bcf7dd24a,0x26f1f87322e213d0,0x97243e676a3eb387},
+  {0x14cbfff5b787dedd,0x355794e80f8cd847,0xf2c951e3c0d77a3d,0xe558cf2f7b5f2991},
+  {0xf87b23ea7452e43c,0x92521695b010b548,0xb7af363918a98cf1,0x473e6304c6f3f9cf},
+  {0xe86f5e030902695a,0x884c59759075978e,0x862a4f44f20c857d,0x2348092c2d62a7ed},
+  {0xbebdf27580f800b3,0x4c82348a99cfaf36,0x7fec6e2fb343c70a,0xd0a2b036a8d95707},
+  {0x59ef03fcb5a57f39,0xfb04bb079290dd73,0x30e0751c7c8e4263,0x4078bcf952cf1a62},
+  {0xa19ffba37095d58a,0x9d164dabb30dff6a,0x16de88d2bac7642a,0x8232b5dca704cbcd},
+  {0x329dfc2b2636492c,0xf0397ad762a31307,0x78adfe730ebe751e,0x5783b8d9d2f05dfd},
+  {0xf2d6e8a736f23aa1,0xe2102f9bd2267093,0xdf2af690beecc500,0x11398c83a817f593},
+  {0xd46565aaafab2385,0xddff3f9a0b99928d,0x5eb2072a49c5a5ab,0x53a03f6a8eb6a094},
+  {0x57fb689ec7092868,0xc2040eb173de1a44,0x810031fb7b19e630,0x53960a9b3b1ef568},
+  {0x40007920454fbf71,0xac025a589e98d1ef,0x9e256036a7fbd143,0xc13cf073bd649440},
+  {0xd06cd6829f0fea3d,0x2a51b1d71d1ac07b,0x3546a5854571bbc6,0xc30b6bf46c0b42fe},
+  {0x62488646a13da231,0xe28973393fb6f682,0x9ed13dc9f5432f8f,0x31b84f2be241c94e},
+  {0x9bb19ea5428d66ae,0xe0080b8616f3babc,0x9610055711788ae5,0x7652d184a46c90fe},
+  {0x112c63f926d9850e,0xe5905a268850e663,0xb9fd3996e6d72608,0xa7aa33543146d58a},
+  {0x77de728df392f575,0x637633946129f8e6,0x72a867e08e3bfce6,0x754f7149e15a365b},
+  {0x3511c4139b98679f,0x56a8a361da8cbe81,0x2a34d15423a9eb45,0x82ae1da57cd32e57},
+  {0xcb3ecb886171f719,0xfcbf82d884e8e020,0xc6d2502bd1e6f6cf,0x80bb7b1db5c2a777},
+  {0x81b8745892f03d2d,0xbc5f38b14116148b,0x4b6d0194055b86d8,0x241dbd17e3eb4ba9},
+  {0x5bbc585152fcd142,0x930f31c230a2050e,0xabf51e10a3e969e5,0x72a0a1c90ead638e},
+  {0xcadb18ff93f7f93e,0x1b8e009b5719bf82,0x743c0ab2c8bc284a,0x7144a02ff1130223},
+  {0x41b95e62522de019,0xcd3465a01c9b93fb,0x236600ff15e70ef3,0x3658cd0c29ea6f20},
+  {0xb9c59bf0b27dc282,0x47955c29304112de,0x3f16c72af19bcb3f,0xa0e568c9c5397d69},
+  {0x9251cf7a209add18,0x8e3a95a336fe4170,0xf28c14a751527126,0xb3d3a9a208590971},
+  {0x5b129f35a37c28ff,0xe3f8ba25b41817b9,0x200b734d2501265f,0x52344985724cceca},
+  {0xa8e27fd1e60dffab,0xa8ea4523b64f5aa4,0xa475b8437f8165a1,0xd644c1691c3c7548},
+  {0x5ddae2f669e64957,0x1fcef31f0b9af756,0x3e6da61c7980074e,0x206f828242ab6764},
+  {0x33144ea9f76bb631,0x9f36e03e21fa3065,0xfe08e97dc86bceb2,0x640b723c98cd7479},
+  {0x1636152634146114,0xc18c0793a80805cd,0x2b106edd3834043c,0x4191bf5c7fbacdf8},
+  {0x429dddfd03ef7bc4,0x4db9b9d6da197cd3,0xe74baaea7f22abc0,0x4364ff1e20f72e64},
+  {0xca8a9a678e94da68,0x6535f14dbed15563,0x98f34f0a20bd3f3c,0xd12c84164701e27f},
+  {0xc02c8d4c379b7ce5,0x7069499c81e1f16e,0x9bf97727b1a05c04,0xf27fa10bb0a78610},
+  {0x4cf536f0cf11a349,0xbd9dfa2a6eb41391,0x565f1d6e23bbbc0d,0xc76bbb697c18cf7f},
+  {0xa17601bde8ac478c,0x8db87c51403e365d,0x4088a87a96d9c622,0x31f82a7918dd0d06},
+  {0x29ee14687120f04e,0xfea2e736c3636d5c,0x7f8c89823855588a,0xf0da86215a008e8c},
+  {0xdd645ec1d816c223,0x0aa7edbc5ba5d0cf,0xfced1c8e126396e5,0x201b07bc6f65eddb},
+  {0xfb25e20cd48f4855,0xa8b3d1435e85371a,0x3ee9acb3f939329e,0xd075efbe502f25a4},
+  {0x0541c9b35049c704,0x94986dc9cd668f39,0x17f4cfb2726cd68b,0x508c14a670636ed4},
+  {0xa2b783ac55d68039,0xc130ab2d841d773e,0xd6d29b14f588465d,0xb790ad979cce43f8},
+  {0x4f8ce0df03c43b98,0xbbda15818c06d7a2,0x380dd95f0f042fdb,0x05f429bccfb597f3},
+  {0xc742e63ad5c5f5e6,0xcbcb225fbbbe33e2,0xa8edf59089d52ced,0xa0e788a338b45f4d},
+  {0x20e95da4bdb0c82f,0x3e63b532cc85e2d9,0x163e3d2b90d4ddaf,0xc71593e07530219a},
+  {0x7992357ab8d37b59,0x4aea96f315f3c064,0x1ba04f945b33146b,0xf65bed5593247ff4},
+  {0x2d4ad59bdce5563d,0x3a24253d449dc88d,0x41c7ffbd062c28f4,0x42734ae219aa9361},
+  {0x644204f2ea9b71e3,0xe551983ade3b5122,0x1bc727382db55ea1,0xe276d03e4bd6fc9a},
+  {0xfb20c1e51a924e81,0x2f795f1d4507decd,0x154de4d0aca02046,0x72ddcd99451381dd},
+  {0xc09ac8020e255c2b,0xa4eff29a2c29d3f3,0x7977c4f4c2f24381,0x349ff7a6efa4d791},
+  {0xea5d2cd9592cb4e6,0xf63dcd3ff0c8104a,0x66d7254c1252ca0d,0x822791068962c667},
+  {0x7b9c477dde2ad4a4,0xd3460c638eb797b0,0x1889eaef7acad771,0xb23db19bd8554e11},
+  {0x6f1c469240cd647f,0x31825907e279b274,0xb97cebbf2c37c29e,0x74ce50e87690b22e},
+  {0xfb92d64637ac0508,0x97999c37b92d0720,0xa23a9e76c1578849,0x66aa9c79979e14fb},
+  {0xcf78e912e65a8877,0xb7dcb878bdeec090,0xe678ac56695a99fc,0x0338870b34c11cee},
+  {0x5529c228e771c374,0xd8ab910e6e0a23f9,0xcd86f7b11bf07839,0xe3358c0867358f64},
+  {0x7c0e69e5db7dc1c3,0x355a9bbca9523a64,0x86985b53d32a3f4b,0xc715ea89b184099b},
+  {0xac499c49b8a4cdc4,0x22485e1df13ea826,0xf91367c2ad8807da,0x863b3b9193879ebd},
+  {0x8086427544d93f9b,0xf378d24905271a4e,0x8a2211f2e881884b,0x27f11aae6fbdeb19},
+  {0xd4d702e312991728,0xd57d86c18df5deb9,0x68b550520aac07f8,0x6163e0c25242d715},
+  {0x0484539b5bd55737,0x69b34b6b4664d575,0xcafeeeea78048b31,0x25a0aca017ec768d},
+  {0x955f03fc32b86250,0xb3ed04233eabbafd,0xb4da5d10fb30568d,0xa1d5c520656d8e7f},
+  {0x18e6772ac0c7b0b5,0x1e8c41bfa134bd72,0x36b1b28f157526b2,0xf5ac9222151e43e1},
+  {0xb500af50c3647566,0x181d28f85aca7575,0x9a16455dfd6341a1,0xc6d058b2c1e37c22},
+  {0x01b46ff0be3c6ef8,0x7f5abf4a7e4a72fe,0xe18780f7372db81b,0x91d1172dbd7c1d3b},
+  {0x62e68a7598567ebd,0x4654b8ed6f377911,0x051bf02a5685ca63,0xd08b010696df1fa9},
+  {0x656ce860674f0d36,0x8bcbf7bc1ef730bf,0x00a0260df392d280,0x33930145fd64eecf},
+  {0x17743293297fc288,0x5b59ca56522bb36a,0xe58ef14098fd4053,0x7444ed68eb16e657},
+  {0x31beae245608121f,0xea349f5c00e7cc25,0xf076aacf6db8c528,0x13c58f0b1e99ac1a},
+  {0x910f9e30c8455d7b,0xc1ebc494beb98220,0x201a3557ec66e851,0x610dd21bbd2f6b9f},
+  {0x317d8fa79aa99e03,0x7b670f771c4590dd,0x77052e1a54ac4638,0x17309eb8c690df96},
+  {0xddae9fdfd80030d4,0x84daf3404eae25e8,0xe93997a2e172c485,0x51f2159ecb7b5e41},
+  {0x9f02a3e12da8bc2c,0x1c746f4b943dc8e5,0xb31951aeeaac4e5e,0x0128a606643b4341},
+  {0xebd158803af98ce2,0x08e82db8ead7c10b,0xba172e80caa61667,0xf61ff900e1918b8a},
+  {0x8c3c570f9ffae2bc,0xff0827921f27e4f5,0x6256d4a0913919b5,0xc1f4fcc60f17957e},
+  {0x648ade6556f9d114,0xf2e85e1746058ffe,0xc9605989ede623cf,0xf3d09098541725a9},
+  {0xc57b49460d911255,0xc0767005f4affb44,0x486c21436602612a,0x87617ddb2a9643c0},
+  {0xc2038cd71c6d3ead,0x8fe1e58a5096a181,0x51cde6590d0f6b27,0xf59bf938475aa39a},
+  {0x9d8138454badbf16,0xaf8306904b15d8a8,0x83bd9fd79c159b39,0xb85db82acdbbf3ae},
+  {0x560807274e8b13db,0xb33b8a036f1617ca,0x72bc05868c923532,0xb7b8ee25c3388851},
+  {0xc042df127c4f6747,0x704ed715ba3ca7d4,0x678f93c55bc0c5d2,0xd2ee482f0bfe6c9a},
+  {0xbd60c5ba33d87b10,0x6c2ff096c60536d6,0x0ce4b4b8c86a8f5b,0x86a0bcebf81d6e4d},
+  {0xf9384ef3a44799c2,0x8b78ec1c676a7fcd,0x5f7c3edb312b00da,0x2390763c1712af67},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_invmer_e_3 AIMER_NAMESPACE(GF_exp_invmer_e_3)
+void GF_exp_invmer_e_3(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer256s/m4speed/api.h b/crypto_sign/aimer256s/m4speed/api.h
new file mode 100644
index 00000000..b19100cf
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 64
+#define CRYPTO_SECRETKEYBYTES 96
+#define CRYPTO_BYTES 17056
+#define CRYPTO_ALGNAME "aimer256s"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer256s/m4speed/field.c b/crypto_sign/aimer256s/m4speed/field.c
new file mode 100644
index 00000000..5c27f63a
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/field.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t temp[8] = {0,};
+  uint64_t sub[10] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[2] ^ a[3];
+  sub[3] = b[2] ^ b[3];
+  sub[4] = a[0] ^ a[2];
+  sub[5] = a[1] ^ a[3];
+  sub[6] = b[0] ^ b[2];
+  sub[7] = b[1] ^ b[3];
+  sub[8] = sub[4] ^ sub[5];
+  sub[9] = sub[6] ^ sub[7];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &t[3], &t[1]);
+  t[1] ^= t[2];
+
+  poly64_mul(&a[3], &b[3], &temp[7], &t[2]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  poly64_mul(&sub[4], &sub[6], &t[1], &t[0]);
+  poly64_mul(&sub[5], &sub[7], &t[3], &t[2]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul(&sub[8], &sub[9], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] = temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] = temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] = temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t temp[8] = {0,};
+  uint64_t sub[10] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[2] ^ a[3];
+  sub[3] = b[2] ^ b[3];
+  sub[4] = a[0] ^ a[2];
+  sub[5] = a[1] ^ a[3];
+  sub[6] = b[0] ^ b[2];
+  sub[7] = b[1] ^ b[3];
+  sub[8] = sub[4] ^ sub[5];
+  sub[9] = sub[6] ^ sub[7];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &t[3], &t[1]);
+  t[1] ^= t[2];
+
+  poly64_mul(&a[3], &b[3], &temp[7], &t[2]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  poly64_mul(&sub[4], &sub[6], &t[1], &t[0]);
+  poly64_mul(&sub[5], &sub[7], &t[3], &t[2]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul(&sub[8], &sub[9], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] ^= temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] ^= temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] ^= temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t temp_c3 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+      temp_c3 ^= (b_ptr[0][3] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+      temp_c3 ^= (b_ptr[1][3] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+      temp_c3 ^= (b_ptr[2][3] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+      temp_c3 ^= (b_ptr[3][3] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+  c[2] = temp_c2;
+  c[3] = temp_c3;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t temp_c3 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+      temp_c3 ^= (b_ptr[0][3] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+      temp_c3 ^= (b_ptr[1][3] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+      temp_c3 ^= (b_ptr[2][3] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+      temp_c3 ^= (b_ptr[3][3] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+  c[2] ^= temp_c2;
+  c[3] ^= temp_c3;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t add[4] = {0,};
+  uint64_t temp[8] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&t[3], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  poly64_mul_s(&temp[7], &t[2], a[3], b[3]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[2] ^ a[3]), (b[2] ^ b[3]));
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  add[0] = a[0] ^ a[2];
+  add[1] = a[1] ^ a[3];
+  add[2] = b[0] ^ b[2];
+  add[3] = b[1] ^ b[3];
+  poly64_mul_s(&t[1], &t[0], add[0], add[2]);
+  poly64_mul_s(&t[3], &t[2], add[1], add[3]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul_s(&t[1], &t[0], (add[0] ^ add[1]), (add[2] ^ add[3]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] = temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] = temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] = temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t add[4] = {0,};
+  uint64_t temp[8] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&t[3], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  poly64_mul_s(&temp[7], &t[2], a[3], b[3]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[2] ^ a[3]), (b[2] ^ b[3]));
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  add[0] = a[0] ^ a[2];
+  add[1] = a[1] ^ a[3];
+  add[2] = b[0] ^ b[2];
+  add[3] = b[1] ^ b[3];
+  poly64_mul_s(&t[1], &t[0], add[0], add[2]);
+  poly64_mul_s(&t[3], &t[2], add[1], add[3]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul_s(&t[1], &t[0], (add[0] ^ add[1]), (add[2] ^ add[3]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] ^= temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] ^= temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] ^= temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
diff --git a/crypto_sign/aimer256s/m4speed/field.h b/crypto_sign/aimer256s/m4speed/field.h
new file mode 100644
index 00000000..089ad983
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[4];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer256s/m4speed/hash.c b/crypto_sign/aimer256s/m4speed/hash.c
new file mode 100644
index 00000000..005b51d1
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/hash.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include "keccakf1600.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static void shake256_inc_skip_squeeze(shake256incctx *state, size_t outlen)
+{
+  size_t i;
+
+  for (i = 0; i < outlen && i < state->ctx[25]; i++)
+  {
+    continue;
+  }
+  outlen -= i;
+  state->ctx[25] -= i;
+
+  while (outlen > 0)
+  {
+    KeccakF1600_StatePermute(state->ctx);
+
+    for (i = 0; i < outlen && i < SHAKE256_RATE; i++)
+    {
+      continue;
+    }
+    outlen -= i;
+    state->ctx[25] = SHAKE256_RATE - i;
+  }
+}
+
+void hash_init(hash_instance *ctx)
+{
+  shake256_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake256_inc_init(ctx);
+  shake256_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake256_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake256_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake256_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_skip_squeeze(hash_instance *ctx, size_t buffer_len)
+{
+  shake256_inc_skip_squeeze(ctx, buffer_len);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake256_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake256_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer256s/m4speed/hash.h b/crypto_sign/aimer256s/m4speed/hash.h
new file mode 100644
index 00000000..8f3450ea
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/hash.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake256incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_skip_squeeze AIMER_NAMESPACE(hash_skip_squeeze)
+void hash_skip_squeeze(hash_instance *ctx, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer256s/m4speed/params.h b/crypto_sign/aimer256s/m4speed/params.h
new file mode 100644
index 00000000..97da601d
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/params.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer256s_m4speed_##s
+
+#define SECURITY_BITS               256                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         3                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     33                   // number of parallel repetitions (Tau)
+#define AIMER_N                     256                  // number of MPC parties (N)
+#define AIMER_LOGN                  8                    // log_2(N)
+
+#define PRE_TREE_IDX                256
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer256s/m4speed/sign.c b/crypto_sign/aimer256s/m4speed/sign.c
new file mode 100644
index 00000000..b983fafc
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/sign.c
@@ -0,0 +1,690 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void commit_and_expand_tape_phase_3(tape_t *tape,
+                                    const hash_instance *ctx_precom,
+                                    const uint8_t seed[AIMER_SEED_SIZE],
+                                    size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_skip_squeeze(&ctx, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[0], mult_chk->x_shares[0]);
+  for (size_t i = 1; i < 11; i++)
+  {
+    GF_sqr_s(mult_chk->z_shares[0], mult_chk->z_shares[0]); 
+  }
+  GF_mul_add(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  GF_sqr_s(mult_chk->z_shares[2], mult_chk->x_shares[2]);
+  for (size_t i = 1; i < 7; i++)
+  {
+    GF_sqr_s(mult_chk->z_shares[2], mult_chk->z_shares[2]);
+  }
+  GF_mul_add(mult_chk->z_shares[2], mult_chk->x_shares[2], aim2_constants[2]);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[2],
+                           matrix_A[2]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const hash_instance *ctx_tree,
+                 const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  hash_instance ctx;
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    memcpy(nodes, pre_nodes[rep][0], (2 * PRE_TREE_IDX - 1) * AIMER_SEED_SIZE);
+    expand_tree(nodes, ctx_tree, rep);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1],  rep, party);
+      hash_update(&ctx, commits, AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_add(delta.t_shares[2], delta.t_shares[2], sbox_outputs[2]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[2], delta.t_shares[2]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+      }
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_3(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])
+{
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF pt_shares[AIMER_N];
+  GF alpha_v_shares[2][AIMER_N];
+  GF epsilons[AIMER_L + 1];
+
+  // prepare h2
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    memcpy(nodes, pre_nodes[rep][0], (2 * PRE_TREE_IDX - 1) * AIMER_SEED_SIZE);
+    expand_tree(nodes, ctx_tree, rep);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF_set0(alpha);
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape_phase_3(&tape, &ctx_precom,
+                                     nodes[AIMER_N + party - 1], rep, party);
+
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk));
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+        GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+        GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+        GF_from_bytes(delta.t_shares[2], sign->proofs[rep].delta_ts_bytes[2]);
+        GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(mult_chk.x_shares[2], tape.t_shares[2]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[3], epsilons[3]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[3], epsilons[3]);
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares, sizeof(alpha_v_shares));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_5(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])
+{
+  // prepare challenge parties
+  hash_instance ctx;
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF epsilons[AIMER_L + 1];
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  mult_chk_t mult_chk;
+  memset(&mult_chk, 0, sizeof(mult_chk_t));
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+
+    // compute parties' seeds using binary tree
+    memcpy(nodes, pre_nodes[rep][0], (2 * PRE_TREE_IDX - 1) * AIMER_SEED_SIZE);
+    expand_tree(nodes, ctx_tree, rep);
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes, i_bar);
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha_share = {0,};
+
+    tape_t tape;
+    commit_and_expand_tape(&tape, commits, &ctx_precom,
+                           nodes[AIMER_N + i_bar - 1], rep, i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits,
+           AIMER_COMMIT_SIZE);
+
+    GF_set0(mult_chk.x_shares[AIMER_L]);
+
+    // adjust the last share and prepare the proof and h_1
+    if (i_bar == AIMER_N - 1)
+    {
+      // initialize adjustment values
+      tape_t delta;
+      memset(&delta, 0, sizeof(tape_t));
+
+      GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+      GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+      GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+      GF_from_bytes(delta.t_shares[2], sign->proofs[rep].delta_ts_bytes[2]);
+      GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+      GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(tape.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+      GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+      GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+    }
+
+    for (size_t ell = 0; ell < AIMER_L; ell++)
+    {
+      GF_copy(mult_chk.x_shares[ell], tape.t_shares[ell]);
+
+      // x_* = sum_i A[i] * t[i] + b
+      GF_transposed_matmul_add(mult_chk.x_shares[AIMER_L],
+                               tape.t_shares[ell], matrix_A[ell]);
+    }
+    GF_copy(alpha_share, tape.a_share);
+
+    GF_mul_add(alpha_share, mult_chk.x_shares[0], epsilons[0]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[1], epsilons[1]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[2], epsilons[2]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[3], epsilons[3]);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes, alpha_share);
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  signature_t *sign = (signature_t *)sig;
+
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt and root seeds
+  hash_instance ctx_roots;
+  hash_init_prefix(&ctx_roots, HASH_PREFIX_3);
+  hash_update(&ctx_roots, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_roots, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_roots, random, SECURITY_BYTES);
+  hash_final(&ctx_roots);
+  hash_squeeze(&ctx_roots, sign->salt, AIMER_SALT_SIZE);
+
+  uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE];
+  hash_squeeze(&ctx_roots, (uint8_t *)root_seeds, AIMER_T * AIMER_SEED_SIZE);
+  hash_ctx_release(&ctx_roots);
+
+  uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE] = {0,};
+
+  hash_instance ctx_tree;
+  hash_init_prefix(&ctx_tree, HASH_PREFIX_4);
+  hash_update(&ctx_tree, sign->salt, AIMER_SALT_SIZE);
+
+  pre_expand_trees(pre_nodes, &ctx_tree, root_seeds);
+
+  run_phase_1(sign, pt_GF, (const GF *)sbox_outputs, mu,
+              &ctx_tree,
+              (const uint8_t (*)[2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])pre_nodes);
+  run_phase_1_to_3(sign, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, ct_GF, &ctx_tree,
+                   (const uint8_t (*)[2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])pre_nodes);
+  run_phase_1_to_5(sign, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, &ctx_tree,
+                   (const uint8_t (*)[2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE])pre_nodes);
+
+  hash_ctx_release(&ctx_tree);
+
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(
+      sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig,
+        size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[2]);
+        GF_add(tape.t_shares[2], tape.t_shares[2], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(mult_chk.x_shares[2], tape.t_shares[2]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[3], epsilons[3]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[3], epsilons[3]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(
+          signature, CRYPTO_BYTES,
+          message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer256s/m4speed/sign.h b/crypto_sign/aimer256s/m4speed/sign.h
new file mode 100644
index 00000000..0c168ee0
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/sign.h
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define commit_and_expand_tape_phase_3 AIMER_NAMESPACE(commit_and_expand_tape_phase_3)
+void commit_and_expand_tape_phase_3(tape_t *tape,
+                                    const hash_instance *ctx_precom,
+                                    const uint8_t seed[AIMER_SEED_SIZE],
+                                    size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const hash_instance *ctx_tree,
+                 const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_3 AIMER_NAMESPACE(run_phase_1_to_3)
+void run_phase_1_to_3(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_5 AIMER_NAMESPACE(run_phase_1_to_5)
+void run_phase_1_to_5(signature_t *sign,
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b,
+                      const hash_instance *ctx_tree,
+                      const uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE]);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer256s/m4speed/tree.c b/crypto_sign/aimer256s/m4speed/tree.c
new file mode 100644
index 00000000..3f4fa554
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/tree.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void pre_expand_trees(uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE],
+                      const hash_instance *ctx_tree,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE])
+{
+  size_t rep_index, node_index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx;
+
+  for (rep_index = 0; rep_index < AIMER_T; rep_index++)
+  {
+    memcpy(pre_nodes[rep_index][0], root_seeds[rep_index], AIMER_SEED_SIZE);
+    buffer[0] = (uint8_t)(rep_index);
+    for (node_index = 1; node_index < PRE_TREE_IDX; node_index++)
+    {
+      buffer[1] = (uint8_t)(node_index);
+      memcpy(buffer + 2, pre_nodes[rep_index][node_index - 1], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, ctx_tree);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, pre_nodes[rep_index][2 * node_index - 1], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+}
+
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const hash_instance *ctx_tree,
+                 size_t rep_index)
+{
+  size_t index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx;
+
+  buffer[0] = (uint8_t)(rep_index);
+  for (index = PRE_TREE_IDX; index < AIMER_N; index++)
+  {
+    buffer[1] = (uint8_t)(index);
+    memcpy(buffer + 2, nodes[index - 1], AIMER_SEED_SIZE);
+
+    hash_ctx_clone(&ctx, ctx_tree);
+    hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+    hash_final(&ctx);
+    hash_squeeze(&ctx, nodes[2 * index - 1], AIMER_SEED_SIZE << 1);
+    hash_ctx_release(&ctx);
+  }
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer256s/m4speed/tree.h b/crypto_sign/aimer256s/m4speed/tree.h
new file mode 100644
index 00000000..364c85f7
--- /dev/null
+++ b/crypto_sign/aimer256s/m4speed/tree.h
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define pre_expand_trees AIMER_NAMESPACE(pre_expand_trees)
+void pre_expand_trees(uint8_t pre_nodes[AIMER_T][2 * PRE_TREE_IDX - 1][AIMER_SEED_SIZE],
+                      const hash_instance *ctx_tree,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE]);
+
+#define expand_tree AIMER_NAMESPACE(expand_tree)
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const hash_instance *ctx_tree,
+                 size_t rep_index);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H
diff --git a/crypto_sign/aimer256s/m4stack/__asm_field.S b/crypto_sign/aimer256s/m4stack/__asm_field.S
new file mode 100644
index 00000000..6181c602
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/__asm_field.S
@@ -0,0 +1,695 @@
+#include "params.h"
+
+.syntax unified
+.cpu cortex-m4
+.thumb
+
+.text
+
+.align  2
+.global AIMER_NAMESPACE(GF_to_bytes)
+.global AIMER_NAMESPACE(GF_from_bytes)
+.global AIMER_NAMESPACE(GF_copy)
+.type   AIMER_NAMESPACE(GF_to_bytes), %function
+.type   AIMER_NAMESPACE(GF_from_bytes), %function
+.type   AIMER_NAMESPACE(GF_copy), %function
+AIMER_NAMESPACE(GF_to_bytes):
+AIMER_NAMESPACE(GF_from_bytes):
+AIMER_NAMESPACE(GF_copy):
+  out_p       .req R0
+  in_p        .req R1
+
+  .equ width, 4
+
+  ldr.w R2, [in_p, #0 * width]
+  ldr.w R3, [in_p, #1 * width]
+  str.w R2, [out_p, #0 * width]
+  str.w R3, [out_p, #1 * width]
+
+  ldr.w R2, [in_p, #2 * width]
+  ldr.w R3, [in_p, #3 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R3, [out_p, #3 * width]
+
+  ldr.w R2, [in_p, #4 * width]
+  ldr.w R3, [in_p, #5 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R3, [out_p, #5 * width]
+
+  ldr.w R2, [in_p, #6 * width]
+  ldr.w R3, [in_p, #7 * width]
+  str.w R2, [out_p, #6 * width]
+  str.w R3, [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq in_p
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_set0)
+.type   AIMER_NAMESPACE(GF_set0), %function
+AIMER_NAMESPACE(GF_set0):
+  out_p       .req R0
+
+  .equ width, 4
+
+  mov.w R2, #0
+  str.w R2, [out_p, #0 * width]
+  str.w R2, [out_p, #1 * width]
+  str.w R2, [out_p, #2 * width]
+  str.w R2, [out_p, #3 * width]
+  str.w R2, [out_p, #4 * width]
+  str.w R2, [out_p, #5 * width]
+  str.w R2, [out_p, #6 * width]
+  str.w R2, [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq out_p
+
+.align  2
+.global AIMER_NAMESPACE(GF_add)
+.type   AIMER_NAMESPACE(GF_add), %function
+AIMER_NAMESPACE(GF_add):
+  out_p       .req R0
+  in0_p       .req R1
+  in1_p       .req R2
+
+  .equ width, 4
+
+  ldr.w R3,  [in0_p, #0 * width]
+  ldr.w R12, [in1_p, #0 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #0 * width]
+
+  ldr.w R3,  [in0_p, #1 * width]
+  ldr.w R12, [in1_p, #1 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #1 * width]
+
+  ldr.w R3,  [in0_p, #2 * width]
+  ldr.w R12, [in1_p, #2 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #2 * width]
+
+  ldr.w R3,  [in0_p, #3 * width]
+  ldr.w R12, [in1_p, #3 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #3 * width]
+
+  ldr.w R3,  [in0_p, #4 * width]
+  ldr.w R12, [in1_p, #4 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #4 * width]
+
+  ldr.w R3,  [in0_p, #5 * width]
+  ldr.w R12, [in1_p, #5 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #5 * width]
+
+  ldr.w R3,  [in0_p, #6 * width]
+  ldr.w R12, [in1_p, #6 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #6 * width]
+
+  ldr.w R3,  [in0_p, #7 * width]
+  ldr.w R12, [in1_p, #7 * width]
+  eor.w R3, R3, R12
+  str.w R3,  [out_p, #7 * width]
+
+  bx    lr
+
+  .unreq out_p
+  .unreq in0_p
+  .unreq in1_p
+
+.macro or_shift_and in_a, con_a, off_a
+  orr.w \in_a, \in_a, \in_a, lsl #\off_a
+  and.w \in_a, \in_a, \con_a
+.endm
+
+.align  2
+.global AIMER_NAMESPACE(GF_sqr_s)
+.type   AIMER_NAMESPACE(GF_sqr_s), %function
+AIMER_NAMESPACE(GF_sqr_s):
+  out_p       .req R0
+  in_p        .req R1
+
+  in0         .req R2
+  in1         .req R3
+  in2         .req R4
+  in3         .req R5
+  in4         .req R6
+  in5         .req R7
+  in6         .req R8
+  in7         .req R9
+  in8         .req R10
+  in9         .req R11
+
+  .equ C0,    1431655765  // 0x55555555
+  .equ C1,    858993459   // 0x33333333
+  .equ C2,    252645135   // 0x0F0F0F0F
+  .equ C3,    16711935    // 0x00FF00FF
+  .equ C4,    4294967295  // 0xFFFFFFFF
+  .equ width, 4
+
+  push.w    {R4-R12, lr}
+
+  ldr.w in0, [in_p, #4 * width]  // a[2]
+  ldr.w in2, [in_p, #5 * width]
+  ldr.w in4, [in_p, #6 * width]  // a[3]
+  ldr.w in6, [in_p, #7 * width]  
+
+  lsr.w in1, in0, #16
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+  lsr.w in7, in6, #16
+
+  mov.w R12, C4
+
+  and.w in0, in0, R12, lsr #16
+  and.w in2, in2, R12, lsr #16
+  and.w in4, in4, R12, lsr #16
+  and.w in6, in6, R12, lsr #16
+
+  or_shift_and in0, C3, 8
+  or_shift_and in1, C3, 8
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+  or_shift_and in6, C3, 8
+  or_shift_and in7, C3, 8
+
+  or_shift_and in0, C2, 4
+  or_shift_and in1, C2, 4
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+  or_shift_and in6, C2, 4
+  or_shift_and in7, C2, 4
+
+  or_shift_and in0, C1, 2
+  or_shift_and in1, C1, 2
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+  or_shift_and in6, C1, 2
+  or_shift_and in7, C1, 2
+
+  or_shift_and in0, C0, 1
+  or_shift_and in1, C0, 1
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+  or_shift_and in6, C0, 1
+  or_shift_and in7, C0, 1
+
+  // t = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+  eor.w in0, in0, in7, lsr #22
+  eor.w in0, in0, in7, lsr #27
+  eor.w in0, in0, in7, lsr #30
+
+  push.w {in2, in3}              // temp[5]
+
+  ldr.w in2, [in_p, #2 * width]  // a[1]
+  ldr.w in8, [in_p, #3 * width]
+
+  lsr.w in3, in2, #16
+  lsr.w in9, in8, #16
+
+  and.w in2, in2, R12, lsr #16
+  and.w in8, in8, R12, lsr #16
+
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in8, C3, 8
+  or_shift_and in9, C3, 8
+
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in8, C2, 4
+  or_shift_and in9, C2, 4
+
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in8, C1, 2
+  or_shift_and in9, C1, 2
+
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in8, C0, 1
+  or_shift_and in9, C0, 1
+
+  // c[3] = temp[3] ^ temp[7];
+  eor.w in8, in8, in6
+  eor.w in9, in9, in7
+
+  // c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  // c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  eor.w in8, in8, in5, lsr #22
+  eor.w in8, in8, in5, lsr #27
+  eor.w in8, in8, in5, lsr #30
+
+  eor.w in8, in8, in6, lsl #10
+  eor.w in8, in8, in6, lsl #5
+  eor.w in8, in8, in6, lsl #2
+
+  eor.w in9, in9, in6, lsr #22
+  eor.w in9, in9, in6, lsr #27
+  eor.w in9, in9, in6, lsr #30
+
+  eor.w in9, in9, in7, lsl #10
+  eor.w in9, in9, in7, lsl #5
+  eor.w in9, in9, in7, lsl #2
+
+  str.w in8, [out_p, #6 * width]
+  str.w in9, [out_p, #7 * width]
+
+  // c[2] = temp[2] ^ temp[6];
+  eor.w in2, in2, in4
+  eor.w in3, in3, in5
+
+  // c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  // c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  // c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  pop.w {in6, in7}               // temp[5]
+
+  eor.w in2, in2, in7, lsr #22
+  eor.w in2, in2, in7, lsr #27
+  eor.w in2, in2, in7, lsr #30
+
+  eor.w in2, in2, in4, lsl #10
+  eor.w in2, in2, in4, lsl #5
+  eor.w in2, in2, in4, lsl #2
+
+  eor.w in3, in3, in4, lsr #22
+  eor.w in3, in3, in4, lsr #27
+  eor.w in3, in3, in4, lsr #30
+
+  eor.w in3, in3, in5, lsl #10
+  eor.w in3, in3, in5, lsl #5
+  eor.w in3, in3, in5, lsl #2
+
+  str.w in2, [out_p, #4 * width]
+  str.w in3, [out_p, #5 * width]
+
+  ldr.w in2, [in_p, #0 * width]  // a[0]
+  ldr.w in4, [in_p, #1 * width]
+
+  lsr.w in3, in2, #16
+  lsr.w in5, in4, #16
+
+  and.w in2, in2, R12, lsr #16
+  and.w in4, in4, R12, lsr #16
+
+  or_shift_and in2, C3, 8
+  or_shift_and in3, C3, 8
+  or_shift_and in4, C3, 8
+  or_shift_and in5, C3, 8
+
+  or_shift_and in2, C2, 4
+  or_shift_and in3, C2, 4
+  or_shift_and in4, C2, 4
+  or_shift_and in5, C2, 4
+
+  or_shift_and in2, C1, 2
+  or_shift_and in3, C1, 2
+  or_shift_and in4, C1, 2
+  or_shift_and in5, C1, 2
+
+  or_shift_and in2, C0, 1
+  or_shift_and in3, C0, 1
+  or_shift_and in4, C0, 1
+  or_shift_and in5, C0, 1
+
+  // c[1] = temp[1] ^ temp[5];
+  eor.w in4, in4, in6
+  eor.w in5, in5, in7
+
+  // c[1] ^= (temp[5] << 10) | (t >> 54);
+  // c[1] ^= (temp[5] <<  5) | (t >> 59);
+  // c[1] ^= (temp[5] <<  2) | (t >> 62);
+  eor.w in4, in4, in1, lsr #22
+  eor.w in4, in4, in1, lsr #27
+  eor.w in4, in4, in1, lsr #30
+
+  eor.w in4, in4, in6, lsl #10
+  eor.w in4, in4, in6, lsl #5
+  eor.w in4, in4, in6, lsl #2
+
+  eor.w in5, in5, in6, lsr #22
+  eor.w in5, in5, in6, lsr #27
+  eor.w in5, in5, in6, lsr #30
+
+  eor.w in5, in5, in7, lsl #10
+  eor.w in5, in5, in7, lsl #5
+  eor.w in5, in5, in7, lsl #2
+
+  str.w in4, [out_p, #2 * width]
+  str.w in5, [out_p, #3 * width]
+
+  // c[0] = temp[0] ^ t;
+  eor.w in2, in2, in0
+  eor.w in3, in3, in1
+
+  // c[0] ^= (t << 10);
+  // c[0] ^= (t << 5);
+  // c[0] ^= (t << 2);
+  eor.w in2, in2, in0, lsl #10
+  eor.w in2, in2, in0, lsl #5
+  eor.w in2, in2, in0, lsl #2
+
+  eor.w in3, in3, in0, lsr #22
+  eor.w in3, in3, in0, lsr #27
+  eor.w in3, in3, in0, lsr #30
+
+  eor.w in3, in3, in1, lsl #10
+  eor.w in3, in3, in1, lsl #5
+  eor.w in3, in3, in1, lsl #2
+
+  str.w in2, [out_p, #0 * width]
+  str.w in3, [out_p, #1 * width]
+
+  pop.w {R4-R12, pc}
+
+  // unlink register name
+  .unreq in_p
+  .unreq out_p
+
+.macro lut_access0_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  ldr \out1_0, [\sp1, #0]
+  ldr \in0_2, [\sp1, #4]
+
+  ldr \in0_0, [\sp0, #0]
+  ldr \in0_1, [\sp0, #4]
+  eor \in0_1, \in0_1, \out1_0
+.endm
+
+.macro lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  lsr \in0_3, \in0_2, #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access0_1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  and \sp1, \mask, \b0_1, lsr #\offset
+  and \sp0, \mask, \b0_0, lsr #\offset
+  lsl \in0_3, \in0_3, #4
+  add \sp1, \sp1, sp
+  add \sp0, \sp0, sp
+
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  ldr \out1_0, [\sp1, #0]
+  ldr \out1_1, [\sp1, #4]
+
+  lsl \in0_2, \in0_2, #4
+  ldr \out0_0, [\sp0, #0]
+  ldr \out0_1, [\sp0, #4]
+
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  lsl \in0_1, \in0_1, #4
+  orr \in0_1, \in0_1, \in0_0, lsr #28
+  lsl \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro lut_access1 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, offset
+  lsl \in0_3, \in0_3, #4
+  and \sp1, \mask, \b0_1, lsl #\offset
+  orr \in0_3, \in0_3, \in0_2, lsr #28
+  and \sp0, \mask, \b0_0, lsl #\offset
+
+  lsl \in0_2, \in0_2, #4
+  add \sp1, \sp1, sp
+  orr \in0_2, \in0_2, \in0_1, lsr #28
+  add \sp0, \sp0, sp
+
+  lsl   \in0_1, \in0_1, #4
+  ldmia \sp1, {\out1_0-\out1_1}
+  orr   \in0_1, \in0_1, \in0_0, lsr #28
+  ldmia \sp0, {\out0_0-\out0_1}
+  lsl   \in0_0, \in0_0, #4
+
+  eor \in0_1, \in0_1, \out1_0
+  eor \in0_2, \in0_2, \out1_1
+  eor \in0_0, \in0_0, \out0_0
+  eor \in0_1, \in0_1, \out0_1
+.endm
+
+.macro last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  and \mask, \one, \r0_ret, lsr #\offset
+  sub \mask, \zero, \mask
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.macro last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, offset
+  sub \mask, \zero, \r0_ret, lsr #\offset
+  and \mask0_1, \b0_0, \mask
+  and \mask0_2, \b0_1, \mask
+
+  lsl \mask0_0, \mask0_1, #\offset
+  lsr \mask0_1, \mask0_1, #32 - \offset
+  orr \mask0_1, \mask0_1, \mask0_2, lsl #\offset
+  lsr \mask0_2, \mask0_2, #32 - \offset
+
+  eor \in0_1, \in0_1, \mask0_0
+  eor \in0_2, \in0_2, \mask0_1
+  eor \in0_3, \in0_3, \mask0_2
+.endm
+
+.global AIMER_NAMESPACE(poly64_mul)
+.type   AIMER_NAMESPACE(poly64_mul), %function
+AIMER_NAMESPACE(poly64_mul):
+  t0_0    .req R0
+  t0_1    .req R1
+  t1_0    .req R2
+  t1_1    .req R3
+  t2_0    .req R4
+  t2_1    .req R5
+  t3_0    .req R6
+  t3_1    .req R7
+  t4_0    .req R8
+  t4_1    .req R9
+  t5_0    .req R10
+  t5_1    .req R11
+
+  r1_copy .req R12
+  t_base  .req R14 
+
+  sp0     .req R12
+  sp1     .req R14
+
+  b0_0    .req R0
+  b0_1    .req R1
+
+  in0_0   .req R2
+  in0_1   .req R3
+  in0_2   .req R4
+  in0_3   .req R5
+
+  out0_0  .req R6
+  out0_1  .req R7
+  out1_0  .req R8
+  out1_1  .req R9
+
+  mask    .req R10
+
+  zero    .req R6
+  one     .req R7
+
+  r0_ret  .req R8
+
+  mask0_0 .req R9
+  mask0_1 .req R11
+  mask0_2 .req R12
+
+  push  {R4-R11, lr}
+  push  {R2-R3}
+
+  ldr   t1_0, [R0, #0]
+  ldr   t1_1, [R0, #4]
+  push  {t1_1}
+
+  sub   sp, #128  // allocating space in the stack
+                  // 8 bytes * 16 = 128 bytes
+  mov   t_base, sp
+  mov   r1_copy, R1
+
+  mov   t0_0, #0
+  mov   t0_1, #0
+
+  and   t1_1, #0x1FFFFFFF
+
+  lsl   t2_1, t1_1, #1
+  orr   t2_1, t2_1, t1_0, lsr #31
+  lsl   t2_0, t1_0, #1
+
+  eor   t3_0, t1_0, t2_0
+  eor   t3_1, t1_1, t2_1
+
+  lsl   t4_1, t2_1, #1
+  orr   t4_1, t4_1, t2_0, lsr #31
+  lsl   t4_0, t2_0, #1
+
+  eor   t5_0, t1_0, t4_0
+  eor   t5_1, t1_1, t4_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t0_0, t2_0, t4_0
+  eor   t0_1, t2_1, t4_1
+
+  lsl   t2_1, t4_1, #1
+  orr   t2_1, t2_1, t4_0, lsr #31
+  lsl   t2_0, t4_0, #1
+
+  eor   t5_0, t2_0, t3_0
+  eor   t5_1, t2_1, t3_1
+
+  eor   t3_0, t2_0, t1_0
+  eor   t3_1, t2_1, t1_1
+
+  eor   t4_0, t4_0, t0_0
+  eor   t4_1, t4_1, t0_1
+
+  eor   t4_0, t4_0, t2_0
+  eor   t4_1, t4_1, t2_1
+
+  eor   t1_0, t1_0, t0_0
+  eor   t1_1, t1_1, t0_1
+
+  stmia t_base!, {t0_0-t5_1}  // 4 bytes X 12 elements  = 48 bytes
+
+  eor   t1_0, t5_0, t0_0
+  eor   t1_1, t5_1, t0_1
+
+  eor   t2_0, t2_0, t0_0
+  eor   t2_1, t2_1, t0_1
+
+  eor   t3_0, t3_0, t0_0
+  eor   t3_1, t3_1, t0_1
+
+  eor   t0_0, t4_0, t0_0
+  eor   t0_1, t4_1, t0_1
+
+  stmia t_base, {t0_0-t3_1}  // 4 bytes X 8 elements  = 32 bytes
+
+  ldmia r1_copy, {b0_0-b0_1}
+  mov   mask, #0x00000078
+
+  lut_access0_0   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 25
+  lut_access0_1_0 sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 21
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 17
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 13
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 9
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 5
+  lut_access0_1   sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 1
+  lut_access1     sp0, sp1, mask, b0_0, b0_1, out0_0, out0_1, out1_0, out1_1, in0_0, in0_1, in0_2, in0_3, 3
+
+  mov   zero, #0
+  mov   one,  #1
+  ldr   r0_ret, [sp, #128]
+
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 29
+  last_mask0 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 30
+  last_mask1 mask, one, r0_ret, zero, mask0_0, mask0_1, mask0_2, b0_0, b0_1, in0_1, in0_2, in0_3, 31
+
+  ldr   R0, [sp, #132]
+  ldr   R1, [sp, #136]
+  add   sp, #140  // restoring stack
+
+  str   in0_0, [R1, #0]
+  str   in0_1, [R1, #4]
+
+  str   in0_2, [R0, #0]
+  str   in0_3, [R0, #4]
+
+  pop   {R4-R11, pc}
+
+  // unlink register name
+  .unreq t0_0
+  .unreq t0_1
+  .unreq t1_0
+  .unreq t1_1
+  .unreq t2_0
+  .unreq t2_1
+  .unreq t3_0
+  .unreq t3_1
+  .unreq t4_0
+  .unreq t4_1
+  .unreq t5_0
+  .unreq t5_1
+
+  .unreq r1_copy
+  .unreq t_base
+
+  .unreq sp0
+  .unreq sp1
+
+  .unreq b0_0
+  .unreq b0_1
+
+  .unreq in0_0
+  .unreq in0_1
+  .unreq in0_2
+  .unreq in0_3
+
+  .unreq out0_0
+  .unreq out0_1
+  .unreq out1_0
+  .unreq out1_1
+
+  .unreq mask
+
+  .unreq zero
+  .unreq one
+
+  .unreq r0_ret
+
+  .unreq mask0_0
+  .unreq mask0_1
+  .unreq mask0_2
diff --git a/crypto_sign/aimer256s/m4stack/aim2.c b/crypto_sign/aimer256s/m4stack/aim2.c
new file mode 100644
index 00000000..74e41922
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/aim2.c
@@ -0,0 +1,598 @@
+// SPDX-License-Identifier: MIT
+
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+// inverse Mersenne S-box with e1 = 11
+// (2 ^ 11 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5
+// b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5 b6 b6d6dadb5
+void GF_exp_invmer_e_1(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,};
+  GF table_5 = {0,}, table_6 = {0,};
+  GF table_a = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 4
+  GF_sqr_s(table_d, in);
+  GF_sqr_s(t1, table_d);
+
+  // table_5 = in ^ 5
+  GF_mul_s(table_5, t1, in);
+  // table_6 = in ^ 6
+  GF_mul_s(table_6, table_5, in);
+  // table_a = in ^ 10 = (in ^ 5) ^ 2
+  GF_sqr_s(table_a, table_5);
+  // table_b = in ^ 11
+  GF_mul_s(table_b, table_5, table_6);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // table_b = in ^ (0xb6), table_5 = in ^ (0xb5)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_5, t1, table_5);
+  GF_mul_s(table_b, t1, table_6);
+
+  // t1 = in ^ (0xb6 d)
+  GF_sqr_s(t1, table_b);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6d 6)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // t1 = in ^ (0xb6d6 d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ (0xb6d6d a)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_a);
+
+  // t1 = in ^ (0xb6d6da d)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // table_5 = in ^ (0xb6d6dad b5)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(table_5, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5 b6)
+  GF_sqr_s(t1, table_5);
+  for (i = 1; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_5);
+
+  // t1 = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5 b6)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_b);
+
+  // out = in ^ (0xb6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6b6d6dadb5b6 b6d6dadb5)
+  for (i = 0; i < 36; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, table_5);
+}
+
+// inverse Mersenne S-box with e2 = 141
+// (2 ^ 141 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0x2224448889112222444888911222244488911122244448891112224444889111
+// 222444 8889112 222444 8889112 222444 889111 222444 4889111 222444 4889111
+void GF_exp_invmer_e_2(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,}, t2 = {0,}, t3 = {0,}, t4 = {0,}, t5 = {0,};
+  GF table_9 = {0,};
+
+  // t2 = in ^ (0x11), table_9 = in ^ 9
+  GF_sqr_s(t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(table_9, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t2, t1, in);
+
+  // t3 = in ^ (0x111)
+  GF_sqr_s(t1, t2);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t3, t1, in);
+
+  // t4 = in ^ (0x222444)
+  GF_sqr_s(t1, t3);
+  for (i = 0; i < 10; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t4, t1);
+
+  // t1 = in ^ (0x222444 8889)
+  GF_sqr_s(t1, t4);
+  for (i = 1; i < 9; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x2224448889 11)
+  for (i = 0; i < 8; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  // t5 = in ^ (0x222444888911 2)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t5, t1);
+
+  // t1 = in ^ (0x2224448889112 2224448889112)
+  GF_sqr_s(t1, t5);
+  for (i = 1; i < 52; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t5);
+
+  // t1 = in ^ (0x22244488891122224448889112 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x22244488891122224448889112222444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889111 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x22244488891122224448889112222444889111222444 4)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t3);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889111 222444)
+  for (i = 0; i < 24; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t4);
+
+  // t1 = in ^ (0x222444888911222244488891122224448891112224444889111222444 4)
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, in);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+
+  // t1 = in ^ (0x2224448889112222444888911222244488911122244448891112224444 889)
+  for (i = 0; i < 5; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, t2);
+
+  for (i = 0; i < 7; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_9);
+
+  // out = in ^ (0x2224448889112222444888911222244488911122244448891112224444889 111)
+  for (i = 0; i < 12; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(out, t1, t3);
+}
+
+// inverse Mersenne S-box with e3 = 7
+// (2 ^ 7 - 1) ^ (-1) mod (2 ^ 256 - 1)
+// = 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76ed
+// ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e ddbb76e d
+void GF_exp_invmer_e_3(GF out, const GF in)
+{
+  size_t i;
+  GF t1 = {0,};
+  GF table_6 = {0,}, table_7 = {0,}, table_b = {0,}, table_d = {0,};
+
+  // t1 = in ^ 3
+  GF_sqr_s(table_d, in);
+  GF_mul_s(t1, table_d, in);
+
+  // table_6 = in ^ 6
+  GF_sqr_s(table_6, t1);
+  // table_7 = in ^ 7
+  GF_mul_s(table_7, table_6, in);
+  // table_b = in ^ 11
+  GF_sqr_s(table_b, table_d);
+  GF_mul_s(table_b, table_7, table_b);
+  // table_d = in ^ 13
+  GF_mul_s(table_d, table_b, table_d);
+
+  // t1 = in ^ 0xdd
+  GF_sqr_s(t1, table_d);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_d);
+
+  // t1 = in ^ 0xdd b
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ 0xddb b
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_b);
+
+  // t1 = in ^ 0xddbb 7
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb7 6
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_6);
+
+  // table_7 = in ^ 0xddbb76 e
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(t1, t1, table_7);
+  GF_sqr_s(table_7, t1);
+
+  // t1 = in ^ 0xddbb76e ddbb76e
+  GF_sqr_s(t1, table_7);
+  for (i = 1; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // t1 = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e ddbb76e
+  for (i = 0; i < 28; i++)
+  {
+    GF_sqr_s(t1, t1);
+  }
+  GF_mul_s(t1, t1, table_7);
+
+  // out = in ^ 0xddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76eddbb76e d
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, table_d);
+}
+
+// Mersenne exponentiation with e_star = 3
+void GF_exp_mer_e_star(GF out, const GF in)
+{
+  GF t1 = {0,};
+
+  // t1 = a ^ (2 ^ 2 - 1)
+  GF_sqr_s(t1, in);
+  GF_mul_s(t1, t1, in);
+
+  // out = a ^ (2 ^ 3 - 1)
+  GF_sqr_s(t1, t1);
+  GF_mul_s(out, t1, in);
+}
+
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE])
+{
+  uint8_t buf[AIM2_NUM_BYTES_FIELD];
+  uint64_t ormask, lmask, umask;
+  hash_instance ctx;
+  GF temp = {0,};
+
+  // initialize hash
+  hash_init(&ctx);
+  hash_update(&ctx, iv, AIM2_IV_SIZE);
+  hash_final(&ctx);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t row = 0; row < AIM2_NUM_BITS_FIELD; row++)
+    {
+      hash_squeeze(&ctx, buf, AIM2_NUM_BYTES_FIELD);
+      GF_from_bytes(temp, buf);
+
+      ormask = ((uint64_t)1) << (row % 64);
+      lmask = ((uint64_t)-1) << (row % 64);
+      umask = ~lmask;
+
+      size_t inter = row / 64;
+      size_t col_word;
+      for (col_word = 0; col_word < inter; col_word++)
+      {
+        // L is zero, U is full
+        matrix_L[num][row][col_word] = 0;
+        matrix_U[num][row][col_word] = temp[col_word];
+      }
+      matrix_L[num][row][inter] = (temp[inter] & lmask) | ormask;
+      matrix_U[num][row][inter] = (temp[inter] & umask) | ormask;
+      for (col_word = inter + 1; col_word < AIM2_NUM_WORDS_FIELD; col_word++)
+      {
+        // L is full, U is zero
+        matrix_L[num][row][col_word] = temp[col_word];
+        matrix_U[num][row][col_word] = 0;
+      }
+    }
+  }
+
+  hash_squeeze(&ctx, (uint8_t *)vector_b, AIM2_NUM_BYTES_FIELD);
+  hash_ctx_release(&ctx);
+}
+
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  for (size_t num = 0; num < AIM2_NUM_INPUT_SBOX; num++)
+  {
+    for (size_t i = 0; i < AIM2_NUM_BITS_FIELD; i++)
+    {
+      GF_transposed_matmul(matrix_A[num][i], matrix_U[num][i],
+                           (const GF *)matrix_L[num]);
+    }
+  }
+}
+
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE])
+{
+  GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+
+  GF state[AIM2_NUM_INPUT_SBOX];
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, pt);
+
+  // generate random matrix
+  generate_matrices_L_and_U(matrix_L, matrix_U, vector_b, iv);
+
+  // linear component: constant addition
+  GF_add(state[0], pt_GF, aim2_constants[0]);
+  GF_add(state[1], pt_GF, aim2_constants[1]);
+  GF_add(state[2], pt_GF, aim2_constants[2]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(state[0], state[0]);
+  GF_exp_invmer_e_2(state[1], state[1]);
+  GF_exp_invmer_e_3(state[2], state[2]);
+
+  // linear component: affine layer
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_U[0]);
+  GF_transposed_matmul(state[0], state[0], (const GF *)matrix_L[0]);
+
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_U[1]);
+  GF_transposed_matmul(state[1], state[1], (const GF *)matrix_L[1]);
+
+  GF_transposed_matmul(state[2], state[2], (const GF *)matrix_U[2]);
+  GF_transposed_matmul(state[2], state[2], (const GF *)matrix_L[2]);
+
+  GF_add(state[0], state[0], state[1]);
+  GF_add(state[2], state[2], vector_b);
+  GF_add(state[0], state[0], state[2]);
+
+  // non-linear component: Mersenne S-box
+  GF_exp_mer_e_star(state[0], state[0]);
+
+  // linear component: feed-forward
+  GF_add(ct_GF, state[0], pt_GF);
+
+  GF_to_bytes(ct, ct_GF);
+}
+
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt)
+{
+  // linear component: constant addition
+  GF_add(sbox_outputs[0], pt, aim2_constants[0]);
+  GF_add(sbox_outputs[1], pt, aim2_constants[1]);
+  GF_add(sbox_outputs[2], pt, aim2_constants[2]);
+
+  // non-linear component: inverse Mersenne S-box
+  GF_exp_invmer_e_1(sbox_outputs[0], sbox_outputs[0]);
+  GF_exp_invmer_e_2(sbox_outputs[1], sbox_outputs[1]);
+  GF_exp_invmer_e_3(sbox_outputs[2], sbox_outputs[2]);
+}
diff --git a/crypto_sign/aimer256s/m4stack/aim2.h b/crypto_sign/aimer256s/m4stack/aim2.h
new file mode 100644
index 00000000..bdc50429
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/aim2.h
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef AIM2_H
+#define AIM2_H
+
+#include "field.h"
+#include "params.h"
+#include <stdint.h>
+
+static const GF aim2_constants[AIM2_NUM_INPUT_SBOX] =
+{
+  {0x24a19947b3916cf7,0xba7c9045f12c7f99,0xb8e1afed6a267e96,0x2ffd72dbd01adfb7},
+  {0x0d95748f728eb658,0xa458fea3f4933d7e,0x636920d871574e69,0x0801f2e2858efc16},
+  {0xc5d1b023286085f0,0x9c30d5392af26013,0x7b54a41dc25a59b5,0x718bcd5882154aee}
+};
+
+static const GF aim2_e2_power_matrix[AIM2_NUM_BITS_FIELD] =
+{
+  {0x0000000000000001,0x0000000000000000,0x0000000000000000,0x0000000000000000},
+  {0x13269d7dcfc555c3,0x6fe13874c42fedfb,0xc69f003d9d5abb9c,0x05636fd04ebf7feb},
+  {0x7a273dd9fcec7e15,0x42cd3eb54144ea68,0x5a88aaa3ebaacdff,0x527284e39fae2053},
+  {0x56bb9ab537abf542,0x768c3d772850c862,0x0160d91d288fd0e0,0x342e111e0a022022},
+  {0xcdb998ce4b3eee2e,0x78984c4dc99c90aa,0x2bb89f84c00275b6,0x75c6a0cc065fd4ac},
+  {0x74b2cd2360cb32af,0xbde82f7cf42dd1bf,0x7ceed82d54d965c4,0xf4e9f207aa17f2e9},
+  {0x995d5aab614ac6c0,0x1563800b79242f35,0x1d940184c4509090,0xe6558fd024716b90},
+  {0x8d0b793b4375cc8a,0xfcf792217776a3ee,0x5da44008043b7450,0xc77adf87407cf838},
+  {0x00451596f23df45e,0xd8bcbc0d7ae8534f,0x02c26abe3748db45,0xb37e029dc51a4b41},
+  {0x177dbfce6cbc8c0b,0x62cdd72c8cbd2d2a,0x568802d992bd7a2c,0xd0082d2193b6e383},
+  {0x221e6872863f45c6,0xbe5a9bce6c00df76,0x98c076efe1cfcc67,0xa75bdc7ab5c142a9},
+  {0x088d4e8e27e0b74d,0x71046740fe7e6c5a,0x20123cab6052c1d6,0xa7135d055351c99b},
+  {0x46176449341c7657,0x2a7936011468475e,0xc347e166dca96014,0xd79326785eee3555},
+  {0xc6b77e5a8b6dcae9,0x6dc641a8e07c54d4,0x37055c3ed77341a8,0xd75eaedd0ec6f1d1},
+  {0x5240b9b6f3433443,0x7b7d965745400c05,0x4542be5aec50ec53,0x13e6ac8f2aac12a2},
+  {0x66c30b9da469d401,0xcd5dbf02dc359172,0xf16b3e62f8a57e1d,0x362c2bc9345b97ed},
+  {0xb2a65d5f7da755e8,0x11df10d6ddd9eb84,0x433468d75cb64470,0xb4a6ffd454c82b2f},
+  {0x1c87142145f7c112,0xde2854fa4939dc0b,0x10a503b51b7c7a19,0x174f91701431e1b3},
+  {0x60d8fb32b890cec6,0x27d95c11548f693c,0x30fce7ce95e950b3,0x210559008a309578},
+  {0x5de49c870dd8fb60,0x1f480e246bb2c961,0xdc5efcb1f4ee90ae,0x165c3f5b62136c5e},
+  {0xc17b4bbe4b5780a8,0x690f1102a6decffe,0xa26e146710d9cd7d,0xc7f278fb3f02a99d},
+  {0x4fe7916de7e17f1c,0xe9e59586ac0a7185,0x092b72935bc23437,0xa306568e985edbfa},
+  {0xc05330df507b35c8,0x944475d0eb5c89f7,0x34a3653b083969a5,0x97e431e62e205633},
+  {0x19fe581ef3e9a896,0x720ab1851376eff0,0xda5ca1af445dea40,0xe3899fd1cdc93f2f},
+  {0x7a18d867d11567d6,0x14e706af946787cb,0x2ececbd0e726236a,0x66a864e0c387e806},
+  {0x0a0a9e1dc2c9d30d,0xa1bd85358585db7a,0x78f90bb68d83e25e,0x2275165a7e496039},
+  {0x23f2e1a2057c9892,0xb7f503272b51fa8f,0x0ecf56cbb57a6021,0x77f77f889ecb3e74},
+  {0x237633913a45a827,0x3a2c98b4d38d139b,0xbc1dfd5ddab4bb19,0xf2bcbdc105b017fd},
+  {0x9a53645fca466120,0x07335188ef82289a,0x9cdd8f1434ddc4c7,0x25afc28ddf0c0ea5},
+  {0x0166bda62c3c97ac,0x4821343275a35741,0xa4a1f8ef377f5177,0x3008d4b041fc0802},
+  {0xed498663eb9138f0,0xb16289e1ea93949b,0xa2476ced73badf6e,0xb384ce50cdee1d75},
+  {0x25430e5e2ea409d8,0xf8909d2164becc11,0x77663884798e456b,0xe11b963640c6a7da},
+  {0x2a5ce7930313e789,0x01a1b717dd5e72f3,0x674b4810dda58bf3,0xb348d6cffeee2602},
+  {0xe4871c9932b98648,0x90432c7798b61577,0xf803346f3989e611,0x176c5f43490e3127},
+  {0x28b7ff52a8d039f5,0x2549d26014bcb371,0x7705b13fd068e5f0,0x22f60aec7063b440},
+  {0xa90087e5804b094e,0x17b587e9f7b1334c,0x7e9128a8fd49f502,0x10a15de60dcc1259},
+  {0x676fc8232449f7f5,0xa45eba0b86ee4f8d,0x48d0f0583763ed04,0x9430177369350009},
+  {0x8bb187487d0ca392,0x8b34c408cf71198e,0x4c5b9033c740f6cb,0x15165d415ea592e5},
+  {0xe25b8fc9315d8b10,0x6f067bcaaa5db46f,0xc0d574e6df163bcb,0x76d62e45eeb26cb3},
+  {0xc7bb4eaa81af7e21,0xc0c25e2c4da66ca9,0x20a5b7a6ef682683,0xe0c40a42bed8c878},
+  {0x340b283a1f67eb72,0x94c68ac57747d7b1,0xaab540d8883c7e78,0x53ffb196e81fbce0},
+  {0x03d1fe920cc5c8b6,0x2d058e7c02de80d0,0x349140f34518313d,0x52d8d34dce452897},
+  {0x3daf5481e615a4ec,0x1d21ddb2b19865a7,0x28572f8e3caef8c4,0x94f0069367dd5a9c},
+  {0xf97efd31544a2432,0x79cc100bcd1c95c5,0x630dd7dbdcda2efa,0xb0c94889efaeabe6},
+  {0x1855a973cd69d2ac,0xa249d1e68760fda5,0x9bd185166791f0b0,0x73aad654a16f87d5},
+  {0xb64f4c4f69887572,0x0dd0ddfafeaec759,0x9a2b2e01a2dfdf21,0x23e6842e19958e74},
+  {0x47126f2ed9d35243,0x2dd26a5dc07d8ab7,0x5f7a0864bae59fef,0x84bd4c2d7eef707e},
+  {0xc2b75aa6809fde33,0x4e05ff4138a1458a,0x4283e814ca9b30b5,0x46b1bcf0f62d4313},
+  {0x83f0c7c594f6cf9a,0xdb8a4b8e5dfe204e,0x44a803aecd550290,0x96cc8907871fc11e},
+  {0x7ca33f7d36e71a53,0x609b8f2296791418,0xd9e9118ba8ddf5e9,0x813002deae63def5},
+  {0x5e3805abc5d66c85,0xe95aac205db8a39d,0xfad61d269550a976,0xc0c3e22037926992},
+  {0xf3ba3f8e2a564d34,0xfd74426f936299c0,0x23bb54e8112b82e3,0xc5afe8e8365a6000},
+  {0xb733edd6855182ef,0x5ecb1ae3728f48e8,0x3b8b1ce5bf96e304,0xf3aba2a7bfac4c59},
+  {0x78f2ea71794eaef2,0x59f25ef7fe359b84,0xacfd3e59513654c8,0xd1e24fda7d0c3936},
+  {0x288da25da8b17fb3,0xbe107e7feb777a7a,0x166db15573baae6c,0xb5ccbf5cfe3e5135},
+  {0x4637849d0285089d,0x4f671ebc0437c2ce,0x188565bc785f8268,0x712dec2cd1ba005e},
+  {0xa25a6b6a471a00b1,0x6e1a6a380bb57611,0x3ef50b155eddd23d,0xd3788fef109d4e3b},
+  {0x4f403f37eba563c1,0x76a201773cddd009,0x58fba6bec18e06a6,0x11a19d4cbf2a6331},
+  {0xe3e6bbb73066a175,0x9748c56fec4b9fa1,0x406aae141855018c,0xa1410c0e735df446},
+  {0x5e569e71e70eb719,0xa673071887dd4687,0x07055d8d0a23d785,0x74d498384aee1190},
+  {0xa0e8a89b6fb6984d,0x908716f3ce5edf66,0x0a2b9e842b73e729,0xa1b9171e0b83204b},
+  {0xbe7532657aadaa20,0x1b66940116e06582,0x7385fd540009963d,0x847a9b51570e7ff8},
+  {0xe9395fd61662cbe6,0xb3a286d4b91d1353,0x455b0689d3ff2d83,0xd56078fc7681e787},
+  {0x8b470957a3441b8a,0x7df431ebbf7e447b,0x0e0f4fa397edd83c,0xd793865c1388620e},
+  {0x7b29927808bfa739,0x96e65ce20d51654b,0xaa8fcec0d3c045c3,0xe5f31c0e239b4fea},
+  {0x5525c2a74e77bf9e,0x88cf3be85881afff,0x7c81312941d70c3c,0x23d8a44e23a9c737},
+  {0xb869097f96d421f1,0xfc5054b0f253daf5,0x1c241e84b424d6aa,0x32b29f522eb351e7},
+  {0x6a466e2ed7c0ad0b,0x5590c446ea6f583b,0x56d2464d3ee4d099,0x068910c7eb32dd95},
+  {0x71139d1bc66bb641,0xb3a1027da065feea,0xe04294fcf6174557,0x81dae384498adb46},
+  {0xf43ed00c527a209a,0xa5754026d1f22c89,0xc78a8d365f196923,0xf5154817fc84f220},
+  {0xae764c7fe7341054,0xffc86134dc4d880f,0x1b6a1e1530d66862,0x250c95737b7b8284},
+  {0xbfee6b3c1e46c128,0xa78dc08ba0e7251d,0x3a95f11bcef9d4c7,0x34f2831709c6a420},
+  {0xe3a3c1aa9e2407d8,0x4c1a200af1077851,0x8965a32110544d77,0x6354a05036f3f5a7},
+  {0xbd108a58fc17d8a6,0x61b0351824a54794,0x499e7fd9fdd626df,0x850217a6be595511},
+  {0x53f2510fb68b5c61,0x5b122cfd2501b4ba,0x7fc88679758e8262,0x233472936a675422},
+  {0x11965eaffc401c95,0x0af31e003ba1fb12,0x2facfdd6611b7f8b,0xd67eaae060c88abf},
+  {0x6fa46680edff5f3f,0x454b6266e25e87cb,0x9addf096cb1df0af,0xa6de67c1da83476b},
+  {0xbf6f0cb8a600033a,0xf520f28cc3846c4b,0x008f972a2108bd6a,0x55bbe0da272b6cb0},
+  {0x9bf38905d29c13e7,0xc50cd62db6acc3da,0xbb9b791e0d47ac11,0xd54b025508c245d8},
+  {0x3a2547ab532ec9ff,0x79495ddf670c8bc8,0xdf4ed2dcee44e1bc,0xc2e52f1fc1f7d4d5},
+  {0x4800ee52ee97ecda,0xc9d9b772550e380c,0x98506ba8ea5ec019,0x21ffafa8b46c668f},
+  {0x3464a9138085b307,0xf67a192be113e9cc,0xfdd61b66e0e162dc,0xd612aba17d397d2c},
+  {0x16207c45e571aabf,0xf2583066040bf4f7,0x4bc24730dc4d62f5,0x608b3d1e61a60b2f},
+  {0xc2a6d2c707faaab1,0xc9cfa575f99f891a,0x61ea461507f40f96,0x67104299d7331a82},
+  {0xaf1c8fcbed1f1699,0x985767a5dbb95b90,0xd6ae3b3279c96a14,0x275ea501029834e7},
+  {0x4e19e32114de1e9c,0x165f71d116e0afb9,0xe968cbf378c1a2f7,0x912182eb2d02ef2d},
+  {0x6e4e3c81caceef19,0x85f15b2e37fe2cbe,0x8ae88fcc89bb8687,0xe50b4d7659484c7a},
+  {0x80353d06c9930d5c,0x723d1f993acaffad,0x89e273ac935dc5e2,0x51356090a9eecbf9},
+  {0xc3bd743bf118e69e,0x78fe213d42306293,0x90638ea842ff3668,0xb0addcda3683625d},
+  {0xe26008c6b83cc264,0x74bbbd5777680be8,0xa8892126f9cc485a,0x54899977a5cc34a1},
+  {0xd19b2baf7fa0c771,0x39d199b5dfd41569,0x7c3c66294bc7b31d,0x81bb86cd53109ac5},
+  {0xe4a790156b11f26a,0xb496c49018830c99,0xf19e574456b9d549,0x867aa70b9bbd4fd0},
+  {0xb8ce927c2afbcba9,0x3ae3f9d11d478318,0xebdecea6a113ffd6,0x071def720f45ca33},
+  {0xa18c4347c3dba5da,0xc231d50db69b59f6,0x784caea3c01900f9,0x21b179202d1177e0},
+  {0x48d839b0e148b37a,0x119910fe9c00220e,0xf6959f7654a471b7,0x138df428ee1ab05e},
+  {0x2378b25ea2d743c2,0x52a0660820b6ff4b,0xb20d6835419796a6,0x77d41062fb9a7654},
+  {0x1e63666141c834dd,0x534d884045bcdedf,0x07b52ebe10206e92,0x67cb1a5c5d2017bc},
+  {0xbd489efa4249447b,0x81b1f830bdd020d0,0xb8db0042e390a71b,0x90b877cf8d8200e6},
+  {0xd91a2f7fe76f986d,0x2c6fcd64257849b8,0xcec2c4be6ecbe77b,0x5031f045518f6b98},
+  {0x3cc9f99a10cba6b9,0x7df264605ea09f19,0xc6099006fa2f35a0,0xf31aa1999c65f2ed},
+  {0x7322250ccd66f2d2,0xa8cf62816a34838c,0xf7bd30878c6d359b,0x450a14aed0d49014},
+  {0xf753996b7d7c1d54,0x45e2b366fb683eae,0xcef4cef44af75b4a,0xd1e647d51db49a04},
+  {0x257099ec419b94a6,0xd4a8a9f3335fcd10,0xa286788285415010,0x023c9feb9c1e9901},
+  {0x229d6fd7eed1531c,0x04cefb6c19ff0062,0x9130be016eed6e29,0xa1a04435eb4cd39d},
+  {0xefbd279ed0b045c6,0xe8ec58f13b1a927d,0xbabddf060b172c30,0xa5fd98adc4c9d7f8},
+  {0x0f859d44ce18448a,0x07af518284a5a680,0xff7565589bc19136,0x72e50c2e9eaa580a},
+  {0x6470f3d6724b5dd7,0x8b0ebb24be876d22,0xfb604e14fd34a2cc,0x213fc1d31fbb7996},
+  {0x50e1d4f6f24a3685,0x69348d20cb64f7b6,0xa13da095f7678267,0xb63a6ac7a66c3284},
+  {0xb0edaccd9a8698dc,0x73d7ca79b1672272,0xaed4ffd76475e235,0xf36b5b0cbbb22a1b},
+  {0x24acd40ab0b10aba,0xafb39e3ea0656a92,0xcfe743611a51fa5a,0xb4f8251f0f0e0d41},
+  {0xe8036bb95086dfdf,0x3d5d0332c379fb16,0x3029edc150437ed5,0xf561ce7ace559b0f},
+  {0x01047fd87eb154ca,0xce04d75cd86f0d9d,0x33f6d9a762e84d0b,0x52f77f2619632746},
+  {0x3fdf7a3e2584aaa7,0xafdff63009b07776,0x24496f671e85ade5,0x35b2e80c0abfdee5},
+  {0x4bb3e9185acc78b3,0xe5634557a7f532a4,0xa6a979853e645782,0x97e9a6c3f5ed6068},
+  {0x41685f9547d8c651,0x6d4bade8828daeda,0xabe0dcd781a5b523,0x3528952d2a770f19},
+  {0xe4e43b26b587ea84,0xf0f3f420178def6d,0xd48cb1f978a8bb2e,0x25de266fb8567a86},
+  {0x2906276141285c5c,0x045688d8cac52240,0xa1a62b2fa2474687,0x917244641b004f87},
+  {0x73897ebb86a40eb0,0x0df1bc6722ab333e,0xb7815fffa0c79792,0x322111adf2c83d06},
+  {0x4dd181aa27fc54fa,0x47b557267a691a35,0x089b8ed1303c2515,0xe60b63596c40b943},
+  {0xe574bb3f5e1d3fe5,0x7e5e1dd1aaea6c56,0x443b9d58176d285b,0xa2c066cf80f1c62d},
+  {0x9df2b1fe93b4cb69,0x5dc5dcbd7bcd4304,0x4cac45f5c51659e4,0x9039bc7472f02b80},
+  {0x81c7d14b2ff6f3d6,0x76b7422e6f000e01,0x23e23fa520ed280b,0x50a4f9ded0d07978},
+  {0x154548397391fa38,0xb1ec123aeb772341,0x22f40fd3abeee812,0x0342edcc39a77162},
+  {0xa7ef812f5e9d9ba6,0x65de86bcc8071b0d,0x4b9bbe60fe0a1fad,0xf4f8322efc5e2f45},
+  {0x21fbeab48a7c1136,0x42736db042991d3e,0xf78c442fd2ed07a6,0x36228053a90abb56},
+  {0x6ebfcec360d88021,0x7deeafd7cae1b159,0x6f32c272246a4999,0xb2f984f6c2b488dd},
+  {0x76beda6b3d15abc7,0x1bc04ff70ef9d0a9,0x75ec5c46c4854ec0,0x77bd25a817826a51},
+  {0xf79c8d9bd7aaf4f0,0xae5add9fd1454f93,0xd9f264167923d698,0x273bf89c8b33a9ec},
+  {0x20ba5517532e42a7,0xd9991aaa0bcb040f,0x81ec69b31aea8c89,0x823ee1a07f410f90},
+  {0x3e10957041e49998,0x9746fdf5f3deb53d,0xbae6be6d5a7923dd,0xd4aa255a7e60b5f8},
+  {0x453e76f50e50f914,0xba084020e530dd32,0x90e9982f02a0b2e3,0xc1bf6d0c93565fd4},
+  {0xbb44043183434a96,0xb6839987e4d3fbf5,0x780e11ff154ba921,0x46deb765191c6fad},
+  {0xf254860f62ddca11,0x2b40c2147fcc1618,0x9b9df4f2213a87f5,0xf5d9f1982bf72085},
+  {0x9ec887ef1dac7ea8,0xf9b9f41cd1a90cab,0x5106c66727088891,0xa079314a8a7aa0cc},
+  {0xbaca971f705d6820,0x32cf35c216d31b74,0xcda1b48f6a782676,0x42dd0c61745b57af},
+  {0x774f50e70700fa3c,0xfe706a77d17875e0,0x50acd4b9e4f085bc,0xa70b2f3a3373b5cf},
+  {0xa3d467e6532333ed,0x9143409c675fea0a,0x186d4c8b7de757db,0x006e698e91bc1742},
+  {0x042690d62241c815,0xf8a04fddc8420797,0x9ff8cf1394eaada4,0x921b7749e0687334},
+  {0x3ba03d72cd709236,0x12f95d885e21e3d3,0xba18560bb5d4d50a,0xa3607627494476ab},
+  {0xade31f9ca5377f89,0x635510178eec1003,0x3dca939c351bf98d,0x339c87aee1cf78dd},
+  {0xe45a6287cc1287d4,0x7cf6c8c56ed07634,0xadf6eda911dd0200,0x87211a5d3722f0f6},
+  {0x7b07d341c0de902d,0x69838993df5c9429,0xb8921642be862244,0x555819247b006cca},
+  {0x4b8ebd3e261a1065,0xec8c767eb1653ceb,0x482e17c892519544,0xb61af0cc04b533a5},
+  {0x4fb9d38c4e2f7113,0x50030b8523699320,0x5716f5c60cedd7d8,0x0673e662c18aadef},
+  {0x641233031f77a5fb,0x1932c76990a0d465,0xb79ab4fbf32c92e8,0xc0a7370dd0467550},
+  {0xdb899bf50910763a,0xf026477f262eb097,0x76b70a1b2163a0d4,0x93a2873f23165f6e},
+  {0xba2a66c196ce2eb8,0x19383fd3ffab287b,0xaed33c3223646076,0x1274559077e98698},
+  {0x035a94843c44ec7a,0x6de99478a3c009e3,0x8a7ecba43ae87e6e,0x458c9cbfca30c71a},
+  {0xe3695ac8419682c5,0xaeee4d4d0392ec66,0xd99792a67250c187,0x91e0f202f4c924b3},
+  {0x9c784cfbf5192c27,0xc113eee0e80c2eae,0x3f7b5a6101ce5f5e,0x842e2d646ecd9d6e},
+  {0x957028a6befc0d73,0xcbb8df5afe2e23c3,0xc00f5c490d8dafb4,0x67d7ee99cddb8452},
+  {0xac8c3e869f704d2d,0xc928ad50bd4faf6e,0x114a0001a078d1a0,0x8375ad6cc681586b},
+  {0x59a53e3fb149cca9,0x69cf3f7ab419768e,0x79d945a746a788b8,0x979b7e9387ae017f},
+  {0x41f7712568f43935,0xf17647a51bb6cff9,0x593eb0f68e21db19,0x77bf0442e77fdbdd},
+  {0xd430085cbe62c90e,0x445d0af933a0c884,0x92f5c9b29a5de145,0x6778e9aad04a6c94},
+  {0x4914b4bd446c5d64,0x21b19c795fec736d,0x72cf9cdf7fa1c0db,0xf67226412058b23c},
+  {0x1e7346a99e1464a3,0xacb82da3ac217e94,0x4d1f4486473e6c18,0x23274da141c63725},
+  {0xf58a0445c9b4903b,0x4f196615648056a4,0xeaf0d8fc78e51fe3,0xc71e969830bec69e},
+  {0xcec3175fd17dee42,0x6fa60eda34cf3b0f,0x016ff6fe365a227b,0x148ed225daf52abf},
+  {0x5eb5954a6c060dcd,0x67ed2e3411fbde9d,0xdaddfd054f15c5a4,0x80e12ae0d1591ef3},
+  {0xc9c76eda44553b71,0x7c4675538cbdcd1e,0xa2128f16928c1efd,0xc13aaef8cfacc959},
+  {0x525318d3ea7544bd,0x6f3e0f4d85ce7b2a,0x397102e6892ab449,0xd028319bc9ef0676},
+  {0xc55bc06690da6f96,0xea6a73d17ce2969a,0xfa21bd37fa658e1e,0x32d421c8c9a9d437},
+  {0x4f53f0e462a9f4f0,0x18c65d2ba362d43b,0x53b8871400599e70,0xf291e9ac535cfe6e},
+  {0x2a420a66918ee17a,0x4dae04d613a5a05f,0xc12c868048f09ef7,0x900c4ca4fb306ac2},
+  {0x357f0638ee05acbc,0x389db47cc78620f7,0x3c531ff5b9fff02b,0x902c96f5fb2c18f9},
+  {0x57abb6151ae9319f,0x917bd98253c43360,0x36b4e4e17d9c5182,0xc2a4751705897c3b},
+  {0x91ee0ad214084c1b,0xfe17b657a9ea9054,0x7b304880e7a3efb6,0xd497c8cea46cf443},
+  {0xb97e1c63dcc46441,0x22898ec1ecb0f186,0x40dd2915e34e92ae,0x83e63e8886604034},
+  {0xf159f13af4545efc,0x6b0312cbfce549f7,0x1632f9e6624b3c5e,0xc387a21c7c20a6d6},
+  {0xe81b4468c49ba628,0x9962cd4b58abb1e7,0xda2145ce9fe59f2e,0x6021807944cfc8e1},
+  {0x9e98852b17310f23,0x3cbe1c8bceb45120,0x0e165b29c57ec0eb,0x305bf854fb1aea8d},
+  {0x1c3dbdac479d54f7,0x4cda9c1c1bbb1a19,0x7d330c571f17bc88,0x826548b30e26b7d7},
+  {0x446afa2ca1809535,0x8d3c9693ee673350,0x7893a83f58de1ffc,0xb19954f7647195ff},
+  {0x21b77a7b577e945a,0x0c3e91d3f1f89e09,0xdd7b8e8a59fae93c,0x6435f276c4582559},
+  {0x4d0e6426007bc199,0x5c13184bcf7dd24a,0x26f1f87322e213d0,0x97243e676a3eb387},
+  {0x14cbfff5b787dedd,0x355794e80f8cd847,0xf2c951e3c0d77a3d,0xe558cf2f7b5f2991},
+  {0xf87b23ea7452e43c,0x92521695b010b548,0xb7af363918a98cf1,0x473e6304c6f3f9cf},
+  {0xe86f5e030902695a,0x884c59759075978e,0x862a4f44f20c857d,0x2348092c2d62a7ed},
+  {0xbebdf27580f800b3,0x4c82348a99cfaf36,0x7fec6e2fb343c70a,0xd0a2b036a8d95707},
+  {0x59ef03fcb5a57f39,0xfb04bb079290dd73,0x30e0751c7c8e4263,0x4078bcf952cf1a62},
+  {0xa19ffba37095d58a,0x9d164dabb30dff6a,0x16de88d2bac7642a,0x8232b5dca704cbcd},
+  {0x329dfc2b2636492c,0xf0397ad762a31307,0x78adfe730ebe751e,0x5783b8d9d2f05dfd},
+  {0xf2d6e8a736f23aa1,0xe2102f9bd2267093,0xdf2af690beecc500,0x11398c83a817f593},
+  {0xd46565aaafab2385,0xddff3f9a0b99928d,0x5eb2072a49c5a5ab,0x53a03f6a8eb6a094},
+  {0x57fb689ec7092868,0xc2040eb173de1a44,0x810031fb7b19e630,0x53960a9b3b1ef568},
+  {0x40007920454fbf71,0xac025a589e98d1ef,0x9e256036a7fbd143,0xc13cf073bd649440},
+  {0xd06cd6829f0fea3d,0x2a51b1d71d1ac07b,0x3546a5854571bbc6,0xc30b6bf46c0b42fe},
+  {0x62488646a13da231,0xe28973393fb6f682,0x9ed13dc9f5432f8f,0x31b84f2be241c94e},
+  {0x9bb19ea5428d66ae,0xe0080b8616f3babc,0x9610055711788ae5,0x7652d184a46c90fe},
+  {0x112c63f926d9850e,0xe5905a268850e663,0xb9fd3996e6d72608,0xa7aa33543146d58a},
+  {0x77de728df392f575,0x637633946129f8e6,0x72a867e08e3bfce6,0x754f7149e15a365b},
+  {0x3511c4139b98679f,0x56a8a361da8cbe81,0x2a34d15423a9eb45,0x82ae1da57cd32e57},
+  {0xcb3ecb886171f719,0xfcbf82d884e8e020,0xc6d2502bd1e6f6cf,0x80bb7b1db5c2a777},
+  {0x81b8745892f03d2d,0xbc5f38b14116148b,0x4b6d0194055b86d8,0x241dbd17e3eb4ba9},
+  {0x5bbc585152fcd142,0x930f31c230a2050e,0xabf51e10a3e969e5,0x72a0a1c90ead638e},
+  {0xcadb18ff93f7f93e,0x1b8e009b5719bf82,0x743c0ab2c8bc284a,0x7144a02ff1130223},
+  {0x41b95e62522de019,0xcd3465a01c9b93fb,0x236600ff15e70ef3,0x3658cd0c29ea6f20},
+  {0xb9c59bf0b27dc282,0x47955c29304112de,0x3f16c72af19bcb3f,0xa0e568c9c5397d69},
+  {0x9251cf7a209add18,0x8e3a95a336fe4170,0xf28c14a751527126,0xb3d3a9a208590971},
+  {0x5b129f35a37c28ff,0xe3f8ba25b41817b9,0x200b734d2501265f,0x52344985724cceca},
+  {0xa8e27fd1e60dffab,0xa8ea4523b64f5aa4,0xa475b8437f8165a1,0xd644c1691c3c7548},
+  {0x5ddae2f669e64957,0x1fcef31f0b9af756,0x3e6da61c7980074e,0x206f828242ab6764},
+  {0x33144ea9f76bb631,0x9f36e03e21fa3065,0xfe08e97dc86bceb2,0x640b723c98cd7479},
+  {0x1636152634146114,0xc18c0793a80805cd,0x2b106edd3834043c,0x4191bf5c7fbacdf8},
+  {0x429dddfd03ef7bc4,0x4db9b9d6da197cd3,0xe74baaea7f22abc0,0x4364ff1e20f72e64},
+  {0xca8a9a678e94da68,0x6535f14dbed15563,0x98f34f0a20bd3f3c,0xd12c84164701e27f},
+  {0xc02c8d4c379b7ce5,0x7069499c81e1f16e,0x9bf97727b1a05c04,0xf27fa10bb0a78610},
+  {0x4cf536f0cf11a349,0xbd9dfa2a6eb41391,0x565f1d6e23bbbc0d,0xc76bbb697c18cf7f},
+  {0xa17601bde8ac478c,0x8db87c51403e365d,0x4088a87a96d9c622,0x31f82a7918dd0d06},
+  {0x29ee14687120f04e,0xfea2e736c3636d5c,0x7f8c89823855588a,0xf0da86215a008e8c},
+  {0xdd645ec1d816c223,0x0aa7edbc5ba5d0cf,0xfced1c8e126396e5,0x201b07bc6f65eddb},
+  {0xfb25e20cd48f4855,0xa8b3d1435e85371a,0x3ee9acb3f939329e,0xd075efbe502f25a4},
+  {0x0541c9b35049c704,0x94986dc9cd668f39,0x17f4cfb2726cd68b,0x508c14a670636ed4},
+  {0xa2b783ac55d68039,0xc130ab2d841d773e,0xd6d29b14f588465d,0xb790ad979cce43f8},
+  {0x4f8ce0df03c43b98,0xbbda15818c06d7a2,0x380dd95f0f042fdb,0x05f429bccfb597f3},
+  {0xc742e63ad5c5f5e6,0xcbcb225fbbbe33e2,0xa8edf59089d52ced,0xa0e788a338b45f4d},
+  {0x20e95da4bdb0c82f,0x3e63b532cc85e2d9,0x163e3d2b90d4ddaf,0xc71593e07530219a},
+  {0x7992357ab8d37b59,0x4aea96f315f3c064,0x1ba04f945b33146b,0xf65bed5593247ff4},
+  {0x2d4ad59bdce5563d,0x3a24253d449dc88d,0x41c7ffbd062c28f4,0x42734ae219aa9361},
+  {0x644204f2ea9b71e3,0xe551983ade3b5122,0x1bc727382db55ea1,0xe276d03e4bd6fc9a},
+  {0xfb20c1e51a924e81,0x2f795f1d4507decd,0x154de4d0aca02046,0x72ddcd99451381dd},
+  {0xc09ac8020e255c2b,0xa4eff29a2c29d3f3,0x7977c4f4c2f24381,0x349ff7a6efa4d791},
+  {0xea5d2cd9592cb4e6,0xf63dcd3ff0c8104a,0x66d7254c1252ca0d,0x822791068962c667},
+  {0x7b9c477dde2ad4a4,0xd3460c638eb797b0,0x1889eaef7acad771,0xb23db19bd8554e11},
+  {0x6f1c469240cd647f,0x31825907e279b274,0xb97cebbf2c37c29e,0x74ce50e87690b22e},
+  {0xfb92d64637ac0508,0x97999c37b92d0720,0xa23a9e76c1578849,0x66aa9c79979e14fb},
+  {0xcf78e912e65a8877,0xb7dcb878bdeec090,0xe678ac56695a99fc,0x0338870b34c11cee},
+  {0x5529c228e771c374,0xd8ab910e6e0a23f9,0xcd86f7b11bf07839,0xe3358c0867358f64},
+  {0x7c0e69e5db7dc1c3,0x355a9bbca9523a64,0x86985b53d32a3f4b,0xc715ea89b184099b},
+  {0xac499c49b8a4cdc4,0x22485e1df13ea826,0xf91367c2ad8807da,0x863b3b9193879ebd},
+  {0x8086427544d93f9b,0xf378d24905271a4e,0x8a2211f2e881884b,0x27f11aae6fbdeb19},
+  {0xd4d702e312991728,0xd57d86c18df5deb9,0x68b550520aac07f8,0x6163e0c25242d715},
+  {0x0484539b5bd55737,0x69b34b6b4664d575,0xcafeeeea78048b31,0x25a0aca017ec768d},
+  {0x955f03fc32b86250,0xb3ed04233eabbafd,0xb4da5d10fb30568d,0xa1d5c520656d8e7f},
+  {0x18e6772ac0c7b0b5,0x1e8c41bfa134bd72,0x36b1b28f157526b2,0xf5ac9222151e43e1},
+  {0xb500af50c3647566,0x181d28f85aca7575,0x9a16455dfd6341a1,0xc6d058b2c1e37c22},
+  {0x01b46ff0be3c6ef8,0x7f5abf4a7e4a72fe,0xe18780f7372db81b,0x91d1172dbd7c1d3b},
+  {0x62e68a7598567ebd,0x4654b8ed6f377911,0x051bf02a5685ca63,0xd08b010696df1fa9},
+  {0x656ce860674f0d36,0x8bcbf7bc1ef730bf,0x00a0260df392d280,0x33930145fd64eecf},
+  {0x17743293297fc288,0x5b59ca56522bb36a,0xe58ef14098fd4053,0x7444ed68eb16e657},
+  {0x31beae245608121f,0xea349f5c00e7cc25,0xf076aacf6db8c528,0x13c58f0b1e99ac1a},
+  {0x910f9e30c8455d7b,0xc1ebc494beb98220,0x201a3557ec66e851,0x610dd21bbd2f6b9f},
+  {0x317d8fa79aa99e03,0x7b670f771c4590dd,0x77052e1a54ac4638,0x17309eb8c690df96},
+  {0xddae9fdfd80030d4,0x84daf3404eae25e8,0xe93997a2e172c485,0x51f2159ecb7b5e41},
+  {0x9f02a3e12da8bc2c,0x1c746f4b943dc8e5,0xb31951aeeaac4e5e,0x0128a606643b4341},
+  {0xebd158803af98ce2,0x08e82db8ead7c10b,0xba172e80caa61667,0xf61ff900e1918b8a},
+  {0x8c3c570f9ffae2bc,0xff0827921f27e4f5,0x6256d4a0913919b5,0xc1f4fcc60f17957e},
+  {0x648ade6556f9d114,0xf2e85e1746058ffe,0xc9605989ede623cf,0xf3d09098541725a9},
+  {0xc57b49460d911255,0xc0767005f4affb44,0x486c21436602612a,0x87617ddb2a9643c0},
+  {0xc2038cd71c6d3ead,0x8fe1e58a5096a181,0x51cde6590d0f6b27,0xf59bf938475aa39a},
+  {0x9d8138454badbf16,0xaf8306904b15d8a8,0x83bd9fd79c159b39,0xb85db82acdbbf3ae},
+  {0x560807274e8b13db,0xb33b8a036f1617ca,0x72bc05868c923532,0xb7b8ee25c3388851},
+  {0xc042df127c4f6747,0x704ed715ba3ca7d4,0x678f93c55bc0c5d2,0xd2ee482f0bfe6c9a},
+  {0xbd60c5ba33d87b10,0x6c2ff096c60536d6,0x0ce4b4b8c86a8f5b,0x86a0bcebf81d6e4d},
+  {0xf9384ef3a44799c2,0x8b78ec1c676a7fcd,0x5f7c3edb312b00da,0x2390763c1712af67},
+};
+
+#define GF_exp_invmer_e_1 AIMER_NAMESPACE(GF_exp_invmer_e_1)
+void GF_exp_invmer_e_1(GF out, const GF in);
+#define GF_exp_invmer_e_2 AIMER_NAMESPACE(GF_exp_invmer_e_2)
+void GF_exp_invmer_e_2(GF out, const GF in);
+#define GF_exp_invmer_e_3 AIMER_NAMESPACE(GF_exp_invmer_e_3)
+void GF_exp_invmer_e_3(GF out, const GF in);
+#define GF_exp_mer_e_star AIMER_NAMESPACE(GF_exp_mer_e_star)
+void GF_exp_mer_e_star(GF out, const GF in);
+
+#define generate_matrices_L_and_U AIMER_NAMESPACE(generate_matrices_L_and_U)
+void generate_matrices_L_and_U(
+        GF matrix_L[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF matrix_U[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+        GF vector_b,
+        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define generate_matrix_LU AIMER_NAMESPACE(generate_matrix_LU)
+void generate_matrix_LU(GF matrix_A[AIM2_NUM_INPUT_SBOX][AIM2_NUM_BITS_FIELD],
+                        GF vector_b,
+                        const uint8_t iv[AIM2_IV_SIZE]);
+
+#define aim2_sbox_outputs AIMER_NAMESPACE(aim2_sbox_outputs)
+void aim2_sbox_outputs(GF sbox_outputs[AIM2_NUM_INPUT_SBOX], const GF pt);
+
+#define aim2 AIMER_NAMESPACE(aim2)
+void aim2(uint8_t ct[AIM2_NUM_BYTES_FIELD],
+          const uint8_t pt[AIM2_NUM_BYTES_FIELD],
+          const uint8_t iv[AIM2_IV_SIZE]);
+
+#endif // AIM2_H
diff --git a/crypto_sign/aimer256s/m4stack/api.h b/crypto_sign/aimer256s/m4stack/api.h
new file mode 100644
index 00000000..b19100cf
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/api.h
@@ -0,0 +1,44 @@
+/*
+NIST-developed software is provided by NIST as a public service. You may use, copy, and distribute copies of the software in any medium, provided that you keep intact this entire notice. You may improve, modify, and create derivative works of the software or any portion of the software, and you may copy and distribute such modifications or works. Modified works should carry a notice stating that you changed the software and should note the date and nature of any such change. Please explicitly acknowledge the National Institute of Standards and Technology as the source of the software.
+ 
+NIST-developed software is expressly provided "AS IS." NIST MAKES NO WARRANTY OF ANY KIND, EXPRESS, IMPLIED, IN FACT, OR ARISING BY OPERATION OF LAW, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND DATA ACCURACY. NIST NEITHER REPRESENTS NOR WARRANTS THAT THE OPERATION OF THE SOFTWARE WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ANY DEFECTS WILL BE CORRECTED. NIST DOES NOT WARRANT OR MAKE ANY REPRESENTATIONS REGARDING THE USE OF THE SOFTWARE OR THE RESULTS THEREOF, INCLUDING BUT NOT LIMITED TO THE CORRECTNESS, ACCURACY, RELIABILITY, OR USEFULNESS OF THE SOFTWARE.
+ 
+You are solely responsible for determining the appropriateness of using and distributing the software and you assume all risks associated with its use, including but not limited to the risks and costs of program errors, compliance with applicable laws, damage to or loss of data, programs or equipment, and the unavailability or interruption of operation. This software is not intended to be used in any situation where a failure could cause risk of injury or damage to property. The software developed by NIST employees is not subject to copyright protection within the United States.
+*/
+
+#ifndef API_H
+#define API_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define CRYPTO_PUBLICKEYBYTES 64
+#define CRYPTO_SECRETKEYBYTES 96
+#define CRYPTO_BYTES 17056
+#define CRYPTO_ALGNAME "aimer256s"
+
+#define crypto_sign_keypair AIMER_NAMESPACE(crypto_sign_keypair)
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+
+#define crypto_sign_signature AIMER_NAMESPACE(crypto_sign_signature)
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign AIMER_NAMESPACE(crypto_sign)
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk);
+
+#define crypto_sign_verify AIMER_NAMESPACE(crypto_sign_verify)
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk);
+
+#define crypto_sign_open AIMER_NAMESPACE(crypto_sign_open)
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk);
+
+#endif
diff --git a/crypto_sign/aimer256s/m4stack/field.c b/crypto_sign/aimer256s/m4stack/field.c
new file mode 100644
index 00000000..5c27f63a
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/field.c
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: MIT
+
+#include "field.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define mask_64     0x00000000ffffffff
+#define mask_final  0x000000000000ffff
+
+#define mask0_64    0x000000ff000000ff
+#define mask0       0x000000ff
+
+#define mask1_64    0x000f000f000f000f
+#define mask1       0x000f000f
+
+#define mask2_64    0x0303030303030303
+#define mask2       0x03030303
+
+#define mask3_64    0x1111111111111111
+#define mask3       0x11111111
+
+#define zero_padding(x0, mask1, mask2, mask3) \
+        x0 = (x0 | (x0 << 12)) & mask1; \
+        x0 = (x0 | (x0 << 6 )) & mask2; \
+        x0 = (x0 | (x0 << 3 )) & mask3;
+
+#define inv_zero_padding(x0, mask0, mask1, mask2, mask_final) \
+        x0 = (x0 | (x0 >> 3 )) & mask2; \
+        x0 = (x0 | (x0 >> 6 )) & mask1; \
+        x0 = (x0 | (x0 >> 12)) & mask0; \
+        x0 = (x0 | (x0 >> 24)) & mask_final;
+
+void GF_mul(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t temp[8] = {0,};
+  uint64_t sub[10] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[2] ^ a[3];
+  sub[3] = b[2] ^ b[3];
+  sub[4] = a[0] ^ a[2];
+  sub[5] = a[1] ^ a[3];
+  sub[6] = b[0] ^ b[2];
+  sub[7] = b[1] ^ b[3];
+  sub[8] = sub[4] ^ sub[5];
+  sub[9] = sub[6] ^ sub[7];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &t[3], &t[1]);
+  t[1] ^= t[2];
+
+  poly64_mul(&a[3], &b[3], &temp[7], &t[2]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  poly64_mul(&sub[4], &sub[6], &t[1], &t[0]);
+  poly64_mul(&sub[5], &sub[7], &t[3], &t[2]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul(&sub[8], &sub[9], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] = temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] = temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] = temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_mul_add(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t temp[8] = {0,};
+  uint64_t sub[10] = {0,};
+
+  sub[0] = a[0] ^ a[1];
+  sub[1] = b[0] ^ b[1];
+  sub[2] = a[2] ^ a[3];
+  sub[3] = b[2] ^ b[3];
+  sub[4] = a[0] ^ a[2];
+  sub[5] = a[1] ^ a[3];
+  sub[6] = b[0] ^ b[2];
+  sub[7] = b[1] ^ b[3];
+  sub[8] = sub[4] ^ sub[5];
+  sub[9] = sub[6] ^ sub[7];
+
+  poly64_mul(&a[0], &b[0], &t[0], &temp[0]);
+  poly64_mul(&a[1], &b[1], &t[2], &t[1]);
+  t[0] ^= t[1];
+
+  poly64_mul(&a[2], &b[2], &t[3], &t[1]);
+  t[1] ^= t[2];
+
+  poly64_mul(&a[3], &b[3], &temp[7], &t[2]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul(&sub[0], &sub[1], &t[1], &t[0]);
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul(&sub[2], &sub[3], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  poly64_mul(&sub[4], &sub[6], &t[1], &t[0]);
+  poly64_mul(&sub[5], &sub[7], &t[3], &t[2]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul(&sub[8], &sub[9], &t[1], &t[0]);
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] ^= temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] ^= temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] ^= temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t temp_c3 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+      temp_c3 ^= (b_ptr[0][3] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+      temp_c3 ^= (b_ptr[1][3] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+      temp_c3 ^= (b_ptr[2][3] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+      temp_c3 ^= (b_ptr[3][3] & mask);
+    }
+  }
+  c[0] = temp_c0;
+  c[1] = temp_c1;
+  c[2] = temp_c2;
+  c[3] = temp_c3;
+}
+
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD])
+{
+  const uint64_t *a_ptr = a;
+  const GF *b_ptr = b;
+
+  uint64_t temp_c0 = 0;
+  uint64_t temp_c1 = 0;
+  uint64_t temp_c2 = 0;
+  uint64_t temp_c3 = 0;
+  uint64_t mask;
+  for (size_t i = AIM2_NUM_WORDS_FIELD; i; --i, ++a_ptr)
+  {
+    uint64_t index = *a_ptr;
+    for (size_t j = AIM2_NUM_BITS_WORD; j; j -= 4, index >>= 4, b_ptr += 4)
+    {
+      mask = -(index & 1);
+      temp_c0 ^= (b_ptr[0][0] & mask);
+      temp_c1 ^= (b_ptr[0][1] & mask);
+      temp_c2 ^= (b_ptr[0][2] & mask);
+      temp_c3 ^= (b_ptr[0][3] & mask);
+
+      mask = -((index >> 1) & 1);
+      temp_c0 ^= (b_ptr[1][0] & mask);
+      temp_c1 ^= (b_ptr[1][1] & mask);
+      temp_c2 ^= (b_ptr[1][2] & mask);
+      temp_c3 ^= (b_ptr[1][3] & mask);
+
+      mask = -((index >> 2) & 1);
+      temp_c0 ^= (b_ptr[2][0] & mask);
+      temp_c1 ^= (b_ptr[2][1] & mask);
+      temp_c2 ^= (b_ptr[2][2] & mask);
+      temp_c3 ^= (b_ptr[2][3] & mask);
+
+      mask = -((index >> 3) & 1);
+      temp_c0 ^= (b_ptr[3][0] & mask);
+      temp_c1 ^= (b_ptr[3][1] & mask);
+      temp_c2 ^= (b_ptr[3][2] & mask);
+      temp_c3 ^= (b_ptr[3][3] & mask);
+    }
+  }
+  c[0] ^= temp_c0;
+  c[1] ^= temp_c1;
+  c[2] ^= temp_c2;
+  c[3] ^= temp_c3;
+}
+
+static void poly64_mul_s(uint64_t *z1, uint64_t *z0, uint64_t x, uint64_t y)
+{
+  // x_low
+  uint32_t x4 = x >> 32;
+
+  uint32_t x0 = x & mask_64;
+  uint32_t x1 = (x0 >> 8) & mask0;
+  uint32_t x2 = (x0 >> 16) & mask0;
+  uint32_t x3 = (x0 >> 24) & mask0;
+  x0 &= mask0;
+
+  // x_high
+  uint32_t x5 = (x4 >> 8) & mask0;
+  uint32_t x6 = (x4 >> 16) & mask0;
+  uint32_t x7 = (x4 >> 24) & mask0;
+  x4 &= mask0;
+
+  // y_low
+  uint32_t y4 = y >> 32;
+
+  uint32_t y0 = y & mask_64;
+  uint32_t y1 = (y0 >> 8) & mask0;
+  uint32_t y2 = (y0 >> 16) & mask0;
+  uint32_t y3 = (y0 >> 24) & mask0;
+  y0 &= mask0;
+
+  // y_high
+  uint32_t y5 = (y4 >> 8) & mask0;
+  uint32_t y6 = (y4 >> 16) & mask0;
+  uint32_t y7 = (y4 >> 24) & mask0;
+  y4 &= mask0;
+
+  // x padding
+  zero_padding(x0, mask1, mask2, mask3);
+  zero_padding(x1, mask1, mask2, mask3);
+  zero_padding(x2, mask1, mask2, mask3);
+  zero_padding(x3, mask1, mask2, mask3);
+  zero_padding(x4, mask1, mask2, mask3);
+  zero_padding(x5, mask1, mask2, mask3);
+  zero_padding(x6, mask1, mask2, mask3);
+  zero_padding(x7, mask1, mask2, mask3);
+
+  // y padding
+  zero_padding(y0, mask1, mask2, mask3);
+  zero_padding(y1, mask1, mask2, mask3);
+  zero_padding(y2, mask1, mask2, mask3);
+  zero_padding(y3, mask1, mask2, mask3);
+  zero_padding(y4, mask1, mask2, mask3);
+  zero_padding(y5, mask1, mask2, mask3);
+  zero_padding(y6, mask1, mask2, mask3);
+  zero_padding(y7, mask1, mask2, mask3);
+
+  //x0-3 * y0-3
+  uint64_t a0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t a1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  a0 ^= (a1 << 32);
+  a1 = a1 >> 32;
+  a1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t a2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  a1 ^= (a2 << 32);
+  a2 = a2 >> 32;
+  a2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t a3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  a2 ^= (a3 << 32);
+  a3 = a3 >> 32;
+
+  a3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  //x4-7 * y4-7
+  uint64_t b0 = ((uint64_t)x4 * (uint64_t)y4) & mask3_64;
+  uint64_t b1 = ((((uint64_t)x4 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y4) & mask3_64));
+  b0 ^= (b1 << 32);
+  b1 = b1 >> 32;
+  b1 ^= ((((uint64_t)x4 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x5 * (uint64_t)y5) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y4) & mask3_64));
+
+  uint64_t b2 = ((((uint64_t)x4 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x5 * (uint64_t)y6) & mask3_64) ^
+                 (((uint64_t)x6 * (uint64_t)y5) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y4) & mask3_64));
+  b1 ^= (b2 << 32);
+  b2 = b2 >> 32;
+  b2 ^= ((((uint64_t)x5 * (uint64_t)y7) & mask3_64) ^
+         (((uint64_t)x6 * (uint64_t)y6) & mask3_64) ^
+         (((uint64_t)x7 * (uint64_t)y5) & mask3_64));
+
+  uint64_t b3 = ((((uint64_t)x6 * (uint64_t)y7) & mask3_64) ^
+                 (((uint64_t)x7 * (uint64_t)y6) & mask3_64));
+  b2 ^= (b3 << 32);
+  b3 = b3 >> 32;
+
+  b3 ^= ((uint64_t)x7 * (uint64_t)y7) & mask3_64;
+
+  //middle part
+  x0 ^= x4;
+  x1 ^= x5;
+  x2 ^= x6;
+  x3 ^= x7;
+
+  y0 ^= y4;
+  y1 ^= y5;
+  y2 ^= y6;
+  y3 ^= y7;
+
+  uint64_t c0 = ((uint64_t)x0 * (uint64_t)y0) & mask3_64;
+  uint64_t c1 = ((((uint64_t)x0 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y0) & mask3_64));
+  c0 ^= (c1 << 32);
+  c1 = c1 >> 32;
+  c1 ^= ((((uint64_t)x0 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x1 * (uint64_t)y1) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y0) & mask3_64));
+
+  uint64_t c2 = ((((uint64_t)x0 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x1 * (uint64_t)y2) & mask3_64) ^
+                 (((uint64_t)x2 * (uint64_t)y1) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y0) & mask3_64));
+  c1 ^= (c2 << 32);
+  c2 = c2 >> 32;
+  c2 ^= ((((uint64_t)x1 * (uint64_t)y3) & mask3_64) ^
+         (((uint64_t)x2 * (uint64_t)y2) & mask3_64) ^
+         (((uint64_t)x3 * (uint64_t)y1) & mask3_64));
+
+  uint64_t c3 = ((((uint64_t)x2 * (uint64_t)y3) & mask3_64) ^
+                 (((uint64_t)x3 * (uint64_t)y2) & mask3_64));
+  c2 ^= (c3 << 32);
+  c3 = c3 >> 32;
+
+  c3 ^= ((uint64_t)x3 * (uint64_t)y3) & mask3_64;
+
+  c0 = c0 ^ a0 ^ b0;
+  c1 = c1 ^ a1 ^ b1;
+  c2 = c2 ^ a2 ^ b2;
+  c3 = c3 ^ a3 ^ b3;
+
+  a2 ^= c0;
+  a3 ^= c1;
+  b0 ^= c2;
+  b1 ^= c3;
+
+  // result inv_padding
+  inv_zero_padding(a0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(a3, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b0, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b1, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b2, mask0_64, mask1_64, mask2_64, mask_final);
+  inv_zero_padding(b3, mask0_64, mask1_64, mask2_64, mask_final);
+
+  *z0 = a0 | (a1 << 16) | (a2 << 32) | (a3 << 48);
+  *z1 = b0 | (b1 << 16) | (b2 << 32) | (b3 << 48);
+}
+
+void GF_mul_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t add[4] = {0,};
+  uint64_t temp[8] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&t[3], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  poly64_mul_s(&temp[7], &t[2], a[3], b[3]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[2] ^ a[3]), (b[2] ^ b[3]));
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  add[0] = a[0] ^ a[2];
+  add[1] = a[1] ^ a[3];
+  add[2] = b[0] ^ b[2];
+  add[3] = b[1] ^ b[3];
+  poly64_mul_s(&t[1], &t[0], add[0], add[2]);
+  poly64_mul_s(&t[3], &t[2], add[1], add[3]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul_s(&t[1], &t[0], (add[0] ^ add[1]), (add[2] ^ add[3]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] = temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] = temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] = temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] = temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
+
+void GF_mul_add_s(GF c, const GF a, const GF b)
+{
+  uint64_t t[4] = {0,};
+  uint64_t add[4] = {0,};
+  uint64_t temp[8] = {0,};
+
+  poly64_mul_s(&t[0], &temp[0], a[0], b[0]);
+  poly64_mul_s(&t[2], &t[1], a[1], b[1]);
+  t[0] ^= t[1];
+
+  poly64_mul_s(&t[3], &t[1], a[2], b[2]);
+  t[1] ^= t[2];
+
+  poly64_mul_s(&temp[7], &t[2], a[3], b[3]);
+  t[2] ^= t[3];
+
+  temp[6] = temp[7] ^ t[2];
+  temp[3] = t[2] ^ t[1];
+  temp[2] = t[1] ^ t[0];
+  temp[1] = t[0] ^ temp[0];
+
+  poly64_mul_s(&t[1], &t[0], (a[0] ^ a[1]), (b[0] ^ b[1]));
+  temp[1] ^= t[0];
+  temp[2] ^= t[1];
+
+  poly64_mul_s(&t[1], &t[0], (a[2] ^ a[3]), (b[2] ^ b[3]));
+  temp[3] ^= t[0];
+  temp[6] ^= t[1];
+
+  temp[5] = temp[7] ^ temp[3];
+  temp[4] = temp[6] ^ temp[2];
+  temp[3] ^= temp[1];
+  temp[2] ^= temp[0];
+
+  add[0] = a[0] ^ a[2];
+  add[1] = a[1] ^ a[3];
+  add[2] = b[0] ^ b[2];
+  add[3] = b[1] ^ b[3];
+  poly64_mul_s(&t[1], &t[0], add[0], add[2]);
+  poly64_mul_s(&t[3], &t[2], add[1], add[3]);
+  t[1] ^= t[2];
+  t[2] = t[1] ^ t[3];
+  t[1] ^= t[0];
+
+  temp[2] ^= t[0];
+  temp[3] ^= t[1];
+  temp[4] ^= t[2];
+  temp[5] ^= t[3];
+
+  poly64_mul_s(&t[1], &t[0], (add[0] ^ add[1]), (add[2] ^ add[3]));
+  temp[3] ^= t[0];
+  temp[4] ^= t[1];
+
+  t[0] = temp[4] ^ ((temp[7] >> 54) ^ (temp[7] >> 59) ^ (temp[7] >> 62));
+
+  c[3] ^= temp[3] ^ temp[7];
+  c[3] ^= (temp[7] << 10) | (temp[6] >> 54);
+  c[3] ^= (temp[7] <<  5) | (temp[6] >> 59);
+  c[3] ^= (temp[7] <<  2) | (temp[6] >> 62);
+
+  c[2] ^= temp[2] ^ temp[6];
+  c[2] ^= (temp[6] << 10) | (temp[5] >> 54);
+  c[2] ^= (temp[6] <<  5) | (temp[5] >> 59);
+  c[2] ^= (temp[6] <<  2) | (temp[5] >> 62);
+
+  c[1] ^= temp[1] ^ temp[5];
+  c[1] ^= (temp[5] << 10) | (t[0] >> 54);
+  c[1] ^= (temp[5] <<  5) | (t[0] >> 59);
+  c[1] ^= (temp[5] <<  2) | (t[0] >> 62);
+
+  c[0] ^= temp[0] ^ t[0];
+  c[0] ^= (t[0] << 10);
+  c[0] ^= (t[0] <<  5);
+  c[0] ^= (t[0] <<  2);
+}
diff --git a/crypto_sign/aimer256s/m4stack/field.h b/crypto_sign/aimer256s/m4stack/field.h
new file mode 100644
index 00000000..089ad983
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/field.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef FIELD_H
+#define FIELD_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef uint64_t GF[4];
+
+#define poly64_mul AIMER_NAMESPACE(poly64_mul)
+void poly64_mul(const uint64_t *a, const uint64_t *b, uint64_t *c1, uint64_t *c0);
+#define GF_set0 AIMER_NAMESPACE(GF_set0)
+void GF_set0(GF a);
+#define GF_copy AIMER_NAMESPACE(GF_copy)
+void GF_copy(GF out, const GF in);
+#define GF_to_bytes AIMER_NAMESPACE(GF_to_bytes)
+void GF_to_bytes(uint8_t *out, const GF in);
+#define GF_from_bytes AIMER_NAMESPACE(GF_from_bytes)
+void GF_from_bytes(GF out, const uint8_t *in);
+
+#define GF_add AIMER_NAMESPACE(GF_add)
+void GF_add(GF c, const GF a, const GF b);
+#define GF_mul AIMER_NAMESPACE(GF_mul)
+void GF_mul(GF c, const GF a, const GF b);
+#define GF_mul_add AIMER_NAMESPACE(GF_mul_add)
+void GF_mul_add(GF c, const GF a, const GF b);
+#define GF_transposed_matmul AIMER_NAMESPACE(GF_transposed_matmul)
+void GF_transposed_matmul(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+#define GF_transposed_matmul_add AIMER_NAMESPACE(GF_transposed_matmul_add)
+void GF_transposed_matmul_add(GF c, const GF a, const GF b[AIM2_NUM_BITS_FIELD]);
+
+#define GF_mul_s AIMER_NAMESPACE(GF_mul_s)
+void GF_mul_s(GF c, const GF a, const GF b);
+#define GF_mul_add_s AIMER_NAMESPACE(GF_mul_add_s)
+void GF_mul_add_s(GF c, const GF a, const GF b);
+#define GF_sqr_s AIMER_NAMESPACE(GF_sqr_s)
+void GF_sqr_s(GF c, const GF a);
+
+#endif // FIELD_H
diff --git a/crypto_sign/aimer256s/m4stack/hash.c b/crypto_sign/aimer256s/m4stack/hash.c
new file mode 100644
index 00000000..06a6f473
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/hash.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+
+void hash_init(hash_instance *ctx)
+{
+  shake256_inc_init(ctx);
+}
+
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix)
+{
+  shake256_inc_init(ctx);
+  shake256_inc_absorb(ctx, &prefix, sizeof(prefix));
+}
+
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len)
+{
+  shake256_inc_absorb(ctx, data, data_len);
+}
+
+void hash_final(hash_instance *ctx)
+{
+  shake256_inc_finalize(ctx);
+}
+
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len)
+{
+  shake256_inc_squeeze(buffer, buffer_len, ctx);
+}
+
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src)
+{
+  shake256_inc_ctx_clone(ctx_dest, ctx_src);
+}
+
+void hash_ctx_release(hash_instance *ctx)
+{
+  shake256_inc_ctx_release(ctx);
+}
diff --git a/crypto_sign/aimer256s/m4stack/hash.h b/crypto_sign/aimer256s/m4stack/hash.h
new file mode 100644
index 00000000..0fb7c8ef
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/hash.h
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef HASH_H
+#define HASH_H
+
+#include "fips202.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+static const uint8_t HASH_PREFIX_0 = 0;
+static const uint8_t HASH_PREFIX_1 = 1;
+static const uint8_t HASH_PREFIX_2 = 2;
+static const uint8_t HASH_PREFIX_3 = 3;
+static const uint8_t HASH_PREFIX_4 = 4;
+static const uint8_t HASH_PREFIX_5 = 5;
+
+typedef shake256incctx hash_instance;
+
+#define hash_init AIMER_NAMESPACE(hash_init)
+void hash_init(hash_instance *ctx);
+#define hash_init_prefix AIMER_NAMESPACE(hash_init_prefix)
+void hash_init_prefix(hash_instance *ctx, uint8_t prefix);
+#define hash_update AIMER_NAMESPACE(hash_update)
+void hash_update(hash_instance *ctx, const uint8_t *data, size_t data_len);
+#define hash_final AIMER_NAMESPACE(hash_final)
+void hash_final(hash_instance *ctx);
+#define hash_squeeze AIMER_NAMESPACE(hash_squeeze)
+void hash_squeeze(hash_instance *ctx, uint8_t *buffer, size_t buffer_len);
+#define hash_ctx_clone AIMER_NAMESPACE(hash_ctx_clone)
+void hash_ctx_clone(hash_instance *ctx_dest, const hash_instance *ctx_src);
+#define hash_ctx_release AIMER_NAMESPACE(hash_ctx_release)
+void hash_ctx_release(hash_instance *ctx);
+
+#endif // HASH_H
diff --git a/crypto_sign/aimer256s/m4stack/params.h b/crypto_sign/aimer256s/m4stack/params.h
new file mode 100644
index 00000000..63a4c90f
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/params.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef PARAMS_H
+#define PARAMS_H
+
+#define AIMER_NAMESPACE(s)          samsungsds_aimer256s_m4stack_##s
+
+#define SECURITY_BITS               256                  // security parameter
+#define SECURITY_BYTES              (SECURITY_BITS / 8)  // byte size of security parameter
+
+#define AIM2_NUM_BITS_FIELD         SECURITY_BITS        // number of bits in field element
+#define AIM2_NUM_BYTES_FIELD        SECURITY_BYTES       // number of bytes in field element
+#define AIM2_NUM_WORDS_FIELD        (SECURITY_BITS / 64) // number of 64-bit words in element
+#define AIM2_NUM_BITS_WORD          64                   // number of bits in word
+#define AIM2_IV_SIZE                SECURITY_BYTES       // byte size of AIM2 initial vector
+
+#define AIM2_NUM_INPUT_SBOX         3                    // number of AIM2 input S-boxes
+
+#define AIMER_SALT_SIZE             SECURITY_BYTES       // byte size of salt
+#define AIMER_SEED_SIZE             SECURITY_BYTES       // byte size of seed
+#define AIMER_COMMIT_SIZE           (SECURITY_BYTES * 2) // byte size of commitment
+
+#define AIMER_L                     AIM2_NUM_INPUT_SBOX
+#define AIMER_T                     33                   // number of parallel repetitions (Tau)
+#define AIMER_N                     256                  // number of MPC parties (N)
+#define AIMER_LOGN                  8                    // log_2(N)
+
+#endif // PARAMS_H
diff --git a/crypto_sign/aimer256s/m4stack/sign.c b/crypto_sign/aimer256s/m4stack/sign.c
new file mode 100644
index 00000000..601718ad
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/sign.c
@@ -0,0 +1,654 @@
+// SPDX-License-Identifier: MIT
+
+#include "api.h"
+#include "aim2.h"
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include "randombytes.h"
+#include "sign.h"
+#include "tree.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party)
+{
+  hash_instance ctx;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  buffer[0] = (uint8_t)(rep);
+  buffer[1] = (uint8_t)(party);
+  memcpy(buffer + 2, seed, AIMER_SEED_SIZE);
+
+  hash_ctx_clone(&ctx, ctx_precom);
+  hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+  hash_final(&ctx);
+  hash_squeeze(&ctx, commit, AIMER_COMMIT_SIZE);
+  hash_squeeze(&ctx, (uint8_t *)tape, sizeof(tape_t));
+  hash_ctx_release(&ctx);
+}
+
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF)
+{
+  // pt + c = t ^ {2 ^ e - 1}
+  // --> t ^ {2 ^ e} + t * c = t * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[0], mult_chk->x_shares[0]);
+  for (size_t i = 1; i < 11; i++)
+  {
+    GF_sqr_s(mult_chk->z_shares[0], mult_chk->z_shares[0]); 
+  }
+  GF_mul_add(mult_chk->z_shares[0], mult_chk->x_shares[0], aim2_constants[0]);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[0],
+                           matrix_A[0]);
+
+  GF_mul(mult_chk->z_shares[1], mult_chk->x_shares[1], aim2_constants[1]);
+  GF_transposed_matmul_add(mult_chk->z_shares[1], mult_chk->x_shares[1],
+                           aim2_e2_power_matrix);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[1],
+                           matrix_A[1]);
+
+  GF_sqr_s(mult_chk->z_shares[2], mult_chk->x_shares[2]);
+  for (size_t i = 1; i < 7; i++)
+  {
+    GF_sqr_s(mult_chk->z_shares[2], mult_chk->z_shares[2]);
+  }
+  GF_mul_add(mult_chk->z_shares[2], mult_chk->x_shares[2], aim2_constants[2]);
+  GF_transposed_matmul_add(mult_chk->x_shares[AIMER_L], mult_chk->x_shares[2],
+                           matrix_A[2]);
+
+  // x ^ {2 ^ e - 1} = pt + ct
+  // --> x ^ {2 ^ e} + x * ct = x * pt
+  // --> z = x * pt
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_sqr_s(mult_chk->z_shares[AIMER_L], mult_chk->z_shares[AIMER_L]);
+  GF_mul_add(mult_chk->z_shares[AIMER_L], mult_chk->x_shares[AIMER_L], ct_GF);
+}
+
+// committing to the seeds and the execution views of the parties
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE])
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  hash_instance ctx;
+
+  // hash_instance for h_1
+  hash_init_prefix(&ctx, HASH_PREFIX_1);
+  hash_update(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1],  rep, party);
+      hash_update(&ctx, commits, AIMER_COMMIT_SIZE);
+
+      // compute offsets
+      GF_add(delta.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(delta.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(delta.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(delta.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+      GF_add(delta.a_share, delta.a_share, tape.a_share);
+      GF_add(delta.c_share, delta.c_share, tape.c_share);
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_add(delta.pt_share, delta.pt_share, pt_GF);
+        GF_add(delta.t_shares[0], delta.t_shares[0], sbox_outputs[0]);
+        GF_add(delta.t_shares[1], delta.t_shares[1], sbox_outputs[1]);
+        GF_add(delta.t_shares[2], delta.t_shares[2], sbox_outputs[2]);
+        GF_mul_add_s(delta.c_share, pt_GF, delta.a_share);
+
+        GF_to_bytes(sign->proofs[rep].delta_pt_bytes, delta.pt_share);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[0], delta.t_shares[0]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[1], delta.t_shares[1]);
+        GF_to_bytes(sign->proofs[rep].delta_ts_bytes[2], delta.t_shares[2]);
+        GF_to_bytes(sign->proofs[rep].delta_c_bytes, delta.c_share);
+      }
+    }
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_precom);
+
+  // commit to salt, (all commitments of parties' seeds,
+  // delta_pt, delta_t, delta_c) for all repetitions
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF)
+{
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF pt_shares[AIMER_N];
+  GF alpha_v_shares[2][AIMER_N];
+  GF epsilons[AIMER_L + 1];
+
+  // prepare h2
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_2);
+  hash_update(&ctx, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx, sign->salt, AIMER_SALT_SIZE);
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  GF alpha = {0,};
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+
+    // initialize adjustment values
+    tape_t delta;
+    memset(&delta, 0, sizeof(tape_t));
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF_set0(alpha);
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      // generate execution views and commitments
+      tape_t tape;
+      commit_and_expand_tape(&tape, commits, &ctx_precom,
+                             nodes[AIMER_N + party - 1], rep, party);
+
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk));
+
+      // adjust the last share and prepare the proof and h_1
+      if (party == AIMER_N - 1)
+      {
+        GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+        GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+        GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+        GF_from_bytes(delta.t_shares[2], sign->proofs[rep].delta_ts_bytes[2]);
+        GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+        GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+        GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+        GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+        GF_add(tape.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+        GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(mult_chk.x_shares[2], tape.t_shares[2]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[3], epsilons[3]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[3], epsilons[3]);
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+    }
+    hash_update(&ctx, (const uint8_t *)alpha_v_shares, sizeof(alpha_v_shares));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  hash_final(&ctx);
+  hash_squeeze(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+}
+
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b)
+{
+  // prepare challenge parties
+  hash_instance ctx;
+  hash_init(&ctx);
+  hash_update(&ctx, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx, indices, AIMER_T);
+  hash_ctx_release(&ctx);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  uint8_t commits[AIMER_COMMIT_SIZE];
+  uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE];
+
+  GF epsilons[AIMER_L + 1];
+
+  // prepare epsilons
+  hash_instance ctx_e;
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  mult_chk_t mult_chk;
+  memset(&mult_chk, 0, sizeof(mult_chk_t));
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+
+    // compute parties' seeds using binary tree
+    expand_tree(nodes, sign->salt, rep, root_seeds[rep]);
+    reveal_all_but(sign->proofs[rep].reveal_path,
+                   (const uint8_t (*)[AIMER_SEED_SIZE])nodes, i_bar);
+
+    // expand challenge
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha_share = {0,};
+
+    tape_t tape;
+    commit_and_expand_tape(&tape, commits, &ctx_precom,
+                           nodes[AIMER_N + i_bar - 1], rep, i_bar);
+    memcpy(sign->proofs[rep].missing_commitment, commits,
+           AIMER_COMMIT_SIZE);
+
+    GF_set0(mult_chk.x_shares[AIMER_L]);
+
+    // adjust the last share and prepare the proof and h_1
+    if (i_bar == AIMER_N - 1)
+    {
+      // initialize adjustment values
+      tape_t delta;
+      memset(&delta, 0, sizeof(tape_t));
+
+      GF_from_bytes(delta.pt_share, sign->proofs[rep].delta_pt_bytes);
+      GF_from_bytes(delta.t_shares[0], sign->proofs[rep].delta_ts_bytes[0]);
+      GF_from_bytes(delta.t_shares[1], sign->proofs[rep].delta_ts_bytes[1]);
+      GF_from_bytes(delta.t_shares[2], sign->proofs[rep].delta_ts_bytes[2]);
+      GF_from_bytes(delta.c_share, sign->proofs[rep].delta_c_bytes);
+
+      GF_add(tape.pt_share, delta.pt_share, tape.pt_share);
+      GF_add(tape.t_shares[0], delta.t_shares[0], tape.t_shares[0]);
+      GF_add(tape.t_shares[1], delta.t_shares[1], tape.t_shares[1]);
+      GF_add(tape.t_shares[2], delta.t_shares[2], tape.t_shares[2]);
+      GF_add(tape.c_share, delta.c_share, tape.c_share);
+
+      GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+    }
+
+    for (size_t ell = 0; ell < AIMER_L; ell++)
+    {
+      GF_copy(mult_chk.x_shares[ell], tape.t_shares[ell]);
+
+      // x_* = sum_i A[i] * t[i] + b
+      GF_transposed_matmul_add(mult_chk.x_shares[AIMER_L],
+                               tape.t_shares[ell], matrix_A[ell]);
+    }
+    GF_copy(alpha_share, tape.a_share);
+
+    GF_mul_add(alpha_share, mult_chk.x_shares[0], epsilons[0]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[1], epsilons[1]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[2], epsilons[2]);
+    GF_mul_add(alpha_share, mult_chk.x_shares[3], epsilons[3]);
+    GF_to_bytes(sign->proofs[rep].missing_alpha_share_bytes, alpha_share);
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+int crypto_sign_keypair(uint8_t *pk, uint8_t *sk)
+{
+  if (!pk || !sk)
+  {
+    return -1;
+  }
+
+  randombytes(sk, AIM2_NUM_BYTES_FIELD);
+  randombytes(pk, AIM2_IV_SIZE);
+
+  aim2(pk + AIM2_IV_SIZE, sk, pk);
+  memcpy(sk + AIM2_NUM_BYTES_FIELD, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+
+  return 0;
+}
+
+int crypto_sign_signature(uint8_t *sig, size_t *siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  signature_t *sign = (signature_t *)sig;
+
+  GF pt_GF = {0,}, ct_GF = {0,};
+  GF_from_bytes(pt_GF, sk);
+  GF_from_bytes(ct_GF, sk + AIM2_NUM_BYTES_FIELD + AIM2_IV_SIZE);
+
+  // message pre-hashing
+  hash_instance ctx;
+  hash_init_prefix(&ctx, HASH_PREFIX_0);
+  hash_update(&ctx, sk + AIM2_NUM_BYTES_FIELD,
+              AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx, m, mlen);
+  hash_final(&ctx);
+
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_squeeze(&ctx, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx);
+
+  // compute first L sboxes' outputs
+  GF sbox_outputs[AIMER_L];
+  aim2_sbox_outputs(sbox_outputs, pt_GF);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, sk + AIM2_NUM_BYTES_FIELD);
+
+  // generate per-signature randomness
+  uint8_t random[SECURITY_BYTES];
+  randombytes(random, SECURITY_BYTES);
+
+  // generate salt and root seeds
+  hash_instance ctx_roots;
+  hash_init_prefix(&ctx_roots, HASH_PREFIX_3);
+  hash_update(&ctx_roots, sk, AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_roots, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_roots, random, SECURITY_BYTES);
+  hash_final(&ctx_roots);
+  hash_squeeze(&ctx_roots, sign->salt, AIMER_SALT_SIZE);
+
+  uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE];
+  hash_squeeze(&ctx_roots, (uint8_t *)root_seeds, AIMER_T * AIMER_SEED_SIZE);
+  hash_ctx_release(&ctx_roots);
+
+  run_phase_1(sign, pt_GF, (const GF *)sbox_outputs, mu,
+              (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds);
+
+  run_phase_1_to_3(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b, ct_GF);
+
+  run_phase_1_to_5(sign, (const uint8_t (*)[AIMER_SEED_SIZE])root_seeds,
+                   (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A,
+                   vector_b);
+
+  *siglen = CRYPTO_BYTES;
+
+  return 0;
+}
+
+int crypto_sign(uint8_t *sm, size_t *smlen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *sk)
+{
+  crypto_sign_signature(sm + mlen, smlen, m, mlen, sk);
+
+  memcpy(sm, m, mlen);
+  *smlen += mlen;
+
+  return 0;
+}
+
+int crypto_sign_verify(const uint8_t *sig, size_t siglen,
+        const uint8_t *m, size_t mlen,
+        const uint8_t *pk)
+{
+  if (siglen != CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const signature_t *sign = (const signature_t *)sig;
+
+  GF ct_GF = {0,};
+  GF_from_bytes(ct_GF, pk + AIM2_IV_SIZE);
+
+  // derive the binary matrix and the vector from the initial vector
+  GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD];
+  GF vector_b = {0,};
+  generate_matrix_LU(matrix_A, vector_b, pk);
+
+  hash_instance ctx_e, ctx_h1, ctx_h2;
+
+  // indices = Expand(h_2)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_2, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  uint8_t indices[AIMER_T]; // AIMER_N <= 256
+  hash_squeeze(&ctx_e, indices, AIMER_T);
+  hash_ctx_release(&ctx_e);
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    indices[rep] &= (1 << AIMER_LOGN) - 1;
+  }
+
+  // epsilons = Expand(h_1)
+  hash_init(&ctx_e);
+  hash_update(&ctx_e, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_final(&ctx_e);
+
+  // message pre-hashing
+  uint8_t mu[AIMER_COMMIT_SIZE];
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_0);
+  hash_update(&ctx_h1, pk, AIM2_IV_SIZE + AIM2_NUM_BYTES_FIELD);
+  hash_update(&ctx_h1, m, mlen);
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  // ready for computing h_1' and h_2'
+  hash_init_prefix(&ctx_h1, HASH_PREFIX_1);
+  hash_update(&ctx_h1, mu, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h1, sign->salt, AIMER_SALT_SIZE);
+
+  hash_init_prefix(&ctx_h2, HASH_PREFIX_2);
+  hash_update(&ctx_h2, sign->h_1, AIMER_COMMIT_SIZE);
+  hash_update(&ctx_h2, sign->salt, AIMER_SALT_SIZE);
+
+  hash_instance ctx_precom;
+  hash_init_prefix(&ctx_precom, HASH_PREFIX_5);
+  hash_update(&ctx_precom, sign->salt, AIMER_SALT_SIZE);
+
+  for (size_t rep = 0; rep < AIMER_T; rep++)
+  {
+    size_t i_bar = indices[rep];
+    uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE];
+
+    reconstruct_tree(nodes, sign->salt, sign->proofs[rep].reveal_path,
+                     rep, i_bar);
+
+    GF pt_shares[AIMER_N];
+    GF alpha_v_shares[2][AIMER_N];
+    GF_set0(alpha_v_shares[1][i_bar]);
+
+    GF epsilons[AIMER_L + 1];
+    hash_squeeze(&ctx_e, (uint8_t *)epsilons, sizeof(epsilons));
+
+    GF alpha = {0,};
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        hash_update(&ctx_h1, sign->proofs[rep].missing_commitment,
+                    AIMER_COMMIT_SIZE);
+        GF_from_bytes(alpha_v_shares[0][i_bar],
+                      sign->proofs[rep].missing_alpha_share_bytes);
+        GF_add(alpha, alpha, alpha_v_shares[0][i_bar]);
+        continue;
+      }
+
+      tape_t tape;
+      uint8_t commit[AIMER_COMMIT_SIZE];
+      commit_and_expand_tape(&tape, commit, &ctx_precom,
+                             nodes[AIMER_N + party - 2], rep, party);
+      hash_update(&ctx_h1, commit, AIMER_COMMIT_SIZE);
+
+      // adjust last shares
+      mult_chk_t mult_chk;
+      memset(&mult_chk, 0, sizeof(mult_chk_t));
+      if (party == AIMER_N - 1)
+      {
+        GF temp = {0,};
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_pt_bytes);
+        GF_add(tape.pt_share, tape.pt_share, temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[0]);
+        GF_add(tape.t_shares[0], tape.t_shares[0], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[1]);
+        GF_add(tape.t_shares[1], tape.t_shares[1], temp);
+        GF_from_bytes(temp, sign->proofs[rep].delta_ts_bytes[2]);
+        GF_add(tape.t_shares[2], tape.t_shares[2], temp);
+
+        GF_from_bytes(temp, sign->proofs[rep].delta_c_bytes);
+        GF_add(tape.c_share, tape.c_share, temp);
+
+        GF_copy(mult_chk.x_shares[AIMER_L], vector_b);
+      }
+
+      // run the MPC simulation and prepare the mult check inputs
+      GF_copy(mult_chk.x_shares[0], tape.t_shares[0]);
+      GF_copy(mult_chk.x_shares[1], tape.t_shares[1]);
+      GF_copy(mult_chk.x_shares[2], tape.t_shares[2]);
+      GF_copy(pt_shares[party], tape.pt_share);
+      GF_copy(alpha_v_shares[0][party], tape.a_share);
+      GF_copy(alpha_v_shares[1][party], tape.c_share);
+      aim2_mpc(&mult_chk, (const GF (*)[AIM2_NUM_BITS_FIELD])matrix_A, ct_GF);
+
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[0], epsilons[0]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[1], epsilons[1]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[2], epsilons[2]);
+      GF_mul_add(alpha_v_shares[0][party], mult_chk.x_shares[3], epsilons[3]);
+      GF_mul_add(alpha_v_shares[1][party], mult_chk.z_shares[3], epsilons[3]);
+
+      GF_add(alpha, alpha, alpha_v_shares[0][party]);
+    }
+
+    // alpha is opened, so we can finish calculating v_share
+    for (size_t party = 0; party < AIMER_N; party++)
+    {
+      if (party == i_bar)
+      {
+        continue;
+      }
+
+      GF_mul_add(alpha_v_shares[1][party], pt_shares[party], alpha);
+      GF_add(alpha_v_shares[1][i_bar], alpha_v_shares[1][i_bar],
+             alpha_v_shares[1][party]);
+    }
+
+    // v is opened
+    hash_update(&ctx_h2, (const uint8_t *)alpha_v_shares,
+                sizeof(alpha_v_shares));
+
+    // NOTE: depend on the order of values in proof_t
+    hash_update(&ctx_h1, sign->proofs[rep].delta_pt_bytes,
+                AIM2_NUM_BYTES_FIELD * (AIMER_L + 2));
+  }
+  hash_ctx_release(&ctx_e);
+  hash_ctx_release(&ctx_precom);
+
+  uint8_t h_1_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h1);
+  hash_squeeze(&ctx_h1, h_1_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h1);
+
+  uint8_t h_2_prime[AIMER_COMMIT_SIZE];
+  hash_final(&ctx_h2);
+  hash_squeeze(&ctx_h2, h_2_prime, AIMER_COMMIT_SIZE);
+  hash_ctx_release(&ctx_h2);
+
+  if (memcmp(h_1_prime, sign->h_1, AIMER_COMMIT_SIZE) != 0 ||
+      memcmp(h_2_prime, sign->h_2, AIMER_COMMIT_SIZE) != 0)
+  {
+    return -1;
+  }
+
+  return 0;
+}
+
+int crypto_sign_open(uint8_t *m, size_t *mlen,
+        const uint8_t *sm, size_t smlen,
+        const uint8_t *pk)
+{
+  if (smlen < CRYPTO_BYTES)
+  {
+    return -1;
+  }
+
+  const size_t message_len = smlen - CRYPTO_BYTES;
+  const uint8_t *message = sm;
+  const uint8_t *signature = sm + message_len;
+
+  if (crypto_sign_verify(signature, CRYPTO_BYTES, message, message_len, pk))
+  {
+    return -1;
+  }
+
+  memmove(m, message, message_len);
+  *mlen = message_len;
+
+  return 0;
+}
diff --git a/crypto_sign/aimer256s/m4stack/sign.h b/crypto_sign/aimer256s/m4stack/sign.h
new file mode 100644
index 00000000..2e2ab61d
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/sign.h
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef SIGN_H
+#define SIGN_H
+
+#include "field.h"
+#include "hash.h"
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+typedef struct tape_t
+{
+  GF pt_share;
+  GF t_shares[AIMER_L];
+  GF a_share;
+  GF c_share;
+} tape_t;
+
+typedef struct mult_chk_t
+{
+  GF x_shares[AIMER_L + 1];
+  GF z_shares[AIMER_L + 1];
+} mult_chk_t;
+
+typedef struct proof_t
+{
+  uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE];
+  uint8_t missing_commitment[AIMER_COMMIT_SIZE];
+  uint8_t delta_pt_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_ts_bytes[AIMER_L][AIM2_NUM_BYTES_FIELD];
+  uint8_t delta_c_bytes[AIM2_NUM_BYTES_FIELD];
+  uint8_t missing_alpha_share_bytes[AIM2_NUM_BYTES_FIELD];
+} proof_t;
+
+typedef struct signature_t
+{
+  uint8_t salt[AIMER_SALT_SIZE];
+  uint8_t h_1[AIMER_COMMIT_SIZE];
+  uint8_t h_2[AIMER_COMMIT_SIZE];
+  proof_t proofs[AIMER_T];
+} signature_t;
+
+#define aim2_mpc AIMER_NAMESPACE(aim2_mpc)
+void aim2_mpc(mult_chk_t *mult_chk,
+              const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+              const GF ct_GF);
+
+#define commit_and_expand_tape AIMER_NAMESPACE(commit_and_expand_tape)
+void commit_and_expand_tape(tape_t *tape, uint8_t *commit,
+                            const hash_instance *ctx_precom,
+                            const uint8_t seed[AIMER_SEED_SIZE],
+                            size_t rep, size_t party);
+
+#define run_phase_1 AIMER_NAMESPACE(run_phase_1)
+void run_phase_1(signature_t *sign,
+                 const GF pt_GF, const GF sbox_outputs[AIMER_L],
+                 const uint8_t mu[AIMER_COMMIT_SIZE],
+                 const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE]);
+
+#define run_phase_1_to_3 AIMER_NAMESPACE(run_phase_1_to_3)
+void run_phase_1_to_3(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b, const GF ct_GF);
+
+#define run_phase_1_to_5 AIMER_NAMESPACE(run_phase_1_to_5)
+void run_phase_1_to_5(signature_t *sign,
+                      const uint8_t root_seeds[AIMER_T][AIMER_SEED_SIZE],
+                      const GF matrix_A[AIMER_L][AIM2_NUM_BITS_FIELD],
+                      const GF vector_b);
+
+#endif // SIGN_H
diff --git a/crypto_sign/aimer256s/m4stack/tree.c b/crypto_sign/aimer256s/m4stack/tree.c
new file mode 100644
index 00000000..0585e7fa
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/tree.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+
+#include "tree.h"
+#include "hash.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+//  Example of tree for [N = 8]
+//  x
+//  d = 0: 1
+//  d = 1: 2         3
+//  d = 2: 4   5     6     7
+//  d = 3: 8 9 10 11 12 13 14 15
+
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE])
+{
+  size_t index;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  memcpy(nodes[0], root_seed, AIMER_SEED_SIZE);
+  buffer[0] = (uint8_t)(rep_index);
+  for (index = 1; index < AIMER_N; index++)
+  {
+    buffer[1] = (uint8_t)(index);
+    memcpy(buffer + 2, nodes[index - 1], AIMER_SEED_SIZE);
+
+    hash_ctx_clone(&ctx, &ctx_);
+    hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+    hash_final(&ctx);
+    hash_squeeze(&ctx, nodes[2 * index - 1], AIMER_SEED_SIZE << 1);
+    hash_ctx_release(&ctx);
+  }
+  hash_ctx_release(&ctx_);
+}
+
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index)
+{
+  size_t index = cover_index + AIMER_N;
+  for (size_t depth = 0; depth < AIMER_LOGN; depth++)
+  {
+    // index ^ 1 is sibling index
+    memcpy(reveal_path[depth], nodes[(index ^ 1) - 1], AIMER_SEED_SIZE);
+
+    // go to parent node
+    index >>= 1;
+  }
+}
+
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index)
+{
+  size_t index, depth, path;
+  uint8_t buffer[AIMER_SEED_SIZE + 2];
+
+  hash_instance ctx, ctx_;
+  hash_init_prefix(&ctx_, HASH_PREFIX_4);
+  hash_update(&ctx_, salt, AIMER_SALT_SIZE);
+
+  for (depth = 1; depth < AIMER_LOGN; depth++)
+  {
+    path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+    memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+
+    for (index = (1U << depth); index < (2U << depth); index++)
+    {
+      buffer[0] = (uint8_t)(rep_index);
+      buffer[1] = (uint8_t)(index);
+      memcpy(buffer + 2, nodes[index - 2], AIMER_SEED_SIZE);
+
+      hash_ctx_clone(&ctx, &ctx_);
+      hash_update(&ctx, buffer, AIMER_SEED_SIZE + 2);
+      hash_final(&ctx);
+      hash_squeeze(&ctx, nodes[2 * index - 2], AIMER_SEED_SIZE << 1);
+      hash_ctx_release(&ctx);
+    }
+  }
+  hash_ctx_release(&ctx_);
+
+  path = ((cover_index + AIMER_N) >> (AIMER_LOGN - depth)) ^ 1;
+  memcpy(nodes[path - 2], reveal_path[AIMER_LOGN - depth], AIMER_SEED_SIZE);
+}
diff --git a/crypto_sign/aimer256s/m4stack/tree.h b/crypto_sign/aimer256s/m4stack/tree.h
new file mode 100644
index 00000000..f1354f19
--- /dev/null
+++ b/crypto_sign/aimer256s/m4stack/tree.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef TREE_H
+#define TREE_H
+
+#include "params.h"
+#include <stddef.h>
+#include <stdint.h>
+
+#define expand_tree AIMER_NAMESPACE(expand_tree)
+void expand_tree(uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                 const uint8_t salt[AIMER_SALT_SIZE],
+                 size_t rep_index,
+                 const uint8_t root_seed[AIMER_SEED_SIZE]);
+
+#define reveal_all_but AIMER_NAMESPACE(reveal_all_but)
+void reveal_all_but(uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                    const uint8_t nodes[2 * AIMER_N - 1][AIMER_SEED_SIZE],
+                    size_t cover_index);
+
+#define reconstruct_tree AIMER_NAMESPACE(reconstruct_tree)
+void reconstruct_tree(uint8_t nodes[2 * AIMER_N - 2][AIMER_SEED_SIZE],
+                      const uint8_t salt[AIMER_SALT_SIZE],
+                      const uint8_t reveal_path[AIMER_LOGN][AIMER_SEED_SIZE],
+                      size_t rep_index,
+                      size_t cover_index);
+
+#endif // TREE_H