diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index f502c2c6f0..dbcdabee66 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,5 +1,7 @@
 name: Test Py
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:
diff --git a/.github/workflows/contrib-tests.yml b/.github/workflows/contrib-tests.yml
index 815b30f890..61d4b99076 100644
--- a/.github/workflows/contrib-tests.yml
+++ b/.github/workflows/contrib-tests.yml
@@ -1,5 +1,7 @@
 name: Check tvb-contrib
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:
diff --git a/.github/workflows/kernels.yml b/.github/workflows/kernels.yml
new file mode 100644
index 0000000000..c7a32ddfa1
--- /dev/null
+++ b/.github/workflows/kernels.yml
@@ -0,0 +1,80 @@
+name: build & test kernel library
+on:
+  push:
+
+# only run latest push
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  pip-tvbk:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: true
+      matrix:
+        python-version: [ "3.10", ]
+
+    steps:
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        id: setPy
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: install tools and dependencies
+        run: python3 -m pip install -U pip pytest scipy ctypesgen
+
+      - name: build & teste
+        run: |
+          cd tvb_kernels
+          pip install .
+          pytest
+
+
+  spack-tvbk:
+    runs-on: ubuntu-latest
+    steps:
+
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: setup spack
+        uses: spack/setup-spack@v2
+        with:
+          ref: develop
+          buildcache: true
+          color: true
+          path: spack
+
+      - name: install deps w/ spack
+        shell: spack-bash {0}
+        run: |
+          cd tvb_kernels
+          spack -e . concretize
+          spack -e . install
+          spack env activate .
+          spack env status
+          pip install pytest ctypesgen
+
+      - name: build & test
+        shell: spack-bash {0}
+        run: |
+          cd tvb_kernels
+          spack env activate .
+          pip install .
+          pytest
+
+      # https://github.com/spack/setup-spack?tab=readme-ov-file#example-caching-your-own-binaries-for-public-repositories
+      - name: Push packages and update index
+        run: |
+          cd tvb_kernels
+          spack -e . mirror set --push --oci-username ${{ github.actor }} --oci-password "${{ secrets.GITHUB_TOKEN }}" local-buildcache
+          spack -e . buildcache push --base-image ubuntu:22.04 --update-index local-buildcache
+        if: ${{ !cancelled() }}
diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml
index dcb1c9ef41..6795c76876 100644
--- a/.github/workflows/notebooks.yml
+++ b/.github/workflows/notebooks.yml
@@ -1,5 +1,7 @@
 name: Test Notebooks
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:
@@ -59,4 +61,4 @@ jobs:
 
       - name: run notebooks
         run: |
-          python ./tvb_build/notebook_runner.py ./tvb_documentation/demos
\ No newline at end of file
+          python ./tvb_build/notebook_runner.py ./tvb_documentation/demos
diff --git a/.github/workflows/pg-tests.yml b/.github/workflows/pg-tests.yml
index ac9f074465..8f1a9ee067 100644
--- a/.github/workflows/pg-tests.yml
+++ b/.github/workflows/pg-tests.yml
@@ -1,5 +1,7 @@
 name: Test PG
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:
diff --git a/.github/workflows/win-tests.yml b/.github/workflows/win-tests.yml
index f4e087dad3..4993a784f7 100644
--- a/.github/workflows/win-tests.yml
+++ b/.github/workflows/win-tests.yml
@@ -1,5 +1,7 @@
 name: Test Win
-on: [push]
+on:
+  push:
+    branches-ignore: [tvb_kernels]
 
 jobs:
   build:
diff --git a/tvb_kernels/.gitignore b/tvb_kernels/.gitignore
new file mode 100644
index 0000000000..cf8713b9c5
--- /dev/null
+++ b/tvb_kernels/.gitignore
@@ -0,0 +1,14 @@
+.cache
+build
+*.jpg
+build
+.spack-env
+spack.lock
+*.whl
+*.so
+*.o
+tvb_kernels/_ctg_tvbk.py
+tvbk.js
+tvbk.wasm
+compile_commands.json
+_ctg_*.py
diff --git a/tvb_kernels/README.md b/tvb_kernels/README.md
new file mode 100644
index 0000000000..60ed194728
--- /dev/null
+++ b/tvb_kernels/README.md
@@ -0,0 +1,39 @@
+# tvb_kernels
+
+This is a library of computational kernels for TVB.
+
+## scope
+
+in order of priority
+
+- [x] sparse delay coupling functions
+- [ ] fused heun neural mass model step functions
+- [ ] neural ODE
+- [ ] bold / tavg monitors
+
+which will then be used internally by TVB.
+
+## building
+
+No stable versions are available, so users can install from
+source with `pip install .` or to just get a wheel `pip wheel .`
+
+For development, prefer
+```
+pip install nanobind scikit-build-core[pyproject]
+pip install --no-build-isolation -Ceditable.rebuild=true -ve .
+```
+ 
+## roadmap
+
+- improve api (ownership, checking etc)
+- variants
+  - single
+  - batches
+  - spatialized parameters
+- cuda ports for Jax, CuPy and Torch users 
+- mex functions?
+- scalable components
+  - domains
+  - projections
+  - small vm to automate inner loop
diff --git a/tvb_kernels/ghci-build.yml b/tvb_kernels/ghci-build.yml
new file mode 120000
index 0000000000..0592cd0e6d
--- /dev/null
+++ b/tvb_kernels/ghci-build.yml
@@ -0,0 +1 @@
+../.github/workflows/kernels.yml
\ No newline at end of file
diff --git a/tvb_kernels/pyproject.toml b/tvb_kernels/pyproject.toml
new file mode 100644
index 0000000000..a064f9445a
--- /dev/null
+++ b/tvb_kernels/pyproject.toml
@@ -0,0 +1,46 @@
+[build-system]
+requires = ["hatchling", "hatch-build-scripts", "ctypesgen"]
+build-backend = "hatchling.build"
+
+[project]
+name = "tvb_kernels"
+version = "0.0.1"
+description = "TVB computational kernels"
+readme = "README.md"
+requires-python = ">=3.8"
+authors = [
+    { name = "Marmaduke Woodman", email = "marmaduke.woodman@univ-amu.fr" },
+]
+classifiers = [ ]
+dependencies = [ "numpy" ]
+
+[project.urls]
+Homepage = "https://github.com/the-virtual-brain/tvb-root"
+
+[tool.cibuildwheel]
+build-verbosity = 1
+test-command = "pytest"
+test-requires = "pytest"
+
+skip = ["pp*"] # don't build pypy wheels
+archs = ["auto"]
+
+# [tool.hatch.version]
+# path = "tvb_kernels/version.py"
+
+[tool.hatch.build]
+include = [
+    "tvb_kernels/*.py",
+]
+
+[[tool.hatch.build.hooks.build-scripts.scripts]]
+work_dir = "src"
+out_dir = "tvb_kernels"
+commands = [ 
+    "make -B libtvbk.so",
+    "ctypesgen -l libtvbk.so tvbk.h > _ctg_tvbk.py",
+]
+artifacts = [ 
+    "libtvbk.so",
+    "_ctg_tvbk.py",
+]
diff --git a/tvb_kernels/spack.yaml b/tvb_kernels/spack.yaml
new file mode 100644
index 0000000000..bcaa0ccb8f
--- /dev/null
+++ b/tvb_kernels/spack.yaml
@@ -0,0 +1,14 @@
+spack:
+
+  specs:
+  - py-numpy
+  - py-scipy
+  - py-pip
+
+  concretizer:
+    unify: true
+
+  mirrors:
+    local-buildcache:
+      url: oci://ghcr.io/maedoc/spack-buildcache
+      signed: false
diff --git a/tvb_kernels/src/Makefile b/tvb_kernels/src/Makefile
new file mode 100644
index 0000000000..df6afaecdf
--- /dev/null
+++ b/tvb_kernels/src/Makefile
@@ -0,0 +1,19 @@
+# run `bear -- make -B -j` to get compile_commands.json
+
+CC = gcc
+CFLAGS = -msse4.2 -O3 -funroll-loops -fopenmp-simd
+SRC = tvbk_conn.c
+OBJ = $(patsubst %.c,%.o,$(SRC))
+SO = libtvbk.so
+ctg = ../tvb_kernels/_ctg_tvbk.py
+
+all: $(SO) $(ctg)
+
+$(SO) : $(OBJ)
+	$(CC) -shared $< -o $@
+	
+$(ctg): tvbk.h
+	ctypesgen -l $(SO) $< > $@
+
+tvbk.wasm: em.cpp tvbk.h $(SRC)
+	emcc -lembind -o tvbk.js em.cpp $(SRC)
diff --git a/tvb_kernels/src/em.cpp b/tvb_kernels/src/em.cpp
new file mode 100644
index 0000000000..5c849d6f8c
--- /dev/null
+++ b/tvb_kernels/src/em.cpp
@@ -0,0 +1,19 @@
+#include <emscripten/bind.h>
+
+using namespace emscripten;
+
+extern "C" {
+#include "tvbk.h"
+}
+
+class Conn {
+  const tvbk_conn conn;
+
+public:
+  Conn() : conn({0}) {}
+  void cx_nop(uint32_t t) { tvbk_cx_nop(&conn, t); }
+};
+
+EMSCRIPTEN_BINDINGS(tvb_kernels) {
+  class_<Conn>("Conn").constructor().function("cx_nop", &Conn::cx_nop);
+}
diff --git a/tvb_kernels/src/tvbk.h b/tvb_kernels/src/tvbk.h
new file mode 100644
index 0000000000..21963af00b
--- /dev/null
+++ b/tvb_kernels/src/tvbk.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <stdint.h>
+
+typedef struct tvbk_params tvbk_params;
+
+struct tvbk_params {
+  const uint32_t count;
+  const float *const values;
+};
+
+/* a afferent coupling buffer into which the cx functions
+   accumulate their results */
+typedef struct tvbk_cx tvbk_cx;
+
+struct tvbk_cx {
+  /* values for 1st and 2nd Heun stage respectively.
+     each shaped (num_node, ) */
+  float *const cx1;
+  float *const cx2;
+  /* delay buffer (num_node, num_time)*/
+  float *const buf;
+  const uint32_t num_node;
+  const uint32_t num_time; // horizon, power of 2
+};
+
+typedef struct tvbk_conn tvbk_conn;
+
+struct tvbk_conn {
+  const int num_node;
+  const int num_nonzero;
+  const int num_cvar;
+  const float *const weights;    // (num_nonzero,)
+  const uint32_t *const indices; // (num_nonzero,)
+  const uint32_t *const indptr;  // (num_nodes+1,)
+  const uint32_t *const idelays; // (num_nonzero,)
+};
+
+/* not currently used */
+typedef struct tvbk_sim tvbk_sim;
+struct tvbk_sim {
+  // keep invariant stuff at the top, per sim stuff below
+  const int rng_seed;
+  const int num_node;
+  const int num_svar;
+  const int num_time;
+  const int num_params;
+  const int num_spatial_params;
+  const float dt;
+  const int oversample; // TODO "oversample" for stability,
+  const int num_skip;   // skip per output sample
+  float *z_scale;       // (num_svar), sigma*sqrt(dt)
+
+  // parameters
+  const tvbk_params global_params;
+  const tvbk_params spatial_params;
+
+  float *state_trace; // (num_time//num_skip, num_svar, num_nodes)
+  float *states;      // full states (num_svar, num_nodes)
+
+  const tvbk_conn conn;
+};
+
+void tvbk_cx_j(const tvbk_cx *cx, const tvbk_conn *conn, uint32_t t);
+void tvbk_cx_i(const tvbk_cx *cx, const tvbk_conn *conn, uint32_t t);
+void tvbk_cx_nop(const tvbk_cx *cx, const tvbk_conn *conn, uint32_t t);
diff --git a/tvb_kernels/src/tvbk_conn.c b/tvb_kernels/src/tvbk_conn.c
new file mode 100644
index 0000000000..8b1ee37b3a
--- /dev/null
+++ b/tvb_kernels/src/tvbk_conn.c
@@ -0,0 +1,53 @@
+#include "tvbk.h"
+
+static void cx_all_j(const tvbk_cx *cx, const tvbk_conn *c, uint32_t t,
+                     uint32_t j) {
+  uint32_t wrap_mask = cx->num_time - 1; // assume num_time is power of 2
+  float *const buf = cx->buf + j * cx->num_time;
+  uint32_t th = t + cx->num_time;
+#pragma omp simd
+  for (uint32_t l = c->indptr[j]; l < c->indptr[j + 1]; l++) {
+    uint32_t i = c->indices[l];
+    float w = c->weights[l];
+    uint32_t d = c->idelays[l];
+    uint32_t p1 = (th - d) & wrap_mask;
+    uint32_t p2 = (th - d + 1) & wrap_mask;
+    cx->cx1[i] += w * buf[p1];
+    cx->cx2[i] += w * buf[p2];
+  }
+}
+
+void tvbk_cx_j(const tvbk_cx *cx, const tvbk_conn *c, uint32_t t) {
+#pragma omp simd
+  for (int i = 0; i < c->num_node; i++)
+    cx->cx1[i] = cx->cx2[i] = 0.0f;
+  for (int j = 0; j < c->num_node; j++)
+    cx_all_j(cx, c, t, j);
+}
+
+void tvbk_cx_i(const tvbk_cx *cx, const tvbk_conn *c, uint32_t t) {
+  uint32_t wrap_mask = cx->num_time - 1; // assume num_time is power of 2
+  uint32_t th = t + cx->num_time;
+#pragma omp simd
+  for (int i = 0; i < c->num_node; i++) {
+    float cx1 = 0.f, cx2 = 0.f;
+    for (uint32_t l = c->indptr[i]; l < c->indptr[i + 1]; l++) {
+      uint32_t j = c->indices[l];
+      float w = c->weights[l];
+      uint32_t d = c->idelays[l];
+      uint32_t p1 = (th - d) & wrap_mask;
+      uint32_t p2 = (th - d + 1) & wrap_mask;
+      cx1 += w * cx->buf[j * cx->num_time + p1];
+      cx2 += w * cx->buf[j * cx->num_time + p2];
+    }
+    cx->cx1[i] = cx1;
+    cx->cx2[i] = cx2;
+  }
+}
+
+void tvbk_cx_nop(const tvbk_cx *cx, const tvbk_conn *c, uint32_t t) {
+  (void)c;
+  (void)t;
+
+  return;
+}
diff --git a/tvb_kernels/test_conn.py b/tvb_kernels/test_conn.py
new file mode 100644
index 0000000000..353705b0aa
--- /dev/null
+++ b/tvb_kernels/test_conn.py
@@ -0,0 +1,62 @@
+import numpy as np
+import scipy.sparse
+import enum
+
+import tvb_kernels
+
+
+def rand_weights(seed=43, sparsity=0.4, dt=0.1, horizon=256, num_node=90):
+    np.random.seed(seed)
+    horizonm1 = horizon - 1
+    assert ((horizon) & (horizonm1)) == 0
+    weights, lengths = np.random.rand(2, num_node, num_node).astype('f')
+    lengths[:] *= 0.8
+    lengths *= (horizon*dt*0.8)
+    zero_mask = weights < (1-sparsity)
+    weights[zero_mask] = 0
+    return weights, lengths
+
+
+def base_setup(mode: tvb_kernels.CxMode = tvb_kernels.CxMode.CX_J):
+    cv = 1.0
+    dt = 0.1
+    num_node = 90
+    horizon = 256
+    weights, lengths = rand_weights(num_node=num_node, horizon=horizon, dt=dt)
+    cx = tvb_kernels.Cx(num_node, horizon)
+    conn = tvb_kernels.Conn(cx, dt, cv, weights, lengths, mode=mode)
+
+    # then we can test
+    cx.buf[:] = np.r_[:1.0:1j*num_node *
+                      horizon].reshape(num_node, horizon).astype('f')*4.0
+
+    # impl simple numpy version
+    def make_cfun_np():
+        zero_mask = weights == 0
+        csr_weights = scipy.sparse.csr_matrix(weights)
+        idelays = (lengths[~zero_mask]/cv/dt).astype('i')+2
+        idelays2 = -horizon + np.c_[idelays, idelays-1].T
+        assert idelays2.shape == (2, csr_weights.nnz)
+
+        def cfun_np(t):
+            _ = cx.buf[csr_weights.indices, (t-idelays2) % horizon]
+            _ *= csr_weights.data
+            _ = np.add.reduceat(_, csr_weights.indptr[:-1], axis=1)
+            return _  # (2, num_node)
+        return cfun_np
+
+    return conn, cx, make_cfun_np()
+
+
+def test_conn_kernels():
+    connj, cxj, cfun_np = base_setup(tvb_kernels.CxMode.CX_J)
+    conni, cxi, _ = base_setup(tvb_kernels.CxMode.CX_I)
+
+    for t in range(1024):
+        cx = cfun_np(t)
+        connj(t)
+        conni(t)
+        np.testing.assert_allclose(cx[0], cxj.cx1, 1e-4, 1e-6)
+        np.testing.assert_allclose(cx[1], cxj.cx2, 1e-4, 1e-6)
+        np.testing.assert_allclose(cx[0], cxi.cx1, 1e-4, 1e-6)
+        np.testing.assert_allclose(cx[1], cxi.cx2, 1e-4, 1e-6)
diff --git a/tvb_kernels/tvb_kernels/__init__.py b/tvb_kernels/tvb_kernels/__init__.py
new file mode 100644
index 0000000000..77264fa21b
--- /dev/null
+++ b/tvb_kernels/tvb_kernels/__init__.py
@@ -0,0 +1,84 @@
+import enum
+import numpy as np
+import ctypes
+
+# TODO mv functionality used to C
+import scipy.sparse
+
+from . import _ctg_tvbk
+
+
+def _to_ct(a):
+    val_type = {
+        'float32': ctypes.c_float,
+        'uint32': ctypes.c_uint32,
+        'int32': ctypes.c_int32
+    }[a.dtype.name]
+    return a.ctypes.data_as(ctypes.POINTER(val_type))
+
+
+class CxMode(enum.Enum):
+    CX_J = 1
+    CX_I = 2
+    CX_NOP = 3
+
+
+class Cx:
+    def __init__(self, num_node, num_time):
+        if num_time & (num_time - 1) != 0:
+            raise ValueError('num_time not power of two')
+        self.cx1, self.cx2 = np.zeros((2, num_node), 'f')
+        self.buf = np.zeros((num_node, num_time), 'f')
+        self._cx = _ctg_tvbk.tvbk_cx(
+            cx1=_to_ct(self.cx1),
+            cx2=_to_ct(self.cx2),
+            buf=_to_ct(self.buf),
+            num_node=num_node,
+            num_time=num_time 
+        )
+
+
+class Conn:
+
+    def __init__(self, cx, dt, cv, weights, lengths, mode: CxMode = CxMode.CX_J):
+        self.mode = mode
+        assert weights.shape[0] == weights.shape[1]
+        assert weights.shape == lengths.shape
+        self.num_node = weights.shape[0]
+        zero_mask = weights == 0
+        if mode == CxMode.CX_J:
+            # cx_j requires csc format, cx_i requires csr
+            spw = scipy.sparse.csc_matrix(weights)
+            # since numpy is row major, we need to transpose zero mask then ravel
+            nnz_ravel = np.argwhere(~zero_mask.T.copy().ravel())[:, 0]
+            # then we also need to extract them in the transposed order
+            idelays = (lengths.T.copy().ravel()[
+                       nnz_ravel] / cv / dt).astype('i') + 2
+        elif mode == CxMode.CX_I:
+            spw = scipy.sparse.csr_matrix(weights)
+            nnz_ravel = np.argwhere(~zero_mask.ravel())[:, 0]
+            idelays = (lengths.ravel()[nnz_ravel] / cv / dt).astype('i') + 2
+        else:
+            raise NotImplementedError
+
+        # retain references to arrays so they aren't collected
+        self.idelays = idelays.astype(np.uint32)
+        self.weights = spw.data.astype(np.float32)
+        self.indices = spw.indices.astype(np.uint32)
+        self.indptr = spw.indptr.astype(np.uint32)
+        self.cx = cx
+        self._conn = _ctg_tvbk.tvbk_conn(
+            num_node=weights.shape[0],
+            num_nonzero=self.indices.size,
+            num_cvar=1,
+            weights=_to_ct(self.weights),
+            indices=_to_ct(self.indices),
+            indptr=_to_ct(self.indptr),
+            idelays=_to_ct(self.idelays),
+        )
+
+    def __call__(self, t):
+        if self.mode == CxMode.CX_J:
+            _ctg_tvbk.tvbk_cx_j(self.cx._cx, self._conn, t)
+        if self.mode == CxMode.CX_I:
+            _ctg_tvbk.tvbk_cx_i(self.cx._cx, self._conn, t)