diff --git a/scf/halide/.gitignore b/scf/halide/.gitignore
index cba65b3..f4553ce 100644
--- a/scf/halide/.gitignore
+++ b/scf/halide/.gitignore
@@ -1,13 +1,7 @@
 callgrind.out.*
-*.o
 scf.x
 *_gen
-*.a
-twoel.a
-twoel.h
-twoel.schedule.h
-*.halide_generated.cpp
-*.registration.cpp
+twoel.*
 *.stmt
 *.trace
 *.png
diff --git a/scf/halide/Makefile b/scf/halide/Makefile
index d8eb3fe..41ee222 100644
--- a/scf/halide/Makefile
+++ b/scf/halide/Makefile
@@ -1,27 +1,25 @@
 ## Makefile for SCF-C
 
-CC  = clang++
-CXX = clang++
-HALIDE_INCDIR=../../halide/build/include
-HALIDE_LIBDIR=../../halide/build/lib
-CFLAGS = -O3 -mtune=native -march=native -g -Wall -Wno-deprecated -I$(HALIDE_INCDIR)
+CC  = clang++-10
+CXX = clang++-10
+CFLAGS = -O3 -mtune=native -march=native -g -Wall -Wno-deprecated
 CXXFLAGS = $(CFLAGS)
-CLINK = -L$(HALIDE_LIBDIR) -ldl -lpthread
-CXXLINK = -L$(HALIDE_LIBDIR) -lHalide
+CLINK = -ldl -lpthread
+CXXLINK = -lHalide
 
 TARGET=host-x86-64
 #TARGET=$(TARGET)-profile
 MACHINE_PARAMS=32,16777216,40
 
 # to use manually defined scheduler in twoel_gen.cpp, leave AUTOSCHEDULER commented.
-AUTOSCHEDULER=Mullapudi2016
+#AUTOSCHEDULER=Mullapudi2016
 #AUTOSCHEDULER=Li2018
 #AUTOSCHEDULER=Adams2019
 
 ifneq ($(AUTOSCHEDULER),)
-SCHEDLIB=../../halide/build/lib/libauto_schedule.so
+SCHEDLIB=/usr/local/lib/x86_64-linux-gnu/libauto_schedule.so
 ifeq ($(AUTOSCHEDULER),Li2018)
-SCHEDLIB=../../halide/build/lib/libgradient_autoscheduler.so
+SCHEDLIB=/usr/local/lib/x86_64-linux-gnu/libgradient_autoscheduler.so
 endif
 endif
 
@@ -42,21 +40,15 @@ trace scf.trace: scf.x be.inpt
 	HL_TRACE_FILE=scf.trace ./scf.x
 
 pictures: scf.trace
-	../../halide/build/bin/HalideTraceDump -i scf.trace -t png
+	HalideTraceDump -i scf.trace -t png
 
-VIDEO_WIDTH=500
-VIDEO_HEIGHT=400
+VIDEO_WIDTH=1920
+VIDEO_HEIGHT=1080
 video: scf.trace
 	rm -f scf.mp4
-	cat scf.trace | ../../halide/build/bin/HalideTraceViz \
-	  --size $(VIDEO_WIDTH) $(VIDEO_HEIGHT) --zoom 4 --timestep 8 --hold 100 --decay 5 5 --gray \
-	  --move  32  48 --func g_fock_out --move  32  40 --label g_fock_out "g_fock out" 1 --move 240  48 --func g_dens     --move 240  40 --label g_dens      g_dens      1 \
-	  --move  32 248 --func x          --move  32 240 --label x           x           1 --move 240 248 --func expnt      --move 240 240 --label expnt       expnt       1 \
-	  --move  32 292 --func y          --move  32 284 --label y           y           1 --move 240 292 --func rnorm      --move 240 284 --label rnorm       rnorm       1 \
-	  --move  32 340 --func z          --move  32 332 --label z           z           1 \
-      --zoom 2 \
-      --move  32 200 --func fm         --move  32 192 --label fm          fm          1 \
-	| ffmpeg -f rawvideo -pix_fmt bgr32 -s $(VIDEO_WIDTH)x$(VIDEO_HEIGHT) -i /dev/stdin -c:v h264 scf.mp4
+	cat scf.trace | HalideTraceViz \
+	  --size $(VIDEO_WIDTH) $(VIDEO_HEIGHT) --zoom 4 --timestep 32 --hold 100 --decay 5 5 --gray --auto_layout \
+	| ffmpeg -loglevel warning -f rawvideo -pix_fmt bgr32 -s $(VIDEO_WIDTH)x$(VIDEO_HEIGHT) -i /dev/stdin -c:v h264 scf.mp4
 
 vg: scf.x
 	valgrind ./scf.x
@@ -64,15 +56,8 @@ vg: scf.x
 cg: scf.x
 	valgrind --tool=callgrind ./scf.x
 
-twoel_gen: twoel_gen.cpp
-	$(CXX) $(CXXFLAGS) -o $@ ../../halide/tools/GenGen.cpp $^ $(CXXLINK)
-
-twoel.a: twoel_gen Makefile
-ifeq ($(AUTOSCHEDULER),)
-	LD_LIBRARY_PATH=$(HALIDE_LIBDIR) ./twoel_gen -g twoel -o . -e static_library,h,schedule,cpp,stmt,registration target=$(TARGET) machine_params=$(MACHINE_PARAMS)
-else
-	LD_LIBRARY_PATH=$(HALIDE_LIBDIR) ./twoel_gen -g twoel -o . -e static_library,h,schedule,cpp,stmt,registration -p $(SCHEDLIB) -s $(AUTOSCHEDULER) target=$(TARGET) auto_schedule=true machine_params=$(MACHINE_PARAMS)
-endif
+twoel.a: tools/twoel_gen.py tools/decompose.py
+	time python3 tools/twoel_gen.py
 
 %.o: %.c
 	$(CC) $(CFLAGS) -c -o $@ $<
@@ -80,4 +65,5 @@ endif
 scf.o: twoel.a
 
 clean:
-	rm -f *~ *.o $(MTARGET) *.a *_gen *.stmt *.halide_generated.cpp twoel.h twoel.schedule.h twoel.registration.cpp *.trace *.png *.mp4
+	rm -rf *~ *.o $(MTARGET) twoel.* *.trace *.png *.mp4 tools/__pycache__
+	make -C test clean
diff --git a/scf/halide/be.inpt b/scf/halide/be.inpt
index 46a6c83..9a6bf26 100644
--- a/scf/halide/be.inpt
+++ b/scf/halide/be.inpt
@@ -1,4 +1,4 @@
-30
+8
 
 4  0.000 0.000 0.000
 4  4.000 0.000 0.000
@@ -8,25 +8,3 @@
 4  20.000 0.000 0.000
 4  24.000 0.000 0.000
 4  28.000 0.000 0.000
-4  32.000 0.000 0.000
-4  36.000 0.000 0.000
-4  40.000 0.000 0.000
-4  44.000 0.000 0.000
-4  48.000 0.000 0.000
-4  52.000 0.000 0.000
-4  56.000 0.000 0.000
-4  60.000 0.000 0.000
-4  64.000 0.000 0.000
-4  68.000 0.000 0.000
-4  72.000 0.000 0.000
-4  76.000 0.000 0.000
-4  80.000 0.000 0.000
-4  84.000 0.000 0.000
-4  88.000 0.000 0.000
-4  92.000 0.000 0.000
-4  96.000 0.000 0.000
-4  100.000 0.000 0.000
-4  104.000 0.000 0.000
-4  108.000 0.000 0.000
-4  112.000 0.000 0.000
-4  116.000 0.000 0.000
diff --git a/scf/halide/scf.c b/scf/halide/scf.c
index 58f22b0..f5ff0d7 100644
--- a/scf/halide/scf.c
+++ b/scf/halide/scf.c
@@ -84,13 +84,14 @@ int main(int argc, char **argv) {
 
 // compute the two particle contributions to the fock matrix and get the total energy;
     {
-        Halide::Runtime::Buffer<double> etwo_buffer = Halide::Runtime::Buffer<double>::make_scalar();
+        Halide::Runtime::Buffer<double, 1> etwo_buffer(1);
         extern double rdelta, delta, delo2; // integ.c
         extern Halide::Runtime::Buffer<double> fm; // integ.c
         int error = twoel(delo2, delta, rdelta, expnt_buf, rnorm_buf, x_buf, y_buf, z_buf, fm_buf, *g_fock_buf, g_dens_buf, etwo_buffer, *g_fock_out_buf);
         assert(!error);
         swap_g_fock();
 #ifdef TRACING
+        printf("twoel took %f seconds\n", timer());
         exit(0);
 #endif /* TRACING */
         etwo = etwo_buffer(0);
diff --git a/scf/halide/test/.gitignore b/scf/halide/test/.gitignore
new file mode 100644
index 0000000..d86fed6
--- /dev/null
+++ b/scf/halide/test/.gitignore
@@ -0,0 +1,7 @@
+/twoel_g.*
+/twoel.*
+faketwoel.trace
+/fake
+
+perf.data
+/__pycache__
diff --git a/scf/halide/test/Makefile b/scf/halide/test/Makefile
new file mode 100644
index 0000000..638b83c
--- /dev/null
+++ b/scf/halide/test/Makefile
@@ -0,0 +1,43 @@
+## Makefile for SCF-C
+
+CXX=clang++-10
+CXXFLAGS=-O2 -g -Wall -fdiagnostics-color=always
+CXXLINK=-ldl -lpthread
+
+THREADS=1
+TILE_SIZE=15
+VECTOR_SIZE=4
+DATA_SIZE=120
+PERF_FUNCS=4D_ij_low_kl_low_pairs_low
+PERF_FUNCS=all
+
+TRACE_DATA_SIZE=15
+TRACE_VIDEO_WIDTH=1920
+TRACE_VIDEO_HEIGHT=1080
+TRACE_FUNCS=all
+
+run: fake
+	HL_NUM_THREADS=$(THREADS) ./fake $(DATA_SIZE)
+
+perf: fake
+	HL_NUM_THREADS=$(THREADS) perf record ./fake $(DATA_SIZE)
+	perf report
+
+twoel.a twoel.h: fakegen.py ../tools/twoel_gen.py ../tools/decompose.py Makefile
+	python3 fakegen.py $(PERF_FUNCS) tilesize=$(TILE_SIZE) vectorsize=$(VECTOR_SIZE)
+
+fake: fake.cpp twoel.a twoel.h
+	$(CXX) $(CXXFLAGS) fake.cpp twoel.a -o $@ $(CXXLINK)
+
+twoel.trace: faketwoel.py ../tools/twoel_gen.py ../tools/decompose.py Makefile
+	rm -f $@
+	HL_NUM_THREADS=$(THREADS) HL_TRACE_FILE=$@ python3 faketwoel.py all datasize=$(TRACE_DATA_SIZE) tilesize=$(TILE_SIZE) vectorsize=$(VECTOR_SIZE) itercount=0 tracing=True
+
+video: twoel.trace
+	rm -f twoel.mp4
+	time cat twoel.trace | HalideTraceViz \
+	  --size $(TRACE_VIDEO_WIDTH) $(TRACE_VIDEO_HEIGHT) --zoom 4 --timestep 15 --hold 100 --decay 5 5 --gray --auto_layout \
+	| ffmpeg -loglevel warning -f rawvideo -pix_fmt bgr32 -s $(TRACE_VIDEO_WIDTH)x$(TRACE_VIDEO_HEIGHT) -i /dev/stdin -c:v h264 twoel.mp4
+
+clean:
+	rm -rf *.trace twoel.* twoel_g.* fake perf.* __pycache__
diff --git a/scf/halide/test/fake.cpp b/scf/halide/test/fake.cpp
new file mode 100644
index 0000000..1db50f2
--- /dev/null
+++ b/scf/halide/test/fake.cpp
@@ -0,0 +1,122 @@
+/* Standalone program that calls twoel() the same way SCF does, and measures how long it takes. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/times.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <vector>
+#include <sstream>
+
+#include <HalideBuffer.h>
+
+#include "twoel.h"
+
+using namespace Halide::Runtime;
+
+int N;
+
+double gen0d() {
+    return drand48();
+}
+
+Buffer<double> gen1d(int I=0) {
+    if(I == 0)
+        I = N;
+    Buffer<double> rv(I);
+    for(int i = 0; i < I; i++)
+        rv(i) = drand48();
+    return rv;
+}
+
+Buffer<double> gen2d(int I=0, int J=0) {
+    if(I == 0)
+        I = N;
+    if(J == 0)
+        J = N;
+    Buffer<double> rv(I, J);
+    for(int i = 0; i < I; i++)
+        for(int j = 0; j < J; j++)
+            rv(i, j) = drand48();
+    return rv;
+}
+
+double timestamp() {
+    double rv;
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    rv = tv.tv_usec;
+    rv /= 1000000;
+    rv += tv.tv_sec;
+    return rv;
+}
+
+long cputickstamp() {
+    struct tms tms;
+    times(&tms);
+    return tms.tms_utime;
+}
+
+int main(int argc, char **argv) {
+    if(argc < 2) {
+        fprintf(stderr, "Usage: %s <N>\n", argv[0]);
+        return 1;
+    }
+    N = strtol(argv[1], NULL, 0);
+    srand(2);
+    srand48(rand());
+
+    double delo2 = gen0d();
+    double delta = gen0d();
+    double rdelta = gen0d();
+    Buffer<double> expnt      = gen1d();
+    Buffer<double> rnorm      = gen1d();
+    Buffer<double> x          = gen1d();
+    Buffer<double> y          = gen1d();
+    Buffer<double> z          = gen1d();
+    Buffer<double> fm         = gen2d(1002, 5);
+    Buffer<double> g_fock_in  = gen2d();
+    Buffer<double> g_dens     = gen2d();
+    Buffer<double> g_fock_out = gen2d();
+    Buffer<double> rv         = gen1d();
+
+    // dry run
+    int error = twoel(delo2, delta, rdelta, expnt, rnorm, x, y, z, fm, g_fock_in, g_dens, rv, g_fock_out);
+    if(error) {
+        fprintf(stderr, "twoel failed with code %d\n", error);
+        return 1;
+    }
+
+    // benchmark it
+    std::vector<double> throughputs = {};
+    for(int trial = 0; trial < 4; trial++) {
+        double start_walltime  = timestamp();
+        clock_t start_cputicks = cputickstamp();
+        int itercount;
+        for(itercount = 0; timestamp() - start_walltime < 5.0; itercount++) {
+            twoel(delo2, delta, rdelta, expnt, rnorm, x, y, z, fm, g_fock_in, g_dens, rv, g_fock_out);
+        }
+        clock_t cputicks = cputickstamp() - start_cputicks;
+        double walltime = timestamp() - start_walltime;
+        double cputime = (double)cputicks / sysconf(_SC_CLK_TCK);
+        double per_walltime = walltime / itercount;
+        double per_cputime  = cputime  / itercount;
+        double throughput = (double)1.0 * N * N * N * N / per_walltime;
+        printf("%d iterations in %.3f seconds, %.3f seconds of cpu time, %.3e seconds per iter, %.3e cpu seconds per iter, %.3e effective iters per second\n", itercount, walltime, cputime, per_walltime, per_cputime, throughput);
+        throughputs.push_back(throughput);
+    }
+    // sort and stringify the throughput values
+    std::sort(throughputs.begin(), throughputs.end());
+    std::ostringstream stringify;
+    for(int i = 0; i < throughputs.size(); i++) {
+        if(i)
+            stringify << ", ";
+        stringify << throughputs[i];
+    }
+    printf("throughputs: {%s}\n", stringify.str().c_str());
+
+    return 0;
+}
diff --git a/scf/halide/test/fakegen.py b/scf/halide/test/fakegen.py
new file mode 100644
index 0000000..001f4a0
--- /dev/null
+++ b/scf/halide/test/fakegen.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+'''generate a twoel.a and twoel.h with only the specified zones, and with any scheduling params (thread count, vector/block sizes) passed through'''
+
+import sys
+sys.path.append('../tools')
+import halide as hl
+import twoel_gen
+
+def gen_twoel(zone_name, **kwargs):
+
+    # get JIT pipeline
+    zone_names = zone_name.split(",")
+    myzones = []
+    for zone in zones.loops:
+        if zone_name == 'all' or zone.name in zone_names:
+            myzones.append(zone)
+    if len(myzones) == 0:
+        if zone_name == 'list':
+            print([z['name'] for z in zones.loops])
+        else:
+            print("no zone %s found"%zone_name)
+        exit(1)
+    if "target_name" in kwargs:
+        target_name = kwargs["target_name"]
+        del kwargs["target_name"]
+    else:
+        target_name = "x86-64-linux-avx-avx2-f16c-fma-sse41-profile-disable_llvm_loop_opt"
+    zones.loops = myzones
+    gen = twoel_gen.Generate_twoel(loopnests=zones, **kwargs)
+    gen.generate_twoel()
+    p = gen.pipeline
+    print("generating for target", target_name)
+    target = hl.Target(target_name)
+    p.compile_to(
+        {
+            hl.Output.c_header: "twoel.h",
+            hl.Output.c_source: "twoel.cpp",
+            hl.Output.static_library: "twoel.a",
+            hl.Output.stmt: "twoel.stmt",
+            hl.Output.stmt_html: "twoel.html",
+            # the following outputs are useful for running it from python
+            #hl.Output.object: "twoel.o",
+            #hl.Output.python_extension: "twoel.py.cpp",
+        }, list(gen.inputs.values()), "twoel", target
+    )
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        print("Usage: %s <zonename>"%sys.argv[0])
+        exit(1)
+
+    zone_name = sys.argv[1]
+
+    kwargs = {}
+    for param in sys.argv[2:]:
+        k, v = param.split("=")
+        try:
+            v = int(v)
+        except:
+            try:
+                v = bool(v)
+            except:
+                pass
+        kwargs[k] = v
+
+    zones = twoel_gen.define_original_twoel_zone().split_recursive()
+
+    gen_twoel(zone_name, **kwargs)
diff --git a/scf/halide/test/faketwoel.py b/scf/halide/test/faketwoel.py
new file mode 100644
index 0000000..68a94f7
--- /dev/null
+++ b/scf/halide/test/faketwoel.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+
+'''JIT-compile twoel with the specified zones and benchmark it, for quick feedback on scheduling changes'''
+
+import sys
+import time
+sys.path.append('../tools')
+import numpy as np
+import halide as hl
+import twoel_gen
+
+def call_twoel(zone_name, seed=2, datasize=15, itercount=10, target_name="host-disable_llvm_loop_opt", **kwargs):
+    N = datasize
+    seed = 2
+
+    inputs = [
+        { "name": "delo2"  , "d": 0, "value": 0.001 },
+        { "name": "delta"  , "d": 0, "value": 0.001 },
+        { "name": "rdelta" , "d": 0, "value": 0.001 },
+        { "name": "expnt"  , "d": 1, "value": 0.00001 },
+        { "name": "rnorm"  , "d": 1 },
+        { "name": "x"      , "d": 1 },
+        { "name": "y"      , "d": 1 },
+        { "name": "z"      , "d": 1 },
+        { "name": "fm"     , "d": 2, "shape": [1002, 5] },
+        { "name": "g_fock" , "d": 2 },
+        { "name": "g_dens" , "d": 2 },
+        { "name": "g_trace", "d": 4, "value": 0.0 },
+    ]
+
+    outputs = [
+        { "name": "rv"    , "d": 1, "shape": [1] },
+        { "name": "g_fock", "d": 2 },
+    ]
+
+    inputs  = { x["name"]: x for x in inputs }
+    outputs = { x["name"]: x for x in outputs }
+
+    # generate input data
+    print("input/output size is", N, "^2")
+    buffers = []
+    buffers_by_name = {}
+    np.random.seed(seed)
+    for key in inputs:
+        param = inputs[key]
+        if param['d'] == 0:
+            thing = 0.2
+        else:
+            shape = [N]*param['d']
+            if 'shape' in param:
+                shape = param['shape']
+            thing = hl.Buffer(hl.Float(64), shape, name=key)
+            if 'value' in param:
+                if param['value'] != 0.0:
+                    for pos in np.ndindex(*shape):
+                        thing[pos] = param['value']
+            else:
+                values = np.random.rand(*shape) - 0.5
+                for pos in np.ndindex(*shape):
+                    thing[pos] = values[pos]
+        buffers.append(thing)
+        buffers_by_name[key] = thing
+
+    # get JIT pipeline
+    zones = twoel_gen.define_original_twoel_zone().split_recursive()
+    zone_names = zone_name.split(",")
+    myzones = []
+    for zone in zones.loops:
+        if zone_name == 'all' or zone['name'] in zone_names:
+            myzones.append(zone)
+    if len(myzones) == 0:
+        if zone_name == 'list':
+            print([z.name for z in zones])
+        else:
+            print("no zone %s found"%zone_name)
+        exit(1)
+    zones.loops = myzones
+    gen = twoel_gen.Generate_twoel(loopnests=zones, **kwargs)
+    gen.generate_twoel()
+    p = gen.pipeline
+    target = hl.Target(target_name)
+    zone_names = [ z.name for z in myzones ]
+    print("compiling zones", zone_names, "for target", target)
+    p.compile_jit(target)
+    # plug in the parameter values
+    for param in gen.inputs.values():
+        name = param.name()
+        if name in buffers_by_name:
+            thing = buffers_by_name[name]
+        elif name.endswith("_in") and name[:-3] in buffers_by_name:
+            name = name[:-3]
+            thing = buffers_by_name[name]
+        else:
+            raise KeyError(name)
+        param.set(thing)
+
+    # dry-run
+    p.realize(N, N)
+    print(itercount, "timed runs")
+
+    if itercount == 0:
+        # when generating trace output, just doing the dry-run is enough.
+        return 0.0, 0.0
+
+    # benchmark it
+    walltime = 0.0
+    cputime = 0.0
+    for _ in range(itercount):
+        cpu_before  = time.process_time()
+        wall_before = time.time()
+
+        rv, g_fock_out = p.realize(N, N)
+
+        cpu_after   = time.process_time()
+        wall_after  = time.time()
+
+        walltime += wall_after - wall_before
+        cputime  += cpu_after  - cpu_before
+    print("walltime: %.3f"%walltime)
+    print("cputime: %.3f"%cputime)
+    walltime /= itercount
+    cputime  /= itercount
+    print("walltime per iter: %.3e"%walltime)
+    print("cputime per iter: %.3e"%cputime)
+    throughput = N * N * N * N / walltime
+    print("throughput: %.3e g() calls per second (roughly)"%throughput)
+    rv = rv[0]
+    g_fock_out = np.array(g_fock_out)
+    return walltime, cputime
+
+
+if __name__ == "__main__":
+
+    zone_name = sys.argv[1]
+
+    kwargs = {}
+    for param in sys.argv[2:]:
+        k,v = param.split("=")
+        try:
+            v = int(v)
+        except:
+            try:
+                v = bool(v)
+            except:
+                pass
+        kwargs[k] = v
+    call_twoel(zone_name, **kwargs)
diff --git a/scf/halide/tools/.gitignore b/scf/halide/tools/.gitignore
new file mode 100644
index 0000000..1b11c61
--- /dev/null
+++ b/scf/halide/tools/.gitignore
@@ -0,0 +1,2 @@
+/__pycache__
+/media
diff --git a/scf/halide/tools/Decompose.webp b/scf/halide/tools/Decompose.webp
new file mode 100644
index 0000000..35985c5
Binary files /dev/null and b/scf/halide/tools/Decompose.webp differ
diff --git a/scf/halide/tools/README.md b/scf/halide/tools/README.md
new file mode 100644
index 0000000..56f1eb6
--- /dev/null
+++ b/scf/halide/tools/README.md
@@ -0,0 +1,189 @@
+# Twoel generator for Halide
+
+This is a Halide generator for the twoel() function in SCF.
+
+# Goals
+
+The main focus of this code is symmetry decomposition.  The generator uses
+symmetry to minimize the number of times the g() function is called.
+
+# Overall workflow
+
+The main loop of twoel is encoded as a data structure, by the define_original_twoel() function.  This main-loop is then decomposed by the code in decompose.py into 6 sets of loops, covering a much smaller iteration space than the original.  1 of these loops is still 4-dimensional, but it covers ⅛ of the original iteration space.  The other 5 loops are 3-dimensional or less.
+
+Once these sets of loops are calculated, a Halide algorithm is produced, scheduled, and compiled.
+
+# Details
+
+## twoel
+
+The execution time of the SCF application is dominated by the `twoel()` function.
+
+In pseudocode, the core loop of twoel looks like this:
+
+```python
+for i in 0..N:
+  for j in 0..N:
+    for k in 0..N:
+      for l in 0..N:
+        fock[i,j] += g(i,j,k,l) * dens[k,l]
+        fock[i,k] += g(i,j,k,l) * dens[j,l] / -2
+```
+
+`N` is the problem size; it is 15 times the number of atoms being simulated.  `dens[]` is an input data structure.  `g()` is a function which is expensive to calculate.
+
+`twoel()` is called by a convergence loop.  The initial contents of `fock[]` are the values produced by the previous round.
+
+The implementation of `g()` is not listed here, but it is also part of the twoel algorithm.
+
+## Loop nests
+
+The core data structure is a LoopNest.  This represents a set of nested loops, the iteration space it covers, and the set of updates that occur in the innermost loop.
+
+The original twoel() LoopNest looks like this:
+
+```python
+g = Function("g", ["i","j","k","l"])
+zone = LoopNest(
+    name="twoel",
+    symmetries=[ (g, "i", "j"), (g, "k", "l"), (g, ["i","j"], ["k","l"]) ],
+    updates=[
+        Update(Function("fock", ["i","j"]), [g, Function("dens", ["k","l"])]),
+        Update(Function("fock", ["i","k"]), [g, Function("dens", ["j","l"])], coeff=-0.5),
+    ]
+)
+```
+
+
+This loopnest covers a 4-dimensional iteration space.  It takes g() and d() as inputs, and writes output to f().
+
+It has no conditions, so it covers the entire 4-dimensional rectangular prism.  It has 3 symmetries defined, which can be decomposed to reduce overall work.  There are 2 updates in the inner-most loop, which call the same g(), multiplied by different positions in d() with different coefficients, with the result added to different positions in f().
+
+A decomposed LoopNest might look like this:
+
+```python
+g = Function("g", [i,i,k,k])
+LoopNest(
+  name='twoel_i_eq_j_k_eq_l_ij_lt_kl',
+  conditions=[
+    Condition([Index('i')], [Index('j')], '=='),
+    Condition([Index('k')], [Index('l')], '=='),
+    Condition([Index('i')], [Index('k')], '<')
+  ],
+  updates=[
+    Update(Function('fock', [Index('i'), Index('i')]), [g, Function('dens', ['k', 'k'])]),
+    Update(Function('fock', [Index('i'), Index('k')]), [g, Function('dens', ['i', 'k'])], coeff=-0.5),
+    Update(Function('fock', [Index('k'), Index('i')]), [g, Function('dens', ['k', 'i'])], coeff=-0.5),
+    Update(Function('fock', [Index('k'), Index('k')]), [g, Function('dens', ['i', 'i'])])
+  ]
+)
+```
+
+This decomposed loop-nest is 2-dimensional, with the condition `i < k`, so its iteration space is the lower triangle of a matrix.
+
+## Decomposition
+
+Symmetry decomposition is done in `decompose.py`.
+
+Symmetry decomposition is accomplished by the following steps:
+
+1. Identify a symmetry.  Example: g has the symmetry i == j, meaning `g(i,j)` is equal to `g(j,i)`.
+1. Split the loop into 3 pieces.  call the pieces "upper", "equal", and "lower".  The loops are identical, except each of them has an added condition that constricts its iteration space.  The conditions are: `i < j` (upper), `i == j` (equal), `i > j` (lower).
+1. Swap indexes `i` and `j` in the third loop, so that the condition is `i < j`.
+1. Move the updates from "lower" into "upper".
+1. Discard "lower".
+1. Consolidate updates and conditions.
+1. The new LoopNests take advantage of the Symmetry to do the same work, while covering less iteration space.
+
+Here is a visual representation of that:
+
+![Animation of symmetry decomposition](Decompose.webp)
+
+The Symmetry definitions act as a to-do list for the decomposition code.  The code recursively decomposes along the Symmetry definitions until none remain.
+
+The `g()` function in twoel has 3 symmetries:
+
+```python
+symmetries=[ (g, "i", "j"), (g, "k", "l"), (g, ["i","j"], ["k","l"]) ],
+```
+
+Or, represented positionally:
+
+```python
+symmetries=[
+    Symmetry(g, 0, 1),
+    Symmetry(g, 2, 3),
+    Symmetry(g, [0,1], [2,3]),
+]
+```
+
+This means: `i = j`, `k = l`, and `(i,j) = (k,l)`.  They are internally expressed in terms of index-positions.  This representation is necessary to keep track of indexes through multiple decompositions without getting confused.
+
+With these 3 symmetries, the original loop can be decomposed into 8 separate LoopNests, much like a truth table over 3 boolean values:
+
+1. `i < j`, `k < l`, `(ij) < (kl)`
+2. `i < j`, `k < l`, `(ij) = (kl)`, simplifies to 2D over `g(i,j,i,j)`, where `i < j`
+3. `i < j`, `k = l`, `(ij) < (kl)`, simplifies to 3D over `g(i,j,k,k)`, where `i < j` and `(ij) < (kk)`
+4. `i < j`, `k = l`, `(ij) = (kl)`, contradictory
+5. `i = j`, `k < l`, `(ij) < (kl)`, simplifies to 3D over `g(i,i,k,l)`, where `k < l` and `(ii) < (kl)`
+6. `i = j`, `k < l`, `(ij) = (kl)`, contradictory
+7. `i = j`, `k = l`, `(ij) < (kl)`, simplifies to 2D over `g(i,i,k,k)`, where `i < k`
+8. `i = j`, `k = l`, `(ij) = (kl)`, simplifies to 1D over `g(i,i,i,i)`
+
+Two of these are contradictory and are detected as such, so the final decomposed LoopNest count is 6.
+
+### Contradictory conditions
+
+Loops #4 and #6 are contradictory.
+
+* #4:
+  * `(ij) = (kl)` implies `i = k` and `j = l`
+  * `i = k`, `k = l`, `j = l` implies `i = j`
+  * `i = j` and `i < j` cannot both be true
+* #6:
+  * `(ij) = (kl)` implies `i = k` and `j = l`
+  * `i = k`, `i = j`, `j = l` implies `k = l`
+  * `k = l` and `k < l` cannot both be true
+
+Contradiction essentially means that the loop condition will never be true, and the LoopNest can be discarded entirely.  These contradictions result from the fact that the original symmetries are not completely independent of each other, they overlap.
+
+The decomposition code detects these contradictions by performing symbolic execution with a small problem size, and comparing the results.  If the new set of LoopNests do not produce the same result as the previous LoopNest, the decomposition was invalid and is discarded.
+
+This works well for twoel, because the the output function is 2-dimensional and the number of updates is relatively small.  This symbolic execution would have scalability problems in cases where the output functions have many indexes.  It may be possible to identify overlapping symmetries through direct logical analysis, before attempting to split, which would be more scalable in that case.
+
+### Mergeable loops
+
+Loops #3 and #5 are mirror images of each other:
+
+* #3: `i < j`, `k = l`, `(ij) < (kl)`, simplifies to 3D over `g(i,j,k,k)`, where `i < j` and `(ij) < (kk)`
+* #5: `i = j`, `k < l`, `(ij) < (kl)`, simplifies to 3D over `g(i,i,k,l)`, where `k < l` and `(ii) < (kl)`
+
+It is possible to merge these two together, into a single loop with conditions `i = j`, `k < l`, `(ij) != (kl)`.  The logic to merge these is not implemented yet, as this is a lower dimensional loop.  Merging these loops could save around 3% of the total execution time.
+
+## Generation
+
+The file `twoel_gen.py` contains a Halide implementation of SCF's twoel function.  The symmetry decomposition, and generation of Halide funcs from the decomposed loops is done in `decompose.py`.
+
+### algorithm
+
+Each LoopNest is generated as a separate Halide `Func`.  Each Func has its own `RDom` to control the iteration space, and a set of updates.  The updates are all put into the same Func update, using `mux()` and `unroll()`, as described [here](https://github.com/halide/Halide/issues/5017#issuecomment-641586195).  This occurs in the `LoopNest.generate_halide()` method.
+
+Finally, the result of each LoopNest is summed together to form the output Func.  This final summation occurs in the `DecomposedLoops.generate_halide()` method.
+
+### schedule
+
+For now, the schedule is baked into `twoel_gen.py` directly.  This schedule is a work in progress.
+
+It is mostly focused on the 4D loopnest, as that is the bottleneck.  A simpler schedule is applied to the other loopnests.
+
+The 4D schedule unrolls over `u`, which is the Update ID.  It uses `rfactor()` to create internal buffering across the `i` and `j` indexes.  It vectorizes over `i` and parallelizes over `j`.
+
+#### scheduling TODO
+
+Here are some things that can be improved, and ideas to try.
+
+* I have attempted to produce tiles of `g()` at a time, but have not yet succeeded in doing this.
+* Somehow, rfactor() sometimes increases the total amount of computation, beyond simple gathering of results at the end.  I don't understand why it does this.  An example of this is [here](https://github.com/halide/Halide/issues/5017#issuecomment-667421538).
+* The current schedule with 1 thread is able to get within an order of magnitude of the serial C code (with symmetry but not schwarz negligibility checks).  However, increasing the thread count does not increase the performance.  It is probably possible to go faster.
+* The schedule should be less embedded in the generation, and more accessible to end users in general.  It should be easy to try different scheduling directives, or generate a GPU version with a schedule suitable for GPUs.
+* It is using too much memory for intermediate buffers.  the input problem size is greater than 15 atoms (120 basis functions), it allocates more than 4GB and Halide aborts.  It should be possible to compute a small hypercube of `g()` at a time, or one hypercube per thread, and fully consume it before moving on.  However, I haven't found a good, performant way to do that yet.
diff --git a/scf/halide/tools/decompose.py b/scf/halide/tools/decompose.py
new file mode 100644
index 0000000..e20f2c3
--- /dev/null
+++ b/scf/halide/tools/decompose.py
@@ -0,0 +1,1175 @@
+#!/usr/bin/env python3
+
+import logging
+import halide as hl
+
+def die(s):
+    '''report a fatal error'''
+    logging.critical(s)
+    raise Exception(s)
+
+class Function:
+    '''a specific tensor access in the RHS update, like g(i,j,k,l)'''
+    def __init__(self, name, indexes):
+        if isinstance(indexes, set):
+            indexes = [*indexes]
+        if not isinstance(indexes, list):
+            die("function indexes should be a list (or set)")
+        for index in indexes:
+            if not isinstance(index, str):
+                die("function indexes should be a list of index names")
+        self.name = name
+        self.indexes = indexes
+
+    def update_indexes(self, updates):
+        '''apply an index-permutation from a symmetry decomposition'''
+        for i in range(len(self.indexes)):
+            if self.indexes[i] in updates:
+                self.indexes[i] = updates[self.indexes[i]]
+
+    def transpose(self, lhs, rhs):
+        '''swap the two sets of indexes'''
+        updates = {}
+        for l, r in zip(lhs, rhs):
+            updates[l.name] = r
+            updates[r.name] = l
+        self.update_indexes(updates)
+
+    def copy(self):
+        return Function(self.name, [*self.indexes])
+
+    def __str__(self):
+        indexes = [ str(i) for i in self.indexes ]
+        indexes = ",".join(indexes)
+        return self.name + "(" + indexes + ")"
+
+    def __repr__(self):
+        things = [self.name, self.indexes]
+        return "Function(" + ", ".join([repr(x) for x in things]) + ")"
+
+
+class Symmetry:
+    '''a symmetric equality statement, like f(i,j) == f(j,i)'''
+    def __init__(self, func, lhs, rhs):
+        if not isinstance(func, Function):
+            die("symmetry function should be a Function")
+        if isinstance(lhs, int):
+            lhs = [lhs]
+        if isinstance(rhs, int):
+            rhs = [rhs]
+        if not isinstance(lhs, list) or not isinstance(rhs, list):
+            die("symmetry lhs and rhs must be lists")
+        seen_index_positions = set()
+        for index in lhs:
+            if not isinstance(index, int):
+                die("symmetry lhs must be a list of ints")
+            if index < 0 or index >= len(func.indexes):
+                die("symmetry lhs is out of bounds")
+            if index in seen_index_positions:
+                die("index " + index + " mentioned twice in lhs")
+            seen_index_positions.add(index)
+        for index in rhs:
+            if not isinstance(index, int):
+                die("symmetry rhs must be a list of ints")
+            if index < 0 or index >= len(func.indexes):
+                die("symmetry rhs is out of bounds")
+            if index in seen_index_positions:
+                die("index " + index + " in both lhs and rhs")
+        if len(lhs) != len(rhs):
+            die("symmetry lhs and rhs must be of same length")
+        self.func = func
+        self.lhs = lhs
+        self.rhs = rhs
+        self.totals = set(lhs + rhs)
+
+    def apply(self, func):
+        '''swap indexes in the given func'''
+        func = func.copy()
+        for l, r in zip(self.lhs, self.rhs):
+            func.indexes[l], func.indexes[r] = func.indexes[r], func.indexes[l]
+        return func
+
+    def copy(self):
+        return Symmetry(self.func.copy(), self.lhs.copy(), self.rhs.copy())
+
+    def __str__(self):
+        permuted = self.apply(self.func)
+        return "<" + str(self.func) + " = " + str(permuted) + ">"
+
+    def __repr__(self):
+        things = [self.func, self.lhs, self.rhs]
+        return "Symmetry(" + ", ".join([repr(x) for x in things]) + ")"
+
+
+class Condition:
+    '''a boolean expression that constricts the iteration space, like i < j'''
+    def __init__(self, lhs, rhs, op="!="):
+        if not isinstance(lhs, list):
+            lhs = [lhs]
+        if not isinstance(rhs, list):
+            rhs = [rhs]
+        for l in lhs:
+            if not isinstance(l, str):
+                die("lhs must contain index names")
+        for r in rhs:
+            if not isinstance(r, str):
+                die("rhs must contain index names")
+        if len(lhs) != len(rhs):
+            die("condition lhs and rhs must be equally sized")
+        if op not in [ '==', '!=', '<', '<=', '>', '>=' ]:
+            die("simple condition op is not recognized")
+        self.lhs = lhs
+        self.rhs = rhs
+        self.op  = op
+
+    def canonicalize(self):
+        '''organize the condition in a standard form'''
+        reverses = {
+            ">": "<",
+            ">=": "<=",
+        }
+        associatives = set(["==", "!="])
+        if self.op in reverses:
+            self.op, self.lhs, self.rhs = reverses[self.op], self.rhs, self.lhs
+        if self.op in associatives:
+            lhs_name = "".join(self.lhs)
+            rhs_name = "".join(self.rhs)
+            if lhs_name > rhs_name:
+                self.lhs, self.rhs = self.rhs, self.lhs
+
+    def simplify(self):
+        '''reduce complexity, possibly resulting in multiple (simpler) Conditions'''
+        rv = []
+        if self.op == "==":
+            # get rid of tautologies (i==i)
+            new_lhs = []
+            new_rhs = []
+            for l, r in zip(self.lhs, self.rhs):
+                if str(l) != str(r):
+                    new_lhs.append(l)
+                    new_rhs.append(r)
+            if len(new_lhs) > 0:
+                self.lhs = new_lhs
+                self.rhs = new_rhs
+                # sort the operands: j==i becomes i==j
+                if self.lhs[0] > self.rhs[0]:
+                    self.rhs, self.lhs = self.lhs, self.rhs
+                rv.append(self)
+        elif self.op == "!=":
+            # sort the operands: j!=i becomes i!=j
+            if self.lhs[0].name > self.rhs[0].name:
+                self.rhs, self.lhs = self.lhs, self.rhs
+            rv.append(self)
+        else:
+            rv.append(self)
+        return rv
+
+    def update_indexes(self, updates):
+        '''apply an index-permutation from a symmetry decomposition'''
+        new_lhs = []
+        new_rhs = []
+        for l, r in zip(self.lhs, self.rhs):
+            if l in updates:
+                l = updates[l]
+            if r in updates:
+                r = updates[r]
+            new_lhs.append(l)
+            new_rhs.append(r)
+        self.lhs = new_lhs
+        self.rhs = new_rhs
+
+    def transpose(self, lhs, rhs):
+        '''swap the two sets of indexes'''
+        self.update_indexes({ lhs.name: rhs, rhs.name: lhs })
+
+    def generate(self, indexes, N=None):
+        '''apply condition logic to produce a boolean expression'''
+        lhs = indexes[self.lhs[0]]
+        rhs = indexes[self.rhs[0]]
+        if len(self.lhs) > 1:
+            if N is None:
+                die("can't generate tuple condition without a size expression")
+            for l in self.lhs[1:]:
+                lhs *= N
+                lhs += indexes[l]
+            for r in self.rhs[1:]:
+                rhs *= N
+                rhs += indexes[r]
+        if self.op == "<":
+            return lhs < rhs
+        elif self.op == "<=":
+            return lhs <= rhs
+        elif self.op == ">":
+            return lhs > rhs
+        elif self.op == ">=":
+            return lhs >= rhs
+        elif self.op == "==":
+            return lhs == rhs
+        elif self.op == "!=":
+            return lhs != rhs
+
+    def symbol_name(self):
+        '''generate a string for this condition which can become part of a C symbol name'''
+        # stringify in a way that is valid as a C label
+        # assumes the index names are valid
+        text = {
+            "<": "lt",
+            ">": "gt",
+            "<=": "le",
+            ">=": "ge",
+            "==": "eq",
+            "!=": "ne",
+        }
+        lhs = "".join(self.lhs)
+        rhs = "".join(self.rhs)
+        return "_".join([lhs, text[self.op], rhs])
+
+    def copy(self):
+        return Condition(self.lhs.copy(), self.rhs.copy(), self.op)
+
+    def __str__(self):
+        return "(" + "".join([str(x) for x in self.lhs]) + " " + self.op + " " + "".join([str(x) for x in self.rhs]) + ")"
+
+    def __repr__(self):
+        things = [self.lhs, self.rhs, self.op]
+        return "Condition(" + ", ".join([repr(x) for x in things]) + ")"
+
+
+class Update:
+    '''a specific statement in the inner-most loop, like f(i,k) += g(i,j,k,l) * d(j,l) * -0.5'''
+    # self.out      = output tensor
+    # self.elements = array of input tensors
+    # self.op       = reduction operator
+    #                 ("*" means the input elements are multiplied)
+    # self.coeff    = scalar coefficient
+    def __init__(self, out, elements, op="+", coeff=1.0):
+        if not isinstance(out, Function):
+            die("update out should be a Function")
+        if isinstance(elements, Function):
+            elements = [ elements ]
+        if not isinstance(elements, list):
+            die("elements should be a list of functions")
+        for element in elements:
+            if not isinstance(element, Function):
+                die("elements should be a list of Function")
+        if op not in "+-/*^":
+            die("unknown op")
+        self.out = out
+        self.elements = elements
+        self.op = op
+        self.coeff = coeff
+
+    def indexes(self):
+        '''determine the set of indexes referenced by this update'''
+        rv = set()
+        for index in self.out.indexes:
+            rv.add(index)
+        for element in self.elements:
+            for index in element.indexes:
+                rv.add(index)
+        # sort keys by name
+        return sorted(list(rv))
+
+    def update_indexes(self, updates):
+        '''apply an index-permutation from a symmetry decomposition'''
+        self.out.update_indexes(updates)
+        for element in self.elements:
+            element.update_indexes(updates)
+
+    def transpose(self, lhs, rhs):
+        '''swap the two sets of indexes'''
+        self.update_indexes({ lhs.name: rhs, rhs.name: lhs })
+
+    def copy(self):
+        return Update(self.out.copy(), [e.copy() for e in self.elements], self.op, self.coeff)
+
+    def __str__(self):
+        pieces = self.elements
+        pieces = [ str(p) for p in pieces ]
+        if self.coeff != 1.0:
+            pieces.append(str(self.coeff))
+        pieces = " * ".join(pieces)
+        return str(self.out) + " " + self.op + "= " + pieces + ";"
+
+    def __repr__(self):
+        things = [self.out, self.elements]
+        things = [repr(x) for x in things]
+        if self.op != "+":
+            things.append("op=" + repr(self.op))
+        if self.coeff != 1.0:
+            things.append("coeff=" + repr(self.coeff))
+        return "Update(" + ", ".join(things) + ")"
+
+
+
+class Testbed():
+    '''uses symbolic execution to determine a loopnest's execution pattern.  used for correctness testing'''
+    def __init__(self, zones, symmetries=None, N=8):
+        if symmetries is None:
+            symmetries = []
+        symmetries_by_name = {}
+        for symmetry in symmetries:
+            name = symmetry.func.name
+            if name not in symmetries_by_name:
+                symmetries_by_name[name] = []
+            symmetries_by_name[name].append(symmetry)
+        symmetries = symmetries_by_name
+        self.N = N
+        self.zones = zones
+        self.symmetries = symmetries
+
+    def funcstr(self, func, index_values):
+        '''generate string represented a permuted function call'''
+        values = [index_values[i] for i in func.indexes]
+        if func.name in self.symmetries:
+            for symmetry in self.symmetries[func.name]:
+                lhs = [values[x] for x in symmetry.lhs]
+                rhs = [values[x] for x in symmetry.rhs]
+                l = lhs[0]
+                r = rhs[0]
+                for la in lhs[1:]:
+                    l *= self.N
+                    l += la
+                for ra in rhs[1:]:
+                    r *= self.N
+                    r += ra
+                if l > r:
+                    for li, ri in zip(symmetry.lhs, symmetry.rhs):
+                        values[li], values[ri] = values[ri], values[li]
+        rv = ",".join([str(x) for x in values])
+        rv = func.name + "(" + rv + ")"
+        return rv
+
+    def run(self):
+        '''do the symbolic execution'''
+        from itertools import product
+        outputs = {}
+        for zone in self.zones:
+            indexes = zone.indexes()
+            # enumerate indexes
+            position_space = [range(self.N) for x in indexes]
+            for position in product(*position_space):
+                # apply conditions
+                index_values = { i:v for i,v in zip(indexes, position) }
+                for a, b in zone.equalities.items():
+                    index_values[a] = index_values[b]
+                skip = False
+                for condition in zone.conditions:
+                    if condition.generate(index_values, self.N) is False:
+                        skip = True
+                        break
+                if skip:
+                    continue
+
+                # apply updates
+                for update in zone.updates:
+                    output = self.funcstr(update.out, index_values)
+                    if output not in outputs:
+                        outputs[output] = {}
+                    elements = update.op.join([self.funcstr(x,index_values) for x in update.elements])
+                    if elements not in outputs[output]:
+                        outputs[output][elements] = 0.0
+                    outputs[output][elements] += update.coeff
+        return outputs
+
+
+class LoopNest:
+    '''a nested loop structure'''
+    # self.name       = string
+    # self.equalities = dictionary of past indexes which have been eliminated/lowered
+    # self.conditions = list of Condition objects
+    #                   represents the subset of the enumeration space that this loop nest should cover
+    # self.symmetries = list of Symmetry objects
+    #                   represents the set of symmetries that have yet to be decomposed
+    # self.updates    = list of Update objects
+    #                   represents the set of work that should be done in the inner-most loop
+    def __init__(self, name, conditions=None, symmetries=None, updates=None, equalities=None):
+        if not isinstance(name, str):
+            die("loop name should be a string")
+        if len(name) == 0:
+            die("loop name should be non-empty")
+        if conditions is None:
+            conditions = []
+        if equalities is None:
+            equalities = {}
+        if not isinstance(conditions, list):
+            die("loop conditions should be a list of conditions")
+        for condition in conditions:
+            if not isinstance(condition, Condition):
+                die("loop conditions should be a list of Condition")
+        if symmetries is None:
+            symmetries = []
+        if not isinstance(symmetries, list):
+            die("loop symmetries should be a list of symmetries")
+        for i in range(len(symmetries)):
+            symmetry = symmetries[i]
+            if isinstance(symmetry, tuple):
+                func, lhs, rhs = symmetry
+                string_mapping = {}
+                for j in range(len(func.indexes)):
+                    string_mapping[func.indexes[j]] = j
+                if isinstance(lhs, str) or isinstance(lhs, int):
+                    lhs = [lhs]
+                if isinstance(rhs, str) or isinstance(rhs, int):
+                    rhs = [rhs]
+                if len(lhs) != len(rhs):
+                    die("Symmetry lhs and rhs must be equal length")
+                for j in range(len(lhs)):
+                    if lhs[j] in string_mapping:
+                        lhs[j] = string_mapping[lhs[j]]
+                    if rhs[j] in string_mapping:
+                        rhs[j] = string_mapping[rhs[j]]
+                symmetry = Symmetry(func, lhs, rhs)
+                symmetries[i] = symmetry
+            elif not isinstance(symmetry, Symmetry):
+                die("loop symmetries should be a list of Symmetry")
+        if not isinstance(updates, list):
+            die("loop updates should be a list of updates")
+        for update in updates:
+            if not isinstance(update, Update):
+                die("loop updates should be a list of Update")
+        self.name = name
+        self.conditions = conditions
+        self.symmetries = symmetries
+        self.updates = updates
+        self.equalities = equalities
+
+    def indexes(self):
+        '''determine the set of indexes referenced by this update'''
+        found = set()
+        for update in self.updates:
+            for index in update.indexes():
+                found.add(index)
+        return sorted(list(found))
+
+    def split(self, orig_by):
+        '''split a nested loop up across a symmetric axis'''
+        # apply symmetry transformation
+        # this optimization benefits applications where elements can be reused across the updates.
+        # given a loop nest, and an assertion that two (sets of) indexes are equivalent/swappable, take advantage of the symmetry to do less work.
+        # specifically, break the loop nest apart into 3 pieces:
+        #   upper: l < r
+        #   equal: l = r
+        #   lower: l > r
+        # then use index-swapping to rewrite "lower" to look like "upper", and merge them together.
+        # the result is "upper" and "equal".
+        # "upper" result is a loop-nest with half of the iteration space, but twice the updates.
+        # "equal" result is a loop-nest of complexity N-1, covering the leftovers (on the diagonal).
+
+        by = orig_by
+        if isinstance(by, int):
+            by = self.symmetries[by]
+        func = by.func
+        indexes = func.indexes.copy()
+
+        lhs_positions = by.lhs
+        rhs_positions = by.rhs
+        lhs_indexes = [indexes[x] for x in lhs_positions]
+        rhs_indexes = [indexes[x] for x in rhs_positions]
+        lhs_name = "".join(lhs_indexes)
+        rhs_name = "".join(rhs_indexes)
+        transpose_indexes = {}
+        for l, r in zip(lhs_indexes, rhs_indexes):
+            transpose_indexes[l] = r
+            transpose_indexes[r] = l
+        upper_condition = Condition(lhs_indexes, rhs_indexes, "<")
+        equal_conditions = [ Condition(l, r, "==") for l, r in zip(lhs_indexes, rhs_indexes) ]
+        lower_condition = Condition(lhs_indexes, rhs_indexes, ">")
+
+        logging.debug("loop.split: lhs is %s", lhs_name)
+        logging.debug("loop.split: rhs is %s", rhs_name)
+        logging.debug("transpose_indexes is %s", transpose_indexes)
+
+        upper = self.copy()
+        equal = self.copy()
+        lower = self.copy()
+        upper.conditions.append(upper_condition)
+        equal.conditions += equal_conditions
+        lower.conditions.append(lower_condition)
+        upper.name += "_" + lhs_name + "_lt_" + rhs_name
+        equal.name += "_" + lhs_name + "_eq_" + rhs_name
+        lower.name += "_" + lhs_name + "_gt_" + rhs_name
+        lower.update_indexes(transpose_indexes)
+        upper.merge_symmetric(lower, by)
+        upper.symmetries = [s.copy() for s in self.symmetries if s != by]
+        equal.symmetries = [s.copy() for s in self.symmetries if s != by]
+        for l, r in zip(lhs_indexes, rhs_indexes):
+            equal.equalities[r] = l
+        equal.simplify()
+        upper.simplify()
+
+        # test whether the results are correct
+        # (poor man's test for logical contradictions and stupid bugs)
+        testbed_orig  = Testbed([self], self.symmetries)
+        testbed_split = Testbed([equal, upper], self.symmetries)
+        updates_orig  = testbed_orig.run()
+        updates_split = testbed_split.run()
+        if updates_orig != updates_split:
+            # output LoopNests are not equivalent to input LoopNest
+            # report the differences to debug log
+            import difflib
+            import json
+            data_orig  = json.dumps(updates_orig , indent=2, sort_keys=True).split("\n")
+            data_split = json.dumps(updates_split, indent=2, sort_keys=True).split("\n")
+            logging.debug("difference in computation coverage:")
+            diff = difflib.unified_diff(data_orig, data_split, fromfile=self.name, tofile="split zones")
+            for line in diff:
+                logging.debug(line)
+
+            logging.info("splitting %s by %s failed; leaving it alone", self.name, by)
+            self.symmetries = [x for x in self.symmetries if x != by]
+            return [self]
+
+        return [upper, equal]
+
+    def update_indexes(self, updates):
+        '''apply an index-permutation from a symmetry decomposition'''
+        for i in range(len(self.conditions)):
+            self.conditions[i].update_indexes(updates)
+        for i in range(len(self.updates)):
+            self.updates[i].update_indexes(updates)
+
+    def merge_symmetric(self, other, along):
+        '''concatenate updates, but preserve the index-order of the func that was split'''
+        if len(self.updates) != len(other.updates):
+            die("merge_symmetric can only be called on loop nests that are the mirrror image of each other.")
+        # find the common element corresponding to the symmetry
+        along_element = along.func
+        symmetric_element  = None
+        transposed_element = None
+        for self_update, other_update in zip(self.updates, other.updates):
+            if symmetric_element is not None:
+                break
+            for self_element, other_element in zip(self_update.elements, other_update.elements):
+                logging.debug("comparing update element %s to symmetry element %s", self_element, along_element)
+                if str(self_element.name) == str(along_element.name):
+                    symmetric_element  = self_element
+                    transposed_element = other_element
+                    break
+        else:
+            die("symmetry func not found in update elements")
+
+        for self_update, other_update in zip(self.updates, other.updates):
+            other_elements = []
+            for element in other_update.elements:
+                if str(element) == str(transposed_element):
+                    element = symmetric_element.copy()
+                other_elements.append(element)
+            new_update = other_update.copy()
+            new_update.elements = other_elements
+            self.updates.append(new_update)
+
+    def simplify(self):
+        '''consolidatate and simplify everything we can'''
+        before = ""
+        while str(self) != before:
+            self.simplify_conditions()
+            self.simplify_equalities()
+            self.simplify_updates()
+            self.simplify_symmetries()
+            before = str(self)
+
+    def simplify_equalities(self):
+        '''if i == j, then go and replace j with i in all the symmetries and updates'''
+        simplifications = {}
+        for condition in self.conditions:
+            condition.simplify()
+            if condition.op == "==":
+                for l, r in zip(condition.lhs, condition.rhs):
+                    simplifications[r] = l
+
+        # reduce update indexes
+        for update in self.updates:
+            update.out.update_indexes(simplifications)
+            for element in update.elements:
+                element.update_indexes(simplifications)
+
+    def simplify_symmetries(self):
+        '''eliminate redundant symmetries, and symmetries that don't apply to any updates'''
+
+        # remove redundant symmetries (symmetries which stringify to the same string)
+        new_symmetries = []
+        symmetry_strings = {}
+        for symmetry in self.symmetries:
+            if str(symmetry) not in symmetry_strings:
+                symmetry_strings[str(symmetry)] = 1
+                new_symmetries.append(symmetry)
+        self.symmetries = new_symmetries
+
+        # figure out which symmetries apply to updates
+        symmetry_func_names = {}
+        update_funcs_by_name = {}
+        for symmetry in self.symmetries:
+            symmetry_func_names[symmetry.func.name] = symmetry.func
+        for update in self.updates:
+            for func in update.elements:
+                if func.name not in symmetry_func_names:
+                    continue
+                if func.name not in update_funcs_by_name:
+                    update_funcs_by_name[func.name] = [func]
+                else:
+                    if str(func) != str(update_funcs_by_name[func.name][0]):
+                        logging.error(update_funcs_by_name[func.name][0])
+                        logging.error(func)
+                        die("inconsistent calls to " + func.name + " between updates")
+                update_funcs_by_name[func.name].append(func)
+
+        # remove the ones that don't
+        new_symmetries = []
+        for symmetry in self.symmetries:
+            logging.debug("considering symmetry for elimination: %s", symmetry)
+            logging.debug("against update funcs: %s", update_funcs_by_name[symmetry.func.name])
+            update_indexes = {}
+            for func in update_funcs_by_name[symmetry.func.name]:
+                for index in func.indexes:
+                    update_indexes[index] = index
+            func = symmetry.func
+            symmetry_affects_updates = False
+            for l, r in zip(symmetry.lhs, symmetry.rhs):
+                l, r = func.indexes[l], func.indexes[r]
+                if l in update_indexes or r in update_indexes:
+                    symmetry_affects_updates = True
+                    break
+            if symmetry_affects_updates:
+                new_symmetries.append(symmetry)
+        self.symmetries = new_symmetries
+
+    def simplify_conditions(self):
+        '''canonicalize/simplify conditions, remove redundant ones'''
+
+        # canonicalize all conditions, remove redundant ones
+        new_conditions = []
+        for condition in self.conditions:
+            condition.canonicalize()
+            for new_condition in condition.simplify():
+                new_condition.canonicalize()
+                new_conditions.append(new_condition)
+        self.conditions = new_conditions
+
+        # apply equalities
+        equalities = {}
+        for condition in self.conditions:
+            if condition.op == '==':
+                for l, r in zip(condition.lhs, condition.rhs):
+                    equalities[r] = l
+
+        # turn tuple conditions into simple conditions
+        for condition in self.conditions:
+            if len(condition.rhs) == 1:
+                continue
+            redundant = set()
+            lhs_seen = set()
+            rhs_seen = set()
+            for i in range(len(condition.lhs)):
+                l = condition.lhs[i]
+                r = condition.rhs[i]
+                if l in equalities:
+                    l = equalities[l]
+                if r in equalities:
+                    r = equalities[r]
+                if l in lhs_seen and r in rhs_seen:
+                    redundant.add(i)
+                lhs_seen.add(l)
+                rhs_seen.add(r)
+            new_lhs = []
+            new_rhs = []
+            for i in range(len(condition.lhs)):
+                if i not in redundant:
+                    new_lhs.append(condition.lhs[i])
+                    new_rhs.append(condition.rhs[i])
+            condition.lhs = new_lhs
+            condition.rhs = new_rhs
+
+    def simplify_updates(self):
+        '''merge updates which share the same pattern by adding coefficients together'''
+        coefficients = {}
+        tuples = {}
+        for update in self.updates:
+            coeff = update.coeff
+            key  = [ str(update.out) ]
+            key += [ str(e) for e in update.elements]
+            key = "/".join(key)
+            if key not in coefficients:
+                coefficients[key] = 0.0
+            coefficients[key] += coeff
+            tuples[key] = (update.out, update.elements)
+        new_updates = []
+        for key in sorted(coefficients.keys()):
+            coeff = coefficients[key]
+            out, elements = tuples[key]
+            update = Update(out=out, elements=elements, coeff=coeff)
+            new_updates.append(update)
+        self.updates = new_updates
+
+    def split_recursive(self):
+        '''split all symmetries recursively'''
+        rvs = [self]
+        making_progress = True
+        while making_progress:
+            making_progress = False
+            new_rvs = []
+            for thing in rvs:
+                if len(thing.symmetries):
+                    logging.debug("splitting thing %s", thing)
+                    new_things = thing.split(0)
+                    new_rvs += new_things
+                    making_progress = True
+                else:
+                    new_rvs.append(thing)
+            rvs = new_rvs
+        return DecomposedLoops(self, rvs)
+
+    def generate_halide(self, app, sizes):
+        '''produce a Halide func implementing this loopnest'''
+        if len(sizes) < len(self.indexes()):
+            raise Exception("called without enough sizes")
+        name = self.name
+        g = app.funcs["g"]
+        g_dens = app.clamps["g_dens"]
+        self.simplify()
+        logging.info("generating zone %s", name)
+        # each symmetry zone has its own iteration space, implemented as an RDom with a where() clause.
+        distinct_iters = self.indexes()
+        logging.debug("distinct iters: %s", distinct_iters)
+
+        piece_count = len(self.updates)
+
+        iter_name_mapping = {k:k for k in app.vars}
+        doing_something_useful = True
+        while doing_something_useful:
+            doing_something_useful = False
+            for condition in self.conditions:
+                if condition.op == "==":
+                    for lhs, rhs in zip(condition.lhs, condition.rhs):
+                        if iter_name_mapping[rhs] != iter_name_mapping[lhs]:
+                            doing_something_useful = True
+                            iter_name_mapping[rhs] = iter_name_mapping[lhs]
+
+        logging.debug("iter_name_mapping: %s", iter_name_mapping)
+        logging.debug("piece_count: %s", piece_count)
+
+        rdom_iters = [(0, piece_count)]
+        for size, index in zip(sizes, self.indexes()):
+            rdom_iters.append((0, size))
+        logging.debug("rdom iters: %s", rdom_iters)
+        r = hl.RDom(rdom_iters, name+"_dom")
+        # set local variables for RVars
+        expanded_iters = {}
+        distinct_iters = [r[i] for i in range(len(r))]
+        assigned_already = {}
+        ru = distinct_iters.pop(0)
+        for a, b in iter_name_mapping.items():
+            if b in assigned_already:
+                expanded_iters[a] = assigned_already[b]
+            else:
+                iterator = distinct_iters.pop(0)
+                expanded_iters[a] = iterator
+                assigned_already[b] = iterator
+        logging.debug("expanded_iters: %s", expanded_iters)
+
+        for condition in self.conditions:
+            if condition.op == "==":
+                continue
+            logging.debug("generating where condition %s", condition)
+            expression = condition.generate(expanded_iters, N=sizes[0])
+            if expression is not None:
+                r.where(expression)
+                logging.debug("resulting where clause: %s", r)
+
+        for update in self.updates:
+            logging.debug(update)
+
+        # generate this nested loop
+
+        def maybe_mux(s):
+            '''wrap multiple Exprs in mux()'''
+            if len(set(s)) == 1:
+                return s[0]
+            else:
+                return hl.mux(hl.Expr(ru), s)
+
+        zone_func = hl.Func(name)
+        zone_func_initial_params = list(app.vars.values())[0:len(self.updates[0].out.indexes)]
+        zone_func.__setitem__(zone_func_initial_params, hl.f64(0.0))
+        left_hand_sides = [[] for x in self.updates[0].out.indexes]
+        right_hand_sides = []
+        for update in self.updates:
+            # LHS indexes
+            for i in range(len(update.out.indexes)):
+                index = update.out.indexes[i]
+                left_hand_sides[i].append(expanded_iters[index])
+            # RHS
+            rhs = None
+            for element in update.elements:
+                if element.name in app.clamps:
+                    func = app.clamps[element.name]
+                elif element.name in app.funcs:
+                    func = app.funcs[element.name]
+                else:
+                    logging.critical("func %s not found"%element.name)
+                func_args = [expanded_iters[x] for x in element.indexes]
+                value = func.__getitem__(func_args)
+                if rhs is None:
+                    rhs = value
+                else:
+                    rhs *= value
+            if update.coeff != 1.0:
+                rhs *= update.coeff
+            right_hand_sides.append(rhs)
+
+        left_hand_sides = [ maybe_mux(x) for x in left_hand_sides ]
+        right_hand_sides = maybe_mux(right_hand_sides)
+        lhs = zone_func.__getitem__(left_hand_sides)
+        zone_func.__setitem__(left_hand_sides, lhs + right_hand_sides)
+        logging.debug("%s[%s, %s] += %s", name, left_hand_sides, right_hand_sides)
+
+        app.funcs[name] = zone_func
+        app.loopnest_funcs[name] = { "func": zone_func, "loopnest": self, "iters": expanded_iters, "rdom": r, "unroll": ru }
+        return zone_func
+
+
+    def copy(self):
+        return LoopNest(
+            name=self.name,
+            conditions=[c.copy() for c in self.conditions],
+            symmetries=[s.copy() for s in self.symmetries],
+            updates=[u.copy() for u in self.updates],
+            equalities={k: v for k,v in self.equalities.items()},
+        )
+
+    def __str__(self):
+        lines = ["# name: " + self.name]
+        for equality_lhs, equality_rhs in self.equalities.items():
+            lines.append("# equality: %s == %s"%(equality_lhs, equality_rhs))
+        for condition in self.conditions:
+            lines.append("# condition: " + str(condition))
+        for symmetry in self.symmetries:
+            lines.append("# symmetry: " + str(symmetry))
+        for update in self.updates:
+            lines.append(str(update))
+        return "\n".join(lines)
+
+    def __repr__(self):
+        things = [self.name, self.conditions, self.symmetries, self.updates]
+        return "LoopNest(" + ", ".join([repr(x) for x in things]) + ")\n"
+
+
+class DecomposedLoops():
+    '''a set of nested loop structures which, together, implement the original algorithm'''
+    def __init__(self, orig, loops):
+        self.orig = orig
+        self.loops = loops
+
+    def generate_halide(self, app, sizes, precursors=None):
+        params = list(app.vars.values())[0:len(self.orig.updates[0].out.indexes)]
+        expr = None
+        if precursors is not None:
+            for input in precursors:
+                if expr is None:
+                    expr = input.__getitem__(params)
+                else:
+                    expr += input.__getitem__(params)
+        for loop in self.loops:
+            func = loop.generate_halide(app, sizes)
+            if expr is None:
+                expr = func.__getitem__(params)
+            else:
+                expr += func.__getitem__(params)
+        output_func = hl.Func(self.orig.name)
+        output_func.__setitem__(params, expr)
+
+        app.funcs[self.orig.name] = output_func
+        return output_func
+
+    def __str__(self):
+        lines = ["original:", str(self.orig)]
+        for loop in self.loops:
+            lines.append(str(loop))
+        return "\n".join(lines)
+
+    def __repr__(self):
+        repr_loops = ", ".join([repr(x) for x in self.loops])
+        repr_loops = "[" + repr_loops + "]"
+        return "DecomposedLoops(%s, %s)"%(repr(self.orig), repr_loops)
+
+
+# test all of the above
+import unittest
+
+class test_Symmetry(unittest.TestCase):
+    def test_stringify(self):
+        indexes = list("ijkl")
+        function = Function("g", indexes)
+        symmetry = Symmetry(function, [0,1], [2,3])
+        self.assertEqual("<g(i,j,k,l) = g(k,l,i,j)>", str(symmetry))
+
+    def test_copy(self):
+        indexes = list("ijkl")
+        function = Function("g", indexes)
+        symmetry1 = Symmetry(function, [0,1], [2,3])
+        symmetry2 = symmetry1.copy()
+        self.assertNotEqual(id(symmetry1), id(symmetry2))
+        self.assertNotEqual(id(symmetry1.func), id(symmetry2.func))
+        self.assertNotEqual(id(symmetry1.lhs), id(symmetry2.lhs))
+        self.assertNotEqual(id(symmetry1.rhs), id(symmetry2.rhs))
+
+
+class test_Update(unittest.TestCase):
+    def test_stringify(self):
+        indexes = list("ijkl")
+        out = indexes[0:2]
+        in_ = indexes[2:4]
+        g   = Function("g", indexes)
+        out = Function("f", out)
+        in_ = Function("d", in_)
+        update = Update(out, [g, in_])
+        self.assertEqual("f(i,j) += g(i,j,k,l) * d(k,l);", str(update))
+        update = Update(out, [g, in_], op="*", coeff=-0.5)
+        self.assertEqual("f(i,j) *= g(i,j,k,l) * d(k,l) * -0.5;", str(update))
+
+class test_Condition(unittest.TestCase):
+    def test_stringify(self):
+        i, j, k, l = "ijkl"
+        ij = Condition(i, j, "<")
+        self.assertEqual("(i < j)", str(ij))
+        kl = Condition(k, l, "<=")
+        self.assertEqual("(k <= l)", str(kl))
+        ijkl = Condition([i,j], [k,l], ">")
+        self.assertEqual("(ij > kl)", str(ijkl))
+
+class test_LoopNest(unittest.TestCase):
+    def define_original_twoel(self):
+        i,j,k,l = 'ijkl'
+        g = Function("g", [i,j,k,l])
+        zone = LoopNest(
+            name="twoel",
+            conditions=[],
+            symmetries=[
+                Symmetry(g, 0, 1),
+                Symmetry(g, 2, 3),
+                Symmetry(g, [0,1], [2,3]),
+            ],
+            updates=[
+                Update(Function("g_fock", [i,j]), [g, Function("g_dens", [k,l])]),
+                Update(Function("g_fock", [i,k]), [g, Function("g_dens", [j,l])], coeff=-0.5),
+            ]
+        )
+        return zone
+
+    def test_definition(self):
+        zone = self.define_original_twoel()
+
+    def test_simplified_numeric_symmetries(self):
+        i,j,k,l = 'ijkl'
+        g = Function("g", [i,j,k,l])
+        zone = LoopNest(
+            name="twoel",
+            # numeric indexes
+            symmetries=[ (g, 0, 1), (g, 2, 3), (g, [0,1], [2,3]) ],
+            updates=[
+                Update(Function("f", [i,j]), [g, Function("d", [k,l])]),
+                Update(Function("f", [i,k]), [g, Function("d", [j,l])], coeff=-0.5),
+            ]
+        )
+        zone.split_recursive()
+
+    def test_simplified_string_symmetries(self):
+        i,j,k,l = 'ijkl'
+        g = Function("g", [i,j,k,l])
+        zone = LoopNest(
+            name="twoel",
+            # numeric indexes
+            symmetries=[ (g, "i", "j"), (g, "k", "l"), (g, ["i","j"], ["k","l"]) ],
+            updates=[
+                Update(Function("f", [i,j]), [g, Function("d", [k,l])]),
+                Update(Function("f", [i,k]), [g, Function("d", [j,l])], coeff=-0.5),
+            ]
+        )
+        zone.split_recursive()
+
+    def test_stringify(self):
+        zone = self.define_original_twoel()
+        s = str(zone)
+        self.assertEqual("""
+# name: twoel
+# symmetry: <g(i,j,k,l) = g(j,i,k,l)>
+# symmetry: <g(i,j,k,l) = g(i,j,l,k)>
+# symmetry: <g(i,j,k,l) = g(k,l,i,j)>
+g_fock(i,j) += g(i,j,k,l) * g_dens(k,l);
+g_fock(i,k) += g(i,j,k,l) * g_dens(j,l) * -0.5;
+""", "\n"+s+"\n")
+
+    def test_indexes(self):
+        zone = self.define_original_twoel()
+        self.assertEqual(["i","j","k","l"], list(zone.indexes()))
+        zone1, zone2 = zone.split(0)
+        self.assertEqual(["i","j","k","l"], list(zone1.indexes()))
+        self.assertEqual(["i","k","l"], list(zone2.indexes()))
+
+    def test_split_simple(self):
+        zone = self.define_original_twoel()
+        zone1, zone2 = zone.split(0)
+
+        i, j, k, l = list("ijkl")
+        expected_zone1 = LoopNest(
+          name='twoel_i_lt_j', conditions=[Condition(i, j, '<')],
+          symmetries=[
+            Symmetry(Function('g', [i, j, k, l]), 2, 3),
+            Symmetry(Function('g', [i, j, k, l]), [0,1], [2,3]),
+          ],
+          updates=[
+            Update(Function('g_fock', [i, j]), [Function('g', [i, j, k, l]), Function('g_dens', [k, l])]),
+            Update(Function('g_fock', [i, k]), [Function('g', [i, j, k, l]), Function('g_dens', [j, l])], coeff=-0.5),
+            Update(Function('g_fock', [j, i]), [Function('g', [i, j, k, l]), Function('g_dens', [k, l])]),
+            Update(Function('g_fock', [j, k]), [Function('g', [i, j, k, l]), Function('g_dens', [i, l])], coeff=-0.5),
+          ]
+        )
+        expected_zone2 = LoopNest(
+          name='twoel_i_eq_j', conditions=[Condition(i, j, '==')],
+          symmetries=[
+            Symmetry(Function('g', [i, j, k, l]), 2, 3),
+            Symmetry(Function('g', [i, j, k, l]), [0,1], [2,3]),
+          ],
+          equalities={
+            j: i
+          },
+          updates=[
+            Update(Function('g_fock', [i, i]), [Function('g', [i, i, k, l]), Function('g_dens', [k, l])]),
+            Update(Function('g_fock', [i, k]), [Function('g', [i, i, k, l]), Function('g_dens', [i, l])], coeff=-0.5),
+          ]
+        )
+
+        self.assertEqual(str(expected_zone1), str(zone1))
+        self.assertEqual(str(expected_zone2), str(zone2))
+
+        testbed_orig  = Testbed([zone], zone.symmetries)
+        testbed_split = Testbed([zone1, zone2], zone.symmetries)
+        updates_orig  = testbed_orig.run()
+        updates_split = testbed_split.run()
+        self.assertEqual(updates_orig, updates_split)
+
+    def test_split_tuple(self):
+        zone = self.define_original_twoel()
+        zone1, zone2 = zone.split(2)
+
+        i, j, k, l = list("ijkl")
+        expected_zone1 = LoopNest(
+          name='twoel_ij_lt_kl', conditions=[Condition([i,j], [k,l], '<')],
+          symmetries=[
+            Symmetry(Function('g', [i, j, k, l]), 0, 1),
+            Symmetry(Function('g', [i, j, k, l]), 2, 3),
+          ],
+          updates=[
+            Update(Function('g_fock', [i, j]), [Function('g', [i, j, k, l]), Function('g_dens', [k, l])]),
+            Update(Function('g_fock', [i, k]), [Function('g', [i, j, k, l]), Function('g_dens', [j, l])], coeff=-0.5),
+            Update(Function('g_fock', [k, i]), [Function('g', [i, j, k, l]), Function('g_dens', [l, j])], coeff=-0.5),
+            Update(Function('g_fock', [k, l]), [Function('g', [i, j, k, l]), Function('g_dens', [i, j])]),
+          ]
+        )
+        expected_zone2 = LoopNest(
+          name='twoel_ij_eq_kl', conditions=[Condition(i, k, '=='), Condition(j, l, '==')],
+          symmetries=[
+            Symmetry(Function('g', [i, j, k, l]), 0, 1),
+          ],
+          equalities={
+            k: i,
+            l: j
+          },
+          updates=[
+            Update(Function('g_fock', [i, i]), [Function('g', [i, j, i, j]), Function('g_dens', [j, j])], coeff=-0.5),
+            Update(Function('g_fock', [i, j]), [Function('g', [i, j, i, j]), Function('g_dens', [i, j])]),
+          ]
+        )
+
+        self.assertEqual(str(expected_zone1), str(zone1))
+        self.assertEqual(str(expected_zone2), str(zone2))
+
+        testbed_orig  = Testbed([zone], zone.symmetries)
+        testbed_split = Testbed([zone1, zone2], zone.symmetries)
+        updates_orig  = testbed_orig.run()
+        updates_split = testbed_split.run()
+        self.assertEqual(updates_orig, updates_split)
+
+    def test_split_recursive(self):
+        zone = self.define_original_twoel()
+        decomposed = zone.split_recursive()
+        zones = decomposed.loops
+        self.assertEqual(6, len(zones))
+        logging.info("original zone: %s", zone)
+        logging.info("split into %d zones.", len(zones))
+        ranks = {}
+        for newzone in zones:
+            rank = len(newzone.indexes())
+            if rank not in ranks:
+                ranks[rank] = {}
+            ranks[rank][newzone.name] = newzone
+        for rank in reversed(sorted(ranks.keys())):
+            logging.info("# RANK %s", rank)
+            for name in sorted(ranks[rank].keys()):
+                newzone = ranks[rank][name]
+                logging.info("# ZONE")
+                logging.info(newzone)
+
+        testbed_orig  = Testbed([zone], zone.symmetries)
+        testbed_split = Testbed(zones, zone.symmetries)
+        updates_orig  = testbed_orig.run()
+        updates_split = testbed_split.run()
+        self.maxDiff = 2000
+        self.assertEqual(updates_orig, updates_split)
+
+    def test_generate_halide(self):
+        zone = self.define_original_twoel()
+        decomposed = zone.split_recursive()
+        self.vars  = {k: hl.Var(k) for k in "ijkl"}
+        i, j, k, l = [self.vars[k] for k in "ijkl"]
+        g_dens = hl.Func("g_dens")
+        g_dens[i,j] = i * j
+        g = hl.Func("g")
+        g[i,j,k,l] = hl.cos(i*j) * hl.sin(k*l)
+        self.inputs = {"g": g, "g_dens": g_dens}
+        self.clamps = {"g": g, "g_dens": g_dens}
+        self.funcs = {"g": g, "g_dens": g_dens}
+        self.loopnest_funcs = {}
+        func = decomposed.generate_halide(self, [8, 8, 8, 8])
+
+
+class test_Testbed(unittest.TestCase):
+    def test_run(self):
+        f1 = Function("f1", ["i","j"])
+        z = LoopNest(
+            name="z",
+            symmetries=[ (f1, "i", "j") ],
+            updates=[ Update(Function("f2", ["i","j"]), [f1]) ]
+        )
+        z1 = LoopNest(
+            name="z1",
+            conditions=[Condition("i","j","<")],
+            updates=[
+                Update(Function("f2", ["i","j"]), [f1]),
+                Update(Function("f2", ["j","i"]), [f1])
+            ]
+        )
+        z2 = LoopNest(
+            name="z2",
+            conditions=[Condition("i","j","==")],
+            updates=[ Update(Function("f2", ["i","i"]), [f1]) ]
+        )
+        t1 = Testbed([z], z.symmetries)
+        t2 = Testbed([z1, z2])
+        r1 = t1.run()
+        r2 = t2.run()
+        import json
+        s1 = json.dumps(r1, indent=2, sort_keys=True).split("\n")
+        s2 = json.dumps(r2, indent=2, sort_keys=True).split("\n")
+        if s1 != s2:
+            import difflib
+            diff = difflib.unified_diff(s1, s2, fromfile=z.name, tofile="split zones")
+            for line in diff:
+                logging.error(line)
+        self.assertEqual(r1, r2)
+
+
+if __name__ == '__main__':
+    # run the test suite
+    logging.basicConfig(level=logging.WARNING)
+    unittest.main()
diff --git a/scf/halide/tools/halidetrace2tensorvis.py b/scf/halide/tools/halidetrace2tensorvis.py
new file mode 100644
index 0000000..6d07da4
--- /dev/null
+++ b/scf/halide/tools/halidetrace2tensorvis.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+
+import sys
+
+if len(sys.argv) < 2:
+    raise Exception("Usage: %s <halide_trace_filename>"%sys.argv[0])
+
+infn = sys.argv[1]
+xlimit = None
+ylimit = None
+if len(sys.argv) > 2:
+    xlimit = int(sys.argv[2])
+if len(sys.argv) > 3:
+    ylimit = int(sys.argv[3])
+
+data_aliases = {
+    "constant_exterior$8.0": "g",
+}
+
+def parse_tag(tag):
+    # format: ? ? data_size_in_bits ? num_indices [min max]+
+    #         1 2 64                1 4            0   8 0 8 0 8 0 8
+    # format: len(func_types) [func_type_code data_size_in_bits func_type_lanes]+ num_indices [min max]+
+    #         1                2              64                1                 4            0   8 0 8 0 8 0 8
+    fields = tag.split(" ")
+    tagname = fields[0]
+    if tagname != "func_type_and_dim":
+        return {"unknown": True}
+    num_func_types = fields[1]
+    offset = 1
+    func_types = []
+    for i in range(num_func_types):
+        func_types.append({ "type_code": fields[offset], "data_size_bits": fields[offset+1], "type_lanes": fields[offset+2] })
+        offset += 3
+    n = int(fields[offset])
+    offset += 1
+    shape = []
+    for i in range(n):
+        shape.append({"min": int(fields[offset]), "max": int(fields[offset+1])})
+        offset += 2
+    return { "bits": bits, "types": func_types, "shape": shape }
+
+with open(infn, "r") as f:
+    for line in f:
+        line = line.strip()
+        type = line.split(" ")[0]
+        line = line[len(type)+1:]
+        if type == "Begin":
+            print("begin", line)
+        elif type == "End":
+            print("end", line)
+            break
+        elif type == "Tag":
+            func, tag = line.split(" tag = ")
+            tag = tag[1:len(tag)-1]
+            print("tag input:", tag)
+            tag = parse_tag(tag)
+            print("tag func", func, "tag", tag)
+        elif type == "Load":
+            print("load", line)
+        elif type == "Store":
+            print("store", line)
diff --git a/scf/halide/tools/twoel_gen.py b/scf/halide/tools/twoel_gen.py
new file mode 100755
index 0000000..0f94f44
--- /dev/null
+++ b/scf/halide/tools/twoel_gen.py
@@ -0,0 +1,332 @@
+#!/usr/bin/env python3
+
+import halide as hl
+from math import pi
+
+'''generate halide code to take advantage of the symmetry of g()'''
+
+import logging
+
+from decompose import Function, Condition, Symmetry, Update, LoopNest, DecomposedLoops
+
+def define_original_twoel_zone():
+    '''base definition of the twoel() function, to be decomposed/optimized'''
+    g = Function("g", ["i","j","k","l"])
+    loop = LoopNest(
+        name="g_fock",
+        symmetries=[ (g, "i", "j"), (g, "k", "l"), (g, ["i","j"], ["k","l"]) ],
+        updates=[
+            Update(Function("g_fock", ["i","j"]), [g, Function("g_dens", ["k","l"])]),
+            Update(Function("g_fock", ["i","k"]), [g, Function("g_dens", ["j","l"])], coeff=-0.5),
+        ]
+    )
+    return loop
+
+class Generate_twoel():
+    ''' SCF twoel() function generator '''
+    def __init__(self, loopnests=None, tracing=False, tracing_g=False, tilesize=30, vectorsize=8):
+        self.tracing = tracing
+        self.tracing_g = tracing_g
+        self.tilesize = tilesize
+        self.vectorsize = vectorsize
+        self.vars = {}
+        self.funcs = {}
+        self.inputs = {}
+        self.clamps = {}
+        self.outputs = {}
+        self.loopnests = loopnests
+        self.loopnest_funcs = {}
+
+    def add_funcs_by_name(self, funcs):
+        ''' add funcs to list of funcs (for bookkeeping) '''
+        for func in funcs:
+            self.funcs[func.name()] = func
+
+    def generate_library(self):
+        ''' generate a static library that implements twoel '''
+        self.generate_twoel()
+        self.compile()
+
+    def generate_twoel(self):
+        ''' generate a Halide pipeline in memory that implements twoel '''
+        self.setup_vars()
+        self.setup_inputs()
+        self.gen_g()
+        self.gen_main_loop()
+        self.gen_outputs()
+        self.schedule()
+        self.gen_pipeline()
+
+    def setup_vars(self):
+        for c in "ijkl":
+            v = hl.Var(c)
+            self.vars[c] = v
+
+    def setup_inputs(self):
+        # input scalars
+        delo2  = hl.Param(hl.Float(64), "delo2")
+        delta  = hl.Param(hl.Float(64), "delta")
+        rdelta = hl.Param(hl.Float(64), "rdelta")
+
+        # input vectors
+        expnt_in = hl.ImageParam(hl.Float(64), 1, "expnt_in")
+        rnorm_in = hl.ImageParam(hl.Float(64), 1, "rnorm_in")
+        x_in     = hl.ImageParam(hl.Float(64), 1, "x_in")
+        y_in     = hl.ImageParam(hl.Float(64), 1, "y_in")
+        z_in     = hl.ImageParam(hl.Float(64), 1, "z_in")
+
+        # input matrices
+        fm_in        = hl.ImageParam(hl.Float(64), 2, "fm_in")
+        g_fock_in_in = hl.ImageParam(hl.Float(64), 2, "g_fock_in")
+        g_dens_in    = hl.ImageParam(hl.Float(64), 2, "g_dens_in")
+
+        self.inputs.update({ x.name(): x for x in [delo2, delta, rdelta, expnt_in, rnorm_in, x_in, y_in, z_in, fm_in, g_fock_in_in, g_dens_in] })
+
+        # clamp all inputs, to prevent out-of-bounds errors from odd tile sizes and such
+        expnt     = hl.BoundaryConditions.constant_exterior(expnt_in    , 0)
+        rnorm     = hl.BoundaryConditions.constant_exterior(rnorm_in    , 0)
+        x         = hl.BoundaryConditions.constant_exterior(x_in        , 0)
+        y         = hl.BoundaryConditions.constant_exterior(y_in        , 0)
+        z         = hl.BoundaryConditions.constant_exterior(z_in        , 0)
+        fm        = hl.BoundaryConditions.constant_exterior(fm_in       , 0)
+        g_fock_in = hl.BoundaryConditions.constant_exterior(g_fock_in_in, 0)
+        g_dens    = hl.BoundaryConditions.constant_exterior(g_dens_in   , 0)
+
+        self.clamps.update({ "expnt": expnt, "rnorm": rnorm, "x": x, "y": y, "z": z, "fm": fm, "g_fock_in_clamped": g_fock_in, "g_dens": g_dens })
+
+        # nbfn=number of basis functions.  This is our problem size
+        self.nbfn = g_fock_in_in.height()
+
+
+    def gen_g(self):
+        ''' define g() function '''
+        # vars
+        i, j, k, l = [self.vars[c] for c in "ijkl"]
+        # clamped inputs
+        x, y, z, expnt, fm, rnorm = [self.clamps[c] for c in ["x", "y", "z", "expnt", "fm", "rnorm"]]
+        # unclamped input (for sizing)
+        fm_in = self.inputs["fm_in"]
+        # scalar inputs
+        delo2, delta, rdelta = [self.inputs[c] for c in ["delo2", "delta", "rdelta"]]
+
+        dx = hl.Func("dx")
+        dy = hl.Func("dy")
+        dz = hl.Func("dz")
+        r2 = hl.Func("g_r2")
+        expnt2 = hl.Func("expnt2")
+        expnt_inv = hl.Func("expnt_inv")
+        self.add_funcs_by_name([dx, dy, dz, r2, expnt2, expnt_inv])
+
+
+        dx[i,j] = x[i] - x[j]
+        dy[i,j] = y[i] - y[j]
+        dz[i,j] = z[i] - z[j]
+
+        r2[i,j] = dx[i,j] * dx[i,j] + dy[i,j] * dy[i,j] + dz[i,j] * dz[i,j]
+
+        expnt2[i,j]     = expnt[i] + expnt[j]
+        expnt_inv[i,j] = hl.f64(1.0) / expnt2[i,j]
+
+
+        fac2   = hl.Func("fac2")
+        ex_arg = hl.Func("ex_arg")
+        ex     = hl.Func("ex")
+        denom  = hl.Func("denom")
+        fac4d  = hl.Func("fac4d")
+        self.add_funcs_by_name([fac2, ex_arg, ex, denom, fac4d])
+        fac2[i,j] = expnt[i] * expnt[j] * expnt_inv[i,j]
+        ex_arg[i,j,k,l] = -fac2[i,j] * r2[i,j] - fac2[k,l] * r2[k,l]
+        ex[i,j,k,l] = hl.select(ex_arg[i,j,k,l] < hl.f64(-37.0), hl.f64(0.0), hl.exp(ex_arg[i,j,k,l]))
+        denom[i,j,k,l]  = expnt2[i,j] * expnt2[k,l] * hl.sqrt(expnt2[i,j] + expnt2[k,l])
+        fac4d[i,j,k,l]  = expnt2[i,j] * expnt2[k,l] /        (expnt2[i,j] + expnt2[k,l])
+
+
+        x2   = hl.Func("g_x2")
+        y2   = hl.Func("g_y2")
+        z2   = hl.Func("g_z2")
+        rpq2 = hl.Func("rpq2")
+        self.add_funcs_by_name([x2, y2, z2, rpq2])
+        x2[i,j] = (x[i] * expnt[i] + x[j] * expnt[j]) * expnt_inv[i,j]
+        y2[i,j] = (y[i] * expnt[i] + y[j] * expnt[j]) * expnt_inv[i,j]
+        z2[i,j] = (z[i] * expnt[i] + z[j] * expnt[j]) * expnt_inv[i,j]
+        rpq2[i,j,k,l] = (
+              (x2[i,j] - x2[k,l]) * (x2[i,j] - x2[k,l])
+            + (y2[i,j] - y2[k,l]) * (y2[i,j] - y2[k,l])
+            + (z2[i,j] - z2[k,l]) * (z2[i,j] - z2[k,l]))
+
+
+
+        f0t   = hl.Func("f0t")
+        f0n   = hl.Func("f0n")
+        f0x   = hl.Func("f0x")
+        f0val = hl.Func("f0val")
+        self.add_funcs_by_name([f0t, f0n, f0x, f0val])
+        f0t[i,j,k,l] = fac4d[i,j,k,l] * rpq2[i,j,k,l]
+        f0n[i,j,k,l] = hl.clamp(hl.i32((f0t[i,j,k,l] + delo2) * rdelta), fm_in.dim(0).min(), fm_in.dim(0).max())
+        f0x[i,j,k,l] = delta * f0n[i,j,k,l] - f0t[i,j,k,l]
+        f0val[i,j,k,l] = hl.select(f0t[i,j,k,l] >= hl.f64(28.0),
+             hl.f64(0.88622692545276) / hl.sqrt(f0t[i,j,k,l]),
+                                               fm[f0n[i,j,k,l],0]
+             + f0x[i,j,k,l] *                 (fm[f0n[i,j,k,l],1]
+             + f0x[i,j,k,l] * hl.f64(0.5) *   (fm[f0n[i,j,k,l],2]
+             + f0x[i,j,k,l] * hl.f64(1./3.) * (fm[f0n[i,j,k,l],3]
+             + f0x[i,j,k,l] * hl.f64(0.25) *   fm[f0n[i,j,k,l],4]))))
+
+        g = hl.Func("g")
+        self.add_funcs_by_name([g])
+
+        if self.tracing and self.tracing_g:
+            g_trace_in = hl.ImageParam(hl.Float(64), 4, "g_trace_in")
+            g_trace    = hl.BoundaryConditions.constant_exterior(g_trace_in, 0)
+            self.inputs["g_trace_in"] = g_trace_in
+            self.clamps["g_trace"] = g_trace
+            g_trace.compute_root()
+            g[i,j,k,l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) / denom[i,j,k,l]) * ex[i,j,k,l] * f0val[i,j,k,l] * rnorm[i] * rnorm[j] * rnorm[k] * rnorm[l] + g_trace[i,j,k,l]
+        else:
+            g_trace = None
+            g[i,j,k,l] = (hl.f64(2.00) * hl.f64(pow(pi, 2.50)) / denom[i,j,k,l]) * ex[i,j,k,l] * f0val[i,j,k,l] * rnorm[i] * rnorm[j] * rnorm[k] * rnorm[l]
+
+
+    def gen_main_loop(self):
+        ''' define the big tensor contraction (with symmetry decomposition) '''
+        i, j = [self.vars[c] for c in "ij"]
+        g_fock_components = [ self.clamps["g_fock_in_clamped"] ]
+        zones = self.loopnests
+        nbfn = self.nbfn
+        loop_extents = [nbfn, nbfn, nbfn, nbfn]
+        if zones is None:
+            original_twoel = define_original_twoel_zone()
+            zones = original_twoel.split_recursive()
+            # uncomment the next line to disable symmetry decomposition
+            #zones = DecomposedLoops(original_twoel, [original_twoel])
+        zones.generate_halide(self, loop_extents, g_fock_components)
+
+
+    def gen_outputs(self):
+        ''' define the outputs '''
+        nbfn = self.nbfn
+        i, j = [self.vars[c] for c in "ij"]
+        g_fock = self.funcs["g_fock"]
+        g_dens = self.clamps["g_dens"]
+        # output scalars
+        rv = hl.Func("rv")
+
+        # output matrix
+        g_fock_out = hl.Func("g_fock_out")
+        self.funcs  .update({"rv": rv, "g_fock_out": g_fock_out})
+        self.outputs.update({"rv": rv, "g_fock_out": g_fock_out})
+
+        g_fock_out[i, j] = g_fock[i,j]
+
+        rv[i] = hl.f64(0.0)
+        r_rv = hl.RDom([(0, nbfn), (0, nbfn)])
+        rv[0] += g_fock[r_rv] * g_dens[r_rv]
+        rv[0] *= hl.f64(0.5)
+
+
+    def schedule(self):
+        ''' apply a CPU schedule '''
+        # scheduling
+        i, j, k, l = [self.vars[c] for c in "ijkl"]
+        vectorsize = self.vectorsize
+        tilesize = self.tilesize
+        logging.info("scheduling")
+        if_, jf, kf, lf = [ hl.Var(c+"f") for c in "ijkl" ] # rfactor vars
+        io, jo, ko, lo = [ hl.Var(c+"o") for c in "ijkl" ] # block outer variables
+        outer_vars = [ io, jo, ko, lo ]
+        ii, ji, ki, li = [ hl.Var(c+"i") for c in "ijkl" ] # block inner variables
+        inner_vars = [ ii, ji, ki, li ]
+        gio, gjo, gko, glo = [ hl.RVar("g"+c+"o") for c in "ijkl" ] # block outer reduction variables
+        outer_rvars = [ gio, gjo, gko, glo ]
+        gii, gji, gki, gli = [ hl.RVar("g"+c+"i") for c in "ijkl" ] # block inner reduction variables
+        inner_rvars = [ gii, gji, gki, gli ]
+        jii = hl.Var("jii") # fused i + ji
+        kolo = hl.Var("kolo") # fused ko + lo
+        gkolo = hl.RVar("gkolo") # fused gko + glo
+        ic, jc = hl._0, hl._1 # indexes for clamp funcs
+
+        # schedule the pieces of g
+
+        for clamped_1d_input in [ "expnt", "rnorm", "x", "y", "z", "fm", "g_fock_in_clamped", "g_dens" ]:
+            clamped_1d_input = self.clamps[clamped_1d_input]
+            clamped_1d_input.compute_root().vectorize(ic, vectorsize)
+        for g_precomputed_matrix in [ "expnt2", "fac2", "g_r2", "g_x2", "g_y2", "g_z2" ]:
+            g_precomputed_matrix = self.funcs[g_precomputed_matrix]
+            g_precomputed_matrix.compute_root().vectorize(i, vectorsize)
+
+        self.funcs["ex_arg"].compute_inline()
+        self.funcs["expnt_inv"].compute_inline()
+        for generic_4d_thing in ["denom", "ex", "fac4d", "rpq2"]:
+            generic_4d_thing = self.funcs[generic_4d_thing]
+            generic_4d_thing.compute_inline()
+
+        for zone_name, zone_record in self.loopnest_funcs.items():
+            logging.info("scheduling zone %s", zone_name)
+            func  = zone_record['func']
+            ur    = zone_record['unroll']
+            iters = zone_record['iters']
+            gi    = iters['i']
+            r     = zone_record['rdom']
+            riter = [r[i] for i in range(1, len(r))] # skip r[0], the unroll factor
+            rinner = riter[0]
+            router = riter[-1]
+            func.compute_root().parallel(j).vectorize(i, vectorsize)
+            func.update().dump_argument_list()
+            if len(riter) == 4:
+                ir, jr, kr, lr = riter
+                func.update().reorder(ur, ir, kr, lr, jr).unroll(ur)
+                func_intm = func.update().rfactor([[jr, jf], [ir, if_]])
+                func_intm.compute_root().update().reorder(ur, if_, kr, lr, jf).unroll(ur).vectorize(if_, vectorsize).parallel(jf)
+                self.funcs[func_intm.name()] = func_intm
+                func.update().atomic().vectorize(ir, vectorsize)
+                #g.in_(func_intm).reorder(i, k, l, j).compute_at(func_intm, if_).store_at(func_intm, lr).vectorize(i, vectorsize)
+            else:
+                func.update().atomic().unroll(ur).vectorize(rinner, vectorsize).parallel(router, 8)
+            if logging.root.level < logging.INFO:
+                logging.debug("function %s loop nest:", zone_name)
+                func.print_loop_nest()
+
+        self.funcs['g_fock'].compute_root()
+
+        # tracing
+        if self.tracing:
+            for func_name in self.funcs:
+                if func_name != "g":
+                    self.funcs[func_name].trace_stores()
+                self.funcs[func_name].trace_loads()
+
+
+    def gen_pipeline(self):
+        '''define the Halide pipeline that generates the outputs'''
+
+        logging.info("generating halide pipeline")
+
+        rv, g_fock_out = [self.outputs[f] for f in ["rv", "g_fock_out"]]
+
+        # return the pipeline
+        self.pipeline = hl.Pipeline([rv, g_fock_out])
+        return self.pipeline, self.outputs, self.inputs
+
+
+    def compile(self):
+        self.pipeline.compile_to(
+            {
+                hl.Output.c_header: "twoel.h",
+                hl.Output.c_source: "twoel.cpp",
+                hl.Output.static_library: "twoel.a",
+                hl.Output.stmt: "twoel.stmt",
+                #hl.Output.stmt_html: "twoel.html",
+                # the following outputs are useful for running it from python
+                #hl.Output.object: "twoel.o",
+                #hl.Output.python_extension: "twoel.py.cpp",
+            }, list(self.inputs.values()), "twoel"
+        )
+
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.WARNING)
+    gen = Generate_twoel()
+    gen.generate_library()
+    logging.debug({"outputs": gen.outputs})
+    logging.debug({"inputs": gen.inputs})
diff --git a/scf/halide/tools/vis.py b/scf/halide/tools/vis.py
new file mode 100644
index 0000000..94b2787
--- /dev/null
+++ b/scf/halide/tools/vis.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+
+from manim import *
+
+class Decompose(Scene):
+    # storyboard:
+    # original (tensor-expression) code on the upper left
+    # original loops on the lower left
+    # original iteration space (square) on the lower right
+    # original complexity (N^2) in the upper right
+
+    # iteration space square splits into 3: lower triangle, diagonal, upper triangle
+    # loops split into 3: i < j, i == j, i > j
+
+    # lower triangle merges with upper triangle
+    # i < j merges with i > j
+    # complexity divides by 2
+
+    def construct(self):
+
+        # loop structure in top left
+        loopsA = Text("loops:", font="Times")
+        loopsS1 = TextMobject('for i $\\in$ (0..N):')
+        loopsS2 = TextMobject('for j $\\in$ (0..N):')
+        loopsS3 = TextMobject('$f(i, j) = g(i, j)$')
+        loopsA.to_corner(UP+LEFT)
+        loopsS1.next_to(loopsA , DOWN).align_to(loopsA , LEFT).scale(0.8)
+        loopsS2.next_to(loopsS1, DOWN).align_to(loopsS1, LEFT).scale(0.8).shift(0.2*RIGHT)
+        loopsS3.next_to(loopsS2, DOWN).align_to(loopsS2, LEFT).scale(0.8).shift(0.2*RIGHT)
+        self.add(loopsA, loopsS1, loopsS2, loopsS3)
+
+        # iteration space square in bottom right
+        spaceA = Text("iteration space:", font="Times")
+        space1 = Square()
+        space2 = Square()
+        space3 = Square()
+        space1.set_fill(PINK, opacity=0.5)
+        space2.set_fill(GOLD, opacity=0.5)
+        space3.set_fill(GREEN, opacity=0.5)
+        spaceA.to_corner(UP+RIGHT)
+        space1.next_to(spaceA, DOWN)
+        space2.align_to(space1, UP+LEFT)
+        space3.align_to(space1, UP+LEFT)
+        self.add(spaceA, space1, space2, space3)
+
+        # draw arrows
+        arrow1S = Arrow(loopsS3, space2.get_edge_center(LEFT))
+        self.add(arrow1S)
+
+        # intro period
+        self.wait(5)
+
+        # split up iteration spaces
+        spaceA2 = Text("iteration space:", font="Times")
+        spaceU = Polygon(np.array([0,1,0]), np.array([1,0,0]), np.array([1,1,0]))
+        spaceD = Line(np.array([1,0,0]), np.array([0,1,0]))
+        spaceL = Polygon(np.array([0,0,0]), np.array([0,1,0]), np.array([1,0,0]))
+        spaceU.scale(2).set_fill(PINK, opacity=0.5)
+        spaceD.scale(2).set_fill(GOLD, opacity=0.5)
+        spaceL.scale(2).set_fill(GREEN, opacity=0.5)
+        spaceA2.to_corner(UP+RIGHT)
+        spaceU.next_to(spaceA2, DOWN)
+        spaceD.next_to(spaceU, DOWN)
+        spaceL.next_to(spaceD, DOWN)
+
+        # split up loops
+        loopsA2 = Text("loops:", font="Times")
+        loopsU1 = TextMobject('for i $\\in$ (0..N):')
+        loopsU2 = TextMobject('for j $\\in$ (0..i-1):')
+        loopsU3 = TextMobject('$f(i, j) = g(i, j)$')
+        loopsU4 = TextMobject('$f(j, i) = g(i, j)$')
+        loopsUDline = Line(np.array([0,0,0]), np.array([1,0,0]))
+        loopsD1 = TextMobject('for i $\\in$ (0..N):')
+        loopsD2 = TextMobject('$f(i, i) = g(i, i)$')
+        loopsDLline = Line(np.array([0,0,0]), np.array([1,0,0]))
+        loopsL1 = TextMobject('for i $\\in$ (0..N):')
+        loopsL2 = TextMobject('for j $\\in$ (i+1..N):')
+        loopsL3 = TextMobject('$f(i, j) = g(i, j)$')
+        loopsA2.to_corner(UP+LEFT)
+        loopsU1    .next_to(loopsA2    , DOWN).align_to(loopsA2, LEFT).scale(0.8)
+        loopsU2    .next_to(loopsU1    , DOWN).align_to(loopsU1, LEFT).scale(0.8).shift(0.2*RIGHT)
+        loopsU3    .next_to(loopsU2    , DOWN).align_to(loopsU2, LEFT).scale(0.8).shift(0.2*RIGHT)
+        loopsU4    .next_to(loopsU3    , DOWN).align_to(loopsU2, LEFT).scale(0.8).shift(0.2*RIGHT)
+        loopsUDline.next_to(loopsU4    , DOWN).align_to(loopsU1, LEFT).scale(0.8)
+        loopsD1    .next_to(loopsUDline, DOWN).align_to(loopsU1, LEFT).scale(0.8)
+        loopsD2    .next_to(loopsD1    , DOWN).align_to(loopsD1, LEFT).scale(0.8).shift(0.2*RIGHT)
+        loopsDLline.next_to(loopsD2    , DOWN).align_to(loopsU1, LEFT).scale(0.8)
+        loopsL1    .next_to(loopsDLline, DOWN).align_to(loopsU1, LEFT).scale(0.8)
+        loopsL2    .next_to(loopsL1    , DOWN).align_to(loopsL1, LEFT).scale(0.8).shift(0.2*RIGHT)
+        loopsL3    .next_to(loopsL2    , DOWN).align_to(loopsL2, LEFT).scale(0.8).shift(0.2*RIGHT)
+
+        # draw arrows from loops to spaces
+        arrowL = Arrow(loopsL3, spaceL.get_edge_center(LEFT))
+        arrowD = Arrow(loopsD2, spaceD.get_edge_center(LEFT))
+        arrowU = Arrow(loopsU3, spaceU.get_edge_center(LEFT))
+
+        self.play(ReplacementTransform(space1, spaceU) , ReplacementTransform(space2, spaceD) , ReplacementTransform(space3, spaceL), ReplacementTransform(spaceA, spaceA2),
+                  ReplacementTransform(loopsS1, loopsL1), ReplacementTransform(loopsS2, loopsL2), ReplacementTransform(loopsS3, loopsL3), ReplacementTransform(loopsA, loopsA2), ReplacementTransform(arrow1S, arrowL),
+                  FadeIn(loopsUDline), FadeIn(loopsD1), FadeIn(loopsD2), FadeIn(loopsDLline), FadeIn(loopsU1), FadeIn(loopsU2), FadeIn(loopsU3), FadeIn(arrowD), FadeIn(arrowU))
+        self.wait(3)
+
+        # transpose L
+        loopsLTA = Text("(transpose)", font="Times")
+        loopsLT2 = TextMobject('for j $\\in$ (0..i-1):')
+        loopsLT3 = TextMobject('$f(j, i) = g(i, j)$')
+        loopsLT2.next_to(loopsL1 , DOWN).align_to(loopsL1 , LEFT).scale(0.8).shift(0.2*RIGHT)
+        loopsLT3.next_to(loopsLT2, DOWN).align_to(loopsLT2, LEFT).scale(0.8).shift(0.2*RIGHT)
+        loopsLTA.next_to(loopsLT2, RIGHT).scale(0.5)
+
+        self.play(ReplacementTransform(loopsL2, loopsLT2), ReplacementTransform(loopsL3, loopsLT3), FadeIn(loopsLTA))
+
+        self.wait(3)
+
+        # consolidate loops and space
+        self.play(ReplacementTransform(loopsL1, loopsU1), ReplacementTransform(loopsLT2, loopsU2), ReplacementTransform(loopsLT3, loopsU4), ReplacementTransform(spaceL, spaceU), ReplacementTransform(arrowL, arrowU),
+                  FadeOut(loopsDLline), FadeOut(loopsLTA))
+        #self.play(FadeOut(loopsL1), FadeOut(loopsL2), FadeOut(loopsL3), FadeOut(spaceL), FadeOut(arrowL))
+
+        self.wait(3)
+
+
+config["frame_rate"] = 20
diff --git a/scf/halide/tools/vis.sh b/scf/halide/tools/vis.sh
new file mode 100755
index 0000000..7fa2bfb
--- /dev/null
+++ b/scf/halide/tools/vis.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+rm -rf media/images/vis media/videos/vis
+time manim vis.py
+mkdir -p media/images/vis/
+ffmpeg -i media/videos/vis/*/Decompose.mp4 -vf scale=800:450 -f image2 media/images/vis/Decompose%07d.png
+time img2webp -o Decompose.webp -min_size -d 50 -loop 0 media/images/vis/*.png
diff --git a/scf/halide/twoel_gen.cpp b/scf/halide/twoel_gen.cpp
deleted file mode 100644
index a98c238..0000000
--- a/scf/halide/twoel_gen.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-#include <Halide.h>
-#include <stdio.h>
-using namespace Halide;
-
-#define pi 3.141592653589793
-#define tol2e 0.000001
-
-class twoel_generator : public Halide::Generator<twoel_generator> {
-public:
-
-    /* input scalars */
-    Input<double> delo2{"delo2"};
-    Input<double> delta{"delta"};
-    Input<double> rdelta{"rdelta"};
-
-    /* output scalars */
-    Output<double> rv{"rv"};
-
-    /* input vectors */
-    Input<Buffer<double>> expnt_in{"expnt_in", 1};
-    Input<Buffer<double>> rnorm_in{"rnorm_in", 1};
-    Input<Buffer<double>> x_in{"x_in", 1};
-    Input<Buffer<double>> y_in{"y_in", 1};
-    Input<Buffer<double>> z_in{"z_in", 1};
-
-    /* input matrices */
-    Input<Buffer<double>> fm_in{"fm_in", 2};
-    Input<Buffer<double>> g_fock_in_in{"g_fock_in", 2};
-    Input<Buffer<double>> g_dens_in{"g_dens_in", 2};
-
-    /* output matrix */
-    Output<Buffer<double>> g_fock_out{"g_fock_out", 2};
-
-    Var i{"i"}, j{"j"}, k{"k"}, l{"l"};
-
-    void generate() {
-        Expr nbfn = g_fock_in_in.height();
-
-        /* clamp all inputs, to prevent out-of-bounds errors from odd tile sizes and such */
-        Func expnt, rnorm, x, y, z, fm, g_fock_in, g_dens;
-
-        expnt     = BoundaryConditions::constant_exterior(expnt_in    , 0);
-        rnorm     = BoundaryConditions::constant_exterior(rnorm_in    , 0);
-        x         = BoundaryConditions::constant_exterior(x_in        , 0);
-        y         = BoundaryConditions::constant_exterior(y_in        , 0);
-        z         = BoundaryConditions::constant_exterior(z_in        , 0);
-        fm        = BoundaryConditions::constant_exterior(fm_in       , 0);
-        g_fock_in = BoundaryConditions::constant_exterior(g_fock_in_in, 0);
-        g_dens    = BoundaryConditions::constant_exterior(g_dens_in   , 0);
-
-        Func dx{"dx"}, dy{"dy"}, dz{"dz"}, r2{"r2"}, expnt2{"expnt2"}, expnt_inv{"expnt_inv"};
-
-        dx(i,j) = x(i) - x(j);
-        dy(i,j) = y(i) - y(j);
-        dz(i,j) = z(i) - z(j);
-
-        r2(i,j) = dx(i,j) * dx(i,j) + dy(i,j) * dy(i,j) + dz(i,j) * dz(i,j);
-
-        expnt2(i,j)     = expnt(i) + expnt(j);
-        expnt_inv(i,j) = Expr(1.0) / expnt2(i,j);
-
-        Func fac2{"fac2"}, ex_arg{"ex_arg"}, ex{"ex"}, denom{"denom"}, fac4d{"fac4d"};
-        fac2(i,j) = expnt(i) * expnt(j) * expnt_inv(i,j);
-        ex_arg(i,j,k,l) = -fac2(i,j) * r2(i,j) - fac2(k,l) * r2(k,l);
-        ex(i,j,k,l) = select(ex_arg(i,j,k,l) < Expr(-37.0), Expr(0.0), exp(ex_arg(i,j,k,l)));
-        denom(i,j,k,l)  = expnt2(i,j) * expnt2(k,l) * sqrt(expnt2(i,j) + expnt2(k,l));
-        fac4d(i,j,k,l)  = expnt2(i,j) * expnt2(k,l) /     (expnt2(i,j) + expnt2(k,l));
-
-        Func x2{"x2"}, y2{"y2"}, z2{"z2"}, rpq2{"rpq2"};
-        x2(i,j)     = (x(i) * expnt(i) + x(j) * expnt(j)) * expnt_inv(i,j);
-        y2(i,j)     = (y(i) * expnt(i) + y(j) * expnt(j)) * expnt_inv(i,j);
-        z2(i,j)     = (z(i) * expnt(i) + z(j) * expnt(j)) * expnt_inv(i,j);
-        rpq2(i,j,k,l) =
-              (x2(i,j) - x2(k,l)) * (x2(i,j) - x2(k,l))
-            + (y2(i,j) - y2(k,l)) * (y2(i,j) - y2(k,l))
-            + (z2(i,j) - z2(k,l)) * (z2(i,j) - z2(k,l));
-
-        Func f0t{"f0t"}, f0n{"f0n"}, f0x{"f0x"}, f0val{"f0val"};
-        f0t(i,j,k,l) = fac4d(i,j,k,l) * rpq2(i,j,k,l);
-        f0n(i,j,k,l) = clamp(cast<int>((f0t(i,j,k,l) + delo2) * rdelta), fm_in.dim(0).min(), fm_in.dim(0).max());
-        f0x(i,j,k,l) = delta * f0n(i,j,k,l) - f0t(i,j,k,l);
-        f0val(i,j,k,l) = select(f0t(i,j,k,l) >= Expr(28.0),
-             Expr(0.88622692545276) / sqrt(f0t(i,j,k,l)),
-                                             fm(f0n(i,j,k,l),0)
-             + f0x(i,j,k,l) *               (fm(f0n(i,j,k,l),1)
-             + f0x(i,j,k,l) * Expr(0.5) *   (fm(f0n(i,j,k,l),2)
-             + f0x(i,j,k,l) * Expr(1./3.) * (fm(f0n(i,j,k,l),3)
-             + f0x(i,j,k,l) * Expr(0.25) *   fm(f0n(i,j,k,l),4)))));
-
-        Func gg{"gg"};
-        gg(i, j, k, l) = (Expr(2.00) * Expr(pow(pi, 2.50)) / denom(i,j,k,l)) * ex(i,j,k,l) * f0val(i,j,k,l) * rnorm(i) * rnorm(j) * rnorm(k) * rnorm(l);
-
-        RDom r(0, nbfn, 0, nbfn);
-
-        Func g_fock{"g_fock"};
-        g_fock(i,j) = g_fock_in(i,j)
-                    + sum(gg(i,j,r.x,r.y) * g_dens(r.x,r.y))
-                    - sum(gg(i,r.x,j,r.y) * g_dens(r.x,r.y)) * Expr(0.5);
-
-        g_fock_out(i,j) = g_fock(i,j);
-        rv() = sum(g_fock(r.x,r.y) * g_dens(r.x,r.y)) * Expr(0.5);
-
-#ifdef TRACING
-        gg.trace_realizations();
-        gg.trace_loads();
-        gg.trace_stores();
-        g_dens_in.trace_loads();
-        fm_in.trace_loads();
-        x_in.trace_loads();
-        y_in.trace_loads();
-        z_in.trace_loads();
-        expnt_in.trace_loads();
-        rnorm_in.trace_loads();
-        g_fock_in_in.trace_loads();
-        g_fock_out.trace_stores();
-#endif /* TRACING */
-    }
-
-    void schedule() {
-        if(auto_schedule) {
-            /* input scalars */
-            delta.set_estimate(0.014);
-            delo2.set_estimate(0.007);
-            rdelta.set_estimate(71.4285);
-            /* input vectors */
-            x_in.dim(0).set_estimate(0, 1024);
-            y_in.dim(0).set_estimate(0, 1024);
-            z_in.dim(0).set_estimate(0, 1024);
-            expnt_in.dim(0).set_estimate(0, 1024);
-            rnorm_in.dim(0).set_estimate(0, 1024);
-            /* input matrices */
-            g_fock_in_in.dim(0).set_estimate(0, 1024);
-            g_fock_in_in.dim(1).set_estimate(0, 1024);
-            g_dens_in.dim(0).set_estimate(0, 1024);
-            g_dens_in.dim(1).set_estimate(0, 1024);
-            fm_in.dim(0).set_estimate(0, 2001);
-            fm_in.dim(1).set_estimate(0, 5);
-            /* output matrix */
-            g_fock_out.set_estimate(i, 0, 1024);
-            g_fock_out.set_estimate(j, 0, 1024);
-        } else {
-            /* insert schedule here */
-        }
-    }
-};
-
-HALIDE_REGISTER_GENERATOR(twoel_generator, twoel);
diff --git a/scf/original/Makefile b/scf/original/Makefile
index ce42b08..46ff0b6 100755
--- a/scf/original/Makefile
+++ b/scf/original/Makefile
@@ -1,6 +1,6 @@
 ## Makefile for SCF-C
 
-CC  = clang
+CC  = clang-10
 CFLAGS = -O3 -mtune=native -march=native
 
 OBJFILES = input.o integ.o output.o timer.o scf.o diagonalize.o
diff --git a/scf/serial/Makefile b/scf/serial/Makefile
index ad92c7a..c13737d 100644
--- a/scf/serial/Makefile
+++ b/scf/serial/Makefile
@@ -1,11 +1,17 @@
 ## Makefile for SCF-C
 
-CC  = clang++
+CC  = clang++-10
+# nothing
 #CFLAGS = -O3 -mtune=native -march=native -Wall -Wno-deprecated -DNO_PRECALC
+# sparsity
 #CFLAGS = -O3 -mtune=native -march=native -Wall -Wno-deprecated -DNO_PRECALC -DNEGLIGIBILITY_CHECKS
+# precomputation
 #CFLAGS = -O3 -mtune=native -march=native -Wall -Wno-deprecated
+# symmetry
 #CFLAGS = -O3 -mtune=native -march=native -Wall -Wno-deprecated -DNO_PRECALC -DSYMMETRY
+# sparsity, symmetry, precomputation
 CFLAGS = -O3 -mtune=native -march=native -Wall -Wno-deprecated -DNEGLIGIBILITY_CHECKS -DSYMMETRY
+
 LDFLAGS =
 
 OBJFILES = input.o integ.o output.o timer.o scf.o diagonalize.o twoel.o