Skip to content

Commit

Permalink
scf: squashmerge scf/symmetry-halide into master
Browse files Browse the repository at this point in the history
  • Loading branch information
Infinoid committed Aug 28, 2020
1 parent 583fc57 commit 49c1131
Show file tree
Hide file tree
Showing 20 changed files with 2,306 additions and 212 deletions.
8 changes: 1 addition & 7 deletions scf/halide/.gitignore
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
callgrind.out.*
*.o
scf.x
*_gen
*.a
twoel.a
twoel.h
twoel.schedule.h
*.halide_generated.cpp
*.registration.cpp
twoel.*
*.stmt
*.trace
*.png
Expand Down
50 changes: 18 additions & 32 deletions scf/halide/Makefile
Original file line number Diff line number Diff line change
@@ -1,27 +1,25 @@
## Makefile for SCF-C

CC = clang++
CXX = clang++
HALIDE_INCDIR=../../halide/build/include
HALIDE_LIBDIR=../../halide/build/lib
CFLAGS = -O3 -mtune=native -march=native -g -Wall -Wno-deprecated -I$(HALIDE_INCDIR)
CC = clang++-10
CXX = clang++-10
CFLAGS = -O3 -mtune=native -march=native -g -Wall -Wno-deprecated
CXXFLAGS = $(CFLAGS)
CLINK = -L$(HALIDE_LIBDIR) -ldl -lpthread
CXXLINK = -L$(HALIDE_LIBDIR) -lHalide
CLINK = -ldl -lpthread
CXXLINK = -lHalide

TARGET=host-x86-64
#TARGET=$(TARGET)-profile
MACHINE_PARAMS=32,16777216,40

# to use manually defined scheduler in twoel_gen.cpp, leave AUTOSCHEDULER commented.
AUTOSCHEDULER=Mullapudi2016
#AUTOSCHEDULER=Mullapudi2016
#AUTOSCHEDULER=Li2018
#AUTOSCHEDULER=Adams2019

ifneq ($(AUTOSCHEDULER),)
SCHEDLIB=../../halide/build/lib/libauto_schedule.so
SCHEDLIB=/usr/local/lib/x86_64-linux-gnu/libauto_schedule.so
ifeq ($(AUTOSCHEDULER),Li2018)
SCHEDLIB=../../halide/build/lib/libgradient_autoscheduler.so
SCHEDLIB=/usr/local/lib/x86_64-linux-gnu/libgradient_autoscheduler.so
endif
endif

Expand All @@ -42,42 +40,30 @@ trace scf.trace: scf.x be.inpt
HL_TRACE_FILE=scf.trace ./scf.x

pictures: scf.trace
../../halide/build/bin/HalideTraceDump -i scf.trace -t png
HalideTraceDump -i scf.trace -t png

VIDEO_WIDTH=500
VIDEO_HEIGHT=400
VIDEO_WIDTH=1920
VIDEO_HEIGHT=1080
video: scf.trace
rm -f scf.mp4
cat scf.trace | ../../halide/build/bin/HalideTraceViz \
--size $(VIDEO_WIDTH) $(VIDEO_HEIGHT) --zoom 4 --timestep 8 --hold 100 --decay 5 5 --gray \
--move 32 48 --func g_fock_out --move 32 40 --label g_fock_out "g_fock out" 1 --move 240 48 --func g_dens --move 240 40 --label g_dens g_dens 1 \
--move 32 248 --func x --move 32 240 --label x x 1 --move 240 248 --func expnt --move 240 240 --label expnt expnt 1 \
--move 32 292 --func y --move 32 284 --label y y 1 --move 240 292 --func rnorm --move 240 284 --label rnorm rnorm 1 \
--move 32 340 --func z --move 32 332 --label z z 1 \
--zoom 2 \
--move 32 200 --func fm --move 32 192 --label fm fm 1 \
| ffmpeg -f rawvideo -pix_fmt bgr32 -s $(VIDEO_WIDTH)x$(VIDEO_HEIGHT) -i /dev/stdin -c:v h264 scf.mp4
cat scf.trace | HalideTraceViz \
--size $(VIDEO_WIDTH) $(VIDEO_HEIGHT) --zoom 4 --timestep 32 --hold 100 --decay 5 5 --gray --auto_layout \
| ffmpeg -loglevel warning -f rawvideo -pix_fmt bgr32 -s $(VIDEO_WIDTH)x$(VIDEO_HEIGHT) -i /dev/stdin -c:v h264 scf.mp4

vg: scf.x
valgrind ./scf.x

cg: scf.x
valgrind --tool=callgrind ./scf.x

twoel_gen: twoel_gen.cpp
$(CXX) $(CXXFLAGS) -o $@ ../../halide/tools/GenGen.cpp $^ $(CXXLINK)

twoel.a: twoel_gen Makefile
ifeq ($(AUTOSCHEDULER),)
LD_LIBRARY_PATH=$(HALIDE_LIBDIR) ./twoel_gen -g twoel -o . -e static_library,h,schedule,cpp,stmt,registration target=$(TARGET) machine_params=$(MACHINE_PARAMS)
else
LD_LIBRARY_PATH=$(HALIDE_LIBDIR) ./twoel_gen -g twoel -o . -e static_library,h,schedule,cpp,stmt,registration -p $(SCHEDLIB) -s $(AUTOSCHEDULER) target=$(TARGET) auto_schedule=true machine_params=$(MACHINE_PARAMS)
endif
twoel.a: tools/twoel_gen.py tools/decompose.py
time python3 tools/twoel_gen.py

%.o: %.c
$(CC) $(CFLAGS) -c -o $@ $<

scf.o: twoel.a

clean:
rm -f *~ *.o $(MTARGET) *.a *_gen *.stmt *.halide_generated.cpp twoel.h twoel.schedule.h twoel.registration.cpp *.trace *.png *.mp4
rm -rf *~ *.o $(MTARGET) twoel.* *.trace *.png *.mp4 tools/__pycache__
make -C test clean
24 changes: 1 addition & 23 deletions scf/halide/be.inpt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
30
8

4 0.000 0.000 0.000
4 4.000 0.000 0.000
Expand All @@ -8,25 +8,3 @@
4 20.000 0.000 0.000
4 24.000 0.000 0.000
4 28.000 0.000 0.000
4 32.000 0.000 0.000
4 36.000 0.000 0.000
4 40.000 0.000 0.000
4 44.000 0.000 0.000
4 48.000 0.000 0.000
4 52.000 0.000 0.000
4 56.000 0.000 0.000
4 60.000 0.000 0.000
4 64.000 0.000 0.000
4 68.000 0.000 0.000
4 72.000 0.000 0.000
4 76.000 0.000 0.000
4 80.000 0.000 0.000
4 84.000 0.000 0.000
4 88.000 0.000 0.000
4 92.000 0.000 0.000
4 96.000 0.000 0.000
4 100.000 0.000 0.000
4 104.000 0.000 0.000
4 108.000 0.000 0.000
4 112.000 0.000 0.000
4 116.000 0.000 0.000
3 changes: 2 additions & 1 deletion scf/halide/scf.c
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,14 @@ int main(int argc, char **argv) {

// compute the two particle contributions to the fock matrix and get the total energy;
{
Halide::Runtime::Buffer<double> etwo_buffer = Halide::Runtime::Buffer<double>::make_scalar();
Halide::Runtime::Buffer<double, 1> etwo_buffer(1);
extern double rdelta, delta, delo2; // integ.c
extern Halide::Runtime::Buffer<double> fm; // integ.c
int error = twoel(delo2, delta, rdelta, expnt_buf, rnorm_buf, x_buf, y_buf, z_buf, fm_buf, *g_fock_buf, g_dens_buf, etwo_buffer, *g_fock_out_buf);
assert(!error);
swap_g_fock();
#ifdef TRACING
printf("twoel took %f seconds\n", timer());
exit(0);
#endif /* TRACING */
etwo = etwo_buffer(0);
Expand Down
7 changes: 7 additions & 0 deletions scf/halide/test/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/twoel_g.*
/twoel.*
faketwoel.trace
/fake

perf.data
/__pycache__
43 changes: 43 additions & 0 deletions scf/halide/test/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
## Makefile for SCF-C

CXX=clang++-10
CXXFLAGS=-O2 -g -Wall -fdiagnostics-color=always
CXXLINK=-ldl -lpthread

THREADS=1
TILE_SIZE=15
VECTOR_SIZE=4
DATA_SIZE=120
PERF_FUNCS=4D_ij_low_kl_low_pairs_low
PERF_FUNCS=all

TRACE_DATA_SIZE=15
TRACE_VIDEO_WIDTH=1920
TRACE_VIDEO_HEIGHT=1080
TRACE_FUNCS=all

run: fake
HL_NUM_THREADS=$(THREADS) ./fake $(DATA_SIZE)

perf: fake
HL_NUM_THREADS=$(THREADS) perf record ./fake $(DATA_SIZE)
perf report

twoel.a twoel.h: fakegen.py ../tools/twoel_gen.py ../tools/decompose.py Makefile
python3 fakegen.py $(PERF_FUNCS) tilesize=$(TILE_SIZE) vectorsize=$(VECTOR_SIZE)

fake: fake.cpp twoel.a twoel.h
$(CXX) $(CXXFLAGS) fake.cpp twoel.a -o $@ $(CXXLINK)

twoel.trace: faketwoel.py ../tools/twoel_gen.py ../tools/decompose.py Makefile
rm -f $@
HL_NUM_THREADS=$(THREADS) HL_TRACE_FILE=$@ python3 faketwoel.py all datasize=$(TRACE_DATA_SIZE) tilesize=$(TILE_SIZE) vectorsize=$(VECTOR_SIZE) itercount=0 tracing=True

video: twoel.trace
rm -f twoel.mp4
time cat twoel.trace | HalideTraceViz \
--size $(TRACE_VIDEO_WIDTH) $(TRACE_VIDEO_HEIGHT) --zoom 4 --timestep 15 --hold 100 --decay 5 5 --gray --auto_layout \
| ffmpeg -loglevel warning -f rawvideo -pix_fmt bgr32 -s $(TRACE_VIDEO_WIDTH)x$(TRACE_VIDEO_HEIGHT) -i /dev/stdin -c:v h264 twoel.mp4

clean:
rm -rf *.trace twoel.* twoel_g.* fake perf.* __pycache__
122 changes: 122 additions & 0 deletions scf/halide/test/fake.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/* Standalone program that calls twoel() the same way SCF does, and measures how long it takes. */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <sys/times.h>
#include <unistd.h>

#include <algorithm>
#include <vector>
#include <sstream>

#include <HalideBuffer.h>

#include "twoel.h"

using namespace Halide::Runtime;

int N;

double gen0d() {
return drand48();
}

Buffer<double> gen1d(int I=0) {
if(I == 0)
I = N;
Buffer<double> rv(I);
for(int i = 0; i < I; i++)
rv(i) = drand48();
return rv;
}

Buffer<double> gen2d(int I=0, int J=0) {
if(I == 0)
I = N;
if(J == 0)
J = N;
Buffer<double> rv(I, J);
for(int i = 0; i < I; i++)
for(int j = 0; j < J; j++)
rv(i, j) = drand48();
return rv;
}

double timestamp() {
double rv;
struct timeval tv;
gettimeofday(&tv, NULL);
rv = tv.tv_usec;
rv /= 1000000;
rv += tv.tv_sec;
return rv;
}

long cputickstamp() {
struct tms tms;
times(&tms);
return tms.tms_utime;
}

int main(int argc, char **argv) {
if(argc < 2) {
fprintf(stderr, "Usage: %s <N>\n", argv[0]);
return 1;
}
N = strtol(argv[1], NULL, 0);
srand(2);
srand48(rand());

double delo2 = gen0d();
double delta = gen0d();
double rdelta = gen0d();
Buffer<double> expnt = gen1d();
Buffer<double> rnorm = gen1d();
Buffer<double> x = gen1d();
Buffer<double> y = gen1d();
Buffer<double> z = gen1d();
Buffer<double> fm = gen2d(1002, 5);
Buffer<double> g_fock_in = gen2d();
Buffer<double> g_dens = gen2d();
Buffer<double> g_fock_out = gen2d();
Buffer<double> rv = gen1d();

// dry run
int error = twoel(delo2, delta, rdelta, expnt, rnorm, x, y, z, fm, g_fock_in, g_dens, rv, g_fock_out);
if(error) {
fprintf(stderr, "twoel failed with code %d\n", error);
return 1;
}

// benchmark it
std::vector<double> throughputs = {};
for(int trial = 0; trial < 4; trial++) {
double start_walltime = timestamp();
clock_t start_cputicks = cputickstamp();
int itercount;
for(itercount = 0; timestamp() - start_walltime < 5.0; itercount++) {
twoel(delo2, delta, rdelta, expnt, rnorm, x, y, z, fm, g_fock_in, g_dens, rv, g_fock_out);
}
clock_t cputicks = cputickstamp() - start_cputicks;
double walltime = timestamp() - start_walltime;
double cputime = (double)cputicks / sysconf(_SC_CLK_TCK);
double per_walltime = walltime / itercount;
double per_cputime = cputime / itercount;
double throughput = (double)1.0 * N * N * N * N / per_walltime;
printf("%d iterations in %.3f seconds, %.3f seconds of cpu time, %.3e seconds per iter, %.3e cpu seconds per iter, %.3e effective iters per second\n", itercount, walltime, cputime, per_walltime, per_cputime, throughput);
throughputs.push_back(throughput);
}
// sort and stringify the throughput values
std::sort(throughputs.begin(), throughputs.end());
std::ostringstream stringify;
for(int i = 0; i < throughputs.size(); i++) {
if(i)
stringify << ", ";
stringify << throughputs[i];
}
printf("throughputs: {%s}\n", stringify.str().c_str());

return 0;
}
69 changes: 69 additions & 0 deletions scf/halide/test/fakegen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env python3

'''generate a twoel.a and twoel.h with only the specified zones, and with any scheduling params (thread count, vector/block sizes) passed through'''

import sys
sys.path.append('../tools')
import halide as hl
import twoel_gen

def gen_twoel(zone_name, **kwargs):

# get JIT pipeline
zone_names = zone_name.split(",")
myzones = []
for zone in zones.loops:
if zone_name == 'all' or zone.name in zone_names:
myzones.append(zone)
if len(myzones) == 0:
if zone_name == 'list':
print([z['name'] for z in zones.loops])
else:
print("no zone %s found"%zone_name)
exit(1)
if "target_name" in kwargs:
target_name = kwargs["target_name"]
del kwargs["target_name"]
else:
target_name = "x86-64-linux-avx-avx2-f16c-fma-sse41-profile-disable_llvm_loop_opt"
zones.loops = myzones
gen = twoel_gen.Generate_twoel(loopnests=zones, **kwargs)
gen.generate_twoel()
p = gen.pipeline
print("generating for target", target_name)
target = hl.Target(target_name)
p.compile_to(
{
hl.Output.c_header: "twoel.h",
hl.Output.c_source: "twoel.cpp",
hl.Output.static_library: "twoel.a",
hl.Output.stmt: "twoel.stmt",
hl.Output.stmt_html: "twoel.html",
# the following outputs are useful for running it from python
#hl.Output.object: "twoel.o",
#hl.Output.python_extension: "twoel.py.cpp",
}, list(gen.inputs.values()), "twoel", target
)

if __name__ == "__main__":
if len(sys.argv) == 1:
print("Usage: %s <zonename>"%sys.argv[0])
exit(1)

zone_name = sys.argv[1]

kwargs = {}
for param in sys.argv[2:]:
k, v = param.split("=")
try:
v = int(v)
except:
try:
v = bool(v)
except:
pass
kwargs[k] = v

zones = twoel_gen.define_original_twoel_zone().split_recursive()

gen_twoel(zone_name, **kwargs)
Loading

0 comments on commit 49c1131

Please sign in to comment.