From 7d64d0a8e5c6e511018cf0566e78dc082babbf70 Mon Sep 17 00:00:00 2001 From: Mamy Ratsimbazafy Date: Fri, 2 Aug 2024 10:41:03 +0200 Subject: [PATCH] research: update LLVM x86 compiler --- constantine/math_compiler/ir.nim | 9 +- research/codegen/nim.cfg | 1 - research/codegen/x86_inlineasm.nim | 6 +- research/codegen/{x86.nim => x86_instr.nim} | 9 +- research/codegen/x86_poc.nim | 327 ++++++++++++++++++++ 5 files changed, 342 insertions(+), 10 deletions(-) delete mode 100644 research/codegen/nim.cfg rename research/codegen/{x86.nim => x86_instr.nim} (94%) create mode 100644 research/codegen/x86_poc.nim diff --git a/constantine/math_compiler/ir.nim b/constantine/math_compiler/ir.nim index 43aa8eaf..1523fdab 100644 --- a/constantine/math_compiler/ir.nim +++ b/constantine/math_compiler/ir.nim @@ -7,7 +7,8 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - constantine/math/config/[curves, precompute], + constantine/named/algebras, + constantine/named/deriv/precompute, constantine/math/io/io_bigints, constantine/platforms/[primitives, bithacks], constantine/platforms/llvm/llvm, @@ -30,6 +31,7 @@ type Backend* = enum bkNvidiaPTX + bkX86_64_Linux FnDef* = tuple[fnTy: TypeRef, fnImpl: ValueRef] # calling getTypeOf on a ValueRef function @@ -51,6 +53,9 @@ proc new*(T: type Assembler_LLVM, backend: Backend, moduleName: cstring): Assemb result.module.setTarget("nvptx64-nvidia-cuda") # Datalayout for NVVM IR 1.8 (CUDA 11.6) result.module.setDataLayout("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64") + of bkX86_64_Linux: + {.warning : "The x86 LLVM backend is incomplete and for research purposes only".} + result.module.setTarget("x86_64-pc-linux-gnu") result.builder = result.ctx.createBuilder() result.i1_t = result.ctx.int1_t() @@ -188,7 +193,7 @@ type spareBits*: uint8 CurveMetadata* = object - curve*: Curve + curve*: Algebra prefix*: string wordSize*: WordSize fp*: FieldConst diff --git a/research/codegen/nim.cfg b/research/codegen/nim.cfg deleted file mode 100644 index c1344003..00000000 --- a/research/codegen/nim.cfg +++ /dev/null @@ -1 +0,0 @@ ---path:constantine/platforms/code_generator diff --git a/research/codegen/x86_inlineasm.nim b/research/codegen/x86_inlineasm.nim index 9f604b98..d3e49761 100644 --- a/research/codegen/x86_inlineasm.nim +++ b/research/codegen/x86_inlineasm.nim @@ -8,7 +8,7 @@ import std/[macros, strutils], - ./llvm + constantine/platforms/llvm/llvm # ############################################################ # @@ -85,7 +85,7 @@ macro genInstr(body: untyped): untyped = instrBody.add quote do: - let `asmString` = if numBits == 64: static(`instr` & "q") & static(" " & `instrParam`) + let `asmString` = if `numBits` == 64: static(`instr` & "q") & static(" " & `instrParam`) else: static(`instr` & "l") & static(" " & `instrParam`) instrBody.add quote do: @@ -206,4 +206,4 @@ genInstr(): op adcx_rr: ("adcx", "%2, %0;", "=r,%0,r", [lhs, rhs]) op adcx_rm: ("adcx", "%2, %0;", "=r,0,m", [lhs, rhs]) op adox_rr: ("adox", "%2, %0;", "=r,%0,r", [lhs, rhs]) - op adox_rm: ("adox", "%2, %0;", "=r,0,m", [lhs, rhs]) \ No newline at end of file + op adox_rm: ("adox", "%2, %0;", "=r,0,m", [lhs, rhs]) diff --git a/research/codegen/x86.nim b/research/codegen/x86_instr.nim similarity index 94% rename from research/codegen/x86.nim rename to research/codegen/x86_instr.nim index 2dd83ab9..1e6211df 100644 --- a/research/codegen/x86.nim +++ b/research/codegen/x86_instr.nim @@ -7,10 +7,11 @@ # at your option. This file may not be copied, modified, or distributed except according to those terms. import - ./bindings/c_abi, - ./llvm, ./ir, - ./x86_inlineasm, - ../primitives + constantine/platforms/llvm/bindings/c_abi, + constantine/platforms/llvm/llvm, + constantine/platforms/primitives, + constantine/math_compiler/ir, + ./x86_inlineasm export x86_inlineasm diff --git a/research/codegen/x86_poc.nim b/research/codegen/x86_poc.nim new file mode 100644 index 00000000..30782615 --- /dev/null +++ b/research/codegen/x86_poc.nim @@ -0,0 +1,327 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + constantine/platforms/llvm/llvm, + constantine/platforms/primitives, + constantine/math_compiler/ir, + ./x86_instr + +echo "LLVM JIT compiler: Multiplication with MULX/ADOX/ADCX" + +proc big_mul_gen(asy: Assembler_LLVM): FnDef = + + + let procName = "big_mul_64x4" + let N = 4 + let ty = array_t(asy.i64_t, N) + let pty = pointer_t(ty) + + let bigMulTy = function_t(asy.void_t, [pty, pty, pty]) + let bigMulKernel = asy.module.addFunction(cstring procName, bigMulTy) + let blck = asy.ctx.appendBasicBlock(bigMulKernel, "bigMulBody") + asy.builder.positionAtEnd(blck) + + let bld = asy.builder + + let (hiTy, hiKernel) = asy.defHi(64) + proc hi(builder: BuilderRef, a: ValueRef): ValueRef = + return builder.call2( + hiTy, hiKernel, + [a], "hi64_" + ) + + let (loTy, loKernel) = asy.defLo(64) + proc lo(builder: BuilderRef, a: ValueRef): ValueRef = + return builder.call2( + loTy, loKernel, + [a], "lo64_" + ) + + let (mulExtTy, mulExtKernel) = asy.defMulExt(64) + bld.positionAtEnd(blck) + + proc mulx(builder: BuilderRef, a, b: ValueRef): tuple[hi, lo: ValueRef] = + # LLVM does not support multipel return value at the moment + # https://nondot.org/sabre/LLVMNotes/MultipleReturnValues.txt + # So we don't create an LLVM function + let t = builder.call2( + mulExtTy, mulExtKernel, + [a, b], "mulx64_" + ) + + builder.positionAtEnd(blck) + let lo = builder.lo(t) + let hi = builder.hi(t) + return (hi, lo) + + let r = bld.asArray(bigMulKernel.getParam(0), ty) + let a = bld.asArray(bigMulKernel.getParam(1), ty) + let b = bld.asArray(bigMulKernel.getParam(2), ty) + + let t = bld.makeArray(ty) + + block: # i = 0 + # TODO: properly implement add/adc in pure LLVM + + # TODO: ensure flags are cleared properly, compiler might optimize this away + t[0] = bld.`xor`(t[0], t[0]) + let (hi, lo) = bld.mulx(a[0], b[0]) + r[0] = lo + t[0] = hi + + for j in 1 ..< N: + let (hi , lo) = bld.mulx(a[j], b[0]) + t[j] = hi + # SHOWSTOPPER: LLVM ERROR: Inline asm not supported by this streamer because we don't have an asm parser for this target + discard bld.adcx_rr(t[j-1], lo) # Replace by LLVM IR uadd_with_overflow + + # SHOWSTOPPER: LLVM ERROR: Inline asm not supported by this streamer because we don't have an asm parser for this target + discard bld.adcx_rr(t[N-1], 0) + + # TODO: rotate t array + + # TODO: impl i in 1 ..< N + + bld.store(r, t) + bld.retVoid() + return (bigMulTy, bigMulKernel) + +when isMainModule: + # It's not the Nvidia PTX backend but it's fine + let asy = Assembler_LLVM.new(bkX86_64_Linux, cstring("x86_poc")) + let bigMul = asy.big_mul_gen() + + asy.module.verify(AbortProcessAction) + + echo "=========================================" + echo "LLVM IR\n" + + echo asy.module + echo "=========================================" + + + var engine: ExecutionEngineRef + initializeFullNativeTarget() + createJITCompilerForModule(engine, asy.module, optLevel = 0) + + let jitMul = cast[proc(r: var array[4, uint64], a, b: array[4, uint64]){.noconv.}]( + engine.getFunctionAddress("big_mul_64x4") + ) + + var r: array[4, uint64] + r.jitMul([uint64 1, 2, 3, 4], [uint64 1, 1, 1, 1]) + echo "jitMul = ", r + + # block: + # Cleanup - Assembler_LLVM is auto-managed + # engine.dispose() # also destroys the module attached to it, which double_frees Assembler_LLVM asy.module + echo "LLVM JIT - calling big_mul_64x4 SUCCESS" + + # -------------------------------------------- + # See the assembly- note it might be different from what the JIT compiler did + + const triple = "x86_64-pc-linux-gnu" + + let machine = createTargetMachine( + target = toTarget(triple), + triple = triple, + cpu = "", + features = "adx,bmi2", # TODO check the proper way to pass options + level = CodeGenLevelAggressive, + reloc = RelocDefault, + codeModel = CodeModelDefault + ) + + let pbo = createPassBuilderOptions() + pbo.setMergeFunctions() + let err = asy.module.runPasses( + "default,function-attrs,memcpyopt,sroa,mem2reg,gvn,dse,instcombine,inline,adce", + machine, + pbo + ) + if not err.pointer().isNil(): + writeStackTrace() + let errMsg = err.getErrorMessage() + stderr.write("\"codegenX86_64\" for module '" & astToStr(module) & "' " & $instantiationInfo() & + " exited with error: " & $cstring(errMsg) & '\n') + errMsg.dispose() + quit 1 + + echo "=========================================" + echo "Assembly\n" + + echo machine.emitToString(asy.module, AssemblyFile) + echo "=========================================" + + # Output + # ------------------------------------------------------------------ + + #[ + LLVM JIT compiler: Multiplication with MULX/ADOX/ADCX + ========================================= + LLVM IR + + ; ModuleID = 'x86_poc' + source_filename = "x86_poc" + target triple = "x86_64-pc-linux-gnu" + + define void @big_mul_64x4(ptr %0, ptr %1, ptr %2) { + bigMulBody: + %3 = alloca [4 x i64], align 8 + %4 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 + %5 = load i64, ptr %4, align 4 + %6 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 + %7 = load i64, ptr %6, align 4 + %8 = xor i64 %5, %7 + %9 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 + store i64 %8, ptr %9, align 4 + %10 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 0 + %11 = load i64, ptr %10, align 4 + %12 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0 + %13 = load i64, ptr %12, align 4 + %mulx64_ = call i128 @hw_mulExt64(i64 %11, i64 %13) + %lo64_ = call i64 @hw_lo64(i128 %mulx64_) + %hi64_ = call i64 @hw_hi64(i128 %mulx64_) + %14 = getelementptr inbounds [4 x i64], ptr %0, i32 0, i32 0 + store i64 %lo64_, ptr %14, align 4 + %15 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 + store i64 %hi64_, ptr %15, align 4 + %16 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 1 + %17 = load i64, ptr %16, align 4 + %18 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0 + %19 = load i64, ptr %18, align 4 + %mulx64_1 = call i128 @hw_mulExt64(i64 %17, i64 %19) + %lo64_2 = call i64 @hw_lo64(i128 %mulx64_1) + %hi64_3 = call i64 @hw_hi64(i128 %mulx64_1) + %20 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 1 + store i64 %hi64_3, ptr %20, align 4 + %21 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 0 + %22 = load i64, ptr %21, align 4 + %23 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %22, i64 %lo64_2) + %24 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 2 + %25 = load i64, ptr %24, align 4 + %26 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0 + %27 = load i64, ptr %26, align 4 + %mulx64_4 = call i128 @hw_mulExt64(i64 %25, i64 %27) + %lo64_5 = call i64 @hw_lo64(i128 %mulx64_4) + %hi64_6 = call i64 @hw_hi64(i128 %mulx64_4) + %28 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 2 + store i64 %hi64_6, ptr %28, align 4 + %29 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 1 + %30 = load i64, ptr %29, align 4 + %31 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %30, i64 %lo64_5) + %32 = getelementptr inbounds [4 x i64], ptr %1, i32 0, i32 3 + %33 = load i64, ptr %32, align 4 + %34 = getelementptr inbounds [4 x i64], ptr %2, i32 0, i32 0 + %35 = load i64, ptr %34, align 4 + %mulx64_7 = call i128 @hw_mulExt64(i64 %33, i64 %35) + %lo64_8 = call i64 @hw_lo64(i128 %mulx64_7) + %hi64_9 = call i64 @hw_hi64(i128 %mulx64_7) + %36 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 3 + store i64 %hi64_9, ptr %36, align 4 + %37 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 2 + %38 = load i64, ptr %37, align 4 + %39 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %38, i64 %lo64_8) + %40 = getelementptr inbounds [4 x i64], ptr %3, i32 0, i32 3 + %41 = load i64, ptr %40, align 4 + %42 = call i64 asm "adcxq %2, %0;", "=r,%0,r"(i64 %41, i64 0) + %43 = load [4 x i64], ptr %3, align 4 + store [4 x i64] %43, ptr %0, align 4 + ret void + } + + define i64 @hw_hi64(i128 %0) { + hiBody: + %1 = lshr i128 %0, 64 + %2 = trunc i128 %1 to i64 + ret i64 %2 + } + + define i64 @hw_lo64(i128 %0) { + loBody: + %1 = trunc i128 %0 to i64 + ret i64 %1 + } + + define i128 @hw_mulExt64(i64 %0, i64 %1) { + mulExtBody: + %2 = zext i64 %0 to i128 + %3 = zext i64 %1 to i128 + %4 = mul i128 %2, %3 + ret i128 %4 + } + + ========================================= + jitMul = [0, 0, 0, 0] + LLVM JIT - calling big_mul_64x4 SUCCESS + ========================================= + Assembly + + .text + .file "x86_poc" + .globl big_mul_64x4 + .p2align 4, 0x90 + .type big_mul_64x4,@function + big_mul_64x4: + .cfi_startproc + movq %rdx, %rcx + movq (%rdx), %rax + mulq (%rsi) + movq %rdx, %r8 + movq %rax, (%rdi) + movq (%rcx), %rcx + movq %rcx, %rax + mulq 8(%rsi) + movq %rdx, %r9 + movq %rcx, %rax + mulq 16(%rsi) + movq %rdx, %r10 + movq %rcx, %rax + mulq 24(%rsi) + movq %r8, (%rdi) + movq %r9, 8(%rdi) + movq %r10, 16(%rdi) + movq %rdx, 24(%rdi) + retq + .Lfunc_end0: + .size big_mul_64x4, .Lfunc_end0-big_mul_64x4 + .cfi_endproc + + .globl hw_hi64 + .p2align 4, 0x90 + .type hw_hi64,@function + hw_hi64: + movq %rsi, %rax + retq + .Lfunc_end1: + .size hw_hi64, .Lfunc_end1-hw_hi64 + + .globl hw_lo64 + .p2align 4, 0x90 + .type hw_lo64,@function + hw_lo64: + movq %rdi, %rax + retq + .Lfunc_end2: + .size hw_lo64, .Lfunc_end2-hw_lo64 + + .globl hw_mulExt64 + .p2align 4, 0x90 + .type hw_mulExt64,@function + hw_mulExt64: + movq %rsi, %rax + mulq %rdi + retq + .Lfunc_end3: + .size hw_mulExt64, .Lfunc_end3-hw_mulExt64 + + .section ".note.GNU-stack","",@progbits + + ========================================= + ]#