EnzymeAD · wsmoses · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/deps/ReactantExtra/API.cpp b/deps/ReactantExtra/API.cpp
@@ -50,7 +50,6 @@
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_api.h"
 #include "xla/pjrt/pjrt_c_api_client.h"
-#include "xla/service/cpu/simple_orc_jit.h"
 
 #include "xla/python/ifrt/hlo/hlo_program.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -460,15 +459,13 @@ extern "C" void XLAExecute(xla::PjRtLoadedExecutable* exec, int num_args, PjRtBu
     }
 }
 
+void prepareRegistry(mlir::DialectRegistry &registry);
+
 extern "C" void RegisterDialects(MlirContext cctx) {
   mlir::MLIRContext &context = *unwrap(cctx);
-  context.loadDialect<mlir::arith::ArithDialect>();
-  context.loadDialect<mlir::enzyme::EnzymeDialect>();
-  context.loadDialect<mlir::tensor::TensorDialect>();
-  context.loadDialect<mlir::func::FuncDialect>();
-  context.loadDialect<mlir::mhlo::MhloDialect>();
-  context.loadDialect<mlir::stablehlo::StablehloDialect>();
-  context.loadDialect<mlir::chlo::ChloDialect>();
+  DialectRegistry registry;
+  prepareRegistry(registry);
+  context.appendDialectRegistry(registry);
 }
 
 #include "mlir/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.h"

diff --git a/deps/ReactantExtra/BUILD b/deps/ReactantExtra/BUILD
@@ -358,6 +358,7 @@ cc_library(
         ],
 
     ) + [
+	"@enzyme_ad//src/enzyme_ad/jax:RegistryUtils.cpp",
         # "@com_google_protobuf//:src/google/protobuf/io/coded_stream.cc",
         "@xla//xla:xla.pb.cc",
         "@xla//xla:xla_data.pb.cc",
@@ -426,6 +427,7 @@ cc_library(
 "-Wl,-exported_symbol,_ifrt_*",
 "-Wl,-exported_symbol,_RegisterCustomCallTarget",
 "-Wl,-exported_symbol,_ConvertLLVMToMLIR",
+"-Wl,-exported_symbol,_EnzymeGPUCustomCall",
     ]}),
     deps = [
                 "@enzyme//:EnzymeMLIR",
@@ -469,6 +471,9 @@ cc_library(
         "@xla//xla/pjrt:pjrt_c_api_client",
         "@xla//xla/pjrt/cpu:cpu_client",
 
+        "@xla//xla/service:metrics_proto_cc",
+        "@xla//xla/service:metrics_proto_cc_impl",
+
         "@xla//xla/service/cpu:cpu_compiler",
         "@xla//xla/stream_executor/tpu:tpu_on_demand_compiler",
         "@xla//xla/stream_executor/tpu:tpu_executor",

diff --git a/deps/ReactantExtra/WORKSPACE b/deps/ReactantExtra/WORKSPACE
@@ -9,7 +9,7 @@ http_archive(
     urls = ["https://github.com/wsmoses/nsync/archive/{commit}.tar.gz".format(commit = NSYNC_COMMIT)],
 )
 
-ENZYMEXLA_COMMIT = "f6587e37ff7298f2a1a273b08c24d69fca7ff30f"
+ENZYMEXLA_COMMIT = "fb483c06f697990c60cc3c0bda7fb1d730fca3de"
 ENZYMEXLA_SHA256 = ""
 
 http_archive(
@@ -19,6 +19,27 @@ http_archive(
     urls = ["https://github.com/EnzymeAD/Enzyme-JAX/archive/{commit}.tar.gz".format(commit = ENZYMEXLA_COMMIT)],
 )
 
+
+# Hedron's Compile Commands Extractor for Bazel
+# https://github.com/hedronvision/bazel-compile-commands-extractor
+http_archive(
+    name = "hedron_compile_commands",
+
+    # Replace the commit hash (0e990032f3c5a866e72615cf67e5ce22186dcb97) in both places (below) with the latest (https://github.com/hedronvision/bazel-compile-commands-extractor/commits/main), rather than using the stale one here.
+    # Even better, set up Renovate and let it do the work for you (see "Suggestion: Updates" in the README).
+    url = "https://github.com/hedronvision/bazel-compile-commands-extractor/archive/4f28899228fb3ad0126897876f147ca15026151e.tar.gz",
+    strip_prefix = "bazel-compile-commands-extractor-4f28899228fb3ad0126897876f147ca15026151e",
+    # When you first run this tool, it'll recommend a sha256 hash to put here with a message like: "DEBUG: Rule 'hedron_compile_commands' indicated that a canonical reproducible form can be obtained by modifying arguments sha256 = ..."
+)
+load("@hedron_compile_commands//:workspace_setup.bzl", "hedron_compile_commands_setup")
+hedron_compile_commands_setup()
+load("@hedron_compile_commands//:workspace_setup_transitive.bzl", "hedron_compile_commands_setup_transitive")
+hedron_compile_commands_setup_transitive()
+load("@hedron_compile_commands//:workspace_setup_transitive_transitive.bzl", "hedron_compile_commands_setup_transitive_transitive")
+hedron_compile_commands_setup_transitive_transitive()
+load("@hedron_compile_commands//:workspace_setup_transitive_transitive_transitive.bzl", "hedron_compile_commands_setup_transitive_transitive_transitive")
+hedron_compile_commands_setup_transitive_transitive_transitive()
+
 load("@enzyme_ad//:workspace.bzl", "JAX_COMMIT", "JAX_SHA256", "ENZYME_COMMIT", "ENZYME_SHA256", "XLA_PATCHES")
 
 XLA_PATCHES = XLA_PATCHES + [
@@ -57,19 +78,6 @@ sed -i.bak0 "s/patch_cmds = \\[/patch_cmds = \\[\\\"find . -type f -name config.
 # """,
 ]
 
-http_archive(
-    name = "rules_cc",
-    sha256 = "85723d827f080c5e927334f1fb18a294c0b3f94fee6d6b45945f5cdae6ea0fd4",
-    strip_prefix = "rules_cc-c8c38f8c710cbbf834283e4777916b68261b359c",
-    urls = [
-        "https://github.com/bazelbuild/rules_cc/archive/c8c38f8c710cbbf834283e4777916b68261b359c.tar.gz",
-    ],
-)
-
-load("@rules_cc//cc:repositories.bzl", "rules_cc_dependencies")
-
-rules_cc_dependencies()
-
 LLVM_TARGETS = select({
     "@bazel_tools//src/conditions:windows": ["AMDGPU", "NVPTX"],
     "@bazel_tools//src/conditions:darwin": [],

diff --git a/ext/ReactantCUDAExt.jl b/ext/ReactantCUDAExt.jl
@@ -317,7 +317,7 @@ function transpose_val(val)
     return MLIR.IR.result(MLIR.Dialects.stablehlo.transpose(val; permutation=attr), 1)
 end
 
-Reactant.@reactant_override @noinline function (func::LLVMFunc{F,tt})(args...; convert=Val(false), blocks::CuDim=1, threads::CuDim=1,
+Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(args...; convert=Val(false), blocks::CuDim=1, threads::CuDim=1,
                 cooperative::Bool=false, shmem::Integer=0, call_kwargs...) where{F, tt}
-Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(args...; convert=Val(false), blocks::CuDim=1, threads::CuDim=1,
-                cooperative::Bool=false, shmem::Integer=0, call_kwargs...) where{F, tt}
+Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(
+    args...;
+    convert=Val(false),
+    blocks::CuDim=1,
+    threads::CuDim=1,
+    cooperative::Bool=false,
+    shmem::Integer=0,
+    call_kwargs...,
+) where {F,tt}
-Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(args...; convert=Val(false), blocks::CuDim=1, threads::CuDim=1,
-                cooperative::Bool=false, shmem::Integer=0, call_kwargs...) where{F, tt}
+Reactant.@reactant_overlay @noinline function (func::LLVMFunc{F,tt})(
+    args...;
+    convert=Val(false),
+    blocks::CuDim=1,
+    threads::CuDim=1,
+    cooperative::Bool=false,
+    shmem::Integer=0,
+    call_kwargs...,
+) where {F,tt}
     @show call_kwargs
 
@@ -352,10 +352,34 @@ Reactant.@reactant_override @noinline function (func::LLVMFunc{F,tt})(args...; c
 
     fname = Reactant.TracedUtils.get_attribute_by_name(func.entry, "sym_name")
     # Force public for now while we don't have real users
-    MLIR.IR.rmattr!(func.entry, "sym_visibility")
+    # MLIR.IR.rmattr!(func.entry, "sym_visibility")
+
+    operands = MLIR.IR.Value[]
+    for idx in (blockdim.x, blockdim.y, blockdim.z, threaddim.x, threaddim.y, threaddim.z, shmem)
+        push!(operands, Reactant.TracedUtils.promote_to(Reactant.TracedRNumber{Int}, idx).mlir_data)
-    for idx in (blockdim.x, blockdim.y, blockdim.z, threaddim.x, threaddim.y, threaddim.z, shmem)
-        push!(operands, Reactant.TracedUtils.promote_to(Reactant.TracedRNumber{Int}, idx).mlir_data)
+    for idx in
+        (blockdim.x, blockdim.y, blockdim.z, threaddim.x, threaddim.y, threaddim.z, shmem)
+        push!(
+            operands,
+            Reactant.TracedUtils.promote_to(Reactant.TracedRNumber{Int}, idx).mlir_data,
+        )
-    for idx in (blockdim.x, blockdim.y, blockdim.z, threaddim.x, threaddim.y, threaddim.z, shmem)
-        push!(operands, Reactant.TracedUtils.promote_to(Reactant.TracedRNumber{Int}, idx).mlir_data)
+    for idx in
+        (blockdim.x, blockdim.y, blockdim.z, threaddim.x, threaddim.y, threaddim.z, shmem)
+        push!(
+            operands,
+            Reactant.TracedUtils.promote_to(Reactant.TracedRNumber{Int}, idx).mlir_data,
+        )
+    end
+    for arg in mlir_args
+	push!(operands, arg)
-	push!(operands, arg)
+        push!(operands, arg)
-	push!(operands, arg)
+        push!(operands, arg)
+    end
+    owned_regions = MLIR.IR.Region[]
+    successors = MLIR.IR.Block[]
+    attributes = MLIR.IR.NamedAttribute[
+	MLIR.IR.NamedAttribute("fn", fname),
+	MLIR.IR.NamedAttribute("output_operand_aliases", output_operand_aliases)
+    ]
+
+    location = MLIR.IR.Location()
+    call = MLIR.IR.create_operation(
+        "enzymexla.kern_call",
+        location;
+        operands,
+        owned_regions,
+        successors,
+        attributes,
+        results=restys,
+        result_inference=false,
+    )
 
-    call = MLIR.Dialects.stablehlo.custom_call(mlir_args; result_0=restys, call_target_name="reactant_gpu_call", output_operand_aliases, backend_config=MLIR.IR.Attribute(fname))
-    # call = MLIR.Dialects.stablehlo.custom_call(mlir_args; result_0=restys, call_target_name="reactant_gpu_call", output_operand_aliases, backend_config=MLIR.IR.Attribute(func.mod))
     for (i, res) in enumerate(rarrays)
        res.mlir_data = transpose_val(MLIR.IR.result(call, i))
-       res.mlir_data = transpose_val(MLIR.IR.result(call, i))
+        res.mlir_data = transpose_val(MLIR.IR.result(call, i))
-       res.mlir_data = transpose_val(MLIR.IR.result(call, i))
+        res.mlir_data = transpose_val(MLIR.IR.result(call, i))
     end
@@ -379,7 +403,7 @@ function compiler_cache(ctx::MLIR.IR.Context)
     return cache
 end
 
-Reactant.@reactant_override @noinline function CUDA.cufunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
+Reactant.@reactant_overlay @noinline function CUDA.cufunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
-Reactant.@reactant_overlay @noinline function CUDA.cufunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
+Reactant.@reactant_overlay @noinline function CUDA.cufunction(
+    f::F, tt::TT=Tuple{}; kwargs...
+) where {F,TT}
-Reactant.@reactant_overlay @noinline function CUDA.cufunction(f::F, tt::TT=Tuple{}; kwargs...) where {F,TT}
+Reactant.@reactant_overlay @noinline function CUDA.cufunction(
+    f::F, tt::TT=Tuple{}; kwargs...
+) where {F,TT}
     res = Base.@lock CUDA.cufunction_lock begin
         # compile the function
 	cache = compiler_cache(MLIR.IR.context())
-	cache = compiler_cache(MLIR.IR.context())
+        cache = compiler_cache(MLIR.IR.context())
-	cache = compiler_cache(MLIR.IR.context())
+        cache = compiler_cache(MLIR.IR.context())

diff --git a/src/Compiler.jl b/src/Compiler.jl
@@ -305,6 +305,22 @@ function compile_mlir!(mod, f, args; optimize::Union{Bool,Symbol}=true)
     optimize isa Bool && (optimize = ifelse(optimize, :all, :none))
 
     if optimize === :all
+        run_pass_pipeline!(mod, join([opt_passes, "enzyme-batch", opt_passes], ","))
+        run_pass_pipeline!(mod, "enzyme,arith-raise{stablehlo=true}"; enable_verifier=false)
+        run_pass_pipeline!(
+            mod,
+            join(
+                [
+                    "canonicalize",
+                    "remove-unnecessary-enzyme-ops",
+                    "enzyme-simplify-math",
+                    opt_passes,
+		    "lower-kernel"
+                ],
+                ',',
+            ),
+        )
+    elseif optimize === :before_kernel
         run_pass_pipeline!(mod, join([opt_passes, "enzyme-batch", opt_passes], ","))
         run_pass_pipeline!(mod, "enzyme,arith-raise{stablehlo=true}"; enable_verifier=false)
         run_pass_pipeline!(
@@ -340,6 +356,7 @@ function compile_mlir!(mod, f, args; optimize::Union{Bool,Symbol}=true)
                     "remove-unnecessary-enzyme-ops",
                     "enzyme-simplify-math",
                     opt_passes,
+		    "lower-kernel"
                 ],
                 ',',
             ),
@@ -348,7 +365,7 @@ function compile_mlir!(mod, f, args; optimize::Union{Bool,Symbol}=true)
         run_pass_pipeline!(mod, join([opt_passes, "enzyme-batch", opt_passes], ","))
         run_pass_pipeline!(mod, "enzyme,arith-raise{stablehlo=true}"; enable_verifier=false)
         run_pass_pipeline!(
-            mod, "canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math"
+            mod, "canonicalize,remove-unnecessary-enzyme-ops,enzyme-simplify-math,lower-kernel"
         )
     elseif optimize !== :none
         error("Invalid optimize option: $(Meta.quot(optimize))")

diff --git a/src/XLA.jl b/src/XLA.jl
@@ -131,6 +131,7 @@ function __init__()
         end
     end
 
+    @ccall MLIR.API.mlir_c.RegisterCustomCallTarget("enzymexla_gpu"::Cstring, cglobal((:EnzymeGPUCustomCall, MLIR.API.mlir_c))::Ptr{Cvoid}, "CUDA"::Cstring)::Cvoid
-    @ccall MLIR.API.mlir_c.RegisterCustomCallTarget("enzymexla_gpu"::Cstring, cglobal((:EnzymeGPUCustomCall, MLIR.API.mlir_c))::Ptr{Cvoid}, "CUDA"::Cstring)::Cvoid
+    @ccall MLIR.API.mlir_c.RegisterCustomCallTarget(
+        "enzymexla_gpu"::Cstring,
+        cglobal((:EnzymeGPUCustomCall, MLIR.API.mlir_c))::Ptr{Cvoid},
+        "CUDA"::Cstring,
+    )::Cvoid
-    @ccall MLIR.API.mlir_c.RegisterCustomCallTarget("enzymexla_gpu"::Cstring, cglobal((:EnzymeGPUCustomCall, MLIR.API.mlir_c))::Ptr{Cvoid}, "CUDA"::Cstring)::Cvoid
+    @ccall MLIR.API.mlir_c.RegisterCustomCallTarget(
+        "enzymexla_gpu"::Cstring,
+        cglobal((:EnzymeGPUCustomCall, MLIR.API.mlir_c))::Ptr{Cvoid},
+        "CUDA"::Cstring,
+    )::Cvoid
     return nothing
 end
 

diff --git a/test/cuda.jl b/test/cuda.jl
@@ -19,6 +19,7 @@ end
     oA = collect(1:1:64)
     A = Reactant.to_rarray(oA)
     @show @code_hlo optimize=false square!(A)
+    @show @code_hlo optimize=:before_kernel square!(A)
-    @show @code_hlo optimize=false square!(A)
-    @show @code_hlo optimize=:before_kernel square!(A)
+    @show @code_hlo optimize = false square!(A)
+    @show @code_hlo optimize = :before_kernel square!(A)
-    @show @code_hlo optimize=false square!(A)
-    @show @code_hlo optimize=:before_kernel square!(A)
+    @show @code_hlo optimize = false square!(A)
+    @show @code_hlo optimize = :before_kernel square!(A)
     @show @code_hlo square!(A)
     func = @compile square!(A)
     @test all(Array(A) .≈ (oA .* oA))

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -41,36 +41,4 @@ end
 
 const REACTANT_TEST_GROUP = lowercase(get(ENV, "REACTANT_TEST_GROUP", "all"))
 
-@testset "Reactant.jl Tests" begin
-    if REACTANT_TEST_GROUP == "all" || REACTANT_TEST_GROUP == "core"
-        @safetestset "Layout" include("layout.jl")
-        @safetestset "Tracing" include("tracing.jl")
-        @safetestset "Basic" include("basic.jl")
-        @safetestset "Autodiff" include("autodiff.jl")
-        @safetestset "Complex" include("complex.jl")
-        @safetestset "Broadcast" include("bcast.jl")
-        @safetestset "Struct" include("struct.jl")
-        @safetestset "Closure" include("closure.jl")
-        @safetestset "Compile" include("compile.jl")
-        @safetestset "Buffer Donation" include("buffer_donation.jl")
-        @safetestset "Shortcuts to MLIR ops" include("ops.jl")
-        @safetestset "Wrapped Arrays" include("wrapped_arrays.jl")
-        @safetestset "Control Flow" include("control_flow.jl")
-    end
-
-    if REACTANT_TEST_GROUP == "all" || REACTANT_TEST_GROUP == "integration"
-        @safetestset "Linear Algebra" include("integration/linear_algebra.jl")
-        @safetestset "AbstractFFTs" include("integration/fft.jl")
-    end
-
-    if REACTANT_TEST_GROUP == "all" || REACTANT_TEST_GROUP == "neural_networks"
-        @testset "Neural Networks" begin
-            @safetestset "NNlib Primitives" include("nn/nnlib.jl")
-            @safetestset "Flux.jl Integration" include("nn/flux.jl")
-            if Sys.islinux()
-                @safetestset "LuxLib Primitives" include("nn/luxlib.jl")
-                @safetestset "Lux Integration" include("nn/lux.jl")
-            end
-        end
-    end
-end
+include("cuda.jl")