From 6f947ac46ff8ecd24d47f34c741542325ee8cba8 Mon Sep 17 00:00:00 2001 From: Shizhi Tang Date: Wed, 17 Apr 2024 17:15:53 +0800 Subject: [PATCH] Emulate float16 on CPU but with warnings (#609) * Emulate float16 on CPU but with warnings * Disable CPU float16 in CUDA codegen for now --- .gitmodules | 3 +++ 3rd-party/half | 1 + runtime/cpu_runtime.h | 2 ++ src/codegen/code_gen_cuda.cc | 2 +- src/codegen/detail/code_gen_c.h | 8 ++++++++ test/40.codegen/cpu/test_cpu.py | 29 +++++++++++++++++++++++++++++ 6 files changed, 44 insertions(+), 1 deletion(-) create mode 160000 3rd-party/half diff --git a/.gitmodules b/.gitmodules index 909273d34..bfee93bc1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -24,3 +24,6 @@ [submodule "3rd-party/cutlass"] path = 3rd-party/cutlass url = ../../NVIDIA/cutlass.git +[submodule "3rd-party/half"] + path = 3rd-party/half + url = ../../suruoxi/half.git diff --git a/3rd-party/half b/3rd-party/half new file mode 160000 index 000000000..7cd91f2a3 --- /dev/null +++ b/3rd-party/half @@ -0,0 +1 @@ +Subproject commit 7cd91f2a3b5feba92a0eb44ed314e0ddb9962d89 diff --git a/runtime/cpu_runtime.h b/runtime/cpu_runtime.h index ab84422a9..417093726 100644 --- a/runtime/cpu_runtime.h +++ b/runtime/cpu_runtime.h @@ -19,6 +19,8 @@ #include "mdspan.h" #include "unchecked_opt.h" +#include "../3rd-party/half/include/half.hpp" + #define restrict __restrict__ #define __ByValArray std::array diff --git a/src/codegen/code_gen_cuda.cc b/src/codegen/code_gen_cuda.cc index e5e27b710..19af26042 100644 --- a/src/codegen/code_gen_cuda.cc +++ b/src/codegen/code_gen_cuda.cc @@ -113,7 +113,7 @@ void CodeGenCUDA::genMdPtrDef(const VarDef &def, } std::string CodeGenCUDA::gen(const DataType &dtype) { - if (dtype == DataType::Float16) { + if (dtype.base() == DataType::Float16) { return "__half"; } else { return CodeGenC::gen(dtype); diff --git a/src/codegen/detail/code_gen_c.h b/src/codegen/detail/code_gen_c.h index 8adc76e93..2349f1ee5 100644 --- a/src/codegen/detail/code_gen_c.h +++ b/src/codegen/detail/code_gen_c.h @@ -767,6 +767,14 @@ std::string CodeGenC::gen(const DataType &dtype) { return "double"; case DataType::Float32: return "float"; + case DataType::Float16: + WARNING( + "float16 arithmetics on CPU is supported via emulation and comes " + "with a performance cost, which is only for compatibility purpose. " + "If you intend to do float32 computation on float16 variables, " + "please convert them explicitly. Please ignore this warning if you " + "are only allocating buffers and not performing arithmetics."); + return "half_float::half"; // From 3rd-party/half case DataType::Int64: return "int64_t"; case DataType::Int32: diff --git a/test/40.codegen/cpu/test_cpu.py b/test/40.codegen/cpu/test_cpu.py index 6de4ac5e7..24bcc7c7b 100644 --- a/test/40.codegen/cpu/test_cpu.py +++ b/test/40.codegen/cpu/test_cpu.py @@ -201,3 +201,32 @@ def test(x, y): y_std = np.array([2, 3, 4, 5], dtype="int32") assert np.array_equal(y_np, y_std) + + +def test_float16_compute(): + # Not testing float16 I/O here + + @ft.transform + def test(x, y): + x: ft.Var[(4,), "float32", "input"] + y: ft.Var[(), "float32", "output"] + x16 = ft.empty((4,), "float16") + y16 = ft.empty((), "float16") + for j in range(4): + x16[j] = ft.cast(x[j], "float16") + y16[...] = 0 + for j in range(4): + y16[...] += x16[j] + y[...] = ft.cast(y16[...], "float32") + + func = ft.lower(test, verbose=1) + code = ft.codegen(func, verbose=True) + x_np = np.random.uniform(size=(4,)).astype("float32") + y_np = np.zeros((), dtype="float32") + x_arr = ft.array(x_np) + y_arr = ft.array(y_np) + ft.build_binary(code)(x=x_arr, y=y_arr) + y_np = y_arr.numpy() + + y_std = np.sum(x_np.astype("float16")).astype("float32") + assert np.all(np.isclose(y_np, y_std, atol=1e-2))