Emulate float16 on CPU but with warnings (#609)

* Emulate float16 on CPU but with warnings * Disable CPU float16 in CUDA codegen for now
roastduck · Apr 17, 2024 · 6f947ac · 6f947ac
1 parent a27d6da
commit 6f947ac
Show file tree

Hide file tree

Showing 6 changed files with 44 additions and 1 deletion.
diff --git a/.gitmodules b/.gitmodules
@@ -24,3 +24,6 @@
 [submodule "3rd-party/cutlass"]
 	path = 3rd-party/cutlass
 	url = ../../NVIDIA/cutlass.git
+[submodule "3rd-party/half"]
+	path = 3rd-party/half
+	url = ../../suruoxi/half.git
diff --git a/3rd-party/half b/3rd-party/half
diff --git a/runtime/cpu_runtime.h b/runtime/cpu_runtime.h
@@ -19,6 +19,8 @@
 #include "mdspan.h"
 #include "unchecked_opt.h"
 
+#include "../3rd-party/half/include/half.hpp"
+
 #define restrict __restrict__
 #define __ByValArray std::array
 

diff --git a/src/codegen/code_gen_cuda.cc b/src/codegen/code_gen_cuda.cc
@@ -113,7 +113,7 @@ void CodeGenCUDA::genMdPtrDef(const VarDef &def,
 }
 
 std::string CodeGenCUDA::gen(const DataType &dtype) {
-    if (dtype == DataType::Float16) {
+    if (dtype.base() == DataType::Float16) {
         return "__half";
     } else {
         return CodeGenC::gen(dtype);

diff --git a/src/codegen/detail/code_gen_c.h b/src/codegen/detail/code_gen_c.h
@@ -767,6 +767,14 @@ std::string CodeGenC<Stream>::gen(const DataType &dtype) {
         return "double";
     case DataType::Float32:
         return "float";
+    case DataType::Float16:
+        WARNING(
+            "float16 arithmetics on CPU is supported via emulation and comes "
+            "with a performance cost, which is only for compatibility purpose. "
+            "If you intend to do float32 computation on float16 variables, "
+            "please convert them explicitly. Please ignore this warning if you "
+            "are only allocating buffers and not performing arithmetics.");
+        return "half_float::half"; // From 3rd-party/half
     case DataType::Int64:
         return "int64_t";
     case DataType::Int32:

diff --git a/test/40.codegen/cpu/test_cpu.py b/test/40.codegen/cpu/test_cpu.py
@@ -201,3 +201,32 @@ def test(x, y):
 
     y_std = np.array([2, 3, 4, 5], dtype="int32")
     assert np.array_equal(y_np, y_std)
+
+
+def test_float16_compute():
+    # Not testing float16 I/O here
+
+    @ft.transform
+    def test(x, y):
+        x: ft.Var[(4,), "float32", "input"]
+        y: ft.Var[(), "float32", "output"]
+        x16 = ft.empty((4,), "float16")
+        y16 = ft.empty((), "float16")
+        for j in range(4):
+            x16[j] = ft.cast(x[j], "float16")
+        y16[...] = 0
+        for j in range(4):
+            y16[...] += x16[j]
+        y[...] = ft.cast(y16[...], "float32")
+
+    func = ft.lower(test, verbose=1)
+    code = ft.codegen(func, verbose=True)
+    x_np = np.random.uniform(size=(4,)).astype("float32")
+    y_np = np.zeros((), dtype="float32")
+    x_arr = ft.array(x_np)
+    y_arr = ft.array(y_np)
+    ft.build_binary(code)(x=x_arr, y=y_arr)
+    y_np = y_arr.numpy()
+
+    y_std = np.sum(x_np.astype("float16")).astype("float32")
+    assert np.all(np.isclose(y_np, y_std, atol=1e-2))