From 6f947ac46ff8ecd24d47f34c741542325ee8cba8 Mon Sep 17 00:00:00 2001
From: Shizhi Tang <rd0x01@gmail.com>
Date: Wed, 17 Apr 2024 17:15:53 +0800
Subject: [PATCH] Emulate float16 on CPU but with warnings (#609)

* Emulate float16 on CPU but with warnings

* Disable CPU float16 in CUDA codegen for now
---
 .gitmodules                     |  3 +++
 3rd-party/half                  |  1 +
 runtime/cpu_runtime.h           |  2 ++
 src/codegen/code_gen_cuda.cc    |  2 +-
 src/codegen/detail/code_gen_c.h |  8 ++++++++
 test/40.codegen/cpu/test_cpu.py | 29 +++++++++++++++++++++++++++++
 6 files changed, 44 insertions(+), 1 deletion(-)
 create mode 160000 3rd-party/half
diff --git a/.gitmodules b/.gitmodules
index 909273d34..bfee93bc1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -24,3 +24,6 @@
 [submodule "3rd-party/cutlass"]
 	path = 3rd-party/cutlass
 	url = ../../NVIDIA/cutlass.git
+[submodule "3rd-party/half"]
+	path = 3rd-party/half
+	url = ../../suruoxi/half.git
diff --git a/3rd-party/half b/3rd-party/half
new file mode 160000
index 000000000..7cd91f2a3
--- /dev/null
+++ b/3rd-party/half
@@ -0,0 +1 @@
+Subproject commit 7cd91f2a3b5feba92a0eb44ed314e0ddb9962d89
diff --git a/runtime/cpu_runtime.h b/runtime/cpu_runtime.h
index ab84422a9..417093726 100644
--- a/runtime/cpu_runtime.h
+++ b/runtime/cpu_runtime.h
@@ -19,6 +19,8 @@
 #include "mdspan.h"
 #include "unchecked_opt.h"
 
+#include "../3rd-party/half/include/half.hpp"
+
 #define restrict __restrict__
 #define __ByValArray std::array
 
diff --git a/src/codegen/code_gen_cuda.cc b/src/codegen/code_gen_cuda.cc
index e5e27b710..19af26042 100644
--- a/src/codegen/code_gen_cuda.cc
+++ b/src/codegen/code_gen_cuda.cc
@@ -113,7 +113,7 @@ void CodeGenCUDA::genMdPtrDef(const VarDef &def,
 }
 
 std::string CodeGenCUDA::gen(const DataType &dtype) {
-    if (dtype == DataType::Float16) {
+    if (dtype.base() == DataType::Float16) {
         return "__half";
     } else {
         return CodeGenC::gen(dtype);
diff --git a/src/codegen/detail/code_gen_c.h b/src/codegen/detail/code_gen_c.h
index 8adc76e93..2349f1ee5 100644
--- a/src/codegen/detail/code_gen_c.h
+++ b/src/codegen/detail/code_gen_c.h
@@ -767,6 +767,14 @@ std::string CodeGenC<Stream>::gen(const DataType &dtype) {
         return "double";
     case DataType::Float32:
         return "float";
+    case DataType::Float16:
+        WARNING(
+            "float16 arithmetics on CPU is supported via emulation and comes "
+            "with a performance cost, which is only for compatibility purpose. "
+            "If you intend to do float32 computation on float16 variables, "
+            "please convert them explicitly. Please ignore this warning if you "
+            "are only allocating buffers and not performing arithmetics.");
+        return "half_float::half"; // From 3rd-party/half
     case DataType::Int64:
         return "int64_t";
     case DataType::Int32:
diff --git a/test/40.codegen/cpu/test_cpu.py b/test/40.codegen/cpu/test_cpu.py
index 6de4ac5e7..24bcc7c7b 100644
--- a/test/40.codegen/cpu/test_cpu.py
+++ b/test/40.codegen/cpu/test_cpu.py
@@ -201,3 +201,32 @@ def test(x, y):
 
     y_std = np.array([2, 3, 4, 5], dtype="int32")
     assert np.array_equal(y_np, y_std)
+
+
+def test_float16_compute():
+    # Not testing float16 I/O here
+
+    @ft.transform
+    def test(x, y):
+        x: ft.Var[(4,), "float32", "input"]
+        y: ft.Var[(), "float32", "output"]
+        x16 = ft.empty((4,), "float16")
+        y16 = ft.empty((), "float16")
+        for j in range(4):
+            x16[j] = ft.cast(x[j], "float16")
+        y16[...] = 0
+        for j in range(4):
+            y16[...] += x16[j]
+        y[...] = ft.cast(y16[...], "float32")
+
+    func = ft.lower(test, verbose=1)
+    code = ft.codegen(func, verbose=True)
+    x_np = np.random.uniform(size=(4,)).astype("float32")
+    y_np = np.zeros((), dtype="float32")
+    x_arr = ft.array(x_np)
+    y_arr = ft.array(y_np)
+    ft.build_binary(code)(x=x_arr, y=y_arr)
+    y_np = y_arr.numpy()
+
+    y_std = np.sum(x_np.astype("float16")).astype("float32")
+    assert np.all(np.isclose(y_np, y_std, atol=1e-2))