support 2 layers of sycl

intel · Jun 4, 2024 · 0dcacb1 · 0dcacb1
1 parent 16d119a
commit 0dcacb1
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 236 deletions.
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -152,7 +152,7 @@
       "description": "x64 SYCL",
       "inherits": "x64-debug-sycl",
       "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "Release"
+        "CMAKE_BUILD_TYPE": "RelWithDebInfo"
       }
     }
   ]

diff --git a/neural_speed/core/layers/ne_bestla.cpp b/neural_speed/core/layers/ne_bestla.cpp
@@ -173,6 +173,21 @@ static inline int ne_nrows(const struct ne_tensor* tensor) {
   return tensor->ne[1] * tensor->ne[2] * tensor->ne[3];
 }
 
+ne_backend bestla_backend_support(struct ne_tensor* src0, struct ne_tensor* src1, enum ne_op op) {
+  ne_backend bk = NE_BACKEND_CPU;
+  switch (op) {
+    case NE_OP_MUL_MAT: {
+      struct ne_tensor* wei = src0;
+      if (src0->type == NE_TYPE_BTLA) {
+        bk = NE_BACKEND_SYCL;
+      }
+    } break;
+    default:
+      break;
+  }
+  return bk;
+}
+
 bool bestla_sycl_support(struct ne_tensor* node) {
   bool support = false;
   switch (node->op) {

diff --git a/neural_speed/core/layers/ne_bestla_sycl.cpp b/neural_speed/core/layers/ne_bestla_sycl.cpp
@@ -79,7 +79,6 @@ void bestla_device_memcpy(void* dstptr, const void* srcptr, size_t size, void* q
   if (queue && srcptr && dstptr) {
     auto ptr = (sycl::queue*)queue;
     ptr->memcpy(dstptr, srcptr, size);
-    ptr->wait();
   }
 }
 

diff --git a/neural_speed/core/ne_bestla.h b/neural_speed/core/ne_bestla.h
@@ -80,6 +80,7 @@ void bestla_mul(int batch, int vsize, const float* tensor, const float* vector,
 void bestla_add(int batch, int vsize, const float* tensor, const float* vector, int vstep, float* out);
 
 bool bestla_sycl_support(struct ne_tensor* node);
+enum ne_backend bestla_backend_support(struct ne_tensor* src0, struct ne_tensor* src1, enum ne_op op);
 bool bestla_support(struct ne_tensor* node, int n_threads, size_t* workspace, size_t* dev_workspace);
 
 #ifdef NS_SYCL