Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
compile on linux
Browse files Browse the repository at this point in the history
  • Loading branch information
luoyu-intel committed Jun 21, 2024
1 parent 7f40ae9 commit 0ce1574
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 79 deletions.
17 changes: 10 additions & 7 deletions neural_speed/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,31 @@ find_package(Threads REQUIRED)
file(GLOB layers_srcs "layers/*.cpp")
file(GLOB test_srcs "layers/*test*.cpp")
list(REMOVE_ITEM layers_srcs ${test_srcs})
set(sources ne_layers.c ${layers_srcs})

add_shareable_library_w_warning(ne_layers "${sources}")
add_library_w_warning(ne_layers_cpp "${layers_srcs}")
target_include_directories(ne_layers_cpp PUBLIC .)

target_include_directories(ne_layers PUBLIC .)
target_compile_features(ne_layers PUBLIC c_std_11) # don't bump
set_target_properties(ne_layers PROPERTIES POSITION_INDEPENDENT_CODE ON)
if (NS_TP)
find_package(oneCCL REQUIRED)
find_package(MPI REQUIRED)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_library(parallel_context STATIC parallel_context.cpp)
target_link_libraries(ne_layers PUBLIC Threads::Threads bestla ne_vec MPI::MPI_CXX ccl parallel_context)
target_link_libraries(ne_layers_cpp PUBLIC Threads::Threads bestla ne_vec MPI::MPI_CXX ccl parallel_context)
else ()
target_link_libraries(ne_layers PUBLIC Threads::Threads bestla ne_vec)
target_link_libraries(ne_layers_cpp PUBLIC Threads::Threads bestla ne_vec)
endif()


add_shareable_library_w_warning(ne_layers ne_layers.c)
set_target_properties(ne_layers PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_include_directories(ne_layers PUBLIC .)
target_compile_features(ne_layers PUBLIC c_std_11) # don't bump
if(NOT WIN32)
target_link_libraries(ne_layers PUBLIC rt)
else()
target_link_options(ne_layers PUBLIC /STACK:5242880 /F5242880)
endif()
target_link_libraries(ne_layers PRIVATE ne_layers_cpp)


if (NS_BUILD_TESTS)
Expand Down
144 changes: 72 additions & 72 deletions neural_speed/core/layers/ne_bestla_sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,15 @@ void bestla_device_mul_f32(const struct ne_compute_params* params, const struct

auto q = (sycl::queue*)params->dev_queue;

const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const size_t ne00 = src0->ne[0];
const size_t ne01 = src0->ne[1];
const size_t ne02 = src0->ne[2];
const size_t ne03 = src0->ne[3];

const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const size_t ne10 = src1->ne[0];
const size_t ne11 = src1->ne[1];
const size_t ne12 = src1->ne[2];
const size_t ne13 = src1->ne[3];

const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
Expand Down Expand Up @@ -237,15 +237,15 @@ void bestla_device_add_f32(const struct ne_compute_params* params, const struct

auto q = (sycl::queue*)params->dev_queue;

const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const size_t ne00 = src0->ne[0];
const size_t ne01 = src0->ne[1];
const size_t ne02 = src0->ne[2];
const size_t ne03 = src0->ne[3];

const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const size_t ne10 = src1->ne[0];
const size_t ne11 = src1->ne[1];
const size_t ne12 = src1->ne[2];
const size_t ne13 = src1->ne[3];

const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
Expand Down Expand Up @@ -299,10 +299,10 @@ void bestla_device_elewise_f32(const struct ne_compute_params* params, const str

auto q = (sycl::queue*)params->dev_queue;
auto op = dst->op;
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const size_t ne00 = src0->ne[0];
const size_t ne01 = src0->ne[1];
const size_t ne02 = src0->ne[2];
const size_t ne03 = src0->ne[3];

auto srcptr = (float*)src0->data;
auto dstptr = (float*)dst->data;
Expand Down Expand Up @@ -330,10 +330,10 @@ void bestla_device_rms_norm_f32(const struct ne_compute_params* params, const st
auto q = (sycl::queue*)params->dev_queue;
float eps;
memcpy(&eps, dst->op_params, sizeof(float));
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const size_t ne00 = src0->ne[0];
const size_t ne01 = src0->ne[1];
const size_t ne02 = src0->ne[2];
const size_t ne03 = src0->ne[3];
const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
const size_t nb02 = src0->nb[2];
Expand All @@ -342,9 +342,9 @@ void bestla_device_rms_norm_f32(const struct ne_compute_params* params, const st
const size_t nb1 = dst->nb[1];
const size_t nb2 = dst->nb[2];
const size_t nb3 = dst->nb[3];
int64_t constexpr WgSize = 1024;
size_t constexpr WgSize = 1024;
int constexpr SgSize = 16;
int64_t ne00_ = bestla::utils::padto_le(ne00, WgSize);
size_t ne00_ = bestla::utils::padto_le(ne00, WgSize);
auto src0ptr = (float*)src0->data;
auto dstptr = (float*)dst->data;
auto ev = q->submit([&](sycl::handler& cgh) {
Expand All @@ -366,7 +366,7 @@ void bestla_device_rms_norm_f32(const struct ne_compute_params* params, const st
float* dst_ptr = (float*)((char*)dstptr + i03 * nb3 + i02 * nb2 + i01 * nb1);
float* src0_ptr = (float*)((char*)src0ptr + i03 * nb03 + i02 * nb02 + i01 * nb01);
float sum = 0.0;
int64_t i00 = wg_loc_id;
size_t i00 = wg_loc_id;
for (; i00 < ne00_; i00 += WgSize) {
sum += (src0_ptr[i00] * src0_ptr[i00]);
}
Expand Down Expand Up @@ -411,7 +411,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {

// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
static void rope_yarn(float theta_extrap, float freq_scale, float corr_dims0, float corr_dims1, int64_t i0,
static void rope_yarn(float theta_extrap, float freq_scale, float corr_dims0, float corr_dims1, size_t i0,
float ext_factor, float mscale, float* cos_theta, float* sin_theta) {
// Get n-d rotational scaling corrected for extrapolation
float theta_interp = freq_scale * theta_extrap;
Expand Down Expand Up @@ -474,15 +474,15 @@ void bestla_device_rope_f32(const struct ne_compute_params* params, const struct
const int64_t n_keep = ((int32_t*)src1->data)[ROPE_NKEEP_IDX];
assert(n_past >= 0);

const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const size_t ne00 = src0->ne[0];
const size_t ne01 = src0->ne[1];
const size_t ne02 = src0->ne[2];
const size_t ne03 = src0->ne[3];

const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
const int64_t ne2 = dst->ne[2];
const int64_t ne3 = dst->ne[3];
const size_t ne0 = dst->ne[0];
const size_t ne1 = dst->ne[1];
const size_t ne2 = dst->ne[2];
const size_t ne3 = dst->ne[3];

const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
Expand Down Expand Up @@ -520,9 +520,9 @@ void bestla_device_rope_f32(const struct ne_compute_params* params, const struct
i /= ne2;
int i3 = i % ne3;

const int64_t p = n_past + i2;
const size_t p = n_past + i2;
float theta_base = (float)p;
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
for (size_t i0 = 0; i0 < ne0; i0 += 2) {
float cos_theta, sin_theta;
rope_yarn(theta_base, freq_scale, corr_dims0, corr_dims1, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);

Expand Down Expand Up @@ -551,15 +551,15 @@ void bestla_device_dup_f32(const struct ne_compute_params* params, const struct
}

auto q = (sycl::queue*)params->dev_queue;
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const size_t ne00 = src0->ne[0];
const size_t ne01 = src0->ne[1];
const size_t ne02 = src0->ne[2];
const size_t ne03 = src0->ne[3];

const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
const int64_t ne2 = dst->ne[2];
const int64_t ne3 = dst->ne[3];
const size_t ne0 = dst->ne[0];
const size_t ne1 = dst->ne[1];
const size_t ne2 = dst->ne[2];
const size_t ne3 = dst->ne[3];

const size_t nb00 = src0->nb[0];
const size_t nb01 = src0->nb[1];
Expand Down Expand Up @@ -666,17 +666,17 @@ class MHA {
int jj = wg_loc_id * 2;
for (; jj < seq_acc_pad; jj += WgSize * 2) {
auto s2 = *(TC*)&slm[jj];
s2[0] = std::expf(s2[0] - fmax);
s2[1] = std::expf(s2[1] - fmax);
s2[0] = std::exp(s2[0] - fmax);
s2[1] = std::exp(s2[1] - fmax);
fsums += s2[0];
fsums += s2[1];
*(TC*)&slm[jj] = s2;
}
if (jj < seq_acc) {
slm[jj] = std::expf(float(slm[jj]) - fmax);
slm[jj] = std::exp(float(slm[jj]) - fmax);
fsums += slm[jj];
if (jj + 1 < seq_acc) {
slm[jj + 1] = std::expf(float(slm[jj + 1]) - fmax);
slm[jj + 1] = std::exp(float(slm[jj + 1]) - fmax);
fsums += slm[jj + 1];
}
}
Expand Down Expand Up @@ -797,12 +797,12 @@ class MHA {
int jj = wg_loc_id;
for (; jj < seq_acc_pad; jj += SgSize) {
auto s = slm[jj];
s = std::expf(s - fmax);
s = std::exp(s - fmax);
fsums += s;
slm[jj] = s;
}
if (jj < seq_acc) {
auto s = std::expf(float(slm[jj]) - fmax);
auto s = std::exp(float(slm[jj]) - fmax);
fsums += s;
slm[jj] = s;
}
Expand Down Expand Up @@ -850,26 +850,26 @@ void bestla_device_mha_f32(const struct ne_compute_params* params, const struct
return;
}
auto q = (sycl::queue*)params->dev_queue;
const int64_t neq0 = _q->ne[0];
const int64_t neq1 = _q->ne[1];
const int64_t neq2 = _q->ne[2];
const int64_t neq3 = _q->ne[3];

const int64_t nek0 = k->ne[0];
const int64_t nek1 = k->ne[1];
const int64_t nek2 = k->ne[2];
// const int64_t nek3 = k->ne[3];

const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];

const int64_t headsize = neq0;
const int64_t headnum = neq1;
const int64_t heads_kv = nek2;
const int64_t embedsize = headnum * headsize;
const int64_t seq_cur = neq2;
const int64_t seq_all = nek1;
const int64_t batch = neq3;
const size_t neq0 = _q->ne[0];
const size_t neq1 = _q->ne[1];
const size_t neq2 = _q->ne[2];
const size_t neq3 = _q->ne[3];

const size_t nek0 = k->ne[0];
const size_t nek1 = k->ne[1];
const size_t nek2 = k->ne[2];
// const size_t nek3 = k->ne[3];

const size_t ne0 = dst->ne[0];
const size_t ne1 = dst->ne[1];

const size_t headsize = neq0;
const size_t headnum = neq1;
const size_t heads_kv = nek2;
const size_t embedsize = headnum * headsize;
const size_t seq_cur = neq2;
const size_t seq_all = nek1;
const size_t batch = neq3;
auto scale = *(float*)dst->padding;
auto n_ctx = *(uint32_t*)&dst->padding[4];
auto Qptr = (float*)_q->data;
Expand Down

0 comments on commit 0ce1574

Please sign in to comment.