Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CPU Kernel Tests #1439

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
05144e5
test_utils refactor, local_cpu_allocator
oOTigger Jul 11, 2024
3bb8ff6
test utils modification, cast, reverse, and replicate cpu kernels
oOTigger Jul 12, 2024
968cd6d
combine kernel
oOTigger Jul 14, 2024
723515b
combine kernels .h file
oOTigger Jul 14, 2024
ba586ae
Implementations for methods for machine_views and associated modules …
Marsella8 Jul 19, 2024
e6e2161
test utils logic cleanup, reverse cpu_kernel pedagogical implmentatio…
oOTigger Jul 31, 2024
29a2cf3
Merge branch 'repo-refactor' into cpu-kernels-tests
oOTigger Sep 20, 2024
366bd94
Merge branch 'repo-refactor' into cpu-kernels-tests
oOTigger Sep 24, 2024
c9c33fd
cpu_kernel's refactor, generic tensor accessor indexing
oOTigger Oct 8, 2024
2a5b38a
Merge branch 'repo-refactor' into cpu-kernels-tests
oOTigger Oct 8, 2024
d50914c
accessor.h formatting
oOTigger Oct 8, 2024
f1f2698
mk_runtime_error formatting
oOTigger Oct 8, 2024
a7422f7
reverse_kernels include
oOTigger Oct 8, 2024
ee19931
Merge branch 'repo-refactor' into cpu-kernels-tests
oOTigger Oct 15, 2024
5863880
test_utils refactor and clarity
oOTigger Oct 15, 2024
e869ace
formatting
oOTigger Oct 15, 2024
de230cb
comment removal reverse_kernels
oOTigger Oct 15, 2024
3fc8718
Issue #1435, tests for managed stream and handle
oOTigger Oct 16, 2024
d1c9e90
#1435 formatting
oOTigger Oct 16, 2024
7106dec
#1409 issue, change datatype for linear kernels away from void *
oOTigger Oct 16, 2024
51c3eb7
R & W accessor changes, minimize code bloat
oOTigger Nov 5, 2024
878cff1
code formatting and refactor
oOTigger Nov 16, 2024
42f1fce
issue #1502 & issue #1540
oOTigger Nov 22, 2024
8f05203
format check
oOTigger Nov 22, 2024
8db629d
Merge branch 'master' into cpu-kernels-tests
lockshaw Dec 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ target_link_libraries(
cudnn
nccl
utils
pcg
)

define_ff_vars(${project_target})
Expand Down
185 changes: 133 additions & 52 deletions lib/kernels/include/kernels/accessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,79 @@
#include "device.h"
#include "kernels/ff_handle.h"
#include "op-attrs/datatype.h"
#include "pcg/device_type.dtg.h"
#include "utils/exception.h"
#include "utils/required.h"

namespace FlexFlow {

struct Allocator;

class GenericTensorAccessorR {
public:
template <DataType DT>
typename data_type_enum_to_class<DT>::type const *get() const {

Check warning on line 19 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L19

Added line #L19 was not covered by tests
if (this->data_type == DT) {
return static_cast<real_type_t<DT> const *>(this->ptr);

Check warning on line 21 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L21

Added line #L21 was not covered by tests
} else {
throw mk_runtime_error(fmt::format(
"Invalid access data type ({} != {})", this->data_type, DT));

Check warning on line 24 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L23-L24

Added lines #L23 - L24 were not covered by tests
}
}

int32_t const *get_int32_ptr() const;
int64_t const *get_int64_ptr() const;
float const *get_float_ptr() const;
double const *get_double_ptr() const;
half const *get_half_ptr() const;

GenericTensorAccessorR() = delete;

GenericTensorAccessorR(DataType data_type,
ArrayShape const &shape,
void const *ptr,
DeviceType device_type);

bool operator==(GenericTensorAccessorR const &) const;
bool operator!=(GenericTensorAccessorR const &) const;

template <DataType DT>
real_type_t<DT> const &at(std::vector<size_t> const &indices) const {

Check warning on line 45 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L45

Added line #L45 was not covered by tests
if (this->device_type != DeviceType::CPU) {
throw mk_runtime_error("Calling at() on non-CPU allocated tensor");

Check warning on line 47 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L47

Added line #L47 was not covered by tests
}
if (this->data_type != DT) {
throw mk_runtime_error(fmt::format(
"Invalid access data type ({} != {})", this->data_type, DT));

Check warning on line 51 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L50-L51

Added lines #L50 - L51 were not covered by tests
}

using T = real_type_t<DT>;

T const *data_ptr = static_cast<T const *>(this->ptr);
size_t offset = calculate_index_offset(indices);

Check warning on line 57 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L56-L57

Added lines #L56 - L57 were not covered by tests

return data_ptr[offset];

Check warning on line 59 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L59

Added line #L59 was not covered by tests
}

public:
DataType data_type;
ArrayShape shape;
void const *ptr;
DeviceType device_type;

private:
std::tuple<decltype(data_type) const &,
decltype(shape) const &,
decltype(ptr) const &,
decltype(device_type) const &>
tie() const;

size_t calculate_index_offset(std::vector<size_t> const &indices) const;
};

std::string format_as(GenericTensorAccessorR const &);
std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);

class GenericTensorAccessorW {
public:
template <DataType DT>
Expand All @@ -28,64 +96,72 @@
double *get_double_ptr() const;
half *get_half_ptr() const;

public:
DataType data_type;
ArrayShape shape;
req<void *> ptr;
};
FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorW,
data_type,
shape,
ptr);
GenericTensorAccessorW() = delete;

std::string format_as(GenericTensorAccessorW const &);
std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);
GenericTensorAccessorW(DataType data_type,
ArrayShape const &shape,
void *ptr,
DeviceType device_type);

bool operator==(GenericTensorAccessorW const &) const;
bool operator!=(GenericTensorAccessorW const &) const;

operator GenericTensorAccessorR() const;

class GenericTensorAccessorR {
public:
template <DataType DT>
typename data_type_enum_to_class<DT>::type const *get() const {
if (this->data_type == DT) {
return static_cast<real_type_t<DT> const *>(this->ptr);
} else {
real_type_t<DT> &at(std::vector<size_t> const &indices) {

Check warning on line 112 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L112

Added line #L112 was not covered by tests
if (this->device_type != DeviceType::CPU) {
throw mk_runtime_error("Calling at() on non-CPU allocated tensor");

Check warning on line 114 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L114

Added line #L114 was not covered by tests
}
if (this->data_type != DT) {
throw mk_runtime_error(fmt::format(
"Invalid access data type ({} != {})", this->data_type, DT));
}

using T = real_type_t<DT>;

T *data_ptr = static_cast<T *>(this->ptr);
size_t offset = calculate_index_offset(indices);

Check warning on line 124 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L123-L124

Added lines #L123 - L124 were not covered by tests

return data_ptr[offset];

Check warning on line 126 in lib/kernels/include/kernels/accessor.h

View check run for this annotation

Codecov / codecov/patch

lib/kernels/include/kernels/accessor.h#L126

Added line #L126 was not covered by tests
}

int32_t const *get_int32_ptr() const;
int64_t const *get_int64_ptr() const;
float const *get_float_ptr() const;
double const *get_double_ptr() const;
half const *get_half_ptr() const;
template <DataType DT>
real_type_t<DT> &at(std::vector<size_t> const &indices) const {
if (this->device_type != DeviceType::CPU) {
throw mk_runtime_error("Calling at() on non-CPU allocated tensor");
}
if (this->data_type != DT) {
throw mk_runtime_error(fmt::format(
"Invalid access data type ({} != {})", this->data_type, DT));
}

using T = real_type_t<DT>;

T const *data_ptr = static_cast<T const *>(this->ptr);
size_t offset = calculate_index_offset(indices);

return data_ptr[offset];
}

public:
DataType data_type;
ArrayShape shape;
req<void const *> ptr;
};
FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(GenericTensorAccessorR,
data_type,
shape,
ptr);
void *ptr;
DeviceType device_type;

std::string format_as(GenericTensorAccessorR const &);
std::ostream &operator<<(std::ostream &, GenericTensorAccessorR const &);
private:
std::tuple<decltype(data_type) const &,
decltype(shape) const &,
decltype(ptr) const &,
decltype(device_type) const &>
tie() const;

int32_t *get_int32_ptr(GenericTensorAccessorW const &);
int64_t *get_int64_ptr(GenericTensorAccessorW const &);
float *get_float_ptr(GenericTensorAccessorW const &);
double *get_double_ptr(GenericTensorAccessorW const &);
half *get_half_ptr(GenericTensorAccessorW const &);
std::vector<int32_t *>
get_int32_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<int64_t *>
get_int64_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<float *>
get_float_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<double *>
get_double_ptrs(std::vector<GenericTensorAccessorW> const &);
std::vector<half *> get_half_ptrs(std::vector<GenericTensorAccessorW> const &);
size_t calculate_index_offset(std::vector<size_t> const &indices) const;
};

std::string format_as(GenericTensorAccessorW const &);
std::ostream &operator<<(std::ostream &, GenericTensorAccessorW const &);

static_assert(is_fmtable<req<DataType> const &>::value, "");

Expand Down Expand Up @@ -150,21 +226,26 @@
GenericTensorAccessorR read_only_accessor_from_write_accessor(
GenericTensorAccessorW const &write_accessor);

bool is_shape_and_dtype_equal(GenericTensorAccessorW const &acc1,
GenericTensorAccessorW const &acc2);

bool shape_and_dtype_matches(GenericTensorAccessorW const &accessor,
ArrayShape const &expected_shape,
DataType const &expected_dtype);
bool is_shape_and_dtype_equal(GenericTensorAccessorR const &acc1,
GenericTensorAccessorR const &acc2);

bool shape_and_dtype_matches(GenericTensorAccessorR const &accessor,
ArrayShape const &expected_shape,
DataType const &expected_dtype);

std::pair<ArrayShape, DataType>
get_shape_and_datatype(GenericTensorAccessorR const &accessor);
std::pair<ArrayShape, DataType>
get_shape_and_datatype(GenericTensorAccessorW const &accessor);

void copy_accessor_data_to_l_from_r(GenericTensorAccessorW &dst_accessor,
GenericTensorAccessorR const &src_accessor);

GenericTensorAccessorR
copy_tensor_accessor_r(GenericTensorAccessorR const &src_accessor,
Allocator &allocator);

GenericTensorAccessorW
copy_tensor_accessor_w(GenericTensorAccessorW const &src_accessor,
Allocator &allocator);

} // namespace FlexFlow

Expand Down
7 changes: 6 additions & 1 deletion lib/kernels/include/kernels/allocation.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#ifndef _FLEXFLOW_KERNELS_ALLOCATION_H
#define _FLEXFLOW_KERNELS_ALLOCATION_H

#include "accessor.h"
#include "kernels/accessor.h"
#include <cstddef>
#include <memory>

Expand All @@ -11,16 +11,21 @@ struct IAllocator {
virtual void *allocate(size_t) = 0;
virtual void deallocate(void *) = 0;

virtual DeviceType get_allocation_device_type() const = 0;

virtual ~IAllocator() = default;
};

struct Allocator {
Allocator() = delete;

GenericTensorAccessorW allocate_tensor(TensorShape const &tensor_shape);

void *allocate(size_t mem_size);
void deallocate(void *ptr);

DeviceType get_allocation_device_type() const;

template <typename T, typename... Args>
static typename std::enable_if<std::is_base_of<IAllocator, T>::value,
Allocator>::type
Expand Down
6 changes: 2 additions & 4 deletions lib/kernels/include/kernels/attention_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@ FF_VISITABLE_STRUCT_NO_EQ(MHAPerDeviceState,
std::string format_as(MHAPerDeviceState const &x);
std::ostream &operator<<(std::ostream &s, MHAPerDeviceState const &x);

namespace Kernels {
namespace MultiHeadAttention {
namespace Kernels::MultiHeadAttention {

MHAPerDeviceState init_kernel(PerDeviceFFHandle const &,
Allocator &,
Expand Down Expand Up @@ -105,8 +104,7 @@ void backward_kernel(ffStream_t stream,
void cleanup_kernel(Allocator &allocator,
MHAPerDeviceState const &device_state);

} // namespace MultiHeadAttention
} // namespace Kernels
} // namespace Kernels::MultiHeadAttention
} // namespace FlexFlow

#endif
8 changes: 2 additions & 6 deletions lib/kernels/include/kernels/batch_matmul_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
#include "kernels/allocation.h"
#include "kernels/ff_handle.h"

namespace FlexFlow {
namespace Kernels {
namespace BatchMatmul {
namespace FlexFlow::Kernels::BatchMatmul {

void forward_kernel(ffStream_t stream,
PerDeviceFFHandle const &handle,
Expand Down Expand Up @@ -35,8 +33,6 @@ void backward_kernel(ffStream_t stream,
int k,
int batch);

} // namespace BatchMatmul
} // namespace Kernels
} // namespace FlexFlow
} // namespace FlexFlow::Kernels::BatchMatmul

#endif
6 changes: 2 additions & 4 deletions lib/kernels/include/kernels/batch_norm_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState,
output_w,
relu);

namespace Kernels {
namespace BatchNorm {
namespace Kernels::BatchNorm {

BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
Allocator allocator,
Expand Down Expand Up @@ -81,8 +80,7 @@ void cleanup_kernel(Allocator allocator,
bool relu,
float *runningMean);

} // namespace BatchNorm
} // namespace Kernels
} // namespace Kernels::BatchNorm
} // namespace FlexFlow

#endif
18 changes: 4 additions & 14 deletions lib/kernels/include/kernels/cast_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,17 @@

#include "device.h"
#include "kernels/accessor.h"
#include "kernels/ff_handle.h"
#include "op-attrs/activation.dtg.h"

namespace FlexFlow {
namespace Kernels {
namespace Cast {
namespace FlexFlow::Kernels::Cast {

void forward_kernel(ffStream_t stream,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output,
DataType input_type,
DataType output_type);
GenericTensorAccessorW const &output);

void backward_kernel(ffStream_t stream,
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output,
DataType input_type,
DataType output_type);
GenericTensorAccessorW const &output);

} // namespace Cast
} // namespace Kernels
} // namespace FlexFlow
} // namespace FlexFlow::Kernels::Cast

#endif
17 changes: 17 additions & 0 deletions lib/kernels/include/kernels/cast_kernels_cpu.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#ifndef _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H
#define _FLEXFLOW_OPS_KERNELS_CAST_KERNELS_CPU_H

#include "device.h"
#include "kernels/accessor.h"

namespace FlexFlow::Kernels::Cast {

void cpu_forward_kernel(GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);

void cpu_backward_kernel(GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);

} // namespace FlexFlow::Kernels::Cast

#endif
Loading
Loading