diff --git a/neural_speed/core/ne_layers.c b/neural_speed/core/ne_layers.c index a6154bda2..5a275f0d5 100644 --- a/neural_speed/core/ne_layers.c +++ b/neural_speed/core/ne_layers.c @@ -10882,6 +10882,7 @@ void ne_graph_compute(struct ne_context* ctx, struct ne_cgraph* cgraph) { } void ne_graph_profiling(const struct ne_cgraph* cgraph) { +#ifdef NS_PERF int64_t perf_total_per_op_us[NE_OP_COUNT] = {0}; NE_PRINT("=== GRAPH Profiling ===\n"); @@ -10904,6 +10905,10 @@ void ne_graph_profiling(const struct ne_cgraph* cgraph) { } NE_PRINT("perf_total_per_op_us[%24s] = %7.3f ms\n", "INNER PRODUCT", (double)ip_duration / 1000.0); NE_PRINT("========================================\n"); + +#else + NE_PRINT("\n[Warning] To collect profiling data, please recompile with NS_PROFILING=ON.\n"); +#endif } void ne_graph_reset(struct ne_cgraph* cgraph) { diff --git a/setup.py b/setup.py index fbda29af2..319cb6030 100644 --- a/setup.py +++ b/setup.py @@ -93,6 +93,7 @@ def build_extension(self, ext: CMakeExtension) -> None: extdir = ext_fullpath.parent.resolve() output_dir = f"{extdir}{os.sep}" + NS_PROFILING_ENV = os.environ.get("NS_PROFILING", "OFF") cmake_args = [ f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={output_dir}", f"-DCMAKE_RUNTIME_OUTPUT_DIRECTORY={output_dir}", @@ -104,6 +105,7 @@ def build_extension(self, ext: CMakeExtension) -> None: f"-DNS_WITH_AVX2={'ON' if NS_WITH_AVX2 else 'OFF'}", f"-DNS_WITH_TESTS=OFF", f"-DNS_PYTHON_API=ON", + f"-DNS_PROFILING={NS_PROFILING_ENV}", ] if sys.platform == "linux": # relative_rpath cmake_args.append('-DCMAKE_BUILD_RPATH=$ORIGIN/')