diff --git a/dory/Hardware_targets/PULP/Common/Templates/network.c.t b/dory/Hardware_targets/PULP/Common/Templates/network.c.t index c81189f7..698eb346 100644 --- a/dory/Hardware_targets/PULP/Common/Templates/network.c.t +++ b/dory/Hardware_targets/PULP/Common/Templates/network.c.t @@ -104,7 +104,7 @@ void ${prefix}execute_layer_fork(void *args) { if (pi_core_id() == 0) pmsis_l1_malloc_free(layer_args->L1_buffer, ${l1_buffer}); } -void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec${", void *L2_input_h" if not l3_supported else ""}) +struct ${prefix}network_run_token ${prefix}network_run_async(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec${", void *L2_input_h" if not l3_supported else ""}) { struct pi_device cluster_dev = {0}; struct pi_cluster_conf conf; @@ -132,12 +132,24 @@ void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final cluster_task.stack_size = ${master_stack}; cluster_task.slave_stack_size = ${slave_stack}; pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); - pi_cluster_close(&cluster_dev); + return (struct ${prefix}network_run_token) { + .cluster_dev = cluster_dev + }; +} + +void ${prefix}network_run_wait(struct ${prefix}network_run_token token) +{ + pi_cluster_close(&token.cluster_dev); % if 'Perf_final' in verbose_level: print_perf("Final", ${prefix}cycle_network_execution, ${MACs}); % endif } +void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec${", void *L2_input_h" if not l3_supported else ""}) +{ + ${prefix}network_run_wait(network_run_async(l2_buffer, l2_buffer_size, l2_final_output, exec${", L2_input_h" if not l3_supported else ""})); +} + void ${prefix}network_run_cluster(void *args) { unsigned int * real_args = (unsigned int *) args; void * l2_buffer = (void *) real_args[0]; diff --git a/dory/Hardware_targets/PULP/Common/Templates/network.h.t b/dory/Hardware_targets/PULP/Common/Templates/network.h.t index bbb15d56..595aebaf 100644 --- a/dory/Hardware_targets/PULP/Common/Templates/network.h.t +++ b/dory/Hardware_targets/PULP/Common/Templates/network.h.t @@ -20,9 +20,6 @@ #ifndef __${prefix.upper()}NETWORK_H__ #define __${prefix.upper()}NETWORK_H__ -% if sdk == 'gap_sdk': -#include "pulp.h" -% endif <% l3_supported = DORY_HW_graph[0].HW_description['memory']['levels'] > 2 single_input = n_inputs==1 @@ -31,6 +28,12 @@ #include "${prefix}weights_definition.h" % endif #include +#include "pmsis.h" + + +struct ${prefix}network_run_token { + struct pi_device cluster_dev; +}; % if l3_supported: @@ -38,6 +41,8 @@ void ${prefix}network_terminate(); void ${prefix}network_initialize(); % endif void ${prefix}network_run_cluster(void * args); +struct ${prefix}network_run_token ${prefix}network_run_async(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec${", void *L2_input_h" if not l3_supported else ""}); +void network_run_wait(struct ${prefix}network_run_token token); void ${prefix}network_run(void *l2_buffer, size_t l2_buffer_size, void *l2_final_output, int exec${", void *L2_input_h" if not l3_supported else ""}); void ${prefix}execute_layer_fork(void *arg);