diff --git a/sw/apps/transformer/src/transformer.h b/sw/apps/transformer/src/transformer.h index 0b00796c4..ddbf70038 100644 --- a/sw/apps/transformer/src/transformer.h +++ b/sw/apps/transformer/src/transformer.h @@ -838,13 +838,14 @@ static inline void transformer_layer_fp64(transformer_layer_fp64_t *const l) { // double used_memory_kB = (double)((uint64_t)tcdm_ptr - (uint64_t)snrt_l1_next()) / 1024.0f; // dump_debug(used_memory_kB); - // determine the column offset for the current cluster + // determine the column offset of the ifmap for the current cluster uint32_t cluster_ifmap_offset = cluster_id * l->seq_len * l->positional_embeddings_fa; - if (core_id == 0) { - dump_idx(cluster_id); - dump_idx(cluster_ifmap_offset); - } - uint32_t cluster_weights_offset = cluster_id * l->positional_embeddings_fa * B_c_lin2; + // if (core_id == 0) { + // dump_idx(cluster_id); + // dump_idx(cluster_ifmap_offset); + // } + // determine the column offset of the weights for the current cluster + uint32_t cluster_weights_offset = cluster_id * l->positional_embeddings_fa * l->embeddings_lin2; uint32_t start_loop_outer = snrt_mcycle(); for (int t_r = 0; t_r < T_r_lin2; t_r++) { @@ -867,12 +868,44 @@ static inline void transformer_layer_fp64(transformer_layer_fp64_t *const l) { snrt_dma_wait_all(); - for (int i = 0; i < B_r_lin2 * l->positional_embeddings_fa; i++) { - dump_idx(i + ifmap_offset); - dump_debug(ifmap_lin2[i]); - // printf("ifmap[%d] = %f\n", i, ifmap_lin2[i]); + // for (int i = 0; i < B_r_lin2 * l->positional_embeddings_fa; i++) { + // dump_idx(i + ifmap_offset); + // dump_debug(ifmap_lin2[i]); + // // printf("ifmap[%d] = %f\n", i + ifmap_offset, ifmap_lin2[i]); + // } + + } + uint32_t end_dma = snrt_mcycle(); + + snrt_cluster_hw_barrier(); + + uint32_t start_loop_inner = snrt_mcycle(); + for (int t_c = 0; t_c < T_c_lin2; t_c++) { + // weights: P x B_c + uint32_t weights_offset = t_c * B_c_lin2 * l->positional_embeddings_fa + cluster_weights_offset; + uint32_t start_dma = snrt_mcycle(); + if (!snrt_is_compute_core()) { + // load the weights tile + snrt_dma_txid_t txid_weights = + snrt_dma_start_2d( + weights_lin2, /* dst */ + l->weights_lin2 + weights_offset, /* src */ + B_c_lin2 * sizeof(double), /* size */ + B_c_lin2 * sizeof(double), /* dst_stride */ + l->positional_embeddings_fa * sizeof(double), /* src_stride */ + l->positional_embeddings_fa); /* repetitions */ + + snrt_dma_wait_all(); + + for (int i = 0; i < B_c_lin2 * l->positional_embeddings_fa; i++) { + dump_idx(i + weights_offset); + dump_debug(weights_lin2[i]); + // printf("weights[%d] = %f\n", i + weights_offset, weights_lin2[i]); + } } + uint32_t end_dma = snrt_mcycle(); + snrt_cluster_hw_barrier(); } }