Skip to content

Commit

Permalink
Merge pull request #22 from TurakhiaLab/optimize-banding
Browse files Browse the repository at this point in the history
Optimize banding
  • Loading branch information
ioeddk authored May 30, 2024
2 parents 6259c85 + 459de50 commit f9a0a4a
Show file tree
Hide file tree
Showing 25 changed files with 1,992 additions and 199 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -fsanitize=address -fno-inline -D
# "-fsanitize=address" flag was used to check the stack smashing with Google Address Sanitizer. Use this flag with CLang and
# run the program to check.

set(DP_HLS_HOME "/home/centos/workspace/DP-HLS")
set(DP_HLS_HOME "/home/centos/workspace/banding/DP-HLS")

set(EXECUTABLE_TARGETS
baseline_local_linear
Expand Down
11 changes: 11 additions & 0 deletions README_1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Usage
There is a folder for the full compilatio nfor each kernel.

After creating a folder for the kernel, please soft link the `src`, `include`, and `common` folders under the kernel folder.
Then copy the `Makefile` and `utils.mk` to the current folder. Do not forget to modify the proper kernel file.

# Notice
When migrating a `params.h` file, remember to add vitis libeary header, remove the definition for `MAX_QUERY_LENGTH`, `MAX_REFERENCE_LENGTH`, `PE_NUM`, and `N_BLOCKS`. Also remember to add common definitions across different kernels.

# Note
We can not put the pragma array partition within the same block we declared the hls::vector, or it will result in error in HLS such that it found the conflicting pragma of array partition and aggragate, where aggragate is done when declaring the vector.
6 changes: 6 additions & 0 deletions cosim_scaling.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

python /home/centos/workspace/DP-HLS/py-hls/auto_cosim.py --config /home/centos/workspace/DP-HLS/compile_configs/scaling_global_linear/global_linear_scaling_pe.json --simulate True
python /home/centos/workspace/DP-HLS/py-hls/auto_cosim.py --config /home/centos/workspace/DP-HLS/compile_configs/scaling_global_linear/global_linear_scaling_blocks.json --simulate True
python /home/centos/workspace/DP-HLS/py-hls/auto_cosim.py --config /home/centos/workspace/DP-HLS/compile_configs/scaling_global_dtw/global_dtw_scaling_pes.json --simulate True
python /home/centos/workspace/DP-HLS/py-hls/auto_cosim.py --config /home/centos/workspace/DP-HLS/compile_configs/scaling_global_dtw/global_dtw_scaling_blocks.json --simulate True
5 changes: 5 additions & 0 deletions further_throughput_1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

python /home/centos/workspace/DP-HLS/py-hls/parallel_compile.py --config /home/centos/workspace/DP-HLS/compile_configs/global_two_piece_affine/two_piece_affine_common_config_max.json --compile True --num_workers 1 --all True
python /home/centos/workspace/DP-HLS/py-hls/parallel_compile.py --config /home/centos/workspace/DP-HLS/compile_configs/global_affine/global_affine_common_config.json --compile True --num_workers 1 --all True
python /home/centos/workspace/DP-HLS/py-hls/parallel_compile.py --config /home/centos/workspace/DP-HLS/compile_configs/local_linear/local_linear_common_configs.json --compile True --num_workers 1 --all True
4 changes: 4 additions & 0 deletions further_throughput_2.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

python /home/centos/workspace/DP-HLS/py-hls/parallel_compile.py --config /home/centos/workspace/DP-HLS/compile_configs/overlap_suffix_prefix/overlap_suffix_prefix_config.json --compile True --num_workers 1 --all True
python /home/centos/workspace/DP-HLS/py-hls/parallel_compile.py --config /home/centos/workspace/DP-HLS/compile_configs/semiglobal_short_long/local_linear_common_configs.json --compile True --num_workers 1 --all True
8 changes: 3 additions & 5 deletions include/PE.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,9 @@ namespace PE
* @param tbp Traceback Poitner Out
*/
void PEUnrollFixedSep(
dp_mem_block_t &dp_mem,
const input_char_block_t &qry,
const input_char_block_t &ref,
const idx_t wavefront,
const idx_t (&l_lim)[PE_NUM], const idx_t (&u_lim)[PE_NUM],
const dp_mem_block_t dp_mem,
const input_char_block_t qry,
const input_char_block_t ref,
const Penalties penalties,
wavefront_scores_inf_t &score,
tbp_vec_t &tbp);
Expand Down
11 changes: 4 additions & 7 deletions include/align.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,9 @@ namespace Align
const chunk_col_scores_inf_t &init_col_scr,
score_vec_t (&init_row_scr)[MAX_REFERENCE_LENGTH],
idx_t p_cols, const idx_t ck_idx,
const idx_t (&local_l_lim)[PE_NUM], const idx_t (&local_u_lim)[PE_NUM],
idx_t &l_lim_reg, idx_t &u_lim_reg,
const bool (&col_pred)[PE_NUM],
const idx_t global_query_length, const idx_t reference_length,
const idx_t global_query_length, const idx_t local_query_length, const idx_t reference_length,
const Penalties &penalties,
ScorePack (&max)[PE_NUM], // write out so must pass by reference
tbp_t (&chunk_tbp_out)[PE_NUM][TBMEM_SIZE]
Expand All @@ -376,7 +376,7 @@ namespace Align
*/
void MapPredicate(
const idx_t (&local_l_lim)[PE_NUM], const idx_t (&local_u_lim)[PE_NUM],
const idx_t ck_start_col, const bool (&col_pred)[PE_NUM],
const idx_t (&virtual_cols)[PE_NUM], const bool (&col_pred)[PE_NUM],
bool (&predicate)[PE_NUM]);

template <typename IDX_T, int MAX_QUERY_LENGTH_, int BANDWIDTH_>
Expand All @@ -401,21 +401,18 @@ namespace Align
void PrepareLocals(
const char_t (&query)[MAX_QUERY_LENGTH_],
const score_vec_t (&init_col_scr)[MAX_QUERY_LENGTH_],
const idx_t (&l_lim)[MAX_QUERY_LENGTH_], const idx_t (&u_lim)[MAX_QUERY_LENGTH_],
char_t (&local_query)[PE_NUM_],
chunk_col_scores_inf_t &init_col_scr_local,
idx_t (&local_l_lim)[PE_NUM_], idx_t (&local_u_lim)[PE_NUM_],
bool (&col_pred)[PE_NUM_], const idx_t local_query_len,
const idx_t idx)
{
FixedBandingPrepareChunkLocals:
init_col_scr_local[0] = init_col_scr_local[PE_NUM_]; // backup the last element from previous chunk
for (int i = 0; i < PE_NUM_; i++)
{
init_col_scr_local[i + 1] = init_col_scr[idx + i];
local_query[i] = query[idx + i];
col_pred[i] = i < local_query_len;
local_l_lim[i] = l_lim[idx + i];
local_u_lim[i] = u_lim[idx + i];
}
}
}
Expand Down
94 changes: 94 additions & 0 deletions include/debug.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <array>
#include <string>
// #include <experimental/filesystem>
#include <map>
#include <fstream>
#include <unordered_map>
#include <hls_vector.h>
Expand Down Expand Up @@ -36,6 +37,27 @@ class Container {
array<array<tbr_t, MAX_REFERENCE_LENGTH>, MAX_QUERY_LENGTH> tb_mat_kernel;
array<array<char, MAX_REFERENCE_LENGTH>, MAX_QUERY_LENGTH> tb_mat_cpp; // this need to be translated

std::map<std::pair<uint, uint>, std::array<std::array<std::array<float, N_LAYERS>, PE_NUM+1>, 2>> wf_dp_mem;
std::map<std::pair<uint, uint>, std::array<bool, PE_NUM>> wf_predicates;

struct score_info {
float up[N_LAYERS];
float left[N_LAYERS];
float diag[N_LAYERS];
float write[N_LAYERS];
bool pred;
bool exiting;
bool entering;
int entering_pe;
int exiting_pe;
};

/**
* @brief Record the information of scores, with their coordinate as the index
*
*/
std::map<std::pair<int, int>, score_info> scores_infos;

Container() {};

void cast_scores();
Expand All @@ -45,6 +67,78 @@ class Container {

void set_score(int chunk_row_offset, int chunk_col_offset, int pe_num, int wavefront, score_vec_t vals, bool pred);
void set_scores_wf(int chunk_row_offset, int chunk_col_offset, int wavefront, score_vec_t vals[PE_NUM], bool predicates[PE_NUM]);

template <typename IDX_T>
void set_wf_dp_mem(IDX_T ck_idx, IDX_T wf_idx, dp_mem_block_t dp_mem){
array<std::array<std::array<float, N_LAYERS>, PE_NUM+1>, 2> store_dp_mem;
for (int i = 0; i < PE_NUM+1; i++){
for (int j = 0; j < 2; j++){
for (int k = 0; k < N_LAYERS; k++){
store_dp_mem[j][i][k] = dp_mem[i][j][k];
}
}
}
wf_dp_mem[std::make_pair(ck_idx, wf_idx)] = store_dp_mem;
}

template <typename IDX_T>
void set_score_info_dependency(IDX_T chunk_offset, IDX_T wf_idx, dp_mem_block_t dp_mem){
for (int i = 0; i < PE_NUM; i++){
score_info curr_info;
scores_infos[std::make_pair(chunk_offset + i, wf_idx - i)] = curr_info;
for (int k = 0; k < N_LAYERS; k++){
scores_infos[std::make_pair(chunk_offset + i, wf_idx - i)].up[k] = dp_mem[i][0][k];
scores_infos[std::make_pair(chunk_offset + i, wf_idx - i)].left[k] = dp_mem[i+1][0][k];
scores_infos[std::make_pair(chunk_offset + i, wf_idx - i)].diag[k] = dp_mem[i][1][k];
}

}
}

template <typename IDX_T>
void set_score_info_entering_exiting(IDX_T chunk_offset, IDX_T wf_idx, bool entering, bool exiting, int entering_pe, int exiting_pe){
for (int i = 0; i < PE_NUM; i++){
scores_infos[std::make_pair(chunk_offset + i, wf_idx - i)].entering = entering;
scores_infos[std::make_pair(chunk_offset + i, wf_idx - i)].exiting = exiting;
scores_infos[std::make_pair(chunk_offset + i, wf_idx - i)].entering_pe = entering_pe;
scores_infos[std::make_pair(chunk_offset + i, wf_idx - i)].exiting_pe = exiting_pe;
}
}

// function that dump scores infos to a debug file
template <int N_LAYERS_>
void dump_scores_infos(ofstream &file){
// dump scores, one cell per line
file << "Scores: " << endl;
for (const auto& kv : this->scores_infos) {
const std::pair<int, int>& key = kv.first;
const score_info& value = kv.second;
for (int i = 0; i < N_LAYERS_; i++){
file << "Coordinate: (" << key.first << ", " << key.second << "), Layer: " << i <<
", Up: " << value.up[i] << ", Left: " << value.left[i] << ", Diag: " << value.diag[i] << ", Pred: " << value.pred << ", ";
if (value.entering){
file << "Entering PE: " << value.entering_pe << ", ";
}
if (value.exiting){
file << "Exiting PE: " << value.exiting_pe << ", ";
}
file << endl;
}
}

}
// set score info predicate
template <typename IDX_T>
void set_score_info_predicates(IDX_T ck_offset, IDX_T wf_idx, bool preds[PE_NUM]){
for (int i = 0; i < PE_NUM; i++){
this->scores_infos[std::make_pair(ck_offset + i, wf_idx-i)].pred = preds[i];
}
}

// template <typename IDX_T>
// void dump_tb_info(ofstream &file){

// }

void compare_scores(array<array<array<float, MAX_REFERENCE_LENGTH>, MAX_QUERY_LENGTH>, N_LAYERS> scores_sol,
int query_len, int ref_len);
Expand Down
21 changes: 11 additions & 10 deletions include/solution_viterbi.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ void viterbi_solution(std::string query, std::string reference, PENALTY_T &penal
double del_up;
double ins_left;
double main_diag, main_up, main_left;
double ins_diag, del_diag;

if (i == 0 && j == 0)
{
Expand All @@ -97,10 +98,10 @@ void viterbi_solution(std::string query, std::string reference, PENALTY_T &penal
main_left = initial_col[0][1];

ins_left = initial_col[0][0];
// ins_diag = 0;
ins_diag = 0;

del_up = initial_row[0][2];
// del_diag = 0;
del_diag = 0;
}
else if (i == 0 && j > 0) // In first row, not column
{
Expand All @@ -109,10 +110,10 @@ void viterbi_solution(std::string query, std::string reference, PENALTY_T &penal
main_left = score_mat[1][i][j - 1];

ins_left = score_mat[0][0][j - 1];
// ins_diag = initial_row[j - 1][0];
ins_diag = initial_row[j - 1][0];

del_up = initial_row[j][2];
// del_diag = initial_row[j - 1][2];
del_diag = initial_row[j - 1][2];
}
else if (i > 0 && j == 0)
{
Expand All @@ -121,10 +122,10 @@ void viterbi_solution(std::string query, std::string reference, PENALTY_T &penal
main_left = initial_col[i][1];

ins_left = initial_col[i][0];
// ins_diag = initial_col[i - 1][0];
ins_diag = initial_col[i - 1][0];

del_up = score_mat[2][i - 1][j];
// del_diag = initial_col[i - 1][2];
del_diag = initial_col[i - 1][2];
}
else
{
Expand All @@ -133,10 +134,10 @@ void viterbi_solution(std::string query, std::string reference, PENALTY_T &penal
main_left = score_mat[1][i][j - 1];

ins_left = score_mat[0][i][j - 1];
// ins_diag = score_mat[0][i-1][j-1];
ins_diag = score_mat[0][i-1][j-1];

del_up = score_mat[2][i - 1][j];
// del_diag = score_mat[2][i-1][j-1];
del_diag = score_mat[2][i-1][j-1];
}

double del_write, ins_write, main_write; // values write to the score matrix
Expand All @@ -152,8 +153,8 @@ void viterbi_solution(std::string query, std::string reference, PENALTY_T &penal
del_write = penalties.transition[4][HostUtils::Sequence::base_to_num(reference[j])] + (del_open_b ? del_open : del_extend);

double main_match = penalties.log_1_m_2_lambda + main_diag;
double main_ins = penalties.log_mu + ins_left;
double main_del = penalties.log_mu + del_up;
double main_ins = penalties.log_mu + ins_diag;
double main_del = penalties.log_mu + del_diag;

double main_max = main_match;
main_max = main_max > main_ins ? main_max : main_ins;
Expand Down
15 changes: 11 additions & 4 deletions include/solutions.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,10 +341,17 @@ void fixed_banding_global_linear_solution(std::string query, std::string referen
int llim[SOL_MAX_QUERY_LENGTH], ulim[SOL_MAX_QUERY_LENGTH];
for (int i = 0; i < SOL_MAX_QUERY_LENGTH; i++)
{
llim[i] = max(0, i - SOL_BANDWIDTH);
ulim[i] = min(SOL_MAX_REFERENCE_LENGTH - 1, i + SOL_BANDWIDTH - 1);
llim[i] = i - SOL_BANDWIDTH;
ulim[i] = i + SOL_BANDWIDTH - 1;
}

// print ulim
for (int i = 0; i < SOL_MAX_QUERY_LENGTH; i++) {
cout << ulim[i] << " ";

}
cout << endl;

// Fill in the DP matrix and traceback matrix
for (int i = 0; i < query.length(); i++)
{
Expand Down Expand Up @@ -2593,7 +2600,7 @@ void global_two_piece_affine_solution(std::string query, std::string reference,
template <typename T, int M, int N>
void print_matrix(array<array<T, N>, M> &mat, string name, std::set<std::tuple<int, int, int>> incorrect_coordinates, int layer_k)
{
int width = 6;
int width = 2;
cout << name << endl;
for (int i = 0; i < M; i++)
{
Expand Down Expand Up @@ -2629,7 +2636,7 @@ void fprint_matrix(ofstream &file, array<array<T, N>, M> &mat, string name)
template <typename T, int M, int N>
void fprint_matrix(ofstream &file, array<array<T, N>, M> &mat, string query, string reference, string name)
{
int width = 2;
int width = 3;
file << name << endl;
file << std::right << std::setw(width) << " ";
file << std::right << std::setw(width) << " ";
Expand Down
28 changes: 28 additions & 0 deletions include/traceback.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ namespace Traceback
int ck_idx, int pe_idx, int col_idx, int v_row, int v_col)
{

#ifdef CMAKEDEBUG
// set the initial state of the traceback to be AL_END
for (int i = 0; i < MAX_QUERY_LENGTH + MAX_REFERENCE_LENGTH; i++)
{
traceback_out[i] = AL_END;
}
#endif

#pragma HLS bind_storage variable = traceback_out type = fifo impl = uram
int pe = pe_idx; // row index, but in tbmat
int col = col_idx;
Expand All @@ -126,6 +134,26 @@ namespace Traceback
TB_STATE state;
ALIGN_TYPE::Traceback::StateInit(tbmat[pe][col], state);

#ifdef CMAKEDEBUG
// print the content fo tbmat
// Please print it with certain width for the first row, which is the index

for (int j = 0; j < TBMEM_SIZE; j++)
{
std::cout << std::setw(3) << j << " ";
}
std::cout << std::endl;
for (int i = 0; i < PE_NUM; i++)
{
for (int j = 0; j < TBMEM_SIZE; j++)
{
std::cout << std::setw(3) << tbmat[i][j].to_int() << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
#endif

traceback_loop:
while (navigation != AL_END) // Now solely this flag determines whether to stop the traceback.
{
Expand Down
Loading

0 comments on commit f9a0a4a

Please sign in to comment.