diff --git a/CMakeLists.txt b/CMakeLists.txt index 578fb09c..4e55560d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -164,7 +164,6 @@ add_executable(test_csim_local_two_piece_affine "kernels/local_two_piece_affine/kernel_local_two_piece_affine.cpp" ${COMMON_SRCS}) - add_executable(test_global_affine_real_data "testbench/test_csim_global_affine_real_data.cpp" "kernels/global_affine/kernel_global_affine.cpp" diff --git a/include/PE.h b/include/PE.h index b7ef46ce..551c0c11 100644 --- a/include/PE.h +++ b/include/PE.h @@ -29,6 +29,9 @@ namespace PE const input_char_block_t qry, const input_char_block_t ref, const Penalties penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif tbp_vec_t &tbp); /** @@ -47,6 +50,9 @@ namespace PE const input_char_block_t &qry, const input_char_block_t &ref, const Penalties penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif wavefront_scores_inf_t &score, tbp_vec_t &tbp); @@ -66,6 +72,9 @@ namespace PE const input_char_block_t qry, const input_char_block_t ref, const Penalties penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif wavefront_scores_inf_t &score, tbp_vec_t &tbp); diff --git a/include/align.h b/include/align.h index 0bbdc39c..e20528ad 100644 --- a/include/align.h +++ b/include/align.h @@ -254,6 +254,9 @@ namespace Align const idx_t query_length, const idx_t reference_length, const Penalties &penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif idx_t &tb_i, idx_t &tb_j #ifndef NO_TRACEBACK , tbr_t (&tb_out)[MAX_REFERENCE_LENGTH + MAX_QUERY_LENGTH] @@ -286,6 +289,9 @@ namespace Align idx_t global_query_length, idx_t query_length, idx_t reference_length, const bool (&col_pred)[PE_NUM], const Penalties &penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif ScorePack (&max)[PE_NUM] #ifndef NO_TRACEBACK , tbp_t (&chunk_tbp_out)[PE_NUM][TBMEM_SIZE] @@ -329,6 +335,9 @@ namespace Align const idx_t query_length, const idx_t reference_length, const Penalties &penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif idx_t &tb_i, idx_t &tb_j #ifndef NO_TRACEBACK , tbr_t (&tb_out)[MAX_REFERENCE_LENGTH + MAX_QUERY_LENGTH] @@ -362,6 +371,9 @@ namespace Align const bool (&col_pred)[PE_NUM], const idx_t global_query_length, const idx_t local_query_length, const idx_t reference_length, const Penalties &penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif ScorePack (&max)[PE_NUM] // write out so must pass by reference #ifndef NO_TRACEBACK , tbp_t (&chunk_tbp_out)[PE_NUM][TBMEM_SIZE] diff --git a/include/frontend.h b/include/frontend.h index 38396221..49062a33 100644 --- a/include/frontend.h +++ b/include/frontend.h @@ -19,6 +19,9 @@ namespace ALIGN_TYPE score_vec_t diag_prev, score_vec_t left_prev, const Penalties penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t transitions[TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif score_vec_t &write_score, tbp_t &write_traceback); diff --git a/include/seq_align_multiple.h b/include/seq_align_multiple.h index 520a5e4c..b878f8ea 100644 --- a/include/seq_align_multiple.h +++ b/include/seq_align_multiple.h @@ -24,7 +24,10 @@ extern "C" { char_t (&references)[MAX_REFERENCE_LENGTH][N_BLOCKS], idx_t (&query_lengths)[N_BLOCKS], idx_t (&reference_lengths)[N_BLOCKS], - Penalties (&penalties)[N_BLOCKS], + const Penalties (&penalties)[N_BLOCKS], +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif idx_t (&tb_is)[N_BLOCKS], idx_t (&tb_js)[N_BLOCKS] #ifndef NO_TRACEBACK , tbr_t (&tb_streams)[MAX_REFERENCE_LENGTH + MAX_QUERY_LENGTH][N_BLOCKS] diff --git a/kernels/dtw/params.h b/kernels/dtw/params.h index 8c40d9f2..f8670a55 100644 --- a/kernels/dtw/params.h +++ b/kernels/dtw/params.h @@ -13,9 +13,10 @@ #define PE_NUM 32 #define LAYER_MAXIMIUM 0 // We need to indicate from which layer (main matrix) is the maximum score stored. +#define BANDING RECTANGULAR + typedef ap_fixed<32, 26> num_t; -#define BANDING RECTANGULAR // Primitive Types struct char_t_st { @@ -30,6 +31,7 @@ struct char_t_st { }; typedef char_t_st char_t; // Sequence Alphabet + typedef ap_fixed<32,26> type_t; // Scores Type typedef short idx_t; // Indexing Type, could be much less than 32. ap_uint<8> typedef ap_uint<2> tbp_t; // Traceback Pointer Type diff --git a/kernels/global_linear/params.h b/kernels/global_linear/params.h index 13a6f1cf..3e5912d8 100644 --- a/kernels/global_linear/params.h +++ b/kernels/global_linear/params.h @@ -36,25 +36,6 @@ typedef ap_uint<2> tbp_t; // Traceback Pointer Type #define DEBUG_OUTPUT_PATH "/home/yic033@AD.UCSD.EDU/DP-HLS-Debug/global_affine/" #define DEBUG_FILENAME "debug_kernel" -struct ScorePack{ - type_t score; - idx_t row; - idx_t col; - idx_t p_col; - idx_t ck; - idx_t pe; - - // Default Constructor - ScorePack() { - score = 0; - row = 0; - col = 0; - p_col = 0; - ck = 0; - pe = 0; - } -}; - struct Penalties { type_t open; type_t extend; diff --git a/kernels/local_affine/kernel_local_affine.cpp b/kernels/local_affine/kernel_local_affine.cpp index 7b0446e7..6caea87f 100644 --- a/kernels/local_affine/kernel_local_affine.cpp +++ b/kernels/local_affine/kernel_local_affine.cpp @@ -13,9 +13,9 @@ void LocalAffine::PE::Compute(char_t local_query_val, // Define Traceback Pointer Navigation Direction const type_t insert_open = left_prev[1] + penalties.open + penalties.extend; // Insert open - const type_t insert_extend = left_prev[0] + penalties.open; // insert extend + const type_t insert_extend = left_prev[0] + penalties.extend; // insert extend const type_t delete_open = up_prev[1] + penalties.open + penalties.extend; // delete open - const type_t delete_extend = up_prev[2] + penalties.open; // delete extend + const type_t delete_extend = up_prev[2] + penalties.extend; // delete extend bool insert_open_b = insert_open > insert_extend; bool delete_open_b = delete_open > delete_extend; diff --git a/kernels/protein_local_affine/kernel_protein_local_affine.cpp b/kernels/protein_local_affine/kernel_protein_local_affine.cpp index 468e15ac..cb5a56e9 100644 --- a/kernels/protein_local_affine/kernel_protein_local_affine.cpp +++ b/kernels/protein_local_affine/kernel_protein_local_affine.cpp @@ -6,16 +6,19 @@ void ProteinLocalAffine::PE::Compute(char_t local_query_val, score_vec_t diag_prev, score_vec_t left_prev, const Penalties penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t transitions[TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif score_vec_t &write_score, tbp_t &write_traceback) { // Define Traceback Pointer Navigation Direction - const type_t insert_open = left_prev[1] + penalties.open + penalties.extend; // Insert open - const type_t insert_extend = left_prev[0] + penalties.open; // insert extend - const type_t delete_open = up_prev[1] + penalties.open + penalties.extend; // delete open - const type_t delete_extend = up_prev[2] + penalties.open; // delete extend + const type_t insert_open = left_prev[1] + penalties.open; // Insert open + const type_t insert_extend = left_prev[0] + penalties.extend; // insert extend + const type_t delete_open = up_prev[1] + penalties.open; // delete open + const type_t delete_extend = up_prev[2] + penalties.extend; // delete extend bool insert_open_b = insert_open > insert_extend; bool delete_open_b = delete_open > delete_extend; @@ -24,8 +27,10 @@ void ProteinLocalAffine::PE::Compute(char_t local_query_val, tbp_t insert_tb = insert_open_b ? (tbp_t) 0 : TB_IMAT; tbp_t delete_tb = delete_open_b ? (tbp_t) 0 : TB_DMAT; - - const type_t match = (local_query_val == local_reference_val) ? diag_prev[1] + penalties.match : diag_prev[1] + penalties.mismatch; + // transition SW + // std::cout << "accessing" << local_query_val.to_int() << "and" << local_reference_val.to_int() << std::endl; + const type_t match = transitions[local_query_val][local_reference_val] + diag_prev[1]; + // std::cout << "accessed" << std::endl; type_t max_value = write_score[0] > write_score[2] ? write_score[0] : write_score[2]; // compare between insertion and deletion max_value = max_value > match ? max_value : match; @@ -68,16 +73,12 @@ void ProteinLocalAffine::InitializeScores( InitializeColumnScores: for (int i = 0; i < MAX_QUERY_LENGTH; i++) { - init_col_scr[i][0] = NINF; - init_col_scr[i][1] = 0.0; - init_col_scr[i][2] = 0.0; + init_col_scr[i] = {NINF, 0,0 }; } InitializeRowScores: for (int i = 0; i < MAX_REFERENCE_LENGTH; i++) { - init_row_scr[i][0] = 0.0; - init_row_scr[i][1] = 0.0; - init_row_scr[i][2] = NINF; + init_row_scr[i] = {0, 0, NINF}; } } diff --git a/kernels/protein_local_affine/params.h b/kernels/protein_local_affine/params.h index 831fd2dd..548dc0e9 100644 --- a/kernels/protein_local_affine/params.h +++ b/kernels/protein_local_affine/params.h @@ -10,6 +10,11 @@ #define BANDING RECTANGULAR +#define LOCAL_TRANSITION_MATRIX +#define TRANSITION_MATRIX_SIZE 20 + +#define SCORED + #define ALIGN_TYPE ProteinLocalAffine #define N_BLOCKS 1 #define N_LAYERS 3 @@ -46,9 +51,6 @@ typedef ap_uint<4> tbp_t; // Traceback Pointer Type struct Penalties { type_t open; type_t extend; - type_t mismatch; - type_t match; - type_t linear_gap; }; enum TB_STATE { diff --git a/kernels/sdtw/kernel_sdtw.cpp b/kernels/sdtw/kernel_sdtw.cpp index 89b3502d..f11ab9a6 100644 --- a/kernels/sdtw/kernel_sdtw.cpp +++ b/kernels/sdtw/kernel_sdtw.cpp @@ -35,6 +35,7 @@ void SDTW::PE::Compute(char_t local_query_val, // The RTL code uses left and diagonal is because possible the query and reference is transposed. // find max from diagonal and left write_score[0] = (diag_prev[0] < up_prev[0] ? diag_prev[0] : up_prev[0]) + abs(local_query_val - local_reference_val); + } void SDTW::UpdatePEMaximum( diff --git a/kernels/sdtw/params.h b/kernels/sdtw/params.h index c964305e..a5df27ba 100644 --- a/kernels/sdtw/params.h +++ b/kernels/sdtw/params.h @@ -18,7 +18,7 @@ */ #define MAX_QUERY_LENGTH 500 // This is read length in SF -#define MAX_REFERENCE_LENGTH 1000 // This is Reference length in SF. +#define MAX_REFERENCE_LENGTH 60000 // This is Reference length in SF. // The default PE_NUM is 100. diff --git a/some notes b/some notes index 897ffa21..d8816a5d 100644 --- a/some notes +++ b/some notes @@ -12,4 +12,7 @@ /usr/lib64/libOpenCL.so /usr/lib64/libOpenCL.so.1 /usr/lib64/libOpenCL.so.1.0.0 -(base) [centos@ip-172-31-21-210 DP-HLS]$ \ No newline at end of file +(base) [centos@ip-172-31-21-210 DP-HLS]$ + +Kernel Aligned Query : MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPSEKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLDAKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHLEKDLVKDFKALV +Kernel Aligned Reference: ___________________________________________________________________________________MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCARIKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYV_RCTYDADL_IDTQAQV \ No newline at end of file diff --git a/src/align.cpp b/src/align.cpp index 15c56995..ac28fbe4 100644 --- a/src/align.cpp +++ b/src/align.cpp @@ -134,6 +134,9 @@ void Align::Rectangular::ChunkCompute( idx_t global_query_length, idx_t query_length, idx_t reference_length, const bool (&col_pred)[PE_NUM], const Penalties &penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif ScorePack (&max)[PE_NUM] #ifndef NO_TRACEBACK , tbp_t (&chunk_tbp_out)[PE_NUM][TBMEM_SIZE] @@ -163,6 +166,14 @@ void Align::Rectangular::ChunkCompute( dp_mem_block_t dp_mem; score_vec_t score_buff[PE_NUM + 1]; +#ifdef CMAKEDEBUG + // clear local reference buffer + for (int i = 0; i < PE_NUM; i++) + { + local_reference[i] = ZERO_CHAR; + } +#endif + #pragma HLS array_partition variable = local_query type = complete #pragma HLS array_partition variable = local_reference type = complete #pragma HLS array_partition variable = dp_mem type = complete @@ -187,6 +198,9 @@ void Align::Rectangular::ChunkCompute( local_query, local_reference, penalties, +#ifdef LOCAL_TRANSITION_MATRIX + transitions, +#endif score_buff, tbp_out); @@ -308,17 +322,22 @@ void Align::PreserveRowScore( } } -void Align::FindMax::ReductionMaxScores(ScorePack (&packs)[PE_NUM], ScorePack &global_max, idx_t &max_pe) +void Align::FindMax::ReductionMaxScores(ScorePack (&packs)[PE_NUM], ScorePack &global_max, idx_t &max_pe_out) { + idx_t max_pe = 0; + type_t max_score = packs[0].score; + ReductionMax: for (idx_t i = 0; i < PE_NUM; i++) { - if (packs[i].score > packs[max_pe].score) + if (packs[i].score > max_score) { + max_score = packs[i].score; max_pe = i; } } global_max = packs[max_pe]; + max_pe_out = max_pe; } void Align::CopyColScore(chunk_col_scores_inf_t &init_col_scr_local, score_vec_t (&init_col_scr)[MAX_QUERY_LENGTH], idx_t idx) @@ -359,6 +378,9 @@ void Align::Rectangular::AlignStatic( const idx_t query_length, const idx_t reference_length, const Penalties &penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif idx_t &tb_i, idx_t &tb_j #ifndef NO_TRACEBACK , tbr_t (&tb_out)[MAX_REFERENCE_LENGTH + MAX_QUERY_LENGTH] @@ -384,6 +406,25 @@ void Align::Rectangular::AlignStatic( #pragma HLS array_partition variable = tbp_matrix type = cyclic factor = PRAGMA_PE_NUM dim = 1 #endif +#ifdef LOCAL_TRANSITION_MATRIX + type_t local_transitions[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE]; +#pragma HLS array_partition variable = local_transitions type = complete dim = 1 +#pragma HLS bind_storage variable = local_transitions type = ram_1p impl = bram + // fill out the local transition matrix + for (idx_t i = 0; i < TRANSITION_MATRIX_SIZE; i++) + { + for (idx_t j = 0; j < TRANSITION_MATRIX_SIZE; j++) + { + for (idx_t k = 0; k < PE_NUM; k++) + { +#pragma HLS unroll + local_transitions[k][i][j] = transitions[i][j]; + } + } + } +#endif + + #ifdef CMAKEDEBUG #ifndef NO_TRACEBACK // initialize tbp_matrix with TB_PH @@ -433,6 +474,9 @@ void Align::Rectangular::AlignStatic( reference_length, col_pred, penalties, +#ifdef LOCAL_TRANSITION_MATRIX + local_transitions, +#endif local_max #ifndef NO_TRACEBACK , tbp_matrix @@ -443,7 +487,8 @@ void Align::Rectangular::AlignStatic( ); } - idx_t max_pe = 0; + + idx_t max_pe; Align::FindMax::ReductionMaxScores(local_max, maximum, max_pe); // >>> Traceback >>> @@ -516,6 +561,9 @@ void Align::Fixed::AlignStatic( const idx_t query_length, const idx_t reference_length, const Penalties &penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif idx_t &tb_i, idx_t &tb_j #ifndef NO_TRACEBACK , tbr_t (&tb_out)[MAX_REFERENCE_LENGTH + MAX_QUERY_LENGTH] @@ -541,6 +589,25 @@ void Align::Fixed::AlignStatic( tbp_t tbp_matrix[PE_NUM][TBMEM_SIZE]; #endif +#ifdef LOCAL_TRANSITION_MATRIX + type_t local_transitions[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE]; +#pragma HLS array_partition variable = local_transitions type = complete dim = 1 +#pragma HLS bind_storage variable = local_transitions type = ram_1p impl = bram + // fill out the local transition matrix + for (idx_t i = 0; i < TRANSITION_MATRIX_SIZE; i++) + { + for (idx_t j = 0; j < TRANSITION_MATRIX_SIZE; j++) + { + for (idx_t k = 0; k < PE_NUM; k++) + { +#pragma HLS unroll + local_transitions[k][i][j] = transitions[i][j]; + } + } + } +#endif + + #ifdef CMAKEDEBUG #ifndef NO_TRACEBACK // initialize tbp_matrix with TB_PH @@ -624,6 +691,9 @@ void Align::Fixed::AlignStatic( col_pred, query_length, local_query_length, reference_length, penalties, +#ifdef LOCAL_TRANSITION_MATRIX + local_transitions, +#endif local_max #ifndef NO_TRACEBACK , tbp_matrix @@ -672,6 +742,9 @@ void Align::Fixed::ChunkCompute( const bool (&col_pred)[PE_NUM], const idx_t global_query_length, const idx_t local_query_length, const idx_t reference_length, const Penalties &penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif ScorePack (&max)[PE_NUM] // write out so must pass by reference #ifndef NO_TRACEBACK , tbp_t (&chunk_tbp_out)[PE_NUM][TBMEM_SIZE] @@ -779,6 +852,9 @@ void Align::Fixed::ChunkCompute( local_query, local_reference, penalties, +#ifdef LOCAL_TRANSITION_MATRIX + transitions, +#endif score_buff, tbp_out); diff --git a/src/hosts/host_protein_sw.cpp b/src/hosts/host_protein_sw.cpp new file mode 100644 index 00000000..3bf0fffe --- /dev/null +++ b/src/hosts/host_protein_sw.cpp @@ -0,0 +1,185 @@ +// FIXME: Wheird that if including params.h, the host code will not compile with weird bug. +// Need a more elegant way. Currently just redefine those types + +#include "xcl2.hpp" +#include +#include +#include +#include +#include "host_utils.h" +#include "dp_hls_common.h" +#include +#include + + +int main(int argc, char **argv) { + if (argc != 2) { + std::cout << "Usage: " << argv[0] << " " << std::endl; + return EXIT_FAILURE; + } + + std::string binaryFile = argv[1]; + cl_int err; + cl::Context context; + cl::Kernel krnl_seq_align; + cl::CommandQueue q; + + // Allocate memory for each array + // std::vector> querys_chars(N_BLOCKS * MAX_QUERY_LENGTH); + // std::vector> references_chars(N_BLOCKS * MAX_REFERENCE_LENGTH); + std::vector> querys(N_BLOCKS * MAX_QUERY_LENGTH); + std::vector> references(N_BLOCKS * MAX_REFERENCE_LENGTH); + std::vector> query_lengths(N_BLOCKS); + std::vector> reference_lengths(N_BLOCKS); + std::vector> penalties(N_BLOCKS); // Assuming a single penalties struct + std::vector> traceback_start_is(N_BLOCKS); // Allocate buffer for the starting row and column of the buffer + std::vector> traceback_start_js(N_BLOCKS); + std::vector> tb_streams(N_BLOCKS * (MAX_REFERENCE_LENGTH + MAX_QUERY_LENGTH)); + + + + // Initialize data + char alphabet[] = {'A', 'T', 'C', 'G'}; // currently putting just random sequence here + string querys_strings = Random::Sequence<4>(alphabet, N_BLOCKS * MAX_QUERY_LENGTH); + string references_strings = Random::Sequence<4>(alphabet, N_BLOCKS * MAX_REFERENCE_LENGTH); + const char *query_ptr = querys_strings.c_str(); + const char *reference_ptr = references_strings.c_str(); + for (int i = 0; i < N_BLOCKS; i++) { + query_lengths[i] = MAX_QUERY_LENGTH; + reference_lengths[i] = MAX_REFERENCE_LENGTH; + for (int j = 0; j < MAX_QUERY_LENGTH; j++) { + querys[i * MAX_QUERY_LENGTH + j] = (type_t) HostUtils::Sequence::base_to_num(*query_ptr++); + } + for (int j = 0; j < MAX_REFERENCE_LENGTH; j++) { + references[i * MAX_REFERENCE_LENGTH + j] = (type_t) HostUtils::Sequence::base_to_num(*reference_ptr++); + } + // Initialize Penalties + penalties[i].open = type_t(-2); + penalties[i].extend = type_t(-1); + } + + // OPENCL HOST CODE AREA START + auto devices = xcl::get_xil_devices(); + auto fileBuf = xcl::read_binary_file(binaryFile); + cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; + bool valid_device = false; + for (unsigned int i = 0; i < devices.size(); i++) { + auto device = devices[i]; + OCL_CHECK(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err)); + OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); + std::cout << "Trying to program device[" << i << "]: " << device.getInfo() << std::endl; + cl::Program program(context, {device}, bins, nullptr, &err); + if (err != CL_SUCCESS) { + std::cout << "Failed to program device[" << i << "] with xclbin file!\n"; + } else { + std::cout << "Device[" << i << "]: program successful!\n"; + OCL_CHECK(err, krnl_seq_align = cl::Kernel(program, "seq_align_multiple_static", &err)); + valid_device = true; + break; + } + } + if (!valid_device) { + std::cout << "Failed to program any device found, exit!\n"; + exit(EXIT_FAILURE); + } + + // Allocate Buffers in Global Memory and set kernel arguments + OCL_CHECK(err, cl::Buffer buffer_querys(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, + sizeof(char_t) * querys.size(), querys.data(), &err)); + OCL_CHECK(err, cl::Buffer buffer_references(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, + sizeof(char_t) * references.size(), references.data(), &err)); + OCL_CHECK(err, cl::Buffer buffer_query_lengths(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, + sizeof(idx_t) * query_lengths.size(), query_lengths.data(), &err)); + OCL_CHECK(err, cl::Buffer buffer_reference_lengths(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, + sizeof(idx_t) * reference_lengths.size(), reference_lengths.data(), &err)); + OCL_CHECK(err, cl::Buffer buffer_penalties(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, + sizeof(Penalties) * penalties.size(), penalties.data(), &err)); + OCL_CHECK(err, cl::Buffer buffer_traceback_start_is(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, + sizeof(idx_t) * traceback_start_is.size(), traceback_start_is.data(), &err)); + OCL_CHECK(err, cl::Buffer buffer_traceback_start_js(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, + sizeof(idx_t) * traceback_start_js.size(), traceback_start_js.data(), &err)); + OCL_CHECK(err, cl::Buffer buffer_tb_streams(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, + sizeof(tbr_t) * tb_streams.size(), tb_streams.data(), &err)); + + + // Set Kernel Arguments + OCL_CHECK(err, err = krnl_seq_align.setArg(0, buffer_querys)); + OCL_CHECK(err, err = krnl_seq_align.setArg(1, buffer_references)); + OCL_CHECK(err, err = krnl_seq_align.setArg(2, buffer_query_lengths)); + OCL_CHECK(err, err = krnl_seq_align.setArg(3, buffer_reference_lengths)); + OCL_CHECK(err, err = krnl_seq_align.setArg(4, buffer_penalties)); + OCL_CHECK(err, err = krnl_seq_align.setArg(5, buffer_traceback_start_is)); + OCL_CHECK(err, err = krnl_seq_align.setArg(6, buffer_traceback_start_js)); + OCL_CHECK(err, err = krnl_seq_align.setArg(7, buffer_tb_streams)); + + // Copy input data to device global memory + auto start = std::chrono::high_resolution_clock::now(); + OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_querys, buffer_references, buffer_query_lengths, + buffer_reference_lengths, buffer_penalties}, 0 /* 0 means from host*/)); + + // Launch the Kernel + OCL_CHECK(err, err = q.enqueueTask(krnl_seq_align)); + + + // Copy Result from Device Global Memory to Host Local Memory + OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_traceback_start_is, buffer_traceback_start_js, buffer_tb_streams}, CL_MIGRATE_MEM_OBJECT_HOST)); + q.finish(); + auto end = std::chrono::high_resolution_clock::now(); + + // OPENCL HOST CODE AREA END + + // Print raw traceback pointer streams + for (int i = 0; i < N_BLOCKS; i++) { + std::cout << "Query: " << querys_strings.substr(i * MAX_QUERY_LENGTH, MAX_QUERY_LENGTH) << std::endl; + std::cout << "Reference: " << references_strings.substr(i * MAX_REFERENCE_LENGTH, MAX_REFERENCE_LENGTH) << std::endl; + std::cout << "Traceback: " << std::endl; + for (int j = 0; j < MAX_QUERY_LENGTH + MAX_REFERENCE_LENGTH; j++) { + std::cout << tb_streams[i * (MAX_QUERY_LENGTH + MAX_REFERENCE_LENGTH) + j]; + } + std::cout << std::endl; + } + + // set up the array to store the traceback lengthes + // string query_strings_primitive[N_BLOCKS]; + // string reference_strings_primitive[N_BLOCKS]; + // for (int i = 0; i < N_BLOCKS; i++){ + // query_strings_primitive[i] = querys_strings.substr(i * MAX_QUERY_LENGTH, MAX_QUERY_LENGTH); + // reference_strings_primitive[i] = references_strings.substr(i * MAX_REFERENCE_LENGTH, MAX_REFERENCE_LENGTH); + // } + + // tbr_t tb_streams_primitive[N_BLOCKS][MAX_REFERENCE_LENGTH + MAX_QUERY_LENGTH]; + // for (int i = 0; i < N_BLOCKS; i++){ + // for (int j = 0; j < MAX_REFERENCE_LENGTH + MAX_QUERY_LENGTH; j++){ + // tb_streams_primitive[i][j] = tb_streams[i * (MAX_QUERY_LENGTH + MAX_REFERENCE_LENGTH) + j]; + // } + // } + + // int tb_qry_lengths[N_BLOCKS]; + // int tb_ref_lengths[N_BLOCKS]; + // for (int i = 0; i < N_BLOCKS; i++){ + // tb_qry_lengths[i] = traceback_start_is[i]; + // tb_ref_lengths[i] = traceback_start_js[i]; + // } + // std::cout << "Reconstructing Traceback" << std::endl; + // array, N_BLOCKS> kernel_alignments; + // kernel_alignments = ReconstructTracebackBlocks( + // query_strings_primitive, + // reference_strings_primitive, + // tb_qry_lengths, tb_ref_lengths, + // tb_streams_primitive); + + // // Print Actual Alignments + // for (int i = 0; i < N_BLOCKS; i++){ + // std::cout << "Block " << i << " Results" << std::endl; + // std::cout << "Query : " << query_strings_primitive[i] << std::endl; + // std::cout << "Reference: " << reference_strings_primitive[i] << std::endl; + // std::cout << "Kernel Aligned Query : " << kernel_alignments[i]["query"] << std::endl; + // std::cout << "Kernel Aligned Reference: " << kernel_alignments[i]["reference"] << std::endl << std::endl; + // } + + // Print time + std::cout << "Kernel execution time: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; + + std::cout << "Kernel execution complete." << std::endl; + return EXIT_SUCCESS; +} diff --git a/src/pe.cpp b/src/pe.cpp index d010f09b..1dc0d103 100644 --- a/src/pe.cpp +++ b/src/pe.cpp @@ -22,6 +22,9 @@ void PE::PEUnroll( const input_char_block_t qry, const input_char_block_t ref, const Penalties penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif tbp_vec_t &tbp) { #pragma HLS array_partition variable = dp_mem dim = 0 type = complete @@ -37,6 +40,9 @@ void PE::PEUnroll( dp_mem[i][2], dp_mem[i+1][1], penalties, +#ifdef LOCAL_TRANSITION_MATRIX + transitions[i], +#endif dp_mem[i+1][0], tbp[i]); } @@ -47,6 +53,9 @@ void PE::PEUnrollSep( const input_char_block_t &qry, const input_char_block_t &ref, const Penalties penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&local_transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif wavefront_scores_inf_t &score, tbp_vec_t &tbp) { @@ -64,6 +73,9 @@ void PE::PEUnrollSep( dp_mem[i][1], dp_mem[i+1][0], penalties, +#ifdef LOCAL_TRANSITION_MATRIX + local_transitions[i], +#endif score[i+1], tbp[i]); } @@ -74,6 +86,9 @@ void PE::PEUnrollFixedSep( const input_char_block_t qry, const input_char_block_t ref, const Penalties penalties, +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[PE_NUM][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif wavefront_scores_inf_t &score, tbp_vec_t &tbp){ @@ -92,6 +107,9 @@ void PE::PEUnrollFixedSep( dp_mem[i][1], dp_mem[i+1][0], penalties, +#ifdef LOCAL_TRANSITION_MATRIX + transitions[i], +#endif score[i+1], tbp[i]); #ifdef CMAKEDEBUG diff --git a/src/seq_align_multiple.cpp b/src/seq_align_multiple.cpp index 7d3bf71e..a1617eb2 100644 --- a/src/seq_align_multiple.cpp +++ b/src/seq_align_multiple.cpp @@ -38,7 +38,10 @@ extern "C" char_t (&references)[MAX_REFERENCE_LENGTH][N_BLOCKS], idx_t (&query_lengths)[N_BLOCKS], idx_t (&reference_lengths)[N_BLOCKS], - Penalties (&penalties)[N_BLOCKS], + const Penalties (&penalties)[N_BLOCKS], +#ifdef LOCAL_TRANSITION_MATRIX + const type_t (&transitions)[TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE], +#endif idx_t (&tb_is)[N_BLOCKS], idx_t (&tb_js)[N_BLOCKS] #ifndef NO_TRACEBACK , tbr_t (&tb_streams)[MAX_REFERENCE_LENGTH + MAX_QUERY_LENGTH][N_BLOCKS] @@ -64,6 +67,10 @@ extern "C" #ifdef SCORED type_t scores_b[N_BLOCKS]; #endif +#ifdef LOCAL_TRANSITION_MATRIX + type_t transitions_block[N_BLOCKS][TRANSITION_MATRIX_SIZE][TRANSITION_MATRIX_SIZE]; +#endif + // Attempted to use URAM but it didn't work. // #pragma HLS bind_storage variable = tb_streams_b type = fifo impl = uram // #pragma HLS bind_storage variable = querys_b type = ram_1p impl = uram @@ -85,6 +92,11 @@ extern "C" #pragma HLS array_partition variable = scores_b type = complete dim = 1 #endif +#ifdef LOCAL_TRANSITION_MATRIX +#pragma HLS array_partition variable = transitions_block type = complete dim = 1 +#endif + + // F1 doesn't support axis on the top level. But for other FPGA it might be more optimized. // #pragma HLS interface mode = axis port = querys_b // #pragma HLS interface mode = axis port = references_b @@ -133,6 +145,21 @@ extern "C" penalties_b[i] = penalties[i]; } +#ifdef LOCAL_TRANSITION_MATRIX + // broadcast the local transitions to each block + for (idx_t i = 0; i < TRANSITION_MATRIX_SIZE; i++) + { + for (idx_t j = 0; j < TRANSITION_MATRIX_SIZE; j++) + { + for (idx_t k = 0; k < N_BLOCKS; k++) + { +#pragma HLS unroll + transitions_block[k][i][j] = transitions[i][j]; + } + } + } +#endif + for (int i = 0; i < N_BLOCKS; i++) { #pragma HLS unroll @@ -145,6 +172,9 @@ extern "C" query_lengths_b[i], reference_lengths_b[i], penalties_b[i], +#ifdef LOCAL_TRANSITION_MATRIX + transitions_block[i], +#endif tb_is_b[i], tb_js_b[i] #ifndef NO_TRACEBACK , tb_streams_b[i] diff --git a/testbench/test_csim_banding_local_affine_scored.cpp b/testbench/test_csim_banding_local_affine_scored.cpp index 18e78669..14dda172 100644 --- a/testbench/test_csim_banding_local_affine_scored.cpp +++ b/testbench/test_csim_banding_local_affine_scored.cpp @@ -13,8 +13,8 @@ using namespace std; -#define INPUT_QUERY_LENGTH 128 -#define INPUT_REFERENCE_LENGTH 128 +#define INPUT_QUERY_LENGTH 256 +#define INPUT_REFERENCE_LENGTH 256 char_t base_to_num(char base) { diff --git a/testbench/test_csim_profile_alignment.cpp b/testbench/test_csim_profile_alignment.cpp index 13efb140..6a8cc8f1 100644 --- a/testbench/test_csim_profile_alignment.cpp +++ b/testbench/test_csim_profile_alignment.cpp @@ -12,8 +12,8 @@ using json = nlohmann::json; // Define length of actual alignment sequences -#define MAX_QUERY -#define MAX_REFERENCE +#define INPUT_QUERY_LENGTH 256 +#define INPUT_REFERENCE_LENGTH 256 // Define a navigation to character mapping that is used to print the traceback in terms of navigations U, L, M and etc. std::string navigation_to_char(tbp_t nav) @@ -128,7 +128,7 @@ int main() reference_n_h = HostUtils::Sequence::MultipleSequencesToProfileAlign(reference_c_h, reference_c_h[0].length()); // prepare very first reference sequence - for (int seq_id = 1; seq_id < 1; seq_id++) // in single pairwise setting, run only one pair wise profile alignment. + for (int seq_id = 1; seq_id < 2; seq_id++) // in single pairwise setting, run only one pair wise profile alignment. { // print something to terminal output_file << "Aligning " << species_names[seq_id - 1] << std::endl; @@ -175,8 +175,8 @@ int main() // Set the lengths of the sequences for (int i = 0; i < N_BLOCKS; i++) { - qry_len_d[i] = query_c_h[0].size(); - ref_len_d[i] = reference_c_h[0].size(); + qry_len_d[i] = query_c_h[0].size() < INPUT_QUERY_LENGTH ? query_c_h[0].size() : INPUT_QUERY_LENGTH; + ref_len_d[i] = reference_c_h[0].size() < INPUT_REFERENCE_LENGTH ? reference_c_h[0].size() : INPUT_REFERENCE_LENGTH; } output_file << "Data prepared" << std::endl; diff --git a/testbench/test_csim_protein_local_affine.cpp b/testbench/test_csim_protein_local_affine.cpp index 0300674c..40c013bf 100644 --- a/testbench/test_csim_protein_local_affine.cpp +++ b/testbench/test_csim_protein_local_affine.cpp @@ -25,12 +25,39 @@ struct Penalties_sol { int main(){ - char alphabet[4] = {'A', 'T', 'G', 'C'}; std::vector all_sequences = HostUtils::IO::readFasta("/home/centos/workspace/DP-HLS/data/uniprot_sprot.fasta"); + // Bolsum62 transition matrix source: ftp://ftp.ncbi.nih.gov/blast/matrices/BLOSUM62 + // FIXME: ChatGPT OCR Converted this, I'm not sure it's fully correct. + const type_t transitions[20][20] = { + { 4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0}, + {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3}, + {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3}, + {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3}, + { 0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1}, + {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2}, + {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2}, + { 0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3}, + {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3}, + {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3}, + {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1}, + {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2}, + {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1}, + {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1}, + {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2}, + { 1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2}, + { 0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0}, + {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3}, + {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1}, + { 0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4}, + }; + + string query_string = all_sequences[0]; string reference_string = all_sequences[1]; + type_t scores[N_BLOCKS]; + query_string = query_string.length() < INPUT_QUERY_LENGTH ? query_string : query_string.substr(0, INPUT_QUERY_LENGTH); reference_string = reference_string.length() < INPUT_REFERENCE_LENGTH ? reference_string : reference_string.substr(0, INPUT_REFERENCE_LENGTH); @@ -39,18 +66,16 @@ int main(){ for (int i = 0; i < N_BLOCKS; i++){ penalties[i].extend = -0.5; penalties[i].open = -10; - penalties[i].match = 7; - penalties[i].mismatch = -2.5; } - // Struct for penalties in solution - Penalties_sol penalties_sol[N_BLOCKS]; - for (Penalties_sol &penalty : penalties_sol) { - penalty.extend = -0.5; - penalty.open = -10; - penalty.match = 7; - penalty.mismatch = -2.5; - } + // // Struct for penalties in solution + // Penalties_sol penalties_sol[N_BLOCKS]; + // for (Penalties_sol &penalty : penalties_sol) { + // penalty.extend = -0.5; + // penalty.open = -10; + // penalty.match = 7; + // penalty.mismatch = -2.5; + // } // Reference and Query Strings std::vector query(query_string.begin(), query_string.end()); @@ -99,6 +124,24 @@ int main(){ } } + // for debugging purposes, print the query and reference in the device buffer in integer form + for (int b = 0; b < N_BLOCKS; b++) + { + cout << "Query Buffer Block " << b << endl; + for (int i = 0; i < query.size(); i++) + { + cout << query_buff[i][b].to_int() << " "; + } + cout << endl; + cout << "Reference Buffer Block " << b << endl; + for (int i = 0; i < reference.size(); i++) + { + cout << reference_buff[i][b].to_int() << " "; + } + cout << endl; + } + + // Fill the lengths of the query and reference for (int b = 0; b < N_BLOCKS; b++) { @@ -119,8 +162,12 @@ int main(){ qry_lengths, ref_lengths, penalties, + transitions, tb_is_d, tb_js_d, tb_streams +#ifdef SCORED + , scores +#endif #ifdef CMAKEDEBUG , debuggers #endif @@ -136,7 +183,7 @@ int main(){ array, MAX_QUERY_LENGTH>, N_LAYERS> sol_score_mat; array, MAX_QUERY_LENGTH> sol_tb_mat; map alignments; - local_affine_solution(query_string, reference_string, penalties_sol[0], sol_score_mat, sol_tb_mat, alignments); + // local_affine_solution(query_string, reference_string, penalties_sol[0], sol_score_mat, sol_tb_mat, alignments); // print_matrix(sol_score_mat[0], "Solution Score Matrix Layer 0"); // print_matrix(sol_tb_mat, "Solution Traceback Matrix"); cout << "Solution Aligned Query : " << alignments["query"] << endl; @@ -174,7 +221,8 @@ int main(){ cout << "Kernel " << i << " Traceback, Start Row: " << tb_is_h[i] << ", Start Column: " << tb_js_h[i] << endl; cout << "Kernel Aligned Query : " << kernel_alignments[0]["query"] << endl; cout << "Kernel Aligned Reference: " << kernel_alignments[0]["reference"] << endl; +#ifdef SCORED + cout << "Kernel " << i << " Score: " << scores[i] << endl; +#endif } - - } \ No newline at end of file diff --git a/testbench/test_csim_sdtw.cpp b/testbench/test_csim_sdtw.cpp index 5513689b..14e66cba 100644 --- a/testbench/test_csim_sdtw.cpp +++ b/testbench/test_csim_sdtw.cpp @@ -16,8 +16,9 @@ using namespace std; +// the dataset they prepared for artifact evaluation uses 8 bits insigned integers #define INPUT_QUERY_LENGTH 500 -#define INPUT_REFERENCE_LENGTH 900 +#define INPUT_REFERENCE_LENGTH 59800 // this is the actual reference length in the dataset char_t base_to_num(char base) {