Skip to content

Commit

Permalink
igpu transfer (#46)
Browse files Browse the repository at this point in the history
Co-authored-by: hyojongk <[email protected]>
  • Loading branch information
hyesoon and hyojongk authored Nov 13, 2020
1 parent ad51942 commit a74efdb
Show file tree
Hide file tree
Showing 29 changed files with 217 additions and 164 deletions.
2 changes: 1 addition & 1 deletion def/general.stat.def
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ DEF_STAT(EXE_TIME, COUNT, NO_RATIO)
DEF_STAT(NUM_REPEAT, COUNT, NO_RATIO)

DEF_STAT(CYC_COUNT_X86, COUNT, NO_RATIO)
DEF_STAT(CYC_COUNT_PTX, COUNT, NO_RATIO)
DEF_STAT(CYC_COUNT_ACC, COUNT, NO_RATIO)

DEF_STAT(AVG_BLOCK_EXE_CYCLE, COUNT, NO_RATIO)
DEF_STAT(AVG_BLOCK_EXE_CYCLE_BASE, COUNT, NO_RATIO)
Expand Down
19 changes: 14 additions & 5 deletions macsimComponent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,16 @@ macsimComponent::macsimComponent(ComponentId_t id, Params& params)
m_clock_freq,
new Clock::Handler<macsimComponent>(this, &macsimComponent::ticReceived));

m_ptx_core = params.find<bool>("ptx_core", 0);
if (params.find<bool>("ptx_core", 0)) {
m_acc_type = PTX_CORE;
m_acc_core = 1;
} else if (params.find<bool>("igpu_core", 0)) {
m_acc_type = IGPU_CORE;
m_acc_core = 1;
} else {
m_acc_core = 0;
m_acc_type = NO_ACC;
}
m_num_link = params.find<uint32_t>("num_link", 1);
configureLinks(params, tc);

Expand Down Expand Up @@ -150,7 +159,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) {
m_data_cache_requests.push_back(std::map<uint64_t, uint64_t>());
m_data_cache_responses.push_back(std::set<uint64_t>());

if (m_ptx_core) {
if (m_acc_core) {
auto ccache_link = loadUserSubComponent<Interfaces::SimpleMem>(
"core" + std::to_string(l) + "-ccache", ComponentInfo::SHARE_NONE, tc,
new Interfaces::SimpleMem::Handler<macsimComponent>(
Expand Down Expand Up @@ -194,7 +203,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) {
m_data_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
m_data_cache_response_counters = std::vector<uint64_t>(m_num_link, 0);

if (m_ptx_core) {
if (m_acc_core) {
m_const_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
m_const_cache_response_counters = std::vector<uint64_t>(m_num_link, 0);
m_texture_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
Expand Down Expand Up @@ -275,7 +284,7 @@ void macsimComponent::setup() {
new Callback<macsimComponent, bool, int, uint64_t>(
this, &macsimComponent::strobeDataCacheRespQ);

if (m_ptx_core) {
if (m_acc_core) {
CallbackSendConstCacheRequest* scr =
new Callback<macsimComponent, void, int, uint64_t, uint64_t, int>(
this, &macsimComponent::sendConstCacheRequest);
Expand Down Expand Up @@ -347,7 +356,7 @@ bool macsimComponent::ticReceived(Cycle_t) {
// Debugging
if (m_cycle % 100000 == 0) {
for (unsigned int l = 0; l < m_num_link; ++l) {
if (m_ptx_core) {
if (m_acc_core) {
MSC_DEBUG(
"Core[%2d] I$: (%lu, %lu), D$: (%lu, %lu) C$: (%lu, %lu), T$: (%lu, "
"%lu)\n",
Expand Down
3 changes: 2 additions & 1 deletion macsimComponent.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ class macsimComponent : public SST::Component

macsim_c *m_macsim;
bool m_sim_running;
bool m_ptx_core;
bool m_acc_core;
ACC_Type m_acc_type;
bool m_cube_connected;
bool m_debug_all;
int64_t m_debug_addr;
Expand Down
121 changes: 74 additions & 47 deletions src/config.h

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions src/dram_ctrl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -578,9 +578,9 @@ void dram_ctrl_c::send(void) {
for (auto I = m_output_buffer->begin(), E = m_output_buffer->end(); I != E;
++I) {
mem_req_s* req = (*I);
if (req_type_allowed[req->m_ptx] == false) continue;
if (req_type_allowed[req->m_acc] == false) continue;

req_type_checked[req->m_ptx] = true;
req_type_checked[req->m_acc] = true;
req->m_msg_type = NOC_FILL;

bool insert_packet =
Expand Down Expand Up @@ -764,7 +764,7 @@ void dram_ctrl_c::channel_schedule_data(void) {
m_current_list[bank]->m_req->m_id);
ASSERT(m_current_list[bank]->m_state == DRAM_DATA);
m_data_ready[bank] = acquire_data_bus(
ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_ptx);
ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_acc);
m_data_avail[bank] = ULLONG_MAX;
m_current_list[bank]->m_state = DRAM_DATA_WAIT;
} else
Expand Down
10 changes: 5 additions & 5 deletions src/exec.cc
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ bool exec_c::exec(int thread_id, int entry, uop_c* uop) {
use_port(thread_id, entry);

// GPU : if we use load-block policy, block current thread due to load instruction
if (uop_latency == -1 && m_ptx_sim &&
if (uop_latency == -1 && m_acc_sim &&
*m_simBase->m_knobs->KNOB_FETCH_ONLY_LOAD_READY) {
m_frontend->set_load_wait(uop->m_thread_id, uop->m_uop_num);

Expand Down Expand Up @@ -741,7 +741,7 @@ void exec_c::br_exec(uop_c* uop) {
}

// GPU : stall on branch policy
if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
if (m_acc_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
m_frontend->set_br_ready(uop->m_thread_id);
}
}
Expand Down Expand Up @@ -793,7 +793,7 @@ void exec_c::run_a_cycle(void) {
if (responseArrived) {
DEBUG_CORE(m_core_id, "key found: 0x%lx, addr = 0x%llx\n", key,
uop->m_vaddr);
if (m_ptx_sim || m_igpu_sim) {
if (m_acc_sim || m_igpu_sim) {
if (uop->m_parent_uop) {
uop_c* puop = uop->m_parent_uop;
++puop->m_num_child_uops_done;
Expand Down Expand Up @@ -883,7 +883,7 @@ int exec_c::access_data_cache(uop_c* uop) {
auto i = m_uop_buffer.find(key);
ASSERTM(m_uop_buffer.end() == i, "uop has already been executed!\n");

int block_size = m_ptx_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue()
int block_size = m_acc_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue()
: KNOB(KNOB_L1_LARGE_LINE_SIZE)->getValue();
// Addr block_addr = uop->m_vaddr & ~((uint64_t)block_size-1);

Expand Down Expand Up @@ -936,7 +936,7 @@ int exec_c::access_data_cache(uop_c* uop) {
}

int exec_c::access_const_texture_cache(uop_c* uop) {
ASSERT(m_ptx_sim);
ASSERT(m_acc_sim);
ASSERT(uop->m_mem_type == MEM_LD_CM || uop->m_mem_type == MEM_LD_TM);

// assign unique key to each memory request; this will be used later in time for strobbing
Expand Down
3 changes: 2 additions & 1 deletion src/exec.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,9 @@ class exec_c
uns16 m_mem_sched_rate; /**< memory schedule rate */
uns16 m_fp_sched_rate; /**< fp schedule rate */
uns8 m_dcache_cycles; /**< L1 cache latency */
bool m_ptx_sim; /**< gpu simulation */
bool m_acc_sim; /**< gpu simulation */
bool m_igpu_sim; /**< intel gpu simulation */
bool m_ptx_sim; /**< PTX simulation */
int m_latency[NUM_UOP_TYPES]; /**< latency map */
Counter m_cur_core_cycle; /**< current core cycle */
int m_max_port[max_ALLOCQ]; /**< maximum port */
Expand Down
40 changes: 22 additions & 18 deletions src/frontend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,8 @@ void frontend_c::run_a_cycle(void) {
// fetch every KNOB_FETCH_RATIO cycle
// CPU : every cycle
// NVIDIA G80 : 1/4 cycles, NVIDIA Fermi: 1/2 cycles
// check core type for the fetch rate
// Hyesoon: Aug-2020 please check whether this need to be changed with heteroe and igpu
if (m_fetch_ratio != 1) {
m_fetch_modulo++;
if (m_fetch_modulo == m_fetch_ratio)
Expand Down Expand Up @@ -300,7 +302,7 @@ void frontend_c::run_a_cycle(void) {

// TONAGESH
// nagesh - comments for BAR are incomplete...
if (m_knob_ptx_sim) {
if (m_ptx_sim) {
// handling of BAR instruction in PTX - can/should this be moved?
// do we have any blocks for which all warps have reached (retired)
// their next barrier?
Expand Down Expand Up @@ -346,7 +348,7 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,

// First time : set up traces for current thread
if (fetch_data->m_first_time) {
m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_knob_ptx_sim);
m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_ptx_sim);
fetch_data->m_first_time = false;

++m_core->m_inst_fetched[tid]; /*! initial increase */
Expand All @@ -356,11 +358,18 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,

// set up initial fetch address
thread_s *thread = m_core->get_trace_info(tid);
if (thread->m_ptx) {
trace_info_gpu_s *prev_trace_info =
static_cast<trace_info_gpu_s *>(thread->m_prev_trace_info);
fetch_data->m_MT_scheduler.m_next_fetch_addr =
prev_trace_info->m_inst_addr;
if (thread->m_acc) {
if (m_ptx_sim) {
trace_info_gpu_s *prev_trace_info =
static_cast<trace_info_gpu_s *>(thread->m_prev_trace_info);
fetch_data->m_MT_scheduler.m_next_fetch_addr =
prev_trace_info->m_inst_addr;
} else if (m_igpu_sim) {
trace_info_igpu_s *prev_trace_info =
static_cast<trace_info_igpu_s *>(thread->m_prev_trace_info);
fetch_data->m_MT_scheduler.m_next_fetch_addr =
prev_trace_info->m_instruction_addr;
}
} else {
if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "x86") {
trace_info_cpu_s *prev_trace_info =
Expand All @@ -372,11 +381,6 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
static_cast<trace_info_a64_s *>(thread->m_prev_trace_info);
fetch_data->m_MT_scheduler.m_next_fetch_addr =
prev_trace_info->m_instruction_addr;
} else if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu") {
trace_info_igpu_s *prev_trace_info =
static_cast<trace_info_igpu_s *>(thread->m_prev_trace_info);
fetch_data->m_MT_scheduler.m_next_fetch_addr =
prev_trace_info->m_instruction_addr;
} else {
ASSERTM(0, "Wrong core type %s\n",
KNOB(KNOB_LARGE_CORE_TYPE)->getValue().c_str());
Expand Down Expand Up @@ -457,8 +461,8 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
ASSERT(new_uop);

// read an uop from the traces
if (!m_simBase->m_trace_reader->get_uops_from_traces(
m_core_id, new_uop, tid, m_knob_ptx_sim)) {
if (!m_simBase->m_trace_reader->get_uops_from_traces(m_core_id, new_uop,
tid, m_ptx_sim)) {
// couldn't get an uop
DEBUG_CORE(m_core_id, "not success\n");
m_uop_pool->release_entry(new_uop->free());
Expand Down Expand Up @@ -631,7 +635,7 @@ bool frontend_c::access_icache(int tid, Addr fetch_addr,
int result = m_simBase->m_memory->new_mem_req(
MRT_IFETCH, line_addr, m_knob_icache_line_size, false, false, 0, NULL,
icache_fill_line_wrapper, m_core->get_unique_uop_num(), NULL, m_core_id,
tid, m_knob_ptx_sim);
tid, m_ptx_sim);

// mshr full
if (!result) return false;
Expand Down Expand Up @@ -712,7 +716,7 @@ bool frontend_c::icache_fill_line(mem_req_s *req) {
if (m_icache->access_cache(req->m_addr, &line_addr, false, req->m_appl_id) ==
NULL) {
m_icache->insert_cache(req->m_addr, &line_addr, &repl_line_addr,
req->m_appl_id, req->m_ptx);
req->m_appl_id, req->m_acc);
POWER_CORE_EVENT(req->m_core_id, POWER_ICACHE_W);
}

Expand Down Expand Up @@ -806,7 +810,7 @@ int frontend_c::predict_bpu(uop_c *uop) {
// no branch prediction
else {
// GPU : stall on branch policy, stop fetching
if (m_knob_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
set_br_wait(uop->m_thread_id);
mispredicted = false;
}
Expand Down Expand Up @@ -906,7 +910,7 @@ int frontend_c::fetch_rr(void) {
}

// check the thread is ready to fetch
if (m_knob_ptx_sim) {
if (m_ptx_sim) {
// GPU : stall on branch policy, check whether previous branch has been resolved
if (*m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR &&
!check_br_ready(fetch_id)) {
Expand Down
4 changes: 3 additions & 1 deletion src/frontend.h
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,9 @@ class frontend_c
uns m_knob_icache_line_size; /**< icache line size */
bool m_fe_stall; /**< frontend stalled */
bool m_fe_running; /**< enabled frontend */
bool m_knob_ptx_sim; /**< GPU simulation */
bool m_ptx_sim; /**< PTX simulation */
bool m_igpu_sim; /**< iGPU simulation */
bool m_acc_sim; /**< Accelerator simulation */
bool m_ready_thread_available; /**< ready thread available */
bool m_last_fetch_tid_failed;
core_c* m_core; /**< core pointer */
Expand Down
5 changes: 5 additions & 0 deletions src/global_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,9 @@ typedef enum uop_latency_map { // enum for x86 latency maps - Michael
NUM_LATENCY_MAPS
} latency_map;

typedef enum _ACC_Type_enum {
NO_ACC = 0, /**< no accelerator */
PTX_CORE, /**< PTX core */
IGPU_CORE /**< IGPU core */
} ACC_Type;
#endif
6 changes: 3 additions & 3 deletions src/macsim.cc
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ void macsim_c::init_cores(int num_max_core) {

// insert to the core type pool
if (static_cast<string>(*m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx")
m_ptx_core_pool.push(ii);
m_acc_core_pool.push(ii);
else
m_x86_core_pool.push(ii);
}
Expand All @@ -352,7 +352,7 @@ void macsim_c::init_cores(int num_max_core) {
// insert to the core type pool
if (static_cast<string>(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) ==
"ptx")
m_ptx_core_pool.push(ii + total_core);
m_acc_core_pool.push(ii + total_core);
else
m_x86_core_pool.push(ii + total_core);
}
Expand All @@ -367,7 +367,7 @@ void macsim_c::init_cores(int num_max_core) {

// insert to the core type pool
if (static_cast<string>(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx")
m_ptx_core_pool.push(ii + total_core);
m_acc_core_pool.push(ii + total_core);
else
m_x86_core_pool.push(ii + total_core);
}
Expand Down
2 changes: 1 addition & 1 deletion src/macsim.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ class macsim_c
// process manager
process_manager_c *m_process_manager; /**< process manager */
queue<int> m_x86_core_pool; /**< x86 cores pool */
queue<int> m_ptx_core_pool; /**< GPU cores pool */
queue<int> m_acc_core_pool; /**< GPU cores pool */
multi_key_map_c *m_block_id_mapper; /**< block id mapper */

// data structure pools (to reduce overhead of memory allocation)
Expand Down
13 changes: 6 additions & 7 deletions src/manifold/models/iris/iris_srcs/components/simpleRouter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -249,13 +249,12 @@ SimpleRouter::handle_link_arrival( int port, LinkData* data )
}

//track flit_id of tail flit
cout << manifold::kernel::Manifold::NowTicks() << ",p,"
<< ((HeadFlit*)data->f)->req->m_id << ","
<< ((HeadFlit*)data->f)->req->m_ptx << ","
<< mem_state_copy[((HeadFlit*)data->f)->req->m_state] << ","
<< mem_req_noc_type_name[((HeadFlit*)data->f)->req->m_msg_type] << ","
<< node_id << "," << ((HeadFlit*)data->f)->dst_node << ","
<< endl;
cout << manifold::kernel::Manifold::NowTicks() << ",p,"
<< ((HeadFlit*)data->f)->req->m_id << ","
<< ((HeadFlit*)data->f)->req->m_acc << ","
<< mem_state_copy[((HeadFlit*)data->f)->req->m_state] << ","
<< mem_req_noc_type_name[((HeadFlit*)data->f)->req->m_msg_type] << ","
<< node_id << "," << ((HeadFlit*)data->f)->dst_node << "," << endl;
/*
cout << manifold::kernel::Manifold::NowTicks() << ",b," << node_id << ",";
for(uint i=0; i<ports; i++)
Expand Down
Loading

0 comments on commit a74efdb

Please sign in to comment.