igpu transfer (#46)

Co-authored-by: hyojongk <[email protected]>
gthparch · Nov 13, 2020 · a74efdb · a74efdb
1 parent ad51942
commit a74efdb
Show file tree

Hide file tree

Showing 29 changed files with 217 additions and 164 deletions.
diff --git a/def/general.stat.def b/def/general.stat.def
@@ -38,7 +38,7 @@ DEF_STAT(EXE_TIME, COUNT, NO_RATIO)
 DEF_STAT(NUM_REPEAT, COUNT, NO_RATIO)
 
 DEF_STAT(CYC_COUNT_X86, COUNT, NO_RATIO)
-DEF_STAT(CYC_COUNT_PTX, COUNT, NO_RATIO)
+DEF_STAT(CYC_COUNT_ACC, COUNT, NO_RATIO)
 
 DEF_STAT(AVG_BLOCK_EXE_CYCLE, COUNT, NO_RATIO)
 DEF_STAT(AVG_BLOCK_EXE_CYCLE_BASE, COUNT, NO_RATIO)

diff --git a/macsimComponent.cpp b/macsimComponent.cpp
@@ -60,7 +60,16 @@ macsimComponent::macsimComponent(ComponentId_t id, Params& params)
     m_clock_freq,
     new Clock::Handler<macsimComponent>(this, &macsimComponent::ticReceived));
 
-  m_ptx_core = params.find<bool>("ptx_core", 0);
+  if (params.find<bool>("ptx_core", 0)) {
+    m_acc_type = PTX_CORE;
+    m_acc_core = 1;
+  } else if (params.find<bool>("igpu_core", 0)) {
+    m_acc_type = IGPU_CORE;
+    m_acc_core = 1;
+  } else {
+    m_acc_core = 0;
+    m_acc_type = NO_ACC;
+  }
   m_num_link = params.find<uint32_t>("num_link", 1);
   configureLinks(params, tc);
 
@@ -150,7 +159,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) {
     m_data_cache_requests.push_back(std::map<uint64_t, uint64_t>());
     m_data_cache_responses.push_back(std::set<uint64_t>());
 
-    if (m_ptx_core) {
+    if (m_acc_core) {
       auto ccache_link = loadUserSubComponent<Interfaces::SimpleMem>(
         "core" + std::to_string(l) + "-ccache", ComponentInfo::SHARE_NONE, tc,
         new Interfaces::SimpleMem::Handler<macsimComponent>(
@@ -194,7 +203,7 @@ void macsimComponent::configureLinks(SST::Params& params, TimeConverter* tc) {
   m_data_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
   m_data_cache_response_counters = std::vector<uint64_t>(m_num_link, 0);
 
-  if (m_ptx_core) {
+  if (m_acc_core) {
     m_const_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
     m_const_cache_response_counters = std::vector<uint64_t>(m_num_link, 0);
     m_texture_cache_request_counters = std::vector<uint64_t>(m_num_link, 0);
@@ -275,7 +284,7 @@ void macsimComponent::setup() {
     new Callback<macsimComponent, bool, int, uint64_t>(
       this, &macsimComponent::strobeDataCacheRespQ);
 
-  if (m_ptx_core) {
+  if (m_acc_core) {
     CallbackSendConstCacheRequest* scr =
       new Callback<macsimComponent, void, int, uint64_t, uint64_t, int>(
         this, &macsimComponent::sendConstCacheRequest);
@@ -347,7 +356,7 @@ bool macsimComponent::ticReceived(Cycle_t) {
   // Debugging
   if (m_cycle % 100000 == 0) {
     for (unsigned int l = 0; l < m_num_link; ++l) {
-      if (m_ptx_core) {
+      if (m_acc_core) {
         MSC_DEBUG(
           "Core[%2d] I$: (%lu, %lu), D$: (%lu, %lu) C$: (%lu, %lu), T$: (%lu, "
           "%lu)\n",

diff --git a/macsimComponent.h b/macsimComponent.h
@@ -105,7 +105,8 @@ class macsimComponent : public SST::Component
 
   macsim_c *m_macsim;
   bool m_sim_running;
-  bool m_ptx_core;
+  bool m_acc_core;
+  ACC_Type m_acc_type;
   bool m_cube_connected;
   bool m_debug_all;
   int64_t m_debug_addr;

diff --git a/src/config.h b/src/config.h
diff --git a/src/dram_ctrl.cc b/src/dram_ctrl.cc
@@ -578,9 +578,9 @@ void dram_ctrl_c::send(void) {
     for (auto I = m_output_buffer->begin(), E = m_output_buffer->end(); I != E;
          ++I) {
       mem_req_s* req = (*I);
-      if (req_type_allowed[req->m_ptx] == false) continue;
+      if (req_type_allowed[req->m_acc] == false) continue;
 
-      req_type_checked[req->m_ptx] = true;
+      req_type_checked[req->m_acc] = true;
       req->m_msg_type = NOC_FILL;
 
       bool insert_packet =
@@ -764,7 +764,7 @@ void dram_ctrl_c::channel_schedule_data(void) {
               m_current_list[bank]->m_req->m_id);
         ASSERT(m_current_list[bank]->m_state == DRAM_DATA);
         m_data_ready[bank] = acquire_data_bus(
-          ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_ptx);
+          ii, m_current_list[bank]->m_size, m_current_list[bank]->m_req->m_acc);
         m_data_avail[bank] = ULLONG_MAX;
         m_current_list[bank]->m_state = DRAM_DATA_WAIT;
       } else

diff --git a/src/exec.cc b/src/exec.cc
@@ -538,7 +538,7 @@ bool exec_c::exec(int thread_id, int entry, uop_c* uop) {
       use_port(thread_id, entry);
 
       // GPU : if we use load-block policy, block current thread due to load instruction
-      if (uop_latency == -1 && m_ptx_sim &&
+      if (uop_latency == -1 && m_acc_sim &&
           *m_simBase->m_knobs->KNOB_FETCH_ONLY_LOAD_READY) {
         m_frontend->set_load_wait(uop->m_thread_id, uop->m_uop_num);
 
@@ -741,7 +741,7 @@ void exec_c::br_exec(uop_c* uop) {
   }
 
   // GPU : stall on branch policy
-  if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
+  if (m_acc_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
     m_frontend->set_br_ready(uop->m_thread_id);
   }
 }
@@ -793,7 +793,7 @@ void exec_c::run_a_cycle(void) {
     if (responseArrived) {
       DEBUG_CORE(m_core_id, "key found: 0x%lx, addr = 0x%llx\n", key,
                  uop->m_vaddr);
-      if (m_ptx_sim || m_igpu_sim) {
+      if (m_acc_sim || m_igpu_sim) {
         if (uop->m_parent_uop) {
           uop_c* puop = uop->m_parent_uop;
           ++puop->m_num_child_uops_done;
@@ -883,7 +883,7 @@ int exec_c::access_data_cache(uop_c* uop) {
   auto i = m_uop_buffer.find(key);
   ASSERTM(m_uop_buffer.end() == i, "uop has already been executed!\n");
 
-  int block_size = m_ptx_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue()
+  int block_size = m_acc_sim ? KNOB(KNOB_L1_SMALL_LINE_SIZE)->getValue()
                              : KNOB(KNOB_L1_LARGE_LINE_SIZE)->getValue();
   // Addr block_addr = uop->m_vaddr & ~((uint64_t)block_size-1);
 
@@ -936,7 +936,7 @@ int exec_c::access_data_cache(uop_c* uop) {
 }
 
 int exec_c::access_const_texture_cache(uop_c* uop) {
-  ASSERT(m_ptx_sim);
+  ASSERT(m_acc_sim);
   ASSERT(uop->m_mem_type == MEM_LD_CM || uop->m_mem_type == MEM_LD_TM);
 
   // assign unique key to each memory request; this will be used later in time for strobbing

diff --git a/src/exec.h b/src/exec.h
@@ -184,8 +184,9 @@ class exec_c
   uns16 m_mem_sched_rate; /**< memory schedule rate */
   uns16 m_fp_sched_rate; /**< fp schedule rate */
   uns8 m_dcache_cycles; /**< L1 cache latency */
-  bool m_ptx_sim; /**< gpu simulation */
+  bool m_acc_sim; /**< gpu simulation */
   bool m_igpu_sim; /**< intel gpu simulation */
+  bool m_ptx_sim; /**< PTX simulation */
   int m_latency[NUM_UOP_TYPES]; /**< latency map */
   Counter m_cur_core_cycle; /**< current core cycle */
   int m_max_port[max_ALLOCQ]; /**< maximum port */

diff --git a/src/frontend.cc b/src/frontend.cc
@@ -191,6 +191,8 @@ void frontend_c::run_a_cycle(void) {
   // fetch every KNOB_FETCH_RATIO cycle
   // CPU : every cycle
   // NVIDIA G80 : 1/4 cycles, NVIDIA Fermi: 1/2 cycles
+  // check core type for the fetch rate
+  // Hyesoon: Aug-2020 please check whether this need to be changed with heteroe and igpu
   if (m_fetch_ratio != 1) {
     m_fetch_modulo++;
     if (m_fetch_modulo == m_fetch_ratio)
@@ -300,7 +302,7 @@ void frontend_c::run_a_cycle(void) {
 
   // TONAGESH
   // nagesh - comments for BAR are incomplete...
-  if (m_knob_ptx_sim) {
+  if (m_ptx_sim) {
     // handling of BAR instruction in PTX - can/should this be moved?
     // do we have any blocks for which all warps have reached (retired)
     // their next barrier?
@@ -346,7 +348,7 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
 
   // First time : set up traces for current thread
   if (fetch_data->m_first_time) {
-    m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_knob_ptx_sim);
+    m_simBase->m_trace_reader->setup_trace(m_core_id, tid, m_ptx_sim);
     fetch_data->m_first_time = false;
 
     ++m_core->m_inst_fetched[tid]; /*! initial increase */
@@ -356,11 +358,18 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
 
     // set up initial fetch address
     thread_s *thread = m_core->get_trace_info(tid);
-    if (thread->m_ptx) {
-      trace_info_gpu_s *prev_trace_info =
-        static_cast<trace_info_gpu_s *>(thread->m_prev_trace_info);
-      fetch_data->m_MT_scheduler.m_next_fetch_addr =
-        prev_trace_info->m_inst_addr;
+    if (thread->m_acc) {
+      if (m_ptx_sim) {
+        trace_info_gpu_s *prev_trace_info =
+          static_cast<trace_info_gpu_s *>(thread->m_prev_trace_info);
+        fetch_data->m_MT_scheduler.m_next_fetch_addr =
+          prev_trace_info->m_inst_addr;
+      } else if (m_igpu_sim) {
+        trace_info_igpu_s *prev_trace_info =
+          static_cast<trace_info_igpu_s *>(thread->m_prev_trace_info);
+        fetch_data->m_MT_scheduler.m_next_fetch_addr =
+          prev_trace_info->m_instruction_addr;
+      }
     } else {
       if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "x86") {
         trace_info_cpu_s *prev_trace_info =
@@ -372,11 +381,6 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
           static_cast<trace_info_a64_s *>(thread->m_prev_trace_info);
         fetch_data->m_MT_scheduler.m_next_fetch_addr =
           prev_trace_info->m_instruction_addr;
-      } else if (KNOB(KNOB_LARGE_CORE_TYPE)->getValue() == "igpu") {
-        trace_info_igpu_s *prev_trace_info =
-          static_cast<trace_info_igpu_s *>(thread->m_prev_trace_info);
-        fetch_data->m_MT_scheduler.m_next_fetch_addr =
-          prev_trace_info->m_instruction_addr;
       } else {
         ASSERTM(0, "Wrong core type %s\n",
                 KNOB(KNOB_LARGE_CORE_TYPE)->getValue().c_str());
@@ -457,8 +461,8 @@ FRONTEND_MODE frontend_c::process_ifetch(unsigned int tid,
         ASSERT(new_uop);
 
         // read an uop from the traces
-        if (!m_simBase->m_trace_reader->get_uops_from_traces(
-              m_core_id, new_uop, tid, m_knob_ptx_sim)) {
+        if (!m_simBase->m_trace_reader->get_uops_from_traces(m_core_id, new_uop,
+                                                             tid, m_ptx_sim)) {
           // couldn't get an uop
           DEBUG_CORE(m_core_id, "not success\n");
           m_uop_pool->release_entry(new_uop->free());
@@ -631,7 +635,7 @@ bool frontend_c::access_icache(int tid, Addr fetch_addr,
     int result = m_simBase->m_memory->new_mem_req(
       MRT_IFETCH, line_addr, m_knob_icache_line_size, false, false, 0, NULL,
       icache_fill_line_wrapper, m_core->get_unique_uop_num(), NULL, m_core_id,
-      tid, m_knob_ptx_sim);
+      tid, m_ptx_sim);
 
     // mshr full
     if (!result) return false;
@@ -712,7 +716,7 @@ bool frontend_c::icache_fill_line(mem_req_s *req) {
   if (m_icache->access_cache(req->m_addr, &line_addr, false, req->m_appl_id) ==
       NULL) {
     m_icache->insert_cache(req->m_addr, &line_addr, &repl_line_addr,
-                           req->m_appl_id, req->m_ptx);
+                           req->m_appl_id, req->m_acc);
     POWER_CORE_EVENT(req->m_core_id, POWER_ICACHE_W);
   }
 
@@ -806,7 +810,7 @@ int frontend_c::predict_bpu(uop_c *uop) {
   // no branch prediction
   else {
     // GPU : stall on branch policy, stop fetching
-    if (m_knob_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
+    if (m_ptx_sim && *m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR) {
       set_br_wait(uop->m_thread_id);
       mispredicted = false;
     }
@@ -906,7 +910,7 @@ int frontend_c::fetch_rr(void) {
     }
 
     // check the thread is ready to fetch
-    if (m_knob_ptx_sim) {
+    if (m_ptx_sim) {
       // GPU : stall on branch policy, check whether previous branch has been resolved
       if (*m_simBase->m_knobs->KNOB_MT_NO_FETCH_BR &&
           !check_br_ready(fetch_id)) {

diff --git a/src/frontend.h b/src/frontend.h
@@ -407,7 +407,9 @@ class frontend_c
   uns m_knob_icache_line_size; /**< icache line size */
   bool m_fe_stall; /**< frontend stalled */
   bool m_fe_running; /**< enabled frontend */
-  bool m_knob_ptx_sim; /**< GPU simulation */
+  bool m_ptx_sim; /**< PTX simulation */
+  bool m_igpu_sim; /**< iGPU simulation */
+  bool m_acc_sim; /**< Accelerator simulation */
   bool m_ready_thread_available; /**< ready thread available */
   bool m_last_fetch_tid_failed;
   core_c* m_core; /**< core pointer */

diff --git a/src/global_types.h b/src/global_types.h
@@ -80,4 +80,9 @@ typedef enum uop_latency_map {  // enum for x86 latency maps - Michael
   NUM_LATENCY_MAPS
 } latency_map;
 
+typedef enum _ACC_Type_enum {
+  NO_ACC = 0, /**< no accelerator */
+  PTX_CORE, /**< PTX core */
+  IGPU_CORE /**< IGPU core */
+} ACC_Type;
 #endif
diff --git a/src/macsim.cc b/src/macsim.cc
@@ -336,7 +336,7 @@ void macsim_c::init_cores(int num_max_core) {
 
     // insert to the core type pool
     if (static_cast<string>(*m_simBase->m_knobs->KNOB_LARGE_CORE_TYPE) == "ptx")
-      m_ptx_core_pool.push(ii);
+      m_acc_core_pool.push(ii);
     else
       m_x86_core_pool.push(ii);
   }
@@ -352,7 +352,7 @@ void macsim_c::init_cores(int num_max_core) {
     // insert to the core type pool
     if (static_cast<string>(*m_simBase->m_knobs->KNOB_MEDIUM_CORE_TYPE) ==
         "ptx")
-      m_ptx_core_pool.push(ii + total_core);
+      m_acc_core_pool.push(ii + total_core);
     else
       m_x86_core_pool.push(ii + total_core);
   }
@@ -367,7 +367,7 @@ void macsim_c::init_cores(int num_max_core) {
 
     // insert to the core type pool
     if (static_cast<string>(*m_simBase->m_knobs->KNOB_CORE_TYPE) == "ptx")
-      m_ptx_core_pool.push(ii + total_core);
+      m_acc_core_pool.push(ii + total_core);
     else
       m_x86_core_pool.push(ii + total_core);
   }

diff --git a/src/macsim.h b/src/macsim.h
@@ -243,7 +243,7 @@ class macsim_c
   // process manager
   process_manager_c *m_process_manager; /**< process manager */
   queue<int> m_x86_core_pool; /**< x86 cores pool */
-  queue<int> m_ptx_core_pool; /**< GPU cores pool */
+  queue<int> m_acc_core_pool; /**< GPU cores pool */
   multi_key_map_c *m_block_id_mapper; /**< block id mapper */
 
   // data structure pools (to reduce overhead of memory allocation)

diff --git a/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc b/src/manifold/models/iris/iris_srcs/components/simpleRouter.cc
@@ -249,13 +249,12 @@ SimpleRouter::handle_link_arrival( int port, LinkData* data )
   }
 
   //track flit_id of tail flit
-                    cout << manifold::kernel::Manifold::NowTicks() << ",p," 
-                        << ((HeadFlit*)data->f)->req->m_id << "," 
-                        << ((HeadFlit*)data->f)->req->m_ptx << ","
-                        << mem_state_copy[((HeadFlit*)data->f)->req->m_state] << ","
-                        << mem_req_noc_type_name[((HeadFlit*)data->f)->req->m_msg_type] << ","
-                        << node_id << "," << ((HeadFlit*)data->f)->dst_node << ","
-                        << endl;
+  cout << manifold::kernel::Manifold::NowTicks() << ",p,"
+       << ((HeadFlit*)data->f)->req->m_id << ","
+       << ((HeadFlit*)data->f)->req->m_acc << ","
+       << mem_state_copy[((HeadFlit*)data->f)->req->m_state] << ","
+       << mem_req_noc_type_name[((HeadFlit*)data->f)->req->m_msg_type] << ","
+       << node_id << "," << ((HeadFlit*)data->f)->dst_node << "," << endl;
          /*           
 		    cout << manifold::kernel::Manifold::NowTicks() << ",b," << node_id << ","; 
                     for(uint i=0; i<ports; i++)