cschpc · me2x · Jan 11, 2023 · Jan 12, 2023 · Jan 19, 2023
diff --git a/3d/hip/core_hip.cpp b/3d/hip/core_hip.cpp
@@ -179,9 +179,99 @@ __global__ void evolve_kernel(double *currdata, double *prevdata, double a, doub
     }
 }
 
-void evolve(Field& curr, Field& prev, const double a, const double dt)
+// Update the temperature values using five-point stencil */
+__global__ void evolve_kernel2(double *currdata, double *prevdata, double a, double dt, int nx, int ny, int nz,
+                       double inv_dx2, double inv_dy2, double inv_dz2, int max_threads)
 {
 
+    //linearize the thread id to have coalesced accesses
+    int ythr_offs = threadIdx.y * blockDim.x + threadIdx.x;
+    int zthr_offs = threadIdx.z * blockDim.y * blockDim.x+ ythr_offs;
+    int xblock_offs = blockIdx.x * blockDim.z * blockDim.y * blockDim.x + zthr_offs;
+    int yblock_offs = blockIdx.y * blockDim.z * blockDim.y * blockDim.x * gridDim.x + xblock_offs;
+    int tid = blockIdx.z * blockDim.z * blockDim.y * blockDim.x * gridDim.x * gridDim.y + yblock_offs;
+
+
+    //i only want to run significative computations, so i need to map the tid to the real memory addresses
+    //this means that i have some "holes" where the halo is mapped.
+    //to avoid too many uncoalesced accesses, we decide to "mask" the small skip (i.e. the halo on the central dimension)
+    //but we will avoid to evaluate the big skip (i.e. the halo surface when we change on the third dimension)
+    int start_offs = (ny + 2) * (nx + 2) +  (nx + 2);// 1 face + 1 line are the halos, must not be computated.
+    int big_skip_size = (ny + 2) + (nx + 2) ;//not completely sure, could be 2* nx+2 or 2*ny+2
+    int num_big_skips = tid / ((nx+2)*ny); //again not completely sure, could be 2nx or 2ny
+    int ind = tid + start_offs + num_big_skips * big_skip_size;
+
+
+    //keep load coalesced (i.e. do load also for thread that must not compute and write)
+    auto tmpdata = prevdata[ind];
+    auto ip = prevdata[ind + ((nx+2)*(ny+2))];
+    auto im = prevdata[ind - ((nx+2)*(ny+2))];
+    auto jp = prevdata[ind + (nx+2)];
+    auto jm = prevdata[ind - (nx+2)];
+    auto kp = prevdata[ind + 1];
+    auto km = prevdata[ind - 1];
+
+
+    //here we really exclude from the computation the "small skips", they are the first and the last on the innermost dimension
+    //e.g., if nx is 4, we need to exclude tid 0,5,6,11,12,... because there is where the halo is.
+    if (!((tid % (nx+2)) == 0 || ((tid - (nx+1)) % (nx+2)) == 0) && tid < max_threads)
+
+      //these printfs could be useful to understand the mapping of tid to memory idx.
+      //to use them you need to reduce the total dimension of the problem to something manageable (e.g. 4,4,4)
+      //and initialize the grid values with understandable numbers (i.e. 100*x+10*y+z)
+      //in this way you will have a set of print with all the "active" indexes
+
+      /*printf ("thread is block %d,%d,%d, is thread %d,%d,%d, tid %d, ind is %d, and value %f \n ",
+        (int)blockIdx.x,(int)blockIdx.y,(int)blockIdx.z,
+        (int)threadIdx.x,(int)threadIdx.y,(int)threadIdx.z,
+        tid,ind,
+        prevdata[ind]);*/
+        /*printf("ind is %f and my halo are %f %f %f %f %f %f \n",
+        prevdata[ind],
+        prevdata[ind+(nx+2)],           //y
+        prevdata[ind-(nx+2)],           //y
+        prevdata[ind+((nx+2)*(ny+2))],  //x   //i think the x-z inversion is due to fortran col major
+        prevdata[ind-((nx+2)*(ny+2))],  //x   //i think the x-z inversion is due to fortran col major
+        prevdata[ind+1],                //z   //i think the x-z inversion is due to fortran col major
+        prevdata[ind-1]                 //z   //i think the x-z inversion is due to fortran col major
+        );*/
+
+      tmpdata = tmpdata + a * dt * (
+                  ( ip - 2.0 * tmpdata + im ) * inv_dz2 + //c++ matrix, row major. "farthest" point is the z dimension
+                  ( jp - 2.0 * tmpdata + jm ) * inv_dy2 +
+                  ( kp - 2.0 * tmpdata + km ) * inv_dx2
+      );
+      //keep store coalesced. note that we used tmpdata local variable only for this reason.
+      //halo points will load and store it, without updating.
+      currdata[ind] = tmpdata;
+
+
+}
+
+
+void evolve(Field& curr, Field& prev, const double a, const double dt)
+{
+  //note that in the innermost dimension we do nx +2 to insert the "small skips"
+  //in the set of threads running. ny and nz don't need to be increased.
+    int max_threads = (curr.nx+2)* (curr.ny) * (curr.nz);
+    int blocks,grids;
+
+    //i'd like to use the optimal occupancy but the API seems to be broken.
+    //I'm not sure what is happening there but cannot use that
+    //hipOccupancyMaxPotentialBlockSize(&grids,&blocks,evolve_kernel, 0, max_threads);
+
+    //lets assign a block size of 256, this parameter can be played with.
+    blocks = 256;
+    grids = (max_threads + blocks -1) / blocks;
+
+    auto inv_dx2 = 1.0 / (prev.dx * prev.dx);
+    auto inv_dy2 = 1.0 / (prev.dy * prev.dy);
+    auto inv_dz2 = 1.0 / (prev.dz * prev.dz);
+
+    hipLaunchKernelGGL((evolve_kernel2), grids,blocks, 0, 0, curr.devdata(), prev.devdata(), a, dt, curr.nx, curr.ny, curr.nz,inv_dx2,inv_dy2,inv_dz2, max_threads );
+
+ //old evolve kernel here. just swap the comments.
+/*
   int nx = curr.nx;
   int ny = curr.ny;
   int nz = curr.nz;
@@ -225,7 +315,7 @@ void evolve(Field& curr, Field& prev, const double a, const double dt)
                                                 inv_dx2, inv_dy2, inv_dz2);
   hipDeviceSynchronize();
   CHECK_ERROR_MSG("evolve");
-
+*/
 }
 
 void evolve_interior(Field& curr, Field& prev, const double a, const double dt, hipStream_t *streams)

diff --git a/3d/hipfort/Makefile b/3d/hipfort/Makefile
@@ -0,0 +1,45 @@
+ifeq ($(COMP),)
+COMP=cray
+endif
+
+export HIPFORT_HOME=./hipfort/hipfortbin
+include Makefile.hipfort 
+
+
+
+EXE=heat_hipfort
+OBJS=main.o kernel.o heat_mod.o core.o utilities.o io.o setup.o 
+
+all: $(EXE)
+
+hipfort:
+	@git clone https://github.com/ROCmSoftwarePlatform/hipfort.git
+
+
+./hipfort/hipfortbin/lib/libhipfort-amdgcn.a: hipfort
+	@echo "running the command"
+	@bash install_hipfort.sh $(COMP)
+
+kernel.o: kernel.cpp ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+heat_mod.o: heat_mod.F90 ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+core.o: core.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+utilities.o: utilities.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+io.o: io.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+setup.o: setup.F90 heat_mod.o utilities.o io.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+main.o: main.F90 heat_mod.o core.o io.o setup.o utilities.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+
+
+
+$(EXE): $(OBJS) hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+	$(FC) $(OBJS) -o $@ $(LDFLAGS) $(LIBS) $(LINKOPTS)
+
+%.o: %.cpp
+	$(CXX) -c $< -o $@
+
+%.o: %.F90
+	$(FC) -DGPU_MPI -c $< -o $@
+
+
+.PHONY: clean
+clean:
+	-/bin/rm -f $(EXE) a.out *.o *.mod *.png *~
diff --git a/3d/hipfort/Makefile.hipfort b/3d/hipfort/Makefile.hipfort
@@ -0,0 +1,51 @@
+
+#
+# Makefile.hipfort: Include Makefile to set values LINKOPTS, CXX, and FC 
+#                   to support compilation with HIPFORT.
+#
+# This file is meant to be included by other Makefiles.  The includer must 
+# set HIPFORT_HOME to the build or install location of HIPFORT.  For example:
+#
+#    HIPFORT_HOME ?= /opt/rocm/hipfort
+#    include $(HIPFORT_HOME)/bin/Makefile.hipfort
+#  
+# If the caller does not set HIPFORT_ARCHGPU, this Makefile will call 
+# a self-identification utility called myarchgpu.  Please create an issue
+# in the hipfort repo if myarchgpu returns unknown.
+#
+# To avoid self identification set HIPFORT_ARCHGPU.  Examples:
+#    HIPFORT_ARCHGPU = amdgcn-gfx906
+#    HIPFORT_ARCHGPU = nvptx-sm_70
+#
+
+export ROCM_PATH=/opt/rocm
+HIPFORT_ARCHGPU ?= $(shell $(HIPFORT_HOME)/libexec/hipfort/myarchgpu)
+ARCH = $(firstword $(subst -, ,$(HIPFORT_ARCHGPU)))
+#HIPFORT_COMPILER ?= /opt/cray/pe/gcc/11.2.0/bin/gfortran
+#HIPFORT_COMPILER ?= /opt/cray/pe/cce/14.0.2/bin/crayftn
+HIPFORT_COMPILER ?= ftn
+#CUDA_PATH=${CUDA_PATH:-/usr/local/cuda}
+#ROCM_PATH=${ROCM_PATH:-/opt/rocm}
+DEVICE_LIB_PATH  ?= $(ROCM_PATH)/lib
+HIP_CLANG_PATH   ?= $(ROCM_PATH)/llvm/bin
+HIP_PLATFORM=${HIP_PLATFORM:-amd}
+
+MOD_DIR  = $(HIPFORT_HOME)/include/hipfort/$(ARCH)
+LIB_DIR  = $(HIPFORT_HOME)/lib
+
+GPU = $(strip $(subst ., ,$(suffix $(subst -,.,$(HIPFORT_ARCHGPU)))))
+ifeq (nvptx,$(findstring nvptx,$(HIPFORT_ARCHGPU)))
+  UNAMEP     = $(shell uname -p)
+  LINKOPTS   = -L$(LIB_DIR) -lhipfort-$(ARCH) -L$(CUDA_PATH)/targets/$(UNAMEP)-linux/lib $(LINKOPTS) -lcudart
+  HIPCC_ENV  = HIP_PLATFORM=nvcc
+  HIPCC_OPTS = "--gpu-architecture=$(GPU)"
+else
+  LINKOPTS   = -L$(LIB_DIR) -lhipfort-$(ARCH) -L$(ROCM_PATH)/lib $(HIPCC_LIBS) -lamdhip64 -Wl,-rpath=$(ROCM_PATH)/lib
+  HIPCC_ENV  = HIP_PLATFORM=$(HIP_PLATFORM) DEVICE_LIB_PATH=$(DEVICE_LIB_PATH) HIP_CLANG_PATH=$(HIP_CLANG_PATH)
+  HIPCC_OPTS = -fno-gpu-rdc -fPIC --offload-arch=$(GPU)
+endif
+
+LINKOPTS += -lstdc++
+CXX      = $(ROCM_PATH)/bin/hipcc $(HIPCC_OPTS)
+FC_OPTS  = "-DHIPFORT_ARCH=\"$(ARCH)\""
+FC       = $(HIPFORT_COMPILER) -Rb -I$(MOD_DIR) $(FC_OPTS)
diff --git a/3d/hipfort/core.F90 b/3d/hipfort/core.F90
@@ -0,0 +1,97 @@
+! Main solver routines for heat equation solver
+module core
+  use heat
+  use iso_c_binding
+  use hipfort
+  use hipfort_check
+
+  !binding the evolve function to the c wrapper that includes the cpp/hip kernel.
+  !note that I'm working with the field_c struct, that is a struct declared in both c and fortran
+  !that contains the details of the field plus a c_ptr that is pointing to the DEVICE memory.
+  !kernel details:
+  ! Compute one time step of temperature evolution
+  ! Arguments:
+  !   curr (type(field_c)): current temperature values
+  !   prev (type(field_c)): values from previous time step
+  !   a (real(c_double)): update equation constant
+  !   dt (real(c_double)): time step value
+
+  interface
+    subroutine evolve(curr, prev, a, dt) , bind(c,name="evolve")
+      use iso_c_binding
+      import field_c
+      implicit none
+
+      type(field_c), target, intent(inout) :: curr, prev
+      real(c_double),value :: a, dt
+
+    end subroutine evolve
+  end interface
+
+
+contains
+
+  ! Exchange the boundary data between MPI tasks
+  subroutine exchange(field0, parallel)
+    use mpi
+
+    implicit none
+
+    type(field), target, intent(inout) :: field0
+    type(parallel_data), intent(in) :: parallel
+
+    !!could be a cray issue. have to check with other compilers
+    real(c_double), dimension(:,:,:),allocatable, target :: data !!!!!! THIS MUST BE ALLOCATABLE NOT POINTER FOR MULTIDIMENSIONAL THINGS !!!!!!!
+
+    integer :: buf_size
+
+    integer :: ierr
+
+    call c_f_pointer (field0%inner_field%data , data, shape(field0%data))
+    !write(*,*) '#@#@#@#@ data pointer shape is ', shape(data), 'old ptr shape is ', shape(field0%data)
+    buf_size = (field0%inner_field%nx + 2) * (field0%inner_field%ny + 2)
+
+    !#ifdef GPU_MPI
+    !for now let's suppose that it's always device2device
+    !TODO manage data transfer via host
+
+    ! Send to left, receive from right
+    call mpi_sendrecv(data(:, :, 1), buf_size, MPI_DOUBLE_PRECISION, &
+         & parallel%nleft, 11, &
+         & data(:, :, field0%inner_field%nz+2), buf_size, MPI_DOUBLE_PRECISION, &
+         & parallel%nright, 11, &
+         & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+
+    ! Send to right, receive from left
+    call mpi_sendrecv(data(:, :, field0%inner_field%nz+1), buf_size, MPI_DOUBLE_PRECISION, &
+         & parallel%nright, 12, &
+         & data(:, :, 1), buf_size, MPI_DOUBLE_PRECISION,&
+         & parallel%nleft, 12, &
+         & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+    !write(15,*) '########mpi stuff here, data pointer is at ', c_loc(data(1,1,1)), 'doublecheck ', c_loc(data), 'triplecheck', field0%inner_field%data
+    !write(15,*) '########mpi stuff here, second element is at ', c_loc(data(1,1,2)), 'or ', c_loc(data(2,1,1))
+    !write(15,*) 'size of data is', size(data), 'address of last thing is ', c_loc(data(6,6,4)), 'and it contains ',data(6,6,4)
+    !write(15,*) 'first element over data is ',c_loc(data(7,6,4)), 'that contains ',data(7,6,4), ' or ', c_loc(data(6,6,5)),' that contains ', data(6,6,5) !want to be sure they are out of bounds.
+    !write(15,*) 'first sendrecv data (left send, right recv), send ',c_loc(data(:,:,2)),' recv ', c_loc(data(:, :, field0%inner_field%nz + 2))
+    !write(15,*) 'second sendrecv data, send ',c_loc(data(:, :, field0%inner_field%nz+1)),' recv ', c_loc(data(:, :, 1))
+    !write(15,*) 'field host data is at ', c_loc(field0%data), ' and device data is at ', field0%inner_field%data, ' data pointer is at ', c_loc(data)
+    !write(15,*) 'buf size is ', buf_size
+    !write(15,*) 'shape of data is ', shape(data)
+    !call flush (15)
+    !write(15,*) 'buf_size', buf_size
+    !call flush (15)
+    !write(15,*) 'precision', MPI_DOUBLE_PRECISION
+    !call flush (15)
+    !write(15,*) 'right proc' ,parallel%nright
+    !call flush (15)
+    !write(15,*) 'pointer' ,data(:, :, 1)
+    !call flush (15)
+    !write(15,*) 'left', parallel%nleft
+    !call flush (15)
+
+    !write(15,*) 'second done'
+
+
+  end subroutine exchange
+
+end module core