diff --git a/3d/hip/core_hip.cpp b/3d/hip/core_hip.cpp index 90461cb..9109067 100644 --- a/3d/hip/core_hip.cpp +++ b/3d/hip/core_hip.cpp @@ -179,9 +179,99 @@ __global__ void evolve_kernel(double *currdata, double *prevdata, double a, doub } } -void evolve(Field& curr, Field& prev, const double a, const double dt) +// Update the temperature values using five-point stencil */ +__global__ void evolve_kernel2(double *currdata, double *prevdata, double a, double dt, int nx, int ny, int nz, + double inv_dx2, double inv_dy2, double inv_dz2, int max_threads) { + //linearize the thread id to have coalesced accesses + int ythr_offs = threadIdx.y * blockDim.x + threadIdx.x; + int zthr_offs = threadIdx.z * blockDim.y * blockDim.x+ ythr_offs; + int xblock_offs = blockIdx.x * blockDim.z * blockDim.y * blockDim.x + zthr_offs; + int yblock_offs = blockIdx.y * blockDim.z * blockDim.y * blockDim.x * gridDim.x + xblock_offs; + int tid = blockIdx.z * blockDim.z * blockDim.y * blockDim.x * gridDim.x * gridDim.y + yblock_offs; + + + //i only want to run significative computations, so i need to map the tid to the real memory addresses + //this means that i have some "holes" where the halo is mapped. + //to avoid too many uncoalesced accesses, we decide to "mask" the small skip (i.e. the halo on the central dimension) + //but we will avoid to evaluate the big skip (i.e. the halo surface when we change on the third dimension) + int start_offs = (ny + 2) * (nx + 2) + (nx + 2);// 1 face + 1 line are the halos, must not be computated. + int big_skip_size = (ny + 2) + (nx + 2) ;//not completely sure, could be 2* nx+2 or 2*ny+2 + int num_big_skips = tid / ((nx+2)*ny); //again not completely sure, could be 2nx or 2ny + int ind = tid + start_offs + num_big_skips * big_skip_size; + + + //keep load coalesced (i.e. do load also for thread that must not compute and write) + auto tmpdata = prevdata[ind]; + auto ip = prevdata[ind + ((nx+2)*(ny+2))]; + auto im = prevdata[ind - ((nx+2)*(ny+2))]; + auto jp = prevdata[ind + (nx+2)]; + auto jm = prevdata[ind - (nx+2)]; + auto kp = prevdata[ind + 1]; + auto km = prevdata[ind - 1]; + + + //here we really exclude from the computation the "small skips", they are the first and the last on the innermost dimension + //e.g., if nx is 4, we need to exclude tid 0,5,6,11,12,... because there is where the halo is. + if (!((tid % (nx+2)) == 0 || ((tid - (nx+1)) % (nx+2)) == 0) && tid < max_threads) + + //these printfs could be useful to understand the mapping of tid to memory idx. + //to use them you need to reduce the total dimension of the problem to something manageable (e.g. 4,4,4) + //and initialize the grid values with understandable numbers (i.e. 100*x+10*y+z) + //in this way you will have a set of print with all the "active" indexes + + /*printf ("thread is block %d,%d,%d, is thread %d,%d,%d, tid %d, ind is %d, and value %f \n ", + (int)blockIdx.x,(int)blockIdx.y,(int)blockIdx.z, + (int)threadIdx.x,(int)threadIdx.y,(int)threadIdx.z, + tid,ind, + prevdata[ind]);*/ + /*printf("ind is %f and my halo are %f %f %f %f %f %f \n", + prevdata[ind], + prevdata[ind+(nx+2)], //y + prevdata[ind-(nx+2)], //y + prevdata[ind+((nx+2)*(ny+2))], //x //i think the x-z inversion is due to fortran col major + prevdata[ind-((nx+2)*(ny+2))], //x //i think the x-z inversion is due to fortran col major + prevdata[ind+1], //z //i think the x-z inversion is due to fortran col major + prevdata[ind-1] //z //i think the x-z inversion is due to fortran col major + );*/ + + tmpdata = tmpdata + a * dt * ( + ( ip - 2.0 * tmpdata + im ) * inv_dz2 + //c++ matrix, row major. "farthest" point is the z dimension + ( jp - 2.0 * tmpdata + jm ) * inv_dy2 + + ( kp - 2.0 * tmpdata + km ) * inv_dx2 + ); + //keep store coalesced. note that we used tmpdata local variable only for this reason. + //halo points will load and store it, without updating. + currdata[ind] = tmpdata; + + +} + + +void evolve(Field& curr, Field& prev, const double a, const double dt) +{ + //note that in the innermost dimension we do nx +2 to insert the "small skips" + //in the set of threads running. ny and nz don't need to be increased. + int max_threads = (curr.nx+2)* (curr.ny) * (curr.nz); + int blocks,grids; + + //i'd like to use the optimal occupancy but the API seems to be broken. + //I'm not sure what is happening there but cannot use that + //hipOccupancyMaxPotentialBlockSize(&grids,&blocks,evolve_kernel, 0, max_threads); + + //lets assign a block size of 256, this parameter can be played with. + blocks = 256; + grids = (max_threads + blocks -1) / blocks; + + auto inv_dx2 = 1.0 / (prev.dx * prev.dx); + auto inv_dy2 = 1.0 / (prev.dy * prev.dy); + auto inv_dz2 = 1.0 / (prev.dz * prev.dz); + + hipLaunchKernelGGL((evolve_kernel2), grids,blocks, 0, 0, curr.devdata(), prev.devdata(), a, dt, curr.nx, curr.ny, curr.nz,inv_dx2,inv_dy2,inv_dz2, max_threads ); + + //old evolve kernel here. just swap the comments. +/* int nx = curr.nx; int ny = curr.ny; int nz = curr.nz; @@ -225,7 +315,7 @@ void evolve(Field& curr, Field& prev, const double a, const double dt) inv_dx2, inv_dy2, inv_dz2); hipDeviceSynchronize(); CHECK_ERROR_MSG("evolve"); - +*/ } void evolve_interior(Field& curr, Field& prev, const double a, const double dt, hipStream_t *streams) diff --git a/3d/hipfort/Makefile b/3d/hipfort/Makefile new file mode 100644 index 0000000..6af2610 --- /dev/null +++ b/3d/hipfort/Makefile @@ -0,0 +1,45 @@ +ifeq ($(COMP),) +COMP=cray +endif + +export HIPFORT_HOME=./hipfort/hipfortbin +include Makefile.hipfort + + + +EXE=heat_hipfort +OBJS=main.o kernel.o heat_mod.o core.o utilities.o io.o setup.o + +all: $(EXE) + +hipfort: + @git clone https://github.com/ROCmSoftwarePlatform/hipfort.git + + +./hipfort/hipfortbin/lib/libhipfort-amdgcn.a: hipfort + @echo "running the command" + @bash install_hipfort.sh $(COMP) + +kernel.o: kernel.cpp ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a +heat_mod.o: heat_mod.F90 ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a +core.o: core.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a +utilities.o: utilities.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a +io.o: io.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a +setup.o: setup.F90 heat_mod.o utilities.o io.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a +main.o: main.F90 heat_mod.o core.o io.o setup.o utilities.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a + + + +$(EXE): $(OBJS) hipfort/hipfortbin/lib/libhipfort-amdgcn.a + $(FC) $(OBJS) -o $@ $(LDFLAGS) $(LIBS) $(LINKOPTS) + +%.o: %.cpp + $(CXX) -c $< -o $@ + +%.o: %.F90 + $(FC) -DGPU_MPI -c $< -o $@ + + +.PHONY: clean +clean: + -/bin/rm -f $(EXE) a.out *.o *.mod *.png *~ diff --git a/3d/hipfort/Makefile.hipfort b/3d/hipfort/Makefile.hipfort new file mode 100644 index 0000000..5756d81 --- /dev/null +++ b/3d/hipfort/Makefile.hipfort @@ -0,0 +1,51 @@ + +# +# Makefile.hipfort: Include Makefile to set values LINKOPTS, CXX, and FC +# to support compilation with HIPFORT. +# +# This file is meant to be included by other Makefiles. The includer must +# set HIPFORT_HOME to the build or install location of HIPFORT. For example: +# +# HIPFORT_HOME ?= /opt/rocm/hipfort +# include $(HIPFORT_HOME)/bin/Makefile.hipfort +# +# If the caller does not set HIPFORT_ARCHGPU, this Makefile will call +# a self-identification utility called myarchgpu. Please create an issue +# in the hipfort repo if myarchgpu returns unknown. +# +# To avoid self identification set HIPFORT_ARCHGPU. Examples: +# HIPFORT_ARCHGPU = amdgcn-gfx906 +# HIPFORT_ARCHGPU = nvptx-sm_70 +# + +export ROCM_PATH=/opt/rocm +HIPFORT_ARCHGPU ?= $(shell $(HIPFORT_HOME)/libexec/hipfort/myarchgpu) +ARCH = $(firstword $(subst -, ,$(HIPFORT_ARCHGPU))) +#HIPFORT_COMPILER ?= /opt/cray/pe/gcc/11.2.0/bin/gfortran +#HIPFORT_COMPILER ?= /opt/cray/pe/cce/14.0.2/bin/crayftn +HIPFORT_COMPILER ?= ftn +#CUDA_PATH=${CUDA_PATH:-/usr/local/cuda} +#ROCM_PATH=${ROCM_PATH:-/opt/rocm} +DEVICE_LIB_PATH ?= $(ROCM_PATH)/lib +HIP_CLANG_PATH ?= $(ROCM_PATH)/llvm/bin +HIP_PLATFORM=${HIP_PLATFORM:-amd} + +MOD_DIR = $(HIPFORT_HOME)/include/hipfort/$(ARCH) +LIB_DIR = $(HIPFORT_HOME)/lib + +GPU = $(strip $(subst ., ,$(suffix $(subst -,.,$(HIPFORT_ARCHGPU))))) +ifeq (nvptx,$(findstring nvptx,$(HIPFORT_ARCHGPU))) + UNAMEP = $(shell uname -p) + LINKOPTS = -L$(LIB_DIR) -lhipfort-$(ARCH) -L$(CUDA_PATH)/targets/$(UNAMEP)-linux/lib $(LINKOPTS) -lcudart + HIPCC_ENV = HIP_PLATFORM=nvcc + HIPCC_OPTS = "--gpu-architecture=$(GPU)" +else + LINKOPTS = -L$(LIB_DIR) -lhipfort-$(ARCH) -L$(ROCM_PATH)/lib $(HIPCC_LIBS) -lamdhip64 -Wl,-rpath=$(ROCM_PATH)/lib + HIPCC_ENV = HIP_PLATFORM=$(HIP_PLATFORM) DEVICE_LIB_PATH=$(DEVICE_LIB_PATH) HIP_CLANG_PATH=$(HIP_CLANG_PATH) + HIPCC_OPTS = -fno-gpu-rdc -fPIC --offload-arch=$(GPU) +endif + +LINKOPTS += -lstdc++ +CXX = $(ROCM_PATH)/bin/hipcc $(HIPCC_OPTS) +FC_OPTS = "-DHIPFORT_ARCH=\"$(ARCH)\"" +FC = $(HIPFORT_COMPILER) -Rb -I$(MOD_DIR) $(FC_OPTS) diff --git a/3d/hipfort/core.F90 b/3d/hipfort/core.F90 new file mode 100644 index 0000000..5f1bb59 --- /dev/null +++ b/3d/hipfort/core.F90 @@ -0,0 +1,97 @@ +! Main solver routines for heat equation solver +module core + use heat + use iso_c_binding + use hipfort + use hipfort_check + + !binding the evolve function to the c wrapper that includes the cpp/hip kernel. + !note that I'm working with the field_c struct, that is a struct declared in both c and fortran + !that contains the details of the field plus a c_ptr that is pointing to the DEVICE memory. + !kernel details: + ! Compute one time step of temperature evolution + ! Arguments: + ! curr (type(field_c)): current temperature values + ! prev (type(field_c)): values from previous time step + ! a (real(c_double)): update equation constant + ! dt (real(c_double)): time step value + + interface + subroutine evolve(curr, prev, a, dt) , bind(c,name="evolve") + use iso_c_binding + import field_c + implicit none + + type(field_c), target, intent(inout) :: curr, prev + real(c_double),value :: a, dt + + end subroutine evolve + end interface + + +contains + + ! Exchange the boundary data between MPI tasks + subroutine exchange(field0, parallel) + use mpi + + implicit none + + type(field), target, intent(inout) :: field0 + type(parallel_data), intent(in) :: parallel + + !!could be a cray issue. have to check with other compilers + real(c_double), dimension(:,:,:),allocatable, target :: data !!!!!! THIS MUST BE ALLOCATABLE NOT POINTER FOR MULTIDIMENSIONAL THINGS !!!!!!! + + integer :: buf_size + + integer :: ierr + + call c_f_pointer (field0%inner_field%data , data, shape(field0%data)) + !write(*,*) '#@#@#@#@ data pointer shape is ', shape(data), 'old ptr shape is ', shape(field0%data) + buf_size = (field0%inner_field%nx + 2) * (field0%inner_field%ny + 2) + + !#ifdef GPU_MPI + !for now let's suppose that it's always device2device + !TODO manage data transfer via host + + ! Send to left, receive from right + call mpi_sendrecv(data(:, :, 1), buf_size, MPI_DOUBLE_PRECISION, & + & parallel%nleft, 11, & + & data(:, :, field0%inner_field%nz+2), buf_size, MPI_DOUBLE_PRECISION, & + & parallel%nright, 11, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + + ! Send to right, receive from left + call mpi_sendrecv(data(:, :, field0%inner_field%nz+1), buf_size, MPI_DOUBLE_PRECISION, & + & parallel%nright, 12, & + & data(:, :, 1), buf_size, MPI_DOUBLE_PRECISION,& + & parallel%nleft, 12, & + & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) + !write(15,*) '########mpi stuff here, data pointer is at ', c_loc(data(1,1,1)), 'doublecheck ', c_loc(data), 'triplecheck', field0%inner_field%data + !write(15,*) '########mpi stuff here, second element is at ', c_loc(data(1,1,2)), 'or ', c_loc(data(2,1,1)) + !write(15,*) 'size of data is', size(data), 'address of last thing is ', c_loc(data(6,6,4)), 'and it contains ',data(6,6,4) + !write(15,*) 'first element over data is ',c_loc(data(7,6,4)), 'that contains ',data(7,6,4), ' or ', c_loc(data(6,6,5)),' that contains ', data(6,6,5) !want to be sure they are out of bounds. + !write(15,*) 'first sendrecv data (left send, right recv), send ',c_loc(data(:,:,2)),' recv ', c_loc(data(:, :, field0%inner_field%nz + 2)) + !write(15,*) 'second sendrecv data, send ',c_loc(data(:, :, field0%inner_field%nz+1)),' recv ', c_loc(data(:, :, 1)) + !write(15,*) 'field host data is at ', c_loc(field0%data), ' and device data is at ', field0%inner_field%data, ' data pointer is at ', c_loc(data) + !write(15,*) 'buf size is ', buf_size + !write(15,*) 'shape of data is ', shape(data) + !call flush (15) + !write(15,*) 'buf_size', buf_size + !call flush (15) + !write(15,*) 'precision', MPI_DOUBLE_PRECISION + !call flush (15) + !write(15,*) 'right proc' ,parallel%nright + !call flush (15) + !write(15,*) 'pointer' ,data(:, :, 1) + !call flush (15) + !write(15,*) 'left', parallel%nleft + !call flush (15) + + !write(15,*) 'second done' + + + end subroutine exchange + +end module core diff --git a/3d/hipfort/heat_mod.F90 b/3d/hipfort/heat_mod.F90 new file mode 100644 index 0000000..c975f7d --- /dev/null +++ b/3d/hipfort/heat_mod.F90 @@ -0,0 +1,138 @@ +! Field metadata for heat equation solver +module heat + use iso_fortran_env, only : REAL64 + use iso_c_binding + use hipfort + use hipfort_check + + implicit none + + integer, parameter :: dp = REAL64 + real(dp), parameter :: DX = 0.01, DY = 0.01, DZ = 0.01 ! Fixed grid spacing + + !data types structures. + !i want to have a struct that contains all the details that can be used from the GPU side (i.e. into C side of the program, device memory) + !and a wrapper of that struct that will contain the fortran declaration of the memory (i.e. the host memory) + !the device memory must be declared as a c_ptr, and will be allocated and managed manually with hipfort function calls + !the host memory has to be a pointer (allocatable,target was not valid. I am unsure of the reason and don't remember the actual compiler error) + type, bind(c) :: field_c + integer (c_int) :: nx ! local dimension of the field + integer (c_int) :: ny + integer (c_int) :: nz + integer (c_int) :: nx_full ! global dimension of the field + integer (c_int) :: ny_full + integer (c_int) :: nz_full + real(c_double) :: dx + real(c_double) :: dy + real(c_double) :: dz + type(c_ptr) :: data = c_null_ptr + end type field_c + + type :: field + type(field_c) :: inner_field + real(c_double), dimension(:,:,:), pointer :: data + end type field + + type :: parallel_data + integer :: size + integer :: rank + integer :: nleft, nright ! Ranks of neighbouring MPI tasks + integer (c_int) :: dev_count !c_int because it needs to go with the hipfort functions + end type parallel_data + +contains + ! Initialize the field type metadata + ! Arguments: + ! field0 (type(field)): input field + ! nx, ny, dx, dy: field dimensions and spatial step size + subroutine set_field_dimensions(field0, nx, ny, nz, parallel) + implicit none + + type(field), intent(out) :: field0 + integer, intent(in) :: nx, ny, nz + type(parallel_data), intent(in) :: parallel + + integer :: nx_local, ny_local, nz_local + + nx_local = nx + ny_local = ny + nz_local = nz / parallel%size + + field0%inner_field%dx = DX + field0%inner_field%dy = DY + field0%inner_field%dz = DZ + field0%inner_field%nx = nx_local + field0%inner_field%ny = ny_local + field0%inner_field%nz = nz_local + field0%inner_field%nx_full = nx + field0%inner_field%ny_full = ny + field0%inner_field%nz_full = nz + + end subroutine set_field_dimensions + + subroutine parallel_setup(parallel, nx, ny, nz) + use mpi + + + implicit none + + type(parallel_data), intent(out) :: parallel + integer, intent(in), optional :: nx, ny, nz + + integer :: nz_local + integer :: ierr + + + integer :: node_rank, node_procs, my_device + integer :: intranodecomm + + + call mpi_comm_size(MPI_COMM_WORLD, parallel%size, ierr) + + if (present(nz)) then + nz_local = nz / parallel%size + if (nz_local * parallel%size /= nz) then + write(*,*) 'Cannot divide grid evenly to processors' + call mpi_abort(MPI_COMM_WORLD, -2, ierr) + end if + end if + + call mpi_comm_rank(MPI_COMM_WORLD, parallel%rank, ierr) + + + parallel%nleft = parallel%rank - 1 + parallel%nright = parallel%rank + 1 + + if (parallel%nleft < 0) then + parallel%nleft = MPI_PROC_NULL + end if + if (parallel%nright > parallel%size - 1) then + parallel%nright = MPI_PROC_NULL + end if + + parallel%dev_count = 0 + + call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, intranodecomm, ierr); + + call mpi_comm_rank(intranodecomm, node_rank, ierr); + call mpi_comm_size(intranodecomm, node_procs, ierr); + + call mpi_comm_free(intranodecomm, ierr) + + call hipCheck(hipGetDeviceCount(parallel%dev_count)) + my_device = mod(node_rank, parallel%dev_count) + + if (node_procs > parallel%dev_count) then + if (parallel%rank == 0) then + write(*, '(AI4AI4)') 'Oversubscriging GPUs: MPI tasks per node: ', & + & node_procs, ' GPUs per node: ', parallel%dev_count + end if + end if + + call hipCheck(hipSetDevice(my_device)) + + + + end subroutine parallel_setup + +end module heat diff --git a/3d/hipfort/install_hipfort.sh b/3d/hipfort/install_hipfort.sh new file mode 100644 index 0000000..6ca3802 --- /dev/null +++ b/3d/hipfort/install_hipfort.sh @@ -0,0 +1,12 @@ +cd hipfort +mkdir build +cd build +echo " compiler is $1" +if ($1 == cray); then +cmake .. -DHIPFORT_COMPILER=ftn -DHIPFORT_COMPILER_FLAGS="-ffree -e Z" -DCMAKE_INSTALL_PREFIX=../hipfortbin +elif ($1 == gnu); then +cmake .. -DHIPFORT_COMPILER=gfortran -DCMAKE_Fortran_FLAGS=" -ffree-form -cpp -ffree-line-length-none -fmax-errors=5 -std=f2008 -fno-underscoring" -DCMAKE_INSTALL_PREFIX=../hipfortbin +fi + +make +make install diff --git a/3d/hipfort/io.F90 b/3d/hipfort/io.F90 new file mode 100644 index 0000000..7b26f76 --- /dev/null +++ b/3d/hipfort/io.F90 @@ -0,0 +1,126 @@ +! I/O routines for heat equation solver +module io + use heat + use mpi + use iso_c_binding + use hipfort + use hipfort_check + +contains + + ! Output routine, saves the temperature distribution as a png image + ! Arguments: + ! curr (type(field)): variable with the temperature data + ! iter (integer): index of the time step + ! not yet supported, left and commented out the old code. + ! only required change SHOULD be to add the memcpy d2h. + subroutine write_field(curr, iter, parallel) + +!#ifndef DISABLE_PNG +! use pngwriter +!#endif +! implicit none +! type(field), intent(in) :: curr +! integer, intent(in) :: iter +! type(parallel_data), intent(in) :: parallel +! +! character(len=85) :: filename +! +! integer :: stat +! real(dp), dimension(:,:,:), allocatable, target :: full_data +! integer :: p, ierr +! +! if (parallel%rank == 0) then +! allocate(full_data(curr%nx_full, curr%ny_full, curr%nz_full)) +! ! Copy rand #0 data to the global array +! full_data(1:curr%nx, 1:curr%ny, 1:curr%nz) = curr%data(1:curr& +! &%nx, 1:curr%ny, 1:curr%nz) +! +! ! Receive data from other ranks +! do p = 1, parallel%size - 1 +! call mpi_recv(full_data(1:curr%nx, 1:curr%ny, p*curr%nz + 1:(p + 1) * curr%ny), & +! & curr%nx * curr%ny * curr%nz, MPI_DOUBLE_PRECISION, p, 22, & +! & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr) +! end do +! write(filename,'(A5,I4.4,A4,A)') 'heat_', iter, '.png' +!#ifdef DISABLE_PNG +! if (parallel%rank == 0) then +! write(*,*) "No libpng, file not written" +! end if +!#else +! stat = save_png(full_data(:, :, curr%nz_full / 2), curr%nx_full, curr%ny_full, filename) +!#endif +! deallocate(full_data) +! else +! ! Send data +! call mpi_send(curr%data(1:curr%nx, 1:curr%ny, 1:curr%nz), curr%nx * curr%ny * curr%nz, MPI_DOUBLE_PRECISION, 0, 22, & +! & MPI_COMM_WORLD, ierr) +! end if + + end subroutine write_field + + + ! Reads the temperature distribution from an input file + ! Arguments: + ! field0 (type(field)): field variable that will store the + ! read data + ! filename (char): name of the input file + ! Note that this version assumes the input data to be in C memory layout + ! untested routine, taken from openacc/fortran. I only inserted the hipfort calls + subroutine read_field(field0, filename, parallel) + + implicit none + type(field), intent(out) :: field0 + character(len=85), intent(in) :: filename + type(parallel_data), intent(out) :: parallel + + integer :: nx, ny, nz, i, ierr + character(len=2) :: dummy + + real(dp), dimension(:,:,:), allocatable :: full_data, inner_data + + open(10, file=filename) + ! Read the header + read(10, *) dummy, nx, ny, nz + + call parallel_setup(parallel, nx, ny, nz) + call set_field_dimensions(field0, nx, ny, nz, parallel) + + ! The arrays for temperature field contain also a halo region + allocate(field0%data(0:field0%inner_field%nx+1, 0:field0%inner_field%ny+1, 0:field0%inner_field%nz+1)) + + call hipCheck(hipMalloc(field0%inner_field%data, size(field0%data)* 8_c_size_t)) + + allocate(inner_data(field0%inner_field%nx, field0%inner_field%ny, field0%inner_field%nz)) + + if (parallel%rank == 0) then + allocate(full_data(nx, ny, nz)) + ! Read the data + do i = 1, nx + read(10, *) full_data(i, 1:ny, 1) + end do + end if + + call mpi_scatter(full_data, nx * field0%inner_field%ny, MPI_DOUBLE_PRECISION, inner_data, & + & nx * field0%inner_field%ny, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr) + ! Copy to full array containing also boundaries + field0%data(1:field0%inner_field%nx, 1:field0%inner_field%ny, 1:field0%inner_field%nz) = inner_data(:,:,:) + !copy data array to gpu array + call hipCheck(hipMemcpy(field0%inner_field%data, c_loc(field0%data), size(field0%data)* 8_c_size_t, hipMemcpyHostToDevice)) + + + ! Set the boundary values + ! field0%data(1:field0%nx, 0) = field0%data(1:field0%nx, 1) + ! field0%data(1:field0%nx, field0%ny + 1) = field0%data(1:field0%nx, field0%ny) + ! field0%data(0, 0:field0%ny + 1) = field0%data(1, 0:field0%ny + 1) + ! field0%data(field0%nx + 1, 0:field0%ny + 1) = field0%data(field0%nx, 0:field0%ny + 1) + + close(10) + deallocate(inner_data) + if (parallel%rank == 0) then + deallocate(full_data) + end if + + end subroutine read_field + +end module io diff --git a/3d/hipfort/kernel.cpp b/3d/hipfort/kernel.cpp new file mode 100644 index 0000000..f76d2b4 --- /dev/null +++ b/3d/hipfort/kernel.cpp @@ -0,0 +1,123 @@ +#include + +//same data struct declared on the fortran side. +struct Field { + // nx and ny are the true dimensions of the field. The temperature matrix + // contains also ghost layers, so it will have dimensions nx+2 x ny+2 + int nx; // Local dimensions of the field + int ny; + int nz; + int nx_full; // Global dimensions of the field + int ny_full; // Global dimensions of the field + int nz_full; // Global dimensions of the field + double dx = 0.01; // Grid spacing + double dy = 0.01; + double dz = 0.01; + double* data; +}; + + +// Update the temperature values using five-point stencil */ +__global__ void evolve_kernel(double *currdata, double *prevdata, double a, double dt, int nx, int ny, int nz, + double inv_dx2, double inv_dy2, double inv_dz2, int max_threads) +{ + + //linearize the thread id to have coalesced accesses + int ythr_offs = threadIdx.y * blockDim.x + threadIdx.x; + int zthr_offs = threadIdx.z * blockDim.y * blockDim.x+ ythr_offs; + int xblock_offs = blockIdx.x * blockDim.z * blockDim.y * blockDim.x + zthr_offs; + int yblock_offs = blockIdx.y * blockDim.z * blockDim.y * blockDim.x * gridDim.x + xblock_offs; + int tid = blockIdx.z * blockDim.z * blockDim.y * blockDim.x * gridDim.x * gridDim.y + yblock_offs; + + + //i only want to run significative computations, so i need to map the tid to the real memory addresses + //this means that i have some "holes" where the halo is mapped. + //to avoid too many uncoalesced accesses, we decide to "mask" the small skip (i.e. the halo on the central dimension) + //but we will avoid to evaluate the big skip (i.e. the halo surface when we change on the third dimension) + int start_offs = (ny + 2) * (nx + 2) + (nx + 2);// 1 face + 1 line are the halos, must not be computated. + int big_skip_size = (ny + 2) + (nx + 2) ;//+ 2 //not completely sure, could be 2* nx+2 or 2*ny+2 + int num_big_skips = tid / ((nx+2)*ny); //again not completely sure, could be 2nx or 2ny + int ind = tid + start_offs + num_big_skips * big_skip_size; + + + + //keep load coalesced (i.e. do load also for thread that must not compute) + auto tmpdata = prevdata[ind]; + auto ip = prevdata[ind + ((nx+2)*(ny+2))]; + auto im = prevdata[ind - ((nx+2)*(ny+2))]; + auto jp = prevdata[ind + (nx+2)]; + auto jm = prevdata[ind - (nx+2)]; + auto kp = prevdata[ind + 1]; + auto km = prevdata[ind - 1]; + + //here we really exclude from the computation the "small skips", they are the first and the last on the innermost dimension + //e.g., if nx is 4, we need to exclude tid 0,5,6,11,12,... because there is where the halo is. + if (!((tid % (nx+2)) == 0 || ((tid - (nx+1)) % (nx+2)) == 0) && tid < max_threads) + + //these printfs could be useful to understand the mapping of tid to memory idx. + //to use them you need to reduce the total dimension of the problem to something manageable (e.g. 4,4,4) + //and initialize the grid values with understandable numbers (i.e. 100*x+10*y+z) + //in this way you will have a set of print with all the "active" indexes + + /*printf ("thread is block %d,%d,%d, is thread %d,%d,%d, tid %d, ind is %d, and value %f \n ", + (int)blockIdx.x,(int)blockIdx.y,(int)blockIdx.z, + (int)threadIdx.x,(int)threadIdx.y,(int)threadIdx.z, + tid,ind, + prevdata[ind]);*/ + /*printf("ind is %f and my halo are %f %f %f %f %f %f \n", + prevdata[ind], + prevdata[ind+(nx+2)], //y + prevdata[ind-(nx+2)], //y + prevdata[ind+((nx+2)*(ny+2))], //x //i think the x-z inversion is due to fortran col major + prevdata[ind-((nx+2)*(ny+2))], //x //i think the x-z inversion is due to fortran col major + prevdata[ind+1], //z //i think the x-z inversion is due to fortran col major + prevdata[ind-1] //z //i think the x-z inversion is due to fortran col major + );*/ + + tmpdata = tmpdata + a * dt * ( + ( ip - 2.0 * tmpdata + im ) * inv_dx2 + //fortran matrix, column major. farthest point is x dimension + ( jp - 2.0 * tmpdata + jm ) * inv_dy2 + + ( kp - 2.0 * tmpdata + km ) * inv_dz2 + ); + //keep store coalesced. note that we used tmpdata local variable only for this reason. + //halo points will load and store it, without updating. + currdata[ind] = tmpdata; + + +} +//i did try to keep everything cpp as an attempt to make the HIP API work, however it failed +//i still prefer having this structure because it can allow templating of kernels and so on, +//so i'll keep it as a "good practice" thing +void evolvecpp(Field* curr, const Field* prev, const double a, const double dt) +{ + //note that in the innermost dimension we do nx +2 to insert the "small skips" + //in the set of threads running. ny and nz don't need to be increased. + int max_threads = (curr->nx+2)* (curr->ny) * (curr->nz); + int blocks,grids; + //i'd like to use the optimal occupancy but the API seems to be broken. + //I'm not sure what is happening there but cannot use that + //hipOccupancyMaxPotentialBlockSize(&grids,&blocks,evolve_kernel, 0, max_threads); + + //lets assign a block size of 256, this parameter can be played with. + blocks = 256; + grids = (max_threads + blocks -1) / blocks; + + auto inv_dx2 = 1.0 / (prev->dx * prev->dx); + auto inv_dy2 = 1.0 / (prev->dy * prev->dy); + auto inv_dz2 = 1.0 / (prev->dz * prev->dz); + + hipLaunchKernelGGL((evolve_kernel), grids,blocks, 0, 0, curr->data, prev->data, a, dt, curr->nx, curr->ny, curr->nz,inv_dx2,inv_dy2,inv_dz2, max_threads ); +} + + + +extern "C" +{ + void evolve(Field* curr, const Field* prev, const double a, const double dt) + { + //launch the wrapper + evolvecpp(curr,prev,a,dt); + } +} + + diff --git a/3d/hipfort/launcher.sh b/3d/hipfort/launcher.sh new file mode 100644 index 0000000..778acde --- /dev/null +++ b/3d/hipfort/launcher.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +#SBATCH --partition=eap +#SBATCH --account=project_462000007 +#SBATCH --time=02:10:10 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=1 +#SBATCH --gpus-per-node=8 +#SBATCH -e slurm.err +#SBATCH -o slurm.out +#SBATCH --exclusive + +source /users/vitaliem/prova_hip/heat-equation/3d/hipfort/modules +make +srun /users/vitaliem/prova_hip/heat_eq/3d/hipfort/heat_hipfort diff --git a/3d/hipfort/main.F90 b/3d/hipfort/main.F90 new file mode 100644 index 0000000..10cdd21 --- /dev/null +++ b/3d/hipfort/main.F90 @@ -0,0 +1,85 @@ +! Heat equation solver in 2D. + +program heat_solve + use heat + use core + use io + use setup + use utilities + use mpi + use iso_c_binding + use hipfort + use hipfort_check + + implicit none + + real(dp), parameter :: a = 0.5 ! Diffusion constant + type(field) :: current, previous ! Current and previus temperature fields + + real(dp) :: dt ! Time step + integer :: nsteps ! Number of time steps + integer, parameter :: image_interval = 1500 ! Image output interval + + type(parallel_data) :: parallelization + integer :: ierr + + integer :: iter + + real(dp) :: average_temp ! Average temperature + + real(kind=dp) :: start_time, stop_time ! Timers + + call mpi_init(ierr) + + call initialize(current, previous, nsteps, parallelization) + + ! Draw the picture of the initial state + !call write_field(current, 0, parallelization) + + !average_temp = average(current) + if (parallelization % rank == 0) then + write(*,*) 'Average temperature at start: ', average_temp + end if + + ! Largest stable time step + dt = current%inner_field%dx**2 * current%inner_field%dy**2 * current%inner_field%dz**2 / & + & (2.0 * a * (current%inner_field%dx**2 + current%inner_field%dy**2 + current%inner_field%dz**2)) + + ! Main iteration loop, save a picture every + ! image_interval steps + + start_time = mpi_wtime() + + do iter = 1, nsteps + + ! senrecv is blocking, so no need of barriers and so on + call exchange(previous, parallelization) + + call evolve(current%inner_field, previous%inner_field, a, dt) + + !output not supported yet + !if (mod(iter, image_interval) == 0) then + ! call update_host(current) + ! call write_field(current, iter, parallelization) + !end if + call swap_fields(current, previous) + end do + + stop_time = mpi_wtime() + + ! Average temperature for reference + average_temp = average(previous,parallelization) + + if (parallelization % rank == 0) then + write(*,'(A,F7.3,A)') 'Iteration took ', stop_time - start_time, ' seconds.' + write(*,'(A,F9.6)') 'Average temperature: ', average_temp + if (command_argument_count() == 0) then + write(*,'(A,F9.6)') 'Reference value with default arguments: ', 63.834223_dp + end if + end if + + call finalize(current, previous) + + call mpi_finalize(ierr) + +end program heat_solve diff --git a/3d/hipfort/pngwriter_mod.F90 b/3d/hipfort/pngwriter_mod.F90 new file mode 100644 index 0000000..f6fe31b --- /dev/null +++ b/3d/hipfort/pngwriter_mod.F90 @@ -0,0 +1,41 @@ +! PNG writer for heat equation solver +module pngwriter + use heat + +contains + + function save_png(data, nx, ny, fname) result(stat) + + use, intrinsic :: ISO_C_BINDING + implicit none + + real(dp), dimension(:,:), intent(in) :: data + integer, intent(in) :: nx, ny + character(len=*), intent(in) :: fname + integer :: stat + + ! Interface for save_png C-function + interface + ! The C-function definition is + ! int save_png(double *data, const int nx, const int ny, + ! const char *fname) + function save_png_c(data, nx, ny, fname, order) & + & bind(C,name="save_png") result(stat) + use, intrinsic :: ISO_C_BINDING + implicit none + real(kind=C_DOUBLE) :: data(*) + integer(kind=C_INT), value, intent(IN) :: nx, ny + character(kind=C_CHAR), intent(IN) :: fname(*) + character(kind=C_CHAR), value, intent(IN) :: order + integer(kind=C_INT) :: stat + end function save_png_c + end interface + + stat = save_png_c(data, nx, ny, trim(fname) // C_NULL_CHAR, 'f') + if (stat /= 0) then + write(*,*) 'save_png returned error!' + end if + + end function save_png + +end module pngwriter diff --git a/3d/hipfort/setup.F90 b/3d/hipfort/setup.F90 new file mode 100644 index 0000000..01511ea --- /dev/null +++ b/3d/hipfort/setup.F90 @@ -0,0 +1,195 @@ +! Setup routines for heat equation solver +module setup + use heat + use iso_c_binding + use hipfort + use hipfort_check + +contains + + subroutine initialize(previous, current, nsteps, parallel) + use utilities + use io + + implicit none + + type(field), intent(out) :: previous, current + integer, intent(out) :: nsteps + type(parallel_data), intent(out) :: parallel + + integer :: height, width, length + logical :: using_input_file + character(len=85) :: input_file, arg ! Input file name and command line arguments + + + ! Default values for grid size and time steps + height = 800 + width = 800 + length = 800 + nsteps = 500 + using_input_file = .false. + + ! Read in the command line arguments and + ! set up the needed variables + select case(command_argument_count()) + case(0) ! No arguments -> default values + case(1) ! One argument -> input file name + using_input_file = .true. + call get_command_argument(1, input_file) + case(2) ! Two arguments -> input file name and number of steps + using_input_file = .true. + call get_command_argument(1, input_file) + call get_command_argument(2, arg) + read(arg, *) nsteps + case(4) ! Three arguments -> rows, cols and nsteps + call get_command_argument(1, arg) + read(arg, *) height + call get_command_argument(2, arg) + read(arg, *) width + call get_command_argument(3, arg) + read(arg, *) length + call get_command_argument(4, arg) + read(arg, *) nsteps + case default + call usage() + stop + end select + ! Initialize the fields according the command line arguments + if (using_input_file) then + call read_field(previous, input_file, parallel) + call copy_fields(previous, current) + else + call parallel_setup(parallel, height, width, length) + call set_field_dimensions(previous, height, width, length, parallel) + call set_field_dimensions(current, height, width, length, parallel) + call generate_field(previous, parallel) + call copy_fields(previous, current) + end if + + if (parallel%rank == 0) then + write(*,'(A, I5, A, I5, A, I5, A, I5)') & + & 'Simulation parameters: height: ', height, ' width: ', width, & + & ' length: ', length, ' time steps: ', nsteps + write(*,'(A, I5)') 'Number of MPI tasks: ', parallel%size + write(*,'(A, I5)') 'Number of GPUs: ', parallel%dev_count + end if + + end subroutine initialize + + ! Generate initial the temperature field. Pattern is disc with a radius + ! of nx_full / 6 in the center of the grid. + ! Boundary conditions are (different) constant temperatures outside the grid + subroutine generate_field(field0, parallel) + use heat + + implicit none + + type(field), intent(inout) :: field0 + type(parallel_data), intent(in) :: parallel + + real(dp) :: radius + integer :: i, j, k, ds2 + + ! The arrays for field contain also a halo region + allocate(field0%data(0:field0%inner_field%nx+1, 0:field0%inner_field%ny+1, 0:field0%inner_field%nz+1)) + ! allocate same size memory on the device. + call hipCheck(hipMalloc(field0%inner_field%data, size(field0%data)* 8_c_size_t)) + + ! Square of the disk radius + radius = (field0%inner_field%nx_full + field0%inner_field%ny_full + field0%inner_field%nz_full) / 18.0_dp + + do k = 0, field0%inner_field%nz + 1 + do j = 0, field0%inner_field%ny + 1 + do i = 0, field0%inner_field%nx + 1 + ds2 = int((i - field0%inner_field%nx_full / 2.0_dp + 1)**2 + & + & (j - field0%inner_field%ny_full / 2.0_dp + 1)**2 + & + & (k + parallel%rank * field0%inner_field%nz - field0%inner_field%nz_full / 2.0_dp + 1)**2) + if (ds2 < radius**2) then + field0%data(i,j,k) = 5.0_dp + else + field0%data(i,j,k) = 65.0_dp + end if + !this "alternative allocation" can be used with a reduced grid size to + !print what is going on using the printfs in the kernel. + !if (parallel%rank == 0) then + !field0%data(i,j,k) = i*100+j*10+k + !else + !field0%data(i,j,k) = 1.0_dp + !endif + end do + end do + end do + + ! Boundary conditions + if (parallel%rank == 0) then + field0%data(:,:,0) = 20.0_dp + end if + if (parallel%rank == parallel%size - 1) then + field0%data(:,:,field0%inner_field%nz+1) = 35.0_dp + end if + + field0%data(0,:,:) = 20.0_dp + field0%data(field0%inner_field%nx+1,:,:) = 35.0_dp + field0%data(:,0,:) = 20.0_dp + field0%data(:,field0%inner_field%ny+1,:) = 35.0_dp + + !and this print is to give a visual impression of the grid. + !never use this with the full grid + !! if (parallel%rank == 0) then + ! do i=0,field0%inner_field%nx+1 + ! do j=0,field0%inner_field%ny+1 + ! do k=0,field0%inner_field%nz+1 + ! write (*,"(F15.10)", advance="no") field0%data(i,j,k) + ! enddo + ! write (*,*) parallel%rank, 'end of line' + ! enddo + ! write (*,*) parallel%rank, 'end of page' + ! enddo + ! ! endif + ! write(*,*) '####### END OF INITIALIZATION OF THE MATRIX ######' + + + + + !copy data array to gpu array + call hipCheck(hipMemcpy(field0%inner_field%data, c_loc(field0%data), size(field0%data)* 8_c_size_t, hipMemcpyHostToDevice)) + + + end subroutine generate_field + + + ! Clean up routine for field type + ! Arguments: + ! field0 (type(field)): field variable to be cleared + subroutine finalize(field0, field1) + use heat + use hipfort + + implicit none + + type(field), intent(inout) :: field0, field1 + + !clean gpu memory + call hipCheck(hipFree(field0%inner_field%data)) + call hipCheck(hipFree(field1%inner_field%data)) + + deallocate(field0%data) + deallocate(field1%data) + + end subroutine finalize + + ! Helper routine that prints out a simple usage if + ! user gives more than three arguments + subroutine usage() + implicit none + character(len=256) :: buf + + call get_command_argument(0, buf) + write (*,'(A)') 'Usage:' + write (*,'(A, " (default values will be used)")') trim(buf) + write (*,'(A, " ")') trim(buf) + write (*,'(A, " ")') trim(buf) + write (*,'(A, " ")') trim(buf) + end subroutine usage + +end module setup diff --git a/3d/hipfort/utilities.F90 b/3d/hipfort/utilities.F90 new file mode 100644 index 0000000..13bcf25 --- /dev/null +++ b/3d/hipfort/utilities.F90 @@ -0,0 +1,119 @@ +! Utility routines for heat equation solver +! NOTE: This file does not need to be edited! +module utilities + use heat + use iso_c_binding + use hipfort + use hipfort_check + +contains + + ! Swap the data fields of two variables of type field + ! Arguments: + ! curr, prev (type(field)): the two variables that are swapped + subroutine swap_fields(curr, prev) + + implicit none + + type(field), intent(inout) :: curr, prev + real(dp), pointer, dimension(:,:,:) :: tmp + type (c_ptr) :: tmp_cptr + + ! i'm now using pointers, so no more move alloc. + tmp => curr%data !call move_alloc(curr%data, tmp) + curr%data => prev%data !call move_alloc(prev%data, curr%data) + prev%data => tmp !call move_alloc(tmp, prev%data) + ! also need to swap the c_ptr with the device memory + tmp_cptr = curr%inner_field%data + curr%inner_field%data = prev%inner_field%data + prev%inner_field%data = tmp_cptr + !TODO wouldn't be better to have the structs as pointers and just swap the struct? + + + end subroutine swap_fields + + ! Copy the data from one field to another + ! Arguments: + ! from_field (type(field)): variable to copy from + ! to_field (type(field)): variable to copy to + !FIXME UPDATE FOR NEW DATA STRUCT + subroutine copy_fields(from_field, to_field) + + implicit none + + type(field), intent(in) :: from_field + type(field), intent(out) :: to_field + + ! Consistency checks + if (.not.associated(from_field%data) .or. from_field%inner_field%data == c_null_ptr) then + write (*,*) "Can not copy from a field without allocated data" + stop + end if + if (.not.associated(to_field%data)) then + ! Target is not initialize, allocate memory + allocate(to_field%data(lbound(from_field%data, 1):ubound(from_field%data, 1), & + & lbound(from_field%data, 2):ubound(from_field%data, 2), & + & lbound(from_field%data, 3):ubound(from_field%data, 3))) + else if (any(shape(from_field%data) /= shape(to_field%data))) then + write (*,*) "Wrong field data sizes in copy routine" + print *, shape(from_field%data), shape(to_field%data) + stop + end if + + if (to_field%inner_field%data == c_null_ptr) then + call hipCheck(hipMalloc(to_field%inner_field%data ,size(from_field%data)* 8_c_size_t)) !https://github.com/ROCmSoftwarePlatform/hipfort/blob/master/README.md + end if + + to_field%data = from_field%data !should be valid even with pointers, this is an assigment of value https://stackoverflow.com/questions/61127387/fortran-pointer-assignment-difference-between-and + + call hipCheck(hipmemcpydtod(to_field%inner_field%data,from_field%inner_field%data,size(from_field%data)* 8_c_size_t)) + + to_field%inner_field%nx = from_field%inner_field%nx + to_field%inner_field%ny = from_field%inner_field%ny + to_field%inner_field%nz = from_field%inner_field%nz + to_field%inner_field%nx_full = from_field%inner_field%nx_full + to_field%inner_field%ny_full = from_field%inner_field%ny_full + to_field%inner_field%nz_full = from_field%inner_field%nz_full + to_field%inner_field%dx = from_field%inner_field%dx + to_field%inner_field%dy = from_field%inner_field%dy + to_field%inner_field%dz = from_field%inner_field%dz + end subroutine copy_fields + + function average(field0, parallel) + use mpi + + implicit none + + real(dp) :: average + type(field) :: field0 + type(parallel_data), intent(in) :: parallel + + real(dp) :: local_average + integer :: rc + integer :: i,j,k + !update host memory + call hipCheck(hipMemcpy(c_loc(field0%data), field0%inner_field%data, size(field0%data)* 8_c_size_t, hipMemcpyDeviceToHost)) + + !set of prints to be used with small grids and other initialization values to + !peek under the hood of the matrix after the evolve kernel is called. + !!if (parallel%rank == 1) then + ! do i=0,field0%inner_field%nx+1 + ! do j=0,field0%inner_field%ny+1 + ! do k=0,field0%inner_field%nz+1 + ! write (*,"(F15.10)", advance="no") field0%data(i,j,k) + ! enddo + ! write (*,*) parallel%rank, 'end of line' + ! enddo + ! write (*,*) parallel%rank, 'end of page' + ! enddo + !!endif + + + local_average = sum(field0%data(1:field0%inner_field%nx, 1:field0%inner_field%ny, 1:field0%inner_field%nz)) + call mpi_allreduce(local_average, average, 1, MPI_DOUBLE_PRECISION, MPI_SUM, & + & MPI_COMM_WORLD, rc) + average = average / (field0%inner_field%nx_full * field0%inner_field%ny_full * field0%inner_field%nz_full) + + end function average + +end module utilities