diff --git a/3d/hip/core_hip.cpp b/3d/hip/core_hip.cpp
index 90461cb..9109067 100644
--- a/3d/hip/core_hip.cpp
+++ b/3d/hip/core_hip.cpp
@@ -179,9 +179,99 @@ __global__ void evolve_kernel(double *currdata, double *prevdata, double a, doub
     }
 }
 
-void evolve(Field& curr, Field& prev, const double a, const double dt)
+// Update the temperature values using five-point stencil */
+__global__ void evolve_kernel2(double *currdata, double *prevdata, double a, double dt, int nx, int ny, int nz,
+                       double inv_dx2, double inv_dy2, double inv_dz2, int max_threads)
 {
 
+    //linearize the thread id to have coalesced accesses
+    int ythr_offs = threadIdx.y * blockDim.x + threadIdx.x;
+    int zthr_offs = threadIdx.z * blockDim.y * blockDim.x+ ythr_offs;
+    int xblock_offs = blockIdx.x * blockDim.z * blockDim.y * blockDim.x + zthr_offs;
+    int yblock_offs = blockIdx.y * blockDim.z * blockDim.y * blockDim.x * gridDim.x + xblock_offs;
+    int tid = blockIdx.z * blockDim.z * blockDim.y * blockDim.x * gridDim.x * gridDim.y + yblock_offs;
+
+
+    //i only want to run significative computations, so i need to map the tid to the real memory addresses
+    //this means that i have some "holes" where the halo is mapped.
+    //to avoid too many uncoalesced accesses, we decide to "mask" the small skip (i.e. the halo on the central dimension)
+    //but we will avoid to evaluate the big skip (i.e. the halo surface when we change on the third dimension)
+    int start_offs = (ny + 2) * (nx + 2) +  (nx + 2);// 1 face + 1 line are the halos, must not be computated.
+    int big_skip_size = (ny + 2) + (nx + 2) ;//not completely sure, could be 2* nx+2 or 2*ny+2
+    int num_big_skips = tid / ((nx+2)*ny); //again not completely sure, could be 2nx or 2ny
+    int ind = tid + start_offs + num_big_skips * big_skip_size;
+    
+    
+    //keep load coalesced (i.e. do load also for thread that must not compute and write)
+    auto tmpdata = prevdata[ind];
+    auto ip = prevdata[ind + ((nx+2)*(ny+2))];
+    auto im = prevdata[ind - ((nx+2)*(ny+2))];
+    auto jp = prevdata[ind + (nx+2)];
+    auto jm = prevdata[ind - (nx+2)];
+    auto kp = prevdata[ind + 1];
+    auto km = prevdata[ind - 1];
+
+
+    //here we really exclude from the computation the "small skips", they are the first and the last on the innermost dimension
+    //e.g., if nx is 4, we need to exclude tid 0,5,6,11,12,... because there is where the halo is.
+    if (!((tid % (nx+2)) == 0 || ((tid - (nx+1)) % (nx+2)) == 0) && tid < max_threads)
+      
+      //these printfs could be useful to understand the mapping of tid to memory idx.
+      //to use them you need to reduce the total dimension of the problem to something manageable (e.g. 4,4,4)
+      //and initialize the grid values with understandable numbers (i.e. 100*x+10*y+z)
+      //in this way you will have a set of print with all the "active" indexes
+
+      /*printf ("thread is block %d,%d,%d, is thread %d,%d,%d, tid %d, ind is %d, and value %f \n ",
+        (int)blockIdx.x,(int)blockIdx.y,(int)blockIdx.z,
+        (int)threadIdx.x,(int)threadIdx.y,(int)threadIdx.z,
+        tid,ind,
+        prevdata[ind]);*/
+        /*printf("ind is %f and my halo are %f %f %f %f %f %f \n",
+        prevdata[ind],
+        prevdata[ind+(nx+2)],           //y
+        prevdata[ind-(nx+2)],           //y
+        prevdata[ind+((nx+2)*(ny+2))],  //x   //i think the x-z inversion is due to fortran col major
+        prevdata[ind-((nx+2)*(ny+2))],  //x   //i think the x-z inversion is due to fortran col major
+        prevdata[ind+1],                //z   //i think the x-z inversion is due to fortran col major
+        prevdata[ind-1]                 //z   //i think the x-z inversion is due to fortran col major
+        );*/
+
+      tmpdata = tmpdata + a * dt * (
+                  ( ip - 2.0 * tmpdata + im ) * inv_dz2 + //c++ matrix, row major. "farthest" point is the z dimension
+                  ( jp - 2.0 * tmpdata + jm ) * inv_dy2 +
+                  ( kp - 2.0 * tmpdata + km ) * inv_dx2
+      );
+      //keep store coalesced. note that we used tmpdata local variable only for this reason.
+      //halo points will load and store it, without updating.
+      currdata[ind] = tmpdata;
+    
+    
+}
+
+
+void evolve(Field& curr, Field& prev, const double a, const double dt)
+{
+  //note that in the innermost dimension we do nx +2 to insert the "small skips"
+  //in the set of threads running. ny and nz don't need to be increased.
+    int max_threads = (curr.nx+2)* (curr.ny) * (curr.nz);
+    int blocks,grids;
+    
+    //i'd like to use the optimal occupancy but the API seems to be broken.
+    //I'm not sure what is happening there but cannot use that
+    //hipOccupancyMaxPotentialBlockSize(&grids,&blocks,evolve_kernel, 0, max_threads);
+
+    //lets assign a block size of 256, this parameter can be played with.
+    blocks = 256;
+    grids = (max_threads + blocks -1) / blocks;
+
+    auto inv_dx2 = 1.0 / (prev.dx * prev.dx);
+    auto inv_dy2 = 1.0 / (prev.dy * prev.dy);
+    auto inv_dz2 = 1.0 / (prev.dz * prev.dz);
+
+    hipLaunchKernelGGL((evolve_kernel2), grids,blocks, 0, 0, curr.devdata(), prev.devdata(), a, dt, curr.nx, curr.ny, curr.nz,inv_dx2,inv_dy2,inv_dz2, max_threads );
+
+ //old evolve kernel here. just swap the comments.
+/*
   int nx = curr.nx;
   int ny = curr.ny;
   int nz = curr.nz;
@@ -225,7 +315,7 @@ void evolve(Field& curr, Field& prev, const double a, const double dt)
                                                 inv_dx2, inv_dy2, inv_dz2);
   hipDeviceSynchronize();
   CHECK_ERROR_MSG("evolve");
-
+*/
 }
 
 void evolve_interior(Field& curr, Field& prev, const double a, const double dt, hipStream_t *streams)
diff --git a/3d/hipfort/Makefile b/3d/hipfort/Makefile
new file mode 100644
index 0000000..6af2610
--- /dev/null
+++ b/3d/hipfort/Makefile
@@ -0,0 +1,45 @@
+ifeq ($(COMP),)
+COMP=cray
+endif
+
+export HIPFORT_HOME=./hipfort/hipfortbin
+include Makefile.hipfort 
+
+
+
+EXE=heat_hipfort
+OBJS=main.o kernel.o heat_mod.o core.o utilities.o io.o setup.o 
+
+all: $(EXE)
+
+hipfort:
+	@git clone https://github.com/ROCmSoftwarePlatform/hipfort.git
+
+	
+./hipfort/hipfortbin/lib/libhipfort-amdgcn.a: hipfort
+	@echo "running the command"
+	@bash install_hipfort.sh $(COMP)
+
+kernel.o: kernel.cpp ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+heat_mod.o: heat_mod.F90 ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+core.o: core.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+utilities.o: utilities.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+io.o: io.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+setup.o: setup.F90 heat_mod.o utilities.o io.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+main.o: main.F90 heat_mod.o core.o io.o setup.o utilities.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+
+
+
+$(EXE): $(OBJS) hipfort/hipfortbin/lib/libhipfort-amdgcn.a
+	$(FC) $(OBJS) -o $@ $(LDFLAGS) $(LIBS) $(LINKOPTS)
+
+%.o: %.cpp
+	$(CXX) -c $< -o $@
+
+%.o: %.F90
+	$(FC) -DGPU_MPI -c $< -o $@
+
+
+.PHONY: clean
+clean:
+	-/bin/rm -f $(EXE) a.out *.o *.mod *.png *~
diff --git a/3d/hipfort/Makefile.hipfort b/3d/hipfort/Makefile.hipfort
new file mode 100644
index 0000000..5756d81
--- /dev/null
+++ b/3d/hipfort/Makefile.hipfort
@@ -0,0 +1,51 @@
+
+#
+# Makefile.hipfort: Include Makefile to set values LINKOPTS, CXX, and FC 
+#                   to support compilation with HIPFORT.
+#
+# This file is meant to be included by other Makefiles.  The includer must 
+# set HIPFORT_HOME to the build or install location of HIPFORT.  For example:
+#
+#    HIPFORT_HOME ?= /opt/rocm/hipfort
+#    include $(HIPFORT_HOME)/bin/Makefile.hipfort
+#  
+# If the caller does not set HIPFORT_ARCHGPU, this Makefile will call 
+# a self-identification utility called myarchgpu.  Please create an issue
+# in the hipfort repo if myarchgpu returns unknown.
+#
+# To avoid self identification set HIPFORT_ARCHGPU.  Examples:
+#    HIPFORT_ARCHGPU = amdgcn-gfx906
+#    HIPFORT_ARCHGPU = nvptx-sm_70
+#
+
+export ROCM_PATH=/opt/rocm
+HIPFORT_ARCHGPU ?= $(shell $(HIPFORT_HOME)/libexec/hipfort/myarchgpu)
+ARCH = $(firstword $(subst -, ,$(HIPFORT_ARCHGPU)))
+#HIPFORT_COMPILER ?= /opt/cray/pe/gcc/11.2.0/bin/gfortran
+#HIPFORT_COMPILER ?= /opt/cray/pe/cce/14.0.2/bin/crayftn
+HIPFORT_COMPILER ?= ftn
+#CUDA_PATH=${CUDA_PATH:-/usr/local/cuda}
+#ROCM_PATH=${ROCM_PATH:-/opt/rocm}
+DEVICE_LIB_PATH  ?= $(ROCM_PATH)/lib
+HIP_CLANG_PATH   ?= $(ROCM_PATH)/llvm/bin
+HIP_PLATFORM=${HIP_PLATFORM:-amd}
+
+MOD_DIR  = $(HIPFORT_HOME)/include/hipfort/$(ARCH)
+LIB_DIR  = $(HIPFORT_HOME)/lib
+
+GPU = $(strip $(subst ., ,$(suffix $(subst -,.,$(HIPFORT_ARCHGPU)))))
+ifeq (nvptx,$(findstring nvptx,$(HIPFORT_ARCHGPU)))
+  UNAMEP     = $(shell uname -p)
+  LINKOPTS   = -L$(LIB_DIR) -lhipfort-$(ARCH) -L$(CUDA_PATH)/targets/$(UNAMEP)-linux/lib $(LINKOPTS) -lcudart
+  HIPCC_ENV  = HIP_PLATFORM=nvcc
+  HIPCC_OPTS = "--gpu-architecture=$(GPU)"
+else
+  LINKOPTS   = -L$(LIB_DIR) -lhipfort-$(ARCH) -L$(ROCM_PATH)/lib $(HIPCC_LIBS) -lamdhip64 -Wl,-rpath=$(ROCM_PATH)/lib
+  HIPCC_ENV  = HIP_PLATFORM=$(HIP_PLATFORM) DEVICE_LIB_PATH=$(DEVICE_LIB_PATH) HIP_CLANG_PATH=$(HIP_CLANG_PATH)
+  HIPCC_OPTS = -fno-gpu-rdc -fPIC --offload-arch=$(GPU)
+endif
+
+LINKOPTS += -lstdc++
+CXX      = $(ROCM_PATH)/bin/hipcc $(HIPCC_OPTS)
+FC_OPTS  = "-DHIPFORT_ARCH=\"$(ARCH)\""
+FC       = $(HIPFORT_COMPILER) -Rb -I$(MOD_DIR) $(FC_OPTS)
diff --git a/3d/hipfort/core.F90 b/3d/hipfort/core.F90
new file mode 100644
index 0000000..5f1bb59
--- /dev/null
+++ b/3d/hipfort/core.F90
@@ -0,0 +1,97 @@
+! Main solver routines for heat equation solver
+module core
+  use heat
+  use iso_c_binding
+  use hipfort
+  use hipfort_check
+
+  !binding the evolve function to the c wrapper that includes the cpp/hip kernel.
+  !note that I'm working with the field_c struct, that is a struct declared in both c and fortran
+  !that contains the details of the field plus a c_ptr that is pointing to the DEVICE memory.
+  !kernel details:
+  ! Compute one time step of temperature evolution
+  ! Arguments:
+  !   curr (type(field_c)): current temperature values
+  !   prev (type(field_c)): values from previous time step
+  !   a (real(c_double)): update equation constant
+  !   dt (real(c_double)): time step value
+  
+  interface
+    subroutine evolve(curr, prev, a, dt) , bind(c,name="evolve")
+      use iso_c_binding
+      import field_c
+      implicit none
+
+      type(field_c), target, intent(inout) :: curr, prev
+      real(c_double),value :: a, dt
+
+    end subroutine evolve
+  end interface
+
+
+contains
+
+  ! Exchange the boundary data between MPI tasks
+  subroutine exchange(field0, parallel)
+    use mpi
+
+    implicit none
+
+    type(field), target, intent(inout) :: field0
+    type(parallel_data), intent(in) :: parallel
+
+    !!could be a cray issue. have to check with other compilers
+    real(c_double), dimension(:,:,:),allocatable, target :: data !!!!!! THIS MUST BE ALLOCATABLE NOT POINTER FOR MULTIDIMENSIONAL THINGS !!!!!!!
+
+    integer :: buf_size
+
+    integer :: ierr
+
+    call c_f_pointer (field0%inner_field%data , data, shape(field0%data))
+    !write(*,*) '#@#@#@#@ data pointer shape is ', shape(data), 'old ptr shape is ', shape(field0%data)
+    buf_size = (field0%inner_field%nx + 2) * (field0%inner_field%ny + 2)
+
+    !#ifdef GPU_MPI
+    !for now let's suppose that it's always device2device
+    !TODO manage data transfer via host
+        
+    ! Send to left, receive from right
+    call mpi_sendrecv(data(:, :, 1), buf_size, MPI_DOUBLE_PRECISION, &
+         & parallel%nleft, 11, &
+         & data(:, :, field0%inner_field%nz+2), buf_size, MPI_DOUBLE_PRECISION, &
+         & parallel%nright, 11, &
+         & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+    
+    ! Send to right, receive from left
+    call mpi_sendrecv(data(:, :, field0%inner_field%nz+1), buf_size, MPI_DOUBLE_PRECISION, &
+         & parallel%nright, 12, &
+         & data(:, :, 1), buf_size, MPI_DOUBLE_PRECISION,&
+         & parallel%nleft, 12, &
+         & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+    !write(15,*) '########mpi stuff here, data pointer is at ', c_loc(data(1,1,1)), 'doublecheck ', c_loc(data), 'triplecheck', field0%inner_field%data
+    !write(15,*) '########mpi stuff here, second element is at ', c_loc(data(1,1,2)), 'or ', c_loc(data(2,1,1))
+    !write(15,*) 'size of data is', size(data), 'address of last thing is ', c_loc(data(6,6,4)), 'and it contains ',data(6,6,4)
+    !write(15,*) 'first element over data is ',c_loc(data(7,6,4)), 'that contains ',data(7,6,4), ' or ', c_loc(data(6,6,5)),' that contains ', data(6,6,5) !want to be sure they are out of bounds.
+    !write(15,*) 'first sendrecv data (left send, right recv), send ',c_loc(data(:,:,2)),' recv ', c_loc(data(:, :, field0%inner_field%nz + 2))
+    !write(15,*) 'second sendrecv data, send ',c_loc(data(:, :, field0%inner_field%nz+1)),' recv ', c_loc(data(:, :, 1))
+    !write(15,*) 'field host data is at ', c_loc(field0%data), ' and device data is at ', field0%inner_field%data, ' data pointer is at ', c_loc(data)
+    !write(15,*) 'buf size is ', buf_size
+    !write(15,*) 'shape of data is ', shape(data)
+    !call flush (15)
+    !write(15,*) 'buf_size', buf_size
+    !call flush (15)
+    !write(15,*) 'precision', MPI_DOUBLE_PRECISION
+    !call flush (15)
+    !write(15,*) 'right proc' ,parallel%nright
+    !call flush (15)
+    !write(15,*) 'pointer' ,data(:, :, 1)
+    !call flush (15)
+    !write(15,*) 'left', parallel%nleft
+    !call flush (15)
+
+    !write(15,*) 'second done'
+    
+
+  end subroutine exchange
+
+end module core
diff --git a/3d/hipfort/heat_mod.F90 b/3d/hipfort/heat_mod.F90
new file mode 100644
index 0000000..c975f7d
--- /dev/null
+++ b/3d/hipfort/heat_mod.F90
@@ -0,0 +1,138 @@
+! Field metadata for heat equation solver
+module heat
+  use iso_fortran_env, only : REAL64
+  use iso_c_binding
+  use hipfort
+  use hipfort_check
+  
+  implicit none
+
+  integer, parameter :: dp = REAL64
+  real(dp), parameter :: DX = 0.01, DY = 0.01, DZ = 0.01  ! Fixed grid spacing
+
+  !data types structures.
+  !i want to have a struct that contains all the details that can be used from the GPU side (i.e. into C side of the program, device memory)
+  !and a wrapper of that struct that will contain the fortran declaration of the memory (i.e. the host memory)
+  !the device memory must be declared as a c_ptr, and will be allocated and managed manually with hipfort function calls
+  !the host memory has to be a pointer (allocatable,target was not valid. I am unsure of the reason and don't remember the actual compiler error)
+  type, bind(c) :: field_c
+     integer (c_int) :: nx          ! local dimension of the field
+     integer (c_int) :: ny
+     integer (c_int) :: nz
+     integer (c_int) :: nx_full     ! global dimension of the field
+     integer (c_int) :: ny_full
+     integer (c_int) :: nz_full
+     real(c_double) :: dx
+     real(c_double) :: dy
+     real(c_double) :: dz
+     type(c_ptr) :: data = c_null_ptr
+  end type field_c
+
+  type :: field
+      type(field_c) :: inner_field
+      real(c_double), dimension(:,:,:), pointer :: data
+  end type field
+
+  type :: parallel_data
+     integer :: size
+     integer :: rank
+     integer :: nleft, nright  ! Ranks of neighbouring MPI tasks
+     integer (c_int) :: dev_count !c_int because it needs to go with the hipfort functions
+  end type parallel_data
+
+contains
+  ! Initialize the field type metadata
+  ! Arguments:
+  !   field0 (type(field)): input field
+  !   nx, ny, dx, dy: field dimensions and spatial step size
+  subroutine set_field_dimensions(field0, nx, ny, nz, parallel)
+    implicit none
+
+    type(field), intent(out) :: field0
+    integer, intent(in) :: nx, ny, nz
+    type(parallel_data), intent(in) :: parallel
+
+    integer :: nx_local, ny_local, nz_local
+
+    nx_local = nx
+    ny_local = ny
+    nz_local = nz / parallel%size
+
+    field0%inner_field%dx = DX
+    field0%inner_field%dy = DY
+    field0%inner_field%dz = DZ
+    field0%inner_field%nx = nx_local
+    field0%inner_field%ny = ny_local
+    field0%inner_field%nz = nz_local
+    field0%inner_field%nx_full = nx
+    field0%inner_field%ny_full = ny
+    field0%inner_field%nz_full = nz
+
+  end subroutine set_field_dimensions
+
+  subroutine parallel_setup(parallel, nx, ny, nz)
+    use mpi
+
+
+    implicit none
+
+    type(parallel_data), intent(out) :: parallel
+    integer, intent(in), optional :: nx, ny, nz
+
+    integer :: nz_local
+    integer :: ierr
+
+
+    integer :: node_rank, node_procs, my_device
+    integer :: intranodecomm
+
+
+    call mpi_comm_size(MPI_COMM_WORLD, parallel%size, ierr)
+
+    if (present(nz)) then
+       nz_local = nz / parallel%size
+       if (nz_local * parallel%size /= nz) then
+          write(*,*) 'Cannot divide grid evenly to processors'
+          call mpi_abort(MPI_COMM_WORLD, -2, ierr)
+       end if
+    end if
+
+    call mpi_comm_rank(MPI_COMM_WORLD, parallel%rank, ierr)
+      
+
+    parallel%nleft = parallel%rank - 1
+    parallel%nright = parallel%rank + 1
+
+    if (parallel%nleft < 0) then
+       parallel%nleft = MPI_PROC_NULL
+    end if
+    if (parallel%nright > parallel%size - 1) then
+       parallel%nright = MPI_PROC_NULL
+    end if
+
+    parallel%dev_count = 0
+
+    call mpi_comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,  MPI_INFO_NULL, intranodecomm, ierr);
+
+    call mpi_comm_rank(intranodecomm, node_rank, ierr);
+    call mpi_comm_size(intranodecomm, node_procs, ierr);
+
+    call mpi_comm_free(intranodecomm, ierr)
+
+    call hipCheck(hipGetDeviceCount(parallel%dev_count))
+    my_device = mod(node_rank, parallel%dev_count)
+
+    if (node_procs > parallel%dev_count) then
+       if (parallel%rank == 0) then
+          write(*, '(AI4AI4)') 'Oversubscriging GPUs: MPI tasks per node: ', &
+                &               node_procs, ' GPUs per node: ', parallel%dev_count
+       end if
+    end if
+
+    call hipCheck(hipSetDevice(my_device))
+
+
+
+  end subroutine parallel_setup
+
+end module heat
diff --git a/3d/hipfort/install_hipfort.sh b/3d/hipfort/install_hipfort.sh
new file mode 100644
index 0000000..6ca3802
--- /dev/null
+++ b/3d/hipfort/install_hipfort.sh
@@ -0,0 +1,12 @@
+cd hipfort
+mkdir build
+cd build
+echo " compiler is $1"
+if ($1 == cray); then 
+cmake .. -DHIPFORT_COMPILER=ftn -DHIPFORT_COMPILER_FLAGS="-ffree -e Z" -DCMAKE_INSTALL_PREFIX=../hipfortbin
+elif ($1 == gnu); then 
+cmake .. -DHIPFORT_COMPILER=gfortran -DCMAKE_Fortran_FLAGS=" -ffree-form -cpp -ffree-line-length-none -fmax-errors=5 -std=f2008 -fno-underscoring" -DCMAKE_INSTALL_PREFIX=../hipfortbin
+fi
+
+make 
+make install
diff --git a/3d/hipfort/io.F90 b/3d/hipfort/io.F90
new file mode 100644
index 0000000..7b26f76
--- /dev/null
+++ b/3d/hipfort/io.F90
@@ -0,0 +1,126 @@
+! I/O routines for heat equation solver
+module io
+  use heat
+  use mpi
+  use iso_c_binding
+  use hipfort
+  use hipfort_check
+
+contains
+
+  ! Output routine, saves the temperature distribution as a png image
+  ! Arguments:
+  !   curr (type(field)): variable with the temperature data
+  !   iter (integer): index of the time step
+  ! not yet supported, left and commented out the old code. 
+  ! only required change SHOULD be to add the memcpy d2h.
+  subroutine write_field(curr, iter, parallel)
+
+!#ifndef DISABLE_PNG
+!    use pngwriter
+!#endif
+!    implicit none
+!    type(field), intent(in) :: curr
+!    integer, intent(in) :: iter
+!    type(parallel_data), intent(in) :: parallel
+!
+!    character(len=85) :: filename
+!
+!    integer :: stat
+!    real(dp), dimension(:,:,:), allocatable, target :: full_data
+!    integer :: p, ierr
+!
+!    if (parallel%rank == 0) then
+!       allocate(full_data(curr%nx_full, curr%ny_full, curr%nz_full))
+!       ! Copy rand #0 data to the global array
+!       full_data(1:curr%nx, 1:curr%ny, 1:curr%nz) = curr%data(1:curr&
+!            &%nx, 1:curr%ny, 1:curr%nz)
+!
+!       ! Receive data from other ranks
+!       do p = 1, parallel%size - 1
+!          call mpi_recv(full_data(1:curr%nx, 1:curr%ny, p*curr%nz + 1:(p + 1) * curr%ny), &
+!               & curr%nx * curr%ny * curr%nz, MPI_DOUBLE_PRECISION, p, 22, &
+!               & MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
+!       end do
+!       write(filename,'(A5,I4.4,A4,A)')  'heat_', iter, '.png'
+!#ifdef DISABLE_PNG
+!       if (parallel%rank == 0) then
+!           write(*,*) "No libpng, file not written"
+!       end if
+!#else
+!       stat = save_png(full_data(:, :, curr%nz_full / 2), curr%nx_full, curr%ny_full, filename)
+!#endif
+!       deallocate(full_data)
+!    else
+!       ! Send data
+!       call mpi_send(curr%data(1:curr%nx, 1:curr%ny, 1:curr%nz), curr%nx * curr%ny * curr%nz, MPI_DOUBLE_PRECISION, 0, 22, &
+!            & MPI_COMM_WORLD, ierr)
+!    end if
+
+  end subroutine write_field
+
+
+  ! Reads the temperature distribution from an input file
+  ! Arguments:
+  !   field0 (type(field)): field variable that will store the
+  !                         read data
+  !   filename (char): name of the input file
+  ! Note that this version assumes the input data to be in C memory layout
+  ! untested routine, taken from openacc/fortran. I only inserted the hipfort calls
+  subroutine read_field(field0, filename, parallel)
+
+    implicit none
+    type(field), intent(out) :: field0
+    character(len=85), intent(in) :: filename
+    type(parallel_data), intent(out) :: parallel
+
+    integer :: nx, ny, nz, i, ierr
+    character(len=2) :: dummy
+
+    real(dp), dimension(:,:,:), allocatable :: full_data, inner_data
+
+    open(10, file=filename)
+    ! Read the header
+    read(10, *) dummy, nx, ny, nz
+
+    call parallel_setup(parallel, nx, ny, nz)
+    call set_field_dimensions(field0, nx, ny, nz, parallel)
+
+    ! The arrays for temperature field contain also a halo region
+    allocate(field0%data(0:field0%inner_field%nx+1, 0:field0%inner_field%ny+1, 0:field0%inner_field%nz+1))
+    
+    call hipCheck(hipMalloc(field0%inner_field%data, size(field0%data)* 8_c_size_t)) 
+
+    allocate(inner_data(field0%inner_field%nx, field0%inner_field%ny, field0%inner_field%nz))
+
+    if (parallel%rank == 0) then
+       allocate(full_data(nx, ny, nz))
+       ! Read the data
+       do i = 1, nx
+          read(10, *) full_data(i, 1:ny, 1)
+       end do
+    end if
+
+    call mpi_scatter(full_data, nx * field0%inner_field%ny, MPI_DOUBLE_PRECISION, inner_data, &
+         & nx * field0%inner_field%ny, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
+    ! Copy to full array containing also boundaries
+    field0%data(1:field0%inner_field%nx, 1:field0%inner_field%ny, 1:field0%inner_field%nz) = inner_data(:,:,:)
+    !copy data array to gpu array
+    call hipCheck(hipMemcpy(field0%inner_field%data, c_loc(field0%data), size(field0%data)* 8_c_size_t, hipMemcpyHostToDevice))
+
+
+    ! Set the boundary values
+    ! field0%data(1:field0%nx, 0) = field0%data(1:field0%nx, 1)
+    ! field0%data(1:field0%nx, field0%ny + 1) = field0%data(1:field0%nx, field0%ny)
+    ! field0%data(0, 0:field0%ny + 1) = field0%data(1, 0:field0%ny + 1)
+    ! field0%data(field0%nx + 1, 0:field0%ny + 1) = field0%data(field0%nx, 0:field0%ny + 1)
+
+    close(10)
+    deallocate(inner_data)
+    if (parallel%rank == 0) then
+       deallocate(full_data)
+    end if
+
+  end subroutine read_field
+
+end module io
diff --git a/3d/hipfort/kernel.cpp b/3d/hipfort/kernel.cpp
new file mode 100644
index 0000000..f76d2b4
--- /dev/null
+++ b/3d/hipfort/kernel.cpp
@@ -0,0 +1,123 @@
+#include <hip/hip_runtime.h>
+
+//same data struct declared on the fortran side.
+struct Field {
+    // nx and ny are the true dimensions of the field. The temperature matrix
+    // contains also ghost layers, so it will have dimensions nx+2 x ny+2 
+    int nx;                     // Local dimensions of the field
+    int ny;
+    int nz;
+    int nx_full;                // Global dimensions of the field
+    int ny_full;                // Global dimensions of the field
+    int nz_full;                // Global dimensions of the field
+    double dx = 0.01;           // Grid spacing
+    double dy = 0.01;
+    double dz = 0.01;
+    double* data;
+};
+
+
+// Update the temperature values using five-point stencil */
+__global__ void evolve_kernel(double *currdata, double *prevdata, double a, double dt, int nx, int ny, int nz,
+                       double inv_dx2, double inv_dy2, double inv_dz2, int max_threads)
+{
+
+    //linearize the thread id to have coalesced accesses
+    int ythr_offs = threadIdx.y * blockDim.x + threadIdx.x;
+    int zthr_offs = threadIdx.z * blockDim.y * blockDim.x+ ythr_offs;
+    int xblock_offs = blockIdx.x * blockDim.z * blockDim.y * blockDim.x + zthr_offs;
+    int yblock_offs = blockIdx.y * blockDim.z * blockDim.y * blockDim.x * gridDim.x + xblock_offs;
+    int tid = blockIdx.z * blockDim.z * blockDim.y * blockDim.x * gridDim.x * gridDim.y + yblock_offs;
+
+
+    //i only want to run significative computations, so i need to map the tid to the real memory addresses
+    //this means that i have some "holes" where the halo is mapped.
+    //to avoid too many uncoalesced accesses, we decide to "mask" the small skip (i.e. the halo on the central dimension)
+    //but we will avoid to evaluate the big skip (i.e. the halo surface when we change on the third dimension)
+    int start_offs = (ny + 2) * (nx + 2) +  (nx + 2);// 1 face + 1 line are the halos, must not be computated. 
+    int big_skip_size = (ny + 2) + (nx + 2) ;//+ 2 //not completely sure, could be 2* nx+2 or 2*ny+2 
+    int num_big_skips = tid / ((nx+2)*ny); //again not completely sure, could be 2nx or 2ny
+    int ind = tid + start_offs + num_big_skips * big_skip_size;
+     
+               
+
+    //keep load coalesced (i.e. do load also for thread that must not compute)
+    auto tmpdata = prevdata[ind];
+    auto ip = prevdata[ind + ((nx+2)*(ny+2))];
+    auto im = prevdata[ind - ((nx+2)*(ny+2))];
+    auto jp = prevdata[ind + (nx+2)];
+    auto jm = prevdata[ind - (nx+2)];
+    auto kp = prevdata[ind + 1];
+    auto km = prevdata[ind - 1];
+
+    //here we really exclude from the computation the "small skips", they are the first and the last on the innermost dimension
+    //e.g., if nx is 4, we need to exclude tid 0,5,6,11,12,... because there is where the halo is.
+    if (!((tid % (nx+2)) == 0 || ((tid - (nx+1)) % (nx+2)) == 0) && tid < max_threads)
+      
+      //these printfs could be useful to understand the mapping of tid to memory idx.
+      //to use them you need to reduce the total dimension of the problem to something manageable (e.g. 4,4,4)
+      //and initialize the grid values with understandable numbers (i.e. 100*x+10*y+z)
+      //in this way you will have a set of print with all the "active" indexes
+
+      /*printf ("thread is block %d,%d,%d, is thread %d,%d,%d, tid %d, ind is %d, and value %f \n ",
+        (int)blockIdx.x,(int)blockIdx.y,(int)blockIdx.z,
+        (int)threadIdx.x,(int)threadIdx.y,(int)threadIdx.z,
+        tid,ind,
+        prevdata[ind]);*/
+        /*printf("ind is %f and my halo are %f %f %f %f %f %f \n",
+        prevdata[ind],
+        prevdata[ind+(nx+2)],           //y
+        prevdata[ind-(nx+2)],           //y
+        prevdata[ind+((nx+2)*(ny+2))],  //x   //i think the x-z inversion is due to fortran col major
+        prevdata[ind-((nx+2)*(ny+2))],  //x   //i think the x-z inversion is due to fortran col major
+        prevdata[ind+1],                //z   //i think the x-z inversion is due to fortran col major
+        prevdata[ind-1]                 //z   //i think the x-z inversion is due to fortran col major
+        );*/
+
+      tmpdata = tmpdata + a * dt * (
+                  ( ip - 2.0 * tmpdata + im ) * inv_dx2 + //fortran matrix, column major. farthest point is x dimension
+                  ( jp - 2.0 * tmpdata + jm ) * inv_dy2 +
+                  ( kp - 2.0 * tmpdata + km ) * inv_dz2
+      );
+      //keep store coalesced. note that we used tmpdata local variable only for this reason.
+      //halo points will load and store it, without updating.
+      currdata[ind] = tmpdata;
+    
+    
+}
+//i did try to keep everything cpp as an attempt to make the HIP API work, however it failed
+//i still prefer having this structure because it can allow templating of kernels and so on,
+//so i'll keep it as a "good practice" thing
+void evolvecpp(Field* curr, const Field* prev, const double a, const double dt)
+{
+  //note that in the innermost dimension we do nx +2 to insert the "small skips"
+  //in the set of threads running. ny and nz don't need to be increased.
+  int max_threads = (curr->nx+2)* (curr->ny) * (curr->nz);
+  int blocks,grids;
+  //i'd like to use the optimal occupancy but the API seems to be broken.
+  //I'm not sure what is happening there but cannot use that
+  //hipOccupancyMaxPotentialBlockSize(&grids,&blocks,evolve_kernel, 0, max_threads);
+  
+  //lets assign a block size of 256, this parameter can be played with.
+  blocks = 256;
+  grids = (max_threads + blocks -1) / blocks;
+
+  auto inv_dx2 = 1.0 / (prev->dx * prev->dx);
+  auto inv_dy2 = 1.0 / (prev->dy * prev->dy);
+  auto inv_dz2 = 1.0 / (prev->dz * prev->dz);
+  
+  hipLaunchKernelGGL((evolve_kernel), grids,blocks, 0, 0, curr->data, prev->data, a, dt, curr->nx, curr->ny, curr->nz,inv_dx2,inv_dy2,inv_dz2, max_threads );
+}
+
+
+
+extern "C"
+{
+  void evolve(Field* curr, const Field* prev, const double a, const double dt)
+  {
+    //launch the wrapper
+    evolvecpp(curr,prev,a,dt);
+  }
+}
+
+
diff --git a/3d/hipfort/launcher.sh b/3d/hipfort/launcher.sh
new file mode 100644
index 0000000..778acde
--- /dev/null
+++ b/3d/hipfort/launcher.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#SBATCH --partition=eap
+#SBATCH --account=project_462000007 
+#SBATCH --time=02:10:10
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=1
+#SBATCH --gpus-per-node=8
+#SBATCH -e slurm.err
+#SBATCH -o slurm.out
+#SBATCH --exclusive
+
+source /users/vitaliem/prova_hip/heat-equation/3d/hipfort/modules
+make 
+srun /users/vitaliem/prova_hip/heat_eq/3d/hipfort/heat_hipfort
diff --git a/3d/hipfort/main.F90 b/3d/hipfort/main.F90
new file mode 100644
index 0000000..10cdd21
--- /dev/null
+++ b/3d/hipfort/main.F90
@@ -0,0 +1,85 @@
+! Heat equation solver in 2D.
+
+program heat_solve
+  use heat
+  use core
+  use io
+  use setup
+  use utilities
+  use mpi
+  use iso_c_binding
+  use hipfort
+  use hipfort_check
+
+  implicit none
+
+  real(dp), parameter :: a = 0.5 ! Diffusion constant
+  type(field) :: current, previous    ! Current and previus temperature fields
+
+  real(dp) :: dt     ! Time step
+  integer :: nsteps       ! Number of time steps
+  integer, parameter :: image_interval = 1500 ! Image output interval
+
+  type(parallel_data) :: parallelization
+  integer :: ierr
+
+  integer :: iter
+
+  real(dp) :: average_temp   !  Average temperature
+
+  real(kind=dp) :: start_time, stop_time ! Timers
+
+  call mpi_init(ierr)
+
+  call initialize(current, previous, nsteps, parallelization)
+
+  ! Draw the picture of the initial state
+  !call write_field(current, 0, parallelization)
+
+  !average_temp = average(current)
+  if (parallelization % rank == 0) then
+     write(*,*) 'Average temperature at start: ', average_temp
+  end if
+
+  ! Largest stable time step
+  dt = current%inner_field%dx**2 * current%inner_field%dy**2 * current%inner_field%dz**2 / &
+       & (2.0 * a * (current%inner_field%dx**2 + current%inner_field%dy**2 + current%inner_field%dz**2))
+
+  ! Main iteration loop, save a picture every
+  ! image_interval steps
+
+  start_time =  mpi_wtime()
+
+  do iter = 1, nsteps
+     
+     ! senrecv is blocking, so no need of barriers and so on
+     call exchange(previous, parallelization)
+
+     call evolve(current%inner_field, previous%inner_field, a, dt)
+     
+     !output not supported yet
+     !if (mod(iter, image_interval) == 0) then
+     !   call update_host(current)
+     !   call write_field(current, iter, parallelization)
+     !end if
+     call swap_fields(current, previous)
+  end do
+
+  stop_time = mpi_wtime()
+
+  ! Average temperature for reference
+  average_temp = average(previous,parallelization)
+
+  if (parallelization % rank == 0) then
+     write(*,'(A,F7.3,A)') 'Iteration took ', stop_time - start_time, ' seconds.'
+     write(*,'(A,F9.6)') 'Average temperature: ',  average_temp
+     if (command_argument_count() == 0) then
+         write(*,'(A,F9.6)') 'Reference value with default arguments: ', 63.834223_dp
+     end if
+  end if
+
+  call finalize(current, previous)
+
+  call mpi_finalize(ierr)
+
+end program heat_solve
diff --git a/3d/hipfort/pngwriter_mod.F90 b/3d/hipfort/pngwriter_mod.F90
new file mode 100644
index 0000000..f6fe31b
--- /dev/null
+++ b/3d/hipfort/pngwriter_mod.F90
@@ -0,0 +1,41 @@
+! PNG writer for heat equation solver
+module pngwriter
+  use heat
+
+contains
+
+  function save_png(data, nx, ny, fname) result(stat)
+
+    use, intrinsic :: ISO_C_BINDING
+    implicit none
+
+    real(dp), dimension(:,:), intent(in) :: data
+    integer, intent(in) :: nx, ny
+    character(len=*), intent(in) :: fname
+    integer :: stat
+
+    ! Interface for save_png C-function
+    interface
+       ! The C-function definition is
+       !   int save_png(double *data, const int nx, const int ny,
+       !                const char *fname)
+       function save_png_c(data, nx, ny, fname, order) &
+            & bind(C,name="save_png") result(stat)
+         use, intrinsic :: ISO_C_BINDING
+         implicit none
+         real(kind=C_DOUBLE) :: data(*)
+         integer(kind=C_INT), value, intent(IN) :: nx, ny
+         character(kind=C_CHAR), intent(IN) :: fname(*)
+         character(kind=C_CHAR), value, intent(IN) :: order
+         integer(kind=C_INT) :: stat
+       end function save_png_c
+    end interface
+
+    stat = save_png_c(data, nx, ny, trim(fname) // C_NULL_CHAR, 'f')
+    if (stat /= 0) then
+       write(*,*) 'save_png returned error!'
+    end if
+
+  end function save_png
+
+end module pngwriter
diff --git a/3d/hipfort/setup.F90 b/3d/hipfort/setup.F90
new file mode 100644
index 0000000..01511ea
--- /dev/null
+++ b/3d/hipfort/setup.F90
@@ -0,0 +1,195 @@
+! Setup routines for heat equation solver
+module setup
+  use heat
+  use iso_c_binding
+  use hipfort
+  use hipfort_check
+
+contains
+
+  subroutine initialize(previous, current, nsteps, parallel)
+    use utilities
+    use io
+
+    implicit none
+
+    type(field), intent(out) :: previous, current
+    integer, intent(out) :: nsteps
+    type(parallel_data), intent(out) :: parallel
+
+    integer :: height, width, length
+    logical :: using_input_file
+    character(len=85) :: input_file, arg  ! Input file name and command line arguments
+
+
+    ! Default values for grid size and time steps
+    height = 800
+    width = 800
+    length = 800
+    nsteps = 500
+    using_input_file = .false.
+
+    ! Read in the command line arguments and
+    ! set up the needed variables
+    select case(command_argument_count())
+    case(0) ! No arguments -> default values
+    case(1) ! One argument -> input file name
+      using_input_file = .true.
+      call get_command_argument(1, input_file)
+    case(2) ! Two arguments -> input file name and number of steps
+      using_input_file = .true.
+      call get_command_argument(1, input_file)
+      call get_command_argument(2, arg)
+      read(arg, *) nsteps
+    case(4) ! Three arguments -> rows, cols and nsteps
+      call get_command_argument(1, arg)
+      read(arg, *) height
+      call get_command_argument(2, arg)
+      read(arg, *) width
+      call get_command_argument(3, arg)
+      read(arg, *) length
+      call get_command_argument(4, arg)
+      read(arg, *) nsteps
+    case default
+       call usage()
+       stop
+    end select
+    ! Initialize the fields according the command line arguments
+    if (using_input_file) then
+      call read_field(previous, input_file, parallel)
+      call copy_fields(previous, current)
+    else
+      call parallel_setup(parallel, height, width, length)
+      call set_field_dimensions(previous, height, width, length, parallel)
+      call set_field_dimensions(current, height, width, length, parallel)
+      call generate_field(previous, parallel)
+      call copy_fields(previous, current)
+    end if
+
+    if (parallel%rank == 0) then
+      write(*,'(A, I5, A, I5, A, I5, A, I5)')  & 
+        &  'Simulation parameters: height: ', height, ' width: ', width, &
+        &  ' length: ', length, ' time steps: ', nsteps
+      write(*,'(A, I5)') 'Number of MPI tasks: ', parallel%size
+      write(*,'(A, I5)') 'Number of GPUs: ', parallel%dev_count
+    end if
+
+  end subroutine initialize
+
+  ! Generate initial the temperature field.  Pattern is disc with a radius
+  ! of nx_full / 6 in the center of the grid.
+  ! Boundary conditions are (different) constant temperatures outside the grid
+  subroutine generate_field(field0, parallel)
+    use heat
+
+    implicit none
+
+    type(field), intent(inout) :: field0
+    type(parallel_data), intent(in) :: parallel
+
+    real(dp) :: radius
+    integer :: i, j, k, ds2
+
+    ! The arrays for field contain also a halo region
+    allocate(field0%data(0:field0%inner_field%nx+1, 0:field0%inner_field%ny+1, 0:field0%inner_field%nz+1))
+    ! allocate same size memory on the device.
+    call hipCheck(hipMalloc(field0%inner_field%data, size(field0%data)* 8_c_size_t)) 
+
+    ! Square of the disk radius
+    radius = (field0%inner_field%nx_full + field0%inner_field%ny_full + field0%inner_field%nz_full) / 18.0_dp
+
+    do k = 0, field0%inner_field%nz + 1
+      do j = 0, field0%inner_field%ny + 1
+        do i = 0, field0%inner_field%nx + 1
+          ds2 = int((i - field0%inner_field%nx_full / 2.0_dp + 1)**2 + &
+            &       (j - field0%inner_field%ny_full / 2.0_dp + 1)**2 + &
+            & (k + parallel%rank * field0%inner_field%nz - field0%inner_field%nz_full / 2.0_dp + 1)**2)
+          if (ds2 < radius**2) then
+            field0%data(i,j,k) = 5.0_dp
+          else
+            field0%data(i,j,k) = 65.0_dp
+          end if
+            !this "alternative allocation" can be used with a reduced grid size to 
+            !print what is going on using the printfs in the kernel.
+            !if (parallel%rank == 0) then 
+            !field0%data(i,j,k) = i*100+j*10+k
+            !else
+            !field0%data(i,j,k) = 1.0_dp
+            !endif
+          end do
+       end do
+    end do
+
+    ! Boundary conditions
+    if (parallel%rank == 0) then
+      field0%data(:,:,0) = 20.0_dp
+    end if
+    if (parallel%rank == parallel%size - 1) then
+      field0%data(:,:,field0%inner_field%nz+1) = 35.0_dp
+    end if
+
+    field0%data(0,:,:) = 20.0_dp
+    field0%data(field0%inner_field%nx+1,:,:) = 35.0_dp
+    field0%data(:,0,:) = 20.0_dp
+    field0%data(:,field0%inner_field%ny+1,:) = 35.0_dp
+
+    !and this print is to give a visual impression of the grid.
+    !never use this with the full grid
+    !! if (parallel%rank == 0) then    
+    !   do i=0,field0%inner_field%nx+1
+    !     do j=0,field0%inner_field%ny+1
+    !       do k=0,field0%inner_field%nz+1
+    !         write (*,"(F15.10)", advance="no") field0%data(i,j,k)
+    !       enddo
+    !       write (*,*) parallel%rank, 'end of line'
+    !     enddo
+    !     write (*,*) parallel%rank, 'end of page'
+    !   enddo
+    ! ! endif
+    ! write(*,*) '####### END OF INITIALIZATION OF THE MATRIX ######'
+
+
+
+    
+    !copy data array to gpu array
+    call hipCheck(hipMemcpy(field0%inner_field%data, c_loc(field0%data), size(field0%data)* 8_c_size_t, hipMemcpyHostToDevice))
+   
+
+  end subroutine generate_field
+
+
+  ! Clean up routine for field type
+  ! Arguments:
+  !   field0 (type(field)): field variable to be cleared
+  subroutine finalize(field0, field1)
+    use heat
+    use hipfort
+
+    implicit none
+
+    type(field), intent(inout) :: field0, field1
+
+    !clean gpu memory
+    call hipCheck(hipFree(field0%inner_field%data))
+    call hipCheck(hipFree(field1%inner_field%data))
+  
+    deallocate(field0%data)
+    deallocate(field1%data)
+
+  end subroutine finalize
+
+  ! Helper routine that prints out a simple usage if
+  ! user gives more than three arguments
+  subroutine usage()
+    implicit none
+    character(len=256) :: buf
+
+    call get_command_argument(0, buf)
+    write (*,'(A)') 'Usage:'
+    write (*,'(A, " (default values will be used)")') trim(buf)
+    write (*,'(A, " <filename>")') trim(buf)
+    write (*,'(A, " <filename> <nsteps>")') trim(buf)
+    write (*,'(A, " <rows> <cols> <nsteps>")') trim(buf)
+  end subroutine usage
+
+end module setup
diff --git a/3d/hipfort/utilities.F90 b/3d/hipfort/utilities.F90
new file mode 100644
index 0000000..13bcf25
--- /dev/null
+++ b/3d/hipfort/utilities.F90
@@ -0,0 +1,119 @@
+! Utility routines for heat equation solver
+!   NOTE: This file does not need to be edited!
+module utilities
+  use heat
+  use iso_c_binding
+  use hipfort
+  use hipfort_check
+
+contains
+
+  ! Swap the data fields of two variables of type field
+  ! Arguments:
+  !   curr, prev (type(field)): the two variables that are swapped
+  subroutine swap_fields(curr, prev)
+
+    implicit none
+
+    type(field), intent(inout) :: curr, prev
+    real(dp), pointer, dimension(:,:,:) :: tmp
+    type (c_ptr) :: tmp_cptr
+
+    ! i'm now using pointers, so no more move alloc.
+    tmp => curr%data       !call move_alloc(curr%data, tmp)
+    curr%data => prev%data !call move_alloc(prev%data, curr%data)
+    prev%data => tmp       !call move_alloc(tmp, prev%data)
+    ! also need to swap the c_ptr with the device memory
+    tmp_cptr = curr%inner_field%data
+    curr%inner_field%data = prev%inner_field%data
+    prev%inner_field%data = tmp_cptr 
+    !TODO wouldn't be better to have the structs as pointers and just swap the struct?
+    
+
+  end subroutine swap_fields
+
+  ! Copy the data from one field to another
+  ! Arguments:
+  !   from_field (type(field)): variable to copy from
+  !   to_field (type(field)): variable to copy to
+  !FIXME UPDATE FOR NEW DATA STRUCT
+  subroutine copy_fields(from_field, to_field)
+
+    implicit none
+
+    type(field), intent(in) :: from_field
+    type(field), intent(out) :: to_field
+
+    ! Consistency checks
+    if (.not.associated(from_field%data) .or. from_field%inner_field%data == c_null_ptr) then
+       write (*,*) "Can not copy from a field without allocated data"
+       stop
+    end if
+    if (.not.associated(to_field%data)) then
+       ! Target is not initialize, allocate memory
+       allocate(to_field%data(lbound(from_field%data, 1):ubound(from_field%data, 1), &
+            & lbound(from_field%data, 2):ubound(from_field%data, 2), &
+            & lbound(from_field%data, 3):ubound(from_field%data, 3)))
+    else if (any(shape(from_field%data) /= shape(to_field%data))) then
+       write (*,*) "Wrong field data sizes in copy routine"
+       print *, shape(from_field%data), shape(to_field%data)
+       stop
+    end if
+
+    if (to_field%inner_field%data == c_null_ptr) then
+      call hipCheck(hipMalloc(to_field%inner_field%data ,size(from_field%data)* 8_c_size_t)) !https://github.com/ROCmSoftwarePlatform/hipfort/blob/master/README.md
+    end if
+
+    to_field%data = from_field%data !should be valid even with pointers, this is an assigment of value https://stackoverflow.com/questions/61127387/fortran-pointer-assignment-difference-between-and
+    
+    call hipCheck(hipmemcpydtod(to_field%inner_field%data,from_field%inner_field%data,size(from_field%data)* 8_c_size_t))
+    
+    to_field%inner_field%nx = from_field%inner_field%nx
+    to_field%inner_field%ny = from_field%inner_field%ny
+    to_field%inner_field%nz = from_field%inner_field%nz
+    to_field%inner_field%nx_full = from_field%inner_field%nx_full
+    to_field%inner_field%ny_full = from_field%inner_field%ny_full
+    to_field%inner_field%nz_full = from_field%inner_field%nz_full
+    to_field%inner_field%dx = from_field%inner_field%dx
+    to_field%inner_field%dy = from_field%inner_field%dy
+    to_field%inner_field%dz = from_field%inner_field%dz
+  end subroutine copy_fields
+
+  function average(field0, parallel) 
+    use mpi
+
+    implicit none
+
+    real(dp) :: average
+    type(field) :: field0
+    type(parallel_data), intent(in) :: parallel
+
+    real(dp) :: local_average
+    integer :: rc
+    integer :: i,j,k
+    !update host memory
+    call hipCheck(hipMemcpy(c_loc(field0%data), field0%inner_field%data, size(field0%data)* 8_c_size_t, hipMemcpyDeviceToHost))
+
+    !set of prints to be used with small grids and other initialization values to 
+    !peek under the hood of the matrix after the evolve kernel is called.
+    !!if (parallel%rank == 1) then    
+    !  do i=0,field0%inner_field%nx+1
+    !    do j=0,field0%inner_field%ny+1
+    !      do k=0,field0%inner_field%nz+1
+    !        write (*,"(F15.10)", advance="no") field0%data(i,j,k)
+    !      enddo
+    !      write (*,*) parallel%rank, 'end of line'
+    !    enddo
+    !    write (*,*) parallel%rank, 'end of page'
+    !  enddo
+    !!endif
+
+
+    local_average = sum(field0%data(1:field0%inner_field%nx, 1:field0%inner_field%ny, 1:field0%inner_field%nz))
+    call mpi_allreduce(local_average, average, 1, MPI_DOUBLE_PRECISION, MPI_SUM,  &
+               &       MPI_COMM_WORLD, rc)
+    average = average / (field0%inner_field%nx_full * field0%inner_field%ny_full * field0%inner_field%nz_full)
+    
+  end function average
+
+end module utilities