diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index 07fb0bad3..75f6da976 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -11,6 +11,7 @@ list( APPEND trans_gpu_common_src algor/ext_acc.F90 algor/c_hipmemgetinfo.cpp + algor/hip_allocator_mod.F90 algor/buffered_allocator_mod.F90 algor/device_mod.F90 algor/growing_allocator_mod.F90 diff --git a/src/trans/gpu/algor/buffered_allocator_mod.F90 b/src/trans/gpu/algor/buffered_allocator_mod.F90 index 34b9c42aa..4a3f57179 100644 --- a/src/trans/gpu/algor/buffered_allocator_mod.F90 +++ b/src/trans/gpu/algor/buffered_allocator_mod.F90 @@ -17,7 +17,7 @@ MODULE BUFFERED_ALLOCATOR_MOD IMPLICIT NONE PRIVATE - PUBLIC :: BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE, RESERVE, ASSIGN_PTR, GET_ALLOCATION + PUBLIC :: BUFFERED_ALLOCATOR, ALLOCATION_RESERVATION_HANDLE, RESERVE, RESERVE_GAM, ASSIGN_PTR, GET_ALLOCATION, GET_ALLOCATION_GAM PUBLIC :: MAKE_BUFFERED_ALLOCATOR, INSTANTIATE_ALLOCATOR ! The buffered allocator uses double buffering. The idea is that the allocator @@ -44,10 +44,19 @@ MODULE BUFFERED_ALLOCATOR_MOD INTEGER(KIND=C_SIZE_T) :: BUFR_SZ(0:NBUF-1) INTEGER(KIND=JPIM) :: NEXT_BUF TYPE(GROWING_ALLOCATION_TYPE), POINTER :: PTR + + ! GPU Aware MPI versions + INTEGER(KIND=C_SIZE_T) :: GAM_BUFR_SZ(0:NBUF-1) + INTEGER(KIND=JPIM) :: GAM_NEXT_BUF + TYPE(GROWING_ALLOCATION_TYPE), POINTER :: GAM_PTR END TYPE TYPE ALLOCATION_RESERVATION_HANDLE INTEGER(KIND=C_SIZE_T) :: SZ INTEGER(KIND=JPIM) :: BUF + + ! GPU Aware MPI versions + INTEGER(KIND=C_SIZE_T) :: GAM_SZ + INTEGER(KIND=JPIM) :: GAM_BUF END TYPE INTERFACE ASSIGN_PTR @@ -67,6 +76,9 @@ FUNCTION MAKE_BUFFERED_ALLOCATOR() MAKE_BUFFERED_ALLOCATOR%BUFR_SZ(:) = 0 MAKE_BUFFERED_ALLOCATOR%NEXT_BUF = 0 + + MAKE_BUFFERED_ALLOCATOR%GAM_BUFR_SZ(:) = 0 + MAKE_BUFFERED_ALLOCATOR%GAM_NEXT_BUF = 0 END FUNCTION MAKE_BUFFERED_ALLOCATOR FUNCTION RESERVE(ALLOCATOR, SZ) @@ -83,8 +95,22 @@ FUNCTION RESERVE(ALLOCATOR, SZ) ALLOCATOR%NEXT_BUF = MOD(ALLOCATOR%NEXT_BUF+1,NBUF) END FUNCTION RESERVE + FUNCTION RESERVE_GAM(ALLOCATOR, SZ) + IMPLICIT NONE + TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR + INTEGER(KIND=C_SIZE_T), INTENT(IN) :: SZ + + TYPE(ALLOCATION_RESERVATION_HANDLE) :: RESERVE_GAM + + ALLOCATOR%GAM_BUFR_SZ(ALLOCATOR%GAM_NEXT_BUF) = MAX(ALLOCATOR%GAM_BUFR_SZ(ALLOCATOR%GAM_NEXT_BUF),SZ) + RESERVE_GAM%GAM_BUF = ALLOCATOR%GAM_NEXT_BUF + RESERVE_GAM%GAM_SZ = SZ + + ALLOCATOR%GAM_NEXT_BUF = MOD(ALLOCATOR%GAM_NEXT_BUF+1,NBUF) + END FUNCTION RESERVE_GAM + SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) - USE GROWING_ALLOCATOR_MOD, ONLY: REALLOCATE_GROWING_ALLOCATION + USE GROWING_ALLOCATOR_MOD, ONLY: REALLOCATE_GROWING_ALLOCATION, REALLOCATE_GROWING_GAM_ALLOCATION IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR !!TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: GROWING_ALLOCATION @@ -97,6 +123,13 @@ SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) ALLOCATOR%PTR => GROWING_ALLOCATION CALL REALLOCATE_GROWING_ALLOCATION(GROWING_ALLOCATION, SUM(ALLOCATOR%BUFR_SZ)) + + DO I = 0, NBUF-1 + ALLOCATOR%GAM_BUFR_SZ(I) = ALIGN(ALLOCATOR%GAM_BUFR_SZ(I),128) + ENDDO + ALLOCATOR%GAM_PTR => GROWING_ALLOCATION + + CALL REALLOCATE_GROWING_GAM_ALLOCATION(GROWING_ALLOCATION, SUM(ALLOCATOR%GAM_BUFR_SZ)) END SUBROUTINE FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION) @@ -117,6 +150,24 @@ FUNCTION GET_ALLOCATION(ALLOCATOR, RESERVATION) ENDIF END FUNCTION GET_ALLOCATION + FUNCTION GET_ALLOCATION_GAM(ALLOCATOR, RESERVATION) + IMPLICIT NONE + TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR + TYPE(ALLOCATION_RESERVATION_HANDLE), INTENT(IN) :: RESERVATION + + INTEGER(KIND=C_INT8_T), POINTER :: GET_ALLOCATION_GAM(:) + + IF (RESERVATION%GAM_SZ > ALLOCATOR%GAM_BUFR_SZ(RESERVATION%GAM_BUF)) THEN + CALL ABORT_TRANS( "Logical Error in GET_ALLOCATION_GAM") + ENDIF + IF (RESERVATION%GAM_BUF == 0) THEN + GET_ALLOCATION_GAM(1:) => ALLOCATOR%GAM_PTR%GAM_PTR(1:RESERVATION%GAM_SZ) + ELSE + GET_ALLOCATION_GAM(1:) => ALLOCATOR%GAM_PTR%GAM_PTR(SUM(ALLOCATOR%GAM_BUFR_SZ(0:RESERVATION%GAM_BUF-1))+1: & + SUM(ALLOCATOR%GAM_BUFR_SZ(0:RESERVATION%GAM_BUF-1))+RESERVATION%GAM_SZ) + ENDIF + END FUNCTION GET_ALLOCATION_GAM + SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE, SET_STREAM) USE ISO_C_BINDING, ONLY: C_FLOAT, C_F_POINTER, C_SIZEOF IMPLICIT NONE diff --git a/src/trans/gpu/algor/growing_allocator_mod.F90 b/src/trans/gpu/algor/growing_allocator_mod.F90 index f8de0fc90..5c6e1abb7 100644 --- a/src/trans/gpu/algor/growing_allocator_mod.F90 +++ b/src/trans/gpu/algor/growing_allocator_mod.F90 @@ -1,14 +1,15 @@ MODULE GROWING_ALLOCATOR_MOD - USE ISO_C_BINDING, ONLY: C_INT8_T + USE HIP_ALLOCATOR_MOD + USE ISO_C_BINDING, ONLY: C_INT8_T, C_PTR PRIVATE PUBLIC :: GROWING_ALLOCATION_TYPE - PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION + PUBLIC :: REALLOCATE_GROWING_ALLOCATION, REGISTER_FREE_FUNCTION, REALLOCATE_GROWING_GAM_ALLOCATION ABSTRACT INTERFACE SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C) - USE ISO_C_BINDING, ONLY: C_SIZE_T, C_INT8_T + USE ISO_C_BINDING, ONLY: C_SIZE_T, C_INT8_T, C_PTR IMPLICIT NONE INTEGER(KIND=C_INT8_T), TARGET :: PTR(:) INTEGER(C_SIZE_T), VALUE :: SZ @@ -20,9 +21,13 @@ SUBROUTINE FREE_FUNC_PROC(PTR, SZ) BIND(C) END TYPE TYPE GROWING_ALLOCATION_TYPE + ! Regular allocations INTEGER(KIND=C_INT8_T), POINTER :: PTR(:) TYPE(FREE_FUNC_TYPE) :: FREE_FUNCS(10) INTEGER :: FREE_FUNCS_SZ + ! GPU aware MPI weirdness + INTEGER(KIND=C_INT8_T), POINTER :: GAM_PTR(:) + INTEGER(KIND=C_INT8_T), POINTER :: GAM_DEV_PTR(:) END TYPE CONTAINS @@ -54,6 +59,47 @@ SUBROUTINE REALLOCATE_GROWING_ALLOCATION(ALLOC, SZ) ENDIF END SUBROUTINE + SUBROUTINE REALLOCATE_GROWING_GAM_ALLOCATION(ALLOC, SZ) + USE ISO_C_BINDING + USE OPENACC + USE TPM_GEN, ONLY: NOUT + USE HIP_ALLOCATOR_MOD, ONLY: DEVICE_ALLOCATE, DEVICE_FREE + IMPLICIT NONE + TYPE(GROWING_ALLOCATION_TYPE), INTENT(INOUT) :: ALLOC + INTEGER(C_SIZE_T) :: SZ + INTEGER :: I + + ! Deallocate existing pointer + IF (ASSOCIATED(ALLOC%GAM_PTR) .AND. SZ > SIZE(ALLOC%GAM_PTR, 1, C_SIZE_T)) THEN + WRITE(NOUT,*) "WARNING: REALLOCATING GROWING POINTER CAUSING GRAPH REINSTANTIATION" + DO I = 1, ALLOC%FREE_FUNCS_SZ + CALL ALLOC%FREE_FUNCS(I)%FUNC(ALLOC%GAM_PTR, & + SIZE(ALLOC%GAM_PTR, 1, C_SIZE_T)) + ENDDO +#ifdef __HIP_PLATFORM_AMD__ + CALL DEVICE_FREE(ALLOC%GAM_DEV_PTR) +#else + !$ACC EXIT DATA DELETE(ALLOC%GAM_PTR) + DEALLOCATE(ALLOC%GAM_PTR) +#endif + NULLIFY(ALLOC%GAM_PTR) + ENDIF + + IF (.NOT. ASSOCIATED(ALLOC%GAM_PTR)) THEN +#ifdef __HIP_PLATFORM_AMD__ + ! This should be moved to an ACC_MALLOC or something similar but it doesn't seem to work. + CALL DEVICE_ALLOCATE(ALLOC%GAM_DEV_PTR,SZ) + !ALLOC%GAM_DEV_PTR = ACC_MALLOC(SZ) + CALL ACC_MAP_DATA(ALLOC%GAM_DEV_PTR, C_LOC(ALLOC%GAM_DEV_PTR),SZ) + CALL C_F_POINTER(C_LOC(ALLOC%GAM_DEV_PTR), ALLOC%GAM_PTR, [SZ]) +#else + ALLOCATE(ALLOC%GAM_PTR(SZ)) + !$ACC ENTER DATA CREATE(ALLOC%GAM_PTR) +#endif + ALLOC%FREE_FUNCS_SZ = 0 + ENDIF + END SUBROUTINE + SUBROUTINE REGISTER_FREE_FUNCTION(ALLOC, FREE_FUNC) USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS IMPLICIT NONE diff --git a/src/trans/gpu/algor/hip_allocator_mod.F90 b/src/trans/gpu/algor/hip_allocator_mod.F90 new file mode 100644 index 000000000..b282f1506 --- /dev/null +++ b/src/trans/gpu/algor/hip_allocator_mod.F90 @@ -0,0 +1,48 @@ +MODULE HIP_ALLOCATOR_MOD + USE ISO_C_BINDING + + IMPLICIT NONE + SAVE + PRIVATE + + PUBLIC :: DEVICE_ALLOCATE, DEVICE_FREE + + INTERFACE + SUBROUTINE HIPMALLOC(CPTR, PSIZE) BIND(C, NAME="hipMalloc") + USE ISO_C_BINDING, ONLY : C_PTR, C_SIZE_T + IMPLICIT NONE + TYPE(C_PTR) :: CPTR + INTEGER(C_SIZE_T), VALUE :: PSIZE + END SUBROUTINE HIPMALLOC + + SUBROUTINE HIPFREE(PTR) BIND(C, NAME="hipFree") + USE ISO_C_BINDING, ONLY : C_PTR + IMPLICIT NONE + TYPE(C_PTR) :: PTR + END SUBROUTINE HIPFREE + END INTERFACE + +CONTAINS + + SUBROUTINE DEVICE_ALLOCATE(X, PSIZE) + USE ISO_C_BINDING, ONLY : C_PTR, C_SIZE_T, C_INT8_T + IMPLICIT NONE + INTEGER(C_INT8_T), DIMENSION(:), POINTER, INTENT(INOUT) :: X + INTEGER(C_SIZE_T), VALUE :: PSIZE + TYPE(C_PTR) :: PTR + PTR = C_LOC(X) + CALL HIPMALLOC(PTR, PSIZE) + CALL C_F_POINTER(PTR, X, [PSIZE]) + + END SUBROUTINE DEVICE_ALLOCATE + + SUBROUTINE DEVICE_FREE(X) + USE ISO_C_BINDING, ONLY : C_PTR, C_INT8_T + IMPLICIT NONE + INTEGER(C_INT8_T), DIMENSION(:), POINTER, INTENT(INOUT) :: X + TYPE(C_PTR) :: PTR + PTR = C_LOC(X) + CALL HIPFREE(PTR) + END SUBROUTINE DEVICE_FREE + +END MODULE HIP_ALLOCATOR_MOD diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 5c325985b..57ecca3bf 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -12,19 +12,24 @@ MODULE TRGTOL_MOD USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE + USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE PRIVATE PUBLIC :: TRGTOL_HANDLE, TRGTOL, PREPARE_TRGTOL TYPE TRGTOL_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS, HCOMBUFR_AND_REEL + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFS_COMBUFR + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL + INTEGER(KIND=C_SIZE_T) :: COMBUFS_START + INTEGER(KIND=C_SIZE_T) :: COMBUFR_START + INTEGER(KIND=C_SIZE_T) :: REEL_START END TYPE CONTAINS FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE, RESERVE_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -37,11 +42,16 @@ FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) INTEGER(KIND=C_SIZE_T) :: NELEM - HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, INT(KF_GP*D%NGPTOT,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HTRGTOL%COMBUFS_START = 1 + NELEM = ALIGN(INT(KF_GP*D%NGPTOT,KIND=C_SIZE_T)*C_SIZEOF(DUMMY), 128) - NELEM = INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY) ! ZCOMBUFR - NELEM = NELEM + INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY) ! PREEL_REAL - HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM) + HTRGTOL%COMBUFR_START = NELEM + 1 + NELEM = NELEM + ALIGN(INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY),128) + HTRGTOL%HCOMBUFS_COMBUFR = RESERVE_GAM(ALLOCATOR, NELEM) + + HTRGTOL%REEL_START = 1 + NELEM = ALIGN(INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY),128) ! PREEL_REAL + HTRGTOL%HREEL = RESERVE(ALLOCATOR, NELEM) END FUNCTION PREPARE_TRGTOL SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& @@ -120,7 +130,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE TPM_TRANS, ONLY: NPROMA USE ISO_C_BINDING, ONLY: C_SIZE_T, C_FLOAT, C_DOUBLE, C_INT8_T, C_SIZEOF - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION, GET_ALLOCATION_GAM USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, & & EXT_ACC_DELETE USE OPENACC, ONLY: ACC_HANDLE_KIND @@ -326,10 +336,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO block - CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_REAL(1))+1_C_SIZE_T, & - & INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_REAL(1))) - !!CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL), size1, size2) + CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HREEL),& + & HTRGTOL%REEL_START, INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(PREEL_REAL(1))) end block #ifdef OMPGPU @@ -458,8 +466,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO IF (ISEND_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& - & 1_C_SIZE_T, INT(ICOMBUFS_OFFSET(ISEND_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFS(1))) + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION_GAM(ALLOCATOR, HTRGTOL%HCOMBUFS_COMBUFR),& + & HTRGTOL%COMBUFS_START, INT(ICOMBUFS_OFFSET(ISEND_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFS(1))) ENDIF !....Pack loop......................................................... @@ -567,8 +575,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(411,0) IF (IRECV_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & 1_C_SIZE_T, INT(ICOMBUFR_OFFSET(IRECV_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1))) + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION_GAM(ALLOCATOR, HTRGTOL%HCOMBUFS_COMBUFR),& + & HTRGTOL%COMBUFR_START, INT(ICOMBUFR_OFFSET(IRECV_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1))) ENDIF #ifdef OMPGPU #endif diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index cc4d4426f..7d9c3574d 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -12,19 +12,22 @@ MODULE TRLTOG_MOD USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE + USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE PRIVATE PUBLIC :: TRLTOG, TRLTOG_HANDLE, PREPARE_TRLTOG TYPE TRLTOG_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFR_AND_COMBUFS + TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFR_COMBUFS + INTEGER(KIND=C_SIZE_T) :: COMBUFS_START + INTEGER(KIND=C_SIZE_T) :: COMBUFR_START END TYPE CONTAINS FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -37,10 +40,13 @@ FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) INTEGER(KIND=C_SIZE_T) :: NELEM + HTRLTOG%COMBUFR_START = 1 NELEM = ALIGN(INT(KF_GP*D%NGPTOT,KIND=C_SIZE_T)*C_SIZEOF(DUMMY),128) ! ZCOMBUFR - NELEM = ALIGN(NELEM + INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY),128) !ZCOMBUFS upper bound - HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM) + HTRLTOG%COMBUFS_START = 1 + NELEM + NELEM = NELEM + ALIGN(INT(KF_FS*D%NLENGTF,KIND=C_SIZE_T)*C_SIZEOF(DUMMY),128) !ZCOMBUFS upper bound + + HTRLTOG%HCOMBUFR_COMBUFS = RESERVE_GAM(ALLOCATOR, NELEM) END FUNCTION PREPARE_TRLTOG SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KPTRGP,& @@ -120,7 +126,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, #endif USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE TPM_TRANS, ONLY: LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE OPENACC_EXT, ONLY: EXT_ACC_ARR_DESC, EXT_ACC_PASS, EXT_ACC_CREATE, & & EXT_ACC_DELETE @@ -646,13 +652,12 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO IF (IRECV_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & 1_C_SIZE_T, INT(ICOMBUFR_OFFSET(IRECV_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1))) + CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION_GAM(ALLOCATOR, HTRLTOG%HCOMBUFR_COMBUFS),& + & HTRLTOG%COMBUFR_START, INT(ICOMBUFR_OFFSET(IRECV_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1))) ENDIF IF (ISEND_COUNTS > 0) THEN - CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & ALIGN(INT(KF_GP*D%NGPTOT,KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFR(1)),128)+1_C_SIZE_T, & - & INT(ICOMBUFS_OFFSET(ISEND_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFS(1))) + CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION_GAM(ALLOCATOR, HTRLTOG%HCOMBUFR_COMBUFS),& + & HTRLTOG%COMBUFS_START, INT(ICOMBUFS_OFFSET(ISEND_COUNTS+1),KIND=C_SIZE_T)*C_SIZEOF(ZCOMBUFS(1))) ENDIF #ifdef OMPGPU diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 60738be3f..a72133a71 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -23,7 +23,7 @@ MODULE TRLTOM_MOD FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -34,7 +34,7 @@ FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) REAL(KIND=JPRBT) :: DUMMY - HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, INT(D%NLENGT1B*2*KF_FS,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HTRLTOM%HPFBUF = RESERVE_GAM(ALLOCATOR, INT(D%NLENGT1B*2*KF_FS,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) END FUNCTION SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) @@ -98,7 +98,7 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS @@ -133,7 +133,7 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) - CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HPFBUF),& + CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION_GAM(ALLOCATOR, HTRLTOM%HPFBUF),& & 1_C_SIZE_T, INT(D%NLENGT1B*2*KF_FS,KIND=C_SIZE_T)*C_SIZEOF(PFBUF(1))) #ifdef OMPGPU diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index d77da0f80..085f60ad4 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -33,7 +33,7 @@ FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM IMPLICIT NONE @@ -43,7 +43,7 @@ FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) REAL(KIND=JPRBT) :: DUMMY - HTRLTOM_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, INT(D%NLENGT0B*KF_FS*2,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HTRLTOM_PACK%HFOUBUF_IN = RESERVE_GAM(ALLOCATOR, INT(D%NLENGT0B*KF_FS*2,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) END FUNCTION PREPARE_TRLTOM_PACK SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) @@ -69,7 +69,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) ! ------------------------------------------------------------------ - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D, MYSETW, D_NSTAGTF, D_NPNTGTB0, D_NPTRLS, D_NDGL_FS USE TPM_GEOMETRY, ONLY: G_NMEN, G_NLOEN @@ -89,7 +89,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) REAL(KIND=JPRBT) :: SCAL - CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION_GAM(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& & 1_C_SIZE_T, INT(D%NLENGT0B*KF_FS*2,KIND=C_SIZE_T)*C_SIZEOF(FOUBUF_IN(1))) #ifdef OMPGPU diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 957cb2c03..d34e8dfa4 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -23,7 +23,7 @@ MODULE TRMTOL_MOD FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -34,7 +34,7 @@ FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) REAL(KIND=JPRBT) :: DUMMY - HTRMTOL%HPFBUF = RESERVE(ALLOCATOR, INT(D%NLENGT0B*2*KF_LEG,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) + HTRMTOL%HPFBUF = RESERVE_GAM(ALLOCATOR, INT(D%NLENGT0B*2*KF_LEG,KIND=C_SIZE_T)*C_SIZEOF(DUMMY)) END FUNCTION SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) @@ -97,7 +97,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) USE MPI_F08, ONLY: MPI_COMM, MPI_FLOAT, MPI_DOUBLE ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS @@ -134,7 +134,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) IF (LHOOK) CALL DR_HOOK('TRMTOL',0,ZHOOK_HANDLE) - CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HPFBUF),& + CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION_GAM(ALLOCATOR, HTRMTOL%HPFBUF),& & 1_C_SIZE_T, INT(D%NLENGT0B*2*KF_LEG,KIND=C_SIZE_T)*C_SIZEOF(PFBUF(1))) IF(NPROC > 1) THEN diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 4468af603..bfc228fb0 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -29,7 +29,7 @@ FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE_GAM IMPLICIT NONE @@ -43,7 +43,7 @@ FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) REAL(KIND=JPRBT) :: ZPRBT_DUMMY IALLOC_SZ = INT(D%NLENGT1B*2*KF_LEG,KIND=C_SIZE_T)*C_SIZEOF(ZPRBT_DUMMY) - HTRMTOL_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, int(IALLOC_SZ,kind=c_size_t)) + HTRMTOL_PACK%HFOUBUF_IN = RESERVE_GAM(ALLOCATOR, int(IALLOC_SZ,kind=c_size_t)) END FUNCTION SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG) @@ -90,7 +90,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I USE TPM_GEOMETRY, ONLY: G_NDGLU USE TPM_DISTR, ONLY: D, D_NUMP, D_MYMS, D_NPNTGTB1, D_OFFSETS_GEMM1 USE LEINV_MOD, ONLY: LEINV_STRIDES - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION_GAM USE ISO_C_BINDING, ONLY: C_SIZE_T, C_SIZEOF IMPLICIT NONE @@ -115,7 +115,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',0,ZHOOK_HANDLE) - CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& + CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION_GAM(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& & 1_C_SIZE_T, INT(D%NLENGT1B*2*KF_LEG,KIND=C_SIZE_T)*C_SIZEOF(FOUBUF_IN(1))) CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,&