Merge pull request #43 from ecmwf-ifs/nabr-release-1.4

Release 1.4.0
ecmwf-ifs · Mar 1, 2023 · 7da2893 · 7da2893
2 parents f4a90b6 + 65ae813
commit 7da2893
Show file tree

Hide file tree

Showing 75 changed files with 21,125 additions and 315 deletions.
diff --git a/.github/scripts/.verify-targets.sh.swp b/.github/scripts/.verify-targets.sh.swp
diff --git a/.github/scripts/run-targets.sh b/.github/scripts/run-targets.sh
@@ -11,13 +11,17 @@ skipped_targets=(dwarf-cloudsc-gpu-claw)
 if [[ "$arch" == *"nvhpc"* ]]
 then
   # Skip GPU targets if built with nvhpc (don't have GPU in test runner)
-  skipped_targets+=(dwarf-cloudsc-gpu-scc dwarf-cloudsc-gpu-scc-hoist dwarf-cloudsc-gpu-omp-scc-hoist)
+  skipped_targets+=(dwarf-cloudsc-gpu-scc dwarf-cloudsc-gpu-scc-hoist dwarf-cloudsc-gpu-omp-scc-hoist dwarf-cloudsc-gpu-scc-field)
 
   # Skip GPU targets from Loki if built with nvhpc (don't have GPU in test runner)
   skipped_targets+=(dwarf-cloudsc-loki-claw-gpu dwarf-cloudsc-loki-scc dwarf-cloudsc-loki-scc-hoist)
 
+  # Skip CUDA targets if built with nvhpc
+  skipped_targets+=(dwarf-cloudsc-gpu-scc-cuf dwarf-cloudsc-gpu-scc-cuf-k-caching)
+  skipped_targets+=(dwarf-cloudsc-loki-scc-cuf-hoist dwarf-cloudsc-loki-scc-cuf-parametrise)
+  skipped_targets+=(dwarf-cloudsc-cuda dwarf-cloudsc-cuda-hoist dwarf-cloudsc-cuda-k-caching)
   # Skip C target if built with nvhpc, segfaults for unknown reasons
-  skipped_targets+=(dwarf-cloudsc-c)
+  skipped_targets+=(dwarf-cloudsc-c dwarf-cloudsc-loki-c)
 fi
 
 exit_code=0

diff --git a/.github/scripts/verify-targets.sh b/.github/scripts/verify-targets.sh
@@ -22,6 +22,15 @@ then
   then
     targets+=(dwarf-cloudsc-gpu-claw)
   fi
+  if [[ "$cuda_flag" == "--with-cuda" ]]
+  then
+    targets+=(dwarf-cloudsc-gpu-scc-cuf dwarf-cloudsc-gpu-scc-cuf-k-caching)
+    targets+=(dwarf-cloudsc-gpu-scc-field)
+  fi
+  if [[ "$cuda_flag" == "--with-cuda" && "$io_library_flag" == "--with-serialbox" ]]
+  then
+      targets+=(dwarf-cloudsc-cuda dwarf-cloudsc-cuda-hoist dwarf-cloudsc-cuda-k-caching)
+  fi
 fi
 
 if [[ "$loki_flag" == "--with-loki" ]]
@@ -36,6 +45,10 @@ then
   then
     targets+=(dwarf-cloudsc-loki-claw-cpu dwarf-cloudsc-loki-claw-gpu)
   fi
+  if [[ "$cuda_flag" == "--with-cuda" ]]
+  then
+    targets+=(dwarf-cloudsc-loki-scc-cuf-hoist dwarf-cloudsc-loki-scc-cuf-parametrise)
+  fi
 fi
 
 #

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -38,6 +38,8 @@ jobs:
 
         gpu_flag: ['', '--with-gpu']  # GPU-variants enabled
 
+        cuda_flag: ['']  # Enable CUDA variants
+
         loki_flag: ['', '--with-loki']  # Loki source-to-source translation enabled
 
         claw_flag: ['']  # Flag to enable CLAW-generated variants
@@ -49,11 +51,15 @@ jobs:
             mpi_flag: ''
             prec_flag: ''
             gpu_flag: '--with-gpu'
+            cuda_flag: '--with-cuda'
+            loki_flag: '--with-loki'
           - arch: github/ubuntu/nvhpc/21.9
             io_library_flag: '--with-serialbox'
             mpi_flag: ''
             prec_flag: ''
             gpu_flag: '--with-gpu'
+            cuda_flag: '--with-cuda'
+            loki_flag: '--with-loki'
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -99,14 +105,15 @@ jobs:
           ./cloudsc-bundle build --retry-verbose \
           --arch=arch/${{ matrix.arch }} ${{ matrix.prec_flag }} \
           ${{ matrix.mpi_flag }} ${{ matrix.io_library_flag }} ${{ matrix.gpu_flag }} \
-          ${{ matrix.claw_flag}} ${{ matrix.loki_flag }}
+          ${{ matrix.claw_flag}} ${{ matrix.loki_flag }} ${{ matrix.cuda_flag }}
 
       # Verify targets exist
       - name: Verify targets
         env:
           io_library_flag: ${{ matrix.io_library_flag }}
           prec_flag: ${{ matrix.prec_flag }}
           gpu_flag: ${{ matrix.gpu_flag }}
+          cuda_flag: ${{ matrix.cuda_flag }}
           loki_flag: ${{ matrix.loki_flag }}
           claw_flag: ${{ matrix.claw_flag }}
         run: .github/scripts/verify-targets.sh

diff --git a/AUTHORS.md b/AUTHORS.md
@@ -16,6 +16,7 @@
 - Z. Piotrowski (ECMWF)
 - B. Reuter (ECMWF)
 - D. Salmond (ECMWF)
+- M. Staneker (ECMWF)
 - M. Tiedtke (ECMWF)
 - A. Tompkins (ECMWF)
 - S. Ubbiali (ETH Zuerich)

diff --git a/README.md b/README.md
@@ -60,6 +60,29 @@ Balthasar Reuter ([email protected])
   move parameter structures to constant memory. To enable this variant,
   a suitable CUDA installation is required and the `--with-cuda` flag
   needs to be passed at the build stage.
+- **dwarf-cloudsc-gpu-scc-cuf-k-caching**: GPU-enabled and further
+  optimized version of CLOUDSC that uses the SCC loop layout in
+  combination with loop fusion and temporary local array demotion, implemented
+  using CUDA-Fortran (CUF). To enable this variant,
+  a suitable CUDA installation is required and the `--with-cuda` flag
+  needs to be passed at the build stage.
+- **CUDA C prototypes**: To enable these variants, a suitable 
+  CUDA installation is required and the `--with-cuda` flag needs
+  to be pased at the build stage.
+ - **dwarf-cloudsc-cuda**: GPU-enabled, CUDA C version of CLOUDSC.
+ - **dwarf-cloudsc-cuda-hoist**: GPU-enabled, optimized CUDA C version 
+   of CLOUDSC including host side hoisted temporary local variables.
+ - **dwarf-cloudsc-cuda-k-caching**: GPU-enabled, further optimized CUDA
+   C version of CLOUDSC including loop fusion and temporary local 
+   array demotion.  
+- **dwarf-cloudsc-gpu-scc-field**: GPU-enabled and optimized version of
+  CLOUDSC that uses the SCC loop layout, and a dedicated Fortran FIELD
+  API to manage device offload and copyback. The intent is to demonstrate
+  the explicit use of pinned host memory to speed-up data transfers, as
+  provided by the shipped prototype implmentation, and investigate the
+  effect of different data storage allocation layouts. To enable this
+  variant, a suitable CUDA installation is required and the
+  `--with-cuda` flag needs to be passed at the build stage.
 
 ## Download and Installation
 
@@ -249,6 +272,27 @@ srun bash -c "CUDA_VISIBLE_DEVICES=\$SLURM_LOCALID bin/dwarf-cloudsc-gpu-scc-hoi
 
 In principle, the same should work for multi-node execution (`-N 2`, `-N 4` etc.) once interconnect issues are resolved.
 
+### GPU runs: Timing device kernels and data transfers
+
+For GPU-enabled runs two internal timer results are reported:
+
+* The isolated compute time of the main compute kernel on device (where `#BLKS == 1`)
+* The overall time of the execution loop including data offload and copyback
+
+It is important to note that due to the nature of the kernel, data
+transfer overheads will dominate timings, and that most supported GPU
+variants aim to optimise compute kernel timings only. However, a
+dedicated variant `dwarf-cloudsc-gpu-scc-field` has been added to
+explore host-side memory pinning, which improves data transfer times
+and alternative data layout strategies. By default, this will allocate
+each array variable individually in pinned memory. A runtime flag
+`CLOUDSC_PACKED_STORAGE=ON` can be used to enable "packed" storage,
+where multiple arrays are stored in a single base allocation, eg.
+
+```sh
+NV_ACC_CUDA_HEAPSIZE=8G CLOUDSC_PACKED_STORAGE=ON ./bin/dwarf-cloudsc-gpu-scc-field 1 80000 128
+```
+
 ## Loki transformations for CLOUDSC
 
 [Loki](https://github.com/ecmwf-ifs/loki) is an in-house developed

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.3.0
+1.4.0
diff --git a/arch/toolchains/ecmwf-hpc2020-nvhpc.cmake b/arch/toolchains/ecmwf-hpc2020-nvhpc.cmake
@@ -37,6 +37,14 @@ set( OpenACC_Fortran_FLAGS "-acc=gpu -mp=gpu -gpu=cc80,lineinfo,fastmath" CACHE
 # Enable this to get more detailed compiler output
 # set( OpenACC_Fortran_FLAGS "${OpenACC_Fortran_FLAGS} -Minfo" )
 
+####################################################################
+# CUDA FLAGS
+####################################################################
+
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 80)
+endif()
+
 ####################################################################
 # COMMON FLAGS
 ####################################################################

diff --git a/arch/toolchains/ecmwf-volta-pgi-gpu.cmake b/arch/toolchains/ecmwf-volta-pgi-gpu.cmake
@@ -50,6 +50,8 @@ set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -Ktrap=fp")
 set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -Kieee")
 set(ECBUILD_Fortran_FLAGS "${ECBUILD_Fortran_FLAGS} -Mdaz")
 
+set(ECBUILD_Fortran_LINK_FLAGS "-gpu=pinned")
+
 set( ECBUILD_Fortran_FLAGS_BIT "-O2 -gopt" )
 
 set( ECBUILD_C_FLAGS "-O2 -gopt -traceback" )

diff --git a/benchmark/include/include_patternset.yml b/benchmark/include/include_patternset.yml
@@ -158,45 +158,45 @@ patternset:
 
   - name: timing_pattern
     pattern:
-      - {name: thr_time,   type: int, _: '(?:$jube_pat_nint\s+){6}:\s+$jube_pat_int\s+$jube_pat_nint\s+@\s+(?:rank#$jube_pat_nint:)?core#'} #$jube_pat_nint'}  # C-version doesn't print core number?
-      - {name: thr_mflops, type: int, _: '(?:$jube_pat_nint\s+){6}:\s+$jube_pat_nint\s+$jube_pat_int\s+@\s+(?:rank#$jube_pat_nint:)?core#'} #$jube_pat_nint'}
-      - {name: rnk_time,   type: int, _: '(?:$jube_pat_nint\s+){6}:\s+$jube_pat_int\s+$jube_pat_nint\s+:\s+TOTAL\s@\srank#$jube_pat_nint'}
-      - {name: rnk_mflops, type: int, _: '(?:$jube_pat_nint\s+){6}:\s+$jube_pat_nint\s+$jube_pat_int\s+:\s+TOTAL\s@\srank#$jube_pat_nint'}
-      - {name: tot_time,   type: int, _: '(?:$jube_pat_nint\s*x\s*)?(?:$jube_pat_nint\s+){6}:\s+$jube_pat_int\s+$jube_pat_nint\s+(?::\s+)?TOTAL(?!\s@)'}
-      - {name: tot_mflops, type: int, _: '(?:$jube_pat_nint\s*x\s*)?(?:$jube_pat_nint\s+){6}:\s+$jube_pat_nint\s+$jube_pat_int\s+(?::\s+)?TOTAL(?!\s@)'}
-      - {name: tot_nproc,  type: int, _: '$jube_pat_int\s*x\s*(?:$jube_pat_nint\s+){6}:\s+(?:$jube_pat_nint\s+){2}(?::\s+)?TOTAL(?!\s@)'}
-      - {name: tot_numomp, type: int, _: '(?:$jube_pat_nint\s*x\s*)?$jube_pat_int\s+(?:$jube_pat_nint\s+){5}:\s+(?:$jube_pat_nint\s+){2}(?::\s+)?TOTAL(?!\s@)'}
-      - {name: tot_ngptot, type: int, _: '(?:$jube_pat_nint\s*x\s*)?$jube_pat_nint\s+$jube_pat_int\s+(?:$jube_pat_nint\s+){4}:\s+(?:$jube_pat_nint\s+){2}(?::\s+)?TOTAL(?!\s@)'}
-      - {name: tot_ngpblks,type: int, _: '(?:$jube_pat_nint\s*x\s*)?(?:$jube_pat_nint\s+){3}$jube_pat_int\s+(?:$jube_pat_nint\s+){2}:\s+(?:$jube_pat_nint\s+){2}(?::\s+)?TOTAL(?!\s@)'}
-      - {name: tot_nproma, type: int, _: '(?:$jube_pat_nint\s*x\s*)?(?:$jube_pat_nint\s+){4}$jube_pat_int\s+$jube_pat_nint\s+:\s+(?:$jube_pat_nint\s+){2}(?::\s+)?TOTAL(?!\s@)'}
+      - {name: thr_time,   type: int, _: '(?:$jube_pat_nint\s+){6}:\s+$jube_pat_int\s+(?:$jube_pat_nint\s+){2}@\s+(?:rank#$jube_pat_nint:)?core#'} #$jube_pat_nint'}  # C-version doesn't print core number?
+      - {name: thr_mflops, type: int, _: '(?:$jube_pat_nint\s+){6}:\s+$jube_pat_nint\s+$jube_pat_int\s+$jube_pat_nint\s+@\s+(?:rank#$jube_pat_nint:)?core#'} #$jube_pat_nint'}
+      - {name: rnk_time,   type: int, _: '(?:$jube_pat_nint\s+){6}:\s+$jube_pat_int\s+(?:$jube_pat_nint\s+){2}:\s+TOTAL\s@\srank#$jube_pat_nint'}
+      - {name: rnk_mflops, type: int, _: '(?:$jube_pat_nint\s+){6}:\s+$jube_pat_nint\s+$jube_pat_int\s+$jube_pat_nint\s+:\s+TOTAL\s@\srank#$jube_pat_nint'}
+      - {name: tot_time,   type: int, _: '(?:$jube_pat_nint\s*x\s*)?(?:$jube_pat_nint\s+){6}:\s+$jube_pat_int\s+(?:$jube_pat_nint\s+){2}(?::\s+)?TOTAL(?!\s@)'}
+      - {name: tot_mflops, type: int, _: '(?:$jube_pat_nint\s*x\s*)?(?:$jube_pat_nint\s+){6}:\s+$jube_pat_nint\s+$jube_pat_int\s+$jube_pat_nint\s+(?::\s+)?TOTAL(?!\s@)'}
+      - {name: tot_nproc,  type: int, _: '$jube_pat_int\s*x\s*(?:$jube_pat_nint\s+){6}:\s+(?:$jube_pat_nint\s+){3}(?::\s+)?TOTAL(?!\s@)'}
+      - {name: tot_numomp, type: int, _: '(?:$jube_pat_nint\s*x\s*)?$jube_pat_int\s+(?:$jube_pat_nint\s+){5}:\s+(?:$jube_pat_nint\s+){3}(?::\s+)?TOTAL(?!\s@)'}
+      - {name: tot_ngptot, type: int, _: '(?:$jube_pat_nint\s*x\s*)?$jube_pat_nint\s+$jube_pat_int\s+(?:$jube_pat_nint\s+){4}:\s+(?:$jube_pat_nint\s+){3}(?::\s+)?TOTAL(?!\s@)'}
+      - {name: tot_ngpblks,type: int, _: '(?:$jube_pat_nint\s*x\s*)?(?:$jube_pat_nint\s+){3}$jube_pat_int\s+(?:$jube_pat_nint\s+){2}:\s+(?:$jube_pat_nint\s+){3}(?::\s+)?TOTAL(?!\s@)'}
+      - {name: tot_nproma, type: int, _: '(?:$jube_pat_nint\s*x\s*)?(?:$jube_pat_nint\s+){4}$jube_pat_int\s+$jube_pat_nint\s+:\s+(?:$jube_pat_nint\s+){3}(?::\s+)?TOTAL(?!\s@)'}
 
-      # NUMOMP    NGPTOT  #GP-cols     #BLKS    NPROMA tid# : Time(msec)  MFlops/s
-      #      8     16384      2048       128        16    0 :        295       866 @ core#22
-      #      8     16384      2048       128        16    1 :        284       899 @ core#4
-      #      8     16384      2048       128        16    2 :        282       905 @ core#16
-      #      8     16384      2048       128        16    3 :        239      1067 @ core#1
-      #      8     16384      2048       128        16    4 :        261       975 @ core#2
-      #      8     16384      2048       128        16    5 :        266       959 @ core#3
-      #      8     16384      2048       128        16    6 :        267       955 @ core#21
-      #      8     16384      2048       128        16    7 :        273       934 @ core#23
-      #      8     16384     16384      1024        16   -1 :        295      6931 : TOTAL
+      # NUMOMP    NGPTOT  #GP-cols     #BLKS    NPROMA tid# : Time(msec)  MFlops/s  col/s
+      #      8     16384      2048       128        16    0 :        295       866   1320  @ core#22
+      #      8     16384      2048       128        16    1 :        284       899   1320  @ core#4
+      #      8     16384      2048       128        16    2 :        282       905   1320  @ core#16
+      #      8     16384      2048       128        16    3 :        239      1067   1320  @ core#1
+      #      8     16384      2048       128        16    4 :        261       975   1320  @ core#2
+      #      8     16384      2048       128        16    5 :        266       959   1320  @ core#3
+      #      8     16384      2048       128        16    6 :        267       955   1320  @ core#21
+      #      8     16384      2048       128        16    7 :        273       934   1320  @ core#23
+      #      8     16384     16384      1024        16   -1 :        295      6931   1320  : TOTAL
 
       # NUMPROC=8, NUMOMP=1, NGPTOTG=16384, NPROMA=16, NGPBLKS=128
-      # NUMOMP    NGPTOT  #GP-cols     #BLKS    NPROMA tid# : Time(msec)  MFlops/s
-      #      1      2048      2048       128        16    0 :        237      1075 @ rank#0:core#20
-      #      1      2048      2048       128        16   -1 :        237      1075 : TOTAL @ rank#0
-      #      1      2048      2048       128        16    0 :        230      1109 @ rank#1:core#11
-      #      1      2048      2048       128        16   -1 :        230      1109 : TOTAL @ rank#1
-      #      1      2048      2048       128        16    0 :        281       906 @ rank#2:core#6
-      #      1      2048      2048       128        16   -1 :        281       906 : TOTAL @ rank#2
-      #      1      2048      2048       128        16    0 :        254      1002 @ rank#3:core#24
-      #      1      2048      2048       128        16   -1 :        254      1002 : TOTAL @ rank#3
-      #      1      2048      2048       128        16    0 :        271       940 @ rank#4:core#3
-      #      1      2048      2048       128        16   -1 :        271       940 : TOTAL @ rank#4
-      #      1      2048      2048       128        16    0 :        249      1025 @ rank#5:core#25
-      #      1      2048      2048       128        16   -1 :        249      1025 : TOTAL @ rank#5
-      #      1      2048      2048       128        16    0 :        235      1086 @ rank#6:core#1
-      #      1      2048      2048       128        16   -1 :        235      1086 : TOTAL @ rank#6
-      #      1      2048      2048       128        16    0 :        243      1050 @ rank#7:core#15
-      #      1      2048      2048       128        16   -1 :        243      1050 : TOTAL @ rank#7
-      #  8 x 1     16384     16384      1024        16   -1 :        281      8193 : TOTAL
+      # NUMOMP    NGPTOT  #GP-cols     #BLKS    NPROMA tid# : Time(msec)  MFlops/s  col/s
+      #      1      2048      2048       128        16    0 :        237      1075   1320  @ rank#0:core#20
+      #      1      2048      2048       128        16   -1 :        237      1075   1320  : TOTAL @ rank#0
+      #      1      2048      2048       128        16    0 :        230      1109   1320  @ rank#1:core#11
+      #      1      2048      2048       128        16   -1 :        230      1109   1320  : TOTAL @ rank#1
+      #      1      2048      2048       128        16    0 :        281       906   1320  @ rank#2:core#6
+      #      1      2048      2048       128        16   -1 :        281       906   1320  : TOTAL @ rank#2
+      #      1      2048      2048       128        16    0 :        254      1002   1320  @ rank#3:core#24
+      #      1      2048      2048       128        16   -1 :        254      1002   1320  : TOTAL @ rank#3
+      #      1      2048      2048       128        16    0 :        271       940   1320  @ rank#4:core#3
+      #      1      2048      2048       128        16   -1 :        271       940   1320  : TOTAL @ rank#4
+      #      1      2048      2048       128        16    0 :        249      1025   1320  @ rank#5:core#25
+      #      1      2048      2048       128        16   -1 :        249      1025   1320  : TOTAL @ rank#5
+      #      1      2048      2048       128        16    0 :        235      1086   1320  @ rank#6:core#1
+      #      1      2048      2048       128        16   -1 :        235      1086   1320  : TOTAL @ rank#6
+      #      1      2048      2048       128        16    0 :        243      1050   1320  @ rank#7:core#15
+      #      1      2048      2048       128        16   -1 :        243      1050   1320  : TOTAL @ rank#7
+      #  8 x 1     16384     16384      1024        16   -1 :        281      8193   1320  : TOTAL
diff --git a/bundle.yml b/bundle.yml
@@ -64,6 +64,8 @@ options :
         cmake : >
             ENABLE_CUDA=ON
             ENABLE_CLOUDSC_GPU_SCC_CUF=ON
+            ENABLE_CLOUDSC_GPU_SCC_CUF_K_CACHING=ON
+            ENABLE_CLOUDSC_GPU_SCC_FIELD=ON
 
     - with-mpi :
         help  : Enable MPI-parallel kernel
@@ -104,6 +106,10 @@ options :
         help  : Build the C version of CLOUDSC [ON|OFF]
         cmake : ENABLE_CLOUDSC_C={{value}}
 
+    - cloudsc-cuda :
+        help  : Build the CUDA C version of CLOUDSC [ON|OFF]
+        cmake : ENABLE_CLOUDSC_CUDA={{value}}
+
     - cloudsc-gpu-claw :
         help  : Build the deprecated CLAW-based GPU version CLOUDSC  [ON|OFF]
         cmake : ENABLE_CLOUDSC_GPU_CLAW={{value}}

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -10,5 +10,6 @@ add_subdirectory(prototype1)
 add_subdirectory(common)
 add_subdirectory(cloudsc_fortran)
 add_subdirectory(cloudsc_c)
+add_subdirectory(cloudsc_cuda)
 add_subdirectory(cloudsc_gpu)
 add_subdirectory(cloudsc_loki)
diff --git a/src/cloudsc_c/CMakeLists.txt b/src/cloudsc_c/CMakeLists.txt
@@ -22,8 +22,11 @@ if( HAVE_CLOUDSC_C )
         INSTALL_HEADERS LISTED
         SOURCES
             cloudsc/yoecldp_c.h
+            cloudsc/yoecldp_c.c
             cloudsc/yoethf_c.h
+            cloudsc/yoethf_c.c
             cloudsc/yomcst_c.h
+            cloudsc/yomcst_c.c
             cloudsc/load_state.h
             cloudsc/load_state.c
             cloudsc/cloudsc_c.h
@@ -64,7 +67,10 @@ if( HAVE_CLOUDSC_C )
         COMMAND bin/dwarf-cloudsc-c
         ARGS 4 100 16
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../..
-        OMP 4
+        # The OpenMP C runtime for some reason doesn't appreciate setting the number of threads
+        # in the application when OMP_NUM_THREADS is set, therefore we disable this here
+        # OMP 4
+        ENVIRONMENT OMP_STACKSIZE=1G
         CONDITION HAVE_OMP
     )
 

diff --git a/src/cloudsc_c/cloudsc/cloudsc_c.h b/src/cloudsc_c/cloudsc/cloudsc_c.h
@@ -8,6 +8,9 @@
  * nor does it submit to any jurisdiction.
  */
 
+#ifndef CLOUDSC_C_H
+#define CLOUDSC_C_H
+
 #include "yomcst_c.h"
 #include "yoethf_c.h"
 #include "yoecldp_c.h"
@@ -24,3 +27,5 @@ int cloudsc_c(int kidia, int kfdia, int klon, int klev, double ptsphy, double *
 	      double * restrict v_pfsqif, double * restrict v_pfcqnng, double * restrict v_pfcqlng, double * restrict v_pfsqrf, double * restrict v_pfsqsf, double * restrict v_pfcqrng,
 	      double * restrict v_pfcqsng, double * restrict v_pfsqltur, double * restrict v_pfsqitur, double * restrict v_pfplsl, double * restrict v_pfplsn, double * restrict v_pfhpsl,
 	      double * restrict v_pfhpsn);
+
+#endif