diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml index 1ef75f6fcb..089c54bb3a 100644 --- a/.github/workflows/nemo_tests.yml +++ b/.github/workflows/nemo_tests.yml @@ -130,6 +130,8 @@ jobs: module load perl/${PERL_VERSION} make clean export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" + export REPRODUCIBLE=1 make -j ${NUM_PARALLEL} openmp_gpu make -j ${NUM_PARALLEL} compile-openmp_gpu export NV_ACC_POOL_THRESHOLD=75 @@ -171,27 +173,39 @@ jobs: - name: NEMO MetOffice OpenACC loops for GPU id: nemo_acc_loops run: | + source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh + spack unload && spack load nemo-build-environment%nvhpc . .runner_venv/bin/activate + + # Set up envvars export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PROFILE_HOME=${GITHUB_WORKSPACE}/lib/profiling/nvidia/ export NEMO_DIR=${PREFIX}/UKMO-NEMOv4 - cd $PSYCLONE_NEMO_DIR - module load nvidia-hpcsdk/${NVFORTRAN_VERSION} - module load hdf5/${HDF5_VERSION} netcdf-c/${NETCDF_C_VERSION} netcdf-fortran/${NETCDF_FORTRAN_VERSION} - module load perl/${PERL_VERSION} - make clean - make -j ${NUM_PARALLEL} openacc_loops - COMPILER_ARCH=linux_nvidia_acc_gpu make -j ${NUM_PARALLEL} compile-openacc_loops + export TEST_DIR=SPITZ12_ACC_LOOPS_NVHPC + export PSYCLONE_COMPILER=$MPIF90 + export MPIF90=psyclonefc + export PARALLEL_DIRECTIVES="acc_offloading" + export REPRODUCIBLE=1 + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" + export FCFLAGS="-i4 -Mr8 -O2 -Mnovect -Mnofma -g -acc -mp=gpu -gpu=mem:managed,math_uniform" + export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script + + # Compile + cd ${PREFIX}/UKMO-NEMOv4 + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + rm -rf cfgs/${TEST_DIR} + ./makenemo -r SPITZ12 -m linux_spack -n ${TEST_DIR} \ + add_key "IEEE_IS_NAN=ieee_is_nan key_nosignedzero" \ + del_key "key_iomput key_mpp_mpi key_si3" -j ${NUM_PARALLEL} + + # Run test export NV_ACC_POOL_THRESHOLD=75 export CUDA_VISIBLE_DEVICES=1 - make run-openacc_loops - # Check the output is as expected (TODO #2895: improve numerical reproducibility) - make output-openacc_loops | grep -q " it : 10" || (echo "Error: 'it : 10' not found!" & false) - make output-openacc_loops | grep -q "|ssh|_max: 0.259483" || (echo "Error: '|ssh|_max: 0.259483' not found!" & false) - make output-openacc_loops | grep -q "|U|_max: 0.458515" || (echo "Error: '|U|_max: 0.458515' not found!" & false) - make output-openacc_loops | grep -q "S_min: 0.482686" || (echo "Error: 'S_min: 0.482686' not found!" & false) - make output-openacc_loops | grep -q "S_max: 0.407622" || (echo "Error: 'S_max: 0.407622' not found!" & false) - export VAR_TIME=$(grep -A 1 "Elapsed Time" <(make -s time-openacc_loops) | head -n 2 | tail -n 1 | awk '{print $1}') + cd cfgs/${TEST_DIR}/EXP00/ + ln -sf /archive/psyclone-tests/nemo-inputs/UKMO-eORCA1_GO8_NEMOv4/*.nc . + ./nemo + diff run.stat $PSYCLONE_NEMO_DIR/KGOs/run.stat.nemo4.spitz12.nvhpc.10steps + export VAR_TIME=$(grep -A 1 "Elapsed Time" timing.output | head -n 2 | tail -n 1 | awk '{print $1}') echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}" # PSyclone, compile and run ECMWF NEMO with OpenMP for CPUs. This uses @@ -205,21 +219,25 @@ jobs: source .runner_venv/bin/activate # Set up envvars + export TEST_DIR=SPITZ12_ACC_LOOPS_NVHPC export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts export PSYCLONE_COMPILER=$MPIF90 export MPIF90=psyclonefc - export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/omp_cpu_trans.py -I ${MPI_HOME}/include" + export PARALLEL_DIRECTIVES="omp_threading" + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -I ${MPI_HOME}/include" export FCFLAGS="-i4 -r8 -O2 -heap-arrays -fp-model=precise -g -qopenmp" export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script # Compile cd ${PREFIX}/ECMWF-NEMOv4 - ./makenemo -r SPITZ12 -m linux_spack -n SPITZ12_psyclone \ + cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + rm -rf cfgs/${TEST_DIR} + ./makenemo -r SPITZ12 -m linux_spack -n ${TEST_DIR} \ add_key "IEEE_IS_NAN=ieee_is_nan key_nosignedzero" \ del_key "key_iomput" -j ${NUM_PARALLEL} # Run NEMO - cd cfgs/SPITZ12_psyclone/EXP00/ + cd cfgs/${TEST_DIR}/EXP00/ ln -sf /archive/psyclone-tests/nemo-inputs/ECMWF-eORCA1_GO8/* . export OMP_NUM_THREADS=12 ./nemo @@ -240,6 +258,8 @@ jobs: module load perl/${PERL_VERSION} make clean export NEMOV4=1 # Enables specific NEMOV4 exclusions in the PSyclone transformation script + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" + export REPRODUCIBLE=1 export ASYNC_PARALLEL=1 make -j ${NUM_PARALLEL} openmp_gpu make -j ${NUM_PARALLEL} compile-openmp_gpu diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml index 77fc44b06e..5077a3fbaa 100644 --- a/.github/workflows/nemo_v5_tests.yml +++ b/.github/workflows/nemo_v5_tests.yml @@ -107,7 +107,7 @@ jobs: # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm - export FCFLAGS="-fdefault-real-8 -O2 -fcray-pointer -ffree-line-length-none -g" + export FCFLAGS="-fdefault-real-8 -O2 -mno-fma -fno-tree-vectorize -fcray-pointer -ffree-line-length-none -g" # Clean up and compile rm -rf tests/${TEST_DIR} @@ -202,8 +202,9 @@ jobs: cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export PSYCLONE_COMPILER=$MPIF90 export MPIF90=psyclonefc - export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/omp_cpu_trans.py" - export FCFLAGS="-fdefault-real-8 -O2 -fcray-pointer -ffree-line-length-none -g -fopenmp" + export PARALLEL_DIRECTIVES="omp_threading" + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" + export FCFLAGS="-fdefault-real-8 -O2 -mno-fma -fno-tree-vectorize -fcray-pointer -ffree-line-length-none -g -fopenmp" # Clean up and compile rm -rf tests/${TEST_DIR} @@ -245,7 +246,8 @@ jobs: export REPRODUCIBLE=1 export PSYCLONE_COMPILER=$MPIF90 export MPIF90=psyclonefc - export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py" + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" + export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py" # Clean up and compile rm -rf tests/${TEST_DIR} ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} -j ${NUM_PARALLEL} -v 1 @@ -296,12 +298,13 @@ jobs: cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export REPRODUCIBLE=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} - ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ - -j ${NUM_PARALLEL} -v 1 + ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1 # Run test cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00 @@ -309,7 +312,7 @@ jobs: # Make sure mpi has been built with cuda support ompi_info --parsable --all | grep mpi_built_with_cuda_support:value # Run with round robin allocations of GPUs to MPI ranks - mpirun -n 2 sh -c 'CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK ./nemo' + OMP_NUM_THREADS=4 mpirun -n 2 sh -c 'CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK ./nemo' diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca1.nvhpc.10steps run.stat export VAR_TIME=$(grep "local MPI proces" timing.output | head -n 1 | awk '{print $5}' | tr -d s) echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}" @@ -331,11 +334,13 @@ jobs: cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export REPRODUCIBLE=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} - ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ + ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \ add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 # Run test @@ -374,12 +379,13 @@ jobs: export ENABLE_PROFILING=1 # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export REPRODUCIBLE=1 export ASYNC_PARALLEL=1 # Clean up and compile rm -rf tests/${TEST_DIR} - ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ - -j ${NUM_PARALLEL} -v 1 + ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1 # Run reproducible test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 @@ -398,8 +404,8 @@ jobs: rm -rf tests/${TEST_DIR} export NV_ACC_POOL_THRESHOLD=75 export CUDA_VISIBLE_DEVICES=1 - ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ - -j ${NUM_PARALLEL} -v 1 + ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1 # Run non-reproducible test cd $NEMO_DIR/tests/${TEST_DIR}/EXP00 cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg @@ -423,13 +429,15 @@ jobs: # We compile at "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" export REPRODUCIBLE=1 export ASYNC_PARALLEL=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} - ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ + ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \ add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1 # Run test @@ -459,14 +467,15 @@ jobs: # We compile at "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results. cd $NEMO_DIR cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm + export PARALLEL_DIRECTIVES="omp_offloading" export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" export REPRODUCIBLE=1 export ASYNC_PARALLEL=1 # Clean up and compile rm -rf cfgs/${TEST_DIR} - ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \ - -j ${NUM_PARALLEL} -v 1 + ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} \ + -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1 # Run test cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00 diff --git a/examples/nemo/README.md b/examples/nemo/README.md index 0ed3998b1a..8faad4da7b 100644 --- a/examples/nemo/README.md +++ b/examples/nemo/README.md @@ -34,6 +34,7 @@ POSSIBILITY OF SUCH DAMAGE. Author A. R. Porter, STFC Daresbury Lab Modified by R. W. Ford, STFC Daresbury Lab Modified by J. Henrichs, Bureau of Meteorology +Modified by S. Siso, STFC Daresbury Lab --> @@ -54,63 +55,8 @@ Contains: ## Scripts -Contains the scripts used to process the NEMO code base and to add profiling -instrumentation (https://psyclone.readthedocs.io/en/latest/user_guide/profiling.html) -and OpenACC or OpenMP directives: - -1. `process_nemo.py` is a driver script that allows the user to specify - which files to process with PSyclone, the transformation script to use - and where to put the outputs: - - $ ./process_nemo.py -h - usage: process_nemo.py [-h] [-o OUT_DIR] [-s SCRIPT_FILE] [-x] - input_file [input_file ...] - - Process the specified NEMO source files using PSyclone - - positional arguments: - input_file One or more NEMO pre-processed source files - - optional arguments: - -h, --help show this help message and exit - -o OUT_DIR Destination directory for processed source files - -s SCRIPT_FILE PSyclone transformation script - -x exit immediately if PSyclone fails - -p add profiling instrumentation to the PROFILE_ONLY file - list. Note that files processed by the SCRIPT_FILE may - be introducing profiling instrumentation as part of - that script. - - In addition to the command-line flags, the script itself contains two - variables that may be used to control its behaviour: - - - `EXCLUDED_FILES`: list of filenames that PSyclone will not attempt to process. - - `PROFILE_ONLY`: list of filenames to add profiling instrumentation but - do not attempt to further process by PSyclone. - - Finally, the precise invocation to use when running PSyclone may be - specified by setting the `PSYCLONE` environment variable. If this is not set - then `psyclone` must be in the user's PATH. - -2. PSyclone transformation scripts: - - `kernels_trans.py` adds OpenACC kernel directives and places fine-grained - profiling instrumentation around any regions that haven't had OpenACC - added. - - `omp_cpu_trans.py` adds OpenMP directives for CPU threading parallelism. - - `omp_gpu_trans.py` adds OpenMP offloading directives for GPU acceleration. - -These scripts are a *work in progress* and are being developed to work on the -MO_GO8 configuration of NEMO supplied by the Met Office. This configuration is -based on version 4.0.2 of NEMO and is compiled using: - - ./makenemo -n MO_GO8_GPU -r SPITZ12 -m linux_nvfortran_gpu \ - del_key "key_iomput key_mpp_mpi" add_key "key_nosignedzero" - -(where you will need an `arch/arch-linux_nvfortran_gpu.fcm` FCM configuration -file specifying how to use the NVIDIA compiler). - -If you are applying PSyclone to any other version or configuration of NEMO then -these scripts should serve as a useful starting point. +Contains a collection of example scripts and the instructions to process the NEMO code. These +are testend in our integration test against NEMOv4.0.2 and NEMOv5.0. ## Example 1 diff --git a/examples/nemo/eg1/Makefile b/examples/nemo/eg1/Makefile index d113cdfda2..d3473525c5 100644 --- a/examples/nemo/eg1/Makefile +++ b/examples/nemo/eg1/Makefile @@ -40,8 +40,8 @@ include ../../common.mk transform: ${PSYCLONE} -s ./openmp_cpu_levels_trans.py ../code/tra_adv.F90 ${PSYCLONE} -s ./openmp_gpu_levels_trans.py ../code/tra_adv.F90 - ${PSYCLONE} -s ../scripts/omp_cpu_trans.py ../code/tra_adv.F90 - ${PSYCLONE} -s ../scripts/omp_gpu_trans.py ../code/tra_adv.F90 + PARLLEL_DIRECTIVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90 + PARLLEL_DIRECTIVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90 compile: transform @echo "No compilation supported for nemo/eg1" diff --git a/examples/nemo/eg2/Makefile b/examples/nemo/eg2/Makefile index 65517ce33b..cb2b2b3c61 100644 --- a/examples/nemo/eg2/Makefile +++ b/examples/nemo/eg2/Makefile @@ -43,8 +43,8 @@ transform: omp_levels omp_levels: ${PSYCLONE} -s ./omp_levels_trans.py ../code/traldf_iso.F90 - ${PSYCLONE} -s ../scripts/omp_cpu_trans.py ../code/traldf_iso.F90 - ${PSYCLONE} -s ../scripts/omp_gpu_trans.py ../code/traldf_iso.F90 + PARLLEL_DIRECTIVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90 + PARLLEL_DIRECTIVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90 compile: transform diff --git a/examples/nemo/scripts/KGOs/arch-linux_spack.fcm b/examples/nemo/scripts/KGOs/arch-linux_spack.fcm index 94943258f3..b2b23e196e 100644 --- a/examples/nemo/scripts/KGOs/arch-linux_spack.fcm +++ b/examples/nemo/scripts/KGOs/arch-linux_spack.fcm @@ -9,7 +9,7 @@ %NCDF_INC -I${NCDF_F_HOME}/include -I${NCDF_C_HOME}/include -I${HDF5_HOME}/include %NCDF_LIB -L${NCDF_F_HOME}/lib -lnetcdff -L${NCDF_C_HOME}/lib -lnetcdf -%CPP cpp -Dkey_nosignedzero +%CPP cpp -Dkey_nosignedzero -I${MPI_HOME}/include %FC ${MPIF90} -c %FCFLAGS ${FCFLAGS} %FFLAGS %FCFLAGS diff --git a/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm index 2e6c8df745..151861e342 100644 --- a/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm +++ b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm @@ -12,9 +12,7 @@ %PROFILE_INC -I${PROFILING_DIR} %PROFILE_LIB -L${PROFILING_DIR} -lnvtx_prof -L${CUDA_HOME}/lib64 -cuda -lnvToolsExt - - -%CPP cpp -Dkey_nosignedzero +%CPP cpp -Dkey_nosignedzero -I${MPI_HOME}/include %FC ${MPIF90} -c %FCFLAGS ${FCFLAGS} %FFLAGS %FCFLAGS diff --git a/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps b/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps index 0f9fc042a7..b0f8f90060 100644 --- a/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps +++ b/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps @@ -1,10 +1,10 @@ - it : 1 |ssh|_max: 0.2336851764570053D+01 |U|_max: 0.7053248015576865D-02 |V|_max: 0.2308346115751064D-02 S_min: 0.2996908779321225D+02 S_max: 0.3101392941293399D+02 - it : 2 |ssh|_max: 0.3739162010288019D+01 |U|_max: 0.1029843199699749D-01 |V|_max: 0.9493800242777233D-02 S_min: 0.2996911000748410D+02 S_max: 0.3101392863147436D+02 - it : 3 |ssh|_max: 0.4224443521973881D+01 |U|_max: 0.1349433227267360D-01 |V|_max: 0.2284885234302297D-01 S_min: 0.2996913553478158D+02 S_max: 0.3101392784904396D+02 - it : 4 |ssh|_max: 0.4659313564999673D+01 |U|_max: 0.1490637483763983D-01 |V|_max: 0.4048444554221592D-01 S_min: 0.2996916129319160D+02 S_max: 0.3101392717586670D+02 - it : 5 |ssh|_max: 0.4949503007019537D+01 |U|_max: 0.1145357177492709D-01 |V|_max: 0.5364770396337241D-01 S_min: 0.2996918706295251D+02 S_max: 0.3101392651315916D+02 - it : 6 |ssh|_max: 0.5140472974504101D+01 |U|_max: 0.1064859943349832D-01 |V|_max: 0.6818865538920099D-01 S_min: 0.2996921262561763D+02 S_max: 0.3101392594056642D+02 - it : 7 |ssh|_max: 0.5229361171698524D+01 |U|_max: 0.7814316351518531D-02 |V|_max: 0.8358086738712590D-01 S_min: 0.2996923864577586D+02 S_max: 0.3101392538498657D+02 - it : 8 |ssh|_max: 0.5220719217849657D+01 |U|_max: 0.1141515836387377D-01 |V|_max: 0.9761604183740114D-01 S_min: 0.2996926417117689D+02 S_max: 0.3101392490495336D+02 - it : 9 |ssh|_max: 0.5145297949564463D+01 |U|_max: 0.1416399592481803D-01 |V|_max: 0.1152759253498275D+00 S_min: 0.2996929035879930D+02 S_max: 0.3101392444612747D+02 - it : 10 |ssh|_max: 0.4979557010366619D+01 |U|_max: 0.1986785874281591D-01 |V|_max: 0.1303543987480588D+00 S_min: 0.2996931641421843D+02 S_max: 0.3101392405137850D+02 + it : 1 |ssh|_max: 0.2336851764570087D+01 |U|_max: 0.7053248015579857D-02 |V|_max: 0.2308346115756259D-02 S_min: 0.2996908779321225D+02 S_max: 0.3101392941293399D+02 + it : 2 |ssh|_max: 0.3739162010287973D+01 |U|_max: 0.1029843199698906D-01 |V|_max: 0.9493800242775713D-02 S_min: 0.2996911000748410D+02 S_max: 0.3101392863147436D+02 + it : 3 |ssh|_max: 0.4224443521974239D+01 |U|_max: 0.1349433227265986D-01 |V|_max: 0.2284885234301404D-01 S_min: 0.2996913553478157D+02 S_max: 0.3101392784904396D+02 + it : 4 |ssh|_max: 0.4659313564999622D+01 |U|_max: 0.1490637483762341D-01 |V|_max: 0.4048444554220138D-01 S_min: 0.2996916129319160D+02 S_max: 0.3101392717586671D+02 + it : 5 |ssh|_max: 0.4949503007019767D+01 |U|_max: 0.1145357177490677D-01 |V|_max: 0.5364770396337813D-01 S_min: 0.2996918706295251D+02 S_max: 0.3101392651315916D+02 + it : 6 |ssh|_max: 0.5140472974504293D+01 |U|_max: 0.1064859943349158D-01 |V|_max: 0.6818865538921454D-01 S_min: 0.2996921262561763D+02 S_max: 0.3101392594056643D+02 + it : 7 |ssh|_max: 0.5229361171698655D+01 |U|_max: 0.7814316351505392D-02 |V|_max: 0.8358086738711774D-01 S_min: 0.2996923864577587D+02 S_max: 0.3101392538498657D+02 + it : 8 |ssh|_max: 0.5220719217849857D+01 |U|_max: 0.1141515836389672D-01 |V|_max: 0.9761604183737865D-01 S_min: 0.2996926417117689D+02 S_max: 0.3101392490495337D+02 + it : 9 |ssh|_max: 0.5145297949564862D+01 |U|_max: 0.1416399592482473D-01 |V|_max: 0.1152759253497909D+00 S_min: 0.2996929035879930D+02 S_max: 0.3101392444612748D+02 + it : 10 |ssh|_max: 0.4979557010366737D+01 |U|_max: 0.1986785874282448D-01 |V|_max: 0.1303543987480547D+00 S_min: 0.2996931641421842D+02 S_max: 0.3101392405137852D+02 diff --git a/examples/nemo/scripts/Makefile b/examples/nemo/scripts/Makefile index c1828e0e8a..9ebeb09634 100644 --- a/examples/nemo/scripts/Makefile +++ b/examples/nemo/scripts/Makefile @@ -61,16 +61,16 @@ psycloned-passthrough/%.f90: ${ROOT_SRC}%.f90 psycloned-passthrough psyclone -s passthrough.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openmp_cpu/%.f90: ${ROOT_SRC}%.f90 psycloned-openmp_cpu - psyclone -s omp_cpu_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< + psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openmp_gpu/%.f90: ${ROOT_SRC}%.f90 psycloned-openmp_gpu - psyclone -s omp_gpu_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< + psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openacc_kernels/%.f90: ${ROOT_SRC}%.f90 psycloned-openacc_kernels psyclone -s acc_kernels_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $< psycloned-openacc_loops/%.f90: ${ROOT_SRC}%.f90 psycloned-openacc_loops - psyclone -s acc_loops_trans.py -l output -I ${ROOT_SRC} -o $@ $< + psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -o $@ $< # Get the number of Makefile parallel jobs to pass it to the makenemo MAKE_PID := $(shell echo $$PPID) diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md new file mode 100644 index 0000000000..6eb7945a5f --- /dev/null +++ b/examples/nemo/scripts/README.md @@ -0,0 +1,223 @@ + + +# PSyclone NEMO Examples + +This directory contains various examples showing how to apply PSyclone to +transform the source code of the NEMO ocean model. + +> [!Important] +> The NEMO build system, `makenemo`, has the ability to apply PSyclone +> scripts that come with the NEMO repository with the `-p` flag (see +> [the NEMO user guide](https://sites.nemo-ocean.io/user-guide/psyclone.html)), +> but these are pinned to a particular release of PSyclone and have constraints +> defined in `mk/sct_psyclone.sh` script. By contrast, the process presented in +> this README uses the experimental `psyclonefc` compiler wrapper command which +> bypasses the `makenemo -p` and instead intercepts any compilation command and +> wraps it with a PSyclone call followed by a compiler call. +> This is the recommended way to apply upstream PSyclone transformations, as it +> is not constrained by the file-exclusions and backward compatibility guarantees +> of the scripts inside the NEMO repository. + +## Downloading the NEMO source and data files + +To test the examples you can download NEMO and its input data as follows: +```bash +git clone https://forge.nemo-ocean.eu/nemo/nemo.git --branch 5.0 --single-branch +wget https://gws-access.jasmin.ac.uk/public/nemo/sette_inputs/r5.0.0/ORCA2_ICE_v5.0.0.tar.gz +tar -xzf ORCA2_ICE_v5.0.0.tar.gz +``` + +The examples have been tested with NEMOv4.0.2 (SPLITZ configuration) and +NEMOv5.0 (BENCH and ORCA_ICE_PISCIES configuration), but we aim to support +any version of NEMO. If you encounter any issue applying these examples +please report to the authors. + + +## Set up environment variables + +In order to provide a flexible system that works with different directives and +compilers we provide a parameterised transformation script +`insert_loop_parallelism.py` and a parameterised NEMO arch file +`KGO/arch-linux_spack.fcm`, both with multiple environment variables that need +to be adjusted depending on your desired optimisation target. + +First of all, the arch file has a `MPIF90` to choose the compiler, this +needs to be set to `psyclonefc`. This is a compiler wrapper utility that +substitutes its calls with: an invocation to PSyclone to process the given +source file (using the options provided in `PSYCLONE_OPTS`) followed by an +invocation to a compiler (provided by `PSYCLONE_COMPILER`). + +For example, to apply the `insert_loop_parallelism.py` and compile it with +`mpif90` we can use the following set up: + +```bash +export MPIF90=psyclonefc +export PSYCLONE_COMPILER=mpif90 +export PSYCLONE_OPTS="-l output -s ${PSYCLONE_NEMO_EXAMPLES_DIR}/insert_loop_parallelism.py" +``` + +This transformation script is in turn parameterised with a `PARALLEL_DIRECTIVES` +variable that have to be consistently set up with the chosen `FCFLAGS` flags. + +For instance, for the `nvfortran` compiler, you can choose between: +- Serial transformations with no parallel directives +```bash +export PARALLEL_DIRECTIVES="" +export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g" +``` + +- Inserting OpenMP CPU threading parallelism +```bash +export PARALLEL_DIRECTIVES="omp_threading" +export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp" +``` + +- Inserting OpenMP GPU offloading with reproducible build flags +```bash +export PARALLEL_DIRECTIVES="omp_offloading" +export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform" +export REPRODUCIBLE=1 +``` + +- Inserting OpenACC GPU offloading with reproducible build flags (-mp=gpu is needed for reproducibility) +```bash +export PARALLEL_DIRECTIVES="acc_offloading" +export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc=gpu -mp=gpu -gpu=mem:managed,math_uniform" +export REPRODUCIBLE=1 +``` + +- Hybrid directives (what cannot be offloaded fallsback to threading) and fast GPU flags +```bash +unset REPRODUCIBLE +export PARALLEL_DIRECTIVES="omp_offloading+omp_threading" +export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed" +``` + +> [!Note] +> Currently, NEMOv4 and NEMOv5 take different optimisation paths, so it is +> important to also set: +> +> ```bash +> export NEMOv4=1 +> ``` +> when applying the transformations to NEMOv4. + +TODO: Mention `ASYNC_PARALLEL`, `ENABLE_INLINING`, `PROFILING` + +## Compiling and running the application + +Once the environment variables are set, use the `makenemo` command with +the desired NEMO configuration and keys. For example: + +```bash +./makenemo -r ORCA2_ICE_PISCES -m arch-linux_spack -n ORCA2_psycloned ... +``` + +If everything worked you will see PSyclone generated files in the +`/BLD/tmp` directory and the final binary in the +`/EXP00` directory. + +You can run this binary using the appropriate command from the configuration +and inserted programming model. For example, for a hybrid +MPI+OMP offloading+OMP threading you can do: + +```bash +# Prepare problem +ln -sf ${ORCA2_INPUTS}/ORCA2_ICE_v5.0.0/* cfgs/ORCA2_psycloned/EXP00/. +cd cfgs/ORCA2_psycloned/EXP00 +# Reduce num of iterations and add timing/runstat +sed -i "s/nn_itend.*/nn_itend = 10/" namelist_cfg +sed -i "s/ln_icebergs.*/ln_icebergs = .false./" namelist_cfg +sed -i "s/\&namctl.*/\&namctl\n ln_timing = .true. \n sn_cfctl%l_runstat = .true.\n/" namelist_cfg +# Run problem +OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=1,2 mpirun -n 2 ./nemo +``` + +## Identifying the cause of issues + +A difficulty of working with code-transformation scripts is that it is possible +to incorrectly transform a file semantics while still creating valid Fortran. + +This means that the transformation will succeed and the generated code will +compile, but the results will diverge. This gets more complicated with parallel +programming because certain operations like reductions or atomics are not +always reproducible. Therefore, to understand what causes the results divergence +it is useful to apply the transformations step-by-step while checking if the +`run.stat` values change. Some useful steps are: + +- Starting building NEMO *without* `psyclonefc` and conservative optimisation flags + and run it serially (O2, no vectorisation, no-fma). Then store the generated `run.stat`. +- Then switch to using `psyclonefc` with the `PSYCLONE_OPTS="-s passthrough.py"`, + this will make PSyclone process all files but without applying any + transformations. Check if the results still match. +- Then build it with `PSYCLONE_OPTS="-s insert_loop_parallelism.py"` but keeping + the `PARALLEL_DIRECTIVES=""` empty. This will apply serial transformations but + no directives yet. +- Then run it `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_threading" PSYCLONE_OPTS="-s insert_loop_parallelism.py"` + and see if the results still match. +- Finally, run it with `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_offloading" PSYCLONE_OPTS="-s insert_loop_parallelism.py"` + +Alongside finding which step is causing the divergence we may want to find +which file/s are causing it. This folder also contains a `do_file_by_file.sh` +script that build NEMO many times, each with only one file being transformed, +and compares the results with the stores `run.stat` + + +## Tuning the generated implementation + +Since this is now a two-step process, there are two locations where you can modify +files that will alter the output result. First is manually modifying the original +source code. For this we recommend using the built-in `makenemo` functionality +that allow to point to a directory with patched source files: + +```bash +./makenemo -e ... +``` + +In addition to the source, you can also modify the recipe that PSyclone uses to +transform the code. In this example you can do so by changing any detail of the +`insert_loop_parallelism.py` transformation script, but the `FILES_TO_SKIP` +global variable is particularly relevant as it allows PSyclone skip processing +the listed files. If modifying a particular file is known to cause problems or +performance regressions, include it in this list. + +You can also do both. For example if you want to provide a modified file that +already includes directives, you need to reference it with the `-e ` +and in the FILES_TO_SKIP (otherwise PSyclone would ignore the given directives +and try to insert its own). This is currently the optimal approach for `seaice` +and `lbclnk.f90` GPU offloading. diff --git a/examples/nemo/scripts/acc_loops_trans.py b/examples/nemo/scripts/acc_loops_trans.py deleted file mode 100755 index c7271fc48c..0000000000 --- a/examples/nemo/scripts/acc_loops_trans.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python -# ----------------------------------------------------------------------------- -# BSD 3-Clause License -# -# Copyright (c) 2023-2026, Science and Technology Facilities Council. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# ----------------------------------------------------------------------------- -# Authors: S. Siso, STFC Daresbury Lab - -''' PSyclone transformation script showing the introduction of OpenACC loop -directives into Nemo code. ''' - -from utils import ( - insert_explicit_loop_parallelism, normalise_loops, add_profiling, - NOT_PERFORMANT, NEMO_MODULES_TO_IMPORT) -from psyclone.psyir.nodes import Routine -from psyclone.transformations import ( - ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans) - -# Enable the insertion of profiling hooks during the transformation script -PROFILING_ENABLED = True - -# Whether to chase the imported modules to improve symbol information (it can -# also be a list of module filenames to limit the chasing to only specific -# modules). This has to be used in combination with '-I' command flag in order -# to point to the module location directory. We also strongly recommend using -# the '--enable-cache' flag to reduce the performance overhead. -RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT - -# List of all files that psyclone will skip processing -FILES_TO_SKIP = NOT_PERFORMANT - - -def trans(psyir): - ''' Add OpenACC Parallel and Loop directives to all loops, including the - implicit ones, to parallelise the code and execute it in an acceleration - device. - - :param psyir: the PSyIR of the provided file. - :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` - - ''' - acc_region_trans = ACCParallelTrans(default_present=False) - acc_loop_trans = ACCLoopTrans() - - # TODO #2317: Has structure accesses that can not be offloaded and has - # a problematic range to loop expansion of (1:1) - if psyir.name.startswith("obs_"): - print("Skipping", psyir.name) - return - - for subroutine in psyir.walk(Routine): - print(f"Transforming subroutine: {subroutine.name}") - - if PROFILING_ENABLED: - add_profiling(subroutine.children) - - # S-0074-Illegal number or type of arguments to ubound [and lbound] - # - keyword argument array; and NVFORTRAN-S-0082-Illegal substring - # expression for variable filtide - if subroutine.name in ("bdytide_init", "sbc_cpl_init"): - print("Skipping", subroutine.name) - continue - - # OpenACC fails in the following routines with the Compiler error: - # Could not find allocated-variable index for symbol - xxx - # This all happen on characters arrays, e.g. cd_nat - if subroutine.name in ("lbc_nfd_2d_ptr", "lbc_nfd_3d_ptr", - "lbc_nfd_4d_ptr", "bdy_dyn", "dia_obs_init"): - print("Skipping", subroutine.name) - continue - - normalise_loops( - subroutine, - hoist_local_arrays=True, - convert_array_notation=True, - convert_range_loops=True, - hoist_expressions=True - ) - - # These are functions that are called from inside parallel regions, - # annotate them with 'acc routine' - if subroutine.name.lower().startswith("sign_"): - ACCRoutineTrans().apply(subroutine) - print(f"Marked {subroutine.name} as GPU-enabled") - continue - - insert_explicit_loop_parallelism( - subroutine, - region_directive_trans=acc_region_trans, - loop_directive_trans=acc_loop_trans, - # Collapse is necessary to give GPUs enough parallel items - collapse=True, - ) diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py new file mode 100755 index 0000000000..035e6e0714 --- /dev/null +++ b/examples/nemo/scripts/insert_loop_parallelism.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python +# ----------------------------------------------------------------------------- +# BSD 3-Clause License +# +# Copyright (c) 2021-2026, Science and Technology Facilities Council. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- +# Authors: S. Siso, STFC Daresbury Lab + +''' PSyclone transformation script showing the introduction of OpenMP for GPU +directives into Nemo code. ''' + +import os +import sys +from utils import ( + add_profiling, inline_calls, insert_explicit_loop_parallelism, + normalise_loops, NEMO_MODULES_TO_IMPORT) +from psyclone.psyir.nodes import Routine, Loop +from psyclone.psyir.transformations import ( + OMPTargetTrans, OMPDeclareTargetTrans) +from psyclone.transformations import ( + OMPLoopTrans, TransformationError) +from psyclone.transformations import ( + ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans) + + +# This environment variable informs if this is targeting NEMOv4 +NEMOV4 = os.environ.get('NEMOV4', False) + +# This environment variable informs which parallelisation directives to use +# It supports acc_offloading, omp_offloading and omp_threading +# They can be combined, e.g PARALLEL_DIRECTIVES='omp_offloading+omp_threading', +# or use none to just apply the serial transformations +PARALLEL_DIRECTIVES = os.environ.get('PARALLEL_DIRECTIVES', '') + +# By default, allow optimisations that may change the results, e.g. reductions, +# offloading intrinsics without math_uniform, ... +REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False) + +# This environment variable informs if profiling hooks have to be inserted. +PROFILING_ENABLED = os.environ.get('ENABLE_PROFILING', False) + +# By default, we don't do module inlining as it's still under development. +INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False) + +# This environment variable informs if we're enabling asynchronous +# parallelism. +ASYNC_PARALLEL = os.environ.get('ASYNC_PARALLEL', False) + +# Whether to chase the imported modules to improve symbol information (it can +# also be a list of module filenames to limit the chasing to only specific +# modules). This has to be used in combination with '-I' command flag in order +# to point to the module location directory. We also strongly recommend using +# the '--enable-cache' flag to reduce the performance overhead. +RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT + +# List of all files that psyclone will skip processing +FILES_TO_SKIP = [] + +# There files are skipped because transforming them degrade the performance +SKIP_FOR_PERFORMANCE = [ + "iom.f90", + "iom_nf90.f90", + "iom_def.f90", + "timing.f90", + "histcom.f90", +] + +# These files change the results from the baseline when psyclone adds +# parallelisation directives +PARALLELISATION_ISSUES = [] + +# These files change the results from the baseline when psyclone adds +# offloading directives +OFFLOADING_ISSUES = [] + +if not NEMOV4: + FILES_TO_SKIP.extend([ + # Fail in nvfortran when enabling seaice + "icefrm.f90", # Has unsupported implicit symbol declaration + ]) + + SKIP_FOR_PERFORMANCE.extend([ + "lbclnk.f90", + ]) + + PARALLELISATION_ISSUES.extend([ + "ldfc1d_c2d.f90", + "tramle.f90", + "traqsr.f90", + ]) + + OFFLOADING_ISSUES.extend([ + # Produces different output results + "zdftke.f90", + # The following issues only affect BENCH (because ice is enabled?) + # Runtime Error: Illegal address during kernel execution + "trcrad.f90", + # nvhpc > 24.11 - Signal 11 issues + "icerst.f90", # When enabling ice* parallelisation + "trcbbl.f90", + "trabbc.f90", + "bdyice.f90", + "sedfunc.f90", + "stpmlf.f90", + "trddyn.f90", + "trczdf.f90", + "trcice_pisces.f90", + "dtatsd.f90", + "trcatf.f90", + "stp2d.f90", + ]) + + # if "acc_offloading" in PARALLEL_DIRECTIVES: + # OFFLOADING_ISSUES.extend([ + # # Fail in OpenACC ORCA2_ICE_PISCES + # "dynzdf.f90", + # "trabbl.f90", + # "trazdf.f90", + # "zdfsh2.f90", + # ]) + +ASYNC_ISSUES = [ + # Runtime Error: (CUDA_ERROR_LAUNCH_FAILED): Launch failed + # (often invalid pointer dereference) in get_cstrgsurf + "sbcclo.f90", + "trcldf.f90", + # Runtime Error: Illegal address during kernel execution with + # asynchronicity. + "zdfiwm.f90", + "zdfsh2.f90", + # Diverging results with asynchronicity + "traadv_fct.f90", + "bdy_oce.f90", +] + + +def select_transformations(): + ''' + Use the PARALLEL_DIRECTIVES global to select what specific transformations + to apply to insert the desired directives. + ''' + process_directives = PARALLEL_DIRECTIVES + + if 'omp_offloading' in process_directives: + offload_region_trans = OMPTargetTrans() + mark_for_gpu_trans = OMPDeclareTargetTrans() + if NEMOV4: + # TODO #2895: Explore why loop/teams loop diverge for NEMOv4 + gpu_loop_trans = OMPLoopTrans(omp_schedule="none") + gpu_loop_trans.omp_directive = "loop" + else: + gpu_loop_trans = OMPLoopTrans(omp_schedule="none") + gpu_loop_trans.omp_directive = "teamsloop" + process_directives = process_directives.replace('omp_offloading', '') + elif 'acc_offloading' in process_directives: + offload_region_trans = ACCParallelTrans(default_present=False) + mark_for_gpu_trans = ACCRoutineTrans() + gpu_loop_trans = ACCLoopTrans() + process_directives = process_directives.replace('acc_offloading', '') + else: + offload_region_trans = None + mark_for_gpu_trans = None + gpu_loop_trans = None + + if 'omp_threading' in process_directives: + cpu_loop_trans = OMPLoopTrans(omp_schedule="static") + cpu_loop_trans.omp_directive = "paralleldo" + process_directives = process_directives.replace('omp_threading', '') + else: + cpu_loop_trans = None + + process_directives = process_directives.replace('+', '') + if process_directives != '': + sys.exit(f"Unknown PARALLEL_DIRECTIVES: {process_directives}") + + return (offload_region_trans, mark_for_gpu_trans, + gpu_loop_trans, cpu_loop_trans) + + +def filter_files_by_name(name: str) -> bool: + ''' + :returns: whether to transform a file with the given name. Contrary to + FILES_TO_SKIP, this will still run the files through psyclone. + ''' + # The two options below are useful for file-by-file exhaustive tests. + # If the environment has ONLY_FILE defined, only process that one file and + # known-good files that need a "declare target" inside. + only_file = os.environ.get('ONLY_FILE', False) + if only_file: + files_to_do = [only_file] + if "offloading" in PARALLEL_DIRECTIVES: + files_to_do.extend( + ["lib_fortran.f90", "solfrac_mod.f90", "sbc_phy.f90"]) + if name not in files_to_do: + return True + # If the environment has ALL_BUT_FILE defined, process all files but + # the one named file. + all_but_file = os.environ.get('ALL_BUT_FILE', False) + if all_but_file and name == all_but_file: + return True + + # These work but are skipped to improve performance, they could be in the + # FILES_TO_SKIP global parameter, but in this script, for testing purposes, + # we exclude them here so the PSyclone frontend and backend are still + # tested and it also allows to insert profiling hooks later on. + if name in SKIP_FOR_PERFORMANCE: + return True + + # Parallelising ICE or ICB currently causes a noticeable slowdown + # On nemo_main it can be just: if name.startswith("icethd"): + if not NEMOV4 and name.startswith("ice"): + return True + if name.startswith("icb"): + return True + + # This file fails for gcc NEMOv5 BENCH + if not NEMOV4 and name == "icedyn_rhg_evp.f90": + return True + + return False + + +def trans(psyir): + ''' Normalise and add directives to all possible loops, including the + implicit ones. + + :param psyir: the PSyIR of the provided file. + :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` + + ''' + if filter_files_by_name(psyir.name): + return + + (offload_region_trans, mark_for_gpu_trans, gpu_loop_trans, + cpu_loop_trans) = select_transformations() + + disable_profiling_for = [] + enable_async = ASYNC_PARALLEL and psyir.name not in ASYNC_ISSUES + privatise_arrays = not (NEMOV4 or "acc" in PARALLEL_DIRECTIVES) + + for subroutine in psyir.walk(Routine): + + # Skip initialisation and diagnostic subroutines + if (subroutine.name.endswith('_alloc') or + subroutine.name.endswith('_init') or + subroutine.name.startswith('init_') or + subroutine.name.startswith('Agrif') or + subroutine.name.startswith('dia_') or + subroutine.name == 'dom_msk' or + subroutine.name == 'dom_zgr' or + subroutine.name == 'dom_ngb'): + continue + + normalise_loops( + subroutine, + hoist_local_arrays=False, + convert_array_notation=True, + # See issue #3022 + loopify_array_intrinsics=psyir.name != "getincom.f90", + convert_range_loops=True, + increase_array_ranks=not NEMOV4, + hoist_expressions=True + ) + + # Perform module-inlining of called routines. + if INLINING_ENABLED: + inline_calls(subroutine) + + # These are functions that are called from inside parallel regions, + # annotate them with 'omp declare target' + if ( + mark_for_gpu_trans and + (subroutine.name.lower().startswith("sign_") + or subroutine.name.lower() == "solfrac" + or (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop))) + ): + try: + mark_for_gpu_trans.apply(subroutine) + print(f"Marked {subroutine.name} as GPU-enabled") + except TransformationError as err: + print(err) + # We continue parallelising inside the routine, but this could + # change if the parallelisation directives added below are not + # nestable, in that case we could add a 'continue' here + disable_profiling_for.append(subroutine.name) + + elif (psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES + and gpu_loop_trans): + print( + f"Adding offload directives to subroutine: {subroutine.name}") + insert_explicit_loop_parallelism( + subroutine, + region_directive_trans=offload_region_trans, + loop_directive_trans=gpu_loop_trans, + collapse=True, + privatise_arrays=privatise_arrays, + enable_reductions=not REPRODUCIBLE, + uniform_intrinsics_only=REPRODUCIBLE, + asynchronous_parallelism=enable_async, + ) + elif psyir.name not in PARALLELISATION_ISSUES and cpu_loop_trans: + # These have issues offloading, but we can still do threading + print(f"Adding OpenMP threading to subroutine: {subroutine.name}") + insert_explicit_loop_parallelism( + subroutine, + loop_directive_trans=cpu_loop_trans, + collapse=False, + privatise_arrays=privatise_arrays, + enable_reductions=not REPRODUCIBLE, + asynchronous_parallelism=enable_async, + ) + + # Iterate again and add profiling hooks when needed + for subroutine in psyir.walk(Routine): + if PROFILING_ENABLED and subroutine.name not in disable_profiling_for: + print(f"Adding profiling hooks to subroutine: {subroutine.name}") + add_profiling(subroutine.children) diff --git a/examples/nemo/scripts/omp_cpu_trans.py b/examples/nemo/scripts/omp_cpu_trans.py deleted file mode 100755 index 367e178423..0000000000 --- a/examples/nemo/scripts/omp_cpu_trans.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python -# ----------------------------------------------------------------------------- -# BSD 3-Clause License -# -# Copyright (c) 2021-2026, Science and Technology Facilities Council. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# ----------------------------------------------------------------------------- -# Authors: S. Siso, STFC Daresbury Lab - -''' PSyclone transformation script to insert OpenMP for CPU -directives into Nemo code. Tested with ECMWF Nemo 4.0 code. ''' - -import os -from utils import ( - insert_explicit_loop_parallelism, normalise_loops, add_profiling, - PARALLELISATION_ISSUES, NEMO_MODULES_TO_IMPORT) -from psyclone.psyir.nodes import Routine -from psyclone.transformations import OMPLoopTrans - -# Enable the insertion of profiling hooks during the transformation script -PROFILING_ENABLED = False - -# Whether to chase the imported modules to improve symbol information (it can -# also be a list of module filenames to limit the chasing to only specific -# modules). This has to be used in combination with '-I' command flag in order -# to point to the module location directory. We also strongly recommend using -# the '--enable-cache' flag to reduce the performance overhead. -RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT - -# A environment variable can inform if this is targeting NEMOv4, in which case -# array privatisation is disabled. -NEMOV4 = os.environ.get('NEMOV4', False) - -# By default, allow optimisations that may change the results, e.g. reductions -REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False) - -# List of all files that psyclone will skip processing -FILES_TO_SKIP = [] -if not NEMOV4: - # TODO #3112: These produce diverging run.stat results in gcc NEMOv5 BENCH - FILES_TO_SKIP = [ - "dynhpg.f90", - "dynspg_ts.f90", - "sbcssm.f90", - "tramle.f90", - "trazdf.f90", - ] - -if PROFILING_ENABLED: - # Fails with profiling enabled. issue #2723 - FILES_TO_SKIP.append("mppini.f90") - - -def trans(psyir): - ''' Add OpenMP Parallel and Do directives to all loops, including the - implicit ones. - - :param psyir: the PSyIR of the provided file. - :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` - - ''' - # If the environemnt has ONLY_FILE defined, only process that one file and - # nothing else. This is useful for file-by-file exhaustive tests. - only_do_file = os.environ.get('ONLY_FILE', False) - if only_do_file and psyir.name != only_do_file: - return - - # Parallelising this file currently causes a noticeable slowdown - if psyir.name.startswith("icethd"): - return - - # This file fails for gcc NEMOv5 BENCH - if not NEMOV4 and psyir.name == "icedyn_rhg_evp.f90": - return - - omp_parallel_trans = None - omp_loop_trans = OMPLoopTrans(omp_schedule="static") - omp_loop_trans.omp_directive = "paralleldo" - - for subroutine in psyir.walk(Routine): - print(f"Adding OpenMP threading to subroutine: {subroutine.name}") - - if PROFILING_ENABLED: - add_profiling(subroutine.children) - - normalise_loops( - subroutine, - hoist_local_arrays=False, - convert_array_notation=True, - # See issue #3022 - loopify_array_intrinsics=psyir.name != "getincom.f90", - convert_range_loops=True, - hoist_expressions=False, - scalarise_loops=False - ) - - if psyir.name not in PARALLELISATION_ISSUES: - insert_explicit_loop_parallelism( - subroutine, - region_directive_trans=omp_parallel_trans, - loop_directive_trans=omp_loop_trans, - collapse=False, - privatise_arrays=not NEMOV4, - enable_reductions=not REPRODUCIBLE, - ) diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py deleted file mode 100755 index 42047274bd..0000000000 --- a/examples/nemo/scripts/omp_gpu_trans.py +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env python -# ----------------------------------------------------------------------------- -# BSD 3-Clause License -# -# Copyright (c) 2021-2026, Science and Technology Facilities Council. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# ----------------------------------------------------------------------------- -# Authors: S. Siso, STFC Daresbury Lab - -''' PSyclone transformation script showing the introduction of OpenMP for GPU -directives into Nemo code. ''' - -import os -from utils import ( - add_profiling, inline_calls, insert_explicit_loop_parallelism, - normalise_loops, PARALLELISATION_ISSUES, NEMO_MODULES_TO_IMPORT) -from psyclone.psyir.nodes import Routine, Loop -from psyclone.psyir.transformations import ( - OMPTargetTrans, OMPDeclareTargetTrans) -from psyclone.transformations import ( - OMPLoopTrans, TransformationError) - - -# This environment variable informs if profiling hooks have to be inserted. -PROFILING_ENABLED = os.environ.get('ENABLE_PROFILING', False) - -# By default, we don't do module inlining as it's still under development. -INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False) - -# By default, we allow all device intrinsics (not only the reproducible ones) -REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False) - -# This environment variable informs if this is targeting NEMOv4, in which case -# array privatisation is disabled and some more files excluded -NEMOV4 = os.environ.get('NEMOV4', False) - -# This environment variable informs if we're enabling asynchronous -# parallelism. -ASYNC_PARALLEL = os.environ.get('ASYNC_PARALLEL', False) - -# Whether to chase the imported modules to improve symbol information (it can -# also be a list of module filenames to limit the chasing to only specific -# modules). This has to be used in combination with '-I' command flag in order -# to point to the module location directory. We also strongly recommend using -# the '--enable-cache' flag to reduce the performance overhead. -RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT - -# List of all files that psyclone will skip processing -FILES_TO_SKIP = [ - "icefrm.f90", # Has an unsupported implicit symbol declaration -] - -NEMOV5_EXCLUSIONS = [] - -NEMOV4_EXCLUSIONS = [ - "dynspg_ts.f90", - "tranxt.f90", -] - -SKIP_FOR_PERFORMANCE = [ - "iom.f90", - "iom_nf90.f90", - "iom_def.f90", - "timing.f90", - "histcom.f90", -] - -OFFLOADING_ISSUES = [ - # Produces different output results - "zdftke.f90", - # The following issues only affect BENCH (because ice is enabled?) - # Runtime Error: Illegal address during kernel execution - "trcrad.f90", - # Signal 11 issues - "trcbbl.f90", - "bdyice.f90", - "sedfunc.f90", - "stpmlf.f90", - "trddyn.f90", - "trczdf.f90", - "trcice_pisces.f90", - "dtatsd.f90", - "trcatf.f90", - "stp2d.f90", - "trabbc.f90", -] - -ASYNC_ISSUES = [ - # TODO #3220: Explore the cause of the async issues - # Runtime Error: (CUDA_ERROR_LAUNCH_FAILED): Launch failed - # (often invalid pointer dereference) in get_cstrgsurf - "sbcclo.f90", - "trcldf.f90", - # Runtime Error: Illegal address during kernel execution with - # asynchronicity. - "zdfiwm.f90", - "zdfsh2.f90", - # Diverging results with asynchronicity - "traadv_fct.f90", -] - - -def trans(psyir): - ''' Add OpenMP Target and Loop directives to all loops, including the - implicit ones, to parallelise the code and execute it in an acceleration - device. - - :param psyir: the PSyIR of the provided file. - :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer` - - ''' - # The two options below are useful for file-by-file exhaustive tests. - # If the environemnt has ONLY_FILE defined, only process that one file and - # known-good files that need a "declare target" inside. - only_do_file = os.environ.get('ONLY_FILE', False) - only_do_files = (only_do_file, "lib_fortran.f90", "solfrac_mod.f90") - if only_do_file and psyir.name not in only_do_files: - return - # If the environemnt has ALL_BUT_FILE defined, process all files but - # the one named file. - all_but_file = os.environ.get('ALL_BUT_FILE', False) - if all_but_file and psyir.name == all_but_file: - return - - omp_target_trans = OMPTargetTrans() - if NEMOV4: - # TODO #2895: Explore why loop/teams loop diverge for NEMOv4 - omp_gpu_loop_trans = OMPLoopTrans(omp_schedule="none") - omp_gpu_loop_trans.omp_directive = "loop" - else: - omp_gpu_loop_trans = OMPLoopTrans(omp_schedule="none") - omp_gpu_loop_trans.omp_directive = "teamsloop" - omp_cpu_loop_trans = OMPLoopTrans(omp_schedule="static") - omp_cpu_loop_trans.omp_directive = "paralleldo" - - disable_profiling_for = [] - enable_async = ASYNC_PARALLEL and psyir.name not in ASYNC_ISSUES - - for subroutine in psyir.walk(Routine): - - # The exclusion below could be in the FILES_TO_SKIP global parameter, - # but in this script, for testing purposes, we exclude them here so the - # PSyclone frontend and backend are still tested and it also allows to - # insert profiling hooks later on. - if psyir.name in SKIP_FOR_PERFORMANCE: - continue - if NEMOV4 and psyir.name in NEMOV4_EXCLUSIONS: - continue - if not NEMOV4 and psyir.name in NEMOV5_EXCLUSIONS: - continue - # ICE routines do not perform well on GPU, so we skip them - if psyir.name.startswith("ice"): - continue - # Skip initialisation and diagnostic subroutines - if (subroutine.name.endswith('_alloc') or - subroutine.name.endswith('_init') or - subroutine.name.startswith('Agrif') or - subroutine.name.startswith('dia_') or - subroutine.name == 'dom_msk' or - subroutine.name == 'dom_zgr' or - subroutine.name == 'dom_ngb'): - continue - - normalise_loops( - subroutine, - hoist_local_arrays=False, - convert_array_notation=True, - # See issue #3022 - loopify_array_intrinsics=psyir.name != "getincom.f90", - convert_range_loops=True, - increase_array_ranks=not NEMOV4, - hoist_expressions=True - ) - # Perform module-inlining of called routines. - if INLINING_ENABLED: - inline_calls(subroutine) - - # These are functions that are called from inside parallel regions, - # annotate them with 'omp declare target' - if ( - subroutine.name.lower().startswith("sign_") - or subroutine.name.lower() == "solfrac" - or (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop)) - ): - try: - OMPDeclareTargetTrans().apply(subroutine) - print(f"Marked {subroutine.name} as GPU-enabled") - except TransformationError as err: - print(err) - # We continue parallelising inside the routine, but this could - # change if the parallelisation directives added below are not - # nestable, in that case we could add a 'continue' here - disable_profiling_for.append(subroutine.name) - - if NEMOV4: - # For nemo4 always offload but without privatisation - print(f"Adding OpenMP offloading to subroutine: {subroutine.name}") - insert_explicit_loop_parallelism( - subroutine, - region_directive_trans=omp_target_trans, - loop_directive_trans=omp_gpu_loop_trans, - collapse=True, - privatise_arrays=False, - asynchronous_parallelism=enable_async, - uniform_intrinsics_only=REPRODUCIBLE, - enable_reductions=not REPRODUCIBLE - ) - elif psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES: - print(f"Adding OpenMP offloading to subroutine: {subroutine.name}") - insert_explicit_loop_parallelism( - subroutine, - region_directive_trans=omp_target_trans, - loop_directive_trans=omp_gpu_loop_trans, - collapse=True, - asynchronous_parallelism=enable_async, - privatise_arrays=True, - uniform_intrinsics_only=REPRODUCIBLE, - enable_reductions=not REPRODUCIBLE - ) - elif psyir.name not in PARALLELISATION_ISSUES: - # This have issues offloading, but we can still do OpenMP threading - print(f"Adding OpenMP threading to subroutine: {subroutine.name}") - # If asynchronous parallelism is enabled, these subroutines in - # sbcclo.f90 fail if they're parallelised on the CPU. - if (ASYNC_PARALLEL and subroutine.name in - ("get_cssrcsurf", "get_cstrgsurf")): - continue - insert_explicit_loop_parallelism( - subroutine, - loop_directive_trans=omp_cpu_loop_trans, - asynchronous_parallelism=enable_async, - privatise_arrays=True, - ) - - # Iterate again and add profiling hooks when needed - for subroutine in psyir.walk(Routine): - if psyir.name in SKIP_FOR_PERFORMANCE: - continue - if PROFILING_ENABLED and subroutine.name not in disable_profiling_for: - print(f"Adding profiling hooks to subroutine: {subroutine.name}") - add_profiling(subroutine.children) diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py index 8161eaa16c..565537e0c2 100755 --- a/examples/nemo/scripts/utils.py +++ b/examples/nemo/scripts/utils.py @@ -201,6 +201,8 @@ def normalise_loops( :param hoist_expressions: whether to hoist bounds and loop invariant statements out of the loop nest. ''' + filename = schedule.root.name + nemo_v4 = os.environ.get('NEMOV4', False) if hoist_local_arrays and schedule.name not in CONTAINS_STMT_FUNCTIONS: # Apply the HoistLocalArraysTrans when possible, it cannot be applied # to files with statement functions because it will attempt to put the @@ -226,17 +228,22 @@ def normalise_loops( print(err.value) if convert_range_loops: - # Convert all array implicit loops to explicit loops - explicit_loops = ArrayAssignment2LoopsTrans() - for assignment in schedule.walk(Assignment): + if schedule.name in ("fld_def",): # TODO #2951: Fix array assignments with dependencies - if schedule.name in ("fld_def",): - continue - try: - explicit_loops.apply( - assignment, options={'verbose': True}) - except TransformationError: - pass + pass + elif nemo_v4 and filename == "dynspg_ts.f90": + # TODO #3256: Is there an issue with the L/UBOUND intrinsics + # that this transformation adds? + pass + else: + # Convert all array implicit loops to explicit loops + explicit_loops = ArrayAssignment2LoopsTrans() + for assignment in schedule.walk(Assignment): + try: + explicit_loops.apply( + assignment, options={'verbose': True}) + except TransformationError: + pass if scalarise_loops: # Apply scalarisation to every loop. Execute this in reverse order diff --git a/src/psyclone/psyir/nodes/structure_reference.py b/src/psyclone/psyir/nodes/structure_reference.py index 9983849b5a..fff61cbb05 100644 --- a/src/psyclone/psyir/nodes/structure_reference.py +++ b/src/psyclone/psyir/nodes/structure_reference.py @@ -348,8 +348,11 @@ def _get_cursor_shape(cursor, cursor_type): if not isinstance(cursor_type, (UnresolvedType, UnsupportedType)): # Once we've hit an Unresolved/UnsupportedType the cursor_type # will remain set to that as we can't do any better. - cursor_type = cursor_type.components[ - cursor.name.lower()].datatype + try: + cursor_type = cursor_type.components[ + cursor.name.lower()].datatype + except KeyError: + return UnresolvedType() try: cursor_shape = _get_cursor_shape(cursor, cursor_type) except NotImplementedError: diff --git a/src/psyclone/tests/psyir/nodes/structure_reference_test.py b/src/psyclone/tests/psyir/nodes/structure_reference_test.py index 6d4311a7ab..612c6b9f99 100644 --- a/src/psyclone/tests/psyir/nodes/structure_reference_test.py +++ b/src/psyclone/tests/psyir/nodes/structure_reference_test.py @@ -263,6 +263,11 @@ def test_struc_ref_datatype(): sref0 = nodes.StructureReference.create(ssym0, ["nx"]) assert sref0.datatype == symbols.INTEGER_TYPE + # If the type component is not found (e.g. it is inherited, which psyclone + # does not support), return UnresolvedType + sref = nodes.StructureReference.create(ssym0, ["not_specified"]) + assert sref.datatype == symbols.UnresolvedType() + # Symbol with type defined by DataTypeSymbol grid_type_symbol = symbols.DataTypeSymbol("grid_type", grid_type) ssym = symbols.DataSymbol("grid", grid_type_symbol) diff --git a/src/psyclone/tests/psyir/transformations/transformations_test.py b/src/psyclone/tests/psyir/transformations/transformations_test.py index bcb9e0176f..8b2d150ef3 100644 --- a/src/psyclone/tests/psyir/transformations/transformations_test.py +++ b/src/psyclone/tests/psyir/transformations/transformations_test.py @@ -123,7 +123,7 @@ def test_accparalleltrans_validate(fortran_reader): ''' Test that ACCParallelTrans validation fails if it contains non-allowed constructs. ''' - omptargettrans = ACCParallelTrans() + accparalleltrans = ACCParallelTrans() code = ''' function myfunc(a) @@ -134,6 +134,8 @@ def test_accparalleltrans_validate(fortran_reader): integer, dimension(10, 10) :: A integer :: i integer :: j + character*8 :: a, b + character :: c(8), d(8) do i = 1, 10 do j = 1, 10 A(i, j) = myfunc(3) @@ -149,35 +151,62 @@ def test_accparalleltrans_validate(fortran_reader): A(i,j) = GET_COMMAND(2) end do end do + do i = 1, 8 + a(i) = b(i) + end do + do i = 1, 8 + c(i) = d(i) + end do end subroutine ''' psyir = fortran_reader.psyir_from_source(code) loops = psyir.walk(Loop, stop_type=Loop) with pytest.raises(TransformationError) as err: - omptargettrans.validate(loops[0]) + accparalleltrans.validate(loops[0]) assert ("'myfunc' is not available on the accelerator device, and " "therefore it cannot be called from within an ACC parallel region." in str(err.value)) with pytest.raises(TransformationError) as err: - omptargettrans.validate(loops[1]) + accparalleltrans.validate(loops[1]) assert ("Nodes of type 'CodeBlock' cannot be enclosed by a ACCParallel" "Trans transformation" in str(err.value)) with pytest.raises(TransformationError) as err: - omptargettrans.validate(loops[2]) + accparalleltrans.validate(loops[2]) assert ("'GET_COMMAND' is not available on the default accelerator " "device. Use the 'device_string' option to specify a different " "device." in str(err.value)) with pytest.raises(TransformationError) as err: - omptargettrans.validate(loops[2], options={'device_string': - 'nvfortran-all'}) + accparalleltrans.validate(loops[2], options={'device_string': + 'nvfortran-all'}) assert ("'GET_COMMAND' is not available on the 'nvfortran-all' accelerator" " device. Use the 'device_string' option to specify a different " "device." in str(err.value)) + # Character substrings and no verbose option + with pytest.raises(TransformationError) as err: + accparalleltrans.validate(loops[3]) + assert ("ACCParallelTrans doesn't enclose regions that uses characters, " + "but found: b(i), use the 'allow_strings' transformation option " + "to offload this region." in str(err.value)) + assert loops[3].preceding_comment == "" + + # Character array and verbose option + with pytest.raises(TransformationError) as err: + accparalleltrans.validate(loops[4], options={'verbose': True}) + assert ("ACCParallelTrans doesn't enclose regions that uses characters, " + "but found: c(i), use the 'allow_strings' transformation option " + "to offload this region." in str(err.value)) + assert ("but found: c(i), use the 'allow_strings'" + in loops[4].preceding_comment) + + # These validate with the right option + accparalleltrans.validate(loops[3], options={'allow_strings': True}) + accparalleltrans.validate(loops[4], options={'allow_strings': True}) + def test_accenterdata(): ''' Generic tests for the ACCEnterDataTrans class ''' diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py index 0315ebba53..39bf6f5442 100644 --- a/src/psyclone/transformations.py +++ b/src/psyclone/transformations.py @@ -61,7 +61,7 @@ ACCDataDirective, ACCDirective, ACCEnterDataDirective, ACCKernelsDirective, ACCLoopDirective, ACCParallelDirective, ACCRoutineDirective, Call, CodeBlock, Directive, Literal, Loop, Node, - Return, Schedule, PSyDataNode, IntrinsicCall) + Reference, Return, Schedule, PSyDataNode, IntrinsicCall) from psyclone.psyir.nodes.acc_mixins import ACCAsyncMixin from psyclone.psyir.nodes.array_mixin import ArrayMixin from psyclone.psyir.nodes.omp_directives import ( @@ -1186,9 +1186,18 @@ def validate(self, node_list, options=None): avoid using unsupported nodes inside a region. :param bool options["default_present"]: this flag controls if the inserted directive should include the default_present clause. + :param bool options["allow_strings"]: whether to allow the + transformation on assignments involving character types. Defaults + to False. + :param bool options["verbose"]: whether to allow the + transformation on assignments involving character types. Defaults + to False. ''' node_list = self.get_node_list(node_list) + verbose = options.get("verbose", False) if options else False + device_string = options.get("device_string", "") if options else "" + allow_strings = options.get("allow_strings", "") if options else False super().validate(node_list, options) if options is not None and "default_present" in options: if not isinstance(options["default_present"], bool): @@ -1196,8 +1205,26 @@ def validate(self, node_list, options=None): f"The provided 'default_present' option must be a " f"boolean, but found '{options['default_present']}'." ) - device_string = options.get("device_string", "") if options else "" for node in node_list: + if not allow_strings: + # Check there are no character assignments in the region + for datanode in node.walk((Reference, Literal), + stop_type=Reference): + dtype = datanode.datatype + # Don't allow CHARACTERS on GPU + if hasattr(dtype, "intrinsic"): + if dtype.intrinsic == ScalarType.Intrinsic.CHARACTER: + message = ( + f"ACCParallelTrans doesn't enclose regions " + f"that uses characters, but found: " + f"{datanode.debug_string()}, use the " + f"'allow_strings' transformation option to " + f"offload this region." + ) + if verbose: + node.preceding_comment = message + raise TransformationError(message) + for call in node.walk(Call): if not call.is_available_on_device(device_string): if isinstance(call, IntrinsicCall): @@ -1230,6 +1257,9 @@ def apply(self, target_nodes, options=None): avoid using unsupported nodes inside a region. :param bool options["default_present"]: this flag controls if the inserted directive should include the default_present clause. + :param bool options["allow_strings"]: whether to allow the + transformation on assignments involving character types. Defaults + to False. ''' if not options: