diff --git a/.github/workflows/nemo_tests.yml b/.github/workflows/nemo_tests.yml
index 1ef75f6fcb..089c54bb3a 100644
--- a/.github/workflows/nemo_tests.yml
+++ b/.github/workflows/nemo_tests.yml
@@ -130,6 +130,8 @@ jobs:
         module load perl/${PERL_VERSION}
         make clean
         export NEMOV4=1  # Enables specific NEMOV4 exclusions in the PSyclone transformation script
+        export PARALLEL_DIRECTIVES="omp_offloading+omp_threading"
+        export REPRODUCIBLE=1
         make -j ${NUM_PARALLEL} openmp_gpu
         make -j ${NUM_PARALLEL} compile-openmp_gpu
         export NV_ACC_POOL_THRESHOLD=75
@@ -171,27 +173,39 @@ jobs:
     - name: NEMO MetOffice OpenACC loops for GPU
       id: nemo_acc_loops
       run: |
+        source /archive/psyclone-spack/psyclone-spack-Jun25/spack-repo/share/spack/setup-env.sh
+        spack unload && spack load nemo-build-environment%nvhpc
         . .runner_venv/bin/activate
+
+        # Set up envvars
         export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts
         export PROFILE_HOME=${GITHUB_WORKSPACE}/lib/profiling/nvidia/
         export NEMO_DIR=${PREFIX}/UKMO-NEMOv4
-        cd $PSYCLONE_NEMO_DIR
-        module load nvidia-hpcsdk/${NVFORTRAN_VERSION}
-        module load hdf5/${HDF5_VERSION} netcdf-c/${NETCDF_C_VERSION} netcdf-fortran/${NETCDF_FORTRAN_VERSION}
-        module load perl/${PERL_VERSION}
-        make clean
-        make -j ${NUM_PARALLEL} openacc_loops
-        COMPILER_ARCH=linux_nvidia_acc_gpu make -j ${NUM_PARALLEL} compile-openacc_loops
+        export TEST_DIR=SPITZ12_ACC_LOOPS_NVHPC
+        export PSYCLONE_COMPILER=$MPIF90
+        export MPIF90=psyclonefc
+        export PARALLEL_DIRECTIVES="acc_offloading"
+        export REPRODUCIBLE=1
+        export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py"
+        export FCFLAGS="-i4 -Mr8 -O2 -Mnovect -Mnofma -g -acc -mp=gpu -gpu=mem:managed,math_uniform"
+        export NEMOV4=1  # Enables specific NEMOV4 exclusions in the PSyclone transformation script
+
+        # Compile
+        cd ${PREFIX}/UKMO-NEMOv4
+        cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
+        rm -rf cfgs/${TEST_DIR}
+        ./makenemo -r SPITZ12 -m linux_spack -n ${TEST_DIR} \
+          add_key "IEEE_IS_NAN=ieee_is_nan key_nosignedzero" \
+          del_key "key_iomput key_mpp_mpi key_si3" -j ${NUM_PARALLEL}
+
+        # Run test
         export NV_ACC_POOL_THRESHOLD=75
         export CUDA_VISIBLE_DEVICES=1
-        make run-openacc_loops
-        # Check the output is as expected (TODO #2895: improve numerical reproducibility)
-        make output-openacc_loops | grep -q " it :      10"  || (echo "Error: 'it :      10' not found!" & false)
-        make output-openacc_loops | grep -q "|ssh|_max:  0.259483" || (echo "Error: '|ssh|_max:  0.259483' not found!" & false)
-        make output-openacc_loops | grep -q "|U|_max:  0.458515" || (echo "Error: '|U|_max:  0.458515' not found!" & false)
-        make output-openacc_loops | grep -q "S_min:  0.482686" || (echo "Error: 'S_min:  0.482686' not found!" & false)
-        make output-openacc_loops | grep -q "S_max:  0.407622" || (echo "Error: 'S_max:  0.407622' not found!" & false)
-        export VAR_TIME=$(grep -A 1 "Elapsed Time" <(make -s time-openacc_loops) | head -n 2 | tail -n 1 | awk '{print $1}')
+        cd cfgs/${TEST_DIR}/EXP00/
+        ln -sf /archive/psyclone-tests/nemo-inputs/UKMO-eORCA1_GO8_NEMOv4/*.nc .
+        ./nemo
+        diff run.stat $PSYCLONE_NEMO_DIR/KGOs/run.stat.nemo4.spitz12.nvhpc.10steps
+        export VAR_TIME=$(grep -A 1 "Elapsed Time" timing.output | head -n 2 | tail -n 1 | awk '{print $1}')
         echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}"
       
     # PSyclone, compile and run ECMWF NEMO with OpenMP for CPUs. This uses
@@ -205,21 +219,25 @@ jobs:
         source .runner_venv/bin/activate
 
         # Set up envvars
+        export TEST_DIR=SPITZ12_ACC_LOOPS_NVHPC
         export PSYCLONE_NEMO_DIR=${GITHUB_WORKSPACE}/examples/nemo/scripts
         export PSYCLONE_COMPILER=$MPIF90
         export MPIF90=psyclonefc
-        export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/omp_cpu_trans.py -I ${MPI_HOME}/include"
+        export PARALLEL_DIRECTIVES="omp_threading"
+        export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -I ${MPI_HOME}/include"
         export FCFLAGS="-i4 -r8 -O2 -heap-arrays -fp-model=precise -g -qopenmp"
         export NEMOV4=1  # Enables specific NEMOV4 exclusions in the PSyclone transformation script
 
         # Compile
         cd ${PREFIX}/ECMWF-NEMOv4
-        ./makenemo -r SPITZ12 -m linux_spack -n SPITZ12_psyclone \
+        cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
+        rm -rf cfgs/${TEST_DIR}
+        ./makenemo -r SPITZ12 -m linux_spack -n ${TEST_DIR} \
           add_key "IEEE_IS_NAN=ieee_is_nan key_nosignedzero" \
           del_key "key_iomput" -j ${NUM_PARALLEL}
         
         # Run NEMO
-        cd cfgs/SPITZ12_psyclone/EXP00/
+        cd cfgs/${TEST_DIR}/EXP00/
         ln -sf /archive/psyclone-tests/nemo-inputs/ECMWF-eORCA1_GO8/* .
         export OMP_NUM_THREADS=12
         ./nemo
@@ -240,6 +258,8 @@ jobs:
         module load perl/${PERL_VERSION}
         make clean
         export NEMOV4=1  # Enables specific NEMOV4 exclusions in the PSyclone transformation script
+        export PARALLEL_DIRECTIVES="omp_offloading+omp_threading"
+        export REPRODUCIBLE=1
         export ASYNC_PARALLEL=1
         make -j ${NUM_PARALLEL} openmp_gpu
         make -j ${NUM_PARALLEL} compile-openmp_gpu
diff --git a/.github/workflows/nemo_v5_tests.yml b/.github/workflows/nemo_v5_tests.yml
index 77fc44b06e..5077a3fbaa 100644
--- a/.github/workflows/nemo_v5_tests.yml
+++ b/.github/workflows/nemo_v5_tests.yml
@@ -107,7 +107,7 @@ jobs:
         # Set up FCM: PATHs are loaded from SPACK, we only need to set the FCFLAGS
         cd $NEMO_DIR
         cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
-        export FCFLAGS="-fdefault-real-8 -O2 -fcray-pointer -ffree-line-length-none -g"
+        export FCFLAGS="-fdefault-real-8 -O2 -mno-fma -fno-tree-vectorize -fcray-pointer -ffree-line-length-none -g"
 
         # Clean up and compile
         rm -rf tests/${TEST_DIR}
@@ -202,8 +202,9 @@ jobs:
         cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
         export PSYCLONE_COMPILER=$MPIF90
         export MPIF90=psyclonefc
-        export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/omp_cpu_trans.py"
-        export FCFLAGS="-fdefault-real-8 -O2 -fcray-pointer -ffree-line-length-none -g -fopenmp"
+        export PARALLEL_DIRECTIVES="omp_threading"
+        export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py"
+        export FCFLAGS="-fdefault-real-8 -O2 -mno-fma -fno-tree-vectorize -fcray-pointer -ffree-line-length-none -g -fopenmp"
 
         # Clean up and compile
         rm -rf tests/${TEST_DIR}
@@ -245,7 +246,8 @@ jobs:
         export REPRODUCIBLE=1
         export PSYCLONE_COMPILER=$MPIF90
         export MPIF90=psyclonefc
-        export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py"
+        export PARALLEL_DIRECTIVES="omp_offloading+omp_threading"
+        export PSYCLONE_OPTS="--enable-cache -l output -s ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py"
         # Clean up and compile
         rm -rf tests/${TEST_DIR}
         ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} -j ${NUM_PARALLEL} -v 1
@@ -296,12 +298,13 @@ jobs:
         cd $NEMO_DIR
         cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
         export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform"
+        export PARALLEL_DIRECTIVES="omp_offloading+omp_threading"
         export REPRODUCIBLE=1
 
         # Clean up and compile
         rm -rf cfgs/${TEST_DIR}
-        ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \
-          -j ${NUM_PARALLEL} -v 1
+        ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} \
+          -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1
 
         # Run test
         cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00
@@ -309,7 +312,7 @@ jobs:
         # Make sure mpi has been built with cuda support
         ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
         # Run with round robin allocations of GPUs to MPI ranks
-        mpirun -n 2 sh -c 'CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK ./nemo'
+        OMP_NUM_THREADS=4 mpirun -n 2 sh -c 'CUDA_VISIBLE_DEVICES=$OMPI_COMM_WORLD_LOCAL_RANK ./nemo'
         diff $PSYCLONE_NEMO_DIR/KGOs/run.stat.orca1.nvhpc.10steps run.stat
         export VAR_TIME=$(grep "local MPI proces" timing.output | head -n 1 | awk '{print $5}' | tr -d s)
         echo "time=${VAR_TIME}" >> "${GITHUB_OUTPUT}"
@@ -331,11 +334,13 @@ jobs:
         cd $NEMO_DIR
         cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
         export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform"
+        export PARALLEL_DIRECTIVES="omp_offloading+omp_threading"
         export REPRODUCIBLE=1
 
         # Clean up and compile
         rm -rf cfgs/${TEST_DIR}
-        ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \
+        ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \
+          -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \
           add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1
 
         # Run test
@@ -374,12 +379,13 @@ jobs:
         export ENABLE_PROFILING=1
         # We compile with "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results.
         export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform"
+        export PARALLEL_DIRECTIVES="omp_offloading+omp_threading"
         export REPRODUCIBLE=1
         export ASYNC_PARALLEL=1
         # Clean up and compile
         rm -rf tests/${TEST_DIR}
-        ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \
-          -j ${NUM_PARALLEL} -v 1
+        ./makenemo -r BENCH -m linux_spack_profile -n ${TEST_DIR} \
+          -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1
 
         # Run reproducible test
         cd $NEMO_DIR/tests/${TEST_DIR}/EXP00
@@ -398,8 +404,8 @@ jobs:
         rm -rf tests/${TEST_DIR}
         export NV_ACC_POOL_THRESHOLD=75
         export CUDA_VISIBLE_DEVICES=1
-        ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \
-          -j ${NUM_PARALLEL} -v 1
+        ./makenemo -r BENCH -m linux_spack -n ${TEST_DIR} \
+          -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1
         # Run non-reproducible test
         cd $NEMO_DIR/tests/${TEST_DIR}/EXP00
         cp $PSYCLONE_NEMO_DIR/KGOs/namelist_cfg_bench_small namelist_cfg
@@ -423,13 +429,15 @@ jobs:
         # We compile at "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results.
         cd $NEMO_DIR
         cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
+        export PARALLEL_DIRECTIVES="omp_offloading+omp_threading"
         export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform"
         export REPRODUCIBLE=1
         export ASYNC_PARALLEL=1
 
         # Clean up and compile
         rm -rf cfgs/${TEST_DIR}
-        ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \
+        ./makenemo -r ORCA2_ICE_PISCES -m linux_spack -n ${TEST_DIR} \
+          -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py \
           add_key "key_mpi_off key_nosignedzero" -j ${NUM_PARALLEL} -v 1
 
         # Run test
@@ -459,14 +467,15 @@ jobs:
         # We compile at "-O2 -Mnofma -Mnovect -gpu=math_uniform" to permit comparison of the results.
         cd $NEMO_DIR
         cp $PSYCLONE_NEMO_DIR/KGOs/arch-linux_spack.fcm arch/arch-linux_spack.fcm
+        export PARALLEL_DIRECTIVES="omp_offloading"
         export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform"
         export REPRODUCIBLE=1
         export ASYNC_PARALLEL=1
 
         # Clean up and compile
         rm -rf cfgs/${TEST_DIR}
-        ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} -p ${PSYCLONE_NEMO_DIR}/omp_gpu_trans.py \
-          -j ${NUM_PARALLEL} -v 1
+        ./makenemo -r GOSI10p0.0_like_eORCA1 -m linux_spack -n ${TEST_DIR} \
+          -p ${PSYCLONE_NEMO_DIR}/insert_loop_parallelism.py -j ${NUM_PARALLEL} -v 1
 
         # Run test
         cd $NEMO_DIR/cfgs/${TEST_DIR}/EXP00
diff --git a/examples/nemo/README.md b/examples/nemo/README.md
index 0ed3998b1a..8faad4da7b 100644
--- a/examples/nemo/README.md
+++ b/examples/nemo/README.md
@@ -34,6 +34,7 @@ POSSIBILITY OF SUCH DAMAGE.
 Author A. R. Porter, STFC Daresbury Lab
 Modified by R. W. Ford, STFC Daresbury Lab
 Modified by J. Henrichs, Bureau of Meteorology
+Modified by S. Siso, STFC Daresbury Lab
 
 -->
 
@@ -54,63 +55,8 @@ Contains:
 
 ## Scripts
 
-Contains the scripts used to process the NEMO code base and to add profiling
-instrumentation (https://psyclone.readthedocs.io/en/latest/user_guide/profiling.html)
-and OpenACC or OpenMP directives:
-
-1. `process_nemo.py` is a driver script that allows the user to specify
-   which files to process with PSyclone, the transformation script to use
-   and where to put the outputs:
-
-       $ ./process_nemo.py -h
-       usage: process_nemo.py [-h] [-o OUT_DIR] [-s SCRIPT_FILE] [-x]
-                              input_file [input_file ...]
-
-       Process the specified NEMO source files using PSyclone
-
-       positional arguments:
-         input_file      One or more NEMO pre-processed source files
-
-       optional arguments:
-         -h, --help      show this help message and exit
-         -o OUT_DIR      Destination directory for processed source files
-         -s SCRIPT_FILE  PSyclone transformation script
-         -x              exit immediately if PSyclone fails
-         -p              add profiling instrumentation to the PROFILE_ONLY file
-                         list. Note that files processed by the SCRIPT_FILE may
-                         be introducing profiling instrumentation as part of
-                         that script.
-
-   In addition to the command-line flags, the script itself contains two
-   variables that may be used to control its behaviour:
-
-   - `EXCLUDED_FILES`: list of filenames that PSyclone will not attempt to process.
-   - `PROFILE_ONLY`: list of filenames to add profiling instrumentation but
-      do not attempt to further process by PSyclone.
-
-   Finally, the precise invocation to use when running PSyclone may be
-   specified by setting the `PSYCLONE` environment variable. If this is not set
-   then `psyclone` must be in the user's PATH.
-
-2. PSyclone transformation scripts:
-   - `kernels_trans.py` adds OpenACC kernel directives and places fine-grained
-     profiling instrumentation around any regions that haven't had OpenACC
-     added.
-   - `omp_cpu_trans.py` adds OpenMP directives for CPU threading parallelism.
-   - `omp_gpu_trans.py` adds OpenMP offloading directives for GPU acceleration.
-
-These scripts are a *work in progress* and are being developed to work on the
-MO_GO8 configuration of NEMO supplied by the Met Office. This configuration is
-based on version 4.0.2 of NEMO and is compiled using:
-
-    ./makenemo -n MO_GO8_GPU -r SPITZ12 -m linux_nvfortran_gpu \
-        del_key "key_iomput key_mpp_mpi" add_key "key_nosignedzero"
-
-(where you will need an `arch/arch-linux_nvfortran_gpu.fcm` FCM configuration
-file specifying how to use the NVIDIA compiler).
-
-If you are applying PSyclone to any other version or configuration of NEMO then
-these scripts should serve as a useful starting point.
+Contains a collection of example scripts and the instructions to process the NEMO code. These
+are testend in our integration test against NEMOv4.0.2 and NEMOv5.0.
 
 ## Example 1
 
diff --git a/examples/nemo/eg1/Makefile b/examples/nemo/eg1/Makefile
index d113cdfda2..d3473525c5 100644
--- a/examples/nemo/eg1/Makefile
+++ b/examples/nemo/eg1/Makefile
@@ -40,8 +40,8 @@ include ../../common.mk
 transform:
 	${PSYCLONE} -s ./openmp_cpu_levels_trans.py ../code/tra_adv.F90
 	${PSYCLONE} -s ./openmp_gpu_levels_trans.py ../code/tra_adv.F90
-	${PSYCLONE} -s ../scripts/omp_cpu_trans.py ../code/tra_adv.F90
-	${PSYCLONE} -s ../scripts/omp_gpu_trans.py ../code/tra_adv.F90
+	PARLLEL_DIRECTIVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90
+	PARLLEL_DIRECTIVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/tra_adv.F90
 
 compile: transform
 	@echo "No compilation supported for nemo/eg1"
diff --git a/examples/nemo/eg2/Makefile b/examples/nemo/eg2/Makefile
index 65517ce33b..cb2b2b3c61 100644
--- a/examples/nemo/eg2/Makefile
+++ b/examples/nemo/eg2/Makefile
@@ -43,8 +43,8 @@ transform: omp_levels
 
 omp_levels:
 	${PSYCLONE} -s ./omp_levels_trans.py ../code/traldf_iso.F90
-	${PSYCLONE} -s ../scripts/omp_cpu_trans.py ../code/traldf_iso.F90
-	${PSYCLONE} -s ../scripts/omp_gpu_trans.py ../code/traldf_iso.F90
+	PARLLEL_DIRECTIVES="omp_threading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90
+	PARLLEL_DIRECTIVES="omp_offloading" ${PSYCLONE} -s ../scripts/insert_loop_parallelism.py ../code/traldf_iso.F90
 
 
 compile: transform
diff --git a/examples/nemo/scripts/KGOs/arch-linux_spack.fcm b/examples/nemo/scripts/KGOs/arch-linux_spack.fcm
index 94943258f3..b2b23e196e 100644
--- a/examples/nemo/scripts/KGOs/arch-linux_spack.fcm
+++ b/examples/nemo/scripts/KGOs/arch-linux_spack.fcm
@@ -9,7 +9,7 @@
 %NCDF_INC            -I${NCDF_F_HOME}/include -I${NCDF_C_HOME}/include -I${HDF5_HOME}/include
 %NCDF_LIB            -L${NCDF_F_HOME}/lib -lnetcdff -L${NCDF_C_HOME}/lib -lnetcdf
 
-%CPP	             cpp -Dkey_nosignedzero
+%CPP	             cpp -Dkey_nosignedzero -I${MPI_HOME}/include
 %FC	                 ${MPIF90} -c
 %FCFLAGS             ${FCFLAGS}
 %FFLAGS              %FCFLAGS
diff --git a/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm
index 2e6c8df745..151861e342 100644
--- a/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm
+++ b/examples/nemo/scripts/KGOs/arch-linux_spack_profile.fcm
@@ -12,9 +12,7 @@
 %PROFILE_INC         -I${PROFILING_DIR}
 %PROFILE_LIB         -L${PROFILING_DIR} -lnvtx_prof -L${CUDA_HOME}/lib64 -cuda -lnvToolsExt
 
-
-
-%CPP	             cpp -Dkey_nosignedzero
+%CPP	             cpp -Dkey_nosignedzero -I${MPI_HOME}/include
 %FC	                 ${MPIF90} -c
 %FCFLAGS             ${FCFLAGS}
 %FFLAGS              %FCFLAGS
diff --git a/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps b/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps
index 0f9fc042a7..b0f8f90060 100644
--- a/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps
+++ b/examples/nemo/scripts/KGOs/run.stat.bench.gfortran.small.10steps
@@ -1,10 +1,10 @@
- it :       1    |ssh|_max:  0.2336851764570053D+01 |U|_max:  0.7053248015576865D-02 |V|_max:  0.2308346115751064D-02 S_min:  0.2996908779321225D+02 S_max:  0.3101392941293399D+02
- it :       2    |ssh|_max:  0.3739162010288019D+01 |U|_max:  0.1029843199699749D-01 |V|_max:  0.9493800242777233D-02 S_min:  0.2996911000748410D+02 S_max:  0.3101392863147436D+02
- it :       3    |ssh|_max:  0.4224443521973881D+01 |U|_max:  0.1349433227267360D-01 |V|_max:  0.2284885234302297D-01 S_min:  0.2996913553478158D+02 S_max:  0.3101392784904396D+02
- it :       4    |ssh|_max:  0.4659313564999673D+01 |U|_max:  0.1490637483763983D-01 |V|_max:  0.4048444554221592D-01 S_min:  0.2996916129319160D+02 S_max:  0.3101392717586670D+02
- it :       5    |ssh|_max:  0.4949503007019537D+01 |U|_max:  0.1145357177492709D-01 |V|_max:  0.5364770396337241D-01 S_min:  0.2996918706295251D+02 S_max:  0.3101392651315916D+02
- it :       6    |ssh|_max:  0.5140472974504101D+01 |U|_max:  0.1064859943349832D-01 |V|_max:  0.6818865538920099D-01 S_min:  0.2996921262561763D+02 S_max:  0.3101392594056642D+02
- it :       7    |ssh|_max:  0.5229361171698524D+01 |U|_max:  0.7814316351518531D-02 |V|_max:  0.8358086738712590D-01 S_min:  0.2996923864577586D+02 S_max:  0.3101392538498657D+02
- it :       8    |ssh|_max:  0.5220719217849657D+01 |U|_max:  0.1141515836387377D-01 |V|_max:  0.9761604183740114D-01 S_min:  0.2996926417117689D+02 S_max:  0.3101392490495336D+02
- it :       9    |ssh|_max:  0.5145297949564463D+01 |U|_max:  0.1416399592481803D-01 |V|_max:  0.1152759253498275D+00 S_min:  0.2996929035879930D+02 S_max:  0.3101392444612747D+02
- it :      10    |ssh|_max:  0.4979557010366619D+01 |U|_max:  0.1986785874281591D-01 |V|_max:  0.1303543987480588D+00 S_min:  0.2996931641421843D+02 S_max:  0.3101392405137850D+02
+ it :       1    |ssh|_max:  0.2336851764570087D+01 |U|_max:  0.7053248015579857D-02 |V|_max:  0.2308346115756259D-02 S_min:  0.2996908779321225D+02 S_max:  0.3101392941293399D+02
+ it :       2    |ssh|_max:  0.3739162010287973D+01 |U|_max:  0.1029843199698906D-01 |V|_max:  0.9493800242775713D-02 S_min:  0.2996911000748410D+02 S_max:  0.3101392863147436D+02
+ it :       3    |ssh|_max:  0.4224443521974239D+01 |U|_max:  0.1349433227265986D-01 |V|_max:  0.2284885234301404D-01 S_min:  0.2996913553478157D+02 S_max:  0.3101392784904396D+02
+ it :       4    |ssh|_max:  0.4659313564999622D+01 |U|_max:  0.1490637483762341D-01 |V|_max:  0.4048444554220138D-01 S_min:  0.2996916129319160D+02 S_max:  0.3101392717586671D+02
+ it :       5    |ssh|_max:  0.4949503007019767D+01 |U|_max:  0.1145357177490677D-01 |V|_max:  0.5364770396337813D-01 S_min:  0.2996918706295251D+02 S_max:  0.3101392651315916D+02
+ it :       6    |ssh|_max:  0.5140472974504293D+01 |U|_max:  0.1064859943349158D-01 |V|_max:  0.6818865538921454D-01 S_min:  0.2996921262561763D+02 S_max:  0.3101392594056643D+02
+ it :       7    |ssh|_max:  0.5229361171698655D+01 |U|_max:  0.7814316351505392D-02 |V|_max:  0.8358086738711774D-01 S_min:  0.2996923864577587D+02 S_max:  0.3101392538498657D+02
+ it :       8    |ssh|_max:  0.5220719217849857D+01 |U|_max:  0.1141515836389672D-01 |V|_max:  0.9761604183737865D-01 S_min:  0.2996926417117689D+02 S_max:  0.3101392490495337D+02
+ it :       9    |ssh|_max:  0.5145297949564862D+01 |U|_max:  0.1416399592482473D-01 |V|_max:  0.1152759253497909D+00 S_min:  0.2996929035879930D+02 S_max:  0.3101392444612748D+02
+ it :      10    |ssh|_max:  0.4979557010366737D+01 |U|_max:  0.1986785874282448D-01 |V|_max:  0.1303543987480547D+00 S_min:  0.2996931641421842D+02 S_max:  0.3101392405137852D+02
diff --git a/examples/nemo/scripts/Makefile b/examples/nemo/scripts/Makefile
index c1828e0e8a..9ebeb09634 100644
--- a/examples/nemo/scripts/Makefile
+++ b/examples/nemo/scripts/Makefile
@@ -61,16 +61,16 @@ psycloned-passthrough/%.f90: ${ROOT_SRC}%.f90 psycloned-passthrough
 	psyclone -s passthrough.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $<
 
 psycloned-openmp_cpu/%.f90: ${ROOT_SRC}%.f90 psycloned-openmp_cpu
-	psyclone -s omp_cpu_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $<
+	psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $<
 
 psycloned-openmp_gpu/%.f90: ${ROOT_SRC}%.f90 psycloned-openmp_gpu
-	psyclone -s omp_gpu_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $<
+	psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $<
 
 psycloned-openacc_kernels/%.f90: ${ROOT_SRC}%.f90 psycloned-openacc_kernels
 	psyclone -s acc_kernels_trans.py -l output -I ${ROOT_SRC} -I ${MPI_INC_DIR} -o $@ $<
 
 psycloned-openacc_loops/%.f90: ${ROOT_SRC}%.f90 psycloned-openacc_loops
-	psyclone -s acc_loops_trans.py -l output -I ${ROOT_SRC} -o $@ $<
+	psyclone -s insert_loop_parallelism.py -l output -I ${ROOT_SRC} -o $@ $<
 
 # Get the number of Makefile parallel jobs to pass it to the makenemo
 MAKE_PID := $(shell echo $$PPID)
diff --git a/examples/nemo/scripts/README.md b/examples/nemo/scripts/README.md
new file mode 100644
index 0000000000..6eb7945a5f
--- /dev/null
+++ b/examples/nemo/scripts/README.md
@@ -0,0 +1,223 @@
+<!--
+BSD 3-Clause License
+
+Copyright (c) 2025, Science and Technology Facilities Council.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+Author S. Siso, STFC Daresbury Lab
+
+-->
+
+# PSyclone NEMO Examples
+
+This directory contains various examples showing how to apply PSyclone to
+transform the source code of the NEMO ocean model.
+
+> [!Important]
+> The NEMO build system, `makenemo`, has the ability to apply PSyclone
+> scripts that come with the NEMO repository with the `-p` flag (see
+> [the NEMO user guide](https://sites.nemo-ocean.io/user-guide/psyclone.html)),
+> but these are pinned to a particular release of PSyclone and have constraints
+> defined in `mk/sct_psyclone.sh` script. By contrast, the process presented in
+> this README uses the experimental `psyclonefc` compiler wrapper command which
+> bypasses the `makenemo -p` and instead intercepts any compilation command and
+> wraps it with a PSyclone call followed by a compiler call.
+> This is the recommended way to apply upstream PSyclone transformations, as it
+> is not constrained by the file-exclusions and backward compatibility guarantees
+> of the scripts inside the NEMO repository.
+
+## Downloading the NEMO source and data files
+
+To test the examples you can download NEMO and its input data as follows:
+```bash
+git clone https://forge.nemo-ocean.eu/nemo/nemo.git --branch 5.0 --single-branch
+wget https://gws-access.jasmin.ac.uk/public/nemo/sette_inputs/r5.0.0/ORCA2_ICE_v5.0.0.tar.gz
+tar -xzf ORCA2_ICE_v5.0.0.tar.gz
+```
+
+The examples have been tested with NEMOv4.0.2 (SPLITZ configuration) and
+NEMOv5.0 (BENCH and ORCA_ICE_PISCIES configuration), but we aim to support
+any version of NEMO. If you encounter any issue applying these examples
+please report to the authors.
+
+
+## Set up environment variables
+
+In order to provide a flexible system that works with different directives and
+compilers we provide a parameterised transformation script
+`insert_loop_parallelism.py` and a parameterised NEMO arch file
+`KGO/arch-linux_spack.fcm`, both with multiple environment variables that need
+to be adjusted depending on your desired optimisation target.
+
+First of all, the arch file has a `MPIF90` to choose the compiler, this
+needs to be set to `psyclonefc`. This is a compiler wrapper utility that
+substitutes its calls with: an invocation to PSyclone to process the given
+source file (using the options provided in `PSYCLONE_OPTS`) followed by an
+invocation to a compiler (provided by `PSYCLONE_COMPILER`).
+
+For example, to apply the `insert_loop_parallelism.py` and compile it with
+`mpif90` we can use the following set up:
+
+```bash
+export MPIF90=psyclonefc
+export PSYCLONE_COMPILER=mpif90
+export PSYCLONE_OPTS="-l output -s ${PSYCLONE_NEMO_EXAMPLES_DIR}/insert_loop_parallelism.py"
+```
+
+This transformation script is in turn parameterised with a `PARALLEL_DIRECTIVES`
+variable that have to be consistently set up with the chosen `FCFLAGS` flags.
+
+For instance, for the `nvfortran` compiler, you can choose between:
+- Serial transformations with no parallel directives
+```bash
+export PARALLEL_DIRECTIVES=""
+export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g"
+```
+
+- Inserting OpenMP CPU threading parallelism
+```bash
+export PARALLEL_DIRECTIVES="omp_threading"
+export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp"
+```
+
+- Inserting OpenMP GPU offloading with reproducible build flags
+```bash
+export PARALLEL_DIRECTIVES="omp_offloading"
+export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -mp=gpu -gpu=mem:managed,math_uniform"
+export REPRODUCIBLE=1
+```
+
+- Inserting OpenACC GPU offloading with reproducible build flags (-mp=gpu is needed for reproducibility)
+```bash
+export PARALLEL_DIRECTIVES="acc_offloading"
+export FCFLAGS="-i4 -Mr8 -O2 -Mnofma -Mnovect -g -acc=gpu -mp=gpu -gpu=mem:managed,math_uniform"
+export REPRODUCIBLE=1
+```
+
+- Hybrid directives (what cannot be offloaded fallsback to threading) and fast GPU flags
+```bash
+unset REPRODUCIBLE
+export PARALLEL_DIRECTIVES="omp_offloading+omp_threading"
+export FCFLAGS="-i4 -Mr8 -O3 -mp=gpu -gpu=mem:managed"
+```
+
+> [!Note]
+> Currently, NEMOv4 and NEMOv5 take different optimisation paths, so it is
+> important to also set:
+>
+> ```bash
+> export NEMOv4=1
+> ```
+> when applying the transformations to NEMOv4.
+
+TODO: Mention `ASYNC_PARALLEL`, `ENABLE_INLINING`, `PROFILING`
+
+## Compiling and running the application
+
+Once the environment variables are set, use the `makenemo` command with
+the desired NEMO configuration and keys. For example:
+
+```bash
+./makenemo -r ORCA2_ICE_PISCES -m arch-linux_spack -n ORCA2_psycloned ...
+```
+
+If everything worked you will see PSyclone generated files in the
+`<configuration>/BLD/tmp` directory and the final binary in the
+`<configuration>/EXP00` directory.
+
+You can run this binary using the appropriate command from the configuration
+and inserted programming model. For example, for a hybrid
+MPI+OMP offloading+OMP threading you can do:
+
+```bash
+# Prepare problem
+ln -sf ${ORCA2_INPUTS}/ORCA2_ICE_v5.0.0/* cfgs/ORCA2_psycloned/EXP00/.
+cd cfgs/ORCA2_psycloned/EXP00
+# Reduce num of iterations and add timing/runstat
+sed -i "s/nn_itend.*/nn_itend = 10/" namelist_cfg
+sed -i "s/ln_icebergs.*/ln_icebergs = .false./" namelist_cfg
+sed -i "s/\&namctl.*/\&namctl\n ln_timing   = .true. \n sn_cfctl%l_runstat = .true.\n/" namelist_cfg
+# Run problem
+OMP_NUM_THREADS=4 CUDA_VISIBLE_DEVICES=1,2 mpirun -n 2 ./nemo
+```
+
+## Identifying the cause of issues
+
+A difficulty of working with code-transformation scripts is that it is possible
+to incorrectly transform a file semantics while still creating valid Fortran.
+
+This means that the transformation will succeed and the generated code will
+compile, but the results will diverge. This gets more complicated with parallel
+programming because certain operations like reductions or atomics are not
+always reproducible. Therefore, to understand what causes the results divergence
+it is useful to apply the transformations step-by-step while checking if the
+`run.stat` values change. Some useful steps are:
+
+- Starting building NEMO *without* `psyclonefc` and conservative optimisation flags
+  and run it serially (O2, no vectorisation, no-fma). Then store the generated `run.stat`.
+- Then switch to using `psyclonefc` with the `PSYCLONE_OPTS="-s passthrough.py"`,
+  this will make PSyclone process all files but without applying any
+  transformations. Check if the results still match.
+- Then build it with `PSYCLONE_OPTS="-s insert_loop_parallelism.py"` but keeping
+  the `PARALLEL_DIRECTIVES=""` empty. This will apply serial transformations but
+  no directives yet.
+- Then run it `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_threading" PSYCLONE_OPTS="-s insert_loop_parallelism.py"`
+  and see if the results still match.
+- Finally, run it with `REPRODUCIBLE=1 PARALLEL_DIRECTIVES="omp_offloading" PSYCLONE_OPTS="-s insert_loop_parallelism.py"`
+
+Alongside finding which step is causing the divergence we may want to find
+which file/s are causing it. This folder also contains a `do_file_by_file.sh`
+script that build NEMO many times, each with only one file being transformed,
+and compares the results with the stores `run.stat`
+
+
+## Tuning the generated implementation
+
+Since this is now a two-step process, there are two locations where you can modify
+files that will alter the output result. First is manually modifying the original
+source code. For this we recommend using the built-in `makenemo` functionality
+that allow to point to a directory with patched source files:
+
+```bash
+./makenemo -e <directory> ...
+```
+
+In addition to the source, you can also modify the recipe that PSyclone uses to
+transform the code. In this example you can do so by changing any detail of the
+`insert_loop_parallelism.py` transformation script, but the `FILES_TO_SKIP`
+global variable is particularly relevant as it allows PSyclone skip processing
+the listed files. If modifying a particular file is known to cause problems or
+performance regressions, include it in this list.
+
+You can also do both. For example if you want to provide a modified file that
+already includes directives, you need to reference it with the `-e <path>`
+and in the FILES_TO_SKIP (otherwise PSyclone would ignore the given directives
+and try to insert its own). This is currently the optimal approach for `seaice`
+and `lbclnk.f90` GPU offloading.
diff --git a/examples/nemo/scripts/acc_loops_trans.py b/examples/nemo/scripts/acc_loops_trans.py
deleted file mode 100755
index c7271fc48c..0000000000
--- a/examples/nemo/scripts/acc_loops_trans.py
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/usr/bin/env python
-# -----------------------------------------------------------------------------
-# BSD 3-Clause License
-#
-# Copyright (c) 2023-2026, Science and Technology Facilities Council.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-# -----------------------------------------------------------------------------
-# Authors: S. Siso, STFC Daresbury Lab
-
-''' PSyclone transformation script showing the introduction of OpenACC loop
-directives into Nemo code. '''
-
-from utils import (
-    insert_explicit_loop_parallelism, normalise_loops, add_profiling,
-    NOT_PERFORMANT, NEMO_MODULES_TO_IMPORT)
-from psyclone.psyir.nodes import Routine
-from psyclone.transformations import (
-    ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans)
-
-# Enable the insertion of profiling hooks during the transformation script
-PROFILING_ENABLED = True
-
-# Whether to chase the imported modules to improve symbol information (it can
-# also be a list of module filenames to limit the chasing to only specific
-# modules). This has to be used in combination with '-I' command flag in order
-# to point to the module location directory. We also strongly recommend using
-# the '--enable-cache' flag to reduce the performance overhead.
-RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT
-
-# List of all files that psyclone will skip processing
-FILES_TO_SKIP = NOT_PERFORMANT
-
-
-def trans(psyir):
-    ''' Add OpenACC Parallel and Loop directives to all loops, including the
-    implicit ones, to parallelise the code and execute it in an acceleration
-    device.
-
-    :param psyir: the PSyIR of the provided file.
-    :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer`
-
-    '''
-    acc_region_trans = ACCParallelTrans(default_present=False)
-    acc_loop_trans = ACCLoopTrans()
-
-    # TODO #2317: Has structure accesses that can not be offloaded and has
-    # a problematic range to loop expansion of (1:1)
-    if psyir.name.startswith("obs_"):
-        print("Skipping", psyir.name)
-        return
-
-    for subroutine in psyir.walk(Routine):
-        print(f"Transforming subroutine: {subroutine.name}")
-
-        if PROFILING_ENABLED:
-            add_profiling(subroutine.children)
-
-        # S-0074-Illegal number or type of arguments to ubound [and lbound]
-        # - keyword argument array; and  NVFORTRAN-S-0082-Illegal substring
-        # expression for variable filtide
-        if subroutine.name in ("bdytide_init", "sbc_cpl_init"):
-            print("Skipping", subroutine.name)
-            continue
-
-        # OpenACC fails in the following routines with the Compiler error:
-        # Could not find allocated-variable index for symbol - xxx
-        # This all happen on characters arrays, e.g. cd_nat
-        if subroutine.name in ("lbc_nfd_2d_ptr", "lbc_nfd_3d_ptr",
-                               "lbc_nfd_4d_ptr", "bdy_dyn", "dia_obs_init"):
-            print("Skipping", subroutine.name)
-            continue
-
-        normalise_loops(
-                subroutine,
-                hoist_local_arrays=True,
-                convert_array_notation=True,
-                convert_range_loops=True,
-                hoist_expressions=True
-        )
-
-        # These are functions that are called from inside parallel regions,
-        # annotate them with 'acc routine'
-        if subroutine.name.lower().startswith("sign_"):
-            ACCRoutineTrans().apply(subroutine)
-            print(f"Marked {subroutine.name} as GPU-enabled")
-            continue
-
-        insert_explicit_loop_parallelism(
-            subroutine,
-            region_directive_trans=acc_region_trans,
-            loop_directive_trans=acc_loop_trans,
-            # Collapse is necessary to give GPUs enough parallel items
-            collapse=True,
-        )
diff --git a/examples/nemo/scripts/insert_loop_parallelism.py b/examples/nemo/scripts/insert_loop_parallelism.py
new file mode 100755
index 0000000000..035e6e0714
--- /dev/null
+++ b/examples/nemo/scripts/insert_loop_parallelism.py
@@ -0,0 +1,345 @@
+#!/usr/bin/env python
+# -----------------------------------------------------------------------------
+# BSD 3-Clause License
+#
+# Copyright (c) 2021-2026, Science and Technology Facilities Council.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# -----------------------------------------------------------------------------
+# Authors: S. Siso, STFC Daresbury Lab
+
+''' PSyclone transformation script showing the introduction of OpenMP for GPU
+directives into Nemo code. '''
+
+import os
+import sys
+from utils import (
+    add_profiling, inline_calls, insert_explicit_loop_parallelism,
+    normalise_loops, NEMO_MODULES_TO_IMPORT)
+from psyclone.psyir.nodes import Routine, Loop
+from psyclone.psyir.transformations import (
+    OMPTargetTrans, OMPDeclareTargetTrans)
+from psyclone.transformations import (
+    OMPLoopTrans, TransformationError)
+from psyclone.transformations import (
+    ACCParallelTrans, ACCLoopTrans, ACCRoutineTrans)
+
+
+# This environment variable informs if this is targeting NEMOv4
+NEMOV4 = os.environ.get('NEMOV4', False)
+
+# This environment variable informs which parallelisation directives to use
+# It supports acc_offloading, omp_offloading and omp_threading
+# They can be combined, e.g PARALLEL_DIRECTIVES='omp_offloading+omp_threading',
+# or use none to just apply the serial transformations
+PARALLEL_DIRECTIVES = os.environ.get('PARALLEL_DIRECTIVES', '')
+
+# By default, allow optimisations that may change the results, e.g. reductions,
+# offloading intrinsics without math_uniform, ...
+REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False)
+
+# This environment variable informs if profiling hooks have to be inserted.
+PROFILING_ENABLED = os.environ.get('ENABLE_PROFILING', False)
+
+# By default, we don't do module inlining as it's still under development.
+INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False)
+
+# This environment variable informs if we're enabling asynchronous
+# parallelism.
+ASYNC_PARALLEL = os.environ.get('ASYNC_PARALLEL', False)
+
+# Whether to chase the imported modules to improve symbol information (it can
+# also be a list of module filenames to limit the chasing to only specific
+# modules). This has to be used in combination with '-I' command flag in order
+# to point to the module location directory. We also strongly recommend using
+# the '--enable-cache' flag to reduce the performance overhead.
+RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT
+
+# List of all files that psyclone will skip processing
+FILES_TO_SKIP = []
+
+# There files are skipped because transforming them degrade the performance
+SKIP_FOR_PERFORMANCE = [
+    "iom.f90",
+    "iom_nf90.f90",
+    "iom_def.f90",
+    "timing.f90",
+    "histcom.f90",
+]
+
+# These files change the results from the baseline when psyclone adds
+# parallelisation directives
+PARALLELISATION_ISSUES = []
+
+# These files change the results from the baseline when psyclone adds
+# offloading directives
+OFFLOADING_ISSUES = []
+
+if not NEMOV4:
+    FILES_TO_SKIP.extend([
+        # Fail in nvfortran when enabling seaice
+        "icefrm.f90",  # Has unsupported implicit symbol declaration
+    ])
+
+    SKIP_FOR_PERFORMANCE.extend([
+        "lbclnk.f90",
+    ])
+
+    PARALLELISATION_ISSUES.extend([
+        "ldfc1d_c2d.f90",
+        "tramle.f90",
+        "traqsr.f90",
+    ])
+
+    OFFLOADING_ISSUES.extend([
+        # Produces different output results
+        "zdftke.f90",
+        # The following issues only affect BENCH (because ice is enabled?)
+        # Runtime Error: Illegal address during kernel execution
+        "trcrad.f90",
+        # nvhpc > 24.11 - Signal 11 issues
+        "icerst.f90",  # When enabling ice* parallelisation
+        "trcbbl.f90",
+        "trabbc.f90",
+        "bdyice.f90",
+        "sedfunc.f90",
+        "stpmlf.f90",
+        "trddyn.f90",
+        "trczdf.f90",
+        "trcice_pisces.f90",
+        "dtatsd.f90",
+        "trcatf.f90",
+        "stp2d.f90",
+    ])
+
+    # if "acc_offloading" in PARALLEL_DIRECTIVES:
+    #     OFFLOADING_ISSUES.extend([
+    #         # Fail in OpenACC ORCA2_ICE_PISCES
+    #         "dynzdf.f90",
+    #         "trabbl.f90",
+    #         "trazdf.f90",
+    #         "zdfsh2.f90",
+    #     ])
+
+ASYNC_ISSUES = [
+    # Runtime Error: (CUDA_ERROR_LAUNCH_FAILED): Launch failed
+    # (often invalid pointer dereference) in get_cstrgsurf
+    "sbcclo.f90",
+    "trcldf.f90",
+    # Runtime Error: Illegal address during kernel execution with
+    # asynchronicity.
+    "zdfiwm.f90",
+    "zdfsh2.f90",
+    # Diverging results with asynchronicity
+    "traadv_fct.f90",
+    "bdy_oce.f90",
+]
+
+
+def select_transformations():
+    '''
+    Use the PARALLEL_DIRECTIVES global to select what specific transformations
+    to apply to insert the desired directives.
+    '''
+    process_directives = PARALLEL_DIRECTIVES
+
+    if 'omp_offloading' in process_directives:
+        offload_region_trans = OMPTargetTrans()
+        mark_for_gpu_trans = OMPDeclareTargetTrans()
+        if NEMOV4:
+            # TODO #2895: Explore why loop/teams loop diverge for NEMOv4
+            gpu_loop_trans = OMPLoopTrans(omp_schedule="none")
+            gpu_loop_trans.omp_directive = "loop"
+        else:
+            gpu_loop_trans = OMPLoopTrans(omp_schedule="none")
+            gpu_loop_trans.omp_directive = "teamsloop"
+        process_directives = process_directives.replace('omp_offloading', '')
+    elif 'acc_offloading' in process_directives:
+        offload_region_trans = ACCParallelTrans(default_present=False)
+        mark_for_gpu_trans = ACCRoutineTrans()
+        gpu_loop_trans = ACCLoopTrans()
+        process_directives = process_directives.replace('acc_offloading', '')
+    else:
+        offload_region_trans = None
+        mark_for_gpu_trans = None
+        gpu_loop_trans = None
+
+    if 'omp_threading' in process_directives:
+        cpu_loop_trans = OMPLoopTrans(omp_schedule="static")
+        cpu_loop_trans.omp_directive = "paralleldo"
+        process_directives = process_directives.replace('omp_threading', '')
+    else:
+        cpu_loop_trans = None
+
+    process_directives = process_directives.replace('+', '')
+    if process_directives != '':
+        sys.exit(f"Unknown PARALLEL_DIRECTIVES: {process_directives}")
+
+    return (offload_region_trans, mark_for_gpu_trans,
+            gpu_loop_trans, cpu_loop_trans)
+
+
+def filter_files_by_name(name: str) -> bool:
+    '''
+    :returns: whether to transform a file with the given name. Contrary to
+        FILES_TO_SKIP, this will still run the files through psyclone.
+    '''
+    # The two options below are useful for file-by-file exhaustive tests.
+    # If the environment has ONLY_FILE defined, only process that one file and
+    # known-good files that need a "declare target" inside.
+    only_file = os.environ.get('ONLY_FILE', False)
+    if only_file:
+        files_to_do = [only_file]
+        if "offloading" in PARALLEL_DIRECTIVES:
+            files_to_do.extend(
+                ["lib_fortran.f90", "solfrac_mod.f90", "sbc_phy.f90"])
+        if name not in files_to_do:
+            return True
+    # If the environment has ALL_BUT_FILE defined, process all files but
+    # the one named file.
+    all_but_file = os.environ.get('ALL_BUT_FILE', False)
+    if all_but_file and name == all_but_file:
+        return True
+
+    # These work but are skipped to improve performance, they could be in the
+    # FILES_TO_SKIP global parameter, but in this script, for testing purposes,
+    # we exclude them here so the PSyclone frontend and backend are still
+    # tested and it also allows to insert profiling hooks later on.
+    if name in SKIP_FOR_PERFORMANCE:
+        return True
+
+    # Parallelising ICE or ICB currently causes a noticeable slowdown
+    # On nemo_main it can be just: if name.startswith("icethd"):
+    if not NEMOV4 and name.startswith("ice"):
+        return True
+    if name.startswith("icb"):
+        return True
+
+    # This file fails for gcc NEMOv5 BENCH
+    if not NEMOV4 and name == "icedyn_rhg_evp.f90":
+        return True
+
+    return False
+
+
+def trans(psyir):
+    ''' Normalise and add directives to all possible loops, including the
+    implicit ones.
+
+    :param psyir: the PSyIR of the provided file.
+    :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer`
+
+    '''
+    if filter_files_by_name(psyir.name):
+        return
+
+    (offload_region_trans, mark_for_gpu_trans, gpu_loop_trans,
+     cpu_loop_trans) = select_transformations()
+
+    disable_profiling_for = []
+    enable_async = ASYNC_PARALLEL and psyir.name not in ASYNC_ISSUES
+    privatise_arrays = not (NEMOV4 or "acc" in PARALLEL_DIRECTIVES)
+
+    for subroutine in psyir.walk(Routine):
+
+        # Skip initialisation and diagnostic subroutines
+        if (subroutine.name.endswith('_alloc') or
+                subroutine.name.endswith('_init') or
+                subroutine.name.startswith('init_') or
+                subroutine.name.startswith('Agrif') or
+                subroutine.name.startswith('dia_') or
+                subroutine.name == 'dom_msk' or
+                subroutine.name == 'dom_zgr' or
+                subroutine.name == 'dom_ngb'):
+            continue
+
+        normalise_loops(
+                subroutine,
+                hoist_local_arrays=False,
+                convert_array_notation=True,
+                # See issue #3022
+                loopify_array_intrinsics=psyir.name != "getincom.f90",
+                convert_range_loops=True,
+                increase_array_ranks=not NEMOV4,
+                hoist_expressions=True
+        )
+
+        # Perform module-inlining of called routines.
+        if INLINING_ENABLED:
+            inline_calls(subroutine)
+
+        # These are functions that are called from inside parallel regions,
+        # annotate them with 'omp declare target'
+        if (
+            mark_for_gpu_trans and
+            (subroutine.name.lower().startswith("sign_")
+             or subroutine.name.lower() == "solfrac"
+             or (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop)))
+        ):
+            try:
+                mark_for_gpu_trans.apply(subroutine)
+                print(f"Marked {subroutine.name} as GPU-enabled")
+            except TransformationError as err:
+                print(err)
+            # We continue parallelising inside the routine, but this could
+            # change if the parallelisation directives added below are not
+            # nestable, in that case we could add a 'continue' here
+            disable_profiling_for.append(subroutine.name)
+
+        elif (psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES
+              and gpu_loop_trans):
+            print(
+                f"Adding offload directives to subroutine: {subroutine.name}")
+            insert_explicit_loop_parallelism(
+                    subroutine,
+                    region_directive_trans=offload_region_trans,
+                    loop_directive_trans=gpu_loop_trans,
+                    collapse=True,
+                    privatise_arrays=privatise_arrays,
+                    enable_reductions=not REPRODUCIBLE,
+                    uniform_intrinsics_only=REPRODUCIBLE,
+                    asynchronous_parallelism=enable_async,
+            )
+        elif psyir.name not in PARALLELISATION_ISSUES and cpu_loop_trans:
+            # These have issues offloading, but we can still do threading
+            print(f"Adding OpenMP threading to subroutine: {subroutine.name}")
+            insert_explicit_loop_parallelism(
+                    subroutine,
+                    loop_directive_trans=cpu_loop_trans,
+                    collapse=False,
+                    privatise_arrays=privatise_arrays,
+                    enable_reductions=not REPRODUCIBLE,
+                    asynchronous_parallelism=enable_async,
+            )
+
+    # Iterate again and add profiling hooks when needed
+    for subroutine in psyir.walk(Routine):
+        if PROFILING_ENABLED and subroutine.name not in disable_profiling_for:
+            print(f"Adding profiling hooks to subroutine: {subroutine.name}")
+            add_profiling(subroutine.children)
diff --git a/examples/nemo/scripts/omp_cpu_trans.py b/examples/nemo/scripts/omp_cpu_trans.py
deleted file mode 100755
index 367e178423..0000000000
--- a/examples/nemo/scripts/omp_cpu_trans.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env python
-# -----------------------------------------------------------------------------
-# BSD 3-Clause License
-#
-# Copyright (c) 2021-2026, Science and Technology Facilities Council.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-# -----------------------------------------------------------------------------
-# Authors: S. Siso, STFC Daresbury Lab
-
-''' PSyclone transformation script to insert OpenMP for CPU
-directives into Nemo code. Tested with ECMWF Nemo 4.0 code. '''
-
-import os
-from utils import (
-    insert_explicit_loop_parallelism, normalise_loops, add_profiling,
-    PARALLELISATION_ISSUES, NEMO_MODULES_TO_IMPORT)
-from psyclone.psyir.nodes import Routine
-from psyclone.transformations import OMPLoopTrans
-
-# Enable the insertion of profiling hooks during the transformation script
-PROFILING_ENABLED = False
-
-# Whether to chase the imported modules to improve symbol information (it can
-# also be a list of module filenames to limit the chasing to only specific
-# modules). This has to be used in combination with '-I' command flag in order
-# to point to the module location directory. We also strongly recommend using
-# the '--enable-cache' flag to reduce the performance overhead.
-RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT
-
-# A environment variable can inform if this is targeting NEMOv4, in which case
-# array privatisation is disabled.
-NEMOV4 = os.environ.get('NEMOV4', False)
-
-# By default, allow optimisations that may change the results, e.g. reductions
-REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False)
-
-# List of all files that psyclone will skip processing
-FILES_TO_SKIP = []
-if not NEMOV4:
-    # TODO #3112: These produce diverging run.stat results in gcc NEMOv5 BENCH
-    FILES_TO_SKIP = [
-        "dynhpg.f90",
-        "dynspg_ts.f90",
-        "sbcssm.f90",
-        "tramle.f90",
-        "trazdf.f90",
-    ]
-
-if PROFILING_ENABLED:
-    # Fails with profiling enabled. issue #2723
-    FILES_TO_SKIP.append("mppini.f90")
-
-
-def trans(psyir):
-    ''' Add OpenMP Parallel and Do directives to all loops, including the
-    implicit ones.
-
-    :param psyir: the PSyIR of the provided file.
-    :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer`
-
-    '''
-    # If the environemnt has ONLY_FILE defined, only process that one file and
-    # nothing else. This is useful for file-by-file exhaustive tests.
-    only_do_file = os.environ.get('ONLY_FILE', False)
-    if only_do_file and psyir.name != only_do_file:
-        return
-
-    # Parallelising this file currently causes a noticeable slowdown
-    if psyir.name.startswith("icethd"):
-        return
-
-    # This file fails for gcc NEMOv5 BENCH
-    if not NEMOV4 and psyir.name == "icedyn_rhg_evp.f90":
-        return
-
-    omp_parallel_trans = None
-    omp_loop_trans = OMPLoopTrans(omp_schedule="static")
-    omp_loop_trans.omp_directive = "paralleldo"
-
-    for subroutine in psyir.walk(Routine):
-        print(f"Adding OpenMP threading to subroutine: {subroutine.name}")
-
-        if PROFILING_ENABLED:
-            add_profiling(subroutine.children)
-
-        normalise_loops(
-                subroutine,
-                hoist_local_arrays=False,
-                convert_array_notation=True,
-                # See issue #3022
-                loopify_array_intrinsics=psyir.name != "getincom.f90",
-                convert_range_loops=True,
-                hoist_expressions=False,
-                scalarise_loops=False
-        )
-
-        if psyir.name not in PARALLELISATION_ISSUES:
-            insert_explicit_loop_parallelism(
-                    subroutine,
-                    region_directive_trans=omp_parallel_trans,
-                    loop_directive_trans=omp_loop_trans,
-                    collapse=False,
-                    privatise_arrays=not NEMOV4,
-                    enable_reductions=not REPRODUCIBLE,
-            )
diff --git a/examples/nemo/scripts/omp_gpu_trans.py b/examples/nemo/scripts/omp_gpu_trans.py
deleted file mode 100755
index 42047274bd..0000000000
--- a/examples/nemo/scripts/omp_gpu_trans.py
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/usr/bin/env python
-# -----------------------------------------------------------------------------
-# BSD 3-Clause License
-#
-# Copyright (c) 2021-2026, Science and Technology Facilities Council.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-# -----------------------------------------------------------------------------
-# Authors: S. Siso, STFC Daresbury Lab
-
-''' PSyclone transformation script showing the introduction of OpenMP for GPU
-directives into Nemo code. '''
-
-import os
-from utils import (
-    add_profiling, inline_calls, insert_explicit_loop_parallelism,
-    normalise_loops, PARALLELISATION_ISSUES, NEMO_MODULES_TO_IMPORT)
-from psyclone.psyir.nodes import Routine, Loop
-from psyclone.psyir.transformations import (
-    OMPTargetTrans, OMPDeclareTargetTrans)
-from psyclone.transformations import (
-    OMPLoopTrans, TransformationError)
-
-
-# This environment variable informs if profiling hooks have to be inserted.
-PROFILING_ENABLED = os.environ.get('ENABLE_PROFILING', False)
-
-# By default, we don't do module inlining as it's still under development.
-INLINING_ENABLED = os.environ.get('ENABLE_INLINING', False)
-
-# By default, we allow all device intrinsics (not only the reproducible ones)
-REPRODUCIBLE = os.environ.get('REPRODUCIBLE', False)
-
-# This environment variable informs if this is targeting NEMOv4, in which case
-# array privatisation is disabled and some more files excluded
-NEMOV4 = os.environ.get('NEMOV4', False)
-
-# This environment variable informs if we're enabling asynchronous
-# parallelism.
-ASYNC_PARALLEL = os.environ.get('ASYNC_PARALLEL', False)
-
-# Whether to chase the imported modules to improve symbol information (it can
-# also be a list of module filenames to limit the chasing to only specific
-# modules). This has to be used in combination with '-I' command flag in order
-# to point to the module location directory. We also strongly recommend using
-# the '--enable-cache' flag to reduce the performance overhead.
-RESOLVE_IMPORTS = NEMO_MODULES_TO_IMPORT
-
-# List of all files that psyclone will skip processing
-FILES_TO_SKIP = [
-    "icefrm.f90",  # Has an unsupported implicit symbol declaration
-]
-
-NEMOV5_EXCLUSIONS = []
-
-NEMOV4_EXCLUSIONS = [
-    "dynspg_ts.f90",
-    "tranxt.f90",
-]
-
-SKIP_FOR_PERFORMANCE = [
-    "iom.f90",
-    "iom_nf90.f90",
-    "iom_def.f90",
-    "timing.f90",
-    "histcom.f90",
-]
-
-OFFLOADING_ISSUES = [
-    # Produces different output results
-    "zdftke.f90",
-    # The following issues only affect BENCH (because ice is enabled?)
-    # Runtime Error: Illegal address during kernel execution
-    "trcrad.f90",
-    # Signal 11 issues
-    "trcbbl.f90",
-    "bdyice.f90",
-    "sedfunc.f90",
-    "stpmlf.f90",
-    "trddyn.f90",
-    "trczdf.f90",
-    "trcice_pisces.f90",
-    "dtatsd.f90",
-    "trcatf.f90",
-    "stp2d.f90",
-    "trabbc.f90",
-]
-
-ASYNC_ISSUES = [
-    # TODO #3220: Explore the cause of the async issues
-    # Runtime Error: (CUDA_ERROR_LAUNCH_FAILED): Launch failed
-    # (often invalid pointer dereference) in get_cstrgsurf
-    "sbcclo.f90",
-    "trcldf.f90",
-    # Runtime Error: Illegal address during kernel execution with
-    # asynchronicity.
-    "zdfiwm.f90",
-    "zdfsh2.f90",
-    # Diverging results with asynchronicity
-    "traadv_fct.f90",
-]
-
-
-def trans(psyir):
-    ''' Add OpenMP Target and Loop directives to all loops, including the
-    implicit ones, to parallelise the code and execute it in an acceleration
-    device.
-
-    :param psyir: the PSyIR of the provided file.
-    :type psyir: :py:class:`psyclone.psyir.nodes.FileContainer`
-
-    '''
-    # The two options below are useful for file-by-file exhaustive tests.
-    # If the environemnt has ONLY_FILE defined, only process that one file and
-    # known-good files that need a "declare target" inside.
-    only_do_file = os.environ.get('ONLY_FILE', False)
-    only_do_files = (only_do_file, "lib_fortran.f90", "solfrac_mod.f90")
-    if only_do_file and psyir.name not in only_do_files:
-        return
-    # If the environemnt has ALL_BUT_FILE defined, process all files but
-    # the one named file.
-    all_but_file = os.environ.get('ALL_BUT_FILE', False)
-    if all_but_file and psyir.name == all_but_file:
-        return
-
-    omp_target_trans = OMPTargetTrans()
-    if NEMOV4:
-        # TODO #2895: Explore why loop/teams loop diverge for NEMOv4
-        omp_gpu_loop_trans = OMPLoopTrans(omp_schedule="none")
-        omp_gpu_loop_trans.omp_directive = "loop"
-    else:
-        omp_gpu_loop_trans = OMPLoopTrans(omp_schedule="none")
-        omp_gpu_loop_trans.omp_directive = "teamsloop"
-    omp_cpu_loop_trans = OMPLoopTrans(omp_schedule="static")
-    omp_cpu_loop_trans.omp_directive = "paralleldo"
-
-    disable_profiling_for = []
-    enable_async = ASYNC_PARALLEL and psyir.name not in ASYNC_ISSUES
-
-    for subroutine in psyir.walk(Routine):
-
-        # The exclusion below could be in the FILES_TO_SKIP global parameter,
-        # but in this script, for testing purposes, we exclude them here so the
-        # PSyclone frontend and backend are still tested and it also allows to
-        # insert profiling hooks later on.
-        if psyir.name in SKIP_FOR_PERFORMANCE:
-            continue
-        if NEMOV4 and psyir.name in NEMOV4_EXCLUSIONS:
-            continue
-        if not NEMOV4 and psyir.name in NEMOV5_EXCLUSIONS:
-            continue
-        # ICE routines do not perform well on GPU, so we skip them
-        if psyir.name.startswith("ice"):
-            continue
-        # Skip initialisation and diagnostic subroutines
-        if (subroutine.name.endswith('_alloc') or
-                subroutine.name.endswith('_init') or
-                subroutine.name.startswith('Agrif') or
-                subroutine.name.startswith('dia_') or
-                subroutine.name == 'dom_msk' or
-                subroutine.name == 'dom_zgr' or
-                subroutine.name == 'dom_ngb'):
-            continue
-
-        normalise_loops(
-                subroutine,
-                hoist_local_arrays=False,
-                convert_array_notation=True,
-                # See issue #3022
-                loopify_array_intrinsics=psyir.name != "getincom.f90",
-                convert_range_loops=True,
-                increase_array_ranks=not NEMOV4,
-                hoist_expressions=True
-        )
-        # Perform module-inlining of called routines.
-        if INLINING_ENABLED:
-            inline_calls(subroutine)
-
-        # These are functions that are called from inside parallel regions,
-        # annotate them with 'omp declare target'
-        if (
-            subroutine.name.lower().startswith("sign_")
-            or subroutine.name.lower() == "solfrac"
-            or (psyir.name == "sbc_phy.f90" and not subroutine.walk(Loop))
-        ):
-            try:
-                OMPDeclareTargetTrans().apply(subroutine)
-                print(f"Marked {subroutine.name} as GPU-enabled")
-            except TransformationError as err:
-                print(err)
-            # We continue parallelising inside the routine, but this could
-            # change if the parallelisation directives added below are not
-            # nestable, in that case we could add a 'continue' here
-            disable_profiling_for.append(subroutine.name)
-
-        if NEMOV4:
-            # For nemo4 always offload but without privatisation
-            print(f"Adding OpenMP offloading to subroutine: {subroutine.name}")
-            insert_explicit_loop_parallelism(
-                    subroutine,
-                    region_directive_trans=omp_target_trans,
-                    loop_directive_trans=omp_gpu_loop_trans,
-                    collapse=True,
-                    privatise_arrays=False,
-                    asynchronous_parallelism=enable_async,
-                    uniform_intrinsics_only=REPRODUCIBLE,
-                    enable_reductions=not REPRODUCIBLE
-            )
-        elif psyir.name not in PARALLELISATION_ISSUES + OFFLOADING_ISSUES:
-            print(f"Adding OpenMP offloading to subroutine: {subroutine.name}")
-            insert_explicit_loop_parallelism(
-                    subroutine,
-                    region_directive_trans=omp_target_trans,
-                    loop_directive_trans=omp_gpu_loop_trans,
-                    collapse=True,
-                    asynchronous_parallelism=enable_async,
-                    privatise_arrays=True,
-                    uniform_intrinsics_only=REPRODUCIBLE,
-                    enable_reductions=not REPRODUCIBLE
-            )
-        elif psyir.name not in PARALLELISATION_ISSUES:
-            # This have issues offloading, but we can still do OpenMP threading
-            print(f"Adding OpenMP threading to subroutine: {subroutine.name}")
-            # If asynchronous parallelism is enabled, these subroutines in
-            # sbcclo.f90 fail if they're parallelised on the CPU.
-            if (ASYNC_PARALLEL and subroutine.name in
-                    ("get_cssrcsurf", "get_cstrgsurf")):
-                continue
-            insert_explicit_loop_parallelism(
-                    subroutine,
-                    loop_directive_trans=omp_cpu_loop_trans,
-                    asynchronous_parallelism=enable_async,
-                    privatise_arrays=True,
-            )
-
-    # Iterate again and add profiling hooks when needed
-    for subroutine in psyir.walk(Routine):
-        if psyir.name in SKIP_FOR_PERFORMANCE:
-            continue
-        if PROFILING_ENABLED and subroutine.name not in disable_profiling_for:
-            print(f"Adding profiling hooks to subroutine: {subroutine.name}")
-            add_profiling(subroutine.children)
diff --git a/examples/nemo/scripts/utils.py b/examples/nemo/scripts/utils.py
index 8161eaa16c..565537e0c2 100755
--- a/examples/nemo/scripts/utils.py
+++ b/examples/nemo/scripts/utils.py
@@ -201,6 +201,8 @@ def normalise_loops(
     :param hoist_expressions: whether to hoist bounds and loop invariant
         statements out of the loop nest.
     '''
+    filename = schedule.root.name
+    nemo_v4 = os.environ.get('NEMOV4', False)
     if hoist_local_arrays and schedule.name not in CONTAINS_STMT_FUNCTIONS:
         # Apply the HoistLocalArraysTrans when possible, it cannot be applied
         # to files with statement functions because it will attempt to put the
@@ -226,17 +228,22 @@ def normalise_loops(
                     print(err.value)
 
     if convert_range_loops:
-        # Convert all array implicit loops to explicit loops
-        explicit_loops = ArrayAssignment2LoopsTrans()
-        for assignment in schedule.walk(Assignment):
+        if schedule.name in ("fld_def",):
             # TODO #2951: Fix array assignments with dependencies
-            if schedule.name in ("fld_def",):
-                continue
-            try:
-                explicit_loops.apply(
-                    assignment, options={'verbose': True})
-            except TransformationError:
-                pass
+            pass
+        elif nemo_v4 and filename == "dynspg_ts.f90":
+            # TODO #3256: Is there an issue with the L/UBOUND intrinsics
+            # that this transformation adds?
+            pass
+        else:
+            # Convert all array implicit loops to explicit loops
+            explicit_loops = ArrayAssignment2LoopsTrans()
+            for assignment in schedule.walk(Assignment):
+                try:
+                    explicit_loops.apply(
+                        assignment, options={'verbose': True})
+                except TransformationError:
+                    pass
 
     if scalarise_loops:
         # Apply scalarisation to every loop. Execute this in reverse order
diff --git a/src/psyclone/psyir/nodes/structure_reference.py b/src/psyclone/psyir/nodes/structure_reference.py
index 9983849b5a..fff61cbb05 100644
--- a/src/psyclone/psyir/nodes/structure_reference.py
+++ b/src/psyclone/psyir/nodes/structure_reference.py
@@ -348,8 +348,11 @@ def _get_cursor_shape(cursor, cursor_type):
             if not isinstance(cursor_type, (UnresolvedType, UnsupportedType)):
                 # Once we've hit an Unresolved/UnsupportedType the cursor_type
                 # will remain set to that as we can't do any better.
-                cursor_type = cursor_type.components[
-                    cursor.name.lower()].datatype
+                try:
+                    cursor_type = cursor_type.components[
+                        cursor.name.lower()].datatype
+                except KeyError:
+                    return UnresolvedType()
             try:
                 cursor_shape = _get_cursor_shape(cursor, cursor_type)
             except NotImplementedError:
diff --git a/src/psyclone/tests/psyir/nodes/structure_reference_test.py b/src/psyclone/tests/psyir/nodes/structure_reference_test.py
index 6d4311a7ab..612c6b9f99 100644
--- a/src/psyclone/tests/psyir/nodes/structure_reference_test.py
+++ b/src/psyclone/tests/psyir/nodes/structure_reference_test.py
@@ -263,6 +263,11 @@ def test_struc_ref_datatype():
     sref0 = nodes.StructureReference.create(ssym0, ["nx"])
     assert sref0.datatype == symbols.INTEGER_TYPE
 
+    # If the type component is not found (e.g. it is inherited, which psyclone
+    # does not support), return UnresolvedType
+    sref = nodes.StructureReference.create(ssym0, ["not_specified"])
+    assert sref.datatype == symbols.UnresolvedType()
+
     # Symbol with type defined by DataTypeSymbol
     grid_type_symbol = symbols.DataTypeSymbol("grid_type", grid_type)
     ssym = symbols.DataSymbol("grid", grid_type_symbol)
diff --git a/src/psyclone/tests/psyir/transformations/transformations_test.py b/src/psyclone/tests/psyir/transformations/transformations_test.py
index bcb9e0176f..8b2d150ef3 100644
--- a/src/psyclone/tests/psyir/transformations/transformations_test.py
+++ b/src/psyclone/tests/psyir/transformations/transformations_test.py
@@ -123,7 +123,7 @@ def test_accparalleltrans_validate(fortran_reader):
     ''' Test that ACCParallelTrans validation fails if it contains non-allowed
     constructs. '''
 
-    omptargettrans = ACCParallelTrans()
+    accparalleltrans = ACCParallelTrans()
 
     code = '''
     function myfunc(a)
@@ -134,6 +134,8 @@ def test_accparalleltrans_validate(fortran_reader):
         integer, dimension(10, 10) :: A
         integer :: i
         integer :: j
+        character*8 :: a, b
+        character :: c(8), d(8)
         do i = 1, 10
             do j = 1, 10
                 A(i, j) = myfunc(3)
@@ -149,35 +151,62 @@ def test_accparalleltrans_validate(fortran_reader):
                 A(i,j) = GET_COMMAND(2)
             end do
         end do
+        do i = 1, 8
+            a(i) = b(i)
+        end do
+        do i = 1, 8
+            c(i) = d(i)
+        end do
     end subroutine
     '''
     psyir = fortran_reader.psyir_from_source(code)
     loops = psyir.walk(Loop, stop_type=Loop)
 
     with pytest.raises(TransformationError) as err:
-        omptargettrans.validate(loops[0])
+        accparalleltrans.validate(loops[0])
     assert ("'myfunc' is not available on the accelerator device, and "
             "therefore it cannot be called from within an ACC parallel region."
             in str(err.value))
 
     with pytest.raises(TransformationError) as err:
-        omptargettrans.validate(loops[1])
+        accparalleltrans.validate(loops[1])
     assert ("Nodes of type 'CodeBlock' cannot be enclosed by a ACCParallel"
             "Trans transformation" in str(err.value))
 
     with pytest.raises(TransformationError) as err:
-        omptargettrans.validate(loops[2])
+        accparalleltrans.validate(loops[2])
     assert ("'GET_COMMAND' is not available on the default accelerator "
             "device. Use the 'device_string' option to specify a different "
             "device." in str(err.value))
 
     with pytest.raises(TransformationError) as err:
-        omptargettrans.validate(loops[2], options={'device_string':
-                                                   'nvfortran-all'})
+        accparalleltrans.validate(loops[2], options={'device_string':
+                                                     'nvfortran-all'})
     assert ("'GET_COMMAND' is not available on the 'nvfortran-all' accelerator"
             " device. Use the 'device_string' option to specify a different "
             "device." in str(err.value))
 
+    # Character substrings and no verbose option
+    with pytest.raises(TransformationError) as err:
+        accparalleltrans.validate(loops[3])
+    assert ("ACCParallelTrans doesn't enclose regions that uses characters, "
+            "but found: b(i), use the 'allow_strings' transformation option "
+            "to offload this region." in str(err.value))
+    assert loops[3].preceding_comment == ""
+
+    # Character array and verbose option
+    with pytest.raises(TransformationError) as err:
+        accparalleltrans.validate(loops[4], options={'verbose': True})
+    assert ("ACCParallelTrans doesn't enclose regions that uses characters, "
+            "but found: c(i), use the 'allow_strings' transformation option "
+            "to offload this region." in str(err.value))
+    assert ("but found: c(i), use the 'allow_strings'"
+            in loops[4].preceding_comment)
+
+    # These validate with the right option
+    accparalleltrans.validate(loops[3], options={'allow_strings': True})
+    accparalleltrans.validate(loops[4], options={'allow_strings': True})
+
 
 def test_accenterdata():
     ''' Generic tests for the ACCEnterDataTrans class '''
diff --git a/src/psyclone/transformations.py b/src/psyclone/transformations.py
index 0315ebba53..39bf6f5442 100644
--- a/src/psyclone/transformations.py
+++ b/src/psyclone/transformations.py
@@ -61,7 +61,7 @@
     ACCDataDirective, ACCDirective, ACCEnterDataDirective, ACCKernelsDirective,
     ACCLoopDirective, ACCParallelDirective, ACCRoutineDirective,
     Call, CodeBlock, Directive, Literal, Loop, Node,
-    Return, Schedule, PSyDataNode, IntrinsicCall)
+    Reference, Return, Schedule, PSyDataNode, IntrinsicCall)
 from psyclone.psyir.nodes.acc_mixins import ACCAsyncMixin
 from psyclone.psyir.nodes.array_mixin import ArrayMixin
 from psyclone.psyir.nodes.omp_directives import (
@@ -1186,9 +1186,18 @@ def validate(self, node_list, options=None):
             avoid using unsupported nodes inside a region.
         :param bool options["default_present"]: this flag controls if the
             inserted directive should include the default_present clause.
+        :param bool options["allow_strings"]: whether to allow the
+            transformation on assignments involving character types. Defaults
+            to False.
+        :param bool options["verbose"]: whether to allow the
+            transformation on assignments involving character types. Defaults
+            to False.
 
         '''
         node_list = self.get_node_list(node_list)
+        verbose = options.get("verbose", False) if options else False
+        device_string = options.get("device_string", "") if options else ""
+        allow_strings = options.get("allow_strings", "") if options else False
         super().validate(node_list, options)
         if options is not None and "default_present" in options:
             if not isinstance(options["default_present"], bool):
@@ -1196,8 +1205,26 @@ def validate(self, node_list, options=None):
                     f"The provided 'default_present' option must be a "
                     f"boolean, but found '{options['default_present']}'."
                 )
-        device_string = options.get("device_string", "") if options else ""
         for node in node_list:
+            if not allow_strings:
+                # Check there are no character assignments in the region
+                for datanode in node.walk((Reference, Literal),
+                                          stop_type=Reference):
+                    dtype = datanode.datatype
+                    # Don't allow CHARACTERS on GPU
+                    if hasattr(dtype, "intrinsic"):
+                        if dtype.intrinsic == ScalarType.Intrinsic.CHARACTER:
+                            message = (
+                                f"ACCParallelTrans doesn't enclose regions "
+                                f"that uses characters, but found: "
+                                f"{datanode.debug_string()}, use the "
+                                f"'allow_strings' transformation option to "
+                                f"offload this region."
+                            )
+                            if verbose:
+                                node.preceding_comment = message
+                            raise TransformationError(message)
+
             for call in node.walk(Call):
                 if not call.is_available_on_device(device_string):
                     if isinstance(call, IntrinsicCall):
@@ -1230,6 +1257,9 @@ def apply(self, target_nodes, options=None):
             avoid using unsupported nodes inside a region.
         :param bool options["default_present"]: this flag controls if the
             inserted directive should include the default_present clause.
+        :param bool options["allow_strings"]: whether to allow the
+            transformation on assignments involving character types. Defaults
+            to False.
 
         '''
         if not options: