diff --git a/ci/cscs/cscs-ci-cd.md b/ci/cscs/cscs-ci-cd.md index 9691155b6..5636dd990 100644 --- a/ci/cscs/cscs-ci-cd.md +++ b/ci/cscs/cscs-ci-cd.md @@ -1,5 +1,5 @@ ## CI/CD PR testing on ALPS @ CSCS -Unit tests and Integration tests will be performed for each PR made to IPPL on ALPS at CSCS, currently, the (daint) gh200 nodes and (beverin) mi300a nodes are being used for tests (other architectures can/will be added on request and subject to availability). Each PR that is tested will receive either a green tick (pass), or a red cross (fail). +Unit tests and Integration tests will be performed for each PR made to IPPL on ALPS at CSCS, currently, the (daint) NVidia gh200 nodes, (beverin) AMD mi300a nodes and (eiger) AMD Zen2 multicore/OpenMP nodes are being used for tests (other architectures can/will be added on request and subject to availability). Each PR that is tested will receive either a green tick (pass), or a red cross (fail). Clicking on the Tick/Cross will take you to the pipeline information page where you can drill down to see individual stages/steps * what stages/steps exist (and dependencies), currently there are @@ -21,7 +21,7 @@ PRs **submitted via forks** can be tested by adding a comment of the form Example: run multiple testing pipelines ``` -cscs-ci run cscs-ci-gh200,cscs-ci-mi300 +cscs-ci run cscs-ci-gh200,cscs-ci-mi300,cscs-ci-openmp ``` Example: run only the gh200 testing pipeline ``` @@ -68,6 +68,9 @@ ippl-root       ├── cuda       │   ├── build_sm90.yml       │   └── run_sm90.yml +    ├── openmp +    │   ├── build_openmp.yml +    │   └── run_openmp.yml       └── rocm          ├── build_rocm-6.3.yml          └── run_rocm-6.3.yml @@ -93,7 +96,14 @@ Essential information on how the yaml files are structured and what sections mea **CSCS CI admin console** https://cicd-ext-mw.cscs.ch/ci/setup/ui?repo=2663791694469788 the ID 2663791694469788 refers to the internal project number granted to the IPPL projet to submit tasks to ALPS for testing. -On this pages, per mission can be set and pipeline properties can be changed. +On this pages, per mission can be set and pipeline properties can be changed. (Note: CSCS staff members can directly register a project by clicking on register new project at the bottom right of the CI overview page). + +**github settings webhooks** +``` +payload url : https://cicd-ext-mw.cscs.ch/ci/webhook_ci?id=`XXXXXXX` +payload type: application/json +Secret : issued when new CI/CD project created +``` Note that the pipeline names can be set via the admin console, the names **cscs-ci-gh200** and **cscs-ci-mi300** were chosen to represent the architectures that they are run on, but are otherwise completely arbitrary and if changed the adjusted names must be used to trigger pipeline checks when done manually as described above. @@ -101,3 +111,20 @@ The pipeline entry points are set on the admin page to point to `ci/cscs/cscs-gh200.yml` and `ci/cscs/cscs-mi300.yml` if these yaml files are moved to renamed, the pipeline entry points must be edited accordingly. For information on how to setup authorization keys for firecrest launching of CSCS jobs, please consult the pages above. + +**CSCS setup steps (for future reference)** + * Github: In the project settings setup an application/json webhook and set the payload url (eg `https://cicd-ext-mw.cscs.ch/ci/webhook_ci?id=0123456789012`). The secret is taken from the CSCS CI admin console when you created the CI testing project + * Github: Create a fine grained API token using your personal account, it might look like `github_pat_01234567890123456789abcdefabcdef`, it needs + * Read access to metadata + * Read/Write access to commit statuses + * The API token will be pasted into the CSCS CI admin console under the global config "Notification token" + * Create an "Application" (eg named: ippl-testing or opalx-testing) in the developer.cscs.ch dev-portal console and add subscriptions to the firecrest interfaces for respective access + * firecrest-HPC (for daint/eiger) + * firecrest-beverin (mi300) + * Create OAuth tokens (key/secret) in the firecrest realm and paste those into the CI admin console in the admin section under firecrest key/secret entries. Set accounts for csstaff, project accounts, user names in permissions + * Setup pipelines like 'cscs-ci-gh200' to match the `ci/cscs/cscs-gh200.yml` yaml entrypoint + * On github check webhoos/response to see if PRs are triggering anything + * Check https://gitlab.com/cscs-ci/ci-testing/webhook-ci/mirrors/6151408209445194/694321096757981/ `build/pipelines` to see CI triggered activity + * On a PR, use `cscs-ci run` or `cscs-ci run cscs-ci-gh200, ...` to trigger a pipeline + * To retrigger master branch rebuild/test - got to main github page of project, locate red cross (or green tick) at top of project, it shows a list of failed or successful builds. Locate the pieline you want, and copy the link under "details", paste it into address bar, but remove `?iid=808&type=gitlab` and then page will show info. Click bottom right corner "login to restart jobs" and master branch CI will be triggered. + diff --git a/ci/cscs/cuda/run_sm90.yml b/ci/cscs/cuda/run_sm90.yml index f9e2ab762..79a42c630 100644 --- a/ci/cscs/cuda/run_sm90.yml +++ b/ci/cscs/cuda/run_sm90.yml @@ -87,13 +87,13 @@ ippl-test-cuda12-sm90-release-4-ranks: # --------------------------------------------------------- # 4 ranks release : known failing tests # --------------------------------------------------------- -ippl-test-cuda12-sm90-release-failing-4-ranks: - needs: ["ippl-build-cuda12-sm90-release"] - extends: .ippl-test-cuda-common - variables: - SLURM_NTASKS: 4 - BUILD_DIR: "build-$CI_COMMIT_SHORT_SHA-release" - BUILD_TYPE: Release - TEST_INFO: "4-ranks-known-failing" - TEST_ARGS: "-R known_fail" - allow_failure: true +# ippl-test-cuda12-sm90-release-failing-4-ranks: +# needs: ["ippl-build-cuda12-sm90-release"] +# extends: .ippl-test-cuda-common +# variables: +# SLURM_NTASKS: 4 +# BUILD_DIR: "build-$CI_COMMIT_SHORT_SHA-release" +# BUILD_TYPE: Release +# TEST_INFO: "4-ranks-known-failing" +# TEST_ARGS: "-R known_fail" +# allow_failure: true diff --git a/ci/cscs/dashboard-configure-build.cmake b/ci/cscs/dashboard-configure-build.cmake index ed460f165..419adb6f2 100644 --- a/ci/cscs/dashboard-configure-build.cmake +++ b/ci/cscs/dashboard-configure-build.cmake @@ -24,39 +24,66 @@ set(CTEST_BUILD_NAME "${CDASH_LABEL}-${BUILD_ARCH}-${BUILD_TYPE}-${TEST_INFO}") set(CTEST_SOURCE_DIRECTORY "$ENV{CI_PROJECT_DIR}") set(CTEST_BINARY_DIRECTORY "${BUILD_DIR}") set(CTEST_CMAKE_GENERATOR "Ninja") -set(CTEST_GROUP "Pull_Requests") set(CTEST_GROUP "Experimental") # --- start a new build in CDash --- ctest_start(Experimental GROUP "${CTEST_GROUP}") +# --- Initialize base configure command as a CMake LIST --- set(CTEST_CONFIGURE_COMMAND "${CMAKE_COMMAND}") -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -S${CTEST_SOURCE_DIRECTORY}") -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -B${CTEST_BINARY_DIRECTORY}") -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -G${CTEST_CMAKE_GENERATOR}") -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} --preset=${PRESET}") -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -DCMAKE_BUILD_TYPE=${BUILD_TYPE}") -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -DCMAKE_BUILD_RPATH_USE_ORIGIN=ON") -if(DEFINED IPPL_PLATFORMS) - set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -DIPPL_PLATFORMS=${IPPL_PLATFORMS}") -endif() -if(DEFINED IPPL_OPENMP_THREADS) - set(CTEST_CONFIGURE_COMMAND - "${CTEST_CONFIGURE_COMMAND} -DIPPL_OPENMP_THREADS=${IPPL_OPENMP_THREADS}") -endif() -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -DIPPL_ENABLE_SOLVERS=ON") -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -DIPPL_MARK_FAILING_TESTS=ON") -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -DMPIEXEC_EXECUTABLE=${MPIEXEC_EXECUTABLE}") -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -DMPIEXEC_PREFLAGS=${MPIEXEC_PREFLAGS}") -set(CTEST_CONFIGURE_COMMAND - "${CTEST_CONFIGURE_COMMAND} -DMPIEXEC_MAX_NUMPROCS=${MPIEXEC_MAX_NUMPROCS}") -if(DEFINED Heffte_VERSION) - set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -DHeffte_VERSION=${Heffte_VERSION}") -endif() -if(DEFINED Kokkos_VERSION) - set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -DKokkos_VERSION=${Kokkos_VERSION}") +string(APPEND CTEST_CONFIGURE_COMMAND " -S${CTEST_SOURCE_DIRECTORY}") +string(APPEND CTEST_CONFIGURE_COMMAND " -B${CTEST_BINARY_DIRECTORY}") +string(APPEND CTEST_CONFIGURE_COMMAND " -G${CTEST_CMAKE_GENERATOR}") +string(APPEND CTEST_CONFIGURE_COMMAND " --preset=${PRESET}") +string(APPEND CTEST_CONFIGURE_COMMAND " -DCMAKE_BUILD_TYPE=${BUILD_TYPE}") + +# --------------------------------- +# cmake-format: off +# --------------------------------- +# --- Forward variables cleanly --- +set(VARS_TO_FORWARD + IPPL_PLATFORMS + IPPL_OPENMP_THREADS + IPPL_ENABLE_SCRIPTS + Heffte_VERSION + Kokkos_VERSION + MPIEXEC_EXECUTABLE + MPIEXEC_PREFLAGS + MPIEXEC_MAX_NUMPROCS +) + +foreach(VAR IN LISTS VARS_TO_FORWARD) + if(DEFINED ${VAR}) + set(VAL "${${VAR}}") + + if("${VAL}" MATCHES ";") + # 1. Force CMake to treat it as a literal string cache entry to preserve semicolons + string(APPEND CTEST_CONFIGURE_COMMAND " -D${VAR}:STRING=${VAL}") + + # 2. Expose it to the local CTest script scope so CTest's test launcher + # can parse it as a native list during the execution phase. + set(${VAR} "${VAL}") + else() + # Standard scalar variable (no semicolons) + string(APPEND CTEST_CONFIGURE_COMMAND " -D${VAR}=${VAL}") + endif() + endif() +endforeach() +# --------------------------------- +# cmake-format: on +# --------------------------------- + +# --- Append remaining static flags --- +string(APPEND CTEST_CONFIGURE_COMMAND " -DCMAKE_BUILD_RPATH_USE_ORIGIN=ON") +string(APPEND CTEST_CONFIGURE_COMMAND " -DIPPL_ENABLE_SOLVERS=ON") +string(APPEND CTEST_CONFIGURE_COMMAND " -DIPPL_MARK_FAILING_TESTS=ON") + +if(DEFINED Kokkos_ARCH_FLAG) + string(APPEND CTEST_CONFIGURE_COMMAND " -D${Kokkos_ARCH_FLAG}=ON") endif() -set(CTEST_CONFIGURE_COMMAND "${CTEST_CONFIGURE_COMMAND} -D${Kokkos_ARCH_FLAG}=ON") + +# --- Output our configure command for debug purposes--- +message("Final CTest configure command: ${CTEST_CONFIGURE_COMMAND}") # --- configure & build --- ctest_configure(RETURN_VALUE configure_result) diff --git a/ci/cscs/openmp/run_openmp.yml b/ci/cscs/openmp/run_openmp.yml index d8d9eda74..ac309bd6d 100644 --- a/ci/cscs/openmp/run_openmp.yml +++ b/ci/cscs/openmp/run_openmp.yml @@ -89,14 +89,14 @@ ippl-test-openmp-release-4-ranks: # --------------------------------------------------------- # 4 ranks release : known failing tests # --------------------------------------------------------- -ippl-test-openmp-release-failing-4-ranks: - needs: ["ippl-build-openmp-release"] - extends: .ippl-test-openmp-common - variables: - SLURM_NTASKS: 4 - SLURM_CPUS_PER_TASK: 32 - BUILD_DIR: "build-$CI_COMMIT_SHORT_SHA-release" - BUILD_TYPE: Release - TEST_INFO: "4-ranks-known-failing" - TEST_ARGS: "-R known_fail" - allow_failure: true +# ippl-test-openmp-release-failing-4-ranks: +# needs: ["ippl-build-openmp-release"] +# extends: .ippl-test-openmp-common +# variables: +# SLURM_NTASKS: 4 +# SLURM_CPUS_PER_TASK: 32 +# BUILD_DIR: "build-$CI_COMMIT_SHORT_SHA-release" +# BUILD_TYPE: Release +# TEST_INFO: "4-ranks-known-failing" +# TEST_ARGS: "-R known_fail" +# allow_failure: true diff --git a/ci/cscs/rocm/build_rocm-6.3.yml b/ci/cscs/rocm/build_rocm-6.3.yml index dcd65e5fd..339eb8347 100644 --- a/ci/cscs/rocm/build_rocm-6.3.yml +++ b/ci/cscs/rocm/build_rocm-6.3.yml @@ -5,7 +5,7 @@ variables: ROCM6_3_UENV: "prgenv-gnu/25.07-6.3.3:v12" WITH_UENV_VIEW: "default" SCRATCH: "/capstor/scratch/cscs/biddisco" - SRUN_FLAGS: "--uenv=${ROCM6_3_UENV};--view=${WITH_UENV_VIEW};--repo=$SCRATCH/.uenv-images-ci-beverin;$WRAPPER" + SRUN_FLAGS: "--uenv=${ROCM6_3_UENV};--view=${WITH_UENV_VIEW};--repo=$SCRATCH/.uenv-images-ci-beverin" BUILD_ARCH: "mi300" CTEST_SITE: "Beverin mi300 ROCm 6.3.3" @@ -28,6 +28,7 @@ variables: - export CXX=amdclang++ - export HSA_PATH=$( find /user-environment/ -name hsa-rocr-dev\* ) - export CMAKE_PREFIX_PATH=$HSA_PATH:$CMAKE_PREFIX_PATH + - export WRAPPER="$( pwd )/$BUILD_DIR/scripts/landau/strong-scaling-alps/wrapper-mi300.sh" script: - >- ctest -V -S $CI_PROJECT_DIR/ci/cscs/dashboard-configure-build.cmake @@ -36,11 +37,12 @@ variables: -DCDASH_LABEL=$CDASH_LABEL -DBUILD_TYPE=$BUILD_TYPE -DBUILD_DIR=$BUILD_PATH + -DIPPL_ENABLE_SCRIPTS=ON -DBUILD_ARCH=$BUILD_ARCH -DKokkos_VERSION=git.4.7.02 -DKokkos_ARCH_FLAG=Kokkos_ARCH_AMD_GFX942_APU -DMPIEXEC_EXECUTABLE=/usr/bin/srun - -DMPIEXEC_PREFLAGS="$SRUN_FLAGS" + -DMPIEXEC_PREFLAGS="$SRUN_FLAGS;${WRAPPER}" -DMPIEXEC_MAX_NUMPROCS=4 # -DHeffte_ENABLE_GPU_AWARE_MPI=OFF - echo "Build directory size (before cleanup):" $(du -sh $BUILD_PATH | cut -f1) diff --git a/ci/cscs/rocm/run_rocm-6.3.yml b/ci/cscs/rocm/run_rocm-6.3.yml index ba49a9740..8d10574cb 100644 --- a/ci/cscs/rocm/run_rocm-6.3.yml +++ b/ci/cscs/rocm/run_rocm-6.3.yml @@ -36,6 +36,8 @@ variables: # reset custom build name for cdash - export CTEST_BUILD_NAME="${CDASH_LABEL}-${BUILD_ARCH}-${BUILD_TYPE}-${TEST_INFO}" - find $BUILD_PATH -type f -name \*.xml -exec sed -i "s|BuildName=.*|BuildName=\"${CTEST_BUILD_NAME}\"|g" {} \; + - find $BUILD_PATH -name \*wrapper\* -exec realpath {} + + - grep MPIEXEC $BUILD_PATH/CMakeCache.txt # ctest should now have valid file paths, (use srun to launch with uenv setup) - ctest -V --output-on-failure --timeout 60 $TEST_ARGS -S $CI_PROJECT_DIR/ci/cscs/dashboard-test.cmake -DCTEST_SITE="$CTEST_SITE" @@ -87,13 +89,13 @@ ippl-test-rocm6_3-release-4-ranks: # --------------------------------------------------------- # 4 ranks release : known failing tests # --------------------------------------------------------- -ippl-test-rocm6_3-release-failing-4-ranks: - needs: ["ippl-build-rocm6_3-release"] - extends: .ippl-test-rocm-common - variables: - SLURM_NTASKS: 4 - BUILD_DIR: "build-$CI_COMMIT_SHORT_SHA-release" - BUILD_TYPE: Release - TEST_INFO: "4-ranks-known-failing" - TEST_ARGS: "-R known_fail" - allow_failure: true +# ippl-test-rocm6_3-release-failing-4-ranks: +# needs: ["ippl-build-rocm6_3-release"] +# extends: .ippl-test-rocm-common +# variables: +# SLURM_NTASKS: 4 +# BUILD_DIR: "build-$CI_COMMIT_SHORT_SHA-release" +# BUILD_TYPE: Release +# TEST_INFO: "4-ranks-known-failing" +# TEST_ARGS: "-R known_fail" +# allow_failure: true