From 4da5ee3ce04fd160a905789cb1fb4a3dedefc079 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 9 Apr 2026 08:00:31 +0200 Subject: [PATCH 01/27] SRE-3704 ci: Fault injection testing stage on VM/bare metal unitTestPost() already processes nlt-junit.xml via the testResults parameter it receives. The bare 'junit testResults: nlt-junit.xml' call that follows is redundant and has no failure protection: it uses the default healthScaleFactor so when fault injection tests intentionally produce failures in nlt-junit.xml it marks the build FAILURE immediately, overriding the controlled result handling done by unitTestPost(). When node_local_test.py runs with --no-root, DAOS logs are written to /localhome/jenkins/build/nlt_logs/ instead of /tmp/. The existing rsync only fetches from /tmp/, leaving nlt_logs/ empty and causing: No artifacts found that match the file pattern "nlt_logs/". Configuration error? Add a second rsync from build/nlt_logs/ to collect logs from the --no-root code path. The '|| true' ensures non-fatal behavior when the path does not exist (plain NLT runs without --no-root). Jenkinsfile: simplify NLT fault injection recordIssues call The vm_test/nlt-errors.json issue scanning for the 'NLT Fault injection testing' stage is now handled by unitTestPost() in pipeline-lib, so remove it from the explicit recordIssues call here. fault_status falback only based on PATH - Add fallback `fault_status` detection: if the primary detection via `$PREFIX/bin` fails, try resolving `fault_status` via `$PATH`, improving robustness when the binary is installed via RPM rather than built in-tree. Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true --- Jenkinsfile | 105 +++++++++------------------------ ci/docker_nlt.sh | 42 ------------- ci/unit/test_nlt.sh | 8 +-- ci/unit/test_nlt_node.sh | 14 ++++- ci/unit/test_nlt_post.sh | 9 ++- utils/docker/Dockerfile.ubuntu | 2 +- utils/node_local_test.py | 8 +++ 7 files changed, 59 insertions(+), 129 deletions(-) delete mode 100755 ci/docker_nlt.sh diff --git a/Jenkinsfile b/Jenkinsfile index 55a3418f689..043ccaa4960 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -47,42 +47,6 @@ void job_step_update(def value=currentBuild.currentResult) { jobStatusUpdate(job_status_internal, env.STAGE_NAME, value) } -Map nlt_test() { - // groovylint-disable-next-line NoJavaUtilDate - Date startDate = new Date() - try { - unstash('nltr') - } catch (e) { - print 'Unstash failed, results from NLT stage will not be included' - } - sh label: 'Fault injection testing using NLT', - script: './ci/docker_nlt.sh --class-name fault-injection fi' - List filesList = [] - filesList.addAll(findFiles(glob: '*.memcheck.xml')) - int vgfail = 0 - int vgerr = 0 - if (filesList) { - String rcs = sh label: 'Check for Valgrind errors', - script: "grep -E ')' ${filesList.join(' ')} || true", - returnStdout: true - if (rcs) { - vgfail = 1 - } - String suite = sanitizedStageName() - junitSimpleReport suite: suite, - file: suite + '_valgrind_results.xml', - fails: vgfail, - errors: vgerr, - name: 'Valgrind_Memcheck', - class: 'Valgrind', - message: 'Valgrind Memcheck error detected', - testdata: rcs - } - int runTime = durationSeconds(startDate) - Map runData = ['nlttest_time': runTime] - return runData -} - // For master, this is just some wildly high number String next_version() { return '1000' @@ -351,6 +315,9 @@ pipeline { booleanParam(name: 'CI_FUNCTIONAL_leap15_TEST', defaultValue: false, description: 'Run the Functional on Leap 15 test stage') + booleanParam(name: 'CI_FUNCTIONAL_sles15_TEST', + defaultValue: false, + description: 'Run the Functional on SLES 15 test stage') booleanParam(name: 'CI_FUNCTIONAL_ubuntu20_TEST', defaultValue: false, description: 'Run the Functional on Ubuntu 20.04 test stage') @@ -391,8 +358,11 @@ pipeline { defaultValue: 'ci_vm9', description: 'Label to use for 9 VM functional tests') string(name: 'CI_NLT_1_LABEL', - defaultValue: 'ci_nlt_1', + defaultValue: 'ci_nlt_vm1', description: 'Label to use for NLT tests') + string(name: 'CI_FI_1_LABEL', + defaultValue: 'ci_fi_vm1', + description: 'Label to use for Fault Injection (FI) tests') string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_LABEL', defaultValue: 'ci_nvme5', description: 'Label to use for the Functional Hardware Medium (MD on SSD) stages') @@ -743,7 +713,7 @@ pipeline { } steps { job_step_update( - unitTest(timeout_time: 60, + unitTest(timeout_time: 240, inst_repos: daosRepos(), test_script: 'ci/unit/test_nlt.sh', unstash_opt: true, @@ -978,63 +948,43 @@ pipeline { } } // post } // stage('Functional on Ubuntu 20.04') - stage('Fault injection testing') { + stage('NLT Fault injection testing') { when { beforeAgent true expression { !skipStage() } } agent { - dockerfile { - filename 'utils/docker/Dockerfile.el.9' - label 'docker_runner_fi' - additionalBuildArgs dockerBuildArgs(repo_type: 'stable', - parallel_build: true, - deps_build: true) + - ' --build-arg POINT_RELEASE=.7 ' - args '--tmpfs /mnt/daos_0' - } + label params.CI_FI_1_LABEL } steps { + /* job_step_update(nlt_test()) */ job_step_update( - sconsBuild(parallel_build: true, - scons_args: 'PREFIX=/opt/daos TARGET_TYPE=release BUILD_TYPE=debug', - build_deps: 'no')) - job_step_update(nlt_test()) - // recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltr.xml']], - // skipPublishingChecks: true, - // id: 'fir', name: 'Fault Injection Report') + unitTest(timeout_time: 600, + inst_repos: daosRepos(), + test_script: 'ci/unit/test_nlt.sh --memcheck no' + + ' --system-ram-reserved 4 --server-debug WARN' + + ' --log-usage-import nltr.json' + + ' --log-usage-save nltr.xml' + + ' --class-name fault-injection fi', + unstash_opt: true, + unstash_tests: false, + inst_rpms: unitPackages(target: 'el9') + ' daos-client-tests', + image_version: 'el9.7')) } post { always { + unitTestPost artifacts: ['nlt_logs/'], + testResults: 'nlt-junit.xml', + always_script: 'ci/unit/test_nlt_post.sh' discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master', scm: 'daos-stack/daos', requiredResult: hudson.model.Result.UNSTABLE - recordIssues enabledForFailure: true, - /* ignore warning/errors from PMDK logging system */ - filters: [excludeFile('pmdk/.+')], - failOnError: false, - ignoreQualityGate: true, - qualityGates: [[threshold: 1, type: 'TOTAL_ERROR'], - [threshold: 1, type: 'TOTAL_HIGH'], - [threshold: 1, type: 'NEW_NORMAL', unstable: true], - [threshold: 1, type: 'NEW_LOW', unstable: true]], - tools: [issues(pattern: 'nlt-errors.json', - name: 'Fault injection issues', - id: 'Fault_Injection'), - issues(pattern: 'nlt-client-leaks.json', - name: 'Fault injection leaks', - id: 'NLT_client')], - scm: 'daos-stack/daos' - junit testResults: 'nlt-junit.xml' - stash name: 'fault-inject-valgrind', - includes: '*.memcheck.xml', - allowEmpty: true archiveArtifacts artifacts: 'nlt_logs/fault-injection/', allowEmptyArchive: true job_status_update() } } - } // stage('Fault injection testing') + } // stage('NLT Fault injection testing') stage('Test RPMs on EL 9.6') { when { beforeAgent true @@ -1255,8 +1205,7 @@ pipeline { post { always { valgrindReportPublish valgrind_stashes: ['nlt-memcheck', - 'unit-memcheck', - 'fault-inject-valgrind'] + 'unit-memcheck'] job_status_update('final_status') jobStatusWrite(job_status_internal) } diff --git a/ci/docker_nlt.sh b/ci/docker_nlt.sh deleted file mode 100755 index a6d85eba771..00000000000 --- a/ci/docker_nlt.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -# Script for running NLT in a docker container. This is called from Jenkinsfile -# where needed, and is a cheat way of running setup_daos_server_helper under sudo -# and NLT itself from a single script. - -set -e - -set -x - -. utils/sl/setup_local.sh - -ps auwx -sudo --preserve-env=SL_PREFIX,SL_SPDK_PREFIX ./utils/setup_daos_server_helper.sh - -TMP_DIR=$(mktemp -d) - -cp utils/node_local_test.py utils/nlt_server.yaml .build_vars.json "$TMP_DIR" -cp src/tests/ftest/cart/util/cart_logparse.py src/tests/ftest/cart/util/cart_logtest.py "$TMP_DIR" -if [ -e nltr.json ] -then - cp nltr.json "$TMP_DIR" -fi - -pushd "$TMP_DIR" - -set +e - -sudo --preserve-env=VIRTUAL_ENV,PATH ./node_local_test.py \ - --no-root --memcheck no --system-ram-reserved 48 --server-debug WARN \ - --log-usage-import nltr.json --log-usage-save nltr.xml "$@" - -RC=$? -set -e -popd - -cp "$TMP_DIR"/*.json . -cp "$TMP_DIR"/*.xml . -sudo chmod -R o+r "$TMP_DIR"/nlt_logs -cp -r "$TMP_DIR"/nlt_logs . - -exit $RC diff --git a/ci/unit/test_nlt.sh b/ci/unit/test_nlt.sh index b8176aca873..23e3bc8b549 100755 --- a/ci/unit/test_nlt.sh +++ b/ci/unit/test_nlt.sh @@ -13,7 +13,7 @@ mydir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" # Copy over the install tree and some of the build tree. rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* opt-daos.tar utils requirements-utest.txt jenkins@"$NODE":build/ -# shellcheck disable=SC2029 -ssh -tt "$SSH_KEY_ARGS" jenkins@"$NODE" "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ - DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \ - $(cat "$mydir/test_nlt_node.sh")" +ssh -T "$SSH_KEY_ARGS" jenkins@"$NODE" \ + "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \ + DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \ + bash -s -- $*" < "$mydir/test_nlt_node.sh" diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 62a734f3bcf..085e8cfc75b 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -45,7 +45,15 @@ pip install /opt/daos/lib/daos/python/ sudo prlimit --nofile=1024:262144 --pid $$ prlimit -n -HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ +if [ "$#" -eq 0 ]; then + set -- --max-log-size 1950MiB \ + --system-ram-reserved 4 \ + --dfuse-dir /localhome/jenkins/ \ + --log-usage-save nltir.xml \ + --log-usage-export nltr.json all +fi + +exec env \ + HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ NO_PROXY="${DAOS_NO_PROXY:-}" \ - ./utils/node_local_test.py --max-log-size 1950MiB \ - --dfuse-dir /localhome/jenkins/ --log-usage-save nltir.xml --log-usage-export nltr.json all + ./utils/node_local_test.py "$@" diff --git a/ci/unit/test_nlt_post.sh b/ci/unit/test_nlt_post.sh index c46a63dac2f..b92b475c064 100755 --- a/ci/unit/test_nlt_post.sh +++ b/ci/unit/test_nlt_post.sh @@ -16,7 +16,14 @@ mkdir nlt_logs rsync -v -dprt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":/tmp/ \ --filter="include dnt*.log" --filter="include dnt*.log.bz2" \ --filter="include dnt_fi_*_logs" \ - --filter="exclude *" nlt_logs/ + --filter="exclude *" nlt_logs/ || true + +# When running with --no-root, DAOS logs go to build/nlt_logs/ on the node +# instead of /tmp/, so fetch them from there as well. +rsync -v -rlpt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/nlt_logs/ \ + --filter="include dnt*.log" --filter="include dnt*.log.bz2" \ + --filter="include dnt_fi_*_logs" --filter="include */" \ + --filter="exclude *" nlt_logs/ || true rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \ --filter="include nlt*.json" --filter="include dnt*.xml" \ diff --git a/utils/docker/Dockerfile.ubuntu b/utils/docker/Dockerfile.ubuntu index 9ad069d710a..7897e40f929 100644 --- a/utils/docker/Dockerfile.ubuntu +++ b/utils/docker/Dockerfile.ubuntu @@ -1,6 +1,6 @@ # Copyright 2018-2024 Intel Corporation # Copyright 2025 Google LLC -# Copyright 2025-2026 Hewlett Packard Enterprise Development LP +# Copyright 2025 Hewlett Packard Enterprise Development LP # All rights reserved. # # 'recipe' for Docker to build an image of Ubuntu-based environment for building the DAOS project. diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 724843e87b5..d26f74b9f43 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -6757,6 +6757,14 @@ def run(wf, args): run_fi = True else: print("Unable to detect fault injection feature, skipping testing") + print("Use fallback on $PATH") + fs = subprocess.run(['fault_status'], check=False) + print(fs) + if fs.returncode == 0: + run_fi = True + else: + print("Unable to detect fault injection feature - fall back does not work, " + "skipping testing") if run_fi: args.server_debug = 'INFO' From 56b5b7238b75ca3e0f96c5c1e3d6453dab0e4f8d Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Mon, 11 May 2026 22:48:49 +0200 Subject: [PATCH 02/27] Test with 14 cores Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- utils/node_local_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index d26f74b9f43..0003ea104d3 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -5924,7 +5924,7 @@ def _prep(self): # pylint: disable-next=no-member num_cores = len(os.sched_getaffinity(0)) - if num_cores < 20: + if num_cores < 14: max_child = 1 else: max_child = int(num_cores / 4 * 3) From 51bd85cf35791847fa1590868019aea214700e1f Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 7 May 2026 16:22:10 +0200 Subject: [PATCH 03/27] nlt: remove ABT_STACK_OVERFLOW_CHECK=mprotect from nlt_server.yaml mprotect-based Argobots ULT stack overflow checking causes a TLB shootdown IPI on every stack allocation/deallocation. On KVM hosts running multiple VMs in parallel this results in VM exits across all vCPUs, significantly increasing latency under concurrent load. Remove the setting to use the default (no overflow check), which is acceptable for a CI/test environment where crashes are already caught by the test harness. Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true --- Jenkinsfile | 5 +++-- ci/unit/test_nlt_node.sh | 1 - utils/docker/Dockerfile.ubuntu | 2 +- utils/nlt_server.yaml | 1 - 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 043ccaa4960..3f010da4073 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -19,6 +19,7 @@ // To use a test branch (i.e. PR) until it lands to master // I.e. for testing library changes //@Library(value='pipeline-lib@your_branch') _ +@Library(value=['pipeline-lib@grom72/SRE-3704','system-pipeline-lib@grom72/SRE-3704']) _ /* groovylint-disable-next-line CompileStatic */ job_status_internal = [:] @@ -713,7 +714,7 @@ pipeline { } steps { job_step_update( - unitTest(timeout_time: 240, + unitTest(timeout_time: 60, inst_repos: daosRepos(), test_script: 'ci/unit/test_nlt.sh', unstash_opt: true, @@ -959,7 +960,7 @@ pipeline { steps { /* job_step_update(nlt_test()) */ job_step_update( - unitTest(timeout_time: 600, + unitTest(timeout_time: 240, inst_repos: daosRepos(), test_script: 'ci/unit/test_nlt.sh --memcheck no' + ' --system-ram-reserved 4 --server-debug WARN' + diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 085e8cfc75b..554e553d325 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -6,7 +6,6 @@ set -uex sudo bash -c 'echo 1 > /proc/sys/kernel/sysrq' -sudo mkdir -p /mnt/daos # using mmap()'ed ULT stacks requires to bump system default if [ "$(sudo sysctl -n vm.max_map_count)" -lt "1000000" ] ; then sudo sysctl vm.max_map_count=1000000 diff --git a/utils/docker/Dockerfile.ubuntu b/utils/docker/Dockerfile.ubuntu index 7897e40f929..9ad069d710a 100644 --- a/utils/docker/Dockerfile.ubuntu +++ b/utils/docker/Dockerfile.ubuntu @@ -1,6 +1,6 @@ # Copyright 2018-2024 Intel Corporation # Copyright 2025 Google LLC -# Copyright 2025 Hewlett Packard Enterprise Development LP +# Copyright 2025-2026 Hewlett Packard Enterprise Development LP # All rights reserved. # # 'recipe' for Docker to build an image of Ubuntu-based environment for building the DAOS project. diff --git a/utils/nlt_server.yaml b/utils/nlt_server.yaml index 438e55718be..5a9cc7f4c61 100644 --- a/utils/nlt_server.yaml +++ b/utils/nlt_server.yaml @@ -14,7 +14,6 @@ engines: - DAOS_MD_CAP=1024 - DAOS_STRICT_SHUTDOWN=1 - DAOS_TARGET_OVERSUBSCRIBE=1 - - ABT_STACK_OVERFLOW_CHECK=mprotect storage: - class: ram From d21036dbca48035db12f753e99606aaf72065ce7 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 5 May 2026 21:47:06 +0200 Subject: [PATCH 04/27] Revert "DAOS-623 test: add allowed error for FI (#17959)" This reverts commit b03decb9b3d40d810ed1bc48f9ecd8e1ce683729. Signed-off-by: Tomasz Gromadzki Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true --- src/tests/ftest/cart/util/cart_logtest.py | 5 ----- utils/node_local_test.py | 7 ------- 2 files changed, 12 deletions(-) diff --git a/src/tests/ftest/cart/util/cart_logtest.py b/src/tests/ftest/cart/util/cart_logtest.py index d203ce0784e..a500cf2046c 100755 --- a/src/tests/ftest/cart/util/cart_logtest.py +++ b/src/tests/ftest/cart/util/cart_logtest.py @@ -225,7 +225,6 @@ def __init__(self, log_iter, quiet=False): self.fi_triggered = False self.fi_location = None self.skip_suffixes = [] - self.skip_substrings = [] self._tracers = [] self.ftest_mode = False @@ -444,10 +443,6 @@ def _check_pid_from_log_file(self, pid, abort_on_warning, leak_wf, show_memleaks show = False if show and any(map(line.get_msg().endswith, self.skip_suffixes)): show = False - if show: - line_msg = line.get_msg().casefold() - if any(sub in line_msg for sub in self.skip_substrings): - show = False if show: # Allow WARNING or ERROR messages, but anything higher like assert should # trigger a failure. diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 0003ea104d3..8ca973d78e3 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -5025,13 +5025,6 @@ def sizeof_fmt(num, suffix='B'): if ignore_busy: lto.skip_suffixes.append(" DER_BUSY(-1012): 'Device or resource busy'") - lto.skip_substrings.extend([ - 'sluggish ec boundary report from rank', - 'sluggish stable epoch reporting', - 'progress callback was not called for too long', - 'rpc failed; rc:', - ]) - try: lto.check_log_file(abort_on_warning=True, show_memleaks=show_memleaks, From ec43c0314fecae7828fc4ac1e94b3dfb7e56a443 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Wed, 6 May 2026 18:38:36 +0200 Subject: [PATCH 05/27] Disable maldet fo CI nodes Signed-off-by: Tomasz Gromadzki --- ci/provisioning/post_provision_config_common_functions.sh | 6 ++++++ ci/unit/test_nlt_node.sh | 6 ++++++ ci/unit/test_nlt_post.sh | 1 + 3 files changed, 13 insertions(+) diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index 46fba4b21c2..68dd4d3da7f 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -321,6 +321,12 @@ post_provision_config_nodes() { dnf -y erase fuse3\* fi + # maldet brings additional load on CPU during tests (e,g, NLT tests) + if command -v maldet &>/dev/null; then + systemctl stop maldet 2>/dev/null || true + systemctl disable maldet 2>/dev/null || true + fi + if [ -n "$CONFIG_POWER_ONLY" ]; then rm -f "$REPOS_DIR"/*_job_daos-stack_job_*_job_*.repo time dnf -y erase fio fuse ior-hpc mpich-autoload \ diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 554e553d325..fdf03fec352 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -46,13 +46,19 @@ prlimit -n if [ "$#" -eq 0 ]; then set -- --max-log-size 1950MiB \ + --class-name nlt \ --system-ram-reserved 4 \ --dfuse-dir /localhome/jenkins/ \ --log-usage-save nltir.xml \ --log-usage-export nltr.json all fi +mkdir -p nlt_logs +sudo mount -t tmpfs tmpfs nlt_logs +sudo chown jenkins:jenkins nlt_logs + exec env \ + TMPDIR="$(pwd)/nlt_logs" \ HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ NO_PROXY="${DAOS_NO_PROXY:-}" \ ./utils/node_local_test.py "$@" diff --git a/ci/unit/test_nlt_post.sh b/ci/unit/test_nlt_post.sh index b92b475c064..a70fa14c0e3 100755 --- a/ci/unit/test_nlt_post.sh +++ b/ci/unit/test_nlt_post.sh @@ -29,5 +29,6 @@ rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \ --filter="include nlt*.json" --filter="include dnt*.xml" \ --filter="include nltir.xml" --filter="include nltr.json" \ --filter="include nlt-junit.xml" --filter="exclude *" ./ + mkdir -p vm_test mv nlt-errors.json vm_test/ From 2c6c9f7e4d02b6b67af73ee3b7da6bcc8eaf0818 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 12:56:40 +0200 Subject: [PATCH 06/27] Revert "Disable maldet fo CI nodes" This reverts commit ec43c0314fecae7828fc4ac1e94b3dfb7e56a443. Signed-off-by: Tomasz Gromadzki --- ci/provisioning/post_provision_config_common_functions.sh | 6 ------ ci/unit/test_nlt_node.sh | 6 ------ ci/unit/test_nlt_post.sh | 1 - 3 files changed, 13 deletions(-) diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index 68dd4d3da7f..46fba4b21c2 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -321,12 +321,6 @@ post_provision_config_nodes() { dnf -y erase fuse3\* fi - # maldet brings additional load on CPU during tests (e,g, NLT tests) - if command -v maldet &>/dev/null; then - systemctl stop maldet 2>/dev/null || true - systemctl disable maldet 2>/dev/null || true - fi - if [ -n "$CONFIG_POWER_ONLY" ]; then rm -f "$REPOS_DIR"/*_job_daos-stack_job_*_job_*.repo time dnf -y erase fio fuse ior-hpc mpich-autoload \ diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index fdf03fec352..554e553d325 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -46,19 +46,13 @@ prlimit -n if [ "$#" -eq 0 ]; then set -- --max-log-size 1950MiB \ - --class-name nlt \ --system-ram-reserved 4 \ --dfuse-dir /localhome/jenkins/ \ --log-usage-save nltir.xml \ --log-usage-export nltr.json all fi -mkdir -p nlt_logs -sudo mount -t tmpfs tmpfs nlt_logs -sudo chown jenkins:jenkins nlt_logs - exec env \ - TMPDIR="$(pwd)/nlt_logs" \ HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ NO_PROXY="${DAOS_NO_PROXY:-}" \ ./utils/node_local_test.py "$@" diff --git a/ci/unit/test_nlt_post.sh b/ci/unit/test_nlt_post.sh index a70fa14c0e3..b92b475c064 100755 --- a/ci/unit/test_nlt_post.sh +++ b/ci/unit/test_nlt_post.sh @@ -29,6 +29,5 @@ rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \ --filter="include nlt*.json" --filter="include dnt*.xml" \ --filter="include nltir.xml" --filter="include nltr.json" \ --filter="include nlt-junit.xml" --filter="exclude *" ./ - mkdir -p vm_test mv nlt-errors.json vm_test/ From 015e4e437d123e9ea1d1731e3627e1b34080e311 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 12:57:45 +0200 Subject: [PATCH 07/27] Reapply "Disable maldet fo CI nodes" This reverts commit 2c6c9f7e4d02b6b67af73ee3b7da6bcc8eaf0818. Signed-off-by: Tomasz Gromadzki --- ci/provisioning/post_provision_config_common_functions.sh | 6 ++++++ ci/unit/test_nlt_node.sh | 6 ++++++ ci/unit/test_nlt_post.sh | 1 + 3 files changed, 13 insertions(+) diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index 46fba4b21c2..68dd4d3da7f 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -321,6 +321,12 @@ post_provision_config_nodes() { dnf -y erase fuse3\* fi + # maldet brings additional load on CPU during tests (e,g, NLT tests) + if command -v maldet &>/dev/null; then + systemctl stop maldet 2>/dev/null || true + systemctl disable maldet 2>/dev/null || true + fi + if [ -n "$CONFIG_POWER_ONLY" ]; then rm -f "$REPOS_DIR"/*_job_daos-stack_job_*_job_*.repo time dnf -y erase fio fuse ior-hpc mpich-autoload \ diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 554e553d325..fdf03fec352 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -46,13 +46,19 @@ prlimit -n if [ "$#" -eq 0 ]; then set -- --max-log-size 1950MiB \ + --class-name nlt \ --system-ram-reserved 4 \ --dfuse-dir /localhome/jenkins/ \ --log-usage-save nltir.xml \ --log-usage-export nltr.json all fi +mkdir -p nlt_logs +sudo mount -t tmpfs tmpfs nlt_logs +sudo chown jenkins:jenkins nlt_logs + exec env \ + TMPDIR="$(pwd)/nlt_logs" \ HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ NO_PROXY="${DAOS_NO_PROXY:-}" \ ./utils/node_local_test.py "$@" diff --git a/ci/unit/test_nlt_post.sh b/ci/unit/test_nlt_post.sh index b92b475c064..a70fa14c0e3 100755 --- a/ci/unit/test_nlt_post.sh +++ b/ci/unit/test_nlt_post.sh @@ -29,5 +29,6 @@ rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \ --filter="include nlt*.json" --filter="include dnt*.xml" \ --filter="include nltir.xml" --filter="include nltr.json" \ --filter="include nlt-junit.xml" --filter="exclude *" ./ + mkdir -p vm_test mv nlt-errors.json vm_test/ From af5d3fec19083ec4e039d3830038734a22e8cac8 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 12:59:00 +0200 Subject: [PATCH 08/27] Revert maldet fix - maldet should be disabled in an image Signed-off-by: Tomasz Gromadzki --- ci/provisioning/post_provision_config_common_functions.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ci/provisioning/post_provision_config_common_functions.sh b/ci/provisioning/post_provision_config_common_functions.sh index 68dd4d3da7f..46fba4b21c2 100755 --- a/ci/provisioning/post_provision_config_common_functions.sh +++ b/ci/provisioning/post_provision_config_common_functions.sh @@ -321,12 +321,6 @@ post_provision_config_nodes() { dnf -y erase fuse3\* fi - # maldet brings additional load on CPU during tests (e,g, NLT tests) - if command -v maldet &>/dev/null; then - systemctl stop maldet 2>/dev/null || true - systemctl disable maldet 2>/dev/null || true - fi - if [ -n "$CONFIG_POWER_ONLY" ]; then rm -f "$REPOS_DIR"/*_job_daos-stack_job_*_job_*.repo time dnf -y erase fio fuse ior-hpc mpich-autoload \ From 26a8c509648c9849a407ea8551cbd040cd0c4ad2 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 13:00:26 +0200 Subject: [PATCH 09/27] Revert SLES 15 fix - should be added by another PR Signed-off-by: Tomasz Gromadzki --- Jenkinsfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 3f010da4073..0d828b65f1b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -316,9 +316,6 @@ pipeline { booleanParam(name: 'CI_FUNCTIONAL_leap15_TEST', defaultValue: false, description: 'Run the Functional on Leap 15 test stage') - booleanParam(name: 'CI_FUNCTIONAL_sles15_TEST', - defaultValue: false, - description: 'Run the Functional on SLES 15 test stage') booleanParam(name: 'CI_FUNCTIONAL_ubuntu20_TEST', defaultValue: false, description: 'Run the Functional on Ubuntu 20.04 test stage') From 29ef8653e5734de4a58164c661bb144a7807ddc8 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 13:06:49 +0200 Subject: [PATCH 10/27] Fix: copy only what is needed Signed-off-by: Tomasz Gromadzki --- ci/unit/test_nlt_post.sh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ci/unit/test_nlt_post.sh b/ci/unit/test_nlt_post.sh index a70fa14c0e3..bde2687ce22 100755 --- a/ci/unit/test_nlt_post.sh +++ b/ci/unit/test_nlt_post.sh @@ -13,17 +13,13 @@ mkdir nlt_logs # Copy any log files. Use rsync filters here to allow us to specify # all files we want to copy, as it's much more flexible than using # standard wildcards. -rsync -v -dprt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":/tmp/ \ - --filter="include dnt*.log" --filter="include dnt*.log.bz2" \ - --filter="include dnt_fi_*_logs" \ - --filter="exclude *" nlt_logs/ || true # When running with --no-root, DAOS logs go to build/nlt_logs/ on the node # instead of /tmp/, so fetch them from there as well. rsync -v -rlpt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/nlt_logs/ \ --filter="include dnt*.log" --filter="include dnt*.log.bz2" \ --filter="include dnt_fi_*_logs" --filter="include */" \ - --filter="exclude *" nlt_logs/ || true + --filter="exclude *" nlt_logs/ rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \ --filter="include nlt*.json" --filter="include dnt*.xml" \ From 855a046b91e37ffb5e317938a02fcfa992b3bccc Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 13:11:03 +0200 Subject: [PATCH 11/27] Fix: minor fixes to address review comments Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 1 - utils/node_local_test.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 0d828b65f1b..e628ef32300 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -955,7 +955,6 @@ pipeline { label params.CI_FI_1_LABEL } steps { - /* job_step_update(nlt_test()) */ job_step_update( unitTest(timeout_time: 240, inst_repos: daosRepos(), diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 8ca973d78e3..a3cc7d9e1ba 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -6749,15 +6749,13 @@ def run(wf, args): if fs.returncode == 0: run_fi = True else: - print("Unable to detect fault injection feature, skipping testing") print("Use fallback on $PATH") fs = subprocess.run(['fault_status'], check=False) print(fs) if fs.returncode == 0: run_fi = True else: - print("Unable to detect fault injection feature - fall back does not work, " - "skipping testing") + print("Unable to detect fault injection feature - skipping FI testing") if run_fi: args.server_debug = 'INFO' From e0de010436cbe0443cfe6fc2188fb2b26008778e Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 13:21:49 +0200 Subject: [PATCH 12/27] Ensure 4GiB tmpfs for logs Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- ci/unit/test_nlt_node.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index fdf03fec352..a4725a3f01f 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -54,7 +54,13 @@ if [ "$#" -eq 0 ]; then fi mkdir -p nlt_logs -sudo mount -t tmpfs tmpfs nlt_logs +avail_line=$(grep '^MemAvailable:' /proc/meminfo) +avail_mem_kib=${avail_line//[^0-9]/} +if [ "$avail_mem_kib" -lt $((4 * 1024 * 1024)) ]; then + echo "ERROR: Less than 4GiB RAM available for nlt_logs tmpfs (${avail_mem_kib} KiB)" >&2 + exit 1 +fi +sudo mount -t tmpfs -o size=4g tmpfs nlt_logs sudo chown jenkins:jenkins nlt_logs exec env \ From da2d5fa1845611b13e7ba8ba7413909c31782c63 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 15:16:44 +0200 Subject: [PATCH 13/27] Simplify fault_status detection. Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- utils/node_local_test.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index a3cc7d9e1ba..6d9cc2340aa 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -6744,18 +6744,14 @@ def run(wf, args): run_fi = False if args.perf_check or fi_test or fi_test_dfuse: - fs = subprocess.run([os.path.join(conf['PREFIX'], 'bin', 'fault_status')], check=False) + fi_env = os.environ.copy() + fi_env['PATH'] = f'{join(conf["PREFIX"], "bin")}:{fi_env["PATH"]}' + fs = subprocess.run(['fault_status'], check=False, env=fi_env) print(fs) if fs.returncode == 0: run_fi = True else: - print("Use fallback on $PATH") - fs = subprocess.run(['fault_status'], check=False) - print(fs) - if fs.returncode == 0: - run_fi = True - else: - print("Unable to detect fault injection feature - skipping FI testing") + print("Unable to detect fault injection feature - skipping FI testing") if run_fi: args.server_debug = 'INFO' From 3d148d5d561e44b0389e455ebad985b5bc5422a5 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 15:33:05 +0200 Subject: [PATCH 14/27] Provide nlt parameters directly from Jenkinsfile. Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 10 ++++++++-- ci/unit/test_nlt_node.sh | 9 --------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index e628ef32300..f42fd3b220c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -713,8 +713,14 @@ pipeline { job_step_update( unitTest(timeout_time: 60, inst_repos: daosRepos(), - test_script: 'ci/unit/test_nlt.sh', - unstash_opt: true, + test_script: 'ci/unit/test_nlt.sh' + + ' --system-ram-reserved 4' + + ' --max-log-size 1950MiB' + + ' --dfuse-dir /localhome/jenkins/' + + ' --log-usage-save nltir.xml' + + ' --log-usage-export nltr.json' + + ' --class-name nlt all', + unstash_opt: true, unstash_tests: false, inst_rpms: unitPackages(target: 'el9'), image_version: 'el9.7')) diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index a4725a3f01f..45107cb3d36 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -44,15 +44,6 @@ pip install /opt/daos/lib/daos/python/ sudo prlimit --nofile=1024:262144 --pid $$ prlimit -n -if [ "$#" -eq 0 ]; then - set -- --max-log-size 1950MiB \ - --class-name nlt \ - --system-ram-reserved 4 \ - --dfuse-dir /localhome/jenkins/ \ - --log-usage-save nltir.xml \ - --log-usage-export nltr.json all -fi - mkdir -p nlt_logs avail_line=$(grep '^MemAvailable:' /proc/meminfo) avail_mem_kib=${avail_line//[^0-9]/} From e26506b0b349d9dc3950032f6f637e3febf6a310 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Tue, 12 May 2026 15:41:39 +0200 Subject: [PATCH 15/27] Code format Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index f42fd3b220c..332fecbc8bc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -720,7 +720,7 @@ pipeline { ' --log-usage-save nltir.xml' + ' --log-usage-export nltr.json' + ' --class-name nlt all', - unstash_opt: true, + unstash_opt: true, unstash_tests: false, inst_rpms: unitPackages(target: 'el9'), image_version: 'el9.7')) From 8d94534b4c021c58ba2ee29fb569ff2bd65de036 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 14 May 2026 09:53:23 +0200 Subject: [PATCH 16/27] unitTest with prov_env_vars parameter Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 332fecbc8bc..90ca5a4ee7f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -723,7 +723,8 @@ pipeline { unstash_opt: true, unstash_tests: false, inst_rpms: unitPackages(target: 'el9'), - image_version: 'el9.7')) + image_version: 'el9.7', + prov_env_vars: 'VM_CPUS=14')) // recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltir.xml']], // skipPublishingChecks: true, // id: 'tlc', name: 'Fault Injection Interim Report') @@ -972,7 +973,8 @@ pipeline { unstash_opt: true, unstash_tests: false, inst_rpms: unitPackages(target: 'el9') + ' daos-client-tests', - image_version: 'el9.7')) + image_version: 'el9.7', + prov_env_vars: 'VM_CPUS=14')) } post { always { From 1d559fe585434e0de0363bb8eb90108f94592d82 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 14 May 2026 11:40:31 +0200 Subject: [PATCH 17/27] Address review comments Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- ci/unit/test_nlt_node.sh | 15 ++++----------- ci/unit/test_nlt_post.sh | 4 ++-- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 45107cb3d36..5775cedd03e 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -45,17 +45,10 @@ sudo prlimit --nofile=1024:262144 --pid $$ prlimit -n mkdir -p nlt_logs -avail_line=$(grep '^MemAvailable:' /proc/meminfo) -avail_mem_kib=${avail_line//[^0-9]/} -if [ "$avail_mem_kib" -lt $((4 * 1024 * 1024)) ]; then - echo "ERROR: Less than 4GiB RAM available for nlt_logs tmpfs (${avail_mem_kib} KiB)" >&2 - exit 1 -fi sudo mount -t tmpfs -o size=4g tmpfs nlt_logs sudo chown jenkins:jenkins nlt_logs -exec env \ - TMPDIR="$(pwd)/nlt_logs" \ - HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ - NO_PROXY="${DAOS_NO_PROXY:-}" \ - ./utils/node_local_test.py "$@" +TMPDIR="$(pwd)/nlt_logs" \ +HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ +NO_PROXY="${DAOS_NO_PROXY:-}" \ +exec ./utils/node_local_test.py "$@" diff --git a/ci/unit/test_nlt_post.sh b/ci/unit/test_nlt_post.sh index bde2687ce22..db39d9c7c3d 100755 --- a/ci/unit/test_nlt_post.sh +++ b/ci/unit/test_nlt_post.sh @@ -14,8 +14,8 @@ mkdir nlt_logs # all files we want to copy, as it's much more flexible than using # standard wildcards. -# When running with --no-root, DAOS logs go to build/nlt_logs/ on the node -# instead of /tmp/, so fetch them from there as well. +# Assuming that node_local_test.py is run with --class-name, +# the logs will be in build/nlt_logs/ on the node. rsync -v -rlpt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/nlt_logs/ \ --filter="include dnt*.log" --filter="include dnt*.log.bz2" \ --filter="include dnt_fi_*_logs" --filter="include */" \ From c7b1d07b325967eb54f8615089c7058fb6b819ab Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 14 May 2026 11:47:59 +0200 Subject: [PATCH 18/27] Remove join from f-string Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- utils/node_local_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/node_local_test.py b/utils/node_local_test.py index 6d9cc2340aa..c51b26ad33f 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -6745,7 +6745,7 @@ def run(wf, args): if args.perf_check or fi_test or fi_test_dfuse: fi_env = os.environ.copy() - fi_env['PATH'] = f'{join(conf["PREFIX"], "bin")}:{fi_env["PATH"]}' + fi_env['PATH'] = f'{conf["PREFIX"]}/bin:{fi_env["PATH"]}' fs = subprocess.run(['fault_status'], check=False, env=fi_env) print(fs) if fs.returncode == 0: From 9d48314dc9bd647963e188d727860527ea7aec79 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 14 May 2026 13:35:46 +0200 Subject: [PATCH 19/27] Fix ident Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- ci/unit/test_nlt_node.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index 5775cedd03e..baee27c9a20 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -49,6 +49,6 @@ sudo mount -t tmpfs -o size=4g tmpfs nlt_logs sudo chown jenkins:jenkins nlt_logs TMPDIR="$(pwd)/nlt_logs" \ -HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ -NO_PROXY="${DAOS_NO_PROXY:-}" \ -exec ./utils/node_local_test.py "$@" + HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \ + NO_PROXY="${DAOS_NO_PROXY:-}" \ + exec ./utils/node_local_test.py "$@" From 97ca76e9aa1fab9a26231a3e20ceda42a9cca9c9 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 14 May 2026 13:38:32 +0200 Subject: [PATCH 20/27] Let NLT control diskspace Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- ci/unit/test_nlt_node.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/unit/test_nlt_node.sh b/ci/unit/test_nlt_node.sh index baee27c9a20..d59688ab5a5 100755 --- a/ci/unit/test_nlt_node.sh +++ b/ci/unit/test_nlt_node.sh @@ -45,7 +45,7 @@ sudo prlimit --nofile=1024:262144 --pid $$ prlimit -n mkdir -p nlt_logs -sudo mount -t tmpfs -o size=4g tmpfs nlt_logs +sudo mount -t tmpfs tmpfs nlt_logs sudo chown jenkins:jenkins nlt_logs TMPDIR="$(pwd)/nlt_logs" \ From 14bfc4eddcf4e44bb626dbc6016f03d7f306d3f4 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 14 May 2026 19:41:54 +0200 Subject: [PATCH 21/27] system-pipeline-lib changes landed Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 90ca5a4ee7f..25b4df3f8e7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -19,7 +19,7 @@ // To use a test branch (i.e. PR) until it lands to master // I.e. for testing library changes //@Library(value='pipeline-lib@your_branch') _ -@Library(value=['pipeline-lib@grom72/SRE-3704','system-pipeline-lib@grom72/SRE-3704']) _ +@Library(value=['pipeline-lib@grom72/SRE-3704']) _ /* groovylint-disable-next-line CompileStatic */ job_status_internal = [:] From 6af6041bcb4c47a24cfa5b7d58c4aa3b17c38323 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Fri, 15 May 2026 13:22:35 +0200 Subject: [PATCH 22/27] ci: explicitly pass NLT/FI parameters to unitTest and unitTestPost pipeline-lib now supports overriding NLT/FI defaults (always_script, testResults, valgrind_pattern, with_valgrind, NLT, FI) via the config map, taking priority over the values auto-detected from the stage name by parseStageInfo. Make the Jenkinsfile stages explicit to take advantage of this and to make the stage configuration self-documenting. NLT stage (unitTest call): - Add with_valgrind: 'memcheck', valgrind_pattern: '*memcheck.xml', always_script: 'ci/unit/test_nlt_post.sh', testResults: 'nlt-junit.xml' NLT stage (unitTestPost call): - Remove always_script (now passed to unitTest above) - Add NLT: true to explicitly activate the NLT post-processing block (recordIssues, discoverGitReferenceBuild) instead of relying on stage name detection - Add valgrind_pattern: '*memcheck.xml' for the valgrind_stash NLT Fault injection testing stage (unitTest call): - Add always_script: 'ci/unit/test_nlt_post.sh', testResults: 'nlt-junit.xml' - Add with_valgrind: '' to explicitly suppress valgrind for FI NLT Fault injection testing stage (unitTestPost call): - Replace always_script with FI: true to explicitly activate fault injection post-processing (nlt-client-leaks.json, 'Fault injection' naming, discoverGitReferenceBuild) instead of relying on the now- removed stage name auto-detection of FI in parseStageInfo Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 25b4df3f8e7..9448db206fc 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -720,6 +720,10 @@ pipeline { ' --log-usage-save nltir.xml' + ' --log-usage-export nltr.json' + ' --class-name nlt all', + with_valgrind: 'memcheck' + valgrind_pattern: '*memcheck.xml', + always_script: 'ci/unit/test_nlt_post.sh', + testResults: 'nlt-junit.xml', unstash_opt: true, unstash_tests: false, inst_rpms: unitPackages(target: 'el9'), @@ -734,8 +738,9 @@ pipeline { always { unitTestPost artifacts: ['nlt_logs/'], testResults: 'nlt-junit.xml', - always_script: 'ci/unit/test_nlt_post.sh', - valgrind_stash: 'nlt-memcheck' + valgrind_stash: 'nlt-memcheck', + valgrind_pattern: '*memcheck.xml', + NLT: true recordIssues enabledForFailure: true, failOnError: false, ignoreQualityGate: true, @@ -970,6 +975,9 @@ pipeline { ' --log-usage-import nltr.json' + ' --log-usage-save nltr.xml' + ' --class-name fault-injection fi', + with_valgrind: '', + always_script: 'ci/unit/test_nlt_post.sh', + testResults: 'nlt-junit.xml', unstash_opt: true, unstash_tests: false, inst_rpms: unitPackages(target: 'el9') + ' daos-client-tests', @@ -980,7 +988,7 @@ pipeline { always { unitTestPost artifacts: ['nlt_logs/'], testResults: 'nlt-junit.xml', - always_script: 'ci/unit/test_nlt_post.sh' + FI: true discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master', scm: 'daos-stack/daos', requiredResult: hudson.model.Result.UNSTABLE From 81a40f50defbb8a2bd835a81cae68238e9a29f7f Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Fri, 15 May 2026 13:28:26 +0200 Subject: [PATCH 23/27] Fix missing comma Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9448db206fc..ca364c1d51f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -720,7 +720,7 @@ pipeline { ' --log-usage-save nltir.xml' + ' --log-usage-export nltr.json' + ' --class-name nlt all', - with_valgrind: 'memcheck' + with_valgrind: 'memcheck', valgrind_pattern: '*memcheck.xml', always_script: 'ci/unit/test_nlt_post.sh', testResults: 'nlt-junit.xml', From 8caf3ca92d9c21919c467c4cc4dfa3b2169e4203 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Fri, 15 May 2026 16:15:58 +0200 Subject: [PATCH 24/27] Fix: config with_valgrind override stage_info Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Jenkinsfile b/Jenkinsfile index ca364c1d51f..5342c8e4fb7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -988,6 +988,7 @@ pipeline { always { unitTestPost artifacts: ['nlt_logs/'], testResults: 'nlt-junit.xml', + with_valgrind: '', FI: true discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master', scm: 'daos-stack/daos', From a82e340e3e18306691feae804c7e6bcb8ae2177f Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Sat, 16 May 2026 13:50:22 +0200 Subject: [PATCH 25/27] Restor original stage name NLT in stage name is no longer needed as required information is transfered via parameters of unitTest and unitTestPost procedures. Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-python-bandit: true Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 5342c8e4fb7..91e6f8cf049 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -958,7 +958,7 @@ pipeline { } } // post } // stage('Functional on Ubuntu 20.04') - stage('NLT Fault injection testing') { + stage('Fault injection testing') { when { beforeAgent true expression { !skipStage() } @@ -998,7 +998,7 @@ pipeline { job_status_update() } } - } // stage('NLT Fault injection testing') + } // stage('Fault injection testing') stage('Test RPMs on EL 9.6') { when { beforeAgent true From b0435f32a92e812504a2bc7ff776be3582900376 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Sat, 16 May 2026 13:53:39 +0200 Subject: [PATCH 26/27] Trigger build with Python Bandit check Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true From 68a39cf388da9bf751171605bfb1cee927ccb4e5 Mon Sep 17 00:00:00 2001 From: Tomasz Gromadzki Date: Thu, 21 May 2026 09:50:31 +0200 Subject: [PATCH 27/27] All pipeline-lib changes landed to master. Signed-off-by: Tomasz Gromadzki Priority: 2 Cancel-prev-build: false Skip-unit-test: true Skip-unit-test-memcheck: true Skip-func-vm-all: true Skip-test-el-9-rpms: true Skip-test-leap-15-rpms: true Skip-func-hw-test: true Skip-build-el8-gcc: true Skip-build-leap15-gcc: true Skip-func-test-el9: true Skip-func-test-leap15: true Signed-off-by: Tomasz Gromadzki --- Jenkinsfile | 1 - 1 file changed, 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index bb17af25d9a..a855f6c67d7 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -19,7 +19,6 @@ // To use a test branch (i.e. PR) until it lands to master // I.e. for testing library changes //@Library(value='pipeline-lib@your_branch') _ -@Library(value=['pipeline-lib@grom72/SRE-3704']) _ /* groovylint-disable-next-line CompileStatic */ job_status_internal = [:]