Skip to content
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4da5ee3
SRE-3704 ci: Fault injection testing stage on VM/bare metal
grom72 Apr 9, 2026
56b5b72
Test with 14 cores
grom72 May 11, 2026
51bd85c
nlt: remove ABT_STACK_OVERFLOW_CHECK=mprotect from nlt_server.yaml
grom72 May 7, 2026
d21036d
Revert "DAOS-623 test: add allowed error for FI (#17959)"
grom72 May 5, 2026
ec43c03
Disable maldet fo CI nodes
grom72 May 6, 2026
2c6c9f7
Revert "Disable maldet fo CI nodes"
grom72 May 12, 2026
015e4e4
Reapply "Disable maldet fo CI nodes"
grom72 May 12, 2026
af5d3fe
Revert maldet fix - maldet should be disabled in an image
grom72 May 12, 2026
26a8c50
Revert SLES 15 fix - should be added by another PR
grom72 May 12, 2026
29ef865
Fix: copy only what is needed
grom72 May 12, 2026
855a046
Fix: minor fixes to address review comments
grom72 May 12, 2026
e0de010
Ensure 4GiB tmpfs for logs
grom72 May 12, 2026
da2d5fa
Simplify fault_status detection.
grom72 May 12, 2026
3d148d5
Provide nlt parameters directly from Jenkinsfile.
grom72 May 12, 2026
e26506b
Code format
grom72 May 12, 2026
8d94534
unitTest with prov_env_vars parameter
grom72 May 14, 2026
1d559fe
Address review comments
grom72 May 14, 2026
c7b1d07
Remove join from f-string
grom72 May 14, 2026
9d48314
Fix ident
grom72 May 14, 2026
97ca76e
Let NLT control diskspace
grom72 May 14, 2026
14bfc4e
system-pipeline-lib changes landed
grom72 May 14, 2026
6af6041
ci: explicitly pass NLT/FI parameters to unitTest and unitTestPost
grom72 May 15, 2026
81a40f5
Fix missing comma
grom72 May 15, 2026
8caf3ca
Fix: config with_valgrind override stage_info
grom72 May 15, 2026
a82e340
Restor original stage name
grom72 May 16, 2026
b0435f3
Trigger build with Python Bandit check
grom72 May 16, 2026
5587a42
Merge remote-tracking branch 'origin/master' into grom72/SRE-3704-CI-…
grom72 May 18, 2026
68a39cf
All pipeline-lib changes landed to master.
grom72 May 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 33 additions & 79 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
// To use a test branch (i.e. PR) until it lands to master
// I.e. for testing library changes
//@Library(value='pipeline-lib@your_branch') _
@Library(value=['pipeline-lib@grom72/SRE-3704','system-pipeline-lib@grom72/SRE-3704']) _
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

of course will need to revert


/* groovylint-disable-next-line CompileStatic */
job_status_internal = [:]
Expand Down Expand Up @@ -47,42 +48,6 @@ void job_step_update(def value=currentBuild.currentResult) {
jobStatusUpdate(job_status_internal, env.STAGE_NAME, value)
}

Map nlt_test() {
// groovylint-disable-next-line NoJavaUtilDate
Date startDate = new Date()
try {
unstash('nltr')
} catch (e) {
print 'Unstash failed, results from NLT stage will not be included'
}
sh label: 'Fault injection testing using NLT',
script: './ci/docker_nlt.sh --class-name fault-injection fi'
List filesList = []
filesList.addAll(findFiles(glob: '*.memcheck.xml'))
int vgfail = 0
int vgerr = 0
if (filesList) {
String rcs = sh label: 'Check for Valgrind errors',
script: "grep -E '<error( |>)' ${filesList.join(' ')} || true",
returnStdout: true
if (rcs) {
vgfail = 1
}
String suite = sanitizedStageName()
junitSimpleReport suite: suite,
file: suite + '_valgrind_results.xml',
fails: vgfail,
errors: vgerr,
name: 'Valgrind_Memcheck',
class: 'Valgrind',
message: 'Valgrind Memcheck error detected',
testdata: rcs
}
int runTime = durationSeconds(startDate)
Map runData = ['nlttest_time': runTime]
return runData
}

// For master, this is just some wildly high number
String next_version() {
return '1000'
Expand Down Expand Up @@ -391,8 +356,11 @@ pipeline {
defaultValue: 'ci_vm9',
description: 'Label to use for 9 VM functional tests')
string(name: 'CI_NLT_1_LABEL',
defaultValue: 'ci_nlt_1',
defaultValue: 'ci_nlt_vm1',
description: 'Label to use for NLT tests')
string(name: 'CI_FI_1_LABEL',
defaultValue: 'ci_fi_vm1',
description: 'Label to use for Fault Injection (FI) tests')
string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_LABEL',
defaultValue: 'ci_nvme5',
description: 'Label to use for the Functional Hardware Medium (MD on SSD) stages')
Expand Down Expand Up @@ -745,11 +713,18 @@ pipeline {
job_step_update(
unitTest(timeout_time: 60,
inst_repos: daosRepos(),
test_script: 'ci/unit/test_nlt.sh',
test_script: 'ci/unit/test_nlt.sh' +
' --system-ram-reserved 4' +
' --max-log-size 1950MiB' +
' --dfuse-dir /localhome/jenkins/' +
' --log-usage-save nltir.xml' +
' --log-usage-export nltr.json' +
' --class-name nlt all',
unstash_opt: true,
unstash_tests: false,
inst_rpms: unitPackages(target: 'el9'),
image_version: 'el9.7'))
image_version: 'el9.7',
prov_env_vars: 'VM_CPUS=14'))
// recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltir.xml']],
// skipPublishingChecks: true,
// id: 'tlc', name: 'Fault Injection Interim Report')
Expand Down Expand Up @@ -978,63 +953,43 @@ pipeline {
}
} // post
} // stage('Functional on Ubuntu 20.04')
stage('Fault injection testing') {
stage('NLT Fault injection testing') {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI this will require an update to merge requirements

Copy link
Copy Markdown
Contributor Author

@grom72 grom72 May 14, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restore original name in a82e340

when {
beforeAgent true
expression { !skipStage() }
}
agent {
dockerfile {
filename 'utils/docker/Dockerfile.el.9'
label 'docker_runner_fi'
additionalBuildArgs dockerBuildArgs(repo_type: 'stable',
parallel_build: true,
deps_build: true) +
' --build-arg POINT_RELEASE=.7 '
args '--tmpfs /mnt/daos_0'
}
label params.CI_FI_1_LABEL
}
steps {
job_step_update(
sconsBuild(parallel_build: true,
scons_args: 'PREFIX=/opt/daos TARGET_TYPE=release BUILD_TYPE=debug',
build_deps: 'no'))
job_step_update(nlt_test())
// recordCoverage(tools: [[parser: 'COBERTURA', pattern:'nltr.xml']],
// skipPublishingChecks: true,
// id: 'fir', name: 'Fault Injection Report')
unitTest(timeout_time: 240,
inst_repos: daosRepos(),
test_script: 'ci/unit/test_nlt.sh --memcheck no' +
' --system-ram-reserved 4 --server-debug WARN' +
' --log-usage-import nltr.json' +
' --log-usage-save nltr.xml' +
' --class-name fault-injection fi',
unstash_opt: true,
unstash_tests: false,
inst_rpms: unitPackages(target: 'el9') + ' daos-client-tests',
image_version: 'el9.7',
prov_env_vars: 'VM_CPUS=14'))
}
post {
always {
unitTestPost artifacts: ['nlt_logs/'],
testResults: 'nlt-junit.xml',
always_script: 'ci/unit/test_nlt_post.sh'
discoverGitReferenceBuild referenceJob: 'daos-stack/daos/master',
scm: 'daos-stack/daos',
requiredResult: hudson.model.Result.UNSTABLE
recordIssues enabledForFailure: true,
/* ignore warning/errors from PMDK logging system */
filters: [excludeFile('pmdk/.+')],
failOnError: false,
ignoreQualityGate: true,
qualityGates: [[threshold: 1, type: 'TOTAL_ERROR'],
[threshold: 1, type: 'TOTAL_HIGH'],
[threshold: 1, type: 'NEW_NORMAL', unstable: true],
[threshold: 1, type: 'NEW_LOW', unstable: true]],
tools: [issues(pattern: 'nlt-errors.json',
name: 'Fault injection issues',
id: 'Fault_Injection'),
issues(pattern: 'nlt-client-leaks.json',
name: 'Fault injection leaks',
id: 'NLT_client')],
scm: 'daos-stack/daos'
junit testResults: 'nlt-junit.xml'
stash name: 'fault-inject-valgrind',
includes: '*.memcheck.xml',
allowEmpty: true
archiveArtifacts artifacts: 'nlt_logs/fault-injection/',
allowEmptyArchive: true
job_status_update()
}
}
} // stage('Fault injection testing')
} // stage('NLT Fault injection testing')
stage('Test RPMs on EL 9.6') {
when {
beforeAgent true
Expand Down Expand Up @@ -1255,8 +1210,7 @@ pipeline {
post {
always {
valgrindReportPublish valgrind_stashes: ['nlt-memcheck',
'unit-memcheck',
'fault-inject-valgrind']
'unit-memcheck']
job_status_update('final_status')
jobStatusWrite(job_status_internal)
}
Expand Down
42 changes: 0 additions & 42 deletions ci/docker_nlt.sh

This file was deleted.

8 changes: 4 additions & 4 deletions ci/unit/test_nlt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ mydir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
# Copy over the install tree and some of the build tree.
rsync -rlpt -z -e "ssh $SSH_KEY_ARGS" .build_vars* opt-daos.tar utils requirements-utest.txt jenkins@"$NODE":build/

# shellcheck disable=SC2029
ssh -tt "$SSH_KEY_ARGS" jenkins@"$NODE" "DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \
DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \
$(cat "$mydir/test_nlt_node.sh")"
ssh -T "$SSH_KEY_ARGS" jenkins@"$NODE" \
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not familiar with -tt vs -T. What's the impact here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No impact from end results perspective -T is more popular and suggested as approach for CI/CD scripting

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

-tt will force a pseudo-terminal whereas -T will disable it. So I would think -T is preferred as long as we don't run anything that requires a pseudo-terminal, which I imagine would fail in an obvious way.

"DAOS_HTTPS_PROXY=\"${DAOS_HTTPS_PROXY:-}\" \
DAOS_NO_PROXY=\"${DAOS_NO_PROXY:-}\" \
bash -s -- $*" < "$mydir/test_nlt_node.sh"
11 changes: 7 additions & 4 deletions ci/unit/test_nlt_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
set -uex

sudo bash -c 'echo 1 > /proc/sys/kernel/sysrq'
sudo mkdir -p /mnt/daos
# using mmap()'ed ULT stacks requires to bump system default
if [ "$(sudo sysctl -n vm.max_map_count)" -lt "1000000" ] ; then
sudo sysctl vm.max_map_count=1000000
Expand Down Expand Up @@ -45,7 +44,11 @@ pip install /opt/daos/lib/daos/python/
sudo prlimit --nofile=1024:262144 --pid $$
prlimit -n

HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \
mkdir -p nlt_logs
sudo mount -t tmpfs -o size=4g tmpfs nlt_logs
sudo chown jenkins:jenkins nlt_logs

TMPDIR="$(pwd)/nlt_logs" \
HTTPS_PROXY="${DAOS_HTTPS_PROXY:-}" \
NO_PROXY="${DAOS_NO_PROXY:-}" \
./utils/node_local_test.py --max-log-size 1950MiB \
--dfuse-dir /localhome/jenkins/ --log-usage-save nltir.xml --log-usage-export nltr.json all
exec ./utils/node_local_test.py "$@"
8 changes: 6 additions & 2 deletions ci/unit/test_nlt_post.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,18 @@ mkdir nlt_logs
# Copy any log files. Use rsync filters here to allow us to specify
# all files we want to copy, as it's much more flexible than using
# standard wildcards.
rsync -v -dprt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":/tmp/ \

# Assuming that node_local_test.py is run with --class-name,
# the logs will be in build/nlt_logs/ on the node.
rsync -v -rlpt -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/nlt_logs/ \
--filter="include dnt*.log" --filter="include dnt*.log.bz2" \
--filter="include dnt_fi_*_logs" \
--filter="include dnt_fi_*_logs" --filter="include */" \
--filter="exclude *" nlt_logs/

rsync -v -dpt -z -e "ssh $SSH_KEY_ARGS" jenkins@"$NODE":build/ \
--filter="include nlt*.json" --filter="include dnt*.xml" \
--filter="include nltir.xml" --filter="include nltr.json" \
--filter="include nlt-junit.xml" --filter="exclude *" ./

mkdir -p vm_test
mv nlt-errors.json vm_test/
5 changes: 0 additions & 5 deletions src/tests/ftest/cart/util/cart_logtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ def __init__(self, log_iter, quiet=False):
self.fi_triggered = False
self.fi_location = None
self.skip_suffixes = []
self.skip_substrings = []
self._tracers = []
self.ftest_mode = False

Expand Down Expand Up @@ -444,10 +443,6 @@ def _check_pid_from_log_file(self, pid, abort_on_warning, leak_wf, show_memleaks
show = False
if show and any(map(line.get_msg().endswith, self.skip_suffixes)):
show = False
if show:
line_msg = line.get_msg().casefold()
if any(sub in line_msg for sub in self.skip_substrings):
show = False
if show:
# Allow WARNING or ERROR messages, but anything higher like assert should
# trigger a failure.
Expand Down
1 change: 0 additions & 1 deletion utils/nlt_server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ engines:
- DAOS_MD_CAP=1024
- DAOS_STRICT_SHUTDOWN=1
- DAOS_TARGET_OVERSUBSCRIBE=1
- ABT_STACK_OVERFLOW_CHECK=mprotect
storage:
-
class: ram
Expand Down
15 changes: 5 additions & 10 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5025,13 +5025,6 @@ def sizeof_fmt(num, suffix='B'):
if ignore_busy:
lto.skip_suffixes.append(" DER_BUSY(-1012): 'Device or resource busy'")

lto.skip_substrings.extend([
'sluggish ec boundary report from rank',
'sluggish stable epoch reporting',
'progress callback was not called for too long',
'rpc failed; rc:',
])

try:
lto.check_log_file(abort_on_warning=True,
show_memleaks=show_memleaks,
Expand Down Expand Up @@ -5924,7 +5917,7 @@ def _prep(self):
# pylint: disable-next=no-member
num_cores = len(os.sched_getaffinity(0))

if num_cores < 20:
if num_cores < 14:
max_child = 1
else:
max_child = int(num_cores / 4 * 3)
Expand Down Expand Up @@ -6751,12 +6744,14 @@ def run(wf, args):
run_fi = False

if args.perf_check or fi_test or fi_test_dfuse:
fs = subprocess.run([os.path.join(conf['PREFIX'], 'bin', 'fault_status')], check=False)
fi_env = os.environ.copy()
fi_env['PATH'] = f'{conf["PREFIX"]}/bin:{fi_env["PATH"]}'
fs = subprocess.run(['fault_status'], check=False, env=fi_env)
Comment thread
daltonbohning marked this conversation as resolved.
print(fs)
if fs.returncode == 0:
run_fi = True
else:
print("Unable to detect fault injection feature, skipping testing")
print("Unable to detect fault injection feature - skipping FI testing")

if run_fi:
args.server_debug = 'INFO'
Expand Down
Loading