jvector/.github/workflows/run-bench.yml at f7595074aae2ecedc4b56f91f5fa852be31771de · datastax/jvector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
name: Run Bench Main

on:
  workflow_dispatch:
    inputs:
      benchmark_config:
        description: 'Benchmark dataset regex (leave empty for all)'
        required: false
        default: ''
      branches:
        description: 'Space-separated list of branches to benchmark'
        required: false
        default: 'main'
      custom_config:
        description: 'Custom YAML configuration content (will override autoDefault.yml)'
        required: false
        type: string
        default: ''
  pull_request:
    types: [opened,synchronize,ready_for_review]
    branches:
      - main
    paths:
      - '**/src/main/java/**'
      - 'pom.xml'
      - '**/pom.xml'

jobs:
  # Job to generate the matrix configuration
  generate-matrix:
    runs-on: ubuntu-latest
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
      - name: Generate matrix
        id: set-matrix
        run: |
          # Print event information for debugging
          echo "Event name: ${{ github.event_name }}"
          echo "Branches input: '${{ github.event.inputs.branches }}'"

          # Default branches based on event type
          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
            echo "Pull request detected. Using main and PR branch: ${{ github.head_ref }}"
            BRANCHES='["main", "${{ github.head_ref }}"]'
          elif [[ "${{ github.event_name }}" == "workflow_dispatch" && -n "${{ github.event.inputs.branches }}" ]]; then
            # Parse space-separated branches input into JSON array
            echo "Workflow dispatch with branches input detected"
            BRANCHES_INPUT="${{ github.event.inputs.branches }}"
            BRANCHES="["
            for branch in $BRANCHES_INPUT; do
              if [[ "$BRANCHES" != "[" ]]; then
                BRANCHES="$BRANCHES, "
              fi
              BRANCHES="$BRANCHES\"$branch\""
              echo "Adding branch to matrix: $branch"
            done
            BRANCHES="$BRANCHES]"
          else
            echo "Default event type. Using main branch only"
            BRANCHES='["main"]'
          fi

          echo "Generated branches matrix: $BRANCHES"
          echo "matrix={\"jdk\":[24],\"isa\":[\"isa-avx512f\"],\"branch\":$BRANCHES}" >> $GITHUB_OUTPUT

  test-avx512:
    needs: generate-matrix
    concurrency:
      group: ${{ matrix.isa }}-${{ matrix.jdk }}-${{ matrix.branch }}
      cancel-in-progress: false
    strategy:
      matrix: ${{ fromJSON(needs.generate-matrix.outputs.matrix) }}
    runs-on: ${{ matrix.isa }}
    steps:
      - name: verify-avx512
        run: |
          # avx2 is included just for illustration
          required="avx2 avx512f avx512cd avx512bw avx512dq avx512v"
          printf "required ISA feature flags: %s\n" "${required}"
          flags="$(lscpu|grep '^Flags'|cut -d: -f2)"
          output=""
          for flag in ${required} ; do
           if [[ " $flags " == *"${flag}"* ]]
           then output="${output} $flag(OK)"
           else output="${output} $flag(FAIL)"
          fi ; done
          printf "%s\n" ${output}
          if [[ " $output " == *"FAIL"* ]] ; then exit 2 ; fi
      - name: Set up GCC
        run: |
          sudo apt install -y gcc
      - uses: actions/checkout@v4
      - name: Set up JDK ${{ matrix.jdk }}
        uses: actions/setup-java@v3
        with:
          java-version: ${{ matrix.jdk }}
          distribution: temurin
          cache: maven

      - name: Get version from pom.xml
        id: get-version
        run: |
          VERSION=$(grep -o '<version>[^<]*</version>' pom.xml | head -1 | sed 's/<version>\(.*\)<\/version>/\1/')
          if [[ "$VERSION" == *'${revision}'* ]]; then
            REVISION=$(grep -o '<revision>[^<]*</revision>' pom.xml | head -1 | sed 's/<revision>\(.*\)<\/revision>/\1/')
            if [ -n "$REVISION" ]; then
              VERSION=${VERSION//\$\{revision\}/$REVISION}
            fi
          fi
          echo "version=$VERSION" >> $GITHUB_OUTPUT
          echo "Current branch has version $VERSION"

      # Print debug information about the current job
      - name: Print job information
        run: |
          echo "Running benchmark for:"
          echo "  - Branch: ${{ matrix.branch }}"
          echo "  - JDK: ${{ matrix.jdk }}"
          echo "  - ISA: ${{ matrix.isa }}"

      # Checkout the branch specified in the matrix
      - name: Checkout branch
        uses: actions/checkout@v4
        with:
          ref: ${{ matrix.branch }}
          fetch-depth: 0

      # ==========================================
      # Decode and write the protected dataset catalog
      #
      # TO UPDATE THIS SECRET:
      # 1. On your local machine, run:
      #    base64 -i jvector-examples/yaml-configs/dataset-catalogs/protected-catalog.yaml
      # 2. Go to GitHub Repo -> Settings -> Secrets and variables -> Actions
      # 3. Update the PROTECTED_CATALOG_YAML secret with the new Base64 string.
      # ==========================================
      - name: Inject Protected Catalog
        run: |
          mkdir -p jvector-examples/yaml-configs/dataset-catalogs
          echo "${{ secrets.PROTECTED_CATALOG_YAML }}" | base64 -d > jvector-examples/yaml-configs/dataset-catalogs/protected-catalog.yaml

      # Create a directory to store benchmark results
      - name: Create results directory
        run: mkdir -p benchmark_results

      # Build the branch
      - name: Build branch
        run: mvn -B -Punix-amd64-profile package --file pom.xml

      # Run the benchmark if jvector-examples exists
      - name: Run benchmark
        id: run-benchmark
        run: |
          # Check if jvector-examples directory and AutoBenchYAML class exist
          if [ ! -d "jvector-examples" ]; then
            echo "Warning: jvector-examples directory not found in branch ${{ matrix.branch }}. Skipping benchmark."
            exit 0
          fi

          # Check if the jar with dependencies was built
          JAR_COUNT=$(ls jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar 2>/dev/null | wc -l)
          if [ "$JAR_COUNT" -eq 0 ]; then
            echo "Warning: No jar with dependencies found in branch ${{ matrix.branch }}. Skipping benchmark."
            exit 0
          fi

          # Determine available memory and set heap size to half of it
          TOTAL_MEM_GB=$(free -g | awk '/^Mem:/ {print $2}')
          # Ensure we have a valid number, default to 16GB total (8GB heap) if detection fails
          if [[ -z "$TOTAL_MEM_GB" ]] || [[ "$TOTAL_MEM_GB" -le 0 ]]; then
            echo "Warning: Could not detect memory size, defaulting to 16GB total memory (8GB heap)"
            TOTAL_MEM_GB=16
          fi
          HALF_MEM_GB=$((TOTAL_MEM_GB / 2))
          # Ensure minimum heap size of 1GB
          if [[ "$HALF_MEM_GB" -lt 1 ]]; then
            HALF_MEM_GB=1
          fi
          echo "Total memory: ${TOTAL_MEM_GB}GB, using ${HALF_MEM_GB}GB for Java heap"

          # Run the benchmark
          echo "Running benchmark for branch ${{ matrix.branch }}"

          # Determine optional benchmark config argument from workflow input
          BENCH_ARG="${{ github.event.inputs.benchmark_config }}"
          if [[ -z "$BENCH_ARG" ]]; then
            echo "No benchmark_config provided; running with default dataset selection."
            BENCH_SUFFIX=""
          else
            echo "Using benchmark_config: '$BENCH_ARG'"
            BENCH_SUFFIX=" $BENCH_ARG"
          fi

          # Handle custom configuration if provided
          CUSTOM_CONFIG="${{ github.event.inputs.custom_config }}"
          CONFIG_ARG=""
          if [[ -n "$CUSTOM_CONFIG" ]]; then
            echo "Custom configuration provided, creating temporary config file..."
            CUSTOM_CONFIG_FILE="custom-benchmark-config.yml"
            echo "$CUSTOM_CONFIG" > "$CUSTOM_CONFIG_FILE"
            CONFIG_ARG="--config $CUSTOM_CONFIG_FILE"
            echo "Using custom config: $CUSTOM_CONFIG_FILE"
          else
            echo "No custom configuration provided, using default autoDefault.yml"
          fi

          # Sanitize branch name for filenames: replace any non-alphanumeric, dash or underscore with underscore
          SAFE_BRANCH=$(echo "${{ matrix.branch }}" | sed 's/[^A-Za-z0-9_-]/_/g')
          echo "safe_branch=$SAFE_BRANCH" >> $GITHUB_OUTPUT

          if [[ "${{ github.event_name }}" == "pull_request" ]]; then
            java ${{ matrix.jdk >= 20 && '--enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector' || '' }} \
              ${{ matrix.jdk >= 22 && '-Djvector.experimental.enable_native_vectorization=true' || '' }} \
              -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/heap_dump/ -Xmx${HALF_MEM_GB}g \
              -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.example.AutoBenchYAML --output ${SAFE_BRANCH}-bench-results ${CONFIG_ARG} dpr-1M
          else
            java ${{ matrix.jdk >= 20 && '--enable-native-access=ALL-UNNAMED --add-modules=jdk.incubator.vector' || '' }} \
              ${{ matrix.jdk >= 22 && '-Djvector.experimental.enable_native_vectorization=true' || '' }} \
              -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=/tmp/heap_dump/ -Xmx${HALF_MEM_GB}g \
              -cp jvector-examples/target/jvector-examples-*-jar-with-dependencies.jar io.github.jbellis.jvector.example.AutoBenchYAML --output ${SAFE_BRANCH}-bench-results ${CONFIG_ARG}${BENCH_SUFFIX:+ }${BENCH_ARG}
          fi

          # Move the results to the benchmark_results directory
          mv ${SAFE_BRANCH}-bench-results.csv benchmark_results/ || true
          mv ${SAFE_BRANCH}-bench-results.json benchmark_results/ || true

          echo "Completed benchmarks for branch: ${{ matrix.branch }}"

      - name: Upload Individual Benchmark Results
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results-${{ matrix.isa }}-jdk${{ matrix.jdk }}-${{ steps.run-benchmark.outputs.safe_branch }}
          path: |
            benchmark_results/*.csv
            benchmark_results/*.json
          if-no-files-found: warn

  # Job to combine results and create visualizations
  combine-results:
    needs: test-avx512
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Download all benchmark results
        uses: actions/download-artifact@v4
        with:
          pattern: benchmark-results-*
          path: all-benchmark-results
          merge-multiple: true

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.x'

      - name: Install Python Dependencies
        run: |
          python -m pip install --upgrade pip
          pip install matplotlib numpy psutil

      - name: Generate visualization using visualize_benchmarks.py
        run: |
          # Discover all downloaded CSV benchmark result files
          shopt -s globstar nullglob
          echo "Listing downloaded artifact directory structure:"
          ls -R all-benchmark-results || true
          files=(all-benchmark-results/**/*.csv)
          if [ ${#files[@]} -eq 0 ]; then
            echo "No CSVs found under all-benchmark-results. Searching repo as fallback..."
            files=(**/*.csv)
          fi
          echo "Found ${#files[@]} CSV files"
          for f in "${files[@]}"; do echo "  - $f"; done

          # Check if any files were found
          if [ ${#files[@]} -eq 0 ]; then
            echo "No benchmark result files found. Skipping visualization generation."
            echo "This can happen when benchmarks are skipped due to missing dependencies or other issues."
            # Create empty output directory to satisfy artifact upload
            mkdir -p benchmark_reports
            echo "No benchmark results were available for visualization." > benchmark_reports/no_results.txt
            exit 0
          fi

          # Ensure output directory matches the script's default/output expectation
          OUTPUT_DIR="benchmark_reports"

          # Run the visualization script with all files, default threshold (5.0)
          python visualize_benchmarks.py --output-dir "$OUTPUT_DIR" "${files[@]}"

      - name: Upload visualization artifacts
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-comparison-results
          path: |
            benchmark_reports/**
          retention-days: 90