perf: Optimize PQ distance look up table calculation for subvector sizes 4,8 & 16 on AVX-512

Raghuveer Devulapalli · Raghuveer Devulapalli · commit 70cd2fbc1378 · 2026-04-07T03:31:37.000Z
Add SIMD fast paths in calculate_partial_sums_dot_f32_512 and
calculate_partial_sums_euclidean_f32_512 for the two most common PQ
subvector sizes:

- size == 4: broadcast a 128-bit query fragment across all four 128-bit
  lanes of a ZMM register, load four consecutive centroids at once, and
  reduce each lane independently using two shuffle+add pairs. Produces 4
  partial sums per loop iteration instead of 1.

- size == 8: broadcast a 256-bit query fragment across both 256-bit
  halves of a ZMM register, load two consecutive centroids at once, and
  reduce across 128-bit lanes followed by within-lane shuffles. Produces
  2 partial sums per loop iteration instead of 1.

- size == 16: query and the centroid fit into a ZMM register, load
  the query into zmm and then loop over the centroids. Produces one
  partial sum per loop iteration, but prevents having to load the
  query multiple times.

Both paths fall back to the default way of computing dot_product_f32
/ euclidean_f32 in a loop for any tail elements or unsupported
sizes.
diff --git a/jvector-native/src/main/c/jvector_simd.c b/jvector-native/src/main/c/jvector_simd.c
@@ -261,15 +261,158 @@ JV_INLINE float euclidean_f32(const float* a, int aoffset, const float* b, int b
 
 JV_INLINE void calculate_partial_sums_dot_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums) {
     int codebookBase = codebookIndex * clusterCount;
-    for (int i = 0; i < clusterCount; i++) {
-      partialSums[codebookBase + i] = dot_product_f32(codebook, i * size, query, queryOffset, size);
+    float tempdat[16];
+    if (size == 4) {
+        int i = 0;
+        // use a zmm register to calculate 4 partial sums in parallel:
+        __m128 q = _mm_loadu_ps(query + queryOffset);
+        __m512 qq = _mm512_broadcast_f32x4(q); // broadcast 128-bit query to all 4 lanes
+        for (; i + 4 <= clusterCount; i += 4) {
+            // load four consecutive centroids from the codebook into zmm
+            __m512 c = _mm512_loadu_ps(codebook + i * size);
+            __m512 sum = _mm512_fmadd_ps(c, qq, _mm512_setzero_ps());
+            // horizontal reduce: within each 128-bit lane independently
+            // Step 1: swap neighboring elements within 128-bit lanes
+            __m512 temp = _mm512_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 3, 0, 1));
+            sum = _mm512_add_ps(sum, temp);
+            // Step 2: swap 32-bit pairs within 128-bit lanes
+            temp = _mm512_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2));
+            sum = _mm512_add_ps(sum, temp);
+            // extract results from position 0 of each 128-bit lane
+            _mm512_storeu_ps(tempdat, sum);
+            partialSums[codebookBase + i]     = tempdat[0];
+            partialSums[codebookBase + i + 1] = tempdat[4];
+            partialSums[codebookBase + i + 2] = tempdat[8];
+            partialSums[codebookBase + i + 3] = tempdat[12];
+        }
+        for (; i < clusterCount; i++) {
+          partialSums[codebookBase + i] = dot_product_f32(codebook, i * size, query, queryOffset, size);
+        }
+    }
+    else if (size == 8) {
+        int i = 0;
+        // use a zmm register to calculate 2 partial sums in parallel:
+        __m256 q = _mm256_loadu_ps(query + queryOffset);
+        __m512 qq = _mm512_broadcast_f32x8(q); // 8 cycles, but have to do it just once outside the loop
+        for (; i + 2 <= clusterCount; i += 2) {
+            // load two consecutive centroids from the codebook into zmm
+            __m512 c1 = _mm512_loadu_ps(codebook + i * size);
+            __m512 sum = _mm512_fmadd_ps(c1, qq, _mm512_setzero_ps());
+            // horizontal reduce: per 256 bit lanes
+            // Step 1: swap neighbouring 128 bits and add to sum across lanes
+            __m512 temp = _mm512_shuffle_f32x4(sum, sum, _MM_SHUFFLE(2, 3, 0, 1)); // swap 128-bit lanes
+            sum = _mm512_add_ps(sum, temp);
+            // Step 2: Shuffle and add to sum within lanes
+            temp = _mm512_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2));
+            sum = _mm512_add_ps(sum, temp);
+            // step 3: shuffle neighboring lanes:
+            temp = _mm512_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 3, 0, 1));
+            sum = _mm512_add_ps(sum, temp);
+            // extract results: may be there is a better way?
+            // Store is cheap and loading them should happen from the store buffers, so this may be faster than shuffling and extracting:
+            // Although its tempting, avoid using vcompress (a high latency instruction)
+            //_mm512_mask_compressstoreu_ps(ans, 0x8080, sum);
+            _mm512_storeu_ps(tempdat, sum);
+            partialSums[codebookBase + i] = tempdat[0];
+            partialSums[codebookBase + i + 1] = tempdat[8];
+        }
+        for (; i < clusterCount; i++) {
+          partialSums[codebookBase + i] = dot_product_f32(codebook, i * size, query, queryOffset, size);
+        }
+    }
+    else if (size == 16) {
+        int i = 0;
+        __m512 q = _mm512_loadu_ps(query + queryOffset);
+        for (; i < clusterCount; i += 1) {
+            __m512 c1 = _mm512_loadu_ps(codebook + i * size);
+            __m512 sum = _mm512_fmadd_ps(c1, c1, _mm512_setzero_ps());
+            partialSums[codebookBase + i] = _mm512_reduce_add_ps(sum);
+        }
+    }
+    else {
+        for (int i = 0; i < clusterCount; i++) {
+          partialSums[codebookBase + i] = dot_product_f32(codebook, i * size, query, queryOffset, size);
+        }
     }
 }
 
 JV_INLINE void calculate_partial_sums_euclidean_f32_512(const float* codebook, int codebookIndex, int size, int clusterCount, const float* query, int queryOffset, float* partialSums) {
     int codebookBase = codebookIndex * clusterCount;
-    for (int i = 0; i < clusterCount; i++) {
-      partialSums[codebookBase + i] = euclidean_f32(codebook, i * size, query, queryOffset, size);
+    float tempdat[16];
+    if (size == 4) {
+        int i = 0;
+        // use a zmm register to calculate 4 partial sums in parallel:
+        __m128 q = _mm_loadu_ps(query + queryOffset);
+        __m512 qq = _mm512_broadcast_f32x4(q); // broadcast 128-bit query to all 4 lanes
+        for (; i + 4 <= clusterCount; i += 4) {
+            // load four consecutive centroids from the codebook into zmm
+            __m512 c = _mm512_loadu_ps(codebook + i * size);
+            __m512 diff = _mm512_sub_ps(c, qq);
+            __m512 sum = _mm512_fmadd_ps(diff, diff, _mm512_setzero_ps());
+            // horizontal reduce: within each 128-bit lane independently
+            // Step 1: swap neighboring elements within 128-bit lanes
+            __m512 temp = _mm512_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 3, 0, 1));
+            sum = _mm512_add_ps(sum, temp);
+            // Step 2: swap 32-bit pairs within 128-bit lanes
+            temp = _mm512_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2));
+            sum = _mm512_add_ps(sum, temp);
+            // extract results from position 0 of each 128-bit lane
+            _mm512_storeu_ps(tempdat, sum);
+            partialSums[codebookBase + i]     = tempdat[0];
+            partialSums[codebookBase + i + 1] = tempdat[4];
+            partialSums[codebookBase + i + 2] = tempdat[8];
+            partialSums[codebookBase + i + 3] = tempdat[12];
+        }
+        for (; i < clusterCount; i++) {
+          partialSums[codebookBase + i] = euclidean_f32(codebook, i * size, query, queryOffset, size);
+        }
+    }
+    else if (size == 8) {
+        int i = 0;
+        // use a zmm register to calculate 2 partial sums in parallel:
+        __m256 q = _mm256_loadu_ps(query + queryOffset);
+        __m512 qq = _mm512_broadcast_f32x8(q); // 8 cycles, but have to do it just once outside the loop
+        for (; i + 2 <= clusterCount; i += 2) {
+            // load two consecutive centroids from the codebook into zmm
+            __m512 c1 = _mm512_loadu_ps(codebook + i * size);
+            __m512 diff = _mm512_sub_ps(c1, qq);
+            __m512 sum = _mm512_fmadd_ps(diff, diff, _mm512_setzero_ps());
+            // horizontal reduce: per 256 bit lanes
+            // Step 1: swap neighbouring 128 bits and add to sum across lanes
+            __m512 temp = _mm512_shuffle_f32x4(sum, sum, _MM_SHUFFLE(2, 3, 0, 1)); // swap 128-bit lanes
+            sum = _mm512_add_ps(sum, temp);
+            // Step 2: Shuffle and add to sum within lanes
+            temp = _mm512_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 0, 3, 2));
+            sum = _mm512_add_ps(sum, temp);
+            // step 3: shuffle neighboring lanes:
+            temp = _mm512_shuffle_ps(sum, sum, _MM_SHUFFLE(2, 3, 0, 1));
+            sum = _mm512_add_ps(sum, temp);
+            // extract results: may be there is a better way?
+            // Store is cheap and loading them should happen from the store buffers, so this may be faster than shuffling and extracting:
+            // Although its tempting, avoid using vcompress (a high latency instruction)
+            //_mm512_mask_compressstoreu_ps(ans, 0x8080, sum);
+            _mm512_storeu_ps(tempdat, sum);
+            partialSums[codebookBase + i] = tempdat[0];
+            partialSums[codebookBase + i + 1] = tempdat[8];
+        }
+        for (; i < clusterCount; i++) {
+          partialSums[codebookBase + i] = euclidean_f32(codebook, i * size, query, queryOffset, size);
+        }
+    }
+    else if (size == 16) {
+        int i = 0;
+        __m512 q = _mm512_loadu_ps(query + queryOffset);
+        for (; i < clusterCount; i += 1) {
+            __m512 c1 = _mm512_loadu_ps(codebook + i * size);
+            __m512 diff = _mm512_sub_ps(c1, qq);
+            __m512 sum = _mm512_fmadd_ps(diff, diff, _mm512_setzero_ps());
+            partialSums[codebookBase + i] = _mm512_reduce_add_ps(sum);
+        }
+    }
+    else {
+        for (int i = 0; i < clusterCount; i++) {
+          partialSums[codebookBase + i] = euclidean_f32(codebook, i * size, query, queryOffset, size);
+        }
     }
 }