diff --git a/libvmaf/src/feature/feature_extractor.c b/libvmaf/src/feature/feature_extractor.c index ef17b901f..49a41892c 100644 --- a/libvmaf/src/feature/feature_extractor.c +++ b/libvmaf/src/feature/feature_extractor.c @@ -46,6 +46,7 @@ extern VmafFeatureExtractor vmaf_fex_psnr; extern VmafFeatureExtractor vmaf_fex_psnr_hvs; extern VmafFeatureExtractor vmaf_fex_integer_adm; extern VmafFeatureExtractor vmaf_fex_integer_motion; +extern VmafFeatureExtractor vmaf_fex_integer_motion_v2; extern VmafFeatureExtractor vmaf_fex_integer_vif; extern VmafFeatureExtractor vmaf_fex_cambi; #if HAVE_CUDA @@ -71,6 +72,7 @@ static VmafFeatureExtractor *feature_extractor_list[] = { &vmaf_fex_psnr_hvs, &vmaf_fex_integer_adm, &vmaf_fex_integer_motion, + &vmaf_fex_integer_motion_v2, &vmaf_fex_integer_vif, &vmaf_fex_cambi, #if HAVE_CUDA diff --git a/libvmaf/src/feature/feature_extractor.h b/libvmaf/src/feature/feature_extractor.h index 574436e76..6d8cb7f11 100644 --- a/libvmaf/src/feature/feature_extractor.h +++ b/libvmaf/src/feature/feature_extractor.h @@ -38,6 +38,7 @@ enum VmafFeatureExtractorFlags { VMAF_FEATURE_EXTRACTOR_TEMPORAL = 1 << 0, VMAF_FEATURE_EXTRACTOR_CUDA = 1 << 1, VMAF_FEATURE_FRAME_SYNC = 1 << 2, + VMAF_FEATURE_EXTRACTOR_PREV_REF = 1 << 3, }; typedef struct VmafFeatureExtractor { @@ -97,6 +98,7 @@ typedef struct VmafFeatureExtractor { #endif VmafFrameSyncContext *framesync; + VmafPicture prev_ref; ///< Previous reference picture, set by framework. } VmafFeatureExtractor; diff --git a/libvmaf/src/feature/integer_motion_v2.c b/libvmaf/src/feature/integer_motion_v2.c new file mode 100644 index 000000000..85c3b1772 --- /dev/null +++ b/libvmaf/src/feature/integer_motion_v2.c @@ -0,0 +1,291 @@ +/** + * + * Copyright 2016-2025 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +// Pipelined motion feature extractor. +// +// Computes the same motion score as integer_motion.c but without storing +// blurred frames across extract calls. Instead, it exploits the linearity +// of convolution: SAD(blur[N-1], blur[N]) == sum(|blur(f[N-1] - f[N])|). +// +// The frame difference, blur, and absolute-sum are fused into a single +// row-at-a-time pipeline, requiring only one row of scratch memory. +// +// The framework provides the previous reference frame via fex->prev_ref, +// making each extract call stateless with respect to pixel data. + +#include +#include +#include + +#include "cpu.h" +#include "feature_collector.h" +#include "feature_extractor.h" +#include "integer_motion.h" + +#if ARCH_X86 +#include "x86/motion_v2_avx2.h" +#if HAVE_AVX512 +#include "x86/motion_v2_avx512.h" +#endif +#endif + +typedef uint64_t (*motion_pipeline_fn)(const uint8_t *, ptrdiff_t, + const uint8_t *, ptrdiff_t, + int32_t *, unsigned, unsigned, + unsigned bpc); + +typedef struct MotionV2State { + int32_t *y_row; + unsigned w, h, bpc; + motion_pipeline_fn pipeline; +} MotionV2State; + +static inline int mirror(int idx, int size) +{ + if (idx < 0) return -idx; + if (idx >= size) return 2 * size - idx - 1; + return idx; +} + +static uint64_t +motion_score_pipeline_8(const uint8_t *prev, ptrdiff_t prev_stride, + const uint8_t *cur, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc) +{ + (void)bpc; + const int radius = filter_width / 2; + const int32_t y_round = 1 << 7; + const int32_t x_round = 1 << 15; + + uint64_t sad = 0; + + for (unsigned i = 0; i < h; i++) { + // Fused diff + y_conv for row i (shift by 8, matching v1 precision) + int32_t any_nonzero = 0; + for (unsigned j = 0; j < w; j++) { + int32_t accum = 0; + for (int k = 0; k < filter_width; k++) { + const int row = mirror((int)i - radius + k, (int)h); + int32_t diff = prev[row * prev_stride + j] + - cur[row * cur_stride + j]; + accum += (int32_t)filter[k] * diff; + } + y_row[j] = (accum + y_round) >> 8; + any_nonzero |= y_row[j]; + } + + if (!any_nonzero) continue; + + // x_conv + abs + accumulate for row i + uint32_t row_sad = 0; + for (unsigned j = 0; j < w; j++) { + int64_t accum = 0; + for (int k = 0; k < filter_width; k++) { + const int col = mirror((int)j - radius + k, (int)w); + accum += (int64_t)filter[k] * y_row[col]; + } + int32_t val = (int32_t)((accum + x_round) >> 16); + row_sad += abs(val); + } + sad += row_sad; + } + + return sad; +} + +static inline uint64_t +motion_score_pipeline_16(const uint8_t *prev_u8, ptrdiff_t prev_stride, + const uint8_t *cur_u8, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc) +{ + const uint16_t *prev = (const uint16_t *)prev_u8; + const uint16_t *cur = (const uint16_t *)cur_u8; + const ptrdiff_t p_stride = prev_stride / 2; + const ptrdiff_t c_stride = cur_stride / 2; + + const int radius = filter_width / 2; + const int32_t y_round = 1 << (bpc - 1); + const int32_t x_round = 1 << 15; + + uint64_t sad = 0; + + for (unsigned i = 0; i < h; i++) { + // Fused diff + y_conv for row i + int32_t any_nonzero = 0; + for (unsigned j = 0; j < w; j++) { + int64_t accum = 0; + for (int k = 0; k < filter_width; k++) { + const int row = mirror((int)i - radius + k, (int)h); + int32_t diff = prev[row * p_stride + j] + - cur[row * c_stride + j]; + accum += (int64_t)filter[k] * diff; + } + y_row[j] = (int32_t)((accum + y_round) >> bpc); + any_nonzero |= y_row[j]; + } + + if (!any_nonzero) continue; + + // x_conv + abs + accumulate for row i + uint32_t row_sad = 0; + for (unsigned j = 0; j < w; j++) { + int64_t accum = 0; + for (int k = 0; k < filter_width; k++) { + const int col = mirror((int)j - radius + k, (int)w); + accum += (int64_t)filter[k] * y_row[col]; + } + int32_t val = (int32_t)((accum + x_round) >> 16); + row_sad += abs(val); + } + sad += row_sad; + } + + return sad; +} + +static int init(VmafFeatureExtractor *fex, enum VmafPixelFormat pix_fmt, + unsigned bpc, unsigned w, unsigned h) +{ + (void) pix_fmt; + MotionV2State *s = fex->priv; + + s->w = w; + s->h = h; + s->bpc = bpc; + + s->y_row = malloc(sizeof(*s->y_row) * w); + if (!s->y_row) return -ENOMEM; + + if (bpc == 8) + s->pipeline = motion_score_pipeline_8; + else + s->pipeline = motion_score_pipeline_16; + +#if ARCH_X86 + if (vmaf_get_cpu_flags() & VMAF_X86_CPU_FLAG_AVX2) { + if (bpc == 8) + s->pipeline = motion_score_pipeline_8_avx2; + else + s->pipeline = motion_score_pipeline_16_avx2; + } +#if HAVE_AVX512 + if (vmaf_get_cpu_flags() & VMAF_X86_CPU_FLAG_AVX512) { + if (bpc == 8) + s->pipeline = motion_score_pipeline_8_avx512; + else + s->pipeline = motion_score_pipeline_16_avx512; + } +#endif +#endif + + return 0; +} + +static int extract(VmafFeatureExtractor *fex, + VmafPicture *ref_pic, VmafPicture *ref_pic_90, + VmafPicture *dist_pic, VmafPicture *dist_pic_90, + unsigned index, VmafFeatureCollector *feature_collector) +{ + MotionV2State *s = fex->priv; + + (void) dist_pic; + (void) ref_pic_90; + (void) dist_pic_90; + + if (index == 0) { + return vmaf_feature_collector_append(feature_collector, + "VMAF_integer_feature_motion_v2_sad_score", 0., index); + } + + if (!fex->prev_ref.ref) + return -EINVAL; + + const unsigned w = s->w; + const unsigned h = s->h; + const uint8_t *prev_data = (const uint8_t *)fex->prev_ref.data[0]; + const uint8_t *cur_data = (const uint8_t *)ref_pic->data[0]; + + uint64_t sad = s->pipeline(prev_data, fex->prev_ref.stride[0], + cur_data, ref_pic->stride[0], + s->y_row, w, h, s->bpc); + + double score = (double)sad / 256. / (w * h); + + return vmaf_feature_collector_append(feature_collector, + "VMAF_integer_feature_motion_v2_sad_score", score, index); +} + +static int close_fex(VmafFeatureExtractor *fex) +{ + MotionV2State *s = fex->priv; + free(s->y_row); + return 0; +} + +static int flush(VmafFeatureExtractor *fex, + VmafFeatureCollector *feature_collector) +{ + (void) fex; + + unsigned n_frames = 0; + double dummy; + while (!vmaf_feature_collector_get_score(feature_collector, + "VMAF_integer_feature_motion_v2_sad_score", &dummy, n_frames)) + n_frames++; + + if (n_frames < 2) return 1; + + for (unsigned i = 0; i < n_frames; i++) { + double score_cur, score_next; + vmaf_feature_collector_get_score(feature_collector, + "VMAF_integer_feature_motion_v2_sad_score", &score_cur, i); + + double motion2; + if (i + 1 < n_frames) { + vmaf_feature_collector_get_score(feature_collector, + "VMAF_integer_feature_motion_v2_sad_score", &score_next, i + 1); + motion2 = score_cur < score_next ? score_cur : score_next; + } else { + motion2 = score_cur; + } + + vmaf_feature_collector_append(feature_collector, + "VMAF_integer_feature_motion2_v2_score", motion2, i); + } + + return 1; +} + +static const char *provided_features[] = { + "VMAF_integer_feature_motion_v2_sad_score", + "VMAF_integer_feature_motion2_v2_score", + NULL +}; + +VmafFeatureExtractor vmaf_fex_integer_motion_v2 = { + .name = "motion_v2", + .init = init, + .extract = extract, + .flush = flush, + .close = close_fex, + .priv_size = sizeof(MotionV2State), + .provided_features = provided_features, + .flags = VMAF_FEATURE_EXTRACTOR_PREV_REF, +}; diff --git a/libvmaf/src/feature/x86/motion_v2_avx2.c b/libvmaf/src/feature/x86/motion_v2_avx2.c new file mode 100644 index 000000000..77b5e5c17 --- /dev/null +++ b/libvmaf/src/feature/x86/motion_v2_avx2.c @@ -0,0 +1,335 @@ +/** + * + * Copyright 2016-2025 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include + +#include "feature/integer_motion.h" + +static inline int mirror(int idx, int size) +{ + if (idx < 0) return -idx; + if (idx >= size) return 2 * size - idx - 1; + return idx; +} + +// Emulate arithmetic right shift of int64 by 16 in AVX2. +// AVX2 lacks srai_epi64; this uses the blend trick: +// low dwords come from logical shift, high dwords from arithmetic shift. +static inline __m256i srai_epi64_16(__m256i v) +{ + __m256i lo = _mm256_srli_epi64(v, 16); + __m256i hi = _mm256_srai_epi32(v, 16); + return _mm256_blend_epi32(lo, hi, 0xAA); +} + +// SIMD phase 2: x_conv + abs + SAD for one row of int32 y_row. +// Processes 8 int32 columns at a time via mullo_epi32 + int64 accumulation. +static inline uint32_t +x_conv_row_sad_avx2(const int32_t *y_row, unsigned w) +{ + const __m256i g0 = _mm256_set1_epi32(3571); + const __m256i g1 = _mm256_set1_epi32(16004); + const __m256i g2 = _mm256_set1_epi32(26386); + const __m256i round64 = _mm256_set1_epi64x(1 << 15); + const __m256i perm_idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0); + + uint32_t row_sad = 0; + + // Scalar left edge (columns 0, 1) — mirror boundary + unsigned j; + for (j = 0; j < 2 && j < w; j++) { + int64_t accum = 0; + for (int k = 0; k < 5; k++) { + int col = mirror((int)j - 2 + k, (int)w); + accum += (int64_t)filter[k] * y_row[col]; + } + int32_t val = (int32_t)((accum + (1 << 15)) >> 16); + row_sad += abs(val); + } + + // SIMD middle: need y_row[j-2]..y_row[j+9], so j+10 <= w + __m256i sad_acc = _mm256_setzero_si256(); + for (; j + 10 <= w; j += 8) { + __m256i y0 = _mm256_loadu_si256((__m256i*)(y_row + j - 2)); + __m256i y1 = _mm256_loadu_si256((__m256i*)(y_row + j - 1)); + __m256i y2 = _mm256_loadu_si256((__m256i*)(y_row + j)); + __m256i y3 = _mm256_loadu_si256((__m256i*)(y_row + j + 1)); + __m256i y4 = _mm256_loadu_si256((__m256i*)(y_row + j + 2)); + + // Each product fits in int32 + __m256i p0 = _mm256_mullo_epi32(y0, g0); + __m256i p1 = _mm256_mullo_epi32(y1, g1); + __m256i p2 = _mm256_mullo_epi32(y2, g2); + __m256i p3 = _mm256_mullo_epi32(y3, g1); + __m256i p4 = _mm256_mullo_epi32(y4, g0); + + // Safe pairs that fit in int32 + __m256i s04 = _mm256_add_epi32(p0, p4); + __m256i s13 = _mm256_add_epi32(p1, p3); + + // Widen to int64 and accumulate (lo 4 elements) + __m256i acc_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(s04)); + acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(s13))); + acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(p2))); + + // hi 4 elements + __m256i acc_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s04, 1)); + acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s13, 1))); + acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(p2, 1))); + + // Round and arithmetic right shift >>16 + acc_lo = srai_epi64_16(_mm256_add_epi64(acc_lo, round64)); + acc_hi = srai_epi64_16(_mm256_add_epi64(acc_hi, round64)); + + // Pack int64 -> int32 (gather low 32 bits of each 64-bit lane) + __m128i res_lo = _mm256_castsi256_si128( + _mm256_permutevar8x32_epi32(acc_lo, perm_idx)); + __m128i res_hi = _mm256_castsi256_si128( + _mm256_permutevar8x32_epi32(acc_hi, perm_idx)); + + // Combine into 8 x int32, abs, accumulate + __m256i result = _mm256_inserti128_si256( + _mm256_castsi128_si256(res_lo), res_hi, 1); + __m256i abs_result = _mm256_abs_epi32(result); + + sad_acc = _mm256_add_epi32(sad_acc, abs_result); + } + + // Horizontal reduction of sad_acc (8 x int32 -> scalar) + __m128i lo128 = _mm256_castsi256_si128(sad_acc); + __m128i hi128 = _mm256_extracti128_si256(sad_acc, 1); + __m128i sum128 = _mm_add_epi32(lo128, hi128); + sum128 = _mm_add_epi32(sum128, + _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1, 0, 3, 2))); + sum128 = _mm_add_epi32(sum128, + _mm_shuffle_epi32(sum128, _MM_SHUFFLE(0, 1, 0, 1))); + row_sad += (uint32_t)_mm_cvtsi128_si32(sum128); + + // Scalar right edge + tail + for (; j < w; j++) { + int64_t accum = 0; + for (int k = 0; k < 5; k++) { + int col = mirror((int)j - 2 + k, (int)w); + accum += (int64_t)filter[k] * y_row[col]; + } + int32_t val = (int32_t)((accum + (1 << 15)) >> 16); + row_sad += abs(val); + } + + return row_sad; +} + +uint64_t motion_score_pipeline_16_avx2(const uint8_t *prev_u8, ptrdiff_t prev_stride, + const uint8_t *cur_u8, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc) +{ + const uint16_t *prev = (const uint16_t *)prev_u8; + const uint16_t *cur = (const uint16_t *)cur_u8; + const ptrdiff_t p_stride = prev_stride / 2; + const ptrdiff_t c_stride = cur_stride / 2; + + const __m256i g0 = _mm256_set1_epi32(3571); + const __m256i g1 = _mm256_set1_epi32(16004); + const __m256i g2 = _mm256_set1_epi32(26386); + const __m256i round64 = _mm256_set1_epi64x(1 << (bpc - 1)); + const __m256i bpc_vec = _mm256_set1_epi64x(bpc); + const __m256i perm_idx = _mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0); + + uint64_t sad = 0; + + for (unsigned i = 0; i < h; i++) { + const uint16_t *pp[5], *cp[5]; + for (int k = 0; k < 5; k++) { + int r = mirror((int)i - 2 + k, (int)h); + pp[k] = prev + r * p_stride; + cp[k] = cur + r * c_stride; + } + + // Phase 1: diff + y_conv -> y_row (8 pixels at a time, int64 accum) + unsigned j; + __m256i nz_acc = _mm256_setzero_si256(); + for (j = 0; j + 8 <= w; j += 8) { + __m256i d0 = _mm256_sub_epi32( + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[0] + j))), + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[0] + j)))); + __m256i d1 = _mm256_sub_epi32( + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[1] + j))), + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[1] + j)))); + __m256i d2 = _mm256_sub_epi32( + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[2] + j))), + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[2] + j)))); + __m256i d3 = _mm256_sub_epi32( + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[3] + j))), + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[3] + j)))); + __m256i d4 = _mm256_sub_epi32( + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(pp[4] + j))), + _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i*)(cp[4] + j)))); + + __m256i prod0 = _mm256_mullo_epi32(d0, g0); + __m256i prod1 = _mm256_mullo_epi32(d1, g1); + __m256i prod2 = _mm256_mullo_epi32(d2, g2); + __m256i prod3 = _mm256_mullo_epi32(d3, g1); + __m256i prod4 = _mm256_mullo_epi32(d4, g0); + + __m256i acc_lo = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod0)); + acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod1))); + acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod2))); + acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod3))); + acc_lo = _mm256_add_epi64(acc_lo, _mm256_cvtepi32_epi64(_mm256_castsi256_si128(prod4))); + + __m256i acc_hi = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod0, 1)); + acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod1, 1))); + acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod2, 1))); + acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod3, 1))); + acc_hi = _mm256_add_epi64(acc_hi, _mm256_cvtepi32_epi64(_mm256_extracti128_si256(prod4, 1))); + + acc_lo = _mm256_srlv_epi64(_mm256_add_epi64(acc_lo, round64), bpc_vec); + acc_hi = _mm256_srlv_epi64(_mm256_add_epi64(acc_hi, round64), bpc_vec); + + __m128i res_lo = _mm256_castsi256_si128( + _mm256_permutevar8x32_epi32(acc_lo, perm_idx)); + __m128i res_hi = _mm256_castsi256_si128( + _mm256_permutevar8x32_epi32(acc_hi, perm_idx)); + + __m256i result = _mm256_inserti128_si256( + _mm256_castsi128_si256(res_lo), res_hi, 1); + _mm256_storeu_si256((__m256i*)(y_row + j), result); + nz_acc = _mm256_or_si256(nz_acc, result); + } + + // Scalar tail for phase 1 + int32_t nz_tail = 0; + for (; j < w; j++) { + int64_t accum = 0; + for (int k = 0; k < 5; k++) { + int32_t diff = pp[k][j] - cp[k][j]; + accum += (int64_t)filter[k] * diff; + } + y_row[j] = (int32_t)((accum + (1 << (bpc - 1))) >> bpc); + nz_tail |= y_row[j]; + } + + if (_mm256_testz_si256(nz_acc, nz_acc) && !nz_tail) continue; + + // Phase 2: SIMD x_conv + abs + accumulate + sad += x_conv_row_sad_avx2(y_row, w); + } + + return sad; +} + +uint64_t motion_score_pipeline_8_avx2(const uint8_t *prev, ptrdiff_t prev_stride, + const uint8_t *cur, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc) +{ + (void)bpc; + const __m256i f0 = _mm256_set1_epi16(3571); + const __m256i f1 = _mm256_set1_epi16(16004); + const __m256i f2 = _mm256_set1_epi16(26386); + const __m256i round8 = _mm256_set1_epi32(1 << 7); + + uint64_t sad = 0; + + for (unsigned i = 0; i < h; i++) { + const uint8_t *p[5], *c[5]; + for (int k = 0; k < 5; k++) { + int r = mirror((int)i - 2 + k, (int)h); + p[k] = prev + r * prev_stride; + c[k] = cur + r * cur_stride; + } + + // Phase 1: diff + y_conv -> y_row (16 columns at a time, shift >>8) + unsigned j; + __m256i nz_acc = _mm256_setzero_si256(); + for (j = 0; j + 16 <= w; j += 16) { + __m256i d0 = _mm256_sub_epi16( + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[0] + j))), + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[0] + j)))); + __m256i d1 = _mm256_sub_epi16( + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[1] + j))), + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[1] + j)))); + __m256i d2 = _mm256_sub_epi16( + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[2] + j))), + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[2] + j)))); + __m256i d3 = _mm256_sub_epi16( + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[3] + j))), + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[3] + j)))); + __m256i d4 = _mm256_sub_epi16( + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(p[4] + j))), + _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(c[4] + j)))); + + __m256i lo = _mm256_mullo_epi16(d0, f0); + __m256i hi = _mm256_mulhi_epi16(d0, f0); + __m256i acc_lo = _mm256_unpacklo_epi16(lo, hi); + __m256i acc_hi = _mm256_unpackhi_epi16(lo, hi); + + lo = _mm256_mullo_epi16(d1, f1); + hi = _mm256_mulhi_epi16(d1, f1); + acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(lo, hi)); + acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(lo, hi)); + + lo = _mm256_mullo_epi16(d2, f2); + hi = _mm256_mulhi_epi16(d2, f2); + acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(lo, hi)); + acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(lo, hi)); + + lo = _mm256_mullo_epi16(d3, f1); + hi = _mm256_mulhi_epi16(d3, f1); + acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(lo, hi)); + acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(lo, hi)); + + lo = _mm256_mullo_epi16(d4, f0); + hi = _mm256_mulhi_epi16(d4, f0); + acc_lo = _mm256_add_epi32(acc_lo, _mm256_unpacklo_epi16(lo, hi)); + acc_hi = _mm256_add_epi32(acc_hi, _mm256_unpackhi_epi16(lo, hi)); + + acc_lo = _mm256_srai_epi32(_mm256_add_epi32(acc_lo, round8), 8); + acc_hi = _mm256_srai_epi32(_mm256_add_epi32(acc_hi, round8), 8); + + __m256i cols_0_7 = _mm256_permute2x128_si256(acc_lo, acc_hi, 0x20); + __m256i cols_8_15 = _mm256_permute2x128_si256(acc_lo, acc_hi, 0x31); + _mm256_storeu_si256((__m256i*)(y_row + j), cols_0_7); + _mm256_storeu_si256((__m256i*)(y_row + j + 8), cols_8_15); + nz_acc = _mm256_or_si256(nz_acc, _mm256_or_si256(cols_0_7, cols_8_15)); + } + + // Scalar tail for phase 1 + int32_t nz_tail = 0; + for (; j < w; j++) { + int32_t accum = 0; + for (int k = 0; k < 5; k++) { + int32_t diff = p[k][j] - c[k][j]; + accum += (int32_t)filter[k] * diff; + } + y_row[j] = (accum + (1 << 7)) >> 8; + nz_tail |= y_row[j]; + } + + if (_mm256_testz_si256(nz_acc, nz_acc) && !nz_tail) continue; + + // Phase 2: SIMD x_conv + abs + accumulate + sad += x_conv_row_sad_avx2(y_row, w); + } + + return sad; +} diff --git a/libvmaf/src/feature/x86/motion_v2_avx2.h b/libvmaf/src/feature/x86/motion_v2_avx2.h new file mode 100644 index 000000000..333243af2 --- /dev/null +++ b/libvmaf/src/feature/x86/motion_v2_avx2.h @@ -0,0 +1,35 @@ +/** + * + * Copyright 2016-2025 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef X86_AVX2_MOTION_V2_H_ +#define X86_AVX2_MOTION_V2_H_ + +#include +#include + +uint64_t motion_score_pipeline_8_avx2(const uint8_t *prev, ptrdiff_t prev_stride, + const uint8_t *cur, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc); + +uint64_t motion_score_pipeline_16_avx2(const uint8_t *prev, ptrdiff_t prev_stride, + const uint8_t *cur, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc); + +#endif /* X86_AVX2_MOTION_V2_H_ */ diff --git a/libvmaf/src/feature/x86/motion_v2_avx512.c b/libvmaf/src/feature/x86/motion_v2_avx512.c new file mode 100644 index 000000000..218fc61eb --- /dev/null +++ b/libvmaf/src/feature/x86/motion_v2_avx512.c @@ -0,0 +1,314 @@ +/** + * + * Copyright 2016-2025 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#include +#include +#include +#include + +#include "feature/integer_motion.h" + +static inline int mirror(int idx, int size) +{ + if (idx < 0) return -idx; + if (idx >= size) return 2 * size - idx - 1; + return idx; +} + +// SIMD phase 2: x_conv + abs + SAD for one row of int32 y_row. +// Processes 16 int32 columns at a time via mullo_epi32 + int64 accumulation. +static inline uint32_t +x_conv_row_sad_avx512(const int32_t *y_row, unsigned w) +{ + const __m512i g0 = _mm512_set1_epi32(3571); + const __m512i g1 = _mm512_set1_epi32(16004); + const __m512i g2 = _mm512_set1_epi32(26386); + const __m512i round64 = _mm512_set1_epi64(1 << 15); + + uint32_t row_sad = 0; + + // Scalar left edge (columns 0, 1) — mirror boundary + unsigned j; + for (j = 0; j < 2 && j < w; j++) { + int64_t accum = 0; + for (int k = 0; k < 5; k++) { + int col = mirror((int)j - 2 + k, (int)w); + accum += (int64_t)filter[k] * y_row[col]; + } + int32_t val = (int32_t)((accum + (1 << 15)) >> 16); + row_sad += abs(val); + } + + // SIMD middle: need y_row[j-2]..y_row[j+17], so j+18 <= w + __m512i sad_acc = _mm512_setzero_si512(); + for (; j + 18 <= w; j += 16) { + __m512i y0 = _mm512_loadu_si512((__m512i*)(y_row + j - 2)); + __m512i y1 = _mm512_loadu_si512((__m512i*)(y_row + j - 1)); + __m512i y2 = _mm512_loadu_si512((__m512i*)(y_row + j)); + __m512i y3 = _mm512_loadu_si512((__m512i*)(y_row + j + 1)); + __m512i y4 = _mm512_loadu_si512((__m512i*)(y_row + j + 2)); + + // Each product fits in int32 + __m512i p0 = _mm512_mullo_epi32(y0, g0); + __m512i p1 = _mm512_mullo_epi32(y1, g1); + __m512i p2 = _mm512_mullo_epi32(y2, g2); + __m512i p3 = _mm512_mullo_epi32(y3, g1); + __m512i p4 = _mm512_mullo_epi32(y4, g0); + + // Safe pairs that fit in int32 + __m512i s04 = _mm512_add_epi32(p0, p4); + __m512i s13 = _mm512_add_epi32(p1, p3); + + // Widen to int64 and accumulate (lo 8 elements) + __m512i acc_lo = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(s04)); + acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(s13))); + acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(p2))); + + // hi 8 elements + __m512i acc_hi = _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(s04, 1)); + acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(s13, 1))); + acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(p2, 1))); + + // Round and arithmetic right shift >>16 (native in AVX-512) + acc_lo = _mm512_srai_epi64(_mm512_add_epi64(acc_lo, round64), 16); + acc_hi = _mm512_srai_epi64(_mm512_add_epi64(acc_hi, round64), 16); + + // Narrow int64 -> int32 (signed saturation) + __m256i res_lo = _mm512_cvtsepi64_epi32(acc_lo); + __m256i res_hi = _mm512_cvtsepi64_epi32(acc_hi); + + // Combine into 16 x int32, abs, accumulate + __m512i result = _mm512_inserti64x4(_mm512_castsi256_si512(res_lo), res_hi, 1); + __m512i abs_result = _mm512_abs_epi32(result); + + sad_acc = _mm512_add_epi32(sad_acc, abs_result); + } + + row_sad += (uint32_t)_mm512_reduce_add_epi32(sad_acc); + + // Scalar right edge + tail + for (; j < w; j++) { + int64_t accum = 0; + for (int k = 0; k < 5; k++) { + int col = mirror((int)j - 2 + k, (int)w); + accum += (int64_t)filter[k] * y_row[col]; + } + int32_t val = (int32_t)((accum + (1 << 15)) >> 16); + row_sad += abs(val); + } + + return row_sad; +} + +uint64_t motion_score_pipeline_16_avx512(const uint8_t *prev_u8, ptrdiff_t prev_stride, + const uint8_t *cur_u8, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc) +{ + const uint16_t *prev = (const uint16_t *)prev_u8; + const uint16_t *cur = (const uint16_t *)cur_u8; + const ptrdiff_t p_stride = prev_stride / 2; + const ptrdiff_t c_stride = cur_stride / 2; + + const __m512i g0 = _mm512_set1_epi32(3571); + const __m512i g1 = _mm512_set1_epi32(16004); + const __m512i g2 = _mm512_set1_epi32(26386); + const __m512i round64 = _mm512_set1_epi64(1 << (bpc - 1)); + const __m512i bpc_vec = _mm512_set1_epi64(bpc); + + uint64_t sad = 0; + + for (unsigned i = 0; i < h; i++) { + const uint16_t *pp[5], *cp[5]; + for (int k = 0; k < 5; k++) { + int r = mirror((int)i - 2 + k, (int)h); + pp[k] = prev + r * p_stride; + cp[k] = cur + r * c_stride; + } + + unsigned j; + __m512i nz_acc = _mm512_setzero_si512(); + for (j = 0; j + 16 <= w; j += 16) { + __m512i d0 = _mm512_sub_epi32( + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[0] + j))), + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[0] + j)))); + __m512i d1 = _mm512_sub_epi32( + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[1] + j))), + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[1] + j)))); + __m512i d2 = _mm512_sub_epi32( + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[2] + j))), + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[2] + j)))); + __m512i d3 = _mm512_sub_epi32( + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[3] + j))), + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[3] + j)))); + __m512i d4 = _mm512_sub_epi32( + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(pp[4] + j))), + _mm512_cvtepu16_epi32(_mm256_loadu_si256((__m256i*)(cp[4] + j)))); + + __m512i prod0 = _mm512_mullo_epi32(d0, g0); + __m512i prod1 = _mm512_mullo_epi32(d1, g1); + __m512i prod2 = _mm512_mullo_epi32(d2, g2); + __m512i prod3 = _mm512_mullo_epi32(d3, g1); + __m512i prod4 = _mm512_mullo_epi32(d4, g0); + + __m512i acc_lo = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod0)); + acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod1))); + acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod2))); + acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod3))); + acc_lo = _mm512_add_epi64(acc_lo, _mm512_cvtepi32_epi64(_mm512_castsi512_si256(prod4))); + + __m512i acc_hi = _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod0, 1)); + acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod1, 1))); + acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod2, 1))); + acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod3, 1))); + acc_hi = _mm512_add_epi64(acc_hi, _mm512_cvtepi32_epi64(_mm512_extracti64x4_epi64(prod4, 1))); + + acc_lo = _mm512_srav_epi64(_mm512_add_epi64(acc_lo, round64), bpc_vec); + acc_hi = _mm512_srav_epi64(_mm512_add_epi64(acc_hi, round64), bpc_vec); + + __m256i res_lo = _mm512_cvtsepi64_epi32(acc_lo); + __m256i res_hi = _mm512_cvtsepi64_epi32(acc_hi); + + __m512i result = _mm512_inserti64x4(_mm512_castsi256_si512(res_lo), res_hi, 1); + _mm512_storeu_si512((__m512i*)(y_row + j), result); + nz_acc = _mm512_or_si512(nz_acc, result); + } + + int32_t nz_tail = 0; + for (; j < w; j++) { + int64_t accum = 0; + for (int k = 0; k < 5; k++) { + int32_t diff = pp[k][j] - cp[k][j]; + accum += (int64_t)filter[k] * diff; + } + y_row[j] = (int32_t)((accum + (1 << (bpc - 1))) >> bpc); + nz_tail |= y_row[j]; + } + + if (_mm512_test_epi32_mask(nz_acc, nz_acc) == 0 && !nz_tail) continue; + + sad += x_conv_row_sad_avx512(y_row, w); + } + + return sad; +} + +uint64_t motion_score_pipeline_8_avx512(const uint8_t *prev, ptrdiff_t prev_stride, + const uint8_t *cur, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc) +{ + (void)bpc; + const __m512i f0 = _mm512_set1_epi16(3571); + const __m512i f1 = _mm512_set1_epi16(16004); + const __m512i f2 = _mm512_set1_epi16(26386); + const __m512i round8 = _mm512_set1_epi32(1 << 7); + + uint64_t sad = 0; + + for (unsigned i = 0; i < h; i++) { + const uint8_t *p[5], *c[5]; + for (int k = 0; k < 5; k++) { + int r = mirror((int)i - 2 + k, (int)h); + p[k] = prev + r * prev_stride; + c[k] = cur + r * cur_stride; + } + + unsigned j; + __m512i nz_acc = _mm512_setzero_si512(); + for (j = 0; j + 32 <= w; j += 32) { + __m512i d0 = _mm512_sub_epi16( + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[0] + j))), + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[0] + j)))); + __m512i d1 = _mm512_sub_epi16( + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[1] + j))), + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[1] + j)))); + __m512i d2 = _mm512_sub_epi16( + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[2] + j))), + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[2] + j)))); + __m512i d3 = _mm512_sub_epi16( + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[3] + j))), + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[3] + j)))); + __m512i d4 = _mm512_sub_epi16( + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(p[4] + j))), + _mm512_cvtepu8_epi16(_mm256_loadu_si256((__m256i*)(c[4] + j)))); + + __m512i lo = _mm512_mullo_epi16(d0, f0); + __m512i hi = _mm512_mulhi_epi16(d0, f0); + __m512i acc_lo = _mm512_unpacklo_epi16(lo, hi); + __m512i acc_hi = _mm512_unpackhi_epi16(lo, hi); + + lo = _mm512_mullo_epi16(d1, f1); hi = _mm512_mulhi_epi16(d1, f1); + acc_lo = _mm512_add_epi32(acc_lo, _mm512_unpacklo_epi16(lo, hi)); + acc_hi = _mm512_add_epi32(acc_hi, _mm512_unpackhi_epi16(lo, hi)); + + lo = _mm512_mullo_epi16(d2, f2); hi = _mm512_mulhi_epi16(d2, f2); + acc_lo = _mm512_add_epi32(acc_lo, _mm512_unpacklo_epi16(lo, hi)); + acc_hi = _mm512_add_epi32(acc_hi, _mm512_unpackhi_epi16(lo, hi)); + + lo = _mm512_mullo_epi16(d3, f1); hi = _mm512_mulhi_epi16(d3, f1); + acc_lo = _mm512_add_epi32(acc_lo, _mm512_unpacklo_epi16(lo, hi)); + acc_hi = _mm512_add_epi32(acc_hi, _mm512_unpackhi_epi16(lo, hi)); + + lo = _mm512_mullo_epi16(d4, f0); hi = _mm512_mulhi_epi16(d4, f0); + acc_lo = _mm512_add_epi32(acc_lo, _mm512_unpacklo_epi16(lo, hi)); + acc_hi = _mm512_add_epi32(acc_hi, _mm512_unpackhi_epi16(lo, hi)); + + acc_lo = _mm512_srai_epi32(_mm512_add_epi32(acc_lo, round8), 8); + acc_hi = _mm512_srai_epi32(_mm512_add_epi32(acc_hi, round8), 8); + + __m256i lo_lo = _mm512_castsi512_si256(acc_lo); + __m256i lo_hi = _mm512_extracti64x4_epi64(acc_lo, 1); + __m256i hi_lo = _mm512_castsi512_si256(acc_hi); + __m256i hi_hi = _mm512_extracti64x4_epi64(acc_hi, 1); + + __m256i cols_0_7 = _mm256_permute2x128_si256(lo_lo, hi_lo, 0x20); + __m256i cols_8_15 = _mm256_permute2x128_si256(lo_lo, hi_lo, 0x31); + __m256i cols_16_23 = _mm256_permute2x128_si256(lo_hi, hi_hi, 0x20); + __m256i cols_24_31 = _mm256_permute2x128_si256(lo_hi, hi_hi, 0x31); + + _mm256_storeu_si256((__m256i*)(y_row + j), cols_0_7); + _mm256_storeu_si256((__m256i*)(y_row + j + 8), cols_8_15); + _mm256_storeu_si256((__m256i*)(y_row + j + 16), cols_16_23); + _mm256_storeu_si256((__m256i*)(y_row + j + 24), cols_24_31); + + __m512i stored = _mm512_inserti64x4( + _mm512_castsi256_si512( + _mm256_or_si256(cols_0_7, cols_8_15)), + _mm256_or_si256(cols_16_23, cols_24_31), 1); + nz_acc = _mm512_or_si512(nz_acc, stored); + } + + int32_t nz_tail = 0; + for (; j < w; j++) { + int32_t accum = 0; + for (int k = 0; k < 5; k++) { + int32_t diff = p[k][j] - c[k][j]; + accum += (int32_t)filter[k] * diff; + } + y_row[j] = (accum + (1 << 7)) >> 8; + nz_tail |= y_row[j]; + } + + if (_mm512_test_epi32_mask(nz_acc, nz_acc) == 0 && !nz_tail) continue; + + sad += x_conv_row_sad_avx512(y_row, w); + } + + return sad; +} diff --git a/libvmaf/src/feature/x86/motion_v2_avx512.h b/libvmaf/src/feature/x86/motion_v2_avx512.h new file mode 100644 index 000000000..f4eda6a71 --- /dev/null +++ b/libvmaf/src/feature/x86/motion_v2_avx512.h @@ -0,0 +1,35 @@ +/** + * + * Copyright 2016-2025 Netflix, Inc. + * + * Licensed under the BSD+Patent License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/BSDplusPatent + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +#ifndef X86_AVX512_MOTION_V2_H_ +#define X86_AVX512_MOTION_V2_H_ + +#include +#include + +uint64_t motion_score_pipeline_8_avx512(const uint8_t *prev, ptrdiff_t prev_stride, + const uint8_t *cur, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc); + +uint64_t motion_score_pipeline_16_avx512(const uint8_t *prev, ptrdiff_t prev_stride, + const uint8_t *cur, ptrdiff_t cur_stride, + int32_t *y_row, unsigned w, unsigned h, + unsigned bpc); + +#endif /* X86_AVX512_MOTION_V2_H_ */ diff --git a/libvmaf/src/libvmaf.c b/libvmaf/src/libvmaf.c index ee95b202b..49bd059be 100644 --- a/libvmaf/src/libvmaf.c +++ b/libvmaf/src/libvmaf.c @@ -90,6 +90,7 @@ typedef struct VmafContext { } pic_params; unsigned pic_cnt; bool flushed; + VmafPicture prev_ref; // previous ref pic for PREV_REF extractors (in-order only) } VmafContext; #ifdef VMAF_BATCH_THREADING @@ -348,6 +349,8 @@ int vmaf_close(VmafContext *vmaf) if (!vmaf) return -EINVAL; vmaf_thread_pool_wait(vmaf->thread_pool); + if (vmaf->prev_ref.ref) + vmaf_picture_unref(&vmaf->prev_ref); vmaf_framesync_destroy(vmaf->framesync); feature_extractor_vector_destroy(&(vmaf->registered_feature_extractors)); vmaf_feature_collector_destroy(vmaf->feature_collector); @@ -483,7 +486,7 @@ int vmaf_use_features_from_model_collection(VmafContext *vmaf, struct ThreadData { VmafFeatureExtractorContext *fex_ctx; - VmafPicture ref, dist; + VmafPicture ref, dist, prev_ref; unsigned index; VmafFeatureCollector *feature_collector; VmafFeatureExtractorContextPool *fex_ctx_pool; @@ -494,9 +497,19 @@ static void threaded_extract_func(void *e, void **thread_data) { (void) thread_data; struct ThreadData *f = e; + + if (f->prev_ref.ref) + f->fex_ctx->fex->prev_ref = f->prev_ref; + f->err = vmaf_feature_extractor_context_extract(f->fex_ctx, &f->ref, NULL, &f->dist, NULL, f->index, f->feature_collector); + + if (f->prev_ref.ref) { + memset(&f->fex_ctx->fex->prev_ref, 0, sizeof(f->fex_ctx->fex->prev_ref)); + vmaf_picture_unref(&f->prev_ref); + } + f->err = vmaf_fex_ctx_pool_release(f->fex_ctx_pool, f->fex_ctx); vmaf_picture_unref(&f->ref); vmaf_picture_unref(&f->dist); @@ -504,7 +517,7 @@ static void threaded_extract_func(void *e, void **thread_data) #ifdef VMAF_BATCH_THREADING struct ThreadDataBatch { - VmafPicture ref, dist; + VmafPicture ref, dist, prev_ref; unsigned index; VmafFeatureCollector *feature_collector; RegisteredFeatureExtractors *registered_fex; @@ -551,11 +564,21 @@ static void threaded_extract_batch_func(void *e, void **thread_data) if (err) { f->err = err; break; } } + if (fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF) { + if (f->prev_ref.ref) + td->fex_ctx[i]->fex->prev_ref = f->prev_ref; + } + int err = vmaf_feature_extractor_context_extract(td->fex_ctx[i], &f->ref, NULL, &f->dist, NULL, f->index, f->feature_collector); + + if (fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF) + memset(&td->fex_ctx[i]->fex->prev_ref, 0, + sizeof(td->fex_ctx[i]->fex->prev_ref)); + if (err) { f->err = err; break; @@ -563,6 +586,8 @@ static void threaded_extract_batch_func(void *e, void **thread_data) } unref: + if (f->prev_ref.ref) + vmaf_picture_unref(&f->prev_ref); vmaf_picture_unref(&f->ref); vmaf_picture_unref(&f->dist); } @@ -597,14 +622,21 @@ static int threaded_read_pictures(VmafContext *vmaf, VmafPicture *ref, &fex_ctx); if (err) return err; - VmafPicture pic_a, pic_b; + VmafPicture pic_a, pic_b, prev_ref = { 0 }; vmaf_picture_ref(&pic_a, ref); vmaf_picture_ref(&pic_b, dist); + if ((fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF) && + vmaf->prev_ref.ref) + { + vmaf_picture_ref(&prev_ref, &vmaf->prev_ref); + } + struct ThreadData data = { .fex_ctx = fex_ctx, .ref = pic_a, .dist = pic_b, + .prev_ref = prev_ref, .index = index, .feature_collector = vmaf->feature_collector, .fex_ctx_pool = vmaf->fex_ctx_pool, @@ -616,10 +648,15 @@ static int threaded_read_pictures(VmafContext *vmaf, VmafPicture *ref, if (err) { vmaf_picture_unref(&pic_a); vmaf_picture_unref(&pic_b); + if (prev_ref.ref) vmaf_picture_unref(&prev_ref); return err; } } + if (vmaf->prev_ref.ref) + vmaf_picture_unref(&vmaf->prev_ref); + vmaf_picture_ref(&vmaf->prev_ref, ref); + return vmaf_picture_unref(ref) | vmaf_picture_unref(dist); } @@ -633,13 +670,17 @@ static int threaded_read_pictures_batch(VmafContext *vmaf, VmafPicture *ref, int err = 0; - VmafPicture pic_a, pic_b; + VmafPicture pic_a, pic_b, prev_ref = { 0 }; vmaf_picture_ref(&pic_a, ref); vmaf_picture_ref(&pic_b, dist); + if (vmaf->prev_ref.ref) + vmaf_picture_ref(&prev_ref, &vmaf->prev_ref); + struct ThreadDataBatch data = { .ref = pic_a, .dist = pic_b, + .prev_ref = prev_ref, .index = index, .feature_collector = vmaf->feature_collector, .registered_fex = &vmaf->registered_feature_extractors, @@ -652,9 +693,14 @@ static int threaded_read_pictures_batch(VmafContext *vmaf, VmafPicture *ref, if (err) { vmaf_picture_unref(&pic_a); vmaf_picture_unref(&pic_b); + if (prev_ref.ref) vmaf_picture_unref(&prev_ref); return err; } + if (vmaf->prev_ref.ref) + vmaf_picture_unref(&vmaf->prev_ref); + vmaf_picture_ref(&vmaf->prev_ref, ref); + return vmaf_picture_unref(ref) | vmaf_picture_unref(dist); } #endif // VMAF_BATCH_THREADING @@ -706,6 +752,22 @@ static int flush_context_threaded(VmafContext *vmaf) err |= vmaf_fex_ctx_pool_flush(vmaf->fex_ctx_pool, vmaf->feature_collector); #endif + { + RegisteredFeatureExtractors rfe = vmaf->registered_feature_extractors; + for (unsigned i = 0; i < rfe.cnt; i++) { + VmafFeatureExtractor *fex = rfe.fex_ctx[i]->fex; + if (fex->flags & VMAF_FEATURE_EXTRACTOR_TEMPORAL) + continue; + if (fex->flags & VMAF_FEATURE_EXTRACTOR_CUDA) + continue; + if (!fex->flush) + continue; + int flush_err = 0; + while (!(flush_err = fex->flush(fex, vmaf->feature_collector))); + if (flush_err < 0) err |= flush_err; + } + } + if (!err) vmaf->flushed = true; return err; } @@ -918,9 +980,19 @@ int vmaf_read_pictures(VmafContext *vmaf, VmafPicture *ref, VmafPicture *dist, &dist_device : &dist_host; #endif + if ((fex_ctx->fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF) && + vmaf->prev_ref.ref) + { + fex_ctx->fex->prev_ref = vmaf->prev_ref; + } + err = vmaf_feature_extractor_context_extract(fex_ctx, ref, NULL, dist, NULL, index, vmaf->feature_collector); + + if (fex_ctx->fex->flags & VMAF_FEATURE_EXTRACTOR_PREV_REF) + memset(&fex_ctx->fex->prev_ref, 0, sizeof(fex_ctx->fex->prev_ref)); + if (err) return err; } @@ -938,6 +1010,10 @@ int vmaf_read_pictures(VmafContext *vmaf, VmafPicture *ref, VmafPicture *dist, return threaded_read_pictures(vmaf, ref, dist, index); #endif } + + if (vmaf->prev_ref.ref) + vmaf_picture_unref(&vmaf->prev_ref); + vmaf_picture_ref(&vmaf->prev_ref, ref); #ifdef HAVE_CUDA if (ref_host.priv) err |= vmaf_picture_unref(&ref_host); diff --git a/libvmaf/src/meson.build b/libvmaf/src/meson.build index 2cc4cea3d..423e027de 100644 --- a/libvmaf/src/meson.build +++ b/libvmaf/src/meson.build @@ -239,6 +239,7 @@ if is_asm_enabled x86_avx2_sources = [ feature_src_dir + 'common/convolution_avx.c', feature_src_dir + 'x86/motion_avx2.c', + feature_src_dir + 'x86/motion_v2_avx2.c', feature_src_dir + 'x86/vif_avx2.c', feature_src_dir + 'x86/adm_avx2.c', feature_src_dir + 'x86/cambi_avx2.c', @@ -256,6 +257,7 @@ if is_asm_enabled if is_avx512_enabled and is_avx512_supported x86_avx512_sources = [ feature_src_dir + 'x86/motion_avx512.c', + feature_src_dir + 'x86/motion_v2_avx512.c', feature_src_dir + 'x86/vif_avx512.c', feature_src_dir + 'x86/adm_avx512.c', ] @@ -440,6 +442,7 @@ libvmaf_feature_sources = [ feature_src_dir + 'integer_adm.c', feature_src_dir + 'feature_collector.c', feature_src_dir + 'integer_motion.c', + feature_src_dir + 'integer_motion_v2.c', feature_src_dir + 'integer_vif.c', feature_src_dir + 'ciede.c', feature_src_dir + 'common/alignment.c',