diff --git a/arch/ffmpeg-v4l2-request-fourier/0007-h264-qpel-mc20-daedalus-fourier.patch b/arch/ffmpeg-v4l2-request-fourier/0007-h264-qpel-mc20-daedalus-fourier.patch new file mode 100644 index 0000000000..9b27f3e0e3 --- /dev/null +++ b/arch/ffmpeg-v4l2-request-fourier/0007-h264-qpel-mc20-daedalus-fourier.patch @@ -0,0 +1,139 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Markus Fritsche +Date: Sat, 23 May 2026 12:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264qpel: route 8x8 mc20 through + daedalus-fourier +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +H264QpelContext.put_h264_qpel_pixels_tab[1][2] (8x8 luma horizontal +half-pel, 6-tap "put" variant — the canonical representative of the +H.264 luma motion-compensation family) now dispatches through +daedalus_recipe_dispatch_h264_qpel_mc20 instead of +ff_put_h264_qpel8_mc20_neon. + +Cycle 9 of the daedalus-v4l2#11 step 2 substitution arc; closes the +4-cycle libavcodec.so substitution sequence (6 IDCT 4x4 / 7 IDCT 8x8 / +8 luma-v deblock / 9 qpel mc20). + +The recipe layer picks the substrate. Per docs/k9_h264qpel_mc20.md +the verdict is CPU NEON: per-block 7.6 ns at 131 Mblock/s gives 135x +margin over 30 fps 1080p, and the QPU dispatch floor (~250 ns) +makes any V3D shader strictly worse. Substitution is plumbing-only, +NEON-by-recipe — same daedalus_ctx_create_no_qpu pthread_once +context shape the cycles 6/7/8 shims already own (kept SEPARATE +from the H264DSP shim's ctx because H264QPEL is its own libavcodec +Makefile module and link order does not guarantee a single .o +owns the ctx symbol; one extra ~µs init per process, paid lazily). + +Other H.264 luma MC variants (mc02, mc11, mc22 etc.) and the 16x16 +size tier stay on the in-tree NEON .S code. Per the cycle-9 phase-1 +rationale, mc20 8x8 is representative of the whole family's per-block +cost — extending the substitution to other variants would multiply +recipe-lookup overhead without changing the substrate verdict. + +Bit-exact against ff_put_h264_qpel8_mc20_neon (daedalus-fourier +cycle 9 green; M1 = 100% bit-exact across 10000 random blocks). + +No SONAME change, no Depends change. + +Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 9. +--- + libavcodec/aarch64/Makefile | 3 +- + libavcodec/aarch64/h264_qpel_daedalus.c | 50 ++++++++++++++++++++++ + libavcodec/aarch64/h264qpel_init_aarch64.c | 4 +- + 3 files changed, 55 insertions(+), 2 deletions(-) + create mode 100644 libavcodec/aarch64/h264_qpel_daedalus.c + +diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile +--- a/libavcodec/aarch64/Makefile ++++ b/libavcodec/aarch64/Makefile +@@ -7,7 +7,8 @@ OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o \ + aarch64/h264_idct_daedalus.o + OBJS-$(CONFIG_HUFFYUVDSP) += aarch64/huffyuvdsp_init_aarch64.o + OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o +-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o ++OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o \ ++ aarch64/h264_qpel_daedalus.o + OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o + OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o + OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o +diff --git a/libavcodec/aarch64/h264_qpel_daedalus.c b/libavcodec/aarch64/h264_qpel_daedalus.c +new file mode 100644 +--- /dev/null ++++ b/libavcodec/aarch64/h264_qpel_daedalus.c +@@ -0,0 +1,50 @@ ++/* ++ * H.264 luma qpel mc20 (8x8, horizontal half-pel, 6-tap "put") ++ * — daedalus-fourier substitution shim. ++ * ++ * Routes H264QpelContext.put_h264_qpel_pixels_tab[1][2] through ++ * daedalus_recipe_dispatch_h264_qpel_mc20 instead of ++ * ff_put_h264_qpel8_mc20_neon. The recipe layer picks the substrate ++ * (CPU NEON for cycle 9; QPU not viable — per-block 7.6 ns vs ++ * ~250 ns QPU dispatch floor, see docs/k9_h264qpel_mc20.md). ++ * ++ * Sibling to libavcodec/aarch64/h264_idct_daedalus.c. We keep a ++ * SEPARATE process-global pthread_once context here instead of ++ * sharing the H264DSP one because H264QPEL is its own libavcodec ++ * Makefile module and link order does not guarantee a single .o ++ * owns the ctx symbol. The cost is one extra ++ * daedalus_ctx_create_no_qpu (~µs) per process; daemon and host ++ * processes pay this lazily on first MC call. ++ * ++ * FFmpeg H264QpelContext convention: both dst and src use a SINGLE ++ * stride and `src` already points at the leftmost OUTPUT column ++ * (col 0); the 6-tap filter reads cols -2..+3. This matches ++ * daedalus_recipe_dispatch_h264_qpel_mc20's documented contract ++ * directly, so dst_off = src_off = 0. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++ ++#include "libavutil/attributes.h" ++ ++static daedalus_ctx *g_dctx; ++static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT; ++ ++static void daedalus_ctx_init_once(void) ++{ ++ g_dctx = daedalus_ctx_create_no_qpu(); ++} ++ ++void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); ++ ++void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) ++{ ++ static const daedalus_h264_qpel_meta meta = { .dst_off = 0, .src_off = 0 }; ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ daedalus_recipe_dispatch_h264_qpel_mc20(g_dctx, dst, src, (size_t)stride, ++ 1, &meta); ++} +diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c +--- a/libavcodec/aarch64/h264qpel_init_aarch64.c ++++ b/libavcodec/aarch64/h264qpel_init_aarch64.c +@@ -47,6 +47,8 @@ void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str + void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); ++void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ++ ptrdiff_t stride); + void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +@@ -184,7 +186,7 @@ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth) + + c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; + c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; +- c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; ++ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_daedalus; + c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; + c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; + c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; +-- +2.47.3 diff --git a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD index fb927e8d7e..27c41ce2ee 100644 --- a/arch/ffmpeg-v4l2-request-fourier/PKGBUILD +++ b/arch/ffmpeg-v4l2-request-fourier/PKGBUILD @@ -24,13 +24,13 @@ _srcname=FFmpeg _version='8.1' _commit='b57fbbe50c9b2656fad86a1a7eeabfd2b2a50935' # v4l2-request-n8.1 tip 2026-04-24 pkgver=8.1.r123329.b57fbbe -pkgrel=9 # pkgrel=9 — restore AV_CODEC_FLAG_LOW_DELAY for H.264 (2026-05-22) +pkgrel=10 # pkgrel=10 — H.264 luma qpel mc20 daedalus-fourier substitution (cycle 9, 2026-05-23) epoch=2 -# daedalus-fourier pin — first kernel substitution in libavcodec -# (cycle 6 H.264 IDCT 4x4). Same SHA as the daedalus-v4l2 daemon's -# inline build; lockstep with that until the public API rolls. -_daedalus_fourier_commit='d87239d8172307d9a1b93c95cbed116d175b85cc' +# daedalus-fourier pin. 209a421 = PR #2 merge (Phase 8c — public API +# gains daedalus_recipe_dispatch_h264_qpel_mc20 + DAEDALUS_KERNEL_H264_QPEL_MC20). +# Cycle 9 closes the libavcodec.so substitution arc started at cycle 6. +_daedalus_fourier_commit='209a4218bcb98b91c04f07ad61513bb04adb13ad' pkgdesc='FFmpeg with V4L2 Request API hwaccel (Rockchip / Allwinner stateless decode)' arch=('aarch64') url='https://github.com/Kwiboo/FFmpeg' @@ -93,8 +93,9 @@ source=("git+https://github.com/Kwiboo/FFmpeg.git#commit=${_commit}" '0003-h264-idct4-daedalus-fourier.patch' '0004-h264-idct8-daedalus-fourier.patch' '0005-h264-deblock-luma-v-daedalus-fourier.patch' - '0006-h264-restore-low-delay.patch') -sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') + '0006-h264-restore-low-delay.patch' + '0007-h264-qpel-mc20-daedalus-fourier.patch') +sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP') pkgver() { cd "${_srcname}" @@ -111,6 +112,7 @@ prepare() { patch -Np1 -i "${srcdir}/0004-h264-idct8-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0005-h264-deblock-luma-v-daedalus-fourier.patch" patch -Np1 -i "${srcdir}/0006-h264-restore-low-delay.patch" + patch -Np1 -i "${srcdir}/0007-h264-qpel-mc20-daedalus-fourier.patch" } build() { diff --git a/debian/ffmpeg-v4l2-request-fourier/0007-h264-qpel-mc20-daedalus-fourier.patch b/debian/ffmpeg-v4l2-request-fourier/0007-h264-qpel-mc20-daedalus-fourier.patch new file mode 100644 index 0000000000..9b27f3e0e3 --- /dev/null +++ b/debian/ffmpeg-v4l2-request-fourier/0007-h264-qpel-mc20-daedalus-fourier.patch @@ -0,0 +1,139 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Markus Fritsche +Date: Sat, 23 May 2026 12:00:00 +0200 +Subject: [PATCH] avcodec/aarch64/h264qpel: route 8x8 mc20 through + daedalus-fourier +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +H264QpelContext.put_h264_qpel_pixels_tab[1][2] (8x8 luma horizontal +half-pel, 6-tap "put" variant — the canonical representative of the +H.264 luma motion-compensation family) now dispatches through +daedalus_recipe_dispatch_h264_qpel_mc20 instead of +ff_put_h264_qpel8_mc20_neon. + +Cycle 9 of the daedalus-v4l2#11 step 2 substitution arc; closes the +4-cycle libavcodec.so substitution sequence (6 IDCT 4x4 / 7 IDCT 8x8 / +8 luma-v deblock / 9 qpel mc20). + +The recipe layer picks the substrate. Per docs/k9_h264qpel_mc20.md +the verdict is CPU NEON: per-block 7.6 ns at 131 Mblock/s gives 135x +margin over 30 fps 1080p, and the QPU dispatch floor (~250 ns) +makes any V3D shader strictly worse. Substitution is plumbing-only, +NEON-by-recipe — same daedalus_ctx_create_no_qpu pthread_once +context shape the cycles 6/7/8 shims already own (kept SEPARATE +from the H264DSP shim's ctx because H264QPEL is its own libavcodec +Makefile module and link order does not guarantee a single .o +owns the ctx symbol; one extra ~µs init per process, paid lazily). + +Other H.264 luma MC variants (mc02, mc11, mc22 etc.) and the 16x16 +size tier stay on the in-tree NEON .S code. Per the cycle-9 phase-1 +rationale, mc20 8x8 is representative of the whole family's per-block +cost — extending the substitution to other variants would multiply +recipe-lookup overhead without changing the substrate verdict. + +Bit-exact against ff_put_h264_qpel8_mc20_neon (daedalus-fourier +cycle 9 green; M1 = 100% bit-exact across 10000 random blocks). + +No SONAME change, no Depends change. + +Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 9. +--- + libavcodec/aarch64/Makefile | 3 +- + libavcodec/aarch64/h264_qpel_daedalus.c | 50 ++++++++++++++++++++++ + libavcodec/aarch64/h264qpel_init_aarch64.c | 4 +- + 3 files changed, 55 insertions(+), 2 deletions(-) + create mode 100644 libavcodec/aarch64/h264_qpel_daedalus.c + +diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile +--- a/libavcodec/aarch64/Makefile ++++ b/libavcodec/aarch64/Makefile +@@ -7,7 +7,8 @@ OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o \ + aarch64/h264_idct_daedalus.o + OBJS-$(CONFIG_HUFFYUVDSP) += aarch64/huffyuvdsp_init_aarch64.o + OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o +-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o ++OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o \ ++ aarch64/h264_qpel_daedalus.o + OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o + OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o + OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o +diff --git a/libavcodec/aarch64/h264_qpel_daedalus.c b/libavcodec/aarch64/h264_qpel_daedalus.c +new file mode 100644 +--- /dev/null ++++ b/libavcodec/aarch64/h264_qpel_daedalus.c +@@ -0,0 +1,50 @@ ++/* ++ * H.264 luma qpel mc20 (8x8, horizontal half-pel, 6-tap "put") ++ * — daedalus-fourier substitution shim. ++ * ++ * Routes H264QpelContext.put_h264_qpel_pixels_tab[1][2] through ++ * daedalus_recipe_dispatch_h264_qpel_mc20 instead of ++ * ff_put_h264_qpel8_mc20_neon. The recipe layer picks the substrate ++ * (CPU NEON for cycle 9; QPU not viable — per-block 7.6 ns vs ++ * ~250 ns QPU dispatch floor, see docs/k9_h264qpel_mc20.md). ++ * ++ * Sibling to libavcodec/aarch64/h264_idct_daedalus.c. We keep a ++ * SEPARATE process-global pthread_once context here instead of ++ * sharing the H264DSP one because H264QPEL is its own libavcodec ++ * Makefile module and link order does not guarantee a single .o ++ * owns the ctx symbol. The cost is one extra ++ * daedalus_ctx_create_no_qpu (~µs) per process; daemon and host ++ * processes pay this lazily on first MC call. ++ * ++ * FFmpeg H264QpelContext convention: both dst and src use a SINGLE ++ * stride and `src` already points at the leftmost OUTPUT column ++ * (col 0); the 6-tap filter reads cols -2..+3. This matches ++ * daedalus_recipe_dispatch_h264_qpel_mc20's documented contract ++ * directly, so dst_off = src_off = 0. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++ ++#include "libavutil/attributes.h" ++ ++static daedalus_ctx *g_dctx; ++static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT; ++ ++static void daedalus_ctx_init_once(void) ++{ ++ g_dctx = daedalus_ctx_create_no_qpu(); ++} ++ ++void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); ++ ++void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride) ++{ ++ static const daedalus_h264_qpel_meta meta = { .dst_off = 0, .src_off = 0 }; ++ pthread_once(&g_dctx_once, daedalus_ctx_init_once); ++ daedalus_recipe_dispatch_h264_qpel_mc20(g_dctx, dst, src, (size_t)stride, ++ 1, &meta); ++} +diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c +--- a/libavcodec/aarch64/h264qpel_init_aarch64.c ++++ b/libavcodec/aarch64/h264qpel_init_aarch64.c +@@ -47,6 +47,8 @@ void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str + void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); ++void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ++ ptrdiff_t stride); + void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); + void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); +@@ -184,7 +186,7 @@ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth) + + c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; + c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; +- c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; ++ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_daedalus; + c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; + c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; + c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; +-- +2.47.3 diff --git a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh index b7bb34fb49..e5e7ace66b 100755 --- a/debian/ffmpeg-v4l2-request-fourier/build-deb.sh +++ b/debian/ffmpeg-v4l2-request-fourier/build-deb.sh @@ -33,18 +33,19 @@ FFMPEG_VERSION=8.1 # epoch 2 matches Debian's stock ffmpeg (currently 7:7.1.x in trixie); # +rfourier suffix to avoid colliding with upstream/Debian rebuilds. PKGVER=2:${FFMPEG_VERSION}+rfourier+gb57fbbe -PKGREL=9 # pkgrel=9 — restore AV_CODEC_FLAG_LOW_DELAY semantics in the - # H.264 decoder (FFmpeg 8.x dropped them). Fixes the 2-1-4-3 - # B-frame pair-swap that re-appeared in Firefox YouTube after - # the SONAME 61→62 jump (PR #75) silently neutered the - # daemon's ctx->flags |= AV_CODEC_FLAG_LOW_DELAY at - # daemon/src/decoder.c:202. Substitution arc unchanged. - # (2026-05-22) +PKGREL=10 # pkgrel=10 — H.264 luma qpel mc20 daedalus-fourier substitution + # (cycle 9 of the daedalus-v4l2#11 step 2 substitution arc; closes + # the libavcodec.so substitution sequence 6 IDCT4 / 7 IDCT8 / + # 8 luma-v deblock / 9 qpel mc20). Pulls daedalus-fourier PR #2 + # which extends the public API with + # daedalus_recipe_dispatch_h264_qpel_mc20. (2026-05-23) -# daedalus-fourier pin — first kernel substitution in libavcodec (cycle 6 -# H.264 IDCT 4x4). Same SHA as the daedalus-v4l2 daemon already ships -# inline; rev in lockstep with the daemon when the public API rolls. -DAEDALUS_FOURIER_COMMIT=d87239d8172307d9a1b93c95cbed116d175b85cc +# daedalus-fourier pin. 209a421 = daedalus-fourier PR #2 merge — public +# API now exposes daedalus_recipe_dispatch_h264_qpel_mc20 + +# DAEDALUS_KERNEL_H264_QPEL_MC20. Cycle 9 plumbs the last H.264 NEON +# kernel through the recipe layer. Daemon-side build (debian/daedalus-v4l2) +# can bump in a follow-up; this PR only changes the libavcodec.so consumer. +DAEDALUS_FOURIER_COMMIT=209a4218bcb98b91c04f07ad61513bb04adb13ad HERE=$(dirname "$(readlink -f "$0")") @@ -72,6 +73,7 @@ patch -Np1 -i "$HERE/0003-h264-idct4-daedalus-fourier.patch" patch -Np1 -i "$HERE/0004-h264-idct8-daedalus-fourier.patch" patch -Np1 -i "$HERE/0005-h264-deblock-luma-v-daedalus-fourier.patch" patch -Np1 -i "$HERE/0006-h264-restore-low-delay.patch" +patch -Np1 -i "$HERE/0007-h264-qpel-mc20-daedalus-fourier.patch" # --- daedalus-fourier: fetch + build static .a with PIC, install to a # per-build prefix; libavcodec.so links it into the shared object so diff --git a/debian/ffmpeg-v4l2-request-fourier/debian/changelog b/debian/ffmpeg-v4l2-request-fourier/debian/changelog index 90e6b78008..84c4bf51d8 100644 --- a/debian/ffmpeg-v4l2-request-fourier/debian/changelog +++ b/debian/ffmpeg-v4l2-request-fourier/debian/changelog @@ -1,3 +1,37 @@ +ffmpeg-v4l2-request-fourier (2:8.1+rfourier+gb57fbbe-10) bookworm trixie; urgency=medium + + * Add 0007-h264-qpel-mc20-daedalus-fourier.patch — + H264QpelContext.put_h264_qpel_pixels_tab[1][2] (8x8 luma + horizontal half-pel, 6-tap "put" — the canonical representative + of the H.264 luma motion-compensation family) now dispatches + through daedalus_recipe_dispatch_h264_qpel_mc20 instead of + ff_put_h264_qpel8_mc20_neon. Cycle 9 of the daedalus-v4l2#11 + step 2 substitution arc; closes the 4-cycle libavcodec.so + substitution sequence (6 IDCT4 / 7 IDCT8 / 8 luma-v deblock / + 9 qpel mc20). + * Bumps daedalus-fourier pin d87239d → 209a421 (PR #2 — public + API extended with daedalus_recipe_dispatch_h264_qpel_mc20 + + DAEDALUS_KERNEL_H264_QPEL_MC20). + * Cycle 9 is "CPU primary; QPU pointless" per + docs/k9_h264qpel_mc20.md. Per-block 7.6 ns at 131 Mblock/s + gives 135x margin over 30 fps 1080p; QPU dispatch floor at + ~250 ns makes any V3D shader strictly worse. Substitution + is plumbing-only, NEON-by-recipe — same + daedalus_ctx_create_no_qpu pthread_once shape the cycles 6/7/8 + shims already own (kept SEPARATE from the H264DSP shim's ctx + because H264QPEL is its own libavcodec Makefile module and + link order does not guarantee a single .o owns the ctx symbol; + one extra ~µs init per process, paid lazily on first MC call). + * Other H.264 luma MC variants (mc02, mc11, mc22 etc.) and the + 16x16 size tier stay on the in-tree NEON .S code. Per the + cycle-9 phase-1 rationale, mc20 8x8 is representative of the + whole family's per-block cost. + * Bit-exact against ff_put_h264_qpel8_mc20_neon (daedalus-fourier + cycle 9 green; 10000/10000 random blocks). + * No SONAME change, no Depends change. + + -- Markus Fritsche Sat, 23 May 2026 12:00:00 +0000 + ffmpeg-v4l2-request-fourier (2:8.1+rfourier+gb57fbbe-9) bookworm trixie; urgency=medium * Add 0006-h264-restore-low-delay.patch — restore the documented