From 9b1c106dc567904b07e768e139c722da683acefb Mon Sep 17 00:00:00 2001 From: claude-noether Date: Mon, 25 May 2026 00:00:46 +0200 Subject: [PATCH] h264: deblock bS=4 intra variants (luma + chroma, V + H) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the deblock matrix: adds the four bS=4 intra-strength loop filters used at I-MB edges (and other boundaries where H.264 §8.7.2.1 forces boundary strength to 4). After this PR fourier covers all 8 standard 8-bit 4:2:0 deblock combinations: bS<4 bS=4 ----- ----- luma_v ✓ (cycle 8 QPU) ✓ (CPU) luma_h ✓ (CPU, PR #9) ✓ (CPU) chrm_v ✓ (CPU, PR #10) ✓ (CPU) chrm_h ✓ (CPU, PR #10) ✓ (CPU) Scope: - 4 new kernel enums (LV_INTRA=13, LH_INTRA=14, CV_INTRA=15, CH_INTRA=16), all → CPU substrate in the recipe table. - 4 new public dispatch fns + 4 recipe wrappers (defined via two DEFINE_INTRA_DISPATCH / DEFINE_INTRA_RECIPE macros to keep the boilerplate tight). - 4 new extern decls for the vendored ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon symbols. - C reference: tests/h264_intra_loop_filter_ref.c covers all four orientations. Algorithm per H.264 §8.7.2.3: Luma: per-side strong/weak filter selector strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2) strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2) Strong updates p0/p1/p2 (and mirror); weak updates p0 only. Chroma: always weak, only p0/q0 updated. - daedalus_h264_deblock_meta is REUSED for intra dispatches; the tc0[] field is ignored (bS=4 hardcodes the strength). Callers can build a single edge list and route by kernel without an extra struct. - Test refactor: an intra_test_spec table + run_intra_test helper drives all four orientations through one harness, keeping the new test surface compact (~50 LOC for 4 kernels vs ~200 if each had its own test_deblock_*_intra fn). Verified on hertz (Pi 5 / V3D 7.1): $ ./build/test_api_h264 === Phase 8a API smoke: H.264 kernels via recipe dispatch === ... H.264 deblock luma v intra: 1024/1024 bytes bit-exact (100.0000%) H.264 deblock luma h intra: 1024/1024 bytes bit-exact (100.0000%) H.264 deblock chroma v intra: 256/256 bytes bit-exact (100.0000%) H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%) ... All 11 H.264 kernels bit-exact PASS — the deblock matrix is closed. The bit-exact match on first try is meaningful for these kernels: the strong/weak filter selector + per-side asymmetry would have surfaced any sign / shift / rounding mistake immediately. The C reference is now a usable spec checkpoint for the eventual QPU shader work. QPU shader follow-up: not in this PR. The intra path's 3-cell per-side update + strong/weak branch is structurally more complex than the bS<4 path that already has a V shader (v3d_h264deblock.spv). Per the prior R-band logic for deblock, intra edges are < 20% of total deblock work at typical bit-rates, so NEON-only at ~ 10 ns/edge fits comfortably in the budget. --- CMakeLists.txt | 1 + include/daedalus.h | 48 ++++++++ src/daedalus_core.c | 106 +++++++++++++++++ tests/h264_intra_loop_filter_ref.c | 184 +++++++++++++++++++++++++++++ tests/test_api_h264.c | 84 +++++++++++++ 5 files changed, 423 insertions(+) create mode 100644 tests/h264_intra_loop_filter_ref.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 367c414..b27f4e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -521,6 +521,7 @@ add_executable(test_api_h264 tests/h264_deblock_ref.c tests/h264_h_loop_filter_luma_ref.c tests/h264_chroma_loop_filter_ref.c + tests/h264_intra_loop_filter_ref.c tests/h264_qpel8_mc20_ref.c ) target_link_libraries(test_api_h264 PRIVATE daedalus_core) diff --git a/include/daedalus.h b/include/daedalus.h index 8fcbebd..02944e3 100644 --- a/include/daedalus.h +++ b/include/daedalus.h @@ -315,6 +315,50 @@ int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrat uint8_t *dst, size_t dst_stride, size_t n_edges, const daedalus_h264_deblock_meta *meta); +/* H.264 bS=4 "intra" loop filters — used at I-MB and inter + * macroblock boundaries where boundary strength is forced to 4 per + * H.264 §8.7.2.1. Different algorithm from bS<4: per-side strong + * vs weak filter decided by quad-tree condition (luma only); + * chroma is always weak. No tc0 — the daedalus_h264_deblock_meta + * struct's tc0[] field is IGNORED for intra dispatches (callers can + * leave it uninitialised or share a single edge list across both + * intra and non-intra kernels). + * + * Reuses the same meta layout as bS<4 dispatches for alpha + beta + + * dst_off; tile geometry per orientation is identical to the bS<4 + * sibling (16-col / 16-row luma; 8-col / 8-row chroma). + * + * QPU shaders not implemented for any of the four; recipe routes + * AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1 (fast fail). + */ +int daedalus_recipe_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); +int daedalus_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + +int daedalus_recipe_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); +int daedalus_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + +int daedalus_recipe_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); +int daedalus_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + +int daedalus_recipe_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); +int daedalus_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta); + /* ------------------------------------------------------------------- * H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9 * (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see @@ -364,6 +408,10 @@ typedef enum { DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10, DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11, DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12, + DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA = 13, + DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA = 14, + DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15, + DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16, } daedalus_kernel; daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k); diff --git a/src/daedalus_core.c b/src/daedalus_core.c index 699beb3..0334581 100644 --- a/src/daedalus_core.c +++ b/src/daedalus_core.c @@ -133,6 +133,10 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k) case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */ case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */ + case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */ + case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU; + case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */ + case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU; case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */ } return DAEDALUS_SUBSTRATE_CPU; @@ -164,6 +168,14 @@ extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); +extern void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +extern void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +extern void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); +extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta); extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride); @@ -320,6 +332,63 @@ static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx, return 0; } +/* --- bS=4 intra variants. Note: the daedalus_h264_deblock_meta + * struct's tc0[] field is unused for intra (the spec hardcodes the + * strength). We accept the same meta type so callers can build a + * single edge-list and route by kernel — saves an extra struct. + */ +static int dispatch_h264_deblock_luma_v_intra_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_edges; i++) { + ff_h264_v_loop_filter_luma_intra_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + meta[i].alpha, meta[i].beta); + } + return 0; +} + +static int dispatch_h264_deblock_luma_h_intra_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_edges; i++) { + ff_h264_h_loop_filter_luma_intra_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + meta[i].alpha, meta[i].beta); + } + return 0; +} + +static int dispatch_h264_deblock_chroma_v_intra_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_edges; i++) { + ff_h264_v_loop_filter_chroma_intra_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + meta[i].alpha, meta[i].beta); + } + return 0; +} + +static int dispatch_h264_deblock_chroma_h_intra_cpu(daedalus_ctx *ctx, + uint8_t *dst, size_t dst_stride, + size_t n_edges, const daedalus_h264_deblock_meta *meta) +{ + (void) ctx; + for (size_t i = 0; i < n_edges; i++) { + ff_h264_h_loop_filter_chroma_intra_neon(dst + meta[i].dst_off, + (ptrdiff_t) dst_stride, + meta[i].alpha, meta[i].beta); + } + return 0; +} + static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) @@ -1270,6 +1339,27 @@ int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrat return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta); } +#define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn) \ +int daedalus_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \ + daedalus_substrate sub, uint8_t *dst, size_t dst_stride, \ + size_t n_edges, const daedalus_h264_deblock_meta *meta) \ +{ \ + daedalus_substrate eff = sub; \ + if (eff == DAEDALUS_SUBSTRATE_AUTO) \ + eff = daedalus_recipe_substrate_for(kernel); \ + if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \ + eff = DAEDALUS_SUBSTRATE_CPU; \ + if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \ + return cpu_fn(ctx, dst, dst_stride, n_edges, meta); \ +} + +DEFINE_INTRA_DISPATCH(luma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA, dispatch_h264_deblock_luma_v_intra_cpu) +DEFINE_INTRA_DISPATCH(luma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA, dispatch_h264_deblock_luma_h_intra_cpu) +DEFINE_INTRA_DISPATCH(chroma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA, dispatch_h264_deblock_chroma_v_intra_cpu) +DEFINE_INTRA_DISPATCH(chroma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA, dispatch_h264_deblock_chroma_h_intra_cpu) + +#undef DEFINE_INTRA_DISPATCH + int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) @@ -1381,6 +1471,22 @@ int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, dst, dst_stride, n_edges, meta); } +#define DEFINE_INTRA_RECIPE(name) \ +int daedalus_recipe_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \ + uint8_t *dst, size_t dst_stride, \ + size_t n_edges, const daedalus_h264_deblock_meta *meta) \ +{ \ + return daedalus_dispatch_h264_deblock_ ## name (ctx, DAEDALUS_SUBSTRATE_AUTO, \ + dst, dst_stride, n_edges, meta); \ +} + +DEFINE_INTRA_RECIPE(luma_v_intra) +DEFINE_INTRA_RECIPE(luma_h_intra) +DEFINE_INTRA_RECIPE(chroma_v_intra) +DEFINE_INTRA_RECIPE(chroma_h_intra) + +#undef DEFINE_INTRA_RECIPE + int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, uint8_t *dst, const uint8_t *src, size_t stride, size_t n_blocks, const daedalus_h264_qpel_meta *meta) diff --git a/tests/h264_intra_loop_filter_ref.c b/tests/h264_intra_loop_filter_ref.c new file mode 100644 index 0000000..9f1f372 --- /dev/null +++ b/tests/h264_intra_loop_filter_ref.c @@ -0,0 +1,184 @@ +/* + * Standalone bit-exact C reference for H.264 luma + chroma "intra" + * loop filters (bS = 4 variant, used at I-MB edges where the + * boundary strength is forced to 4). Covers all four orientations: + * + * v_loop_filter_luma_intra — 16 cols × 8 rows, edge between + * rows -1 and 0 + * h_loop_filter_luma_intra — 8 cols × 16 rows, edge between + * cols -1 and 0 + * v_loop_filter_chroma_intra — 8 cols × 4 rows + * h_loop_filter_chroma_intra — 4 cols × 8 rows + * + * Mirrors FFmpeg's `ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon` + * in external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S. + * + * Algorithm per H.264 §8.7.2.3 (bS=4): + * + * Preconditions (same as bS<4): + * |p0-q0| < α AND |p1-p0| < β AND |q1-q0| < β + * + * Luma — strong/weak filter selector per side: + * strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2) + * strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2) + * + * If strong_p, update p0/p1/p2: + * p0' = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3 + * p1' = (p2 + p1 + p0 + q0 + 2) >> 2 + * p2' = (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3 + * Else weak (single cell): + * p0' = (2*p1 + p0 + q1 + 2) >> 2 + * Mirror for q-side. + * + * Chroma — always weak (no quad-tree selector): + * p0' = (2*p1 + p0 + q1 + 2) >> 2 + * q0' = (2*q1 + q0 + p1 + 2) >> 2 + * Chroma never updates p1/p2/q1/q2. + * + * Signature (no tc0 in the intra path — the daedalus_h264_deblock_meta + * struct's tc0 field is ignored at the dispatch layer): + * void(uint8_t *pix, ptrdiff_t stride, int alpha, int beta); + * + * License: LGPL-2.1-or-later (matches FFmpeg upstream). + */ +#include +#include + +static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; } +static inline int abs_i(int x) { return x < 0 ? -x : x; } + +/* --- luma intra, one column across the horizontal edge --- */ +static void h264_luma_intra_cell_v(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta) +{ + int p3 = pix[-4*stride], p2 = pix[-3*stride]; + int p1 = pix[-2*stride], p0 = pix[-1*stride]; + int q0 = pix[ 0*stride], q1 = pix[ 1*stride]; + int q2 = pix[ 2*stride], q3 = pix[ 3*stride]; + + if (abs_i(p0 - q0) >= alpha) return; + if (abs_i(p1 - p0) >= beta) return; + if (abs_i(q1 - q0) >= beta) return; + + int strong_common = abs_i(p0 - q0) < ((alpha >> 2) + 2); + int strong_p = strong_common && (abs_i(p2 - p0) < beta); + int strong_q = strong_common && (abs_i(q2 - q0) < beta); + + if (strong_p) { + pix[-1*stride] = (uint8_t) clip_u8((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3); + pix[-2*stride] = (uint8_t) clip_u8((p2 + p1 + p0 + q0 + 2) >> 2); + pix[-3*stride] = (uint8_t) clip_u8((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3); + } else { + pix[-1*stride] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2); + } + + if (strong_q) { + pix[ 0*stride] = (uint8_t) clip_u8((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3); + pix[ 1*stride] = (uint8_t) clip_u8((q2 + q1 + q0 + p0 + 2) >> 2); + pix[ 2*stride] = (uint8_t) clip_u8((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3); + } else { + pix[ 0*stride] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2); + } +} + +/* --- luma intra, one row across the vertical edge --- */ +static void h264_luma_intra_cell_h(uint8_t *pix, int alpha, int beta) +{ + int p3 = pix[-4], p2 = pix[-3], p1 = pix[-2], p0 = pix[-1]; + int q0 = pix[ 0], q1 = pix[ 1], q2 = pix[ 2], q3 = pix[ 3]; + + if (abs_i(p0 - q0) >= alpha) return; + if (abs_i(p1 - p0) >= beta) return; + if (abs_i(q1 - q0) >= beta) return; + + int strong_common = abs_i(p0 - q0) < ((alpha >> 2) + 2); + int strong_p = strong_common && (abs_i(p2 - p0) < beta); + int strong_q = strong_common && (abs_i(q2 - q0) < beta); + + if (strong_p) { + pix[-1] = (uint8_t) clip_u8((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3); + pix[-2] = (uint8_t) clip_u8((p2 + p1 + p0 + q0 + 2) >> 2); + pix[-3] = (uint8_t) clip_u8((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3); + } else { + pix[-1] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2); + } + + if (strong_q) { + pix[ 0] = (uint8_t) clip_u8((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3); + pix[ 1] = (uint8_t) clip_u8((q2 + q1 + q0 + p0 + 2) >> 2); + pix[ 2] = (uint8_t) clip_u8((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3); + } else { + pix[ 0] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2); + } +} + +/* --- chroma intra, one column across the horizontal edge --- */ +static void h264_chroma_intra_cell_v(uint8_t *pix, ptrdiff_t stride, + int alpha, int beta) +{ + int p1 = pix[-2*stride], p0 = pix[-1*stride]; + int q0 = pix[ 0*stride], q1 = pix[ 1*stride]; + + if (abs_i(p0 - q0) >= alpha) return; + if (abs_i(p1 - p0) >= beta) return; + if (abs_i(q1 - q0) >= beta) return; + + pix[-1*stride] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2); + pix[ 0*stride] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2); +} + +/* --- chroma intra, one row across the vertical edge --- */ +static void h264_chroma_intra_cell_h(uint8_t *pix, int alpha, int beta) +{ + int p1 = pix[-2], p0 = pix[-1]; + int q0 = pix[ 0], q1 = pix[ 1]; + + if (abs_i(p0 - q0) >= alpha) return; + if (abs_i(p1 - p0) >= beta) return; + if (abs_i(q1 - q0) >= beta) return; + + pix[-1] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2); + pix[ 0] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2); +} + +/* --- public refs --- */ + +void daedalus_h264_v_loop_filter_luma_intra_ref( + uint8_t *pix, ptrdiff_t stride, int alpha, int beta) +{ + /* Note: the FFmpeg .S `h264_loop_filter_start_intra` macro + * returns early if (alpha|beta) == 0. For non-zero alpha or + * non-zero beta it runs the filter; the per-cell preconditions + * (abs(p0-q0)n_edges * t->tile_stride * t->tile_rows; + daedalus_ctx *ctx = daedalus_ctx_create(); + if (!ctx) return 1; + + uint8_t *dst = malloc((size_t) total); + uint8_t *dst_ref = malloc((size_t) total); + daedalus_h264_deblock_meta *meta = calloc((size_t) t->n_edges, sizeof(*meta)); + if (!dst || !dst_ref || !meta) return 1; + + for (int i = 0; i < total; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff); + int tile_bytes = t->tile_stride * t->tile_rows; + for (int i = 0; i < t->n_edges; i++) { + meta[i].dst_off = (uint32_t)(i * tile_bytes + t->edge_off); + meta[i].alpha = (int)(xs() % 64) + 1; + meta[i].beta = (int)(xs() % 16) + 1; + /* tc0[] unused for intra; leave at 0 from calloc. */ + } + for (int i = 0; i < t->n_edges; i++) { + t->ref(dst_ref + meta[i].dst_off, + (ptrdiff_t) t->tile_stride, + meta[i].alpha, meta[i].beta); + } + int rc = t->dispatch(ctx, dst, (size_t) t->tile_stride, + (size_t) t->n_edges, meta); + if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", t->name, rc); return 1; } + + int diff = 0; + for (int i = 0; i < total; i++) if (dst[i] != dst_ref[i]) diff++; + printf(" H.264 deblock %s: %d/%d bytes bit-exact (%.4f%%)\n", + t->name, total - diff, total, 100.0 * (total - diff) / total); + + free(meta); free(dst_ref); free(dst); + daedalus_ctx_destroy(ctx); + return diff == 0 ? 0 : 1; +} + +static int test_deblock_intra_all(void) +{ + intra_test_spec specs[] = { + { "luma v intra", 8, 16, 8, 4 * 16, + daedalus_h264_v_loop_filter_luma_intra_ref, + daedalus_recipe_dispatch_h264_deblock_luma_v_intra }, + { "luma h intra", 8, 8, 16, 4, + daedalus_h264_h_loop_filter_luma_intra_ref, + daedalus_recipe_dispatch_h264_deblock_luma_h_intra }, + { "chroma v intra", 8, 8, 4, 2 * 8, + daedalus_h264_v_loop_filter_chroma_intra_ref, + daedalus_recipe_dispatch_h264_deblock_chroma_v_intra }, + { "chroma h intra", 8, 4, 8, 2, + daedalus_h264_h_loop_filter_chroma_intra_ref, + daedalus_recipe_dispatch_h264_deblock_chroma_h_intra }, + }; + int fail = 0; + for (size_t i = 0; i < sizeof(specs)/sizeof(specs[0]); i++) + fail |= run_intra_test(&specs[i]); + return fail; +} + static int test_qpel_mc20(void) { /* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile @@ -336,6 +417,8 @@ int main(void) (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV)); printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n", (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH)); + printf(" H264_DEBLOCK_*_INTRA recipe substrate: %d (CPU, bS=4 set)\n", + (int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA)); int fail = 0; fail |= test_idct4(); @@ -344,6 +427,7 @@ int main(void) fail |= test_deblock_h(); fail |= test_deblock_chroma_v(); fail |= test_deblock_chroma_h(); + fail |= test_deblock_intra_all(); fail |= test_qpel_mc20(); return fail; } -- 2.47.3