h264: deblock bS=4 intra variants (luma + chroma, V + H)
Closes the deblock matrix: adds the four bS=4 intra-strength loop
filters used at I-MB edges (and other boundaries where H.264
§8.7.2.1 forces boundary strength to 4). After this PR fourier
covers all 8 standard 8-bit 4:2:0 deblock combinations:
bS<4 bS=4
----- -----
luma_v ✓ (cycle 8 QPU) ✓ (CPU)
luma_h ✓ (CPU, PR #9) ✓ (CPU)
chrm_v ✓ (CPU, PR #10) ✓ (CPU)
chrm_h ✓ (CPU, PR #10) ✓ (CPU)
Scope:
- 4 new kernel enums (LV_INTRA=13, LH_INTRA=14, CV_INTRA=15,
CH_INTRA=16), all → CPU substrate in the recipe table.
- 4 new public dispatch fns + 4 recipe wrappers (defined via two
DEFINE_INTRA_DISPATCH / DEFINE_INTRA_RECIPE macros to keep the
boilerplate tight).
- 4 new extern decls for the vendored
ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon symbols.
- C reference: tests/h264_intra_loop_filter_ref.c covers all four
orientations. Algorithm per H.264 §8.7.2.3:
Luma: per-side strong/weak filter selector
strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2)
strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2)
Strong updates p0/p1/p2 (and mirror); weak updates p0 only.
Chroma: always weak, only p0/q0 updated.
- daedalus_h264_deblock_meta is REUSED for intra dispatches; the
tc0[] field is ignored (bS=4 hardcodes the strength). Callers
can build a single edge list and route by kernel without an
extra struct.
- Test refactor: an intra_test_spec table + run_intra_test helper
drives all four orientations through one harness, keeping the
new test surface compact (~50 LOC for 4 kernels vs ~200 if each
had its own test_deblock_*_intra fn).
Verified on hertz (Pi 5 / V3D 7.1):
$ ./build/test_api_h264
=== Phase 8a API smoke: H.264 kernels via recipe dispatch ===
...
H.264 deblock luma v intra: 1024/1024 bytes bit-exact (100.0000%)
H.264 deblock luma h intra: 1024/1024 bytes bit-exact (100.0000%)
H.264 deblock chroma v intra: 256/256 bytes bit-exact (100.0000%)
H.264 deblock chroma h intra: 256/256 bytes bit-exact (100.0000%)
...
All 11 H.264 kernels bit-exact PASS — the deblock matrix is closed.
The bit-exact match on first try is meaningful for these kernels:
the strong/weak filter selector + per-side asymmetry would have
surfaced any sign / shift / rounding mistake immediately. The
C reference is now a usable spec checkpoint for the eventual QPU
shader work.
QPU shader follow-up: not in this PR. The intra path's 3-cell
per-side update + strong/weak branch is structurally more complex
than the bS<4 path that already has a V shader (v3d_h264deblock.spv).
Per the prior R-band logic for deblock, intra edges are < 20% of
total deblock work at typical bit-rates, so NEON-only at ~ 10 ns/edge
fits comfortably in the budget.
This commit is contained in:
@@ -521,6 +521,7 @@ add_executable(test_api_h264
|
|||||||
tests/h264_deblock_ref.c
|
tests/h264_deblock_ref.c
|
||||||
tests/h264_h_loop_filter_luma_ref.c
|
tests/h264_h_loop_filter_luma_ref.c
|
||||||
tests/h264_chroma_loop_filter_ref.c
|
tests/h264_chroma_loop_filter_ref.c
|
||||||
|
tests/h264_intra_loop_filter_ref.c
|
||||||
tests/h264_qpel8_mc20_ref.c
|
tests/h264_qpel8_mc20_ref.c
|
||||||
)
|
)
|
||||||
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
target_link_libraries(test_api_h264 PRIVATE daedalus_core)
|
||||||
|
|||||||
@@ -315,6 +315,50 @@ int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrat
|
|||||||
uint8_t *dst, size_t dst_stride,
|
uint8_t *dst, size_t dst_stride,
|
||||||
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
|
||||||
|
/* H.264 bS=4 "intra" loop filters — used at I-MB and inter
|
||||||
|
* macroblock boundaries where boundary strength is forced to 4 per
|
||||||
|
* H.264 §8.7.2.1. Different algorithm from bS<4: per-side strong
|
||||||
|
* vs weak filter decided by quad-tree condition (luma only);
|
||||||
|
* chroma is always weak. No tc0 — the daedalus_h264_deblock_meta
|
||||||
|
* struct's tc0[] field is IGNORED for intra dispatches (callers can
|
||||||
|
* leave it uninitialised or share a single edge list across both
|
||||||
|
* intra and non-intra kernels).
|
||||||
|
*
|
||||||
|
* Reuses the same meta layout as bS<4 dispatches for alpha + beta +
|
||||||
|
* dst_off; tile geometry per orientation is identical to the bS<4
|
||||||
|
* sibling (16-col / 16-row luma; 8-col / 8-row chroma).
|
||||||
|
*
|
||||||
|
* QPU shaders not implemented for any of the four; recipe routes
|
||||||
|
* AUTO to CPU NEON. Explicit SUBSTRATE_QPU returns -1 (fast fail).
|
||||||
|
*/
|
||||||
|
int daedalus_recipe_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
int daedalus_dispatch_h264_deblock_luma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
int daedalus_dispatch_h264_deblock_luma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
int daedalus_dispatch_h264_deblock_chroma_v_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
|
||||||
|
int daedalus_recipe_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
int daedalus_dispatch_h264_deblock_chroma_h_intra(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
|
||||||
/* -------------------------------------------------------------------
|
/* -------------------------------------------------------------------
|
||||||
* H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
|
* H.264 luma qpel mc20 (8×8, horizontal half-pel) — cycle 9
|
||||||
* (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
|
* (CPU by recipe; per-block 7.6 ns NEON, QPU not viable — see
|
||||||
@@ -364,6 +408,10 @@ typedef enum {
|
|||||||
DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
|
DAEDALUS_KERNEL_H264_DEBLOCK_LH = 10,
|
||||||
DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
|
DAEDALUS_KERNEL_H264_DEBLOCK_CV = 11,
|
||||||
DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
|
DAEDALUS_KERNEL_H264_DEBLOCK_CH = 12,
|
||||||
|
DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA = 13,
|
||||||
|
DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA = 14,
|
||||||
|
DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA = 15,
|
||||||
|
DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA = 16,
|
||||||
} daedalus_kernel;
|
} daedalus_kernel;
|
||||||
|
|
||||||
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k);
|
||||||
|
|||||||
@@ -133,6 +133,10 @@ daedalus_substrate daedalus_recipe_substrate_for(daedalus_kernel k)
|
|||||||
case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */
|
case DAEDALUS_KERNEL_H264_DEBLOCK_LH: return DAEDALUS_SUBSTRATE_CPU; /* QPU H shader pending */
|
||||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
case DAEDALUS_KERNEL_H264_DEBLOCK_CV: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
||||||
case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
case DAEDALUS_KERNEL_H264_DEBLOCK_CH: return DAEDALUS_SUBSTRATE_CPU; /* chroma QPU pending */
|
||||||
|
case DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 luma QPU pending */
|
||||||
|
case DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
|
case DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA: return DAEDALUS_SUBSTRATE_CPU; /* bS=4 chroma QPU pending */
|
||||||
|
case DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA: return DAEDALUS_SUBSTRATE_CPU;
|
||||||
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
case DAEDALUS_KERNEL_H264_QPEL_MC20: return DAEDALUS_SUBSTRATE_QPU; /* v3d_h264_qpel_mc20.spv */
|
||||||
}
|
}
|
||||||
return DAEDALUS_SUBSTRATE_CPU;
|
return DAEDALUS_SUBSTRATE_CPU;
|
||||||
@@ -164,6 +168,14 @@ extern void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
|||||||
int alpha, int beta, int8_t *tc0);
|
int alpha, int beta, int8_t *tc0);
|
||||||
extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
extern void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, ptrdiff_t stride,
|
||||||
int alpha, int beta, int8_t *tc0);
|
int alpha, int beta, int8_t *tc0);
|
||||||
|
extern void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
extern void ff_h264_h_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
extern void ff_h264_v_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
extern void ff_h264_h_loop_filter_chroma_intra_neon(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
extern void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src,
|
||||||
ptrdiff_t stride);
|
ptrdiff_t stride);
|
||||||
|
|
||||||
@@ -320,6 +332,63 @@ static int dispatch_h264_deblock_chroma_h_cpu(daedalus_ctx *ctx,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* --- bS=4 intra variants. Note: the daedalus_h264_deblock_meta
|
||||||
|
* struct's tc0[] field is unused for intra (the spec hardcodes the
|
||||||
|
* strength). We accept the same meta type so callers can build a
|
||||||
|
* single edge-list and route by kernel — saves an extra struct.
|
||||||
|
*/
|
||||||
|
static int dispatch_h264_deblock_luma_v_intra_cpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
for (size_t i = 0; i < n_edges; i++) {
|
||||||
|
ff_h264_v_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
|
||||||
|
(ptrdiff_t) dst_stride,
|
||||||
|
meta[i].alpha, meta[i].beta);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dispatch_h264_deblock_luma_h_intra_cpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
for (size_t i = 0; i < n_edges; i++) {
|
||||||
|
ff_h264_h_loop_filter_luma_intra_neon(dst + meta[i].dst_off,
|
||||||
|
(ptrdiff_t) dst_stride,
|
||||||
|
meta[i].alpha, meta[i].beta);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dispatch_h264_deblock_chroma_v_intra_cpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
for (size_t i = 0; i < n_edges; i++) {
|
||||||
|
ff_h264_v_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
|
||||||
|
(ptrdiff_t) dst_stride,
|
||||||
|
meta[i].alpha, meta[i].beta);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int dispatch_h264_deblock_chroma_h_intra_cpu(daedalus_ctx *ctx,
|
||||||
|
uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta)
|
||||||
|
{
|
||||||
|
(void) ctx;
|
||||||
|
for (size_t i = 0; i < n_edges; i++) {
|
||||||
|
ff_h264_h_loop_filter_chroma_intra_neon(dst + meta[i].dst_off,
|
||||||
|
(ptrdiff_t) dst_stride,
|
||||||
|
meta[i].alpha, meta[i].beta);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
static int dispatch_h264_qpel_mc20_cpu(daedalus_ctx *ctx,
|
||||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||||
@@ -1270,6 +1339,27 @@ int daedalus_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx, daedalus_substrat
|
|||||||
return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
|
return dispatch_h264_deblock_chroma_h_cpu(ctx, dst, dst_stride, n_edges, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define DEFINE_INTRA_DISPATCH(name, kernel, cpu_fn) \
|
||||||
|
int daedalus_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
|
||||||
|
daedalus_substrate sub, uint8_t *dst, size_t dst_stride, \
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
|
||||||
|
{ \
|
||||||
|
daedalus_substrate eff = sub; \
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_AUTO) \
|
||||||
|
eff = daedalus_recipe_substrate_for(kernel); \
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU && !daedalus_ctx_has_qpu(ctx)) \
|
||||||
|
eff = DAEDALUS_SUBSTRATE_CPU; \
|
||||||
|
if (eff == DAEDALUS_SUBSTRATE_QPU) return -1; \
|
||||||
|
return cpu_fn(ctx, dst, dst_stride, n_edges, meta); \
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFINE_INTRA_DISPATCH(luma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA, dispatch_h264_deblock_luma_v_intra_cpu)
|
||||||
|
DEFINE_INTRA_DISPATCH(luma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_LH_INTRA, dispatch_h264_deblock_luma_h_intra_cpu)
|
||||||
|
DEFINE_INTRA_DISPATCH(chroma_v_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CV_INTRA, dispatch_h264_deblock_chroma_v_intra_cpu)
|
||||||
|
DEFINE_INTRA_DISPATCH(chroma_h_intra, DAEDALUS_KERNEL_H264_DEBLOCK_CH_INTRA, dispatch_h264_deblock_chroma_h_intra_cpu)
|
||||||
|
|
||||||
|
#undef DEFINE_INTRA_DISPATCH
|
||||||
|
|
||||||
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
int daedalus_dispatch_h264_qpel_mc20(daedalus_ctx *ctx, daedalus_substrate sub,
|
||||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||||
@@ -1381,6 +1471,22 @@ int daedalus_recipe_dispatch_h264_deblock_chroma_h(daedalus_ctx *ctx,
|
|||||||
dst, dst_stride, n_edges, meta);
|
dst, dst_stride, n_edges, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define DEFINE_INTRA_RECIPE(name) \
|
||||||
|
int daedalus_recipe_dispatch_h264_deblock_ ## name (daedalus_ctx *ctx, \
|
||||||
|
uint8_t *dst, size_t dst_stride, \
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta) \
|
||||||
|
{ \
|
||||||
|
return daedalus_dispatch_h264_deblock_ ## name (ctx, DAEDALUS_SUBSTRATE_AUTO, \
|
||||||
|
dst, dst_stride, n_edges, meta); \
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFINE_INTRA_RECIPE(luma_v_intra)
|
||||||
|
DEFINE_INTRA_RECIPE(luma_h_intra)
|
||||||
|
DEFINE_INTRA_RECIPE(chroma_v_intra)
|
||||||
|
DEFINE_INTRA_RECIPE(chroma_h_intra)
|
||||||
|
|
||||||
|
#undef DEFINE_INTRA_RECIPE
|
||||||
|
|
||||||
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
int daedalus_recipe_dispatch_h264_qpel_mc20(daedalus_ctx *ctx,
|
||||||
uint8_t *dst, const uint8_t *src, size_t stride,
|
uint8_t *dst, const uint8_t *src, size_t stride,
|
||||||
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
size_t n_blocks, const daedalus_h264_qpel_meta *meta)
|
||||||
|
|||||||
@@ -0,0 +1,184 @@
|
|||||||
|
/*
|
||||||
|
* Standalone bit-exact C reference for H.264 luma + chroma "intra"
|
||||||
|
* loop filters (bS = 4 variant, used at I-MB edges where the
|
||||||
|
* boundary strength is forced to 4). Covers all four orientations:
|
||||||
|
*
|
||||||
|
* v_loop_filter_luma_intra — 16 cols × 8 rows, edge between
|
||||||
|
* rows -1 and 0
|
||||||
|
* h_loop_filter_luma_intra — 8 cols × 16 rows, edge between
|
||||||
|
* cols -1 and 0
|
||||||
|
* v_loop_filter_chroma_intra — 8 cols × 4 rows
|
||||||
|
* h_loop_filter_chroma_intra — 4 cols × 8 rows
|
||||||
|
*
|
||||||
|
* Mirrors FFmpeg's `ff_h264_{v,h}_loop_filter_{luma,chroma}_intra_neon`
|
||||||
|
* in external/ffmpeg-snapshot/libavcodec/aarch64/h264dsp_neon.S.
|
||||||
|
*
|
||||||
|
* Algorithm per H.264 §8.7.2.3 (bS=4):
|
||||||
|
*
|
||||||
|
* Preconditions (same as bS<4):
|
||||||
|
* |p0-q0| < α AND |p1-p0| < β AND |q1-q0| < β
|
||||||
|
*
|
||||||
|
* Luma — strong/weak filter selector per side:
|
||||||
|
* strong_p = (|p2-p0| < β) AND (|p0-q0| < (α>>2)+2)
|
||||||
|
* strong_q = (|q2-q0| < β) AND (|p0-q0| < (α>>2)+2)
|
||||||
|
*
|
||||||
|
* If strong_p, update p0/p1/p2:
|
||||||
|
* p0' = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3
|
||||||
|
* p1' = (p2 + p1 + p0 + q0 + 2) >> 2
|
||||||
|
* p2' = (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
|
||||||
|
* Else weak (single cell):
|
||||||
|
* p0' = (2*p1 + p0 + q1 + 2) >> 2
|
||||||
|
* Mirror for q-side.
|
||||||
|
*
|
||||||
|
* Chroma — always weak (no quad-tree selector):
|
||||||
|
* p0' = (2*p1 + p0 + q1 + 2) >> 2
|
||||||
|
* q0' = (2*q1 + q0 + p1 + 2) >> 2
|
||||||
|
* Chroma never updates p1/p2/q1/q2.
|
||||||
|
*
|
||||||
|
* Signature (no tc0 in the intra path — the daedalus_h264_deblock_meta
|
||||||
|
* struct's tc0 field is ignored at the dispatch layer):
|
||||||
|
* void(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
|
||||||
|
*
|
||||||
|
* License: LGPL-2.1-or-later (matches FFmpeg upstream).
|
||||||
|
*/
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
||||||
|
static inline int abs_i(int x) { return x < 0 ? -x : x; }
|
||||||
|
|
||||||
|
/* --- luma intra, one column across the horizontal edge --- */
|
||||||
|
static void h264_luma_intra_cell_v(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta)
|
||||||
|
{
|
||||||
|
int p3 = pix[-4*stride], p2 = pix[-3*stride];
|
||||||
|
int p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||||||
|
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
|
||||||
|
int q2 = pix[ 2*stride], q3 = pix[ 3*stride];
|
||||||
|
|
||||||
|
if (abs_i(p0 - q0) >= alpha) return;
|
||||||
|
if (abs_i(p1 - p0) >= beta) return;
|
||||||
|
if (abs_i(q1 - q0) >= beta) return;
|
||||||
|
|
||||||
|
int strong_common = abs_i(p0 - q0) < ((alpha >> 2) + 2);
|
||||||
|
int strong_p = strong_common && (abs_i(p2 - p0) < beta);
|
||||||
|
int strong_q = strong_common && (abs_i(q2 - q0) < beta);
|
||||||
|
|
||||||
|
if (strong_p) {
|
||||||
|
pix[-1*stride] = (uint8_t) clip_u8((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
|
||||||
|
pix[-2*stride] = (uint8_t) clip_u8((p2 + p1 + p0 + q0 + 2) >> 2);
|
||||||
|
pix[-3*stride] = (uint8_t) clip_u8((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
|
||||||
|
} else {
|
||||||
|
pix[-1*stride] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (strong_q) {
|
||||||
|
pix[ 0*stride] = (uint8_t) clip_u8((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3);
|
||||||
|
pix[ 1*stride] = (uint8_t) clip_u8((q2 + q1 + q0 + p0 + 2) >> 2);
|
||||||
|
pix[ 2*stride] = (uint8_t) clip_u8((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
|
||||||
|
} else {
|
||||||
|
pix[ 0*stride] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- luma intra, one row across the vertical edge --- */
|
||||||
|
static void h264_luma_intra_cell_h(uint8_t *pix, int alpha, int beta)
|
||||||
|
{
|
||||||
|
int p3 = pix[-4], p2 = pix[-3], p1 = pix[-2], p0 = pix[-1];
|
||||||
|
int q0 = pix[ 0], q1 = pix[ 1], q2 = pix[ 2], q3 = pix[ 3];
|
||||||
|
|
||||||
|
if (abs_i(p0 - q0) >= alpha) return;
|
||||||
|
if (abs_i(p1 - p0) >= beta) return;
|
||||||
|
if (abs_i(q1 - q0) >= beta) return;
|
||||||
|
|
||||||
|
int strong_common = abs_i(p0 - q0) < ((alpha >> 2) + 2);
|
||||||
|
int strong_p = strong_common && (abs_i(p2 - p0) < beta);
|
||||||
|
int strong_q = strong_common && (abs_i(q2 - q0) < beta);
|
||||||
|
|
||||||
|
if (strong_p) {
|
||||||
|
pix[-1] = (uint8_t) clip_u8((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3);
|
||||||
|
pix[-2] = (uint8_t) clip_u8((p2 + p1 + p0 + q0 + 2) >> 2);
|
||||||
|
pix[-3] = (uint8_t) clip_u8((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3);
|
||||||
|
} else {
|
||||||
|
pix[-1] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (strong_q) {
|
||||||
|
pix[ 0] = (uint8_t) clip_u8((q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4) >> 3);
|
||||||
|
pix[ 1] = (uint8_t) clip_u8((q2 + q1 + q0 + p0 + 2) >> 2);
|
||||||
|
pix[ 2] = (uint8_t) clip_u8((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3);
|
||||||
|
} else {
|
||||||
|
pix[ 0] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- chroma intra, one column across the horizontal edge --- */
|
||||||
|
static void h264_chroma_intra_cell_v(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta)
|
||||||
|
{
|
||||||
|
int p1 = pix[-2*stride], p0 = pix[-1*stride];
|
||||||
|
int q0 = pix[ 0*stride], q1 = pix[ 1*stride];
|
||||||
|
|
||||||
|
if (abs_i(p0 - q0) >= alpha) return;
|
||||||
|
if (abs_i(p1 - p0) >= beta) return;
|
||||||
|
if (abs_i(q1 - q0) >= beta) return;
|
||||||
|
|
||||||
|
pix[-1*stride] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||||
|
pix[ 0*stride] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- chroma intra, one row across the vertical edge --- */
|
||||||
|
static void h264_chroma_intra_cell_h(uint8_t *pix, int alpha, int beta)
|
||||||
|
{
|
||||||
|
int p1 = pix[-2], p0 = pix[-1];
|
||||||
|
int q0 = pix[ 0], q1 = pix[ 1];
|
||||||
|
|
||||||
|
if (abs_i(p0 - q0) >= alpha) return;
|
||||||
|
if (abs_i(p1 - p0) >= beta) return;
|
||||||
|
if (abs_i(q1 - q0) >= beta) return;
|
||||||
|
|
||||||
|
pix[-1] = (uint8_t) clip_u8((2*p1 + p0 + q1 + 2) >> 2);
|
||||||
|
pix[ 0] = (uint8_t) clip_u8((2*q1 + q0 + p1 + 2) >> 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- public refs --- */
|
||||||
|
|
||||||
|
void daedalus_h264_v_loop_filter_luma_intra_ref(
|
||||||
|
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||||
|
{
|
||||||
|
/* Note: the FFmpeg .S `h264_loop_filter_start_intra` macro
|
||||||
|
* returns early if (alpha|beta) == 0. For non-zero alpha or
|
||||||
|
* non-zero beta it runs the filter; the per-cell preconditions
|
||||||
|
* (abs(p0-q0)<alpha etc.) then decide whether each column
|
||||||
|
* actually updates pixels. Match that here. */
|
||||||
|
if ((alpha | beta) == 0) return;
|
||||||
|
|
||||||
|
/* 16 columns; no quad-tree segments in the intra path (bS=4 is
|
||||||
|
* uniform across the edge, no tc0_seg < 0 skip). */
|
||||||
|
for (int c = 0; c < 16; c++)
|
||||||
|
h264_luma_intra_cell_v(pix + c, stride, alpha, beta);
|
||||||
|
}
|
||||||
|
|
||||||
|
void daedalus_h264_h_loop_filter_luma_intra_ref(
|
||||||
|
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||||
|
{
|
||||||
|
if ((alpha | beta) == 0) return;
|
||||||
|
for (int r = 0; r < 16; r++)
|
||||||
|
h264_luma_intra_cell_h(pix + r * stride, alpha, beta);
|
||||||
|
}
|
||||||
|
|
||||||
|
void daedalus_h264_v_loop_filter_chroma_intra_ref(
|
||||||
|
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||||
|
{
|
||||||
|
if ((alpha | beta) == 0) return;
|
||||||
|
for (int c = 0; c < 8; c++)
|
||||||
|
h264_chroma_intra_cell_v(pix + c, stride, alpha, beta);
|
||||||
|
}
|
||||||
|
|
||||||
|
void daedalus_h264_h_loop_filter_chroma_intra_ref(
|
||||||
|
uint8_t *pix, ptrdiff_t stride, int alpha, int beta)
|
||||||
|
{
|
||||||
|
if ((alpha | beta) == 0) return;
|
||||||
|
for (int r = 0; r < 8; r++)
|
||||||
|
h264_chroma_intra_cell_h(pix + r * stride, alpha, beta);
|
||||||
|
}
|
||||||
@@ -22,6 +22,14 @@ extern void daedalus_h264_v_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t strid
|
|||||||
int alpha, int beta, int8_t tc0[4]);
|
int alpha, int beta, int8_t tc0[4]);
|
||||||
extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
extern void daedalus_h264_h_loop_filter_chroma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||||
int alpha, int beta, int8_t tc0[4]);
|
int alpha, int beta, int8_t tc0[4]);
|
||||||
|
extern void daedalus_h264_v_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
extern void daedalus_h264_h_loop_filter_luma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
extern void daedalus_h264_v_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
|
extern void daedalus_h264_h_loop_filter_chroma_intra_ref(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
int alpha, int beta);
|
||||||
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
extern void daedalus_h264_v_loop_filter_luma_ref(uint8_t *pix, ptrdiff_t stride,
|
||||||
int alpha, int beta, int8_t tc0[4]);
|
int alpha, int beta, int8_t tc0[4]);
|
||||||
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
extern void daedalus_put_h264_qpel8_mc20_ref(uint8_t *dst, const uint8_t *src,
|
||||||
@@ -278,6 +286,79 @@ static int test_deblock_chroma_h(void)
|
|||||||
return diff == 0 ? 0 : 1;
|
return diff == 0 ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* --- bS=4 intra-strength deblock tests ---
|
||||||
|
* Tile geometry per orientation matches the bS<4 variant; only the
|
||||||
|
* dispatch + reference function change. alpha/beta are non-trivial
|
||||||
|
* (the C ref + NEON both early-return when alpha|beta == 0).
|
||||||
|
*/
|
||||||
|
typedef struct {
|
||||||
|
const char *name;
|
||||||
|
int n_edges, tile_stride, tile_rows, edge_off;
|
||||||
|
void (*ref)(uint8_t *pix, ptrdiff_t stride, int alpha, int beta);
|
||||||
|
int (*dispatch)(daedalus_ctx *ctx, uint8_t *dst, size_t dst_stride,
|
||||||
|
size_t n_edges, const daedalus_h264_deblock_meta *meta);
|
||||||
|
} intra_test_spec;
|
||||||
|
|
||||||
|
static int run_intra_test(const intra_test_spec *t)
|
||||||
|
{
|
||||||
|
int total = t->n_edges * t->tile_stride * t->tile_rows;
|
||||||
|
daedalus_ctx *ctx = daedalus_ctx_create();
|
||||||
|
if (!ctx) return 1;
|
||||||
|
|
||||||
|
uint8_t *dst = malloc((size_t) total);
|
||||||
|
uint8_t *dst_ref = malloc((size_t) total);
|
||||||
|
daedalus_h264_deblock_meta *meta = calloc((size_t) t->n_edges, sizeof(*meta));
|
||||||
|
if (!dst || !dst_ref || !meta) return 1;
|
||||||
|
|
||||||
|
for (int i = 0; i < total; i++) dst[i] = dst_ref[i] = (uint8_t)(xs() & 0xff);
|
||||||
|
int tile_bytes = t->tile_stride * t->tile_rows;
|
||||||
|
for (int i = 0; i < t->n_edges; i++) {
|
||||||
|
meta[i].dst_off = (uint32_t)(i * tile_bytes + t->edge_off);
|
||||||
|
meta[i].alpha = (int)(xs() % 64) + 1;
|
||||||
|
meta[i].beta = (int)(xs() % 16) + 1;
|
||||||
|
/* tc0[] unused for intra; leave at 0 from calloc. */
|
||||||
|
}
|
||||||
|
for (int i = 0; i < t->n_edges; i++) {
|
||||||
|
t->ref(dst_ref + meta[i].dst_off,
|
||||||
|
(ptrdiff_t) t->tile_stride,
|
||||||
|
meta[i].alpha, meta[i].beta);
|
||||||
|
}
|
||||||
|
int rc = t->dispatch(ctx, dst, (size_t) t->tile_stride,
|
||||||
|
(size_t) t->n_edges, meta);
|
||||||
|
if (rc) { fprintf(stderr, "%s dispatch rc=%d\n", t->name, rc); return 1; }
|
||||||
|
|
||||||
|
int diff = 0;
|
||||||
|
for (int i = 0; i < total; i++) if (dst[i] != dst_ref[i]) diff++;
|
||||||
|
printf(" H.264 deblock %s: %d/%d bytes bit-exact (%.4f%%)\n",
|
||||||
|
t->name, total - diff, total, 100.0 * (total - diff) / total);
|
||||||
|
|
||||||
|
free(meta); free(dst_ref); free(dst);
|
||||||
|
daedalus_ctx_destroy(ctx);
|
||||||
|
return diff == 0 ? 0 : 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int test_deblock_intra_all(void)
|
||||||
|
{
|
||||||
|
intra_test_spec specs[] = {
|
||||||
|
{ "luma v intra", 8, 16, 8, 4 * 16,
|
||||||
|
daedalus_h264_v_loop_filter_luma_intra_ref,
|
||||||
|
daedalus_recipe_dispatch_h264_deblock_luma_v_intra },
|
||||||
|
{ "luma h intra", 8, 8, 16, 4,
|
||||||
|
daedalus_h264_h_loop_filter_luma_intra_ref,
|
||||||
|
daedalus_recipe_dispatch_h264_deblock_luma_h_intra },
|
||||||
|
{ "chroma v intra", 8, 8, 4, 2 * 8,
|
||||||
|
daedalus_h264_v_loop_filter_chroma_intra_ref,
|
||||||
|
daedalus_recipe_dispatch_h264_deblock_chroma_v_intra },
|
||||||
|
{ "chroma h intra", 8, 4, 8, 2,
|
||||||
|
daedalus_h264_h_loop_filter_chroma_intra_ref,
|
||||||
|
daedalus_recipe_dispatch_h264_deblock_chroma_h_intra },
|
||||||
|
};
|
||||||
|
int fail = 0;
|
||||||
|
for (size_t i = 0; i < sizeof(specs)/sizeof(specs[0]); i++)
|
||||||
|
fail |= run_intra_test(&specs[i]);
|
||||||
|
return fail;
|
||||||
|
}
|
||||||
|
|
||||||
static int test_qpel_mc20(void)
|
static int test_qpel_mc20(void)
|
||||||
{
|
{
|
||||||
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
|
/* Cycle 9 — one 8x8 block per 16-wide row-tile, 8 tiles. Each tile
|
||||||
@@ -336,6 +417,8 @@ int main(void)
|
|||||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CV));
|
||||||
printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
|
printf(" H264_DEBLOCK_CH recipe substrate: %d (CPU)\n",
|
||||||
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_CH));
|
||||||
|
printf(" H264_DEBLOCK_*_INTRA recipe substrate: %d (CPU, bS=4 set)\n",
|
||||||
|
(int) daedalus_recipe_substrate_for(DAEDALUS_KERNEL_H264_DEBLOCK_LV_INTRA));
|
||||||
|
|
||||||
int fail = 0;
|
int fail = 0;
|
||||||
fail |= test_idct4();
|
fail |= test_idct4();
|
||||||
@@ -344,6 +427,7 @@ int main(void)
|
|||||||
fail |= test_deblock_h();
|
fail |= test_deblock_h();
|
||||||
fail |= test_deblock_chroma_v();
|
fail |= test_deblock_chroma_v();
|
||||||
fail |= test_deblock_chroma_h();
|
fail |= test_deblock_chroma_h();
|
||||||
|
fail |= test_deblock_intra_all();
|
||||||
fail |= test_qpel_mc20();
|
fail |= test_qpel_mc20();
|
||||||
return fail;
|
return fail;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user