h264: promote remaining intra prediction modes (17) to public API

Follows PR #26 (Intra_4x4 luma) with the same promotion pattern for the rest of the intra prediction primitive set: Intra_16x16 luma (4 modes, PR #13) — V/H/DC/Plane Intra_8x8 chroma (4 modes, PR #14) — DC/H/V/Plane (4:2:0) Intra_8x8 luma (9 modes, PRs #21 + #22) — High profile, with 1-2-1 pre-filter 3 file moves via `git mv`, ~17 function renames stripping the `_ref` suffix. Test binaries rewired to link daedalus_core instead of compiling the (now moved) ref files directly. No code change — pure plumbing for substitution-arc consumers. 26 intra prediction modes total now in the public API after this PR. Verified on hertz: test_intra_pred_16x16: 5/5 PASS test_intra_pred_chroma8x8: 5/5 PASS test_intra_pred_8x8_luma: 11/11 PASS All via public symbols (test binaries linked against daedalus_core). Unblocks marfrit-packages substitution arc patch 0014 — wires H264PredContext.pred4x4[], pred16x16[], pred8x8[], pred8x8l[] through daedalus alongside the existing IDCT / deblock / qpel / DC Hadamard substitutions. After 0014 lands, the libavcodec.so built by marfrit-packages will have EVERY hot-path pixel-math kernel of an H.264 8-bit 4:2:0 decode routing through daedalus — the substitution arc is feature- complete for the campaign target (Pi 5 Firefox YouTube playback).
2026-05-25 15:37:44 +02:00
parent 31c68d0d0e
commit cb3aef3dac
8 changed files with 108 additions and 73 deletions
@@ -1,106 +0,0 @@
-/*
- * Standalone bit-exact C reference for H.264 luma Intra_16x16
- * prediction modes (per H.264 spec §8.3.2).  All 4 modes.
- *
- * Mode index → name (per H.264 Table 7-15):
- *   0 = Vertical
- *   1 = Horizontal
- *   2 = DC
- *   3 = Plane
- *
- * Calling convention (FFmpeg-style, matches the Intra_4x4 ref):
- *   pred_16x16_<mode>(uint8_t *dst, ptrdiff_t stride)
- *
- * `dst` points at row 0, col 0 of the 16x16 output block.  Neighbours:
- *   top[0..15]  = dst[-stride + 0 .. -stride + 15]
- *   top-left    = dst[-stride - 1]
- *   left[0..15] = dst[ 0*stride - 1 .. 15*stride - 1]
- *
- * AVAILABILITY: assumes all neighbours valid (interior-MB case).  The
- * H.264 spec defines fallback for boundary cases (DC averages just
- * the available side, etc.); the eventual libavcodec intercept
- * handles availability before calling.
- *
- * License: BSD-2-Clause.
- */
-#include <stdint.h>
-#include <stddef.h>
-
-static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
-
-/* Mode 0 — Vertical: each col = top[col]. */
-void daedalus_h264_pred_16x16_vertical_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    const uint8_t *top = dst - stride;
-    for (int r = 0; r < 16; r++)
-        for (int c = 0; c < 16; c++) dst[r * stride + c] = top[c];
-}
-
-/* Mode 1 — Horizontal: each row = left[row]. */
-void daedalus_h264_pred_16x16_horizontal_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    for (int r = 0; r < 16; r++) {
-        uint8_t l = dst[r * stride - 1];
-        for (int c = 0; c < 16; c++) dst[r * stride + c] = l;
-    }
-}
-
-/* Mode 2 — DC: ((sum_top16 + sum_left16 + 16) >> 5) broadcast. */
-void daedalus_h264_pred_16x16_dc_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    const uint8_t *top = dst - stride;
-    int sum = 16;  /* rounding for >> 5 over 32 samples */
-    for (int i = 0; i < 16; i++) sum += top[i];
-    for (int i = 0; i < 16; i++) sum += dst[i * stride - 1];
-    uint8_t v = (uint8_t)(sum >> 5);
-    for (int r = 0; r < 16; r++)
-        for (int c = 0; c < 16; c++) dst[r * stride + c] = v;
-}
-
-/* Mode 3 — Plane (per H.264 §8.3.2.4):
- *   H = sum_{i=0..7} (i+1) * (p[7+i+1, -1] - p[7-i-1, -1])
- *     = sum_{i=0..7} (i+1) * (top[8+i] - top[6-i])
- *   V = sum_{j=0..7} (j+1) * (p[-1, 7+j+1] - p[-1, 7-j-1])
- *     = sum_{j=0..7} (j+1) * (left[8+j] - left[6-j])
- *   b = (5*H + 32) >> 6
- *   c = (5*V + 32) >> 6
- *   a = 16 * (p[-1, 15] + p[15, -1])
- *     = 16 * (left[15] + top[15])
- *   pred[y][x] = Clip1((a + b*(x-7) + c*(y-7) + 16) >> 5)
- *
- * Note: spec indexing uses [x, y] with x = col, y = row (or vice
- * versa depending on the section).  Here I use the FFmpeg convention
- * pred[y][x] = pred[row][col]; the H = horizontal-slope formula uses
- * the TOP row's left-vs-right asymmetry; V = vertical-slope uses the
- * LEFT col's top-vs-bottom asymmetry.  Boundary participants are
- * the top-left corner p[-1,-1] inferred from the spec's index range
- * (it does NOT participate in the H/V sums in the 16x16 case — only
- * for the chroma 8x8 plane mode).
- */
-void daedalus_h264_pred_16x16_plane_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    const uint8_t *top = dst - stride;
-    /* H accumulates differences across the right vs left half of the
-     * top row.  Per spec, the top-left p[-1,-1] participates: i=7 uses
-     * p[15,-1] - p[-1,-1].  We include it by reading top[-1]. */
-    int H = 0, V = 0;
-    for (int i = 0; i < 8; i++) {
-        int t_right = top[8 + i];
-        int t_left  = (i == 7) ? top[-1] : top[6 - i];
-        H += (i + 1) * (t_right - t_left);
-    }
-    for (int j = 0; j < 8; j++) {
-        int l_bot = dst[(8 + j) * stride - 1];
-        int l_top = (j == 7) ? top[-1] : dst[(6 - j) * stride - 1];
-        V += (j + 1) * (l_bot - l_top);
-    }
-    int b = (5 * H + 32) >> 6;
-    int c = (5 * V + 32) >> 6;
-    int a = 16 * (dst[15 * stride - 1] + top[15]);
-    for (int y = 0; y < 16; y++) {
-        for (int x = 0; x < 16; x++) {
-            int v = (a + b * (x - 7) + c * (y - 7) + 16) >> 5;
-            dst[y * stride + x] = (uint8_t) clip_u8(v);
-        }
-    }
-}
@@ -1,305 +0,0 @@
-/*
- * Standalone bit-exact C reference for H.264 luma Intra_8x8
- * prediction modes (per H.264 spec §8.3.2.1).  High-profile-only
- * MB type — Baseline/Main/Extended profiles don't see Intra_8x8.
- *
- * Distinct from Intra_4x4 in two ways:
- *
- *   1. REFERENCE SAMPLE FILTERING (§8.3.2.1.1).  The 25 raw
- *      neighbour samples are pre-filtered with a 1-2-1 smoothing
- *      filter BEFORE prediction.  The filtering has spec-defined
- *      boundary handling at the corners and the right-edge of the
- *      top-row extension.
- *
- *   2. SCALE.  All 9 prediction modes operate at 8x8 with the
- *      filtered samples (Intra_4x4 operates at 4x4 with the raw
- *      samples).
- *
- * This PR implements the filter + the 3 simple modes (Vertical,
- * Horizontal, DC).  The 6 directional modes (DDL, DDR, VR, HD, VL,
- * HU at 8x8) follow in a separate PR — same template, different
- * formulas per spec sections §8.3.2.1.4..§8.3.2.1.9.
- *
- * Calling convention (FFmpeg-style):
- *   pred_8x8_<mode>_ref(uint8_t *dst, ptrdiff_t stride)
- *
- * `dst` points at row 0 col 0 of the 8x8 output block.  Reads from
- *   top[0..15]  = dst[-stride + 0..15]
- *   top-left    = dst[-stride - 1]
- *   left[0..7]  = dst[ 0*stride - 1 .. 7*stride - 1]
- *
- * AVAILABILITY: assumes all neighbours valid (interior-MB case).
- *
- * License: BSD-2-Clause.
- */
-#include <stdint.h>
-#include <stddef.h>
-#include <string.h>
-
-static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
-
-/* H.264 §8.3.2.1.1 reference sample filtering.  Filters the 25 raw
- * samples around the 8x8 block into a `filt` array with the same
- * indices.  When called against an "all neighbours available" tile,
- * the filtered output uses these spec-defined formulas:
- *
- *   filt[top -1] (= filtered top-left) = (top[0] + 2*tl + left[0] + 2) >> 2
- *
- *   filt[top  0] = (tl + 2*top[0] + top[1] + 2) >> 2
- *   filt[top  i] for 1<=i<=14 = (top[i-1] + 2*top[i] + top[i+1] + 2) >> 2
- *   filt[top 15] = (top[14] + 3*top[15] + 2) >> 2    (boundary)
- *
- *   filt[left 0] = (tl + 2*left[0] + left[1] + 2) >> 2
- *   filt[left j] for 1<=j<=6 = (left[j-1] + 2*left[j] + left[j+1] + 2) >> 2
- *   filt[left 7] = (left[6] + 3*left[7] + 2) >> 2    (boundary)
- *
- * Reads neighbours from the dst buffer; writes filtered values to
- * a caller-provided 26-element array indexed as:
- *   filt[0]      = filtered top-left
- *   filt[1..16]  = filtered top[0..15]
- *   filt[17..24] = filtered left[0..7]
- */
-static void filter_refs(const uint8_t *dst, ptrdiff_t stride,
-                         uint8_t filt[25])
-{
-    int tl = dst[-stride - 1];
-    int t[16];
-    for (int i = 0; i < 16; i++) t[i] = dst[-stride + i];
-    int l[8];
-    for (int j = 0; j < 8; j++) l[j] = dst[j * stride - 1];
-
-    /* Filtered top-left. */
-    filt[0] = (uint8_t)((t[0] + 2*tl + l[0] + 2) >> 2);
-
-    /* Filtered top. */
-    filt[1] = (uint8_t)((tl + 2*t[0] + t[1] + 2) >> 2);
-    for (int i = 1; i <= 14; i++)
-        filt[1 + i] = (uint8_t)((t[i-1] + 2*t[i] + t[i+1] + 2) >> 2);
-    filt[1 + 15] = (uint8_t)((t[14] + 3*t[15] + 2) >> 2);
-
-    /* Filtered left. */
-    filt[17 + 0] = (uint8_t)((tl + 2*l[0] + l[1] + 2) >> 2);
-    for (int j = 1; j <= 6; j++)
-        filt[17 + j] = (uint8_t)((l[j-1] + 2*l[j] + l[j+1] + 2) >> 2);
-    filt[17 + 7] = (uint8_t)((l[6] + 3*l[7] + 2) >> 2);
-}
-
-/* Convenience macros for accessing the filt[] array by spec-style index. */
-#define FT(i)  filt[1 + (i)]    /* filtered top[i],  i in 0..15  */
-#define FL(j)  filt[17 + (j)]   /* filtered left[j], j in 0..7   */
-#define FTL    filt[0]          /* filtered top-left              */
-
-/* Mode 0 Vertical (§8.3.2.1.2): pred[r,c] = filt_top[c]. */
-void daedalus_h264_pred_8x8l_vertical_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    uint8_t filt[25];
-    filter_refs(dst, stride, filt);
-    for (int r = 0; r < 8; r++)
-        for (int c = 0; c < 8; c++) dst[r * stride + c] = FT(c);
-}
-
-/* Mode 1 Horizontal (§8.3.2.1.3): pred[r,c] = filt_left[r]. */
-void daedalus_h264_pred_8x8l_horizontal_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    uint8_t filt[25];
-    filter_refs(dst, stride, filt);
-    for (int r = 0; r < 8; r++)
-        for (int c = 0; c < 8; c++) dst[r * stride + c] = FL(r);
-}
-
-/* Mode 2 DC (§8.3.2.1.4): ((sum_filt_top[0..7] + sum_filt_left[0..7]
- * + 8) >> 4) broadcast.  Note the +8 (not +4 like 4x4): there are
- * 16 samples summed total, so >> 4 with half-step rounding +8. */
-void daedalus_h264_pred_8x8l_dc_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    uint8_t filt[25];
-    filter_refs(dst, stride, filt);
-    int sum = 8;
-    for (int i = 0; i < 8; i++) sum += FT(i);
-    for (int j = 0; j < 8; j++) sum += FL(j);
-    uint8_t v = (uint8_t)(sum >> 4);
-    for (int r = 0; r < 8; r++)
-        for (int c = 0; c < 8; c++) dst[r * stride + c] = v;
-}
-
-/* --- 6 directional modes for Intra_8x8 (H.264 §8.3.2.1.5..§8.3.2.1.10).
- * Transcribed from FFmpeg libavcodec/h264pred_template.c
- * pred8x8l_{down_left, down_right, vertical_right, horizontal_down,
- * vertical_left, horizontal_up} (LGPL-2.1+ in the original; algorithm
- * reproduced here for test purposes).
- *
- * All 6 use the same FILTERED reference samples produced by
- * filter_refs() above.  Mapping from FFmpeg's t0..t15 / l0..l7 / lt
- * notation:
- *     tN = FT(N)   for N in 0..15
- *     lN = FL(N)   for N in 0..7
- *     lt = FTL
- *
- * SRC(x,y) maps to dst[y*stride + x] (col x, row y).
- */
-#define SRC(x, y) dst[(y) * stride + (x)]
-#define T(i)  FT(i)
-#define L(j)  FL(j)
-#define LT    FTL
-
-/* Mode 3 DDL (Diagonal_Down_Left) — uses TOP + TOP_RIGHT, no LEFT. */
-void daedalus_h264_pred_8x8l_ddl_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    uint8_t filt[25];
-    filter_refs(dst, stride, filt);
-    SRC(0,0)= (T(0) + 2*T(1) + T(2) + 2) >> 2;
-    SRC(0,1)=SRC(1,0)= (T(1) + 2*T(2) + T(3) + 2) >> 2;
-    SRC(0,2)=SRC(1,1)=SRC(2,0)= (T(2) + 2*T(3) + T(4) + 2) >> 2;
-    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (T(3) + 2*T(4) + T(5) + 2) >> 2;
-    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (T(4) + 2*T(5) + T(6) + 2) >> 2;
-    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (T(5) + 2*T(6) + T(7) + 2) >> 2;
-    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (T(6) + 2*T(7) + T(8) + 2) >> 2;
-    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (T(7) + 2*T(8) + T(9) + 2) >> 2;
-    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (T(8) + 2*T(9) + T(10) + 2) >> 2;
-    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (T(9) + 2*T(10) + T(11) + 2) >> 2;
-    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (T(10) + 2*T(11) + T(12) + 2) >> 2;
-    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (T(11) + 2*T(12) + T(13) + 2) >> 2;
-    SRC(5,7)=SRC(6,6)=SRC(7,5)= (T(12) + 2*T(13) + T(14) + 2) >> 2;
-    SRC(6,7)=SRC(7,6)= (T(13) + 2*T(14) + T(15) + 2) >> 2;
-    SRC(7,7)= (T(14) + 3*T(15) + 2) >> 2;
-}
-
-/* Mode 4 DDR (Diagonal_Down_Right). */
-void daedalus_h264_pred_8x8l_ddr_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    uint8_t filt[25];
-    filter_refs(dst, stride, filt);
-    SRC(0,7)= (L(7) + 2*L(6) + L(5) + 2) >> 2;
-    SRC(0,6)=SRC(1,7)= (L(6) + 2*L(5) + L(4) + 2) >> 2;
-    SRC(0,5)=SRC(1,6)=SRC(2,7)= (L(5) + 2*L(4) + L(3) + 2) >> 2;
-    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (L(4) + 2*L(3) + L(2) + 2) >> 2;
-    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (L(3) + 2*L(2) + L(1) + 2) >> 2;
-    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (L(2) + 2*L(1) + L(0) + 2) >> 2;
-    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (L(1) + 2*L(0) + LT + 2) >> 2;
-    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (L(0) + 2*LT + T(0) + 2) >> 2;
-    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (LT + 2*T(0) + T(1) + 2) >> 2;
-    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (T(0) + 2*T(1) + T(2) + 2) >> 2;
-    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (T(1) + 2*T(2) + T(3) + 2) >> 2;
-    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (T(2) + 2*T(3) + T(4) + 2) >> 2;
-    SRC(5,0)=SRC(6,1)=SRC(7,2)= (T(3) + 2*T(4) + T(5) + 2) >> 2;
-    SRC(6,0)=SRC(7,1)= (T(4) + 2*T(5) + T(6) + 2) >> 2;
-    SRC(7,0)= (T(5) + 2*T(6) + T(7) + 2) >> 2;
-}
-
-/* Mode 5 VR (Vertical_Right). */
-void daedalus_h264_pred_8x8l_vr_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    uint8_t filt[25];
-    filter_refs(dst, stride, filt);
-    SRC(0,6)= (L(5) + 2*L(4) + L(3) + 2) >> 2;
-    SRC(0,7)= (L(6) + 2*L(5) + L(4) + 2) >> 2;
-    SRC(0,4)=SRC(1,6)= (L(3) + 2*L(2) + L(1) + 2) >> 2;
-    SRC(0,5)=SRC(1,7)= (L(4) + 2*L(3) + L(2) + 2) >> 2;
-    SRC(0,2)=SRC(1,4)=SRC(2,6)= (L(1) + 2*L(0) + LT + 2) >> 2;
-    SRC(0,3)=SRC(1,5)=SRC(2,7)= (L(2) + 2*L(1) + L(0) + 2) >> 2;
-    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (L(0) + 2*LT + T(0) + 2) >> 2;
-    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (LT + T(0) + 1) >> 1;
-    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (LT + 2*T(0) + T(1) + 2) >> 2;
-    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (T(0) + T(1) + 1) >> 1;
-    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (T(0) + 2*T(1) + T(2) + 2) >> 2;
-    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (T(1) + T(2) + 1) >> 1;
-    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (T(1) + 2*T(2) + T(3) + 2) >> 2;
-    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (T(2) + T(3) + 1) >> 1;
-    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (T(2) + 2*T(3) + T(4) + 2) >> 2;
-    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (T(3) + T(4) + 1) >> 1;
-    SRC(5,1)=SRC(6,3)=SRC(7,5)= (T(3) + 2*T(4) + T(5) + 2) >> 2;
-    SRC(5,0)=SRC(6,2)=SRC(7,4)= (T(4) + T(5) + 1) >> 1;
-    SRC(6,1)=SRC(7,3)= (T(4) + 2*T(5) + T(6) + 2) >> 2;
-    SRC(6,0)=SRC(7,2)= (T(5) + T(6) + 1) >> 1;
-    SRC(7,1)= (T(5) + 2*T(6) + T(7) + 2) >> 2;
-    SRC(7,0)= (T(6) + T(7) + 1) >> 1;
-}
-
-/* Mode 6 HD (Horizontal_Down). */
-void daedalus_h264_pred_8x8l_hd_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    uint8_t filt[25];
-    filter_refs(dst, stride, filt);
-    SRC(0,7)= (L(6) + L(7) + 1) >> 1;
-    SRC(1,7)= (L(5) + 2*L(6) + L(7) + 2) >> 2;
-    SRC(0,6)=SRC(2,7)= (L(5) + L(6) + 1) >> 1;
-    SRC(1,6)=SRC(3,7)= (L(4) + 2*L(5) + L(6) + 2) >> 2;
-    SRC(0,5)=SRC(2,6)=SRC(4,7)= (L(4) + L(5) + 1) >> 1;
-    SRC(1,5)=SRC(3,6)=SRC(5,7)= (L(3) + 2*L(4) + L(5) + 2) >> 2;
-    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (L(3) + L(4) + 1) >> 1;
-    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (L(2) + 2*L(3) + L(4) + 2) >> 2;
-    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (L(2) + L(3) + 1) >> 1;
-    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (L(1) + 2*L(2) + L(3) + 2) >> 2;
-    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (L(1) + L(2) + 1) >> 1;
-    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (L(0) + 2*L(1) + L(2) + 2) >> 2;
-    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (L(0) + L(1) + 1) >> 1;
-    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (LT + 2*L(0) + L(1) + 2) >> 2;
-    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (LT + L(0) + 1) >> 1;
-    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (L(0) + 2*LT + T(0) + 2) >> 2;
-    SRC(2,0)=SRC(4,1)=SRC(6,2)= (T(1) + 2*T(0) + LT + 2) >> 2;
-    SRC(3,0)=SRC(5,1)=SRC(7,2)= (T(2) + 2*T(1) + T(0) + 2) >> 2;
-    SRC(4,0)=SRC(6,1)= (T(3) + 2*T(2) + T(1) + 2) >> 2;
-    SRC(5,0)=SRC(7,1)= (T(4) + 2*T(3) + T(2) + 2) >> 2;
-    SRC(6,0)= (T(5) + 2*T(4) + T(3) + 2) >> 2;
-    SRC(7,0)= (T(6) + 2*T(5) + T(4) + 2) >> 2;
-}
-
-/* Mode 7 VL (Vertical_Left) — uses TOP + TOP_RIGHT only. */
-void daedalus_h264_pred_8x8l_vl_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    uint8_t filt[25];
-    filter_refs(dst, stride, filt);
-    SRC(0,0)= (T(0) + T(1) + 1) >> 1;
-    SRC(0,1)= (T(0) + 2*T(1) + T(2) + 2) >> 2;
-    SRC(0,2)=SRC(1,0)= (T(1) + T(2) + 1) >> 1;
-    SRC(0,3)=SRC(1,1)= (T(1) + 2*T(2) + T(3) + 2) >> 2;
-    SRC(0,4)=SRC(1,2)=SRC(2,0)= (T(2) + T(3) + 1) >> 1;
-    SRC(0,5)=SRC(1,3)=SRC(2,1)= (T(2) + 2*T(3) + T(4) + 2) >> 2;
-    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (T(3) + T(4) + 1) >> 1;
-    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (T(3) + 2*T(4) + T(5) + 2) >> 2;
-    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (T(4) + T(5) + 1) >> 1;
-    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (T(4) + 2*T(5) + T(6) + 2) >> 2;
-    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (T(5) + T(6) + 1) >> 1;
-    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (T(5) + 2*T(6) + T(7) + 2) >> 2;
-    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (T(6) + T(7) + 1) >> 1;
-    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (T(6) + 2*T(7) + T(8) + 2) >> 2;
-    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (T(7) + T(8) + 1) >> 1;
-    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (T(7) + 2*T(8) + T(9) + 2) >> 2;
-    SRC(5,6)=SRC(6,4)=SRC(7,2)= (T(8) + T(9) + 1) >> 1;
-    SRC(5,7)=SRC(6,5)=SRC(7,3)= (T(8) + 2*T(9) + T(10) + 2) >> 2;
-    SRC(6,6)=SRC(7,4)= (T(9) + T(10) + 1) >> 1;
-    SRC(6,7)=SRC(7,5)= (T(9) + 2*T(10) + T(11) + 2) >> 2;
-    SRC(7,6)= (T(10) + T(11) + 1) >> 1;
-    SRC(7,7)= (T(10) + 2*T(11) + T(12) + 2) >> 2;
-}
-
-/* Mode 8 HU (Horizontal_Up) — uses LEFT only. */
-void daedalus_h264_pred_8x8l_hu_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    uint8_t filt[25];
-    filter_refs(dst, stride, filt);
-    SRC(0,0)= (L(0) + L(1) + 1) >> 1;
-    SRC(1,0)= (L(0) + 2*L(1) + L(2) + 2) >> 2;
-    SRC(0,1)=SRC(2,0)= (L(1) + L(2) + 1) >> 1;
-    SRC(1,1)=SRC(3,0)= (L(1) + 2*L(2) + L(3) + 2) >> 2;
-    SRC(0,2)=SRC(2,1)=SRC(4,0)= (L(2) + L(3) + 1) >> 1;
-    SRC(1,2)=SRC(3,1)=SRC(5,0)= (L(2) + 2*L(3) + L(4) + 2) >> 2;
-    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (L(3) + L(4) + 1) >> 1;
-    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (L(3) + 2*L(4) + L(5) + 2) >> 2;
-    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (L(4) + L(5) + 1) >> 1;
-    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (L(4) + 2*L(5) + L(6) + 2) >> 2;
-    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (L(5) + L(6) + 1) >> 1;
-    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (L(5) + 2*L(6) + L(7) + 2) >> 2;
-    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (L(6) + L(7) + 1) >> 1;
-    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (L(6) + 3*L(7) + 2) >> 2;
-    /* 20 positions all = L(7) per FFmpeg lines 1097-1100. */
-    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
-    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
-    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
-    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= L(7);
-}
-
-#undef SRC
-#undef T
-#undef L
-#undef LT
@@ -1,123 +0,0 @@
-/*
- * Standalone bit-exact C reference for H.264 chroma Intra_8x8
- * prediction modes (per H.264 §8.3.3), used for both Cb and Cr
- * planes at 4:2:0.  All 4 modes.
- *
- * Mode index → name (per H.264 Table 7-16):
- *   0 = DC          (per-quadrant — asymmetric, see §8.3.3.2)
- *   1 = Horizontal
- *   2 = Vertical
- *   3 = Plane       (slope coefficient 34, distinct from luma's 5)
- *
- * Calling convention (same shape as luma intra refs):
- *   pred_chroma8x8_<mode>(uint8_t *dst, ptrdiff_t stride)
- *
- * `dst` points at row 0, col 0 of the 8x8 output block (single
- * component plane — Cb or Cr, dispatched independently).  Neighbours:
- *   top[0..7]   = dst[-stride + 0 .. -stride + 7]
- *   top-left    = dst[-stride - 1]
- *   left[0..7]  = dst[ 0*stride - 1 .. 7*stride - 1]
- *
- * AVAILABILITY: assumes all neighbours valid (interior-MB case).
- * The H.264 spec defines per-quadrant fallback for the DC mode at
- * MB boundaries; that's caller-side via the libavcodec intercept.
- *
- * License: BSD-2-Clause.
- */
-#include <stdint.h>
-#include <stddef.h>
-
-static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
-
-/* Mode 0 — DC (per-quadrant, 4:2:0 layout per §8.3.3.2).
- *
- * The 8×8 block is split into four 4×4 quadrants.  For interior
- * MBs (all neighbours available), the DC value per quadrant uses:
- *   (0,0) top-left  : (sum_top[0..3] + sum_left[0..3] + 4) >> 3
- *   (0,1) top-right :  sum_top[4..7]                  + 2) >> 2
- *   (1,0) bot-left  : (sum_left[4..7]                 + 2) >> 2
- *   (1,1) bot-right : (sum_top[4..7] + sum_left[4..7] + 4) >> 3
- *
- * The asymmetry mirrors what neighbours are "logically available"
- * for each quadrant in the spec's availability model.  Top-right
- * quadrant ignores the top-left-half because that half is "vertically
- * above" the top-left quadrant; the spec uses top[4..7] only.
- */
-void daedalus_h264_pred_chroma8x8_dc_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    const uint8_t *top = dst - stride;
-    int top_lo = 0, top_hi = 0, left_lo = 0, left_hi = 0;
-    for (int i = 0; i < 4; i++) {
-        top_lo  += top[i];
-        top_hi  += top[4 + i];
-        left_lo += dst[i * stride - 1];
-        left_hi += dst[(4 + i) * stride - 1];
-    }
-    uint8_t dc00 = (uint8_t)((top_lo  + left_lo + 4) >> 3);  /* top-left */
-    uint8_t dc01 = (uint8_t)((top_hi             + 2) >> 2); /* top-right */
-    uint8_t dc10 = (uint8_t)((           left_hi + 2) >> 2); /* bot-left  */
-    uint8_t dc11 = (uint8_t)((top_hi  + left_hi + 4) >> 3);  /* bot-right */
-    for (int r = 0; r < 4; r++) {
-        for (int c = 0; c < 4; c++) {
-            dst[(    r) * stride +     c    ] = dc00;
-            dst[(    r) * stride + 4 + c    ] = dc01;
-            dst[(4 + r) * stride +     c    ] = dc10;
-            dst[(4 + r) * stride + 4 + c    ] = dc11;
-        }
-    }
-}
-
-/* Mode 1 — Horizontal: each row = left[row]. */
-void daedalus_h264_pred_chroma8x8_horizontal_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    for (int r = 0; r < 8; r++) {
-        uint8_t l = dst[r * stride - 1];
-        for (int c = 0; c < 8; c++) dst[r * stride + c] = l;
-    }
-}
-
-/* Mode 2 — Vertical: each col = top[col]. */
-void daedalus_h264_pred_chroma8x8_vertical_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    const uint8_t *top = dst - stride;
-    for (int r = 0; r < 8; r++)
-        for (int c = 0; c < 8; c++) dst[r * stride + c] = top[c];
-}
-
-/* Mode 3 — Plane (per H.264 §8.3.3.4):
- *   H = sum_{i=0..3} (i+1) * (p[4+i, -1]  - p[2-i, -1])    ; i=3 uses p[-1,-1]
- *   V = sum_{j=0..3} (j+1) * (p[-1, 4+j]  - p[-1, 2-j])    ; j=3 uses p[-1,-1]
- *   b = (34 * H + 32) >> 6
- *   c = (34 * V + 32) >> 6
- *   a = 16 * (p[-1, 7] + p[7, -1])
- *   pred[y][x] = Clip1((a + b*(x - 3) + c*(y - 3) + 16) >> 5)
- *
- * Distinct from the Intra_16x16 luma Plane:
- *   - Slope coefficient is 34 (not 5).
- *   - Centre is (x-3, y-3) (not x-7, y-7).
- *   - Spans 4 differences per sum (not 8).
- */
-void daedalus_h264_pred_chroma8x8_plane_ref(uint8_t *dst, ptrdiff_t stride)
-{
-    const uint8_t *top = dst - stride;
-    int H = 0, V = 0;
-    for (int i = 0; i < 4; i++) {
-        int t_right = top[4 + i];
-        int t_left  = (i == 3) ? top[-1] : top[2 - i];
-        H += (i + 1) * (t_right - t_left);
-    }
-    for (int j = 0; j < 4; j++) {
-        int l_bot = dst[(4 + j) * stride - 1];
-        int l_top = (j == 3) ? top[-1] : dst[(2 - j) * stride - 1];
-        V += (j + 1) * (l_bot - l_top);
-    }
-    int b = (34 * H + 32) >> 6;
-    int c = (34 * V + 32) >> 6;
-    int a = 16 * (dst[7 * stride - 1] + top[7]);
-    for (int y = 0; y < 8; y++) {
-        for (int x = 0; x < 8; x++) {
-            int v = (a + b * (x - 3) + c * (y - 3) + 16) >> 5;
-            dst[y * stride + x] = (uint8_t) clip_u8(v);
-        }
-    }
-}
@@ -18,10 +18,10 @@
 #include <stdio.h>
 #include <string.h>

-extern void daedalus_h264_pred_16x16_vertical_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_16x16_horizontal_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_16x16_dc_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_16x16_plane_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_16x16_vertical(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_16x16_horizontal(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_16x16_dc(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_16x16_plane(uint8_t *dst, ptrdiff_t stride);

 #define STRIDE 17
 #define ROWS   17
@@ -84,7 +84,7 @@ int main(void)
        int t[16], l[16];
        for (int i = 0; i < 16; i++) { t[i] = 10 + i; l[i] = 0; }
        set_ctx(buf, 0, t, l);
-        daedalus_h264_pred_16x16_vertical_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_16x16_vertical(&buf[1][1], STRIDE);
        struct vertical_ctx vc = { t };
        fail |= check(buf, "Vertical (mode 0)", expect_vertical, &vc);
    }
@@ -95,7 +95,7 @@ int main(void)
        int t[16] = {0}, l[16];
        for (int i = 0; i < 16; i++) l[i] = 50 + i;
        set_ctx(buf, 0, t, l);
-        daedalus_h264_pred_16x16_horizontal_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_16x16_horizontal(&buf[1][1], STRIDE);
        struct horizontal_ctx hc = { l };
        fail |= check(buf, "Horizontal (mode 1)", expect_horizontal, &hc);
    }
@@ -108,7 +108,7 @@ int main(void)
        int t[16], l[16];
        for (int i = 0; i < 16; i++) { t[i] = 2; l[i] = 6; }
        set_ctx(buf, 99, t, l);
-        daedalus_h264_pred_16x16_dc_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_16x16_dc(&buf[1][1], STRIDE);
        uint8_t exp_val = 4;
        fail |= check(buf, "DC (mode 2)", expect_uniform, &exp_val);
    }
@@ -123,7 +123,7 @@ int main(void)
        int t[16], l[16];
        for (int i = 0; i < 16; i++) { t[i] = 100; l[i] = 100; }
        set_ctx(buf, 100, t, l);   /* uniform tl too — H/V sums actually zero */
-        daedalus_h264_pred_16x16_plane_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_16x16_plane(&buf[1][1], STRIDE);
        uint8_t exp_val = 100;
        fail |= check(buf, "Plane (mode 3, uniform)", expect_uniform, &exp_val);
    }
@@ -150,7 +150,7 @@ int main(void)
        int t[16], l[16];
        for (int i = 0; i < 16; i++) { t[i] = i; l[i] = i; }
        set_ctx(buf, 0, t, l);
-        daedalus_h264_pred_16x16_plane_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_16x16_plane(&buf[1][1], STRIDE);
        uint8_t tl_actual = buf[1 + 0][1 + 0];
        uint8_t br_actual = buf[1 + 15][1 + 15];
        int spot_fail = 0;
@@ -14,15 +14,15 @@
 #include <stdio.h>
 #include <string.h>

-extern void daedalus_h264_pred_8x8l_vertical_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_8x8l_horizontal_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_8x8l_dc_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_8x8l_ddl_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_8x8l_ddr_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_8x8l_vr_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_8x8l_hd_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_8x8l_vl_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_8x8l_hu_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_vertical(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_horizontal(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_dc(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_ddl(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_ddr(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_vr(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_hd(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_vl(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_8x8l_hu(uint8_t *dst, ptrdiff_t stride);

 #define STRIDE 17
 #define ROWS   9
@@ -61,7 +61,7 @@ int main(void)
        for (int i = 0; i < 16; i++) t[i] = 50;
        for (int j = 0; j < 8; j++)  l[j] = 0;
        set_ctx(buf, 50, t, l);
-        daedalus_h264_pred_8x8l_vertical_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_8x8l_vertical(&buf[1][1], STRIDE);
        fail |= check_uniform(buf, "Vertical (mode 0, uniform top)", 50);
    }

@@ -71,7 +71,7 @@ int main(void)
        int t[16] = {0}, l[8];
        for (int j = 0; j < 8; j++) l[j] = 70;
        set_ctx(buf, 70, t, l);
-        daedalus_h264_pred_8x8l_horizontal_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_8x8l_horizontal(&buf[1][1], STRIDE);
        fail |= check_uniform(buf, "Horizontal (mode 1, uniform left)", 70);
    }

@@ -84,7 +84,7 @@ int main(void)
        for (int i = 0; i < 16; i++) t[i] = 33;
        for (int j = 0; j < 8; j++)  l[j] = 33;
        set_ctx(buf, 33, t, l);
-        daedalus_h264_pred_8x8l_dc_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_8x8l_dc(&buf[1][1], STRIDE);
        fail |= check_uniform(buf, "DC (mode 2, uniform)", 33);
    }

@@ -108,7 +108,7 @@ int main(void)
        int t[16], l[8] = {0};
        for (int i = 0; i < 16; i++) t[i] = i;
        set_ctx(buf, 0, t, l);
-        daedalus_h264_pred_8x8l_vertical_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_8x8l_vertical(&buf[1][1], STRIDE);
        int diff = 0;
        for (int r = 0; r < 8; r++)
            for (int c = 0; c < 8; c++)
@@ -129,7 +129,7 @@ int main(void)
        int t[16] = {0}, l[8];
        for (int j = 0; j < 8; j++) l[j] = j;
        set_ctx(buf, 0, t, l);
-        daedalus_h264_pred_8x8l_horizontal_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_8x8l_horizontal(&buf[1][1], STRIDE);
        int diff = 0;
        for (int r = 0; r < 8; r++)
            for (int c = 0; c < 8; c++)
@@ -146,12 +146,12 @@ int main(void)
    {
        typedef void (*pred_fn_t)(uint8_t *dst, ptrdiff_t stride);
        struct { const char *name; pred_fn_t fn; } modes[] = {
-            { "DDL (mode 3, uniform)",        daedalus_h264_pred_8x8l_ddl_ref },
-            { "DDR (mode 4, uniform)",        daedalus_h264_pred_8x8l_ddr_ref },
-            { "VR (mode 5, uniform)",         daedalus_h264_pred_8x8l_vr_ref  },
-            { "HD (mode 6, uniform)",         daedalus_h264_pred_8x8l_hd_ref  },
-            { "VL (mode 7, uniform)",         daedalus_h264_pred_8x8l_vl_ref  },
-            { "HU (mode 8, uniform)",         daedalus_h264_pred_8x8l_hu_ref  },
+            { "DDL (mode 3, uniform)",        daedalus_h264_pred_8x8l_ddl },
+            { "DDR (mode 4, uniform)",        daedalus_h264_pred_8x8l_ddr },
+            { "VR (mode 5, uniform)",         daedalus_h264_pred_8x8l_vr  },
+            { "HD (mode 6, uniform)",         daedalus_h264_pred_8x8l_hd  },
+            { "VL (mode 7, uniform)",         daedalus_h264_pred_8x8l_vl  },
+            { "HU (mode 8, uniform)",         daedalus_h264_pred_8x8l_hu  },
        };
        for (size_t i = 0; i < sizeof(modes)/sizeof(modes[0]); i++) {
            uint8_t buf[ROWS][STRIDE];
@@ -16,10 +16,10 @@
 #include <stdio.h>
 #include <string.h>

-extern void daedalus_h264_pred_chroma8x8_dc_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_chroma8x8_horizontal_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_chroma8x8_vertical_ref(uint8_t *dst, ptrdiff_t stride);
-extern void daedalus_h264_pred_chroma8x8_plane_ref(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_chroma8x8_dc(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_chroma8x8_horizontal(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_chroma8x8_vertical(uint8_t *dst, ptrdiff_t stride);
+extern void daedalus_h264_pred_chroma8x8_plane(uint8_t *dst, ptrdiff_t stride);

 #define STRIDE 9
 #define ROWS   9
@@ -69,7 +69,7 @@ int main(void)
        uint8_t buf[ROWS][STRIDE];
        int t[8] = {0}, l[8] = {10, 20, 30, 40, 50, 60, 70, 80};
        set_ctx(buf, 0, t, l);
-        daedalus_h264_pred_chroma8x8_horizontal_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_chroma8x8_horizontal(&buf[1][1], STRIDE);
        uint8_t exp[8][8];
        for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = (uint8_t) l[r];
        fail |= check_per_cell(buf, "Horizontal (mode 1)", exp);
@@ -80,7 +80,7 @@ int main(void)
        uint8_t buf[ROWS][STRIDE];
        int t[8] = {15, 25, 35, 45, 55, 65, 75, 85}, l[8] = {0};
        set_ctx(buf, 0, t, l);
-        daedalus_h264_pred_chroma8x8_vertical_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_chroma8x8_vertical(&buf[1][1], STRIDE);
        uint8_t exp[8][8];
        for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = (uint8_t) t[c];
        fail |= check_per_cell(buf, "Vertical (mode 2)", exp);
@@ -104,7 +104,7 @@ int main(void)
        int t[8] = { 8, 8, 8, 8,  16, 16, 16, 16 };
        int l[8] = { 24, 24, 24, 24,  40, 40, 40, 40 };
        set_ctx(buf, 99, t, l);
-        daedalus_h264_pred_chroma8x8_dc_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_chroma8x8_dc(&buf[1][1], STRIDE);
        uint8_t exp[8][8] = {
            {16,16,16,16, 16,16,16,16},
            {16,16,16,16, 16,16,16,16},
@@ -125,7 +125,7 @@ int main(void)
        int t[8], l[8];
        for (int i = 0; i < 8; i++) { t[i] = 100; l[i] = 100; }
        set_ctx(buf, 100, t, l);
-        daedalus_h264_pred_chroma8x8_plane_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_chroma8x8_plane(&buf[1][1], STRIDE);
        uint8_t exp[8][8];
        for (int r = 0; r < 8; r++) for (int c = 0; c < 8; c++) exp[r][c] = 100;
        fail |= check_per_cell(buf, "Plane uniform (mode 3)", exp);
@@ -153,7 +153,7 @@ int main(void)
        int t[8], l[8];
        for (int i = 0; i < 8; i++) { t[i] = i; l[i] = i; }
        set_ctx(buf, 0, t, l);
-        daedalus_h264_pred_chroma8x8_plane_ref(&buf[1][1], STRIDE);
+        daedalus_h264_pred_chroma8x8_plane(&buf[1][1], STRIDE);
        uint8_t tl_actual = buf[1 + 0][1 + 0];
        uint8_t br_actual = buf[1 + 7][1 + 7];
        int spot_fail = 0;