cb3aef3dac
Follows PR #26 (Intra_4x4 luma) with the same promotion pattern for the rest of the intra prediction primitive set: Intra_16x16 luma (4 modes, PR #13) — V/H/DC/Plane Intra_8x8 chroma (4 modes, PR #14) — DC/H/V/Plane (4:2:0) Intra_8x8 luma (9 modes, PRs #21 + #22) — High profile, with 1-2-1 pre-filter 3 file moves via `git mv`, ~17 function renames stripping the `_ref` suffix. Test binaries rewired to link daedalus_core instead of compiling the (now moved) ref files directly. No code change — pure plumbing for substitution-arc consumers. 26 intra prediction modes total now in the public API after this PR. Verified on hertz: test_intra_pred_16x16: 5/5 PASS test_intra_pred_chroma8x8: 5/5 PASS test_intra_pred_8x8_luma: 11/11 PASS All via public symbols (test binaries linked against daedalus_core). Unblocks marfrit-packages substitution arc patch 0014 — wires H264PredContext.pred4x4[], pred16x16[], pred8x8[], pred8x8l[] through daedalus alongside the existing IDCT / deblock / qpel / DC Hadamard substitutions. After 0014 lands, the libavcodec.so built by marfrit-packages will have EVERY hot-path pixel-math kernel of an H.264 8-bit 4:2:0 decode routing through daedalus — the substitution arc is feature- complete for the campaign target (Pi 5 Firefox YouTube playback).
107 lines
3.9 KiB
C
107 lines
3.9 KiB
C
/*
|
|
* Standalone bit-exact C reference for H.264 luma Intra_16x16
|
|
* prediction modes (per H.264 spec §8.3.2). All 4 modes.
|
|
*
|
|
* Mode index → name (per H.264 Table 7-15):
|
|
* 0 = Vertical
|
|
* 1 = Horizontal
|
|
* 2 = DC
|
|
* 3 = Plane
|
|
*
|
|
* Calling convention (FFmpeg-style, matches the Intra_4x4 ref):
|
|
* pred_16x16_<mode>(uint8_t *dst, ptrdiff_t stride)
|
|
*
|
|
* `dst` points at row 0, col 0 of the 16x16 output block. Neighbours:
|
|
* top[0..15] = dst[-stride + 0 .. -stride + 15]
|
|
* top-left = dst[-stride - 1]
|
|
* left[0..15] = dst[ 0*stride - 1 .. 15*stride - 1]
|
|
*
|
|
* AVAILABILITY: assumes all neighbours valid (interior-MB case). The
|
|
* H.264 spec defines fallback for boundary cases (DC averages just
|
|
* the available side, etc.); the eventual libavcodec intercept
|
|
* handles availability before calling.
|
|
*
|
|
* License: BSD-2-Clause.
|
|
*/
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
|
|
static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }
|
|
|
|
/* Mode 0 — Vertical: each col = top[col]. */
|
|
void daedalus_h264_pred_16x16_vertical(uint8_t *dst, ptrdiff_t stride)
|
|
{
|
|
const uint8_t *top = dst - stride;
|
|
for (int r = 0; r < 16; r++)
|
|
for (int c = 0; c < 16; c++) dst[r * stride + c] = top[c];
|
|
}
|
|
|
|
/* Mode 1 — Horizontal: each row = left[row]. */
|
|
void daedalus_h264_pred_16x16_horizontal(uint8_t *dst, ptrdiff_t stride)
|
|
{
|
|
for (int r = 0; r < 16; r++) {
|
|
uint8_t l = dst[r * stride - 1];
|
|
for (int c = 0; c < 16; c++) dst[r * stride + c] = l;
|
|
}
|
|
}
|
|
|
|
/* Mode 2 — DC: ((sum_top16 + sum_left16 + 16) >> 5) broadcast. */
|
|
void daedalus_h264_pred_16x16_dc(uint8_t *dst, ptrdiff_t stride)
|
|
{
|
|
const uint8_t *top = dst - stride;
|
|
int sum = 16; /* rounding for >> 5 over 32 samples */
|
|
for (int i = 0; i < 16; i++) sum += top[i];
|
|
for (int i = 0; i < 16; i++) sum += dst[i * stride - 1];
|
|
uint8_t v = (uint8_t)(sum >> 5);
|
|
for (int r = 0; r < 16; r++)
|
|
for (int c = 0; c < 16; c++) dst[r * stride + c] = v;
|
|
}
|
|
|
|
/* Mode 3 — Plane (per H.264 §8.3.2.4):
|
|
* H = sum_{i=0..7} (i+1) * (p[7+i+1, -1] - p[7-i-1, -1])
|
|
* = sum_{i=0..7} (i+1) * (top[8+i] - top[6-i])
|
|
* V = sum_{j=0..7} (j+1) * (p[-1, 7+j+1] - p[-1, 7-j-1])
|
|
* = sum_{j=0..7} (j+1) * (left[8+j] - left[6-j])
|
|
* b = (5*H + 32) >> 6
|
|
* c = (5*V + 32) >> 6
|
|
* a = 16 * (p[-1, 15] + p[15, -1])
|
|
* = 16 * (left[15] + top[15])
|
|
* pred[y][x] = Clip1((a + b*(x-7) + c*(y-7) + 16) >> 5)
|
|
*
|
|
* Note: spec indexing uses [x, y] with x = col, y = row (or vice
|
|
* versa depending on the section). Here I use the FFmpeg convention
|
|
* pred[y][x] = pred[row][col]; the H = horizontal-slope formula uses
|
|
* the TOP row's left-vs-right asymmetry; V = vertical-slope uses the
|
|
* LEFT col's top-vs-bottom asymmetry. Boundary participants are
|
|
* the top-left corner p[-1,-1] inferred from the spec's index range
|
|
* (it does NOT participate in the H/V sums in the 16x16 case — only
|
|
* for the chroma 8x8 plane mode).
|
|
*/
|
|
void daedalus_h264_pred_16x16_plane(uint8_t *dst, ptrdiff_t stride)
|
|
{
|
|
const uint8_t *top = dst - stride;
|
|
/* H accumulates differences across the right vs left half of the
|
|
* top row. Per spec, the top-left p[-1,-1] participates: i=7 uses
|
|
* p[15,-1] - p[-1,-1]. We include it by reading top[-1]. */
|
|
int H = 0, V = 0;
|
|
for (int i = 0; i < 8; i++) {
|
|
int t_right = top[8 + i];
|
|
int t_left = (i == 7) ? top[-1] : top[6 - i];
|
|
H += (i + 1) * (t_right - t_left);
|
|
}
|
|
for (int j = 0; j < 8; j++) {
|
|
int l_bot = dst[(8 + j) * stride - 1];
|
|
int l_top = (j == 7) ? top[-1] : dst[(6 - j) * stride - 1];
|
|
V += (j + 1) * (l_bot - l_top);
|
|
}
|
|
int b = (5 * H + 32) >> 6;
|
|
int c = (5 * V + 32) >> 6;
|
|
int a = 16 * (dst[15 * stride - 1] + top[15]);
|
|
for (int y = 0; y < 16; y++) {
|
|
for (int x = 0; x < 16; x++) {
|
|
int v = (a + b * (x - 7) + c * (y - 7) + 16) >> 5;
|
|
dst[y * stride + x] = (uint8_t) clip_u8(v);
|
|
}
|
|
}
|
|
}
|