daedalus-fourier/src/h264_intra_pred_16x16.c

/*
 * Standalone bit-exact C reference for H.264 luma Intra_16x16
 * prediction modes (per H.264 spec §8.3.2).  All 4 modes.
 *
 * Mode index → name (per H.264 Table 7-15):
 *   0 = Vertical
 *   1 = Horizontal
 *   2 = DC
 *   3 = Plane
 *
 * Calling convention (FFmpeg-style, matches the Intra_4x4 ref):
 *   pred_16x16_<mode>(uint8_t *dst, ptrdiff_t stride)
 *
 * `dst` points at row 0, col 0 of the 16x16 output block.  Neighbours:
 *   top[0..15]  = dst[-stride + 0 .. -stride + 15]
 *   top-left    = dst[-stride - 1]
 *   left[0..15] = dst[ 0*stride - 1 .. 15*stride - 1]
 *
 * AVAILABILITY: assumes all neighbours valid (interior-MB case).  The
 * H.264 spec defines fallback for boundary cases (DC averages just
 * the available side, etc.); the eventual libavcodec intercept
 * handles availability before calling.
 *
 * License: BSD-2-Clause.
 */
#include <stdint.h>
#include <stddef.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

/* Mode 0 — Vertical: each col = top[col]. */
void daedalus_h264_pred_16x16_vertical(uint8_t *dst, ptrdiff_t stride)
{
    const uint8_t *top = dst - stride;
    for (int r = 0; r < 16; r++)
        for (int c = 0; c < 16; c++) dst[r * stride + c] = top[c];
}

/* Mode 1 — Horizontal: each row = left[row]. */
void daedalus_h264_pred_16x16_horizontal(uint8_t *dst, ptrdiff_t stride)
{
    for (int r = 0; r < 16; r++) {
        uint8_t l = dst[r * stride - 1];
        for (int c = 0; c < 16; c++) dst[r * stride + c] = l;
    }
}

/* Mode 2 — DC: ((sum_top16 + sum_left16 + 16) >> 5) broadcast. */
void daedalus_h264_pred_16x16_dc(uint8_t *dst, ptrdiff_t stride)
{
    const uint8_t *top = dst - stride;
    int sum = 16;  /* rounding for >> 5 over 32 samples */
    for (int i = 0; i < 16; i++) sum += top[i];
    for (int i = 0; i < 16; i++) sum += dst[i * stride - 1];
    uint8_t v = (uint8_t)(sum >> 5);
    for (int r = 0; r < 16; r++)
        for (int c = 0; c < 16; c++) dst[r * stride + c] = v;
}

/* Mode 3 — Plane (per H.264 §8.3.2.4):
 *   H = sum_{i=0..7} (i+1) * (p[7+i+1, -1] - p[7-i-1, -1])
 *     = sum_{i=0..7} (i+1) * (top[8+i] - top[6-i])
 *   V = sum_{j=0..7} (j+1) * (p[-1, 7+j+1] - p[-1, 7-j-1])
 *     = sum_{j=0..7} (j+1) * (left[8+j] - left[6-j])
 *   b = (5*H + 32) >> 6
 *   c = (5*V + 32) >> 6
 *   a = 16 * (p[-1, 15] + p[15, -1])
 *     = 16 * (left[15] + top[15])
 *   pred[y][x] = Clip1((a + b*(x-7) + c*(y-7) + 16) >> 5)
 *
 * Note: spec indexing uses [x, y] with x = col, y = row (or vice
 * versa depending on the section).  Here I use the FFmpeg convention
 * pred[y][x] = pred[row][col]; the H = horizontal-slope formula uses
 * the TOP row's left-vs-right asymmetry; V = vertical-slope uses the
 * LEFT col's top-vs-bottom asymmetry.  Boundary participants are
 * the top-left corner p[-1,-1] inferred from the spec's index range
 * (it does NOT participate in the H/V sums in the 16x16 case — only
 * for the chroma 8x8 plane mode).
 */
void daedalus_h264_pred_16x16_plane(uint8_t *dst, ptrdiff_t stride)
{
    const uint8_t *top = dst - stride;
    /* H accumulates differences across the right vs left half of the
     * top row.  Per spec, the top-left p[-1,-1] participates: i=7 uses
     * p[15,-1] - p[-1,-1].  We include it by reading top[-1]. */
    int H = 0, V = 0;
    for (int i = 0; i < 8; i++) {
        int t_right = top[8 + i];
        int t_left  = (i == 7) ? top[-1] : top[6 - i];
        H += (i + 1) * (t_right - t_left);
    }
    for (int j = 0; j < 8; j++) {
        int l_bot = dst[(8 + j) * stride - 1];
        int l_top = (j == 7) ? top[-1] : dst[(6 - j) * stride - 1];
        V += (j + 1) * (l_bot - l_top);
    }
    int b = (5 * H + 32) >> 6;
    int c = (5 * V + 32) >> 6;
    int a = 16 * (dst[15 * stride - 1] + top[15]);
    for (int y = 0; y < 16; y++) {
        for (int x = 0; x < 16; x++) {
            int v = (a + b * (x - 7) + c * (y - 7) + 16) >> 5;
            dst[y * stride + x] = (uint8_t) clip_u8(v);
        }
    }
}