daedalus-fourier/src/h264_intra_pred_chroma8x8.c

/*
 * Standalone bit-exact C reference for H.264 chroma Intra_8x8
 * prediction modes (per H.264 §8.3.3), used for both Cb and Cr
 * planes at 4:2:0.  All 4 modes.
 *
 * Mode index → name (per H.264 Table 7-16):
 *   0 = DC          (per-quadrant — asymmetric, see §8.3.3.2)
 *   1 = Horizontal
 *   2 = Vertical
 *   3 = Plane       (slope coefficient 34, distinct from luma's 5)
 *
 * Calling convention (same shape as luma intra refs):
 *   pred_chroma8x8_<mode>(uint8_t *dst, ptrdiff_t stride)
 *
 * `dst` points at row 0, col 0 of the 8x8 output block (single
 * component plane — Cb or Cr, dispatched independently).  Neighbours:
 *   top[0..7]   = dst[-stride + 0 .. -stride + 7]
 *   top-left    = dst[-stride - 1]
 *   left[0..7]  = dst[ 0*stride - 1 .. 7*stride - 1]
 *
 * AVAILABILITY: assumes all neighbours valid (interior-MB case).
 * The H.264 spec defines per-quadrant fallback for the DC mode at
 * MB boundaries; that's caller-side via the libavcodec intercept.
 *
 * License: BSD-2-Clause.
 */
#include <stdint.h>
#include <stddef.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

/* Mode 0 — DC (per-quadrant, 4:2:0 layout per §8.3.3.2).
 *
 * The 8×8 block is split into four 4×4 quadrants.  For interior
 * MBs (all neighbours available), the DC value per quadrant uses:
 *   (0,0) top-left  : (sum_top[0..3] + sum_left[0..3] + 4) >> 3
 *   (0,1) top-right :  sum_top[4..7]                  + 2) >> 2
 *   (1,0) bot-left  : (sum_left[4..7]                 + 2) >> 2
 *   (1,1) bot-right : (sum_top[4..7] + sum_left[4..7] + 4) >> 3
 *
 * The asymmetry mirrors what neighbours are "logically available"
 * for each quadrant in the spec's availability model.  Top-right
 * quadrant ignores the top-left-half because that half is "vertically
 * above" the top-left quadrant; the spec uses top[4..7] only.
 */
void daedalus_h264_pred_chroma8x8_dc(uint8_t *dst, ptrdiff_t stride)
{
    const uint8_t *top = dst - stride;
    int top_lo = 0, top_hi = 0, left_lo = 0, left_hi = 0;
    for (int i = 0; i < 4; i++) {
        top_lo  += top[i];
        top_hi  += top[4 + i];
        left_lo += dst[i * stride - 1];
        left_hi += dst[(4 + i) * stride - 1];
    }
    uint8_t dc00 = (uint8_t)((top_lo  + left_lo + 4) >> 3);  /* top-left */
    uint8_t dc01 = (uint8_t)((top_hi             + 2) >> 2); /* top-right */
    uint8_t dc10 = (uint8_t)((           left_hi + 2) >> 2); /* bot-left  */
    uint8_t dc11 = (uint8_t)((top_hi  + left_hi + 4) >> 3);  /* bot-right */
    for (int r = 0; r < 4; r++) {
        for (int c = 0; c < 4; c++) {
            dst[(    r) * stride +     c    ] = dc00;
            dst[(    r) * stride + 4 + c    ] = dc01;
            dst[(4 + r) * stride +     c    ] = dc10;
            dst[(4 + r) * stride + 4 + c    ] = dc11;
        }
    }
}

/* Mode 1 — Horizontal: each row = left[row]. */
void daedalus_h264_pred_chroma8x8_horizontal(uint8_t *dst, ptrdiff_t stride)
{
    for (int r = 0; r < 8; r++) {
        uint8_t l = dst[r * stride - 1];
        for (int c = 0; c < 8; c++) dst[r * stride + c] = l;
    }
}

/* Mode 2 — Vertical: each col = top[col]. */
void daedalus_h264_pred_chroma8x8_vertical(uint8_t *dst, ptrdiff_t stride)
{
    const uint8_t *top = dst - stride;
    for (int r = 0; r < 8; r++)
        for (int c = 0; c < 8; c++) dst[r * stride + c] = top[c];
}

/* Mode 3 — Plane (per H.264 §8.3.3.4):
 *   H = sum_{i=0..3} (i+1) * (p[4+i, -1]  - p[2-i, -1])    ; i=3 uses p[-1,-1]
 *   V = sum_{j=0..3} (j+1) * (p[-1, 4+j]  - p[-1, 2-j])    ; j=3 uses p[-1,-1]
 *   b = (34 * H + 32) >> 6
 *   c = (34 * V + 32) >> 6
 *   a = 16 * (p[-1, 7] + p[7, -1])
 *   pred[y][x] = Clip1((a + b*(x - 3) + c*(y - 3) + 16) >> 5)
 *
 * Distinct from the Intra_16x16 luma Plane:
 *   - Slope coefficient is 34 (not 5).
 *   - Centre is (x-3, y-3) (not x-7, y-7).
 *   - Spans 4 differences per sum (not 8).
 */
void daedalus_h264_pred_chroma8x8_plane(uint8_t *dst, ptrdiff_t stride)
{
    const uint8_t *top = dst - stride;
    int H = 0, V = 0;
    for (int i = 0; i < 4; i++) {
        int t_right = top[4 + i];
        int t_left  = (i == 3) ? top[-1] : top[2 - i];
        H += (i + 1) * (t_right - t_left);
    }
    for (int j = 0; j < 4; j++) {
        int l_bot = dst[(4 + j) * stride - 1];
        int l_top = (j == 3) ? top[-1] : dst[(2 - j) * stride - 1];
        V += (j + 1) * (l_bot - l_top);
    }
    int b = (34 * H + 32) >> 6;
    int c = (34 * V + 32) >> 6;
    int a = 16 * (dst[7 * stride - 1] + top[7]);
    for (int y = 0; y < 8; y++) {
        for (int x = 0; x < 8; x++) {
            int v = (a + b * (x - 3) + c * (y - 3) + 16) >> 5;
            dst[y * stride + x] = (uint8_t) clip_u8(v);
        }
    }
}