daedalus-fourier/tests/h264_idct4_ref.c

/*
 * Standalone bit-exact C reference for H.264 4x4 inverse integer
 * transform + add. Algorithm per H.264 spec §8.5.12.1 (4x4 IT for
 * blocks coded with TransformBypassFlag = 0).
 *
 * Mirrors FFmpeg `ff_h264_idct_add_neon` in
 * external/ffmpeg-snapshot/libavcodec/aarch64/h264idct_neon.S
 * (n7.1.3 pin). Destructively zeroes `block` to match upstream
 * convention (post-call block must be zero for the H.264 conformance
 * residual loop).
 *
 * Signature mirrors the NEON convention:
 *   void(uint8_t *dst, int16_t *block, ptrdiff_t stride);
 *
 * License: LGPL-2.1-or-later (matches FFmpeg upstream the algorithm
 * was transcribed from). Spec is H.264 ITU-T Rec H.264 / ISO/IEC
 * 14496-10.
 */
#include <stdint.h>
#include <stddef.h>
#include <string.h>

static inline int clip_u8(int v) { return v < 0 ? 0 : v > 255 ? 255 : v; }

/* 1D butterfly per H.264 spec §8.5.12.1.
 * d[0..3] are input, e/f/g/h are intermediate, h_c[0..3] are output. */
static inline void h264_idct4_butterfly(const int d[4], int h_c[4])
{
    int e = d[0] + d[2];
    int f = d[0] - d[2];
    int g = (d[1] >> 1) - d[3];
    int h = d[1] + (d[3] >> 1);
    h_c[0] = e + h;
    h_c[1] = f + g;
    h_c[2] = f - g;
    h_c[3] = e - h;
}

void daedalus_h264_idct_add_ref(uint8_t *dst, int16_t *block, ptrdiff_t stride)
{
    /* H.264/FFmpeg block layout is COLUMN-MAJOR:
     *   block[c*4 + r] = coefficient at row r, column c.
     * NEON ld1.4h{4 regs} interleaves consecutive memory across
     * registers; with column-major source this gives v_r[c] = block at
     * (row=r, col=c). The first lane-wise butterfly (v0+v2 etc.) then
     * combines column 0 and column 2 within each row → row pass.
     * JM and FFmpeg C reference both do row-first then column-pass.
     *
     * dst is row-major (dst[r*stride + c]).
     */
    int tmp[4][4];

    /* Row pass FIRST. Read block as column-major (block[c*4 + r]). */
    for (int r = 0; r < 4; r++) {
        int d[4] = { block[0*4 + r], block[1*4 + r],
                     block[2*4 + r], block[3*4 + r] };
        int h_c[4];
        h264_idct4_butterfly(d, h_c);
        for (int c = 0; c < 4; c++) tmp[r][c] = h_c[c];
    }

    /* Column pass NEXT (on row-major tmp). */
    int col_out[4][4];
    for (int c = 0; c < 4; c++) {
        int d[4] = { tmp[0][c], tmp[1][c], tmp[2][c], tmp[3][c] };
        int h_c[4];
        h264_idct4_butterfly(d, h_c);
        for (int r = 0; r < 4; r++) col_out[r][c] = h_c[r];
    }

    /* Round (+32) >> 6, add to dst, clip to u8. */
    for (int r = 0; r < 4; r++) {
        for (int c = 0; c < 4; c++) {
            int rounded = (col_out[r][c] + 32) >> 6;
            dst[r * stride + c] = (uint8_t) clip_u8(dst[r * stride + c] + rounded);
        }
    }

    /* FFmpeg convention: zero the block after the transform. */
    memset(block, 0, 16 * sizeof(int16_t));
}