From adaabb1f63de7c8a8934d73398d224d6584447e5 Mon Sep 17 00:00:00 2001
From: claude-noether <claude-noether@noreply.localhost>
Date: Sun, 24 May 2026 22:41:05 +0200
Subject: [PATCH] phase1: IDCT 8x8 dispatch (High profile
 transform_8x8_size_flag)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the High-profile 8x8 luma transform path alongside the existing
4x4 dispatch.  flush_frame now partitions macroblocks by each MB's
transform_8x8 flag and issues a separate luma dispatch per partition:

  - mb.transform_8x8 == 0 (Baseline/Main) → coeffs[0..256) interpreted
    as 16 4x4 blocks, fed to daedalus_recipe_dispatch_h264_idct4
    (existing behaviour, unchanged).
  - mb.transform_8x8 == 1 (High)          → coeffs[0..256) interpreted
    as 4 8x8 blocks (64 int16 each, column-major), fed to the new
    daedalus_recipe_dispatch_h264_idct8 call.

Both luma partitions can be non-empty in the same frame (FFmpeg sets
the flag per-MB).  Each non-empty partition costs one
vkQueueSubmit + vkQueueWaitIdle; empty partitions are skipped
(common case: Baseline streams skip the 8x8 dispatch entirely).

Chroma is unchanged — 4:2:0 chroma always uses the 4x4 transform.

API surface:
  - New uint8_t `transform_8x8` field in `struct daedalus_decoder_mb_input`
    (after deblock_*).  Backwards-compatible at the source level
    because the field defaults to 0 with C99 designated initialisers
    or {0} struct zeroing, both of which select the existing 4x4
    path.  ABI is pre-0.1 (per the header doc) so structural change
    is fine.
  - Mirrored in `struct daedalus_decoder_mb_desc` (internal layout).

Test changes:

  - test_idct_bitexact now exercises a mixed-mode frame: every odd
    raster MB uses 8x8, every even uses 4x4 (so flush_frame's
    partitioning is also under test, not just the underlying shaders).
  - 8x8 C reference (h264_idct8_butterfly + ref_idct8_add)
    transcribed from daedalus-fourier tests/h264_idct8_ref.c per
    H.264 §8.5.13.2.  Block layout column-major; +32 >> 6 rounding;
    add-to-predicted; clip255.
  - Reference luma compute branches per MB on the same mb_8x8[]
    array used to set the input flag.

Verified on hertz (Pi 5 / V3D 7.1 / daedalus-fourier 0.1.0):

  $ ./build/test_idct_bitexact
  test_idct_bitexact: 320x240 (300 MBs), seed=0xfeedface5a5a5a5a
  MB mix: 150 4x4 MBs, 150 8x8 MBs
  Y bytes total:  76800
  Y bytes diff:   0 (0.0000%)
  Cb bytes total: 19200  diff: 0 (0.0000%)
  Cr bytes total: 19200  diff: 0 (0.0000%)
  BIT-EXACT PASS (Y + Cb + Cr)

  $ ctest --test-dir build
  100% tests passed, 0 tests failed out of 2

Bit-exact PASS first try for the 8x8 path — 150 8x8 MBs × 4 blocks =
600 8x8 IDCTs against the spec C reference, identical output.
Validates both the daedalus-fourier IDCT 8x8 shader (already gated
by its own cycle-7 bit-exact test, now also gated end-to-end through
our flush_frame), and our 8x8 layout assumptions (column-major coeffs,
raster sb_y*2+sb_x block order, top-left = mb*16 + sb*8).

What's NOT covered yet (deferred):

  - Z-scan permutation for FFmpeg compatibility (libavcodec intercept
    patch's concern; both 4x4 and 8x8 z-scans differ).
  - Chroma DC / luma Intra16x16 DC Hadamard pre-pass.
  - Mixed intra/inter MB handling — currently all MBs treated as
    residual-only (predicted=0).

Closes the "IDCT 8x8 (High profile)" item from PR #3's deferred list.
---
 include/daedalus_decoder.h |  16 ++++-
 src/daedalus_decoder.c     | 131 +++++++++++++++++++++++--------------
 src/internal.h             |   3 +-
 tests/test_idct_bitexact.c | 126 +++++++++++++++++++++++++++++------
 4 files changed, 203 insertions(+), 73 deletions(-)

diff --git a/include/daedalus_decoder.h b/include/daedalus_decoder.h
index f1a626c..925a953 100644
--- a/include/daedalus_decoder.h
+++ b/include/daedalus_decoder.h
@@ -75,9 +75,19 @@ struct daedalus_decoder_mb_input {
     int8_t   deblock_alpha_c0;
     int8_t   deblock_beta;
 
-    /* Transform coefficients — 256 luma (4x4 x 16) + 64 cb + 64 cr,
-     * column-major within each 4x4 block (matches FFmpeg convention).
-     * Caller-owned; copied during append. */
+    /* High-profile 8x8 transform selector.
+     *   0 = the 256-int16 luma section of coeffs[] holds 16 4x4 blocks
+     *       (16 coeffs each, raster sb_y*4+sb_x); the chroma section is
+     *       always 4x4.
+     *   1 = the 256-int16 luma section holds 4 8x8 blocks (64 coeffs
+     *       each, raster sb_y*2+sb_x).  Set per H.264's
+     *       transform_8x8_size_flag.  Chroma remains 4x4 (4:2:0).
+     */
+    uint8_t  transform_8x8;
+
+    /* Transform coefficients — 256 luma + 64 cb + 64 cr int16, all
+     * column-major within each 4x4 or 8x8 block (matches FFmpeg
+     * convention).  Caller-owned; copied during append. */
     const int16_t *coeffs;       /* points at exactly 384 int16_t */
 };
 
diff --git a/src/daedalus_decoder.c b/src/daedalus_decoder.c
index d130f56..74934f0 100644
--- a/src/daedalus_decoder.c
+++ b/src/daedalus_decoder.c
@@ -119,6 +119,7 @@ int daedalus_decoder_append_mb(daedalus_decoder *dec,
     d->deblock_disable   = mb->deblock_disable;
     d->deblock_alpha_c0  = mb->deblock_alpha_c0;
     d->deblock_beta      = mb->deblock_beta;
+    d->transform_8x8     = mb->transform_8x8;
 
     memcpy(&dec->coeffs[(size_t) expected * 384],
            mb->coeffs,
@@ -179,74 +180,104 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
 
     int rc = 0;
 
-    /* ---- Build frame-scaled luma-4x4 dispatch ---- */
+    /* ---- Build frame-scaled luma dispatches (4x4 + 8x8) ---- */
 
-    const size_t n_luma_blocks_per_mb = 16;
-    const size_t n_luma_blocks = (size_t) dec->n_mbs * n_luma_blocks_per_mb;
-
-    /* Scratch Y plane — coded-size byte buffer.  Zero-initialised so
-     * the IDCT-ADD-clip operation reduces to clip255(IDCT) per block
-     * (predicted=0 because no intra/MC has run yet). */
+    /* Two partitions of the per-MB luma section based on each MB's
+     * transform_8x8 flag:
+     *
+     *   transform_8x8 == 0  →  16 4x4 blocks contribute to the 4x4
+     *                          dispatch (16 coeffs each).
+     *   transform_8x8 == 1  →  4 8x8 blocks contribute to the 8x8
+     *                          dispatch (64 coeffs each).
+     *
+     * Both partitions can be non-empty in the same frame (FFmpeg sets
+     * transform_8x8_size_flag per MB), so we allocate worst-case for
+     * each and track actual counts.
+     */
     const size_t y_stride_int = (size_t) dec->width;
     const size_t y_size = y_stride_int * (size_t) dec->height;
     uint8_t *scratch_y = calloc(1, y_size);
-    int16_t *flat_coeffs = malloc(n_luma_blocks * 16 * sizeof(int16_t));
-    daedalus_h264_block_meta *meta = malloc(
-        n_luma_blocks * sizeof(daedalus_h264_block_meta));
 
-    if (!scratch_y || !flat_coeffs || !meta) {
+    const size_t worst_4x4 = (size_t) dec->n_mbs * 16;
+    const size_t worst_8x8 = (size_t) dec->n_mbs * 4;
+    int16_t                  *coeffs4 = malloc(worst_4x4 * 16 * sizeof(int16_t));
+    int16_t                  *coeffs8 = malloc(worst_8x8 * 64 * sizeof(int16_t));
+    daedalus_h264_block_meta *meta4   = malloc(worst_4x4 * sizeof(*meta4));
+    daedalus_h264_block_meta *meta8   = malloc(worst_8x8 * sizeof(*meta8));
+
+    if (!scratch_y || !coeffs4 || !coeffs8 || !meta4 || !meta8) {
         rc = -1;
         goto cleanup;
     }
 
-    /* Raster-order layout: walk each MB, then each of its 16 luma 4×4
-     * sub-blocks in raster order (sb_y=0..3 outer, sb_x=0..3 inner).
+    /* Walk MBs in raster order, append each MB's luma blocks to the
+     * partition selected by its transform_8x8 flag.
      *
-     * NB: H.264's actual per-MB 4×4 coefficient scan order is the
-     * z-scan from spec §6.4.3 / fig 6-10.  We're using a flat raster
-     * here because Phase 1 stage 1 only validates the dispatch
-     * round-trip; bit-exact against an FFmpeg reference requires the
-     * z-scan permutation and is a follow-on test.  The per-MB
-     * coeffs[] field's first 256 entries are interpreted as 16
-     * consecutive 4×4 blocks in the same raster order on the input
-     * side, so this is self-consistent for the validation. */
-    size_t bi = 0;
+     * NB: per-MB 4x4 / 8x8 coefficient ORDER inside the H.264 bitstream
+     * follows the z-scan from spec §6.4.3 / fig 6-10.  We're using
+     * flat raster on the input side too (sb_y outer, sb_x inner) for
+     * Phase 1 self-consistency; the z-scan permutation is the
+     * libavcodec-intercept patch's responsibility.
+     */
+    size_t bi4 = 0, bi8 = 0;
     for (int mb_y = 0; mb_y < dec->mb_height; mb_y++) {
         for (int mb_x = 0; mb_x < dec->mb_width; mb_x++) {
             int mb_idx = mb_y * dec->mb_width + mb_x;
+            const struct daedalus_decoder_mb_desc *d = &dec->mb_descs[mb_idx];
             const int16_t *mb_coeffs = &dec->coeffs[(size_t) mb_idx * 384];
 
-            for (int sb_y = 0; sb_y < 4; sb_y++) {
-                for (int sb_x = 0; sb_x < 4; sb_x++) {
-                    /* Block top-left pixel in the coded Y plane. */
-                    size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
-                    size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
-                    meta[bi].dst_off = (uint32_t) (px_y * y_stride_int + px_x);
-
-                    /* Copy 16 coeffs for this block from the per-MB
-                     * coeffs[] (luma offset = block_idx * 16). */
-                    int block_in_mb = sb_y * 4 + sb_x;
-                    memcpy(&flat_coeffs[bi * 16],
-                           &mb_coeffs[block_in_mb * 16],
-                           16 * sizeof(int16_t));
-                    bi++;
+            if (d->transform_8x8) {
+                /* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
+                for (int sb_y = 0; sb_y < 2; sb_y++) {
+                    for (int sb_x = 0; sb_x < 2; sb_x++) {
+                        size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 8;
+                        size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 8;
+                        meta8[bi8].dst_off = (uint32_t)
+                            (px_y * y_stride_int + px_x);
+                        int block_in_mb = sb_y * 2 + sb_x;
+                        memcpy(&coeffs8[bi8 * 64],
+                               &mb_coeffs[block_in_mb * 64],
+                               64 * sizeof(int16_t));
+                        bi8++;
+                    }
+                }
+            } else {
+                /* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
+                for (int sb_y = 0; sb_y < 4; sb_y++) {
+                    for (int sb_x = 0; sb_x < 4; sb_x++) {
+                        size_t px_y = (size_t) mb_y * 16 + (size_t) sb_y * 4;
+                        size_t px_x = (size_t) mb_x * 16 + (size_t) sb_x * 4;
+                        meta4[bi4].dst_off = (uint32_t)
+                            (px_y * y_stride_int + px_x);
+                        int block_in_mb = sb_y * 4 + sb_x;
+                        memcpy(&coeffs4[bi4 * 16],
+                               &mb_coeffs[block_in_mb * 16],
+                               16 * sizeof(int16_t));
+                        bi4++;
+                    }
                 }
             }
         }
     }
-    /* assert bi == n_luma_blocks; the loop math guarantees it */
+    /* assert bi4 + bi8*4 == n_mbs*16; loop math guarantees it */
 
-    /* ---- One Vulkan submit + wait for the whole frame's luma IDCT.
+    /* ---- One Vulkan submit + wait per non-empty luma partition.
      * AUTO substrate picks QPU per the post-decree recipe table; falls
-     * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable. */
-    int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
-                                                  scratch_y, y_stride_int,
-                                                  flat_coeffs,
-                                                  n_luma_blocks,
-                                                  meta);
-    if (dr != 0) {
-        rc = -3;  /* GPU dispatch failure */
-        goto cleanup;
+     * back to CPU NEON if the daedalus-fourier ctx wasn't QPU-capable.
+     * Skipping the dispatch when the partition is empty avoids the
+     * shader-pool warm-up cost on the common case (a typical Baseline
+     * stream is all-4x4 → 8x8 dispatch is no-op). */
+    if (bi4 > 0) {
+        int dr = daedalus_recipe_dispatch_h264_idct4(dec->dctx,
+                                                      scratch_y, y_stride_int,
+                                                      coeffs4, bi4, meta4);
+        if (dr != 0) { rc = -3; goto cleanup; }
+    }
+    if (bi8 > 0) {
+        int dr = daedalus_recipe_dispatch_h264_idct8(dec->dctx,
+                                                      scratch_y, y_stride_int,
+                                                      coeffs8, bi8, meta8);
+        if (dr != 0) { rc = -3; goto cleanup; }
     }
 
     /* ---- Copy Y out to caller's plane at the requested stride. ---- */
@@ -362,8 +393,10 @@ int daedalus_decoder_flush_frame(daedalus_decoder *dec,
     }
 
 cleanup:
-    free(meta);
-    free(flat_coeffs);
+    free(meta8);
+    free(meta4);
+    free(coeffs8);
+    free(coeffs4);
     free(scratch_y);
     dec->mbs_appended = 0;
     return rc;
diff --git a/src/internal.h b/src/internal.h
index bb006e4..637f1e0 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -41,7 +41,8 @@ struct daedalus_decoder_mb_desc {
     uint8_t  deblock_disable;
     int8_t   deblock_alpha_c0;
     int8_t   deblock_beta;
-    uint8_t  _pad1;
+    uint8_t  transform_8x8;     /* 0 = 4 luma blocks of 4x4 (16 total),
+                                 * 1 = 4 luma blocks of 8x8. */
 };
 
 struct daedalus_decoder {
diff --git a/tests/test_idct_bitexact.c b/tests/test_idct_bitexact.c
index 5b76226..8b5a92f 100644
--- a/tests/test_idct_bitexact.c
+++ b/tests/test_idct_bitexact.c
@@ -19,13 +19,15 @@
  * layout is a separate concern (handled in the eventual libavcodec-
  * intercept patch).
  *
- * Covers BOTH luma (Y plane, 16 blocks/MB) and chroma (UV plane,
- * 4 Cb + 4 Cr blocks/MB, NV12-interleaved).  Random coeffs for all
- * three components; reference IDCT applied per block.  The chroma
+ * Covers Y (4x4 + 8x8) and chroma (4x4 Cb + Cr, NV12-interleaved).
+ * Half the MBs use transform_8x8=1 (4 luma 8x8 blocks), half use
+ * transform_8x8=0 (16 luma 4x4 blocks); both partitions are
+ * exercised in the same frame so the flush_frame partitioning logic
+ * is also under test, not just the underlying shaders.  Random coeffs
+ * for all components; reference IDCT applied per block.  The chroma
  * compare deinterleaves NV12 UV back into separate Cb/Cr expectations.
  *
  * Not in scope (covered by other tests / future PRs):
- *   - IDCT 8×8 (Phase 1 follow-on)
  *   - Chroma DC / Intra16x16 DC Hadamard pre-pass
  *   - bit-exactness against real H.264 streams (test-vector PR)
  *   - non-zero predicted pixels (intra prediction lands in Stage 2a)
@@ -66,6 +68,65 @@ static void h264_idct4_butterfly(const int d[4], int out[4])
     out[3] = e - h;
 }
 
+/* 1D 8-point butterfly per H.264 §8.5.13.2.  Transcribed from
+ * daedalus-fourier tests/h264_idct8_ref.c (LGPL-2.1+ in the original —
+ * algorithm reproduced here for test purposes, no copy of code). */
+static void h264_idct8_butterfly(const int d[8], int g[8])
+{
+    int e[8], f[8];
+    e[0] = d[0] + d[4];
+    e[1] = -d[3] + d[5] - d[7] - (d[7] >> 1);
+    e[2] = d[0] - d[4];
+    e[3] = d[1] + d[7] - d[3] - (d[3] >> 1);
+    e[4] = (d[2] >> 1) - d[6];
+    e[5] = -d[1] + d[7] + d[5] + (d[5] >> 1);
+    e[6] = d[2] + (d[6] >> 1);
+    e[7] = d[3] + d[5] + d[1] + (d[1] >> 1);
+
+    f[0] = e[0] + e[6];
+    f[1] = e[1] + (e[7] >> 2);
+    f[2] = e[2] + e[4];
+    f[3] = e[3] + (e[5] >> 2);
+    f[4] = e[2] - e[4];
+    f[5] = (e[3] >> 2) - e[5];
+    f[6] = e[0] - e[6];
+    f[7] = e[7] - (e[1] >> 2);
+
+    g[0] = f[0] + f[7];
+    g[1] = f[2] + f[5];
+    g[2] = f[4] + f[3];
+    g[3] = f[6] + f[1];
+    g[4] = f[6] - f[1];
+    g[5] = f[4] - f[3];
+    g[6] = f[2] - f[5];
+    g[7] = f[0] - f[7];
+}
+
+static void ref_idct8_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
+{
+    /* block layout COLUMN-MAJOR: block[c*8 + r] = coef at (row=r, col=c). */
+    int tmp[8][8];
+    for (int r = 0; r < 8; r++) {
+        int d[8];
+        for (int c = 0; c < 8; c++) d[c] = block[c * 8 + r];
+        int g[8];
+        h264_idct8_butterfly(d, g);
+        for (int c = 0; c < 8; c++) tmp[r][c] = g[c];
+    }
+    int col_out[8][8];
+    for (int c = 0; c < 8; c++) {
+        int d[8];
+        for (int r = 0; r < 8; r++) d[r] = tmp[r][c];
+        int g[8];
+        h264_idct8_butterfly(d, g);
+        for (int r = 0; r < 8; r++) col_out[r][c] = g[r];
+    }
+    for (int r = 0; r < 8; r++)
+        for (int c = 0; c < 8; c++)
+            dst[r * stride + c] = (uint8_t) clip_u8(
+                dst[r * stride + c] + ((col_out[r][c] + 32) >> 6));
+}
+
 static void ref_idct4_add(uint8_t *dst, ptrdiff_t stride, const int16_t *block)
 {
     /* block layout: COLUMN-MAJOR (matches FFmpeg + daedalus-fourier):
@@ -131,20 +192,31 @@ int main(int argc, char **argv)
         }
     }
 
+    /* Per-MB transform mode (deterministic split: every odd raster MB
+     * is 8x8, every even is 4x4 — exercises BOTH partitions in the
+     * same frame so the flush_frame partitioning logic is under test). */
+    uint8_t *mb_8x8 = malloc((size_t) n_mbs);
+    if (!mb_8x8) { fprintf(stderr, "alloc fail\n"); return 1; }
+    for (int i = 0; i < n_mbs; i++) mb_8x8[i] = (i & 1) ? 1 : 0;
+
     /* Append in raster order. */
     struct daedalus_decoder_mb_input mb = {0};
+    int n_8x8_mbs = 0, n_4x4_mbs = 0;
     for (int my = 0; my < mb_h; my++) {
         for (int mx = 0; mx < mb_w; mx++) {
             int idx = my * mb_w + mx;
             mb.mb_x = (uint16_t) mx;
             mb.mb_y = (uint16_t) my;
             mb.coeffs = per_mb_coeffs[idx];
+            mb.transform_8x8 = mb_8x8[idx];
+            if (mb_8x8[idx]) n_8x8_mbs++; else n_4x4_mbs++;
             if (daedalus_decoder_append_mb(dec, &mb) != 0) {
                 fprintf(stderr, "append (%d,%d) failed\n", mx, my);
                 return 1;
             }
         }
     }
+    printf("MB mix: %d 4x4 MBs, %d 8x8 MBs\n", n_4x4_mbs, n_8x8_mbs);
 
     /* Flush — exercise BOTH the luma path (out_y) and the chroma path
      * (out_uv set to non-NULL so flush_frame runs the chroma dispatch
@@ -162,27 +234,40 @@ int main(int argc, char **argv)
     }
 
     /* Compute the reference output: same per-MB → flat raster block
-     * layout as flush_frame uses. */
+     * layout as flush_frame uses.  Branch per MB on transform_8x8. */
     uint8_t *ref_y = calloc(1, y_size);
     if (!ref_y) return 1;
-    /* Need a destructively-mutable copy because the reference IDCT
-     * doesn't actually mutate, but the GPU's IDCT shader does zero
-     * the coeffs.  Our reference doesn't zero; that's fine because we
-     * use a fresh copy per block. */
-    int16_t block_scratch[16];
+    int16_t block_scratch[64];  /* large enough for 8x8 */
     for (int my = 0; my < mb_h; my++) {
         for (int mx = 0; mx < mb_w; mx++) {
             int mb_idx = my * mb_w + mx;
-            for (int sb_y = 0; sb_y < 4; sb_y++) {
-                for (int sb_x = 0; sb_x < 4; sb_x++) {
-                    int block_in_mb = sb_y * 4 + sb_x;
-                    memcpy(block_scratch,
-                           &per_mb_coeffs[mb_idx][block_in_mb * 16],
-                           16 * sizeof(int16_t));
-                    size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
-                    size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
-                    ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
-                                  width, block_scratch);
+            if (mb_8x8[mb_idx]) {
+                /* 4 luma 8x8 blocks, raster sb_y*2+sb_x. */
+                for (int sb_y = 0; sb_y < 2; sb_y++) {
+                    for (int sb_x = 0; sb_x < 2; sb_x++) {
+                        int block_in_mb = sb_y * 2 + sb_x;
+                        memcpy(block_scratch,
+                               &per_mb_coeffs[mb_idx][block_in_mb * 64],
+                               64 * sizeof(int16_t));
+                        size_t px_y = (size_t) my * 16 + (size_t) sb_y * 8;
+                        size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 8;
+                        ref_idct8_add(&ref_y[px_y * (size_t) width + px_x],
+                                      width, block_scratch);
+                    }
+                }
+            } else {
+                /* 16 luma 4x4 blocks, raster sb_y*4+sb_x. */
+                for (int sb_y = 0; sb_y < 4; sb_y++) {
+                    for (int sb_x = 0; sb_x < 4; sb_x++) {
+                        int block_in_mb = sb_y * 4 + sb_x;
+                        memcpy(block_scratch,
+                               &per_mb_coeffs[mb_idx][block_in_mb * 16],
+                               16 * sizeof(int16_t));
+                        size_t px_y = (size_t) my * 16 + (size_t) sb_y * 4;
+                        size_t px_x = (size_t) mx * 16 + (size_t) sb_x * 4;
+                        ref_idct4_add(&ref_y[px_y * (size_t) width + px_x],
+                                      width, block_scratch);
+                    }
                 }
             }
         }
@@ -278,6 +363,7 @@ int main(int argc, char **argv)
     free(ref_y);
     free(gpu_uv);
     free(gpu_y);
+    free(mb_8x8);
     free(per_mb_coeffs);
     daedalus_decoder_destroy(dec);
 
-- 
2.47.3