diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e5e080..184debd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -136,6 +136,21 @@ add_test(NAME idct_bitexact_cpu COMMAND test_idct_bitexact 320 240
 # gets slow we'll split into a CTest LABEL for opt-in.
 add_test(NAME idct_bitexact_1080p COMMAND test_idct_bitexact 1920 1088)
 
+# ---- Stage 2 PR-b deblock smoke ------------------------------------
+#
+# Validates flush_frame's per-frame deblock dispatch (luma + chroma,
+# V + H, bS<4 + bS=4 intra — up to 8 dispatches added after IDCT).
+# Strategy: same input through substrate=CPU and substrate=QPU, assert
+# byte-exact match (transitive bit-exact gate — daedalus-fourier's own
+# test_api_h264 already validates each substrate against a C reference,
+# so CPU-QPU equivalence here means both match the spec).  Plus an
+# anti-no-op check: run a third pass with edges removed and assert
+# different output, proving deblock actually ran.
+add_executable(test_deblock_smoke tests/test_deblock_smoke.c)
+target_link_libraries(test_deblock_smoke PRIVATE daedalus_decoder)
+target_compile_options(test_deblock_smoke PRIVATE -O2)
+add_test(NAME deblock_smoke COMMAND test_deblock_smoke)
+
 # ---- Benchmarks (not gated by ctest) ------------------------------
 #
 # Build-time only; user runs them by hand when checking perf.  Adding
diff --git a/tests/test_deblock_smoke.c b/tests/test_deblock_smoke.c
new file mode 100644
index 0000000..16d5524
--- /dev/null
+++ b/tests/test_deblock_smoke.c
@@ -0,0 +1,281 @@
+/* SPDX-License-Identifier: BSD-2-Clause */
+/*
+ * test_deblock_smoke — Stage 2 PR-b smoke test for flush_frame's
+ * per-frame deblock dispatch.
+ *
+ * Strategy
+ * --------
+ *
+ * Bit-exact-against-C-reference would require transcribing ~400 lines
+ * of FFmpeg's deblock kernels into this test.  daedalus-fourier's
+ * tests/test_api_h264 already does that for both CPU NEON and V3D QPU
+ * substrates per kernel.  So here we instead validate the daedalus-
+ * decoder's *dispatch wiring* — that the frame's edge list correctly
+ * partitions into (plane × orient × bS-band) buckets, with correct
+ * dst_off math, and reaches both backends identically:
+ *
+ *   1. Build a frame with random coeffs + predicted + edges.
+ *   2. Decode it with substrate=CPU → out_cpu.
+ *   3. Decode it again (same input!) with substrate=QPU → out_qpu.
+ *   4. Assert out_cpu == out_qpu byte-for-byte.
+ *
+ * Plus an anti-no-op check:
+ *
+ *   5. Decode a third time with n_edges=0 on every MB → out_no_deblock.
+ *   6. Assert out_cpu != out_no_deblock (some bytes differ — deblock
+ *      actually fired and changed pixels).
+ *
+ * The CPU↔QPU equivalence combined with daedalus-fourier's own kernel-
+ * level bit-exact gate gives transitive proof of spec-correct dispatch
+ * routing.  This test is cheap (sub-second on QVGA) so it runs in
+ * every ctest invocation.
+ *
+ * Not in scope:
+ *   - Spec-exact deblock semantics (caller's bS / alpha / beta derivation
+ *     per H.264 §8.7 is the integrator's responsibility; the decoder
+ *     just routes whatever edges it receives).
+ *   - Frame-boundary edge handling (caller MUST set bS=0 there; we
+ *     generate edges that respect this).
+ */
+
+#include "daedalus_decoder.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static uint64_t xs64_state;
+static uint64_t xs64(void)
+{
+    uint64_t x = xs64_state;
+    x ^= x << 13; x ^= x >> 7; x ^= x << 17;
+    return xs64_state = x;
+}
+
+/* Build a list of edges for one MB.  Returns the count written.
+ *
+ * Layout (caller pre-allocates an array of >= 16 entries):
+ *   - 4 V-luma edges (edge_idx 0..3).  edge 0 = MB-boundary at mb_x;
+ *     bS=0 if mb_x==0 (frame boundary).
+ *   - 4 H-luma edges.  edge 0 = MB-boundary at mb_y; bS=0 if mb_y==0.
+ *   - 2 V-chroma edges, plane=Cb (edge 0 = MB boundary; bS=0 if mb_x==0).
+ *   - 2 H-chroma edges, plane=Cb (edge 0 = MB boundary; bS=0 if mb_y==0).
+ *   - 2 V-chroma edges, plane=Cr.
+ *   - 2 H-chroma edges, plane=Cr.
+ *
+ * Total 16 edges.  For interior MBs all 16 are filtered; for frame
+ * boundary MBs the boundary edges drop to bS=0.
+ *
+ * bS pattern: edge 0 (MB boundary) → bS=4 ("intra" path); edges 1..3
+ * (internal) → random bS in {1, 2, 3} (bS<4 path).  alpha/beta/tc0
+ * randomized in spec-realistic ranges. */
+static int build_mb_edges(int mb_x, int mb_y, int last_mb_x, int last_mb_y,
+                          struct daedalus_decoder_edge *out)
+{
+    int n = 0;
+    (void) last_mb_x; (void) last_mb_y;
+
+    /* Helper to make one edge — closes over the running counter. */
+#define EDGE(orient_, plane_, eidx_, bs_, edge_is_frame_boundary)         \
+    do {                                                                  \
+        out[n].mb_x     = (uint16_t) mb_x;                                \
+        out[n].mb_y     = (uint16_t) mb_y;                                \
+        out[n].edge_idx = (uint8_t)  (eidx_);                             \
+        out[n].orient   = (uint8_t)  (orient_);                           \
+        out[n].plane    = (uint8_t)  (plane_);                            \
+        out[n].bS       = (uint8_t)  ((edge_is_frame_boundary) ? 0        \
+                                                              : (bs_));   \
+        out[n].alpha    = (uint8_t) (20 + (int)(xs64() % 40));            \
+        out[n].beta     = (uint8_t) ( 8 + (int)(xs64() % 16));            \
+        for (int s = 0; s < 4; s++)                                        \
+            out[n].tc0[s] = (int8_t) (xs64() % 8);                         \
+        n++;                                                              \
+    } while (0)
+
+    /* V luma: 4 edges.  edge 0 at MB-boundary → frame boundary iff mb_x==0. */
+    for (int e = 0; e < 4; e++)
+        EDGE(/*V*/0, /*luma*/0, e,
+             (e == 0) ? 4 : (int)(1 + xs64() % 3),
+             /*boundary?*/ (e == 0 && mb_x == 0));
+
+    /* H luma: 4 edges.  edge 0 → frame boundary iff mb_y==0. */
+    for (int e = 0; e < 4; e++)
+        EDGE(/*H*/1, /*luma*/0, e,
+             (e == 0) ? 4 : (int)(1 + xs64() % 3),
+             /*boundary?*/ (e == 0 && mb_y == 0));
+
+    /* V chroma Cb: 2 edges. */
+    for (int e = 0; e < 2; e++)
+        EDGE(0, /*Cb*/1, e,
+             (e == 0) ? 4 : (int)(1 + xs64() % 3),
+             (e == 0 && mb_x == 0));
+
+    /* H chroma Cb. */
+    for (int e = 0; e < 2; e++)
+        EDGE(1, 1, e,
+             (e == 0) ? 4 : (int)(1 + xs64() % 3),
+             (e == 0 && mb_y == 0));
+
+    /* V chroma Cr. */
+    for (int e = 0; e < 2; e++)
+        EDGE(0, /*Cr*/2, e,
+             (e == 0) ? 4 : (int)(1 + xs64() % 3),
+             (e == 0 && mb_x == 0));
+
+    /* H chroma Cr. */
+    for (int e = 0; e < 2; e++)
+        EDGE(1, 2, e,
+             (e == 0) ? 4 : (int)(1 + xs64() % 3),
+             (e == 0 && mb_y == 0));
+
+#undef EDGE
+    return n;  /* 16 */
+}
+
+/* Drive the decoder once with the given substrate + optional edges.
+ * Returns 0 on success, fills out_y/out_uv. */
+static int run_once(daedalus_decoder *dec, daedalus_decoder_substrate sub,
+                    int mb_w, int mb_h,
+                    const int16_t (*per_mb_coeffs)[384],
+                    const uint8_t (*per_mb_pred)[384],
+                    const struct daedalus_decoder_edge (*per_mb_edges)[16],
+                    int with_edges,
+                    int width, int height,
+                    uint8_t *out_y, uint8_t *out_uv)
+{
+    if (daedalus_decoder_set_substrate(dec, sub) != 0) {
+        fprintf(stderr, "set_substrate failed\n");
+        return -1;
+    }
+    struct daedalus_decoder_mb_input mb = {0};
+    for (int my = 0; my < mb_h; my++) {
+        for (int mx = 0; mx < mb_w; mx++) {
+            int idx = my * mb_w + mx;
+            mb.mb_x        = (uint16_t) mx;
+            mb.mb_y        = (uint16_t) my;
+            mb.coeffs      = per_mb_coeffs[idx];
+            mb.predicted   = per_mb_pred[idx];
+            mb.transform_8x8 = 0;
+            mb.edges       = with_edges ? per_mb_edges[idx] : NULL;
+            mb.n_edges     = with_edges ? 16 : 0;
+            if (daedalus_decoder_append_mb(dec, &mb) != 0) {
+                fprintf(stderr, "append (%d,%d) failed\n", mx, my);
+                return -1;
+            }
+        }
+    }
+    int frc = daedalus_decoder_flush_frame(dec, out_y, (size_t) width,
+                                            out_uv, (size_t) width);
+    if (frc != 0) {
+        fprintf(stderr, "flush_frame rc=%d sub=%d\n", frc, (int) sub);
+        return -1;
+    }
+    (void) height;
+    return 0;
+}
+
+int main(int argc, char **argv)
+{
+    int width  = argc > 1 ? atoi(argv[1]) : 320;
+    int height = argc > 2 ? atoi(argv[2]) : 240;
+    uint64_t seed = argc > 3 ? strtoull(argv[3], NULL, 0) : 0xdeadbeefcafebabeULL;
+    xs64_state = seed;
+
+    int mb_w = width  / 16;
+    int mb_h = height / 16;
+    int n_mbs = mb_w * mb_h;
+    printf("test_deblock_smoke: %dx%d (%d MBs), seed=0x%lx\n",
+           width, height, n_mbs, (unsigned long) seed);
+
+    /* Allocate per-MB arrays. */
+    int16_t (*coeffs)[384]   = malloc((size_t) n_mbs * sizeof(*coeffs));
+    uint8_t (*pred)[384]     = malloc((size_t) n_mbs * sizeof(*pred));
+    struct daedalus_decoder_edge (*edges)[16] =
+        malloc((size_t) n_mbs * sizeof(*edges));
+    if (!coeffs || !pred || !edges) { fprintf(stderr, "alloc fail\n"); return 1; }
+
+    for (int mb = 0; mb < n_mbs; mb++) {
+        for (int i = 0; i < 384; i++) {
+            coeffs[mb][i] = (int16_t)((int)(xs64() % 1024) - 512);
+            pred[mb][i]   = (uint8_t)(xs64() & 0xff);
+        }
+    }
+    int edge_total = 0, edge_non_skip = 0;
+    for (int my = 0; my < mb_h; my++) {
+        for (int mx = 0; mx < mb_w; mx++) {
+            int idx = my * mb_w + mx;
+            int n = build_mb_edges(mx, my, mb_w - 1, mb_h - 1, edges[idx]);
+            edge_total += n;
+            for (int k = 0; k < n; k++)
+                if (edges[idx][k].bS != 0) edge_non_skip++;
+        }
+    }
+    printf("edges total=%d non-skip=%d (frame boundaries skipped)\n",
+           edge_total, edge_non_skip);
+
+    daedalus_decoder *dec = daedalus_decoder_create(width, height);
+    if (!dec) {
+        fprintf(stderr, "SKIP: ctx create failed (Vulkan / V3D7 unavailable)\n");
+        return 0;
+    }
+
+    size_t y_size = (size_t) width * height;
+    size_t uv_size = y_size / 2;
+    uint8_t *out_cpu_y  = malloc(y_size);
+    uint8_t *out_cpu_uv = malloc(uv_size);
+    uint8_t *out_qpu_y  = malloc(y_size);
+    uint8_t *out_qpu_uv = malloc(uv_size);
+    uint8_t *out_nodb_y  = malloc(y_size);
+    uint8_t *out_nodb_uv = malloc(uv_size);
+    if (!out_cpu_y || !out_cpu_uv || !out_qpu_y || !out_qpu_uv ||
+        !out_nodb_y || !out_nodb_uv) return 1;
+
+    /* Pass 1: substrate=CPU, with edges. */
+    if (run_once(dec, DAEDALUS_DECODER_SUBSTRATE_CPU, mb_w, mb_h,
+                  coeffs, pred, edges, /*with_edges*/1,
+                  width, height, out_cpu_y, out_cpu_uv) != 0) return 1;
+    /* Pass 2: substrate=QPU, with edges. */
+    if (run_once(dec, DAEDALUS_DECODER_SUBSTRATE_QPU, mb_w, mb_h,
+                  coeffs, pred, edges, /*with_edges*/1,
+                  width, height, out_qpu_y, out_qpu_uv) != 0) return 1;
+    /* Pass 3: substrate=CPU, no edges → IDCT-only baseline. */
+    if (run_once(dec, DAEDALUS_DECODER_SUBSTRATE_CPU, mb_w, mb_h,
+                  coeffs, pred, edges, /*with_edges*/0,
+                  width, height, out_nodb_y, out_nodb_uv) != 0) return 1;
+
+    /* Check 1: CPU vs QPU byte-exact. */
+    size_t y_diffs = 0, uv_diffs = 0;
+    for (size_t i = 0; i < y_size; i++)
+        if (out_cpu_y[i] != out_qpu_y[i]) y_diffs++;
+    for (size_t i = 0; i < uv_size; i++)
+        if (out_cpu_uv[i] != out_qpu_uv[i]) uv_diffs++;
+    printf("CPU vs QPU: Y diff %zu/%zu, UV diff %zu/%zu\n",
+           y_diffs, y_size, uv_diffs, uv_size);
+    if (y_diffs != 0 || uv_diffs != 0) {
+        fprintf(stderr, "FAIL: CPU and QPU outputs differ — dispatch wiring broken\n");
+        return 1;
+    }
+
+    /* Check 2: with-edges vs no-edges different → deblock actually ran. */
+    size_t y_changed = 0, uv_changed = 0;
+    for (size_t i = 0; i < y_size; i++)
+        if (out_cpu_y[i] != out_nodb_y[i]) y_changed++;
+    for (size_t i = 0; i < uv_size; i++)
+        if (out_cpu_uv[i] != out_nodb_uv[i]) uv_changed++;
+    printf("With vs without deblock: Y changed %zu/%zu, UV changed %zu/%zu\n",
+           y_changed, y_size, uv_changed, uv_size);
+    if (y_changed == 0 && uv_changed == 0) {
+        fprintf(stderr, "FAIL: deblock produced no pixel changes — likely a no-op\n");
+        return 1;
+    }
+
+    printf("PASS (CPU≡QPU, deblock fired)\n");
+
+    daedalus_decoder_destroy(dec);
+    free(out_nodb_uv); free(out_nodb_y);
+    free(out_qpu_uv);  free(out_qpu_y);
+    free(out_cpu_uv);  free(out_cpu_y);
+    free(edges); free(pred); free(coeffs);
+    return 0;
+}