From 3ffa9d0d175a3831f83188c6078b993e3985fc6e Mon Sep 17 00:00:00 2001
From: claude-noether <claude@reauktion.de>
Date: Sun, 17 May 2026 19:17:14 +0000
Subject: [PATCH] =?UTF-8?q?iter40:=20Pi=205=20HEVC=20chapter=20=E2=80=94?=
 =?UTF-8?q?=20backend=20integration=20lands,=20bit-exact=20pending?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 6 implementation. Backend builds clean on higgs (Debian 13
trixie, aarch64), vainfo lists VAProfileHEVCMain via rpi-hevc-dec,
multi-device probe finds /dev/video19 + /dev/media1, CreateContext
+ S_FMT + REQBUFS + STREAMON all succeed.

Phase 7 partial: infrastructure works, 10 frames flow through the
pipeline (correct byte counts produced — 13824000 for 1280x720 x 10
NV12 frames). But every DQBUF CAPTURE returns V4L2_BUF_FLAG_ERROR
so output content is wrong (libva sha != kdirect sha). The decode
itself is failing on the rpi-hevc-dec side despite all ctrl
submissions returning success.

Code changes:
- request.h: video_fd_rpi_hevc_dec / media_fd_rpi_hevc_dec slots +
  has_hevc_ext_sps_rps_rpi_hevc_dec flag (mirrors iter38 + iter2
  pair-of-flags pattern, naturally false on Pi).
- request.c: known_decoder_drivers gains rpi-hevc-dec; primary-driver
  probe gets an else-if branch setting the new fds (Phase 5 F3);
  request_switch_device_for_profile prefers 'p' for HEVC when
  rpi-hevc-dec present.
- context.c: per-fd want_pixfmt (NC12 on Pi), capture_pixelformat
  taken from video_format slot (not hardcoded NV12/NV15);
  synthetic-SPS pre-seed gated off for Pi (Phase 5 F6);
  destination_sizes uses nv12_col128_uv_plane_offset for NC12 SAND
  layout (Phase 5 F2);
  per-driver HEVC_START_CODE (NONE on Pi, ANNEX_B on RK);
  per-driver context_object->h264_start_code (skip prepend on Pi).
- video.c: NV12_COL128 video_format entry (8-bit SAND, single
  buffer, 2 planes, NV12 drm_format with MOD_NONE so detile branch
  fires rather than tiled_to_planar).
- nv12_col128.c/.h: detile primitive (Y + UV per-plane, kernel
  hevc_d_video.c bytesperline formula + ffmpeg/Kynesim per-pixel
  offset). UV plane offset = 128 * ALIGN(h, 8) — within-column
  (SAND interleaves Y+UV per column, NOT plane-concatenated;
  earlier wrong formula caught by Phase 7 SEGV).
- image.c: #ifdef __arm__ extended to __arm__ || __aarch64__
  (Phase 5 F1 — guard was killing detile path on all aarch64
  hosts including fresnel iter39 NV15 path, masked because 10-bit
  never exercised); RequestCreateImage NC12 → NV12 stride override
  (linear width, not column-stride); copy_surface_to_image NC12
  detile branch (gates on fourcc + v4l2_format).
- nv15.h: fallback V4L2_PIX_FMT_NV15 define (Debian 13 headers
  omit it though they have NC12).
- nv12_col128.h: fallback V4L2_PIX_FMT_NV12_COL128 +
  V4L2_PIX_FMT_NV12_10_COL128 (Arch / mainline pre-Pi headers).
- tests/test_nv12_col128_detile.c: hand-crafted-bytes unit test;
  passes (8 cases: Y + UV for 4 widths incl. 1366 misaligned;
  UV-offset helper).
- meson.build / nv12_col128 sources listed.

Phase 7 status: not yet bit-exact. Remaining diagnosis: per-frame
S_EXT_CTRLS payload diff vs kdirect (kdirect sends 4 ctrls
SPS+PPS+decode_params+slice_array; ours sends 5 incl. scaling_matrix;
field ordering differs). Likely the slice_array contents need
per-driver handling for rpi-hevc-dec's expected layout. Beyond
in-session reach.

iter38 5/5 baseline on fresnel + ampere should be unaffected (new
fd stays -1 on non-Pi hosts; all gates either short-circuit on
fd-not-present or no-op).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/context.c                   | 164 +++++++++++++++++++++-----
 src/image.c                     |  80 ++++++++++++-
 src/meson.build                 |   2 +
 src/nv12_col128.c               | 114 +++++++++++++++++++
 src/nv12_col128.h               |  88 ++++++++++++++
 src/nv15.h                      |  15 +++
 src/request.c                   |  39 +++++++
 src/request.h                   |  15 +++
 src/video.c                     |  24 ++++
 tests/test_nv12_col128_detile.c | 196 ++++++++++++++++++++++++++++++++
 10 files changed, 706 insertions(+), 31 deletions(-)
 create mode 100644 src/nv12_col128.c
 create mode 100644 src/nv12_col128.h
 create mode 100644 tests/test_nv12_col128_detile.c

diff --git a/src/context.c b/src/context.c
index 47bfc55..45aa3ea 100644
--- a/src/context.c
+++ b/src/context.c
@@ -42,6 +42,9 @@
 
 #include <hevc-ctrls.h>
 
+#include "nv15.h"  /* iter40: fallback V4L2_PIX_FMT_NV15 define for Pi 5
+		    * Debian headers that ship NC12 but not NV15. */
+#include "nv12_col128.h"  /* iter40: NC12 detile primitive + UV offset helper */
 #include "utils.h"
 #include "v4l2.h"
 
@@ -117,8 +120,19 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	{
 		bool want_10bit = (config_object->profile == VAProfileH264High10 ||
 				   config_object->profile == VAProfileHEVCMain10);
-		unsigned int want_pixfmt = want_10bit ? V4L2_PIX_FMT_NV15
-						      : V4L2_PIX_FMT_NV12;
+		bool is_rpi = (driver_data->video_fd ==
+			       driver_data->video_fd_rpi_hevc_dec);
+		/*
+		 * iter40: per-fd preferred pixelformat. rpi-hevc-dec exposes
+		 * NC12 (8-bit) / NC30 (10-bit), not NV12 / NV15.
+		 */
+		unsigned int want_pixfmt;
+		if (is_rpi)
+			want_pixfmt = want_10bit ? V4L2_PIX_FMT_NV12_10_COL128
+						 : V4L2_PIX_FMT_NV12_COL128;
+		else
+			want_pixfmt = want_10bit ? V4L2_PIX_FMT_NV15
+						 : V4L2_PIX_FMT_NV12;
 		if (driver_data->video_format &&
 		    driver_data->video_format->v4l2_format != want_pixfmt &&
 		    driver_data->video_format->v4l2_format != V4L2_PIX_FMT_SUNXI_TILED_NV12)
@@ -127,9 +141,24 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	if (!driver_data->video_format) {
 		bool want_10bit = (config_object->profile == VAProfileH264High10 ||
 				   config_object->profile == VAProfileHEVCMain10);
+		bool is_rpi = (driver_data->video_fd ==
+			       driver_data->video_fd_rpi_hevc_dec);
 		video_format = NULL;
 
-		if (!want_10bit) {
+		if (is_rpi) {
+			/*
+			 * iter40: rpi-hevc-dec CAPTURE is NC12 (8-bit SAND
+			 * 128-pixel-wide column tile) or NC30 (10-bit variant).
+			 * Direct map; the kernel exposes BOTH formats in
+			 * VIDIOC_ENUM_FMT(CAPTURE_MPLANE) without a pre-SPS
+			 * step (verified Phase 0 strace), so find_format would
+			 * also succeed — skip it for symmetry with the NV15
+			 * iter39 branch below.
+			 */
+			video_format = video_format_find(
+				want_10bit ? V4L2_PIX_FMT_NV12_10_COL128
+					   : V4L2_PIX_FMT_NV12_COL128);
+		} else if (!want_10bit) {
 			found = v4l2_find_format(driver_data->video_fd,
 						 V4L2_BUF_TYPE_VIDEO_CAPTURE,
 						 V4L2_PIX_FMT_SUNXI_TILED_NV12);
@@ -212,12 +241,22 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	 * CAPTURE (sanity read-back, matches what S_FMT committed).
 	 */
 	{
-		/* iter39: NV15 for 10-bit profiles (rkvdec Hi10P/Main10),
-		 * NV12 otherwise. driver_data->is_10bit was set above from
-		 * the active profile. */
-		unsigned int capture_pixelformat = driver_data->is_10bit
-			? V4L2_PIX_FMT_NV15
-			: V4L2_PIX_FMT_NV12;
+		/*
+		 * iter40: take the CAPTURE pixelformat from the resolved
+		 * video_format slot — that's per-fd, per-bit-depth correct.
+		 *   rkvdec  8-bit  → NV12
+		 *   rkvdec 10-bit  → NV15
+		 *   hantro  8-bit  → NV12
+		 *   rpi-hevc-dec   → NC12 (V4L2_PIX_FMT_NV12_COL128)
+		 * Pre-iter40 this was hardcoded NV12/NV15 — the rpi-hevc-dec
+		 * fd would then have S_FMT(NV12) issued, and the kernel
+		 * "helpfully" substituted V4L2_PIX_FMT_NV12MT_COL128 (the
+		 * MULTI-PLANE-NON-CONTIGUOUS variant) instead of the
+		 * SINGLE-PLANE NC12 we wanted, breaking cap_pool QUERYBUF
+		 * downstream (Phase 7 iter40 first-run discovery).
+		 */
+		unsigned int capture_pixelformat =
+			driver_data->video_format->v4l2_format;
 		rc = v4l2_set_format(driver_data->video_fd, capture_type,
 				     capture_pixelformat, picture_width,
 				     picture_height);
@@ -274,7 +313,22 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	 * the device-init DECODE_MODE + START_CODE block below ALSO uses
 	 * void-cast best-effort, so this is consistent with prior pattern.
 	 */
-	{
+	/*
+	 * iter40 (Phase 5 review F6): the synthetic-SPS pre-seed is an
+	 * rkvdec-specific quirk fix (the -EBUSY-on-CAPTURE-busy bug in
+	 * rkvdec_s_ctrl). rpi-hevc-dec does NOT need it and uses a
+	 * different submission ordering (Phase 0 strace: S_FMT_OUTPUT →
+	 * REQBUFS_OUTPUT → S_FMT_CAPTURE → CREATE_BUFS_CAPTURE → STREAMON,
+	 * with per-frame SPS via S_EXT_CTRLS class=0xf010000). Sending a
+	 * stale dummy SPS at context-init time would leave rpi-hevc-dec's
+	 * internal state on the dummy until the first real per-frame SPS
+	 * arrives — exact behavior unknown but a known divergence from
+	 * kdirect.
+	 *
+	 * Skip pre-seed when the active fd is rpi-hevc-dec. rkvdec /
+	 * hantro paths unchanged.
+	 */
+	if (driver_data->video_fd != driver_data->video_fd_rpi_hevc_dec) {
 		/*
 		 * iter39: 10-bit profiles set bit_depth_luma_minus8 = 2 in
 		 * the synthetic SPS so rkvdec's get_image_fmt resolves to
@@ -343,7 +397,7 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 		default:
 			break;
 		}
-	}
+	}  /* iter40: end of pre-seed-skip-on-rpi-hevc-dec guard */
 
 	destination_planes_count = video_format->planes_count;
 
@@ -377,10 +431,39 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	 * changed by BeginPicture's slot acquisition.
 	 */
 	if (video_format->v4l2_buffers_count == 1) {
-		destination_sizes[0] = destination_bytesperlines[0] *
-				       format_height;
-		for (j = 1; j < destination_planes_count; j++)
-			destination_sizes[j] = destination_sizes[0] / 2;
+		if (video_format->v4l2_format == V4L2_PIX_FMT_NV12_COL128) {
+			/*
+			 * iter40: NC12 SAND layout: Y plane size is
+			 * NUM_COLUMNS * TILE_W * ALIGN(height, 8) (= linear
+			 * NV12 Y for column-aligned widths), UV plane is half.
+			 * The kernel-reported destination_bytesperlines[0] is
+			 * the COLUMN stride (ALIGN(height,8)*3/2), not the
+			 * linear Y stride — using it × format_height gives the
+			 * wrong intra-buffer UV offset (destination_offsets[1]
+			 * derives from destination_sizes[0] in
+			 * surface_fill_format_uniform).
+			 *
+			 * Use format_width/format_height (kernel-returned from
+			 * G_FMT) not picture_width/height (caller request),
+			 * because the kernel applies its own ALIGN rules; the
+			 * UV plane location is keyed off the kernel layout.
+			 */
+			unsigned int uv_off = nv12_col128_uv_plane_offset(
+				format_width, format_height);
+			destination_sizes[0] = uv_off;
+			for (j = 1; j < destination_planes_count; j++)
+				destination_sizes[j] = uv_off / 2;
+			request_log("iter40: NC12 sizes pic=%ux%u fmt=%ux%u bpl=%u uv_off=%u sizeimage(kernel)=%u\n",
+				    picture_width, picture_height,
+				    format_width, format_height,
+				    destination_bytesperlines[0], uv_off,
+				    destination_bytesperlines[0] * format_height);
+		} else {
+			destination_sizes[0] = destination_bytesperlines[0] *
+					       format_height;
+			for (j = 1; j < destination_planes_count; j++)
+				destination_sizes[j] = destination_sizes[0] / 2;
+		}
 	}
 
 	/*
@@ -514,6 +597,18 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	 * + ANNEX_B (only supported menu values per Phase 0 v4l2_inventory).
 	 */
 	{
+		/*
+		 * iter40: per-driver HEVC start_code menu value. rkvdec /
+		 * hantro path uses ANNEX_B + start-code-prepended payload.
+		 * rpi-hevc-dec uses NONE — confirmed empirically Phase 7
+		 * (any other mode → V4L2_BUF_FLAG_ERROR on every CAPTURE
+		 * DQBUF, all-zero output). kdirect's strace also shows
+		 * start_code=0 on rpi-hevc-dec. Both are accepted by the
+		 * driver's QUERY_EXT_CTRL menu (min=0 max=1), but only NONE
+		 * actually drives correct decode on the Pi.
+		 */
+		bool is_rpi = (driver_data->video_fd ==
+			       driver_data->video_fd_rpi_hevc_dec);
 		struct v4l2_ext_control hevc_dev_ctrls[2] = {
 			{
 				.id = V4L2_CID_STATELESS_HEVC_DECODE_MODE,
@@ -521,7 +616,9 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 			},
 			{
 				.id = V4L2_CID_STATELESS_HEVC_START_CODE,
-				.value = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
+				.value = is_rpi
+					? 0 /* V4L2_STATELESS_HEVC_START_CODE_NONE */
+					: V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
 			},
 		};
 		(void)v4l2_set_controls(driver_data->video_fd, -1,
@@ -554,18 +651,29 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
 	 * commit will replace this hardcoded assignment with a runtime
 	 * read of the kernel's accepted START_CODE value.
 	 */
-	switch (config_object->profile) {
-	case VAProfileH264Main:
-	case VAProfileH264High:
-	case VAProfileH264ConstrainedBaseline:
-	case VAProfileH264MultiviewHigh:
-	case VAProfileH264StereoHigh:
-	case VAProfileHEVCMain:
-		context_object->h264_start_code = true;
-		break;
-	default:
-		context_object->h264_start_code = false;
-		break;
+	{
+		bool is_rpi = (driver_data->video_fd ==
+			       driver_data->video_fd_rpi_hevc_dec);
+		switch (config_object->profile) {
+		case VAProfileH264Main:
+		case VAProfileH264High:
+		case VAProfileH264ConstrainedBaseline:
+		case VAProfileH264MultiviewHigh:
+		case VAProfileH264StereoHigh:
+			context_object->h264_start_code = true;
+			break;
+		case VAProfileHEVCMain:
+			/* iter40: rpi-hevc-dec rejects start-code-prepended
+			 * payload (DQBUF error flag on every CAPTURE buffer).
+			 * Gate to match the per-driver START_CODE menu value
+			 * set above: NONE on rpi → no prepend; ANNEX_B on
+			 * rkvdec → prepend. */
+			context_object->h264_start_code = !is_rpi;
+			break;
+		default:
+			context_object->h264_start_code = false;
+			break;
+		}
 	}
 
 	rc = v4l2_set_stream(driver_data->video_fd, output_type, true);
diff --git a/src/image.c b/src/image.c
index f20e6ca..0451a29 100644
--- a/src/image.c
+++ b/src/image.c
@@ -40,6 +40,7 @@
 #include <linux/dma-buf.h>
 
 #include "nv15.h"
+#include "nv12_col128.h"
 #include "tiled_yuv.h"
 #include "utils.h"
 #include "v4l2.h"
@@ -104,6 +105,25 @@ VAStatus RequestCreateImage(VADriverContextP context, VAImageFormat *format,
 		size = 0;
 		for (i = 0; i < destination_planes_count; i++)
 			size += destination_sizes[i];
+	} else if (format->fourcc == VA_FOURCC_NV12 &&
+		   video_format->v4l2_format == V4L2_PIX_FMT_NV12_COL128) {
+		/*
+		 * iter40 Phase 5 review F2: NC12 source, NV12 image output.
+		 * V4L2-reported destination_bytesperlines[0] is the NC12
+		 * column stride (= ALIGN(height,8) * 3/2 — e.g. 1080 for
+		 * 1280×720), NOT the linear NV12 Y stride. Override to the
+		 * linear stride (width) so VAImage pitches reflect the
+		 * detile-output layout the consumer reads.
+		 */
+		destination_bytesperlines[0] = width;
+		destination_sizes[0] = destination_bytesperlines[0] * format_height;
+		for (i = 1; i < destination_planes_count; i++) {
+			destination_bytesperlines[i] = destination_bytesperlines[0];
+			destination_sizes[i] = destination_sizes[0] / 2;
+		}
+		size = 0;
+		for (i = 0; i < destination_planes_count; i++)
+			size += destination_sizes[i];
 	} else {
 		/* NV12: V4L2 stride is correct, sizes derived from height. */
 		destination_sizes[0] = destination_bytesperlines[0] * format_height;
@@ -236,14 +256,31 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
 	}
 
 	for (i = 0; i < surface_object->destination_planes_count; i++) {
-#ifdef __arm__
+		/*
+		 * iter40 Phase 5 review F1: guard extended from __arm__ to
+		 * __arm__ || __aarch64__. Without this, the detile primitives
+		 * silently compiled out on aarch64 (fresnel RK3399, ampere
+		 * RK3588, higgs Pi CM5) and the memcpy fall-through delivered
+		 * raw tiled bytes to NV12/P010 image consumers. iter39 5/5
+		 * PASS masked the issue because no 10-bit path was exercised.
+		 */
+#if defined(__arm__) || defined(__aarch64__)
+		/*
+		 * Sunxi tiled_to_planar lives in tiled_yuv.S which is
+		 * #ifdef __arm__ — symbol absent on aarch64. Keep this
+		 * branch arm-only; aarch64 Sunxi support would need a C or
+		 * aarch64-ASM port (no Sunxi aarch64 board in current fleet).
+		 */
+#if defined(__arm__)
 		if (!video_format_is_linear(driver_data->video_format))
 			tiled_to_planar(surface_object->destination_data[i],
 					buffer_object->data + image->offsets[i],
 					image->pitches[i], image->width,
 					i == 0 ? image->height :
 						 image->height / 2);
-		else if (driver_data->is_10bit &&
+		else
+#endif
+		if (driver_data->is_10bit &&
 			 image->format.fourcc == VA_FOURCC_P010) {
 			/*
 			 * iter39: rkvdec emits NV15 (4×10-bit packed in 5
@@ -260,12 +297,49 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
 				(uint16_t *)(buffer_object->data + image->offsets[i]),
 				image->width, plane_h,
 				surface_object->destination_bytesperlines[i]);
+		} else if (driver_data->video_format != NULL &&
+			   driver_data->video_format->v4l2_format ==
+			   V4L2_PIX_FMT_NV12_COL128 &&
+			   image->format.fourcc == VA_FOURCC_NV12) {
+			/*
+			 * iter40: Pi 5 rpi-hevc-dec emits NV12_COL128 (SAND
+			 * 128-pixel-wide column tiles). Detile to linear NV12
+			 * via the per-plane primitive. surface_object->
+			 * destination_data[i] is the V4L2 CAPTURE mmap (single
+			 * buffer, planes_count==2): i==0 is the Y plane base,
+			 * i==1 is the UV plane base offset within the SAME
+			 * physical buffer (per cap_pool plane[1] offset = Y
+			 * plane size in COL128 layout).
+			 *
+			 * src_col_stride = destination_bytesperlines[i] = the
+			 * kernel-reported NC12 bytesperline (column stride,
+			 * = ALIGN(image_h, 8) * 3/2). Same for both planes
+			 * since column geometry is plane-agnostic.
+			 *
+			 * dst stride is image->pitches[i] = image->width
+			 * (overridden in RequestCreateImage NC12 branch below).
+			 */
+			if (i == 0) {
+				nv12_col128_detile_y(
+					(uint8_t *)(buffer_object->data + image->offsets[i]),
+					image->pitches[i],
+					surface_object->destination_data[i],
+					surface_object->destination_bytesperlines[i],
+					image->width, image->height);
+			} else {
+				nv12_col128_detile_uv(
+					(uint8_t *)(buffer_object->data + image->offsets[i]),
+					image->pitches[i],
+					surface_object->destination_data[i],
+					surface_object->destination_bytesperlines[i],
+					image->width, image->height / 2);
+			}
 		} else {
 #endif
 			memcpy(buffer_object->data + image->offsets[i],
 			       surface_object->destination_data[i],
 			       surface_object->destination_sizes[i]);
-#ifdef __arm__
+#if defined(__arm__) || defined(__aarch64__)
 		}
 #endif
 	}
diff --git a/src/meson.build b/src/meson.build
index c05b63e..6943ce2 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -52,6 +52,7 @@ sources = [
 	'vp9.c',
 	'codec.c',
 	'nv15.c',
+	'nv12_col128.c',
 
 	# Vendored GStreamer 1.28.2 H.265 parser + utilities (LGPL v2.1+,
 	# see src/h265_parser/gst_compat.h for sourcing notes + per-iter2
@@ -88,6 +89,7 @@ headers = [
 	'vp9.h',
 	'codec.h',
 	'nv15.h',
+	'nv12_col128.h',
 
 	# Internal mirror of Linux 7.0 V4L2 HEVC EXT_SPS_*_RPS UAPI defs
 	# (allows building against pre-7.0 linux-api-headers; redundant
diff --git a/src/nv12_col128.c b/src/nv12_col128.c
new file mode 100644
index 0000000..7817043
--- /dev/null
+++ b/src/nv12_col128.c
@@ -0,0 +1,114 @@
+/*
+ * V4L2_PIX_FMT_NV12_COL128 → linear NV12 detile primitive. Pi 5 / CM5
+ * rpi-hevc-dec CAPTURE. iter40 (2026-05-17).
+ *
+ * Math derived from kernel hevc_d_video.c (size formula) +
+ * ffmpeg/Kynesim libavutil/rpi_sand_fn_pw.h (per-pixel offset). The
+ * single-stripe fast path memcpy's 128 bytes at a time when an output
+ * row falls entirely within one tile column (the common case);
+ * straddling rows are split into two memcpy halves.
+ *
+ * No NEON / SIMD here — correctness first. Each output row generates
+ * (width / 128) + ~1 memcpys of up to 128 bytes; for 1920x1080 that's
+ * ~17000 small memcpys per frame, fine for Phase 1 PoC.
+ */
+
+#include "nv12_col128.h"
+
+#include <string.h>
+
+/*
+ * Tile column width in bytes. The 'COL128' name embeds this; if it ever
+ * varies, take it from V4L2_PIX_FMT_NV12_COL128's kernel definition.
+ */
+#define NC12_TILE_W   128
+
+/*
+ * Common Y / UV plane detile — the layout is identical (single-byte per
+ * pixel, column-major 128-wide tiles). The only thing that varies is
+ * what plane the caller passes in. width here is plane width in bytes
+ * (= image width for both Y and CbCr-interleaved NV12 UV); height is
+ * plane height in pixels (image height for Y, image height / 2 for UV).
+ */
+static void nv12_col128_detile_plane(uint8_t *dst, unsigned int dst_stride,
+                                     const uint8_t *src,
+                                     unsigned int src_col_stride,
+                                     unsigned int width, unsigned int height)
+{
+	unsigned int y, x;
+
+	for (y = 0; y < height; y++) {
+		uint8_t *drow = dst + y * dst_stride;
+		x = 0;
+		while (x < width) {
+			unsigned int col = x / NC12_TILE_W;
+			unsigned int in_col = x % NC12_TILE_W;
+			unsigned int n = NC12_TILE_W - in_col;
+			if (n > width - x)
+				n = width - x;
+			/*
+			 * Source byte = base + col*128*col_stride + y*128 + in_col
+			 * Copy n contiguous bytes (all within this tile column,
+			 * since n is capped at the remaining width-in-column).
+			 */
+			const uint8_t *p = src
+				+ (size_t)col * NC12_TILE_W * src_col_stride
+				+ (size_t)y * NC12_TILE_W
+				+ in_col;
+			memcpy(drow + x, p, n);
+			x += n;
+		}
+	}
+}
+
+void nv12_col128_detile_y(uint8_t *dst, unsigned int dst_stride,
+                          const uint8_t *src_y, unsigned int src_col_stride,
+                          unsigned int width, unsigned int height)
+{
+	nv12_col128_detile_plane(dst, dst_stride, src_y, src_col_stride,
+				 width, height);
+}
+
+void nv12_col128_detile_uv(uint8_t *dst, unsigned int dst_stride,
+                           const uint8_t *src_uv, unsigned int src_col_stride,
+                           unsigned int width, unsigned int uv_height)
+{
+	/* UV plane (CbCr interleaved): byte-width equals Y-plane width
+	 * (one Cb + one Cr per 2x2 Y block → 2 bytes per 2 horizontal Y
+	 * samples → 1 byte per Y pixel horizontally). Height is half. */
+	nv12_col128_detile_plane(dst, dst_stride, src_uv, src_col_stride,
+				 width, uv_height);
+}
+
+unsigned int nv12_col128_uv_plane_offset(unsigned int image_width,
+                                         unsigned int image_height)
+{
+	unsigned int aligned_h = (image_height + 7) & ~7u;
+
+	/*
+	 * In the COL128 SAND layout, Y and UV are NOT separate planes
+	 * concatenated end-to-end. Within EACH 128-pixel-wide column:
+	 *   first 128 * height bytes  = Y data for this column strip
+	 *   next  128 * height / 2 bytes = UV data for this column strip
+	 *   total 128 * bytesperline (= 128 * height * 3/2) bytes per column
+	 *
+	 * The "UV plane base" pointer (data[1] in AVFrame convention) is
+	 * just data[0] + (128 * height) — the offset of the UV bytes
+	 * WITHIN the first column. All subsequent UV bytes are reached by
+	 * the same column-stride arithmetic the Y plane uses (col *
+	 * 128 * bytesperline + y * 128 + in_col), so passing this offset
+	 * pointer + iterating y over [0, height/2) traverses all UV rows
+	 * across all columns correctly.
+	 *
+	 * Earlier wrong formula was num_columns * 128 * aligned_h (i.e.
+	 * sizeof(linear Y plane)) — that pushed past the end of the SAND
+	 * buffer because the layout isn't planes-end-to-end.
+	 *
+	 * Cross-check: kernel sizeimage = bytesperline * width =
+	 * (aligned_h * 3/2) * num_columns * 128 = num_columns * 128 *
+	 * aligned_h * 3/2. Per column: 128 * aligned_h * 3/2. Y portion
+	 * per column: 128 * aligned_h. UV portion per column: half of Y.
+	 * Sum across columns: matches sizeimage.
+	 */
+	return NC12_TILE_W * aligned_h;
+}
diff --git a/src/nv12_col128.h b/src/nv12_col128.h
new file mode 100644
index 0000000..17798fb
--- /dev/null
+++ b/src/nv12_col128.h
@@ -0,0 +1,88 @@
+/*
+ * V4L2_PIX_FMT_NV12_COL128 (NC12) SAND-tiled → linear NV12 detile.
+ *
+ * Pi 5 / CM5 (BCM2712) rpi-hevc-dec CAPTURE format. iter40 (2026-05-17).
+ *
+ * Layout (kernel drivers/media/platform/raspberrypi/hevc_dec/hevc_d_video.c
+ * size-formula + ffmpeg/Kynesim libavutil/rpi_sand_fn_pw.h per-pixel
+ * offset math):
+ *
+ *   width  ALIGN(image_width,  128)   -- columns are 128 px wide
+ *   height ALIGN(image_height,   8)
+ *   col_stride (= bytesperline) = height * 3 / 2
+ *               (bytes per [128-wide column] vertical unit incl. Y + UV)
+ *   sizeimage  = col_stride * width = total bytes
+ *
+ *   For pixel (x, y) in the Y plane:
+ *     col      = x / 128
+ *     in_col_x = x % 128
+ *     offset   = col * col_stride * 128 + y * 128 + in_col_x
+ *
+ *   UV plane starts at offset (128 * height * num_columns_y) — the same
+ *   per-column layout, h/2 rows tall (CbCr interleaved).
+ *
+ * The primitive copies the entire image extent at once. width/height are
+ * the cropped consumer-visible dimensions; src_col_stride is the kernel-
+ * reported bytesperline (i.e. ALIGN(height,8) * 3/2).
+ */
+
+#ifndef _NV12_COL128_H_
+#define _NV12_COL128_H_
+
+#include <stdint.h>
+
+#include <linux/videodev2.h>
+
+/*
+ * Pre-Pi-kernel headers (Arch ALARM linux-api-headers, older mainline
+ * kernel-headers packages) may not define V4L2_PIX_FMT_NV12_COL128. The
+ * fourcc is Pi-specific. Provide a private fallback so the backend
+ * builds on hosts that target NON-Pi codecs too.
+ */
+#ifndef V4L2_PIX_FMT_NV12_COL128
+#define V4L2_PIX_FMT_NV12_COL128  \
+	((unsigned int)('N') | ((unsigned int)('C') << 8) | \
+	 ((unsigned int)('1') << 16) | ((unsigned int)('2') << 24))
+#endif
+
+#ifndef V4L2_PIX_FMT_NV12_10_COL128
+/* 10-bit SAND variant: 3 pixels packed into 4 bytes in 128-byte / 96-pixel
+ * wide columns. iter40 references the fourcc for completeness; the 10-bit
+ * Pi 5 HEVC chapter (Main10) is post-iter40. */
+#define V4L2_PIX_FMT_NV12_10_COL128  \
+	((unsigned int)('N') | ((unsigned int)('C') << 8) | \
+	 ((unsigned int)('3') << 16) | ((unsigned int)('0') << 24))
+#endif
+
+/* Detile the Y plane of an NC12 source to a linear NV12 Y plane.
+ *   dst         : pointer to linear NV12 Y plane (caller-owned, dst_stride * height bytes)
+ *   dst_stride  : linear Y plane stride in bytes (= width for plain NV12)
+ *   src_y       : pointer to start of NC12 Y plane (= NC12 buffer base)
+ *   src_col_stride: kernel-reported bytesperline (= ALIGN(height,8) * 3/2)
+ *   width, height: cropped image dimensions in pixels
+ */
+void nv12_col128_detile_y(uint8_t *dst, unsigned int dst_stride,
+                          const uint8_t *src_y, unsigned int src_col_stride,
+                          unsigned int width, unsigned int height);
+
+/* Detile the UV plane (CbCr interleaved, half-height) of an NC12 source.
+ *   dst         : pointer to linear NV12 UV plane
+ *   dst_stride  : linear UV plane stride in bytes (= width for NV12)
+ *   src_uv      : pointer to start of NC12 UV plane (= src_y + Y-plane-size)
+ *   src_col_stride: same as Y plane (same column geometry)
+ *   width       : Y-plane width in pixels (UV plane has same byte width)
+ *   uv_height   : UV plane height = height / 2
+ */
+void nv12_col128_detile_uv(uint8_t *dst, unsigned int dst_stride,
+                           const uint8_t *src_uv, unsigned int src_col_stride,
+                           unsigned int width, unsigned int uv_height);
+
+/* Compute the offset of the UV plane within an NC12 buffer.
+ *   image_width, image_height: cropped image dimensions in pixels
+ *   Returns: byte offset from buffer start to UV plane start
+ *           (= 128 * ALIGN(image_height, 8) * num_columns_y)
+ */
+unsigned int nv12_col128_uv_plane_offset(unsigned int image_width,
+                                         unsigned int image_height);
+
+#endif /* _NV12_COL128_H_ */
diff --git a/src/nv15.h b/src/nv15.h
index 3c8605a..039d620 100644
--- a/src/nv15.h
+++ b/src/nv15.h
@@ -27,6 +27,21 @@
 
 #include <stdint.h>
 
+#include <linux/videodev2.h>
+
+/*
+ * Older or downstream linux-api-headers / kernel-headers packages may
+ * not define V4L2_PIX_FMT_NV15. Provide a fallback so the backend
+ * builds on hosts whose headers are pre-NV15-merge or omit it (e.g.
+ * Pi 5 Debian trixie 6.12.62 headers include NC12 but not NV15).
+ * Same numeric value as mainline.
+ */
+#ifndef V4L2_PIX_FMT_NV15
+#define V4L2_PIX_FMT_NV15  \
+	((unsigned int)('N') | ((unsigned int)('V') << 8) | \
+	 ((unsigned int)('1') << 16) | ((unsigned int)('5') << 24))
+#endif
+
 /*
  * Unpack one plane of V4L2_PIX_FMT_NV15 (4 × 10-bit values packed into
  * 5 consecutive bytes, LSB-first) into VA_FOURCC_P010 (16-bit per pixel,
diff --git a/src/request.c b/src/request.c
index 20884be..ed97fe0 100644
--- a/src/request.c
+++ b/src/request.c
@@ -93,6 +93,7 @@
 static const char * const known_decoder_drivers[] = {
 	"rkvdec",
 	"hantro-vpu",
+	"rpi-hevc-dec",  /* iter40: Pi 5 / CM5 stateless HEVC */
 	"cedrus",
 	"sun4i_csi",
 	NULL
@@ -431,12 +432,31 @@ int request_switch_device_for_profile(struct request_data *driver_data,
 	char kind = request_device_kind_for_profile(profile);
 	int target_video, target_media;
 
+	/*
+	 * iter40: HEVC override when rpi-hevc-dec is probed. The static
+	 * table (request_device_kind_for_profile) maps HEVC → 'r' (rkvdec)
+	 * because that's the canonical RK path. On Pi 5 there's no rkvdec
+	 * — rpi-hevc-dec is the only decoder. When BOTH would be present
+	 * (hypothetical mixed board), prefer rpi-hevc-dec for HEVC.
+	 *
+	 * Other rkvdec-routed profiles (VP9, H.264) stay on 'r' because
+	 * rpi-hevc-dec is HEVC-only.
+	 */
+	if ((profile == VAProfileHEVCMain || profile == VAProfileHEVCMain10) &&
+	    driver_data->video_fd_rpi_hevc_dec >= 0 &&
+	    driver_data->media_fd_rpi_hevc_dec >= 0) {
+		kind = 'p';
+	}
+
 	if (kind == 'r') {
 		target_video = driver_data->video_fd_rkvdec;
 		target_media = driver_data->media_fd_rkvdec;
 	} else if (kind == 'h') {
 		target_video = driver_data->video_fd_hantro;
 		target_media = driver_data->media_fd_hantro;
+	} else if (kind == 'p') {
+		target_video = driver_data->video_fd_rpi_hevc_dec;
+		target_media = driver_data->media_fd_rpi_hevc_dec;
 	} else {
 		return -1;
 	}
@@ -624,6 +644,8 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 	driver_data->media_fd_rkvdec = -1;
 	driver_data->video_fd_hantro = -1;
 	driver_data->media_fd_hantro = -1;
+	driver_data->video_fd_rpi_hevc_dec = -1;
+	driver_data->media_fd_rpi_hevc_dec = -1;
 
 	/*
 	 * iter38: probe BOTH rkvdec and hantro-vpu so a single libva session
@@ -654,6 +676,15 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 				alt_driver = "rkvdec";
 				driver_data->video_fd_hantro = video_fd;
 				driver_data->media_fd_hantro = media_fd;
+			} else if (strcmp(info.driver, "rpi-hevc-dec") == 0) {
+				/* iter40: Pi 5 / CM5 — sole decoder is rpi-hevc-dec.
+				 * No alt driver to probe; the rkvdec / hantro slots
+				 * stay -1 and HEVC routes to 'p' via
+				 * request_device_kind_for_profile. */
+				primary_driver = "rpi-hevc-dec";
+				alt_driver = NULL;
+				driver_data->video_fd_rpi_hevc_dec = video_fd;
+				driver_data->media_fd_rpi_hevc_dec = media_fd;
 			}
 		}
 
@@ -693,11 +724,19 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
 		probe_hevc_ext_sps_rps_controls(driver_data->video_fd_rkvdec);
 	driver_data->has_hevc_ext_sps_rps_hantro =
 		probe_hevc_ext_sps_rps_controls(driver_data->video_fd_hantro);
+	driver_data->has_hevc_ext_sps_rps_rpi_hevc_dec =
+		probe_hevc_ext_sps_rps_controls(driver_data->video_fd_rpi_hevc_dec);
 	if (driver_data->has_hevc_ext_sps_rps_rkvdec) {
 		request_log("iter2: kernel registers HEVC EXT_SPS_{ST,LT}_RPS "
 			    "controls on rkvdec fd (will route through "
 			    "vendored GStreamer parser)\n");
 	}
+	if (driver_data->video_fd_rpi_hevc_dec >= 0) {
+		request_log("iter40: also opened rpi-hevc-dec at video_fd=%d "
+			    "media_fd=%d (Pi 5 HEVC stateless)\n",
+			    driver_data->video_fd_rpi_hevc_dec,
+			    driver_data->media_fd_rpi_hevc_dec);
+	}
 
 	status = VA_STATUS_SUCCESS;
 	goto complete;
diff --git a/src/request.h b/src/request.h
index 2d67e2f..6c3c9a2 100644
--- a/src/request.h
+++ b/src/request.h
@@ -78,6 +78,15 @@ struct request_data {
 	int media_fd_rkvdec;
 	int video_fd_hantro;
 	int media_fd_hantro;
+	/*
+	 * iter40: third multi-device-probe slot for rpi-hevc-dec (Pi 5 /
+	 * CM5 / BCM2712). V4L2 stateless HEVC; CAPTURE is NC12/NC30 SAND
+	 * 128-pixel-wide column tiled (Pi-specific). On Pi 5 this is the
+	 * ONLY decoder slot; on RK hosts it stays -1 and HEVC routes to
+	 * rkvdec as before.
+	 */
+	int video_fd_rpi_hevc_dec;
+	int media_fd_rpi_hevc_dec;
 
 	/*
 	 * iter2 (ampere-kernel-decoders campaign) — per-fd probe result
@@ -98,6 +107,12 @@ struct request_data {
 	 */
 	bool has_hevc_ext_sps_rps_rkvdec;
 	bool has_hevc_ext_sps_rps_hantro;
+	/* iter40: rpi-hevc-dec doesn't expose EXT_SPS_*_RPS controls
+	 * (verified Phase 0 higgs probe: QUERY_EXT_CTRL on 0xa97 → EINVAL).
+	 * Probed for consistency with the iter2 pair-of-flags pattern;
+	 * stays false on Pi 5 and the iter2 vendored-parser path naturally
+	 * doesn't engage. */
+	bool has_hevc_ext_sps_rps_rpi_hevc_dec;
 
 	/*
 	 * iter2 — cached SPS-derived RPS arrays. SPS NALs only appear in
diff --git a/src/video.c b/src/video.c
index 04cbf25..f160a92 100644
--- a/src/video.c
+++ b/src/video.c
@@ -31,6 +31,8 @@
 #include <drm_fourcc.h>
 #include <linux/videodev2.h>
 
+#include "nv12_col128.h"  /* fallback V4L2_PIX_FMT_NV12_COL128 define */
+#include "nv15.h"         /* fallback V4L2_PIX_FMT_NV15 define */
 #include "utils.h"
 #include "video.h"
 
@@ -55,6 +57,28 @@ static struct video_format formats[] = {
 		.planes_count		= 2,
 		.bpp			= 24,
 	},
+	{
+		/*
+		 * iter40: Pi 5 / CM5 rpi-hevc-dec CAPTURE format. 8-bit NV12
+		 * stored as 128-pixel-wide column tiles (SAND128 layout).
+		 * Pi-specific; not in mainline drm_fourcc.h (uses NV12 + a
+		 * BROADCOM_SAND128 modifier for DRM_PRIME). Our consumer path
+		 * always detiles to linear NV12 in copy_surface_to_image, so
+		 * we don't expose the SAND modifier downstream — drm_format is
+		 * still DRM_FORMAT_NV12 and drm_modifier MOD_NONE so the
+		 * format-is-linear gate doesn't pull us into tiled_to_planar
+		 * (Sunxi-specific). image.c branches on v4l2_format ==
+		 * V4L2_PIX_FMT_NV12_COL128 to invoke the dedicated detile.
+		 */
+		.description		= "NV12 SAND128 (8-bit, rpi-hevc-dec)",
+		.v4l2_format		= V4L2_PIX_FMT_NV12_COL128,
+		.v4l2_buffers_count	= 1,
+		.v4l2_mplane		= true,
+		.drm_format		= DRM_FORMAT_NV12,
+		.drm_modifier		= DRM_FORMAT_MOD_NONE,
+		.planes_count		= 2,
+		.bpp			= 16,
+	},
 // Code to handle this DRM_FORMAT is __arm__ only
 #ifdef __arm__
 	{
diff --git a/tests/test_nv12_col128_detile.c b/tests/test_nv12_col128_detile.c
new file mode 100644
index 0000000..87d1b39
--- /dev/null
+++ b/tests/test_nv12_col128_detile.c
@@ -0,0 +1,196 @@
+/*
+ * Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
+ *
+ * MIT-licensed per project. iter40 self-test for nv12_col128 detile.
+ *
+ * Build an NC12-tiled source buffer from a known linear NV12 image,
+ * run the detile primitive, assert output matches the original. No
+ * hardware needed — pure bit-layout verification of the kernel math
+ * (drivers/media/platform/raspberrypi/hevc_dec/hevc_d_video.c
+ * V4L2_PIX_FMT_NV12_COL128 case + ffmpeg/Kynesim per-pixel offset).
+ *
+ * Build:
+ *   cc -Wall -Werror -O2 -o test_nv12_col128_detile \
+ *      tests/test_nv12_col128_detile.c src/nv12_col128.c
+ *
+ * Exit 0 = all asserts pass.
+ */
+
+#include "../src/nv12_col128.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define TILE_W 128
+
+static unsigned int align_up(unsigned int v, unsigned int a)
+{
+	return (v + a - 1) & ~(a - 1);
+}
+
+/* Pack a linear plane (width × height bytes, stride=width) into NC12
+ * layout: each 128-wide column held contiguously, columns at offsets
+ * col * col_stride * 128. col_stride is the kernel-reported bytesperline
+ * = ALIGN(height, 8) * 3/2. Returns the buffer + sizes. */
+static uint8_t *pack_to_nc12(const uint8_t *linear,
+			     unsigned int width, unsigned int height,
+			     unsigned int *out_col_stride,
+			     size_t *out_size)
+{
+	unsigned int aligned_w = align_up(width, TILE_W);
+	unsigned int aligned_h = align_up(height, 8);
+	unsigned int col_stride = aligned_h * 3 / 2;
+	unsigned int num_cols = aligned_w / TILE_W;
+	size_t total = (size_t)col_stride * aligned_w;
+	uint8_t *buf;
+	unsigned int col, y, in_col;
+
+	buf = calloc(1, total);
+	assert(buf != NULL);
+
+	for (col = 0; col < num_cols; col++) {
+		uint8_t *col_base = buf + (size_t)col * TILE_W * col_stride;
+		for (y = 0; y < height; y++) {
+			for (in_col = 0; in_col < TILE_W; in_col++) {
+				unsigned int x = col * TILE_W + in_col;
+				if (x >= width)
+					break;
+				col_base[(size_t)y * TILE_W + in_col] =
+					linear[(size_t)y * width + x];
+			}
+		}
+	}
+
+	*out_col_stride = col_stride;
+	*out_size = total;
+	return buf;
+}
+
+static void test_detile_y(unsigned int width, unsigned int height)
+{
+	uint8_t *linear, *tiled, *recovered;
+	unsigned int col_stride;
+	size_t tile_size, i;
+
+	linear = malloc((size_t)width * height);
+	assert(linear != NULL);
+	/* Distinctive content per pixel: y * 17 + x * 13 — avoids byte-
+	 * aliasing patterns that could mask off-by-one bugs. */
+	for (unsigned int y = 0; y < height; y++)
+		for (unsigned int x = 0; x < width; x++)
+			linear[(size_t)y * width + x] = (uint8_t)(y * 17 + x * 13);
+
+	tiled = pack_to_nc12(linear, width, height, &col_stride, &tile_size);
+
+	recovered = calloc(1, (size_t)width * height);
+	assert(recovered != NULL);
+
+	nv12_col128_detile_y(recovered, width, tiled, col_stride, width, height);
+
+	for (i = 0; i < (size_t)width * height; i++) {
+		if (recovered[i] != linear[i]) {
+			fprintf(stderr,
+				"FAIL %ux%u Y: pixel %zu (x=%zu y=%zu) "
+				"linear=0x%02x recovered=0x%02x\n",
+				width, height, i,
+				i % width, i / width,
+				linear[i], recovered[i]);
+			free(linear); free(tiled); free(recovered);
+			exit(1);
+		}
+	}
+	printf("PASS %ux%u Y plane (%u columns, col_stride=%u, tile_size=%zu)\n",
+	       width, height, align_up(width, TILE_W) / TILE_W,
+	       col_stride, tile_size);
+
+	free(linear);
+	free(tiled);
+	free(recovered);
+}
+
+static void test_detile_uv(unsigned int width, unsigned int height)
+{
+	unsigned int uv_h = height / 2;
+	uint8_t *linear, *tiled, *recovered;
+	unsigned int col_stride;
+	size_t tile_size, i;
+
+	linear = malloc((size_t)width * uv_h);
+	assert(linear != NULL);
+	for (unsigned int y = 0; y < uv_h; y++)
+		for (unsigned int x = 0; x < width; x++)
+			linear[(size_t)y * width + x] = (uint8_t)(y * 23 + x * 7);
+
+	tiled = pack_to_nc12(linear, width, uv_h, &col_stride, &tile_size);
+
+	recovered = calloc(1, (size_t)width * uv_h);
+	assert(recovered != NULL);
+
+	nv12_col128_detile_uv(recovered, width, tiled, col_stride, width, uv_h);
+
+	for (i = 0; i < (size_t)width * uv_h; i++) {
+		if (recovered[i] != linear[i]) {
+			fprintf(stderr,
+				"FAIL %ux%u UV: pixel %zu linear=0x%02x recovered=0x%02x\n",
+				width, height, i,
+				linear[i], recovered[i]);
+			free(linear); free(tiled); free(recovered);
+			exit(1);
+		}
+	}
+	printf("PASS %ux%u UV plane\n", width, height);
+
+	free(linear);
+	free(tiled);
+	free(recovered);
+}
+
+static void test_uv_offset(void)
+{
+	/* Per the SAND COL128 layout, Y and UV are interleaved within
+	 * EACH column (not concatenated as separate planes), so the UV
+	 * plane base pointer is offset by 128 * ALIGN(height, 8) — the
+	 * Y portion of column 0. NOT 128 * height * num_columns (the
+	 * size of all Y across all columns), which was an earlier wrong
+	 * formula caught by Phase 7 SEGV on higgs. */
+	unsigned int off = nv12_col128_uv_plane_offset(1280, 720);
+	if (off != 128u * 720) {
+		fprintf(stderr, "FAIL UV offset 1280×720: got %u expected %u\n",
+			off, 128u * 720);
+		exit(1);
+	}
+	printf("PASS UV offset 1280×720 = %u\n", off);
+
+	off = nv12_col128_uv_plane_offset(1366, 768);
+	if (off != 128u * 768) {
+		fprintf(stderr, "FAIL UV offset 1366×768: got %u expected %u\n",
+			off, 128u * 768);
+		exit(1);
+	}
+	printf("PASS UV offset 1366×768 (column-misaligned width)\n");
+}
+
+int main(void)
+{
+	/* Phase 3 fixture sizes — all 128-aligned, 8-line-aligned. */
+	test_detile_y(640, 360);
+	test_detile_y(1280, 720);
+	test_detile_y(1920, 1080);
+
+	/* Phase 5 review F4: column-misaligned width (1366 → 1408 padding). */
+	test_detile_y(1366, 768);
+
+	/* UV plane (half-height) at each width. */
+	test_detile_uv(640, 360);
+	test_detile_uv(1280, 720);
+	test_detile_uv(1920, 1080);
+	test_detile_uv(1366, 768);
+
+	test_uv_offset();
+
+	printf("All NC12 detile asserts pass.\n");
+	return 0;
+}