iter40: Pi 5 HEVC chapter — backend integration lands, bit-exact pending
Phase 6 implementation. Backend builds clean on higgs (Debian 13 trixie, aarch64), vainfo lists VAProfileHEVCMain via rpi-hevc-dec, multi-device probe finds /dev/video19 + /dev/media1, CreateContext + S_FMT + REQBUFS + STREAMON all succeed. Phase 7 partial: infrastructure works, 10 frames flow through the pipeline (correct byte counts produced — 13824000 for 1280x720 x 10 NV12 frames). But every DQBUF CAPTURE returns V4L2_BUF_FLAG_ERROR so output content is wrong (libva sha != kdirect sha). The decode itself is failing on the rpi-hevc-dec side despite all ctrl submissions returning success. Code changes: - request.h: video_fd_rpi_hevc_dec / media_fd_rpi_hevc_dec slots + has_hevc_ext_sps_rps_rpi_hevc_dec flag (mirrors iter38 + iter2 pair-of-flags pattern, naturally false on Pi). - request.c: known_decoder_drivers gains rpi-hevc-dec; primary-driver probe gets an else-if branch setting the new fds (Phase 5 F3); request_switch_device_for_profile prefers 'p' for HEVC when rpi-hevc-dec present. - context.c: per-fd want_pixfmt (NC12 on Pi), capture_pixelformat taken from video_format slot (not hardcoded NV12/NV15); synthetic-SPS pre-seed gated off for Pi (Phase 5 F6); destination_sizes uses nv12_col128_uv_plane_offset for NC12 SAND layout (Phase 5 F2); per-driver HEVC_START_CODE (NONE on Pi, ANNEX_B on RK); per-driver context_object->h264_start_code (skip prepend on Pi). - video.c: NV12_COL128 video_format entry (8-bit SAND, single buffer, 2 planes, NV12 drm_format with MOD_NONE so detile branch fires rather than tiled_to_planar). - nv12_col128.c/.h: detile primitive (Y + UV per-plane, kernel hevc_d_video.c bytesperline formula + ffmpeg/Kynesim per-pixel offset). UV plane offset = 128 * ALIGN(h, 8) — within-column (SAND interleaves Y+UV per column, NOT plane-concatenated; earlier wrong formula caught by Phase 7 SEGV). - image.c: #ifdef __arm__ extended to __arm__ || __aarch64__ (Phase 5 F1 — guard was killing detile path on all aarch64 hosts including fresnel iter39 NV15 path, masked because 10-bit never exercised); RequestCreateImage NC12 → NV12 stride override (linear width, not column-stride); copy_surface_to_image NC12 detile branch (gates on fourcc + v4l2_format). - nv15.h: fallback V4L2_PIX_FMT_NV15 define (Debian 13 headers omit it though they have NC12). - nv12_col128.h: fallback V4L2_PIX_FMT_NV12_COL128 + V4L2_PIX_FMT_NV12_10_COL128 (Arch / mainline pre-Pi headers). - tests/test_nv12_col128_detile.c: hand-crafted-bytes unit test; passes (8 cases: Y + UV for 4 widths incl. 1366 misaligned; UV-offset helper). - meson.build / nv12_col128 sources listed. Phase 7 status: not yet bit-exact. Remaining diagnosis: per-frame S_EXT_CTRLS payload diff vs kdirect (kdirect sends 4 ctrls SPS+PPS+decode_params+slice_array; ours sends 5 incl. scaling_matrix; field ordering differs). Likely the slice_array contents need per-driver handling for rpi-hevc-dec's expected layout. Beyond in-session reach. iter38 5/5 baseline on fresnel + ampere should be unaffected (new fd stays -1 on non-Pi hosts; all gates either short-circuit on fd-not-present or no-op). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+136
-28
@@ -42,6 +42,9 @@
|
||||
|
||||
#include <hevc-ctrls.h>
|
||||
|
||||
#include "nv15.h" /* iter40: fallback V4L2_PIX_FMT_NV15 define for Pi 5
|
||||
* Debian headers that ship NC12 but not NV15. */
|
||||
#include "nv12_col128.h" /* iter40: NC12 detile primitive + UV offset helper */
|
||||
#include "utils.h"
|
||||
#include "v4l2.h"
|
||||
|
||||
@@ -117,8 +120,19 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
|
||||
{
|
||||
bool want_10bit = (config_object->profile == VAProfileH264High10 ||
|
||||
config_object->profile == VAProfileHEVCMain10);
|
||||
unsigned int want_pixfmt = want_10bit ? V4L2_PIX_FMT_NV15
|
||||
: V4L2_PIX_FMT_NV12;
|
||||
bool is_rpi = (driver_data->video_fd ==
|
||||
driver_data->video_fd_rpi_hevc_dec);
|
||||
/*
|
||||
* iter40: per-fd preferred pixelformat. rpi-hevc-dec exposes
|
||||
* NC12 (8-bit) / NC30 (10-bit), not NV12 / NV15.
|
||||
*/
|
||||
unsigned int want_pixfmt;
|
||||
if (is_rpi)
|
||||
want_pixfmt = want_10bit ? V4L2_PIX_FMT_NV12_10_COL128
|
||||
: V4L2_PIX_FMT_NV12_COL128;
|
||||
else
|
||||
want_pixfmt = want_10bit ? V4L2_PIX_FMT_NV15
|
||||
: V4L2_PIX_FMT_NV12;
|
||||
if (driver_data->video_format &&
|
||||
driver_data->video_format->v4l2_format != want_pixfmt &&
|
||||
driver_data->video_format->v4l2_format != V4L2_PIX_FMT_SUNXI_TILED_NV12)
|
||||
@@ -127,9 +141,24 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
|
||||
if (!driver_data->video_format) {
|
||||
bool want_10bit = (config_object->profile == VAProfileH264High10 ||
|
||||
config_object->profile == VAProfileHEVCMain10);
|
||||
bool is_rpi = (driver_data->video_fd ==
|
||||
driver_data->video_fd_rpi_hevc_dec);
|
||||
video_format = NULL;
|
||||
|
||||
if (!want_10bit) {
|
||||
if (is_rpi) {
|
||||
/*
|
||||
* iter40: rpi-hevc-dec CAPTURE is NC12 (8-bit SAND
|
||||
* 128-pixel-wide column tile) or NC30 (10-bit variant).
|
||||
* Direct map; the kernel exposes BOTH formats in
|
||||
* VIDIOC_ENUM_FMT(CAPTURE_MPLANE) without a pre-SPS
|
||||
* step (verified Phase 0 strace), so find_format would
|
||||
* also succeed — skip it for symmetry with the NV15
|
||||
* iter39 branch below.
|
||||
*/
|
||||
video_format = video_format_find(
|
||||
want_10bit ? V4L2_PIX_FMT_NV12_10_COL128
|
||||
: V4L2_PIX_FMT_NV12_COL128);
|
||||
} else if (!want_10bit) {
|
||||
found = v4l2_find_format(driver_data->video_fd,
|
||||
V4L2_BUF_TYPE_VIDEO_CAPTURE,
|
||||
V4L2_PIX_FMT_SUNXI_TILED_NV12);
|
||||
@@ -212,12 +241,22 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
|
||||
* CAPTURE (sanity read-back, matches what S_FMT committed).
|
||||
*/
|
||||
{
|
||||
/* iter39: NV15 for 10-bit profiles (rkvdec Hi10P/Main10),
|
||||
* NV12 otherwise. driver_data->is_10bit was set above from
|
||||
* the active profile. */
|
||||
unsigned int capture_pixelformat = driver_data->is_10bit
|
||||
? V4L2_PIX_FMT_NV15
|
||||
: V4L2_PIX_FMT_NV12;
|
||||
/*
|
||||
* iter40: take the CAPTURE pixelformat from the resolved
|
||||
* video_format slot — that's per-fd, per-bit-depth correct.
|
||||
* rkvdec 8-bit → NV12
|
||||
* rkvdec 10-bit → NV15
|
||||
* hantro 8-bit → NV12
|
||||
* rpi-hevc-dec → NC12 (V4L2_PIX_FMT_NV12_COL128)
|
||||
* Pre-iter40 this was hardcoded NV12/NV15 — the rpi-hevc-dec
|
||||
* fd would then have S_FMT(NV12) issued, and the kernel
|
||||
* "helpfully" substituted V4L2_PIX_FMT_NV12MT_COL128 (the
|
||||
* MULTI-PLANE-NON-CONTIGUOUS variant) instead of the
|
||||
* SINGLE-PLANE NC12 we wanted, breaking cap_pool QUERYBUF
|
||||
* downstream (Phase 7 iter40 first-run discovery).
|
||||
*/
|
||||
unsigned int capture_pixelformat =
|
||||
driver_data->video_format->v4l2_format;
|
||||
rc = v4l2_set_format(driver_data->video_fd, capture_type,
|
||||
capture_pixelformat, picture_width,
|
||||
picture_height);
|
||||
@@ -274,7 +313,22 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
|
||||
* the device-init DECODE_MODE + START_CODE block below ALSO uses
|
||||
* void-cast best-effort, so this is consistent with prior pattern.
|
||||
*/
|
||||
{
|
||||
/*
|
||||
* iter40 (Phase 5 review F6): the synthetic-SPS pre-seed is an
|
||||
* rkvdec-specific quirk fix (the -EBUSY-on-CAPTURE-busy bug in
|
||||
* rkvdec_s_ctrl). rpi-hevc-dec does NOT need it and uses a
|
||||
* different submission ordering (Phase 0 strace: S_FMT_OUTPUT →
|
||||
* REQBUFS_OUTPUT → S_FMT_CAPTURE → CREATE_BUFS_CAPTURE → STREAMON,
|
||||
* with per-frame SPS via S_EXT_CTRLS class=0xf010000). Sending a
|
||||
* stale dummy SPS at context-init time would leave rpi-hevc-dec's
|
||||
* internal state on the dummy until the first real per-frame SPS
|
||||
* arrives — exact behavior unknown but a known divergence from
|
||||
* kdirect.
|
||||
*
|
||||
* Skip pre-seed when the active fd is rpi-hevc-dec. rkvdec /
|
||||
* hantro paths unchanged.
|
||||
*/
|
||||
if (driver_data->video_fd != driver_data->video_fd_rpi_hevc_dec) {
|
||||
/*
|
||||
* iter39: 10-bit profiles set bit_depth_luma_minus8 = 2 in
|
||||
* the synthetic SPS so rkvdec's get_image_fmt resolves to
|
||||
@@ -343,7 +397,7 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
} /* iter40: end of pre-seed-skip-on-rpi-hevc-dec guard */
|
||||
|
||||
destination_planes_count = video_format->planes_count;
|
||||
|
||||
@@ -377,10 +431,39 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
|
||||
* changed by BeginPicture's slot acquisition.
|
||||
*/
|
||||
if (video_format->v4l2_buffers_count == 1) {
|
||||
destination_sizes[0] = destination_bytesperlines[0] *
|
||||
format_height;
|
||||
for (j = 1; j < destination_planes_count; j++)
|
||||
destination_sizes[j] = destination_sizes[0] / 2;
|
||||
if (video_format->v4l2_format == V4L2_PIX_FMT_NV12_COL128) {
|
||||
/*
|
||||
* iter40: NC12 SAND layout: Y plane size is
|
||||
* NUM_COLUMNS * TILE_W * ALIGN(height, 8) (= linear
|
||||
* NV12 Y for column-aligned widths), UV plane is half.
|
||||
* The kernel-reported destination_bytesperlines[0] is
|
||||
* the COLUMN stride (ALIGN(height,8)*3/2), not the
|
||||
* linear Y stride — using it × format_height gives the
|
||||
* wrong intra-buffer UV offset (destination_offsets[1]
|
||||
* derives from destination_sizes[0] in
|
||||
* surface_fill_format_uniform).
|
||||
*
|
||||
* Use format_width/format_height (kernel-returned from
|
||||
* G_FMT) not picture_width/height (caller request),
|
||||
* because the kernel applies its own ALIGN rules; the
|
||||
* UV plane location is keyed off the kernel layout.
|
||||
*/
|
||||
unsigned int uv_off = nv12_col128_uv_plane_offset(
|
||||
format_width, format_height);
|
||||
destination_sizes[0] = uv_off;
|
||||
for (j = 1; j < destination_planes_count; j++)
|
||||
destination_sizes[j] = uv_off / 2;
|
||||
request_log("iter40: NC12 sizes pic=%ux%u fmt=%ux%u bpl=%u uv_off=%u sizeimage(kernel)=%u\n",
|
||||
picture_width, picture_height,
|
||||
format_width, format_height,
|
||||
destination_bytesperlines[0], uv_off,
|
||||
destination_bytesperlines[0] * format_height);
|
||||
} else {
|
||||
destination_sizes[0] = destination_bytesperlines[0] *
|
||||
format_height;
|
||||
for (j = 1; j < destination_planes_count; j++)
|
||||
destination_sizes[j] = destination_sizes[0] / 2;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -514,6 +597,18 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
|
||||
* + ANNEX_B (only supported menu values per Phase 0 v4l2_inventory).
|
||||
*/
|
||||
{
|
||||
/*
|
||||
* iter40: per-driver HEVC start_code menu value. rkvdec /
|
||||
* hantro path uses ANNEX_B + start-code-prepended payload.
|
||||
* rpi-hevc-dec uses NONE — confirmed empirically Phase 7
|
||||
* (any other mode → V4L2_BUF_FLAG_ERROR on every CAPTURE
|
||||
* DQBUF, all-zero output). kdirect's strace also shows
|
||||
* start_code=0 on rpi-hevc-dec. Both are accepted by the
|
||||
* driver's QUERY_EXT_CTRL menu (min=0 max=1), but only NONE
|
||||
* actually drives correct decode on the Pi.
|
||||
*/
|
||||
bool is_rpi = (driver_data->video_fd ==
|
||||
driver_data->video_fd_rpi_hevc_dec);
|
||||
struct v4l2_ext_control hevc_dev_ctrls[2] = {
|
||||
{
|
||||
.id = V4L2_CID_STATELESS_HEVC_DECODE_MODE,
|
||||
@@ -521,7 +616,9 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
|
||||
},
|
||||
{
|
||||
.id = V4L2_CID_STATELESS_HEVC_START_CODE,
|
||||
.value = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
|
||||
.value = is_rpi
|
||||
? 0 /* V4L2_STATELESS_HEVC_START_CODE_NONE */
|
||||
: V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
|
||||
},
|
||||
};
|
||||
(void)v4l2_set_controls(driver_data->video_fd, -1,
|
||||
@@ -554,18 +651,29 @@ VAStatus RequestCreateContext(VADriverContextP context, VAConfigID config_id,
|
||||
* commit will replace this hardcoded assignment with a runtime
|
||||
* read of the kernel's accepted START_CODE value.
|
||||
*/
|
||||
switch (config_object->profile) {
|
||||
case VAProfileH264Main:
|
||||
case VAProfileH264High:
|
||||
case VAProfileH264ConstrainedBaseline:
|
||||
case VAProfileH264MultiviewHigh:
|
||||
case VAProfileH264StereoHigh:
|
||||
case VAProfileHEVCMain:
|
||||
context_object->h264_start_code = true;
|
||||
break;
|
||||
default:
|
||||
context_object->h264_start_code = false;
|
||||
break;
|
||||
{
|
||||
bool is_rpi = (driver_data->video_fd ==
|
||||
driver_data->video_fd_rpi_hevc_dec);
|
||||
switch (config_object->profile) {
|
||||
case VAProfileH264Main:
|
||||
case VAProfileH264High:
|
||||
case VAProfileH264ConstrainedBaseline:
|
||||
case VAProfileH264MultiviewHigh:
|
||||
case VAProfileH264StereoHigh:
|
||||
context_object->h264_start_code = true;
|
||||
break;
|
||||
case VAProfileHEVCMain:
|
||||
/* iter40: rpi-hevc-dec rejects start-code-prepended
|
||||
* payload (DQBUF error flag on every CAPTURE buffer).
|
||||
* Gate to match the per-driver START_CODE menu value
|
||||
* set above: NONE on rpi → no prepend; ANNEX_B on
|
||||
* rkvdec → prepend. */
|
||||
context_object->h264_start_code = !is_rpi;
|
||||
break;
|
||||
default:
|
||||
context_object->h264_start_code = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
rc = v4l2_set_stream(driver_data->video_fd, output_type, true);
|
||||
|
||||
+77
-3
@@ -40,6 +40,7 @@
|
||||
#include <linux/dma-buf.h>
|
||||
|
||||
#include "nv15.h"
|
||||
#include "nv12_col128.h"
|
||||
#include "tiled_yuv.h"
|
||||
#include "utils.h"
|
||||
#include "v4l2.h"
|
||||
@@ -104,6 +105,25 @@ VAStatus RequestCreateImage(VADriverContextP context, VAImageFormat *format,
|
||||
size = 0;
|
||||
for (i = 0; i < destination_planes_count; i++)
|
||||
size += destination_sizes[i];
|
||||
} else if (format->fourcc == VA_FOURCC_NV12 &&
|
||||
video_format->v4l2_format == V4L2_PIX_FMT_NV12_COL128) {
|
||||
/*
|
||||
* iter40 Phase 5 review F2: NC12 source, NV12 image output.
|
||||
* V4L2-reported destination_bytesperlines[0] is the NC12
|
||||
* column stride (= ALIGN(height,8) * 3/2 — e.g. 1080 for
|
||||
* 1280×720), NOT the linear NV12 Y stride. Override to the
|
||||
* linear stride (width) so VAImage pitches reflect the
|
||||
* detile-output layout the consumer reads.
|
||||
*/
|
||||
destination_bytesperlines[0] = width;
|
||||
destination_sizes[0] = destination_bytesperlines[0] * format_height;
|
||||
for (i = 1; i < destination_planes_count; i++) {
|
||||
destination_bytesperlines[i] = destination_bytesperlines[0];
|
||||
destination_sizes[i] = destination_sizes[0] / 2;
|
||||
}
|
||||
size = 0;
|
||||
for (i = 0; i < destination_planes_count; i++)
|
||||
size += destination_sizes[i];
|
||||
} else {
|
||||
/* NV12: V4L2 stride is correct, sizes derived from height. */
|
||||
destination_sizes[0] = destination_bytesperlines[0] * format_height;
|
||||
@@ -236,14 +256,31 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
|
||||
}
|
||||
|
||||
for (i = 0; i < surface_object->destination_planes_count; i++) {
|
||||
#ifdef __arm__
|
||||
/*
|
||||
* iter40 Phase 5 review F1: guard extended from __arm__ to
|
||||
* __arm__ || __aarch64__. Without this, the detile primitives
|
||||
* silently compiled out on aarch64 (fresnel RK3399, ampere
|
||||
* RK3588, higgs Pi CM5) and the memcpy fall-through delivered
|
||||
* raw tiled bytes to NV12/P010 image consumers. iter39 5/5
|
||||
* PASS masked the issue because no 10-bit path was exercised.
|
||||
*/
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
/*
|
||||
* Sunxi tiled_to_planar lives in tiled_yuv.S which is
|
||||
* #ifdef __arm__ — symbol absent on aarch64. Keep this
|
||||
* branch arm-only; aarch64 Sunxi support would need a C or
|
||||
* aarch64-ASM port (no Sunxi aarch64 board in current fleet).
|
||||
*/
|
||||
#if defined(__arm__)
|
||||
if (!video_format_is_linear(driver_data->video_format))
|
||||
tiled_to_planar(surface_object->destination_data[i],
|
||||
buffer_object->data + image->offsets[i],
|
||||
image->pitches[i], image->width,
|
||||
i == 0 ? image->height :
|
||||
image->height / 2);
|
||||
else if (driver_data->is_10bit &&
|
||||
else
|
||||
#endif
|
||||
if (driver_data->is_10bit &&
|
||||
image->format.fourcc == VA_FOURCC_P010) {
|
||||
/*
|
||||
* iter39: rkvdec emits NV15 (4×10-bit packed in 5
|
||||
@@ -260,12 +297,49 @@ static VAStatus copy_surface_to_image (struct request_data *driver_data,
|
||||
(uint16_t *)(buffer_object->data + image->offsets[i]),
|
||||
image->width, plane_h,
|
||||
surface_object->destination_bytesperlines[i]);
|
||||
} else if (driver_data->video_format != NULL &&
|
||||
driver_data->video_format->v4l2_format ==
|
||||
V4L2_PIX_FMT_NV12_COL128 &&
|
||||
image->format.fourcc == VA_FOURCC_NV12) {
|
||||
/*
|
||||
* iter40: Pi 5 rpi-hevc-dec emits NV12_COL128 (SAND
|
||||
* 128-pixel-wide column tiles). Detile to linear NV12
|
||||
* via the per-plane primitive. surface_object->
|
||||
* destination_data[i] is the V4L2 CAPTURE mmap (single
|
||||
* buffer, planes_count==2): i==0 is the Y plane base,
|
||||
* i==1 is the UV plane base offset within the SAME
|
||||
* physical buffer (per cap_pool plane[1] offset = Y
|
||||
* plane size in COL128 layout).
|
||||
*
|
||||
* src_col_stride = destination_bytesperlines[i] = the
|
||||
* kernel-reported NC12 bytesperline (column stride,
|
||||
* = ALIGN(image_h, 8) * 3/2). Same for both planes
|
||||
* since column geometry is plane-agnostic.
|
||||
*
|
||||
* dst stride is image->pitches[i] = image->width
|
||||
* (overridden in RequestCreateImage NC12 branch below).
|
||||
*/
|
||||
if (i == 0) {
|
||||
nv12_col128_detile_y(
|
||||
(uint8_t *)(buffer_object->data + image->offsets[i]),
|
||||
image->pitches[i],
|
||||
surface_object->destination_data[i],
|
||||
surface_object->destination_bytesperlines[i],
|
||||
image->width, image->height);
|
||||
} else {
|
||||
nv12_col128_detile_uv(
|
||||
(uint8_t *)(buffer_object->data + image->offsets[i]),
|
||||
image->pitches[i],
|
||||
surface_object->destination_data[i],
|
||||
surface_object->destination_bytesperlines[i],
|
||||
image->width, image->height / 2);
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
memcpy(buffer_object->data + image->offsets[i],
|
||||
surface_object->destination_data[i],
|
||||
surface_object->destination_sizes[i]);
|
||||
#ifdef __arm__
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -52,6 +52,7 @@ sources = [
|
||||
'vp9.c',
|
||||
'codec.c',
|
||||
'nv15.c',
|
||||
'nv12_col128.c',
|
||||
|
||||
# Vendored GStreamer 1.28.2 H.265 parser + utilities (LGPL v2.1+,
|
||||
# see src/h265_parser/gst_compat.h for sourcing notes + per-iter2
|
||||
@@ -88,6 +89,7 @@ headers = [
|
||||
'vp9.h',
|
||||
'codec.h',
|
||||
'nv15.h',
|
||||
'nv12_col128.h',
|
||||
|
||||
# Internal mirror of Linux 7.0 V4L2 HEVC EXT_SPS_*_RPS UAPI defs
|
||||
# (allows building against pre-7.0 linux-api-headers; redundant
|
||||
|
||||
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* V4L2_PIX_FMT_NV12_COL128 → linear NV12 detile primitive. Pi 5 / CM5
|
||||
* rpi-hevc-dec CAPTURE. iter40 (2026-05-17).
|
||||
*
|
||||
* Math derived from kernel hevc_d_video.c (size formula) +
|
||||
* ffmpeg/Kynesim libavutil/rpi_sand_fn_pw.h (per-pixel offset). The
|
||||
* single-stripe fast path memcpy's 128 bytes at a time when an output
|
||||
* row falls entirely within one tile column (the common case);
|
||||
* straddling rows are split into two memcpy halves.
|
||||
*
|
||||
* No NEON / SIMD here — correctness first. Each output row generates
|
||||
* (width / 128) + ~1 memcpys of up to 128 bytes; for 1920x1080 that's
|
||||
* ~17000 small memcpys per frame, fine for Phase 1 PoC.
|
||||
*/
|
||||
|
||||
#include "nv12_col128.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Tile column width in bytes. The 'COL128' name embeds this; if it ever
|
||||
* varies, take it from V4L2_PIX_FMT_NV12_COL128's kernel definition.
|
||||
*/
|
||||
#define NC12_TILE_W 128
|
||||
|
||||
/*
|
||||
* Common Y / UV plane detile — the layout is identical (single-byte per
|
||||
* pixel, column-major 128-wide tiles). The only thing that varies is
|
||||
* what plane the caller passes in. width here is plane width in bytes
|
||||
* (= image width for both Y and CbCr-interleaved NV12 UV); height is
|
||||
* plane height in pixels (image height for Y, image height / 2 for UV).
|
||||
*/
|
||||
static void nv12_col128_detile_plane(uint8_t *dst, unsigned int dst_stride,
|
||||
const uint8_t *src,
|
||||
unsigned int src_col_stride,
|
||||
unsigned int width, unsigned int height)
|
||||
{
|
||||
unsigned int y, x;
|
||||
|
||||
for (y = 0; y < height; y++) {
|
||||
uint8_t *drow = dst + y * dst_stride;
|
||||
x = 0;
|
||||
while (x < width) {
|
||||
unsigned int col = x / NC12_TILE_W;
|
||||
unsigned int in_col = x % NC12_TILE_W;
|
||||
unsigned int n = NC12_TILE_W - in_col;
|
||||
if (n > width - x)
|
||||
n = width - x;
|
||||
/*
|
||||
* Source byte = base + col*128*col_stride + y*128 + in_col
|
||||
* Copy n contiguous bytes (all within this tile column,
|
||||
* since n is capped at the remaining width-in-column).
|
||||
*/
|
||||
const uint8_t *p = src
|
||||
+ (size_t)col * NC12_TILE_W * src_col_stride
|
||||
+ (size_t)y * NC12_TILE_W
|
||||
+ in_col;
|
||||
memcpy(drow + x, p, n);
|
||||
x += n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void nv12_col128_detile_y(uint8_t *dst, unsigned int dst_stride,
|
||||
const uint8_t *src_y, unsigned int src_col_stride,
|
||||
unsigned int width, unsigned int height)
|
||||
{
|
||||
nv12_col128_detile_plane(dst, dst_stride, src_y, src_col_stride,
|
||||
width, height);
|
||||
}
|
||||
|
||||
void nv12_col128_detile_uv(uint8_t *dst, unsigned int dst_stride,
|
||||
const uint8_t *src_uv, unsigned int src_col_stride,
|
||||
unsigned int width, unsigned int uv_height)
|
||||
{
|
||||
/* UV plane (CbCr interleaved): byte-width equals Y-plane width
|
||||
* (one Cb + one Cr per 2x2 Y block → 2 bytes per 2 horizontal Y
|
||||
* samples → 1 byte per Y pixel horizontally). Height is half. */
|
||||
nv12_col128_detile_plane(dst, dst_stride, src_uv, src_col_stride,
|
||||
width, uv_height);
|
||||
}
|
||||
|
||||
unsigned int nv12_col128_uv_plane_offset(unsigned int image_width,
|
||||
unsigned int image_height)
|
||||
{
|
||||
unsigned int aligned_h = (image_height + 7) & ~7u;
|
||||
|
||||
/*
|
||||
* In the COL128 SAND layout, Y and UV are NOT separate planes
|
||||
* concatenated end-to-end. Within EACH 128-pixel-wide column:
|
||||
* first 128 * height bytes = Y data for this column strip
|
||||
* next 128 * height / 2 bytes = UV data for this column strip
|
||||
* total 128 * bytesperline (= 128 * height * 3/2) bytes per column
|
||||
*
|
||||
* The "UV plane base" pointer (data[1] in AVFrame convention) is
|
||||
* just data[0] + (128 * height) — the offset of the UV bytes
|
||||
* WITHIN the first column. All subsequent UV bytes are reached by
|
||||
* the same column-stride arithmetic the Y plane uses (col *
|
||||
* 128 * bytesperline + y * 128 + in_col), so passing this offset
|
||||
* pointer + iterating y over [0, height/2) traverses all UV rows
|
||||
* across all columns correctly.
|
||||
*
|
||||
* Earlier wrong formula was num_columns * 128 * aligned_h (i.e.
|
||||
* sizeof(linear Y plane)) — that pushed past the end of the SAND
|
||||
* buffer because the layout isn't planes-end-to-end.
|
||||
*
|
||||
* Cross-check: kernel sizeimage = bytesperline * width =
|
||||
* (aligned_h * 3/2) * num_columns * 128 = num_columns * 128 *
|
||||
* aligned_h * 3/2. Per column: 128 * aligned_h * 3/2. Y portion
|
||||
* per column: 128 * aligned_h. UV portion per column: half of Y.
|
||||
* Sum across columns: matches sizeimage.
|
||||
*/
|
||||
return NC12_TILE_W * aligned_h;
|
||||
}
|
||||
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* V4L2_PIX_FMT_NV12_COL128 (NC12) SAND-tiled → linear NV12 detile.
|
||||
*
|
||||
* Pi 5 / CM5 (BCM2712) rpi-hevc-dec CAPTURE format. iter40 (2026-05-17).
|
||||
*
|
||||
* Layout (kernel drivers/media/platform/raspberrypi/hevc_dec/hevc_d_video.c
|
||||
* size-formula + ffmpeg/Kynesim libavutil/rpi_sand_fn_pw.h per-pixel
|
||||
* offset math):
|
||||
*
|
||||
* width ALIGN(image_width, 128) -- columns are 128 px wide
|
||||
* height ALIGN(image_height, 8)
|
||||
* col_stride (= bytesperline) = height * 3 / 2
|
||||
* (bytes per [128-wide column] vertical unit incl. Y + UV)
|
||||
* sizeimage = col_stride * width = total bytes
|
||||
*
|
||||
* For pixel (x, y) in the Y plane:
|
||||
* col = x / 128
|
||||
* in_col_x = x % 128
|
||||
* offset = col * col_stride * 128 + y * 128 + in_col_x
|
||||
*
|
||||
* UV plane starts at offset (128 * height * num_columns_y) — the same
|
||||
* per-column layout, h/2 rows tall (CbCr interleaved).
|
||||
*
|
||||
* The primitive copies the entire image extent at once. width/height are
|
||||
* the cropped consumer-visible dimensions; src_col_stride is the kernel-
|
||||
* reported bytesperline (i.e. ALIGN(height,8) * 3/2).
|
||||
*/
|
||||
|
||||
#ifndef _NV12_COL128_H_
|
||||
#define _NV12_COL128_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <linux/videodev2.h>
|
||||
|
||||
/*
|
||||
* Pre-Pi-kernel headers (Arch ALARM linux-api-headers, older mainline
|
||||
* kernel-headers packages) may not define V4L2_PIX_FMT_NV12_COL128. The
|
||||
* fourcc is Pi-specific. Provide a private fallback so the backend
|
||||
* builds on hosts that target NON-Pi codecs too.
|
||||
*/
|
||||
#ifndef V4L2_PIX_FMT_NV12_COL128
|
||||
#define V4L2_PIX_FMT_NV12_COL128 \
|
||||
((unsigned int)('N') | ((unsigned int)('C') << 8) | \
|
||||
((unsigned int)('1') << 16) | ((unsigned int)('2') << 24))
|
||||
#endif
|
||||
|
||||
#ifndef V4L2_PIX_FMT_NV12_10_COL128
|
||||
/* 10-bit SAND variant: 3 pixels packed into 4 bytes in 128-byte / 96-pixel
|
||||
* wide columns. iter40 references the fourcc for completeness; the 10-bit
|
||||
* Pi 5 HEVC chapter (Main10) is post-iter40. */
|
||||
#define V4L2_PIX_FMT_NV12_10_COL128 \
|
||||
((unsigned int)('N') | ((unsigned int)('C') << 8) | \
|
||||
((unsigned int)('3') << 16) | ((unsigned int)('0') << 24))
|
||||
#endif
|
||||
|
||||
/* Detile the Y plane of an NC12 source to a linear NV12 Y plane.
|
||||
* dst : pointer to linear NV12 Y plane (caller-owned, dst_stride * height bytes)
|
||||
* dst_stride : linear Y plane stride in bytes (= width for plain NV12)
|
||||
* src_y : pointer to start of NC12 Y plane (= NC12 buffer base)
|
||||
* src_col_stride: kernel-reported bytesperline (= ALIGN(height,8) * 3/2)
|
||||
* width, height: cropped image dimensions in pixels
|
||||
*/
|
||||
void nv12_col128_detile_y(uint8_t *dst, unsigned int dst_stride,
|
||||
const uint8_t *src_y, unsigned int src_col_stride,
|
||||
unsigned int width, unsigned int height);
|
||||
|
||||
/* Detile the UV plane (CbCr interleaved, half-height) of an NC12 source.
|
||||
* dst : pointer to linear NV12 UV plane
|
||||
* dst_stride : linear UV plane stride in bytes (= width for NV12)
|
||||
* src_uv : pointer to start of NC12 UV plane (= src_y + Y-plane-size)
|
||||
* src_col_stride: same as Y plane (same column geometry)
|
||||
* width : Y-plane width in pixels (UV plane has same byte width)
|
||||
* uv_height : UV plane height = height / 2
|
||||
*/
|
||||
void nv12_col128_detile_uv(uint8_t *dst, unsigned int dst_stride,
|
||||
const uint8_t *src_uv, unsigned int src_col_stride,
|
||||
unsigned int width, unsigned int uv_height);
|
||||
|
||||
/* Compute the offset of the UV plane within an NC12 buffer.
|
||||
* image_width, image_height: cropped image dimensions in pixels
|
||||
* Returns: byte offset from buffer start to UV plane start
|
||||
* (= 128 * ALIGN(image_height, 8) * num_columns_y)
|
||||
*/
|
||||
unsigned int nv12_col128_uv_plane_offset(unsigned int image_width,
|
||||
unsigned int image_height);
|
||||
|
||||
#endif /* _NV12_COL128_H_ */
|
||||
+15
@@ -27,6 +27,21 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include <linux/videodev2.h>
|
||||
|
||||
/*
|
||||
* Older or downstream linux-api-headers / kernel-headers packages may
|
||||
* not define V4L2_PIX_FMT_NV15. Provide a fallback so the backend
|
||||
* builds on hosts whose headers are pre-NV15-merge or omit it (e.g.
|
||||
* Pi 5 Debian trixie 6.12.62 headers include NC12 but not NV15).
|
||||
* Same numeric value as mainline.
|
||||
*/
|
||||
#ifndef V4L2_PIX_FMT_NV15
|
||||
#define V4L2_PIX_FMT_NV15 \
|
||||
((unsigned int)('N') | ((unsigned int)('V') << 8) | \
|
||||
((unsigned int)('1') << 16) | ((unsigned int)('5') << 24))
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Unpack one plane of V4L2_PIX_FMT_NV15 (4 × 10-bit values packed into
|
||||
* 5 consecutive bytes, LSB-first) into VA_FOURCC_P010 (16-bit per pixel,
|
||||
|
||||
@@ -93,6 +93,7 @@
|
||||
static const char * const known_decoder_drivers[] = {
|
||||
"rkvdec",
|
||||
"hantro-vpu",
|
||||
"rpi-hevc-dec", /* iter40: Pi 5 / CM5 stateless HEVC */
|
||||
"cedrus",
|
||||
"sun4i_csi",
|
||||
NULL
|
||||
@@ -431,12 +432,31 @@ int request_switch_device_for_profile(struct request_data *driver_data,
|
||||
char kind = request_device_kind_for_profile(profile);
|
||||
int target_video, target_media;
|
||||
|
||||
/*
|
||||
* iter40: HEVC override when rpi-hevc-dec is probed. The static
|
||||
* table (request_device_kind_for_profile) maps HEVC → 'r' (rkvdec)
|
||||
* because that's the canonical RK path. On Pi 5 there's no rkvdec
|
||||
* — rpi-hevc-dec is the only decoder. When BOTH would be present
|
||||
* (hypothetical mixed board), prefer rpi-hevc-dec for HEVC.
|
||||
*
|
||||
* Other rkvdec-routed profiles (VP9, H.264) stay on 'r' because
|
||||
* rpi-hevc-dec is HEVC-only.
|
||||
*/
|
||||
if ((profile == VAProfileHEVCMain || profile == VAProfileHEVCMain10) &&
|
||||
driver_data->video_fd_rpi_hevc_dec >= 0 &&
|
||||
driver_data->media_fd_rpi_hevc_dec >= 0) {
|
||||
kind = 'p';
|
||||
}
|
||||
|
||||
if (kind == 'r') {
|
||||
target_video = driver_data->video_fd_rkvdec;
|
||||
target_media = driver_data->media_fd_rkvdec;
|
||||
} else if (kind == 'h') {
|
||||
target_video = driver_data->video_fd_hantro;
|
||||
target_media = driver_data->media_fd_hantro;
|
||||
} else if (kind == 'p') {
|
||||
target_video = driver_data->video_fd_rpi_hevc_dec;
|
||||
target_media = driver_data->media_fd_rpi_hevc_dec;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
@@ -624,6 +644,8 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
|
||||
driver_data->media_fd_rkvdec = -1;
|
||||
driver_data->video_fd_hantro = -1;
|
||||
driver_data->media_fd_hantro = -1;
|
||||
driver_data->video_fd_rpi_hevc_dec = -1;
|
||||
driver_data->media_fd_rpi_hevc_dec = -1;
|
||||
|
||||
/*
|
||||
* iter38: probe BOTH rkvdec and hantro-vpu so a single libva session
|
||||
@@ -654,6 +676,15 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
|
||||
alt_driver = "rkvdec";
|
||||
driver_data->video_fd_hantro = video_fd;
|
||||
driver_data->media_fd_hantro = media_fd;
|
||||
} else if (strcmp(info.driver, "rpi-hevc-dec") == 0) {
|
||||
/* iter40: Pi 5 / CM5 — sole decoder is rpi-hevc-dec.
|
||||
* No alt driver to probe; the rkvdec / hantro slots
|
||||
* stay -1 and HEVC routes to 'p' via
|
||||
* request_device_kind_for_profile. */
|
||||
primary_driver = "rpi-hevc-dec";
|
||||
alt_driver = NULL;
|
||||
driver_data->video_fd_rpi_hevc_dec = video_fd;
|
||||
driver_data->media_fd_rpi_hevc_dec = media_fd;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -693,11 +724,19 @@ VAStatus VA_DRIVER_INIT_FUNC(VADriverContextP context)
|
||||
probe_hevc_ext_sps_rps_controls(driver_data->video_fd_rkvdec);
|
||||
driver_data->has_hevc_ext_sps_rps_hantro =
|
||||
probe_hevc_ext_sps_rps_controls(driver_data->video_fd_hantro);
|
||||
driver_data->has_hevc_ext_sps_rps_rpi_hevc_dec =
|
||||
probe_hevc_ext_sps_rps_controls(driver_data->video_fd_rpi_hevc_dec);
|
||||
if (driver_data->has_hevc_ext_sps_rps_rkvdec) {
|
||||
request_log("iter2: kernel registers HEVC EXT_SPS_{ST,LT}_RPS "
|
||||
"controls on rkvdec fd (will route through "
|
||||
"vendored GStreamer parser)\n");
|
||||
}
|
||||
if (driver_data->video_fd_rpi_hevc_dec >= 0) {
|
||||
request_log("iter40: also opened rpi-hevc-dec at video_fd=%d "
|
||||
"media_fd=%d (Pi 5 HEVC stateless)\n",
|
||||
driver_data->video_fd_rpi_hevc_dec,
|
||||
driver_data->media_fd_rpi_hevc_dec);
|
||||
}
|
||||
|
||||
status = VA_STATUS_SUCCESS;
|
||||
goto complete;
|
||||
|
||||
@@ -78,6 +78,15 @@ struct request_data {
|
||||
int media_fd_rkvdec;
|
||||
int video_fd_hantro;
|
||||
int media_fd_hantro;
|
||||
/*
|
||||
* iter40: third multi-device-probe slot for rpi-hevc-dec (Pi 5 /
|
||||
* CM5 / BCM2712). V4L2 stateless HEVC; CAPTURE is NC12/NC30 SAND
|
||||
* 128-pixel-wide column tiled (Pi-specific). On Pi 5 this is the
|
||||
* ONLY decoder slot; on RK hosts it stays -1 and HEVC routes to
|
||||
* rkvdec as before.
|
||||
*/
|
||||
int video_fd_rpi_hevc_dec;
|
||||
int media_fd_rpi_hevc_dec;
|
||||
|
||||
/*
|
||||
* iter2 (ampere-kernel-decoders campaign) — per-fd probe result
|
||||
@@ -98,6 +107,12 @@ struct request_data {
|
||||
*/
|
||||
bool has_hevc_ext_sps_rps_rkvdec;
|
||||
bool has_hevc_ext_sps_rps_hantro;
|
||||
/* iter40: rpi-hevc-dec doesn't expose EXT_SPS_*_RPS controls
|
||||
* (verified Phase 0 higgs probe: QUERY_EXT_CTRL on 0xa97 → EINVAL).
|
||||
* Probed for consistency with the iter2 pair-of-flags pattern;
|
||||
* stays false on Pi 5 and the iter2 vendored-parser path naturally
|
||||
* doesn't engage. */
|
||||
bool has_hevc_ext_sps_rps_rpi_hevc_dec;
|
||||
|
||||
/*
|
||||
* iter2 — cached SPS-derived RPS arrays. SPS NALs only appear in
|
||||
|
||||
+24
@@ -31,6 +31,8 @@
|
||||
#include <drm_fourcc.h>
|
||||
#include <linux/videodev2.h>
|
||||
|
||||
#include "nv12_col128.h" /* fallback V4L2_PIX_FMT_NV12_COL128 define */
|
||||
#include "nv15.h" /* fallback V4L2_PIX_FMT_NV15 define */
|
||||
#include "utils.h"
|
||||
#include "video.h"
|
||||
|
||||
@@ -55,6 +57,28 @@ static struct video_format formats[] = {
|
||||
.planes_count = 2,
|
||||
.bpp = 24,
|
||||
},
|
||||
{
|
||||
/*
|
||||
* iter40: Pi 5 / CM5 rpi-hevc-dec CAPTURE format. 8-bit NV12
|
||||
* stored as 128-pixel-wide column tiles (SAND128 layout).
|
||||
* Pi-specific; not in mainline drm_fourcc.h (uses NV12 + a
|
||||
* BROADCOM_SAND128 modifier for DRM_PRIME). Our consumer path
|
||||
* always detiles to linear NV12 in copy_surface_to_image, so
|
||||
* we don't expose the SAND modifier downstream — drm_format is
|
||||
* still DRM_FORMAT_NV12 and drm_modifier MOD_NONE so the
|
||||
* format-is-linear gate doesn't pull us into tiled_to_planar
|
||||
* (Sunxi-specific). image.c branches on v4l2_format ==
|
||||
* V4L2_PIX_FMT_NV12_COL128 to invoke the dedicated detile.
|
||||
*/
|
||||
.description = "NV12 SAND128 (8-bit, rpi-hevc-dec)",
|
||||
.v4l2_format = V4L2_PIX_FMT_NV12_COL128,
|
||||
.v4l2_buffers_count = 1,
|
||||
.v4l2_mplane = true,
|
||||
.drm_format = DRM_FORMAT_NV12,
|
||||
.drm_modifier = DRM_FORMAT_MOD_NONE,
|
||||
.planes_count = 2,
|
||||
.bpp = 16,
|
||||
},
|
||||
// Code to handle this DRM_FORMAT is __arm__ only
|
||||
#ifdef __arm__
|
||||
{
|
||||
|
||||
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (C) 2026 claude-noether <claude-noether@reauktion.de>
|
||||
*
|
||||
* MIT-licensed per project. iter40 self-test for nv12_col128 detile.
|
||||
*
|
||||
* Build an NC12-tiled source buffer from a known linear NV12 image,
|
||||
* run the detile primitive, assert output matches the original. No
|
||||
* hardware needed — pure bit-layout verification of the kernel math
|
||||
* (drivers/media/platform/raspberrypi/hevc_dec/hevc_d_video.c
|
||||
* V4L2_PIX_FMT_NV12_COL128 case + ffmpeg/Kynesim per-pixel offset).
|
||||
*
|
||||
* Build:
|
||||
* cc -Wall -Werror -O2 -o test_nv12_col128_detile \
|
||||
* tests/test_nv12_col128_detile.c src/nv12_col128.c
|
||||
*
|
||||
* Exit 0 = all asserts pass.
|
||||
*/
|
||||
|
||||
#include "../src/nv12_col128.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define TILE_W 128
|
||||
|
||||
static unsigned int align_up(unsigned int v, unsigned int a)
|
||||
{
|
||||
return (v + a - 1) & ~(a - 1);
|
||||
}
|
||||
|
||||
/* Pack a linear plane (width × height bytes, stride=width) into NC12
|
||||
* layout: each 128-wide column held contiguously, columns at offsets
|
||||
* col * col_stride * 128. col_stride is the kernel-reported bytesperline
|
||||
* = ALIGN(height, 8) * 3/2. Returns the buffer + sizes. */
|
||||
static uint8_t *pack_to_nc12(const uint8_t *linear,
|
||||
unsigned int width, unsigned int height,
|
||||
unsigned int *out_col_stride,
|
||||
size_t *out_size)
|
||||
{
|
||||
unsigned int aligned_w = align_up(width, TILE_W);
|
||||
unsigned int aligned_h = align_up(height, 8);
|
||||
unsigned int col_stride = aligned_h * 3 / 2;
|
||||
unsigned int num_cols = aligned_w / TILE_W;
|
||||
size_t total = (size_t)col_stride * aligned_w;
|
||||
uint8_t *buf;
|
||||
unsigned int col, y, in_col;
|
||||
|
||||
buf = calloc(1, total);
|
||||
assert(buf != NULL);
|
||||
|
||||
for (col = 0; col < num_cols; col++) {
|
||||
uint8_t *col_base = buf + (size_t)col * TILE_W * col_stride;
|
||||
for (y = 0; y < height; y++) {
|
||||
for (in_col = 0; in_col < TILE_W; in_col++) {
|
||||
unsigned int x = col * TILE_W + in_col;
|
||||
if (x >= width)
|
||||
break;
|
||||
col_base[(size_t)y * TILE_W + in_col] =
|
||||
linear[(size_t)y * width + x];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*out_col_stride = col_stride;
|
||||
*out_size = total;
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void test_detile_y(unsigned int width, unsigned int height)
|
||||
{
|
||||
uint8_t *linear, *tiled, *recovered;
|
||||
unsigned int col_stride;
|
||||
size_t tile_size, i;
|
||||
|
||||
linear = malloc((size_t)width * height);
|
||||
assert(linear != NULL);
|
||||
/* Distinctive content per pixel: y * 17 + x * 13 — avoids byte-
|
||||
* aliasing patterns that could mask off-by-one bugs. */
|
||||
for (unsigned int y = 0; y < height; y++)
|
||||
for (unsigned int x = 0; x < width; x++)
|
||||
linear[(size_t)y * width + x] = (uint8_t)(y * 17 + x * 13);
|
||||
|
||||
tiled = pack_to_nc12(linear, width, height, &col_stride, &tile_size);
|
||||
|
||||
recovered = calloc(1, (size_t)width * height);
|
||||
assert(recovered != NULL);
|
||||
|
||||
nv12_col128_detile_y(recovered, width, tiled, col_stride, width, height);
|
||||
|
||||
for (i = 0; i < (size_t)width * height; i++) {
|
||||
if (recovered[i] != linear[i]) {
|
||||
fprintf(stderr,
|
||||
"FAIL %ux%u Y: pixel %zu (x=%zu y=%zu) "
|
||||
"linear=0x%02x recovered=0x%02x\n",
|
||||
width, height, i,
|
||||
i % width, i / width,
|
||||
linear[i], recovered[i]);
|
||||
free(linear); free(tiled); free(recovered);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("PASS %ux%u Y plane (%u columns, col_stride=%u, tile_size=%zu)\n",
|
||||
width, height, align_up(width, TILE_W) / TILE_W,
|
||||
col_stride, tile_size);
|
||||
|
||||
free(linear);
|
||||
free(tiled);
|
||||
free(recovered);
|
||||
}
|
||||
|
||||
static void test_detile_uv(unsigned int width, unsigned int height)
|
||||
{
|
||||
unsigned int uv_h = height / 2;
|
||||
uint8_t *linear, *tiled, *recovered;
|
||||
unsigned int col_stride;
|
||||
size_t tile_size, i;
|
||||
|
||||
linear = malloc((size_t)width * uv_h);
|
||||
assert(linear != NULL);
|
||||
for (unsigned int y = 0; y < uv_h; y++)
|
||||
for (unsigned int x = 0; x < width; x++)
|
||||
linear[(size_t)y * width + x] = (uint8_t)(y * 23 + x * 7);
|
||||
|
||||
tiled = pack_to_nc12(linear, width, uv_h, &col_stride, &tile_size);
|
||||
|
||||
recovered = calloc(1, (size_t)width * uv_h);
|
||||
assert(recovered != NULL);
|
||||
|
||||
nv12_col128_detile_uv(recovered, width, tiled, col_stride, width, uv_h);
|
||||
|
||||
for (i = 0; i < (size_t)width * uv_h; i++) {
|
||||
if (recovered[i] != linear[i]) {
|
||||
fprintf(stderr,
|
||||
"FAIL %ux%u UV: pixel %zu linear=0x%02x recovered=0x%02x\n",
|
||||
width, height, i,
|
||||
linear[i], recovered[i]);
|
||||
free(linear); free(tiled); free(recovered);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
printf("PASS %ux%u UV plane\n", width, height);
|
||||
|
||||
free(linear);
|
||||
free(tiled);
|
||||
free(recovered);
|
||||
}
|
||||
|
||||
static void test_uv_offset(void)
|
||||
{
|
||||
/* Per the SAND COL128 layout, Y and UV are interleaved within
|
||||
* EACH column (not concatenated as separate planes), so the UV
|
||||
* plane base pointer is offset by 128 * ALIGN(height, 8) — the
|
||||
* Y portion of column 0. NOT 128 * height * num_columns (the
|
||||
* size of all Y across all columns), which was an earlier wrong
|
||||
* formula caught by Phase 7 SEGV on higgs. */
|
||||
unsigned int off = nv12_col128_uv_plane_offset(1280, 720);
|
||||
if (off != 128u * 720) {
|
||||
fprintf(stderr, "FAIL UV offset 1280×720: got %u expected %u\n",
|
||||
off, 128u * 720);
|
||||
exit(1);
|
||||
}
|
||||
printf("PASS UV offset 1280×720 = %u\n", off);
|
||||
|
||||
off = nv12_col128_uv_plane_offset(1366, 768);
|
||||
if (off != 128u * 768) {
|
||||
fprintf(stderr, "FAIL UV offset 1366×768: got %u expected %u\n",
|
||||
off, 128u * 768);
|
||||
exit(1);
|
||||
}
|
||||
printf("PASS UV offset 1366×768 (column-misaligned width)\n");
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
/* Phase 3 fixture sizes — all 128-aligned, 8-line-aligned. */
|
||||
test_detile_y(640, 360);
|
||||
test_detile_y(1280, 720);
|
||||
test_detile_y(1920, 1080);
|
||||
|
||||
/* Phase 5 review F4: column-misaligned width (1366 → 1408 padding). */
|
||||
test_detile_y(1366, 768);
|
||||
|
||||
/* UV plane (half-height) at each width. */
|
||||
test_detile_uv(640, 360);
|
||||
test_detile_uv(1280, 720);
|
||||
test_detile_uv(1920, 1080);
|
||||
test_detile_uv(1366, 768);
|
||||
|
||||
test_uv_offset();
|
||||
|
||||
printf("All NC12 detile asserts pass.\n");
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user