diff --git a/daemon/src/decoder.c b/daemon/src/decoder.c index c29fcae..a5ea69a 100644 --- a/daemon/src/decoder.c +++ b/daemon/src/decoder.c @@ -233,6 +233,65 @@ static int pack_p010_to_plane(struct AVFrame *fr, return 0; } +/* + * Pack 8-bit planar YUV420P into V4L2_PIX_FMT_NV12 single plane: + * Y plane (W*H bytes) followed by interleaved CbCr at half-res + * (W*H/2 bytes) all in planes->base[0]. Same layout as P010 + * sans the depth shift. For libva-v4l2-request-style clients + * that expect num_planes=1 NV12. + */ +static int pack_nv12_single_to_plane(struct AVFrame *fr, + const AVPixFmtDescriptor *desc, + const struct daedalus_capture_planes *planes) +{ + int h = fr->height; + int w = fr->width; + int cw, ch, y, x; + uint8_t *base; + uint32_t stride; + uint8_t *dst_y, *dst_uv; + size_t y_size; + + if (!desc || !planes || planes->nr < 1) + return -EINVAL; + if (desc->nb_components < 3) + return -EINVAL; + if (desc->log2_chroma_w != 1 || desc->log2_chroma_h != 1) + return -EINVAL; + if (desc->comp[0].depth != 8) + return -EINVAL; + + cw = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); + ch = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); + + base = planes->base[0]; + stride = planes->stride[0] ? planes->stride[0] : (uint32_t) w; + if (!base) + return -EINVAL; + + dst_y = base; + y_size = (size_t) stride * (size_t) h; + dst_uv = base + y_size; + + for (y = 0; y < h; y++) + memcpy(dst_y + (size_t) y * stride, + fr->data[0] + (size_t) y * fr->linesize[0], + (size_t) w); + + for (y = 0; y < ch; y++) { + const uint8_t *u = fr->data[1] + + (size_t) y * fr->linesize[1]; + const uint8_t *v = fr->data[2] + + (size_t) y * fr->linesize[2]; + uint8_t *row = dst_uv + (size_t) y * stride; + for (x = 0; x < cw; x++) { + row[x * 2 + 0] = u[x]; + row[x * 2 + 1] = v[x]; + } + } + return 0; +} + static int pack_nv12_to_planes(struct AVFrame *fr, const AVPixFmtDescriptor *desc, const struct daedalus_capture_planes *planes) @@ -425,6 +484,9 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec, case V4L2_PIX_FMT_NV12M: prc = pack_nv12_to_planes(fr, desc, planes); break; + case V4L2_PIX_FMT_NV12: + prc = pack_nv12_single_to_plane(fr, desc, planes); + break; case V4L2_PIX_FMT_P010: prc = pack_p010_to_plane(fr, desc, planes); break; diff --git a/docs/phase_8_10_11_closure.md b/docs/phase_8_10_11_closure.md new file mode 100644 index 0000000..3dd9b7b --- /dev/null +++ b/docs/phase_8_10_11_closure.md @@ -0,0 +1,231 @@ +# Phase 8.10 + 8.11 closure — libva consumer integration scaffold + +**Status:** closed 2026-05-18. + +Two interleaved phases: + +- **Phase 8.10** — wire daedalus_v4l2 into the existing + `libva-v4l2-request-fourier` campaign fork (the sibling + repo at `marfrit/libva-v4l2-request-fourier` already had + VP9/AV1/H264/HEVC working on Rockchip/Allwinner). +- **Phase 8.11** — extend daedalus_v4l2 with the V4L2 + surface format and media-controller plumbing the libva + fork needs. + +Together they bring the daedalus stack from "standalone +test client" to "VAAPI-discoverable decoder with all the +ICD-side framework integration in place." Actual decode +through the libva path stops at the V4L2 stateless control +payload acceptance step — a deeper framework integration +that lands in Phase 8.12. + +## What lands + +### libva-v4l2-request-fourier (sibling fork, gitea `marfrit/`) + +Two commits pushed to `master`: + +- `b5b3acf` — daedalus_v4l2: add to known_decoder_drivers + + multi-device-probe slot. Same shape as iter40's + rpi-hevc-dec wiring: array entry, driver_data fd slot, + primary-driver detection branch, post-probe log line. + 34-line diff. +- `2146341` — daedalus_v4l2: meson option gate (default + true). `meson setup -Ddaedalus_v4l2=false` builds a .so + with no daedalus strings at all (verified via strings + on both build dirs). Struct fields stay unconditional + to avoid ODR risk across translation units. + +### daedalus-v4l2 (this repo) + +Three production changes in this commit: + +**1. `V4L2_PIX_FMT_NV12` (single-plane) on CAPTURE** + +The libva fork's `video_format` table only knows NV12 +single-plane (W*H Y bytes followed by W*H/2 interleaved +CbCr bytes in one buffer plane), not NV12M (two-plane). +Added NV12 alongside our existing NV12M + P010 in the +CAPTURE format list: + +- `daedalus_capture_formats[]` grew an `V4L2_PIX_FMT_NV12` + entry; `enum_fmt` now lists 3 CAPTURE formats. +- `daedalus_fill_capture_fmt` handles the new num_planes=1 + layout (sizeimage = W*H*3/2, bytesperline = W). +- daemon `pack_nv12_single_to_plane`: Y line-by-line into + base+0, interleaved CbCr at base+(stride*H). Mirrors + the P010 pack structure minus the depth shift. +- `daedalus_decoder_run_request` dispatches on + `req->capture_pix_fmt` to the right pack function. + +**Verified**: VP9 1080p decoded into NV12 single-plane via +`tools/test_m2m_stream`, byte-for-byte match against +`ffmpeg -pix_fmt nv12` reference (10-frame 31 MB stream). + +**2. V4L2 Request API media ops** + +`daedalus_media_ops = { .req_validate = vb2_request_validate, +.req_queue = v4l2_m2m_request_queue }` assigned to +`mdev.ops` before `media_device_init`. Before this, +`MEDIA_IOC_REQUEST_ALLOC` returned `-ENOTTY` and any +VAAPI consumer couldn't even allocate a media_request. + +**3. Stateless control registration via `v4l2_ctrl_new_custom`** + +Switched from `v4l2_ctrl_new_std_compound(NULL p_def)` to +`v4l2_ctrl_new_custom(&cfg, NULL)` — the pattern rkvdec / +cedrus / hantro use. Adds a no-op `s_ctrl` callback so +v4l2-core has somewhere to dispatch SET operations. + +## Verification + +### Probe + enumeration + +``` +$ LIBVA_DRIVER_NAME=v4l2_request \ + LIBVA_V4L2_REQUEST_VIDEO_PATH=/dev/video0 \ + LIBVA_V4L2_REQUEST_MEDIA_PATH=/dev/media3 \ + vainfo --display drm --device /dev/dri/renderD128 + +v4l2-request: phase 8.10: opened daedalus_v4l2 at video_fd=N media_fd=M +vainfo: Driver version: v4l2-request +vainfo: Supported profile and entrypoints + VAProfileH264Main : VAEntrypointVLD + VAProfileH264High : VAEntrypointVLD + VAProfileH264ConstrainedBaseline: VAEntrypointVLD + VAProfileH264MultiviewHigh : VAEntrypointVLD + VAProfileH264StereoHigh : VAEntrypointVLD + VAProfileVP9Profile0 : VAEntrypointVLD + VAProfileAV1Profile0 : VAEntrypointVLD +``` + +Seven VAAPI profiles enumerated through the libva path. + +### LibVA trace through `ffmpeg -hwaccel vaapi` + +| Step | Status | +|------|--------| +| `vaInitialize` | ✓ | +| `vaQueryConfigProfiles` | ✓ | +| `vaQueryConfigEntrypoints` (VLD) | ✓ | +| `vaCreateConfig` (VP9 + VLD + NV12) | ✓ | +| `vaQuerySurfaceAttributes` (NV12 fourcc reported) | ✓ | +| `vaCreateSurfaces` | ✓ | +| `vaCreateContext` (cap_pool: 24 slots, 1 plane each) | ✓ | +| `vaCreateBuffer` (slice + picture params) | ✓ | +| `MEDIA_IOC_REQUEST_ALLOC` | ✓ | +| `VIDIOC_S_EXT_CTRLS` (stateless ctrls) | ✗ EINVAL | +| `VIDIOC_QBUF` with request fd | ✗ "Invalid request descriptor" | +| `vaEndPicture` | ✗ OPERATION_FAILED | + +Everything past discovery / probe / context / surface / +buffer / request alloc works. The blocker is +`VIDIOC_S_EXT_CTRLS` returning EINVAL when libva tries to +populate `V4L2_CID_STATELESS_VP9_FRAME` on the request — +the payload validation against the kernel's expected +compound-control struct shape rejects. + +This isn't a "missing line" fix — it needs proper +stateless control plumbing (the SPS/PPS/SliceParams/etc. +get_dims, validate, default-value paths that the in-tree +rkvdec / cedrus / hantro decoders implement to satisfy +v4l2-core's `std_validate` machinery). That's Phase 8.12 +scope. + +### Standalone NV12 verification + +``` +$ sudo ./tools/test_m2m_stream /tmp/vp9_1080_stream.ivf \ + /tmp/nv12_out.nv12 1920 1080 vp9 nv12 +parsed 10 frames, 1920x1080 +CAPTURE fmt=NV12 planes=1 sizeimage=[3110400,0] +decoded 10 / 10 frames +$ cmp /tmp/nv12_out.nv12 /tmp/vp9_1080_stream_ref.nv12 +$ echo $? +0 +``` + +Byte-exact through the daedalus-internal path with the +NV12 single-plane format. Confirms `pack_nv12_single_to_plane` +produces the same pixels as `pack_nv12_to_planes` (just +re-laid-out into one buffer). + +## Design decisions + +### Why ship even though decode-via-libva is blocked + +The framework integration up to MEDIA_IOC_REQUEST_ALLOC is +itself a significant deliverable: + +- Other VAAPI consumers (testing tools, future patched + ffmpeg paths, custom VA clients) get the same scaffolding + for free. +- The remaining gap is well-characterised: it's the + stateless control payload acceptance, a single named + V4L2 framework integration. Phase 8.12's surface is + clearly defined. +- All the new code is correct on its own merits (NV12 + single-plane decode is byte-exact via our own path; + request API ops are the canonical helpers). + +### Why register stateless controls if we don't act on them + +libva-v4l2-request-fourier's per-codec dispatch requires +the standard V4L2 stateless controls to be visible — that's +how it validates that the kernel supports the right +profile. Without the registered controls vainfo would +not enumerate the profile. + +Our daemon ignores the control values because FFmpeg +re-parses the bitstream on its own. The plumbing exists +to satisfy the V4L2 stateless contract; the actual +decode logic doesn't depend on it. + +### Why `v4l2_ctrl_new_custom` over `_std_compound` + +`v4l2_ctrl_new_std_compound` with `NULL` default rejected +SET requests (same EINVAL libva is hitting today — +removing the `NULL` default didn't fix it either). +`v4l2_ctrl_new_custom` is the pattern in-tree decoders +use; v4l2-core auto-fills the type/dims/size from the std +control table when given just a `.id`. + +The remaining issue isn't the registration pattern but +the payload validation path — v4l2-core expects more +than just a registered control; it expects the driver to +have set up `min/max/step/def` for each compound field, +which `v4l2_ctrl_new_std_compound` does internally for +known CIDs but our handler isn't quite right. + +## What's NOT here (Phase 8.12 scope) + +- **Stateless control payload acceptance**: the S_EXT_CTRLS + EINVAL. Needs proper v4l2-core validation hooks — + likely meaning the daemon needs to actually consume the + per-frame controls (not just ignore them), so the + validation path has something to hand off to. +- **Per-codec control wiring**: even if S_EXT_CTRLS + succeeds, the actual decode submission to the daemon + needs to bundle the per-buffer controls (or document + why they're ignored — and convince v4l2-core to allow + the request to validate). +- **First end-to-end decoded frame via libva**: the + payoff for Phase 8.12. + +## Phase 8.12 plan + +1. Study cedrus or rkvdec's stateless control validation + to understand what `std_validate` expects beyond just + registration. +2. Either: + - (a) Add proper compound-control validation hooks so + S_EXT_CTRLS succeeds without us doing real work + (control values become "advisory" since daemon + re-parses bitstream), OR + - (b) Wire the daemon to actually use the per-frame + control payload (skip FFmpeg's parse step, trust + libva's parsed values). Bigger change but more + correct. +3. Verify first frame decoded through the libva path. +4. Run the full vainfo --display drm --decode test if + that exists, or a small VA decode snippet. diff --git a/kernel/daedalus_v4l2_main.c b/kernel/daedalus_v4l2_main.c index 069d5b2..6680f0a 100644 --- a/kernel/daedalus_v4l2_main.c +++ b/kernel/daedalus_v4l2_main.c @@ -69,8 +69,18 @@ static const u32 daedalus_output_formats[] = { #define DAEDALUS_NUM_OUTPUT_FMTS ARRAY_SIZE(daedalus_output_formats) #define DAEDALUS_DEFAULT_OUTPUT_FOURCC V4L2_PIX_FMT_VP9_FRAME +/* + * NV12 (single-plane Y+CbCr contiguous) listed alongside NV12M + * (two-plane Y / CbCr separate) so legacy MPLANE clients that + * expect single-plane buffer geometry (e.g. libva-v4l2-request- + * fourier's NV12 video_format entry, used by VAAPI consumers via + * ffmpeg vaapi) can negotiate the format successfully. The two + * fourccs differ only in plane layout — bit-exact pixel content + * is identical. + */ static const u32 daedalus_capture_formats[] = { V4L2_PIX_FMT_NV12M, + V4L2_PIX_FMT_NV12, V4L2_PIX_FMT_P010, }; #define DAEDALUS_NUM_CAPTURE_FMTS ARRAY_SIZE(daedalus_capture_formats) @@ -177,21 +187,44 @@ static const u32 daedalus_stateless_ctrls[] = { V4L2_CID_STATELESS_AV1_FILM_GRAIN, }; +/* + * No-op control op set: daemon ignores all stateless control + * values (FFmpeg re-parses the bitstream). But v4l2-core requires + * ops to be present on a ctrl_handler that processes SET requests + * — without it, S_EXT_CTRLS rejects with EINVAL on validate. + * Always-success s_ctrl is the right shape for "we accept whatever + * you tell us but actually act on the OUTPUT buffer payload alone." + */ +static int daedalus_s_ctrl_noop(struct v4l2_ctrl *ctrl) +{ + (void) ctrl; + return 0; +} + +static const struct v4l2_ctrl_ops daedalus_ctrl_ops = { + .s_ctrl = daedalus_s_ctrl_noop, +}; + static int daedalus_register_stateless_ctrls(struct v4l2_ctrl_handler *hdl) { size_t i; + /* + * Use v4l2_ctrl_new_custom (the pattern rkvdec / cedrus / + * hantro use) rather than v4l2_ctrl_new_std_compound. + * v4l2-core auto-detects the type from each known + * V4L2_CID_STATELESS_* id and allocates the right payload + * size internally; S_EXT_CTRLS then validates user input + * against that allocated payload. v4l2_ctrl_new_std_compound + * with NULL p_def was rejecting writes (libva-v4l2-request- + * fourier got EINVAL on every stateless ctrl SET). + */ for (i = 0; i < ARRAY_SIZE(daedalus_stateless_ctrls); i++) { - v4l2_ctrl_new_std_compound(hdl, NULL, - daedalus_stateless_ctrls[i], - v4l2_ctrl_ptr_create(NULL)); - /* - * Errors here mean the v4l2-core doesn't know about - * this CID on this kernel (e.g. older trees missing - * AV1_FILM_GRAIN). hdl->error captures it; we - * tolerate it — the codec just won't appear as - * supported through that control. - */ + struct v4l2_ctrl_config cfg = { + .ops = &daedalus_ctrl_ops, + .id = daedalus_stateless_ctrls[i], + }; + v4l2_ctrl_new_custom(hdl, &cfg, NULL); if (hdl->error) { pr_debug("daedalus_v4l2: skipping unsupported CID 0x%x (err=%d)\n", daedalus_stateless_ctrls[i], hdl->error); @@ -204,9 +237,14 @@ static int daedalus_register_stateless_ctrls(struct v4l2_ctrl_handler *hdl) /* -- format helpers -------------------------------------------------- */ /* - * CAPTURE format fill. Two layouts supported: + * CAPTURE format fill. Three layouts supported: * NV12M (default, 8-bit) — 2 planes: Y (W*H bytes) + interleaved * CbCr at half-res (W*H/2 bytes). + * NV12 (8-bit, 1 plane) — 1 plane: Y (W*H) followed by + * interleaved CbCr (W*H/2); total + * W*H*3/2 bytes. For legacy MPLANE + * clients that don't speak multi- + * plane (libva-v4l2-request). * P010 (10-bit HDR) — 1 plane: Y first (W*H*2 bytes) then * interleaved CbCr at half-res * (W*H bytes); 16-bit samples, @@ -230,6 +268,12 @@ static void daedalus_fill_capture_fmt(struct v4l2_pix_format_mplane *f, f->plane_fmt[0].sizeimage = w * h * 2 + w * h; f->plane_fmt[1].bytesperline = 0; f->plane_fmt[1].sizeimage = 0; + } else if (fourcc == V4L2_PIX_FMT_NV12) { + f->num_planes = 1; + f->plane_fmt[0].bytesperline = w; + f->plane_fmt[0].sizeimage = w * h + w * h / 2; + f->plane_fmt[1].bytesperline = 0; + f->plane_fmt[1].sizeimage = 0; } else { f->num_planes = 2; f->plane_fmt[0].bytesperline = w; @@ -927,6 +971,26 @@ static void daedalus_vdev_release(struct video_device *vdev) /* embedded in daedalus_dev (devm) — nothing to free here */ } +/* -- media controller request-API ops (Phase 8.11) ------------------ */ + +/* + * V4L2 Request API plumbing: lets a client allocate a media_request + * (MEDIA_IOC_REQUEST_ALLOC), stage per-buffer controls into it via + * VIDIOC_S_EXT_CTRLS with which=V4L2_CTRL_WHICH_REQUEST_VAL, then + * queue the OUTPUT buffer with the request fd bound — all controls + * + the buffer apply atomically at decode submission. + * + * vb2_request_validate / v4l2_m2m_request_queue are the canonical + * helpers; the daemon doesn't actually use the staged controls + * (FFmpeg re-parses the bitstream) but the wire-level support is + * what libva-v4l2-request-fourier requires to call MEDIA_IOC_ + * REQUEST_ALLOC successfully. + */ +static const struct media_device_ops daedalus_media_ops = { + .req_validate = vb2_request_validate, + .req_queue = v4l2_m2m_request_queue, +}; + /* -- platform driver bind -------------------------------------------- */ static int daedalus_probe(struct platform_device *pdev) @@ -964,9 +1028,18 @@ static int daedalus_probe(struct platform_device *pdev) * are required by spec to expose a media controller (the * request API rides on it) — v4l2-compliance's DECODER_CMD * test rejects drivers without it. + * + * Phase 8.11: wire the V4L2 request API media ops so libva- + * v4l2-request-fourier can MEDIA_IOC_REQUEST_ALLOC against + * us. vb2_request_validate + v4l2_m2m_request_queue are the + * canonical helpers — they bundle per-buffer controls with + * the matching qbuf so the decode submission is atomic + * (required for stateless decoders feeding hardware that + * needs all params present before kickoff). */ dev->mdev.dev = &pdev->dev; strscpy(dev->mdev.model, "daedalus-v4l2", sizeof(dev->mdev.model)); + dev->mdev.ops = &daedalus_media_ops; media_device_init(&dev->mdev); dev->v4l2_dev.mdev = &dev->mdev; diff --git a/tools/test_m2m_stream.c b/tools/test_m2m_stream.c index 8061f2d..7fe9d90 100644 --- a/tools/test_m2m_stream.c +++ b/tools/test_m2m_stream.c @@ -315,6 +315,9 @@ int main(int argc, char **argv) if (!strcmp(cf, "nv12m")) { capture_fourcc = V4L2_PIX_FMT_NV12M; capture_num_planes = 2; + } else if (!strcmp(cf, "nv12")) { + capture_fourcc = V4L2_PIX_FMT_NV12; + capture_num_planes = 1; } else if (!strcmp(cf, "p010")) { capture_fourcc = V4L2_PIX_FMT_P010; capture_num_planes = 1;