Files
daedalus-v4l2/daemon/src/decoder.c
T
marfrit 8c1d9960c4 DAEMON-PPS: synthesise H.264 SPS/PPS NAL units from V4L2 controls
libva-v4l2-request-fourier (and any V4L2-stateless-API consumer)
passes H.264 SPS/PPS as separate V4L2_CID_STATELESS_H264_{SPS,PPS}
controls; only the slice NAL goes into the OUTPUT buffer.  This is
correct per the V4L2 stateless contract.  But libavcodec — which
the daedalus daemon uses for actual decode (Option γ) — wants a
self-contained AnnexB stream including SPS+PPS before any slice.
Result on higgs: "non-existing PPS 0 referenced" + decode_slice_
header errors on every H.264 frame, even after LIBVA-1 and -2
routing correctly delivered the request to the daemon.

Fix splits across kernel + daemon, keeping the kernel module as a
thin transport and putting the actual NAL encoding in userspace:

  include/daedalus_v4l2_proto.h:
    Add struct daedalus_h264_meta (the four v4l2_ctrl_h264_*
    structs the kernel collects) and DAEDALUS_REQ_FLAG_H264_META
    (set in req.flags when the meta block is present between the
    daedalus_req_decode prefix and the slice bitstream).

  kernel/daedalus_v4l2_main.c:
    Add daedalus_collect_h264_meta() — reads the H.264 ctrl values
    from the bound media_request via v4l2_ctrl_find +
    ctrl->p_cur.p_h264_*.  device_run() calls it on H.264 codec_id,
    copies the structs into the REQ_DECODE payload between the
    prefix and bitstream, and sets the flag.  Payload size is
    bounds-checked against DAEDALUS_PROTO_MAX_PAYLOAD so an over-
    sized slice + meta fails loud instead of truncating.

  daemon/src/bitstream_writer.{c,h}:
    New module — MSB-first bit packer with H.264 Exp-Golomb ue(v)
    and se(v) coding + rbsp_trailing_bits alignment.  Sticky
    overflow flag so callers can verify the output buffer wasn't
    truncated.

  daemon/src/h264_nal_synth.{c,h}:
    New module — turns v4l2_ctrl_h264_sps / v4l2_ctrl_h264_pps
    into AnnexB-framed NAL units per ITU-T H.264 7.3.2.1 / 7.3.2.2.
    Emits emulation prevention bytes (0x03 after every 00 00 in the
    EBSP) and the 4-byte start code (0x00000001).  Coverage matches
    what V4L2 stateless surface gives us: VUI parameters and full
    scaling matrices are NOT emitted (V4L2 doesn't carry them — the
    seq_scaling_matrix_present_flag is set to 0 and libavcodec uses
    flat defaults, which matches the de-facto behaviour of most
    H.264 streams libva-v4l2-request drives).

  daemon/src/decoder.c:
    daedalus_decoder_run_request() now takes an optional
    h264_meta parameter.  For codec_id == H264 with meta != NULL,
    synthesises SPS+PPS NAL units, allocates a combined
    [SPS][PPS][slice] buffer (+ AV_INPUT_BUFFER_PADDING_SIZE), and
    feeds that to avcodec_send_packet instead of the raw slice.
    VP9/AV1 path unchanged (frames are self-contained).  Cleanup
    now goes through a unified `out:` label so the assembled
    buffer is always freed on every exit (including the existing
    decoder_open_codec / no-frame / receive_frame failure paths).

  daemon/src/chardev_client.c:
    handle_req_decode() peels off the optional meta block when the
    flag is set, passes it through to the decoder, and updates
    the payload-length consistency check (now allows for an extra
    sizeof(daedalus_h264_meta) when the flag is on).

Build (boltzmann aarch64): clean compile of all daemon sources,
including bitstream_writer + h264_nal_synth + the refactored
decoder.c.  Kernel module compile to be verified via DKMS rebuild
on higgs in the marfrit-packages bump that follows.

Test plan: with this commit + a marfrit-packages daedalus pin
bump, higgs's ffmpeg -hwaccel vaapi -i h264_test.mp4 should
produce a successful decode (vs. the previous "non-existing PPS 0
referenced" failure).  The daemon log should show:
  decoder: opened h264 context
  decoder: h264 prepended SPS=NB PPS=MB slice=KB
  decoder: OK 320x240 fmt=0 (yuv420p) fnv1a=0x...

VP9 / AV1 behaviour unchanged — they don't carry meta and the
existing per-frame self-describing path still applies.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-20 17:35:24 +02:00

561 lines
16 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/* SPDX-License-Identifier: BSD-2-Clause */
/*
* decoder.c — FFmpeg-driven decode helper for daedalus-v4l2 daemon.
*/
#include "decoder.h"
#include "ffmpeg_loader.h"
#include "h264_nal_synth.h"
#include "log.h"
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <linux/videodev2.h>
#include <libavcodec/avcodec.h>
#include <libavutil/pixfmt.h>
/*
* FNV-1a 32-bit hash. Used as a compact digest of the decoded
* frame's YUV planes so the kernel can verify "the daemon produced
* the expected pixels" without shipping the full frame through the
* 64-KiB-capped chardev wire protocol. Phase 8.5's dmabuf path
* carries actual pixel data; this digest stays useful as a
* cross-host sanity check.
*/
static uint32_t fnv1a32_init(void)
{
return 0x811c9dc5u;
}
static uint32_t fnv1a32_update(uint32_t h, const uint8_t *data, size_t len)
{
size_t i;
for (i = 0; i < len; i++)
h = (h ^ data[i]) * 0x01000193u;
return h;
}
/*
* Hash plane @p (width @w bytes per line, @h lines, stride @stride
* bytes between lines). We strip libav's row alignment padding so
* the hash matches the layout used by `ffmpeg -f rawvideo` reference
* output (which is tightly packed).
*/
static uint32_t fnv1a32_plane(uint32_t h, const uint8_t *p,
int w, int height, int stride)
{
int y;
for (y = 0; y < height; y++)
h = fnv1a32_update(h, p + (size_t) y * (size_t) stride,
(size_t) w);
return h;
}
int daedalus_decoder_init(struct daedalus_decoder *dec,
struct ffmpeg_loader *loader)
{
memset(dec, 0, sizeof(*dec));
dec->loader = loader;
dec->pkt = loader->av_packet_alloc();
if (!dec->pkt)
return -ENOMEM;
dec->frame = loader->av_frame_alloc();
if (!dec->frame) {
loader->av_packet_free(&dec->pkt);
return -ENOMEM;
}
return 0;
}
void daedalus_decoder_cleanup(struct daedalus_decoder *dec)
{
if (!dec || !dec->loader)
return;
if (dec->ctx_vp9)
dec->loader->avcodec_free_context(&dec->ctx_vp9);
if (dec->ctx_av1)
dec->loader->avcodec_free_context(&dec->ctx_av1);
if (dec->ctx_h264)
dec->loader->avcodec_free_context(&dec->ctx_h264);
if (dec->frame)
dec->loader->av_frame_free(&dec->frame);
if (dec->pkt)
dec->loader->av_packet_free(&dec->pkt);
memset(dec, 0, sizeof(*dec));
}
/*
* Lazily open the AVCodecContext for codec_id. Returns 0 on
* success, -ENOSYS on unknown codec, -EIO on FFmpeg failure.
*/
static int decoder_open_codec(struct daedalus_decoder *dec, uint32_t codec_id,
struct AVCodecContext **out)
{
struct ffmpeg_loader *fm = dec->loader;
const struct AVCodec *codec;
struct AVCodecContext *ctx;
enum AVCodecID av_id;
struct AVCodecContext **cache;
int rc;
switch (codec_id) {
case DAEDALUS_CODEC_VP9:
av_id = AV_CODEC_ID_VP9;
cache = &dec->ctx_vp9;
break;
case DAEDALUS_CODEC_AV1:
av_id = AV_CODEC_ID_AV1;
cache = &dec->ctx_av1;
break;
case DAEDALUS_CODEC_H264:
av_id = AV_CODEC_ID_H264;
cache = &dec->ctx_h264;
break;
default:
log_warn("decoder: unknown codec_id %u", codec_id);
return -ENOSYS;
}
if (*cache) {
*out = *cache;
return 0;
}
codec = fm->avcodec_find_decoder(av_id);
if (!codec) {
log_err("decoder: avcodec_find_decoder(%d) returned NULL", av_id);
return -EIO;
}
ctx = fm->avcodec_alloc_context3(codec);
if (!ctx)
return -ENOMEM;
rc = fm->avcodec_open2(ctx, codec, NULL);
if (rc < 0) {
log_err("decoder: avcodec_open2 failed: %d", rc);
fm->avcodec_free_context(&ctx);
return -EIO;
}
*cache = ctx;
*out = ctx;
log_info("decoder: opened %s context", codec->name);
return 0;
}
/*
* Pack the decoded YUV planes into NV12M layout across two
* mapped CAPTURE planes:
* planes[0] = Y, written w bytes per row with stride dst_y_stride
* planes[1] = interleaved CbCr at half-res, two bytes per chroma
* sample, written cw*2 bytes per row with stride
* dst_uv_stride
*
* Source stride padding (fr->linesize[*]) is stripped; destination
* stride padding (dst_stride - row_bytes) is left as-is — the V4L2
* client knows the format's bytesperline and walks accordingly.
*
* Returns 0 on success, -EINVAL if the source is not planar 4:2:0
* (Phase 8.6 still expects yuv420p-class outputs; 8.7 widens).
*/
/*
* Pack 10-bit planar YUV420P10LE into V4L2_PIX_FMT_P010 single
* plane: Y plane (width × 2 bytes per pixel, height rows) +
* interleaved CbCr plane at half-res (cw*2 bytes per row, ch
* rows). P010 stores 10-bit samples in 16-bit words,
* MSB-aligned (low 6 bits zero). libav's YUV420P10LE delivers
* 10-bit samples in the LOW 10 bits, so we shift left by 6.
*
* The single-plane layout means Y and CbCr are concatenated in
* planes->base[0]; planes->stride[0] is the Y stride (which we
* also use for the CbCr rows since both have the same
* per-line byte count for 4:2:0 with interleaved chroma).
*/
static int pack_p010_to_plane(struct AVFrame *fr,
const AVPixFmtDescriptor *desc,
const struct daedalus_capture_planes *planes)
{
int h = fr->height;
int w = fr->width;
int cw, ch, y, x;
uint8_t *base;
uint32_t stride;
uint8_t *dst_y, *dst_uv;
size_t y_size;
if (!desc || !planes || planes->nr < 1)
return -EINVAL;
if (desc->nb_components < 3)
return -EINVAL;
if (desc->log2_chroma_w != 1 || desc->log2_chroma_h != 1)
return -EINVAL;
/* Only 10-bit-per-sample sources packed into 16 bits per
* libav convention. Anything else needs its own path. */
if (desc->comp[0].depth != 10)
return -EINVAL;
cw = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
ch = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
base = planes->base[0];
stride = planes->stride[0] ? planes->stride[0] : (uint32_t) (w * 2);
if (!base)
return -EINVAL;
dst_y = base;
y_size = (size_t) stride * (size_t) h;
dst_uv = base + y_size;
/* Y plane: shift 10-bit → MSB-aligned 16-bit. */
for (y = 0; y < h; y++) {
const uint16_t *src = (const uint16_t *) (fr->data[0] +
(size_t) y * fr->linesize[0]);
uint16_t *dst = (uint16_t *) (dst_y +
(size_t) y * stride);
for (x = 0; x < w; x++)
dst[x] = (uint16_t) (src[x] << 6);
}
/* Interleave Cb/Cr at half-res, also MSB-aligned. */
for (y = 0; y < ch; y++) {
const uint16_t *u = (const uint16_t *) (fr->data[1] +
(size_t) y * fr->linesize[1]);
const uint16_t *v = (const uint16_t *) (fr->data[2] +
(size_t) y * fr->linesize[2]);
uint16_t *dst = (uint16_t *) (dst_uv +
(size_t) y * stride);
for (x = 0; x < cw; x++) {
dst[x * 2 + 0] = (uint16_t) (u[x] << 6);
dst[x * 2 + 1] = (uint16_t) (v[x] << 6);
}
}
return 0;
}
/*
* Pack 8-bit planar YUV420P into V4L2_PIX_FMT_NV12 single plane:
* Y plane (W*H bytes) followed by interleaved CbCr at half-res
* (W*H/2 bytes) all in planes->base[0]. Same layout as P010
* sans the depth shift. For libva-v4l2-request-style clients
* that expect num_planes=1 NV12.
*/
static int pack_nv12_single_to_plane(struct AVFrame *fr,
const AVPixFmtDescriptor *desc,
const struct daedalus_capture_planes *planes)
{
int h = fr->height;
int w = fr->width;
int cw, ch, y, x;
uint8_t *base;
uint32_t stride;
uint8_t *dst_y, *dst_uv;
size_t y_size;
if (!desc || !planes || planes->nr < 1)
return -EINVAL;
if (desc->nb_components < 3)
return -EINVAL;
if (desc->log2_chroma_w != 1 || desc->log2_chroma_h != 1)
return -EINVAL;
if (desc->comp[0].depth != 8)
return -EINVAL;
cw = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
ch = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
base = planes->base[0];
stride = planes->stride[0] ? planes->stride[0] : (uint32_t) w;
if (!base)
return -EINVAL;
dst_y = base;
y_size = (size_t) stride * (size_t) h;
dst_uv = base + y_size;
for (y = 0; y < h; y++)
memcpy(dst_y + (size_t) y * stride,
fr->data[0] + (size_t) y * fr->linesize[0],
(size_t) w);
for (y = 0; y < ch; y++) {
const uint8_t *u = fr->data[1] +
(size_t) y * fr->linesize[1];
const uint8_t *v = fr->data[2] +
(size_t) y * fr->linesize[2];
uint8_t *row = dst_uv + (size_t) y * stride;
for (x = 0; x < cw; x++) {
row[x * 2 + 0] = u[x];
row[x * 2 + 1] = v[x];
}
}
return 0;
}
static int pack_nv12_to_planes(struct AVFrame *fr,
const AVPixFmtDescriptor *desc,
const struct daedalus_capture_planes *planes)
{
int h = fr->height;
int w = fr->width;
int cw, ch;
size_t row_y, row_uv;
int y, x;
uint8_t *dst_y, *dst_uv;
uint32_t dst_y_stride, dst_uv_stride;
if (!desc || !planes || planes->nr < 2)
return -EINVAL;
if (desc->nb_components < 3)
return -EINVAL;
if (desc->log2_chroma_w != 1 || desc->log2_chroma_h != 1)
return -EINVAL; /* not 4:2:0 — would need a different pack */
cw = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
ch = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
dst_y = planes->base[0];
dst_uv = planes->base[1];
dst_y_stride = planes->stride[0] ? planes->stride[0] : (uint32_t) w;
dst_uv_stride = planes->stride[1] ? planes->stride[1] : (uint32_t) (cw * 2);
row_y = (size_t) w;
row_uv = (size_t) cw * 2u;
if (!dst_y || !dst_uv)
return -EINVAL;
/* Y plane copy — strip source stride padding. */
for (y = 0; y < h; y++)
memcpy(dst_y + (size_t) y * dst_y_stride,
fr->data[0] + (size_t) y * fr->linesize[0],
row_y);
/* Interleave Cb and Cr into NV12 chroma plane. */
for (y = 0; y < ch; y++) {
const uint8_t *u = fr->data[1] +
(size_t) y * fr->linesize[1];
const uint8_t *v = fr->data[2] +
(size_t) y * fr->linesize[2];
uint8_t *row = dst_uv + (size_t) y * dst_uv_stride;
for (x = 0; x < cw; x++) {
row[x * 2 + 0] = u[x];
row[x * 2 + 1] = v[x];
}
}
(void) row_uv;
return 0;
}
int daedalus_decoder_run_request(struct daedalus_decoder *dec,
const struct daedalus_req_decode *req,
const uint8_t *bitstream,
const struct daedalus_h264_meta *h264_meta,
struct daedalus_resp_frame *resp,
const struct daedalus_capture_planes *planes)
{
struct ffmpeg_loader *fm = dec->loader;
struct AVCodecContext *ctx = NULL;
uint8_t *assembled = NULL;
size_t assembled_len = 0;
int rc;
memset(resp, 0, sizeof(*resp));
resp->codec_id = req->codec_id;
rc = decoder_open_codec(dec, req->codec_id, &ctx);
if (rc == -ENOSYS) {
resp->status = DAEDALUS_DECODE_ERR_CODEC;
goto out;
}
if (rc < 0) {
resp->status = DAEDALUS_DECODE_ERR_OPEN;
goto out;
}
fm->av_packet_unref(dec->pkt);
/*
* H.264 path: libavcodec needs SPS+PPS NAL units BEFORE the
* slice can be decoded. libva-v4l2-request passes those as
* separate V4L2 controls (per the stateless API), so the
* daedalus kernel module forwards them to us as struct
* daedalus_h264_meta. Synthesise AnnexB SPS+PPS NALs from
* the structs and prepend them to @bitstream before feeding
* libavcodec.
*/
if (req->codec_id == DAEDALUS_CODEC_H264 && h264_meta) {
uint8_t sps_nal[256];
uint8_t pps_nal[128];
size_t sps_len, pps_len;
sps_len = h264_synth_sps(&h264_meta->sps,
sps_nal, sizeof(sps_nal));
pps_len = h264_synth_pps(&h264_meta->pps,
pps_nal, sizeof(pps_nal));
if (sps_len == 0 || pps_len == 0) {
log_err("decoder: SPS/PPS NAL synth failed (sps=%zu pps=%zu)",
sps_len, pps_len);
resp->status = DAEDALUS_DECODE_ERR_SEND;
goto out;
}
assembled_len = sps_len + pps_len + req->bitstream_len;
assembled = malloc(assembled_len + AV_INPUT_BUFFER_PADDING_SIZE);
if (!assembled) {
resp->status = DAEDALUS_DECODE_ERR_SEND;
goto out;
}
memcpy(assembled, sps_nal, sps_len);
memcpy(assembled + sps_len, pps_nal, pps_len);
memcpy(assembled + sps_len + pps_len,
bitstream, req->bitstream_len);
memset(assembled + assembled_len, 0,
AV_INPUT_BUFFER_PADDING_SIZE);
dec->pkt->data = assembled;
dec->pkt->size = (int) assembled_len;
log_debug("decoder: h264 prepended SPS=%zuB PPS=%zuB slice=%uB",
sps_len, pps_len, req->bitstream_len);
} else {
/*
* VP9/AV1: bitstream is self-contained per frame, point the
* AVPacket at it directly. Cast away const — AVPacket->data
* is non-const but avcodec_send_packet doesn't mutate it.
*/
dec->pkt->data = (uint8_t *) (uintptr_t) bitstream;
dec->pkt->size = (int) req->bitstream_len;
}
rc = fm->avcodec_send_packet(ctx, dec->pkt);
if (rc < 0) {
log_err("decoder: avcodec_send_packet failed: %d", rc);
resp->status = DAEDALUS_DECODE_ERR_SEND;
goto out;
}
fm->av_frame_unref(dec->frame);
rc = fm->avcodec_receive_frame(ctx, dec->frame);
if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF) {
log_debug("decoder: no frame ready yet (rc=%d)", rc);
resp->status = DAEDALUS_DECODE_NO_FRAME;
goto out;
}
if (rc < 0) {
log_err("decoder: avcodec_receive_frame failed: %d", rc);
resp->status = DAEDALUS_DECODE_ERR_RECV;
goto out;
}
{
struct AVFrame *fr = dec->frame;
const AVPixFmtDescriptor *desc =
fm->av_pix_fmt_desc_get(fr->format);
uint32_t h = fnv1a32_init();
uint32_t luma_len = 0, chroma_len = 0;
resp->status = DAEDALUS_DECODE_OK;
resp->width = (uint32_t) fr->width;
resp->height = (uint32_t) fr->height;
resp->pix_fmt = fr->format;
/*
* Walk every plane reported by the AVPixFmtDescriptor.
* For each component, byte width = ((plane_w *
* step_minus1) >> 0) — but the descriptor only tells
* us which plane each component sits in, not the
* plane's byte stride per pixel. In practice for the
* formats we care about (YUV420P, YUV422P, YUV444P,
* GBRP, NV12), each plane has exactly one component
* at 1 byte/sample. Hash each plane at
* (width >> log2_chroma_w) × (height >> log2_chroma_h)
* for chroma planes, full-size for plane 0.
*
* This generalises cleanly to anything 8-bit-per-
* sample-per-plane; 10/12-bit (P010, YUV420P10LE) will
* need depth handling when Phase 8.6 brings HDR
* content into play.
*/
if (!desc) {
log_warn("decoder: no descriptor for pix_fmt %d",
fr->format);
} else {
int p, max_plane = 0;
int i;
for (i = 0; i < desc->nb_components; i++) {
if (desc->comp[i].plane > max_plane)
max_plane = desc->comp[i].plane;
}
for (p = 0; p <= max_plane; p++) {
int pw, ph;
if (!fr->data[p] || !fr->linesize[p])
continue;
if (p == 0) {
pw = fr->width;
ph = fr->height;
luma_len += (uint32_t) pw *
(uint32_t) ph;
} else {
pw = AV_CEIL_RSHIFT(fr->width,
desc->log2_chroma_w);
ph = AV_CEIL_RSHIFT(fr->height,
desc->log2_chroma_h);
chroma_len += (uint32_t) pw *
(uint32_t) ph;
}
h = fnv1a32_plane(h, fr->data[p], pw, ph,
fr->linesize[p]);
}
}
resp->luma_len = luma_len;
resp->chroma_len = chroma_len;
resp->fnv1a_yuv = h;
/*
* Pack pixels directly into the mapped CAPTURE dmabuf
* planes. Dispatch on the V4L2 fourcc the kernel
* negotiated:
* V4L2_PIX_FMT_NV12M (default, 8-bit, 2 planes)
* V4L2_PIX_FMT_P010 (10-bit HDR, 1 plane)
*/
if (planes && planes->nr >= 1) {
int prc = 0;
switch (req->capture_pix_fmt) {
case V4L2_PIX_FMT_NV12M:
prc = pack_nv12_to_planes(fr, desc, planes);
break;
case V4L2_PIX_FMT_NV12:
prc = pack_nv12_single_to_plane(fr, desc, planes);
break;
case V4L2_PIX_FMT_P010:
prc = pack_p010_to_plane(fr, desc, planes);
break;
default:
log_warn("decoder: unsupported capture fourcc 0x%08x",
req->capture_pix_fmt);
prc = -EINVAL;
break;
}
if (prc < 0)
log_warn("decoder: pack failed (pix_fmt=%d cap_fourcc=0x%08x) — kernel will see metadata only",
fr->format, req->capture_pix_fmt);
}
log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u",
fr->width, fr->height, fr->format,
desc ? desc->name : "?",
h, luma_len, chroma_len);
}
fm->av_frame_unref(dec->frame);
out:
free(assembled);
(void) assembled_len;
return 0;
}