diff --git a/daemon/CMakeLists.txt b/daemon/CMakeLists.txt index 64a218b..9dde650 100644 --- a/daemon/CMakeLists.txt +++ b/daemon/CMakeLists.txt @@ -36,6 +36,8 @@ add_executable(daedalus_v4l2_daemon src/decoder.c src/chardev_client.c src/dmabuf_capture.c + src/bitstream_writer.c + src/h264_nal_synth.c ) target_include_directories(daedalus_v4l2_daemon diff --git a/daemon/src/bitstream_writer.c b/daemon/src/bitstream_writer.c new file mode 100644 index 0000000..3944ad9 --- /dev/null +++ b/daemon/src/bitstream_writer.c @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +#include "bitstream_writer.h" + +#include + +void bsw_init(struct bs_writer *bs, uint8_t *buf, size_t cap) +{ + bs->buf = buf; + bs->cap = cap; + bs->pos_bytes = 0; + bs->pos_bit = 0; + bs->overflow = false; + if (buf && cap) + buf[0] = 0; +} + +void bsw_put_u(struct bs_writer *bs, uint32_t v, int n) +{ + int i; + + if (n <= 0 || n > 32) + return; + + for (i = n - 1; i >= 0; i--) { + uint8_t bit = (uint8_t) ((v >> i) & 1u); + + if (bs->pos_bytes >= bs->cap) { + bs->overflow = true; + return; + } + + if (bs->pos_bit == 0) + bs->buf[bs->pos_bytes] = 0; + + bs->buf[bs->pos_bytes] |= (uint8_t) (bit << (7 - bs->pos_bit)); + bs->pos_bit++; + if (bs->pos_bit == 8) { + bs->pos_bit = 0; + bs->pos_bytes++; + } + } +} + +/* + * Exp-Golomb ue(v) — H.264 9.1 / 9.1.1. + * For an unsigned value v: + * codeNum = v + * leadingZeroBits = floor(log2(codeNum + 1)) + * code = leadingZeroBits zeros, then '1', then leadingZeroBits bits + * of (codeNum + 1 - 2^leadingZeroBits) + * + * Total length = 2 * leadingZeroBits + 1 bits. For v = 0 the + * code is just "1" (1 bit). For v in [1,2] the code is 3 bits, etc. + */ +void bsw_put_ue(struct bs_writer *bs, uint32_t v) +{ + uint32_t code_num = v; + uint32_t code_num_plus_1 = code_num + 1u; + int leading_zeros = 0; + uint32_t tmp; + + tmp = code_num_plus_1 >> 1; + while (tmp) { + leading_zeros++; + tmp >>= 1; + } + + /* leading_zeros zero bits */ + bsw_put_u(bs, 0u, leading_zeros); + /* one '1' bit + the (leading_zeros) low bits of code_num_plus_1 */ + bsw_put_u(bs, code_num_plus_1, leading_zeros + 1); +} + +/* + * Exp-Golomb se(v) — H.264 9.1.1: signed mapping is interleaved: + * 0 -> 0, 1 -> 1, -1 -> 2, 2 -> 3, -2 -> 4, ... + * i.e. codeNum = 2 * |v| - (v > 0 ? 1 : 0). + */ +void bsw_put_se(struct bs_writer *bs, int32_t v) +{ + uint32_t code_num; + + if (v > 0) + code_num = (uint32_t) (2 * v - 1); + else + code_num = (uint32_t) (-2 * v); + bsw_put_ue(bs, code_num); +} + +void bsw_align_rbsp(struct bs_writer *bs) +{ + /* rbsp_stop_one_bit + zero-fill to byte boundary */ + bsw_put_u(bs, 1u, 1); + while (bs->pos_bit != 0) + bsw_put_u(bs, 0u, 1); +} + +size_t bsw_bytes(const struct bs_writer *bs) +{ + return bs->pos_bytes; +} diff --git a/daemon/src/bitstream_writer.h b/daemon/src/bitstream_writer.h new file mode 100644 index 0000000..15b72fe --- /dev/null +++ b/daemon/src/bitstream_writer.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * bitstream_writer.h — MSB-first bit packer with Exp-Golomb coding. + * + * Used by h264_nal_synth to emit AnnexB SPS / PPS NAL units the + * daemon prepends to libva-provided slice data before feeding + * libavcodec. The writer is the minimum primitives the H.264 + * SPS/PPS RBSPs need: + * + * put_u(v, n) — n-bit unsigned, MSB first + * put_ue(v) — Exp-Golomb unsigned (CAVLC ue(v)) + * put_se(v) — Exp-Golomb signed (CAVLC se(v)) + * align() — rbsp_trailing_bits: stop_one + zero-pad to byte + * + * No allocations — the caller hands in a fixed buffer and the writer + * tracks (byte, bit) cursor in it. Overruns are detected and made + * sticky via an error flag; callers check bsw_overflowed at the end. + */ +#ifndef DAEDALUS_BITSTREAM_WRITER_H +#define DAEDALUS_BITSTREAM_WRITER_H + +#include +#include +#include + +struct bs_writer { + uint8_t *buf; + size_t cap; + size_t pos_bytes; + int pos_bit; /* 0..7, MSB-first within the current byte */ + bool overflow; +}; + +void bsw_init(struct bs_writer *bs, uint8_t *buf, size_t cap); +void bsw_put_u(struct bs_writer *bs, uint32_t v, int n); +void bsw_put_ue(struct bs_writer *bs, uint32_t v); +void bsw_put_se(struct bs_writer *bs, int32_t v); + +/* Align to next byte boundary by appending rbsp_trailing_bits: + * a single '1' followed by '0's up to the byte boundary. After + * this call bsw_bytes is the RBSP length. */ +void bsw_align_rbsp(struct bs_writer *bs); + +/* Byte count of payload written so far. If pos_bit != 0, returns + * pos_bytes (incomplete bits are not counted; finalise with + * bsw_align_rbsp first). */ +size_t bsw_bytes(const struct bs_writer *bs); + +static inline bool bsw_overflowed(const struct bs_writer *bs) +{ + return bs->overflow; +} + +#endif /* DAEDALUS_BITSTREAM_WRITER_H */ diff --git a/daemon/src/chardev_client.c b/daemon/src/chardev_client.c index 4b63b7e..10621cc 100644 --- a/daemon/src/chardev_client.c +++ b/daemon/src/chardev_client.c @@ -140,6 +140,8 @@ static int handle_req_decode(struct chardev_client *cli, struct daedalus_req_decode req; struct daedalus_resp_frame resp; struct daedalus_capture_planes planes; + const struct daedalus_h264_meta *h264_meta = NULL; + size_t meta_off, meta_len = 0; int rc; int decoded = 0; @@ -152,17 +154,30 @@ static int handle_req_decode(struct chardev_client *cli, hdr->cookie, &resp, sizeof(resp)); } memcpy(&req, payload, sizeof(req)); - if ((size_t) req.bitstream_len + sizeof(req) != hdr->payload_len) { - log_err("REQ_DECODE cookie=%u: bitstream_len %u inconsistent with payload_len %u", - hdr->cookie, req.bitstream_len, hdr->payload_len); + + /* Optional H.264 meta block follows req when the flag is set; + * bitstream comes after meta. */ + if (req.flags & DAEDALUS_REQ_FLAG_H264_META) + meta_len = sizeof(struct daedalus_h264_meta); + meta_off = sizeof(req); + + if ((size_t) req.bitstream_len + sizeof(req) + meta_len != + hdr->payload_len) { + log_err("REQ_DECODE cookie=%u: bitstream_len %u + meta %zu inconsistent with payload_len %u", + hdr->cookie, req.bitstream_len, meta_len, + hdr->payload_len); memset(&resp, 0, sizeof(resp)); resp.status = DAEDALUS_DECODE_ERR_RECV; return send_response(cli, DAEDALUS_MSG_RESP_FRAME, hdr->cookie, &resp, sizeof(resp)); } + if (meta_len) + h264_meta = (const struct daedalus_h264_meta *) + (payload + meta_off); - log_info("REQ_DECODE cookie=%u codec=%u bitstream=%u bytes capture=%ux%u %u planes", + log_info("REQ_DECODE cookie=%u codec=%u bitstream=%u bytes meta=%s capture=%ux%u %u planes", hdr->cookie, req.codec_id, req.bitstream_len, + h264_meta ? "h264" : "none", req.capture_width, req.capture_height, req.capture_num_planes); @@ -181,7 +196,9 @@ static int handle_req_decode(struct chardev_client *cli, } rc = daedalus_decoder_run_request(cli->decoder, &req, - payload + sizeof(req), &resp, + payload + meta_off + meta_len, + h264_meta, + &resp, planes.nr ? &planes : NULL); decoded = (rc >= 0); diff --git a/daemon/src/decoder.c b/daemon/src/decoder.c index a5ea69a..9d85ee8 100644 --- a/daemon/src/decoder.c +++ b/daemon/src/decoder.c @@ -4,6 +4,7 @@ */ #include "decoder.h" #include "ffmpeg_loader.h" +#include "h264_nal_synth.h" #include "log.h" #include @@ -350,11 +351,14 @@ static int pack_nv12_to_planes(struct AVFrame *fr, int daedalus_decoder_run_request(struct daedalus_decoder *dec, const struct daedalus_req_decode *req, const uint8_t *bitstream, + const struct daedalus_h264_meta *h264_meta, struct daedalus_resp_frame *resp, const struct daedalus_capture_planes *planes) { struct ffmpeg_loader *fm = dec->loader; struct AVCodecContext *ctx = NULL; + uint8_t *assembled = NULL; + size_t assembled_len = 0; int rc; memset(resp, 0, sizeof(*resp)); @@ -363,32 +367,72 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec, rc = decoder_open_codec(dec, req->codec_id, &ctx); if (rc == -ENOSYS) { resp->status = DAEDALUS_DECODE_ERR_CODEC; - return 0; + goto out; } if (rc < 0) { resp->status = DAEDALUS_DECODE_ERR_OPEN; - return 0; + goto out; } fm->av_packet_unref(dec->pkt); + /* - * The kernel's REQ_DECODE payload is borrowed memory we'll - * free as soon as this function returns. Pointing the - * AVPacket at it directly is safe because avcodec_send_packet - * either fully consumes the input or copies it internally — - * by the time we return we no longer reference @bitstream. - * - * We cast away const because AVPacket->data is non-const in - * the FFmpeg API; we promise not to mutate the buffer. + * H.264 path: libavcodec needs SPS+PPS NAL units BEFORE the + * slice can be decoded. libva-v4l2-request passes those as + * separate V4L2 controls (per the stateless API), so the + * daedalus kernel module forwards them to us as struct + * daedalus_h264_meta. Synthesise AnnexB SPS+PPS NALs from + * the structs and prepend them to @bitstream before feeding + * libavcodec. */ - dec->pkt->data = (uint8_t *) (uintptr_t) bitstream; - dec->pkt->size = (int) req->bitstream_len; + if (req->codec_id == DAEDALUS_CODEC_H264 && h264_meta) { + uint8_t sps_nal[256]; + uint8_t pps_nal[128]; + size_t sps_len, pps_len; + + sps_len = h264_synth_sps(&h264_meta->sps, + sps_nal, sizeof(sps_nal)); + pps_len = h264_synth_pps(&h264_meta->pps, + pps_nal, sizeof(pps_nal)); + if (sps_len == 0 || pps_len == 0) { + log_err("decoder: SPS/PPS NAL synth failed (sps=%zu pps=%zu)", + sps_len, pps_len); + resp->status = DAEDALUS_DECODE_ERR_SEND; + goto out; + } + + assembled_len = sps_len + pps_len + req->bitstream_len; + assembled = malloc(assembled_len + AV_INPUT_BUFFER_PADDING_SIZE); + if (!assembled) { + resp->status = DAEDALUS_DECODE_ERR_SEND; + goto out; + } + memcpy(assembled, sps_nal, sps_len); + memcpy(assembled + sps_len, pps_nal, pps_len); + memcpy(assembled + sps_len + pps_len, + bitstream, req->bitstream_len); + memset(assembled + assembled_len, 0, + AV_INPUT_BUFFER_PADDING_SIZE); + + dec->pkt->data = assembled; + dec->pkt->size = (int) assembled_len; + log_debug("decoder: h264 prepended SPS=%zuB PPS=%zuB slice=%uB", + sps_len, pps_len, req->bitstream_len); + } else { + /* + * VP9/AV1: bitstream is self-contained per frame, point the + * AVPacket at it directly. Cast away const — AVPacket->data + * is non-const but avcodec_send_packet doesn't mutate it. + */ + dec->pkt->data = (uint8_t *) (uintptr_t) bitstream; + dec->pkt->size = (int) req->bitstream_len; + } rc = fm->avcodec_send_packet(ctx, dec->pkt); if (rc < 0) { log_err("decoder: avcodec_send_packet failed: %d", rc); resp->status = DAEDALUS_DECODE_ERR_SEND; - return 0; + goto out; } fm->av_frame_unref(dec->frame); @@ -396,12 +440,12 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec, if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF) { log_debug("decoder: no frame ready yet (rc=%d)", rc); resp->status = DAEDALUS_DECODE_NO_FRAME; - return 0; + goto out; } if (rc < 0) { log_err("decoder: avcodec_receive_frame failed: %d", rc); resp->status = DAEDALUS_DECODE_ERR_RECV; - return 0; + goto out; } { @@ -508,5 +552,9 @@ int daedalus_decoder_run_request(struct daedalus_decoder *dec, } fm->av_frame_unref(dec->frame); + +out: + free(assembled); + (void) assembled_len; return 0; } diff --git a/daemon/src/decoder.h b/daemon/src/decoder.h index fe980b7..f6eba56 100644 --- a/daemon/src/decoder.h +++ b/daemon/src/decoder.h @@ -60,6 +60,12 @@ void daedalus_decoder_cleanup(struct daedalus_decoder *dec); * @dec: initialised decoder * @req: REQ_DECODE prefix (from the wire) * @bitstream: bitstream blob (req->bitstream_len bytes) + * @h264_meta: optional H.264 SPS/PPS metadata; non-NULL only when + * codec_id == H264 and the kernel set DAEDALUS_REQ_FLAG_ + * H264_META. Used to synthesise the AnnexB SPS+PPS NALs + * libavcodec needs before any slice (libva-v4l2-request + * passes only the slice in @bitstream per the V4L2 + * stateless API contract). NULL for VP9/AV1 paths. * @resp: caller-allocated RESP_FRAME output (zeroed by callee) * @planes: mapped CAPTURE planes (Phase 8.6 dmabuf path). If * NULL or planes->nr == 0, the decoder runs but @@ -75,6 +81,7 @@ void daedalus_decoder_cleanup(struct daedalus_decoder *dec); int daedalus_decoder_run_request(struct daedalus_decoder *dec, const struct daedalus_req_decode *req, const uint8_t *bitstream, + const struct daedalus_h264_meta *h264_meta, struct daedalus_resp_frame *resp, const struct daedalus_capture_planes *planes); diff --git a/daemon/src/h264_nal_synth.c b/daemon/src/h264_nal_synth.c new file mode 100644 index 0000000..3ae8e86 --- /dev/null +++ b/daemon/src/h264_nal_synth.c @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * h264_nal_synth.c — encode SPS / PPS NAL units from the V4L2 + * stateless control structs. See header for design rationale. + * + * Spec references are to ITU-T H.264 (08/2021) section 7.3.2. + * The RBSP encodings here cover the common profiles libva-v4l2- + * request drives (Constrained Baseline, Main, High up to Hi10). + * VUI parameters and seq_scaling_list payloads are NOT emitted — + * we set the corresponding present flags to 0, which produces a + * valid SPS / PPS that libavcodec accepts (it just uses default + * scaling matrices and no VUI-derived timing). That matches the + * V4L2 stateless control surface: the kernel controls don't carry + * VUI fields either, so synthesising them would require fabrication. + */ +#include "h264_nal_synth.h" +#include "bitstream_writer.h" + +#include + +#define NAL_SPS 7 +#define NAL_PPS 8 +#define NAL_REF_IDC_HIGHEST 3 + +/* + * Profiles that carry the "chroma_format_idc and friends" extension + * block per H.264 7.3.2.1.1. Any profile_idc not in this list skips + * the chroma_format_idc/bit_depth/scaling/transform_bypass fields. + */ +static bool sps_has_chroma_format_block(uint8_t profile_idc) +{ + switch (profile_idc) { + case 100: case 110: case 122: case 244: + case 44: case 83: case 86: + case 118: case 128: case 138: case 139: + case 134: case 135: + return true; + default: + return false; + } +} + +/* + * Insert emulation prevention bytes into @rbsp[0..len) and copy the + * result into @out. Returns the number of bytes written to @out. + * If the result would exceed @out_cap, returns 0. + * + * Rule (H.264 7.4.1.1): in the byte stream, any subsequence + * 0x00 0x00 0x00, 0x00 0x00 0x01, 0x00 0x00 0x02, or 0x00 0x00 0x03 + * inside the EBSP must be expanded to 0x00 0x00 0x03 . + * Practically: scan the RBSP, after every "0x00 0x00" output the + * 0x03 escape if the next byte is <= 0x03. + */ +static size_t emulation_prevent(const uint8_t *rbsp, size_t len, + uint8_t *out, size_t out_cap) +{ + size_t i, w = 0; + int zeros = 0; + + for (i = 0; i < len; i++) { + uint8_t b = rbsp[i]; + + if (zeros >= 2 && b <= 0x03) { + if (w >= out_cap) + return 0; + out[w++] = 0x03; + zeros = 0; + } + + if (w >= out_cap) + return 0; + out[w++] = b; + + if (b == 0x00) + zeros++; + else + zeros = 0; + } + + return w; +} + +/* + * Emit AnnexB start code + nal_unit_header + EBSP into @out. + * @rbsp/@rbsp_len is the raw RBSP (already byte-aligned). + * Returns total bytes written, or 0 on overflow. + */ +static size_t wrap_nal_annexb(uint8_t nal_unit_type, uint8_t nal_ref_idc, + const uint8_t *rbsp, size_t rbsp_len, + uint8_t *out, size_t out_cap) +{ + uint8_t header; + size_t w = 0, ebsp_len; + + if (out_cap < 5) + return 0; + + /* start code: 0x00 0x00 0x00 0x01 (4-byte form — safe for any + * concatenation with other NALs since 3-byte form would risk + * confusion when the preceding NAL ends in 0x00). */ + out[w++] = 0x00; + out[w++] = 0x00; + out[w++] = 0x00; + out[w++] = 0x01; + + header = (uint8_t) (((nal_ref_idc & 0x3) << 5) | + (nal_unit_type & 0x1f)); + out[w++] = header; + + ebsp_len = emulation_prevent(rbsp, rbsp_len, out + w, out_cap - w); + if (ebsp_len == 0 && rbsp_len > 0) + return 0; + w += ebsp_len; + return w; +} + +size_t h264_synth_sps(const struct v4l2_ctrl_h264_sps *sps, + uint8_t *out, size_t out_cap) +{ + uint8_t rbsp[512]; + struct bs_writer bs; + uint32_t flags = sps->flags; + bool has_chroma_block = sps_has_chroma_format_block(sps->profile_idc); + bool frame_mbs_only = !!(flags & V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY); + + bsw_init(&bs, rbsp, sizeof(rbsp)); + + bsw_put_u(&bs, sps->profile_idc, 8); + bsw_put_u(&bs, sps->constraint_set_flags, 8); + bsw_put_u(&bs, sps->level_idc, 8); + bsw_put_ue(&bs, sps->seq_parameter_set_id); + + if (has_chroma_block) { + bsw_put_ue(&bs, sps->chroma_format_idc); + if (sps->chroma_format_idc == 3) { + bsw_put_u(&bs, + (flags & V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE) ? 1 : 0, + 1); + } + bsw_put_ue(&bs, sps->bit_depth_luma_minus8); + bsw_put_ue(&bs, sps->bit_depth_chroma_minus8); + bsw_put_u(&bs, + (flags & V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS) ? 1 : 0, + 1); + /* seq_scaling_matrix_present_flag = 0 — let libavcodec + * use default scaling matrices. V4L2 ships scaling + * matrices via a separate control which we don't fold + * into SPS here (the libavcodec decoder ignores them + * for default-flat content anyway). */ + bsw_put_u(&bs, 0u, 1); + } + + bsw_put_ue(&bs, sps->log2_max_frame_num_minus4); + bsw_put_ue(&bs, sps->pic_order_cnt_type); + + if (sps->pic_order_cnt_type == 0) { + bsw_put_ue(&bs, sps->log2_max_pic_order_cnt_lsb_minus4); + } else if (sps->pic_order_cnt_type == 1) { + uint32_t n = sps->num_ref_frames_in_pic_order_cnt_cycle; + uint32_t i; + bsw_put_u(&bs, + (flags & V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO) ? 1 : 0, + 1); + bsw_put_se(&bs, sps->offset_for_non_ref_pic); + bsw_put_se(&bs, sps->offset_for_top_to_bottom_field); + bsw_put_ue(&bs, n); + if (n > 255) + return 0; + for (i = 0; i < n; i++) + bsw_put_se(&bs, sps->offset_for_ref_frame[i]); + } + + bsw_put_ue(&bs, sps->max_num_ref_frames); + bsw_put_u(&bs, + (flags & V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED) ? 1 : 0, + 1); + bsw_put_ue(&bs, sps->pic_width_in_mbs_minus1); + bsw_put_ue(&bs, sps->pic_height_in_map_units_minus1); + bsw_put_u(&bs, frame_mbs_only ? 1u : 0u, 1); + if (!frame_mbs_only) { + bsw_put_u(&bs, + (flags & V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD) ? 1 : 0, + 1); + } + bsw_put_u(&bs, + (flags & V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE) ? 1 : 0, + 1); + + /* frame_cropping_flag = 0 — V4L2 SPS doesn't carry crop offsets. + * libva/ffmpeg uses the surface dimensions from + * VAPictureParameterBufferH264 directly so the SPS crop is + * informational for libavcodec output sizing only; absent crop + * means the daemon's output equals the encoded size, which + * matches our wire protocol's capture_width/height. */ + bsw_put_u(&bs, 0u, 1); + /* vui_parameters_present_flag = 0 */ + bsw_put_u(&bs, 0u, 1); + + bsw_align_rbsp(&bs); + if (bsw_overflowed(&bs)) + return 0; + + return wrap_nal_annexb(NAL_SPS, NAL_REF_IDC_HIGHEST, + rbsp, bsw_bytes(&bs), out, out_cap); +} + +size_t h264_synth_pps(const struct v4l2_ctrl_h264_pps *pps, + uint8_t *out, size_t out_cap) +{ + uint8_t rbsp[128]; + struct bs_writer bs; + uint16_t flags = pps->flags; + bool transform_8x8 = !!(flags & V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE); + + bsw_init(&bs, rbsp, sizeof(rbsp)); + + bsw_put_ue(&bs, pps->pic_parameter_set_id); + bsw_put_ue(&bs, pps->seq_parameter_set_id); + bsw_put_u(&bs, + (flags & V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE) ? 1 : 0, + 1); + bsw_put_u(&bs, + (flags & V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT) ? 1 : 0, + 1); + bsw_put_ue(&bs, pps->num_slice_groups_minus1); + /* Slice-group map types only meaningful when num_slice_groups_minus1 > 0; + * V4L2 stateless decode path doesn't surface slice group maps, so we + * assume single-slice-group (0) — this is the overwhelming common case. */ + bsw_put_ue(&bs, pps->num_ref_idx_l0_default_active_minus1); + bsw_put_ue(&bs, pps->num_ref_idx_l1_default_active_minus1); + bsw_put_u(&bs, + (flags & V4L2_H264_PPS_FLAG_WEIGHTED_PRED) ? 1 : 0, + 1); + bsw_put_u(&bs, pps->weighted_bipred_idc, 2); + bsw_put_se(&bs, pps->pic_init_qp_minus26); + bsw_put_se(&bs, pps->pic_init_qs_minus26); + bsw_put_se(&bs, pps->chroma_qp_index_offset); + bsw_put_u(&bs, + (flags & V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT) ? 1 : 0, + 1); + bsw_put_u(&bs, + (flags & V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED) ? 1 : 0, + 1); + bsw_put_u(&bs, + (flags & V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT) ? 1 : 0, + 1); + + /* The "more_rbsp_data()" section: only emit when we actually have + * something to say. If transform_8x8 is set OR the second chroma + * offset differs from the first, write the extended trailer; + * otherwise stop here and let rbsp_trailing_bits close out. This + * matches what ffmpeg expects — too-short PPS with default values + * is fine. */ + if (transform_8x8 || + pps->second_chroma_qp_index_offset != pps->chroma_qp_index_offset) { + bsw_put_u(&bs, transform_8x8 ? 1u : 0u, 1); + /* pic_scaling_matrix_present_flag = 0 — let libavcodec + * use defaults; we'd need full scaling list serialisation + * to do better and it rarely matters for stateless decode. */ + bsw_put_u(&bs, 0u, 1); + bsw_put_se(&bs, pps->second_chroma_qp_index_offset); + } + + bsw_align_rbsp(&bs); + if (bsw_overflowed(&bs)) + return 0; + + return wrap_nal_annexb(NAL_PPS, NAL_REF_IDC_HIGHEST, + rbsp, bsw_bytes(&bs), out, out_cap); +} diff --git a/daemon/src/h264_nal_synth.h b/daemon/src/h264_nal_synth.h new file mode 100644 index 0000000..a34140b --- /dev/null +++ b/daemon/src/h264_nal_synth.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: BSD-2-Clause */ +/* + * h264_nal_synth.h — synthesise AnnexB SPS + PPS NAL units from the + * v4l2_ctrl_h264_sps / v4l2_ctrl_h264_pps structures the libva driver + * sets via V4L2 stateless controls and the daedalus kernel module + * forwards over the chardev as struct daedalus_h264_meta. + * + * libavcodec needs SPS+PPS NAL units BEFORE any slice NAL to bind + * the slice's pic_parameter_set_id reference; libva-v4l2-request + * passes only the slice in the OUTPUT buffer (per the V4L2 stateless + * H.264 contract). The daemon bridges by reconstructing SPS+PPS + * NAL bytes from the structured controls and prepending them to the + * bitstream the daemon hands libavcodec. + * + * Output is AnnexB-framed: 0x00 0x00 0x00 0x01 start code + NAL. + * Emulation prevention (insert 0x03 after any 0x00 0x00 in the RBSP) + * is handled by this module so the consumer can concatenate raw. + */ +#ifndef DAEDALUS_H264_NAL_SYNTH_H +#define DAEDALUS_H264_NAL_SYNTH_H + +#include +#include + +#include + +/* + * Encode an AnnexB SPS NAL into out[]. Returns the number of bytes + * written, or 0 on overflow / malformed input. out_cap must be at + * least 256 bytes to handle worst-case SPS with full offset_for_ref_ + * frame[] cycle. + */ +size_t h264_synth_sps(const struct v4l2_ctrl_h264_sps *sps, + uint8_t *out, size_t out_cap); + +/* + * Encode an AnnexB PPS NAL into out[]. Returns the number of bytes + * written, or 0 on overflow. out_cap should be at least 64 bytes; + * PPS NALs are small. + */ +size_t h264_synth_pps(const struct v4l2_ctrl_h264_pps *pps, + uint8_t *out, size_t out_cap); + +#endif /* DAEDALUS_H264_NAL_SYNTH_H */ diff --git a/include/daedalus_v4l2_proto.h b/include/daedalus_v4l2_proto.h index 9f3f4ac..04e2dc2 100644 --- a/include/daedalus_v4l2_proto.h +++ b/include/daedalus_v4l2_proto.h @@ -25,6 +25,7 @@ #define DAEDALUS_V4L2_PROTO_H #include +#include #define DAEDALUS_PROTO_MAGIC 0x44303456u /* 'D04V' */ #define DAEDALUS_PROTO_VERSION 0u /* pre-1.0 */ @@ -90,10 +91,22 @@ enum daedalus_codec_id { DAEDALUS_CODEC_H264 = 3, }; +/** + * DAEDALUS_REQ_FLAG_H264_META - daedalus_req_decode.flags bit + * + * Set when a struct daedalus_h264_meta is present between the + * daedalus_req_decode prefix and the slice bitstream. Required for + * H.264 (codec_id == DAEDALUS_CODEC_H264) since libavcodec needs + * SPS/PPS that the V4L2 stateless API delivers as separate ctrls, + * not in the OUTPUT buffer. Other codecs ignore this bit. + */ +#define DAEDALUS_REQ_FLAG_H264_META 0x00000001u + /** * struct daedalus_req_decode - REQ_DECODE payload prefix * @codec_id: enum daedalus_codec_id * @bitstream_len: bytes of bitstream following this struct + * (after any optional metadata blocks — see flags) * @capture_width: CAPTURE buffer width in pixels * @capture_height: CAPTURE buffer height in pixels * @capture_pix_fmt: V4L2 fourcc of the CAPTURE format @@ -103,10 +116,17 @@ enum daedalus_codec_id { * @capture_plane_size: per-plane sizeimage from V4L2 S_FMT * (plane[0..N-1]). Unused entries = 0. * @capture_plane_stride: per-plane bytesperline from V4L2 S_FMT. - * @flags: reserved, must be zero + * @flags: bitmask of DAEDALUS_REQ_FLAG_* * - * Total payload_len for a REQ_DECODE = sizeof(struct - * daedalus_req_decode) + bitstream_len. + * Wire layout for a REQ_DECODE payload: + * + * struct daedalus_req_decode req; + * IF (req.flags & DAEDALUS_REQ_FLAG_H264_META): + * struct daedalus_h264_meta meta; + * u8 bitstream[req.bitstream_len]; + * + * Total payload_len = sizeof(req) + (meta ? sizeof(meta) : 0) + * + req.bitstream_len. * * The daemon uses (capture_*) to fetch + mmap the right CAPTURE * plane via DAEDALUS_IOC_GET_DMABUF, then decodes pixels @@ -124,6 +144,32 @@ struct daedalus_req_decode { __u32 flags; }; +/** + * struct daedalus_h264_meta - H.264 stateless-decode metadata + * + * Optional block following the daedalus_req_decode prefix when + * DAEDALUS_REQ_FLAG_H264_META is set in req.flags. Carries the + * structured controls the kernel collected from + * V4L2_CID_STATELESS_H264_* — the daemon converts them into + * AnnexB SPS+PPS NAL units (via an Exp-Golomb writer) and + * prepends those NAL units to the slice bitstream before + * handing it to libavcodec. + * + * The kernel never inspects these fields beyond capturing them + * verbatim from the v4l2_ctrl_handler at device_run time; the + * field semantics are governed entirely by the linux uABI + * V4L2 stateless H.264 control definitions. + * + * Wire-stable across phases. If the kernel V4L2 H.264 control + * structs grow new fields the protocol version bumps with them. + */ +struct daedalus_h264_meta { + struct v4l2_ctrl_h264_sps sps; + struct v4l2_ctrl_h264_pps pps; + struct v4l2_ctrl_h264_scaling_matrix scaling_matrix; + struct v4l2_ctrl_h264_decode_params decode_params; +}; + /** * enum daedalus_decode_status - RESP_FRAME outcome codes * @DAEDALUS_DECODE_OK: frame produced; fields below populated diff --git a/kernel/daedalus_v4l2_main.c b/kernel/daedalus_v4l2_main.c index b066c02..ffc616b 100644 --- a/kernel/daedalus_v4l2_main.c +++ b/kernel/daedalus_v4l2_main.c @@ -190,12 +190,14 @@ static const u32 daedalus_stateless_ctrls[] = { }; /* - * No-op control op set: daemon ignores all stateless control - * values (FFmpeg re-parses the bitstream). But v4l2-core requires - * ops to be present on a ctrl_handler that processes SET requests - * — without it, S_EXT_CTRLS rejects with EINVAL on validate. - * Always-success s_ctrl is the right shape for "we accept whatever - * you tell us but actually act on the OUTPUT buffer payload alone." + * Always-success s_ctrl op. v4l2-core requires ops to be present on + * a ctrl_handler that processes SET requests — without it, + * S_EXT_CTRLS rejects with EINVAL on validate. We don't act on + * values here at SET time; for H.264, device_run later reads the + * (request-bound) values from p_cur and ships them to the daemon + * via struct daedalus_h264_meta. For VP9/AV1 we still rely on + * FFmpeg re-parsing the bitstream — those formats are self- + * describing per frame. */ static int daedalus_s_ctrl_noop(struct v4l2_ctrl *ctrl) { @@ -207,6 +209,44 @@ static const struct v4l2_ctrl_ops daedalus_ctrl_ops = { .s_ctrl = daedalus_s_ctrl_noop, }; +/* + * Copy the current H.264 stateless control values into a + * daedalus_h264_meta scratch buffer. Returns true if all four + * required controls (SPS, PPS, scaling matrix, decode params) had + * data on the ctrl handler — caller then ships the meta block in + * REQ_DECODE. Returns false if any control was missing (caller + * skips the meta block; daemon will likely fail the decode, but + * with a clear "no SPS" error from libavcodec rather than a + * confusing protocol mismatch). + * + * The ctrl_handler's p_cur values are bound to the in-flight + * media_request by v4l2_ctrl_request_setup, which v4l2-m2m calls + * before device_run for stateless decoders. + */ +static bool daedalus_collect_h264_meta(struct daedalus_ctx *ctx, + struct daedalus_h264_meta *meta) +{ + struct v4l2_ctrl *c_sps, *c_pps, *c_sm, *c_dp; + + c_sps = v4l2_ctrl_find(&ctx->hdl, V4L2_CID_STATELESS_H264_SPS); + c_pps = v4l2_ctrl_find(&ctx->hdl, V4L2_CID_STATELESS_H264_PPS); + c_sm = v4l2_ctrl_find(&ctx->hdl, V4L2_CID_STATELESS_H264_SCALING_MATRIX); + c_dp = v4l2_ctrl_find(&ctx->hdl, V4L2_CID_STATELESS_H264_DECODE_PARAMS); + + if (!c_sps || !c_pps || !c_sm || !c_dp) + return false; + if (!c_sps->p_cur.p_h264_sps || !c_pps->p_cur.p_h264_pps || + !c_sm->p_cur.p_h264_scaling_matrix || + !c_dp->p_cur.p_h264_decode_params) + return false; + + meta->sps = *c_sps->p_cur.p_h264_sps; + meta->pps = *c_pps->p_cur.p_h264_pps; + meta->scaling_matrix = *c_sm->p_cur.p_h264_scaling_matrix; + meta->decode_params = *c_dp->p_cur.p_h264_decode_params; + return true; +} + static int daedalus_register_stateless_ctrls(struct v4l2_ctrl_handler *hdl) { size_t i; @@ -637,36 +677,71 @@ static void daedalus_device_run(void *priv) goto fail_buf_error; } - payload_len = sizeof(*req) + blen; - req = kmalloc(payload_len, GFP_KERNEL); - if (!req) - goto fail_buf_error; - memset(req, 0, sizeof(*req)); - - req->codec_id = daedalus_fourcc_to_codec_id(ctx->src_fmt.pixelformat); - if (!req->codec_id) { - v4l2_err(&dev->v4l2_dev, - "device_run: unsupported OUTPUT pixelformat 0x%08x\n", - ctx->src_fmt.pixelformat); - kfree(req); - req = NULL; - goto fail_buf_error; - } - req->bitstream_len = (u32) blen; - req->capture_width = ctx->dst_fmt.width; - req->capture_height = ctx->dst_fmt.height; - req->capture_pix_fmt = ctx->dst_fmt.pixelformat; - req->capture_num_planes = ctx->dst_fmt.num_planes; { - unsigned int p; - for (p = 0; p < ctx->dst_fmt.num_planes && p < 3; p++) { - req->capture_plane_size[p] = - ctx->dst_fmt.plane_fmt[p].sizeimage; - req->capture_plane_stride[p] = - ctx->dst_fmt.plane_fmt[p].bytesperline; + u32 cid = daedalus_fourcc_to_codec_id(ctx->src_fmt.pixelformat); + size_t meta_len = 0; + struct daedalus_h264_meta meta_local; + bool have_h264_meta = false; + + if (!cid) { + v4l2_err(&dev->v4l2_dev, + "device_run: unsupported OUTPUT pixelformat 0x%08x\n", + ctx->src_fmt.pixelformat); + goto fail_buf_error; } + + /* + * H.264 needs SPS/PPS/scaling-matrix/decode-params shipped + * to the daemon alongside the slice bitstream — libavcodec + * can't decode slices without them. VP9/AV1 are self- + * describing so we skip the meta block for those. + */ + if (cid == DAEDALUS_CODEC_H264) { + memset(&meta_local, 0, sizeof(meta_local)); + have_h264_meta = daedalus_collect_h264_meta(ctx, &meta_local); + if (have_h264_meta) + meta_len = sizeof(meta_local); + else + v4l2_warn(&dev->v4l2_dev, + "device_run: H.264 frame without SPS/PPS controls — daemon will fail decode\n"); + } + + payload_len = sizeof(*req) + meta_len + blen; + if (payload_len > DAEDALUS_PROTO_MAX_PAYLOAD) { + v4l2_err(&dev->v4l2_dev, + "device_run: payload %zu exceeds chardev cap %u\n", + payload_len, + (unsigned int) DAEDALUS_PROTO_MAX_PAYLOAD); + goto fail_buf_error; + } + + req = kmalloc(payload_len, GFP_KERNEL); + if (!req) + goto fail_buf_error; + memset(req, 0, sizeof(*req)); + + req->codec_id = cid; + req->bitstream_len = (u32) blen; + req->capture_width = ctx->dst_fmt.width; + req->capture_height = ctx->dst_fmt.height; + req->capture_pix_fmt = ctx->dst_fmt.pixelformat; + req->capture_num_planes = ctx->dst_fmt.num_planes; + { + unsigned int p; + for (p = 0; p < ctx->dst_fmt.num_planes && p < 3; p++) { + req->capture_plane_size[p] = + ctx->dst_fmt.plane_fmt[p].sizeimage; + req->capture_plane_stride[p] = + ctx->dst_fmt.plane_fmt[p].bytesperline; + } + } + if (have_h264_meta) { + req->flags |= DAEDALUS_REQ_FLAG_H264_META; + memcpy((u8 *) req + sizeof(*req), + &meta_local, sizeof(meta_local)); + } + memcpy((u8 *) req + sizeof(*req) + meta_len, bitstream, blen); } - memcpy((u8 *) req + sizeof(*req), bitstream, blen); inf = kzalloc(sizeof(*inf), GFP_KERNEL); if (!inf)