/* SPDX-License-Identifier: BSD-2-Clause */
/*
 * decoder.c — FFmpeg-driven decode helper for daedalus-v4l2 daemon.
 */
#include "decoder.h"
#include "ffmpeg_loader.h"
#include "h264_nal_synth.h"
#include "log.h"

#include <errno.h>
#include <stdlib.h>
#include <string.h>

#include <linux/videodev2.h>

#include <libavcodec/avcodec.h>
#include <libavutil/pixfmt.h>

/*
 * FNV-1a 32-bit hash.  Used as a compact digest of the decoded
 * frame's YUV planes so the kernel can verify "the daemon produced
 * the expected pixels" without shipping the full frame through the
 * 64-KiB-capped chardev wire protocol.  Phase 8.5's dmabuf path
 * carries actual pixel data; this digest stays useful as a
 * cross-host sanity check.
 */
static uint32_t fnv1a32_init(void)
{
	return 0x811c9dc5u;
}

static uint32_t fnv1a32_update(uint32_t h, const uint8_t *data, size_t len)
{
	size_t i;
	for (i = 0; i < len; i++)
		h = (h ^ data[i]) * 0x01000193u;
	return h;
}

/*
 * Hash plane @p (width @w bytes per line, @h lines, stride @stride
 * bytes between lines).  We strip libav's row alignment padding so
 * the hash matches the layout used by `ffmpeg -f rawvideo` reference
 * output (which is tightly packed).
 */
static uint32_t fnv1a32_plane(uint32_t h, const uint8_t *p,
			      int w, int height, int stride)
{
	int y;
	for (y = 0; y < height; y++)
		h = fnv1a32_update(h, p + (size_t) y * (size_t) stride,
				   (size_t) w);
	return h;
}

int daedalus_decoder_init(struct daedalus_decoder *dec,
			  struct ffmpeg_loader *loader)
{
	memset(dec, 0, sizeof(*dec));
	dec->loader = loader;

	dec->pkt = loader->av_packet_alloc();
	if (!dec->pkt)
		return -ENOMEM;
	dec->frame = loader->av_frame_alloc();
	if (!dec->frame) {
		loader->av_packet_free(&dec->pkt);
		return -ENOMEM;
	}
	return 0;
}

void daedalus_decoder_cleanup(struct daedalus_decoder *dec)
{
	if (!dec || !dec->loader)
		return;
	if (dec->ctx_vp9)
		dec->loader->avcodec_free_context(&dec->ctx_vp9);
	if (dec->ctx_av1)
		dec->loader->avcodec_free_context(&dec->ctx_av1);
	if (dec->ctx_h264)
		dec->loader->avcodec_free_context(&dec->ctx_h264);
	if (dec->frame)
		dec->loader->av_frame_free(&dec->frame);
	if (dec->pkt)
		dec->loader->av_packet_free(&dec->pkt);
	memset(dec, 0, sizeof(*dec));
}

/*
 * Lazily open the AVCodecContext for codec_id.  Returns 0 on
 * success, -ENOSYS on unknown codec, -EIO on FFmpeg failure.
 */
static int decoder_open_codec(struct daedalus_decoder *dec, uint32_t codec_id,
			      struct AVCodecContext **out)
{
	struct ffmpeg_loader *fm = dec->loader;
	const struct AVCodec *codec;
	struct AVCodecContext *ctx;
	enum AVCodecID av_id;
	struct AVCodecContext **cache;
	int rc;

	switch (codec_id) {
	case DAEDALUS_CODEC_VP9:
		av_id = AV_CODEC_ID_VP9;
		cache = &dec->ctx_vp9;
		break;
	case DAEDALUS_CODEC_AV1:
		av_id = AV_CODEC_ID_AV1;
		cache = &dec->ctx_av1;
		break;
	case DAEDALUS_CODEC_H264:
		av_id = AV_CODEC_ID_H264;
		cache = &dec->ctx_h264;
		break;
	default:
		log_warn("decoder: unknown codec_id %u", codec_id);
		return -ENOSYS;
	}

	if (*cache) {
		*out = *cache;
		return 0;
	}

	codec = fm->avcodec_find_decoder(av_id);
	if (!codec) {
		log_err("decoder: avcodec_find_decoder(%d) returned NULL", av_id);
		return -EIO;
	}
	ctx = fm->avcodec_alloc_context3(codec);
	if (!ctx)
		return -ENOMEM;
	rc = fm->avcodec_open2(ctx, codec, NULL);
	if (rc < 0) {
		log_err("decoder: avcodec_open2 failed: %d", rc);
		fm->avcodec_free_context(&ctx);
		return -EIO;
	}

	*cache = ctx;
	*out = ctx;
	log_info("decoder: opened %s context", codec->name);
	return 0;
}

/*
 * Pack the decoded YUV planes into NV12M layout across two
 * mapped CAPTURE planes:
 *   planes[0] = Y, written w bytes per row with stride dst_y_stride
 *   planes[1] = interleaved CbCr at half-res, two bytes per chroma
 *               sample, written cw*2 bytes per row with stride
 *               dst_uv_stride
 *
 * Source stride padding (fr->linesize[*]) is stripped; destination
 * stride padding (dst_stride - row_bytes) is left as-is — the V4L2
 * client knows the format's bytesperline and walks accordingly.
 *
 * Returns 0 on success, -EINVAL if the source is not planar 4:2:0
 * (Phase 8.6 still expects yuv420p-class outputs; 8.7 widens).
 */
/*
 * Pack 10-bit planar YUV420P10LE into V4L2_PIX_FMT_P010 single
 * plane: Y plane (width × 2 bytes per pixel, height rows) +
 * interleaved CbCr plane at half-res (cw*2 bytes per row, ch
 * rows).  P010 stores 10-bit samples in 16-bit words,
 * MSB-aligned (low 6 bits zero).  libav's YUV420P10LE delivers
 * 10-bit samples in the LOW 10 bits, so we shift left by 6.
 *
 * The single-plane layout means Y and CbCr are concatenated in
 * planes->base[0]; planes->stride[0] is the Y stride (which we
 * also use for the CbCr rows since both have the same
 * per-line byte count for 4:2:0 with interleaved chroma).
 */
static int pack_p010_to_plane(struct AVFrame *fr,
			      const AVPixFmtDescriptor *desc,
			      const struct daedalus_capture_planes *planes)
{
	int h = fr->height;
	int w = fr->width;
	int cw, ch, y, x;
	uint8_t *base;
	uint32_t stride;
	uint8_t *dst_y, *dst_uv;
	size_t y_size;

	if (!desc || !planes || planes->nr < 1)
		return -EINVAL;
	if (desc->nb_components < 3)
		return -EINVAL;
	if (desc->log2_chroma_w != 1 || desc->log2_chroma_h != 1)
		return -EINVAL;
	/* Only 10-bit-per-sample sources packed into 16 bits per
	 * libav convention.  Anything else needs its own path. */
	if (desc->comp[0].depth != 10)
		return -EINVAL;

	cw = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
	ch = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);

	base   = planes->base[0];
	stride = planes->stride[0] ? planes->stride[0] : (uint32_t) (w * 2);
	if (!base)
		return -EINVAL;

	dst_y  = base;
	y_size = (size_t) stride * (size_t) h;
	dst_uv = base + y_size;

	/* Y plane: shift 10-bit → MSB-aligned 16-bit. */
	for (y = 0; y < h; y++) {
		const uint16_t *src = (const uint16_t *) (fr->data[0] +
				(size_t) y * fr->linesize[0]);
		uint16_t *dst = (uint16_t *) (dst_y +
				(size_t) y * stride);
		for (x = 0; x < w; x++)
			dst[x] = (uint16_t) (src[x] << 6);
	}

	/* Interleave Cb/Cr at half-res, also MSB-aligned. */
	for (y = 0; y < ch; y++) {
		const uint16_t *u = (const uint16_t *) (fr->data[1] +
				(size_t) y * fr->linesize[1]);
		const uint16_t *v = (const uint16_t *) (fr->data[2] +
				(size_t) y * fr->linesize[2]);
		uint16_t *dst = (uint16_t *) (dst_uv +
				(size_t) y * stride);
		for (x = 0; x < cw; x++) {
			dst[x * 2 + 0] = (uint16_t) (u[x] << 6);
			dst[x * 2 + 1] = (uint16_t) (v[x] << 6);
		}
	}
	return 0;
}

/*
 * Pack 8-bit planar YUV420P into V4L2_PIX_FMT_NV12 single plane:
 * Y plane (W*H bytes) followed by interleaved CbCr at half-res
 * (W*H/2 bytes) all in planes->base[0].  Same layout as P010
 * sans the depth shift.  For libva-v4l2-request-style clients
 * that expect num_planes=1 NV12.
 */
static int pack_nv12_single_to_plane(struct AVFrame *fr,
				     const AVPixFmtDescriptor *desc,
				     const struct daedalus_capture_planes *planes)
{
	int h = fr->height;
	int w = fr->width;
	int cw, ch, y, x;
	uint8_t *base;
	uint32_t stride;
	uint8_t *dst_y, *dst_uv;
	size_t y_size;

	if (!desc || !planes || planes->nr < 1)
		return -EINVAL;
	if (desc->nb_components < 3)
		return -EINVAL;
	if (desc->log2_chroma_w != 1 || desc->log2_chroma_h != 1)
		return -EINVAL;
	if (desc->comp[0].depth != 8)
		return -EINVAL;

	cw = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
	ch = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);

	base   = planes->base[0];
	stride = planes->stride[0] ? planes->stride[0] : (uint32_t) w;
	if (!base)
		return -EINVAL;

	dst_y  = base;
	y_size = (size_t) stride * (size_t) h;
	dst_uv = base + y_size;

	for (y = 0; y < h; y++)
		memcpy(dst_y + (size_t) y * stride,
		       fr->data[0] + (size_t) y * fr->linesize[0],
		       (size_t) w);

	for (y = 0; y < ch; y++) {
		const uint8_t *u = fr->data[1] +
				   (size_t) y * fr->linesize[1];
		const uint8_t *v = fr->data[2] +
				   (size_t) y * fr->linesize[2];
		uint8_t *row = dst_uv + (size_t) y * stride;
		for (x = 0; x < cw; x++) {
			row[x * 2 + 0] = u[x];
			row[x * 2 + 1] = v[x];
		}
	}
	return 0;
}

static int pack_nv12_to_planes(struct AVFrame *fr,
			       const AVPixFmtDescriptor *desc,
			       const struct daedalus_capture_planes *planes)
{
	int h = fr->height;
	int w = fr->width;
	int cw, ch;
	size_t row_y, row_uv;
	int y, x;
	uint8_t *dst_y, *dst_uv;
	uint32_t dst_y_stride, dst_uv_stride;

	if (!desc || !planes || planes->nr < 2)
		return -EINVAL;
	if (desc->nb_components < 3)
		return -EINVAL;
	if (desc->log2_chroma_w != 1 || desc->log2_chroma_h != 1)
		return -EINVAL;	/* not 4:2:0 — would need a different pack */

	cw = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
	ch = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);

	dst_y		= planes->base[0];
	dst_uv		= planes->base[1];
	dst_y_stride	= planes->stride[0] ? planes->stride[0] : (uint32_t) w;
	dst_uv_stride	= planes->stride[1] ? planes->stride[1] : (uint32_t) (cw * 2);

	row_y  = (size_t) w;
	row_uv = (size_t) cw * 2u;

	if (!dst_y || !dst_uv)
		return -EINVAL;

	/* Y plane copy — strip source stride padding. */
	for (y = 0; y < h; y++)
		memcpy(dst_y + (size_t) y * dst_y_stride,
		       fr->data[0] + (size_t) y * fr->linesize[0],
		       row_y);

	/* Interleave Cb and Cr into NV12 chroma plane. */
	for (y = 0; y < ch; y++) {
		const uint8_t *u = fr->data[1] +
				   (size_t) y * fr->linesize[1];
		const uint8_t *v = fr->data[2] +
				   (size_t) y * fr->linesize[2];
		uint8_t *row = dst_uv + (size_t) y * dst_uv_stride;
		for (x = 0; x < cw; x++) {
			row[x * 2 + 0] = u[x];
			row[x * 2 + 1] = v[x];
		}
	}
	(void) row_uv;
	return 0;
}

int daedalus_decoder_run_request(struct daedalus_decoder *dec,
				 const struct daedalus_req_decode *req,
				 const uint8_t *bitstream,
				 const struct daedalus_h264_meta *h264_meta,
				 struct daedalus_resp_frame *resp,
				 const struct daedalus_capture_planes *planes)
{
	struct ffmpeg_loader *fm = dec->loader;
	struct AVCodecContext *ctx = NULL;
	uint8_t *assembled = NULL;
	size_t assembled_len = 0;
	int rc;

	memset(resp, 0, sizeof(*resp));
	resp->codec_id = req->codec_id;

	rc = decoder_open_codec(dec, req->codec_id, &ctx);
	if (rc == -ENOSYS) {
		resp->status = DAEDALUS_DECODE_ERR_CODEC;
		goto out;
	}
	if (rc < 0) {
		resp->status = DAEDALUS_DECODE_ERR_OPEN;
		goto out;
	}

	fm->av_packet_unref(dec->pkt);

	/*
	 * H.264 path: libavcodec needs SPS+PPS NAL units BEFORE the
	 * slice can be decoded.  libva-v4l2-request passes those as
	 * separate V4L2 controls (per the stateless API), so the
	 * daedalus kernel module forwards them to us as struct
	 * daedalus_h264_meta.  Synthesise AnnexB SPS+PPS NALs from
	 * the structs and prepend them to @bitstream before feeding
	 * libavcodec.
	 */
	if (req->codec_id == DAEDALUS_CODEC_H264 && h264_meta) {
		uint8_t sps_nal[256];
		uint8_t pps_nal[128];
		size_t sps_len, pps_len;

		sps_len = h264_synth_sps(&h264_meta->sps,
					 sps_nal, sizeof(sps_nal));
		pps_len = h264_synth_pps(&h264_meta->pps,
					 pps_nal, sizeof(pps_nal));
		if (sps_len == 0 || pps_len == 0) {
			log_err("decoder: SPS/PPS NAL synth failed (sps=%zu pps=%zu)",
				sps_len, pps_len);
			resp->status = DAEDALUS_DECODE_ERR_SEND;
			goto out;
		}

		assembled_len = sps_len + pps_len + req->bitstream_len;
		assembled = malloc(assembled_len + AV_INPUT_BUFFER_PADDING_SIZE);
		if (!assembled) {
			resp->status = DAEDALUS_DECODE_ERR_SEND;
			goto out;
		}
		memcpy(assembled, sps_nal, sps_len);
		memcpy(assembled + sps_len, pps_nal, pps_len);
		memcpy(assembled + sps_len + pps_len,
		       bitstream, req->bitstream_len);
		memset(assembled + assembled_len, 0,
		       AV_INPUT_BUFFER_PADDING_SIZE);

		dec->pkt->data = assembled;
		dec->pkt->size = (int) assembled_len;
		log_debug("decoder: h264 prepended SPS=%zuB PPS=%zuB slice=%uB",
			  sps_len, pps_len, req->bitstream_len);
	} else {
		/*
		 * VP9/AV1: bitstream is self-contained per frame, point the
		 * AVPacket at it directly.  Cast away const — AVPacket->data
		 * is non-const but avcodec_send_packet doesn't mutate it.
		 */
		dec->pkt->data = (uint8_t *) (uintptr_t) bitstream;
		dec->pkt->size = (int) req->bitstream_len;
	}

	rc = fm->avcodec_send_packet(ctx, dec->pkt);
	if (rc < 0) {
		log_err("decoder: avcodec_send_packet failed: %d", rc);
		resp->status = DAEDALUS_DECODE_ERR_SEND;
		goto out;
	}

	fm->av_frame_unref(dec->frame);
	rc = fm->avcodec_receive_frame(ctx, dec->frame);
	if (rc == AVERROR(EAGAIN) || rc == AVERROR_EOF) {
		log_debug("decoder: no frame ready yet (rc=%d)", rc);
		resp->status = DAEDALUS_DECODE_NO_FRAME;
		goto out;
	}
	if (rc < 0) {
		log_err("decoder: avcodec_receive_frame failed: %d", rc);
		resp->status = DAEDALUS_DECODE_ERR_RECV;
		goto out;
	}

	{
		struct AVFrame *fr = dec->frame;
		const AVPixFmtDescriptor *desc =
			fm->av_pix_fmt_desc_get(fr->format);
		uint32_t h = fnv1a32_init();
		uint32_t luma_len = 0, chroma_len = 0;

		resp->status	= DAEDALUS_DECODE_OK;
		resp->width	= (uint32_t) fr->width;
		resp->height	= (uint32_t) fr->height;
		resp->pix_fmt	= fr->format;

		/*
		 * Walk every plane reported by the AVPixFmtDescriptor.
		 * For each component, byte width = ((plane_w *
		 * step_minus1) >> 0) — but the descriptor only tells
		 * us which plane each component sits in, not the
		 * plane's byte stride per pixel.  In practice for the
		 * formats we care about (YUV420P, YUV422P, YUV444P,
		 * GBRP, NV12), each plane has exactly one component
		 * at 1 byte/sample.  Hash each plane at
		 * (width >> log2_chroma_w) × (height >> log2_chroma_h)
		 * for chroma planes, full-size for plane 0.
		 *
		 * This generalises cleanly to anything 8-bit-per-
		 * sample-per-plane; 10/12-bit (P010, YUV420P10LE) will
		 * need depth handling when Phase 8.6 brings HDR
		 * content into play.
		 */
		if (!desc) {
			log_warn("decoder: no descriptor for pix_fmt %d",
				 fr->format);
		} else {
			int p, max_plane = 0;
			int i;

			for (i = 0; i < desc->nb_components; i++) {
				if (desc->comp[i].plane > max_plane)
					max_plane = desc->comp[i].plane;
			}

			for (p = 0; p <= max_plane; p++) {
				int pw, ph;
				if (!fr->data[p] || !fr->linesize[p])
					continue;
				if (p == 0) {
					pw = fr->width;
					ph = fr->height;
					luma_len += (uint32_t) pw *
						    (uint32_t) ph;
				} else {
					pw = AV_CEIL_RSHIFT(fr->width,
							    desc->log2_chroma_w);
					ph = AV_CEIL_RSHIFT(fr->height,
							    desc->log2_chroma_h);
					chroma_len += (uint32_t) pw *
						      (uint32_t) ph;
				}
				h = fnv1a32_plane(h, fr->data[p], pw, ph,
						  fr->linesize[p]);
			}
		}

		resp->luma_len	 = luma_len;
		resp->chroma_len = chroma_len;
		resp->fnv1a_yuv	 = h;

		/*
		 * Pack pixels directly into the mapped CAPTURE dmabuf
		 * planes.  Dispatch on the V4L2 fourcc the kernel
		 * negotiated:
		 *   V4L2_PIX_FMT_NV12M (default, 8-bit, 2 planes)
		 *   V4L2_PIX_FMT_P010  (10-bit HDR, 1 plane)
		 */
		if (planes && planes->nr >= 1) {
			int prc = 0;
			switch (req->capture_pix_fmt) {
			case V4L2_PIX_FMT_NV12M:
				prc = pack_nv12_to_planes(fr, desc, planes);
				break;
			case V4L2_PIX_FMT_NV12:
				prc = pack_nv12_single_to_plane(fr, desc, planes);
				break;
			case V4L2_PIX_FMT_P010:
				prc = pack_p010_to_plane(fr, desc, planes);
				break;
			default:
				log_warn("decoder: unsupported capture fourcc 0x%08x",
					 req->capture_pix_fmt);
				prc = -EINVAL;
				break;
			}
			if (prc < 0)
				log_warn("decoder: pack failed (pix_fmt=%d cap_fourcc=0x%08x) — kernel will see metadata only",
					 fr->format, req->capture_pix_fmt);
		}

		log_info("decoder: OK %dx%d fmt=%d (%s) fnv1a=0x%08x luma=%u chroma=%u",
			 fr->width, fr->height, fr->format,
			 desc ? desc->name : "?",
			 h, luma_len, chroma_len);
	}

	fm->av_frame_unref(dec->frame);

out:
	free(assembled);
	(void) assembled_len;
	return 0;
}