/*
 * Copyright (C) 2007 Intel Corporation
 * Copyright (C) 2016 Florent Revest <florent.revest@free-electrons.com>
 * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "image.h"
#include "buffer.h"
#include "request.h"
#include "surface.h"
#include "video.h"

#include <assert.h>
#include <fcntl.h>
#include <string.h>
#include <unistd.h>

#include <sys/ioctl.h>

#include <linux/dma-buf.h>

#include "tiled_yuv.h"
#include "utils.h"
#include "v4l2.h"

VAStatus RequestCreateImage(VADriverContextP context, VAImageFormat *format,
			    int width, int height, VAImage *image)
{
	struct request_data *driver_data = context->pDriverData;
	unsigned int destination_sizes[VIDEO_MAX_PLANES];
	unsigned int destination_bytesperlines[VIDEO_MAX_PLANES];
	unsigned int destination_planes_count;
	unsigned int planes_count;
	unsigned int format_width, format_height;
	unsigned int size;
	unsigned int capture_type;
	struct video_format *video_format;
	struct object_image *image_object;
	VABufferID buffer_id;
	VAImageID id;
	VAStatus status;
	unsigned int i;
	int rc;

	video_format = driver_data->video_format;
	if (video_format == NULL)
		return VA_STATUS_ERROR_OPERATION_FAILED;

	capture_type = v4l2_type_video_capture(video_format->v4l2_mplane);

	/*
	 * FIXME: This should be replaced by per-pixelformat hadling to
	 * determine the logical plane offsets and sizes;
	 */
	rc = v4l2_get_format(driver_data->video_fd, capture_type,
			     &format_width, &format_height,
			     destination_bytesperlines, destination_sizes,
			     &planes_count);
	if (rc < 0)
		return VA_STATUS_ERROR_OPERATION_FAILED;

	destination_planes_count = video_format->planes_count;
	size = 0;

	/* The size returned by V4L2 covers buffers, not logical planes. */
	for (i = 0; i < planes_count; i++)
		size += destination_sizes[i];

	/* Here we calculate the sizes assuming NV12. */

	destination_sizes[0] = destination_bytesperlines[0] * format_height;

	for (i = 1; i < destination_planes_count; i++) {
		destination_bytesperlines[i] = destination_bytesperlines[0];
		destination_sizes[i] = destination_sizes[0] / 2;
	}

	id = object_heap_allocate(&driver_data->image_heap);
	image_object = IMAGE(driver_data, id);
	if (image_object == NULL)
		return VA_STATUS_ERROR_ALLOCATION_FAILED;

	status = RequestCreateBuffer(context, 0, VAImageBufferType, size, 1,
				     NULL, &buffer_id);
	if (status != VA_STATUS_SUCCESS) {
		object_heap_free(&driver_data->image_heap,
				 (struct object_base *)image_object);
		return status;
	}

	memset(image, 0, sizeof(*image));

	image->format = *format;
	image->width = width;
	image->height = height;
	image->buf = buffer_id;
	image->image_id = id;

	image->num_planes = destination_planes_count;
	image->data_size = size;

	for (i = 0; i < image->num_planes; i++) {
		image->pitches[i] = destination_bytesperlines[i];
		image->offsets[i] = i > 0 ? destination_sizes[i - 1] : 0;
	}

	image_object->image = *image;

	return VA_STATUS_SUCCESS;
}

VAStatus RequestDestroyImage(VADriverContextP context, VAImageID image_id)
{

	struct request_data *driver_data = context->pDriverData;
	struct object_image *image_object;
	VAStatus status;

	image_object = IMAGE(driver_data, image_id);
	if (image_object == NULL)
		return VA_STATUS_ERROR_INVALID_IMAGE;

	status = RequestDestroyBuffer(context, image_object->image.buf);
	if (status != VA_STATUS_SUCCESS)
		return status;

	object_heap_free(&driver_data->image_heap,
			 (struct object_base *)image_object);

	return VA_STATUS_SUCCESS;
}

static VAStatus copy_surface_to_image (struct request_data *driver_data,
				       struct object_surface *surface_object,
				       VAImage *image)
{
	struct object_buffer *buffer_object;
	unsigned int i;
	int sync_fds[VIDEO_MAX_PLANES];
	unsigned int n_sync_fds = 0;

	buffer_object = BUFFER(driver_data, image->buf);
	if (buffer_object == NULL)
		return VA_STATUS_ERROR_INVALID_BUFFER;

	for (i = 0; i < VIDEO_MAX_PLANES; i++)
		sync_fds[i] = -1;

	/*
	 * iter13 α-17: explicit cache sync around the CAPTURE buffer read.
	 *
	 * The CAPTURE buffer is V4L2_MEMORY_MMAP and was mapped at
	 * cap_pool_init time with cached attributes. Kernel decode writes to
	 * the buffer via DMA, which doesn't propagate to the CPU's cache
	 * observer for that virtual mapping. Reading from
	 * surface_object->destination_data[] without an explicit cache
	 * invalidation returns stale data — observed empirically as Bug 4
	 * (H.264 partial-fill) and Bug 5 (HEVC all-zero) when libva went
	 * through the SAME readback path that kdirect ffmpeg-v4l2request +
	 * DRM_PRIME-mmap successfully reads (kdirect's drm-prime mmap
	 * implicitly handles sync).
	 *
	 * DMA_BUF_IOCTL_SYNC(START | READ) makes the CPU mapping coherent
	 * with the producing engine's writes; END releases the sync.
	 * Per V4L2 + dma-buf spec, this is the userspace contract for
	 * cached-mmap'd buffers (Tomasz Figa, linaro-mm-sig 2024-07-11).
	 *
	 * Requires a dma-buf fd: get one via VIDIOC_EXPBUF, sync, close.
	 * Per-call cost is one ioctl pair + one fd open/close per plane.
	 * Could be optimised by caching the EXPBUF fd on the cap_pool slot,
	 * but doing it just-in-time keeps the lifecycle uncomplicated. The
	 * EXPBUF fd's dup count doesn't affect the V4L2 buffer's underlying
	 * pages; closing the fd is a no-op on memory.
	 *
	 * If EXPBUF fails (e.g., consumer-held EXPBUF prevents a second one
	 * — only true for hantro G1 oddity), we skip the sync silently. The
	 * existing pre-iter13 behavior is preserved on the error path.
	 */
	if (surface_object->current_slot != NULL &&
	    driver_data->video_format != NULL) {
		unsigned int capture_type =
			v4l2_type_video_capture(driver_data->video_format->v4l2_mplane);
		if (v4l2_export_buffer(driver_data->video_fd, capture_type,
				       surface_object->destination_index,
				       O_RDONLY, sync_fds,
				       surface_object->destination_buffers_count) >= 0) {
			n_sync_fds = surface_object->destination_buffers_count;
			for (i = 0; i < n_sync_fds; i++) {
				struct dma_buf_sync s = {
					.flags = DMA_BUF_SYNC_START |
						 DMA_BUF_SYNC_READ,
				};
				/* failure is non-fatal: we continue with the read */
				(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
			}
		}
	}

	/*
	 * AV1 film_grain: when this surface is the display surface of a
	 * decode (current_display_picture != current_frame with apply_grain=1),
	 * its slot is NULL because BeginPicture only fired on the decode
	 * surface. Follow the back-link set in av1_set_controls and borrow
	 * the decode surface's destination_data + sizes for the copy.
	 */
	if (surface_object->current_slot == NULL &&
	    surface_object->linked_decode_surface_id != VA_INVALID_SURFACE) {
		struct object_surface *decode_surface =
			SURFACE(driver_data,
				surface_object->linked_decode_surface_id);
		if (decode_surface != NULL &&
		    decode_surface->current_slot != NULL) {
			/* Mirror the fields we read below. The surface heap
			 * pointer is stable for the surface's lifetime; we
			 * only need destination_data + destination_sizes +
			 * destination_planes_count from it. */
			surface_object->destination_planes_count =
				decode_surface->destination_planes_count;
			for (i = 0; i < decode_surface->destination_planes_count; i++) {
				surface_object->destination_data[i] =
					decode_surface->destination_data[i];
				surface_object->destination_sizes[i] =
					decode_surface->destination_sizes[i];
			}
		}
	}

	for (i = 0; i < surface_object->destination_planes_count; i++) {
		/* AV1 Phase 3 diag: surface NULL-deref hunt. */
		if (buffer_object->data == NULL ||
		    surface_object->destination_data[i] == NULL) {
			request_log("copy_surface_to_image NULL i=%u "
				    "buf_data=%p dest_data=%p dest_size=%u "
				    "planes=%u slot=%p linked=0x%x\n",
				    i, (void *)buffer_object->data,
				    (void *)surface_object->destination_data[i],
				    surface_object->destination_sizes[i],
				    surface_object->destination_planes_count,
				    (void *)surface_object->current_slot,
				    surface_object->linked_decode_surface_id);
			return VA_STATUS_ERROR_OPERATION_FAILED;
		}
#ifdef __arm__
		if (!video_format_is_linear(driver_data->video_format))
			tiled_to_planar(surface_object->destination_data[i],
					buffer_object->data + image->offsets[i],
					image->pitches[i], image->width,
					i == 0 ? image->height :
						 image->height / 2);
		else {
#endif
			memcpy(buffer_object->data + image->offsets[i],
			       surface_object->destination_data[i],
			       surface_object->destination_sizes[i]);
#ifdef __arm__
		}
#endif
	}

	/* iter13 α-17: release cache sync. END pairs with each START. */
	for (i = 0; i < n_sync_fds; i++) {
		struct dma_buf_sync s = {
			.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ,
		};
		(void)ioctl(sync_fds[i], DMA_BUF_IOCTL_SYNC, &s);
		close(sync_fds[i]);
	}

	return VA_STATUS_SUCCESS;
}

VAStatus RequestDeriveImage(VADriverContextP context, VASurfaceID surface_id,
			    VAImage *image)
{
	struct request_data *driver_data = context->pDriverData;
	struct object_surface *surface_object;
	struct object_buffer *buffer_object;
	VAImageFormat format;
	VAStatus status;


	surface_object = SURFACE(driver_data, surface_id);
	if (surface_object == NULL)
		return VA_STATUS_ERROR_INVALID_SURFACE;

	if (surface_object->status == VASurfaceRendering) {
		status = RequestSyncSurface(context, surface_id);
		if (status != VA_STATUS_SUCCESS)
			return status;
	}

	/* Fully populate VAImageFormat to match QueryImageFormats output. */
	memset(&format, 0, sizeof(format));
	format.fourcc = VA_FOURCC_NV12;
	format.byte_order = VA_LSB_FIRST;
	format.bits_per_pixel = 12;

	status = RequestCreateImage(context, &format, surface_object->width,
				    surface_object->height, image);
	if (status != VA_STATUS_SUCCESS)
		return status;

	/*
	 * Iter2 Fix 3: skip the surface→image copy when no CAPTURE slot is
	 * bound. ffmpeg's av_hwframe_ctx_init probes vaDeriveImage on a
	 * never-decoded surface to learn the format; it doesn't read the
	 * data. With the cap_pool decoupling, destination_data[] is NULL
	 * until BeginPicture binds a slot — copying from a NULL source
	 * crashed in memcpy. The image's buffer remains zero-initialized;
	 * subsequent post-decode DeriveImage on the same surface (after
	 * BeginPicture has bound a slot) does the real copy.
	 */
	if (surface_object->current_slot != NULL) {
		status = copy_surface_to_image (driver_data, surface_object,
						image);
		if (status != VA_STATUS_SUCCESS)
			return status;
	}

	surface_object->status = VASurfaceReady;

	buffer_object = BUFFER(driver_data, image->buf);
	buffer_object->derived_surface_id = surface_id;

	return VA_STATUS_SUCCESS;
}

VAStatus RequestQueryImageFormats(VADriverContextP context,
				  VAImageFormat *formats, int *formats_count)
{

	/*
	 * Populate the VAImageFormat fully per VAAPI spec for NV12 —
	 * not just .fourcc. Consumers (FFmpeg's hwcontext_vaapi, mpv,
	 * Firefox) read .byte_order and .bits_per_pixel; leaving them
	 * uninitialized inherits whatever caller-stack garbage is in
	 * the buffer and produces non-deterministic behavior. Reference:
	 * Mesa's gallium/frontends/va/image.c::vlVaQueryImageFormats and
	 * intel-vaapi-driver's i965_drv_video.c — both publish NV12
	 * with byte_order=VA_LSB_FIRST and bits_per_pixel=12.
	 *
	 * For YUV formats, depth/red_mask/green_mask/blue_mask/alpha_mask
	 * are not meaningful (those describe RGB bit layouts); leave them
	 * zeroed via memset before populating.
	 */
	memset(&formats[0], 0, sizeof(formats[0]));
	formats[0].fourcc = VA_FOURCC_NV12;
	formats[0].byte_order = VA_LSB_FIRST;
	formats[0].bits_per_pixel = 12;
	*formats_count = 1;

	return VA_STATUS_SUCCESS;
}

VAStatus RequestSetImagePalette(VADriverContextP context, VAImageID image_id,
				unsigned char *palette)
{
	return VA_STATUS_ERROR_UNIMPLEMENTED;
}

VAStatus RequestGetImage(VADriverContextP context, VASurfaceID surface_id,
			 int x, int y, unsigned int width, unsigned int height,
			 VAImageID image_id)
{
	struct request_data *driver_data = context->pDriverData;
	struct object_surface *surface_object;
	struct object_image *image_object;
	VAImage *image;


	surface_object = SURFACE(driver_data, surface_id);
	if (surface_object == NULL)
		return VA_STATUS_ERROR_INVALID_SURFACE;

	image_object = IMAGE(driver_data, image_id);
	if (image_object == NULL)
		return VA_STATUS_ERROR_INVALID_IMAGE;

	image = &image_object->image;
	if (x != 0 || y != 0 || width != image->width || height != image->height)
		return VA_STATUS_ERROR_UNIMPLEMENTED;

	return copy_surface_to_image (driver_data, surface_object, image);
}

VAStatus RequestPutImage(VADriverContextP context, VASurfaceID surface_id,
			 VAImageID image, int src_x, int src_y,
			 unsigned int src_width, unsigned int src_height,
			 int dst_x, int dst_y, unsigned int dst_width,
			 unsigned int dst_height)
{
	return VA_STATUS_ERROR_UNIMPLEMENTED;
}