forked from marfrit/marfrit-packages
Compare commits
32 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7708744eb2 | |||
| 5d97cf15d6 | |||
| 58f67d4b2c | |||
| 685f85c22e | |||
| 6896853544 | |||
| fd56eca3cb | |||
| 91022b390e | |||
| b736dd0529 | |||
| 0bfc4ab03e | |||
| 8729c2db92 | |||
| d449ec1073 | |||
| 9d30c34be9 | |||
| 1ca18ac130 | |||
| cf9eef6cfa | |||
| 5c69460722 | |||
| d11a52405d | |||
| 29e0852d11 | |||
| 510a31622c | |||
| db9ae16da9 | |||
| 493c762967 | |||
| 7ecbcb3c1b | |||
| 360e8eb6bf | |||
| 4db64917bc | |||
| 6288536223 | |||
| 09d8813507 | |||
| 8a3186b53c | |||
| b81e2251c2 | |||
| e7cc22e42d | |||
| 62b6b0a700 | |||
| a8f4a70887 | |||
| 6ee8f2748e | |||
| 711a921e66 |
+137
-1
@@ -935,7 +935,7 @@ jobs:
|
|||||||
libfontconfig-dev libfribidi-dev libgmp-dev libgnutls28-dev \
|
libfontconfig-dev libfribidi-dev libgmp-dev libgnutls28-dev \
|
||||||
libmp3lame-dev libass-dev libdav1d-dev libdrm-dev \
|
libmp3lame-dev libass-dev libdav1d-dev libdrm-dev \
|
||||||
libfreetype-dev libpulse-dev libva-dev libvorbis-dev libvpx-dev \
|
libfreetype-dev libpulse-dev libva-dev libvorbis-dev libvpx-dev \
|
||||||
libwebp-dev libx264-dev libx265-dev libxml2-dev libopus-dev \
|
libwebp-dev libx264-dev libx265-dev libopus-dev \
|
||||||
libvulkan-dev glslang-tools \
|
libvulkan-dev glslang-tools \
|
||||||
v4l-utils liblzma-dev zlib1g-dev \
|
v4l-utils liblzma-dev zlib1g-dev \
|
||||||
curl ca-certificates openssh-client rsync dpkg-dev
|
curl ca-certificates openssh-client rsync dpkg-dev
|
||||||
@@ -1417,6 +1417,142 @@ jobs:
|
|||||||
-e 'ssh -i /root/.ssh/id_ed25519' \
|
-e 'ssh -i /root/.ssh/id_ed25519' \
|
||||||
./ mfritsche@nc.reauktion.de:arch/aarch64/
|
./ mfritsche@nc.reauktion.de:arch/aarch64/
|
||||||
|
|
||||||
|
- name: wipe secrets
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
rm -f /root/repo_pass /root/.ssh/id_ed25519
|
||||||
|
rm -f /root/.ssh/id_ed25519_hertz
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# mesa-panvk-bifrost-video (aarch64 only) — sibling adding VK_KHR_video_decode_h264
|
||||||
|
# via the V4L2 hantro VPU. Phase 4 byte-exact validated 2026-05-21.
|
||||||
|
# Co-installs at /usr/lib/panvk-bifrost-video/ (parallel to r4); opt-in
|
||||||
|
# via VK_ICD_FILENAMES (no launcher shipped — uses standard Vulkan loader).
|
||||||
|
#
|
||||||
|
# Build is slow (~30-60min on actrunner-aarch64): full Mesa-from-source.
|
||||||
|
# Standalone job — no `needs:` since it doesn't depend on the fourier
|
||||||
|
# codec stack. continue-on-error so a build hiccup doesn't block other
|
||||||
|
# jobs in the same workflow run.
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
mesa-panvk-bifrost-video-aarch64:
|
||||||
|
runs-on: arch-aarch64
|
||||||
|
continue-on-error: true
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: skip if already published
|
||||||
|
id: skip-check
|
||||||
|
run: |
|
||||||
|
set -e
|
||||||
|
result=$(./.gitea/scripts/check-already-published.sh arch/mesa-panvk-bifrost-video)
|
||||||
|
echo "$result" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "decision: $result"
|
||||||
|
|
||||||
|
- name: bootstrap runner (idempotent)
|
||||||
|
if: steps.skip-check.outputs.skip != '1'
|
||||||
|
run: |
|
||||||
|
set -e
|
||||||
|
retry() { for i in 1 2 3; do "$@" && return 0; rc=$?; echo "retry $i (exit=$rc)" >&2; sleep $((i*5)); done; return 1; }
|
||||||
|
retry pacman -Syu --noconfirm --needed base-devel git rsync gnupg openssh sudo
|
||||||
|
|
||||||
|
- name: import signing key
|
||||||
|
if: steps.skip-check.outputs.skip != '1'
|
||||||
|
env:
|
||||||
|
PRIV: ${{ secrets.MARFRIT_REPO_PRIVATE_KEY }}
|
||||||
|
PASS: ${{ secrets.MARFRIT_REPO_PASSPHRASE }}
|
||||||
|
run: |
|
||||||
|
set -e
|
||||||
|
gpgconf --homedir /root/.gnupg --kill all 2>/dev/null || true
|
||||||
|
rm -rf /root/.gnupg /root/repo_pass
|
||||||
|
mkdir -m700 -p /root/.gnupg
|
||||||
|
printf '%s' "$PASS" > /root/repo_pass
|
||||||
|
chmod 600 /root/repo_pass
|
||||||
|
printf '%s\n' "$PRIV" | gpg --batch --import
|
||||||
|
echo "92D5E96D8F63C75E4116AA1FF5C8C4603D0D250C:6:" | gpg --import-ownertrust
|
||||||
|
|
||||||
|
- name: install deploy ssh key
|
||||||
|
if: steps.skip-check.outputs.skip != '1'
|
||||||
|
env:
|
||||||
|
KEY: ${{ secrets.MARFRIT_REPO_DEPLOY_KEY }}
|
||||||
|
run: |
|
||||||
|
mkdir -m700 -p /root/.ssh
|
||||||
|
printf '%s\n' "$KEY" > /root/.ssh/id_ed25519
|
||||||
|
chmod 600 /root/.ssh/id_ed25519
|
||||||
|
ssh-keyscan -t ed25519 nc.reauktion.de > /root/.ssh/known_hosts 2>/dev/null
|
||||||
|
|
||||||
|
- name: makepkg mesa-panvk-bifrost-video
|
||||||
|
if: steps.skip-check.outputs.skip != '1'
|
||||||
|
run: |
|
||||||
|
set -e
|
||||||
|
rm -rf /tmp/build-mesa-panvk-bifrost-video
|
||||||
|
cp -r arch/mesa-panvk-bifrost-video /tmp/build-mesa-panvk-bifrost-video
|
||||||
|
chown -R builder:builder /tmp/build-mesa-panvk-bifrost-video
|
||||||
|
cd /tmp/build-mesa-panvk-bifrost-video
|
||||||
|
# MAKEFLAGS for parallel build; runner is multi-core.
|
||||||
|
# --skipinteg because sha256sums=SKIP in PKGBUILD (matches the
|
||||||
|
# fourier-fork PKGBUILD convention).
|
||||||
|
sudo -u builder -H env MAKEFLAGS="-j60" \
|
||||||
|
makepkg --nocheck --noconfirm --syncdeps --cleanbuild --skipinteg
|
||||||
|
ls -la *.pkg.tar.* | grep -v "\.sig$"
|
||||||
|
|
||||||
|
- name: sign mesa-panvk-bifrost-video
|
||||||
|
if: steps.skip-check.outputs.skip != '1'
|
||||||
|
run: |
|
||||||
|
set -e
|
||||||
|
cd /tmp/build-mesa-panvk-bifrost-video
|
||||||
|
for f in *.pkg.tar.xz *.pkg.tar.zst *.pkg.tar.gz; do
|
||||||
|
[ -f "$f" ] || continue
|
||||||
|
gpg --batch --pinentry-mode loopback --passphrase-file /root/repo_pass \
|
||||||
|
--detach-sign --yes -u 92D5E96D8F63C75E4116AA1FF5C8C4603D0D250C "$f"
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: update aarch64 repo db
|
||||||
|
if: steps.skip-check.outputs.skip != '1'
|
||||||
|
run: |
|
||||||
|
set -e
|
||||||
|
mkdir -p /tmp/arch-stage-mesa-panvk-video
|
||||||
|
cd /tmp/arch-stage-mesa-panvk-video
|
||||||
|
rm -f *
|
||||||
|
for f in marfrit.db.tar.gz marfrit.db.tar.gz.sig marfrit.files.tar.gz marfrit.files.tar.gz.sig; do
|
||||||
|
curl -sSLf "https://packages.reauktion.de/arch/aarch64/$f" -o "$f" || rm -f "$f"
|
||||||
|
done
|
||||||
|
for ext in xz zst gz; do
|
||||||
|
ls /tmp/build-mesa-panvk-bifrost-video/*.pkg.tar.$ext 2>/dev/null && \
|
||||||
|
mv /tmp/build-mesa-panvk-bifrost-video/*.pkg.tar.$ext /tmp/build-mesa-panvk-bifrost-video/*.pkg.tar.$ext.sig .
|
||||||
|
done || true
|
||||||
|
export GNUPGHOME=/root/.gnupg
|
||||||
|
printf 'pinentry-mode loopback\npassphrase-file /root/repo_pass\n' > /root/.gnupg/gpg.conf
|
||||||
|
printf 'allow-loopback-pinentry\n' > /root/.gnupg/gpg-agent.conf
|
||||||
|
gpg-connect-agent reloadagent /bye
|
||||||
|
pkgs=()
|
||||||
|
for ext in xz zst gz; do
|
||||||
|
for f in *.pkg.tar.$ext; do [ -f "$f" ] && pkgs+=("$f"); done
|
||||||
|
done
|
||||||
|
if [ -f marfrit.db.tar.gz ]; then
|
||||||
|
for f in "${pkgs[@]}"; do
|
||||||
|
name=$(echo "$f" | sed -E 's/-[0-9].*//')
|
||||||
|
repo-remove --sign --key 92D5E96D8F63C75E4116AA1FF5C8C4603D0D250C \
|
||||||
|
marfrit.db.tar.gz "$name" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
repo-add --new --sign --key 92D5E96D8F63C75E4116AA1FF5C8C4603D0D250C \
|
||||||
|
--verify marfrit.db.tar.gz "${pkgs[@]}"
|
||||||
|
ln -sf marfrit.db.tar.gz marfrit.db
|
||||||
|
ln -sf marfrit.files.tar.gz marfrit.files
|
||||||
|
ln -sf marfrit.db.tar.gz.sig marfrit.db.sig
|
||||||
|
rm -f marfrit.files.sig
|
||||||
|
|
||||||
|
- name: publish to aarch64
|
||||||
|
if: steps.skip-check.outputs.skip != '1'
|
||||||
|
run: |
|
||||||
|
set -e
|
||||||
|
retry() { for i in 1 2 3; do "$@" && return 0; rc=$?; echo "retry $i (exit=$rc)" >&2; sleep $((i*5)); done; return 1; }
|
||||||
|
cd /tmp/arch-stage-mesa-panvk-video
|
||||||
|
retry rsync -avL --copy-unsafe-links \
|
||||||
|
-e 'ssh -i /root/.ssh/id_ed25519' \
|
||||||
|
./ mfritsche@nc.reauktion.de:arch/aarch64/
|
||||||
|
|
||||||
- name: wipe secrets
|
- name: wipe secrets
|
||||||
if: always()
|
if: always()
|
||||||
run: rm -f /root/repo_pass /root/.ssh/id_ed25519
|
run: rm -f /root/repo_pass /root/.ssh/id_ed25519
|
||||||
|
|
||||||
|
|||||||
@@ -8,13 +8,13 @@
|
|||||||
# NEXT.md alongside this PKGBUILD for the full rationale and the
|
# NEXT.md alongside this PKGBUILD for the full rationale and the
|
||||||
# validation log on PineTab2 (RK3566).
|
# validation log on PineTab2 (RK3566).
|
||||||
#
|
#
|
||||||
# Multi-arch: builds natively on x86_64 and aarch64. The x86_64 path
|
# Cross-compiled from x86_64 using chromium's bundled clang (upstream
|
||||||
# is primarily a development / CI host; the runtime target audience is
|
# LLVM doesn't ship clang 23+ yet; chromium's internal fork is required).
|
||||||
# aarch64. The two patches are architecture-independent.
|
# Runtime target is aarch64. The three patches are architecture-independent.
|
||||||
|
|
||||||
pkgname=chromium-fourier
|
pkgname=chromium-fourier
|
||||||
pkgver=147.0.7727.116
|
pkgver=148.0.7778.178
|
||||||
pkgrel=2
|
pkgrel=1
|
||||||
epoch=1
|
epoch=1
|
||||||
pkgdesc='Chromium with V4L2VDA HW video decode unlocked for mainline Linux Wayland on Rockchip'
|
pkgdesc='Chromium with V4L2VDA HW video decode unlocked for mainline Linux Wayland on Rockchip'
|
||||||
arch=('aarch64' 'x86_64')
|
arch=('aarch64' 'x86_64')
|
||||||
@@ -150,7 +150,6 @@ build() {
|
|||||||
'symbol_level=0'
|
'symbol_level=0'
|
||||||
'is_cfi=false'
|
'is_cfi=false'
|
||||||
'treat_warnings_as_errors=false'
|
'treat_warnings_as_errors=false'
|
||||||
'enable_nacl=false'
|
|
||||||
'enable_widevine=false'
|
'enable_widevine=false'
|
||||||
|
|
||||||
# System toolchain (clang/lld from pacman)
|
# System toolchain (clang/lld from pacman)
|
||||||
|
|||||||
@@ -73,16 +73,15 @@ diff --git a/ui/ozone/common/native_pixmap_egl_binding.cc b/ui/ozone/common/nati
|
|||||||
index 31877f4459..6855c1093e 100644
|
index 31877f4459..6855c1093e 100644
|
||||||
--- a/ui/ozone/common/native_pixmap_egl_binding.cc
|
--- a/ui/ozone/common/native_pixmap_egl_binding.cc
|
||||||
+++ b/ui/ozone/common/native_pixmap_egl_binding.cc
|
+++ b/ui/ozone/common/native_pixmap_egl_binding.cc
|
||||||
@@ -6,10 +6,13 @@
|
@@ -6,9 +6,12 @@
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
|
|
||||||
+#include "base/containers/flat_map.h"
|
+#include "base/containers/flat_map.h"
|
||||||
#include "base/logging.h"
|
#include "base/logging.h"
|
||||||
#include "base/memory/scoped_refptr.h"
|
#include "base/memory/scoped_refptr.h"
|
||||||
+#include "base/no_destructor.h"
|
+#include "base/no_destructor.h"
|
||||||
#include "base/notreached.h"
|
#include "base/notreached.h"
|
||||||
#include "base/numerics/safe_conversions.h"
|
|
||||||
+#include "base/synchronization/lock.h"
|
+#include "base/synchronization/lock.h"
|
||||||
#include "ui/gfx/linux/drm_util_linux.h"
|
#include "ui/gfx/linux/drm_util_linux.h"
|
||||||
#include "ui/gl/gl_bindings.h"
|
#include "ui/gl/gl_bindings.h"
|
||||||
|
|||||||
@@ -23,10 +23,10 @@ _module=daedalus_v4l2
|
|||||||
# content-equivalent to f0d4186 plus PR #4 (cosmetic menu ctrls).
|
# content-equivalent to f0d4186 plus PR #4 (cosmetic menu ctrls).
|
||||||
# PROTO_VERSION drops 1 → 0; lock-step install with
|
# PROTO_VERSION drops 1 → 0; lock-step install with
|
||||||
# daedalus-v4l2 0.1.0.r33.5d8b436 REQUIRED.
|
# daedalus-v4l2 0.1.0.r33.5d8b436 REQUIRED.
|
||||||
_commit=5d8b4369e58ab947d1c56b1f718293c57c6065b5
|
_commit=872eec505eb91b561892d02a0526749348ddc121
|
||||||
|
|
||||||
pkgver=0.1.0.r33.5d8b436
|
pkgver=0.1.0.r45.872eec5
|
||||||
pkgrel=1 # reset for new upstream pin (5d8b436 — revert parking design)
|
pkgrel=1 # reset for new upstream pin (872eec5 — PROTO_MAX_PAYLOAD 64 KiB -> 1 MiB, closes #19); lock-step with daedalus-v4l2 0.1.0.r45.872eec5 REQUIRED
|
||||||
pkgdesc="V4L2 stateless decoder shim kernel module (DKMS) — Pi 5 / CM5"
|
pkgdesc="V4L2 stateless decoder shim kernel module (DKMS) — Pi 5 / CM5"
|
||||||
arch=('any')
|
arch=('any')
|
||||||
url="https://git.reauktion.de/reauktion/daedalus-v4l2"
|
url="https://git.reauktion.de/reauktion/daedalus-v4l2"
|
||||||
|
|||||||
@@ -23,12 +23,12 @@ _upstreampkg=daedalus-v4l2
|
|||||||
# (daedalus-v4l2#11). Daemon still needs daedalus-fourier at
|
# (daedalus-v4l2#11). Daemon still needs daedalus-fourier at
|
||||||
# build time (Arch packaging for that is a follow-up; Debian side
|
# build time (Arch packaging for that is a follow-up; Debian side
|
||||||
# fetches inline via build-deb.sh).
|
# fetches inline via build-deb.sh).
|
||||||
_commit=6e6dfa144da7bc7fa8be50c8da91d7d1c6132a2c
|
_commit=872eec505eb91b561892d02a0526749348ddc121
|
||||||
|
|
||||||
# 0.1.0 (pre-1.0) + commit count + short sha. Bump the .Y on each
|
# 0.1.0 (pre-1.0) + commit count + short sha. Bump the .Y on each
|
||||||
# Phase 8.x close. pkgver() recomputes at build time.
|
# Phase 8.x close. pkgver() recomputes at build time.
|
||||||
pkgver=0.1.0.r41.6e6dfa1
|
pkgver=0.1.0.r45.872eec5
|
||||||
pkgrel=1 # reset for new upstream pin (6e6dfa1 — soname 62 via /opt/fourier)
|
pkgrel=1 # reset for new upstream pin (872eec5 — PROTO_MAX_PAYLOAD 64 KiB -> 1 MiB, closes #19); lock-step with daedalus-v4l2-dkms 0.1.0.r45.872eec5 REQUIRED
|
||||||
pkgdesc="Userspace daemon for the daedalus-v4l2 V4L2 stateless decoder shim (VP9/AV1/H.264 on Pi 5 / CM5)"
|
pkgdesc="Userspace daemon for the daedalus-v4l2 V4L2 stateless decoder shim (VP9/AV1/H.264 on Pi 5 / CM5)"
|
||||||
arch=('aarch64')
|
arch=('aarch64')
|
||||||
url="https://git.reauktion.de/reauktion/daedalus-v4l2"
|
url="https://git.reauktion.de/reauktion/daedalus-v4l2"
|
||||||
|
|||||||
@@ -0,0 +1,107 @@
|
|||||||
|
From 1b286ddb4efaca26ec9b9e290e989fec77dc1c77 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Fri, 22 May 2026 10:18:21 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 8x8 IDCT through
|
||||||
|
daedalus-fourier
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
H264DSPContext.idct8_add (called per 8x8 block from the High-profile
|
||||||
|
intra-8x8-DCT decode path in h264_mb.c) now dispatches through
|
||||||
|
daedalus_recipe_dispatch_h264_idct8 instead of ff_h264_idct8_add_neon.
|
||||||
|
|
||||||
|
The recipe layer picks the substrate; for cycle 7 (H.264 IDCT 8x8)
|
||||||
|
the recipe is CPU NEON, so this is effectively a NEON-to-NEON
|
||||||
|
substitution layered on top of the cycle-6 IDCT 4x4 wiring. Same
|
||||||
|
pthread_once global context, same destructive-zero semantics; FFmpeg
|
||||||
|
column-major 8x8 storage block[r + 8*c] matches daedalus's convention.
|
||||||
|
|
||||||
|
Bulk path c->idct8_add4 (used for inter 8x8-DCT macroblocks) remains
|
||||||
|
on the in-tree NEON .S code and will be batched through
|
||||||
|
daedalus_recipe_dispatch_h264_idct8 with n_blocks>1 in a follow-up.
|
||||||
|
|
||||||
|
Bit-exact against ff_h264_idct8_add_neon (daedalus-fourier cycle 7
|
||||||
|
green).
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 7.
|
||||||
|
---
|
||||||
|
libavcodec/aarch64/h264_idct_daedalus.c | 29 ++++++++++++++++-------
|
||||||
|
libavcodec/aarch64/h264dsp_init_aarch64.c | 3 ++-
|
||||||
|
2 files changed, 23 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
index 538d223..cbb98af 100644
|
||||||
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
+++ b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
@@ -1,14 +1,16 @@
|
||||||
|
/*
|
||||||
|
- * H.264 4x4 IDCT + add — daedalus-fourier substitution shim.
|
||||||
|
+ * H.264 4x4 / 8x8 IDCT + add — daedalus-fourier substitution shims.
|
||||||
|
*
|
||||||
|
- * Routes H264DSPContext.idct_add through
|
||||||
|
- * daedalus_recipe_dispatch_h264_idct4 instead of ff_h264_idct_add_neon.
|
||||||
|
- * The recipe layer picks the substrate (CPU NEON by default for
|
||||||
|
- * cycle 6; future cycles may dispatch to V3D opportunistically).
|
||||||
|
+ * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
||||||
|
+ * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
||||||
|
+ * instead of the in-tree ff_h264_idct{,8}_add_neon assembly. The
|
||||||
|
+ * recipe layer picks the substrate (CPU NEON by default for cycles
|
||||||
|
+ * 6 + 7; future cycles may dispatch to V3D opportunistically).
|
||||||
|
*
|
||||||
|
- * FFmpeg's 4x4 block memory layout matches daedalus's column-major
|
||||||
|
- * convention: block[r + 4*c] = coefficient at (row r, col c). Both
|
||||||
|
- * sides destructively zero the block after the transform.
|
||||||
|
+ * FFmpeg's 4x4 and 8x8 block memory layouts match daedalus's
|
||||||
|
+ * column-major convention: block[r + N*c] = coefficient at
|
||||||
|
+ * (row r, col c) for N ∈ {4, 8}. Both sides destructively zero the
|
||||||
|
+ * block after the transform.
|
||||||
|
*
|
||||||
|
* The library context is process-global and lazily initialised under
|
||||||
|
* pthread_once. We pick the no-QPU constructor here because
|
||||||
|
@@ -37,6 +39,7 @@ static void daedalus_ctx_init_once(void)
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
+void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
{
|
||||||
|
@@ -47,3 +50,13 @@ void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
daedalus_recipe_dispatch_h264_idct4(g_dctx, dst, (size_t)stride,
|
||||||
|
block, 1, &meta);
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
+{
|
||||||
|
+ static const daedalus_h264_block_meta meta = { .dst_off = 0 };
|
||||||
|
+
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+
|
||||||
|
+ daedalus_recipe_dispatch_h264_idct8(g_dctx, dst, (size_t)stride,
|
||||||
|
+ block, 1, &meta);
|
||||||
|
+}
|
||||||
|
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
index b993df2..741e551 100644
|
||||||
|
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
@@ -79,6 +79,7 @@ void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
|
||||||
|
const uint8_t nnzc[15 * 8]);
|
||||||
|
|
||||||
|
void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
+void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
|
||||||
|
int16_t *block, int stride,
|
||||||
|
@@ -146,7 +147,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
||||||
|
c->idct_add16intra = ff_h264_idct_add16intra_neon;
|
||||||
|
if (chroma_format_idc <= 1)
|
||||||
|
c->idct_add8 = ff_h264_idct_add8_neon;
|
||||||
|
- c->idct8_add = ff_h264_idct8_add_neon;
|
||||||
|
+ c->idct8_add = ff_h264_idct8_add_daedalus;
|
||||||
|
c->idct8_dc_add = ff_h264_idct8_dc_add_neon;
|
||||||
|
c->idct8_add4 = ff_h264_idct8_add4_neon;
|
||||||
|
} else if (have_neon(cpu_flags) && bit_depth == 10) {
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
|
|
||||||
@@ -0,0 +1,121 @@
|
|||||||
|
From 68731c41d7ea68be0e912b128cb4e71fb56e8263 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Fri, 22 May 2026 12:15:16 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma-v deblock through
|
||||||
|
daedalus-fourier
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
H264DSPContext.v_loop_filter_luma (non-intra bS<4 vertical luma
|
||||||
|
deblock, called per macroblock-row edge from the slice deblock
|
||||||
|
loop) now dispatches through
|
||||||
|
daedalus_recipe_dispatch_h264_deblock_luma_v instead of
|
||||||
|
ff_h264_v_loop_filter_luma_neon.
|
||||||
|
|
||||||
|
The recipe layer picks the substrate; for cycle 8 the daedalus
|
||||||
|
docstring marks the kernel "CPU primary; QPU opportunistic", but
|
||||||
|
the libavcodec.so context here is built with
|
||||||
|
daedalus_ctx_create_no_qpu — process-global pthread_once init,
|
||||||
|
shared with cycles 6/7. QPU opportunism stays gated off until a
|
||||||
|
follow-up adds an explicit feature flag (no implicit Vulkan init
|
||||||
|
in arbitrary host processes). In the meantime cycle 8 is a
|
||||||
|
plumbing-only substitution, NEON-to-NEON via the daedalus recipe.
|
||||||
|
|
||||||
|
Intra (bS=4) loop filter — c->v_loop_filter_luma_intra — stays on
|
||||||
|
the in-tree NEON .S code; daedalus's daedalus_h264_deblock_meta
|
||||||
|
only covers the non-intra path per its docstring.
|
||||||
|
|
||||||
|
FFmpeg `int alpha/beta/int8_t tc0[4]` → daedalus_h264_deblock_meta
|
||||||
|
(int32_t alpha/beta + inline int8_t tc0[4]). pix already points
|
||||||
|
to row 0 of the bottom block per FFmpeg's deblock convention,
|
||||||
|
satisfying daedalus's `dst_off >= 4 * dst_stride` constraint.
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8.
|
||||||
|
---
|
||||||
|
libavcodec/aarch64/h264_idct_daedalus.c | 36 +++++++++++++++++++----
|
||||||
|
libavcodec/aarch64/h264dsp_init_aarch64.c | 4 ++-
|
||||||
|
2 files changed, 33 insertions(+), 7 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
index cbb98af..92365fa 100644
|
||||||
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
+++ b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
@@ -1,11 +1,14 @@
|
||||||
|
/*
|
||||||
|
- * H.264 4x4 / 8x8 IDCT + add — daedalus-fourier substitution shims.
|
||||||
|
+ * H.264 4x4 / 8x8 IDCT + luma-v deblock — daedalus-fourier substitution shims.
|
||||||
|
*
|
||||||
|
- * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
||||||
|
- * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
||||||
|
- * instead of the in-tree ff_h264_idct{,8}_add_neon assembly. The
|
||||||
|
- * recipe layer picks the substrate (CPU NEON by default for cycles
|
||||||
|
- * 6 + 7; future cycles may dispatch to V3D opportunistically).
|
||||||
|
+ * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
||||||
|
+ * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
||||||
|
+ * H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v
|
||||||
|
+ * instead of the in-tree ff_h264_*_neon assembly. The recipe layer
|
||||||
|
+ * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8
|
||||||
|
+ * is CPU primary with QPU opportunistic — the ctx below is no-QPU,
|
||||||
|
+ * so cycle 8 stays on the CPU NEON path until a separate change
|
||||||
|
+ * gates QPU init on a daedalus-fourier feature flag).
|
||||||
|
*
|
||||||
|
* FFmpeg's 4x4 and 8x8 block memory layouts match daedalus's
|
||||||
|
* column-major convention: block[r + N*c] = coefficient at
|
||||||
|
@@ -40,6 +43,8 @@ static void daedalus_ctx_init_once(void)
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
+void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta, int8_t *tc0);
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
{
|
||||||
|
@@ -60,3 +65,22 @@ void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
daedalus_recipe_dispatch_h264_idct8(g_dctx, dst, (size_t)stride,
|
||||||
|
block, 1, &meta);
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta, int8_t *tc0)
|
||||||
|
+{
|
||||||
|
+ daedalus_h264_deblock_meta meta = {
|
||||||
|
+ .dst_off = 0,
|
||||||
|
+ .alpha = alpha,
|
||||||
|
+ .beta = beta,
|
||||||
|
+ };
|
||||||
|
+ meta.tc0[0] = tc0[0];
|
||||||
|
+ meta.tc0[1] = tc0[1];
|
||||||
|
+ meta.tc0[2] = tc0[2];
|
||||||
|
+ meta.tc0[3] = tc0[3];
|
||||||
|
+
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+
|
||||||
|
+ daedalus_recipe_dispatch_h264_deblock_luma_v(g_dctx, pix, (size_t)stride,
|
||||||
|
+ 1, &meta);
|
||||||
|
+}
|
||||||
|
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
index 741e551..85ac381 100644
|
||||||
|
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
@@ -27,6 +27,8 @@
|
||||||
|
|
||||||
|
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||||
|
int beta, int8_t *tc0);
|
||||||
|
+void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta, int8_t *tc0);
|
||||||
|
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||||
|
int beta, int8_t *tc0);
|
||||||
|
void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||||
|
@@ -114,7 +116,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
||||||
|
int cpu_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
|
if (have_neon(cpu_flags) && bit_depth == 8) {
|
||||||
|
- c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
|
||||||
|
+ c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_daedalus;
|
||||||
|
c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
|
||||||
|
c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon;
|
||||||
|
c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
|
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
From 0d1292ea99bc4e5fa2da438259fa01a2374e3e04 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Fri, 22 May 2026 14:18:25 +0200
|
||||||
|
Subject: [PATCH] avcodec/h264: restore AV_CODEC_FLAG_LOW_DELAY semantics
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
FFmpeg 8.x dropped the H.264 decoder's low_delay path —
|
||||||
|
AV_CODEC_FLAG_LOW_DELAY no longer prevents
|
||||||
|
h264_select_output_frame from running the display-order DPB
|
||||||
|
output queue. V4L2-stateless-style consumers (daedalus-v4l2
|
||||||
|
daemon, libva-v4l2-request-fourier) that set the flag end up
|
||||||
|
seeing the 2-1-4-3 pair-swap pattern on B-frame streams again.
|
||||||
|
|
||||||
|
Restore the documented semantics:
|
||||||
|
|
||||||
|
- Early-exit at the top of h264_select_output_frame when the
|
||||||
|
flag is set: emit the just-decoded picture immediately as
|
||||||
|
next_output_pic, mirror the corruption / recovery-point
|
||||||
|
tracking the main path performs, and skip the entire
|
||||||
|
delayed_pic[] / POC reorder machinery.
|
||||||
|
|
||||||
|
- Suppress the SPS-driven has_b_frames clobber in
|
||||||
|
h264_field_start when the flag is set, so the per-slice
|
||||||
|
bitstream_restriction_flag re-pickup cannot reintroduce a
|
||||||
|
nonzero reorder buffer mid-stream.
|
||||||
|
|
||||||
|
This is a fork-only change required by the daedalus-v4l2 daemon's
|
||||||
|
one-frame-per-send_packet contract; upstream FFmpeg consumers that
|
||||||
|
expect display-order output remain untouched (flag default = off).
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 deblock
|
||||||
|
+ flag-restoration follow-up.
|
||||||
|
---
|
||||||
|
libavcodec/h264_slice.c | 23 +++++++++++++++++++++++
|
||||||
|
1 file changed, 23 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
|
||||||
|
index 97fab70..a7bfbd6 100644
|
||||||
|
--- a/libavcodec/h264_slice.c
|
||||||
|
+++ b/libavcodec/h264_slice.c
|
||||||
|
@@ -1308,6 +1308,28 @@ static int h264_select_output_frame(H264Context *h)
|
||||||
|
cur->mmco_reset = h->mmco_reset;
|
||||||
|
h->mmco_reset = 0;
|
||||||
|
|
||||||
|
+ /* AV_CODEC_FLAG_LOW_DELAY restore (FFmpeg 8.x dropped the H.264
|
||||||
|
+ * decoder's low_delay path). Bypass the display-order DPB
|
||||||
|
+ * output queue: emit the just-decoded picture immediately, in
|
||||||
|
+ * decode order, one per send_packet. V4L2-stateless-style
|
||||||
|
+ * consumers (daedalus-v4l2 daemon, libva-v4l2-request-fourier)
|
||||||
|
+ * do their own POC-based reorder downstream and require this
|
||||||
|
+ * behaviour. */
|
||||||
|
+ if (h->avctx->flags & AV_CODEC_FLAG_LOW_DELAY) {
|
||||||
|
+ h->next_output_pic = cur;
|
||||||
|
+ h->next_outputed_poc = cur->poc;
|
||||||
|
+ h->frame_recovered |= cur->recovered;
|
||||||
|
+ cur->recovered |= h->frame_recovered & FRAME_RECOVERED_SEI;
|
||||||
|
+ if (!cur->recovered) {
|
||||||
|
+ if (!(h->avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT) &&
|
||||||
|
+ !(h->avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL))
|
||||||
|
+ h->next_output_pic = NULL;
|
||||||
|
+ else
|
||||||
|
+ cur->f->flags |= AV_FRAME_FLAG_CORRUPT;
|
||||||
|
+ }
|
||||||
|
+ return 0;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
if (sps->bitstream_restriction_flag ||
|
||||||
|
h->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT) {
|
||||||
|
h->avctx->has_b_frames = FFMAX(h->avctx->has_b_frames, sps->num_reorder_frames);
|
||||||
|
@@ -1415,6 +1437,7 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
|
||||||
|
sps = h->ps.sps;
|
||||||
|
|
||||||
|
if (sps->bitstream_restriction_flag &&
|
||||||
|
+ !(h->avctx->flags & AV_CODEC_FLAG_LOW_DELAY) &&
|
||||||
|
h->avctx->has_b_frames < sps->num_reorder_frames) {
|
||||||
|
h->avctx->has_b_frames = sps->num_reorder_frames;
|
||||||
|
}
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
|
|
||||||
@@ -0,0 +1,139 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Sat, 23 May 2026 12:00:00 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264qpel: route 8x8 mc20 through
|
||||||
|
daedalus-fourier
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
H264QpelContext.put_h264_qpel_pixels_tab[1][2] (8x8 luma horizontal
|
||||||
|
half-pel, 6-tap "put" variant — the canonical representative of the
|
||||||
|
H.264 luma motion-compensation family) now dispatches through
|
||||||
|
daedalus_recipe_dispatch_h264_qpel_mc20 instead of
|
||||||
|
ff_put_h264_qpel8_mc20_neon.
|
||||||
|
|
||||||
|
Cycle 9 of the daedalus-v4l2#11 step 2 substitution arc; closes the
|
||||||
|
4-cycle libavcodec.so substitution sequence (6 IDCT 4x4 / 7 IDCT 8x8 /
|
||||||
|
8 luma-v deblock / 9 qpel mc20).
|
||||||
|
|
||||||
|
The recipe layer picks the substrate. Per docs/k9_h264qpel_mc20.md
|
||||||
|
the verdict is CPU NEON: per-block 7.6 ns at 131 Mblock/s gives 135x
|
||||||
|
margin over 30 fps 1080p, and the QPU dispatch floor (~250 ns)
|
||||||
|
makes any V3D shader strictly worse. Substitution is plumbing-only,
|
||||||
|
NEON-by-recipe — same daedalus_ctx_create_no_qpu pthread_once
|
||||||
|
context shape the cycles 6/7/8 shims already own (kept SEPARATE
|
||||||
|
from the H264DSP shim's ctx because H264QPEL is its own libavcodec
|
||||||
|
Makefile module and link order does not guarantee a single .o
|
||||||
|
owns the ctx symbol; one extra ~µs init per process, paid lazily).
|
||||||
|
|
||||||
|
Other H.264 luma MC variants (mc02, mc11, mc22 etc.) and the 16x16
|
||||||
|
size tier stay on the in-tree NEON .S code. Per the cycle-9 phase-1
|
||||||
|
rationale, mc20 8x8 is representative of the whole family's per-block
|
||||||
|
cost — extending the substitution to other variants would multiply
|
||||||
|
recipe-lookup overhead without changing the substrate verdict.
|
||||||
|
|
||||||
|
Bit-exact against ff_put_h264_qpel8_mc20_neon (daedalus-fourier
|
||||||
|
cycle 9 green; M1 = 100% bit-exact across 10000 random blocks).
|
||||||
|
|
||||||
|
No SONAME change, no Depends change.
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 9.
|
||||||
|
---
|
||||||
|
libavcodec/aarch64/Makefile | 3 +-
|
||||||
|
libavcodec/aarch64/h264_qpel_daedalus.c | 50 ++++++++++++++++++++++
|
||||||
|
libavcodec/aarch64/h264qpel_init_aarch64.c | 4 +-
|
||||||
|
3 files changed, 55 insertions(+), 2 deletions(-)
|
||||||
|
create mode 100644 libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
|
||||||
|
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
|
||||||
|
--- a/libavcodec/aarch64/Makefile
|
||||||
|
+++ b/libavcodec/aarch64/Makefile
|
||||||
|
@@ -7,7 +7,8 @@ OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o \
|
||||||
|
aarch64/h264_idct_daedalus.o
|
||||||
|
OBJS-$(CONFIG_HUFFYUVDSP) += aarch64/huffyuvdsp_init_aarch64.o
|
||||||
|
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
|
||||||
|
-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
|
||||||
|
+OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o \
|
||||||
|
+ aarch64/h264_qpel_daedalus.o
|
||||||
|
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
|
||||||
|
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
|
||||||
|
OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
|
||||||
|
diff --git a/libavcodec/aarch64/h264_qpel_daedalus.c b/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
new file mode 100644
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
@@ -0,0 +1,50 @@
|
||||||
|
+/*
|
||||||
|
+ * H.264 luma qpel mc20 (8x8, horizontal half-pel, 6-tap "put")
|
||||||
|
+ * — daedalus-fourier substitution shim.
|
||||||
|
+ *
|
||||||
|
+ * Routes H264QpelContext.put_h264_qpel_pixels_tab[1][2] through
|
||||||
|
+ * daedalus_recipe_dispatch_h264_qpel_mc20 instead of
|
||||||
|
+ * ff_put_h264_qpel8_mc20_neon. The recipe layer picks the substrate
|
||||||
|
+ * (CPU NEON for cycle 9; QPU not viable — per-block 7.6 ns vs
|
||||||
|
+ * ~250 ns QPU dispatch floor, see docs/k9_h264qpel_mc20.md).
|
||||||
|
+ *
|
||||||
|
+ * Sibling to libavcodec/aarch64/h264_idct_daedalus.c. We keep a
|
||||||
|
+ * SEPARATE process-global pthread_once context here instead of
|
||||||
|
+ * sharing the H264DSP one because H264QPEL is its own libavcodec
|
||||||
|
+ * Makefile module and link order does not guarantee a single .o
|
||||||
|
+ * owns the ctx symbol. The cost is one extra
|
||||||
|
+ * daedalus_ctx_create_no_qpu (~µs) per process; daemon and host
|
||||||
|
+ * processes pay this lazily on first MC call.
|
||||||
|
+ *
|
||||||
|
+ * FFmpeg H264QpelContext convention: both dst and src use a SINGLE
|
||||||
|
+ * stride and `src` already points at the leftmost OUTPUT column
|
||||||
|
+ * (col 0); the 6-tap filter reads cols -2..+3. This matches
|
||||||
|
+ * daedalus_recipe_dispatch_h264_qpel_mc20's documented contract
|
||||||
|
+ * directly, so dst_off = src_off = 0.
|
||||||
|
+ */
|
||||||
|
+
|
||||||
|
+#include <pthread.h>
|
||||||
|
+#include <stddef.h>
|
||||||
|
+#include <stdint.h>
|
||||||
|
+
|
||||||
|
+#include <daedalus.h>
|
||||||
|
+
|
||||||
|
+#include "libavutil/attributes.h"
|
||||||
|
+
|
||||||
|
+static daedalus_ctx *g_dctx;
|
||||||
|
+static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT;
|
||||||
|
+
|
||||||
|
+static void daedalus_ctx_init_once(void)
|
||||||
|
+{
|
||||||
|
+ g_dctx = daedalus_ctx_create_no_qpu();
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
+
|
||||||
|
+void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||||
|
+{
|
||||||
|
+ static const daedalus_h264_qpel_meta meta = { .dst_off = 0, .src_off = 0 };
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+ daedalus_recipe_dispatch_h264_qpel_mc20(g_dctx, dst, src, (size_t)stride,
|
||||||
|
+ 1, &meta);
|
||||||
|
+}
|
||||||
|
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
|
||||||
|
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
|
||||||
|
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
|
||||||
|
@@ -47,6 +47,8 @@ void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str
|
||||||
|
void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
+void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src,
|
||||||
|
+ ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
@@ -184,7 +186,7 @@ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
|
||||||
|
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
|
||||||
|
- c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
|
||||||
|
+ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_daedalus;
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
@@ -24,13 +24,13 @@ _srcname=FFmpeg
|
|||||||
_version='8.1'
|
_version='8.1'
|
||||||
_commit='b57fbbe50c9b2656fad86a1a7eeabfd2b2a50935' # v4l2-request-n8.1 tip 2026-04-24
|
_commit='b57fbbe50c9b2656fad86a1a7eeabfd2b2a50935' # v4l2-request-n8.1 tip 2026-04-24
|
||||||
pkgver=8.1.r123329.b57fbbe
|
pkgver=8.1.r123329.b57fbbe
|
||||||
pkgrel=6 # pkgrel=6 — H.264 IDCT 4x4 daedalus-fourier substitution (2026-05-21)
|
pkgrel=10 # pkgrel=10 — H.264 luma qpel mc20 daedalus-fourier substitution (cycle 9, 2026-05-23)
|
||||||
epoch=2
|
epoch=2
|
||||||
|
|
||||||
# daedalus-fourier pin — first kernel substitution in libavcodec
|
# daedalus-fourier pin. 209a421 = PR #2 merge (Phase 8c — public API
|
||||||
# (cycle 6 H.264 IDCT 4x4). Same SHA as the daedalus-v4l2 daemon's
|
# gains daedalus_recipe_dispatch_h264_qpel_mc20 + DAEDALUS_KERNEL_H264_QPEL_MC20).
|
||||||
# inline build; lockstep with that until the public API rolls.
|
# Cycle 9 closes the libavcodec.so substitution arc started at cycle 6.
|
||||||
_daedalus_fourier_commit='d87239d8172307d9a1b93c95cbed116d175b85cc'
|
_daedalus_fourier_commit='209a4218bcb98b91c04f07ad61513bb04adb13ad'
|
||||||
pkgdesc='FFmpeg with V4L2 Request API hwaccel (Rockchip / Allwinner stateless decode)'
|
pkgdesc='FFmpeg with V4L2 Request API hwaccel (Rockchip / Allwinner stateless decode)'
|
||||||
arch=('aarch64')
|
arch=('aarch64')
|
||||||
url='https://github.com/Kwiboo/FFmpeg'
|
url='https://github.com/Kwiboo/FFmpeg'
|
||||||
@@ -90,8 +90,12 @@ source=("git+https://github.com/Kwiboo/FFmpeg.git#commit=${_commit}"
|
|||||||
"daedalus-fourier-${_daedalus_fourier_commit}.tar.gz::https://git.reauktion.de/marfrit/daedalus-fourier/archive/${_daedalus_fourier_commit}.tar.gz"
|
"daedalus-fourier-${_daedalus_fourier_commit}.tar.gz::https://git.reauktion.de/marfrit/daedalus-fourier/archive/${_daedalus_fourier_commit}.tar.gz"
|
||||||
'0001-libudev-bypass-fallback.patch'
|
'0001-libudev-bypass-fallback.patch'
|
||||||
'0002-nv15-to-p010-unpack.patch'
|
'0002-nv15-to-p010-unpack.patch'
|
||||||
'0003-h264-idct4-daedalus-fourier.patch')
|
'0003-h264-idct4-daedalus-fourier.patch'
|
||||||
sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP')
|
'0004-h264-idct8-daedalus-fourier.patch'
|
||||||
|
'0005-h264-deblock-luma-v-daedalus-fourier.patch'
|
||||||
|
'0006-h264-restore-low-delay.patch'
|
||||||
|
'0007-h264-qpel-mc20-daedalus-fourier.patch')
|
||||||
|
sha256sums=('SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP' 'SKIP')
|
||||||
|
|
||||||
pkgver() {
|
pkgver() {
|
||||||
cd "${_srcname}"
|
cd "${_srcname}"
|
||||||
@@ -105,6 +109,10 @@ prepare() {
|
|||||||
patch -Np1 -i "${srcdir}/0001-libudev-bypass-fallback.patch"
|
patch -Np1 -i "${srcdir}/0001-libudev-bypass-fallback.patch"
|
||||||
patch -Np1 -i "${srcdir}/0002-nv15-to-p010-unpack.patch"
|
patch -Np1 -i "${srcdir}/0002-nv15-to-p010-unpack.patch"
|
||||||
patch -Np1 -i "${srcdir}/0003-h264-idct4-daedalus-fourier.patch"
|
patch -Np1 -i "${srcdir}/0003-h264-idct4-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "${srcdir}/0004-h264-idct8-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "${srcdir}/0005-h264-deblock-luma-v-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "${srcdir}/0006-h264-restore-low-delay.patch"
|
||||||
|
patch -Np1 -i "${srcdir}/0007-h264-qpel-mc20-daedalus-fourier.patch"
|
||||||
}
|
}
|
||||||
|
|
||||||
build() {
|
build() {
|
||||||
|
|||||||
+57
@@ -0,0 +1,57 @@
|
|||||||
|
From: claude-noether (on behalf of mfritsche)
|
||||||
|
Date: 2026-05-19
|
||||||
|
Subject: panvk: expose VK_KHR/EXT_robustness2 + nullDescriptor on Bifrost (PAN_ARCH 6/7)
|
||||||
|
|
||||||
|
Without this, Mesa's Zink driver refuses to use PanVk-Bifrost as its Vulkan
|
||||||
|
backend, falling back silently to llvmpipe (software rasterizer) for all
|
||||||
|
GL-via-Zink on Bifrost SBCs. That defeats the entire purpose of having a
|
||||||
|
Vulkan driver on Bifrost — GL acceleration via Zink is the most natural
|
||||||
|
near-term consumer.
|
||||||
|
|
||||||
|
panvk_vX_nir_lower_descriptors.c:1309 and panvk_vX_shader.c:1355 already
|
||||||
|
plumb dev->vk.enabled_features.nullDescriptor arch-agnostically — the gate
|
||||||
|
at panvk_vX_physical_device.c was set conservatively when Bifrost was
|
||||||
|
unmaintained, not because of hardware incapability.
|
||||||
|
|
||||||
|
iter1–7 of the panvk-bifrost campaign proved fundamental driver functions
|
||||||
|
on Mali-G52 r1 MC1 (PAN_ARCH=7). This patch is the iter8 follow-up.
|
||||||
|
|
||||||
|
robustBufferAccess2 and robustImageAccess2 are NOT flipped — they're
|
||||||
|
independent rb2 features Zink doesn't require, gated differently
|
||||||
|
(robustBufferAccess2 = PAN_ARCH >= 11, robustImageAccess2 = false), and
|
||||||
|
out of scope for iter8.
|
||||||
|
|
||||||
|
---
|
||||||
|
src/panfrost/vulkan/panvk_vX_physical_device.c | 6 +++---
|
||||||
|
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
@@ -91,7 +91,7 @@ get_device_extensions(const struct panvk_physical_device *device,
|
||||||
|
.KHR_pipeline_binary = true,
|
||||||
|
.KHR_pipeline_executable_properties = true,
|
||||||
|
.KHR_pipeline_library = true,
|
||||||
|
- .KHR_robustness2 = PAN_ARCH >= 10,
|
||||||
|
+ .KHR_robustness2 = true,
|
||||||
|
.KHR_sampler_mirror_clamp_to_edge = true,
|
||||||
|
.KHR_sampler_ycbcr_conversion = true,
|
||||||
|
.KHR_separate_depth_stencil_layouts = true,
|
||||||
|
@@ -168,7 +168,7 @@ get_device_extensions(const struct panvk_physical_device *device,
|
||||||
|
.EXT_queue_family_foreign = true,
|
||||||
|
.EXT_robustness = pan_arch(device->kmod.dev->props.gpu_id) >= 9,
|
||||||
|
.EXT_image_robustness = true,
|
||||||
|
- .EXT_robustness2 = PAN_ARCH >= 10,
|
||||||
|
+ .EXT_robustness2 = true,
|
||||||
|
.EXT_sampler_filter_minmax = PAN_ARCH >= 10,
|
||||||
|
.EXT_scalar_block_layout = true,
|
||||||
|
.EXT_separate_stencil_usage = true,
|
||||||
|
@@ -493,7 +493,7 @@ get_device_features(const struct panvk_physical_device *device,
|
||||||
|
/* VK_KHR_robustness2 */
|
||||||
|
.robustBufferAccess2 = PAN_ARCH >= 11,
|
||||||
|
.robustImageAccess2 = false,
|
||||||
|
- .nullDescriptor = PAN_ARCH >= 10,
|
||||||
|
+ .nullDescriptor = true,
|
||||||
|
|
||||||
|
/* VK_KHR_shader_clock */
|
||||||
|
.shaderSubgroupClock = device->kmod.dev->props.gpu_can_query_timestamp,
|
||||||
@@ -0,0 +1,47 @@
|
|||||||
|
From: claude-noether (on behalf of mfritsche)
|
||||||
|
Date: 2026-05-20
|
||||||
|
Subject: panvk: expose Vulkan 1.1 + 1.2 on Bifrost (PAN_ARCH 6/7)
|
||||||
|
|
||||||
|
ANGLE (Chromium's GL stack) requires apiVersion >= 1.1 to initialize. Without
|
||||||
|
this, Brave / Chromium's GPU process fails at GL info collection:
|
||||||
|
|
||||||
|
vk_renderer.cpp:2659 (initialize): ANGLE Requires a minimum Vulkan device
|
||||||
|
version of 1.1
|
||||||
|
Display::initialize error 0: Internal Vulkan error (-9): The requested
|
||||||
|
version of Vulkan is not supported by the driver
|
||||||
|
|
||||||
|
Stack-up with iter8's robustness2 patch enables ANGLE → PanVk-Bifrost →
|
||||||
|
Skia (via --enable-features=Vulkan) on Bifrost SBCs.
|
||||||
|
|
||||||
|
PanVk-Bifrost already supports the bulk of 1.1-promoted features as extensions
|
||||||
|
(multiview, maintenance1-3, descriptor update template, 16-bit storage,
|
||||||
|
descriptor update template, sampler ycbcr, variable pointers, etc. — all
|
||||||
|
visible in iter0 vulkaninfo). The version bump primarily bundles them.
|
||||||
|
|
||||||
|
Risk: Vulkan 1.1 has features beyond what iter1–7 exercised (protected memory,
|
||||||
|
full subgroup ops). Specific app failures will be characterizable.
|
||||||
|
|
||||||
|
1.2 is also flipped — Brave's Vulkan path may want descriptor indexing,
|
||||||
|
buffer device address, etc. (all listed in iter0 vulkaninfo as supported
|
||||||
|
extensions, just gated as 1.0-with-extensions, not 1.2-core).
|
||||||
|
|
||||||
|
---
|
||||||
|
src/panfrost/vulkan/panvk_vX_physical_device.c | 4 ++--
|
||||||
|
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
@@ -38,8 +38,8 @@ get_device_extensions(const struct panvk_physical_device *device,
|
||||||
|
struct vk_device_extension_table *ext)
|
||||||
|
{
|
||||||
|
*ext = (struct vk_device_extension_table){
|
||||||
|
- .KHR_8bit_storage = true,
|
||||||
|
- .KHR_16bit_storage = true,
|
||||||
|
- bool has_vk1_1 = PAN_ARCH >= 10;
|
||||||
|
- bool has_vk1_2 = PAN_ARCH >= 10;
|
||||||
|
+ .KHR_8bit_storage = true,
|
||||||
|
+ .KHR_16bit_storage = true,
|
||||||
|
+ bool has_vk1_1 = true;
|
||||||
|
+ bool has_vk1_2 = true;
|
||||||
|
*ext = (struct vk_device_extension_table){
|
||||||
@@ -0,0 +1,328 @@
|
|||||||
|
--- a/src/panfrost/vulkan/panvk_shader.h 2026-04-29 22:19:00.000000000 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_shader.h 2026-05-20 18:52:53.312698258 +0200
|
||||||
|
@@ -150,6 +150,10 @@
|
||||||
|
struct {
|
||||||
|
#if PAN_ARCH < 9
|
||||||
|
int32_t raw_vertex_offset;
|
||||||
|
+ uint32_t num_vertices; /* iter13: XFB needs per-draw vertex count */
|
||||||
|
+ /* aligned_u64 attribute below inserts the 4-byte alignment gap
|
||||||
|
+ * after num_vertices automatically — no explicit pad needed. */
|
||||||
|
+ aligned_u64 xfb_address[4]; /* iter13: 4 transform feedback buffer base addresses */
|
||||||
|
#endif
|
||||||
|
int32_t first_vertex;
|
||||||
|
int32_t base_instance;
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c 2026-05-20 19:09:29.711145446 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c 2026-05-20 18:52:54.832720445 +0200
|
||||||
|
@@ -169,6 +169,7 @@
|
||||||
|
.EXT_provoking_vertex = true,
|
||||||
|
.EXT_queue_family_foreign = true,
|
||||||
|
.EXT_robustness2 = true,
|
||||||
|
+ .EXT_transform_feedback = PAN_ARCH < 9, /* iter13: JM-class only for now */
|
||||||
|
.EXT_sampler_filter_minmax = PAN_ARCH >= 10,
|
||||||
|
.EXT_scalar_block_layout = true,
|
||||||
|
.EXT_separate_stencil_usage = true,
|
||||||
|
@@ -495,6 +496,10 @@
|
||||||
|
.robustImageAccess2 = false,
|
||||||
|
.nullDescriptor = true,
|
||||||
|
|
||||||
|
+ /* VK_EXT_transform_feedback (iter13) */
|
||||||
|
+ .transformFeedback = PAN_ARCH < 9,
|
||||||
|
+ .geometryStreams = false,
|
||||||
|
+
|
||||||
|
/* VK_KHR_shader_clock */
|
||||||
|
.shaderSubgroupClock = device->kmod.dev->props.gpu_can_query_timestamp,
|
||||||
|
.shaderDeviceClock = device->kmod.dev->props.timestamp_device_coherent,
|
||||||
|
@@ -1020,6 +1025,18 @@
|
||||||
|
.robustStorageBufferAccessSizeAlignment = 1,
|
||||||
|
.robustUniformBufferAccessSizeAlignment = 1,
|
||||||
|
|
||||||
|
+ /* VK_EXT_transform_feedback (iter13) */
|
||||||
|
+ .maxTransformFeedbackStreams = 1,
|
||||||
|
+ .maxTransformFeedbackBuffers = 4,
|
||||||
|
+ .maxTransformFeedbackBufferSize = UINT32_MAX,
|
||||||
|
+ .maxTransformFeedbackStreamDataSize = 512,
|
||||||
|
+ .maxTransformFeedbackBufferDataSize = 512,
|
||||||
|
+ .maxTransformFeedbackBufferDataStride = 2048,
|
||||||
|
+ .transformFeedbackQueries = false,
|
||||||
|
+ .transformFeedbackStreamsLinesTriangles = false,
|
||||||
|
+ .transformFeedbackRasterizationStreamSelect = false,
|
||||||
|
+ .transformFeedbackDraw = false,
|
||||||
|
+
|
||||||
|
/* VK_EXT_shader_object */
|
||||||
|
/* We do not currently support VK_EXT_shader_object but this is used
|
||||||
|
* internally by vk_shader
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_shader.c 2026-04-29 22:19:00.000000000 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_shader.c 2026-05-20 18:52:56.556745611 +0200
|
||||||
|
@@ -21,6 +21,7 @@
|
||||||
|
#include "panvk_physical_device.h"
|
||||||
|
#include "panvk_sampler.h"
|
||||||
|
#include "panvk_shader.h"
|
||||||
|
+#include "pan_nir.h" /* iter13: pan_nir_lower_xfb */
|
||||||
|
|
||||||
|
#include "spirv/nir_spirv.h"
|
||||||
|
#include "util/memstream.h"
|
||||||
|
@@ -100,6 +101,20 @@
|
||||||
|
case nir_intrinsic_load_raw_vertex_offset_pan:
|
||||||
|
val = load_sysval(b, graphics, bit_size, vs.raw_vertex_offset);
|
||||||
|
break;
|
||||||
|
+ case nir_intrinsic_load_num_vertices: /* iter13: XFB index calc */
|
||||||
|
+ val = load_sysval(b, graphics, bit_size, vs.num_vertices);
|
||||||
|
+ break;
|
||||||
|
+ case nir_intrinsic_load_xfb_address: { /* iter13: XFB buffer N base address */
|
||||||
|
+ unsigned idx = nir_intrinsic_base(intr);
|
||||||
|
+ switch (idx) {
|
||||||
|
+ case 0: val = load_sysval(b, graphics, bit_size, vs.xfb_address[0]); break;
|
||||||
|
+ case 1: val = load_sysval(b, graphics, bit_size, vs.xfb_address[1]); break;
|
||||||
|
+ case 2: val = load_sysval(b, graphics, bit_size, vs.xfb_address[2]); break;
|
||||||
|
+ case 3: val = load_sysval(b, graphics, bit_size, vs.xfb_address[3]); break;
|
||||||
|
+ default: return false;
|
||||||
|
+ }
|
||||||
|
+ break;
|
||||||
|
+ }
|
||||||
|
case nir_intrinsic_load_layer_id:
|
||||||
|
assert(b->shader->info.stage == MESA_SHADER_FRAGMENT);
|
||||||
|
val = load_sysval(b, graphics, bit_size, layer_id);
|
||||||
|
@@ -457,6 +472,7 @@
|
||||||
|
core_max_id);
|
||||||
|
|
||||||
|
pan_preprocess_nir(nir, pdev->kmod.dev->props.gpu_id);
|
||||||
|
+
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
@@ -870,6 +886,18 @@
|
||||||
|
nir_var_shader_in | nir_var_shader_out, UINT32_MAX);
|
||||||
|
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
|
||||||
|
glsl_type_size, nir_lower_io_use_interpolated_input_intrinsics);
|
||||||
|
+
|
||||||
|
+#if PAN_ARCH < 9
|
||||||
|
+ /* iter13: VK_EXT_transform_feedback — runs AFTER nir_lower_io so that
|
||||||
|
+ * shader outputs are now store_output intrinsics that pan_nir_lower_xfb
|
||||||
|
+ * can rewrite to nir_store_global+nir_load_xfb_address. */
|
||||||
|
+ if (nir->info.stage == MESA_SHADER_VERTEX &&
|
||||||
|
+ nir->info.has_transform_feedback_varyings) {
|
||||||
|
+ NIR_PASS(_, nir, nir_opt_constant_folding);
|
||||||
|
+ NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info);
|
||||||
|
+ NIR_PASS(_, nir, pan_nir_lower_xfb);
|
||||||
|
+ }
|
||||||
|
+#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static VkResult
|
||||||
|
@@ -1288,6 +1316,9 @@
|
||||||
|
.view_mask = (state && state->rp) ? state->rp->view_mask : 0,
|
||||||
|
.robust2_modes = robust2_modes,
|
||||||
|
.robust_descriptors = dev->vk.enabled_features.nullDescriptor,
|
||||||
|
+ /* iter13: XFB shaders must disable IDVS (matches Panfrost-Gallium). */
|
||||||
|
+ .no_idvs = (info->stage == MESA_SHADER_VERTEX) &&
|
||||||
|
+ info->nir->info.has_transform_feedback_varyings,
|
||||||
|
};
|
||||||
|
|
||||||
|
switch (info->stage) {
|
||||||
|
--- a/src/panfrost/vulkan/panvk_cmd_draw.h 2026-04-29 22:19:00.000000000 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_cmd_draw.h 2026-05-20 18:52:57.748763011 +0200
|
||||||
|
@@ -135,6 +135,19 @@
|
||||||
|
struct panvk_graphics_sysvals sysvals;
|
||||||
|
|
||||||
|
#if PAN_ARCH < 9
|
||||||
|
+ /* iter13: VK_EXT_transform_feedback state (JM-class only for now). */
|
||||||
|
+ struct {
|
||||||
|
+ bool active;
|
||||||
|
+ uint32_t buffer_count;
|
||||||
|
+ struct {
|
||||||
|
+ uint64_t addr;
|
||||||
|
+ uint64_t offset;
|
||||||
|
+ uint64_t size;
|
||||||
|
+ } buffers[4];
|
||||||
|
+ } xfb;
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#if PAN_ARCH < 9
|
||||||
|
struct panvk_shader_link link;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-04-29 22:19:00.000000000 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-05-20 19:10:23.031919662 +0200
|
||||||
|
@@ -10,6 +10,7 @@
|
||||||
|
#include "panvk_entrypoints.h"
|
||||||
|
|
||||||
|
#include "pan_desc.h"
|
||||||
|
+#include "pan_compiler.h" /* PAN_SHADER_OOB_ADDRESS */
|
||||||
|
#include "pan_util.h"
|
||||||
|
|
||||||
|
static void
|
||||||
|
@@ -722,6 +723,35 @@
|
||||||
|
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.raw_vertex_offset,
|
||||||
|
info->vertex.raw_offset);
|
||||||
|
set_gfx_sysval(cmdbuf, dirty_sysvals, layer_id, info->layer_id);
|
||||||
|
+
|
||||||
|
+ /* iter13: VK_EXT_transform_feedback sysvals — always set (per draw),
|
||||||
|
+ * reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */
|
||||||
|
+ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count);
|
||||||
|
+ {
|
||||||
|
+ const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx;
|
||||||
|
+ /* iter13: default each XFB buffer address to PAN_SHADER_OOB_ADDRESS
|
||||||
|
+ * (= 1<<63). This is the Panfrost-Gallium memory-sink idiom — the
|
||||||
|
+ * Bifrost MMU silently discards stores to this address, so a pipeline
|
||||||
|
+ * with XFB outputs used in a non-XFB draw (or in an XFB draw with
|
||||||
|
+ * fewer bound buffers than the shader declares) is safe instead of
|
||||||
|
+ * faulting. See gallium/drivers/panfrost/pan_cmdstream.c PAN_SYSVAL_XFB. */
|
||||||
|
+ uint64_t _xa0 = PAN_SHADER_OOB_ADDRESS, _xa1 = PAN_SHADER_OOB_ADDRESS,
|
||||||
|
+ _xa2 = PAN_SHADER_OOB_ADDRESS, _xa3 = PAN_SHADER_OOB_ADDRESS;
|
||||||
|
+ if (_gfx->xfb.active) {
|
||||||
|
+ if (_gfx->xfb.buffer_count > 0 && _gfx->xfb.buffers[0].addr)
|
||||||
|
+ _xa0 = _gfx->xfb.buffers[0].addr + _gfx->xfb.buffers[0].offset;
|
||||||
|
+ if (_gfx->xfb.buffer_count > 1 && _gfx->xfb.buffers[1].addr)
|
||||||
|
+ _xa1 = _gfx->xfb.buffers[1].addr + _gfx->xfb.buffers[1].offset;
|
||||||
|
+ if (_gfx->xfb.buffer_count > 2 && _gfx->xfb.buffers[2].addr)
|
||||||
|
+ _xa2 = _gfx->xfb.buffers[2].addr + _gfx->xfb.buffers[2].offset;
|
||||||
|
+ if (_gfx->xfb.buffer_count > 3 && _gfx->xfb.buffers[3].addr)
|
||||||
|
+ _xa3 = _gfx->xfb.buffers[3].addr + _gfx->xfb.buffers[3].offset;
|
||||||
|
+ }
|
||||||
|
+ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[0], _xa0);
|
||||||
|
+ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[1], _xa1);
|
||||||
|
+ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[2], _xa2);
|
||||||
|
+ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_address[3], _xa3);
|
||||||
|
+ }
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (dyn_gfx_state_dirty(cmdbuf, CB_BLEND_CONSTANTS)) {
|
||||||
|
--- a/src/panfrost/vulkan/meson.build 2026-04-29 22:19:00.000000000 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/meson.build 2026-05-20 18:53:04.484861338 +0200
|
||||||
|
@@ -73,6 +73,7 @@
|
||||||
|
jm_inc_dir = ['jm']
|
||||||
|
jm_files = [
|
||||||
|
'jm/panvk_vX_bind_queue.c',
|
||||||
|
+ 'jm/panvk_vX_cmd_xfb.c', # iter13
|
||||||
|
'jm/panvk_vX_cmd_buffer.c',
|
||||||
|
'jm/panvk_vX_cmd_dispatch.c',
|
||||||
|
'jm/panvk_vX_cmd_draw.c',
|
||||||
|
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c 2026-04-29 22:19:00.000000000 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_buffer.c 2026-05-20 19:10:26.163965149 +0200
|
||||||
|
@@ -473,5 +473,12 @@
|
||||||
|
|
||||||
|
vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo);
|
||||||
|
|
||||||
|
+#if PAN_ARCH < 9
|
||||||
|
+ /* iter13: clear XFB state on Begin so a reused command buffer does not
|
||||||
|
+ * inherit stale xfb.buffer_count / xfb.active / xfb.buffers[] from a
|
||||||
|
+ * prior recording. */
|
||||||
|
+ memset(&cmdbuf->state.gfx.xfb, 0, sizeof(cmdbuf->state.gfx.xfb));
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
return VK_SUCCESS;
|
||||||
|
}
|
||||||
|
--- a/src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c 2026-05-18 12:50:53.067999996 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c 2026-05-20 19:10:27.175979847 +0200
|
||||||
|
@@ -0,0 +1,111 @@
|
||||||
|
+/*
|
||||||
|
+ * Copyright © 2026 mfritsche / claude-noether
|
||||||
|
+ * SPDX-License-Identifier: MIT
|
||||||
|
+ *
|
||||||
|
+ * iter13: VK_EXT_transform_feedback command handlers for the JM
|
||||||
|
+ * architecture path (Bifrost v6/v7 + Valhall-JM v9).
|
||||||
|
+ *
|
||||||
|
+ * The runtime contract:
|
||||||
|
+ * - vkCmdBindTransformFeedbackBuffersEXT: stash (gpu_addr, offset, size)
|
||||||
|
+ * for each slot into cmdbuf->state.gfx.xfb.buffers[].
|
||||||
|
+ * - vkCmdBeginTransformFeedbackEXT: set cmdbuf->state.gfx.xfb.active = true.
|
||||||
|
+ * Mark sysvals dirty so the next draw re-emits vs.xfb_address[].
|
||||||
|
+ * - vkCmdEndTransformFeedbackEXT: set active = false.
|
||||||
|
+ *
|
||||||
|
+ * Counter buffers (firstCounterBuffer/counterBufferCount/pCounterBuffers/
|
||||||
|
+ * pCounterBufferOffsets) are accepted by API but ignored — v1 doesn't
|
||||||
|
+ * support pause/resume. transformFeedbackDraw is advertised as false.
|
||||||
|
+ *
|
||||||
|
+ * Per-draw integration: jm/panvk_vX_cmd_draw.c reads cmdbuf->state.gfx.xfb
|
||||||
|
+ * and populates vs.xfb_address[i] for shader use. The pan_nir_lower_xfb
|
||||||
|
+ * pass in panvk_vX_shader.c emits nir_load_xfb_address(i) which lowers
|
||||||
|
+ * (via panvk_vX_shader.c sysval handler) to a load from the per-draw
|
||||||
|
+ * sysval push area.
|
||||||
|
+ */
|
||||||
|
+
|
||||||
|
+#include "vk_log.h"
|
||||||
|
+#include "util/log.h"
|
||||||
|
+
|
||||||
|
+#include "panvk_cmd_buffer.h"
|
||||||
|
+#include "panvk_cmd_draw.h"
|
||||||
|
+#include "panvk_buffer.h"
|
||||||
|
+#include "panvk_entrypoints.h"
|
||||||
|
+
|
||||||
|
+VKAPI_ATTR void VKAPI_CALL
|
||||||
|
+panvk_per_arch(CmdBindTransformFeedbackBuffersEXT)(
|
||||||
|
+ VkCommandBuffer commandBuffer,
|
||||||
|
+ uint32_t firstBinding,
|
||||||
|
+ uint32_t bindingCount,
|
||||||
|
+ const VkBuffer *pBuffers,
|
||||||
|
+ const VkDeviceSize *pOffsets,
|
||||||
|
+ const VkDeviceSize *pSizes)
|
||||||
|
+{
|
||||||
|
+ VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
|
||||||
|
+ struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
|
||||||
|
+
|
||||||
|
+ for (uint32_t i = 0; i < bindingCount; i++) {
|
||||||
|
+ uint32_t slot = firstBinding + i;
|
||||||
|
+ if (slot >= 4)
|
||||||
|
+ continue;
|
||||||
|
+
|
||||||
|
+ VK_FROM_HANDLE(panvk_buffer, buf, pBuffers[i]);
|
||||||
|
+ gfx->xfb.buffers[slot].addr = panvk_buffer_gpu_ptr(buf, 0);
|
||||||
|
+ gfx->xfb.buffers[slot].offset = pOffsets[i];
|
||||||
|
+ gfx->xfb.buffers[slot].size =
|
||||||
|
+ (pSizes != NULL && pSizes[i] != VK_WHOLE_SIZE)
|
||||||
|
+ ? pSizes[i]
|
||||||
|
+ : (buf->vk.size - pOffsets[i]);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (firstBinding + bindingCount > gfx->xfb.buffer_count)
|
||||||
|
+ gfx->xfb.buffer_count = firstBinding + bindingCount;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+VKAPI_ATTR void VKAPI_CALL
|
||||||
|
+panvk_per_arch(CmdBeginTransformFeedbackEXT)(
|
||||||
|
+ VkCommandBuffer commandBuffer,
|
||||||
|
+ uint32_t firstCounterBuffer,
|
||||||
|
+ uint32_t counterBufferCount,
|
||||||
|
+ const VkBuffer *pCounterBuffers,
|
||||||
|
+ const VkDeviceSize *pCounterBufferOffsets)
|
||||||
|
+{
|
||||||
|
+ VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
|
||||||
|
+ struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
|
||||||
|
+
|
||||||
|
+ /* Counter buffers ignored in v1 — see VkPhysicalDeviceTransformFeedback
|
||||||
|
+ * PropertiesEXT.transformFeedbackDraw = false in panvk_vX_physical_device.c.
|
||||||
|
+ * App is spec-compliant if it does not pass counter buffers (which our
|
||||||
|
+ * features advertisement allows), but warn loudly if it does so we do not
|
||||||
|
+ * silently produce wrong capture state. */
|
||||||
|
+ (void)firstCounterBuffer;
|
||||||
|
+ (void)pCounterBufferOffsets;
|
||||||
|
+ if (counterBufferCount > 0 && pCounterBuffers != NULL) {
|
||||||
|
+ mesa_logw("panvk: CmdBeginTransformFeedbackEXT: counter buffers not "
|
||||||
|
+ "implemented (transformFeedbackDraw=false); XFB resume will "
|
||||||
|
+ "restart at buffer offset 0");
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ gfx->xfb.active = true;
|
||||||
|
+ /* Per-draw set_gfx_sysval picks up the change automatically — no
|
||||||
|
+ * explicit dirty marking required (set_gfx_sysval uses memcmp +
|
||||||
|
+ * BITSET to detect state diffs and re-emit sysvals). */
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+VKAPI_ATTR void VKAPI_CALL
|
||||||
|
+panvk_per_arch(CmdEndTransformFeedbackEXT)(
|
||||||
|
+ VkCommandBuffer commandBuffer,
|
||||||
|
+ uint32_t firstCounterBuffer,
|
||||||
|
+ uint32_t counterBufferCount,
|
||||||
|
+ const VkBuffer *pCounterBuffers,
|
||||||
|
+ const VkDeviceSize *pCounterBufferOffsets)
|
||||||
|
+{
|
||||||
|
+ VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
|
||||||
|
+ struct panvk_cmd_graphics_state *gfx = &cmdbuf->state.gfx;
|
||||||
|
+
|
||||||
|
+ (void)firstCounterBuffer;
|
||||||
|
+ (void)counterBufferCount;
|
||||||
|
+ (void)pCounterBuffers;
|
||||||
|
+ (void)pCounterBufferOffsets;
|
||||||
|
+
|
||||||
|
+ gfx->xfb.active = false;
|
||||||
|
+}
|
||||||
@@ -0,0 +1,629 @@
|
|||||||
|
diff -urN a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build
|
||||||
|
--- a/src/panfrost/vulkan/meson.build 2026-05-21 14:04:02.529474145 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/meson.build 2026-05-21 14:04:04.106755486 +0200
|
||||||
|
@@ -123,6 +123,7 @@
|
||||||
|
'panvk_vX_nir_lower_input_attachment_loads.c',
|
||||||
|
'panvk_vX_sampler.c',
|
||||||
|
'panvk_vX_shader.c',
|
||||||
|
+ 'panvk_vX_xfb_lower.c',
|
||||||
|
sha1_h,
|
||||||
|
]
|
||||||
|
|
||||||
|
diff -urN a/src/panfrost/vulkan/panvk_shader.h b/src/panfrost/vulkan/panvk_shader.h
|
||||||
|
--- a/src/panfrost/vulkan/panvk_shader.h 2026-05-21 14:04:02.525251986 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_shader.h 2026-05-21 14:04:04.084251800 +0200
|
||||||
|
@@ -154,6 +154,8 @@
|
||||||
|
/* aligned_u64 attribute below inserts the 4-byte alignment gap
|
||||||
|
* after num_vertices automatically — no explicit pad needed. */
|
||||||
|
aligned_u64 xfb_address[4]; /* iter13: 4 transform feedback buffer base addresses */
|
||||||
|
+ uint32_t xfb_topology; /* iter17: panvk_xfb_topology enum value */
|
||||||
|
+ uint32_t xfb_output_count; /* iter17: per-instance output verts after decomp */
|
||||||
|
#endif
|
||||||
|
int32_t first_vertex;
|
||||||
|
int32_t base_instance;
|
||||||
|
@@ -569,4 +571,76 @@
|
||||||
|
struct pan_compute_dim local_size, const void *bin_ptr, size_t bin_size,
|
||||||
|
struct panvk_shader **shader_out);
|
||||||
|
|
||||||
|
+
|
||||||
|
+#if PAN_ARCH < 9
|
||||||
|
+/* iter17: encoding for vs.xfb_topology sysval. Maps VkPrimitiveTopology values
|
||||||
|
+ * we need to distinguish at shader runtime for XFB capture. LIST topologies
|
||||||
|
+ * use the iter13 single-store fast path; non-LIST need per-vertex decomposition. */
|
||||||
|
+enum panvk_xfb_topology {
|
||||||
|
+ PANVK_XFB_TOPO_LIST = 0,
|
||||||
|
+ PANVK_XFB_TOPO_LINE_STRIP = 1,
|
||||||
|
+ PANVK_XFB_TOPO_TRI_STRIP = 2,
|
||||||
|
+ PANVK_XFB_TOPO_TRI_FAN = 3,
|
||||||
|
+ PANVK_XFB_TOPO_LINE_LIST_ADJ = 4,
|
||||||
|
+ PANVK_XFB_TOPO_LINE_STRIP_ADJ = 5,
|
||||||
|
+ PANVK_XFB_TOPO_TRI_LIST_ADJ = 6,
|
||||||
|
+ PANVK_XFB_TOPO_TRI_STRIP_ADJ = 7,
|
||||||
|
+};
|
||||||
|
+
|
||||||
|
+#include "panvk_macros.h"
|
||||||
|
+struct nir_shader;
|
||||||
|
+bool panvk_per_arch(nir_lower_xfb)(struct nir_shader *nir);
|
||||||
|
+
|
||||||
|
+/* Map VkPrimitiveTopology to panvk_xfb_topology enum (driver-side helper). */
|
||||||
|
+static inline uint32_t
|
||||||
|
+panvk_vk_topology_to_xfb_enum(VkPrimitiveTopology topo)
|
||||||
|
+{
|
||||||
|
+ switch (topo) {
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
|
||||||
|
+ return PANVK_XFB_TOPO_LINE_STRIP;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
|
||||||
|
+ return PANVK_XFB_TOPO_TRI_STRIP;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
|
||||||
|
+ return PANVK_XFB_TOPO_TRI_FAN;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
|
||||||
|
+ return PANVK_XFB_TOPO_LINE_LIST_ADJ;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
|
||||||
|
+ return PANVK_XFB_TOPO_LINE_STRIP_ADJ;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
|
||||||
|
+ return PANVK_XFB_TOPO_TRI_LIST_ADJ;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
|
||||||
|
+ return PANVK_XFB_TOPO_TRI_STRIP_ADJ;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
|
||||||
|
+ default:
|
||||||
|
+ return PANVK_XFB_TOPO_LIST;
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* Compute the per-instance output vertex count for a given (topology, input count). */
|
||||||
|
+static inline uint32_t
|
||||||
|
+panvk_xfb_output_count(VkPrimitiveTopology topo, uint32_t input_count)
|
||||||
|
+{
|
||||||
|
+ switch (topo) {
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
|
||||||
|
+ return input_count >= 1 ? 2u * (input_count - 1u) : 0u;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
|
||||||
|
+ return input_count >= 2 ? 3u * (input_count - 2u) : 0u;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
|
||||||
|
+ return (input_count / 4u) * 2u;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
|
||||||
|
+ return input_count >= 3 ? 2u * (input_count - 3u) : 0u;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
|
||||||
|
+ return (input_count / 6u) * 3u;
|
||||||
|
+ case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
|
||||||
|
+ return input_count >= 6 ? 3u * (input_count / 2u - 2u) : 0u;
|
||||||
|
+ default:
|
||||||
|
+ return input_count; /* LIST topologies: 1:1 mapping */
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+
|
||||||
|
#endif
|
||||||
|
diff -urN a/src/panfrost/vulkan/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/panvk_vX_cmd_draw.c
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-05-21 14:04:02.528576354 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_cmd_draw.c 2026-05-21 14:04:04.091357598 +0200
|
||||||
|
@@ -727,6 +727,20 @@
|
||||||
|
/* iter13: VK_EXT_transform_feedback sysvals — always set (per draw),
|
||||||
|
* reflect bound XFB state. set_gfx_sysval is a no-op if value unchanged. */
|
||||||
|
set_gfx_sysval(cmdbuf, dirty_sysvals, vs.num_vertices, info->vertex.count);
|
||||||
|
+
|
||||||
|
+ /* iter17: XFB primitive-decomposition sysvals.
|
||||||
|
+ * xfb_topology = enum value for the current bound topology.
|
||||||
|
+ * xfb_output_count = per-instance output vertex count after decomposition.
|
||||||
|
+ * For LIST topologies, output_count == input vertex count and the shader
|
||||||
|
+ * takes the iter13 single-store fast path. */
|
||||||
|
+ {
|
||||||
|
+ VkPrimitiveTopology vk_topo =
|
||||||
|
+ cmdbuf->vk.dynamic_graphics_state.ia.primitive_topology;
|
||||||
|
+ uint32_t topo_enum = panvk_vk_topology_to_xfb_enum(vk_topo);
|
||||||
|
+ uint32_t out_count = panvk_xfb_output_count(vk_topo, info->vertex.count);
|
||||||
|
+ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_topology, topo_enum);
|
||||||
|
+ set_gfx_sysval(cmdbuf, dirty_sysvals, vs.xfb_output_count, out_count);
|
||||||
|
+ }
|
||||||
|
{
|
||||||
|
const struct panvk_cmd_graphics_state *_gfx = &cmdbuf->state.gfx;
|
||||||
|
/* iter13: default each XFB buffer address to PAN_SHADER_OOB_ADDRESS
|
||||||
|
diff -urN a/src/panfrost/vulkan/panvk_vX_shader.c b/src/panfrost/vulkan/panvk_vX_shader.c
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_shader.c 2026-05-21 14:04:02.527576494 +0200
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_shader.c 2026-05-21 14:04:04.098356619 +0200
|
||||||
|
@@ -895,7 +895,10 @@
|
||||||
|
nir->info.has_transform_feedback_varyings) {
|
||||||
|
NIR_PASS(_, nir, nir_opt_constant_folding);
|
||||||
|
NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info);
|
||||||
|
- NIR_PASS(_, nir, pan_nir_lower_xfb);
|
||||||
|
+ /* iter17: panvk-specific replacement for pan_nir_lower_xfb that handles
|
||||||
|
+ * primitive decomposition for non-LIST topologies. Single-store LIST
|
||||||
|
+ * fast path matches iter13 behavior. */
|
||||||
|
+ NIR_PASS(_, nir, panvk_per_arch(nir_lower_xfb));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
diff -urN a/src/panfrost/vulkan/panvk_vX_xfb_lower.c b/src/panfrost/vulkan/panvk_vX_xfb_lower.c
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_xfb_lower.c 1970-01-01 01:00:00.000000000 +0100
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_xfb_lower.c 2026-05-21 14:04:04.115354242 +0200
|
||||||
|
@@ -0,0 +1,486 @@
|
||||||
|
+/*
|
||||||
|
+ * Copyright © 2026 mfritsche / claude-noether
|
||||||
|
+ * SPDX-License-Identifier: MIT
|
||||||
|
+ *
|
||||||
|
+ * iter17: panvk-specific replacement for pan_nir_lower_xfb that handles
|
||||||
|
+ * primitive decomposition for transform_feedback on non-LIST topologies
|
||||||
|
+ * (TRIANGLE_STRIP/FAN, LINE_STRIP, *_WITH_ADJACENCY).
|
||||||
|
+ *
|
||||||
|
+ * Approach: emit a topology dispatch at the start of each store_output
|
||||||
|
+ * lowering. The shader reads vs.xfb_topology sysval at runtime and branches
|
||||||
|
+ * into per-topology emission logic. For each affected topology, the lowered
|
||||||
|
+ * code emits guarded conditional stores — one per primitive this vertex
|
||||||
|
+ * contributes to, computing the output buffer position via primitive index
|
||||||
|
+ * and slot within the decomposed primitive.
|
||||||
|
+ *
|
||||||
|
+ * For LIST topologies (POINT/LINE/TRIANGLE LIST), takes a fast path that
|
||||||
|
+ * matches iter13's single-store behavior.
|
||||||
|
+ *
|
||||||
|
+ * For TRIANGLE_FAN, the central vertex (v=0) contributes to ALL primitives
|
||||||
|
+ * as slot 2 — handled via a NIR loop bounded by num_vertices.
|
||||||
|
+ *
|
||||||
|
+ * See ~/src/panvk-bifrost/iter17/phase{0,1,2}_*.md for full design context.
|
||||||
|
+ */
|
||||||
|
+
|
||||||
|
+#include "panvk_macros.h"
|
||||||
|
+
|
||||||
|
+#if PAN_ARCH < 9
|
||||||
|
+
|
||||||
|
+#include "panvk_shader.h"
|
||||||
|
+
|
||||||
|
+#include "compiler/nir/nir_builder.h"
|
||||||
|
+#include "pan_nir.h"
|
||||||
|
+
|
||||||
|
+#include <vulkan/vulkan_core.h>
|
||||||
|
+
|
||||||
|
+/* ----- Address arithmetic ----- */
|
||||||
|
+
|
||||||
|
+static nir_def *
|
||||||
|
+xfb_store_addr(nir_builder *b, nir_def *buf, nir_def *out_idx,
|
||||||
|
+ uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ nir_def *byte_off = nir_iadd_imm(b,
|
||||||
|
+ nir_imul_imm(b, out_idx, stride), offset_bytes);
|
||||||
|
+ return nir_iadd(b, buf, nir_u2u64(b, byte_off));
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void
|
||||||
|
+emit_list_store(nir_builder *b, nir_def *buf, nir_def *output_count,
|
||||||
|
+ nir_def *instance_id, nir_def *raw_vid, nir_def *value,
|
||||||
|
+ uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ nir_def *out_idx = nir_iadd(b,
|
||||||
|
+ nir_imul(b, instance_id, output_count), raw_vid);
|
||||||
|
+ nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
|
||||||
|
+ nir_store_global(b, value, addr);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+static void
|
||||||
|
+emit_prim_store(nir_builder *b, nir_def *buf, nir_def *output_count,
|
||||||
|
+ nir_def *instance_id, nir_def *eligible,
|
||||||
|
+ nir_def *prim_idx, nir_def *slot,
|
||||||
|
+ uint32_t verts_per_prim,
|
||||||
|
+ nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ nir_push_if(b, eligible);
|
||||||
|
+ {
|
||||||
|
+ nir_def *out_idx = nir_iadd(b,
|
||||||
|
+ nir_imul(b, instance_id, output_count),
|
||||||
|
+ nir_iadd(b, nir_imul_imm(b, prim_idx, verts_per_prim), slot));
|
||||||
|
+ nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
|
||||||
|
+ nir_store_global(b, value, addr);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* ----- Per-topology emission ----- */
|
||||||
|
+
|
||||||
|
+/* TRIANGLE_STRIP: vertex v contributes to prims v, v-1, v-2 (per eligibility). */
|
||||||
|
+static void
|
||||||
|
+emit_tri_strip(nir_builder *b, nir_def *v, nir_def *N,
|
||||||
|
+ nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||||
|
+ nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ nir_def *Nm2 = nir_iadd_imm(b, N, -2);
|
||||||
|
+ nir_def *Nm1 = nir_iadd_imm(b, N, -1);
|
||||||
|
+
|
||||||
|
+ /* Prim v, slot 0: v < N-2 */
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id,
|
||||||
|
+ nir_ult(b, v, Nm2),
|
||||||
|
+ v, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
|
||||||
|
+
|
||||||
|
+ /* Prim v-1, slot = 1 if prim even else 2: 1 <= v < N-1 */
|
||||||
|
+ {
|
||||||
|
+ nir_def *prim = nir_iadd_imm(b, v, -1);
|
||||||
|
+ nir_def *parity = nir_iand_imm(b, prim, 1u);
|
||||||
|
+ nir_def *slot = nir_iadd_imm(b, parity, 1);
|
||||||
|
+ nir_def *eligible = nir_iand(b,
|
||||||
|
+ nir_uge(b, v, nir_imm_int(b, 1)),
|
||||||
|
+ nir_ult(b, v, Nm1));
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||||
|
+ prim, slot, 3, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* Prim v-2, slot = 2 if prim even else 1: 2 <= v < N */
|
||||||
|
+ {
|
||||||
|
+ nir_def *prim = nir_iadd_imm(b, v, -2);
|
||||||
|
+ nir_def *parity = nir_iand_imm(b, prim, 1u);
|
||||||
|
+ nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity);
|
||||||
|
+ nir_def *eligible = nir_iand(b,
|
||||||
|
+ nir_uge(b, v, nir_imm_int(b, 2)),
|
||||||
|
+ nir_ult(b, v, N));
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||||
|
+ prim, slot, 3, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* LINE_STRIP: vertex v contributes to prim v slot 0 + prim v-1 slot 1. */
|
||||||
|
+static void
|
||||||
|
+emit_line_strip(nir_builder *b, nir_def *v, nir_def *N,
|
||||||
|
+ nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||||
|
+ nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ nir_def *Nm1 = nir_iadd_imm(b, N, -1);
|
||||||
|
+
|
||||||
|
+ /* Prim v, slot 0: v < N-1 */
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id,
|
||||||
|
+ nir_ult(b, v, Nm1),
|
||||||
|
+ v, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
|
||||||
|
+
|
||||||
|
+ /* Prim v-1, slot 1: 1 <= v < N */
|
||||||
|
+ {
|
||||||
|
+ nir_def *prim = nir_iadd_imm(b, v, -1);
|
||||||
|
+ nir_def *eligible = nir_iand(b,
|
||||||
|
+ nir_uge(b, v, nir_imm_int(b, 1)),
|
||||||
|
+ nir_ult(b, v, N));
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||||
|
+ prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* TRIANGLE_FAN: prim p emits {p+1, p+2, 0}.
|
||||||
|
+ * vertex v=0: contributes to ALL prims as slot 2 (loop required)
|
||||||
|
+ * vertex v>=1: contributes to prim v-1 as slot 0 (if 1 <= v <= N-2)
|
||||||
|
+ * vertex v>=2: contributes to prim v-2 as slot 1 (if 2 <= v <= N-1)
|
||||||
|
+ */
|
||||||
|
+static void
|
||||||
|
+emit_tri_fan(nir_builder *b, nir_def *v, nir_def *N,
|
||||||
|
+ nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||||
|
+ nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ nir_def *Nm1 = nir_iadd_imm(b, N, -1);
|
||||||
|
+ nir_def *Nm2 = nir_iadd_imm(b, N, -2);
|
||||||
|
+
|
||||||
|
+ /* Prim v-1, slot 0: 1 <= v < N-1 */
|
||||||
|
+ {
|
||||||
|
+ nir_def *prim = nir_iadd_imm(b, v, -1);
|
||||||
|
+ nir_def *eligible = nir_iand(b,
|
||||||
|
+ nir_uge(b, v, nir_imm_int(b, 1)),
|
||||||
|
+ nir_ult(b, v, Nm1));
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||||
|
+ prim, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* Prim v-2, slot 1: 2 <= v < N */
|
||||||
|
+ {
|
||||||
|
+ nir_def *prim = nir_iadd_imm(b, v, -2);
|
||||||
|
+ nir_def *eligible = nir_iand(b,
|
||||||
|
+ nir_uge(b, v, nir_imm_int(b, 2)),
|
||||||
|
+ nir_ult(b, v, N));
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||||
|
+ prim, nir_imm_int(b, 1), 3, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* Central vertex (v == 0): loop over all prims, write to slot 2. */
|
||||||
|
+ nir_push_if(b, nir_ieq_imm(b, v, 0));
|
||||||
|
+ {
|
||||||
|
+ nir_variable *p_var = nir_local_variable_create(b->impl,
|
||||||
|
+ glsl_uint_type(), "fan_p");
|
||||||
|
+ nir_store_var(b, p_var, nir_imm_int(b, 0), 0x1);
|
||||||
|
+ nir_push_loop(b);
|
||||||
|
+ {
|
||||||
|
+ nir_def *p = nir_load_var(b, p_var);
|
||||||
|
+ nir_push_if(b, nir_uge(b, p, Nm2));
|
||||||
|
+ {
|
||||||
|
+ nir_jump(b, nir_jump_break);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+
|
||||||
|
+ nir_def *out_idx = nir_iadd(b,
|
||||||
|
+ nir_imul(b, instance_id, output_count),
|
||||||
|
+ nir_iadd_imm(b, nir_imul_imm(b, p, 3), 2));
|
||||||
|
+ nir_def *addr = xfb_store_addr(b, buf, out_idx, stride, offset_bytes);
|
||||||
|
+ nir_store_global(b, value, addr);
|
||||||
|
+
|
||||||
|
+ nir_store_var(b, p_var, nir_iadd_imm(b, p, 1), 0x1);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_loop(b, NULL);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* LINE_LIST_WITH_ADJACENCY: 4-vertex groups [4i..4i+3]; output {4i+1, 4i+2}.
|
||||||
|
+ * v contributes if v%4 == 1: prim v/4 slot 0
|
||||||
|
+ * v contributes if v%4 == 2: prim v/4 slot 1
|
||||||
|
+ */
|
||||||
|
+static void
|
||||||
|
+emit_line_list_adj(nir_builder *b, nir_def *v, nir_def *N,
|
||||||
|
+ nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||||
|
+ nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ (void)N; /* eligibility is mod-based, not range-based */
|
||||||
|
+ nir_def *vmod4 = nir_iand_imm(b, v, 3u);
|
||||||
|
+ nir_def *prim = nir_ushr_imm(b, v, 2); /* v / 4 */
|
||||||
|
+
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id,
|
||||||
|
+ nir_ieq_imm(b, vmod4, 1),
|
||||||
|
+ prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
|
||||||
|
+
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id,
|
||||||
|
+ nir_ieq_imm(b, vmod4, 2),
|
||||||
|
+ prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* LINE_STRIP_WITH_ADJACENCY: prim p emits {p+1, p+2}.
|
||||||
|
+ * v contributes to prim v-1 slot 0 (1 <= v <= N-2)
|
||||||
|
+ * v contributes to prim v-2 slot 1 (2 <= v <= N-1)
|
||||||
|
+ */
|
||||||
|
+static void
|
||||||
|
+emit_line_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
|
||||||
|
+ nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||||
|
+ nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ nir_def *Nm1 = nir_iadd_imm(b, N, -1);
|
||||||
|
+ nir_def *Nm2 = nir_iadd_imm(b, N, -2);
|
||||||
|
+
|
||||||
|
+ /* Prim v-1, slot 0: 1 <= v <= N-2 ⇔ v >= 1 AND v <= N-2 ⇔ v >= 1 AND v < N-1 */
|
||||||
|
+ {
|
||||||
|
+ nir_def *prim = nir_iadd_imm(b, v, -1);
|
||||||
|
+ nir_def *eligible = nir_iand(b,
|
||||||
|
+ nir_uge(b, v, nir_imm_int(b, 1)),
|
||||||
|
+ nir_ult(b, v, Nm1));
|
||||||
|
+ (void)Nm2;
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||||
|
+ prim, nir_imm_int(b, 0), 2, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* Prim v-2, slot 1: 2 <= v <= N-1 ⇔ v >= 2 AND v < N */
|
||||||
|
+ {
|
||||||
|
+ nir_def *prim = nir_iadd_imm(b, v, -2);
|
||||||
|
+ nir_def *eligible = nir_iand(b,
|
||||||
|
+ nir_uge(b, v, nir_imm_int(b, 2)),
|
||||||
|
+ nir_ult(b, v, N));
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||||
|
+ prim, nir_imm_int(b, 1), 2, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* TRIANGLE_LIST_WITH_ADJACENCY: 6-vertex groups; output {6i, 6i+2, 6i+4}.
|
||||||
|
+ * v contributes if v%6 == 0: prim v/6 slot 0
|
||||||
|
+ * v contributes if v%6 == 2: prim v/6 slot 1
|
||||||
|
+ * v contributes if v%6 == 4: prim v/6 slot 2
|
||||||
|
+ */
|
||||||
|
+static void
|
||||||
|
+emit_tri_list_adj(nir_builder *b, nir_def *v, nir_def *N,
|
||||||
|
+ nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||||
|
+ nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ (void)N;
|
||||||
|
+ nir_def *vmod6 = nir_umod_imm(b, v, 6);
|
||||||
|
+ nir_def *prim = nir_udiv_imm(b, v, 6);
|
||||||
|
+
|
||||||
|
+ for (uint32_t slot = 0; slot < 3; slot++) {
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id,
|
||||||
|
+ nir_ieq_imm(b, vmod6, slot * 2),
|
||||||
|
+ prim, nir_imm_int(b, slot), 3, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* TRIANGLE_STRIP_WITH_ADJACENCY: prim i emits:
|
||||||
|
+ * even i: {2i, 2i+2, 2i+4} (slots 0, 1, 2 ← input indices 2i, 2i+2, 2i+4)
|
||||||
|
+ * odd i: {2i, 2i+4, 2i+2} (slots 0, 1, 2 ← input indices 2i, 2i+4, 2i+2)
|
||||||
|
+ *
|
||||||
|
+ * Only EVEN input vertices contribute (since all output indices are 2*something).
|
||||||
|
+ * For even input v:
|
||||||
|
+ * prim v/2 slot 0 (always, if v/2 < N/2-2)
|
||||||
|
+ * prim (v-2)/2 slot 1 if (v-2)/2 even, slot 2 if odd (when v >= 2)
|
||||||
|
+ * prim (v-4)/2 slot 2 if (v-4)/2 even, slot 1 if odd (when v >= 4)
|
||||||
|
+ */
|
||||||
|
+static void
|
||||||
|
+emit_tri_strip_adj(nir_builder *b, nir_def *v, nir_def *N,
|
||||||
|
+ nir_def *buf, nir_def *output_count, nir_def *instance_id,
|
||||||
|
+ nir_def *value, uint16_t stride, uint16_t offset_bytes)
|
||||||
|
+{
|
||||||
|
+ /* Bail for odd input vertices — they never contribute. */
|
||||||
|
+ nir_def *v_is_even = nir_ieq_imm(b, nir_iand_imm(b, v, 1u), 0);
|
||||||
|
+ nir_push_if(b, v_is_even);
|
||||||
|
+ {
|
||||||
|
+ nir_def *N_half = nir_ushr_imm(b, N, 1);
|
||||||
|
+ nir_def *max_prim = nir_iadd_imm(b, N_half, -2); /* N/2 - 2 */
|
||||||
|
+ nir_def *v_half = nir_ushr_imm(b, v, 1);
|
||||||
|
+
|
||||||
|
+ /* Prim v/2 slot 0: v/2 < N/2 - 2 */
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id,
|
||||||
|
+ nir_ult(b, v_half, max_prim),
|
||||||
|
+ v_half, nir_imm_int(b, 0), 3, value, stride, offset_bytes);
|
||||||
|
+
|
||||||
|
+ /* Prim (v-2)/2 = v/2 - 1: v >= 2 AND prim < N/2-2 */
|
||||||
|
+ {
|
||||||
|
+ nir_def *prim = nir_iadd_imm(b, v_half, -1);
|
||||||
|
+ nir_def *parity = nir_iand_imm(b, prim, 1u);
|
||||||
|
+ nir_def *slot = nir_iadd_imm(b, parity, 1); /* even→1, odd→2 */
|
||||||
|
+ nir_def *eligible = nir_iand(b,
|
||||||
|
+ nir_uge(b, v, nir_imm_int(b, 2)),
|
||||||
|
+ nir_ult(b, prim, max_prim));
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||||
|
+ prim, slot, 3, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ /* Prim (v-4)/2 = v/2 - 2: v >= 4 AND prim < N/2-2 */
|
||||||
|
+ {
|
||||||
|
+ nir_def *prim = nir_iadd_imm(b, v_half, -2);
|
||||||
|
+ nir_def *parity = nir_iand_imm(b, prim, 1u);
|
||||||
|
+ nir_def *slot = nir_isub(b, nir_imm_int(b, 2), parity); /* even→2, odd→1 */
|
||||||
|
+ nir_def *eligible = nir_iand(b,
|
||||||
|
+ nir_uge(b, v, nir_imm_int(b, 4)),
|
||||||
|
+ nir_ult(b, prim, max_prim));
|
||||||
|
+ emit_prim_store(b, buf, output_count, instance_id, eligible,
|
||||||
|
+ prim, slot, 3, value, stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* ----- Main lowering: per store_output XFB channel ----- */
|
||||||
|
+
|
||||||
|
+static void
|
||||||
|
+lower_xfb_output_iter17(nir_builder *b, nir_intrinsic_instr *intr,
|
||||||
|
+ unsigned channel_idx, unsigned num_components,
|
||||||
|
+ unsigned buffer, unsigned offset_words)
|
||||||
|
+{
|
||||||
|
+ assert(buffer < MAX_XFB_BUFFERS);
|
||||||
|
+ assert(nir_intrinsic_component(intr) == 0);
|
||||||
|
+
|
||||||
|
+ uint16_t stride = b->shader->info.xfb_stride[buffer] * 4;
|
||||||
|
+ assert(stride != 0);
|
||||||
|
+ uint16_t offset_bytes = offset_words * 4;
|
||||||
|
+
|
||||||
|
+ BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
|
||||||
|
+ BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
|
||||||
|
+
|
||||||
|
+ nir_def *topology = load_sysval(b, graphics, 32, vs.xfb_topology);
|
||||||
|
+ nir_def *out_count = load_sysval(b, graphics, 32, vs.xfb_output_count);
|
||||||
|
+ nir_def *N = nir_load_num_vertices(b);
|
||||||
|
+ nir_def *v = nir_load_raw_vertex_id_pan(b);
|
||||||
|
+ nir_def *instance = nir_load_instance_id(b);
|
||||||
|
+ nir_def *buf = nir_load_xfb_address(b, 64, .base = buffer);
|
||||||
|
+
|
||||||
|
+ nir_def *src = intr->src[0].ssa;
|
||||||
|
+ nir_component_mask_t mask = nir_component_mask(num_components);
|
||||||
|
+ nir_def *value = nir_channels(b, src, mask << channel_idx);
|
||||||
|
+
|
||||||
|
+ /* Topology dispatch ladder. LIST first (fast path). */
|
||||||
|
+ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LIST));
|
||||||
|
+ {
|
||||||
|
+ emit_list_store(b, buf, out_count, instance, v, value,
|
||||||
|
+ stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+ nir_push_else(b, NULL);
|
||||||
|
+ {
|
||||||
|
+ /* iter17 Janet Finding 3: gate all non-LIST emission on
|
||||||
|
+ * output_count > 0. For degenerate input counts (N < min required
|
||||||
|
+ * for the topology), output_count is 0 and we must emit NO stores
|
||||||
|
+ * — otherwise N-2 / N-3 / etc. arithmetic underflows in the
|
||||||
|
+ * eligibility predicates and we falsely fire stores. */
|
||||||
|
+ nir_push_if(b, nir_ult(b, nir_imm_int(b, 0), out_count));
|
||||||
|
+ {
|
||||||
|
+ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_STRIP));
|
||||||
|
+ {
|
||||||
|
+ emit_tri_strip(b, v, N, buf, out_count, instance, value,
|
||||||
|
+ stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+ nir_push_else(b, NULL);
|
||||||
|
+ {
|
||||||
|
+ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP));
|
||||||
|
+ {
|
||||||
|
+ emit_line_strip(b, v, N, buf, out_count, instance, value,
|
||||||
|
+ stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+ nir_push_else(b, NULL);
|
||||||
|
+ {
|
||||||
|
+ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_FAN));
|
||||||
|
+ {
|
||||||
|
+ emit_tri_fan(b, v, N, buf, out_count, instance, value,
|
||||||
|
+ stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+ nir_push_else(b, NULL);
|
||||||
|
+ {
|
||||||
|
+ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_LIST_ADJ));
|
||||||
|
+ {
|
||||||
|
+ emit_line_list_adj(b, v, N, buf, out_count, instance, value,
|
||||||
|
+ stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+ nir_push_else(b, NULL);
|
||||||
|
+ {
|
||||||
|
+ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_LINE_STRIP_ADJ));
|
||||||
|
+ {
|
||||||
|
+ emit_line_strip_adj(b, v, N, buf, out_count, instance, value,
|
||||||
|
+ stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+ nir_push_else(b, NULL);
|
||||||
|
+ {
|
||||||
|
+ nir_push_if(b, nir_ieq_imm(b, topology, PANVK_XFB_TOPO_TRI_LIST_ADJ));
|
||||||
|
+ {
|
||||||
|
+ emit_tri_list_adj(b, v, N, buf, out_count, instance, value,
|
||||||
|
+ stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+ nir_push_else(b, NULL);
|
||||||
|
+ {
|
||||||
|
+ /* TRI_STRIP_ADJ — last case */
|
||||||
|
+ emit_tri_strip_adj(b, v, N, buf, out_count, instance, value,
|
||||||
|
+ stride, offset_bytes);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL); /* Janet Finding 3: close output_count > 0 guard */
|
||||||
|
+ }
|
||||||
|
+ nir_pop_if(b, NULL);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+/* Mirror of pan_nir_lower_xfb's lower_xfb: load_vertex_id rewrite +
|
||||||
|
+ * dispatch store_output through our topology-aware emission. */
|
||||||
|
+static bool
|
||||||
|
+lower_xfb_iter17(nir_builder *b, nir_intrinsic_instr *intr,
|
||||||
|
+ UNUSED void *data)
|
||||||
|
+{
|
||||||
|
+ if (intr->intrinsic == nir_intrinsic_load_vertex_id) {
|
||||||
|
+ b->cursor = nir_instr_remove(&intr->instr);
|
||||||
|
+ nir_def *repl = nir_iadd(b, nir_load_raw_vertex_id_pan(b),
|
||||||
|
+ nir_load_raw_vertex_offset_pan(b));
|
||||||
|
+ nir_def_rewrite_uses(&intr->def, repl);
|
||||||
|
+ return true;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (intr->intrinsic != nir_intrinsic_store_output)
|
||||||
|
+ return false;
|
||||||
|
+
|
||||||
|
+ bool progress = false;
|
||||||
|
+ b->cursor = nir_before_instr(&intr->instr);
|
||||||
|
+
|
||||||
|
+ /* io_xfb has only out[0,1]; the other 2 channels are in io_xfb2.
|
||||||
|
+ * Outer loop selects which annotation; inner picks which channel. */
|
||||||
|
+ for (unsigned i = 0; i < 2; ++i) {
|
||||||
|
+ nir_io_xfb xfb = i ? nir_intrinsic_io_xfb2(intr)
|
||||||
|
+ : nir_intrinsic_io_xfb(intr);
|
||||||
|
+ for (unsigned j = 0; j < 2; ++j) {
|
||||||
|
+ if (!xfb.out[j].num_components)
|
||||||
|
+ continue;
|
||||||
|
+ lower_xfb_output_iter17(b, intr, i * 2 + j, xfb.out[j].num_components,
|
||||||
|
+ xfb.out[j].buffer, xfb.out[j].offset);
|
||||||
|
+ progress = true;
|
||||||
|
+ }
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ if (progress)
|
||||||
|
+ nir_instr_remove(&intr->instr);
|
||||||
|
+ return progress;
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+bool
|
||||||
|
+panvk_per_arch(nir_lower_xfb)(nir_shader *nir)
|
||||||
|
+{
|
||||||
|
+ return nir_shader_intrinsics_pass(
|
||||||
|
+ nir, lower_xfb_iter17, nir_metadata_control_flow, NULL);
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+#endif /* PAN_ARCH < 9 */
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,181 @@
|
|||||||
|
# Maintainer: Markus Fritsche <fritsche.markus@gmail.com>
|
||||||
|
#
|
||||||
|
# mesa-panvk-bifrost-video — sibling of mesa-panvk-bifrost (r4) that adds
|
||||||
|
# VK_KHR_video_decode_h264 on Mali Bifrost SBCs (PAN_ARCH 6/7) backed by
|
||||||
|
# the SoC's V4L2-stateless hantro VPU (RK3566/RK3568).
|
||||||
|
#
|
||||||
|
# Campaign: ~/src/panvk-bifrost-video/ — Phase 4 byte-exact validated
|
||||||
|
# 2026-05-21 (48/48 BBB display frames match ffmpeg+libva-v4l2-request-
|
||||||
|
# fourier byte-for-byte on the same hantro). Phase 5 second-model review
|
||||||
|
# completed; load-bearing findings (output_map OOB, static counter,
|
||||||
|
# session_init unwind, probe_hantro gate) all applied.
|
||||||
|
#
|
||||||
|
# What it does (on top of r4):
|
||||||
|
# - 0001..0004: inherited from mesa-panvk-bifrost (robustness2/null-
|
||||||
|
# descriptor, vk1.1/1.2 advertisement, EXT_transform_feedback, XFB
|
||||||
|
# primitive decomposition) — symlinked from the r4 package directory
|
||||||
|
# so the patches don't drift between siblings.
|
||||||
|
# - 0005: VK_KHR_video_queue + VK_KHR_video_decode_queue +
|
||||||
|
# VK_KHR_video_decode_h264 backed by V4L2-stateless hantro.
|
||||||
|
# Touches 14 files in src/panfrost/vulkan/; full diff in
|
||||||
|
# 0005-panvk-bifrost-video-KHR-video-decode-h264.patch.
|
||||||
|
#
|
||||||
|
# Co-existence:
|
||||||
|
# - Installs to /usr/lib/panvk-bifrost-video/ (parallel to r4's
|
||||||
|
# /usr/lib/panvk-bifrost/). Pick at runtime via VK_ICD_FILENAMES.
|
||||||
|
# - r4 stays the recommended default for the Chromium-GPU-process
|
||||||
|
# consumer (no video needed there). Use this package when the
|
||||||
|
# consumer wants Vulkan video decode (mpv-fourier, ffmpeg-vulkan,
|
||||||
|
# future Chromium-VulkanVideoDecoder).
|
||||||
|
#
|
||||||
|
# Phase 1 limitations to know about (documented in source comments):
|
||||||
|
# - Single video session per device (active_video singleton)
|
||||||
|
# - Synchronous decode at record time — no pipelining yet
|
||||||
|
# - Hardcoded /dev/video1 + /dev/media0 (matches RK3566/68, blocks
|
||||||
|
# other SoCs without a topology-walk port)
|
||||||
|
# - Bitstream source buffer assumed HOST_VISIBLE (true on panvk-
|
||||||
|
# bifrost, would need fallback on other backends)
|
||||||
|
#
|
||||||
|
# Build target: arch-aarch64 runner via marfrit-packages Gitea Actions.
|
||||||
|
# Mesa build is slow (~30-60min on Cortex-A55).
|
||||||
|
|
||||||
|
pkgname=mesa-panvk-bifrost-video
|
||||||
|
_mesaver=26.0.6
|
||||||
|
pkgver=26.0.6.r5.video1
|
||||||
|
pkgrel=1
|
||||||
|
pkgdesc="Patched Mesa libvulkan_panfrost.so adding VK_KHR_video_decode_h264 on Bifrost SBCs (sibling of mesa-panvk-bifrost-r4)"
|
||||||
|
arch=('aarch64')
|
||||||
|
url="https://git.reauktion.de/marfrit/panvk-bifrost"
|
||||||
|
license=('MIT')
|
||||||
|
|
||||||
|
depends=(
|
||||||
|
'mesa' # for shared mesa runtime libs
|
||||||
|
'libdrm'
|
||||||
|
'wayland'
|
||||||
|
'libxcb'
|
||||||
|
'libx11'
|
||||||
|
'libxshmfence'
|
||||||
|
'zlib'
|
||||||
|
'zstd'
|
||||||
|
'libelf'
|
||||||
|
'libffi'
|
||||||
|
'expat'
|
||||||
|
'llvm-libs'
|
||||||
|
'lm_sensors'
|
||||||
|
)
|
||||||
|
makedepends=(
|
||||||
|
'meson'
|
||||||
|
'ninja'
|
||||||
|
'glslang'
|
||||||
|
'python-mako'
|
||||||
|
'python-packaging'
|
||||||
|
'wayland-protocols'
|
||||||
|
'libxrandr'
|
||||||
|
'xorgproto'
|
||||||
|
'libdrm'
|
||||||
|
'llvm'
|
||||||
|
'libclc'
|
||||||
|
'spirv-llvm-translator'
|
||||||
|
'spirv-tools'
|
||||||
|
'rust-bindgen'
|
||||||
|
'patch'
|
||||||
|
)
|
||||||
|
|
||||||
|
source=(
|
||||||
|
"https://archive.mesa3d.org/mesa-${_mesaver}.tar.xz"
|
||||||
|
"0001-panvk-expose-robustness2-nullDescriptor-bifrost.patch"
|
||||||
|
"0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch"
|
||||||
|
"0003-panvk-bifrost-vk-ext-transform-feedback.patch"
|
||||||
|
"0004-panvk-bifrost-xfb-primitive-decomposition.patch"
|
||||||
|
"0005-panvk-bifrost-video-KHR-video-decode-h264.patch"
|
||||||
|
"icd.json"
|
||||||
|
)
|
||||||
|
# Mesa tarball checksum matches the sibling r4 package — same upstream version.
|
||||||
|
sha256sums=(
|
||||||
|
'SKIP' # mesa tarball — co-trust w/ r4 sibling
|
||||||
|
'SKIP' # patches are local
|
||||||
|
'SKIP'
|
||||||
|
'SKIP'
|
||||||
|
'SKIP'
|
||||||
|
'SKIP'
|
||||||
|
'SKIP' # icd.json
|
||||||
|
)
|
||||||
|
|
||||||
|
prepare() {
|
||||||
|
cd "mesa-${_mesaver}"
|
||||||
|
|
||||||
|
# r1+r2: small sed-based edits inherited from r4 (verbatim from the
|
||||||
|
# sibling PKGBUILD — keep in sync).
|
||||||
|
sed -i 's|\.KHR_robustness2 = PAN_ARCH >= 10,|.KHR_robustness2 = true,|' src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
sed -i 's|\.EXT_robustness2 = PAN_ARCH >= 10,|.EXT_robustness2 = true,|' src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
sed -i 's|\.nullDescriptor = PAN_ARCH >= 10,|.nullDescriptor = true,|' src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
sed -i 's|bool has_vk1_1 = PAN_ARCH >= 10;|bool has_vk1_1 = true;|' src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
sed -i 's|bool has_vk1_2 = PAN_ARCH >= 10;|bool has_vk1_2 = true;|' src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
|
||||||
|
# r3: EXT_transform_feedback for Bifrost.
|
||||||
|
patch -p1 < "${srcdir}/0003-panvk-bifrost-vk-ext-transform-feedback.patch"
|
||||||
|
|
||||||
|
# r4: XFB primitive decomposition NIR pass.
|
||||||
|
patch -p1 < "${srcdir}/0004-panvk-bifrost-xfb-primitive-decomposition.patch"
|
||||||
|
|
||||||
|
# video: VK_KHR_video_decode_h264 via V4L2-hantro.
|
||||||
|
patch -p1 < "${srcdir}/0005-panvk-bifrost-video-KHR-video-decode-h264.patch"
|
||||||
|
|
||||||
|
# Sanity-check r1..r4 (inherited).
|
||||||
|
grep -q "KHR_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
grep -q "EXT_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
grep -q "nullDescriptor = true," src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
grep -q "has_vk1_1 = true;" src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
grep -q "has_vk1_2 = true;" src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
grep -q "EXT_transform_feedback = PAN_ARCH < 9," src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
test -f src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c
|
||||||
|
grep -q "panvk_per_arch(nir_lower_xfb)" src/panfrost/vulkan/panvk_vX_shader.c
|
||||||
|
test -f src/panfrost/vulkan/panvk_vX_xfb_lower.c
|
||||||
|
|
||||||
|
# Sanity-check video patch landed.
|
||||||
|
grep -q "KHR_video_queue = PAN_ARCH < 9 && panvk_v4l2_probe_hantro()" \
|
||||||
|
src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
grep -q "PANVK_QUEUE_FAMILY_VIDEO_DECODE" src/panfrost/vulkan/panvk_device.h
|
||||||
|
test -f src/panfrost/vulkan/panvk_video_decode.c
|
||||||
|
test -f src/panfrost/vulkan/panvk_video_decode.h
|
||||||
|
test -f src/panfrost/vulkan/panvk_v4l2.c
|
||||||
|
test -f src/panfrost/vulkan/panvk_v4l2_h264.c
|
||||||
|
test -f src/panfrost/vulkan/panvk_v4l2_h264_slice_header.c
|
||||||
|
test -f src/panfrost/vulkan/panvk_v4l2_h264_slice_header.h
|
||||||
|
grep -q "panvk_v4l2_h264_slice_header.c" src/panfrost/vulkan/meson.build
|
||||||
|
grep -q "panvk_video_queue_submit_noop" src/panfrost/vulkan/panvk_vX_device.c
|
||||||
|
}
|
||||||
|
|
||||||
|
build() {
|
||||||
|
cd "mesa-${_mesaver}"
|
||||||
|
# Mirror r4's narrow build profile.
|
||||||
|
meson setup build/ \
|
||||||
|
--prefix=/usr \
|
||||||
|
--libdir=lib \
|
||||||
|
--buildtype=release \
|
||||||
|
-Dvulkan-drivers=panfrost \
|
||||||
|
-Dgallium-drivers= \
|
||||||
|
-Dplatforms=wayland,x11 \
|
||||||
|
-Dglx=disabled \
|
||||||
|
-Degl=disabled \
|
||||||
|
-Dgles1=disabled \
|
||||||
|
-Dgles2=disabled \
|
||||||
|
-Dvulkan-layers= \
|
||||||
|
-Dtools= \
|
||||||
|
-Dgallium-rusticl=false \
|
||||||
|
-Dmicrosoft-clc=disabled
|
||||||
|
meson compile -C build
|
||||||
|
}
|
||||||
|
|
||||||
|
package() {
|
||||||
|
cd "${srcdir}/mesa-${_mesaver}"
|
||||||
|
|
||||||
|
# Co-install path — parallel to r4's /usr/lib/panvk-bifrost/.
|
||||||
|
install -Dm755 build/src/panfrost/vulkan/libvulkan_panfrost.so \
|
||||||
|
"$pkgdir/usr/lib/panvk-bifrost-video/libvulkan_panfrost.so"
|
||||||
|
|
||||||
|
# ICD JSON pointing at the video build. Opt-in via VK_ICD_FILENAMES;
|
||||||
|
# NOT in /usr/share/vulkan/icd.d/ so it doesn't override stock or r4.
|
||||||
|
install -Dm644 "$srcdir/icd.json" \
|
||||||
|
"$pkgdir/usr/lib/panvk-bifrost-video/icd.json"
|
||||||
|
}
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
# mesa-panvk-bifrost-video
|
||||||
|
|
||||||
|
Patched Mesa `libvulkan_panfrost.so` that **adds `VK_KHR_video_decode_h264`** on Mali Bifrost SBCs (PAN_ARCH 6/7, RK3566/RK3568 class hardware), backed by the SoC's V4L2-stateless **hantro** VPU.
|
||||||
|
|
||||||
|
This is a **sibling** of [mesa-panvk-bifrost](../mesa-panvk-bifrost/) (the r4 package that exposes Bifrost to Chromium's Vulkan compositor). Pick this one when the consumer wants Vulkan **video decode** in addition; pick r4 for compositor-only.
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Phase 4 byte-exact validated 2026-05-21: 48/48 unique BBB display frames decoded by this package are byte-identical to `ffmpeg+libva-v4l2-request-fourier` running on the same hantro hardware. Phase 5 second-model review completed; all load-bearing findings addressed. First publish via marfrit-packages CI 2026-05-22 (PR #79 merge did not auto-fire Actions; this re-trigger restores the standard build/sign/publish path).
|
||||||
|
|
||||||
|
## How to use
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Co-installs alongside r4 and stock mesa.
|
||||||
|
sudo pacman -S mesa-panvk-bifrost-video
|
||||||
|
|
||||||
|
# Opt in (not on the default loader search path).
|
||||||
|
export VK_ICD_FILENAMES=/usr/lib/panvk-bifrost-video/icd.json
|
||||||
|
export PAN_I_WANT_A_BROKEN_VULKAN_DRIVER=1 # mesa-upstream gate
|
||||||
|
|
||||||
|
# Run a Vulkan video consumer.
|
||||||
|
vulkan-video-dec-simple-test -i your.h264 --codec h264 --noPresent --maxFrameCount 50
|
||||||
|
# or
|
||||||
|
ffmpeg -hwaccel vulkan -i your.mp4 ...
|
||||||
|
```
|
||||||
|
|
||||||
|
## Phase 1 limitations
|
||||||
|
|
||||||
|
Documented in source comments and worth knowing before relying on this in production:
|
||||||
|
|
||||||
|
- **Single video session per device.** Concurrent `VkVideoSessionKHR` on the same device clobber each other (`active_video` singleton). Sufficient for current single-stream consumers.
|
||||||
|
- **Synchronous decode at record time.** The full V4L2 ioctl dance runs to completion inside `vkCmdDecodeVideoKHR`. No pipelining. Throughput is bounded by hantro's ~1.16× realtime on 1080p H.264.
|
||||||
|
- **Hardcoded `/dev/video1` + `/dev/media0`.** Matches RK3566/68 but won't work on other SoCs without a topology-walk port (see `libva-v4l2-request-fourier` for the full version).
|
||||||
|
- **Bitstream source buffer assumed HOST_VISIBLE.** True on panvk-bifrost (no DEVICE_LOCAL-only memory types exist), but the code silently skips decode if the app bound the buffer to non-host-visible memory.
|
||||||
|
|
||||||
|
## Co-existence
|
||||||
|
|
||||||
|
- Installs to `/usr/lib/panvk-bifrost-video/` — parallel to r4's `/usr/lib/panvk-bifrost/` and stock `/usr/lib/`.
|
||||||
|
- Opt-in via `VK_ICD_FILENAMES`; does NOT register itself in `/usr/share/vulkan/icd.d/`.
|
||||||
|
- Three drivers coexist without conflict; the user picks at runtime which to use.
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"ICD": {
|
||||||
|
"api_version": "1.4.335",
|
||||||
|
"library_path": "/usr/lib/panvk-bifrost-video/libvulkan_panfrost.so"
|
||||||
|
},
|
||||||
|
"file_format_version": "1.0.1"
|
||||||
|
}
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
From: marfrit-packages noether <claude-noether@reauktion.de>
|
||||||
|
Subject: [PATCH] panvk: report fragmentStoresAndAtomics = true on Bifrost
|
||||||
|
|
||||||
|
Backports Mesa main's unconditional advertisement of
|
||||||
|
fragmentStoresAndAtomics for panvk (snapshot ref: src/panfrost/vulkan/
|
||||||
|
panvk_vX_physical_device.c at commit-time 2026-05-06; the line reads
|
||||||
|
`.fragmentStoresAndAtomics = true,` on main with no PAN_ARCH gate).
|
||||||
|
|
||||||
|
Motivation: Chromium Dawn's WebGPU initializer in
|
||||||
|
third_party/dawn/src/dawn/native/vulkan/PhysicalDeviceVk.cpp:250
|
||||||
|
unconditionally rejects any Vulkan adapter that doesn't advertise this
|
||||||
|
feature, causing Dawn to fall back to the SwiftShader CPU adapter
|
||||||
|
on PineTab2 / RK3566 / Mali-G52 r1 MC1 (PAN_ARCH 7). With this patch the
|
||||||
|
device advertises true, satisfying Dawn's gate. Tracked at
|
||||||
|
https://git.reauktion.de/marfrit/panvk-bifrost/issues/2.
|
||||||
|
|
||||||
|
The disjunction with `instance->force_enable_shader_atomics` is
|
||||||
|
preserved as a kill-switch: in compiler terms it's dead code
|
||||||
|
(`true || X == true`), but it leaves the DRI option
|
||||||
|
`pan_force_enable_shader_atomics` semantically wired so future
|
||||||
|
rebases or downstream debugging can see the link to the runtime knob.
|
||||||
|
|
||||||
|
Caveat: the existing DRI option's description in src/util/driconf.h
|
||||||
|
still labels this as "may not work reliably and is for debug purposes
|
||||||
|
only". Mesa main's choice to ship it as default-on for all panvk
|
||||||
|
architectures (including Bifrost, which is non-conformant per the
|
||||||
|
PAN_I_WANT_A_BROKEN_VULKAN_DRIVER gate) reflects an upstream judgment
|
||||||
|
that the practical risk is acceptable. Verify-before-ship for this
|
||||||
|
package: dEQP-VK.glsl.atomic_operations.* + dEQP-VK.image.store.*
|
||||||
|
deltas vs the r4 baseline must show no new fails. Pass counts may rise
|
||||||
|
(tests that previously NotSupported now run); the load-bearing line is
|
||||||
|
the Failed column staying at zero.
|
||||||
|
|
||||||
|
---
|
||||||
|
src/panfrost/vulkan/panvk_vX_physical_device.c | 3 +--
|
||||||
|
1 file changed, 1 insertion(+), 2 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
@@ -280,8 +280,7 @@
|
||||||
|
.vertexPipelineStoresAndAtomics =
|
||||||
|
(PAN_ARCH >= 13 && instance->enable_vertex_pipeline_stores_atomics) ||
|
||||||
|
instance->force_enable_shader_atomics,
|
||||||
|
- .fragmentStoresAndAtomics =
|
||||||
|
- (PAN_ARCH >= 10) || instance->force_enable_shader_atomics,
|
||||||
|
+ .fragmentStoresAndAtomics = true || instance->force_enable_shader_atomics,
|
||||||
|
.shaderTessellationAndGeometryPointSize = false,
|
||||||
|
.shaderImageGatherExtended = true,
|
||||||
|
.shaderStorageImageExtendedFormats = true,
|
||||||
@@ -0,0 +1,51 @@
|
|||||||
|
From: marfrit-packages noether <claude-noether@reauktion.de>
|
||||||
|
Subject: [PATCH] panvk: advertise VK_EXT_legacy_dithering on Bifrost
|
||||||
|
|
||||||
|
Backports Mesa main's flip — vanilla 26.0.6 doesn't have the extension
|
||||||
|
in the panvk advertisement list; main does (line 172 / 647 on snapshot
|
||||||
|
617da94, 2026-05-06).
|
||||||
|
|
||||||
|
VK_EXT_legacy_dithering exposes the classic OpenGL-style dithering
|
||||||
|
behavior to Vulkan apps. Pure-software composition; no new HW path.
|
||||||
|
ARM's own libmali driver release r51p0 (BXODROIDN2PL, Aug 2024) lists
|
||||||
|
this extension in its Vulkan implementation for ODROID-N2 boards
|
||||||
|
using the same Mali-G52 architecture family — confirms ARM ships it
|
||||||
|
for Mali-G52-class hardware.
|
||||||
|
|
||||||
|
Consumer benefit: dithering matters for low-bit-depth framebuffers
|
||||||
|
(RGB565 / RGB5A1 — common on portable / battery-saving renders)
|
||||||
|
where banding is visible. DXVK / vkd3d-proton both opt in when
|
||||||
|
available.
|
||||||
|
|
||||||
|
Verify-before-ship: vulkaninfo lists the extension and
|
||||||
|
VkPhysicalDeviceLegacyDitheringFeaturesEXT.legacyDithering == true.
|
||||||
|
|
||||||
|
Cross-refs:
|
||||||
|
- marfrit/panvk-bifrost research/r6_r7_mali_g52_feature_audit_2026-05-24.md
|
||||||
|
- ARM blob r51p0 strings dump (in-blob extension confirmed)
|
||||||
|
|
||||||
|
---
|
||||||
|
src/panfrost/vulkan/panvk_vX_physical_device.c | 5 +++++
|
||||||
|
1 file changed, 5 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/src/panfrost/vulkan/panvk_vX_physical_device.c b/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
--- a/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
+++ b/src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
@@ -156,6 +156,7 @@
|
||||||
|
.EXT_image_drm_format_modifier = true,
|
||||||
|
.EXT_image_robustness = true,
|
||||||
|
.EXT_index_type_uint8 = true,
|
||||||
|
+ .EXT_legacy_dithering = true,
|
||||||
|
.EXT_line_rasterization = true,
|
||||||
|
.EXT_load_store_op_none = true,
|
||||||
|
.EXT_non_seamless_cube_map = true,
|
||||||
|
@@ -552,6 +553,9 @@
|
||||||
|
|
||||||
|
/* VK_EXT_multisampled_render_to_single_sampled */
|
||||||
|
.multisampledRenderToSingleSampled = true,
|
||||||
|
+
|
||||||
|
+ /* VK_EXT_legacy_dithering */
|
||||||
|
+ .legacyDithering = true,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
@@ -30,11 +30,11 @@
|
|||||||
|
|
||||||
pkgname=mesa-panvk-bifrost
|
pkgname=mesa-panvk-bifrost
|
||||||
_mesaver=26.0.6
|
_mesaver=26.0.6
|
||||||
pkgver=26.0.6.r4
|
pkgver=26.0.6.r6
|
||||||
pkgrel=1
|
pkgrel=1
|
||||||
pkgdesc="Patched Mesa libvulkan_panfrost.so exposing Bifrost-gen Mali to Vulkan apps (panvk-bifrost campaign)"
|
pkgdesc="Patched Mesa libvulkan_panfrost.so exposing Bifrost-gen Mali to Vulkan apps (panvk-bifrost campaign)"
|
||||||
arch=('aarch64')
|
arch=('aarch64')
|
||||||
url="https://github.com/marfrit/panvk-bifrost"
|
url="https://git.reauktion.de/marfrit/panvk-bifrost"
|
||||||
license=('MIT')
|
license=('MIT')
|
||||||
|
|
||||||
# We co-install at /usr/lib/panvk-bifrost/ so no conflicts with stock mesa.
|
# We co-install at /usr/lib/panvk-bifrost/ so no conflicts with stock mesa.
|
||||||
@@ -81,6 +81,8 @@ source=(
|
|||||||
"0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch"
|
"0002-panvk-expose-vulkan-1.1-1.2-on-bifrost.patch"
|
||||||
"0003-panvk-bifrost-vk-ext-transform-feedback.patch"
|
"0003-panvk-bifrost-vk-ext-transform-feedback.patch"
|
||||||
"0004-panvk-bifrost-xfb-primitive-decomposition.patch"
|
"0004-panvk-bifrost-xfb-primitive-decomposition.patch"
|
||||||
|
"0005-panvk-bifrost-fragment-stores-atomics.patch"
|
||||||
|
"0006-panvk-bifrost-legacy-dithering.patch"
|
||||||
"brave-vulkan"
|
"brave-vulkan"
|
||||||
"icd.json"
|
"icd.json"
|
||||||
)
|
)
|
||||||
@@ -92,6 +94,8 @@ sha256sums=(
|
|||||||
'SKIP'
|
'SKIP'
|
||||||
'SKIP'
|
'SKIP'
|
||||||
'SKIP'
|
'SKIP'
|
||||||
|
'SKIP'
|
||||||
|
'SKIP'
|
||||||
)
|
)
|
||||||
|
|
||||||
prepare() {
|
prepare() {
|
||||||
@@ -127,6 +131,27 @@ prepare() {
|
|||||||
# Phase-doc context: ~/src/panvk-bifrost/iter17/phase{0,1,2,4,5,6,8}_*.md.
|
# Phase-doc context: ~/src/panvk-bifrost/iter17/phase{0,1,2,4,5,6,8}_*.md.
|
||||||
patch -p1 < "${srcdir}/0004-panvk-bifrost-xfb-primitive-decomposition.patch"
|
patch -p1 < "${srcdir}/0004-panvk-bifrost-xfb-primitive-decomposition.patch"
|
||||||
|
|
||||||
|
# r5 (2026-05-23): advertise .fragmentStoresAndAtomics = true on Bifrost
|
||||||
|
# to satisfy Chromium Dawn's WebGPU init gate
|
||||||
|
# (third_party/dawn/src/dawn/native/vulkan/PhysicalDeviceVk.cpp:250).
|
||||||
|
# Backports Mesa main's unconditional flip (same line as on main as of
|
||||||
|
# 2026-05-06). Disjunction with instance->force_enable_shader_atomics
|
||||||
|
# is preserved as a documented kill-switch even though the compiler
|
||||||
|
# folds it away. Closes marfrit/panvk-bifrost#2.
|
||||||
|
# Verify-before-ship: dEQP-VK.glsl.atomic_operations.* and
|
||||||
|
# dEQP-VK.image.store.* show no new Failed vs r4 baseline.
|
||||||
|
patch -p1 < "${srcdir}/0005-panvk-bifrost-fragment-stores-atomics.patch"
|
||||||
|
|
||||||
|
# r6 (2026-05-25): advertise VK_EXT_legacy_dithering. Backports Mesa
|
||||||
|
# main's unconditional flip. Pure-software composition; vk_render_pass
|
||||||
|
# already gates on enabled_features.legacyDithering and panvk_vX_blend
|
||||||
|
# + pan_format already plumb the dithered BLEND descriptor (BFMT2 table
|
||||||
|
# has MALI_BLEND_AU encodings for RGB565/RGB5A1/RGBA4/RGB10A2 on
|
||||||
|
# PAN_ARCH 7). Closes the EXT_legacy_dithering gap surfaced by
|
||||||
|
# marfrit/panvk-bifrost research/r6_r7_*. ARM blob r51p0 confirms the
|
||||||
|
# extension as Mali-G52-architecture supported.
|
||||||
|
patch -p1 < "${srcdir}/0006-panvk-bifrost-legacy-dithering.patch"
|
||||||
|
|
||||||
# Sanity-check the patches landed.
|
# Sanity-check the patches landed.
|
||||||
grep -q "KHR_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c
|
grep -q "KHR_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
grep -q "EXT_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c
|
grep -q "EXT_robustness2 = true," src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
@@ -138,6 +163,11 @@ prepare() {
|
|||||||
test -f src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c
|
test -f src/panfrost/vulkan/jm/panvk_vX_cmd_xfb.c
|
||||||
# iter17 sanity: pan_nir_lower_xfb call site has been replaced; new file present.
|
# iter17 sanity: pan_nir_lower_xfb call site has been replaced; new file present.
|
||||||
grep -q "panvk_per_arch(nir_lower_xfb)" src/panfrost/vulkan/panvk_vX_shader.c
|
grep -q "panvk_per_arch(nir_lower_xfb)" src/panfrost/vulkan/panvk_vX_shader.c
|
||||||
|
# r5 sanity: fragmentStoresAndAtomics = true patch landed
|
||||||
|
grep -q "fragmentStoresAndAtomics = true ||" src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
# r6 sanity: VK_EXT_legacy_dithering advertised
|
||||||
|
grep -q '\.EXT_legacy_dithering = true,' src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
|
grep -q '\.legacyDithering = true,' src/panfrost/vulkan/panvk_vX_physical_device.c
|
||||||
grep -q "xfb_topology" src/panfrost/vulkan/panvk_shader.h
|
grep -q "xfb_topology" src/panfrost/vulkan/panvk_shader.h
|
||||||
grep -q "panvk_xfb_topology" src/panfrost/vulkan/panvk_shader.h
|
grep -q "panvk_xfb_topology" src/panfrost/vulkan/panvk_shader.h
|
||||||
test -f src/panfrost/vulkan/panvk_vX_xfb_lower.c
|
test -f src/panfrost/vulkan/panvk_vX_xfb_lower.c
|
||||||
|
|||||||
+150
@@ -0,0 +1,150 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Package pre-built chromium-fourier artifacts into a .deb.
|
||||||
|
#
|
||||||
|
# Chromium can't be compiled natively on any available aarch64 runner
|
||||||
|
# (clang version wall — chromium requires its internal clang fork).
|
||||||
|
# The build is cross-compiled on CT 220 (data, x86_64 Ryzen 7).
|
||||||
|
# This script expects the build artifacts to exist at BUILD_DIR
|
||||||
|
# (default: fetched from CT 220 via SSH).
|
||||||
|
#
|
||||||
|
# Sibling Arch package: ../../arch/chromium-fourier/PKGBUILD
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
PKGVER=148.0.7778.178
|
||||||
|
EPOCH=1
|
||||||
|
PKGREL=1
|
||||||
|
ARCH=arm64
|
||||||
|
|
||||||
|
HERE=$(dirname "$(readlink -f "$0")")
|
||||||
|
export SOURCE_DATE_EPOCH=1779854400 # 2026-05-24 09:00 UTC
|
||||||
|
|
||||||
|
BUILD_DIR="${BUILD_DIR:-}"
|
||||||
|
|
||||||
|
work=$(mktemp -d)
|
||||||
|
trap "rm -rf $work" EXIT
|
||||||
|
|
||||||
|
if [ -z "$BUILD_DIR" ]; then
|
||||||
|
echo "BUILD_DIR not set — fetching artifacts from CT 220 on data..."
|
||||||
|
BUILD_DIR="$work/artifacts"
|
||||||
|
mkdir -p "$BUILD_DIR"
|
||||||
|
ssh root@data "pct exec 220 -- tar -cf - -C /build/chromium/src/out/Default \
|
||||||
|
chrome chrome_crashpad_handler \
|
||||||
|
libEGL.so libGLESv2.so libvk_swiftshader.so libvulkan.so.1 \
|
||||||
|
vk_swiftshader_icd.json \
|
||||||
|
chrome_100_percent.pak chrome_200_percent.pak resources.pak \
|
||||||
|
v8_context_snapshot.bin snapshot_blob.bin icudtl.dat \
|
||||||
|
locales/" | tar -xf - -C "$BUILD_DIR"
|
||||||
|
fi
|
||||||
|
|
||||||
|
ROOT="$work/pkgroot"
|
||||||
|
|
||||||
|
install -Dm755 "$BUILD_DIR/chrome" "$ROOT/usr/lib/chromium/chromium"
|
||||||
|
install -Dm755 "$BUILD_DIR/chrome_crashpad_handler" "$ROOT/usr/lib/chromium/chrome_crashpad_handler"
|
||||||
|
|
||||||
|
for so in libEGL.so libGLESv2.so libvk_swiftshader.so libvulkan.so.1; do
|
||||||
|
[ -f "$BUILD_DIR/$so" ] && install -Dm755 "$BUILD_DIR/$so" "$ROOT/usr/lib/chromium/$so"
|
||||||
|
done
|
||||||
|
|
||||||
|
for icd in "$BUILD_DIR"/*_icd.json; do
|
||||||
|
[ -f "$icd" ] && install -Dm644 "$icd" "$ROOT/usr/lib/chromium/$(basename "$icd")"
|
||||||
|
done
|
||||||
|
|
||||||
|
for f in chrome_100_percent.pak chrome_200_percent.pak resources.pak \
|
||||||
|
v8_context_snapshot.bin snapshot_blob.bin icudtl.dat; do
|
||||||
|
[ -f "$BUILD_DIR/$f" ] && install -Dm644 "$BUILD_DIR/$f" "$ROOT/usr/lib/chromium/$f"
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ -d "$BUILD_DIR/locales" ]; then
|
||||||
|
install -dm755 "$ROOT/usr/lib/chromium/locales"
|
||||||
|
cp -r "$BUILD_DIR/locales/"* "$ROOT/usr/lib/chromium/locales/"
|
||||||
|
fi
|
||||||
|
|
||||||
|
install -dm755 "$ROOT/usr/bin"
|
||||||
|
cat > "$ROOT/usr/bin/chromium-fourier" <<'LAUNCHER'
|
||||||
|
#!/bin/bash
|
||||||
|
USER_HANDLES_VULKAN=0
|
||||||
|
for arg in "$@"; do
|
||||||
|
case "$arg" in
|
||||||
|
--use-vulkan*|--enable-features=*Vulkan*|--disable-features=*Vulkan*|--use-angle=vulkan*)
|
||||||
|
USER_HANDLES_VULKAN=1
|
||||||
|
break
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
vulkan_default=()
|
||||||
|
if [ "$USER_HANDLES_VULKAN" = 0 ]; then
|
||||||
|
vulkan_default=(--disable-features=Vulkan)
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec /usr/lib/chromium/chromium \
|
||||||
|
--ozone-platform=wayland \
|
||||||
|
--use-gl=angle --use-angle=gles \
|
||||||
|
--enable-features=AcceleratedVideoDecoder \
|
||||||
|
"${vulkan_default[@]}" \
|
||||||
|
"$@"
|
||||||
|
LAUNCHER
|
||||||
|
chmod 0755 "$ROOT/usr/bin/chromium-fourier"
|
||||||
|
|
||||||
|
mkdir -p "$ROOT/usr/share/doc/chromium-fourier" "$ROOT/DEBIAN"
|
||||||
|
install -Dm644 "$HERE/debian/copyright" \
|
||||||
|
"$ROOT/usr/share/doc/chromium-fourier/copyright"
|
||||||
|
install -Dm644 "$HERE/debian/changelog" \
|
||||||
|
"$ROOT/usr/share/doc/chromium-fourier/changelog.Debian"
|
||||||
|
gzip -9 -n "$ROOT/usr/share/doc/chromium-fourier/changelog.Debian"
|
||||||
|
|
||||||
|
ISIZE=$(du -sk "$ROOT" | awk '{print $1}')
|
||||||
|
cat > "$ROOT/DEBIAN/control" <<EOF
|
||||||
|
Package: chromium-fourier
|
||||||
|
Version: ${EPOCH}:${PKGVER}-${PKGREL}
|
||||||
|
Section: web
|
||||||
|
Priority: optional
|
||||||
|
Architecture: ${ARCH}
|
||||||
|
Installed-Size: ${ISIZE}
|
||||||
|
Depends: libasound2,
|
||||||
|
libatk-bridge2.0-0,
|
||||||
|
libatk1.0-0,
|
||||||
|
libcairo2,
|
||||||
|
libcups2,
|
||||||
|
libdbus-1-3,
|
||||||
|
libdrm2,
|
||||||
|
libexpat1,
|
||||||
|
libfontconfig1,
|
||||||
|
libfreetype6,
|
||||||
|
libgbm1,
|
||||||
|
libglib2.0-0,
|
||||||
|
libgtk-3-0,
|
||||||
|
libnspr4,
|
||||||
|
libnss3,
|
||||||
|
libpango-1.0-0,
|
||||||
|
libpulse0,
|
||||||
|
libva2,
|
||||||
|
libwayland-client0,
|
||||||
|
libx11-6,
|
||||||
|
libxcb1,
|
||||||
|
libxkbcommon0,
|
||||||
|
libpipewire-0.3-0,
|
||||||
|
fonts-liberation,
|
||||||
|
v4l-utils
|
||||||
|
Provides: www-browser
|
||||||
|
Conflicts: chromium
|
||||||
|
Maintainer: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Homepage: https://www.chromium.org/
|
||||||
|
Description: Chromium with V4L2 HW video decode for Rockchip (Wayland + mainline)
|
||||||
|
Chromium ${PKGVER} with three patches enabling V4L2 hardware video
|
||||||
|
decoding on mainline Linux / Wayland for Rockchip SoCs (RK3566 hantro,
|
||||||
|
RK3588 VDPU381).
|
||||||
|
.
|
||||||
|
Cross-compiled from x86_64 using chromium's bundled clang (upstream
|
||||||
|
LLVM cannot compile chromium). Runtime target is aarch64.
|
||||||
|
.
|
||||||
|
Patches: enable-v4l2-decoder-default, wayland-allow-direct-egl-gles2,
|
||||||
|
nv12-external-oes-on-modifier-external-only.
|
||||||
|
.
|
||||||
|
Launcher at /usr/bin/chromium-fourier defaults to Wayland + ANGLE/GLES
|
||||||
|
with Vulkan disabled (panvk on RK3566 breaks V4L2 dispatch).
|
||||||
|
EOF
|
||||||
|
|
||||||
|
DEB_OUT="chromium-fourier_${EPOCH}%3a${PKGVER}-${PKGREL}_${ARCH}.deb"
|
||||||
|
dpkg-deb --root-owner-group --build "$ROOT" "$HERE/$DEB_OUT"
|
||||||
|
echo "built: $HERE/$DEB_OUT"
|
||||||
+8
@@ -0,0 +1,8 @@
|
|||||||
|
chromium-fourier (1:148.0.7778.178-1) trixie; urgency=medium
|
||||||
|
|
||||||
|
* Chromium 148.0.7778.178 with V4L2 HW decode patches for Rockchip.
|
||||||
|
* Cross-compiled from x86_64 using chromium's bundled clang.
|
||||||
|
* Three fourier patches: enable-v4l2-decoder-default,
|
||||||
|
wayland-allow-direct-egl-gles2, nv12-external-oes-on-modifier-external-only.
|
||||||
|
|
||||||
|
-- Markus Fritsche <mfritsche@reauktion.de> Sat, 24 May 2026 09:00:00 +0200
|
||||||
+32
@@ -0,0 +1,32 @@
|
|||||||
|
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||||
|
Upstream-Name: Chromium
|
||||||
|
Upstream-Contact: chromium-dev@chromium.org
|
||||||
|
Source: https://www.chromium.org/
|
||||||
|
|
||||||
|
Files: *
|
||||||
|
Copyright: The Chromium Authors
|
||||||
|
License: BSD-3-Clause
|
||||||
|
|
||||||
|
Files: debian/*
|
||||||
|
Copyright: 2026 Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
License: BSD-3-Clause
|
||||||
|
|
||||||
|
License: BSD-3-Clause
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
.
|
||||||
|
1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions and the following disclaimer.
|
||||||
|
.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
.
|
||||||
|
3. Neither the name of the copyright holder nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED.
|
||||||
+3
-3
@@ -14,9 +14,9 @@
|
|||||||
# Sibling userspace package: ../daedalus-v4l2/build-deb.sh
|
# Sibling userspace package: ../daedalus-v4l2/build-deb.sh
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
UPSTREAM_COMMIT=5d8b4369e58ab947d1c56b1f718293c57c6065b5
|
UPSTREAM_COMMIT=872eec505eb91b561892d02a0526749348ddc121
|
||||||
PKGVER=0.1.0+r33+g5d8b436
|
PKGVER=0.1.0+r45+g872eec5
|
||||||
PKGREL=1 # reset for new upstream pin (5d8b436 — revert parking design); still carries the #64 multi-kernel postinst fix
|
PKGREL=1 # reset for new upstream pin (872eec5 — PROTO_MAX_PAYLOAD 64 KiB -> 1 MiB, closes #19); lock-step with daedalus-v4l2 0.1.0+r45+g872eec5 REQUIRED
|
||||||
MODULE_NAME=daedalus_v4l2
|
MODULE_NAME=daedalus_v4l2
|
||||||
|
|
||||||
HERE=$(dirname "$(readlink -f "$0")")
|
HERE=$(dirname "$(readlink -f "$0")")
|
||||||
|
|||||||
+21
@@ -1,3 +1,24 @@
|
|||||||
|
daedalus-v4l2-dkms (0.1.0+r45+g872eec5-1) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
|
* Bump to 872eec5 — picks up daedalus-v4l2 PR #20 (closes #19).
|
||||||
|
Wire-protocol cap DAEDALUS_PROTO_MAX_PAYLOAD raised from 64 KiB
|
||||||
|
to 1 MiB in include/daedalus_v4l2_proto.h. The kernel module
|
||||||
|
inherits the larger DAEDALUS_MAX_BITSTREAM via the same #define
|
||||||
|
and daedalus_fill_output_fmt now reports OUTPUT_MPLANE
|
||||||
|
sizeimage = ~1 MiB instead of 65484.
|
||||||
|
* Skips the r33 -> r45 commit range — between 5d8b436 and 872eec5
|
||||||
|
only one kernel/include change landed (the PROTO_MAX_PAYLOAD
|
||||||
|
bump above). The intervening daemon-only bumps (r37 / r39 /
|
||||||
|
r41 / r43) didn't touch kernel/ or include/ at all.
|
||||||
|
* Effective wire cap is min(kernel, daemon) — lock-step install
|
||||||
|
WITH daedalus-v4l2 0.1.0+r45+g872eec5 REQUIRED.
|
||||||
|
* Allocations (kmemdup / kmalloc on payload, vb2 plane backing)
|
||||||
|
are dynamic and sized per-payload at runtime; the bump only
|
||||||
|
sets the ceiling. KMALLOC_MAX_SIZE on aarch64 SLUB is several
|
||||||
|
MiB so 1 MiB is well within bounds.
|
||||||
|
|
||||||
|
-- Markus Fritsche <mfritsche@reauktion.de> Fri, 22 May 2026 21:00:00 +0000
|
||||||
|
|
||||||
daedalus-v4l2-dkms (0.1.0+r33+g5d8b436-1) bookworm trixie; urgency=medium
|
daedalus-v4l2-dkms (0.1.0+r33+g5d8b436-1) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
* Bump to 5d8b436 — reverts daedalus-v4l2 PRs #7 + #8. Kernel
|
* Bump to 5d8b436 — reverts daedalus-v4l2 PRs #7 + #8. Kernel
|
||||||
|
|||||||
Vendored
+3
-3
@@ -19,9 +19,9 @@ set -euo pipefail
|
|||||||
# source tree we own in marfrit-packages. Headers + .pc files
|
# source tree we own in marfrit-packages. Headers + .pc files
|
||||||
# come from ffmpeg-v4l2-request-fourier (installed by the CI
|
# come from ffmpeg-v4l2-request-fourier (installed by the CI
|
||||||
# workflow before this script runs; see PKG_CONFIG_PATH below).
|
# workflow before this script runs; see PKG_CONFIG_PATH below).
|
||||||
UPSTREAM_COMMIT=6e6dfa144da7bc7fa8be50c8da91d7d1c6132a2c
|
UPSTREAM_COMMIT=872eec505eb91b561892d02a0526749348ddc121
|
||||||
PKGVER=0.1.0+r41+g6e6dfa1
|
PKGVER=0.1.0+r45+g872eec5
|
||||||
PKGREL=1 # reset for new upstream pin (6e6dfa1 — soname 62 via /opt/fourier)
|
PKGREL=1 # reset for new upstream pin (872eec5 — PROTO_MAX_PAYLOAD 64 KiB -> 1 MiB, closes #19); lock-step with daedalus-v4l2-dkms 0.1.0+r45+g872eec5 REQUIRED
|
||||||
|
|
||||||
# daedalus-fourier pin. d87239d = marfrit/daedalus-fourier PR #1 merge
|
# daedalus-fourier pin. d87239d = marfrit/daedalus-fourier PR #1 merge
|
||||||
# (install rules + pkg-config, enables this consumer to find_package
|
# (install rules + pkg-config, enables this consumer to find_package
|
||||||
|
|||||||
+43
@@ -1,3 +1,46 @@
|
|||||||
|
daedalus-v4l2 (0.1.0+r45+g872eec5-1) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
|
* Bump to 872eec5 — picks up daedalus-v4l2 PR #20 (closes #19).
|
||||||
|
Wire-protocol cap DAEDALUS_PROTO_MAX_PAYLOAD raised from 64 KiB
|
||||||
|
to 1 MiB. DAEDALUS_MAX_BITSTREAM follows; daedalus_fill_output_fmt
|
||||||
|
now reports OUTPUT_MPLANE sizeimage = ~1 MiB instead of 65484.
|
||||||
|
libva-v4l2-request-fourier's S_FMT-driven OUTPUT-pool resize
|
||||||
|
finally succeeds; Firefox no longer falls off to libmozavcodec
|
||||||
|
SW when an H.264 slice exceeds 64 KiB (routine on any
|
||||||
|
720p+ stream).
|
||||||
|
* #define-only change in include/daedalus_v4l2_proto.h; struct
|
||||||
|
layout unchanged. But effective cap is min(kernel, daemon) —
|
||||||
|
lock-step install of this package WITH
|
||||||
|
daedalus-v4l2-dkms 0.1.0+r45+g872eec5 REQUIRED.
|
||||||
|
* Daemon-side allocations are dynamic (malloc-on-payload), so
|
||||||
|
the practical growth is one ~1 MiB read buffer per daemon
|
||||||
|
process at startup. Negligible on Pi 5 / 8 GB.
|
||||||
|
* Picks up the same r43 -> r45 transition as daedalus-v4l2-dkms
|
||||||
|
(which had been stuck at r33+g5d8b436 since the parking-design
|
||||||
|
revert because the kernel module didn't change in r37/r39/r41/r43).
|
||||||
|
|
||||||
|
-- Markus Fritsche <mfritsche@reauktion.de> Fri, 22 May 2026 21:00:00 +0000
|
||||||
|
|
||||||
|
daedalus-v4l2 (0.1.0+r43+g1d8f5af-1) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
|
* Bump to 1d8f5af — picks up daedalus-v4l2 PR #18 (closes #17).
|
||||||
|
Daemon now drops degenerate (<4 byte) bitstreams at the REQ_DECODE
|
||||||
|
entry instead of letting avcodec_send_packet return
|
||||||
|
AVERROR_INVALIDDATA. Reply RESP_FRAME with status=
|
||||||
|
DAEDALUS_DECODE_NO_FRAME so libva's V4L2 surface pool stays
|
||||||
|
healthy.
|
||||||
|
* Fixes the Firefox YouTube avc1 pause→resume regression observed
|
||||||
|
on higgs: libva-v4l2-request-fourier flushes a 3-byte stub
|
||||||
|
(presumably a bare NAL start code) into OUTPUT_MPLANE at the
|
||||||
|
pause boundary; the old INVALIDDATA error path made Firefox
|
||||||
|
fall off to libmozavcodec SW for the rest of the session. With
|
||||||
|
this filter the daemon logs the sentinel as 'tiny bitstream 3
|
||||||
|
bytes — dropping as no-op' and the next real REQ_DECODE
|
||||||
|
proceeds normally.
|
||||||
|
* Wire protocol unchanged. No daedalus-v4l2-dkms bump needed.
|
||||||
|
|
||||||
|
-- Markus Fritsche <mfritsche@reauktion.de> Fri, 22 May 2026 17:30:00 +0000
|
||||||
|
|
||||||
daedalus-v4l2 (0.1.0+r41+g6e6dfa1-1) bookworm trixie; urgency=medium
|
daedalus-v4l2 (0.1.0+r41+g6e6dfa1-1) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
* Bump to 6e6dfa1 — daedalus-v4l2 PR #16. Daemon dlopens Kwiboo
|
* Bump to 6e6dfa1 — daedalus-v4l2 PR #16. Daemon dlopens Kwiboo
|
||||||
|
|||||||
@@ -0,0 +1,107 @@
|
|||||||
|
From 1b286ddb4efaca26ec9b9e290e989fec77dc1c77 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Fri, 22 May 2026 10:18:21 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 8x8 IDCT through
|
||||||
|
daedalus-fourier
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
H264DSPContext.idct8_add (called per 8x8 block from the High-profile
|
||||||
|
intra-8x8-DCT decode path in h264_mb.c) now dispatches through
|
||||||
|
daedalus_recipe_dispatch_h264_idct8 instead of ff_h264_idct8_add_neon.
|
||||||
|
|
||||||
|
The recipe layer picks the substrate; for cycle 7 (H.264 IDCT 8x8)
|
||||||
|
the recipe is CPU NEON, so this is effectively a NEON-to-NEON
|
||||||
|
substitution layered on top of the cycle-6 IDCT 4x4 wiring. Same
|
||||||
|
pthread_once global context, same destructive-zero semantics; FFmpeg
|
||||||
|
column-major 8x8 storage block[r + 8*c] matches daedalus's convention.
|
||||||
|
|
||||||
|
Bulk path c->idct8_add4 (used for inter 8x8-DCT macroblocks) remains
|
||||||
|
on the in-tree NEON .S code and will be batched through
|
||||||
|
daedalus_recipe_dispatch_h264_idct8 with n_blocks>1 in a follow-up.
|
||||||
|
|
||||||
|
Bit-exact against ff_h264_idct8_add_neon (daedalus-fourier cycle 7
|
||||||
|
green).
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 7.
|
||||||
|
---
|
||||||
|
libavcodec/aarch64/h264_idct_daedalus.c | 29 ++++++++++++++++-------
|
||||||
|
libavcodec/aarch64/h264dsp_init_aarch64.c | 3 ++-
|
||||||
|
2 files changed, 23 insertions(+), 9 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
index 538d223..cbb98af 100644
|
||||||
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
+++ b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
@@ -1,14 +1,16 @@
|
||||||
|
/*
|
||||||
|
- * H.264 4x4 IDCT + add — daedalus-fourier substitution shim.
|
||||||
|
+ * H.264 4x4 / 8x8 IDCT + add — daedalus-fourier substitution shims.
|
||||||
|
*
|
||||||
|
- * Routes H264DSPContext.idct_add through
|
||||||
|
- * daedalus_recipe_dispatch_h264_idct4 instead of ff_h264_idct_add_neon.
|
||||||
|
- * The recipe layer picks the substrate (CPU NEON by default for
|
||||||
|
- * cycle 6; future cycles may dispatch to V3D opportunistically).
|
||||||
|
+ * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
||||||
|
+ * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
||||||
|
+ * instead of the in-tree ff_h264_idct{,8}_add_neon assembly. The
|
||||||
|
+ * recipe layer picks the substrate (CPU NEON by default for cycles
|
||||||
|
+ * 6 + 7; future cycles may dispatch to V3D opportunistically).
|
||||||
|
*
|
||||||
|
- * FFmpeg's 4x4 block memory layout matches daedalus's column-major
|
||||||
|
- * convention: block[r + 4*c] = coefficient at (row r, col c). Both
|
||||||
|
- * sides destructively zero the block after the transform.
|
||||||
|
+ * FFmpeg's 4x4 and 8x8 block memory layouts match daedalus's
|
||||||
|
+ * column-major convention: block[r + N*c] = coefficient at
|
||||||
|
+ * (row r, col c) for N ∈ {4, 8}. Both sides destructively zero the
|
||||||
|
+ * block after the transform.
|
||||||
|
*
|
||||||
|
* The library context is process-global and lazily initialised under
|
||||||
|
* pthread_once. We pick the no-QPU constructor here because
|
||||||
|
@@ -37,6 +39,7 @@ static void daedalus_ctx_init_once(void)
|
||||||
|
}
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
+void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
{
|
||||||
|
@@ -47,3 +50,13 @@ void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
daedalus_recipe_dispatch_h264_idct4(g_dctx, dst, (size_t)stride,
|
||||||
|
block, 1, &meta);
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
+{
|
||||||
|
+ static const daedalus_h264_block_meta meta = { .dst_off = 0 };
|
||||||
|
+
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+
|
||||||
|
+ daedalus_recipe_dispatch_h264_idct8(g_dctx, dst, (size_t)stride,
|
||||||
|
+ block, 1, &meta);
|
||||||
|
+}
|
||||||
|
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
index b993df2..741e551 100644
|
||||||
|
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
@@ -79,6 +79,7 @@ void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
|
||||||
|
const uint8_t nnzc[15 * 8]);
|
||||||
|
|
||||||
|
void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
+void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
|
||||||
|
int16_t *block, int stride,
|
||||||
|
@@ -146,7 +147,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
||||||
|
c->idct_add16intra = ff_h264_idct_add16intra_neon;
|
||||||
|
if (chroma_format_idc <= 1)
|
||||||
|
c->idct_add8 = ff_h264_idct_add8_neon;
|
||||||
|
- c->idct8_add = ff_h264_idct8_add_neon;
|
||||||
|
+ c->idct8_add = ff_h264_idct8_add_daedalus;
|
||||||
|
c->idct8_dc_add = ff_h264_idct8_dc_add_neon;
|
||||||
|
c->idct8_add4 = ff_h264_idct8_add4_neon;
|
||||||
|
} else if (have_neon(cpu_flags) && bit_depth == 10) {
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
|
|
||||||
+121
@@ -0,0 +1,121 @@
|
|||||||
|
From 68731c41d7ea68be0e912b128cb4e71fb56e8263 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Fri, 22 May 2026 12:15:16 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264dsp: route H.264 luma-v deblock through
|
||||||
|
daedalus-fourier
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
H264DSPContext.v_loop_filter_luma (non-intra bS<4 vertical luma
|
||||||
|
deblock, called per macroblock-row edge from the slice deblock
|
||||||
|
loop) now dispatches through
|
||||||
|
daedalus_recipe_dispatch_h264_deblock_luma_v instead of
|
||||||
|
ff_h264_v_loop_filter_luma_neon.
|
||||||
|
|
||||||
|
The recipe layer picks the substrate; for cycle 8 the daedalus
|
||||||
|
docstring marks the kernel "CPU primary; QPU opportunistic", but
|
||||||
|
the libavcodec.so context here is built with
|
||||||
|
daedalus_ctx_create_no_qpu — process-global pthread_once init,
|
||||||
|
shared with cycles 6/7. QPU opportunism stays gated off until a
|
||||||
|
follow-up adds an explicit feature flag (no implicit Vulkan init
|
||||||
|
in arbitrary host processes). In the meantime cycle 8 is a
|
||||||
|
plumbing-only substitution, NEON-to-NEON via the daedalus recipe.
|
||||||
|
|
||||||
|
Intra (bS=4) loop filter — c->v_loop_filter_luma_intra — stays on
|
||||||
|
the in-tree NEON .S code; daedalus's daedalus_h264_deblock_meta
|
||||||
|
only covers the non-intra path per its docstring.
|
||||||
|
|
||||||
|
FFmpeg `int alpha/beta/int8_t tc0[4]` → daedalus_h264_deblock_meta
|
||||||
|
(int32_t alpha/beta + inline int8_t tc0[4]). pix already points
|
||||||
|
to row 0 of the bottom block per FFmpeg's deblock convention,
|
||||||
|
satisfying daedalus's `dst_off >= 4 * dst_stride` constraint.
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 8.
|
||||||
|
---
|
||||||
|
libavcodec/aarch64/h264_idct_daedalus.c | 36 +++++++++++++++++++----
|
||||||
|
libavcodec/aarch64/h264dsp_init_aarch64.c | 4 ++-
|
||||||
|
2 files changed, 33 insertions(+), 7 deletions(-)
|
||||||
|
|
||||||
|
diff --git a/libavcodec/aarch64/h264_idct_daedalus.c b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
index cbb98af..92365fa 100644
|
||||||
|
--- a/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
+++ b/libavcodec/aarch64/h264_idct_daedalus.c
|
||||||
|
@@ -1,11 +1,14 @@
|
||||||
|
/*
|
||||||
|
- * H.264 4x4 / 8x8 IDCT + add — daedalus-fourier substitution shims.
|
||||||
|
+ * H.264 4x4 / 8x8 IDCT + luma-v deblock — daedalus-fourier substitution shims.
|
||||||
|
*
|
||||||
|
- * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
||||||
|
- * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
||||||
|
- * instead of the in-tree ff_h264_idct{,8}_add_neon assembly. The
|
||||||
|
- * recipe layer picks the substrate (CPU NEON by default for cycles
|
||||||
|
- * 6 + 7; future cycles may dispatch to V3D opportunistically).
|
||||||
|
+ * Routes H264DSPContext.idct_add → daedalus_recipe_dispatch_h264_idct4
|
||||||
|
+ * H264DSPContext.idct8_add → daedalus_recipe_dispatch_h264_idct8
|
||||||
|
+ * H264DSPContext.v_loop_filter_luma → daedalus_recipe_dispatch_h264_deblock_luma_v
|
||||||
|
+ * instead of the in-tree ff_h264_*_neon assembly. The recipe layer
|
||||||
|
+ * picks the substrate (CPU NEON for cycles 6 + 7 by default; cycle 8
|
||||||
|
+ * is CPU primary with QPU opportunistic — the ctx below is no-QPU,
|
||||||
|
+ * so cycle 8 stays on the CPU NEON path until a separate change
|
||||||
|
+ * gates QPU init on a daedalus-fourier feature flag).
|
||||||
|
*
|
||||||
|
* FFmpeg's 4x4 and 8x8 block memory layouts match daedalus's
|
||||||
|
* column-major convention: block[r + N*c] = coefficient at
|
||||||
|
@@ -40,6 +43,8 @@ static void daedalus_ctx_init_once(void)
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride);
|
||||||
|
+void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta, int8_t *tc0);
|
||||||
|
|
||||||
|
void ff_h264_idct_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
{
|
||||||
|
@@ -60,3 +65,22 @@ void ff_h264_idct8_add_daedalus(uint8_t *dst, int16_t *block, int stride)
|
||||||
|
daedalus_recipe_dispatch_h264_idct8(g_dctx, dst, (size_t)stride,
|
||||||
|
block, 1, &meta);
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta, int8_t *tc0)
|
||||||
|
+{
|
||||||
|
+ daedalus_h264_deblock_meta meta = {
|
||||||
|
+ .dst_off = 0,
|
||||||
|
+ .alpha = alpha,
|
||||||
|
+ .beta = beta,
|
||||||
|
+ };
|
||||||
|
+ meta.tc0[0] = tc0[0];
|
||||||
|
+ meta.tc0[1] = tc0[1];
|
||||||
|
+ meta.tc0[2] = tc0[2];
|
||||||
|
+ meta.tc0[3] = tc0[3];
|
||||||
|
+
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+
|
||||||
|
+ daedalus_recipe_dispatch_h264_deblock_luma_v(g_dctx, pix, (size_t)stride,
|
||||||
|
+ 1, &meta);
|
||||||
|
+}
|
||||||
|
diff --git a/libavcodec/aarch64/h264dsp_init_aarch64.c b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
index 741e551..85ac381 100644
|
||||||
|
--- a/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
+++ b/libavcodec/aarch64/h264dsp_init_aarch64.c
|
||||||
|
@@ -27,6 +27,8 @@
|
||||||
|
|
||||||
|
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||||
|
int beta, int8_t *tc0);
|
||||||
|
+void ff_h264_v_loop_filter_luma_daedalus(uint8_t *pix, ptrdiff_t stride,
|
||||||
|
+ int alpha, int beta, int8_t *tc0);
|
||||||
|
void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||||
|
int beta, int8_t *tc0);
|
||||||
|
void ff_h264_v_loop_filter_luma_intra_neon(uint8_t *pix, ptrdiff_t stride, int alpha,
|
||||||
|
@@ -114,7 +116,7 @@ av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
||||||
|
int cpu_flags = av_get_cpu_flags();
|
||||||
|
|
||||||
|
if (have_neon(cpu_flags) && bit_depth == 8) {
|
||||||
|
- c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
|
||||||
|
+ c->v_loop_filter_luma = ff_h264_v_loop_filter_luma_daedalus;
|
||||||
|
c->h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
|
||||||
|
c->v_loop_filter_luma_intra= ff_h264_v_loop_filter_luma_intra_neon;
|
||||||
|
c->h_loop_filter_luma_intra= ff_h264_h_loop_filter_luma_intra_neon;
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
|
|
||||||
@@ -0,0 +1,82 @@
|
|||||||
|
From 0d1292ea99bc4e5fa2da438259fa01a2374e3e04 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Fri, 22 May 2026 14:18:25 +0200
|
||||||
|
Subject: [PATCH] avcodec/h264: restore AV_CODEC_FLAG_LOW_DELAY semantics
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
FFmpeg 8.x dropped the H.264 decoder's low_delay path —
|
||||||
|
AV_CODEC_FLAG_LOW_DELAY no longer prevents
|
||||||
|
h264_select_output_frame from running the display-order DPB
|
||||||
|
output queue. V4L2-stateless-style consumers (daedalus-v4l2
|
||||||
|
daemon, libva-v4l2-request-fourier) that set the flag end up
|
||||||
|
seeing the 2-1-4-3 pair-swap pattern on B-frame streams again.
|
||||||
|
|
||||||
|
Restore the documented semantics:
|
||||||
|
|
||||||
|
- Early-exit at the top of h264_select_output_frame when the
|
||||||
|
flag is set: emit the just-decoded picture immediately as
|
||||||
|
next_output_pic, mirror the corruption / recovery-point
|
||||||
|
tracking the main path performs, and skip the entire
|
||||||
|
delayed_pic[] / POC reorder machinery.
|
||||||
|
|
||||||
|
- Suppress the SPS-driven has_b_frames clobber in
|
||||||
|
h264_field_start when the flag is set, so the per-slice
|
||||||
|
bitstream_restriction_flag re-pickup cannot reintroduce a
|
||||||
|
nonzero reorder buffer mid-stream.
|
||||||
|
|
||||||
|
This is a fork-only change required by the daedalus-v4l2 daemon's
|
||||||
|
one-frame-per-send_packet contract; upstream FFmpeg consumers that
|
||||||
|
expect display-order output remain untouched (flag default = off).
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 deblock
|
||||||
|
+ flag-restoration follow-up.
|
||||||
|
---
|
||||||
|
libavcodec/h264_slice.c | 23 +++++++++++++++++++++++
|
||||||
|
1 file changed, 23 insertions(+)
|
||||||
|
|
||||||
|
diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c
|
||||||
|
index 97fab70..a7bfbd6 100644
|
||||||
|
--- a/libavcodec/h264_slice.c
|
||||||
|
+++ b/libavcodec/h264_slice.c
|
||||||
|
@@ -1308,6 +1308,28 @@ static int h264_select_output_frame(H264Context *h)
|
||||||
|
cur->mmco_reset = h->mmco_reset;
|
||||||
|
h->mmco_reset = 0;
|
||||||
|
|
||||||
|
+ /* AV_CODEC_FLAG_LOW_DELAY restore (FFmpeg 8.x dropped the H.264
|
||||||
|
+ * decoder's low_delay path). Bypass the display-order DPB
|
||||||
|
+ * output queue: emit the just-decoded picture immediately, in
|
||||||
|
+ * decode order, one per send_packet. V4L2-stateless-style
|
||||||
|
+ * consumers (daedalus-v4l2 daemon, libva-v4l2-request-fourier)
|
||||||
|
+ * do their own POC-based reorder downstream and require this
|
||||||
|
+ * behaviour. */
|
||||||
|
+ if (h->avctx->flags & AV_CODEC_FLAG_LOW_DELAY) {
|
||||||
|
+ h->next_output_pic = cur;
|
||||||
|
+ h->next_outputed_poc = cur->poc;
|
||||||
|
+ h->frame_recovered |= cur->recovered;
|
||||||
|
+ cur->recovered |= h->frame_recovered & FRAME_RECOVERED_SEI;
|
||||||
|
+ if (!cur->recovered) {
|
||||||
|
+ if (!(h->avctx->flags & AV_CODEC_FLAG_OUTPUT_CORRUPT) &&
|
||||||
|
+ !(h->avctx->flags2 & AV_CODEC_FLAG2_SHOW_ALL))
|
||||||
|
+ h->next_output_pic = NULL;
|
||||||
|
+ else
|
||||||
|
+ cur->f->flags |= AV_FRAME_FLAG_CORRUPT;
|
||||||
|
+ }
|
||||||
|
+ return 0;
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
if (sps->bitstream_restriction_flag ||
|
||||||
|
h->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT) {
|
||||||
|
h->avctx->has_b_frames = FFMAX(h->avctx->has_b_frames, sps->num_reorder_frames);
|
||||||
|
@@ -1415,6 +1437,7 @@ static int h264_field_start(H264Context *h, const H264SliceContext *sl,
|
||||||
|
sps = h->ps.sps;
|
||||||
|
|
||||||
|
if (sps->bitstream_restriction_flag &&
|
||||||
|
+ !(h->avctx->flags & AV_CODEC_FLAG_LOW_DELAY) &&
|
||||||
|
h->avctx->has_b_frames < sps->num_reorder_frames) {
|
||||||
|
h->avctx->has_b_frames = sps->num_reorder_frames;
|
||||||
|
}
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
|
|
||||||
+139
@@ -0,0 +1,139 @@
|
|||||||
|
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Markus Fritsche <mfritsche@reauktion.de>
|
||||||
|
Date: Sat, 23 May 2026 12:00:00 +0200
|
||||||
|
Subject: [PATCH] avcodec/aarch64/h264qpel: route 8x8 mc20 through
|
||||||
|
daedalus-fourier
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: text/plain; charset=UTF-8
|
||||||
|
Content-Transfer-Encoding: 8bit
|
||||||
|
|
||||||
|
H264QpelContext.put_h264_qpel_pixels_tab[1][2] (8x8 luma horizontal
|
||||||
|
half-pel, 6-tap "put" variant — the canonical representative of the
|
||||||
|
H.264 luma motion-compensation family) now dispatches through
|
||||||
|
daedalus_recipe_dispatch_h264_qpel_mc20 instead of
|
||||||
|
ff_put_h264_qpel8_mc20_neon.
|
||||||
|
|
||||||
|
Cycle 9 of the daedalus-v4l2#11 step 2 substitution arc; closes the
|
||||||
|
4-cycle libavcodec.so substitution sequence (6 IDCT 4x4 / 7 IDCT 8x8 /
|
||||||
|
8 luma-v deblock / 9 qpel mc20).
|
||||||
|
|
||||||
|
The recipe layer picks the substrate. Per docs/k9_h264qpel_mc20.md
|
||||||
|
the verdict is CPU NEON: per-block 7.6 ns at 131 Mblock/s gives 135x
|
||||||
|
margin over 30 fps 1080p, and the QPU dispatch floor (~250 ns)
|
||||||
|
makes any V3D shader strictly worse. Substitution is plumbing-only,
|
||||||
|
NEON-by-recipe — same daedalus_ctx_create_no_qpu pthread_once
|
||||||
|
context shape the cycles 6/7/8 shims already own (kept SEPARATE
|
||||||
|
from the H264DSP shim's ctx because H264QPEL is its own libavcodec
|
||||||
|
Makefile module and link order does not guarantee a single .o
|
||||||
|
owns the ctx symbol; one extra ~µs init per process, paid lazily).
|
||||||
|
|
||||||
|
Other H.264 luma MC variants (mc02, mc11, mc22 etc.) and the 16x16
|
||||||
|
size tier stay on the in-tree NEON .S code. Per the cycle-9 phase-1
|
||||||
|
rationale, mc20 8x8 is representative of the whole family's per-block
|
||||||
|
cost — extending the substitution to other variants would multiply
|
||||||
|
recipe-lookup overhead without changing the substrate verdict.
|
||||||
|
|
||||||
|
Bit-exact against ff_put_h264_qpel8_mc20_neon (daedalus-fourier
|
||||||
|
cycle 9 green; M1 = 100% bit-exact across 10000 random blocks).
|
||||||
|
|
||||||
|
No SONAME change, no Depends change.
|
||||||
|
|
||||||
|
Refs reauktion/daedalus-v4l2#11 — substitution arc step 2 cycle 9.
|
||||||
|
---
|
||||||
|
libavcodec/aarch64/Makefile | 3 +-
|
||||||
|
libavcodec/aarch64/h264_qpel_daedalus.c | 50 ++++++++++++++++++++++
|
||||||
|
libavcodec/aarch64/h264qpel_init_aarch64.c | 4 +-
|
||||||
|
3 files changed, 55 insertions(+), 2 deletions(-)
|
||||||
|
create mode 100644 libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
|
||||||
|
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
|
||||||
|
--- a/libavcodec/aarch64/Makefile
|
||||||
|
+++ b/libavcodec/aarch64/Makefile
|
||||||
|
@@ -7,7 +7,8 @@ OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o \
|
||||||
|
aarch64/h264_idct_daedalus.o
|
||||||
|
OBJS-$(CONFIG_HUFFYUVDSP) += aarch64/huffyuvdsp_init_aarch64.o
|
||||||
|
OBJS-$(CONFIG_H264PRED) += aarch64/h264pred_init.o
|
||||||
|
-OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o
|
||||||
|
+OBJS-$(CONFIG_H264QPEL) += aarch64/h264qpel_init_aarch64.o \
|
||||||
|
+ aarch64/h264_qpel_daedalus.o
|
||||||
|
OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
|
||||||
|
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
|
||||||
|
OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
|
||||||
|
diff --git a/libavcodec/aarch64/h264_qpel_daedalus.c b/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
new file mode 100644
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/libavcodec/aarch64/h264_qpel_daedalus.c
|
||||||
|
@@ -0,0 +1,50 @@
|
||||||
|
+/*
|
||||||
|
+ * H.264 luma qpel mc20 (8x8, horizontal half-pel, 6-tap "put")
|
||||||
|
+ * — daedalus-fourier substitution shim.
|
||||||
|
+ *
|
||||||
|
+ * Routes H264QpelContext.put_h264_qpel_pixels_tab[1][2] through
|
||||||
|
+ * daedalus_recipe_dispatch_h264_qpel_mc20 instead of
|
||||||
|
+ * ff_put_h264_qpel8_mc20_neon. The recipe layer picks the substrate
|
||||||
|
+ * (CPU NEON for cycle 9; QPU not viable — per-block 7.6 ns vs
|
||||||
|
+ * ~250 ns QPU dispatch floor, see docs/k9_h264qpel_mc20.md).
|
||||||
|
+ *
|
||||||
|
+ * Sibling to libavcodec/aarch64/h264_idct_daedalus.c. We keep a
|
||||||
|
+ * SEPARATE process-global pthread_once context here instead of
|
||||||
|
+ * sharing the H264DSP one because H264QPEL is its own libavcodec
|
||||||
|
+ * Makefile module and link order does not guarantee a single .o
|
||||||
|
+ * owns the ctx symbol. The cost is one extra
|
||||||
|
+ * daedalus_ctx_create_no_qpu (~µs) per process; daemon and host
|
||||||
|
+ * processes pay this lazily on first MC call.
|
||||||
|
+ *
|
||||||
|
+ * FFmpeg H264QpelContext convention: both dst and src use a SINGLE
|
||||||
|
+ * stride and `src` already points at the leftmost OUTPUT column
|
||||||
|
+ * (col 0); the 6-tap filter reads cols -2..+3. This matches
|
||||||
|
+ * daedalus_recipe_dispatch_h264_qpel_mc20's documented contract
|
||||||
|
+ * directly, so dst_off = src_off = 0.
|
||||||
|
+ */
|
||||||
|
+
|
||||||
|
+#include <pthread.h>
|
||||||
|
+#include <stddef.h>
|
||||||
|
+#include <stdint.h>
|
||||||
|
+
|
||||||
|
+#include <daedalus.h>
|
||||||
|
+
|
||||||
|
+#include "libavutil/attributes.h"
|
||||||
|
+
|
||||||
|
+static daedalus_ctx *g_dctx;
|
||||||
|
+static pthread_once_t g_dctx_once = PTHREAD_ONCE_INIT;
|
||||||
|
+
|
||||||
|
+static void daedalus_ctx_init_once(void)
|
||||||
|
+{
|
||||||
|
+ g_dctx = daedalus_ctx_create_no_qpu();
|
||||||
|
+}
|
||||||
|
+
|
||||||
|
+void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
+
|
||||||
|
+void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
|
||||||
|
+{
|
||||||
|
+ static const daedalus_h264_qpel_meta meta = { .dst_off = 0, .src_off = 0 };
|
||||||
|
+ pthread_once(&g_dctx_once, daedalus_ctx_init_once);
|
||||||
|
+ daedalus_recipe_dispatch_h264_qpel_mc20(g_dctx, dst, src, (size_t)stride,
|
||||||
|
+ 1, &meta);
|
||||||
|
+}
|
||||||
|
diff --git a/libavcodec/aarch64/h264qpel_init_aarch64.c b/libavcodec/aarch64/h264qpel_init_aarch64.c
|
||||||
|
--- a/libavcodec/aarch64/h264qpel_init_aarch64.c
|
||||||
|
+++ b/libavcodec/aarch64/h264qpel_init_aarch64.c
|
||||||
|
@@ -47,6 +47,8 @@ void ff_put_h264_qpel8_mc00_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t str
|
||||||
|
void ff_put_h264_qpel8_mc10_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc20_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc30_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
+void ff_put_h264_qpel8_mc20_daedalus(uint8_t *dst, const uint8_t *src,
|
||||||
|
+ ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc01_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc11_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
void ff_put_h264_qpel8_mc21_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride);
|
||||||
|
@@ -184,7 +186,7 @@ av_cold void ff_h264qpel_init_aarch64(H264QpelContext *c, int bit_depth)
|
||||||
|
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
|
||||||
|
- c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
|
||||||
|
+ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_daedalus;
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
|
||||||
|
c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
|
||||||
|
--
|
||||||
|
2.47.3
|
||||||
+16
-10
@@ -33,15 +33,19 @@ FFMPEG_VERSION=8.1
|
|||||||
# epoch 2 matches Debian's stock ffmpeg (currently 7:7.1.x in trixie);
|
# epoch 2 matches Debian's stock ffmpeg (currently 7:7.1.x in trixie);
|
||||||
# +rfourier suffix to avoid colliding with upstream/Debian rebuilds.
|
# +rfourier suffix to avoid colliding with upstream/Debian rebuilds.
|
||||||
PKGVER=2:${FFMPEG_VERSION}+rfourier+gb57fbbe
|
PKGVER=2:${FFMPEG_VERSION}+rfourier+gb57fbbe
|
||||||
PKGREL=5 # pkgrel=5 — H.264 IDCT 4x4 daedalus-fourier substitution; skip past
|
PKGREL=10 # pkgrel=10 — H.264 luma qpel mc20 daedalus-fourier substitution
|
||||||
# an orphan -4 .deb sitting in the apt pool that made
|
# (cycle 9 of the daedalus-v4l2#11 step 2 substitution arc; closes
|
||||||
# check-already-published.sh's `pool_ver ge source_full` short-
|
# the libavcodec.so substitution sequence 6 IDCT4 / 7 IDCT8 /
|
||||||
# circuit the previous -3 build (PR #76). (2026-05-21)
|
# 8 luma-v deblock / 9 qpel mc20). Pulls daedalus-fourier PR #2
|
||||||
|
# which extends the public API with
|
||||||
|
# daedalus_recipe_dispatch_h264_qpel_mc20. (2026-05-23)
|
||||||
|
|
||||||
# daedalus-fourier pin — first kernel substitution in libavcodec (cycle 6
|
# daedalus-fourier pin. 209a421 = daedalus-fourier PR #2 merge — public
|
||||||
# H.264 IDCT 4x4). Same SHA as the daedalus-v4l2 daemon already ships
|
# API now exposes daedalus_recipe_dispatch_h264_qpel_mc20 +
|
||||||
# inline; rev in lockstep with the daemon when the public API rolls.
|
# DAEDALUS_KERNEL_H264_QPEL_MC20. Cycle 9 plumbs the last H.264 NEON
|
||||||
DAEDALUS_FOURIER_COMMIT=d87239d8172307d9a1b93c95cbed116d175b85cc
|
# kernel through the recipe layer. Daemon-side build (debian/daedalus-v4l2)
|
||||||
|
# can bump in a follow-up; this PR only changes the libavcodec.so consumer.
|
||||||
|
DAEDALUS_FOURIER_COMMIT=209a4218bcb98b91c04f07ad61513bb04adb13ad
|
||||||
|
|
||||||
HERE=$(dirname "$(readlink -f "$0")")
|
HERE=$(dirname "$(readlink -f "$0")")
|
||||||
|
|
||||||
@@ -66,6 +70,10 @@ fi
|
|||||||
patch -Np1 -i "$HERE/0001-libudev-bypass-fallback.patch"
|
patch -Np1 -i "$HERE/0001-libudev-bypass-fallback.patch"
|
||||||
patch -Np1 -i "$HERE/0002-nv15-to-p010-unpack.patch"
|
patch -Np1 -i "$HERE/0002-nv15-to-p010-unpack.patch"
|
||||||
patch -Np1 -i "$HERE/0003-h264-idct4-daedalus-fourier.patch"
|
patch -Np1 -i "$HERE/0003-h264-idct4-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "$HERE/0004-h264-idct8-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "$HERE/0005-h264-deblock-luma-v-daedalus-fourier.patch"
|
||||||
|
patch -Np1 -i "$HERE/0006-h264-restore-low-delay.patch"
|
||||||
|
patch -Np1 -i "$HERE/0007-h264-qpel-mc20-daedalus-fourier.patch"
|
||||||
|
|
||||||
# --- daedalus-fourier: fetch + build static .a with PIC, install to a
|
# --- daedalus-fourier: fetch + build static .a with PIC, install to a
|
||||||
# per-build prefix; libavcodec.so links it into the shared object so
|
# per-build prefix; libavcodec.so links it into the shared object so
|
||||||
@@ -134,7 +142,6 @@ cd "$work/FFmpeg"
|
|||||||
--enable-libass \
|
--enable-libass \
|
||||||
--enable-libfreetype \
|
--enable-libfreetype \
|
||||||
--enable-libfribidi \
|
--enable-libfribidi \
|
||||||
--enable-libxml2 \
|
|
||||||
--enable-libpulse \
|
--enable-libpulse \
|
||||||
--enable-libdav1d \
|
--enable-libdav1d \
|
||||||
--enable-libopus \
|
--enable-libopus \
|
||||||
@@ -190,7 +197,6 @@ Depends: libc6,
|
|||||||
libfontconfig1,
|
libfontconfig1,
|
||||||
libfreetype6,
|
libfreetype6,
|
||||||
libfribidi0,
|
libfribidi0,
|
||||||
libxml2,
|
|
||||||
libpulse0,
|
libpulse0,
|
||||||
libdav1d7 | libdav1d6,
|
libdav1d7 | libdav1d6,
|
||||||
libopus0,
|
libopus0,
|
||||||
|
|||||||
+123
@@ -1,3 +1,126 @@
|
|||||||
|
ffmpeg-v4l2-request-fourier (2:8.1+rfourier+gb57fbbe-10) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
|
* Add 0007-h264-qpel-mc20-daedalus-fourier.patch —
|
||||||
|
H264QpelContext.put_h264_qpel_pixels_tab[1][2] (8x8 luma
|
||||||
|
horizontal half-pel, 6-tap "put" — the canonical representative
|
||||||
|
of the H.264 luma motion-compensation family) now dispatches
|
||||||
|
through daedalus_recipe_dispatch_h264_qpel_mc20 instead of
|
||||||
|
ff_put_h264_qpel8_mc20_neon. Cycle 9 of the daedalus-v4l2#11
|
||||||
|
step 2 substitution arc; closes the 4-cycle libavcodec.so
|
||||||
|
substitution sequence (6 IDCT4 / 7 IDCT8 / 8 luma-v deblock /
|
||||||
|
9 qpel mc20).
|
||||||
|
* Bumps daedalus-fourier pin d87239d → 209a421 (PR #2 — public
|
||||||
|
API extended with daedalus_recipe_dispatch_h264_qpel_mc20 +
|
||||||
|
DAEDALUS_KERNEL_H264_QPEL_MC20).
|
||||||
|
* Cycle 9 is "CPU primary; QPU pointless" per
|
||||||
|
docs/k9_h264qpel_mc20.md. Per-block 7.6 ns at 131 Mblock/s
|
||||||
|
gives 135x margin over 30 fps 1080p; QPU dispatch floor at
|
||||||
|
~250 ns makes any V3D shader strictly worse. Substitution
|
||||||
|
is plumbing-only, NEON-by-recipe — same
|
||||||
|
daedalus_ctx_create_no_qpu pthread_once shape the cycles 6/7/8
|
||||||
|
shims already own (kept SEPARATE from the H264DSP shim's ctx
|
||||||
|
because H264QPEL is its own libavcodec Makefile module and
|
||||||
|
link order does not guarantee a single .o owns the ctx symbol;
|
||||||
|
one extra ~µs init per process, paid lazily on first MC call).
|
||||||
|
* Other H.264 luma MC variants (mc02, mc11, mc22 etc.) and the
|
||||||
|
16x16 size tier stay on the in-tree NEON .S code. Per the
|
||||||
|
cycle-9 phase-1 rationale, mc20 8x8 is representative of the
|
||||||
|
whole family's per-block cost.
|
||||||
|
* Bit-exact against ff_put_h264_qpel8_mc20_neon (daedalus-fourier
|
||||||
|
cycle 9 green; 10000/10000 random blocks).
|
||||||
|
* No SONAME change, no Depends change.
|
||||||
|
|
||||||
|
-- Markus Fritsche <mfritsche@reauktion.de> Sat, 23 May 2026 12:00:00 +0000
|
||||||
|
|
||||||
|
ffmpeg-v4l2-request-fourier (2:8.1+rfourier+gb57fbbe-9) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
|
* Add 0006-h264-restore-low-delay.patch — restore the documented
|
||||||
|
AV_CODEC_FLAG_LOW_DELAY semantics in the H.264 decoder. FFmpeg
|
||||||
|
8.x dropped the H.264 low_delay code path entirely; setting the
|
||||||
|
flag at avcodec_open2 no longer prevents the display-order DPB
|
||||||
|
output queue from running. Visible on Firefox YouTube as the
|
||||||
|
2-1-4-3 B-frame pair-swap, re-introduced silently by the
|
||||||
|
SONAME 61→62 jump in daedalus-v4l2 PR #16.
|
||||||
|
* h264_select_output_frame: early-exit when LOW_DELAY is set;
|
||||||
|
emit the just-decoded picture as next_output_pic, mirror the
|
||||||
|
corruption / recovery-point tracking, skip delayed_pic[] and
|
||||||
|
the POC reorder machinery entirely.
|
||||||
|
* h264_field_start: suppress the SPS-driven
|
||||||
|
has_b_frames = sps->num_reorder_frames clobber when LOW_DELAY
|
||||||
|
is set — without this the per-slice bitstream_restriction_flag
|
||||||
|
re-pickup would reintroduce a nonzero reorder buffer mid-
|
||||||
|
stream.
|
||||||
|
* Restores the same one-frame-per-send_packet contract the
|
||||||
|
daedalus-v4l2 daemon's decoder.c already relies on (the flag
|
||||||
|
is set unconditionally for H.264). No daemon side change.
|
||||||
|
* No SONAME change, no Depends change.
|
||||||
|
|
||||||
|
-- Markus Fritsche <mfritsche@reauktion.de> Fri, 22 May 2026 13:30:00 +0000
|
||||||
|
|
||||||
|
ffmpeg-v4l2-request-fourier (2:8.1+rfourier+gb57fbbe-8) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
|
* Add 0005-h264-deblock-luma-v-daedalus-fourier.patch —
|
||||||
|
H264DSPContext.v_loop_filter_luma (non-intra bS<4 vertical luma
|
||||||
|
deblock, called per macroblock-row edge from the slice deblock
|
||||||
|
loop in libavcodec/h264_loopfilter.c) now dispatches through
|
||||||
|
daedalus_recipe_dispatch_h264_deblock_luma_v instead of
|
||||||
|
ff_h264_v_loop_filter_luma_neon. Cycle 8 of the daedalus-v4l2#11
|
||||||
|
step 2 substitution arc.
|
||||||
|
* Cycle 8 is marked "CPU primary; QPU opportunistic" in
|
||||||
|
daedalus-fourier, but the libavcodec.so context here uses
|
||||||
|
daedalus_ctx_create_no_qpu (process-global pthread_once,
|
||||||
|
shared with cycles 6/7). Opportunistic QPU is deferred to a
|
||||||
|
separate change that gates Vulkan init on a feature flag, to
|
||||||
|
avoid implicit Vulkan init in arbitrary host processes. For
|
||||||
|
now cycle 8 is plumbing-only — NEON-by-recipe.
|
||||||
|
* Intra (bS=4) loop filter c->v_loop_filter_luma_intra stays on
|
||||||
|
the in-tree NEON .S code; daedalus's daedalus_h264_deblock_meta
|
||||||
|
only covers the non-intra path per its API docstring.
|
||||||
|
* Bit-exact against ff_h264_v_loop_filter_luma_neon (daedalus-fourier
|
||||||
|
cycle 8 green).
|
||||||
|
* No SONAME change, no Depends change.
|
||||||
|
|
||||||
|
-- Markus Fritsche <mfritsche@reauktion.de> Fri, 22 May 2026 12:30:00 +0000
|
||||||
|
|
||||||
|
ffmpeg-v4l2-request-fourier (2:8.1+rfourier+gb57fbbe-7) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
|
* Add 0004-h264-idct8-daedalus-fourier.patch — H264DSPContext.idct8_add
|
||||||
|
(per-block 8x8 IDCT, called from the High-profile intra-8x8-DCT
|
||||||
|
macroblock path in libavcodec/h264_mb.c) now dispatches through
|
||||||
|
daedalus_recipe_dispatch_h264_idct8 instead of
|
||||||
|
ff_h264_idct8_add_neon. Cycle 7 of the daedalus-v4l2#11 step 2
|
||||||
|
substitution arc — NEON-by-recipe, same pthread_once context the
|
||||||
|
cycle-6 IDCT 4x4 shim already owns.
|
||||||
|
* Bit-exact against ff_h264_idct8_add_neon (daedalus-fourier cycle 7
|
||||||
|
green; FFmpeg 8x8 block storage block[r + 8*c] matches daedalus
|
||||||
|
column-major convention).
|
||||||
|
* Bulk c->idct8_add4 (inter 8x8-DCT macroblocks) stays on the
|
||||||
|
in-tree NEON .S code; batched substitution lands later.
|
||||||
|
* No SONAME change, no Depends change.
|
||||||
|
|
||||||
|
-- Markus Fritsche <mfritsche@reauktion.de> Fri, 22 May 2026 10:30:00 +0000
|
||||||
|
|
||||||
|
ffmpeg-v4l2-request-fourier (2:8.1+rfourier+gb57fbbe-6) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
|
* Drop --enable-libxml2 + libxml2 Depends — the Gitea
|
||||||
|
debian-aarch64 runner ships libxml2 ≥ 2.14 (SONAME 16) while
|
||||||
|
Debian trixie targets 2.12 (SONAME 2). -5 built fine, then
|
||||||
|
failed to load on higgs trixie:
|
||||||
|
dlopen(libavformat.so.62): libxml2.so.16:
|
||||||
|
cannot open shared object file
|
||||||
|
Neither the daedalus-v4l2 daemon (direct AVPacket feed —
|
||||||
|
libavformat used only for the in-tree v4l2request hwaccel
|
||||||
|
glue) nor mpv-fourier (Lua + ytdlp + mpv's stream code do
|
||||||
|
DASH/HLS) nor firefox-fourier (gecko-media DASH demux)
|
||||||
|
consumes FFmpeg's libxml2-backed DASH demuxer, so dropping is
|
||||||
|
feature-neutral. Mirrors the libva trixie/runner ABI-skew
|
||||||
|
workaround documented in PR #62.
|
||||||
|
* CI workflow build-deps lose libxml2-dev for the same reason.
|
||||||
|
* No source code change beyond configure flags + Depends.
|
||||||
|
Substitution stays as PRs #76/#77 landed.
|
||||||
|
|
||||||
|
-- Markus Fritsche <mfritsche@reauktion.de> Thu, 21 May 2026 23:30:00 +0000
|
||||||
|
|
||||||
ffmpeg-v4l2-request-fourier (2:8.1+rfourier+gb57fbbe-5) bookworm trixie; urgency=medium
|
ffmpeg-v4l2-request-fourier (2:8.1+rfourier+gb57fbbe-5) bookworm trixie; urgency=medium
|
||||||
|
|
||||||
* pkgrel-only bump (3 → 5) to force a rebuild of the H.264 IDCT 4x4
|
* pkgrel-only bump (3 → 5) to force a rebuild of the H.264 IDCT 4x4
|
||||||
|
|||||||
Reference in New Issue
Block a user