From 3cecf6f430820b7a09992c816a2aa3119cc08526 Mon Sep 17 00:00:00 2001 From: Markus Fritsche Date: Tue, 28 Apr 2026 19:41:38 +0000 Subject: [PATCH] arch/kwin-fourier: mirror the upstream-shape patch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror github.com/marfrit/fourier kwin-fourier/0002 + README update to the local Arch package source. PKGBUILD unchanged — still applies 0001 (the diagnostic bypass). The 0002 patch (poll dmabuf fd directly, drop the EXPORT_SYNC_FILE + sync_file roundtrip) is staged for validation; when validated, swap the source array entry in PKGBUILD from 0001 to 0002 and rebuild. --- ...ll-dmabuf-fd-directly-upstream-shape.patch | 123 ++++++++++++++++++ arch/kwin-fourier/README.md | 110 ++++++++++++++++ 2 files changed, 233 insertions(+) create mode 100644 arch/kwin-fourier/0002-transaction-poll-dmabuf-fd-directly-upstream-shape.patch create mode 100644 arch/kwin-fourier/README.md diff --git a/arch/kwin-fourier/0002-transaction-poll-dmabuf-fd-directly-upstream-shape.patch b/arch/kwin-fourier/0002-transaction-poll-dmabuf-fd-directly-upstream-shape.patch new file mode 100644 index 000000000..2fbe1606d --- /dev/null +++ b/arch/kwin-fourier/0002-transaction-poll-dmabuf-fd-directly-upstream-shape.patch @@ -0,0 +1,123 @@ +From 54e3862be4d2a5b06a48cdcd61065f759a449a61 Mon Sep 17 00:00:00 2001 +From: Markus Fritsche +Date: Tue, 28 Apr 2026 19:32:03 +0000 +Subject: [PATCH] wayland/transaction: poll dmabuf fd directly instead of + EXPORT_SYNC_FILE +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Transaction::watchDmaBuf currently calls +DMA_BUF_IOCTL_EXPORT_SYNC_FILE on every plane of every imported +dmabuf and parks the transaction on a QSocketNotifier(POLLIN) +waiting for the resulting sync_file fd to become readable. + +This is correct, but unnecessary. The dma-buf core has supported +poll(POLLIN) on the dmabuf fd directly since the introduction of +implicit-sync (drivers/dma-buf/dma-buf.c, dma_buf_poll). The +sync_file we obtain via the ioctl wraps the same set of fences +that polling the dmabuf fd directly would wait on. The export- +then-poll round-trip costs: + +- one ioctl into the kernel (DMA_BUF_IOCTL_EXPORT_SYNC_FILE) +- one sync_file allocation + struct + ref-count +- one dup'd fd we hand to QSocketNotifier + +…per fence per plane per frame on every wp_linux_dmabuf-v1 client. +Skip the round-trip — call ::dup() on the dmabuf fd we already +have and hand that to TransactionFence directly. Same wait +semantics, fewer syscalls. + +Tested on PineTab2 (RK3566 / Mali-G52 panfrost / mainline 6.19, +KDE Plasma 6.6.4 Wayland) playing 1080p30 H.264 in chromium. +Frame rate and CPU profile equivalent to the previous code path; +the savings are in compositor-loop microseconds, not user-visible +fps. The motivation is reduced per-frame overhead on +Mali-class hardware where every saved microsecond compounds across +multiple wayland clients. + +Side effect: removes the dependency on and + in transaction.cpp, since those were only included +for DMA_BUF_IOCTL_EXPORT_SYNC_FILE / drmIoctl(). The +exportWaitSyncFile() helper is removed for the same reason. + +Signed-off-by: Markus Fritsche +--- + src/wayland/transaction.cpp | 39 +++++++++++++------------------------ + 1 file changed, 14 insertions(+), 25 deletions(-) + +diff --git a/src/wayland/transaction.cpp b/src/wayland/transaction.cpp +index 967b22b..f55ea16 100644 +--- a/src/wayland/transaction.cpp ++++ b/src/wayland/transaction.cpp +@@ -11,11 +11,6 @@ + #include "wayland/subcompositor.h" + #include "wayland/surface_p.h" + +-#if defined(Q_OS_LINUX) +-#include +-#include +-#endif +- + namespace KWin + { + +@@ -249,41 +244,35 @@ void Transaction::watchSyncObj(TransactionEntry *entry) + entry->fences.emplace_back(std::make_unique(this, std::move(eventFd))); + } + +-#if defined(Q_OS_LINUX) +-static FileDescriptor exportWaitSyncFile(const FileDescriptor &fileDescriptor) +-{ +- dma_buf_export_sync_file request{ +- .flags = DMA_BUF_SYNC_READ, +- .fd = -1, +- }; +- if (drmIoctl(fileDescriptor.get(), DMA_BUF_IOCTL_EXPORT_SYNC_FILE, &request) == 0) { +- return FileDescriptor(request.fd); +- } +- +- return FileDescriptor{}; +-} +-#endif +- + void Transaction::watchDmaBuf(TransactionEntry *entry) + { +-#if defined(Q_OS_LINUX) + const DmaBufAttributes *attributes = entry->buffer->dmabufAttributes(); + if (!attributes) { + return; + } + ++ // The dma-buf core (drivers/dma-buf/dma-buf.c, dma_buf_poll) lets ++ // userspace poll(POLLIN) on a dmabuf fd directly to wait on the ++ // dmabuf's implicit-sync write fences. Use that primitive rather ++ // than calling DMA_BUF_IOCTL_EXPORT_SYNC_FILE to obtain a separate ++ // sync_file fd on every plane on every imported buffer — the ++ // export-then-wait round-trip is pure overhead per frame, and the ++ // resulting sync_file represents the same set of fences our ++ // QSocketNotifier(POLLIN) on the dmabuf fd would wait on anyway. ++ // ++ // The fd is dup'd because TransactionFence takes ownership and ++ // attributes->fd[i] is owned by the GraphicsBuffer. + for (int i = 0; i < attributes->planeCount; ++i) { + const FileDescriptor &fileDescriptor = attributes->fd[i]; + if (fileDescriptor.isReadable()) { + continue; + } + +- auto syncFile = exportWaitSyncFile(fileDescriptor); +- if (syncFile.isValid()) { +- entry->fences.emplace_back(std::make_unique(this, std::move(syncFile))); ++ FileDescriptor dup_fd(::dup(fileDescriptor.get())); ++ if (dup_fd.isValid()) { ++ entry->fences.emplace_back(std::make_unique(this, std::move(dup_fd))); + } + } +-#endif + } + + } // namespace KWin +-- +2.47.3 + diff --git a/arch/kwin-fourier/README.md b/arch/kwin-fourier/README.md new file mode 100644 index 000000000..506a11432 --- /dev/null +++ b/arch/kwin-fourier/README.md @@ -0,0 +1,110 @@ +# kwin-fourier + +A diagnostic patch that no-ops `Transaction::watchDmaBuf` in KWin +6.6.4. **Test fixture, not the upstream-bound shape.** + +## Background + +KWin's `Transaction::watchDmaBuf` +(`src/wayland/transaction.cpp:265`) calls +`DMA_BUF_IOCTL_EXPORT_SYNC_FILE` on every plane of every imported +dmabuf and parks the transaction on a `QSocketNotifier(POLLIN)` +waiting for the resulting sync_file fd to become readable. + +For V4L2-produced dmabufs (hantro / rockchip-rga and any other vb2 +driver), that fence either: + +- **Stub-signals immediately**, because vb2 doesn't populate + `dma_resv` exclusive fences (see kernel layer in the top-level + README) and `dma_buf_export_sync_file` substitutes + `dma_fence_get_stub()`. Pure latency cost: a synchronous ioctl + + socket-notifier setup per frame, for a fence that signals in + microseconds and represents nothing real. +- **Signals very late or not at all**, on edge cases that we hit + during the chromium-fourier validation campaign. KWin's + transaction parks indefinitely; the previous wl_buffer never gets + released to the client; the client's V4L2 capture pool starves; + hard stall. + +## Patches + +This directory carries **two** patches — the PKGBUILD applies only +`0001` for now (validated on ohm), while `0002` is the +upstream-bound shape staged here for later validation and +submission. + +### `0001-transaction-bypass-watchDmaBuf-fence-wait.patch` *(currently shipped)* + +No-ops `Transaction::watchDmaBuf` entirely. Every transaction +commits without waiting on implicit-sync fences for the dmabufs +it imports. **Test fixture, validated end-to-end on ohm**: the +patch unblocks chromium-fourier 1080p30 H.264 playback under KDE +Plasma 6.6.4 Wayland on RK3566 + panfrost + mainline 6.19. + +### `0002-transaction-poll-dmabuf-fd-directly-upstream-shape.patch` *(unvalidated, upstream-bound)* + +Rewrites `Transaction::watchDmaBuf` to call `poll(POLLIN)` on the +dmabuf fd directly via a duplicated fd in a `QSocketNotifier`, +instead of going through `DMA_BUF_IOCTL_EXPORT_SYNC_FILE` plus a +sync_file fd. The dma-buf core has supported polling the dmabuf fd +for implicit-sync write fences since the introduction of the +feature; the export-then-poll round-trip is per-frame syscall +overhead with no semantic difference. + +This shape preserves KWin's defense — the wait still actually +*waits* on the producer's fence — while shedding the per-frame +overhead. It is **not validated yet** and is offered here as the +shape upstream review will likely converge on. Validation gates +before swapping the PKGBUILD to apply 0002 instead of 0001: + +1. Build kwin-fourier with 0002 instead of 0001 (one PKGBUILD line + change). +2. Install on ohm; restart Plasma session so the new + `kwin_wayland` is mapped. +3. Run chromium-fourier + bbb sample as before. Expectation: + plays through end-to-end at the same ~81 % combined CPU. + Equivalence with 0001 confirms the upstream shape works + without weakening defenses. +4. Capture before/after `dma_buf_export_sync_file` syscall counts + via `strace -c` on `kwin_wayland` (the per-frame syscall savings + are the patch's claimed benefit). +5. Submit to invent.kde.org/plasma/kwin against `master`. + +## Why patch 0001 is *not* the upstream-bound shape + +Wayland's security model is "compositor trusts no client" — +watchDmaBuf is a defense against a misbehaving client that attaches +a buffer the GPU is still writing. The blanket no-op makes a +correctness-equivalent assumption (`every Wayland client honors the +spec`) that KWin maintainers are reasonably unwilling to take +unconditionally. + +**The upstream-correct fix lives in the kernel** (vb2 / hantro / +rga don't populate `dma_resv` — fix that, KWin's wait now works +correctly because the fence is real). Once the kernel side lands, +KWin can either keep its current wait (now correct) or migrate to +`poll(POLLIN)` directly on the dmabuf fd, skipping the +`EXPORT_SYNC_FILE` ioctl. + +The kwin-fourier patch in this repo is the **diagnostic** that +identified the kernel bug and lets the chromium-fourier validation +proceed today on stock kernel + KWin. It will be rewritten or +removed once the kernel side is upstream. + +## Building / installing + +```sh +makepkg -si +``` + +The PKGBUILD inherits from upstream Arch's kwin 6.6.4-1, applies +the single watchDmaBuf bypass, and bumps `epoch=1` to dominate +upstream pkgrel. + +## Side effect + +Across the test session, every wp_linux_dmabuf client on the +compositor (chrome, brave, mpv, VLC, …) feels markedly snappier on +Mali-class hardware because the per-frame sync_file roundtrip is +gone. A pleasant accident; the cleaner, kernel-side fix will +preserve the speedup without weakening the defense.