v3d_runner: buffer pool for QPU dispatch hot path
Per the QPU-default substrate decree 2026-05-23: the per-dispatch
vkAllocateMemory in dispatch_*_qpu was the biggest single fixable
contributor to QPU dispatch overhead. This pools v3d_buffer
allocations by power-of-2 size class so the second-and-subsequent
dispatch hits a freelist instead of paying ~10-50us of Mesa-V3D7
memory-allocation cost per call.
API additions (v3d_runner.h):
- v3d_runner_acquire_buffer(): pulls from per-bucket freelist;
falls through to v3d_runner_create_buffer() on miss.
- v3d_runner_release_buffer(): pushes back onto the freelist; the
backing VkBuffer/VkDeviceMemory only get vkFreeMemory'd in
v3d_runner_destroy().
- v3d_runner_pool_total_bytes(): diagnostic watermark.
Size classes 2^8..2^23 (256 B to 8 MiB). Oversize requests fall
through to non-pooled (vkAllocateMemory) for both ends — pool stays
correct, just degenerates to old behaviour for those calls.
Migration: daedalus_core.c dispatch_*_qpu paths globally swap
create_buffer → acquire_buffer and destroy_buffer → release_buffer.
All five QPU dispatch functions (idct8 / lpf / mc_8h / cdef /
h264_deblock) now reuse buffers across calls. test_api_idct stays
bit-exact (4096/4096 bytes on CPU/QPU/AUTO substrates on hertz).
Microbench (tests/bench_pool_overhead.c) on hertz (Pi 5,
6.18.29+rpt-rpi-2712, V3D 7.1):
call 0: 434.89 us (cold — 3x vkAllocateMemory)
call 1: 100.06 us (pool hit on all 3 buffers)
steady-state:
p50: 76.44 us
p90: 90.52 us
mean: 77.95 us
first-call / steady-state ratio: 5.7x
The remaining ~76us steady-state is dominated by vkQueueWaitIdle +
shader execution + per-call descriptor-set update + command-buffer
allocation — addressed in follow-on tasks 161 (persistent cmdbuf)
and 162 (dmabuf import for dst, eliminates memcpy in/out).
Refs daedalus-fourier task #160.
This commit is contained in:
+43
-43
@@ -291,13 +291,13 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||
}
|
||||
|
||||
v3d_buffer buf_coeffs = {0}, buf_dst = {0}, buf_meta = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, coeff_bytes, &buf_coeffs)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, max_byte_touched, &buf_dst)) {
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
}
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) {
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs); return -1;
|
||||
}
|
||||
|
||||
/* Upload. Coeffs and meta are straight copies. Dst we copy the
|
||||
@@ -344,15 +344,15 @@ static int dispatch_idct8_qpu(daedalus_ctx *ctx,
|
||||
/* Read-back dst. */
|
||||
memcpy(dst, buf_dst.mapped, max_byte_touched);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_coeffs);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_coeffs);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -424,9 +424,9 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||
size_t dst_window_size = hi - lo;
|
||||
|
||||
v3d_buffer buf_meta = {0}, buf_dst = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta); return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &buf_meta)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_window_size, &buf_dst)) {
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta); return -1;
|
||||
}
|
||||
|
||||
memcpy(buf_dst.mapped, dst + lo, dst_window_size);
|
||||
@@ -468,12 +468,12 @@ static int dispatch_lpf_qpu(daedalus_ctx *ctx, int wd_8,
|
||||
|
||||
memcpy(dst + lo, buf_dst.mapped, dst_window_size);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &buf_meta);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_dst);
|
||||
v3d_runner_release_buffer(ctx->runner, &buf_meta);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -509,9 +509,9 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
||||
}
|
||||
|
||||
v3d_buffer bm = {0}, bd = {0}, bs = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_create_buffer(ctx->runner, src_max, &bs)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, src_max, &bs)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
|
||||
memcpy(bs.mapped, src, src_max);
|
||||
memcpy(bd.mapped, dst, dst_max);
|
||||
@@ -545,14 +545,14 @@ static int dispatch_mc_8h_qpu(daedalus_ctx *ctx,
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bs);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bs);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bs);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -588,9 +588,9 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
||||
size_t tmp_bytes = tmp_max_u16 * sizeof(uint16_t);
|
||||
|
||||
v3d_buffer bm = {0}, bd = {0}, bt = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_create_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_destroy_buffer(ctx->runner, &bd); v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, tmp_bytes, &bt)) { v3d_runner_release_buffer(ctx->runner, &bd); v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
|
||||
/* tmp may need padding before block-origin offset (caller-allocated). Just
|
||||
* copy from caller; we assume meta[i].tmp_off_u16 is consistent with how
|
||||
@@ -630,14 +630,14 @@ static int dispatch_cdef_qpu(daedalus_ctx *ctx,
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bt);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bt);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bt);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return -1;
|
||||
}
|
||||
|
||||
@@ -670,8 +670,8 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
||||
}
|
||||
|
||||
v3d_buffer bm = {0}, bd = {0};
|
||||
if (v3d_runner_create_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_create_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_destroy_buffer(ctx->runner, &bm); return -1; }
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, meta_bytes, &bm)) return -1;
|
||||
if (v3d_runner_acquire_buffer(ctx->runner, dst_max, &bd)) { v3d_runner_release_buffer(ctx->runner, &bm); return -1; }
|
||||
|
||||
memcpy(bd.mapped, dst, dst_max);
|
||||
uint32_t *m = bm.mapped;
|
||||
@@ -706,12 +706,12 @@ static int dispatch_h264_deblock_qpu(daedalus_ctx *ctx,
|
||||
|
||||
memcpy(dst, bd.mapped, dst_max);
|
||||
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return 0;
|
||||
fail:
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bd);
|
||||
v3d_runner_destroy_buffer(ctx->runner, &bm);
|
||||
v3d_runner_release_buffer(ctx->runner, &bd);
|
||||
v3d_runner_release_buffer(ctx->runner, &bm);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user