/* * iter16 winding-order regression probe for PanVk-Bifrost. * * Phase 3 of iter16. The 162 CTS dEQP-VK.transform_feedback.simple.winding_* * failures (catalogued in iter15) all share the same root cause: iter13's * pan_nir_lower_xfb captures one entry per VS invocation, which for non-LIST * topologies gives ONE OUTPUT PER INPUT VERTEX. The Vulkan spec requires * primitive-decomposed capture: an N-vertex triangle strip must produce * 3*(N-2) captured entries with the right per-primitive winding order. * * This probe exercises the canonical case: triangle strip with 8 input * vertices, expecting 18 captured entries arranged as 6 triangles. The * verifier accepts any rotation within each primitive (per CTS's rule) * but enforces the winding direction. * * Pre-iter16 behavior (current iter13/r3 driver): captured count = 8 * → PROBE FAILS (under-capture). * Post-iter16 behavior: captured count = 18 in decomposed order * → PROBE PASSES. * * Parameterized so we can add LINE_STRIP, TRIANGLE_FAN, *_ADJACENCY tests * as the fix expands in Phase 4. For now, only TRIANGLE_STRIP is wired up. */ #include #include #include #include #include #include #include #define VSPV_PATH "probe_winding.vert.spv" #define STEP(name) do { fprintf(stderr, "[step] " name "\n"); fflush(stderr); } while (0) #define VK_CHECK(call) do { \ VkResult _r = (call); \ if (_r != VK_SUCCESS) { \ fprintf(stderr, "[fail] " #call " => %d at %s:%d\n", \ (int)_r, __FILE__, __LINE__); \ exit(2); \ } \ } while (0) /* ---- Per-topology expected-output helper (mirrors CTS) ---- */ /* * For input vertex count N and topology T, returns the decomposed primitive * count and per-primitive vertex layout. CTS test logic uses identical lambdas * in vktTransformFeedbackSimpleTests.cpp around line 1241. */ struct topo_decomp { VkPrimitiveTopology topology; const char *name; uint32_t verts_per_prim; uint32_t (*prim_count)(uint32_t input_count); /* Fills out[verts_per_prim] with the input-vertex-IDs that should appear * in primitive prim_idx (in CTS winding order; rotations are accepted at * verify time). */ void (*expected)(uint32_t prim_idx, uint32_t *out); }; /* TRIANGLE_STRIP: 3*(N-2) outputs. * Even prim i: {i, i+1, i+2} * Odd prim i: {i, i+2, i+1} */ static uint32_t prim_count_tri_strip(uint32_t n) { return (n >= 2) ? (n - 2) : 0; } static void expected_tri_strip(uint32_t i, uint32_t *out) { uint32_t iMod2 = i & 1u; out[0] = i; out[1] = i + 1 + iMod2; out[2] = i + 2 - iMod2; } /* LINE_STRIP: 2*(N-1) outputs. Each prim i: {i, i+1} */ static uint32_t prim_count_line_strip(uint32_t n) { return (n >= 1) ? (n - 1) : 0; } static void expected_line_strip(uint32_t i, uint32_t *out) { out[0] = i; out[1] = i + 1u; } /* TRIANGLE_FAN: 3*(N-2) outputs. Each prim i: {i+1, i+2, 0} */ static uint32_t prim_count_tri_fan(uint32_t n) { return (n >= 2) ? (n - 2) : 0; } static void expected_tri_fan(uint32_t i, uint32_t *out) { out[0] = i + 1u; out[1] = i + 2u; out[2] = 0u; } static const struct topo_decomp TOPO_TESTS[] = { { VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP, "TRIANGLE_STRIP", 3, prim_count_tri_strip, expected_tri_strip }, { VK_PRIMITIVE_TOPOLOGY_LINE_STRIP, "LINE_STRIP", 2, prim_count_line_strip, expected_line_strip }, { VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN, "TRIANGLE_FAN", 3, prim_count_tri_fan, expected_tri_fan }, }; #define NUM_TOPO_TESTS (sizeof(TOPO_TESTS) / sizeof(TOPO_TESTS[0])) /* ---- Vulkan plumbing ---- */ static uint32_t *read_spv(const char *path, size_t *out_bytes) { FILE *f = fopen(path, "rb"); if (!f) { fprintf(stderr, "[fail] open %s: %s\n", path, strerror(errno)); exit(3); } fseek(f, 0, SEEK_END); long n = ftell(f); fseek(f, 0, SEEK_SET); uint32_t *buf = malloc((size_t)n); fread(buf, 1, (size_t)n, f); fclose(f); *out_bytes = (size_t)n; return buf; } static uint32_t pick_host_visible(const VkPhysicalDeviceMemoryProperties *mp, uint32_t type_bits) { VkMemoryPropertyFlags want = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT; for (uint32_t i = 0; i < mp->memoryTypeCount; i++) { if ((type_bits & (1u << i)) && (mp->memoryTypes[i].propertyFlags & want) == want) return i; } fprintf(stderr, "[fail] no HOST_VISIBLE+COHERENT memtype\n"); exit(4); } /* ---- Verifier (rotation-aware, mirrors CTS verifyVertexDataWithWinding) ---- */ /* Returns 1 if got[verts_per_prim] is a rotation of ref[verts_per_prim], 0 else. */ static int rotations_match(const uint32_t *ref, const uint32_t *got, uint32_t vpp) { for (uint32_t start = 0; start < vpp; start++) { int ok = 1; for (uint32_t v = 0; v < vpp; v++) { uint32_t r = ref[(start + v) % vpp]; if (r != got[v]) { ok = 0; break; } } if (ok) return 1; } return 0; } /* Returns number of mismatched primitives. Prints details for each mismatch. */ static int verify_winding(const struct topo_decomp *t, uint32_t input_count, const uint32_t *got, uint32_t got_count) { uint32_t expected_prims = t->prim_count(input_count); uint32_t expected_count = expected_prims * t->verts_per_prim; if (got_count != expected_count) { fprintf(stderr, "[diff] %s: captured count %u, expected %u " "(%u prims × %u verts)\n", t->name, got_count, expected_count, expected_prims, t->verts_per_prim); return -1; } int mismatches = 0; for (uint32_t p = 0; p < expected_prims; p++) { uint32_t ref[8] = {0}; t->expected(p, ref); const uint32_t *prim_got = got + p * t->verts_per_prim; if (!rotations_match(ref, prim_got, t->verts_per_prim)) { fprintf(stderr, "[diff] %s prim %u: expected rotation of {", t->name, p); for (uint32_t v = 0; v < t->verts_per_prim; v++) fprintf(stderr, "%s%u", v ? "," : "", ref[v]); fprintf(stderr, "} got {"); for (uint32_t v = 0; v < t->verts_per_prim; v++) fprintf(stderr, "%s%u", v ? "," : "", prim_got[v]); fprintf(stderr, "}\n"); mismatches++; } } return mismatches; } /* ---- Per-topology test ---- */ static int run_one_topology(VkDevice dev, VkQueue queue, uint32_t qfam, VkRenderPass dummy_rp, PFN_vkCmdBindTransformFeedbackBuffersEXT pBindXfb, PFN_vkCmdBeginTransformFeedbackEXT pBeginXfb, PFN_vkCmdEndTransformFeedbackEXT pEndXfb, PFN_vkCmdBeginRenderingKHR pBeginRendering, PFN_vkCmdEndRenderingKHR pEndRendering, VkPhysicalDeviceMemoryProperties *mp, VkShaderModule vsm, const struct topo_decomp *t, uint32_t input_count) { /* Capacity: expected_prims × verts_per_prim × 4. Pad to 64 entries * (256 bytes) so iter13's under-capture is visible (sentinel-filled tail). */ const uint32_t buf_words = 64; const VkDeviceSize buf_bytes = buf_words * sizeof(uint32_t); fprintf(stderr, "\n=== %s with %u input verts ===\n", t->name, input_count); /* XFB capture buffer */ VkBufferCreateInfo bci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .size = buf_bytes, .usage = VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, }; VkBuffer xfb_buf; VK_CHECK(vkCreateBuffer(dev, &bci, NULL, &xfb_buf)); VkMemoryRequirements mr; vkGetBufferMemoryRequirements(dev, xfb_buf, &mr); VkMemoryAllocateInfo mai = { .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .allocationSize = mr.size, .memoryTypeIndex = pick_host_visible(mp, mr.memoryTypeBits), }; VkDeviceMemory xfb_mem; VK_CHECK(vkAllocateMemory(dev, &mai, NULL, &xfb_mem)); VK_CHECK(vkBindBufferMemory(dev, xfb_buf, xfb_mem, 0)); void *mapped; VK_CHECK(vkMapMemory(dev, xfb_mem, 0, VK_WHOLE_SIZE, 0, &mapped)); /* Sentinel-fill so we can distinguish "captured 0xDEADBEEF" from * "GPU didn't write here" — under-capture leaves the tail at sentinel. */ uint32_t *u32 = (uint32_t *)mapped; for (uint32_t i = 0; i < buf_words; i++) u32[i] = 0xDEADBEEFu; /* Pipeline */ VkPipelineLayoutCreateInfo plci = { .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, }; VkPipelineLayout pl; VK_CHECK(vkCreatePipelineLayout(dev, &plci, NULL, &pl)); VkPipelineShaderStageCreateInfo stages[1] = { { .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, .stage = VK_SHADER_STAGE_VERTEX_BIT, .module = vsm, .pName = "main" }, }; VkPipelineVertexInputStateCreateInfo vi = { .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, }; VkPipelineInputAssemblyStateCreateInfo ia = { .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, .topology = t->topology, }; VkViewport vp_dummy = { 0, 0, 1, 1, 0.0f, 1.0f }; VkRect2D sc_dummy = {{0,0}, {1,1}}; VkPipelineViewportStateCreateInfo vp = { .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, .viewportCount = 1, .pViewports = &vp_dummy, .scissorCount = 1, .pScissors = &sc_dummy, }; VkPipelineRasterizationStateCreateInfo rs = { .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, .rasterizerDiscardEnable = VK_TRUE, .polygonMode = VK_POLYGON_MODE_FILL, .cullMode = VK_CULL_MODE_NONE, .lineWidth = 1.0f, }; VkPipelineMultisampleStateCreateInfo ms = { .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, }; VkPipelineRenderingCreateInfoKHR pri = { .sType = VK_STRUCTURE_TYPE_PIPELINE_RENDERING_CREATE_INFO_KHR, .colorAttachmentCount = 0, }; VkGraphicsPipelineCreateInfo gpci = { .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, .pNext = &pri, .stageCount = 1, .pStages = stages, .pVertexInputState = &vi, .pInputAssemblyState = &ia, .pViewportState = &vp, .pRasterizationState = &rs, .pMultisampleState = &ms, .layout = pl, }; VkPipeline pipe; VK_CHECK(vkCreateGraphicsPipelines(dev, VK_NULL_HANDLE, 1, &gpci, NULL, &pipe)); /* Command buffer */ VkCommandPoolCreateInfo cpoolci = { .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, .queueFamilyIndex = qfam, }; VkCommandPool cpool; VK_CHECK(vkCreateCommandPool(dev, &cpoolci, NULL, &cpool)); VkCommandBufferAllocateInfo cbai = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, .commandPool = cpool, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, .commandBufferCount = 1, }; VkCommandBuffer cb; VK_CHECK(vkAllocateCommandBuffers(dev, &cbai, &cb)); VkCommandBufferBeginInfo cbbi = { .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, }; VK_CHECK(vkBeginCommandBuffer(cb, &cbbi)); VkDeviceSize xfb_off = 0, xfb_size = buf_bytes; pBindXfb(cb, 0, 1, &xfb_buf, &xfb_off, &xfb_size); VkRenderingInfoKHR ri = { .sType = VK_STRUCTURE_TYPE_RENDERING_INFO_KHR, .renderArea = {{0,0}, {1,1}}, .layerCount = 1, .colorAttachmentCount = 0, }; pBeginRendering(cb, &ri); vkCmdBindPipeline(cb, VK_PIPELINE_BIND_POINT_GRAPHICS, pipe); pBeginXfb(cb, 0, 0, NULL, NULL); vkCmdDraw(cb, input_count, 1, 0, 0); pEndXfb(cb, 0, 0, NULL, NULL); pEndRendering(cb); VkBufferMemoryBarrier bb = { .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, .srcAccessMask = VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT, .dstAccessMask = VK_ACCESS_HOST_READ_BIT, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .buffer = xfb_buf, .offset = 0, .size = VK_WHOLE_SIZE, }; vkCmdPipelineBarrier(cb, VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, VK_PIPELINE_STAGE_HOST_BIT, 0, 0, NULL, 1, &bb, 0, NULL); VK_CHECK(vkEndCommandBuffer(cb)); /* Submit + wait */ VkFenceCreateInfo fci = { .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO }; VkFence fence; VK_CHECK(vkCreateFence(dev, &fci, NULL, &fence)); VkSubmitInfo si = { .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .commandBufferCount = 1, .pCommandBuffers = &cb, }; VK_CHECK(vkQueueSubmit(queue, 1, &si, fence)); VkResult wr = vkWaitForFences(dev, 1, &fence, VK_TRUE, 10ULL * 1000 * 1000 * 1000); if (wr != VK_SUCCESS) { fprintf(stderr, "[fail] %s: vkWaitForFences => %d\n", t->name, wr); return -1; } /* Read back: count contiguous non-sentinel words from offset 0. */ uint32_t captured_count = 0; while (captured_count < buf_words && u32[captured_count] != 0xDEADBEEFu) captured_count++; fprintf(stderr, "[info] %s: captured %u entries (sentinel-stopped)\n", t->name, captured_count); /* Print first few for debugging */ if (captured_count > 0) { fprintf(stderr, "[info] first 8: "); for (uint32_t i = 0; i < captured_count && i < 8; i++) fprintf(stderr, "%u%s", u32[i], (i + 1 < 8 && i + 1 < captured_count) ? "," : ""); fprintf(stderr, "\n"); } int mismatches = verify_winding(t, input_count, u32, captured_count); /* Teardown */ vkUnmapMemory(dev, xfb_mem); vkDestroyFence(dev, fence, NULL); vkDestroyCommandPool(dev, cpool, NULL); vkDestroyPipeline(dev, pipe, NULL); vkDestroyPipelineLayout(dev, pl, NULL); vkDestroyBuffer(dev, xfb_buf, NULL); vkFreeMemory(dev, xfb_mem, NULL); (void)dummy_rp; return mismatches; } /* ---- main: bring up Vulkan, run all topology tests ---- */ int main(int argc, char **argv) { /* Optional CLI: limit to one topology by name */ const char *only = NULL; if (argc > 1) only = argv[1]; STEP("vkCreateInstance"); VkApplicationInfo app = { .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, .pApplicationName = "panvk-bifrost iter16 winding probe", .apiVersion = VK_API_VERSION_1_0, }; const char *inst_exts[] = { "VK_KHR_get_physical_device_properties2" }; VkInstanceCreateInfo ici = { .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, .pApplicationInfo = &app, .enabledExtensionCount = 1, .ppEnabledExtensionNames = inst_exts, }; VkInstance inst; VK_CHECK(vkCreateInstance(&ici, NULL, &inst)); uint32_t n_phys = 0; VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, NULL)); VkPhysicalDevice *phys = calloc(n_phys, sizeof(*phys)); VK_CHECK(vkEnumeratePhysicalDevices(inst, &n_phys, phys)); VkPhysicalDevice gpu = phys[0]; VkPhysicalDeviceMemoryProperties mp; vkGetPhysicalDeviceMemoryProperties(gpu, &mp); uint32_t n_qf = 0; vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, NULL); VkQueueFamilyProperties *qfp = calloc(n_qf, sizeof(*qfp)); vkGetPhysicalDeviceQueueFamilyProperties(gpu, &n_qf, qfp); uint32_t qfam = UINT32_MAX; for (uint32_t i = 0; i < n_qf; i++) if (qfp[i].queueFlags & VK_QUEUE_GRAPHICS_BIT) { qfam = i; break; } STEP("vkCreateDevice"); const char *dev_exts[] = { "VK_KHR_multiview", "VK_KHR_maintenance2", "VK_KHR_create_renderpass2", "VK_KHR_depth_stencil_resolve", "VK_KHR_dynamic_rendering", "VK_EXT_transform_feedback", }; VkPhysicalDeviceTransformFeedbackFeaturesEXT enable_xfb = { .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_FEATURES_EXT, .transformFeedback = VK_TRUE, .geometryStreams = VK_FALSE, }; VkPhysicalDeviceDynamicRenderingFeaturesKHR dyn_feat = { .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DYNAMIC_RENDERING_FEATURES_KHR, .pNext = &enable_xfb, .dynamicRendering = VK_TRUE, }; float qprio = 1.0f; VkDeviceQueueCreateInfo qci = { .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO, .queueFamilyIndex = qfam, .queueCount = 1, .pQueuePriorities = &qprio, }; VkDeviceCreateInfo dci = { .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, .pNext = &dyn_feat, .queueCreateInfoCount = 1, .pQueueCreateInfos = &qci, .enabledExtensionCount = sizeof(dev_exts)/sizeof(dev_exts[0]), .ppEnabledExtensionNames = dev_exts, }; VkDevice dev; VK_CHECK(vkCreateDevice(gpu, &dci, NULL, &dev)); VkQueue queue; vkGetDeviceQueue(dev, qfam, 0, &queue); PFN_vkCmdBindTransformFeedbackBuffersEXT pBindXfb = (PFN_vkCmdBindTransformFeedbackBuffersEXT)vkGetDeviceProcAddr( dev, "vkCmdBindTransformFeedbackBuffersEXT"); PFN_vkCmdBeginTransformFeedbackEXT pBeginXfb = (PFN_vkCmdBeginTransformFeedbackEXT)vkGetDeviceProcAddr( dev, "vkCmdBeginTransformFeedbackEXT"); PFN_vkCmdEndTransformFeedbackEXT pEndXfb = (PFN_vkCmdEndTransformFeedbackEXT)vkGetDeviceProcAddr( dev, "vkCmdEndTransformFeedbackEXT"); PFN_vkCmdBeginRenderingKHR pBeginRendering = (PFN_vkCmdBeginRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdBeginRenderingKHR"); PFN_vkCmdEndRenderingKHR pEndRendering = (PFN_vkCmdEndRenderingKHR)vkGetDeviceProcAddr(dev, "vkCmdEndRenderingKHR"); /* Shader (shared across topology iterations) */ size_t spv_bytes = 0; uint32_t *spv = read_spv(VSPV_PATH, &spv_bytes); VkShaderModuleCreateInfo smci = { .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .codeSize = spv_bytes, .pCode = spv, }; VkShaderModule vsm; VK_CHECK(vkCreateShaderModule(dev, &smci, NULL, &vsm)); free(spv); /* Run each topology test */ int total_fail = 0; int total_tested = 0; for (size_t i = 0; i < NUM_TOPO_TESTS; i++) { const struct topo_decomp *t = &TOPO_TESTS[i]; if (only && strcmp(only, t->name) != 0) continue; total_tested++; int rc = run_one_topology(dev, queue, qfam, VK_NULL_HANDLE, pBindXfb, pBeginXfb, pEndXfb, pBeginRendering, pEndRendering, &mp, vsm, t, 8u); if (rc != 0) { total_fail++; fprintf(stderr, "[FAIL] %s: %d mismatch(es)\n", t->name, rc); } else { fprintf(stderr, "[PASS] %s\n", t->name); } } vkDestroyShaderModule(dev, vsm, NULL); vkDestroyDevice(dev, NULL); vkDestroyInstance(inst, NULL); free(phys); free(qfp); fprintf(stderr, "\n=== SUMMARY: %d/%d topology tests passed ===\n", total_tested - total_fail, total_tested); return total_fail == 0 ? 0 : 1; }