30 #define TYPE_NAME "vec4"
32 #define TYPE_SIZE (TYPE_ELEMS*4)
69 GLSLF(4, s1 = texture(input_img[%
i],
pos + ivec2(%
i + %
s, %
i + %
s))[%
i];
70 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
72 GLSLF(4, s2[0] = texture(input_img[%
i],
pos + offs[0] + ivec2(%
i + %
s, %
i + %
s))[%
i];
73 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
74 GLSLF(4, s2[1] = texture(input_img[%
i],
pos + offs[1] + ivec2(%
i + %
s, %
i + %
s))[%
i];
75 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
76 GLSLF(4, s2[2] = texture(input_img[%
i],
pos + offs[2] + ivec2(%
i + %
s, %
i + %
s))[%
i];
77 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
78 GLSLF(4, s2[3] = texture(input_img[%
i],
pos + offs[3] + ivec2(%
i + %
s, %
i + %
s))[%
i];
79 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
81 GLSLC(4, s2 = (s1 - s2) * (s1 - s2); );
86 GLSLF(1,
pos.y =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
88 GLSLC(1, barrier(); );
91 GLSLC(2, #pragma unroll(1) );
92 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
93 GLSLC(3, prefix_sum = DTYPE(0); );
94 GLSLC(3, offset = int_stride * uint64_t(pos.y + r); );
95 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
97 GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
99 insert_first(shd, 0,
"r", 0, plane, comp);
101 GLSLC(4, s2 = dst.v[pos.x]; );
102 GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; );
103 GLSLC(4, prefix_sum += s2; );
112 GLSLF(1,
pos.x =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
113 GLSLC(1, #pragma unroll(1) );
114 GLSLF(1,
for (
r = 0;
r < %
i;
r++) ,nb_rows);
115 GLSLC(2, psum[
r] = DTYPE(0); );
118 GLSLC(1, barrier(); );
121 GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
122 GLSLC(3, offset = int_stride * uint64_t(pos.y); );
123 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
125 GLSLC(3, #pragma unroll(1) );
126 GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows);
128 insert_first(shd, 0,
"r", 1, plane, comp);
130 GLSLC(4, s2 = dst.v[pos.x + r]; );
131 GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; );
132 GLSLC(4, psum[r] += s2; );
140 int t,
int dst_comp,
int plane,
int comp)
142 GLSLF(1, p = patch_size[%
i]; ,dst_comp);
144 GLSLC(1, barrier(); );
148 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane);
150 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
151 GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
153 GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
154 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane);
156 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
157 GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
160 GLSLC(3, a = DTYPE(0); );
161 GLSLC(3, b = DTYPE(0); );
162 GLSLC(3, c = DTYPE(0); );
163 GLSLC(3, d = DTYPE(0); );
165 GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); );
167 GLSLF(3, src[0] = texture(input_img[%i], pos + offs[0])[%i]; ,plane, comp);
168 GLSLF(3, src[1] = texture(input_img[%i], pos + offs[1])[%i]; ,plane, comp);
169 GLSLF(3, src[2] = texture(input_img[%i], pos + offs[2])[%i]; ,plane, comp);
170 GLSLF(3, src[3] = texture(input_img[%i], pos + offs[3])[%i]; ,plane, comp);
172 GLSLC(3, if (lt == false) { );
173 GLSLC(3, offset = int_stride * uint64_t(pos.y - p); );
174 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
175 GLSLC(4, a = dst.v[pos.x - p]; );
176 GLSLC(4, c = dst.v[pos.x + p]; );
177 GLSLC(3, offset = int_stride * uint64_t(pos.y + p); );
178 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
179 GLSLC(4, b = dst.v[pos.x - p]; );
180 GLSLC(4, d = dst.v[pos.x + p]; );
183 GLSLC(3, patch_diff = d + a - b - c; );
184 GLSLF(3, w = exp(patch_diff * strength[%i]); ,dst_comp);
185 GLSLC(3, w_sum = w[0] + w[1] + w[2] + w[3]; );
186 GLSLC(3, sum = dot(w, src*255); );
189 GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp);
190 GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp);
192 GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp);
193 GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp);
199 typedef struct HorizontalPushData {
202 uint32_t ws_stride[4];
205 VkDeviceAddress integral_base;
206 uint64_t integral_size;
208 uint32_t xyoffs_start;
209 } HorizontalPushData;
221 void *spv_opaque =
NULL;
224 uint32_t max_wg = vkctx->
props.properties.limits.maxComputeWorkGroupSize[0];
225 int wg_size, wg_rows;
231 if (max_wg > max_dim) {
233 }
else if (max_wg < max_dim) {
235 while (wg_size*wg_rows < max_dim)
240 VK_SHADER_STAGE_COMPUTE_BIT,
241 (
const char *[]) {
"GL_EXT_buffer_reference",
242 "GL_EXT_buffer_reference2" }, 2,
249 GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
250 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
255 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_ALIGN)
buffer DataBuffer { );
256 GLSLC(1, DTYPE v[]; );
259 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
262 GLSLC(1, uvec4 ws_stride; );
263 GLSLC(1, ivec4 patch_size; );
264 GLSLC(1, vec4 strength; );
265 GLSLC(1, DataBuffer integral_base; );
266 GLSLC(1, uint64_t integral_size; );
267 GLSLC(1, uint64_t int_stride; );
268 GLSLC(1, uint xyoffs_start; );
273 VK_SHADER_STAGE_COMPUTE_BIT);
278 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
281 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
285 .name =
"weights_buffer_0",
286 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
287 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
288 .buf_content =
"float weights_0[];",
291 .name =
"sums_buffer_0",
292 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
293 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
294 .buf_content =
"float sums_0[];",
297 .name =
"weights_buffer_1",
298 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
299 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
300 .buf_content =
"float weights_1[];",
303 .name =
"sums_buffer_1",
304 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
305 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
306 .buf_content =
"float sums_1[];",
309 .name =
"weights_buffer_2",
310 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
311 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
312 .buf_content =
"float weights_2[];",
315 .name =
"sums_buffer_2",
316 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
317 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
318 .buf_content =
"float sums_2[];",
321 .name =
"weights_buffer_3",
322 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
323 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
324 .buf_content =
"float weights_3[];",
327 .name =
"sums_buffer_3",
328 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
329 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
330 .buf_content =
"float sums_3[];",
337 .
name =
"xyoffsets_buffer",
338 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
339 .mem_quali =
"readonly",
340 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
341 .buf_content =
"ivec2 xyoffsets[];",
351 GLSLC(1,
float s1; );
352 GLSLC(1, DTYPE s2; );
353 GLSLC(1, DTYPE prefix_sum; );
354 GLSLF(1, DTYPE psum[%
i]; ,*nb_rows);
359 GLSLC(1, DataBuffer integral_data; );
362 GLSLC(1,
int invoc_idx =
int(gl_WorkGroupID.z); );
365 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) +
offset); );
374 GLSLC(1, DTYPE patch_diff; );
382 GLSLC(1,
float w_sum; );
383 GLSLC(1,
float sum; );
389 for (
int i = 0;
i <
desc->nb_components;
i++) {
404 RET(spv->
compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
416 typedef struct DenoisePushData {
417 uint32_t ws_stride[4];
428 void *spv_opaque =
NULL;
432 VK_SHADER_STAGE_COMPUTE_BIT,
433 (
const char *[]) {
"GL_EXT_buffer_reference",
434 "GL_EXT_buffer_reference2" }, 2,
438 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
439 GLSLC(1, uvec4 ws_stride; );
443 VK_SHADER_STAGE_COMPUTE_BIT);
448 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
451 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
455 .name =
"output_img",
456 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
458 .mem_quali =
"writeonly",
461 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
468 .
name =
"weights_buffer_0",
469 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
470 .mem_quali =
"readonly",
471 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
472 .buf_content =
"float weights_0[];",
475 .name =
"sums_buffer_0",
476 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
477 .mem_quali =
"readonly",
478 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
479 .buf_content =
"float sums_0[];",
482 .name =
"weights_buffer_1",
483 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
484 .mem_quali =
"readonly",
485 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
486 .buf_content =
"float weights_1[];",
489 .name =
"sums_buffer_1",
490 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
491 .mem_quali =
"readonly",
492 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
493 .buf_content =
"float sums_1[];",
496 .name =
"weights_buffer_2",
497 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
498 .mem_quali =
"readonly",
499 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
500 .buf_content =
"float weights_2[];",
503 .name =
"sums_buffer_2",
504 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
505 .mem_quali =
"readonly",
506 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
507 .buf_content =
"float sums_2[];",
510 .name =
"weights_buffer_3",
511 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
512 .mem_quali =
"readonly",
513 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
514 .buf_content =
"float weights_3[];",
517 .name =
"sums_buffer_3",
518 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
519 .mem_quali =
"readonly",
520 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
521 .buf_content =
"float sums_3[];",
530 GLSLC(1,
const ivec2
pos = ivec2(gl_GlobalInvocationID.xy); );
531 GLSLC(1,
const uint plane = uint(gl_WorkGroupID.z); );
533 GLSLC(1,
float w_sum; );
534 GLSLC(1,
float sum; );
538 GLSLC(1,
size = imageSize(output_img[plane]); );
544 for (
int c = 0;
c <
desc->nb_components;
c++) {
549 GLSLF(2,
r[%
i] = (sum +
src[%
i]*255) / (1.0 + w_sum) / 255; ,off, off);
553 GLSLC(1, imageStore(output_img[plane],
pos,
r); );
556 RET(spv->
compile_shader(vkctx, spv, shd, &spv_data, &spv_len,
"main", &spv_opaque));
571 int xcnt = 0, ycnt = 0;
577 int offsets_dispatched = 0, nb_dispatches = 0;
584 if (!(
s->opts.r & 1)) {
590 if (!(
s->opts.p & 1)) {
596 for (
int i = 0;
i < 4;
i++) {
597 double str = (
s->opts.sc[
i] > 1.0) ?
s->opts.sc[
i] :
s->opts.s;
598 int ps = (
s->opts.pc[
i] ?
s->opts.pc[
i] :
s->opts.p);
601 str = 255.0*255.0 / str;
602 s->strength[
i] = str;
608 s->patch[
i] = ps / 2;
612 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
613 s->xoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->xoffsets));
614 s->yoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->yoffsets));
617 for (
int x = -rad; x <= rad; x++) {
618 for (
int y = -rad; y <= rad; y++) {
622 s->xoffsets[xcnt++] = x;
623 s->yoffsets[ycnt++] = y;
629 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
630 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
631 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
632 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
635 for (
int i = 0;
i < 2*
s->nb_offsets;
i += 2) {
636 offsets_buf[
i + 0] =
s->xoffsets[
i >> 1];
637 offsets_buf[
i + 1] =
s->yoffsets[
i >> 1];
645 "disabling dispatch parallelism\n");
649 spv = ff_vk_spirv_init();
659 RET(init_weights_pipeline(vkctx, &
s->e, &
s->shd_weights,
s->sampler,
660 spv,
s->vkctx.output_width,
s->vkctx.output_height,
663 RET(init_denoise_pipeline(vkctx, &
s->e, &
s->shd_denoise,
s->sampler,
668 &
s->xyoffsets_buf, 0,
s->xyoffsets_buf.size,
669 VK_FORMAT_UNDEFINED));
672 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
673 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
676 }
while (offsets_dispatched < s->nb_offsets);
679 s->nb_offsets, nb_dispatches);
695 VkBufferMemoryBarrier2 buf_bar[8];
698 DenoisePushData pd = {
699 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
707 VK_SHADER_STAGE_COMPUTE_BIT,
710 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
711 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
712 .srcStageMask = ws_vk->
stage,
713 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
714 .srcAccessMask = ws_vk->
access,
715 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
716 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
717 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
718 .buffer = ws_vk->
buf,
723 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
724 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
725 .pBufferMemoryBarriers = buf_bar,
726 .bufferMemoryBarrierCount = nb_buf_bar,
728 ws_vk->
stage = buf_bar[0].dstStageMask;
729 ws_vk->
access = buf_bar[0].dstAccessMask;
732 vk->CmdDispatch(exec->
buf,
752 int plane_heights[4];
754 int offsets_dispatched = 0;
765 VkDeviceSize weights_offs[4];
766 VkDeviceSize sums_offs[4];
767 uint32_t ws_stride[4];
769 size_t ws_total_size = 0;
774 VkImageMemoryBarrier2 img_bar[8];
776 VkBufferMemoryBarrier2 buf_bar[8];
787 int_stride =
s->shd_weights.lg_size[0]*
s->pl_weights_rows*
TYPE_SIZE;
788 int_size =
s->shd_weights.lg_size[0]*
s->pl_weights_rows*int_stride;
791 for (
int i = 0;
i <
desc->nb_components;
i++) {
794 plane_widths[
i] =
FFALIGN(plane_widths[
i],
s->shd_denoise.lg_size[0]);
795 plane_heights[
i] =
FFALIGN(plane_heights[
i],
s->shd_denoise.lg_size[1]);
797 ws_stride[
i] = plane_widths[
i];
798 ws_size[
i] = ws_stride[
i] * plane_heights[
i] *
sizeof(
float);
799 ws_total_size += ws_size[
i];
804 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
805 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
807 s->opts.t * int_size,
808 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
814 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
815 VK_BUFFER_USAGE_TRANSFER_DST_BIT |
816 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
819 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
825 sums_offs[0] = ws_total_size;
826 for (
int i = 1;
i <
desc->nb_components;
i++) {
827 weights_offs[
i] = weights_offs[
i - 1] + ws_size[
i - 1];
828 sums_offs[
i] = sums_offs[
i - 1] + ws_size[
i - 1];
844 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
845 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
847 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
848 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
859 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
860 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
861 VK_ACCESS_SHADER_READ_BIT,
862 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
863 VK_QUEUE_FAMILY_IGNORED);
868 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
869 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
870 VK_ACCESS_SHADER_WRITE_BIT,
871 VK_IMAGE_LAYOUT_GENERAL,
872 VK_QUEUE_FAMILY_IGNORED);
875 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
876 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
877 .srcStageMask = ws_vk->
stage,
878 .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
879 .srcAccessMask = ws_vk->
access,
880 .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
881 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
882 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
883 .buffer = ws_vk->
buf,
887 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
888 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
889 .srcStageMask = integral_vk->
stage,
890 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
891 .srcAccessMask = integral_vk->
access,
892 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
893 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
894 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
895 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
896 .buffer = integral_vk->
buf,
897 .size = integral_vk->
size,
901 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
902 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
903 .pImageMemoryBarriers = img_bar,
904 .imageMemoryBarrierCount = nb_img_bar,
905 .pBufferMemoryBarriers = buf_bar,
906 .bufferMemoryBarrierCount = nb_buf_bar,
908 ws_vk->
stage = buf_bar[0].dstStageMask;
909 ws_vk->
access = buf_bar[0].dstAccessMask;
910 integral_vk->
stage = buf_bar[1].dstStageMask;
911 integral_vk->
access = buf_bar[1].dstAccessMask;
914 vk->CmdFillBuffer(exec->
buf, ws_vk->
buf, 0, ws_vk->
size, 0x0);
917 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
918 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
919 .srcStageMask = ws_vk->
stage,
920 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
921 .srcAccessMask = ws_vk->
access,
922 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
923 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
924 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
925 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
926 .buffer = ws_vk->
buf,
931 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
932 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
933 .pBufferMemoryBarriers = buf_bar,
934 .bufferMemoryBarrierCount = nb_buf_bar,
936 ws_vk->
stage = buf_bar[0].dstStageMask;
937 ws_vk->
access = buf_bar[0].dstAccessMask;
941 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
943 for (
int i = 0;
i <
desc->nb_components;
i++) {
945 ws_vk, weights_offs[
i], ws_size[
i],
946 VK_FORMAT_UNDEFINED));
948 ws_vk, sums_offs[
i], ws_size[
i],
949 VK_FORMAT_UNDEFINED));
954 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
957 VK_IMAGE_LAYOUT_GENERAL,
s->sampler);
958 for (
int i = 0;
i <
desc->nb_components;
i++) {
960 ws_vk, weights_offs[
i], ws_size[
i],
961 VK_FORMAT_UNDEFINED));
963 ws_vk, sums_offs[
i], ws_size[
i],
964 VK_FORMAT_UNDEFINED));
972 HorizontalPushData pd = {
973 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
974 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
975 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
976 {
s->patch[0],
s->patch[1],
s->patch[2],
s->patch[3] },
977 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[2], },
980 (uint64_t)int_stride,
986 VK_SHADER_STAGE_COMPUTE_BIT,
989 if (offsets_dispatched) {
991 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
992 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
993 .srcStageMask = integral_vk->
stage,
994 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
995 .srcAccessMask = integral_vk->
access,
996 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
997 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
998 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
999 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
1000 .buffer = integral_vk->
buf,
1001 .size = integral_vk->
size,
1005 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
1006 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1007 .pBufferMemoryBarriers = buf_bar,
1008 .bufferMemoryBarrierCount = nb_buf_bar,
1010 integral_vk->
stage = buf_bar[1].dstStageMask;
1011 integral_vk->
access = buf_bar[1].dstAccessMask;
1015 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
1018 vk->CmdDispatch(exec->
buf, 1, 1, wg_invoc);
1021 }
while (offsets_dispatched < s->nb_offsets);
1023 RET(denoise_pass(
s, exec, ws_vk, ws_stride));
1070 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
1071 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
1072 static const AVOption nlmeans_vulkan_options[] = {
1093 static const AVFilterPad nlmeans_vulkan_inputs[] = {
1097 .filter_frame = &nlmeans_vulkan_filter_frame,
1102 static const AVFilterPad nlmeans_vulkan_outputs[] = {
1111 .
name =
"nlmeans_vulkan",
1115 .
uninit = &nlmeans_vulkan_uninit,
1119 .priv_class = &nlmeans_vulkan_class,