28 #define TYPE_NAME "vec4"
30 #define TYPE_SIZE (TYPE_ELEMS*4)
73 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
75 GLSLF(4,
s2[0] = texture(input_img[%
i],
pos + offs[0] + ivec2(%
i + %
s, %
i + %
s))[%
i];
76 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
77 GLSLF(4,
s2[1] = texture(input_img[%
i],
pos + offs[1] + ivec2(%
i + %
s, %
i + %
s))[%
i];
78 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
79 GLSLF(4,
s2[2] = texture(input_img[%
i],
pos + offs[2] + ivec2(%
i + %
s, %
i + %
s))[%
i];
80 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
81 GLSLF(4,
s2[3] = texture(input_img[%
i],
pos + offs[3] + ivec2(%
i + %
s, %
i + %
s))[%
i];
82 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
89 GLSLF(1,
pos.y =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
91 GLSLC(1, barrier(); );
94 GLSLC(2, #pragma unroll(1) );
95 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
96 GLSLC(3, prefix_sum = DTYPE(0); );
97 GLSLC(3, offset = int_stride * uint64_t(pos.y + r); );
98 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
100 GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
102 insert_first(shd, 0,
"r", 0, plane, comp);
104 GLSLC(4, s2 = dst.v[pos.x]; );
105 GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; );
106 GLSLC(4, prefix_sum += s2; );
115 GLSLF(1,
pos.x =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
116 GLSLC(1, #pragma unroll(1) );
117 GLSLF(1,
for (
r = 0;
r < %
i;
r++) ,nb_rows);
118 GLSLC(2, psum[
r] = DTYPE(0); );
121 GLSLC(1, barrier(); );
124 GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
125 GLSLC(3, offset = int_stride * uint64_t(pos.y); );
126 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
128 GLSLC(3, #pragma unroll(1) );
129 GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows);
131 insert_first(shd, 0,
"r", 1, plane, comp);
133 GLSLC(4, s2 = dst.v[pos.x + r]; );
134 GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; );
135 GLSLC(4, psum[r] += s2; );
143 int t,
int dst_comp,
int plane,
int comp)
145 GLSLF(1, p = patch_size[%
i]; ,dst_comp);
147 GLSLC(1, barrier(); );
151 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane);
153 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
154 GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
156 GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
157 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane);
159 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
160 GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
163 GLSLC(3, a = DTYPE(0); );
164 GLSLC(3, b = DTYPE(0); );
165 GLSLC(3, c = DTYPE(0); );
166 GLSLC(3, d = DTYPE(0); );
168 GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); );
170 GLSLF(3, src[0] = texture(input_img[%i], pos + offs[0])[%i]; ,plane, comp);
171 GLSLF(3, src[1] = texture(input_img[%i], pos + offs[1])[%i]; ,plane, comp);
172 GLSLF(3, src[2] = texture(input_img[%i], pos + offs[2])[%i]; ,plane, comp);
173 GLSLF(3, src[3] = texture(input_img[%i], pos + offs[3])[%i]; ,plane, comp);
175 GLSLC(3, if (lt == false) { );
176 GLSLC(3, offset = int_stride * uint64_t(pos.y - p); );
177 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
178 GLSLC(4, a = dst.v[pos.x - p]; );
179 GLSLC(4, c = dst.v[pos.x + p]; );
180 GLSLC(3, offset = int_stride * uint64_t(pos.y + p); );
181 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
182 GLSLC(4, b = dst.v[pos.x - p]; );
183 GLSLC(4, d = dst.v[pos.x + p]; );
186 GLSLC(3, patch_diff = d + a - b - c; );
187 GLSLF(3, w = exp(patch_diff * strength[%i]); ,dst_comp);
188 GLSLC(3, w_sum = w[0] + w[1] + w[2] + w[3]; );
189 GLSLC(3, sum = dot(w, src*255); );
192 GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp);
193 GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp);
195 GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp);
196 GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp);
202 typedef struct HorizontalPushData {
205 uint32_t ws_stride[4];
208 VkDeviceAddress integral_base;
209 uint64_t integral_size;
211 uint32_t xyoffs_start;
212 } HorizontalPushData;
224 void *spv_opaque =
NULL;
227 uint32_t max_wg = vkctx->
props.properties.limits.maxComputeWorkGroupSize[0];
228 int wg_size, wg_rows;
234 if (max_wg > max_dim) {
236 }
else if (max_wg < max_dim) {
238 while (wg_size*wg_rows < max_dim)
247 GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
248 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
253 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_ALIGN)
buffer DataBuffer { );
254 GLSLC(1, DTYPE v[]; );
257 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
260 GLSLC(1, uvec4 ws_stride; );
261 GLSLC(1, ivec4 patch_size; );
262 GLSLC(1, vec4 strength; );
263 GLSLC(1, DataBuffer integral_base; );
264 GLSLC(1, uint64_t integral_size; );
265 GLSLC(1, uint64_t int_stride; );
266 GLSLC(1, uint xyoffs_start; );
275 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
278 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
282 .name =
"weights_buffer_0",
283 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
284 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
285 .buf_content =
"float weights_0[];",
288 .name =
"sums_buffer_0",
289 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
290 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
291 .buf_content =
"float sums_0[];",
294 .name =
"weights_buffer_1",
295 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
296 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
297 .buf_content =
"float weights_1[];",
300 .name =
"sums_buffer_1",
301 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
302 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
303 .buf_content =
"float sums_1[];",
306 .name =
"weights_buffer_2",
307 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
308 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
309 .buf_content =
"float weights_2[];",
312 .name =
"sums_buffer_2",
313 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
314 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
315 .buf_content =
"float sums_2[];",
318 .name =
"weights_buffer_3",
319 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
320 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
321 .buf_content =
"float weights_3[];",
324 .name =
"sums_buffer_3",
325 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
326 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
327 .buf_content =
"float sums_3[];",
334 .
name =
"xyoffsets_buffer",
335 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
336 .mem_quali =
"readonly",
337 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
338 .buf_content =
"ivec2 xyoffsets[];",
347 GLSLC(1, DataBuffer dst; );
350 GLSLC(1, DTYPE prefix_sum; );
351 GLSLF(1, DTYPE psum[%
i]; ,*nb_rows);
356 GLSLC(1, DataBuffer integral_data; );
359 GLSLC(1,
int invoc_idx =
int(gl_WorkGroupID.z); );
362 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) +
offset); );
371 GLSLC(1, DTYPE patch_diff; );
379 GLSLC(1,
float w_sum; );
380 GLSLC(1,
float sum; );
386 for (
int i = 0;
i <
desc->nb_components;
i++) {
401 RET(spv->
compile_shader(spv, vkctx, shd, &spv_data, &spv_len,
"main", &spv_opaque));
414 typedef struct DenoisePushData {
415 uint32_t ws_stride[4];
426 void *spv_opaque =
NULL;
430 VK_SHADER_STAGE_COMPUTE_BIT, 0));
434 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
435 GLSLC(1, uvec4 ws_stride; );
443 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
446 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
450 .name =
"output_img",
451 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
453 .mem_quali =
"writeonly",
456 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
459 .name =
"weights_buffer_0",
460 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
461 .mem_quali =
"readonly",
462 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
463 .buf_content =
"float weights_0[];",
466 .name =
"sums_buffer_0",
467 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
468 .mem_quali =
"readonly",
469 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
470 .buf_content =
"float sums_0[];",
473 .name =
"weights_buffer_1",
474 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
475 .mem_quali =
"readonly",
476 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
477 .buf_content =
"float weights_1[];",
480 .name =
"sums_buffer_1",
481 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
482 .mem_quali =
"readonly",
483 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
484 .buf_content =
"float sums_1[];",
487 .name =
"weights_buffer_2",
488 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
489 .mem_quali =
"readonly",
490 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
491 .buf_content =
"float weights_2[];",
494 .name =
"sums_buffer_2",
495 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
496 .mem_quali =
"readonly",
497 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
498 .buf_content =
"float sums_2[];",
501 .name =
"weights_buffer_3",
502 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
503 .mem_quali =
"readonly",
504 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
505 .buf_content =
"float weights_3[];",
508 .name =
"sums_buffer_3",
509 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
510 .mem_quali =
"readonly",
511 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
512 .buf_content =
"float sums_3[];",
520 GLSLC(1,
const ivec2
pos = ivec2(gl_GlobalInvocationID.xy); );
521 GLSLC(1,
const uint plane = uint(gl_WorkGroupID.z); );
523 GLSLC(1,
float w_sum; );
524 GLSLC(1,
float sum; );
528 GLSLC(1,
size = imageSize(output_img[plane]); );
534 for (
int c = 0;
c <
desc->nb_components;
c++) {
539 GLSLF(2,
r[%
i] = (sum +
src[%
i]*255) / (1.0 + w_sum) / 255; ,off, off);
543 GLSLC(1, imageStore(output_img[plane],
pos,
r); );
546 RET(spv->
compile_shader(spv, vkctx, shd, &spv_data, &spv_len,
"main", &spv_opaque));
562 int xcnt = 0, ycnt = 0;
568 int offsets_dispatched = 0, nb_dispatches = 0;
575 if (!(
s->opts.r & 1)) {
581 if (!(
s->opts.p & 1)) {
587 for (
int i = 0;
i < 4;
i++) {
588 double str = (
s->opts.sc[
i] > 1.0) ?
s->opts.sc[
i] :
s->opts.s;
589 int ps = (
s->opts.pc[
i] ?
s->opts.pc[
i] :
s->opts.p);
592 str = 255.0*255.0 / str;
593 s->strength[
i] = str;
599 s->patch[
i] = ps / 2;
603 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
604 s->xoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->xoffsets));
605 s->yoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->yoffsets));
608 for (
int x = -rad; x <= rad; x++) {
609 for (
int y = -rad; y <= rad; y++) {
613 s->xoffsets[xcnt++] = x;
614 s->yoffsets[ycnt++] = y;
620 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
621 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
622 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
623 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
626 for (
int i = 0;
i < 2*
s->nb_offsets;
i += 2) {
627 offsets_buf[
i + 0] =
s->xoffsets[
i >> 1];
628 offsets_buf[
i + 1] =
s->yoffsets[
i >> 1];
636 "disabling dispatch parallelism\n");
640 spv = ff_vk_spirv_init();
650 RET(init_weights_pipeline(vkctx, &
s->e, &
s->pl_weights, &
s->shd_weights,
s->sampler,
651 spv,
s->vkctx.output_width,
s->vkctx.output_height,
654 RET(init_denoise_pipeline(vkctx, &
s->e, &
s->pl_denoise, &
s->shd_denoise,
s->sampler,
658 s->xyoffsets_buf.address,
s->xyoffsets_buf.size,
659 VK_FORMAT_UNDEFINED));
662 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
663 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
666 }
while (offsets_dispatched < s->nb_offsets);
669 s->nb_offsets, nb_dispatches);
685 VkBufferMemoryBarrier2 buf_bar[8];
693 0,
sizeof(DenoisePushData), &(DenoisePushData) {
694 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
697 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
698 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
699 .srcStageMask = ws_vk->
stage,
700 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
701 .srcAccessMask = ws_vk->
access,
702 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
703 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
704 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
705 .buffer = ws_vk->
buf,
710 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
711 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
712 .pBufferMemoryBarriers = buf_bar,
713 .bufferMemoryBarrierCount = nb_buf_bar,
715 ws_vk->
stage = buf_bar[0].dstStageMask;
716 ws_vk->
access = buf_bar[0].dstAccessMask;
719 vk->CmdDispatch(exec->
buf,
739 int plane_heights[4];
741 int offsets_dispatched = 0;
752 VkDeviceAddress weights_addr[4];
753 VkDeviceAddress sums_addr[4];
754 uint32_t ws_stride[4];
756 size_t ws_total_size = 0;
761 VkImageMemoryBarrier2 img_bar[8];
763 VkBufferMemoryBarrier2 buf_bar[8];
774 int_stride =
s->pl_weights.wg_size[0]*
s->pl_weights_rows*
TYPE_SIZE;
775 int_size =
s->pl_weights.wg_size[0]*
s->pl_weights_rows*int_stride;
778 for (
int i = 0;
i <
desc->nb_components;
i++) {
781 plane_widths[
i] =
FFALIGN(plane_widths[
i],
s->pl_denoise.wg_size[0]);
782 plane_heights[
i] =
FFALIGN(plane_heights[
i],
s->pl_denoise.wg_size[1]);
784 ws_stride[
i] = plane_widths[
i];
785 ws_size[
i] = ws_stride[
i] * plane_heights[
i] *
sizeof(
float);
786 ws_total_size += ws_size[
i];
791 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
792 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
794 s->opts.t * int_size,
795 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
801 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
802 VK_BUFFER_USAGE_TRANSFER_DST_BIT |
803 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
806 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
811 weights_addr[0] = ws_vk->
address;
812 sums_addr[0] = ws_vk->
address + ws_total_size;
813 for (
int i = 1;
i <
desc->nb_components;
i++) {
814 weights_addr[
i] = weights_addr[
i - 1] + ws_size[
i - 1];
815 sums_addr[
i] = sums_addr[
i - 1] + ws_size[
i - 1];
831 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
832 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
834 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
835 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
846 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
849 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
850 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
851 VK_ACCESS_SHADER_READ_BIT,
852 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
853 VK_QUEUE_FAMILY_IGNORED);
858 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
859 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
860 VK_ACCESS_SHADER_WRITE_BIT,
861 VK_IMAGE_LAYOUT_GENERAL,
862 VK_QUEUE_FAMILY_IGNORED);
865 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
866 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
867 .srcStageMask = ws_vk->
stage,
868 .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
869 .srcAccessMask = ws_vk->
access,
870 .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
871 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
872 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
873 .buffer = ws_vk->
buf,
877 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
878 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
879 .srcStageMask = integral_vk->
stage,
880 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
881 .srcAccessMask = integral_vk->
access,
882 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
883 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
884 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
885 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
886 .buffer = integral_vk->
buf,
887 .size = integral_vk->
size,
891 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
892 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
893 .pImageMemoryBarriers = img_bar,
894 .imageMemoryBarrierCount = nb_img_bar,
895 .pBufferMemoryBarriers = buf_bar,
896 .bufferMemoryBarrierCount = nb_buf_bar,
898 ws_vk->
stage = buf_bar[0].dstStageMask;
899 ws_vk->
access = buf_bar[0].dstAccessMask;
900 integral_vk->
stage = buf_bar[1].dstStageMask;
901 integral_vk->
access = buf_bar[1].dstAccessMask;
904 vk->CmdFillBuffer(exec->
buf, ws_vk->
buf, 0, ws_vk->
size, 0x0);
907 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
908 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
909 .srcStageMask = ws_vk->
stage,
910 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
911 .srcAccessMask = ws_vk->
access,
912 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
913 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
914 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
915 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
916 .buffer = ws_vk->
buf,
921 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
922 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
923 .pBufferMemoryBarriers = buf_bar,
924 .bufferMemoryBarrierCount = nb_buf_bar,
926 ws_vk->
stage = buf_bar[0].dstStageMask;
927 ws_vk->
access = buf_bar[0].dstAccessMask;
931 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
933 for (
int i = 0;
i <
desc->nb_components;
i++) {
935 weights_addr[
i], ws_size[
i],
936 VK_FORMAT_UNDEFINED));
938 sums_addr[
i], ws_size[
i],
939 VK_FORMAT_UNDEFINED));
944 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
947 VK_IMAGE_LAYOUT_GENERAL,
s->sampler);
948 for (
int i = 0;
i <
desc->nb_components;
i++) {
950 weights_addr[
i], ws_size[
i],
951 VK_FORMAT_UNDEFINED));
953 sums_addr[
i], ws_size[
i],
954 VK_FORMAT_UNDEFINED));
962 HorizontalPushData pd = {
963 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
964 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
965 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
966 {
s->patch[0],
s->patch[1],
s->patch[2],
s->patch[3] },
967 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[2], },
970 (uint64_t)int_stride,
974 if (offsets_dispatched) {
976 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
977 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
978 .srcStageMask = integral_vk->
stage,
979 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
980 .srcAccessMask = integral_vk->
access,
981 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
982 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
983 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
984 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
985 .buffer = integral_vk->
buf,
986 .size = integral_vk->
size,
990 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
991 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
992 .pBufferMemoryBarriers = buf_bar,
993 .bufferMemoryBarrierCount = nb_buf_bar,
995 integral_vk->
stage = buf_bar[1].dstStageMask;
996 integral_vk->
access = buf_bar[1].dstAccessMask;
1001 0,
sizeof(pd), &pd);
1004 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
1007 vk->CmdDispatch(exec->
buf, 1, 1, wg_invoc);
1010 }
while (offsets_dispatched < s->nb_offsets);
1012 RET(denoise_pass(
s, exec, ws_vk, ws_stride));
1061 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
1062 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
1063 static const AVOption nlmeans_vulkan_options[] = {
1084 static const AVFilterPad nlmeans_vulkan_inputs[] = {
1088 .filter_frame = &nlmeans_vulkan_filter_frame,
1093 static const AVFilterPad nlmeans_vulkan_outputs[] = {
1102 .
name =
"nlmeans_vulkan",
1106 .
uninit = &nlmeans_vulkan_uninit,
1110 .priv_class = &nlmeans_vulkan_class,