29 #define TYPE_NAME "vec4"
31 #define TYPE_SIZE (TYPE_ELEMS*4)
74 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
76 GLSLF(4,
s2[0] = texture(input_img[%
i],
pos + offs[0] + ivec2(%
i + %
s, %
i + %
s))[%
i];
77 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
78 GLSLF(4,
s2[1] = texture(input_img[%
i],
pos + offs[1] + ivec2(%
i + %
s, %
i + %
s))[%
i];
79 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
80 GLSLF(4,
s2[2] = texture(input_img[%
i],
pos + offs[2] + ivec2(%
i + %
s, %
i + %
s))[%
i];
81 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
82 GLSLF(4,
s2[3] = texture(input_img[%
i],
pos + offs[3] + ivec2(%
i + %
s, %
i + %
s))[%
i];
83 ,plane, horiz ?
r : 0, horiz ? off :
"0", !horiz ?
r : 0, !horiz ? off :
"0",
comp);
90 GLSLF(1,
pos.y =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
92 GLSLC(1, barrier(); );
95 GLSLC(2, #pragma unroll(1) );
96 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
97 GLSLC(3, prefix_sum = DTYPE(0); );
98 GLSLC(3, offset = int_stride * uint64_t(pos.y + r); );
99 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
101 GLSLF(3, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
103 insert_first(shd, 0,
"r", 0, plane, comp);
105 GLSLC(4, s2 = dst.v[pos.x]; );
106 GLSLC(4, dst.v[pos.x] = s2 + prefix_sum; );
107 GLSLC(4, prefix_sum += s2; );
116 GLSLF(1,
pos.x =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
117 GLSLC(1, #pragma unroll(1) );
118 GLSLF(1,
for (
r = 0;
r < %
i;
r++) ,nb_rows);
119 GLSLC(2, psum[
r] = DTYPE(0); );
122 GLSLC(1, barrier(); );
125 GLSLF(2, for (pos.y = 0; pos.y < height[%i]; pos.y++) { ,plane);
126 GLSLC(3, offset = int_stride * uint64_t(pos.y); );
127 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
129 GLSLC(3, #pragma unroll(1) );
130 GLSLF(3, for (r = 0; r < %i; r++) { ,nb_rows);
132 insert_first(shd, 0,
"r", 1, plane, comp);
134 GLSLC(4, s2 = dst.v[pos.x + r]; );
135 GLSLC(4, dst.v[pos.x + r] = s2 + psum[r]; );
136 GLSLC(4, psum[r] += s2; );
144 int t,
int dst_comp,
int plane,
int comp)
146 GLSLF(1, p = patch_size[%
i]; ,dst_comp);
148 GLSLC(1, barrier(); );
152 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= width[%i]) ,nb_rows, plane);
154 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
155 GLSLF(3, pos.x = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
157 GLSLF(1, for (pos.x = 0; pos.x < width[%i]; pos.x++) { ,plane);
158 GLSLF(2, if (gl_GlobalInvocationID.x*%i >= height[%i]) ,nb_rows, plane);
160 GLSLF(2, for (r = 0; r < %i; r++) { ,nb_rows);
161 GLSLF(3, pos.y = int(gl_GlobalInvocationID.x) * %i + r; ,nb_rows);
164 GLSLC(3, a = DTYPE(0); );
165 GLSLC(3, b = DTYPE(0); );
166 GLSLC(3, c = DTYPE(0); );
167 GLSLC(3, d = DTYPE(0); );
169 GLSLC(3, lt = ((pos.x - p) < 0) || ((pos.y - p) < 0); );
171 GLSLF(3, src[0] = texture(input_img[%i], pos + offs[0])[%i]; ,plane, comp);
172 GLSLF(3, src[1] = texture(input_img[%i], pos + offs[1])[%i]; ,plane, comp);
173 GLSLF(3, src[2] = texture(input_img[%i], pos + offs[2])[%i]; ,plane, comp);
174 GLSLF(3, src[3] = texture(input_img[%i], pos + offs[3])[%i]; ,plane, comp);
176 GLSLC(3, if (lt == false) { );
177 GLSLC(3, offset = int_stride * uint64_t(pos.y - p); );
178 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
179 GLSLC(4, a = dst.v[pos.x - p]; );
180 GLSLC(4, c = dst.v[pos.x + p]; );
181 GLSLC(3, offset = int_stride * uint64_t(pos.y + p); );
182 GLSLC(3, dst = DataBuffer(uint64_t(integral_data) + offset); );
183 GLSLC(4, b = dst.v[pos.x - p]; );
184 GLSLC(4, d = dst.v[pos.x + p]; );
187 GLSLC(3, patch_diff = d + a - b - c; );
188 GLSLF(3, w = exp(patch_diff * strength[%i]); ,dst_comp);
189 GLSLC(3, w_sum = w[0] + w[1] + w[2] + w[3]; );
190 GLSLC(3, sum = dot(w, src*255); );
193 GLSLF(3, atomicAdd(weights_%i[pos.y*ws_stride[%i] + pos.x], w_sum); ,dst_comp, dst_comp);
194 GLSLF(3, atomicAdd(sums_%i[pos.y*ws_stride[%i] + pos.x], sum); ,dst_comp, dst_comp);
196 GLSLF(3, weights_%i[pos.y*ws_stride[%i] + pos.x] += w_sum; ,dst_comp, dst_comp);
197 GLSLF(3, sums_%i[pos.y*ws_stride[%i] + pos.x] += sum; ,dst_comp, dst_comp);
203 typedef struct HorizontalPushData {
206 uint32_t ws_stride[4];
209 VkDeviceAddress integral_base;
210 uint64_t integral_size;
212 uint32_t xyoffs_start;
213 } HorizontalPushData;
225 void *spv_opaque =
NULL;
228 uint32_t max_wg = vkctx->
props.properties.limits.maxComputeWorkGroupSize[0];
229 int wg_size, wg_rows;
235 if (max_wg > max_dim) {
237 }
else if (max_wg < max_dim) {
239 while (wg_size*wg_rows < max_dim)
248 GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
249 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
254 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_ALIGN)
buffer DataBuffer { );
255 GLSLC(1, DTYPE v[]; );
258 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
261 GLSLC(1, uvec4 ws_stride; );
262 GLSLC(1, ivec4 patch_size; );
263 GLSLC(1, vec4 strength; );
264 GLSLC(1, DataBuffer integral_base; );
265 GLSLC(1, uint64_t integral_size; );
266 GLSLC(1, uint64_t int_stride; );
267 GLSLC(1, uint xyoffs_start; );
276 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
279 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
283 .name =
"weights_buffer_0",
284 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
285 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
286 .buf_content =
"float weights_0[];",
289 .name =
"sums_buffer_0",
290 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
291 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
292 .buf_content =
"float sums_0[];",
295 .name =
"weights_buffer_1",
296 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
297 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
298 .buf_content =
"float weights_1[];",
301 .name =
"sums_buffer_1",
302 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
303 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
304 .buf_content =
"float sums_1[];",
307 .name =
"weights_buffer_2",
308 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
309 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
310 .buf_content =
"float weights_2[];",
313 .name =
"sums_buffer_2",
314 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
315 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
316 .buf_content =
"float sums_2[];",
319 .name =
"weights_buffer_3",
320 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
321 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
322 .buf_content =
"float weights_3[];",
325 .name =
"sums_buffer_3",
326 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
327 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
328 .buf_content =
"float sums_3[];",
335 .
name =
"xyoffsets_buffer",
336 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
337 .mem_quali =
"readonly",
338 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
339 .buf_content =
"ivec2 xyoffsets[];",
348 GLSLC(1, DataBuffer dst; );
351 GLSLC(1, DTYPE prefix_sum; );
352 GLSLF(1, DTYPE psum[%
i]; ,*nb_rows);
357 GLSLC(1, DataBuffer integral_data; );
360 GLSLC(1,
int invoc_idx =
int(gl_WorkGroupID.z); );
363 GLSLC(1, integral_data = DataBuffer(uint64_t(integral_base) +
offset); );
372 GLSLC(1, DTYPE patch_diff; );
380 GLSLC(1,
float w_sum; );
381 GLSLC(1,
float sum; );
387 for (
int i = 0;
i <
desc->nb_components;
i++) {
402 RET(spv->
compile_shader(spv, vkctx, shd, &spv_data, &spv_len,
"main", &spv_opaque));
415 typedef struct DenoisePushData {
416 uint32_t ws_stride[4];
427 void *spv_opaque =
NULL;
431 VK_SHADER_STAGE_COMPUTE_BIT, 0));
435 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
436 GLSLC(1, uvec4 ws_stride; );
444 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
447 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
451 .name =
"output_img",
452 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
454 .mem_quali =
"writeonly",
457 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
460 .name =
"weights_buffer_0",
461 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
462 .mem_quali =
"readonly",
463 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
464 .buf_content =
"float weights_0[];",
467 .name =
"sums_buffer_0",
468 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
469 .mem_quali =
"readonly",
470 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
471 .buf_content =
"float sums_0[];",
474 .name =
"weights_buffer_1",
475 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
476 .mem_quali =
"readonly",
477 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
478 .buf_content =
"float weights_1[];",
481 .name =
"sums_buffer_1",
482 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
483 .mem_quali =
"readonly",
484 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
485 .buf_content =
"float sums_1[];",
488 .name =
"weights_buffer_2",
489 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
490 .mem_quali =
"readonly",
491 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
492 .buf_content =
"float weights_2[];",
495 .name =
"sums_buffer_2",
496 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
497 .mem_quali =
"readonly",
498 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
499 .buf_content =
"float sums_2[];",
502 .name =
"weights_buffer_3",
503 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
504 .mem_quali =
"readonly",
505 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
506 .buf_content =
"float weights_3[];",
509 .name =
"sums_buffer_3",
510 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
511 .mem_quali =
"readonly",
512 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
513 .buf_content =
"float sums_3[];",
521 GLSLC(1,
const ivec2
pos = ivec2(gl_GlobalInvocationID.xy); );
522 GLSLC(1,
const uint plane = uint(gl_WorkGroupID.z); );
524 GLSLC(1,
float w_sum; );
525 GLSLC(1,
float sum; );
529 GLSLC(1,
size = imageSize(output_img[plane]); );
535 for (
int c = 0;
c <
desc->nb_components;
c++) {
540 GLSLF(2,
r[%
i] = (sum +
src[%
i]*255) / (1.0 + w_sum) / 255; ,off, off);
544 GLSLC(1, imageStore(output_img[plane],
pos,
r); );
547 RET(spv->
compile_shader(spv, vkctx, shd, &spv_data, &spv_len,
"main", &spv_opaque));
563 int xcnt = 0, ycnt = 0;
569 int offsets_dispatched = 0, nb_dispatches = 0;
576 if (!(
s->opts.r & 1)) {
582 if (!(
s->opts.p & 1)) {
588 for (
int i = 0;
i < 4;
i++) {
589 double str = (
s->opts.sc[
i] > 1.0) ?
s->opts.sc[
i] :
s->opts.s;
590 int ps = (
s->opts.pc[
i] ?
s->opts.pc[
i] :
s->opts.p);
593 str = 255.0*255.0 / str;
594 s->strength[
i] = str;
600 s->patch[
i] = ps / 2;
604 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
605 s->xoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->xoffsets));
606 s->yoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->yoffsets));
609 for (
int x = -rad; x <= rad; x++) {
610 for (
int y = -rad; y <= rad; y++) {
614 s->xoffsets[xcnt++] = x;
615 s->yoffsets[ycnt++] = y;
621 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT |
622 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
623 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
624 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT));
627 for (
int i = 0;
i < 2*
s->nb_offsets;
i += 2) {
628 offsets_buf[
i + 0] =
s->xoffsets[
i >> 1];
629 offsets_buf[
i + 1] =
s->yoffsets[
i >> 1];
637 "disabling dispatch parallelism\n");
641 spv = ff_vk_spirv_init();
651 RET(init_weights_pipeline(vkctx, &
s->e, &
s->pl_weights, &
s->shd_weights,
s->sampler,
652 spv,
s->vkctx.output_width,
s->vkctx.output_height,
655 RET(init_denoise_pipeline(vkctx, &
s->e, &
s->pl_denoise, &
s->shd_denoise,
s->sampler,
659 s->xyoffsets_buf.address,
s->xyoffsets_buf.size,
660 VK_FORMAT_UNDEFINED));
663 int wg_invoc =
FFMIN((
s->nb_offsets - offsets_dispatched)/
TYPE_ELEMS,
s->opts.t);
664 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
667 }
while (offsets_dispatched < s->nb_offsets);
670 s->nb_offsets, nb_dispatches);
686 VkBufferMemoryBarrier2 buf_bar[8];
694 0,
sizeof(DenoisePushData), &(DenoisePushData) {
695 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
698 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
699 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
700 .srcStageMask = ws_vk->
stage,
701 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
702 .srcAccessMask = ws_vk->
access,
703 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
704 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
705 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
706 .buffer = ws_vk->
buf,
711 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
712 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
713 .pBufferMemoryBarriers = buf_bar,
714 .bufferMemoryBarrierCount = nb_buf_bar,
716 ws_vk->
stage = buf_bar[0].dstStageMask;
717 ws_vk->
access = buf_bar[0].dstAccessMask;
720 vk->CmdDispatch(exec->
buf,
740 int plane_heights[4];
742 int offsets_dispatched = 0;
753 VkDeviceAddress weights_addr[4];
754 VkDeviceAddress sums_addr[4];
755 uint32_t ws_stride[4];
757 size_t ws_total_size = 0;
762 VkImageMemoryBarrier2 img_bar[8];
764 VkBufferMemoryBarrier2 buf_bar[8];
775 int_stride =
s->pl_weights.wg_size[0]*
s->pl_weights_rows*
TYPE_SIZE;
776 int_size =
s->pl_weights.wg_size[0]*
s->pl_weights_rows*int_stride;
779 for (
int i = 0;
i <
desc->nb_components;
i++) {
782 plane_widths[
i] =
FFALIGN(plane_widths[
i],
s->pl_denoise.wg_size[0]);
783 plane_heights[
i] =
FFALIGN(plane_heights[
i],
s->pl_denoise.wg_size[1]);
785 ws_stride[
i] = plane_widths[
i];
786 ws_size[
i] = ws_stride[
i] * plane_heights[
i] *
sizeof(
float);
787 ws_total_size += ws_size[
i];
792 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
793 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
795 s->opts.t * int_size,
796 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
802 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
803 VK_BUFFER_USAGE_TRANSFER_DST_BIT |
804 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
807 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
812 weights_addr[0] = ws_vk->
address;
813 sums_addr[0] = ws_vk->
address + ws_total_size;
814 for (
int i = 1;
i <
desc->nb_components;
i++) {
815 weights_addr[
i] = weights_addr[
i - 1] + ws_size[
i - 1];
816 sums_addr[
i] = sums_addr[
i - 1] + ws_size[
i - 1];
832 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
833 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
835 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
836 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
847 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
850 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
851 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
852 VK_ACCESS_SHADER_READ_BIT,
853 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
854 VK_QUEUE_FAMILY_IGNORED);
859 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
860 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
861 VK_ACCESS_SHADER_WRITE_BIT,
862 VK_IMAGE_LAYOUT_GENERAL,
863 VK_QUEUE_FAMILY_IGNORED);
866 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
867 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
868 .srcStageMask = ws_vk->
stage,
869 .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
870 .srcAccessMask = ws_vk->
access,
871 .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
872 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
873 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
874 .buffer = ws_vk->
buf,
878 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
879 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
880 .srcStageMask = integral_vk->
stage,
881 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
882 .srcAccessMask = integral_vk->
access,
883 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
884 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
885 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
886 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
887 .buffer = integral_vk->
buf,
888 .size = integral_vk->
size,
892 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
893 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
894 .pImageMemoryBarriers = img_bar,
895 .imageMemoryBarrierCount = nb_img_bar,
896 .pBufferMemoryBarriers = buf_bar,
897 .bufferMemoryBarrierCount = nb_buf_bar,
899 ws_vk->
stage = buf_bar[0].dstStageMask;
900 ws_vk->
access = buf_bar[0].dstAccessMask;
901 integral_vk->
stage = buf_bar[1].dstStageMask;
902 integral_vk->
access = buf_bar[1].dstAccessMask;
905 vk->CmdFillBuffer(exec->
buf, ws_vk->
buf, 0, ws_vk->
size, 0x0);
908 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
909 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
910 .srcStageMask = ws_vk->
stage,
911 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
912 .srcAccessMask = ws_vk->
access,
913 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
914 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
915 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
916 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
917 .buffer = ws_vk->
buf,
922 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
923 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
924 .pBufferMemoryBarriers = buf_bar,
925 .bufferMemoryBarrierCount = nb_buf_bar,
927 ws_vk->
stage = buf_bar[0].dstStageMask;
928 ws_vk->
access = buf_bar[0].dstAccessMask;
932 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
934 for (
int i = 0;
i <
desc->nb_components;
i++) {
936 weights_addr[
i], ws_size[
i],
937 VK_FORMAT_UNDEFINED));
939 sums_addr[
i], ws_size[
i],
940 VK_FORMAT_UNDEFINED));
945 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
948 VK_IMAGE_LAYOUT_GENERAL,
s->sampler);
949 for (
int i = 0;
i <
desc->nb_components;
i++) {
951 weights_addr[
i], ws_size[
i],
952 VK_FORMAT_UNDEFINED));
954 sums_addr[
i], ws_size[
i],
955 VK_FORMAT_UNDEFINED));
963 HorizontalPushData pd = {
964 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
965 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
966 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
967 {
s->patch[0],
s->patch[1],
s->patch[2],
s->patch[3] },
968 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[2], },
971 (uint64_t)int_stride,
975 if (offsets_dispatched) {
977 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
978 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
979 .srcStageMask = integral_vk->
stage,
980 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
981 .srcAccessMask = integral_vk->
access,
982 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
983 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
984 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
985 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
986 .buffer = integral_vk->
buf,
987 .size = integral_vk->
size,
991 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
992 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
993 .pBufferMemoryBarriers = buf_bar,
994 .bufferMemoryBarrierCount = nb_buf_bar,
996 integral_vk->
stage = buf_bar[1].dstStageMask;
997 integral_vk->
access = buf_bar[1].dstAccessMask;
1002 0,
sizeof(pd), &pd);
1005 wg_invoc =
FFMIN(wg_invoc, vkctx->
props.properties.limits.maxComputeWorkGroupCount[2]);
1008 vk->CmdDispatch(exec->
buf, 1, 1, wg_invoc);
1011 }
while (offsets_dispatched < s->nb_offsets);
1013 RET(denoise_pass(
s, exec, ws_vk, ws_stride));
1062 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
1063 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
1064 static const AVOption nlmeans_vulkan_options[] = {
1085 static const AVFilterPad nlmeans_vulkan_inputs[] = {
1089 .filter_frame = &nlmeans_vulkan_filter_frame,
1094 static const AVFilterPad nlmeans_vulkan_outputs[] = {
1103 .
name =
"nlmeans_vulkan",
1107 .
uninit = &nlmeans_vulkan_uninit,
1111 .priv_class = &nlmeans_vulkan_class,