28 #define TYPE_NAME "vec4"
30 #define TYPE_SIZE (TYPE_ELEMS*4)
71 GLSLF(2,
s1 = texture(input_img[%
i], ivec2(x + %
i, y + %
i))[%
i];
72 ,plane, horiz ?
r : 0, !horiz ?
r : 0,
comp);
75 GLSLF(2,
s2[0] = texture(input_img[%
i], ivec2(x + %
i + xoffs[0], y + %
i + yoffs[0]))[%
i];
76 ,plane, horiz ?
r : 0, !horiz ?
r : 0,
comp);
77 GLSLF(2,
s2[1] = texture(input_img[%
i], ivec2(x + %
i + xoffs[1], y + %
i + yoffs[1]))[%
i];
78 ,plane, horiz ?
r : 0, !horiz ?
r : 0,
comp);
79 GLSLF(2,
s2[2] = texture(input_img[%
i], ivec2(x + %
i + xoffs[2], y + %
i + yoffs[2]))[%
i];
80 ,plane, horiz ?
r : 0, !horiz ?
r : 0,
comp);
81 GLSLF(2,
s2[3] = texture(input_img[%
i], ivec2(x + %
i + xoffs[3], y + %
i + yoffs[3]))[%
i];
82 ,plane, horiz ?
r : 0, !horiz ?
r : 0,
comp);
84 for (
int i = 0;
i < 16;
i++) {
85 GLSLF(2,
s2[%
i][%
i] = texture(input_img[%
i], ivec2(x + %
i + xoffs[%
i], y + %
i + yoffs[%
i]))[%
i];
86 ,
i / 4,
i % 4, plane, horiz ?
r : 0,
i, !horiz ?
r : 0,
i,
comp);
95 GLSLF(1, x =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
97 GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup,
98 gl_StorageSemanticsBuffer,
99 gl_SemanticsAcquireRelease |
100 gl_SemanticsMakeAvailable |
101 gl_SemanticsMakeVisible); );
104 GLSLC(2,
offset = uint64_t(int_stride)*y*T_ALIGN; );
105 GLSLC(2, dst = DataBuffer(uint64_t(integral_data) +
offset); );
108 for (
int r = 0;
r < nb_rows;
r++) {
114 GLSLC(2, barrier(); );
115 GLSLC(2, prefix_sum(dst, 1, dst, 1); );
122 GLSLF(1, y =
int(gl_GlobalInvocationID.x) * %
i; ,nb_rows);
124 GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup,
125 gl_StorageSemanticsBuffer,
126 gl_SemanticsAcquireRelease |
127 gl_SemanticsMakeAvailable |
128 gl_SemanticsMakeVisible); );
131 GLSLC(2, dst = DataBuffer(uint64_t(integral_data) + x*T_ALIGN); );
133 for (
int r = 0;
r < nb_rows;
r++) {
136 GLSLF(2, integral_data.v[(y + %
i)*int_stride + x] =
s2; ,
r);
141 GLSLC(2, barrier(); );
142 GLSLC(2, prefix_sum(dst, int_stride, dst, int_stride); );
148 int t,
int dst_comp,
int plane,
int comp)
150 GLSLF(1, p = patch_size[%
i]; ,dst_comp);
152 GLSLC(1, controlBarrier(gl_ScopeWorkgroup, gl_ScopeWorkgroup,
153 gl_StorageSemanticsBuffer,
154 gl_SemanticsAcquireRelease |
155 gl_SemanticsMakeAvailable |
156 gl_SemanticsMakeVisible); );
157 GLSLC(1, barrier(); );
160 GLSLF(2,
if (gl_GlobalInvocationID.x*%
i >=
width[%
i]) ,nb_rows, plane);
162 GLSLF(2,
for (
r = 0;
r < %
i;
r++) { ,nb_rows);
163 GLSLF(3, x =
int(gl_GlobalInvocationID.x) * %
i +
r; ,nb_rows);
166 GLSLF(2,
if (gl_GlobalInvocationID.x*%
i >=
height[%
i]) ,nb_rows, plane);
168 GLSLF(2,
for (
r = 0;
r < %
i;
r++) { ,nb_rows);
169 GLSLF(3, y =
int(gl_GlobalInvocationID.x) * %
i +
r; ,nb_rows);
177 GLSLC(3, lt = ((x - p) < 0) || ((y - p) < 0); );
180 GLSLF(3,
src[0] = texture(input_img[%
i], ivec2(x + xoffs[0], y + yoffs[0]))[%
i]; ,plane,
comp);
181 GLSLF(3,
src[1] = texture(input_img[%
i], ivec2(x + xoffs[1], y + yoffs[1]))[%
i]; ,plane,
comp);
182 GLSLF(3,
src[2] = texture(input_img[%
i], ivec2(x + xoffs[2], y + yoffs[2]))[%
i]; ,plane,
comp);
183 GLSLF(3,
src[3] = texture(input_img[%
i], ivec2(x + xoffs[3], y + yoffs[3]))[%
i]; ,plane,
comp);
185 for (
int i = 0;
i < 16;
i++)
186 GLSLF(3,
src[%
i][%
i] = texture(input_img[%
i], ivec2(x + xoffs[%
i], y + yoffs[%
i]))[%
i];
191 GLSLC(3,
if (lt ==
false) { );
192 GLSLC(4,
a = integral_data.v[(y - p)*int_stride + x - p]; );
193 GLSLC(4,
c = integral_data.v[(y - p)*int_stride + x + p]; );
194 GLSLC(4,
b = integral_data.v[(y + p)*int_stride + x - p]; );
195 GLSLC(4,
d = integral_data.v[(y + p)*int_stride + x + p]; );
200 GLSLF(3,
w =
exp(patch_diff * strength[%
i]); ,dst_comp);
201 GLSLC(3, w_sum =
w[0] +
w[1] +
w[2] +
w[3]; );
204 for (
int i = 0;
i < 4;
i++)
205 GLSLF(3,
w[%
i] =
exp(patch_diff[%
i] * strength[%
i]); ,
i,
i,dst_comp);
206 for (
int i = 0;
i < 4;
i++)
208 ,!
i ?
"=" :
"+=",
i,
i,
i,
i);
209 for (
int i = 0;
i < 4;
i++)
211 ,!
i ?
"=" :
"+=",
i,
i);
215 GLSLF(3,
atomicAdd(weights_%
i[y*ws_stride[%
i] + x], w_sum); ,dst_comp, dst_comp);
216 GLSLF(3,
atomicAdd(sums_%
i[y*ws_stride[%
i] + x], sum); ,dst_comp, dst_comp);
218 GLSLF(3, weights_%
i[y*ws_stride[%
i] + x] += w_sum; ,dst_comp, dst_comp);
219 GLSLF(3, sums_%
i[y*ws_stride[%
i] + x] += sum; ,dst_comp, dst_comp);
225 typedef struct HorizontalPushData {
226 VkDeviceAddress integral_data;
227 VkDeviceAddress state_data;
232 uint32_t ws_stride[4];
236 } HorizontalPushData;
248 void *spv_opaque =
NULL;
251 uint32_t max_wg = vkctx->
props.properties.limits.maxComputeWorkGroupSize[0];
252 int max_shm = vkctx->
props.properties.limits.maxComputeSharedMemorySize;
253 int wg_size, wg_rows;
256 max_wg = 1 << (31 -
ff_clz(max_wg));
260 if (max_wg > max_dim) {
261 wg_size = max_wg / (max_wg / max_dim);
262 }
else if (max_wg < max_dim) {
264 while (wg_size*wg_rows < max_dim)
279 GLSLC(0, #extension GL_EXT_shader_atomic_float : require );
280 GLSLC(0, #extension GL_ARB_gpu_shader_int64 : require );
281 GLSLC(0, #pragma use_vulkan_memory_model );
282 GLSLC(0, #extension GL_KHR_memory_scope_semantics : enable );
284 GLSLF(0, #define N_ROWS %
i ,*nb_rows);
285 GLSLC(0, #define WG_SIZE (gl_WorkGroupSize.x) );
287 GLSLC(0, #define PARTITION_SIZE (N_ROWS*WG_SIZE) );
291 GLSLC(0,
layout(buffer_reference, buffer_reference_align = T_ALIGN) coherent
buffer DataBuffer { );
292 GLSLC(1, DTYPE v[]; );
297 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
298 GLSLC(1, coherent DataBuffer integral_data; );
304 GLSLC(1, uvec4 ws_stride; );
305 GLSLC(1, ivec4 patch_size; );
306 GLSLC(1, vec4 strength; );
307 GLSLC(1, uint int_stride; );
316 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
319 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
323 .name =
"weights_buffer_0",
324 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
325 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
326 .buf_content =
"float weights_0[];",
329 .name =
"sums_buffer_0",
330 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
331 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
332 .buf_content =
"float sums_0[];",
335 .name =
"weights_buffer_1",
336 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
337 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
338 .buf_content =
"float weights_1[];",
341 .name =
"sums_buffer_1",
342 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
343 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
344 .buf_content =
"float sums_1[];",
347 .name =
"weights_buffer_2",
348 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
349 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
350 .buf_content =
"float weights_2[];",
353 .name =
"sums_buffer_2",
354 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
355 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
356 .buf_content =
"float sums_2[];",
359 .name =
"weights_buffer_3",
360 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
361 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
362 .buf_content =
"float weights_3[];",
365 .name =
"sums_buffer_3",
366 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
367 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
368 .buf_content =
"float sums_3[];",
378 GLSLC(1, DataBuffer dst; );
391 GLSLC(1, DTYPE patch_diff; );
399 GLSLC(1,
float w_sum; );
400 GLSLC(1,
float sum; );
406 for (
int i = 0;
i <
desc->nb_components;
i++) {
421 RET(spv->
compile_shader(spv, vkctx, shd, &spv_data, &spv_len,
"main", &spv_opaque));
434 typedef struct DenoisePushData {
435 uint32_t ws_stride[4];
446 void *spv_opaque =
NULL;
450 VK_SHADER_STAGE_COMPUTE_BIT, 0));
454 GLSLC(0,
layout(push_constant, std430) uniform pushConstants { );
455 GLSLC(1, uvec4 ws_stride; );
463 .type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
466 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
470 .name =
"output_img",
471 .type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
473 .mem_quali =
"writeonly",
476 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
479 .name =
"weights_buffer_0",
480 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
481 .mem_quali =
"readonly",
482 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
483 .buf_content =
"float weights_0[];",
486 .name =
"sums_buffer_0",
487 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
488 .mem_quali =
"readonly",
489 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
490 .buf_content =
"float sums_0[];",
493 .name =
"weights_buffer_1",
494 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
495 .mem_quali =
"readonly",
496 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
497 .buf_content =
"float weights_1[];",
500 .name =
"sums_buffer_1",
501 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
502 .mem_quali =
"readonly",
503 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
504 .buf_content =
"float sums_1[];",
507 .name =
"weights_buffer_2",
508 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
509 .mem_quali =
"readonly",
510 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
511 .buf_content =
"float weights_2[];",
514 .name =
"sums_buffer_2",
515 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
516 .mem_quali =
"readonly",
517 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
518 .buf_content =
"float sums_2[];",
521 .name =
"weights_buffer_3",
522 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
523 .mem_quali =
"readonly",
524 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
525 .buf_content =
"float weights_3[];",
528 .name =
"sums_buffer_3",
529 .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
530 .mem_quali =
"readonly",
531 .stages = VK_SHADER_STAGE_COMPUTE_BIT,
532 .buf_content =
"float sums_3[];",
540 GLSLC(1,
const ivec2
pos = ivec2(gl_GlobalInvocationID.xy); );
542 GLSLC(1,
float w_sum; );
543 GLSLC(1,
float sum; );
550 for (
int c = 0;
c <
desc->nb_components;
c++) {
551 if (
desc->comp[
c].plane ==
i) {
555 GLSLF(1,
r[%
i] = (sum +
src[%
i]*255) / (1.0 + w_sum) / 255; ,off, off);
565 RET(spv->
compile_shader(spv, vkctx, shd, &spv_data, &spv_len,
"main", &spv_opaque));
581 int xcnt = 0, ycnt = 0;
592 if (!(
s->opts.r & 1)) {
598 if (!(
s->opts.p & 1)) {
604 for (
int i = 0;
i < 4;
i++) {
605 double str = (
s->opts.sc[
i] > 1.0) ?
s->opts.sc[
i] :
s->opts.s;
606 int ps = (
s->opts.pc[
i] ?
s->opts.pc[
i] :
s->opts.p);
609 str = 255.0*255.0 / str;
610 s->strength[
i] = str;
616 s->patch[
i] = ps / 2;
620 s->nb_offsets = (2*rad + 1)*(2*rad + 1) - 1;
621 s->xoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->xoffsets));
622 s->yoffsets =
av_malloc(
s->nb_offsets*
sizeof(*
s->yoffsets));
625 for (
int x = -rad; x <= rad; x++) {
626 for (
int y = -rad; y <= rad; y++) {
630 s->xoffsets[xcnt++] = x;
631 s->yoffsets[ycnt++] = y;
639 "disabling dispatch parallelism\n");
643 if (!vkctx->
feats_12.vulkanMemoryModel) {
648 spv = ff_vk_spirv_init();
658 RET(init_weights_pipeline(vkctx, &
s->e, &
s->pl_weights, &
s->shd_weights,
s->sampler,
659 spv,
s->vkctx.output_width,
s->vkctx.output_height,
662 RET(init_denoise_pipeline(vkctx, &
s->e, &
s->pl_denoise, &
s->shd_denoise,
s->sampler,
682 VkBufferMemoryBarrier2 buf_bar[8];
690 0,
sizeof(DenoisePushData), &(DenoisePushData) {
691 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
694 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
695 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
696 .srcStageMask = ws_vk->
stage,
697 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
698 .srcAccessMask = ws_vk->
access,
699 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
700 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
701 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
702 .buffer = ws_vk->
buf,
707 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
708 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
709 .pBufferMemoryBarriers = buf_bar,
710 .bufferMemoryBarrierCount = nb_buf_bar,
712 ws_vk->
stage = buf_bar[0].dstStageMask;
713 ws_vk->
access = buf_bar[0].dstAccessMask;
716 vk->CmdDispatch(exec->
buf,
736 int plane_heights[4];
751 VkDeviceAddress weights_addr[4];
752 VkDeviceAddress sums_addr[4];
753 uint32_t ws_stride[4];
755 size_t ws_total_size = 0;
760 VkImageMemoryBarrier2 img_bar[8];
762 VkBufferMemoryBarrier2 buf_bar[8];
773 int_stride =
s->pl_weights.wg_size[0]*
s->pl_weights_rows;
774 int_size = int_stride * int_stride *
TYPE_SIZE;
778 for (
int i = 0;
i <
desc->nb_components;
i++) {
781 plane_widths[
i] =
FFALIGN(plane_widths[
i],
s->pl_denoise.wg_size[0]);
782 plane_heights[
i] =
FFALIGN(plane_heights[
i],
s->pl_denoise.wg_size[1]);
784 ws_stride[
i] = plane_widths[
i];
785 ws_size[
i] = ws_stride[
i] * plane_heights[
i] *
sizeof(
float);
786 ws_total_size += ws_size[
i];
791 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
792 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
794 s->opts.t * int_size,
795 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
801 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
802 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
804 s->opts.t * state_size,
805 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
811 VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
812 VK_BUFFER_USAGE_TRANSFER_DST_BIT |
813 VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
816 VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
821 weights_addr[0] = ws_vk->
address;
822 sums_addr[0] = ws_vk->
address + ws_total_size;
823 for (
int i = 1;
i <
desc->nb_components;
i++) {
824 weights_addr[
i] = weights_addr[
i - 1] + ws_size[
i - 1];
825 sums_addr[
i] = sums_addr[
i - 1] + ws_size[
i - 1];
841 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
842 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
844 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
845 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT));
853 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
856 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
857 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
858 VK_ACCESS_SHADER_READ_BIT,
859 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
860 VK_QUEUE_FAMILY_IGNORED);
865 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
866 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
867 VK_ACCESS_SHADER_WRITE_BIT,
868 VK_IMAGE_LAYOUT_GENERAL,
869 VK_QUEUE_FAMILY_IGNORED);
871 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
872 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
873 .srcStageMask = ws_vk->
stage,
874 .dstStageMask = VK_PIPELINE_STAGE_2_TRANSFER_BIT,
875 .srcAccessMask = ws_vk->
access,
876 .dstAccessMask = VK_ACCESS_2_TRANSFER_WRITE_BIT,
877 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
878 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
879 .buffer = ws_vk->
buf,
884 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
885 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
886 .pImageMemoryBarriers = img_bar,
887 .imageMemoryBarrierCount = nb_img_bar,
888 .pBufferMemoryBarriers = buf_bar,
889 .bufferMemoryBarrierCount = nb_buf_bar,
891 ws_vk->
stage = buf_bar[0].dstStageMask;
892 ws_vk->
access = buf_bar[0].dstAccessMask;
895 vk->CmdFillBuffer(exec->
buf, ws_vk->
buf, 0, ws_vk->
size, 0x0);
897 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
898 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
899 .srcStageMask = ws_vk->
stage,
900 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
901 .srcAccessMask = ws_vk->
access,
902 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
903 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
904 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
905 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
906 .buffer = ws_vk->
buf,
911 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
912 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
913 .pBufferMemoryBarriers = buf_bar,
914 .bufferMemoryBarrierCount = nb_buf_bar,
916 ws_vk->
stage = buf_bar[0].dstStageMask;
917 ws_vk->
access = buf_bar[0].dstAccessMask;
921 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
923 for (
int i = 0;
i <
desc->nb_components;
i++) {
925 weights_addr[
i], ws_size[
i],
926 VK_FORMAT_UNDEFINED));
928 sums_addr[
i], ws_size[
i],
929 VK_FORMAT_UNDEFINED));
934 VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
937 VK_IMAGE_LAYOUT_GENERAL,
s->sampler);
938 for (
int i = 0;
i <
desc->nb_components;
i++) {
940 weights_addr[
i], ws_size[
i],
941 VK_FORMAT_UNDEFINED));
943 sums_addr[
i], ws_size[
i],
944 VK_FORMAT_UNDEFINED));
951 int *xoffs =
s->xoffsets +
i;
952 int *yoffs =
s->yoffsets +
i;
953 HorizontalPushData pd = {
954 integral_vk->
address + t_offset*int_size,
955 state_vk->
address + t_offset*state_size,
958 { plane_widths[0], plane_widths[1], plane_widths[2], plane_widths[3] },
959 { plane_heights[0], plane_heights[1], plane_heights[2], plane_heights[3] },
960 { ws_stride[0], ws_stride[1], ws_stride[2], ws_stride[3] },
961 {
s->patch[0],
s->patch[1],
s->patch[2],
s->patch[3] },
962 {
s->strength[0],
s->strength[1],
s->strength[2],
s->strength[2], },
966 memcpy(pd.xoffs, xoffs,
sizeof(pd.xoffs));
967 memcpy(pd.yoffs, yoffs,
sizeof(pd.yoffs));
973 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
974 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
975 .srcStageMask = integral_vk->
stage,
976 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
977 .srcAccessMask = integral_vk->
access,
978 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
979 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
980 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
981 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
982 .buffer = integral_vk->
buf,
983 .size = integral_vk->
size,
986 buf_bar[nb_buf_bar++] = (VkBufferMemoryBarrier2) {
987 .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2,
988 .srcStageMask = state_vk->
stage,
989 .dstStageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT,
990 .srcAccessMask = state_vk->
access,
991 .dstAccessMask = VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
992 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT,
993 .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
994 .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
995 .buffer = state_vk->
buf,
996 .size = state_vk->
size,
1000 vk->CmdPipelineBarrier2(exec->
buf, &(VkDependencyInfo) {
1001 .sType = VK_STRUCTURE_TYPE_DEPENDENCY_INFO,
1002 .pBufferMemoryBarriers = buf_bar,
1003 .bufferMemoryBarrierCount = nb_buf_bar,
1005 integral_vk->
stage = buf_bar[0].dstStageMask;
1006 integral_vk->
access = buf_bar[0].dstAccessMask;
1007 state_vk->
stage = buf_bar[1].dstStageMask;
1008 state_vk->
access = buf_bar[1].dstAccessMask;
1010 t_offset = (t_offset + 1) %
s->opts.t;
1014 0,
sizeof(pd), &pd);
1017 vk->CmdDispatch(exec->
buf, 1, 1, 1);
1020 RET(denoise_pass(
s, exec, ws_vk, ws_stride));
1068 #define OFFSET(x) offsetof(NLMeansVulkanContext, x)
1069 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
1070 static const AVOption nlmeans_vulkan_options[] = {
1091 static const AVFilterPad nlmeans_vulkan_inputs[] = {
1095 .filter_frame = &nlmeans_vulkan_filter_frame,
1100 static const AVFilterPad nlmeans_vulkan_outputs[] = {
1109 .
name =
"nlmeans_vulkan",
1113 .
uninit = &nlmeans_vulkan_uninit,
1117 .priv_class = &nlmeans_vulkan_class,