Blender V5.0
source/blender/compositor/algorithms/intern/convolve.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2025 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
5#include <complex>
6#include <numeric>
7
8#include "BLI_array.hh"
9#include "BLI_assert.h"
11#include "BLI_fftw.hh"
12#include "BLI_index_range.hh"
13#include "BLI_memory_utils.hh"
14#include "BLI_task.hh"
15
16#if defined(WITH_FFTW3)
17# include <fftw3.h>
18#endif
19
20#include "COM_context.hh"
21#include "COM_result.hh"
22#include "COM_utilities.hh"
23
25
26namespace blender::compositor {
27
28void convolve(Context &context,
29 const Result &input,
30 const Result &kernel,
32 const bool normalize_kernel)
33{
34#if defined(WITH_FFTW3)
36 BLI_assert(kernel.type() == ResultType::Float || kernel.type() == ResultType::Color);
38
39 /* Since we will be doing a circular convolution, we need to zero pad the input image by the
40 * kernel size and vice versa to avoid the kernel affecting the pixels at the other side of
41 * image. The kernel size is limited by the image size since it will have no effect on the image
42 * during convolution. */
43 const int2 image_size = input.domain().size;
44 const int2 kernel_size = kernel.domain().size;
45 const int2 needed_padding_amount = math::max(kernel_size, image_size);
46 const int2 needed_spatial_size = image_size + needed_padding_amount - 1;
47 const int2 spatial_size = fftw::optimal_size_for_real_transform(needed_spatial_size);
48
49 /* The FFTW real to complex transforms utilizes the hermitian symmetry of real transforms and
50 * stores only half the output since the other half is redundant, so we only allocate half of
51 * the first dimension. See Section 4.3.4 Real-data DFT Array Format in the FFTW manual for
52 * more information. */
53 const int2 frequency_size = int2(spatial_size.x / 2 + 1, spatial_size.y);
54
55 constexpr int input_channels_count = 4;
56 const int64_t spatial_pixels_count = int64_t(spatial_size.x) * spatial_size.y;
57 const int64_t frequency_pixels_count = int64_t(frequency_size.x) * frequency_size.y;
58
59 /* A structure to gather all buffers that need to be forward transformed from the real to the
60 * frequency domain. */
61 struct ForwardTransformTask {
62 float *input;
63 std::complex<float> *output;
64 };
65 Vector<ForwardTransformTask> forward_transform_tasks;
66
67 /* Allocate a real buffer and a complex buffer for each of the input channels for the FFT input
68 * and output respectively, then add a forward transform task for it. */
69 Array<float *> image_spatial_domain_channels(input_channels_count);
70 Array<std::complex<float> *> image_frequency_domain_channels(input_channels_count);
71 for (const int channel : image_spatial_domain_channels.index_range()) {
72 image_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count);
73 image_frequency_domain_channels[channel] = reinterpret_cast<std::complex<float> *>(
74 fftwf_alloc_complex(frequency_pixels_count));
75 forward_transform_tasks.append(ForwardTransformTask{image_spatial_domain_channels[channel],
76 image_frequency_domain_channels[channel]});
77 }
78
79 BLI_SCOPED_DEFER([&]() {
80 for (const int channel : image_spatial_domain_channels.index_range()) {
81 fftwf_free(image_spatial_domain_channels[channel]);
82 fftwf_free(image_frequency_domain_channels[channel]);
83 }
84 });
85
86 const int kernel_channels_count = kernel.channels_count();
87 const bool is_color_kernel = kernel_channels_count == 4;
88
89 /* Allocate a real buffer and a complex buffer for each of the kernel channels for the FFT input
90 * and output respectively, then add a forward transform task for it. */
91 Array<float *> kernel_spatial_domain_channels(kernel_channels_count);
92 Array<std::complex<float> *> kernel_frequency_domain_channels(kernel_channels_count);
93 for (const int channel : kernel_spatial_domain_channels.index_range()) {
94 kernel_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count);
95 kernel_frequency_domain_channels[channel] = reinterpret_cast<std::complex<float> *>(
96 fftwf_alloc_complex(frequency_pixels_count));
97 forward_transform_tasks.append(ForwardTransformTask{
98 kernel_spatial_domain_channels[channel], kernel_frequency_domain_channels[channel]});
99 }
100
101 BLI_SCOPED_DEFER([&]() {
102 for (const int channel : kernel_spatial_domain_channels.index_range()) {
103 fftwf_free(kernel_spatial_domain_channels[channel]);
104 fftwf_free(kernel_frequency_domain_channels[channel]);
105 }
106 });
107
108 /* Create a real to complex and complex to real plans to transform the image to the frequency
109 * domain.
110 *
111 * Notice that FFTW provides an advanced interface as per Section 4.4.2 Advanced Real-data DFTs
112 * to transform all image channels simultaneously with interleaved pixel layouts. But profiling
113 * showed better performance when running a single plan in parallel for all image channels with a
114 * planner pixel format, so this is what we will be doing.
115 *
116 * The input and output buffers here are dummy buffers and still not initialized, because they
117 * are required by the planner internally for planning and their data will be overwritten. So
118 * make sure not to initialize the buffers before creating the plan. */
119 fftwf_plan forward_plan = fftwf_plan_dft_r2c_2d(
120 spatial_size.y,
121 spatial_size.x,
122 image_spatial_domain_channels[0],
123 reinterpret_cast<fftwf_complex *>(image_frequency_domain_channels[0]),
124 FFTW_ESTIMATE);
125 fftwf_plan backward_plan = fftwf_plan_dft_c2r_2d(
126 spatial_size.y,
127 spatial_size.x,
128 reinterpret_cast<fftwf_complex *>(image_frequency_domain_channels[0]),
129 image_spatial_domain_channels[0],
130 FFTW_ESTIMATE);
131
132 BLI_SCOPED_DEFER([&]() {
133 fftwf_destroy_plan(forward_plan);
134 fftwf_destroy_plan(backward_plan);
135 });
136
137 /* Download GPU results to CPU for GPU contexts. */
138 Result input_cpu = context.use_gpu() ? input.download_to_cpu() : input;
139 Result kernel_cpu = context.use_gpu() ? kernel.download_to_cpu() : kernel;
140
141 BLI_SCOPED_DEFER([&]() {
142 if (context.use_gpu()) {
143 input_cpu.release();
144 kernel_cpu.release();
145 }
146 });
147
148 /* Zero pad the image to the required spatial domain size, storing each channel in planar
149 * format for better cache locality, that is, RRRR...GGGG...BBBB...AAAA. */
150 threading::memory_bandwidth_bound_task(spatial_pixels_count * sizeof(float), [&]() {
151 parallel_for(spatial_size, [&](const int2 texel) {
152 const float4 pixel_color = input_cpu.load_pixel_zero<float4>(texel);
153 for (const int channel : IndexRange(input_channels_count)) {
154 float *buffer = image_spatial_domain_channels[channel];
155 const int64_t index = texel.y * int64_t(spatial_size.x) + texel.x;
156 buffer[index] = pixel_color[channel];
157 }
158 });
159 });
160
161 /* Use doubles to sum the kernel since floats are not stable with threaded summation. We always
162 * use a double4 even for float kernels for generality, in that case, only the first component
163 * is initialized. */
164 threading::EnumerableThreadSpecific<double4> sum_by_thread([]() { return double4(0.0); });
165
166 /* Compute the kernel while zero padding to match the spatial size. */
167 const int2 kernel_center = kernel_size / 2;
168 parallel_for(spatial_size, [&](const int2 texel) {
169 /* We offset the computed kernel with wrap around such that it is centered at the zero
170 * point, which is the expected format for doing circular convolutions in the frequency
171 * domain. */
172 const int2 centered_texel = kernel_center - texel;
173 const int2 wrapped_texel = int2(mod_i(centered_texel.x, spatial_size.x),
174 mod_i(centered_texel.y, spatial_size.y));
175
176 const float4 kernel_value = is_color_kernel ?
177 kernel_cpu.load_pixel_zero<float4>(wrapped_texel) :
178 float4(kernel_cpu.load_pixel_zero<float>(wrapped_texel));
179 for (const int channel : IndexRange(kernel_channels_count)) {
180 float *buffer = kernel_spatial_domain_channels[channel];
181 buffer[texel.x + texel.y * int64_t(spatial_size.x)] = kernel_value[channel];
182 }
183 sum_by_thread.local() += double4(kernel_value);
184 });
185
186 /* The computed kernel is not normalized and should be normalized, but instead of normalizing the
187 * kernel during computation, we normalize it in the frequency domain when convolving the kernel
188 * to the image since we will be doing sample normalization anyways. This is okay since the
189 * Fourier transform is linear. */
190 const float4 sum = float4(
191 std::accumulate(sum_by_thread.begin(), sum_by_thread.end(), double4(0.0)));
192 const float4 sanitized_sum = float4(sum[0] == 0.0f ? 1.0f : sum[0],
193 sum[1] == 0.0f ? 1.0f : sum[1],
194 sum[2] == 0.0f ? 1.0f : sum[2],
195 sum[3] == 0.0f ? 1.0f : sum[3]);
196 const float4 normalization_factor = normalize_kernel ? sanitized_sum : float4(1.0f);
197
198 /* Transform all necessary data from the real domain to the frequency domain. */
200 forward_transform_tasks.index_range(), 1, [&](const IndexRange sub_range) {
201 for (const int64_t i : sub_range) {
202 fftwf_execute_dft_r2c(
203 forward_plan,
204 forward_transform_tasks[i].input,
205 reinterpret_cast<fftwf_complex *>(forward_transform_tasks[i].output));
206 }
207 });
208
209 /* Multiply the kernel and the image in the frequency domain to perform the convolution. The
210 * FFT is not normalized, meaning the result of the FFT followed by an inverse FFT will result
211 * in an image that is scaled by a factor of the product of the width and height, so we take
212 * that into account by dividing by that scale. See Section 4.8.6 Multi-dimensional Transforms
213 * of the FFTW manual for more information. */
214 const float4 normalization_scale = float(spatial_size.x) * spatial_size.y * normalization_factor;
215 threading::parallel_for(IndexRange(frequency_size.y), 1, [&](const IndexRange sub_y_range) {
216 for (const int64_t channel : IndexRange(input_channels_count)) {
217 const int kernel_channel = is_color_kernel ? channel : 0;
218 std::complex<float> *image_buffer = image_frequency_domain_channels[channel];
219 const std::complex<float> *kernel_buffer = kernel_frequency_domain_channels[kernel_channel];
220 for (const int64_t y : sub_y_range) {
221 for (const int64_t x : IndexRange(frequency_size.x)) {
222 const int64_t index = x + y * int64_t(frequency_size.x);
223 image_buffer[index] *= kernel_buffer[index] / normalization_scale[kernel_channel];
224 }
225 }
226 }
227 });
228
229 /* Transform channels from the frequency domain to the real domain. */
230 threading::parallel_for(IndexRange(input_channels_count), 1, [&](const IndexRange sub_range) {
231 for (const int64_t channel : sub_range) {
232 fftwf_execute_dft_c2r(
233 backward_plan,
234 reinterpret_cast<fftwf_complex *>(image_frequency_domain_channels[channel]),
235 image_spatial_domain_channels[channel]);
236 }
237 });
238
239 Result output_cpu = context.create_result(input.type());
240 output_cpu.allocate_texture(input.domain(), true, ResultStorageType::CPU);
241
242 /* Copy the result to the output. */
243 threading::memory_bandwidth_bound_task(input.size_in_bytes(), [&]() {
244 parallel_for(image_size, [&](const int2 texel) {
245 float4 color = float4(0.0f);
246 for (const int channel : IndexRange(input_channels_count)) {
247 const int64_t index = texel.x + texel.y * int64_t(spatial_size.x);
248 color[channel] = image_spatial_domain_channels[channel][index];
249 }
250 output_cpu.store_pixel(texel, color);
251 });
252 });
253
254 if (context.use_gpu()) {
255 Result output_gpu = output_cpu.upload_to_gpu(true);
256 output.steal_data(output_gpu);
257 output_cpu.release();
258 }
259 else {
260 output.steal_data(output_cpu);
261 }
262#else
263 UNUSED_VARS(kernel, normalize_kernel);
264 output.allocate_texture(input.domain());
265 if (context.use_gpu()) {
267 }
268 else {
269 parallel_for(output.domain().size, [&](const int2 texel) {
270 output.store_pixel(texel, input.load_pixel<float4>(texel));
271 });
272 }
273#endif
274}
275
276} // namespace blender::compositor
#define BLI_assert(a)
Definition BLI_assert.h:46
MINLINE int mod_i(int i, int n)
#define BLI_SCOPED_DEFER(function_to_defer)
#define UNUSED_VARS(...)
void GPU_texture_copy(blender::gpu::Texture *dst, blender::gpu::Texture *src)
long long int int64_t
static T sum(const btAlignedObjectArray< T > &items)
IndexRange index_range() const
Definition BLI_array.hh:360
void append(const T &value)
IndexRange index_range() const
Result download_to_cpu() const
Definition result.cc:474
T load_pixel_zero(const int2 &texel) const
static ResultType type(blender::gpu::TextureFormat format)
Definition result.cc:261
const Domain & domain() const
int64_t channels_count() const
nullptr float
#define input
#define output
void convolve(Context &context, const Result &input, const Result &kernel, Result &output, const bool normalize_kernel)
void parallel_for(const int2 range, const Function &function)
int context(const bContext *C, const char *member, bContextDataResult *result)
int optimal_size_for_real_transform(int size)
Definition fftw.cc:59
T max(const T &a, const T &b)
void parallel_for(const IndexRange range, const int64_t grain_size, const Function &function, const TaskSizeHints &size_hints=detail::TaskSizeHints_Static(1))
Definition BLI_task.hh:93
void memory_bandwidth_bound_task(const int64_t approximate_bytes_touched, const Function &function)
Definition BLI_task.hh:265
VecBase< float, 4 > float4
VecBase< int32_t, 2 > int2
VecBase< double, 4 > double4