32 const bool normalize_kernel)
34#if defined(WITH_FFTW3)
43 const int2 image_size =
input.domain().size;
45 const int2 needed_padding_amount =
math::max(kernel_size, image_size);
46 const int2 needed_spatial_size = image_size + needed_padding_amount - 1;
53 const int2 frequency_size =
int2(spatial_size.x / 2 + 1, spatial_size.y);
55 constexpr int input_channels_count = 4;
56 const int64_t spatial_pixels_count =
int64_t(spatial_size.x) * spatial_size.y;
57 const int64_t frequency_pixels_count =
int64_t(frequency_size.x) * frequency_size.y;
61 struct ForwardTransformTask {
63 std::complex<float> *
output;
69 Array<float *> image_spatial_domain_channels(input_channels_count);
71 for (
const int channel : image_spatial_domain_channels.
index_range()) {
72 image_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count);
73 image_frequency_domain_channels[channel] =
reinterpret_cast<std::complex<float> *
>(
74 fftwf_alloc_complex(frequency_pixels_count));
75 forward_transform_tasks.
append(ForwardTransformTask{image_spatial_domain_channels[channel],
76 image_frequency_domain_channels[channel]});
80 for (
const int channel : image_spatial_domain_channels.
index_range()) {
81 fftwf_free(image_spatial_domain_channels[channel]);
82 fftwf_free(image_frequency_domain_channels[channel]);
87 const bool is_color_kernel = kernel_channels_count == 4;
91 Array<float *> kernel_spatial_domain_channels(kernel_channels_count);
93 for (
const int channel : kernel_spatial_domain_channels.
index_range()) {
94 kernel_spatial_domain_channels[channel] = fftwf_alloc_real(spatial_pixels_count);
95 kernel_frequency_domain_channels[channel] =
reinterpret_cast<std::complex<float> *
>(
96 fftwf_alloc_complex(frequency_pixels_count));
97 forward_transform_tasks.
append(ForwardTransformTask{
98 kernel_spatial_domain_channels[channel], kernel_frequency_domain_channels[channel]});
102 for (
const int channel : kernel_spatial_domain_channels.
index_range()) {
103 fftwf_free(kernel_spatial_domain_channels[channel]);
104 fftwf_free(kernel_frequency_domain_channels[channel]);
119 fftwf_plan forward_plan = fftwf_plan_dft_r2c_2d(
122 image_spatial_domain_channels[0],
123 reinterpret_cast<fftwf_complex *
>(image_frequency_domain_channels[0]),
125 fftwf_plan backward_plan = fftwf_plan_dft_c2r_2d(
128 reinterpret_cast<fftwf_complex *
>(image_frequency_domain_channels[0]),
129 image_spatial_domain_channels[0],
133 fftwf_destroy_plan(forward_plan);
134 fftwf_destroy_plan(backward_plan);
142 if (context.use_gpu()) {
152 const float4 pixel_color = input_cpu.load_pixel_zero<
float4>(texel);
153 for (
const int channel :
IndexRange(input_channels_count)) {
154 float *buffer = image_spatial_domain_channels[channel];
155 const int64_t index = texel.y *
int64_t(spatial_size.x) + texel.x;
156 buffer[index] = pixel_color[channel];
167 const int2 kernel_center = kernel_size / 2;
172 const int2 centered_texel = kernel_center - texel;
173 const int2 wrapped_texel =
int2(
mod_i(centered_texel.x, spatial_size.x),
174 mod_i(centered_texel.y, spatial_size.y));
176 const float4 kernel_value = is_color_kernel ?
179 for (
const int channel :
IndexRange(kernel_channels_count)) {
180 float *buffer = kernel_spatial_domain_channels[channel];
181 buffer[texel.x + texel.y *
int64_t(spatial_size.x)] = kernel_value[channel];
183 sum_by_thread.local() +=
double4(kernel_value);
191 std::accumulate(sum_by_thread.begin(), sum_by_thread.end(),
double4(0.0)));
193 sum[1] == 0.0f ? 1.0f :
sum[1],
194 sum[2] == 0.0f ? 1.0f :
sum[2],
195 sum[3] == 0.0f ? 1.0f :
sum[3]);
196 const float4 normalization_factor = normalize_kernel ? sanitized_sum :
float4(1.0f);
201 for (const int64_t i : sub_range) {
202 fftwf_execute_dft_r2c(
204 forward_transform_tasks[i].input,
205 reinterpret_cast<fftwf_complex *>(forward_transform_tasks[i].output));
214 const float4 normalization_scale =
float(spatial_size.x) * spatial_size.y * normalization_factor;
216 for (const int64_t channel : IndexRange(input_channels_count)) {
217 const int kernel_channel = is_color_kernel ? channel : 0;
218 std::complex<float> *image_buffer = image_frequency_domain_channels[channel];
219 const std::complex<float> *kernel_buffer = kernel_frequency_domain_channels[kernel_channel];
220 for (const int64_t y : sub_y_range) {
221 for (const int64_t x : IndexRange(frequency_size.x)) {
222 const int64_t index = x + y * int64_t(frequency_size.x);
223 image_buffer[index] *= kernel_buffer[index] / normalization_scale[kernel_channel];
231 for (
const int64_t channel : sub_range) {
232 fftwf_execute_dft_c2r(
234 reinterpret_cast<fftwf_complex *
>(image_frequency_domain_channels[channel]),
235 image_spatial_domain_channels[channel]);
239 Result output_cpu =
context.create_result(
input.type());
240 output_cpu.allocate_texture(
input.domain(),
true, ResultStorageType::CPU);
243 threading::memory_bandwidth_bound_task(
input.size_in_bytes(), [&]() {
244 parallel_for(image_size, [&](const int2 texel) {
245 float4 color = float4(0.0f);
246 for (const int channel : IndexRange(input_channels_count)) {
247 const int64_t index = texel.x + texel.y * int64_t(spatial_size.x);
248 color[channel] = image_spatial_domain_channels[channel][index];
250 output_cpu.store_pixel(texel, color);
255 Result output_gpu = output_cpu.upload_to_gpu(
true);
256 output.steal_data(output_gpu);
257 output_cpu.release();
260 output.steal_data(output_cpu);
270 output.store_pixel(texel, input.load_pixel<float4>(texel));
void GPU_texture_copy(blender::gpu::Texture *dst, blender::gpu::Texture *src)
static T sum(const btAlignedObjectArray< T > &items)