Blender V5.0
deriche_gaussian_blur.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2024 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
5#include "BLI_assert.h"
6#include "BLI_math_vector.hh"
8
9#include "GPU_shader.hh"
10
11#include "COM_context.hh"
12#include "COM_result.hh"
13#include "COM_utilities.hh"
14
17
18namespace blender::compositor {
19
20#define FILTER_ORDER 4
21
22/* See sum_causal_and_non_causal_results. */
24 const Result &causal_input,
25 const Result &non_causal_input,
27{
28 gpu::Shader *shader = context.get_shader("compositor_deriche_gaussian_blur_sum");
29 GPU_shader_bind(shader);
30
31 causal_input.bind_as_texture(shader, "causal_input_tx");
32 non_causal_input.bind_as_texture(shader, "non_causal_input_tx");
33
34 const Domain domain = causal_input.domain();
35 const Domain transposed_domain = domain.transposed();
36 output.allocate_texture(transposed_domain);
37 output.bind_as_image(shader, "output_img");
38
40
42 causal_input.unbind_as_texture();
43 non_causal_input.unbind_as_texture();
44 output.unbind_as_image();
45}
46
47/* See sum_causal_and_non_causal_results. */
48static void sum_causal_and_non_causal_results_cpu(const Result &causal_input,
49 const Result &non_causal_input,
51{
52 const Domain domain = causal_input.domain();
53 const Domain transposed_domain = domain.transposed();
54 output.allocate_texture(transposed_domain);
55
56 parallel_for(domain.size, [&](const int2 texel) {
57 /* The Deriche filter is a parallel interconnection filter, meaning its output is the sum of
58 * its causal and non causal filters. */
59 float4 filter_output = causal_input.load_pixel<float4>(texel) +
60 non_causal_input.load_pixel<float4>(texel);
61
62 /* Write the color using the transposed texel. See the sum_causal_and_non_causal_results method
63 * in the deriche_gaussian_blur.cc file for more information on the rational behind this. */
64 output.store_pixel(int2(texel.y, texel.x), filter_output);
65 });
66}
67
68/* Sum the causal and non causal outputs of the filter and write the sum to the output. This is
69 * because the Deriche filter is a parallel interconnection filter, meaning its output is the sum
70 * of its causal and non causal filters. The output is expected not to be allocated as it will be
71 * allocated internally.
72 *
73 * The output is allocated and written transposed, that is, with a height equivalent to the width
74 * of the input and vice versa. This is done as a performance optimization. The blur pass will
75 * blur the image horizontally and write it to the intermediate output transposed. Then the
76 * vertical pass will execute the same horizontal blur shader, but since its input is transposed,
77 * it will effectively do a vertical blur and write to the output transposed, effectively undoing
78 * the transposition in the horizontal pass. This is done to improve spatial cache locality in the
79 * shader and to avoid having two separate shaders for each blur pass. */
81 const Result &causal_input,
82 const Result &non_causal_input,
84{
85 if (context.use_gpu()) {
86 sum_causal_and_non_causal_results_gpu(context, causal_input, non_causal_input, output);
87 }
88 else {
89 sum_causal_and_non_causal_results_cpu(causal_input, non_causal_input, output);
90 }
91}
92
93static void blur_pass_gpu(Context &context,
94 const Result &input,
95 Result &causal_result,
96 Result &non_causal_result,
97 const float sigma)
98{
99 gpu::Shader *shader = context.get_shader("compositor_deriche_gaussian_blur");
100 GPU_shader_bind(shader);
101
102 const DericheGaussianCoefficients &coefficients =
103 context.cache_manager().deriche_gaussian_coefficients.get(context, sigma);
104
106 "causal_feedforward_coefficients",
109 "non_causal_feedforward_coefficients",
112 shader, "feedback_coefficients", float4(coefficients.feedback_coefficients()));
114 shader, "causal_boundary_coefficient", float(coefficients.causal_boundary_coefficient()));
116 "non_causal_boundary_coefficient",
117 float(coefficients.non_causal_boundary_coefficient()));
118
119 input.bind_as_texture(shader, "input_tx");
120
121 const Domain domain = input.domain();
122 causal_result.allocate_texture(domain);
123 non_causal_result.allocate_texture(domain);
124 causal_result.bind_as_image(shader, "causal_output_img");
125 non_causal_result.bind_as_image(shader, "non_causal_output_img");
126
127 /* The second dispatch dimension is two dispatches, one for the causal filter and one for the non
128 * causal one. */
129 compute_dispatch_threads_at_least(shader, int2(domain.size.y, 2), int2(128, 2));
130
132 input.unbind_as_texture();
133 causal_result.unbind_as_image();
134 non_causal_result.unbind_as_image();
135}
136
137static void blur_pass_cpu(Context &context,
138 const Result &input,
139 Result &causal_output,
140 Result &non_causal_output,
141 const float sigma)
142{
143 const DericheGaussianCoefficients &coefficients =
144 context.cache_manager().deriche_gaussian_coefficients.get(context, sigma);
145
146 const float4 causal_feedforward_coefficients = float4(
147 coefficients.causal_feedforward_coefficients());
148 const float4 non_causal_feedforward_coefficients = float4(
150 const float4 feedback_coefficients = float4(coefficients.feedback_coefficients());
151 const float causal_boundary_coefficient = float(coefficients.causal_boundary_coefficient());
152 const float non_causal_boundary_coefficient = float(
153 coefficients.non_causal_boundary_coefficient());
154
155 const Domain domain = input.domain();
156 causal_output.allocate_texture(domain);
157 non_causal_output.allocate_texture(domain);
158
159 /* The first dispatch dimension is two dispatches, one for the causal filter and one for the non
160 * causal one. */
161 const int2 parallel_for_size = int2(2, domain.size.y);
162 /* Blur the input horizontally by applying a fourth order IIR filter approximating a Gaussian
163 * filter using Deriche's design method. This is based on the following paper:
164 *
165 * Deriche, Rachid. Recursively implementating the Gaussian and its derivatives. Diss. INRIA,
166 * 1993.
167 *
168 * We run two filters per row in parallel, one for the causal filter and one for the non causal
169 * filter, storing the result of each separately. See the DericheGaussianCoefficients class and
170 * the implementation for more information. */
171 parallel_for(parallel_for_size, [&](const int2 invocation) {
172 /* The code runs parallel across rows but serially across columns. */
173 int y = invocation.y;
174 int width = input.domain().size.x;
175
176 /* The second dispatch dimension is two dispatches, one for the causal filter and one for the
177 * non causal one. */
178 bool is_causal = invocation.x == 0;
179 float4 feedforward_coefficients = is_causal ? causal_feedforward_coefficients :
180 non_causal_feedforward_coefficients;
181 float boundary_coefficient = is_causal ? causal_boundary_coefficient :
182 non_causal_boundary_coefficient;
183
184 /* Create an array that holds the last FILTER_ORDER inputs along with the current input. The
185 * current input is at index 0 and the oldest input is at index FILTER_ORDER. We assume Neumann
186 * boundary condition, so we initialize all inputs by the boundary pixel. */
187 int2 boundary_texel = is_causal ? int2(0, y) : int2(width - 1, y);
188 float4 input_boundary = input.load_pixel<float4>(boundary_texel);
189 float4 inputs[FILTER_ORDER + 1] = {
190 input_boundary, input_boundary, input_boundary, input_boundary, input_boundary};
191
192 /* Create an array that holds the last FILTER_ORDER outputs along with the current output. The
193 * current output is at index 0 and the oldest output is at index FILTER_ORDER. We assume
194 * Neumann boundary condition, so we initialize all outputs by the boundary pixel multiplied by
195 * the boundary coefficient. See the DericheGaussianCoefficients class for more information on
196 * the boundary handing. */
197 float4 output_boundary = input_boundary * boundary_coefficient;
199 output_boundary, output_boundary, output_boundary, output_boundary, output_boundary};
200
201 for (int x = 0; x < width; x++) {
202 /* Run forward across rows for the causal filter and backward for the non causal filter. */
203 int2 texel = is_causal ? int2(x, y) : int2(width - 1 - x, y);
204 inputs[0] = input.load_pixel<float4>(texel);
205
206 /* Compute Equation (28) for the causal filter or Equation (29) for the non causal filter.
207 * The only difference is that the non causal filter ignores the current value and starts
208 * from the previous input, as can be seen in the subscript of the first input term in both
209 * equations. So add one while indexing the non causal inputs. */
210 outputs[0] = float4(0.0f);
211 int first_input_index = is_causal ? 0 : 1;
212 for (int i = 0; i < FILTER_ORDER; i++) {
213 outputs[0] += feedforward_coefficients[i] * inputs[first_input_index + i];
214 outputs[0] -= feedback_coefficients[i] * outputs[i + 1];
215 }
216
217 /* Store the causal and non causal outputs independently, then sum them in a separate shader
218 * dispatch for better parallelism. */
219 if (is_causal) {
220 causal_output.store_pixel(texel, outputs[0]);
221 }
222 else {
223 non_causal_output.store_pixel(texel, outputs[0]);
224 }
225
226 /* Shift the inputs temporally by one. The oldest input is discarded, while the current input
227 * will retain its value but will be overwritten with the new current value in the next
228 * iteration. */
229 for (int i = FILTER_ORDER; i >= 1; i--) {
230 inputs[i] = inputs[i - 1];
231 }
232
233 /* Shift the outputs temporally by one. The oldest output is discarded, while the current
234 * output will retain its value but will be overwritten with the new current value in the
235 * next iteration. */
236 for (int i = FILTER_ORDER; i >= 1; i--) {
237 outputs[i] = outputs[i - 1];
238 }
239 }
240 });
241}
242
243static void blur_pass(Context &context, const Result &input, Result &output, const float sigma)
244{
245 Result causal_result = context.create_result(ResultType::Color);
246 Result non_causal_result = context.create_result(ResultType::Color);
247
248 if (context.use_gpu()) {
249 blur_pass_gpu(context, input, causal_result, non_causal_result, sigma);
250 }
251 else {
252 blur_pass_cpu(context, input, causal_result, non_causal_result, sigma);
253 }
254
255 sum_causal_and_non_causal_results(context, causal_result, non_causal_result, output);
256 causal_result.release();
257 non_causal_result.release();
258}
259
261 const Result &input,
262 Result &output,
263 const float2 &sigma)
264{
265 BLI_assert_msg(math::reduce_max(sigma) >= 3.0f,
266 "Deriche filter is slower and less accurate than direct convolution for sigma "
267 "values less 3. Use direct convolution blur instead.");
268 BLI_assert_msg(math::reduce_max(sigma) < 32.0f,
269 "Deriche filter is not accurate nor numerically stable for sigma values larger "
270 "than 32. Use Van Vliet filter instead.");
271
272 Result horizontal_pass_result = context.create_result(ResultType::Color);
273 blur_pass(context, input, horizontal_pass_result, sigma.x);
274 blur_pass(context, horizontal_pass_result, output, sigma.y);
275 horizontal_pass_result.release();
276}
277
278} // namespace blender::compositor
#define BLI_assert_msg(a, msg)
Definition BLI_assert.h:53
void GPU_shader_uniform_1f(blender::gpu::Shader *sh, const char *name, float value)
void GPU_shader_uniform_4fv(blender::gpu::Shader *sh, const char *name, const float data[4])
void GPU_shader_bind(blender::gpu::Shader *shader, const blender::gpu::shader::SpecializationConstants *constants_state=nullptr)
void GPU_shader_unbind()
Domain transposed() const
Definition domain.cc:29
void store_pixel(const int2 &texel, const T &pixel_value)
void allocate_texture(const Domain domain, const bool from_pool=true, const std::optional< ResultStorageType > storage_type=std::nullopt)
Definition result.cc:389
void unbind_as_texture() const
Definition result.cc:511
void bind_as_texture(gpu::Shader *shader, const char *texture_name) const
Definition result.cc:487
const Domain & domain() const
void unbind_as_image() const
Definition result.cc:517
void bind_as_image(gpu::Shader *shader, const char *image_name, bool read=false) const
Definition result.cc:498
nullptr float
#define FILTER_ORDER
#define input
#define output
static void sum_causal_and_non_causal_results_cpu(const Result &causal_input, const Result &non_causal_input, Result &output)
void compute_dispatch_threads_at_least(gpu::Shader *shader, int2 threads_range, int2 local_size=int2(16))
Definition utilities.cc:196
static void sum_causal_and_non_causal_results(Context &context, const Result &causal_input, const Result &non_causal_input, Result &output)
static void blur_pass_cpu(Context &context, const Result &input, Result &causal_output, Result &non_causal_output, const float sigma)
void deriche_gaussian_blur(Context &context, const Result &input, Result &output, const float2 &sigma)
void parallel_for(const int2 range, const Function &function)
static void blur_pass_gpu(Context &context, const Result &input, Result &causal_result, Result &non_causal_result, const float sigma)
static void sum_causal_and_non_causal_results_gpu(Context &context, const Result &causal_input, const Result &non_causal_input, Result &output)
static void blur_pass(Context &context, const Result &input, Result &output, const float sigma)
T reduce_max(const VecBase< T, Size > &a)
VecBase< float, 4 > float4
VecBase< int32_t, 2 > int2
VecBase< float, 2 > float2
static blender::bke::bNodeSocketTemplate outputs[]
static blender::bke::bNodeSocketTemplate inputs[]
i
Definition text_draw.cc:230