Blender V5.0
summed_area_table.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2023 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
5#include "BLI_assert.h"
6#include "BLI_index_range.hh"
7#include "BLI_math_vector.hh"
9#include "BLI_task.hh"
10
11#include "GPU_compute.hh"
12#include "GPU_shader.hh"
13
14#include "COM_context.hh"
15#include "COM_result.hh"
16
18
19namespace blender::compositor {
20
21/* ------------------------------------------------------------------------------------------------
22 * Summed Area Table
23 *
24 * An implementation of the summed area table algorithm from the paper:
25 *
26 * Nehab, Diego, et al. "GPU-efficient recursive filtering and summed-area tables."
27 *
28 * This file is a straightforward implementation of each of the four passes described in
29 * Algorithm SAT in section 6 of the paper. Note that we use Blender's convention of first
30 * quadrant images, so we call prologues horizontal or X prologues, and we call transposed
31 * prologues vertical or Y prologues. See each of the functions for more details. */
32
34{
35 switch (operation) {
37 return "compositor_summed_area_table_compute_incomplete_prologues_identity";
39 return "compositor_summed_area_table_compute_incomplete_prologues_square";
40 }
41
43 return "";
44}
45
46/* Computes the horizontal and vertical incomplete prologues from the given input using equations
47 * (42) and (43) to implement the first pass of Algorithm SAT. Those equations accumulatively sum
48 * each row in each block, writing the final sum to the X incomplete block, then sum each column in
49 * the X accumulatively summed block, writing the final sum to the Y incomplete block. The output
50 * is the prologues along the horizontal and vertical directions, where the accumulation axis is
51 * stored along the vertical axis, so the X prologues are stored transposed for better cache
52 * locality. */
56 Result &incomplete_x_prologues,
57 Result &incomplete_y_prologues)
58{
59 gpu::Shader *shader = context.get_shader(get_compute_incomplete_prologues_shader(operation),
61 GPU_shader_bind(shader);
62
63 input.bind_as_texture(shader, "input_tx");
64
65 const int2 group_size = int2(16);
66 const int2 input_size = input.domain().size;
67 const int2 number_of_groups = math::divide_ceil(input_size, group_size);
68
69 incomplete_x_prologues.allocate_texture(Domain(int2(input_size.y, number_of_groups.x)));
70 incomplete_x_prologues.bind_as_image(shader, "incomplete_x_prologues_img");
71
72 incomplete_y_prologues.allocate_texture(Domain(int2(input_size.x, number_of_groups.y)));
73 incomplete_y_prologues.bind_as_image(shader, "incomplete_y_prologues_img");
74
75 GPU_compute_dispatch(shader, number_of_groups.x, number_of_groups.y, 1);
76
78 input.unbind_as_texture();
79 incomplete_x_prologues.unbind_as_image();
80 incomplete_y_prologues.unbind_as_image();
81}
82
83/* Computes the complete X prologues and their sum from the incomplete X prologues using equation
84 * (44) to implement the second pass of Algorithm SAT. That equation simply sum the incomplete
85 * prologue and all incomplete prologues before it, writing the sum to the complete prologue. Then,
86 * each of the complete prologues is summed using parallel reduction writing the sum to the output
87 * sum for each block. The shader runs in parallel vertically, but serially horizontally. Note that
88 * the input incomplete X prologues and output complete X prologues are stored transposed for
89 * better cache locality, but the output sum is stored straight, not transposed. */
92 Result &incomplete_x_prologues,
93 Result &complete_x_prologues,
94 Result &complete_x_prologues_sum)
95{
96 gpu::Shader *shader = context.get_shader(
97 "compositor_summed_area_table_compute_complete_x_prologues", ResultPrecision::Full);
98 GPU_shader_bind(shader);
99
100 incomplete_x_prologues.bind_as_texture(shader, "incomplete_x_prologues_tx");
101
102 const int2 group_size = int2(16);
103 const int2 input_size = input.domain().size;
104 const int2 number_of_groups = math::divide_ceil(input_size, group_size);
105
106 complete_x_prologues.allocate_texture(incomplete_x_prologues.domain());
107 complete_x_prologues.bind_as_image(shader, "complete_x_prologues_img");
108
109 complete_x_prologues_sum.allocate_texture(Domain(number_of_groups));
110 complete_x_prologues_sum.bind_as_image(shader, "complete_x_prologues_sum_img");
111
112 GPU_compute_dispatch(shader, number_of_groups.y, 1, 1);
113
115 incomplete_x_prologues.unbind_as_texture();
116 complete_x_prologues.unbind_as_image();
117 complete_x_prologues_sum.unbind_as_image();
118}
119
120/* Computes the complete Y prologues from the incomplete Y prologues using equation (45) to
121 * implement the third pass of Algorithm SAT. That equation simply sum the incomplete prologue and
122 * all incomplete prologues before it, then adds the sum of the complete X prologue for the same
123 * block, writing the sum to the complete prologue. The shader runs in parallel horizontally, but
124 * serially vertically. */
126 Result &input,
127 Result &incomplete_y_prologues,
128 Result &complete_x_prologues_sum,
129 Result &complete_y_prologues)
130{
131 gpu::Shader *shader = context.get_shader(
132 "compositor_summed_area_table_compute_complete_y_prologues", ResultPrecision::Full);
133 GPU_shader_bind(shader);
134
135 incomplete_y_prologues.bind_as_texture(shader, "incomplete_y_prologues_tx");
136 complete_x_prologues_sum.bind_as_texture(shader, "complete_x_prologues_sum_tx");
137
138 const int2 group_size = int2(16);
139 const int2 input_size = input.domain().size;
140 const int2 number_of_groups = math::divide_ceil(input_size, group_size);
141
142 complete_y_prologues.allocate_texture(incomplete_y_prologues.domain());
143 complete_y_prologues.bind_as_image(shader, "complete_y_prologues_img");
144
145 GPU_compute_dispatch(shader, number_of_groups.x, 1, 1);
146
148 incomplete_y_prologues.unbind_as_texture();
149 complete_x_prologues_sum.unbind_as_texture();
150 complete_y_prologues.unbind_as_image();
151}
152
154{
155 switch (operation) {
157 return "compositor_summed_area_table_compute_complete_blocks_identity";
159 return "compositor_summed_area_table_compute_complete_blocks_square";
160 }
161
163 return "";
164}
165
166/* Computes the final summed area table blocks from the complete X and Y prologues using equation
167 * (41) to implement the fourth pass of Algorithm SAT. That equation simply uses an intermediate
168 * shared memory to cascade the accumulation of rows and then column in each block using the
169 * prologues as initial values and writes each step of the latter accumulation to the output. */
170static void compute_complete_blocks(Context &context,
171 Result &input,
172 Result &complete_x_prologues,
173 Result &complete_y_prologues,
174 SummedAreaTableOperation operation,
175 Result &output)
176{
177 gpu::Shader *shader = context.get_shader(get_compute_complete_blocks_shader(operation),
179 GPU_shader_bind(shader);
180
181 input.bind_as_texture(shader, "input_tx");
182 complete_x_prologues.bind_as_texture(shader, "complete_x_prologues_tx");
183 complete_y_prologues.bind_as_texture(shader, "complete_y_prologues_tx");
184
185 output.allocate_texture(input.domain());
186 output.bind_as_image(shader, "output_img", true);
187
188 const int2 group_size = int2(16);
189 const int2 input_size = input.domain().size;
190 const int2 number_of_groups = math::divide_ceil(input_size, group_size);
191
192 GPU_compute_dispatch(shader, number_of_groups.x, number_of_groups.y, 1);
193
195 input.unbind_as_texture();
196 complete_x_prologues.unbind_as_texture();
197 complete_y_prologues.unbind_as_texture();
198 output.unbind_as_image();
199}
200
201static void summed_area_table_gpu(Context &context,
202 Result &input,
203 Result &output,
204 SummedAreaTableOperation operation)
205{
206 Result incomplete_x_prologues = context.create_result(ResultType::Color, ResultPrecision::Full);
207 Result incomplete_y_prologues = context.create_result(ResultType::Color, ResultPrecision::Full);
209 context, input, operation, incomplete_x_prologues, incomplete_y_prologues);
210
211 Result complete_x_prologues = context.create_result(ResultType::Color, ResultPrecision::Full);
212 Result complete_x_prologues_sum = context.create_result(ResultType::Color,
215 context, input, incomplete_x_prologues, complete_x_prologues, complete_x_prologues_sum);
216 incomplete_x_prologues.release();
217
218 Result complete_y_prologues = context.create_result(ResultType::Color, ResultPrecision::Full);
220 context, input, incomplete_y_prologues, complete_x_prologues_sum, complete_y_prologues);
221 incomplete_y_prologues.release();
222 complete_x_prologues_sum.release();
223
225 context, input, complete_x_prologues, complete_y_prologues, operation, output);
226 complete_x_prologues.release();
227 complete_y_prologues.release();
228}
229
230/* Computes the summed area table as a cascade of a horizontal summing pass followed by a vertical
231 * summing pass. */
233 Result &output,
234 SummedAreaTableOperation operation)
235{
236 output.allocate_texture(input.domain());
237
238 /* Horizontal summing pass. */
239 const int2 size = input.domain().size;
240 threading::parallel_for(IndexRange(size.y), 1, [&](const IndexRange range_y) {
241 for (const int y : range_y) {
242 float4 accumulated_color = float4(0.0f);
243 for (const int x : IndexRange(size.x)) {
244 const int2 texel = int2(x, y);
245 const float4 color = input.load_pixel<float4>(texel);
246 accumulated_color += operation == SummedAreaTableOperation::Square ? color * color : color;
247 output.store_pixel(texel, accumulated_color);
248 }
249 }
250 });
251
252 /* Vertical summing pass. */
253 threading::parallel_for(IndexRange(size.x), 1, [&](const IndexRange range_x) {
254 for (const int x : range_x) {
255 float4 accumulated_color = float4(0.0f);
256 for (const int y : IndexRange(size.y)) {
257 const int2 texel = int2(x, y);
258 const float4 color = output.load_pixel<float4>(texel);
259 accumulated_color += color;
260 output.store_pixel(texel, accumulated_color);
261 }
262 }
263 });
264}
265
267 Result &input,
268 Result &output,
269 SummedAreaTableOperation operation)
270{
271 if (context.use_gpu()) {
272 summed_area_table_gpu(context, input, output, operation);
273 }
274 else {
276 }
277}
278
279} // namespace blender::compositor
#define BLI_assert_unreachable()
Definition BLI_assert.h:93
void GPU_compute_dispatch(blender::gpu::Shader *shader, uint groups_x_len, uint groups_y_len, uint groups_z_len, const blender::gpu::shader::SpecializationConstants *constants_state=nullptr)
void GPU_shader_bind(blender::gpu::Shader *shader, const blender::gpu::shader::SpecializationConstants *constants_state=nullptr)
void GPU_shader_unbind()
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
void allocate_texture(const Domain domain, const bool from_pool=true, const std::optional< ResultStorageType > storage_type=std::nullopt)
Definition result.cc:389
void unbind_as_texture() const
Definition result.cc:511
void bind_as_texture(gpu::Shader *shader, const char *texture_name) const
Definition result.cc:487
const Domain & domain() const
void unbind_as_image() const
Definition result.cc:517
void bind_as_image(gpu::Shader *shader, const char *image_name, bool read=false) const
Definition result.cc:498
#define input
#define output
void summed_area_table(Context &context, Result &input, Result &output, SummedAreaTableOperation operation=SummedAreaTableOperation::Identity)
static void compute_complete_blocks(Context &context, Result &input, Result &complete_x_prologues, Result &complete_y_prologues, SummedAreaTableOperation operation, Result &output)
static const char * get_compute_complete_blocks_shader(SummedAreaTableOperation operation)
static void compute_complete_y_prologues(Context &context, Result &input, Result &incomplete_y_prologues, Result &complete_x_prologues_sum, Result &complete_y_prologues)
static void summed_area_table_cpu(Result &input, Result &output, SummedAreaTableOperation operation)
static void summed_area_table_gpu(Context &context, Result &input, Result &output, SummedAreaTableOperation operation)
static void compute_complete_x_prologues(Context &context, Result &input, Result &incomplete_x_prologues, Result &complete_x_prologues, Result &complete_x_prologues_sum)
static void compute_incomplete_prologues(Context &context, Result &input, SummedAreaTableOperation operation, Result &incomplete_x_prologues, Result &incomplete_y_prologues)
static const char * get_compute_incomplete_prologues_shader(SummedAreaTableOperation operation)
VecBase< T, Size > divide_ceil(const VecBase< T, Size > &a, const VecBase< T, Size > &b)
void parallel_for(const IndexRange range, const int64_t grain_size, const Function &function, const TaskSizeHints &size_hints=detail::TaskSizeHints_Static(1))
Definition BLI_task.hh:93
VecBase< int32_t, 2 > int2