Blender V4.3
summed_area_table.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2023 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
5#include "BLI_assert.h"
6#include "BLI_math_base.hh"
7#include "BLI_math_vector.hh"
9
10#include "GPU_compute.hh"
11#include "GPU_shader.hh"
12#include "GPU_texture.hh"
13
14#include "COM_context.hh"
15#include "COM_result.hh"
16#include "COM_utilities.hh"
17
19
21
22/* ------------------------------------------------------------------------------------------------
23 * Summed Area Table
24 *
25 * An implementation of the summed area table algorithm from the paper:
26 *
27 * Nehab, Diego, et al. "GPU-efficient recursive filtering and summed-area tables."
28 *
29 * This file is a straightforward implementation of each of the four passes described in
30 * Algorithm SAT in section 6 of the paper. Note that we use Blender's convention of first
31 * quadrant images, so we call prologues horizontal or X prologues, and we call transposed
32 * prologues vertical or Y prologues. See each of the functions for more details. */
33
35{
36 switch (operation) {
38 return "compositor_summed_area_table_compute_incomplete_prologues_identity";
40 return "compositor_summed_area_table_compute_incomplete_prologues_square";
41 }
42
44 return "";
45}
46
47/* Computes the horizontal and vertical incomplete prologues from the given input using equations
48 * (42) and (43) to implement the first pass of Algorithm SAT. Those equations accumulatively sum
49 * each row in each block, writing the final sum to the X incomplete block, then sum each column in
50 * the X accumulatively summed block, writing the final sum to the Y incomplete block. The output
51 * is the prologues along the horizontal and vertical directions, where the accumulation axis is
52 * stored along the vertical axis, so the X prologues are stored transposed for better cache
53 * locality. */
55 Result &input,
57 Result &incomplete_x_prologues,
58 Result &incomplete_y_prologues)
59{
60 GPUShader *shader = context.get_shader(get_compute_incomplete_prologues_shader(operation),
62 GPU_shader_bind(shader);
63
64 input.bind_as_texture(shader, "input_tx");
65
66 const int2 group_size = int2(16);
67 const int2 input_size = input.domain().size;
68 const int2 number_of_groups = math::divide_ceil(input_size, group_size);
69
70 incomplete_x_prologues.allocate_texture(Domain(int2(input_size.y, number_of_groups.x)));
71 incomplete_x_prologues.bind_as_image(shader, "incomplete_x_prologues_img");
72
73 incomplete_y_prologues.allocate_texture(Domain(int2(input_size.x, number_of_groups.y)));
74 incomplete_y_prologues.bind_as_image(shader, "incomplete_y_prologues_img");
75
76 GPU_compute_dispatch(shader, number_of_groups.x, number_of_groups.y, 1);
77
79 input.unbind_as_texture();
80 incomplete_x_prologues.unbind_as_image();
81 incomplete_y_prologues.unbind_as_image();
82}
83
84/* Computes the complete X prologues and their sum from the incomplete X prologues using equation
85 * (44) to implement the second pass of Algorithm SAT. That equation simply sum the incomplete
86 * prologue and all incomplete prologues before it, writing the sum to the complete prologue. Then,
87 * each of the complete prologues is summed using parallel reduction writing the sum to the output
88 * sum for each block. The shader runs in parallel vertically, but serially horizontally. Note that
89 * the input incomplete X prologues and output complete X prologues are stored transposed for
90 * better cache locality, but the output sum is stored straight, not transposed. */
92 Result &input,
93 Result &incomplete_x_prologues,
94 Result &complete_x_prologues,
95 Result &complete_x_prologues_sum)
96{
97 GPUShader *shader = context.get_shader(
98 "compositor_summed_area_table_compute_complete_x_prologues", ResultPrecision::Full);
99 GPU_shader_bind(shader);
100
101 incomplete_x_prologues.bind_as_texture(shader, "incomplete_x_prologues_tx");
102
103 const int2 group_size = int2(16);
104 const int2 input_size = input.domain().size;
105 const int2 number_of_groups = math::divide_ceil(input_size, group_size);
106
107 complete_x_prologues.allocate_texture(incomplete_x_prologues.domain());
108 complete_x_prologues.bind_as_image(shader, "complete_x_prologues_img");
109
110 complete_x_prologues_sum.allocate_texture(Domain(number_of_groups));
111 complete_x_prologues_sum.bind_as_image(shader, "complete_x_prologues_sum_img");
112
113 GPU_compute_dispatch(shader, number_of_groups.y, 1, 1);
114
116 incomplete_x_prologues.unbind_as_texture();
117 complete_x_prologues.unbind_as_image();
118 complete_x_prologues_sum.unbind_as_image();
119}
120
121/* Computes the complete Y prologues from the incomplete Y prologues using equation (45) to
122 * implement the third pass of Algorithm SAT. That equation simply sum the incomplete prologue and
123 * all incomplete prologues before it, then adds the sum of the complete X prologue for the same
124 * block, writing the sum to the complete prologue. The shader runs in parallel horizontally, but
125 * serially vertically. */
127 Result &input,
128 Result &incomplete_y_prologues,
129 Result &complete_x_prologues_sum,
130 Result &complete_y_prologues)
131{
132 GPUShader *shader = context.get_shader(
133 "compositor_summed_area_table_compute_complete_y_prologues", ResultPrecision::Full);
134 GPU_shader_bind(shader);
135
136 incomplete_y_prologues.bind_as_texture(shader, "incomplete_y_prologues_tx");
137 complete_x_prologues_sum.bind_as_texture(shader, "complete_x_prologues_sum_tx");
138
139 const int2 group_size = int2(16);
140 const int2 input_size = input.domain().size;
141 const int2 number_of_groups = math::divide_ceil(input_size, group_size);
142
143 complete_y_prologues.allocate_texture(incomplete_y_prologues.domain());
144 complete_y_prologues.bind_as_image(shader, "complete_y_prologues_img");
145
146 GPU_compute_dispatch(shader, number_of_groups.x, 1, 1);
147
149 incomplete_y_prologues.unbind_as_texture();
150 complete_x_prologues_sum.unbind_as_texture();
151 complete_y_prologues.unbind_as_image();
152}
153
155{
156 switch (operation) {
158 return "compositor_summed_area_table_compute_complete_blocks_identity";
160 return "compositor_summed_area_table_compute_complete_blocks_square";
161 }
162
164 return "";
165}
166
167/* Computes the final summed area table blocks from the complete X and Y prologues using equation
168 * (41) to implement the fourth pass of Algorithm SAT. That equation simply uses an intermediate
169 * shared memory to cascade the accumulation of rows and then column in each block using the
170 * prologues as initial values and writes each step of the latter accumulation to the output. */
171static void compute_complete_blocks(Context &context,
172 Result &input,
173 Result &complete_x_prologues,
174 Result &complete_y_prologues,
175 SummedAreaTableOperation operation,
176 Result &output)
177{
178 GPUShader *shader = context.get_shader(get_compute_complete_blocks_shader(operation),
180 GPU_shader_bind(shader);
181
182 input.bind_as_texture(shader, "input_tx");
183 complete_x_prologues.bind_as_texture(shader, "complete_x_prologues_tx");
184 complete_y_prologues.bind_as_texture(shader, "complete_y_prologues_tx");
185
186 output.allocate_texture(input.domain());
187 output.bind_as_image(shader, "output_img", true);
188
189 const int2 group_size = int2(16);
190 const int2 input_size = input.domain().size;
191 const int2 number_of_groups = math::divide_ceil(input_size, group_size);
192
193 GPU_compute_dispatch(shader, number_of_groups.x, number_of_groups.y, 1);
194
196 input.unbind_as_texture();
197 complete_x_prologues.unbind_as_texture();
198 complete_y_prologues.unbind_as_texture();
199 output.unbind_as_image();
200}
201
203 Result &input,
204 Result &output,
205 SummedAreaTableOperation operation)
206{
207 Result incomplete_x_prologues = context.create_result(ResultType::Color, ResultPrecision::Full);
208 Result incomplete_y_prologues = context.create_result(ResultType::Color, ResultPrecision::Full);
210 context, input, operation, incomplete_x_prologues, incomplete_y_prologues);
211
212 Result complete_x_prologues = context.create_result(ResultType::Color, ResultPrecision::Full);
213 Result complete_x_prologues_sum = context.create_result(ResultType::Color,
216 context, input, incomplete_x_prologues, complete_x_prologues, complete_x_prologues_sum);
217 incomplete_x_prologues.release();
218
219 Result complete_y_prologues = context.create_result(ResultType::Color, ResultPrecision::Full);
221 context, input, incomplete_y_prologues, complete_x_prologues_sum, complete_y_prologues);
222 incomplete_y_prologues.release();
223 complete_x_prologues_sum.release();
224
226 context, input, complete_x_prologues, complete_y_prologues, operation, output);
227 complete_x_prologues.release();
228 complete_y_prologues.release();
229}
230
231} // namespace blender::realtime_compositor
#define BLI_assert_unreachable()
Definition BLI_assert.h:97
void GPU_compute_dispatch(GPUShader *shader, uint groups_x_len, uint groups_y_len, uint groups_z_len)
void GPU_shader_bind(GPUShader *shader)
void GPU_shader_unbind()
struct GPUShader GPUShader
void bind_as_image(GPUShader *shader, const char *image_name, bool read=false) const
Definition result.cc:264
const Domain & domain() const
Definition result.cc:712
void allocate_texture(Domain domain, bool from_pool=true)
Definition result.cc:204
void bind_as_texture(GPUShader *shader, const char *texture_name) const
Definition result.cc:253
VecBase< T, Size > divide_ceil(const VecBase< T, Size > &a, const VecBase< T, Size > &b)
static void compute_complete_blocks(Context &context, Result &input, Result &complete_x_prologues, Result &complete_y_prologues, SummedAreaTableOperation operation, Result &output)
static void compute_complete_y_prologues(Context &context, Result &input, Result &incomplete_y_prologues, Result &complete_x_prologues_sum, Result &complete_y_prologues)
static void compute_complete_x_prologues(Context &context, Result &input, Result &incomplete_x_prologues, Result &complete_x_prologues, Result &complete_x_prologues_sum)
static const char * get_compute_complete_blocks_shader(SummedAreaTableOperation operation)
static const char * get_compute_incomplete_prologues_shader(SummedAreaTableOperation operation)
void summed_area_table(Context &context, Result &input, Result &output, SummedAreaTableOperation operation=SummedAreaTableOperation::Identity)
static void compute_incomplete_prologues(Context &context, Result &input, SummedAreaTableOperation operation, Result &incomplete_x_prologues, Result &incomplete_y_prologues)
VecBase< int32_t, 2 > int2