Blender V4.3
scheduler.cc
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2023 Blender Authors
2 *
3 * SPDX-License-Identifier: GPL-2.0-or-later */
4
5#include "BLI_map.hh"
6#include "BLI_set.hh"
7#include "BLI_stack.hh"
8#include "BLI_vector.hh"
9#include "BLI_vector_set.hh"
10
12
13#include "BKE_node_runtime.hh"
14
15#include "COM_context.hh"
16#include "COM_scheduler.hh"
17#include "COM_utilities.hh"
18
20
21using namespace nodes::derived_node_tree_types;
22
23/* Add the viewer node which is marked as NODE_DO_OUTPUT in the given context to the given stack.
24 * If no viewer nodes were found, composite nodes can be added as a fallback
25 * viewer node. */
26static bool add_viewer_nodes_in_context(const DTreeContext *context, Stack<DNode> &node_stack)
27{
28 for (const bNode *node : context->btree().nodes_by_type("CompositorNodeViewer")) {
29 if (node->flag & NODE_DO_OUTPUT && !(node->flag & NODE_MUTED)) {
30 node_stack.push(DNode(context, node));
31 return true;
32 }
33 }
34
35 /* The active Composite node was already added, no need to add it again, see the next block. */
36 if (!node_stack.is_empty() && node_stack.peek()->type == CMP_NODE_COMPOSITE) {
37 return false;
38 }
39
40 /* No active viewers exist in this context, try to add the Composite node as a fallback viewer if
41 * it was not already added. */
42 for (const bNode *node : context->btree().nodes_by_type("CompositorNodeComposite")) {
43 if (node->flag & NODE_DO_OUTPUT && !(node->flag & NODE_MUTED)) {
44 node_stack.push(DNode(context, node));
45 return true;
46 }
47 }
48
49 return false;
50}
51
52/* Add the output nodes whose result should be computed to the given stack. This includes File
53 * Output, Composite, and Viewer nodes. Viewer nodes are a special case, as only the nodes that
54 * satisfies the requirements in the add_viewer_nodes_in_context function are added. First, the
55 * active context is searched for viewer nodes, if non were found, the root context is searched.
56 * For more information on what contexts mean here, see the DerivedNodeTree::active_context()
57 * function. */
58static void add_output_nodes(const Context &context,
59 const DerivedNodeTree &tree,
60 Stack<DNode> &node_stack)
61{
62 const DTreeContext &root_context = tree.root_context();
63
64 /* Only add File Output nodes if the context supports them. */
65 if (context.use_file_output()) {
66 for (const bNode *node : root_context.btree().nodes_by_type("CompositorNodeOutputFile")) {
67 if (!(node->flag & NODE_MUTED)) {
68 node_stack.push(DNode(&root_context, node));
69 }
70 }
71 }
72
73 /* Only add the Composite output node if the context supports composite outputs. The active
74 * Composite node may still be added as a fallback viewer output below. */
75 if (context.use_composite_output()) {
76 for (const bNode *node : root_context.btree().nodes_by_type("CompositorNodeComposite")) {
77 if (node->flag & NODE_DO_OUTPUT && !(node->flag & NODE_MUTED)) {
78 node_stack.push(DNode(&root_context, node));
79 break;
80 }
81 }
82 }
83
84 const DTreeContext &active_context = tree.active_context();
85 const bool viewer_was_added = add_viewer_nodes_in_context(&active_context, node_stack);
86
87 /* An active viewer was added, no need to search further. */
88 if (viewer_was_added) {
89 return;
90 }
91
92 /* If the active context is the root one and no viewer nodes were found, we consider this node
93 * tree to have no viewer nodes, even if one of the non-active descendants have viewer nodes. */
94 if (active_context.is_root()) {
95 return;
96 }
97
98 /* The active context doesn't have a viewer node, search in the root context as a fallback. */
99 add_viewer_nodes_in_context(&tree.root_context(), node_stack);
100}
101
102/* A type representing a mapping that associates each node with a heuristic estimation of the
103 * number of intermediate buffers needed to compute it and all of its dependencies. See the
104 * compute_number_of_needed_buffers function for more information. */
106
107/* Compute a heuristic estimation of the number of intermediate buffers needed to compute each node
108 * and all of its dependencies for all nodes that the given node depends on. The output is a map
109 * that maps each node with the number of intermediate buffers needed to compute it and all of its
110 * dependencies.
111 *
112 * Consider a node that takes n number of buffers as an input from a number of node dependencies,
113 * which we shall call the input nodes. The node also computes and outputs m number of buffers.
114 * In order for the node to compute its output, a number of intermediate buffers will be needed.
115 * Since the node takes n buffers and outputs m buffers, then the number of buffers directly
116 * needed by the node is (n + m). But each of the input buffers are computed by a node that, in
117 * turn, needs a number of buffers to compute its output. So the total number of buffers needed
118 * to compute the output of the node is max(n + m, d) where d is the number of buffers needed by
119 * the input node that needs the largest number of buffers. We only consider the input node that
120 * needs the largest number of buffers, because those buffers can be reused by any input node
121 * that needs a lesser number of buffers.
122 *
123 * Pixel nodes, however, are a special case because links between two pixel nodes inside the same
124 * pixel operation don't pass a buffer, but a single value in the pixel processor. So for pixel
125 * nodes, only inputs and outputs linked to nodes that are not pixel nodes should be considered.
126 * Note that this might not actually be true, because the compiler may decide to split a pixel
127 * operation into multiples ones that will pass buffers, but this is not something that can be
128 * known at scheduling-time. See the discussion in COM_compile_state.hh, COM_evaluator.hh, and
129 * COM_shader_operation.hh for more information. In the node tree shown below, node 4 will have
130 * exactly the same number of needed buffers by node 3, because its inputs and outputs are all
131 * internally linked in the pixel operation.
132 *
133 * Pixel Operation
134 * +------------------------------------------------------+
135 * .------------. | .------------. .------------. .------------. | .------------.
136 * | Node 1 | | | Node 3 | | Node 4 | | Node 5 | | | Node 6 |
137 * | |----|--| |--| |------| |--|--| |
138 * | | .-|--| | | | .---| | | | |
139 * '------------' | | '------------' '------------' | '------------' | '------------'
140 * | +----------------------------------|-------------------+
141 * .------------. | |
142 * | Node 2 | | |
143 * | |--'------------------------------------'
144 * | |
145 * '------------'
146 *
147 * Note that the computed output is not guaranteed to be accurate, and will not be in most cases.
148 * The computation is merely a heuristic estimation that works well in most cases. This is due to a
149 * number of reasons:
150 * - The node tree is actually a graph that allows output sharing, which is not something that was
151 * taken into consideration in this implementation because it is difficult to correctly consider.
152 * - Each node may allocate any number of internal buffers, which is not taken into account in this
153 * implementation because it rarely affects the output and is done by very few nodes.
154 * - The compiler may decide to compiler the schedule differently depending on runtime information
155 * which we can merely speculate at scheduling-time as described above. */
157{
158 NeededBuffers needed_buffers;
159
160 /* A stack of nodes used to traverse the node tree starting from the output nodes. */
161 Stack<DNode> node_stack = output_nodes;
162
163 /* Traverse the node tree in a post order depth first manner and compute the number of needed
164 * buffers for each node. Post order traversal guarantee that all the node dependencies of each
165 * node are computed before it. This is done by pushing all the uncomputed node dependencies to
166 * the node stack first and only popping and computing the node when all its node dependencies
167 * were computed. */
168 while (!node_stack.is_empty()) {
169 /* Do not pop the node immediately, as it may turn out that we can't compute its number of
170 * needed buffers just yet because its dependencies weren't computed, it will be popped later
171 * when needed. */
172 DNode &node = node_stack.peek();
173
174 /* Go over the node dependencies connected to the inputs of the node and push them to the node
175 * stack if they were not computed already. */
176 Set<DNode> pushed_nodes;
177 for (const bNodeSocket *input : node->input_sockets()) {
178 const DInputSocket dinput{node.context(), input};
179
180 /* Get the output linked to the input. If it is null, that means the input is unlinked and
181 * has no dependency node. */
182 const DOutputSocket doutput = get_output_linked_to_input(dinput);
183 if (!doutput) {
184 continue;
185 }
186
187 /* The node dependency was already computed or pushed before, so skip it. */
188 if (needed_buffers.contains(doutput.node()) || pushed_nodes.contains(doutput.node())) {
189 continue;
190 }
191
192 /* The output node needs to be computed, push the node dependency to the node stack and
193 * indicate that it was pushed. */
194 node_stack.push(doutput.node());
195 pushed_nodes.add_new(doutput.node());
196 }
197
198 /* If any of the node dependencies were pushed, that means that not all of them were computed
199 * and consequently we can't compute the number of needed buffers for this node just yet. */
200 if (!pushed_nodes.is_empty()) {
201 continue;
202 }
203
204 /* We don't need to store the result of the pop because we already peeked at it before. */
205 node_stack.pop();
206
207 /* Compute the number of buffers that the node takes as an input as well as the number of
208 * buffers needed to compute the most demanding of the node dependencies. */
209 int number_of_input_buffers = 0;
210 int buffers_needed_by_dependencies = 0;
211 for (const bNodeSocket *input : node->input_sockets()) {
212 const DInputSocket dinput{node.context(), input};
213
214 /* Get the output linked to the input. If it is null, that means the input is unlinked.
215 * Unlinked inputs do not take a buffer, so skip those inputs. */
216 const DOutputSocket doutput = get_output_linked_to_input(dinput);
217 if (!doutput) {
218 continue;
219 }
220
221 /* Since this input is linked, if the link is not between two pixel nodes, it means that the
222 * node takes a buffer through this input and so we increment the number of input buffers. */
223 if (!is_pixel_node(node) || !is_pixel_node(doutput.node())) {
224 number_of_input_buffers++;
225 }
226
227 /* If the number of buffers needed by the node dependency is more than the total number of
228 * buffers needed by the dependencies, then update the latter to be the former. This is
229 * computing the "d" in the aforementioned equation "max(n + m, d)". */
230 const int buffers_needed_by_dependency = needed_buffers.lookup(doutput.node());
231 if (buffers_needed_by_dependency > buffers_needed_by_dependencies) {
232 buffers_needed_by_dependencies = buffers_needed_by_dependency;
233 }
234 }
235
236 /* Compute the number of buffers that will be computed/output by this node. */
237 int number_of_output_buffers = 0;
238 for (const bNodeSocket *output : node->output_sockets()) {
239 const DOutputSocket doutput{node.context(), output};
240
241 /* The output is not linked, it outputs no buffer. */
242 if (!output->is_logically_linked()) {
243 continue;
244 }
245
246 /* If any of the links is not between two pixel nodes, it means that the node outputs
247 * a buffer through this output and so we increment the number of output buffers. */
249 number_of_output_buffers++;
250 }
251 }
252
253 /* Compute the heuristic estimation of the number of needed intermediate buffers to compute
254 * this node and all of its dependencies. This is computing the aforementioned equation
255 * "max(n + m, d)". */
256 const int total_buffers = std::max(number_of_input_buffers + number_of_output_buffers,
257 buffers_needed_by_dependencies);
258 needed_buffers.add(node, total_buffers);
259 }
260
261 return needed_buffers;
262}
263
264/* There are multiple different possible orders of evaluating a node graph, each of which needs
265 * to allocate a number of intermediate buffers to store its intermediate results. It follows
266 * that we need to find the evaluation order which uses the least amount of intermediate buffers.
267 * For instance, consider a node that takes two input buffers A and B. Each of those buffers is
268 * computed through a number of nodes constituting a sub-graph whose root is the node that
269 * outputs that buffer. Suppose the number of intermediate buffers needed to compute A and B are
270 * N(A) and N(B) respectively and N(A) > N(B). Then evaluating the sub-graph computing A would be
271 * a better option than that of B, because had B was computed first, its outputs will need to be
272 * stored in extra buffers in addition to the buffers needed by A. The number of buffers needed by
273 * each node is estimated as described in the compute_number_of_needed_buffers function.
274 *
275 * This is a heuristic generalization of the Sethi-Ullman algorithm, a generalization that
276 * doesn't always guarantee an optimal evaluation order, as the optimal evaluation order is very
277 * difficult to compute, however, this method works well in most cases. Moreover it assumes that
278 * all buffers will have roughly the same size, which may not always be the case. */
279Schedule compute_schedule(const Context &context, const DerivedNodeTree &tree)
280{
281 Schedule schedule;
282
283 /* A stack of nodes used to traverse the node tree starting from the output nodes. */
284 Stack<DNode> node_stack;
285
286 /* Add the output nodes whose result should be computed to the stack. */
287 add_output_nodes(context, tree, node_stack);
288
289 /* No output nodes, the node tree has no effect, return an empty schedule. */
290 if (node_stack.is_empty()) {
291 return schedule;
292 }
293
294 /* Compute the number of buffers needed by each node connected to the outputs. */
295 const NeededBuffers needed_buffers = compute_number_of_needed_buffers(node_stack);
296
297 /* Traverse the node tree in a post order depth first manner, scheduling the nodes in an order
298 * informed by the number of buffers needed by each node. Post order traversal guarantee that all
299 * the node dependencies of each node are scheduled before it. This is done by pushing all the
300 * unscheduled node dependencies to the node stack first and only popping and scheduling the node
301 * when all its node dependencies were scheduled. */
302 while (!node_stack.is_empty()) {
303 /* Do not pop the node immediately, as it may turn out that we can't schedule it just yet
304 * because its dependencies weren't scheduled, it will be popped later when needed. */
305 DNode &node = node_stack.peek();
306
307 /* Compute the nodes directly connected to the node inputs sorted by their needed buffers such
308 * that the node with the lowest number of needed buffers comes first. Note that we actually
309 * want the node with the highest number of needed buffers to be schedule first, but since
310 * those are pushed to the traversal stack, we need to push them in reverse order. */
311 Vector<DNode> sorted_dependency_nodes;
312 for (const bNodeSocket *input : node->input_sockets()) {
313 const DInputSocket dinput{node.context(), input};
314
315 /* Get the output linked to the input. If it is null, that means the input is unlinked and
316 * has no dependency node, so skip it. */
317 const DOutputSocket doutput = get_output_linked_to_input(dinput);
318 if (!doutput) {
319 continue;
320 }
321
322 /* The dependency node was added before, so skip it. The number of dependency nodes is very
323 * small, typically less than 3, so a linear search is okay. */
324 if (sorted_dependency_nodes.contains(doutput.node())) {
325 continue;
326 }
327
328 /* The dependency node was already schedule, so skip it. */
329 if (schedule.contains(doutput.node())) {
330 continue;
331 }
332
333 /* Sort in ascending order on insertion, the number of dependency nodes is very small,
334 * typically less than 3, so insertion sort is okay. */
335 int insertion_position = 0;
336 for (int i = 0; i < sorted_dependency_nodes.size(); i++) {
337 if (needed_buffers.lookup(doutput.node()) >
338 needed_buffers.lookup(sorted_dependency_nodes[i]))
339 {
340 insertion_position++;
341 }
342 else {
343 break;
344 }
345 }
346 sorted_dependency_nodes.insert(insertion_position, doutput.node());
347 }
348
349 /* Push the sorted dependency nodes to the node stack in order. */
350 for (const DNode &dependency_node : sorted_dependency_nodes) {
351 node_stack.push(dependency_node);
352 }
353
354 /* If there are no sorted dependency nodes, that means they were all already scheduled or that
355 * none exists in the first place, so we can pop and schedule the node now. */
356 if (sorted_dependency_nodes.is_empty()) {
357 /* The node might have already been scheduled, so we don't use add_new here and simply don't
358 * add it if it was already scheduled. */
359 schedule.add(node_stack.pop());
360 }
361 }
362
363 return schedule;
364}
365
366} // namespace blender::realtime_compositor
#define CMP_NODE_COMPOSITE
Definition BKE_node.hh:1033
@ NODE_DO_OUTPUT
@ NODE_MUTED
bool add(const Key &key, const Value &value)
Definition BLI_map.hh:271
const Value & lookup(const Key &key) const
Definition BLI_map.hh:506
bool contains(const Key &key) const
Definition BLI_map.hh:329
bool contains(const Key &key) const
Definition BLI_set.hh:291
bool is_empty() const
Definition BLI_set.hh:572
void add_new(const Key &key)
Definition BLI_set.hh:233
bool is_empty() const
Definition BLI_stack.hh:308
void push(const T &value)
Definition BLI_stack.hh:213
bool add(const Key &key)
bool contains(const Key &key) const
int64_t size() const
bool contains(const T &value) const
void insert(const int64_t insert_index, const T &value)
bool is_empty() const
const DTreeContext * context() const
const bNodeTree & btree() const
KDTree_3d * tree
bool is_pixel_node(DNode node)
Definition utilities.cc:104
bool is_output_linked_to_node_conditioned(DOutputSocket output, FunctionRef< bool(DNode)> condition)
Definition utilities.cc:78
DOutputSocket get_output_linked_to_input(DInputSocket input)
Definition utilities.cc:47
static void add_output_nodes(const Context &context, const DerivedNodeTree &tree, Stack< DNode > &node_stack)
Definition scheduler.cc:58
static bool add_viewer_nodes_in_context(const DTreeContext *context, Stack< DNode > &node_stack)
Definition scheduler.cc:26
Schedule compute_schedule(const Context &context, const DerivedNodeTree &tree)
Definition scheduler.cc:279
static NeededBuffers compute_number_of_needed_buffers(Stack< DNode > &output_nodes)
Definition scheduler.cc:156