Blender V5.0
device/queue.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#pragma once
6
7#include "device/kernel.h"
8
10#include "util/log.h"
11#include "util/map.h"
12#include "util/string.h"
13#include "util/unique_ptr.h"
14
16
17class Device;
18class device_memory;
19
20struct KernelWorkTile;
21
22/* Container for device kernel arguments with type correctness ensured by API. */
24
32
33 static const int MAX_ARGS = 18;
36 size_t sizes[MAX_ARGS];
37 size_t count = 0;
38
40
41 template<class T> DeviceKernelArguments(const T *arg)
42 {
43 add(arg);
44 }
45
46 template<class T, class... Args> DeviceKernelArguments(const T *first, Args... args)
47 {
48 add(first);
49 add(args...);
50 }
51
52 void add(const KernelFilmConvert *value)
53 {
55 }
56 void add(const device_ptr *value)
57 {
58 add(POINTER, value, sizeof(device_ptr));
59 }
60 void add(const int32_t *value)
61 {
62 add(INT32, value, sizeof(int32_t));
63 }
64 void add(const float *value)
65 {
66 add(FLOAT32, value, sizeof(float));
67 }
68 void add(const Type type, const void *value, const size_t size)
69 {
71
72 types[count] = type;
73 values[count] = (void *)value;
74 sizes[count] = size;
75 count++;
76 }
77 template<typename T, typename... Args> void add(const T *first, Args... args)
78 {
79 add(first);
80 add(args...);
81 }
82};
83
84/* Abstraction of a command queue for a device.
85 * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
86 * from driver side.
87 *
88 * This class encapsulates all properties needed for commands execution. */
90 public:
91 virtual ~DeviceQueue();
92
93 /* Number of concurrent states to process for integrator,
94 * based on number of cores and/or available memory. */
95 virtual int num_concurrent_states(const size_t state_size) const = 0;
96
97 /* Number of states which keeps the device occupied with work without losing performance.
98 * The renderer will add more work (when available) when number of active paths falls below this
99 * value. */
100 virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
101
102 /* Number of partitions of sorted shaders, that improves memory locality of
103 * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
104 virtual int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const
105 {
106 /* Sort partitioning becomes less effective when more shaders are in the wavefront. In lieu of
107 * a more sophisticated heuristic we simply disable sort partitioning if the shader count is
108 * high.
109 */
110 if (max_scene_shaders < 300) {
111 return max(max_num_paths / 65536, 1);
112 }
113 else {
114 return 1;
115 }
116 }
117
118 /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
119 * INTEGRATOR_SORT_WRITE_PASS)? */
120 virtual bool supports_local_atomic_sort() const
121 {
122 return false;
123 }
124
125 /* Initialize execution of kernels on this queue.
126 *
127 * Will, for example, load all data required by the kernels from Device to global or path state.
128 *
129 * Use this method after device synchronization has finished before enqueueing any kernels. */
130 virtual void init_execution() = 0;
131
132 /* Enqueue kernel execution.
133 *
134 * Execute the kernel work_size times on the device.
135 * Supported arguments types:
136 * - int: pass pointer to the int
137 * - device memory: pass pointer to device_memory.device_pointer
138 * Return false if there was an error executing this or a previous kernel. */
139 virtual bool enqueue(DeviceKernel kernel,
140 const int work_size,
141 const DeviceKernelArguments &args) = 0;
142
143 /* Wait unit all enqueued kernels have finished execution.
144 * Return false if there was an error executing any of the enqueued kernels. */
145 virtual bool synchronize() = 0;
146
147 /* Copy memory to/from device as part of the command queue, to ensure
148 * operations are done in order without having to synchronize. */
149 virtual void zero_to_device(device_memory &mem) = 0;
150 virtual void copy_to_device(device_memory &mem) = 0;
151 virtual void copy_from_device(device_memory &mem) = 0;
152
153 /* Graphics resources interoperability.
154 *
155 * The interoperability comes here by the meaning that the device is capable of computing result
156 * directly into an OpenGL (or other graphics library) buffer. */
157
158 /* Create graphics interoperability context which will be taking care of mapping graphics
159 * resource as a buffer writable by kernels of this device. */
161 {
162 LOG_FATAL << "Request of GPU interop of a device which does not support it.";
163 return nullptr;
164 }
165
166 /* Device this queue has been created for. */
167 Device *device = nullptr;
168
169 virtual void *native_queue()
170 {
171 return nullptr;
172 }
173
174 protected:
175 /* Hide construction so that allocation via `Device` API is enforced. */
176 explicit DeviceQueue(Device *device);
177
178 /* Implementations call these from the corresponding methods to generate debugging logs. */
180 void debug_enqueue_begin(DeviceKernel kernel, const int work_size);
181 void debug_enqueue_end();
182 void debug_synchronize();
183 string debug_active_kernels();
184
185 /* Combination of kernels enqueued together sync last synchronize. */
187 /* Time of synchronize call. */
188 double last_sync_time_ = 0.0;
189 /* Accumulated execution time for combinations of kernels launched together. */
190 map<DeviceKernelMask, double> stats_kernel_time_;
191 /* If it is true, then a performance statistics in the debugging logs will have focus on kernels
192 * and an explicit queue synchronization will be added after each kernel execution. */
194};
195
unsigned int uint
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
double last_sync_time_
DeviceKernelMask last_kernels_enqueued_
void debug_enqueue_end()
Definition queue.cpp:70
virtual int num_concurrent_busy_states(const size_t state_size) const =0
virtual void copy_from_device(device_memory &mem)=0
void debug_synchronize()
Definition queue.cpp:77
bool is_per_kernel_performance_
virtual bool supports_local_atomic_sort() const
virtual int num_concurrent_states(const size_t state_size) const =0
virtual bool enqueue(DeviceKernel kernel, const int work_size, const DeviceKernelArguments &args)=0
Device * device
virtual void init_execution()=0
virtual void copy_to_device(device_memory &mem)=0
map< DeviceKernelMask, double > stats_kernel_time_
virtual unique_ptr< DeviceGraphicsInterop > graphics_interop_create()
virtual int num_sort_partitions(int max_num_paths, uint max_scene_shaders) const
string debug_active_kernels()
Definition queue.cpp:96
virtual ~DeviceQueue()
Definition queue.cpp:22
virtual bool synchronize()=0
virtual void * native_queue()
void debug_init_execution()
Definition queue.cpp:51
DeviceQueue(Device *device)
Definition queue.cpp:16
void debug_enqueue_begin(DeviceKernel kernel, const int work_size)
Definition queue.cpp:60
virtual void zero_to_device(device_memory &mem)=0
#define CCL_NAMESPACE_END
#define assert(assertion)
ccl_gpu_kernel_postfix const ccl_global int ccl_global float const int work_size
DeviceKernel
#define LOG_FATAL
Definition log.h:99
#define T
DeviceKernelArguments()=default
static const int MAX_ARGS
void add(const Type type, const void *value, const size_t size)
void add(const device_ptr *value)
DeviceKernelArguments(const T *arg)
void add(const T *first, Args... args)
void add(const float *value)
void * values[MAX_ARGS]
void add(const int32_t *value)
DeviceKernelArguments(const T *first, Args... args)
size_t sizes[MAX_ARGS]
void add(const KernelFilmConvert *value)
Type types[MAX_ARGS]
max
Definition text_draw.cc:251
uint64_t device_ptr
Definition types_base.h:44