Blender V4.3
device/queue.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5#pragma once
6
7#include "device/kernel.h"
8
10#include "util/debug.h"
11#include "util/log.h"
12#include "util/map.h"
13#include "util/string.h"
14#include "util/unique_ptr.h"
15
17
18class Device;
19class device_memory;
20
21struct KernelWorkTile;
22
23/* Container for device kernel arguments with type correctness ensured by API. */
25
33
34 static const int MAX_ARGS = 18;
35 Type types[MAX_ARGS];
36 void *values[MAX_ARGS];
37 size_t sizes[MAX_ARGS];
38 size_t count = 0;
39
41
42 template<class T> DeviceKernelArguments(const T *arg)
43 {
44 add(arg);
45 }
46
47 template<class T, class... Args> DeviceKernelArguments(const T *first, Args... args)
48 {
49 add(first);
50 add(args...);
51 }
52
53 void add(const KernelFilmConvert *value)
54 {
56 }
57 void add(const device_ptr *value)
58 {
59 add(POINTER, value, sizeof(device_ptr));
60 }
61 void add(const int32_t *value)
62 {
63 add(INT32, value, sizeof(int32_t));
64 }
65 void add(const float *value)
66 {
67 add(FLOAT32, value, sizeof(float));
68 }
69 void add(const Type type, const void *value, size_t size)
70 {
71 assert(count < MAX_ARGS);
72
73 types[count] = type;
74 values[count] = (void *)value;
75 sizes[count] = size;
76 count++;
77 }
78 template<typename T, typename... Args> void add(const T *first, Args... args)
79 {
80 add(first);
81 add(args...);
82 }
83};
84
85/* Abstraction of a command queue for a device.
86 * Provides API to schedule kernel execution in a specific queue with minimal possible overhead
87 * from driver side.
88 *
89 * This class encapsulates all properties needed for commands execution. */
91 public:
92 virtual ~DeviceQueue();
93
94 /* Number of concurrent states to process for integrator,
95 * based on number of cores and/or available memory. */
96 virtual int num_concurrent_states(const size_t state_size) const = 0;
97
98 /* Number of states which keeps the device occupied with work without losing performance.
99 * The renderer will add more work (when available) when number of active paths falls below this
100 * value. */
101 virtual int num_concurrent_busy_states(const size_t state_size) const = 0;
102
103 /* Number of elements in a partition of sorted shaders, that improves memory locality of
104 * integrator state fetch at the cost of decreased coherence for shader kernel execution. */
105 virtual int num_sort_partition_elements() const
106 {
107 return 65536;
108 }
109
110 /* Does device support local atomic sorting kernels (INTEGRATOR_SORT_BUCKET_PASS and
111 * INTEGRATOR_SORT_WRITE_PASS)? */
112 virtual bool supports_local_atomic_sort() const
113 {
114 return false;
115 }
116
117 /* Initialize execution of kernels on this queue.
118 *
119 * Will, for example, load all data required by the kernels from Device to global or path state.
120 *
121 * Use this method after device synchronization has finished before enqueueing any kernels. */
122 virtual void init_execution() = 0;
123
124 /* Enqueue kernel execution.
125 *
126 * Execute the kernel work_size times on the device.
127 * Supported arguments types:
128 * - int: pass pointer to the int
129 * - device memory: pass pointer to device_memory.device_pointer
130 * Return false if there was an error executing this or a previous kernel. */
131 virtual bool enqueue(DeviceKernel kernel,
132 const int work_size,
133 DeviceKernelArguments const &args) = 0;
134
135 /* Wait unit all enqueued kernels have finished execution.
136 * Return false if there was an error executing any of the enqueued kernels. */
137 virtual bool synchronize() = 0;
138
139 /* Copy memory to/from device as part of the command queue, to ensure
140 * operations are done in order without having to synchronize. */
141 virtual void zero_to_device(device_memory &mem) = 0;
142 virtual void copy_to_device(device_memory &mem) = 0;
143 virtual void copy_from_device(device_memory &mem) = 0;
144
145 /* Graphics resources interoperability.
146 *
147 * The interoperability comes here by the meaning that the device is capable of computing result
148 * directly into an OpenGL (or other graphics library) buffer. */
149
150 /* Create graphics interoperability context which will be taking care of mapping graphics
151 * resource as a buffer writable by kernels of this device. */
152 virtual unique_ptr<DeviceGraphicsInterop> graphics_interop_create()
153 {
154 LOG(FATAL) << "Request of GPU interop of a device which does not support it.";
155 return nullptr;
156 }
157
158 /* Device this queue has been created for. */
160
161 virtual void *native_queue()
162 {
163 return nullptr;
164 }
165
166 protected:
167 /* Hide construction so that allocation via `Device` API is enforced. */
168 explicit DeviceQueue(Device *device);
169
170 /* Implementations call these from the corresponding methods to generate debugging logs. */
172 void debug_enqueue_begin(DeviceKernel kernel, const int work_size);
173 void debug_enqueue_end();
174 void debug_synchronize();
175 string debug_active_kernels();
176
177 /* Combination of kernels enqueued together sync last synchronize. */
179 /* Time of synchronize call. */
181 /* Accumulated execution time for combinations of kernels launched together. */
182 map<DeviceKernelMask, double> stats_kernel_time_;
183 /* If it is true, then a performance statistics in the debugging logs will have focus on kernels
184 * and an explicit queue synchronization will be added after each kernel execution. */
186};
187
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition btDbvt.cpp:52
double last_sync_time_
virtual int num_sort_partition_elements() const
DeviceKernelMask last_kernels_enqueued_
void debug_enqueue_end()
Definition queue.cpp:75
virtual int num_concurrent_busy_states(const size_t state_size) const =0
virtual void copy_from_device(device_memory &mem)=0
void debug_synchronize()
Definition queue.cpp:82
bool is_per_kernel_performance_
virtual bool supports_local_atomic_sort() const
virtual int num_concurrent_states(const size_t state_size) const =0
Device * device
virtual void init_execution()=0
virtual void copy_to_device(device_memory &mem)=0
map< DeviceKernelMask, double > stats_kernel_time_
virtual unique_ptr< DeviceGraphicsInterop > graphics_interop_create()
string debug_active_kernels()
Definition queue.cpp:101
virtual ~DeviceQueue()
Definition queue.cpp:25
virtual bool synchronize()=0
virtual bool enqueue(DeviceKernel kernel, const int work_size, DeviceKernelArguments const &args)=0
virtual void * native_queue()
void debug_init_execution()
Definition queue.cpp:56
DeviceQueue(Device *device)
Definition queue.cpp:15
void debug_enqueue_begin(DeviceKernel kernel, const int work_size)
Definition queue.cpp:65
virtual void zero_to_device(device_memory &mem)=0
#define CCL_NAMESPACE_END
uint64_t DeviceKernelMask
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
DeviceKernel
#define LOG(severity)
Definition log.h:33
#define T
signed int int32_t
Definition stdint.h:77
static const int MAX_ARGS
void add(const device_ptr *value)
DeviceKernelArguments(const T *arg)
void add(const T *first, Args... args)
void add(const Type type, const void *value, size_t size)
void add(const float *value)
void add(const int32_t *value)
DeviceKernelArguments(const T *first, Args... args)
size_t sizes[MAX_ARGS]
void add(const KernelFilmConvert *value)
uint64_t device_ptr
Definition util/types.h:45