Blender V4.3
kernel/device/cuda/config.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5/* Device data taken from CUDA occupancy calculator.
6 *
7 * Terminology
8 * - CUDA GPUs have multiple streaming multiprocessors
9 * - Each multiprocessor executes multiple thread blocks
10 * - Each thread block contains a number of threads, also known as the block size
11 * - Multiprocessors have a fixed number of registers, and the amount of registers
12 * used by each threads limits the number of threads per block.
13 */
14
15/* 3.0 and 3.5 */
16#if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
17# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
18# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
19# define GPU_BLOCK_MAX_THREADS 1024
20# define GPU_THREAD_MAX_REGISTERS 63
21
22/* tunable parameters */
23# define GPU_KERNEL_BLOCK_NUM_THREADS 256
24# define GPU_KERNEL_MAX_REGISTERS 63
25
26/* 3.2 */
27#elif __CUDA_ARCH__ == 320
28# define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
29# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
30# define GPU_BLOCK_MAX_THREADS 1024
31# define GPU_THREAD_MAX_REGISTERS 63
32
33/* tunable parameters */
34# define GPU_KERNEL_BLOCK_NUM_THREADS 256
35# define GPU_KERNEL_MAX_REGISTERS 63
36
37/* 3.7 */
38#elif __CUDA_ARCH__ == 370
39# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
40# define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
41# define GPU_BLOCK_MAX_THREADS 1024
42# define GPU_THREAD_MAX_REGISTERS 255
43
44/* tunable parameters */
45# define GPU_KERNEL_BLOCK_NUM_THREADS 256
46# define GPU_KERNEL_MAX_REGISTERS 63
47
48/* 5.x, 6.x */
49#elif __CUDA_ARCH__ <= 699
50# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
51# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
52# define GPU_BLOCK_MAX_THREADS 1024
53# define GPU_THREAD_MAX_REGISTERS 255
54
55/* tunable parameters */
56# define GPU_KERNEL_BLOCK_NUM_THREADS 256
57/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
58 * registers */
59# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
60# define GPU_KERNEL_MAX_REGISTERS 64
61# else
62# define GPU_KERNEL_MAX_REGISTERS 48
63# endif
64
65/* 7.x, 8.x */
66#elif __CUDA_ARCH__ <= 899
67# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
68# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
69# define GPU_BLOCK_MAX_THREADS 1024
70# define GPU_THREAD_MAX_REGISTERS 255
71
72/* tunable parameters */
73# define GPU_KERNEL_BLOCK_NUM_THREADS 512
74# define GPU_KERNEL_MAX_REGISTERS 96
75
76/* unknown architecture */
77#else
78# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
79#endif
80
81/* Compute number of threads per block and minimum blocks per multiprocessor
82 * given the maximum number of registers per thread. */
83#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
84 extern "C" __global__ void __launch_bounds__(block_num_threads, \
85 GPU_MULTIPRESSOR_MAX_REGISTERS / \
86 (block_num_threads * thread_num_registers))
87
88#define ccl_gpu_kernel_threads(block_num_threads) \
89 extern "C" __global__ void __launch_bounds__(block_num_threads)
90
91#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
92#define ccl_gpu_kernel_postfix
93
94#define ccl_gpu_kernel_call(x) x
95#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))
96
97/* Define a function object where "func" is the lambda body, and additional parameters are used to
98 * specify captured state */
99#define ccl_gpu_kernel_lambda(func, ...) \
100 struct KernelLambda { \
101 __VA_ARGS__; \
102 __device__ int operator()(const int state) \
103 { \
104 return (func); \
105 } \
106 } ccl_gpu_kernel_lambda_pass
107
108/* sanity checks */
109
110#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
111# error "Maximum number of threads per block exceeded"
112#endif
113
114#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
115 GPU_MULTIPROCESSOR_MAX_BLOCKS
116# error "Maximum number of blocks per multiprocessor exceeded"
117#endif
118
119#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
120# error "Maximum number of registers per thread exceeded"
121#endif