Blender V5.0
kernel/device/cuda/config.h
Go to the documentation of this file.
1/* SPDX-FileCopyrightText: 2011-2022 Blender Foundation
2 *
3 * SPDX-License-Identifier: Apache-2.0 */
4
5/* Device data taken from CUDA occupancy calculator.
6 *
7 * Terminology
8 * - CUDA GPUs have multiple streaming multiprocessors
9 * - Each multiprocessor executes multiple thread blocks
10 * - Each thread block contains a number of threads, also known as the block size
11 * - Multiprocessors have a fixed number of registers, and the amount of registers
12 * used by each threads limits the number of threads per block.
13 */
14
15/* 5.x, 6.x */
16#if __CUDA_ARCH__ <= 699
17# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
18# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
19# define GPU_BLOCK_MAX_THREADS 1024
20# define GPU_THREAD_MAX_REGISTERS 255
21
22/* tunable parameters */
23# define GPU_KERNEL_BLOCK_NUM_THREADS 256
24/* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
25 * registers */
26# if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
27# define GPU_KERNEL_MAX_REGISTERS 64
28# else
29# define GPU_KERNEL_MAX_REGISTERS 48
30# endif
31
32/* 7.x, 8.x, 12.x */
33#elif __CUDA_ARCH__ <= 1299
34# define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
35# define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
36# define GPU_BLOCK_MAX_THREADS 1024
37# define GPU_THREAD_MAX_REGISTERS 255
38
39/* tunable parameters */
40# define GPU_KERNEL_BLOCK_NUM_THREADS 384
41# define GPU_KERNEL_MAX_REGISTERS 168
42
43/* unknown architecture */
44#else
45# error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
46#endif
47
48/* Compute number of threads per block and minimum blocks per multiprocessor
49 * given the maximum number of registers per thread. */
50#define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
51 extern "C" __global__ void __launch_bounds__(block_num_threads, \
52 GPU_MULTIPRESSOR_MAX_REGISTERS / \
53 (block_num_threads * thread_num_registers))
54
55#define ccl_gpu_kernel_threads(block_num_threads) \
56 extern "C" __global__ void __launch_bounds__(block_num_threads)
57
58#define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
59#define ccl_gpu_kernel_postfix
60
61#define ccl_gpu_kernel_call(x) x
62#define ccl_gpu_kernel_within_bounds(i, n) ((i) < (n))
63
64/* Define a function object where "func" is the lambda body, and additional parameters are used to
65 * specify captured state */
66#define ccl_gpu_kernel_lambda(func, ...) \
67 struct KernelLambda { \
68 __VA_ARGS__; \
69 __device__ int operator()(const int state) \
70 { \
71 return (func); \
72 } \
73 } ccl_gpu_kernel_lambda_pass
74
75/* sanity checks */
76
77#if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
78# error "Maximum number of threads per block exceeded"
79#endif
80
81#if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
82 GPU_MULTIPROCESSOR_MAX_BLOCKS
83# error "Maximum number of blocks per multiprocessor exceeded"
84#endif
85
86#if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
87# error "Maximum number of registers per thread exceeded"
88#endif