17 last_kernels_enqueued_(0),
19 is_per_kernel_performance_(
false)
31 stats_sorted.push_back(stat);
34 sort(stats_sorted.begin(),
36 [](
const pair<DeviceKernelMask, double> &a,
const pair<DeviceKernelMask, double> &
b) {
37 return a.second > b.second;
42 for (
const auto &[mask, time] : stats_sorted) {
45 << std::setprecision(5) << std::right << time
50 VLOG_DEVICE_STATS <<
"GPU queue total time: " << std::fixed << std::setprecision(5)
85 const double new_time =
time_dt();
87 VLOG_DEVICE_STATS <<
"GPU queue synchronize, elapsed " << std::setw(10) << elapsed_time <<
"s";
static DBVT_INLINE btDbvtNode * sort(btDbvtNode *n, btDbvtNode *&r)
DeviceKernelMask last_kernels_enqueued_
bool is_per_kernel_performance_
map< DeviceKernelMask, double > stats_kernel_time_
string debug_active_kernels()
virtual bool synchronize()=0
void debug_init_execution()
DeviceQueue(Device *device)
void debug_enqueue_begin(DeviceKernel kernel, const int work_size)
local_group_size(16, 16) .push_constant(Type b
#define CCL_NAMESPACE_END
string device_kernel_mask_as_string(DeviceKernelMask mask)
const char * device_kernel_as_string(DeviceKernel kernel)
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
#define VLOG_DEVICE_STATS_IS_ON
#define VLOG_DEVICE_STATS
unsigned __int64 uint64_t
CCL_NAMESPACE_BEGIN double time_dt()