PAPI  5.7.0.0
linux-cuda.c
Go to the documentation of this file.
1 
18 //-----------------------------------------------------------------------------
19 // A basic assumption here (and in other components) is that we put as much of
20 // the computational load of this component into the initialization stage and
21 // the "adding" stage for events (update_control), becuase users are likely not
22 // measuring performance at those times, but may well be reading these events
23 // when performance matters. So we want the read operation lightweight, but we
24 // can remember tables and such at startup and when servicing a PAPI_add().
25 //-----------------------------------------------------------------------------
26 
27 #include <dlfcn.h>
28 #include <cupti.h>
29 #include <cuda_runtime_api.h>
30 
31 #include "papi.h"
32 #include "papi_memory.h"
33 #include "papi_internal.h"
34 #include "papi_vector.h"
35 
36 // We use a define so we can use it as a static array dimension. Increase as needed.
37 #define PAPICUDA_MAX_COUNTERS 512
38 
39 // #define PAPICUDA_KERNEL_REPLAY_MODE
40 // w to punctuate an embedded quoted question within a declarative sentence? [duplicate]
41 
42 // Contains device list, pointer to device desciption, and the list of all available events.
43 typedef struct papicuda_context {
45  struct papicuda_device_desc *deviceArray;
46  uint32_t availEventSize;
47  CUpti_ActivityKind *availEventKind;
49  uint32_t *availEventIDArray;
51  struct papicuda_name_desc *availEventDesc;
53 
54 /* Store the name and description for an event */
55 typedef struct papicuda_name_desc {
57  char description[PAPI_2MAX_STR_LEN];
58  uint16_t numMetricEvents; // 0=event, if a metric, size of metricEvents array below.
59  CUpti_EventID *metricEvents; // NULL for cuda events, an array of member events if a metric.
60  CUpti_MetricValueKind MV_Kind; // eg. % or counter or rate, etc. Needed to compute metric from individual events.
62 
63 /* For a device, store device description */
64 typedef struct papicuda_device_desc {
65  CUdevice cuDev;
66  int deviceNum;
67  char deviceName[PAPI_MIN_STR_LEN];
68  uint32_t maxDomains; /* number of domains per device */
69  CUpti_EventDomainID *domainIDArray; /* Array[maxDomains] of domain IDs */
70  uint32_t *domainIDNumEvents; /* Array[maxDomains] of num of events in that domain */
72 
73 // For each active cuda context (one measuring something) we also track the
74 // cuda device number it is on. We track in separate arrays for each reading
75 // method. cuda metrics and nvlink metrics require multiple events to be read,
76 // these are then arithmetically combined to produce the metric value. The
77 // allEvents array stores all the actual events; i.e. metrics are deconstructed
78 // to their individual events and stored there, as well as regular events, so
79 // we can perform an analysis of how to read with cuptiEventGroupSetsCreate().
80 
81 typedef struct papicuda_active_cucontext_s {
82  CUcontext cuCtx;
83  int deviceNum;
84 
85  uint32_t ctxActiveCount; // Count of entries in ctxActiveEvents.
86  uint32_t ctxActiveEvents [PAPICUDA_MAX_COUNTERS]; // index into gctrl->activeEventXXXX arrays, so we can store values.
87 
88  uint32_t allEventsCount; // entries in allEvents array.
89  CUpti_EventID allEvents [PAPICUDA_MAX_COUNTERS]; // allEvents, including sub-events of metrics. (no metric Ids in here).
90  uint64_t allEventValues [PAPICUDA_MAX_COUNTERS]; // aggregated event values.
91 
92  CUpti_EventGroupSets *eventGroupSets; // Built during add, to save time not doing it at read.
94 
95 // Control structure tracks array of active contexts and active events
96 // in the order the user requested them; along with associated values
97 // values and types (to save lookup time).
98 typedef struct papicuda_control {
102  int activeEventIndex [PAPICUDA_MAX_COUNTERS]; // index into gctxt->availEventXXXXX arrays.
103  long long activeEventValues [PAPICUDA_MAX_COUNTERS]; // values we will return.
104  CUpti_MetricValueKind activeEventKind [PAPICUDA_MAX_COUNTERS]; // For metrics: double, uint64, % or throughput. Needed to compute metric from individual events.
105  uint64_t cuptiStartTimestampNs; // needed to compute duration for some metrics.
106  uint64_t cuptiReadTimestampNs; // ..
108 
109 // file handles used to access cuda libraries with dlopen
110 static void *dl1 = NULL;
111 static void *dl2 = NULL;
112 static void *dl3 = NULL;
113 
114 /* The PAPI side (external) variable as a global */
116 
117 /* Global variable for hardware description, event and metric lists */
119 
120 /* This global variable points to the head of the control state list */
122 
123 /* Macros for error checking... each arg is only referenced/evaluated once */
124 #define CHECK_PRINT_EVAL( checkcond, str, evalthis ) \
125  do { \
126  int _cond = (checkcond); \
127  if (_cond) { \
128  SUBDBG("error: condition %s failed: %s.\n", #checkcond, str); \
129  evalthis; \
130  } \
131  } while (0)
132 
133 #define CUDA_CALL( call, handleerror ) \
134  do { \
135  cudaError_t _status = (call); \
136  if (_status != cudaSuccess) { \
137  SUBDBG("error: function %s failed with error %d.\n", #call, _status); \
138  handleerror; \
139  } \
140  } while (0)
141 
142 #define CU_CALL( call, handleerror ) \
143  do { \
144  CUresult _status = (call); \
145  if (_status != CUDA_SUCCESS) { \
146  SUBDBG("error: function %s failed with error %d.\n", #call, _status); \
147  /* fprintf(stderr,"Line %i CU_CALL error function %s failed with error %d.\n", __LINE__, #call, _status); */ \
148  handleerror; \
149  } \
150  } while (0)
151 
152 
153 #define CUPTI_CALL(call, handleerror) \
154  do { \
155  CUptiResult _status = (call); \
156  if (_status != CUPTI_SUCCESS) { \
157  const char *errstr; \
158  (*cuptiGetResultStringPtr)(_status, &errstr); \
159  SUBDBG("error: function %s failed with error %s.\n", #call, errstr); \
160  /* fprintf(stderr, "Line %i CUPTI_CALL macro '%s' failed with error '%s'.\n", __LINE__, #call, errstr); */ \
161  handleerror; \
162  } \
163  } while (0)
164 
165 #define BUF_SIZE (32 * 1024)
166 #define ALIGN_SIZE (8)
167 #define ALIGN_BUFFER(buffer, align) \
168  (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))
169 
170 /* Function prototypes */
172 
173 /* ****** CHANGE PROTOTYPES TO DECLARE CUDA LIBRARY SYMBOLS AS WEAK **********
174  * This is done so that a version of PAPI built with the cuda component can *
175  * be installed on a system which does not have the cuda libraries installed. *
176  * *
177  * If this is done without these prototypes, then all papi services on the *
178  * system without the cuda libraries installed will fail. The PAPI libraries *
179  * contain references to the cuda libraries which are not installed. The *
180  * load of PAPI commands fails because the cuda library references can not be *
181  * resolved. *
182  * *
183  * This also defines pointers to the cuda library functions that we call. *
184  * These function pointers will be resolved with dlopen/dlsym calls at *
185  * component initialization time. The component then calls the cuda library *
186  * functions through these function pointers. *
187  *******************************************************************************/
188 void (*_dl_non_dynamic_init) (void) __attribute__ ((weak));
189 
190 #define CUAPIWEAK __attribute__( ( weak ) )
191 #define DECLARECUFUNC(funcname, funcsig) CUresult CUAPIWEAK funcname funcsig; CUresult( *funcname##Ptr ) funcsig;
192 DECLARECUFUNC(cuCtxGetCurrent, (CUcontext *));
193 DECLARECUFUNC(cuCtxSetCurrent, (CUcontext));
194 DECLARECUFUNC(cuCtxDestroy, (CUcontext));
195 DECLARECUFUNC(cuCtxCreate, (CUcontext *pctx, unsigned int flags, CUdevice dev));
196 DECLARECUFUNC(cuDeviceGet, (CUdevice *, int));
197 DECLARECUFUNC(cuDeviceGetCount, (int *));
198 DECLARECUFUNC(cuDeviceGetName, (char *, int, CUdevice));
199 DECLARECUFUNC(cuInit, (unsigned int));
200 DECLARECUFUNC(cuCtxPopCurrent, (CUcontext * pctx));
201 DECLARECUFUNC(cuCtxPushCurrent, (CUcontext pctx));
202 DECLARECUFUNC(cuCtxSynchronize, ());
203 
204 #define CUDAAPIWEAK __attribute__( ( weak ) )
205 #define DECLARECUDAFUNC(funcname, funcsig) cudaError_t CUDAAPIWEAK funcname funcsig; cudaError_t( *funcname##Ptr ) funcsig;
208 DECLARECUDAFUNC(cudaFree, (void *));
209 
210 #define CUPTIAPIWEAK __attribute__( ( weak ) )
211 #define DECLARECUPTIFUNC(funcname, funcsig) CUptiResult CUPTIAPIWEAK funcname funcsig; CUptiResult( *funcname##Ptr ) funcsig;
212 /* CUptiResult CUPTIAPIWEAK cuptiDeviceEnumEventDomains( CUdevice, size_t *, CUpti_EventDomainID * ); */
213 /* CUptiResult( *cuptiDeviceEnumEventDomainsPtr )( CUdevice, size_t *, CUpti_EventDomainID * ); */
214 DECLARECUPTIFUNC(cuptiDeviceEnumMetrics, (CUdevice device, size_t * arraySizeBytes, CUpti_MetricID * metricArray));
215 DECLARECUPTIFUNC(cuptiDeviceGetEventDomainAttribute, (CUdevice device, CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib, size_t * valueSize, void *value));
216 DECLARECUPTIFUNC(cuptiDeviceGetNumMetrics, (CUdevice device, uint32_t * numMetrics));
217 DECLARECUPTIFUNC(cuptiEventGroupGetAttribute, (CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib, size_t * valueSize, void *value));
218 DECLARECUPTIFUNC(cuptiEventGroupReadEvent, (CUpti_EventGroup eventGroup, CUpti_ReadEventFlags flags, CUpti_EventID event, size_t * eventValueBufferSizeBytes, uint64_t * eventValueBuffer));
219 DECLARECUPTIFUNC(cuptiEventGroupSetAttribute, (CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib, size_t valueSize, void *value));
220 DECLARECUPTIFUNC(cuptiEventGroupSetDisable, (CUpti_EventGroupSet * eventGroupSet));
221 DECLARECUPTIFUNC(cuptiEventGroupSetEnable, (CUpti_EventGroupSet * eventGroupSet));
222 DECLARECUPTIFUNC(cuptiEventGroupSetsCreate, (CUcontext context, size_t eventIdArraySizeBytes, CUpti_EventID * eventIdArray, CUpti_EventGroupSets ** eventGroupPasses));
223 DECLARECUPTIFUNC(cuptiMetricCreateEventGroupSets, (CUcontext context, size_t metricIdArraySizeBytes, CUpti_MetricID * metricIdArray, CUpti_EventGroupSets ** eventGroupPasses));
224 DECLARECUPTIFUNC(cuptiEventGroupSetsDestroy, (CUpti_EventGroupSets * eventGroupSets));
225 DECLARECUPTIFUNC(cuptiMetricGetRequiredEventGroupSets, (CUcontext ctx, CUpti_MetricID metricId, CUpti_EventGroupSets **thisEventGroupSet));
226 DECLARECUPTIFUNC(cuptiGetTimestamp, (uint64_t * timestamp));
227 DECLARECUPTIFUNC(cuptiMetricEnumEvents, (CUpti_MetricID metric, size_t * eventIdArraySizeBytes, CUpti_EventID * eventIdArray));
228 DECLARECUPTIFUNC(cuptiMetricGetAttribute, (CUpti_MetricID metric, CUpti_MetricAttribute attrib, size_t * valueSize, void *value));
229 DECLARECUPTIFUNC(cuptiMetricGetNumEvents, (CUpti_MetricID metric, uint32_t * numEvents));
230 DECLARECUPTIFUNC(cuptiMetricGetValue, (CUdevice device, CUpti_MetricID metric, size_t eventIdArraySizeBytes, CUpti_EventID * eventIdArray, size_t eventValueArraySizeBytes, uint64_t * eventValueArray, uint64_t timeDuration, CUpti_MetricValue * metricValue));
231 DECLARECUPTIFUNC(cuptiSetEventCollectionMode, (CUcontext context, CUpti_EventCollectionMode mode));
232 DECLARECUPTIFUNC(cuptiDeviceEnumEventDomains, (CUdevice, size_t *, CUpti_EventDomainID *));
233 DECLARECUPTIFUNC(cuptiDeviceGetNumEventDomains, (CUdevice, uint32_t *));
234 DECLARECUPTIFUNC(cuptiEventDomainEnumEvents, (CUpti_EventDomainID, size_t *, CUpti_EventID *));
235 DECLARECUPTIFUNC(cuptiEventDomainGetAttribute, (CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib, size_t * valueSize, void *value));
236 DECLARECUPTIFUNC(cuptiEventDomainGetNumEvents, (CUpti_EventDomainID, uint32_t *));
237 DECLARECUPTIFUNC(cuptiEventGetAttribute, (CUpti_EventID, CUpti_EventAttribute, size_t *, void *));
238 DECLARECUPTIFUNC(cuptiEventGroupAddEvent, (CUpti_EventGroup, CUpti_EventID));
239 DECLARECUPTIFUNC(cuptiEventGroupCreate, (CUcontext, CUpti_EventGroup *, uint32_t));
240 DECLARECUPTIFUNC(cuptiEventGroupDestroy, (CUpti_EventGroup));
241 DECLARECUPTIFUNC(cuptiEventGroupDisable, (CUpti_EventGroup));
242 DECLARECUPTIFUNC(cuptiEventGroupEnable, (CUpti_EventGroup));
243 DECLARECUPTIFUNC(cuptiEventGroupReadAllEvents, (CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *));
244 DECLARECUPTIFUNC(cuptiEventGroupResetAllEvents, (CUpti_EventGroup));
245 DECLARECUPTIFUNC(cuptiGetResultString, (CUptiResult result, const char **str));
246 DECLARECUPTIFUNC(cuptiEnableKernelReplayMode, ( CUcontext context ));
247 DECLARECUPTIFUNC(cuptiDisableKernelReplayMode, ( CUcontext context ));
248 
249 
250 /*****************************************************************************
251  ******** BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT *********
252  *****************************************************************************/
253 
254 /*
255  * Link the necessary CUDA libraries to use the cuda component. If any of them can not be found, then
256  * the CUDA component will just be disabled. This is done at runtime so that a version of PAPI built
257  * with the CUDA component can be installed and used on systems which have the CUDA libraries installed
258  * and on systems where these libraries are not installed.
259  */
260 static int papicuda_linkCudaLibraries()
261 {
262 #define DLSYM_AND_CHECK( dllib, name ) dlsym( dllib, name ); if ( dlerror()!=NULL ) { strncpy( _cuda_vector.cmp_info.disabled_reason, "A CUDA required function was not found in dynamic libs", PAPI_MAX_STR_LEN ); return ( PAPI_ENOSUPP ); }
263 
264  /* Attempt to guess if we were statically linked to libc, if so bail */
265  if(_dl_non_dynamic_init != NULL) {
266  strncpy(_cuda_vector.cmp_info.disabled_reason, "The CUDA component does not support statically linking to libc.", PAPI_MAX_STR_LEN);
267  return PAPI_ENOSUPP;
268  }
269  /* Need to link in the cuda libraries, if not found disable the component */
270  dl1 = dlopen("libcuda.so", RTLD_NOW | RTLD_GLOBAL);
271  CHECK_PRINT_EVAL(!dl1, "CUDA library libcuda.so not found.", return (PAPI_ENOSUPP));
272  cuCtxGetCurrentPtr = DLSYM_AND_CHECK(dl1, "cuCtxGetCurrent");
273  cuCtxSetCurrentPtr = DLSYM_AND_CHECK(dl1, "cuCtxSetCurrent");
274  cuDeviceGetPtr = DLSYM_AND_CHECK(dl1, "cuDeviceGet");
275  cuDeviceGetCountPtr = DLSYM_AND_CHECK(dl1, "cuDeviceGetCount");
276  cuDeviceGetNamePtr = DLSYM_AND_CHECK(dl1, "cuDeviceGetName");
277  cuInitPtr = DLSYM_AND_CHECK(dl1, "cuInit");
278  cuCtxPopCurrentPtr = DLSYM_AND_CHECK(dl1, "cuCtxPopCurrent");
279  cuCtxPushCurrentPtr = DLSYM_AND_CHECK(dl1, "cuCtxPushCurrent");
280  cuCtxDestroyPtr = DLSYM_AND_CHECK(dl1, "cuCtxDestroy");
281  cuCtxCreatePtr = DLSYM_AND_CHECK(dl1, "cuCtxCreate");
282  cuCtxSynchronizePtr = DLSYM_AND_CHECK(dl1, "cuCtxSynchronize");
283 
284  dl2 = dlopen("libcudart.so", RTLD_NOW | RTLD_GLOBAL | RTLD_NODELETE);
285  CHECK_PRINT_EVAL(!dl2, "CUDA runtime library libcudart.so not found.", return (PAPI_ENOSUPP));
286  cudaGetDevicePtr = DLSYM_AND_CHECK(dl2, "cudaGetDevice");
287  cudaSetDevicePtr = DLSYM_AND_CHECK(dl2, "cudaSetDevice");
288  cudaFreePtr = DLSYM_AND_CHECK(dl2, "cudaFree");
289 
290  dl3 = dlopen("libcupti.so", RTLD_NOW | RTLD_GLOBAL);
291  CHECK_PRINT_EVAL(!dl3, "CUDA Profiling Tools Interface (CUPTI) library libcupti.so not found.", return (PAPI_ENOSUPP));
292  /* The macro DLSYM_AND_CHECK results in the expansion example below */
293  /* cuptiDeviceEnumEventDomainsPtr = dlsym( dl3, "cuptiDeviceEnumEventDomains" ); */
294  /* if ( dlerror()!=NULL ) { strncpy( _cuda_vector.cmp_info.disabled_reason, "A CUDA required function was not found in dynamic libs", PAPI_MAX_STR_LEN ); return ( PAPI_ENOSUPP ); } */
295  cuptiDeviceEnumMetricsPtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceEnumMetrics");
296  cuptiDeviceGetEventDomainAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceGetEventDomainAttribute");
297  cuptiDeviceGetNumMetricsPtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceGetNumMetrics");
298  cuptiEventGroupGetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupGetAttribute");
299  cuptiEventGroupReadEventPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupReadEvent");
300  cuptiEventGroupSetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetAttribute");
301  cuptiMetricGetRequiredEventGroupSetsPtr = DLSYM_AND_CHECK(dl3, "cuptiMetricGetRequiredEventGroupSets");
302  cuptiEventGroupSetDisablePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetDisable");
303  cuptiEventGroupSetEnablePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetEnable");
304  cuptiEventGroupSetsCreatePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetsCreate");
305  cuptiEventGroupSetsDestroyPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupSetsDestroy");
306  cuptiGetTimestampPtr = DLSYM_AND_CHECK(dl3, "cuptiGetTimestamp");
307  cuptiMetricEnumEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiMetricEnumEvents");
308  cuptiMetricGetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiMetricGetAttribute");
309  cuptiMetricGetNumEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiMetricGetNumEvents");
310  cuptiMetricGetValuePtr = DLSYM_AND_CHECK(dl3, "cuptiMetricGetValue");
311  cuptiMetricCreateEventGroupSetsPtr = DLSYM_AND_CHECK(dl3, "cuptiMetricCreateEventGroupSets");
312  cuptiSetEventCollectionModePtr = DLSYM_AND_CHECK(dl3, "cuptiSetEventCollectionMode");
313  cuptiDeviceEnumEventDomainsPtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceEnumEventDomains");
314  cuptiDeviceGetNumEventDomainsPtr = DLSYM_AND_CHECK(dl3, "cuptiDeviceGetNumEventDomains");
315  cuptiEventDomainEnumEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiEventDomainEnumEvents");
316  cuptiEventDomainGetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiEventDomainGetAttribute");
317  cuptiEventDomainGetNumEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiEventDomainGetNumEvents");
318  cuptiEventGetAttributePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGetAttribute");
319  cuptiEventGroupAddEventPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupAddEvent");
320  cuptiEventGroupCreatePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupCreate");
321  cuptiEventGroupDestroyPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupDestroy");
322  cuptiEventGroupDisablePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupDisable");
323  cuptiEventGroupEnablePtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupEnable");
324  cuptiEventGroupReadAllEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupReadAllEvents");
325  cuptiEventGroupResetAllEventsPtr = DLSYM_AND_CHECK(dl3, "cuptiEventGroupResetAllEvents");
326  cuptiGetResultStringPtr = DLSYM_AND_CHECK(dl3, "cuptiGetResultString");
327  cuptiEnableKernelReplayModePtr = DLSYM_AND_CHECK(dl3, "cuptiEnableKernelReplayMode");
328  cuptiDisableKernelReplayModePtr = DLSYM_AND_CHECK(dl3, "cuptiEnableKernelReplayMode");
329  return (PAPI_OK);
330 }
331 
332 
334 {
335  SUBDBG("Entering\n");
336  CUresult cuErr;
337  int deviceNum;
338  uint32_t domainNum, eventNum;
339  papicuda_device_desc_t *mydevice;
340  char tmpStr[PAPI_MIN_STR_LEN];
341  tmpStr[PAPI_MIN_STR_LEN - 1] = '\0';
342  size_t tmpSizeBytes;
343  int ii;
344  uint32_t maxEventSize;
345 
346  /* How many CUDA devices do we have? */
347  cuErr = (*cuDeviceGetCountPtr) (&gctxt->deviceCount);
348  if(cuErr == CUDA_ERROR_NOT_INITIALIZED) {
349  /* If CUDA not initialized, initialize CUDA and retry the device list */
350  /* This is required for some of the PAPI tools, that do not call the init functions */
351  if(((*cuInitPtr) (0)) != CUDA_SUCCESS) {
352  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA cannot be found and initialized (cuInit failed).", PAPI_MAX_STR_LEN);
353  return PAPI_ENOSUPP;
354  }
355  CU_CALL((*cuDeviceGetCountPtr) (&gctxt->deviceCount), return (PAPI_EMISC));
356  }
357 
358  if(gctxt->deviceCount == 0) {
359  strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA initialized but no CUDA devices found.", PAPI_MAX_STR_LEN);
360  return PAPI_ENOSUPP;
361  }
362  SUBDBG("Found %d devices\n", gctxt->deviceCount);
363 
364  /* allocate memory for device information */
366  CHECK_PRINT_EVAL(!gctxt->deviceArray, "ERROR CUDA: Could not allocate memory for CUDA device structure", return (PAPI_ENOMEM));
367 
368  /* For each device, get domains and domain-events counts */
369  maxEventSize = 0;
370  for(deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) {
371  mydevice = &gctxt->deviceArray[deviceNum];
372  /* Get device id, name, numeventdomains for each device */
373  CU_CALL((*cuDeviceGetPtr) (&mydevice->cuDev, deviceNum), // get CUdevice.
374  return (PAPI_EMISC)); // .. on failure.
375 
376  CU_CALL((*cuDeviceGetNamePtr) (mydevice->deviceName, // get device name,
377  PAPI_MIN_STR_LEN - 1, mydevice->cuDev), // .. max length,
378  return (PAPI_EMISC)); // .. on failure.
379 
380  mydevice->deviceName[PAPI_MIN_STR_LEN - 1] = '\0'; // z-terminate it.
381 
382  CUPTI_CALL((*cuptiDeviceGetNumEventDomainsPtr) // get number of domains,
383  (mydevice->cuDev, &mydevice->maxDomains),
384  return (PAPI_EMISC)); // .. on failure.
385 
386  /* Allocate space to hold domain IDs */
387  mydevice->domainIDArray = (CUpti_EventDomainID *) papi_calloc(
388  mydevice->maxDomains, sizeof(CUpti_EventDomainID));
389 
390  CHECK_PRINT_EVAL(!mydevice->domainIDArray, "ERROR CUDA: Could not allocate memory for CUDA device domains", return (PAPI_ENOMEM));
391 
392  /* Put domain ids into allocated space */
393  size_t domainarraysize = mydevice->maxDomains * sizeof(CUpti_EventDomainID);
394  CUPTI_CALL((*cuptiDeviceEnumEventDomainsPtr) // enumerate domain ids into space.
395  (mydevice->cuDev, &domainarraysize, mydevice->domainIDArray),
396  return (PAPI_EMISC)); // .. on failure.
397 
398  /* Allocate space to hold domain event counts */
399  mydevice->domainIDNumEvents = (uint32_t *) papi_calloc(mydevice->maxDomains, sizeof(uint32_t));
400  CHECK_PRINT_EVAL(!mydevice->domainIDNumEvents, "ERROR CUDA: Could not allocate memory for domain event counts", return (PAPI_ENOMEM));
401 
402  /* For each domain, get event counts in domainNumEvents[] */
403  for(domainNum = 0; domainNum < mydevice->maxDomains; domainNum++) { // For each domain,
404  CUpti_EventDomainID domainID = mydevice->domainIDArray[domainNum]; // .. make a copy of the domain ID.
405  /* Get num events in domain */
406  CUPTI_CALL((*cuptiEventDomainGetNumEventsPtr) // Get number of events in this domain,
407  (domainID, &mydevice->domainIDNumEvents[domainNum]), // .. store in array.
408  return (PAPI_EMISC)); // .. on failure.
409 
410  maxEventSize += mydevice->domainIDNumEvents[domainNum]; // keep track of overall number of events.
411  } // end for each domain.
412  } // end of for each device.
413 
414  // Increase maxEventSize for metrics on this device.
415  for(deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) { // for each device,
416  uint32_t maxMetrics = 0;
417  CUptiResult cuptiRet;
418  mydevice = &gctxt->deviceArray[deviceNum]; // Get papicuda_device_desc pointer.
419  cuptiRet = (*cuptiDeviceGetNumMetricsPtr) (mydevice->cuDev, &maxMetrics); // Read the # metrics on this device.
420  if (cuptiRet != CUPTI_SUCCESS || maxMetrics < 1) continue; // If no metrics, skip to next device.
421  maxEventSize += maxMetrics; // make room for metrics we discover later.
422  } // end for each device.
423 
424  /* Allocate space for all events and descriptors */
425  gctxt->availEventKind = (CUpti_ActivityKind *) papi_calloc(maxEventSize, sizeof(CUpti_ActivityKind));
426  CHECK_PRINT_EVAL(!gctxt->availEventKind, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
427  gctxt->availEventDeviceNum = (int *) papi_calloc(maxEventSize, sizeof(int));
428  CHECK_PRINT_EVAL(!gctxt->availEventDeviceNum, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
429  gctxt->availEventIDArray = (CUpti_EventID *) papi_calloc(maxEventSize, sizeof(CUpti_EventID));
430  CHECK_PRINT_EVAL(!gctxt->availEventIDArray, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
431  gctxt->availEventIsBeingMeasuredInEventset = (uint32_t *) papi_calloc(maxEventSize, sizeof(uint32_t));
432  CHECK_PRINT_EVAL(!gctxt->availEventIsBeingMeasuredInEventset, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
433  gctxt->availEventDesc = (papicuda_name_desc_t *) papi_calloc(maxEventSize, sizeof(papicuda_name_desc_t));
434  CHECK_PRINT_EVAL(!gctxt->availEventDesc, "ERROR CUDA: Could not allocate memory", return (PAPI_ENOMEM));
435 
436  // Record all events on each device, and their descriptions.
437  uint32_t idxEventArray = 0;
438  for(deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) { // loop through each device.
439  mydevice = &gctxt->deviceArray[deviceNum]; // get a pointer to the papicuda_device_desc struct.
440 
441  // For each domain, get and store event IDs, names, descriptions.
442  for(domainNum = 0; domainNum < mydevice->maxDomains; domainNum++) { // loop through the domains in this device.
443 
444  /* Get domain id */
445  CUpti_EventDomainID domainID = mydevice->domainIDArray[domainNum]; // get the domain id,
446  uint32_t domainNumEvents = mydevice->domainIDNumEvents[domainNum]; // get the number of events in it.
447 
448  // SUBDBG( "For device %d domain %d domainID %d numEvents %d\n", mydevice->cuDev, domainNum, domainID, domainNumEvents );
449 
450  CUpti_EventID *domainEventIDArray = // Make space for the events in this domain.
451  (CUpti_EventID *) papi_calloc(domainNumEvents, sizeof(CUpti_EventID)); // ..
452  CHECK_PRINT_EVAL(!domainEventIDArray, "ERROR CUDA: Could not allocate memory for events", return (PAPI_ENOMEM));
453 
454  size_t domainEventArraySize = domainNumEvents * sizeof(CUpti_EventID); // compute size of array we allocated.
455  CUPTI_CALL((*cuptiEventDomainEnumEventsPtr) // Enumerate the events in the domain,
456  (domainID, &domainEventArraySize, domainEventIDArray), // ..
457  return (PAPI_EMISC)); // .. on failure, exit.
458 
459  for(eventNum = 0; eventNum < domainNumEvents; eventNum++) { // Loop through the events in this domain.
460  CUpti_EventID myeventCuptiEventId = domainEventIDArray[eventNum]; // .. get this event,
461  gctxt->availEventKind[idxEventArray] = CUPTI_ACTIVITY_KIND_EVENT; // .. record the kind,
462  gctxt->availEventIDArray[idxEventArray] = myeventCuptiEventId; // .. record the id,
463  gctxt->availEventDeviceNum[idxEventArray] = deviceNum; // .. record the device number,
464 
465  tmpSizeBytes = PAPI_MIN_STR_LEN - 1 * sizeof(char); // .. compute size of name,
466  CUPTI_CALL((*cuptiEventGetAttributePtr) (myeventCuptiEventId, // .. Get the event name seen by cupti,
467  CUPTI_EVENT_ATTR_NAME, &tmpSizeBytes, tmpStr), // .. into tmpStr.
468  return (PAPI_EMISC)); // .. on failure, exit routine.
469 
470  snprintf(gctxt->availEventDesc[idxEventArray].name, PAPI_MIN_STR_LEN, // record expaneded name for papi user.
471  "event:%s:device=%d", tmpStr, deviceNum);
472  gctxt->availEventDesc[idxEventArray].name[PAPI_MIN_STR_LEN - 1] = '\0'; // ensure null termination.
473  char *nameTmpPtr = gctxt->availEventDesc[idxEventArray].name; // For looping, get pointer to name.
474  for(ii = 0; ii < (int) strlen(nameTmpPtr); ii++) { // Replace spaces with underscores.
475  if(nameTmpPtr[ii] == ' ') nameTmpPtr[ii] = '_'; // ..
476  }
477 
478  /* Save description in the native event array */
479  tmpSizeBytes = PAPI_2MAX_STR_LEN - 1 * sizeof(char); // Most space to use for description.
480  CUPTI_CALL((*cuptiEventGetAttributePtr) (myeventCuptiEventId, // Get it,
481  CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &tmpSizeBytes, // .. Set limit (and recieve bytes written),
482  gctxt->availEventDesc[idxEventArray].description), // .. in the description.
483  return (PAPI_EMISC)); // .. on failure.
484  gctxt->availEventDesc[idxEventArray].description[PAPI_2MAX_STR_LEN - 1] = '\0'; // Ensure null terminator.
485  gctxt->availEventDesc[idxEventArray].numMetricEvents = 0; // Not a metric.
486  gctxt->availEventDesc[idxEventArray].metricEvents = NULL; // No space allocated.
487  /* Increment index past events in this domain to start of next domain */
488  idxEventArray++; // Bump total number of events.
489  } // end of events in this domain.
490 
491  papi_free(domainEventIDArray); // done with temp space.
492  } // end of domain loop within device.
493  } // end of device loop, for events.
494 
495  // Now we retrieve and store all METRIC info for each device; this includes
496  // both cuda metrics and nvlink metrics.
497  SUBDBG("Checking for metrics\n");
498  for (deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) {
499  uint32_t maxMetrics = 0, i, j;
500  CUpti_MetricID *metricIdList = NULL;
501  CUptiResult cuptiRet;
502  mydevice = &gctxt->deviceArray[deviceNum]; // Get papicuda_device_desc pointer.
503  cuptiRet = (*cuptiDeviceGetNumMetricsPtr) (mydevice->cuDev, &maxMetrics); // Read the # metrics on this device.
504  if (cuptiRet != CUPTI_SUCCESS || maxMetrics < 1) continue; // If no metrics, skip to next device.
505 
506  SUBDBG("Device %d: Checking each of the (maxMetrics) %d metrics\n", deviceNum, maxMetrics);
507 
508  // Make a temporary list of the metric Ids to add to the available named collectables.
509  size_t size = maxMetrics * sizeof(CUpti_EventID);
510  metricIdList = (CUpti_MetricID *) papi_calloc(maxMetrics, sizeof(CUpti_EventID));
511  CHECK_PRINT_EVAL(metricIdList == NULL, "Out of memory", return (PAPI_ENOMEM));
512 
513  CUPTI_CALL((*cuptiDeviceEnumMetricsPtr) // Enumerate the metric Ids for this device,
514  (mydevice->cuDev, &size, metricIdList), // .. into metricIdList.
515  return (PAPI_EMISC)); // .. On failure, but should work, we have metrics!
516 
517  // Elimination loop for metrics we cannot support.
518  int saveDeviceNum = 0;
519  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC)); // save caller's device num.
520 
521  for (i=0, j=0; i<maxMetrics; i++) { // process each metric Id.
522  size = PAPI_MIN_STR_LEN-1; // Most bytes allowed to be written.
523  CUPTI_CALL((*cuptiMetricGetAttributePtr) (metricIdList[i], // Get the name.
524  CUPTI_METRIC_ATTR_NAME, &size, (uint8_t *) tmpStr),
525  return (PAPI_EMISC));
526 
527  // Note that 'size' also returned total bytes written.
528  tmpStr[size] = '\0';
529 
530  if (strcmp("branch_efficiency", tmpStr) == 0) continue; // If it is branch efficiency, skip it.
531 
532  // We'd like to reject anything requiring more than 1
533  // set, but there is a problem I cannot find; I have
534  // been unable to create a CUcontext here so I can
535  // execute the CreateEventGroups. I've tried both
536  // ways, it returns an error saying no cuda devices
537  // available. There does not seem to be a way to get
538  // the number of "sets" (passes) for a metric without
539  // having a context.
540 
541  // CUpti_EventGroupSets *thisEventGroupSets = NULL;
542  //CUPTI_CALL ((*cuptiMetricCreateEventGroupSetsPtr) (
543  // tempContext,
544  // sizeof(CUpti_MetricID),
545  // &metricIdList[i],
546  // &thisEventGroupSets),
547  // return (PAPI_EMISC));
548  //
549  //int numSets = 0; // # of sets (passes) required.
550  //if (thisEventGroupSets != NULL) {
551  // numSets=thisEventGroupSets->numSets; // Get sets if a grouping is necessary.
552  // CUPTI_CALL((*cuptiEventGroupSetsDestroyPtr) (thisEventGroupSets), // Done with this.
553  // return (PAPI_EMISC));
554  //}
555  //
556  //if (numSets > 1) continue; // skip this metric too many passes.
557 
558  metricIdList[j++] = metricIdList[i]; // we are compressing if we skipped any.
559  } // end elimination loop.
560 
561  // Done with eliminations, the rest are valid.
562  maxMetrics = j; // Change the number to process.
563 
564  // Eliminations accomplished, now add the valid metric Ids to the list.
565  for(i = 0; i < maxMetrics; i++) { // for each id,
566  gctxt->availEventIDArray[idxEventArray] = metricIdList[i]; // add to the list of collectables.
567  gctxt->availEventKind[idxEventArray] = CUPTI_ACTIVITY_KIND_METRIC; // Indicate it is a metric.
568  gctxt->availEventDeviceNum[idxEventArray] = deviceNum; // remember the device number.
569  size = PAPI_MIN_STR_LEN;
570  CUPTI_CALL((*cuptiMetricGetAttributePtr) (metricIdList[i], // Get the name, fail if we cannot.
571  CUPTI_METRIC_ATTR_NAME, &size, (uint8_t *) tmpStr),
572  return (PAPI_EMISC));
573 
574  if (size >= PAPI_MIN_STR_LEN) { // Truncate if we don't have room for the name.
575  gctxt->availEventDesc[idxEventArray].name[PAPI_MIN_STR_LEN - 1] = '\0';
576  }
577 
578  size_t MV_KindSize = sizeof(CUpti_MetricValueKind);
579  CUPTI_CALL((*cuptiMetricGetAttributePtr) // Collect the metric kind.
580  (metricIdList[i], CUPTI_METRIC_ATTR_VALUE_KIND, &MV_KindSize, // .. for this metric,
581  &gctxt->availEventDesc[idxEventArray].MV_Kind), // .. store in the event description,
582  return (PAPI_EMISC)); // .. on failure, but should always work.
583 
584  snprintf(gctxt->availEventDesc[idxEventArray].name, PAPI_MIN_STR_LEN, // .. develop name for papi user in tmpStr.
585  "metric:%s:device=%d", tmpStr, deviceNum);
586 
587  size = PAPI_2MAX_STR_LEN-1; // Most bytes to return.
588  CUPTI_CALL((*cuptiMetricGetAttributePtr) // Collect the long description.
589  (metricIdList[i], CUPTI_METRIC_ATTR_LONG_DESCRIPTION, &size, // .. for this metric, no more than size.
590  (uint8_t *) gctxt->availEventDesc[idxEventArray].description), // .. and store in event description.
591  return (PAPI_EMISC)); // .. on failure, but should always work.
592 
593  // Note that 'size' also returned total bytes written.
594  gctxt->availEventDesc[idxEventArray].description[size] = '\0'; // Always z-terminate.
595 
596  // Now we get all the sub-events of this metric.
597  uint32_t numSubs;
598  CUpti_MetricID itemId = metricIdList[i]; //.. shortcut to metric id.
599  CUPTI_CALL((*cuptiMetricGetNumEventsPtr) (itemId, &numSubs), // .. Get number of sub-events in metric.
600  return (PAPI_EINVAL)); // .. on failure of call.
601 
602  size_t sizeBytes = numSubs * sizeof(CUpti_EventID); // .. compute size of array we need.
603  CUpti_EventID *subEventIds = papi_malloc(sizeBytes); // .. Make the space.
604  CHECK_PRINT_EVAL(subEventIds == NULL, "Malloc failed", // .. If malloc fails,
605  return (PAPI_ENOMEM));
606 
607  CUPTI_CALL((*cuptiMetricEnumEventsPtr) // .. Enumrate events in the metric.
608  (itemId, &sizeBytes, subEventIds), // .. store in array.
609  return (PAPI_EINVAL)); // .. If cupti call fails.
610 
611  gctxt->availEventDesc[idxEventArray].metricEvents = subEventIds; // .. Copy the array pointer for IDs.
612  gctxt->availEventDesc[idxEventArray].numMetricEvents = numSubs; // .. Copy number of elements in it.
613 
614  idxEventArray++; // count another collectable found.
615  } // end maxMetrics loop.
616 
617  papi_free(metricIdList); // Done with this enumeration of metrics.
618  // Part of problem above, cannot create tempContext for unknown reason.
619  // CU_CALL((*cuCtxDestroyPtr) (tempContext), return (PAPI_EMISC)); // destroy the temporary context.
620  CUDA_CALL((*cudaSetDevicePtr) (saveDeviceNum), return (PAPI_EMISC)); // set the device pointer back to caller.
621  } // end 'for each device'.
622 
623  gctxt->availEventSize = idxEventArray;
624 
625  /* return 0 if everything went OK */
626  return 0;
627 } // end papicuda_add_native_events
628 
629 
630 /*
631  This routine tries to convert all CUPTI values to long long values.
632  If the CUPTI value is an integer type, it is cast to long long. If
633  the CUPTI value is a percent, it is multiplied by 100 to return the
634  integer percentage. If the CUPTI value is a double, the value
635  is cast to long long... this can be a severe truncation.
636  */
637 static int papicuda_convert_metric_value_to_long_long(CUpti_MetricValue metricValue, CUpti_MetricValueKind valueKind, long long int *papiValue)
638 {
639  union {
640  long long ll;
641  double fp;
642  } tmpValue;
643 
644  SUBDBG("Try to convert the CUPTI metric value kind (index %d) to PAPI value (long long or double)\n", valueKind);
645  switch (valueKind) {
646  case CUPTI_METRIC_VALUE_KIND_DOUBLE:
647  SUBDBG("Metric double %f\n", metricValue.metricValueDouble);
648  tmpValue.ll = (long long)(metricValue.metricValueDouble);
649  //CHECK_PRINT_EVAL(tmpValue.fp - metricValue.metricValueDouble > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
650  break;
651  case CUPTI_METRIC_VALUE_KIND_UINT64:
652  SUBDBG("Metric uint64 = %llu\n", (unsigned long long) metricValue.metricValueUint64);
653  tmpValue.ll = (long long) (metricValue.metricValueUint64);
654  CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValueUint64 > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
655  break;
656  case CUPTI_METRIC_VALUE_KIND_INT64:
657  SUBDBG("Metric int64 = %lld\n", (long long) metricValue.metricValueInt64);
658  tmpValue.ll = (long long) (metricValue.metricValueInt64);
659  CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValueInt64 > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
660  break;
661  case CUPTI_METRIC_VALUE_KIND_PERCENT:
662  SUBDBG("Metric percent = %f%%\n", metricValue.metricValuePercent);
663  tmpValue.ll = (long long)(metricValue.metricValuePercent*100);
664  //CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValuePercent > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
665  break;
666  case CUPTI_METRIC_VALUE_KIND_THROUGHPUT:
667  SUBDBG("Metric throughput %llu bytes/sec\n", (unsigned long long) metricValue.metricValueThroughput);
668  tmpValue.ll = (long long) (metricValue.metricValueThroughput);
669  CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValueThroughput > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
670  break;
671  case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL:
672  SUBDBG("Metric utilization level %u\n", (unsigned int) metricValue.metricValueUtilizationLevel);
673  tmpValue.ll = (long long) (metricValue.metricValueUtilizationLevel);
674  CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValueUtilizationLevel > 1e-6, "Error converting metric\n", return (PAPI_EMISC));
675  break;
676  default:
677  CHECK_PRINT_EVAL(1, "ERROR: unsupported metric value kind", return (PAPI_EINVAL));
678  exit(-1);
679  }
680 
681  *papiValue = tmpValue.ll;
682  return (PAPI_OK);
683 } // end routine
684 
685 
686 /* ****************************************************************************
687  ******************* BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS *************
688  **************************************************************************** */
689 
690 /*
691  * This is called whenever a thread is initialized.
692  */
694 {
695  (void) ctx;
696  SUBDBG("Entering\n");
697 
698  return PAPI_OK;
699 }
700 
701 
702 /* Initialize hardware counters, setup the function vector table
703  * and get hardware information, this routine is called when the
704  * PAPI process is initialized (IE PAPI_library_init)
705  */
706 /* NOTE: only called by main thread (not by every thread) !!! Starting
707  in CUDA 4.0, multiple CPU threads can access the same CUDA
708  context. This is a much easier programming model then pre-4.0 as
709  threads - using the same context - can share memory, data,
710  etc. It's possible to create a different context for each
711  thread. That's why CUDA context creation is done in
712  CUDA_init_component() (called only by main thread) rather than
713  CUDA_init() or CUDA_init_control_state() (both called by each
714  thread). */
716 {
717  SUBDBG("Entering with component idx: %d\n", cidx);
718  int rv;
719 
720  /* link in all the cuda libraries and resolve the symbols we need to use */
721  if(papicuda_linkCudaLibraries() != PAPI_OK) {
722  SUBDBG("Dynamic link of CUDA libraries failed, component will be disabled.\n");
723  SUBDBG("See disable reason in papi_component_avail output for more details.\n");
724  return (PAPI_ENOSUPP);
725  }
726 
727  /* Create the structure */
730 
731  /* Get list of all native CUDA events supported */
733  if(rv != 0)
734  return (rv);
735 
736  /* Export some information */
741 
742  return (PAPI_OK);
743 } // end init_component
744 
745 
746 /* Setup a counter control state.
747  * In general a control state holds the hardware info for an
748  * EventSet.
749  */
751 {
752  SUBDBG("Entering\n");
753  (void) ctrl;
755 
756  CHECK_PRINT_EVAL(!gctxt, "Error: The PAPI CUDA component needs to be initialized first", return (PAPI_ENOINIT));
757  /* If no events were found during the initial component initialization, return error */
759  strncpy(_cuda_vector.cmp_info.disabled_reason, "ERROR CUDA: No events exist", PAPI_MAX_STR_LEN);
760  return (PAPI_EMISC);
761  }
762  /* If it does not exist, create the global structure to hold CUDA contexts and active events */
767  }
768 
769  return PAPI_OK;
770 } // end papicuda_init_control_state
771 
772 /* Triggered by eventset operations like add or remove. For CUDA, needs to be
773  * called multiple times from each seperate CUDA context with the events to be
774  * measured from that context. For each context, create eventgroups for the
775  * events.
776  */
777 
778 /* Note: NativeInfo_t is defined in papi_internal.h */
780  NativeInfo_t * nativeInfo, int nativeCount, hwd_context_t * ctx)
781 {
782  SUBDBG("Entering with nativeCount %d\n", nativeCount);
783  (void) ctx;
784  papicuda_control_t *gctrl = global_papicuda_control; // We don't use the passed-in parameter, we use a global.
785  papicuda_context_t *gctxt = global_papicuda_context; // We don't use the passed-in parameter, we use a global.
786  int currDeviceNum;
787  CUcontext currCuCtx;
788  int eventContextIdx;
789  CUcontext eventCuCtx;
790  int index, ii, ee, cc;
791 
792  /* Return if no events */
793  if(nativeCount == 0)
794  return (PAPI_OK);
795 
796  /* Get deviceNum, initialize context if needed via free, get context */
797  CUDA_CALL((*cudaGetDevicePtr) (&currDeviceNum), return (PAPI_EMISC));
798  SUBDBG("currDeviceNum %d \n", currDeviceNum);
799 
800  CUDA_CALL((*cudaFreePtr) (NULL), return (PAPI_EMISC));
801  CU_CALL((*cuCtxGetCurrentPtr) (&currCuCtx), return (PAPI_EMISC));
802  SUBDBG("currDeviceNum %d cuCtx %p \n", currDeviceNum, currCuCtx);
803 
804  /* Handle user request of events to be monitored */
805  for (ii = 0; ii < nativeCount; ii++) { // For each event provided by caller,
806  index = nativeInfo[ii].ni_event; // Get the index of the event (in the global context).
807  char *eventName = gctxt->availEventDesc[index].name; // Shortcut to name.
808  int numMetricEvents= gctxt->availEventDesc[index].numMetricEvents; // Get if this is an event (=0) or metric (>0).
809  int eventDeviceNum = gctxt->availEventDeviceNum[index]; // Device number for this event.
810  (void) eventName; // Useful in checkpoint and debug, don't warn if not used.
811 
812  /* if this event is already added continue to next ii, if not, mark it as being added */
813  if (gctxt->availEventIsBeingMeasuredInEventset[index] == 1) { // If already being collected, skip it.
814  SUBDBG("Skipping event %s which is already added\n", eventName);
815  continue;
816  } else {
817  gctxt->availEventIsBeingMeasuredInEventset[index] = 1; // If not being collected yet, flag it as being collected now.
818  }
819 
820  /* Find context/control in papicuda, creating it if does not exist */
821  for(cc = 0; cc < (int) gctrl->countOfActiveCUContexts; cc++) { // Scan all active contexts.
822  CHECK_PRINT_EVAL(cc >= PAPICUDA_MAX_COUNTERS, "Exceeded hardcoded maximum number of contexts (PAPICUDA_MAX_COUNTERS)", return (PAPI_EMISC));
823 
824  if(gctrl->arrayOfActiveCUContexts[cc]->deviceNum == eventDeviceNum) { // If this cuda context is for the device for this event,
825  eventCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx; // Remember that context.
826  SUBDBG("Event %s device %d already has a cuCtx %p registered\n", eventName, eventDeviceNum, eventCuCtx);
827 
828  if(eventCuCtx != currCuCtx) // If that is not our CURRENT context, push and make it so.
829  CU_CALL((*cuCtxPushCurrentPtr) (eventCuCtx), // .. Stack the current counter, replace with this one.
830  return (PAPI_EMISC)); // .. .. on failure.
831  break; // .. exit the loop.
832  } // end if found.
833  } // end loop through active contexts.
834 
835  if(cc == (int) gctrl->countOfActiveCUContexts) { // If we never found the context, create one.
836  SUBDBG("Event %s device %d does not have a cuCtx registered yet...\n", eventName, eventDeviceNum);
837  if(currDeviceNum != eventDeviceNum) { // .. If we need to switch to another device,
838  CUDA_CALL((*cudaSetDevicePtr) (eventDeviceNum), // .. .. set the device pointer to the event's device.
839  return (PAPI_EMISC)); // .. .. .. (on faiure).
840  CUDA_CALL((*cudaFreePtr) (NULL), return (PAPI_EMISC)); // .. .. This is a no-op, but used to force init of a context.
841  CU_CALL((*cuCtxGetCurrentPtr) (&eventCuCtx), // .. .. So we can get a pointer to it.
842  return (PAPI_EMISC)); // .. .. .. On failure.
843  } else { // .. If we are already on the right device,
844  eventCuCtx = currCuCtx; // .. .. just get the current context.
845  }
846 
847  gctrl->arrayOfActiveCUContexts[cc] = papi_calloc(1, sizeof(papicuda_active_cucontext_t)); // allocate a structure.
848  CHECK_PRINT_EVAL(gctrl->arrayOfActiveCUContexts[cc] == NULL, "Memory allocation for new active context failed", return (PAPI_ENOMEM));
849  gctrl->arrayOfActiveCUContexts[cc]->deviceNum = eventDeviceNum; // Fill in everything.
850  gctrl->arrayOfActiveCUContexts[cc]->cuCtx = eventCuCtx;
851  gctrl->arrayOfActiveCUContexts[cc]->allEventsCount = 0; // All events read by this context on this device.
852  gctrl->arrayOfActiveCUContexts[cc]->ctxActiveCount = 0; // active events being read by this context on this device.
853  gctrl->countOfActiveCUContexts++;
854  SUBDBG("Added a new context deviceNum %d cuCtx %p ... now countOfActiveCUContexts is %d\n", eventDeviceNum, eventCuCtx, gctrl->countOfActiveCUContexts);
855  } // end if we needed to create a new context.
856 
857  //---------------------------------------------------------------------
858  // We found the context, or created it, and the index is in cc.
859  //---------------------------------------------------------------------
860  eventContextIdx = cc;
861  papicuda_active_cucontext_t *eventctrl = gctrl->arrayOfActiveCUContexts[eventContextIdx]; // get the context for this event.
862 
863  // We need to get all the events (or sub-events of a metric) and add
864  // them to our list of all events. Note we only check if we exceed the
865  // bounds of the allEvents[] array; everything added to any other array
866  // results in at least ONE add to allEvents[], so it will fail before
867  // or coincident with any other array. TC
868 
869  CUpti_EventID itemId = gctxt->availEventIDArray[index]; // event (or metric) ID.
870 
871  if (numMetricEvents == 0) { // Dealing with a simple event.
872  eventctrl->allEvents[eventctrl->allEventsCount++] = itemId; // add to aggregate list, count it.
873  if (eventctrl->allEventsCount >= PAPICUDA_MAX_COUNTERS) { // .. Fail if we exceed size of array.
874  SUBDBG("Num events (generated by metric) exceeded PAPICUDA_MAX_COUNTERS\n");
875  return(PAPI_EINVAL);
876  }
877  } else { // dealing with a metric.
878  // cuda events and metrics have already been skipped if duplicates,
879  // but we can't say the same for sub-events of a metric. We need to
880  // check we don't duplicate them in allEvents.
881 
882  for(ee = 0; ee < numMetricEvents; ee++) { // For each event retrieved,
883  int aeIdx;
884  CUpti_EventID myId = gctxt->availEventDesc[index].metricEvents[ee]; // collect the sub-event ID.
885 
886  for (aeIdx=0; aeIdx<(int) eventctrl->allEventsCount; aeIdx++) { // loop through existing events.
887  if (eventctrl->allEvents[aeIdx] == myId) break; // break out if duplicate found.
888  }
889 
890  if (aeIdx < (int) eventctrl->allEventsCount) continue; // Don't add if already present.
891  eventctrl->allEvents[eventctrl->allEventsCount++] = myId; // add event to the all array.
892 
893  if (eventctrl->allEventsCount >= PAPICUDA_MAX_COUNTERS) { // Fail if we exceed size of array.
894  SUBDBG("Num events (generated by metric) exceeded PAPICUDA_MAX_COUNTERS\n");
895  return(PAPI_EINVAL);
896  }
897  } // end for each event in metric.
898  } // end if we must process all sub-events of a metric.
899 
900  // Record index of this active event back into the nativeInfo
901  // structure.
902 
903  nativeInfo[ii].ni_position = gctrl->activeEventCount;
904 
905  // Record index of this active event within this context. We need this
906  // so after we read this context, we can move values (or compute
907  // metrics and move values) into their proper position within the
908  // activeValues[] array.
909 
910  eventctrl->ctxActiveEvents[eventctrl->ctxActiveCount++] = // within this active_cucontext.
911  gctrl->activeEventCount; // ..
912 
913  // Record in internal gctrl arrays.
914  // so we have a succinct list of active events and metrics; this will
915  // be useful for performance especially on metrics, where we must
916  // compose values.
917 
918  CHECK_PRINT_EVAL(gctrl->activeEventCount == PAPICUDA_MAX_COUNTERS - 1, "Exceeded maximum num of events (PAPI_MAX_COUNTERS)", return (PAPI_EMISC));
919  gctrl->activeEventIndex[gctrl->activeEventCount] = index;
920  gctrl->activeEventValues[gctrl->activeEventCount] = 0;
921  gctrl->activeEventCount++;
922 
923  // EventGroupSets does an analysis to creates 'sets' of events that
924  // can be collected simultaneously, i.e. the application must be
925  // run once per set. CUpti calls these 'passes'. We don't allow
926  // such combinations, there is no way to tell a PAPI user to run
927  // their application multiple times. WITHIN a single set are
928  // EventGroups which are collected simultaneously but must be read
929  // separately because each group applies to a separate domain. So
930  // we don't mind that; but we must exit with an invalid combination
931  // if numsets > 1, indicating the most recent event requested
932  // cannot be collected simultaneously with the others.
933 
934  // We destroy any existing eventGroupSets, and then create one for the
935  // new set of events.
936 
937  SUBDBG("Create eventGroupSets for context (destroy pre-existing) (nativeCount %d, allEventsCount %d) \n", gctrl->activeEventCount, eventctrl->allEventsCount);
938  if(eventctrl->allEventsCount > 0) { // If we have events...
939  // SUBDBG("Destroy previous eventGroupPasses for the context \n");
940  if(eventctrl->eventGroupSets != NULL) { // if we have a previous analysis;
941  CUPTI_CALL((*cuptiEventGroupSetsDestroyPtr) // .. Destroy it.
942  (eventctrl->eventGroupSets), return (PAPI_EMISC)); // .. If we can't, return error.
943  eventctrl->eventGroupSets = NULL; // .. Reset pointer.
944  }
945 
946  size_t sizeBytes = (eventctrl->allEventsCount) * sizeof(CUpti_EventID); // compute bytes in the array.
947 
948  // SUBDBG("About to create eventGroupPasses for the context (sizeBytes %zu) \n", sizeBytes);
949 #ifdef PAPICUDA_KERNEL_REPLAY_MODE
950  CUPTI_CALL((*cuptiEnableKernelReplayModePtr) (eventCuCtx),
951  return (PAPI_ECMP));
952  CUPTI_CALL((*cuptiEventGroupSetsCreatePtr)
953  (eventCuCtx, sizeBytes, eventctrl->allEvents,
954  &eventctrl->eventGroupSets),
955  return (PAPI_ECMP));
956 
957 #else // Normal operation.
958  CUPTI_CALL((*cuptiSetEventCollectionModePtr)
959  (eventCuCtx,CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS),
960  return(PAPI_ECMP));
961 
962 // CUPTI provides two routines to create EventGroupSets, one is used
963 // here cuptiEventGroupSetsCreate(), the other is for metrics, it will
964 // automatically collect the events needed for a metric. It is called
965 // cuptiMetricCreateEventGroupSets(). We have checked and these two routines
966 // produce groups of the same size with the same event IDs, and work equally.
967 
968  CUPTI_CALL((*cuptiEventGroupSetsCreatePtr)
969  (eventCuCtx, sizeBytes, eventctrl->allEvents,
970  &eventctrl->eventGroupSets),
971  return (PAPI_EMISC));
972 
973  if (eventctrl->eventGroupSets->numSets > 1) { // If more than one pass is required,
974  SUBDBG("Error occurred: The combined CUPTI events cannot be collected simultaneously ... try different events\n");
975  papicuda_cleanup_eventset(ctrl); // Will do cuptiEventGroupSetsDestroy() to clean up memory.
976  return(PAPI_ECOMBO);
977  } else {
978  SUBDBG("Created eventGroupSets. nativeCount %d, allEventsCount %d. Sets (passes-required) = %d) \n", gctrl->activeEventCount, eventctrl->allEventsCount, eventctrl->eventGroupSets->numSets);
979  }
980 
981 #endif // #if/#else/#endif on PAPICUDA_KERNEL_REPLAY_MODE
982 
983  } // end if we had any events.
984 
985  if(eventCuCtx != currCuCtx) // restore original context for caller, if we changed it.
986  CU_CALL((*cuCtxPopCurrentPtr) (&eventCuCtx), return (PAPI_EMISC));
987 
988  }
989  return (PAPI_OK);
990 } // end PAPI_update_control_state.
991 
992 
993 /* Triggered by PAPI_start().
994  * For CUDA component, switch to each context and start all eventgroups.
995 */
997 {
998  SUBDBG("Entering\n");
999  (void) ctx;
1000  (void) ctrl;
1002  // papicuda_context_t *gctxt = global_papicuda_context;
1003  uint32_t ii, gg, cc;
1004  int saveDeviceNum = -1;
1005 
1006  SUBDBG("Reset all active event values\n");
1007  for(ii = 0; ii < gctrl->activeEventCount; ii++) // These are the values we will return.
1008  gctrl->activeEventValues[ii] = 0;
1009 
1010  SUBDBG("Save current context, then switch to each active device/context and enable eventgroups\n");
1011  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC));
1012  CUPTI_CALL((*cuptiGetTimestampPtr) (&gctrl->cuptiStartTimestampNs), return (PAPI_EMISC));
1013 
1014  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) { // For each context,
1015  int eventDeviceNum = gctrl->arrayOfActiveCUContexts[cc]->deviceNum; // .. get device number.
1016  CUcontext eventCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx; // .. get this context,
1017  SUBDBG("Set to device %d cuCtx %p \n", eventDeviceNum, eventCuCtx);
1018  if(eventDeviceNum != saveDeviceNum) { // .. If we need to switch,
1019  CU_CALL((*cuCtxPushCurrentPtr) (eventCuCtx), return (PAPI_EMISC)); // .. .. push current on stack, use this one.
1020  }
1021 
1022  CUpti_EventGroupSets *eventGroupSets = // .. Shortcut to eventGroupSets for this context.
1023  gctrl->arrayOfActiveCUContexts[cc]->eventGroupSets; // ..
1024  CUpti_EventGroupSet *groupset = &eventGroupSets->sets[0]; // .. There can be only one set of groups.
1025  for(gg = 0; gg < groupset->numEventGroups; gg++) { // .. For each group within this groupset,
1026  uint32_t one = 1;
1027  CUPTI_CALL((*cuptiEventGroupSetAttributePtr) ( // .. .. Say we want to profile all domains.
1028  groupset->eventGroups[gg],
1029  CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
1030  sizeof(uint32_t), &one),
1031  return (PAPI_EMISC)); // .. .. on failure of call.
1032  } // end for each group.
1033 
1034  CUPTI_CALL((*cuptiEventGroupSetEnablePtr) (groupset), // .. Enable all groups in set (start collecting).
1035  return (PAPI_EMISC)); // .. on failure of call.
1036 
1037  if(eventDeviceNum != saveDeviceNum) { // .. If we pushed a context,
1038  CU_CALL((*cuCtxPopCurrentPtr) (&eventCuCtx), return (PAPI_EMISC)); // .. Pop it.
1039  }
1040  } // end of loop on all contexts.
1041 
1042  return (PAPI_OK); // We started all groups.
1043 } // end routine.
1044 
1045 // Triggered by PAPI_read(). For CUDA component, switch to each context, read
1046 // all the eventgroups, and put the values in the correct places. Note that
1047 // parameters (ctx, ctrl, flags) are all ignored. The design of this components
1048 // doesn't pay attention to PAPI EventSets, because ONLY ONE is ever allowed
1049 // for a component. So instead of maintaining ctx and ctrl, we use global
1050 // variables to keep track of the one and only eventset. Note that **values is
1051 // where we have to give PAPI the address of an array of the values we read (or
1052 // composed).
1053 
1054 static int papicuda_read(hwd_context_t * ctx, hwd_control_state_t * ctrl, long long **values, int flags)
1055 {
1056  SUBDBG("Entering\n");
1057  (void) ctx;
1058  (void) ctrl;
1059  (void) flags;
1062  uint32_t gg, i, j, cc;
1063  int saveDeviceNum;
1064 
1065  // Get read time stamp
1066  CUPTI_CALL((*cuptiGetTimestampPtr) // Read current timestamp.
1067  (&gctrl->cuptiReadTimestampNs),
1068  return (PAPI_EMISC));
1069  uint64_t durationNs = gctrl->cuptiReadTimestampNs -
1070  gctrl->cuptiStartTimestampNs; // compute duration from start.
1071  gctrl->cuptiStartTimestampNs = gctrl->cuptiReadTimestampNs; // Change start to value just read.
1072 
1073  SUBDBG("Save current context, then switch to each active device/context and enable context-specific eventgroups\n");
1074  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC)); // Save Caller's current device number on entry.
1075 
1076  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) { // For each active context,
1077  papicuda_active_cucontext_t *activeCuCtxt =
1078  gctrl->arrayOfActiveCUContexts[cc]; // A shortcut.
1079  int currDeviceNum = activeCuCtxt->deviceNum; // Get the device number.
1080  CUcontext currCuCtx = activeCuCtxt->cuCtx; // Get the actual CUcontext.
1081 
1082  SUBDBG("Set to device %d cuCtx %p \n", currDeviceNum, currCuCtx);
1083  if(currDeviceNum != saveDeviceNum) { // If my current is not the same as callers,
1084  CU_CALL((*cuCtxPushCurrentPtr) (currCuCtx), return (PAPI_EMISC)); // .. Push the current, and replace with mine.
1085  // Note, cuCtxPushCurrent() implicitly includes a cudaSetDevice().
1086  } else { // If my current IS the same as callers,
1087  CU_CALL((*cuCtxSetCurrentPtr) (currCuCtx), return (PAPI_EMISC)); // .. No push. Just set the current.
1088  }
1089 
1090  CU_CALL((*cuCtxSynchronizePtr) (), return (PAPI_EMISC)); // Block until device finishes all prior tasks.
1091  CUpti_EventGroupSets *myEventGroupSets = activeCuCtxt->eventGroupSets; // Make a copy of pointer to EventGroupSets.
1092 
1093  uint32_t numEvents, numInstances, numTotalInstances;
1094  size_t sizeofuint32num = sizeof(uint32_t);
1095  CUpti_EventDomainID groupDomainID;
1096  size_t groupDomainIDSize = sizeof(groupDomainID);
1097  CUdevice cudevice = gctxt->deviceArray[currDeviceNum].cuDev; // Make a copy of the current device.
1098 
1099  // For each pass, we get the event groups that can be read together.
1100  // But since elsewhere, we don't allow events to be added that would
1101  // REQUIRE more than one pass, this will always be just ONE pass. So we
1102  // only need to loop over the groups.
1103 
1104  CUpti_EventGroupSet *groupset = &myEventGroupSets->sets[0]; // The one and only set.
1105  SUBDBG("Read events in this context\n");
1106  int AEIdx = 0; // we will be over-writing the allEvents array.
1107 
1108  for (gg = 0; gg < groupset->numEventGroups; gg++) { // process each eventgroup within the groupset.
1109  CUpti_EventGroup group = groupset->eventGroups[gg]; // Shortcut to the group.
1110 
1111  CUPTI_CALL((*cuptiEventGroupGetAttributePtr) // Get 'groupDomainID' for this group.
1112  (group, CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID,
1113  &groupDomainIDSize, &groupDomainID),
1114  return (PAPI_EMISC));
1115 
1116  // 'numTotalInstances' and 'numInstances are needed for scaling
1117  // the values retrieved. (Nvidia instructions and samples).
1118  CUPTI_CALL((*cuptiDeviceGetEventDomainAttributePtr) // Get 'numTotalInstances' for this domain.
1119  (cudevice,
1120  groupDomainID,
1121  CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT,
1122  &sizeofuint32num,
1123  &numTotalInstances),
1124  return (PAPI_EMISC));
1125 
1126  CUPTI_CALL((*cuptiEventGroupGetAttributePtr) // Get 'numInstances' for this domain.
1127  (group,
1128  CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
1129  &sizeofuint32num,
1130  &numInstances),
1131  return (PAPI_EMISC));
1132 
1133  CUPTI_CALL((*cuptiEventGroupGetAttributePtr) // Get 'numEvents' in this group.
1134  (group,
1135  CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS,
1136  &sizeofuint32num,
1137  &numEvents),
1138  return (PAPI_EMISC));
1139 
1140  // Now we will read all events in this group; aggregate the values
1141  // and then distribute them. We do not calculate metrics here;
1142  // wait until all groups are read and all values are available.
1143 
1144  size_t resultArrayBytes = sizeof(uint64_t) * numEvents * numTotalInstances;
1145  size_t eventIdArrayBytes = sizeof(CUpti_EventID) * numEvents;
1146  size_t numCountersRead = 2;
1147 
1148  CUpti_EventID *eventIdArray = (CUpti_EventID *) papi_malloc(eventIdArrayBytes);
1149  uint64_t *resultArray = (uint64_t *) papi_malloc(resultArrayBytes);
1150  uint64_t *aggrResultArray = (uint64_t *) papi_calloc(numEvents, sizeof(uint64_t));
1151 
1152  for (i=0; i<(resultArrayBytes/sizeof(uint64_t)); i++) resultArray[i]=0;
1153 
1154  if (eventIdArray == NULL || resultArray == NULL || aggrResultArray == NULL) {
1155  fprintf(stderr, "%s:%i failed to allocate memory.\n", __FILE__, __LINE__);
1156  return(PAPI_EMISC);
1157  }
1158 
1159  CUPTI_CALL( (*cuptiEventGroupReadAllEventsPtr) // Read all events.
1160  (group, CUPTI_EVENT_READ_FLAG_NONE, // This flag is the only allowed flag.
1161  &resultArrayBytes, resultArray,
1162  &eventIdArrayBytes, eventIdArray,
1163  &numCountersRead),
1164  return (PAPI_EMISC));
1165 
1166  // Now (per Nvidia) we must sum up all domains for each event.
1167  // Arrangement of 2-d Array returned in resultArray:
1168  // domain instance 0: event0 event1 ... eventN
1169  // domain instance 1: event0 event1 ... eventN
1170  // ...
1171  // domain instance M: event0 event1 ... eventN
1172  // But we accumulate by column, event[0], event[1], etc.
1173 
1174  for (i = 0; i < numEvents; i++) { // outer loop is column (event) we are on.
1175  for (j = 0; j < numTotalInstances; j++) { // inner loop is row (instance) we are on.
1176  aggrResultArray[i] += resultArray[i + numEvents * j]; // accumulate the column.
1177  }
1178  }
1179 
1180  // We received an eventIdArray; note this is not necessarily in the
1181  // same order as we added them; CUpti can reorder them when sorting
1182  // them into groups. However, the total number of events must be
1183  // the same, so now as we read each group, we just overwrite the
1184  // allEvents[] and allEventValues[] arrays. It doesn't make a
1185  // difference to cuptiGetMetricValue what order the events appear
1186  // in.
1187 
1188  // After all these groups are read, allEvents will be complete, and
1189  // we can use it to compute the metrics and move metric and event
1190  // values back into user order.
1191 
1192  for (i=0; i<numEvents; i++) { // For each event in eventIdArray (just this group),
1193  CUpti_EventID myId = eventIdArray[i]; // shortcut for the event id within this group.
1194  activeCuCtxt->allEvents[AEIdx] = myId; // Overwrite All Events id.
1195  activeCuCtxt->allEventValues[AEIdx++] = aggrResultArray[i]; // Overwrite all events value; increment position.
1196  } // end loop for each event.
1197 
1198  papi_free(eventIdArray);
1199  papi_free(resultArray);
1200  papi_free(aggrResultArray);
1201  } // end of an event group.
1202 
1203  // We have finished all event groups within this context; allEvents[]
1204  // and allEventValues[] are populated. Now we compute metrics and move
1205  // event values. We do that by looping through the events assigned to
1206  // this context, and we must back track to the activeEventIdx[] and
1207  // activeEventValues[] array in gctrl. We have kept our indexes into
1208  // that array, in ctxActive[].
1209 
1210  uint32_t ctxActiveCount = activeCuCtxt->ctxActiveCount; // Number of (papi user) events in this context.
1211  uint32_t *ctxActive = activeCuCtxt->ctxActiveEvents; // index of each event in gctrl->activeEventXXXX.
1212 
1213  for (j=0; j<ctxActiveCount; j++) { // Search for matching active event.
1214  uint32_t activeIdx, availIdx;
1215 
1216  activeIdx=ctxActive[j]; // get index into activeEventIdx.
1217  availIdx = gctrl->activeEventIndex[activeIdx]; // Get the availEventIdx.
1218  CUpti_EventID thisEventId = gctxt->availEventIDArray[availIdx]; // Get the event ID (or metric ID).
1219  struct papicuda_name_desc *myDesc=&(gctxt->availEventDesc[availIdx]); // get pointer to the description.
1220 
1221  if (myDesc->numMetricEvents == 0) { // If this is a simple cuda event (not a metric),
1222  int k;
1223  for (k=0; k<AEIdx; k++) { // search the array for this event id.
1224  if (activeCuCtxt->allEvents[k] == thisEventId) { // If I found the event,
1225  gctrl->activeEventValues[activeIdx] = // Record the value,
1226  activeCuCtxt->allEventValues[k];
1227  break; // break out of the search loop.
1228  } // end if I found it.
1229  } // end search loop.
1230 
1231  continue; // Jump to next in ctxActiveCount.
1232  } else { // If I found a metric, I must compute it.
1233  CUpti_MetricValue myValue; // Space for a return.
1234  CUPTI_CALL( (*cuptiMetricGetValue) // Get the value,
1235  (cudevice, thisEventId, // device and metric Id,
1236  AEIdx * sizeof(CUpti_EventID), // size of event list,
1237  activeCuCtxt->allEvents, // the event list.
1238  AEIdx * sizeof(uint64_t), // size of corresponding event values,
1239  activeCuCtxt->allEventValues, // the event values.
1240  durationNs, &myValue), // duration (for rates), and where to return the value.
1241  return(PAPI_EMISC)); // In case of error.
1242 
1243  papicuda_convert_metric_value_to_long_long( // convert the value computed to long long and store it.
1244  myValue, myDesc->MV_Kind,
1245  &gctrl->activeEventValues[activeIdx]);
1246  }
1247  } // end loop on active events in this context.
1248 
1249  if(currDeviceNum != saveDeviceNum) { // If we had to change the context from user's,
1250  CUDA_CALL((*cudaSetDevicePtr) (saveDeviceNum), // set the device pointer to the user's original.
1251  return (PAPI_EMISC)); // .. .. (on faiure).
1252  CU_CALL((*cuCtxPopCurrentPtr) (&currCuCtx), return (PAPI_EMISC)); // .. pop the pushed context back to user's.
1253  }
1254  } // end of loop for each active context.
1255 
1256  *values = gctrl->activeEventValues; // Return ptr to the list of computed values to user.
1257  return (PAPI_OK);
1258 } // end of papicuda_read().
1259 
1260 /* Triggered by PAPI_stop() */
1262 {
1263  SUBDBG("Entering\n");
1264  (void) ctx;
1265  (void) ctrl;
1267  uint32_t cc, ss;
1268  int saveDeviceNum;
1269 
1270  SUBDBG("Save current context, then switch to each active device/context and enable eventgroups\n");
1271  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC));
1272  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
1273  int currDeviceNum = gctrl->arrayOfActiveCUContexts[cc]->deviceNum;
1274  CUcontext currCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
1275  SUBDBG("Set to device %d cuCtx %p \n", currDeviceNum, currCuCtx);
1276  if(currDeviceNum != saveDeviceNum)
1277  CU_CALL((*cuCtxPushCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1278  else
1279  CU_CALL((*cuCtxSetCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1280  CUpti_EventGroupSets *currEventGroupSets = gctrl->arrayOfActiveCUContexts[cc]->eventGroupSets;
1281  for (ss=0; ss<currEventGroupSets->numSets; ss++) { // For each group in the set,
1282  CUpti_EventGroupSet groupset = currEventGroupSets->sets[ss]; // get the set,
1283  CUPTI_CALL((*cuptiEventGroupSetDisablePtr) (&groupset), // disable the whole set.
1284  return (PAPI_EMISC)); // .. on failure.
1285  }
1286  /* Pop the pushed context */
1287  if(currDeviceNum != saveDeviceNum)
1288  CU_CALL((*cuCtxPopCurrentPtr) (&currCuCtx), return (PAPI_EMISC));
1289 
1290  }
1291  return (PAPI_OK);
1292 } // end of papicuda_stop.
1293 
1294 
1295 /*
1296  * Disable and destroy the CUDA eventGroup
1297  */
1299 {
1300  SUBDBG("Entering\n");
1301  (void) ctrl; // Don't need this parameter.
1304  // papicuda_active_cucontext_t *currctrl;
1305  uint32_t cc;
1306  int saveDeviceNum;
1307  unsigned int ui;
1308 
1309  SUBDBG("Save current context, then switch to each active device/context and enable eventgroups\n");
1310  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC));
1311  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
1312  CUcontext currCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
1313  int currDeviceNum = gctrl->arrayOfActiveCUContexts[cc]->deviceNum;
1314  CUpti_EventGroupSets *currEventGroupSets = gctrl->arrayOfActiveCUContexts[cc]->eventGroupSets;
1315  if(currDeviceNum != saveDeviceNum)
1316  CU_CALL((*cuCtxPushCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1317  else
1318  CU_CALL((*cuCtxSetCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1319  //CUPTI_CALL((*cuptiEventGroupSetsDestroyPtr) (currEventGroupPasses), return (PAPI_EMISC));
1320  (*cuptiEventGroupSetsDestroyPtr) (currEventGroupSets);
1321  gctrl->arrayOfActiveCUContexts[cc]->eventGroupSets = NULL;
1322  papi_free( gctrl->arrayOfActiveCUContexts[cc] );
1323  /* Pop the pushed context */
1324  if(currDeviceNum != saveDeviceNum)
1325  CU_CALL((*cuCtxPopCurrentPtr) (&currCuCtx), return (PAPI_EMISC));
1326  }
1327  /* Record that there are no active contexts or events */
1328  for (ui=0; ui<gctrl->activeEventCount; ui++) { // For each active event,
1329  int idx = gctrl->activeEventIndex[ui]; // .. Get its index...
1330  gctxt->availEventIsBeingMeasuredInEventset[idx] = 0; // .. No longer being measured.
1331  }
1332 
1333  gctrl->countOfActiveCUContexts = 0;
1334  gctrl->activeEventCount = 0;
1335  return (PAPI_OK);
1336 } // end papicuda_cleanup_eventset
1337 
1338 
1339 /* Called at thread shutdown. Does nothing in the CUDA component. */
1341 {
1342  SUBDBG("Entering\n");
1343  (void) ctx;
1344 
1345  return (PAPI_OK);
1346 }
1347 
1348 /* Triggered by PAPI_shutdown() and frees memory allocated in the CUDA component. */
1350 {
1351  SUBDBG("Entering\n");
1354  int deviceNum;
1355  uint32_t i, cc;
1356  /* Free context */
1357  if(gctxt) {
1358  for(deviceNum = 0; deviceNum < gctxt->deviceCount; deviceNum++) {
1359  papicuda_device_desc_t *mydevice = &gctxt->deviceArray[deviceNum];
1360  papi_free(mydevice->domainIDArray);
1361  papi_free(mydevice->domainIDNumEvents);
1362  }
1363 
1364  for (i=0; i<gctxt->availEventSize; i++) { // For every event in this context,
1365  struct papicuda_name_desc *desc = &(gctxt->availEventDesc[i]); // get a name description.
1366  if (desc->numMetricEvents > 0) { // If we have any sub-events,
1367  papi_free(desc->metricEvents); // .. Free the list of sub-events.
1368  }
1369  } // end for every available event.
1370 
1371  papi_free(gctxt->availEventIDArray);
1373  papi_free(gctxt->availEventKind);
1375  papi_free(gctxt->availEventDesc);
1376  papi_free(gctxt->deviceArray);
1377  papi_free(gctxt);
1378  global_papicuda_context = gctxt = NULL;
1379  }
1380  /* Free control */
1381  if(gctrl) {
1382  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
1383 #ifdef PAPICUDA_KERNEL_REPLAY_MODE
1384  CUcontext currCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
1385  CUPTI_CALL((*cuptiDisableKernelReplayModePtr) (currCuCtx), return (PAPI_EMISC));
1386 #endif
1387  if(gctrl->arrayOfActiveCUContexts[cc] != NULL)
1388  papi_free(gctrl->arrayOfActiveCUContexts[cc]);
1389  }
1390  papi_free(gctrl);
1391  global_papicuda_control = gctrl = NULL;
1392  }
1393  // close the dynamic libraries needed by this component (opened in the init substrate call)
1394  dlclose(dl1);
1395  dlclose(dl2);
1396  dlclose(dl3);
1397  return (PAPI_OK);
1398 } // end papicuda_shutdown_component().
1399 
1400 
1401 /* Triggered by PAPI_reset() but only if the EventSet is currently
1402  * running. If the eventset is not currently running, then the saved
1403  * value in the EventSet is set to zero without calling this
1404  * routine. */
1406 {
1407  (void) ctx;
1408  (void) ctrl;
1410  uint32_t gg, ii, cc, ss;
1411  int saveDeviceNum;
1412 
1413  SUBDBG("Reset all active event values\n");
1414  for(ii = 0; ii < gctrl->activeEventCount; ii++)
1415  gctrl->activeEventValues[ii] = 0;
1416 
1417  SUBDBG("Save current context, then switch to each active device/context and reset\n");
1418  CUDA_CALL((*cudaGetDevicePtr) (&saveDeviceNum), return (PAPI_EMISC));
1419  for(cc = 0; cc < gctrl->countOfActiveCUContexts; cc++) {
1420  CUcontext currCuCtx = gctrl->arrayOfActiveCUContexts[cc]->cuCtx;
1421  int currDeviceNum = gctrl->arrayOfActiveCUContexts[cc]->deviceNum;
1422  if(currDeviceNum != saveDeviceNum)
1423  CU_CALL((*cuCtxPushCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1424  else
1425  CU_CALL((*cuCtxSetCurrentPtr) (currCuCtx), return (PAPI_EMISC));
1426  CUpti_EventGroupSets *currEventGroupSets = gctrl->arrayOfActiveCUContexts[cc]->eventGroupSets;
1427  for (ss=0; ss<currEventGroupSets->numSets; ss++) {
1428  CUpti_EventGroupSet groupset = currEventGroupSets->sets[ss];
1429  for(gg = 0; gg < groupset.numEventGroups; gg++) {
1430  CUpti_EventGroup group = groupset.eventGroups[gg];
1431  CUPTI_CALL((*cuptiEventGroupResetAllEventsPtr) (group), return (PAPI_EMISC));
1432  }
1433  CUPTI_CALL((*cuptiEventGroupSetEnablePtr) (&groupset), return (PAPI_EMISC));
1434  }
1435  if(currDeviceNum != saveDeviceNum)
1436  CU_CALL((*cuCtxPopCurrentPtr) (&currCuCtx), return (PAPI_EMISC));
1437  }
1438  return (PAPI_OK);
1439 } // end papicuda_reset().
1440 
1441 
1442 /* This function sets various options in the component - Does nothing in the CUDA component.
1443  @param[in] ctx -- hardware context
1444  @param[in] code valid are PAPI_SET_DEFDOM, PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL and PAPI_SET_INHERIT
1445  @param[in] option -- options to be set
1446 */
1447 static int papicuda_ctrl(hwd_context_t * ctx, int code, _papi_int_option_t * option)
1448 {
1449  SUBDBG("Entering\n");
1450  (void) ctx;
1451  (void) code;
1452  (void) option;
1453  return (PAPI_OK);
1454 }
1455 
1456 /*
1457  * This function has to set the bits needed to count different domains
1458  * In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER
1459  * By default return PAPI_EINVAL if none of those are specified
1460  * and PAPI_OK with success
1461  * PAPI_DOM_USER is only user context is counted
1462  * PAPI_DOM_KERNEL is only the Kernel/OS context is counted
1463  * PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses)
1464  * PAPI_DOM_ALL is all of the domains
1465  */
1466 static int papicuda_set_domain(hwd_control_state_t * ctrl, int domain)
1467 {
1468  SUBDBG("Entering\n");
1469  (void) ctrl;
1470  if((PAPI_DOM_USER & domain) || (PAPI_DOM_KERNEL & domain) || (PAPI_DOM_OTHER & domain) || (PAPI_DOM_ALL & domain))
1471  return (PAPI_OK);
1472  else
1473  return (PAPI_EINVAL);
1474  return (PAPI_OK);
1475 }
1476 
1477 
1478 /* Enumerate Native Events.
1479  * @param EventCode is the event of interest
1480  * @param modifier is one of PAPI_ENUM_FIRST, PAPI_ENUM_EVENTS
1481  */
1482 static int papicuda_ntv_enum_events(unsigned int *EventCode, int modifier)
1483 {
1484  // SUBDBG( "Entering (get next event after %u)\n", *EventCode );
1485  switch (modifier) {
1486  case PAPI_ENUM_FIRST:
1487  *EventCode = 0;
1488  return (PAPI_OK);
1489  break;
1490  case PAPI_ENUM_EVENTS:
1491  if(*EventCode < global_papicuda_context->availEventSize - 1) {
1492  *EventCode = *EventCode + 1;
1493  return (PAPI_OK);
1494  } else
1495  return (PAPI_ENOEVNT);
1496  break;
1497  default:
1498  return (PAPI_EINVAL);
1499  }
1500  return (PAPI_OK);
1501 }
1502 
1503 
1504 /* Takes a native event code and passes back the name
1505  * @param EventCode is the native event code
1506  * @param name is a pointer for the name to be copied to
1507  * @param len is the size of the name string
1508  */
1509 static int papicuda_ntv_code_to_name(unsigned int EventCode, char *name, int len)
1510 {
1511  // SUBDBG( "Entering EventCode %d\n", EventCode );
1512  unsigned int index = EventCode;
1514  if(index < gctxt->availEventSize) {
1515  strncpy(name, gctxt->availEventDesc[index].name, len);
1516  } else {
1517  return (PAPI_EINVAL);
1518  }
1519  // SUBDBG( "Exit: EventCode %d: Name %s\n", EventCode, name );
1520  return (PAPI_OK);
1521 }
1522 
1523 
1524 /* Takes a native event code and passes back the event description
1525  * @param EventCode is the native event code
1526  * @param descr is a pointer for the description to be copied to
1527  * @param len is the size of the descr string
1528  */
1529 static int papicuda_ntv_code_to_descr(unsigned int EventCode, char *name, int len)
1530 {
1531  // SUBDBG( "Entering\n" );
1532  unsigned int index = EventCode;
1534  if(index < gctxt->availEventSize) {
1535  strncpy(name, gctxt->availEventDesc[index].description, len);
1536  } else {
1537  return (PAPI_EINVAL);
1538  }
1539  return (PAPI_OK);
1540 }
1541 
1542 
1543 /* Vector that points to entry points for the component */
1545  .cmp_info = {
1546  /* default component information (unspecified values are initialized to 0) */
1547  .name = "cuda",
1548  .short_name = "cuda",
1549  .version = "5.1",
1550  .description = "CUDA events and metrics via NVIDIA CuPTI interfaces",
1551  .num_mpx_cntrs = PAPICUDA_MAX_COUNTERS,
1552  .num_cntrs = PAPICUDA_MAX_COUNTERS,
1553  .default_domain = PAPI_DOM_USER,
1554  .default_granularity = PAPI_GRN_THR,
1555  .available_granularities = PAPI_GRN_THR,
1556  .hardware_intr_sig = PAPI_INT_SIGNAL,
1557  /* component specific cmp_info initializations */
1558  .fast_real_timer = 0,
1559  .fast_virtual_timer = 0,
1560  .attach = 0,
1561  .attach_must_ptrace = 0,
1562  .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL,
1563  }
1564  ,
1565  /* sizes of framework-opaque component-private structures... these are all unused in this component */
1566  .size = {
1567  .context = 1, /* sizeof( papicuda_context_t ), */
1568  .control_state = 1, /* sizeof( papicuda_control_t ), */
1569  .reg_value = 1, /* sizeof( papicuda_register_t ), */
1570  .reg_alloc = 1, /* sizeof( papicuda_reg_alloc_t ), */
1571  }
1572  ,
1573  /* function pointers in this component */
1574  .start = papicuda_start, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl ) */
1575  .stop = papicuda_stop, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl ) */
1576  .read = papicuda_read, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl, long_long ** events, int flags ) */
1577  .reset = papicuda_reset, /* ( hwd_context_t * ctx, hwd_control_state_t * ctrl ) */
1578  .cleanup_eventset = papicuda_cleanup_eventset, /* ( hwd_control_state_t * ctrl ) */
1579 
1580  .init_component = papicuda_init_component, /* ( int cidx ) */
1581  .init_thread = papicuda_init_thread, /* ( hwd_context_t * ctx ) */
1582  .init_control_state = papicuda_init_control_state, /* ( hwd_control_state_t * ctrl ) */
1583  .update_control_state = papicuda_update_control_state, /* ( hwd_control_state_t * ptr, NativeInfo_t * native, int count, hwd_context_t * ctx ) */
1584 
1585  .ctl = papicuda_ctrl, /* ( hwd_context_t * ctx, int code, _papi_int_option_t * option ) */
1586  .set_domain = papicuda_set_domain, /* ( hwd_control_state_t * cntrl, int domain ) */
1587  .ntv_enum_events = papicuda_ntv_enum_events, /* ( unsigned int *EventCode, int modifier ) */
1588  .ntv_code_to_name = papicuda_ntv_code_to_name, /* ( unsigned int EventCode, char *name, int len ) */
1589  .ntv_code_to_descr = papicuda_ntv_code_to_descr, /* ( unsigned int EventCode, char *name, int len ) */
1590  .shutdown_thread = papicuda_shutdown_thread, /* ( hwd_context_t * ctx ) */
1591  .shutdown_component = papicuda_shutdown_component, /* ( void ) */
1592 };
1593 
1594 //-------------------------------------------------------------------------------------------------
1595 // This routine is an adaptation from 'readMetricValue' in nvlink_bandwidth_cupti_only.cu; where
1596 // it is shown to work. Note that a metric can consist of more than one event, so the number of
1597 // events and the number of metrics does not have to match.
1598 // 'eventGroup' should contain the events needed to read the
1599 // 'numEvents' is the number of events needed to read to compute the metrics.
1600 // 'metricId' is the array of METRICS, and
1601 // 'numMetrics" is the number of them, and also applies to the arrays 'values' and 'myKinds'.
1602 // 'dev is the CUDevice needed to compute the metric. We don't need to switch the context, that is
1603 // already done by the caller so we are pointing at the correct context.
1604 //-------------------------------------------------------------------------------------------------
1605 void readMetricValue(CUpti_EventGroup eventGroup,
1606  uint32_t numEvents, // array COLS in results,
1607  uint64_t numTotalInstances, // array ROWS in results,
1608  CUdevice dev, // current Device structure.
1609  uint32_t numMetrics,
1610  CUpti_MetricID *metricId,
1611  CUpti_MetricValueKind *myKinds,
1612  long long int *values,
1613  uint64_t timeDuration)
1614 {
1615  size_t bufferSizeBytes, numCountersRead;
1616  uint64_t *eventValueArray = NULL;
1617  CUpti_EventID *eventIdArray;
1618  size_t arraySizeBytes = 0;
1619  uint64_t *aggrEventValueArray = NULL;
1620  size_t aggrEventValueArraySize;
1621  uint32_t i = 0, j = 0;
1622 
1623  arraySizeBytes = sizeof(CUpti_EventID) * numEvents;
1624  bufferSizeBytes = sizeof(uint64_t) * numEvents * numTotalInstances;
1625 
1626  eventValueArray = (uint64_t *) malloc(bufferSizeBytes);
1627 
1628  eventIdArray = (CUpti_EventID *) malloc(arraySizeBytes);
1629 
1630  aggrEventValueArray = (uint64_t *) calloc(numEvents, sizeof(uint64_t));
1631 
1632  aggrEventValueArraySize = sizeof(uint64_t) * numEvents;
1633 
1634  CUPTI_CALL( (*cuptiEventGroupReadAllEvents)
1635  (eventGroup, CUPTI_EVENT_READ_FLAG_NONE, &bufferSizeBytes,
1636  eventValueArray, &arraySizeBytes, eventIdArray, &numCountersRead),
1637  return);
1638 
1639  // Arrangement of 2-d Array returned in eventValueArray:
1640  // domain instance 0: event0 event1 ... eventN
1641  // domain instance 1: event0 event1 ... eventN
1642  // ...
1643  // domain instance M: event0 event1 ... eventN
1644  // But we accumulate by column, event[0], event[1], etc.
1645 
1646  for (i = 0; i < numEvents; i++) { // outer loop is column (event) we are on.
1647  for (j = 0; j < numTotalInstances; j++) { // inner loop is row (instance) we are on.
1648  aggrEventValueArray[i] += eventValueArray[i + numEvents * j];
1649  }
1650  }
1651 
1652  // After aggregation, we use the data to compose the metrics.
1653  for (i = 0; i < numMetrics; i++) {
1654  CUpti_MetricValue metricValue;
1655  CUPTI_CALL( (*cuptiMetricGetValue)
1656  (dev, metricId[i], arraySizeBytes, eventIdArray,
1657  aggrEventValueArraySize, aggrEventValueArray,
1658  timeDuration, &metricValue),
1659  return);
1660 
1661  papicuda_convert_metric_value_to_long_long(metricValue, myKinds[i], &values[i]);
1662  }
1663 
1664  free(eventValueArray);
1665  free(eventIdArray);
1666 } // end readMetricValue.
1667 
1668 
#define PAPI_OK
Definition: fpapi.h:105
char disabled_reason[PAPI_MAX_STR_LEN]
Definition: papi.h:637
CUpti_MetricValueKind MV_Kind
Definition: linux-cuda.c:60
static papicuda_control_t * global_papicuda_control
Definition: linux-cuda.c:121
static int papicuda_start(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:996
#define PAPI_ENOMEM
Definition: fpapi.h:107
static const char * name
Definition: fork_overflow.c:31
static int papicuda_init_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:693
uint64_t allEventValues[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:90
#define PAPI_DOM_KERNEL
Definition: fpapi.h:22
#define PAPI_EINVAL
Definition: fpapi.h:106
uint64_t cuptiStartTimestampNs
Definition: linux-cuda.c:105
#define PAPI_GRN_THR
Definition: fpapi.h:67
static papicuda_context_t * global_papicuda_context
Definition: linux-cuda.c:118
static int papicuda_update_control_state(hwd_control_state_t *ctrl, NativeInfo_t *nativeInfo, int nativeCount, hwd_context_t *ctx)
Definition: linux-cuda.c:779
#define PAPICUDA_MAX_COUNTERS
Definition: linux-cuda.c:37
uint32_t * domainIDNumEvents
Definition: linux-cuda.c:70
#define papi_free(a)
Definition: papi_memory.h:35
uint32_t * availEventIDArray
Definition: linux-cuda.c:49
#define PAPI_ENOSUPP
Definition: fpapi.h:123
#define PAPI_ECOMBO
Definition: fpapi.h:129
#define papi_malloc(a)
Definition: papi_memory.h:34
int * availEventDeviceNum
Definition: linux-cuda.c:48
int cudaSetDevice(int devnum, int n1, int n2, int n3, void *ptr1)
#define CU_CALL(call, handleerror)
Definition: linux-cuda.c:142
static int papicuda_set_domain(hwd_control_state_t *ctrl, int domain)
Definition: linux-cuda.c:1466
static int papicuda_ntv_code_to_name(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:1509
#define DLSYM_AND_CHECK(dllib, name)
static int papicuda_read(hwd_context_t *ctx, hwd_control_state_t *ctrl, long long **values, int flags)
Definition: linux-cuda.c:1054
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
static int papicuda_convert_metric_value_to_long_long(CUpti_MetricValue metricValue, CUpti_MetricValueKind valueKind, long long int *papiValue)
Definition: linux-cuda.c:637
static FILE * fp
Return codes and api definitions.
static void * dl1
Definition: linux-cuda.c:110
#define CHECK_PRINT_EVAL(checkcond, str, evalthis)
Definition: linux-cuda.c:124
#define PAPI_DOM_OTHER
Definition: fpapi.h:23
#define PAPI_ECMP
Definition: fpapi.h:109
struct papicuda_name_desc * availEventDesc
Definition: linux-cuda.c:51
#define PAPI_2MAX_STR_LEN
Definition: papi.h:467
static int cidx
static void * dl2
Definition: linux-cuda.c:111
CUresult CUDAAPI cuInit(unsigned int myInt)
Definition: benchSANVML.c:48
CUpti_EventDomainID * domainIDArray
Definition: linux-cuda.c:69
#define PAPI_EMISC
Definition: fpapi.h:119
static int papicuda_stop(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:1261
int one
CUpti_EventID allEvents[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:89
static int papicuda_reset(hwd_context_t *ctx, hwd_control_state_t *ctrl)
Definition: linux-cuda.c:1405
static int papicuda_cleanup_eventset(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:1298
#define PAPI_DOM_USER
Definition: fpapi.h:21
#define PAPI_MIN_STR_LEN
Definition: fpapi.h:41
__attribute__((constructor))
Definition: init_fini.c:12
papicuda_active_cucontext_t * arrayOfActiveCUContexts[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:100
uint32_t availEventSize
Definition: linux-cuda.c:46
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
void(* _dl_non_dynamic_init)(void)
Definition: linux-cuda.c:188
#define CUDA_CALL(call, handleerror)
Definition: linux-cuda.c:133
static int papicuda_ntv_enum_events(unsigned int *EventCode, int modifier)
Definition: linux-cuda.c:1482
#define DECLARECUFUNC(funcname, funcsig)
static void * dl3
Definition: linux-cuda.c:112
char name[PAPI_MAX_STR_LEN]
Definition: papi.h:630
int papicuda_shutdown_thread(hwd_context_t *ctx)
Definition: linux-cuda.c:1340
#define PAPI_INT_SIGNAL
Definition: papi_internal.h:53
papi_vector_t _cuda_vector
Definition: linux-cuda.c:115
CUpti_EventGroupSets * eventGroupSets
Definition: linux-cuda.c:92
static int papicuda_shutdown_component(void)
Definition: linux-cuda.c:1349
static int papicuda_ntv_code_to_descr(unsigned int EventCode, char *name, int len)
Definition: linux-cuda.c:1529
struct papicuda_device_desc * deviceArray
Definition: linux-cuda.c:45
static int papicuda_add_native_events(papicuda_context_t *gctxt)
Definition: linux-cuda.c:333
uint32_t countOfActiveCUContexts
Definition: linux-cuda.c:99
static int papicuda_init_control_state(hwd_control_state_t *ctrl)
Definition: linux-cuda.c:750
CUresult CUDAAPI(* cuInitPtr)(unsigned int)
Definition: benchSANVML.c:47
CUpti_EventID * metricEvents
Definition: linux-cuda.c:59
static int papicuda_ctrl(hwd_context_t *ctx, int code, _papi_int_option_t *option)
Definition: linux-cuda.c:1447
long long activeEventValues[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:103
uint16_t numMetricEvents
Definition: linux-cuda.c:58
#define PAPI_ENOEVNT
Definition: fpapi.h:112
#define DECLARECUPTIFUNC(funcname, funcsig)
char deviceName[PAPI_MIN_STR_LEN]
Definition: linux-cuda.c:67
static int papicuda_init_component(int cidx)
Definition: linux-cuda.c:715
#define PAPI_ENOINIT
Definition: fpapi.h:121
void readMetricValue(CUpti_EventGroup eventGroup, uint32_t numEvents, uint64_t numTotalInstances, CUdevice dev, uint32_t numMetrics, CUpti_MetricID *metricId, CUpti_MetricValueKind *myKinds, long long int *values, uint64_t timeDuration)
Definition: linux-cuda.c:1605
CUpti_ActivityKind * availEventKind
Definition: linux-cuda.c:47
static long long values[NUM_EVENTS]
Definition: init_fini.c:10
int activeEventIndex[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:102
cudaError_t CUDARTAPI cudaGetDevice(int *dest)
Definition: benchSANVML.c:57
uint32_t activeEventCount
Definition: linux-cuda.c:101
void exit()
#define PAPI_DOM_ALL
Definition: fpapi.h:25
uint32_t * availEventIsBeingMeasuredInEventset
Definition: linux-cuda.c:50
uint32_t ctxActiveEvents[PAPICUDA_MAX_COUNTERS]
Definition: linux-cuda.c:86
uint64_t cuptiReadTimestampNs
Definition: linux-cuda.c:106
#define papi_calloc(a, b)
Definition: papi_memory.h:37
#define CUPTI_CALL(call, handleerror)
Definition: linux-cuda.c:153
cudaError_t(* cudaGetDevicePtr)(int *)
Definition: benchSANVML.c:53
int i
Definition: fileop.c:140
#define DECLARECUDAFUNC(funcname, funcsig)
#define PAPI_MAX_STR_LEN
Definition: fpapi.h:43