29 #include <cuda_runtime_api.h> 37 #define PAPICUDA_MAX_COUNTERS 512 43 typedef struct papicuda_context {
55 typedef struct papicuda_name_desc {
64 typedef struct papicuda_device_desc {
81 typedef struct papicuda_active_cucontext_s {
98 typedef struct papicuda_control {
124 #define CHECK_PRINT_EVAL( checkcond, str, evalthis ) \ 126 int _cond = (checkcond); \ 128 SUBDBG("error: condition %s failed: %s.\n", #checkcond, str); \ 133 #define CUDA_CALL( call, handleerror ) \ 135 cudaError_t _status = (call); \ 136 if (_status != cudaSuccess) { \ 137 SUBDBG("error: function %s failed with error %d.\n", #call, _status); \ 142 #define CU_CALL( call, handleerror ) \ 144 CUresult _status = (call); \ 145 if (_status != CUDA_SUCCESS) { \ 146 SUBDBG("error: function %s failed with error %d.\n", #call, _status); \ 153 #define CUPTI_CALL(call, handleerror) \ 155 CUptiResult _status = (call); \ 156 if (_status != CUPTI_SUCCESS) { \ 157 const char *errstr; \ 158 (*cuptiGetResultStringPtr)(_status, &errstr); \ 159 SUBDBG("error: function %s failed with error %s.\n", #call, errstr); \ 165 #define BUF_SIZE (32 * 1024) 166 #define ALIGN_SIZE (8) 167 #define ALIGN_BUFFER(buffer, align) \ 168 (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer)) 190 #define CUAPIWEAK __attribute__( ( weak ) ) 191 #define DECLARECUFUNC(funcname, funcsig) CUresult CUAPIWEAK funcname funcsig; CUresult( *funcname##Ptr ) funcsig; 195 DECLARECUFUNC(cuCtxCreate, (CUcontext *pctx,
unsigned int flags, CUdevice dev));
204 #define CUDAAPIWEAK __attribute__( ( weak ) ) 205 #define DECLARECUDAFUNC(funcname, funcsig) cudaError_t CUDAAPIWEAK funcname funcsig; cudaError_t( *funcname##Ptr ) funcsig; 210 #define CUPTIAPIWEAK __attribute__( ( weak ) ) 211 #define DECLARECUPTIFUNC(funcname, funcsig) CUptiResult CUPTIAPIWEAK funcname funcsig; CUptiResult( *funcname##Ptr ) funcsig; 214 DECLARECUPTIFUNC(cuptiDeviceEnumMetrics, (CUdevice device,
size_t * arraySizeBytes, CUpti_MetricID * metricArray));
215 DECLARECUPTIFUNC(cuptiDeviceGetEventDomainAttribute, (CUdevice device, CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib,
size_t * valueSize,
void *value));
216 DECLARECUPTIFUNC(cuptiDeviceGetNumMetrics, (CUdevice device, uint32_t * numMetrics));
217 DECLARECUPTIFUNC(cuptiEventGroupGetAttribute, (CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
size_t * valueSize,
void *value));
218 DECLARECUPTIFUNC(cuptiEventGroupReadEvent, (CUpti_EventGroup eventGroup, CUpti_ReadEventFlags flags, CUpti_EventID event,
size_t * eventValueBufferSizeBytes, uint64_t * eventValueBuffer));
219 DECLARECUPTIFUNC(cuptiEventGroupSetAttribute, (CUpti_EventGroup eventGroup, CUpti_EventGroupAttribute attrib,
size_t valueSize,
void *value));
220 DECLARECUPTIFUNC(cuptiEventGroupSetDisable, (CUpti_EventGroupSet * eventGroupSet));
221 DECLARECUPTIFUNC(cuptiEventGroupSetEnable, (CUpti_EventGroupSet * eventGroupSet));
222 DECLARECUPTIFUNC(cuptiEventGroupSetsCreate, (CUcontext context,
size_t eventIdArraySizeBytes, CUpti_EventID * eventIdArray, CUpti_EventGroupSets ** eventGroupPasses));
223 DECLARECUPTIFUNC(cuptiMetricCreateEventGroupSets, (CUcontext context,
size_t metricIdArraySizeBytes, CUpti_MetricID * metricIdArray, CUpti_EventGroupSets ** eventGroupPasses));
224 DECLARECUPTIFUNC(cuptiEventGroupSetsDestroy, (CUpti_EventGroupSets * eventGroupSets));
225 DECLARECUPTIFUNC(cuptiMetricGetRequiredEventGroupSets, (CUcontext ctx, CUpti_MetricID metricId, CUpti_EventGroupSets **thisEventGroupSet));
227 DECLARECUPTIFUNC(cuptiMetricEnumEvents, (CUpti_MetricID metric,
size_t * eventIdArraySizeBytes, CUpti_EventID * eventIdArray));
228 DECLARECUPTIFUNC(cuptiMetricGetAttribute, (CUpti_MetricID metric, CUpti_MetricAttribute attrib,
size_t * valueSize,
void *value));
229 DECLARECUPTIFUNC(cuptiMetricGetNumEvents, (CUpti_MetricID metric, uint32_t * numEvents));
230 DECLARECUPTIFUNC(cuptiMetricGetValue, (CUdevice device, CUpti_MetricID metric,
size_t eventIdArraySizeBytes, CUpti_EventID * eventIdArray,
size_t eventValueArraySizeBytes, uint64_t * eventValueArray, uint64_t timeDuration, CUpti_MetricValue * metricValue));
231 DECLARECUPTIFUNC(cuptiSetEventCollectionMode, (CUcontext context, CUpti_EventCollectionMode mode));
232 DECLARECUPTIFUNC(cuptiDeviceEnumEventDomains, (CUdevice,
size_t *, CUpti_EventDomainID *));
234 DECLARECUPTIFUNC(cuptiEventDomainEnumEvents, (CUpti_EventDomainID,
size_t *, CUpti_EventID *));
235 DECLARECUPTIFUNC(cuptiEventDomainGetAttribute, (CUpti_EventDomainID eventDomain, CUpti_EventDomainAttribute attrib,
size_t * valueSize,
void *value));
236 DECLARECUPTIFUNC(cuptiEventDomainGetNumEvents, (CUpti_EventDomainID, uint32_t *));
237 DECLARECUPTIFUNC(cuptiEventGetAttribute, (CUpti_EventID, CUpti_EventAttribute,
size_t *,
void *));
238 DECLARECUPTIFUNC(cuptiEventGroupAddEvent, (CUpti_EventGroup, CUpti_EventID));
239 DECLARECUPTIFUNC(cuptiEventGroupCreate, (CUcontext, CUpti_EventGroup *, uint32_t));
243 DECLARECUPTIFUNC(cuptiEventGroupReadAllEvents, (CUpti_EventGroup, CUpti_ReadEventFlags,
size_t *, uint64_t *,
size_t *, CUpti_EventID *,
size_t *));
245 DECLARECUPTIFUNC(cuptiGetResultString, (CUptiResult result,
const char **str));
260 static int papicuda_linkCudaLibraries()
262 #define DLSYM_AND_CHECK( dllib, name ) dlsym( dllib, name ); if ( dlerror()!=NULL ) { strncpy( _cuda_vector.cmp_info.disabled_reason, "A CUDA required function was not found in dynamic libs", PAPI_MAX_STR_LEN ); return ( PAPI_ENOSUPP ); } 270 dl1 = dlopen(
"libcuda.so", RTLD_NOW | RTLD_GLOBAL);
284 dl2 = dlopen(
"libcudart.so", RTLD_NOW | RTLD_GLOBAL | RTLD_NODELETE);
290 dl3 = dlopen(
"libcupti.so", RTLD_NOW | RTLD_GLOBAL);
296 cuptiDeviceGetEventDomainAttributePtr =
DLSYM_AND_CHECK(
dl3,
"cuptiDeviceGetEventDomainAttribute");
301 cuptiMetricGetRequiredEventGroupSetsPtr =
DLSYM_AND_CHECK(
dl3,
"cuptiMetricGetRequiredEventGroupSets");
311 cuptiMetricCreateEventGroupSetsPtr =
DLSYM_AND_CHECK(
dl3,
"cuptiMetricCreateEventGroupSets");
314 cuptiDeviceGetNumEventDomainsPtr =
DLSYM_AND_CHECK(
dl3,
"cuptiDeviceGetNumEventDomains");
316 cuptiEventDomainGetAttributePtr =
DLSYM_AND_CHECK(
dl3,
"cuptiEventDomainGetAttribute");
317 cuptiEventDomainGetNumEventsPtr =
DLSYM_AND_CHECK(
dl3,
"cuptiEventDomainGetNumEvents");
324 cuptiEventGroupReadAllEventsPtr =
DLSYM_AND_CHECK(
dl3,
"cuptiEventGroupReadAllEvents");
325 cuptiEventGroupResetAllEventsPtr =
DLSYM_AND_CHECK(
dl3,
"cuptiEventGroupResetAllEvents");
328 cuptiDisableKernelReplayModePtr =
DLSYM_AND_CHECK(
dl3,
"cuptiEnableKernelReplayMode");
338 uint32_t domainNum, eventNum;
344 uint32_t maxEventSize;
347 cuErr = (*cuDeviceGetCountPtr) (&gctxt->
deviceCount);
348 if(cuErr == CUDA_ERROR_NOT_INITIALIZED) {
370 for(deviceNum = 0; deviceNum < gctxt->
deviceCount; deviceNum++) {
373 CU_CALL((*cuDeviceGetPtr) (&mydevice->
cuDev, deviceNum),
382 CUPTI_CALL((*cuptiDeviceGetNumEventDomainsPtr)
388 mydevice->
maxDomains,
sizeof(CUpti_EventDomainID));
393 size_t domainarraysize = mydevice->
maxDomains *
sizeof(CUpti_EventDomainID);
403 for(domainNum = 0; domainNum < mydevice->
maxDomains; domainNum++) {
404 CUpti_EventDomainID domainID = mydevice->
domainIDArray[domainNum];
415 for(deviceNum = 0; deviceNum < gctxt->
deviceCount; deviceNum++) {
416 uint32_t maxMetrics = 0;
417 CUptiResult cuptiRet;
419 cuptiRet = (*cuptiDeviceGetNumMetricsPtr) (mydevice->
cuDev, &maxMetrics);
420 if (cuptiRet != CUPTI_SUCCESS || maxMetrics < 1)
continue;
421 maxEventSize += maxMetrics;
437 uint32_t idxEventArray = 0;
438 for(deviceNum = 0; deviceNum < gctxt->
deviceCount; deviceNum++) {
442 for(domainNum = 0; domainNum < mydevice->
maxDomains; domainNum++) {
445 CUpti_EventDomainID domainID = mydevice->
domainIDArray[domainNum];
450 CUpti_EventID *domainEventIDArray =
451 (CUpti_EventID *)
papi_calloc(domainNumEvents,
sizeof(CUpti_EventID));
454 size_t domainEventArraySize = domainNumEvents *
sizeof(CUpti_EventID);
456 (domainID, &domainEventArraySize, domainEventIDArray),
459 for(eventNum = 0; eventNum < domainNumEvents; eventNum++) {
460 CUpti_EventID myeventCuptiEventId = domainEventIDArray[eventNum];
466 CUPTI_CALL((*cuptiEventGetAttributePtr) (myeventCuptiEventId,
467 CUPTI_EVENT_ATTR_NAME, &tmpSizeBytes, tmpStr),
471 "event:%s:device=%d", tmpStr, deviceNum);
474 for(ii = 0; ii < (int) strlen(nameTmpPtr); ii++) {
475 if(nameTmpPtr[ii] ==
' ') nameTmpPtr[ii] =
'_';
480 CUPTI_CALL((*cuptiEventGetAttributePtr) (myeventCuptiEventId,
481 CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &tmpSizeBytes,
497 SUBDBG(
"Checking for metrics\n");
498 for (deviceNum = 0; deviceNum < gctxt->
deviceCount; deviceNum++) {
499 uint32_t maxMetrics = 0,
i, j;
500 CUpti_MetricID *metricIdList = NULL;
501 CUptiResult cuptiRet;
503 cuptiRet = (*cuptiDeviceGetNumMetricsPtr) (mydevice->
cuDev, &maxMetrics);
504 if (cuptiRet != CUPTI_SUCCESS || maxMetrics < 1)
continue;
506 SUBDBG(
"Device %d: Checking each of the (maxMetrics) %d metrics\n", deviceNum, maxMetrics);
509 size_t size = maxMetrics *
sizeof(CUpti_EventID);
510 metricIdList = (CUpti_MetricID *)
papi_calloc(maxMetrics,
sizeof(CUpti_EventID));
514 (mydevice->
cuDev, &size, metricIdList),
518 int saveDeviceNum = 0;
521 for (
i=0, j=0;
i<maxMetrics;
i++) {
523 CUPTI_CALL((*cuptiMetricGetAttributePtr) (metricIdList[
i],
524 CUPTI_METRIC_ATTR_NAME, &size, (uint8_t *) tmpStr),
530 if (strcmp(
"branch_efficiency", tmpStr) == 0)
continue;
558 metricIdList[j++] = metricIdList[
i];
565 for(
i = 0;
i < maxMetrics;
i++) {
567 gctxt->
availEventKind[idxEventArray] = CUPTI_ACTIVITY_KIND_METRIC;
570 CUPTI_CALL((*cuptiMetricGetAttributePtr) (metricIdList[
i],
571 CUPTI_METRIC_ATTR_NAME, &size, (uint8_t *) tmpStr),
578 size_t MV_KindSize =
sizeof(CUpti_MetricValueKind);
580 (metricIdList[
i], CUPTI_METRIC_ATTR_VALUE_KIND, &MV_KindSize,
585 "metric:%s:device=%d", tmpStr, deviceNum);
589 (metricIdList[
i], CUPTI_METRIC_ATTR_LONG_DESCRIPTION, &size,
598 CUpti_MetricID itemId = metricIdList[
i];
599 CUPTI_CALL((*cuptiMetricGetNumEventsPtr) (itemId, &numSubs),
602 size_t sizeBytes = numSubs *
sizeof(CUpti_EventID);
603 CUpti_EventID *subEventIds =
papi_malloc(sizeBytes);
608 (itemId, &sizeBytes, subEventIds),
644 SUBDBG(
"Try to convert the CUPTI metric value kind (index %d) to PAPI value (long long or double)\n", valueKind);
646 case CUPTI_METRIC_VALUE_KIND_DOUBLE:
647 SUBDBG(
"Metric double %f\n", metricValue.metricValueDouble);
648 tmpValue.ll = (
long long)(metricValue.metricValueDouble);
651 case CUPTI_METRIC_VALUE_KIND_UINT64:
652 SUBDBG(
"Metric uint64 = %llu\n", (
unsigned long long) metricValue.metricValueUint64);
653 tmpValue.ll = (
long long) (metricValue.metricValueUint64);
656 case CUPTI_METRIC_VALUE_KIND_INT64:
657 SUBDBG(
"Metric int64 = %lld\n", (
long long) metricValue.metricValueInt64);
658 tmpValue.ll = (
long long) (metricValue.metricValueInt64);
661 case CUPTI_METRIC_VALUE_KIND_PERCENT:
662 SUBDBG(
"Metric percent = %f%%\n", metricValue.metricValuePercent);
663 tmpValue.ll = (
long long)(metricValue.metricValuePercent*100);
666 case CUPTI_METRIC_VALUE_KIND_THROUGHPUT:
667 SUBDBG(
"Metric throughput %llu bytes/sec\n", (
unsigned long long) metricValue.metricValueThroughput);
668 tmpValue.ll = (
long long) (metricValue.metricValueThroughput);
671 case CUPTI_METRIC_VALUE_KIND_UTILIZATION_LEVEL:
672 SUBDBG(
"Metric utilization level %u\n", (
unsigned int) metricValue.metricValueUtilizationLevel);
673 tmpValue.ll = (
long long) (metricValue.metricValueUtilizationLevel);
674 CHECK_PRINT_EVAL(tmpValue.ll - metricValue.metricValueUtilizationLevel > 1e-6,
"Error converting metric\n", return (
PAPI_EMISC));
681 *papiValue = tmpValue.ll;
717 SUBDBG(
"Entering with component idx: %d\n",
cidx);
721 if(papicuda_linkCudaLibraries() !=
PAPI_OK) {
722 SUBDBG(
"Dynamic link of CUDA libraries failed, component will be disabled.\n");
723 SUBDBG(
"See disable reason in papi_component_avail output for more details.\n");
782 SUBDBG(
"Entering with nativeCount %d\n", nativeCount);
789 CUcontext eventCuCtx;
790 int index, ii, ee, cc;
798 SUBDBG(
"currDeviceNum %d \n", currDeviceNum);
802 SUBDBG(
"currDeviceNum %d cuCtx %p \n", currDeviceNum, currCuCtx);
805 for (ii = 0; ii < nativeCount; ii++) {
808 int numMetricEvents= gctxt->
availEventDesc[index].numMetricEvents;
814 SUBDBG(
"Skipping event %s which is already added\n", eventName);
826 SUBDBG(
"Event %s device %d already has a cuCtx %p registered\n", eventName, eventDeviceNum, eventCuCtx);
828 if(eventCuCtx != currCuCtx)
829 CU_CALL((*cuCtxPushCurrentPtr) (eventCuCtx),
836 SUBDBG(
"Event %s device %d does not have a cuCtx registered yet...\n", eventName, eventDeviceNum);
837 if(currDeviceNum != eventDeviceNum) {
838 CUDA_CALL((*cudaSetDevicePtr) (eventDeviceNum),
841 CU_CALL((*cuCtxGetCurrentPtr) (&eventCuCtx),
844 eventCuCtx = currCuCtx;
854 SUBDBG(
"Added a new context deviceNum %d cuCtx %p ... now countOfActiveCUContexts is %d\n", eventDeviceNum, eventCuCtx, gctrl->
countOfActiveCUContexts);
860 eventContextIdx = cc;
871 if (numMetricEvents == 0) {
874 SUBDBG(
"Num events (generated by metric) exceeded PAPICUDA_MAX_COUNTERS\n");
882 for(ee = 0; ee < numMetricEvents; ee++) {
884 CUpti_EventID myId = gctxt->
availEventDesc[index].metricEvents[ee];
887 if (eventctrl->
allEvents[aeIdx] == myId)
break;
894 SUBDBG(
"Num events (generated by metric) exceeded PAPICUDA_MAX_COUNTERS\n");
946 size_t sizeBytes = (eventctrl->
allEventsCount) *
sizeof(CUpti_EventID);
949 #ifdef PAPICUDA_KERNEL_REPLAY_MODE 950 CUPTI_CALL((*cuptiEnableKernelReplayModePtr) (eventCuCtx),
953 (eventCuCtx, sizeBytes, eventctrl->
allEvents,
957 #else // Normal operation. 959 (eventCuCtx,CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS),
969 (eventCuCtx, sizeBytes, eventctrl->
allEvents,
974 SUBDBG(
"Error occurred: The combined CUPTI events cannot be collected simultaneously ... try different events\n");
981 #endif // #if/#else/#endif on PAPICUDA_KERNEL_REPLAY_MODE 985 if(eventCuCtx != currCuCtx)
1003 uint32_t ii, gg, cc;
1004 int saveDeviceNum = -1;
1006 SUBDBG(
"Reset all active event values\n");
1010 SUBDBG(
"Save current context, then switch to each active device/context and enable eventgroups\n");
1017 SUBDBG(
"Set to device %d cuCtx %p \n", eventDeviceNum, eventCuCtx);
1018 if(eventDeviceNum != saveDeviceNum) {
1022 CUpti_EventGroupSets *eventGroupSets =
1024 CUpti_EventGroupSet *groupset = &eventGroupSets->sets[0];
1025 for(gg = 0; gg < groupset->numEventGroups; gg++) {
1027 CUPTI_CALL((*cuptiEventGroupSetAttributePtr) (
1028 groupset->eventGroups[gg],
1029 CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES,
1030 sizeof(uint32_t), &
one),
1034 CUPTI_CALL((*cuptiEventGroupSetEnablePtr) (groupset),
1037 if(eventDeviceNum != saveDeviceNum) {
1062 uint32_t gg,
i, j, cc;
1073 SUBDBG(
"Save current context, then switch to each active device/context and enable context-specific eventgroups\n");
1079 int currDeviceNum = activeCuCtxt->
deviceNum;
1080 CUcontext currCuCtx = activeCuCtxt->
cuCtx;
1082 SUBDBG(
"Set to device %d cuCtx %p \n", currDeviceNum, currCuCtx);
1083 if(currDeviceNum != saveDeviceNum) {
1091 CUpti_EventGroupSets *myEventGroupSets = activeCuCtxt->
eventGroupSets;
1093 uint32_t numEvents, numInstances, numTotalInstances;
1094 size_t sizeofuint32num =
sizeof(uint32_t);
1095 CUpti_EventDomainID groupDomainID;
1096 size_t groupDomainIDSize =
sizeof(groupDomainID);
1097 CUdevice cudevice = gctxt->
deviceArray[currDeviceNum].cuDev;
1104 CUpti_EventGroupSet *groupset = &myEventGroupSets->sets[0];
1105 SUBDBG(
"Read events in this context\n");
1108 for (gg = 0; gg < groupset->numEventGroups; gg++) {
1109 CUpti_EventGroup group = groupset->eventGroups[gg];
1112 (group, CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID,
1113 &groupDomainIDSize, &groupDomainID),
1118 CUPTI_CALL((*cuptiDeviceGetEventDomainAttributePtr)
1121 CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT,
1123 &numTotalInstances),
1128 CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT,
1135 CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS,
1144 size_t resultArrayBytes =
sizeof(uint64_t) * numEvents * numTotalInstances;
1145 size_t eventIdArrayBytes =
sizeof(CUpti_EventID) * numEvents;
1146 size_t numCountersRead = 2;
1148 CUpti_EventID *eventIdArray = (CUpti_EventID *)
papi_malloc(eventIdArrayBytes);
1149 uint64_t *resultArray = (uint64_t *)
papi_malloc(resultArrayBytes);
1150 uint64_t *aggrResultArray = (uint64_t *)
papi_calloc(numEvents,
sizeof(uint64_t));
1152 for (
i=0;
i<(resultArrayBytes/
sizeof(uint64_t));
i++) resultArray[
i]=0;
1154 if (eventIdArray == NULL || resultArray == NULL || aggrResultArray == NULL) {
1155 fprintf(stderr,
"%s:%i failed to allocate memory.\n", __FILE__, __LINE__);
1159 CUPTI_CALL( (*cuptiEventGroupReadAllEventsPtr)
1160 (group, CUPTI_EVENT_READ_FLAG_NONE,
1161 &resultArrayBytes, resultArray,
1162 &eventIdArrayBytes, eventIdArray,
1174 for (
i = 0;
i < numEvents;
i++) {
1175 for (j = 0; j < numTotalInstances; j++) {
1176 aggrResultArray[
i] += resultArray[
i + numEvents * j];
1192 for (
i=0;
i<numEvents;
i++) {
1193 CUpti_EventID myId = eventIdArray[
i];
1213 for (j=0; j<ctxActiveCount; j++) {
1214 uint32_t activeIdx, availIdx;
1216 activeIdx=ctxActive[j];
1219 struct papicuda_name_desc *myDesc=&(gctxt->
availEventDesc[availIdx]);
1221 if (myDesc->numMetricEvents == 0) {
1223 for (k=0; k<AEIdx; k++) {
1224 if (activeCuCtxt->
allEvents[k] == thisEventId) {
1233 CUpti_MetricValue myValue;
1235 (cudevice, thisEventId,
1236 AEIdx *
sizeof(CUpti_EventID),
1238 AEIdx *
sizeof(uint64_t),
1240 durationNs, &myValue),
1244 myValue, myDesc->MV_Kind,
1249 if(currDeviceNum != saveDeviceNum) {
1250 CUDA_CALL((*cudaSetDevicePtr) (saveDeviceNum),
1270 SUBDBG(
"Save current context, then switch to each active device/context and enable eventgroups\n");
1275 SUBDBG(
"Set to device %d cuCtx %p \n", currDeviceNum, currCuCtx);
1276 if(currDeviceNum != saveDeviceNum)
1281 for (ss=0; ss<currEventGroupSets->numSets; ss++) {
1282 CUpti_EventGroupSet groupset = currEventGroupSets->sets[ss];
1283 CUPTI_CALL((*cuptiEventGroupSetDisablePtr) (&groupset),
1287 if(currDeviceNum != saveDeviceNum)
1309 SUBDBG(
"Save current context, then switch to each active device/context and enable eventgroups\n");
1315 if(currDeviceNum != saveDeviceNum)
1320 (*cuptiEventGroupSetsDestroyPtr) (currEventGroupSets);
1324 if(currDeviceNum != saveDeviceNum)
1358 for(deviceNum = 0; deviceNum < gctxt->
deviceCount; deviceNum++) {
1366 if (desc->numMetricEvents > 0) {
1383 #ifdef PAPICUDA_KERNEL_REPLAY_MODE 1410 uint32_t gg, ii, cc, ss;
1413 SUBDBG(
"Reset all active event values\n");
1417 SUBDBG(
"Save current context, then switch to each active device/context and reset\n");
1422 if(currDeviceNum != saveDeviceNum)
1427 for (ss=0; ss<currEventGroupSets->numSets; ss++) {
1428 CUpti_EventGroupSet groupset = currEventGroupSets->sets[ss];
1429 for(gg = 0; gg < groupset.numEventGroups; gg++) {
1430 CUpti_EventGroup group = groupset.eventGroups[gg];
1435 if(currDeviceNum != saveDeviceNum)
1491 if(*EventCode < global_papicuda_context->availEventSize - 1) {
1492 *EventCode = *EventCode + 1;
1512 unsigned int index = EventCode;
1514 if(index < gctxt->availEventSize) {
1532 unsigned int index = EventCode;
1534 if(index < gctxt->availEventSize) {
1548 .short_name =
"cuda",
1550 .description =
"CUDA events and metrics via NVIDIA CuPTI interfaces",
1558 .fast_real_timer = 0,
1559 .fast_virtual_timer = 0,
1561 .attach_must_ptrace = 0,
1607 uint64_t numTotalInstances,
1609 uint32_t numMetrics,
1610 CUpti_MetricID *metricId,
1611 CUpti_MetricValueKind *myKinds,
1613 uint64_t timeDuration)
1615 size_t bufferSizeBytes, numCountersRead;
1616 uint64_t *eventValueArray = NULL;
1617 CUpti_EventID *eventIdArray;
1618 size_t arraySizeBytes = 0;
1619 uint64_t *aggrEventValueArray = NULL;
1620 size_t aggrEventValueArraySize;
1621 uint32_t
i = 0, j = 0;
1623 arraySizeBytes =
sizeof(CUpti_EventID) * numEvents;
1624 bufferSizeBytes =
sizeof(uint64_t) * numEvents * numTotalInstances;
1626 eventValueArray = (uint64_t *) malloc(bufferSizeBytes);
1628 eventIdArray = (CUpti_EventID *) malloc(arraySizeBytes);
1630 aggrEventValueArray = (uint64_t *) calloc(numEvents,
sizeof(uint64_t));
1632 aggrEventValueArraySize =
sizeof(uint64_t) * numEvents;
1635 (eventGroup, CUPTI_EVENT_READ_FLAG_NONE, &bufferSizeBytes,
1636 eventValueArray, &arraySizeBytes, eventIdArray, &numCountersRead),
1646 for (
i = 0;
i < numEvents;
i++) {
1647 for (j = 0; j < numTotalInstances; j++) {
1648 aggrEventValueArray[
i] += eventValueArray[
i + numEvents * j];
1653 for (
i = 0;
i < numMetrics;
i++) {
1654 CUpti_MetricValue metricValue;
1656 (dev, metricId[
i], arraySizeBytes, eventIdArray,
1657 aggrEventValueArraySize, aggrEventValueArray,
1658 timeDuration, &metricValue),
1664 free(eventValueArray);
char disabled_reason[PAPI_MAX_STR_LEN]
CUpti_MetricValueKind MV_Kind
static papicuda_control_t * global_papicuda_control
static int papicuda_start(hwd_context_t *ctx, hwd_control_state_t *ctrl)
static int papicuda_init_thread(hwd_context_t *ctx)
uint64_t allEventValues[PAPICUDA_MAX_COUNTERS]
uint64_t cuptiStartTimestampNs
static papicuda_context_t * global_papicuda_context
static int papicuda_update_control_state(hwd_control_state_t *ctrl, NativeInfo_t *nativeInfo, int nativeCount, hwd_context_t *ctx)
#define PAPICUDA_MAX_COUNTERS
uint32_t * domainIDNumEvents
uint32_t * availEventIDArray
int * availEventDeviceNum
int cudaSetDevice(int devnum, int n1, int n2, int n3, void *ptr1)
#define CU_CALL(call, handleerror)
static int papicuda_set_domain(hwd_control_state_t *ctrl, int domain)
static int papicuda_ntv_code_to_name(unsigned int EventCode, char *name, int len)
#define DLSYM_AND_CHECK(dllib, name)
static int papicuda_read(hwd_context_t *ctx, hwd_control_state_t *ctrl, long long **values, int flags)
PAPI_component_info_t cmp_info
static int papicuda_convert_metric_value_to_long_long(CUpti_MetricValue metricValue, CUpti_MetricValueKind valueKind, long long int *papiValue)
Return codes and api definitions.
#define CHECK_PRINT_EVAL(checkcond, str, evalthis)
struct papicuda_name_desc * availEventDesc
#define PAPI_2MAX_STR_LEN
CUresult CUDAAPI cuInit(unsigned int myInt)
CUpti_EventDomainID * domainIDArray
static int papicuda_stop(hwd_context_t *ctx, hwd_control_state_t *ctrl)
CUpti_EventID allEvents[PAPICUDA_MAX_COUNTERS]
static int papicuda_reset(hwd_context_t *ctx, hwd_control_state_t *ctrl)
static int papicuda_cleanup_eventset(hwd_control_state_t *ctrl)
__attribute__((constructor))
papicuda_active_cucontext_t * arrayOfActiveCUContexts[PAPICUDA_MAX_COUNTERS]
#define SUBDBG(format, args...)
void(* _dl_non_dynamic_init)(void)
#define CUDA_CALL(call, handleerror)
static int papicuda_ntv_enum_events(unsigned int *EventCode, int modifier)
#define DECLARECUFUNC(funcname, funcsig)
char name[PAPI_MAX_STR_LEN]
int papicuda_shutdown_thread(hwd_context_t *ctx)
papi_vector_t _cuda_vector
CUpti_EventGroupSets * eventGroupSets
static int papicuda_shutdown_component(void)
static int papicuda_ntv_code_to_descr(unsigned int EventCode, char *name, int len)
struct papicuda_device_desc * deviceArray
static int papicuda_add_native_events(papicuda_context_t *gctxt)
uint32_t countOfActiveCUContexts
static int papicuda_init_control_state(hwd_control_state_t *ctrl)
CUresult CUDAAPI(* cuInitPtr)(unsigned int)
CUpti_EventID * metricEvents
static int papicuda_ctrl(hwd_context_t *ctx, int code, _papi_int_option_t *option)
long long activeEventValues[PAPICUDA_MAX_COUNTERS]
#define DECLARECUPTIFUNC(funcname, funcsig)
char deviceName[PAPI_MIN_STR_LEN]
static int papicuda_init_component(int cidx)
void readMetricValue(CUpti_EventGroup eventGroup, uint32_t numEvents, uint64_t numTotalInstances, CUdevice dev, uint32_t numMetrics, CUpti_MetricID *metricId, CUpti_MetricValueKind *myKinds, long long int *values, uint64_t timeDuration)
CUpti_ActivityKind * availEventKind
static long long values[NUM_EVENTS]
int activeEventIndex[PAPICUDA_MAX_COUNTERS]
cudaError_t CUDARTAPI cudaGetDevice(int *dest)
uint32_t activeEventCount
uint32_t * availEventIsBeingMeasuredInEventset
uint32_t ctxActiveEvents[PAPICUDA_MAX_COUNTERS]
uint64_t cuptiReadTimestampNs
#define papi_calloc(a, b)
#define CUPTI_CALL(call, handleerror)
cudaError_t(* cudaGetDevicePtr)(int *)
#define DECLARECUDAFUNC(funcname, funcsig)