|
PAPI
5.3.0.0
|
00001 /****************************/ 00002 /* THIS IS OPEN SOURCE CODE */ 00003 /****************************/ 00004 00017 #include <dlfcn.h> 00018 00019 #include "papi.h" 00020 #include "papi_internal.h" 00021 #include "papi_vector.h" 00022 #include "papi_memory.h" 00023 #include "linux-cuda.h" 00024 00025 00026 /******** CHANGE PROTOTYPES TO DECLARE CUDA LIBRARY SYMBOLS AS WEAK ********** 00027 * This is done so that a version of PAPI built with the cuda component can * 00028 * be installed on a system which does not have the cuda libraries installed. * 00029 * * 00030 * If this is done without these prototypes, then all papi services on the * 00031 * system without the cuda libraries installed will fail. The PAPI libraries * 00032 * contain references to the cuda libraries which are not installed. The * 00033 * load of PAPI commands fails because the cuda library references can not be * 00034 * resolved. * 00035 * * 00036 * This also defines pointers to the cuda library functions that we call. * 00037 * These function pointers will be resolved with dlopen/dlsym calls at * 00038 * component initialization time. The component then calls the cuda library * 00039 * functions through these function pointers. * 00040 *******************************************************************************/ 00041 void (*_dl_non_dynamic_init)(void) __attribute__((weak)); 00042 #undef CUDAAPI 00043 #define CUDAAPI __attribute__((weak)) 00044 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); 00045 CUresult CUDAAPI cuCtxDestroy(CUcontext); 00046 CUresult CUDAAPI cuCtxGetCurrent(CUcontext *); 00047 CUresult CUDAAPI cuDeviceGet(CUdevice *, int); 00048 CUresult CUDAAPI cuDeviceGetCount(int *); 00049 CUresult CUDAAPI cuDeviceGetName(char *, int, CUdevice); 00050 CUresult CUDAAPI cuInit(unsigned int); 00051 00052 CUresult (*cuCtxCreatePtr)(CUcontext *pctx, unsigned int flags, CUdevice dev); 00053 CUresult (*cuCtxDestroyPtr)(CUcontext); 00054 CUresult (*cuCtxGetCurrentPtr)(CUcontext *); 00055 CUresult (*cuDeviceGetPtr)(CUdevice *, int); 00056 CUresult (*cuDeviceGetCountPtr)(int *); 00057 CUresult (*cuDeviceGetNamePtr)(char *, int, CUdevice); 00058 CUresult (*cuInitPtr)(unsigned int); 00059 00060 #undef CUDARTAPI 00061 #define CUDARTAPI __attribute__((weak)) 00062 cudaError_t CUDARTAPI cudaFree(void *); 00063 cudaError_t CUDARTAPI cudaGetDevice(int *); 00064 cudaError_t CUDARTAPI cudaRuntimeGetVersion( int *); 00065 cudaError_t CUDARTAPI cudaDriverGetVersion( int *); 00066 00067 cudaError_t (*cudaFreePtr)(void *); 00068 cudaError_t (*cudaGetDevicePtr)(int *); 00069 cudaError_t (*cudaRuntimeGetVersionPtr)(int *); 00070 cudaError_t (*cudaDriverGetVersionPtr)(int *); 00071 00072 #undef CUPTIAPI 00073 #define CUPTIAPI __attribute__((weak)) 00074 CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice, size_t *, CUpti_EventDomainID *); 00075 CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice, CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *); 00076 CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice, uint32_t *); 00077 CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID, size_t*, CUpti_EventID *); 00078 CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID, uint32_t *); 00079 CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID, CUpti_EventAttribute, size_t *, void *); 00080 CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup, CUpti_EventID); 00081 CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext, CUpti_EventGroup *, uint32_t); 00082 CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup); 00083 CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup); 00084 CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup); 00085 CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *); 00086 CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup); 00087 CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup); 00088 00089 CUptiResult (*cuptiDeviceEnumEventDomainsPtr)(CUdevice, size_t *, CUpti_EventDomainID *); 00090 CUptiResult (*cuptiDeviceGetEventDomainAttributePtr)(CUdevice, CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *); 00091 CUptiResult (*cuptiDeviceGetNumEventDomainsPtr)(CUdevice, uint32_t *); 00092 CUptiResult (*cuptiEventDomainEnumEventsPtr)(CUpti_EventDomainID, size_t*, CUpti_EventID *); 00093 CUptiResult (*cuptiEventDomainGetNumEventsPtr)(CUpti_EventDomainID, uint32_t *); 00094 CUptiResult (*cuptiEventGetAttributePtr)(CUpti_EventID, CUpti_EventAttribute, size_t *, void *); 00095 CUptiResult (*cuptiEventGroupAddEventPtr)(CUpti_EventGroup, CUpti_EventID); 00096 CUptiResult (*cuptiEventGroupCreatePtr)(CUcontext, CUpti_EventGroup *, uint32_t); 00097 CUptiResult (*cuptiEventGroupDestroyPtr)(CUpti_EventGroup); 00098 CUptiResult (*cuptiEventGroupDisablePtr)(CUpti_EventGroup); 00099 CUptiResult (*cuptiEventGroupEnablePtr)(CUpti_EventGroup); 00100 CUptiResult (*cuptiEventGroupReadAllEventsPtr)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *); 00101 CUptiResult (*cuptiEventGroupRemoveAllEventsPtr)(CUpti_EventGroup); 00102 CUptiResult (*cuptiEventGroupResetAllEventsPtr)(CUpti_EventGroup); 00103 00104 // file handles used to access cuda libraries with dlopen 00105 static void* dl1 = NULL; 00106 static void* dl2 = NULL; 00107 static void* dl3 = NULL; 00108 00109 static int linkCudaLibraries (); 00110 00111 papi_vector_t _cuda_vector; 00112 00113 00114 /****************************************************************************** 00115 ******** BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT ********* 00116 *****************************************************************************/ 00117 /* 00118 * Specify device(s): Counts number of cuda events available in this system 00119 */ 00120 static int 00121 detectDevice( void ) 00122 { 00123 CUresult err; 00124 int skipDevice = 0; 00125 int id; 00126 char deviceName_tmp[PAPI_MIN_STR_LEN] = "init"; 00127 00128 totalEventCount = 0; 00129 00130 /* CUDA initialization */ 00131 err = (*cuInitPtr)( 0 ); 00132 if ( err != CUDA_SUCCESS ) { 00133 SUBDBG ("Info: Error from cuInit(): %d\n", err); 00134 return ( PAPI_ENOSUPP ); 00135 } 00136 00137 /* How many gpgpu devices do we have? */ 00138 err = (*cuDeviceGetCountPtr)( &deviceCount ); 00139 CHECK_CU_ERROR( err, "cuDeviceGetCount" ); 00140 if ( deviceCount == 0 ) 00141 return ( PAPI_ENOSUPP ); 00142 00143 /* allocate memory for device data table */ 00144 device = ( DeviceData_t * ) malloc( sizeof ( DeviceData_t ) * deviceCount ); 00145 if ( device == NULL ) { 00146 perror( "malloc(): Failed to allocate memory to CUDA device table" ); 00147 return ( PAPI_ENOSUPP ); 00148 } 00149 00150 /* What are the devices? Get Name and # of domains per device */ 00151 for ( id = 0; id < deviceCount; id++ ) { 00152 err = (*cuDeviceGetPtr)( &device[id].dev, id ); 00153 CHECK_CU_ERROR( err, "cuDeviceGet" ); 00154 00155 err = (*cuDeviceGetNamePtr)( device[id].name, PAPI_MIN_STR_LEN, device[id].dev ); 00156 CHECK_CU_ERROR( err, "cuDeviceGetName" ); 00157 00158 SUBDBG ("Cuda deviceName: %s\n", device[id].name); 00159 00160 /* Skip device if there are multiple of the same type 00161 and if it has been already added to the list */ 00162 if ( 0 == strcmp( deviceName_tmp, device[id].name ) ) { 00163 skipDevice++; 00164 continue; 00165 } 00166 00167 strcpy( deviceName_tmp, device[id].name ); 00168 00169 /* enumerate the domains on the device */ 00170 if ( 0 != enumEventDomains( device[id].dev, id ) ) 00171 return ( PAPI_ENOSUPP ); 00172 } 00173 00174 deviceCount = deviceCount - skipDevice; 00175 00176 /* return number of events provided via CuPTI */ 00177 return totalEventCount; 00178 } 00179 00180 00181 /* 00182 * Detect supported domains for specified device 00183 */ 00184 static int 00185 enumEventDomains( CUdevice dev, int deviceId ) 00186 { 00187 CUptiResult err = CUPTI_SUCCESS; 00188 CUpti_EventDomainID *domainId = NULL; 00189 uint32_t id = 0; 00190 size_t size = 0; 00191 00192 device[deviceId].domainCount = 0; 00193 00194 /* get number of domains for device dev */ 00195 err = (*cuptiDeviceGetNumEventDomainsPtr)( dev, &device[deviceId].domainCount ); 00196 CHECK_CUPTI_ERROR( err, "cuptiDeviceGetNumEventDomains" ); 00197 00198 if ( device[deviceId].domainCount == 0 ) { 00199 printf( "No domain is exposed by dev = %d\n", dev ); 00200 return -1; 00201 } 00202 00203 /* CuPTI domain struct */ 00204 size = sizeof ( CUpti_EventDomainID ) * device[deviceId].domainCount; 00205 domainId = ( CUpti_EventDomainID * ) malloc( size ); 00206 if ( domainId == NULL ) { 00207 perror( "malloc(): Failed to allocate memory to CuPTI domain ID" ); 00208 return -1; 00209 } 00210 memset( domainId, 0, size ); 00211 00212 /* PAPI domain struct */ 00213 device[deviceId].domain = 00214 ( DomainData_t * ) malloc( sizeof ( DomainData_t ) * 00215 device[deviceId].domainCount ); 00216 if ( device[deviceId].domain == NULL ) { 00217 perror( "malloc(): Failed to allocate memory to PAPI domain struct" ); 00218 free(domainId); 00219 return -1; 00220 } 00221 00222 /* Enumerates the event domains for a device dev */ 00223 err = (*cuptiDeviceEnumEventDomainsPtr)( dev, &size, domainId ); 00224 CHECK_CUPTI_ERROR( err, "cuptiDeviceEnumEventDomains" ); 00225 00226 /* enum domains */ 00227 for ( id = 0; id < device[deviceId].domainCount; id++ ) { 00228 device[deviceId].domain[id].domainId = domainId[id]; 00229 00230 /* query domain name */ 00231 size = PAPI_MIN_STR_LEN; 00232 #ifdef CUDA_4_0 00233 err = cuptiEventDomainGetAttribute( dev, 00234 device[deviceId].domain[id]. 00235 domainId, 00236 CUPTI_EVENT_DOMAIN_ATTR_NAME, &size, 00237 ( void * ) device[deviceId]. 00238 domain[id].name ); 00239 CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" ); 00240 00241 /* query num of events avaialble in the domain */ 00242 size = sizeof ( device[deviceId].domain[id].eventCount ); 00243 err = cuptiEventDomainGetAttribute( dev, 00244 device[deviceId].domain[id]. 00245 domainId, 00246 CUPTI_EVENT_DOMAIN_MAX_EVENTS, 00247 &size, 00248 ( void * ) &device[deviceId]. 00249 domain[id].eventCount ); 00250 CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" ); 00251 00252 /* enumerate the events for the domain[id] on the device dev */ 00253 if ( 0 != enumEvents( dev, deviceId, id ) ) 00254 return -1; 00255 #else 00256 err = (*cuptiDeviceGetEventDomainAttributePtr)( dev, 00257 device[deviceId].domain[id].domainId, 00258 CUPTI_EVENT_DOMAIN_ATTR_NAME, &size, 00259 ( void * ) device[deviceId].domain[id].name ); 00260 CHECK_CUPTI_ERROR( err, "cuptiDeviceGetEventDomainAttribute" ); 00261 00262 /* query num of events avaialble in the domain */ 00263 err = (*cuptiEventDomainGetNumEventsPtr)( device[deviceId].domain[id].domainId, 00264 &device[deviceId].domain[id].eventCount ); 00265 CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetNumEvents" ); 00266 00267 /* enumerate the events for the domain[id] on the device deviceId */ 00268 if ( 0 != enumEvents( deviceId, id ) ) 00269 return -1; 00270 #endif 00271 } 00272 00273 totalDomainCount += device[deviceId].domainCount; 00274 free( domainId ); 00275 return 0; 00276 } 00277 00278 00279 /* 00280 * Detect supported events for specified device domain 00281 */ 00282 #ifdef CUDA_4_0 00283 static int 00284 enumEvents( CUdevice dev, int deviceId, int domainId ) 00285 #else 00286 static int 00287 enumEvents( int deviceId, int domainId ) 00288 #endif 00289 { 00290 CUptiResult err = CUPTI_SUCCESS; 00291 CUpti_EventID *eventId = NULL; 00292 size_t size = 0; 00293 uint32_t id = 0; 00294 00295 /* CuPTI event struct */ 00296 size = 00297 sizeof ( CUpti_EventID ) * device[deviceId].domain[domainId].eventCount; 00298 eventId = ( CUpti_EventID * ) malloc( size ); 00299 if ( eventId == NULL ) { 00300 perror( "malloc(): Failed to allocate memory to CuPTI event ID" ); 00301 return -1; 00302 } 00303 memset( eventId, 0, size ); 00304 00305 /* PAPI event struct */ 00306 device[deviceId].domain[domainId].event = 00307 ( EventData_t * ) malloc( sizeof ( EventData_t ) * 00308 device[deviceId].domain[domainId]. 00309 eventCount ); 00310 if ( device[deviceId].domain[domainId].event == NULL ) { 00311 perror( "malloc(): Failed to allocate memory to PAPI event struct" ); 00312 free(eventId); 00313 return -1; 00314 } 00315 00316 /* enumerate the events for the domain[domainId] on the device[deviceId] */ 00317 #ifdef CUDA_4_0 00318 err = 00319 (*cuptiEventDomainEnumEventsPtr)( dev, 00320 ( CUpti_EventDomainID ) device[deviceId]. 00321 domain[domainId].domainId, &size, eventId ); 00322 #else 00323 err = 00324 (*cuptiEventDomainEnumEventsPtr)( ( CUpti_EventDomainID ) device[deviceId]. 00325 domain[domainId].domainId, &size, eventId ); 00326 #endif 00327 CHECK_CUPTI_ERROR( err, "cuptiEventDomainEnumEvents" ); 00328 00329 /* query event info */ 00330 for ( id = 0; id < device[deviceId].domain[domainId].eventCount; id++ ) { 00331 device[deviceId].domain[domainId].event[id].eventId = eventId[id]; 00332 00333 /* query event name */ 00334 size = PAPI_MIN_STR_LEN; 00335 #ifdef CUDA_4_0 00336 err = (*cuptiEventGetAttributePtr)( dev, 00337 device[deviceId].domain[domainId]. 00338 event[id].eventId, CUPTI_EVENT_ATTR_NAME, 00339 &size, 00340 ( uint8_t * ) device[deviceId]. 00341 domain[domainId].event[id].name ); 00342 #else 00343 err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId]. 00344 event[id].eventId, CUPTI_EVENT_ATTR_NAME, 00345 &size, 00346 ( uint8_t * ) device[deviceId]. 00347 domain[domainId].event[id].name ); 00348 #endif 00349 CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" ); 00350 00351 /* query event description */ 00352 size = PAPI_2MAX_STR_LEN; 00353 #ifdef CUDA_4_0 00354 err = (*cuptiEventGetAttributePtr)( dev, 00355 device[deviceId].domain[domainId]. 00356 event[id].eventId, 00357 CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size, 00358 ( uint8_t * ) device[deviceId]. 00359 domain[domainId].event[id].desc ); 00360 #else 00361 err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId]. 00362 event[id].eventId, 00363 CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size, 00364 ( uint8_t * ) device[deviceId]. 00365 domain[domainId].event[id].desc ); 00366 #endif 00367 CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" ); 00368 } 00369 00370 totalEventCount += device[deviceId].domain[domainId].eventCount; 00371 free( eventId ); 00372 return 0; 00373 } 00374 00375 00376 /* 00377 * Create the native events for specified domain and device 00378 */ 00379 static int 00380 createNativeEvents( void ) 00381 { 00382 int deviceId, id = 0; 00383 uint32_t domainId, eventId; 00384 int cuptiDomainId; 00385 int i; 00386 int devNameLen; 00387 00388 /* create events for every GPU device and every domain per device */ 00389 for ( deviceId = 0; deviceId < deviceCount; deviceId++ ) { 00390 /* for the event names, replace blanks in the device name with underscores */ 00391 devNameLen = strlen( device[deviceId].name ); 00392 for ( i = 0; i < devNameLen; i++ ) 00393 if ( device[deviceId].name[i] == ' ' ) 00394 device[deviceId].name[i] = '_'; 00395 00396 for ( domainId = 0; domainId < device[deviceId].domainCount; 00397 domainId++ ) { 00398 cuptiDomainId = device[deviceId].domain[domainId].domainId; 00399 00400 for ( eventId = 0; 00401 eventId < device[deviceId].domain[domainId].eventCount; 00402 eventId++ ) { 00403 /* Save native event data */ 00404 sprintf( cuda_native_table[id].name, 00405 "%s:%s:%s", 00406 device[deviceId].name, 00407 device[deviceId].domain[domainId].name, 00408 device[deviceId].domain[domainId].event[eventId]. 00409 name ); 00410 00411 strncpy( cuda_native_table[id].description, 00412 device[deviceId].domain[domainId].event[eventId].desc, 00413 PAPI_2MAX_STR_LEN ); 00414 00415 /* The selector has to be !=0 . Starts with 1 */ 00416 cuda_native_table[id].resources.selector = id + 1; 00417 00418 /* store event ID */ 00419 cuda_native_table[id].resources.eventId = 00420 device[deviceId].domain[domainId].event[eventId].eventId; 00421 00422 /* increment the table index counter */ 00423 id++; 00424 } 00425 } 00426 } 00427 00428 /* Return the number of events created */ 00429 return id; 00430 } 00431 00432 00433 /* 00434 * Returns all event values from the CuPTI eventGroup 00435 */ 00436 static int 00437 getEventValue( long long *counts, CUpti_EventGroup eventGroup, AddedEvents_t addedEvents ) 00438 { 00439 CUptiResult cuptiErr = CUPTI_SUCCESS; 00440 size_t events_read, bufferSizeBytes, arraySizeBytes, i; 00441 uint64_t *counterDataBuffer; 00442 CUpti_EventID *eventIDArray; 00443 int j; 00444 00445 bufferSizeBytes = addedEvents.count * sizeof ( uint64_t ); 00446 counterDataBuffer = ( uint64_t * ) malloc( bufferSizeBytes ); 00447 00448 arraySizeBytes = addedEvents.count * sizeof ( CUpti_EventID ); 00449 eventIDArray = ( CUpti_EventID * ) malloc( arraySizeBytes ); 00450 00451 /* read counter data for the specified event from the CuPTI eventGroup */ 00452 cuptiErr = (*cuptiEventGroupReadAllEventsPtr)( eventGroup, 00453 CUPTI_EVENT_READ_FLAG_NONE, 00454 &bufferSizeBytes, 00455 counterDataBuffer, &arraySizeBytes, 00456 eventIDArray, &events_read ); 00457 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupReadAllEvents" ); 00458 00459 if ( events_read != ( size_t ) addedEvents.count ) 00460 return -1; 00461 00462 /* Since there is no guarantee that returned counter values are in the same 00463 order as the counters in the PAPI addedEvents.list, we need to map the 00464 CUpti_EventID to PAPI event ID values. 00465 According to CuPTI doc: counter return values of counterDataBuffer 00466 correspond to the return event IDs in eventIDArray */ 00467 for ( i = 0; i < events_read; i++ ) 00468 for ( j = 0; j < addedEvents.count; j++ ) 00469 if ( cuda_native_table[addedEvents.list[j]].resources.eventId == 00470 eventIDArray[i] ) 00471 // since cuptiEventGroupReadAllEvents() resets counter values to 0; 00472 // we have to accumulate ourselves 00473 counts[addedEvents.list[j]] = counts[addedEvents.list[j]] + counterDataBuffer[i]; 00474 00475 free( counterDataBuffer ); 00476 free( eventIDArray ); 00477 return 0; 00478 } 00479 00480 00481 /***************************************************************************** 00482 ******************* BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS ************* 00483 *****************************************************************************/ 00484 00485 /* 00486 * This is called whenever a thread is initialized 00487 */ 00488 int 00489 CUDA_init_thread( hwd_context_t * ctx ) 00490 { 00491 ( void ) ctx; 00492 00493 return PAPI_OK; 00494 } 00495 00496 00497 /* Initialize hardware counters, setup the function vector table 00498 * and get hardware information, this routine is called when the 00499 * PAPI process is initialized (IE PAPI_library_init) 00500 * 00501 * NOTE: only called by main thread (not by every thread) !!! 00502 * 00503 * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context. 00504 * This is a much easier programming model then pre-4.0 as threads - using the 00505 * same context - can share memory, data, etc. 00506 * It's possible to create a different context for each thread, but then we are 00507 * likely running into a limitation that only one context can be profiled at a time. 00508 * ==> and we don't want this. That's why CUDA context creation is done in 00509 * CUDA_init_component() (called only by main thread) rather than CUDA_init() 00510 * or CUDA_init_control_state() (both called by each thread). 00511 */ 00512 int 00513 CUDA_init_component( int cidx ) 00514 { 00515 SUBDBG ("Entry: cidx: %d\n", cidx); 00516 CUresult cuErr = CUDA_SUCCESS; 00517 00518 /* link in all the cuda libraries and resolve the symbols we need to use */ 00519 if (linkCudaLibraries() != PAPI_OK) { 00520 SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n"); 00521 SUBDBG ("See disable reason in papi_component_avail output for more details.\n"); 00522 return (PAPI_ENOSUPP); 00523 } 00524 00525 /* Create dynamic event table */ 00526 NUM_EVENTS = detectDevice( ); 00527 if (NUM_EVENTS < 0) { 00528 strncpy(_cuda_vector.cmp_info.disabled_reason, "Call to detectDevice failed.",PAPI_MAX_STR_LEN); 00529 return (PAPI_ENOSUPP); 00530 } 00531 /* TODO: works only for one device right now; 00532 need to find out if user can use 2 or more devices at same time */ 00533 00534 /* want create a CUDA context for either the default device or 00535 the device specified with cudaSetDevice() in user code */ 00536 if ( CUDA_SUCCESS != (*cudaGetDevicePtr)( ¤tDeviceID ) ) { 00537 strncpy(_cuda_vector.cmp_info.disabled_reason, "No NVIDIA GPU's found.",PAPI_MAX_STR_LEN); 00538 return ( PAPI_ENOSUPP ); 00539 } 00540 00541 if ( getenv( "PAPI_VERBOSE" ) ) { 00542 printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name, 00543 currentDeviceID ); 00544 } 00545 00546 /* get the CUDA context from the calling CPU thread */ 00547 cuErr = (*cuCtxGetCurrentPtr)( &cuCtx ); 00548 00549 /* if no CUDA context is bound to the calling CPU thread yet, create one */ 00550 if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) { 00551 cuErr = (*cuCtxCreatePtr)( &cuCtx, 0, device[currentDeviceID].dev ); 00552 CHECK_CU_ERROR( cuErr, "cuCtxCreate" ); 00553 } 00554 00555 /* cuCtxGetCurrent() can return a non-null context that is not valid 00556 because the context has not yet been initialized. 00557 Here is a workaround: 00558 cudaFree(NULL) forces the context to be initialized 00559 if cudaFree(NULL) returns success then we are able to use the context in subsequent calls 00560 if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable, 00561 and will never be useable */ 00562 if ( CUDA_SUCCESS != (*cudaFreePtr)( NULL ) ) { 00563 strncpy(_cuda_vector.cmp_info.disabled_reason, "Problem initializing CUDA context.",PAPI_MAX_STR_LEN); 00564 return ( PAPI_ENOSUPP ); 00565 } 00566 00567 /* Create dynamic event table */ 00568 cuda_native_table = ( CUDA_native_event_entry_t * ) 00569 malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS ); 00570 if ( cuda_native_table == NULL ) { 00571 perror( "malloc(): Failed to allocate memory to events table" ); 00572 strncpy(_cuda_vector.cmp_info.disabled_reason, "Failed to allocate memory to events table.",PAPI_MAX_STR_LEN); 00573 return ( PAPI_ENOSUPP ); 00574 } 00575 00576 if ( NUM_EVENTS != createNativeEvents( ) ) { 00577 strncpy(_cuda_vector.cmp_info.disabled_reason, "Error creating CUDA event list.",PAPI_MAX_STR_LEN); 00578 return ( PAPI_ENOSUPP ); 00579 } 00580 00581 /* Export the component id */ 00582 _cuda_vector.cmp_info.CmpIdx = cidx; 00583 00584 /* Number of events */ 00585 _cuda_vector.cmp_info.num_native_events = NUM_EVENTS; 00586 00587 return ( PAPI_OK ); 00588 } 00589 00590 00591 /* 00592 * Link the necessary CUDA libraries to use the cuda component. If any of them can not be found, then 00593 * the CUDA component will just be disabled. This is done at runtime so that a version of PAPI built 00594 * with the CUDA component can be installed and used on systems which have the CUDA libraries installed 00595 * and on systems where these libraries are not installed. 00596 */ 00597 static int 00598 linkCudaLibraries () 00599 { 00600 /* Attempt to guess if we were statically linked to libc, if so bail */ 00601 if ( _dl_non_dynamic_init != NULL ) { 00602 strncpy(_cuda_vector.cmp_info.disabled_reason, "The cuda component does not support statically linking to libc.",PAPI_MAX_STR_LEN); 00603 return PAPI_ENOSUPP; 00604 } 00605 /* Need to link in the cuda libraries, if not found disable the component */ 00606 dl1 = dlopen("libcuda.so", RTLD_NOW | RTLD_GLOBAL); 00607 if (!dl1) 00608 { 00609 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA library libcuda.so not found.",PAPI_MAX_STR_LEN); 00610 return ( PAPI_ENOSUPP ); 00611 } 00612 cuCtxCreatePtr = dlsym(dl1, "cuCtxCreate_v2"); 00613 if (dlerror() != NULL) 00614 { 00615 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxCreate not found.",PAPI_MAX_STR_LEN); 00616 return ( PAPI_ENOSUPP ); 00617 } 00618 cuCtxDestroyPtr = dlsym(dl1, "cuCtxDestroy_v2"); 00619 if (dlerror() != NULL) 00620 { 00621 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxDestroy not found.",PAPI_MAX_STR_LEN); 00622 return ( PAPI_ENOSUPP ); 00623 } 00624 cuCtxGetCurrentPtr = dlsym(dl1, "cuCtxGetCurrent"); 00625 if (dlerror() != NULL) 00626 { 00627 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxGetCurrent not found.",PAPI_MAX_STR_LEN); 00628 return ( PAPI_ENOSUPP ); 00629 } 00630 cuDeviceGetPtr = dlsym(dl1, "cuDeviceGet"); 00631 if (dlerror() != NULL) 00632 { 00633 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGet not found.",PAPI_MAX_STR_LEN); 00634 return ( PAPI_ENOSUPP ); 00635 } 00636 cuDeviceGetCountPtr = dlsym(dl1, "cuDeviceGetCount"); 00637 if (dlerror() != NULL) 00638 { 00639 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGetCount not found.",PAPI_MAX_STR_LEN); 00640 return ( PAPI_ENOSUPP ); 00641 } 00642 cuDeviceGetNamePtr = dlsym(dl1, "cuDeviceGetName"); 00643 if (dlerror() != NULL) 00644 { 00645 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGetName not found.",PAPI_MAX_STR_LEN); 00646 return ( PAPI_ENOSUPP ); 00647 } 00648 cuInitPtr = dlsym(dl1, "cuInit"); 00649 if (dlerror() != NULL) 00650 { 00651 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuInit not found.",PAPI_MAX_STR_LEN); 00652 return ( PAPI_ENOSUPP ); 00653 } 00654 00655 dl2 = dlopen("libcudart.so", RTLD_NOW | RTLD_GLOBAL); 00656 if (!dl2) 00657 { 00658 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA runtime library libcudart.so not found.",PAPI_MAX_STR_LEN); 00659 return ( PAPI_ENOSUPP ); 00660 } 00661 cudaFreePtr = dlsym(dl2, "cudaFree"); 00662 if (dlerror() != NULL) 00663 { 00664 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaFree not found.",PAPI_MAX_STR_LEN); 00665 return ( PAPI_ENOSUPP ); 00666 } 00667 cudaGetDevicePtr = dlsym(dl2, "cudaGetDevice"); 00668 if (dlerror() != NULL) 00669 { 00670 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaGetDevice not found.",PAPI_MAX_STR_LEN); 00671 return ( PAPI_ENOSUPP ); 00672 } 00673 cudaRuntimeGetVersionPtr = dlsym(dl2, "cudaRuntimeGetVersion"); 00674 if (dlerror() != NULL) 00675 { 00676 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaRuntimeGetVersion not found.",PAPI_MAX_STR_LEN); 00677 return ( PAPI_ENOSUPP ); 00678 } 00679 cudaDriverGetVersionPtr = dlsym(dl2, "cudaDriverGetVersion"); 00680 if (dlerror() != NULL) 00681 { 00682 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaDriverGetVersion not found.",PAPI_MAX_STR_LEN); 00683 return ( PAPI_ENOSUPP ); 00684 } 00685 00686 dl3 = dlopen("libcupti.so", RTLD_NOW | RTLD_GLOBAL); 00687 if (!dl3) 00688 { 00689 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA runtime library libcupti.so not found.",PAPI_MAX_STR_LEN); 00690 return ( PAPI_ENOSUPP ); 00691 } 00692 cuptiDeviceEnumEventDomainsPtr = dlsym(dl3, "cuptiDeviceEnumEventDomains"); 00693 if (dlerror() != NULL) 00694 { 00695 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceEnumEventDomains not found.",PAPI_MAX_STR_LEN); 00696 return ( PAPI_ENOSUPP ); 00697 } 00698 cuptiDeviceGetEventDomainAttributePtr = dlsym(dl3, "cuptiDeviceGetEventDomainAttribute"); 00699 if (dlerror() != NULL) 00700 { 00701 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceGetEventDomainAttribute not found.",PAPI_MAX_STR_LEN); 00702 return ( PAPI_ENOSUPP ); 00703 } 00704 cuptiDeviceGetNumEventDomainsPtr = dlsym(dl3, "cuptiDeviceGetNumEventDomains"); 00705 if (dlerror() != NULL) 00706 { 00707 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceGetNumEventDomains not found.",PAPI_MAX_STR_LEN); 00708 return ( PAPI_ENOSUPP ); 00709 } 00710 cuptiEventDomainEnumEventsPtr = dlsym(dl3, "cuptiEventDomainEnumEvents"); 00711 if (dlerror() != NULL) 00712 { 00713 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventDomainEnumEvents not found.",PAPI_MAX_STR_LEN); 00714 return ( PAPI_ENOSUPP ); 00715 } 00716 cuptiEventDomainGetNumEventsPtr = dlsym(dl3, "cuptiEventDomainGetNumEvents"); 00717 if (dlerror() != NULL) 00718 { 00719 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventDomainGetNumEvents not found.",PAPI_MAX_STR_LEN); 00720 return ( PAPI_ENOSUPP ); 00721 } 00722 cuptiEventGetAttributePtr = dlsym(dl3, "cuptiEventGetAttribute"); 00723 if (dlerror() != NULL) 00724 { 00725 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGetAttribute not found.",PAPI_MAX_STR_LEN); 00726 return ( PAPI_ENOSUPP ); 00727 } 00728 cuptiEventGroupAddEventPtr = dlsym(dl3, "cuptiEventGroupAddEvent"); 00729 if (dlerror() != NULL) 00730 { 00731 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupAddEvent not found.",PAPI_MAX_STR_LEN); 00732 return ( PAPI_ENOSUPP ); 00733 } 00734 cuptiEventGroupCreatePtr = dlsym(dl3, "cuptiEventGroupCreate"); 00735 if (dlerror() != NULL) 00736 { 00737 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupCreate not found.",PAPI_MAX_STR_LEN); 00738 return ( PAPI_ENOSUPP ); 00739 } 00740 cuptiEventGroupDestroyPtr = dlsym(dl3, "cuptiEventGroupDestroy"); 00741 if (dlerror() != NULL) 00742 { 00743 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupDestroy not found.",PAPI_MAX_STR_LEN); 00744 return ( PAPI_ENOSUPP ); 00745 } 00746 cuptiEventGroupDisablePtr = dlsym(dl3, "cuptiEventGroupDisable"); 00747 if (dlerror() != NULL) 00748 { 00749 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupDisable not found.",PAPI_MAX_STR_LEN); 00750 return ( PAPI_ENOSUPP ); 00751 } 00752 cuptiEventGroupEnablePtr = dlsym(dl3, "cuptiEventGroupEnable"); 00753 if (dlerror() != NULL) 00754 { 00755 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupEnable not found.",PAPI_MAX_STR_LEN); 00756 return ( PAPI_ENOSUPP ); 00757 } 00758 cuptiEventGroupReadAllEventsPtr = dlsym(dl3, "cuptiEventGroupReadAllEvents"); 00759 if (dlerror() != NULL) 00760 { 00761 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupReadAllEvents not found.",PAPI_MAX_STR_LEN); 00762 return ( PAPI_ENOSUPP ); 00763 } 00764 cuptiEventGroupRemoveAllEventsPtr = dlsym(dl3, "cuptiEventGroupRemoveAllEvents"); 00765 if (dlerror() != NULL) 00766 { 00767 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupRemoveAllEvents not found.",PAPI_MAX_STR_LEN); 00768 return ( PAPI_ENOSUPP ); 00769 } 00770 cuptiEventGroupResetAllEventsPtr = dlsym(dl3, "cuptiEventGroupResetAllEvents"); 00771 if (dlerror() != NULL) 00772 { 00773 strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupResetAllEvents not found.",PAPI_MAX_STR_LEN); 00774 return ( PAPI_ENOSUPP ); 00775 } 00776 00777 return ( PAPI_OK ); 00778 } 00779 00780 00781 /* 00782 * Control of counters (Reading/Writing/Starting/Stopping/Setup) 00783 * functions 00784 */ 00785 int 00786 CUDA_init_control_state( hwd_control_state_t * ctrl ) 00787 { 00788 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 00789 CUptiResult cuptiErr = CUPTI_SUCCESS; 00790 int i; 00791 00792 /* allocate memory for the list of events that are added to the CuPTI eventGroup */ 00793 CUDA_ctrl->addedEvents.list = malloc( sizeof ( int ) * NUM_EVENTS ); 00794 if ( CUDA_ctrl->addedEvents.list == NULL ) { 00795 perror 00796 ( "malloc(): Failed to allocate memory to table of events that are added to CuPTI eventGroup" ); 00797 return ( PAPI_ENOSUPP ); 00798 } 00799 00800 /* initialize the event list */ 00801 for ( i = 0; i < NUM_EVENTS; i++ ) 00802 CUDA_ctrl->addedEvents.list[i] = 0; 00803 00804 00805 00806 cuptiErr = (*cuptiEventGroupCreatePtr)( cuCtx, &CUDA_ctrl->eventGroup, 0 ); 00807 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupCreate" ); 00808 00809 return PAPI_OK; 00810 } 00811 00812 00813 /* 00814 * 00815 */ 00816 int 00817 CUDA_start( hwd_context_t * ctx, hwd_control_state_t * ctrl ) 00818 { 00819 ( void ) ctx; 00820 int i; 00821 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 00822 CUptiResult cuptiErr = CUPTI_SUCCESS; 00823 00824 // reset all event values to 0 00825 for ( i = 0; i < NUM_EVENTS; i++ ) 00826 CUDA_ctrl->counts[i] = 0; 00827 00828 cuptiErr = (*cuptiEventGroupEnablePtr)( CUDA_ctrl->eventGroup ); 00829 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupEnable" ); 00830 00831 /* Resets all events in the CuPTI eventGroup to zero */ 00832 cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup ); 00833 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" ); 00834 00835 return ( PAPI_OK ); 00836 } 00837 00838 00839 /* 00840 * 00841 */ 00842 int 00843 CUDA_stop( hwd_context_t * ctx, hwd_control_state_t * ctrl ) 00844 { 00845 ( void ) ctx; 00846 ( void ) ctrl; 00847 00848 return ( PAPI_OK ); 00849 } 00850 00851 00852 /* 00853 * 00854 */ 00855 int 00856 CUDA_read( hwd_context_t * ctx, hwd_control_state_t * ctrl, 00857 long_long ** events, int flags ) 00858 { 00859 ( void ) ctx; 00860 ( void ) flags; 00861 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 00862 00863 00864 if ( 0 != getEventValue( CUDA_ctrl->counts, CUDA_ctrl->eventGroup, CUDA_ctrl->addedEvents ) ) 00865 return ( PAPI_ENOSUPP ); 00866 00867 *events = CUDA_ctrl->counts; 00868 00869 return ( PAPI_OK ); 00870 } 00871 00872 /* 00873 * 00874 */ 00875 int 00876 CUDA_shutdown_thread( hwd_context_t *ctx ) 00877 { 00878 CUDA_context_t *CUDA_ctx = (CUDA_context_t*)ctx; 00879 free( CUDA_ctx->state.addedEvents.list ); 00880 return (PAPI_OK); 00881 } 00882 00883 /* 00884 * 00885 */ 00886 int 00887 CUDA_shutdown_component( void ) 00888 { 00889 CUresult cuErr = CUDA_SUCCESS; 00890 00891 /* if running a threaded application, we need to make sure that 00892 a thread doesn't free the same memory location(s) more than once */ 00893 if ( CUDA_FREED == 0 ) { 00894 uint32_t j; 00895 int i; 00896 00897 CUDA_FREED = 1; 00898 00899 /* deallocate all the memory */ 00900 for ( i = 0; i < deviceCount; i++ ) { 00901 for ( j = 0; j < device[i].domainCount; j++ ) 00902 free( device[i].domain[j].event ); 00903 00904 free( device[i].domain ); 00905 } 00906 00907 free( device ); 00908 free( cuda_native_table ); 00909 00910 /* destroy floating CUDA context */ 00911 cuErr = (*cuCtxDestroyPtr)( cuCtx ); 00912 if ( cuErr != CUDA_SUCCESS ) 00913 return ( PAPI_ENOSUPP ); // Not supported 00914 } 00915 00916 // close the dynamic libraries needed by this component (opened in the init substrate call) 00917 dlclose(dl1); 00918 dlclose(dl2); 00919 dlclose(dl3); 00920 00921 return ( PAPI_OK ); 00922 } 00923 00924 00925 /* This function sets various options in the component 00926 * The valid codes being passed in are PAPI_SET_DEFDOM, 00927 * PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL * and PAPI_SET_INHERIT 00928 */ 00929 int 00930 CUDA_ctl( hwd_context_t * ctx, int code, _papi_int_option_t * option ) 00931 { 00932 ( void ) ctx; 00933 ( void ) code; 00934 ( void ) option; 00935 return ( PAPI_OK ); 00936 } 00937 00938 00939 //int CUDA_ntv_code_to_bits ( unsigned int EventCode, hwd_register_t * bits ); 00940 00941 00942 /* 00943 * 00944 */ 00945 int 00946 CUDA_update_control_state( hwd_control_state_t * ptr, 00947 NativeInfo_t * native, int count, 00948 hwd_context_t * ctx ) 00949 { 00950 ( void ) ctx; 00951 CUDA_control_state_t * CUDA_ptr = ( CUDA_control_state_t * ) ptr; 00952 int index, i; 00953 CUptiResult cuptiErr = CUPTI_SUCCESS; 00954 00955 /* Disable the CUDA eventGroup; 00956 it also frees the perfmon hardware on the GPU */ 00957 cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ptr->eventGroup ); 00958 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" ); 00959 00960 cuptiErr = (*cuptiEventGroupRemoveAllEventsPtr)( CUDA_ptr->eventGroup ); 00961 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupRemoveAllEvents" ); 00962 00963 // otherwise, add the events to the eventset 00964 for ( i = 0; i < count; i++ ) { 00965 00966 index = native[i].ni_event; 00967 native[i].ni_position = index; 00968 00969 /* store events, that have been added to the CuPTI eveentGroup 00970 in a seperate place (addedEvents). 00971 Needed, so that we can read the values for the added events only */ 00972 CUDA_ptr->addedEvents.count = count; 00973 CUDA_ptr->addedEvents.list[i] = index; 00974 00975 /* if this device name is different from the actual device the code is running on, then exit */ 00976 if ( 0 != strncmp( device[currentDeviceID].name, 00977 cuda_native_table[index].name, 00978 strlen( device[currentDeviceID].name ) ) ) { 00979 fprintf( stderr, "Device %s is used -- BUT event %s is collected. \n ---> ERROR: Specify events for the device that is used!\n\n", 00980 device[currentDeviceID].name, cuda_native_table[index].name ); 00981 00982 return ( PAPI_ENOSUPP ); // Not supported 00983 } 00984 00985 /* Add events to the CuPTI eventGroup */ 00986 cuptiErr = 00987 (*cuptiEventGroupAddEventPtr)( CUDA_ptr->eventGroup, 00988 cuda_native_table[index].resources. 00989 eventId ); 00990 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupAddEvent" ); 00991 } 00992 00993 return ( PAPI_OK ); 00994 } 00995 00996 00997 /* 00998 * This function has to set the bits needed to count different domains 00999 * In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER 01000 * By default return PAPI_EINVAL if none of those are specified 01001 * and PAPI_OK with success 01002 * PAPI_DOM_USER is only user context is counted 01003 * PAPI_DOM_KERNEL is only the Kernel/OS context is counted 01004 * PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses) 01005 * PAPI_DOM_ALL is all of the domains 01006 */ 01007 int 01008 CUDA_set_domain( hwd_control_state_t * cntrl, int domain ) 01009 { 01010 int found = 0; 01011 ( void ) cntrl; 01012 01013 if ( PAPI_DOM_USER & domain ) 01014 found = 1; 01015 01016 if ( PAPI_DOM_KERNEL & domain ) 01017 found = 1; 01018 01019 if ( PAPI_DOM_OTHER & domain ) 01020 found = 1; 01021 01022 if ( !found ) 01023 return ( PAPI_EINVAL ); 01024 01025 return ( PAPI_OK ); 01026 } 01027 01028 01029 /* 01030 * 01031 */ 01032 int 01033 CUDA_reset( hwd_context_t * ctx, hwd_control_state_t * ctrl ) 01034 { 01035 ( void ) ctx; 01036 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 01037 CUptiResult cuptiErr = CUPTI_SUCCESS; 01038 01039 /* Resets all events in the CuPTI eventGroup to zero */ 01040 cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup ); 01041 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" ); 01042 01043 return ( PAPI_OK ); 01044 } 01045 01046 01047 /* 01048 * Disable and Destoy the CUDA eventGroup */ 01049 int 01050 CUDA_cleanup_eventset( hwd_control_state_t * ctrl ) 01051 { 01052 ( void ) ctrl; 01053 01054 // TODO: after cleanup_eventset() which destroys the eventset, update_control_state() 01055 // is called, which operates on the already destroyed eventset. Bad! 01056 #if 0 01057 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 01058 CUptiResult cuptiErr = CUPTI_SUCCESS; 01059 01060 /* Disable the CUDA eventGroup; 01061 it also frees the perfmon hardware on the GPU */ 01062 cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ctrl->eventGroup ); 01063 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" ); 01064 01065 /* Call the CuPTI cleaning function before leaving */ 01066 cuptiErr = (*cuptiEventGroupDestroyPtr)( CUDA_ctrl->eventGroup ); 01067 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDestroy" ); 01068 #endif 01069 return ( PAPI_OK ); 01070 } 01071 01072 01073 /* 01074 * Native Event functions 01075 */ 01076 int 01077 CUDA_ntv_enum_events( unsigned int *EventCode, int modifier ) 01078 { 01079 01080 switch ( modifier ) { 01081 case PAPI_ENUM_FIRST: 01082 *EventCode = 0; 01083 01084 return ( PAPI_OK ); 01085 break; 01086 01087 case PAPI_ENUM_EVENTS: 01088 { 01089 int index = *EventCode; 01090 01091 if ( index < NUM_EVENTS - 1 ) { 01092 *EventCode = *EventCode + 1; 01093 return ( PAPI_OK ); 01094 } else 01095 return ( PAPI_ENOEVNT ); 01096 01097 break; 01098 } 01099 default: 01100 return ( PAPI_EINVAL ); 01101 } 01102 return ( PAPI_EINVAL ); 01103 } 01104 01105 01106 /* 01107 * 01108 */ 01109 int 01110 CUDA_ntv_code_to_name( unsigned int EventCode, char *name, int len ) 01111 { 01112 int index = EventCode; 01113 01114 strncpy( name, cuda_native_table[index].name, len ); 01115 return ( PAPI_OK ); 01116 } 01117 01118 01119 /* 01120 * 01121 */ 01122 int 01123 CUDA_ntv_code_to_descr( unsigned int EventCode, char *name, int len ) 01124 { 01125 int index = EventCode; 01126 01127 strncpy( name, cuda_native_table[index].description, len ); 01128 return ( PAPI_OK ); 01129 } 01130 01131 01132 /* 01133 * 01134 */ 01135 int 01136 CUDA_ntv_code_to_bits( unsigned int EventCode, hwd_register_t * bits ) 01137 { 01138 int index = EventCode; 01139 01140 memcpy( ( CUDA_register_t * ) bits, 01141 &( cuda_native_table[index].resources ), 01142 sizeof ( CUDA_register_t ) ); 01143 01144 return ( PAPI_OK ); 01145 } 01146 01147 01148 /* 01149 * 01150 */ 01151 papi_vector_t _cuda_vector = { 01152 .cmp_info = { 01153 /* default component information (unspecified values are initialized to 0) */ 01154 .name = "cuda", 01155 .short_name = "cuda", 01156 .version = "5.0", 01157 .description = "CuPTI provides the API for monitoring NVIDIA GPU hardware events", 01158 .num_mpx_cntrs = CUDA_MAX_COUNTERS, 01159 .num_cntrs = CUDA_MAX_COUNTERS, 01160 .default_domain = PAPI_DOM_USER, 01161 .default_granularity = PAPI_GRN_THR, 01162 .available_granularities = PAPI_GRN_THR, 01163 .hardware_intr_sig = PAPI_INT_SIGNAL, 01164 01165 /* component specific cmp_info initializations */ 01166 .fast_real_timer = 0, 01167 .fast_virtual_timer = 0, 01168 .attach = 0, 01169 .attach_must_ptrace = 0, 01170 .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL, 01171 } 01172 , 01173 01174 /* sizes of framework-opaque component-private structures */ 01175 .size = { 01176 .context = sizeof ( CUDA_context_t ), 01177 .control_state = sizeof ( CUDA_control_state_t ), 01178 .reg_value = sizeof ( CUDA_register_t ), 01179 .reg_alloc = sizeof ( CUDA_reg_alloc_t ), 01180 } 01181 , 01182 /* function pointers in this component */ 01183 .init_thread = CUDA_init_thread, 01184 .init_component = CUDA_init_component, 01185 .init_control_state = CUDA_init_control_state, 01186 .start = CUDA_start, 01187 .stop = CUDA_stop, 01188 .read = CUDA_read, 01189 .shutdown_component = CUDA_shutdown_component, 01190 .shutdown_thread = CUDA_shutdown_thread, 01191 .cleanup_eventset = CUDA_cleanup_eventset, 01192 .ctl = CUDA_ctl, 01193 .update_control_state = CUDA_update_control_state, 01194 .set_domain = CUDA_set_domain, 01195 .reset = CUDA_reset, 01196 01197 .ntv_enum_events = CUDA_ntv_enum_events, 01198 .ntv_code_to_name = CUDA_ntv_code_to_name, 01199 .ntv_code_to_descr = CUDA_ntv_code_to_descr, 01200 .ntv_code_to_bits = CUDA_ntv_code_to_bits, 01201 };