PAPI  5.3.0.0
linux-cuda.c
Go to the documentation of this file.
00001 /****************************/
00002 /* THIS IS OPEN SOURCE CODE */
00003 /****************************/
00004 
00017 #include <dlfcn.h>
00018 
00019 #include "papi.h"
00020 #include "papi_internal.h"
00021 #include "papi_vector.h"
00022 #include "papi_memory.h"
00023 #include "linux-cuda.h"
00024 
00025 
00026 /********  CHANGE PROTOTYPES TO DECLARE CUDA LIBRARY SYMBOLS AS WEAK  **********
00027  *  This is done so that a version of PAPI built with the cuda component can   *
00028  *  be installed on a system which does not have the cuda libraries installed. *
00029  *                                                                             *
00030  *  If this is done without these prototypes, then all papi services on the    *
00031  *  system without the cuda libraries installed will fail.  The PAPI libraries *
00032  *  contain references to the cuda libraries which are not installed.  The     *
00033  *  load of PAPI commands fails because the cuda library references can not be *
00034  *  resolved.                                                                  *
00035  *                                                                             *
00036  *  This also defines pointers to the cuda library functions that we call.     *
00037  *  These function pointers will be resolved with dlopen/dlsym calls at        *
00038  *  component initialization time.  The component then calls the cuda library  *
00039  *  functions through these function pointers.                                 *
00040  *******************************************************************************/
00041 void (*_dl_non_dynamic_init)(void) __attribute__((weak));
00042 #undef CUDAAPI
00043 #define CUDAAPI __attribute__((weak))
00044 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
00045 CUresult CUDAAPI cuCtxDestroy(CUcontext);
00046 CUresult CUDAAPI cuCtxGetCurrent(CUcontext *);
00047 CUresult CUDAAPI cuDeviceGet(CUdevice *, int);
00048 CUresult CUDAAPI cuDeviceGetCount(int *);
00049 CUresult CUDAAPI cuDeviceGetName(char *, int, CUdevice);
00050 CUresult CUDAAPI cuInit(unsigned int);
00051 
00052 CUresult (*cuCtxCreatePtr)(CUcontext *pctx, unsigned int flags, CUdevice dev);
00053 CUresult (*cuCtxDestroyPtr)(CUcontext);
00054 CUresult (*cuCtxGetCurrentPtr)(CUcontext *);
00055 CUresult (*cuDeviceGetPtr)(CUdevice *, int);
00056 CUresult (*cuDeviceGetCountPtr)(int *);
00057 CUresult (*cuDeviceGetNamePtr)(char *, int, CUdevice);
00058 CUresult (*cuInitPtr)(unsigned int);
00059 
00060 #undef CUDARTAPI
00061 #define CUDARTAPI __attribute__((weak))
00062 cudaError_t CUDARTAPI cudaFree(void *);
00063 cudaError_t CUDARTAPI cudaGetDevice(int *);
00064 cudaError_t CUDARTAPI cudaRuntimeGetVersion( int *);
00065 cudaError_t CUDARTAPI cudaDriverGetVersion( int *);
00066 
00067 cudaError_t (*cudaFreePtr)(void *);
00068 cudaError_t (*cudaGetDevicePtr)(int *);
00069 cudaError_t (*cudaRuntimeGetVersionPtr)(int *);
00070 cudaError_t (*cudaDriverGetVersionPtr)(int *);
00071 
00072 #undef CUPTIAPI
00073 #define CUPTIAPI __attribute__((weak))
00074 CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice, size_t *, CUpti_EventDomainID *);
00075 CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice, CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
00076 CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice, uint32_t *);
00077 CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID, size_t*, CUpti_EventID *);
00078 CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID, uint32_t *);
00079 CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID, CUpti_EventAttribute, size_t *, void *);
00080 CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup, CUpti_EventID);
00081 CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext, CUpti_EventGroup *, uint32_t);
00082 CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup);
00083 CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup);
00084 CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup);
00085 CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *);
00086 CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup);
00087 CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup);
00088 
00089 CUptiResult (*cuptiDeviceEnumEventDomainsPtr)(CUdevice, size_t *, CUpti_EventDomainID *);
00090 CUptiResult (*cuptiDeviceGetEventDomainAttributePtr)(CUdevice, CUpti_EventDomainID, CUpti_EventDomainAttribute, size_t *, void *);
00091 CUptiResult (*cuptiDeviceGetNumEventDomainsPtr)(CUdevice, uint32_t *);
00092 CUptiResult (*cuptiEventDomainEnumEventsPtr)(CUpti_EventDomainID, size_t*, CUpti_EventID *);
00093 CUptiResult (*cuptiEventDomainGetNumEventsPtr)(CUpti_EventDomainID, uint32_t *);
00094 CUptiResult (*cuptiEventGetAttributePtr)(CUpti_EventID, CUpti_EventAttribute, size_t *, void *);
00095 CUptiResult (*cuptiEventGroupAddEventPtr)(CUpti_EventGroup, CUpti_EventID);
00096 CUptiResult (*cuptiEventGroupCreatePtr)(CUcontext, CUpti_EventGroup *, uint32_t);
00097 CUptiResult (*cuptiEventGroupDestroyPtr)(CUpti_EventGroup);
00098 CUptiResult (*cuptiEventGroupDisablePtr)(CUpti_EventGroup);
00099 CUptiResult (*cuptiEventGroupEnablePtr)(CUpti_EventGroup);
00100 CUptiResult (*cuptiEventGroupReadAllEventsPtr)(CUpti_EventGroup, CUpti_ReadEventFlags, size_t *, uint64_t *, size_t *, CUpti_EventID *, size_t *);
00101 CUptiResult (*cuptiEventGroupRemoveAllEventsPtr)(CUpti_EventGroup);
00102 CUptiResult (*cuptiEventGroupResetAllEventsPtr)(CUpti_EventGroup);
00103 
00104 // file handles used to access cuda libraries with dlopen
00105 static void* dl1 = NULL;
00106 static void* dl2 = NULL;
00107 static void* dl3 = NULL;
00108 
00109 static int linkCudaLibraries ();
00110 
00111 papi_vector_t _cuda_vector;
00112 
00113 
00114 /******************************************************************************
00115  ********  BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT *********
00116  *****************************************************************************/
00117 /*
00118  * Specify device(s): Counts number of cuda events available in this system
00119  */
00120 static int
00121 detectDevice( void )
00122 {
00123     CUresult err;
00124     int skipDevice = 0;
00125     int id;
00126     char deviceName_tmp[PAPI_MIN_STR_LEN] = "init";
00127 
00128     totalEventCount = 0;
00129 
00130 /* CUDA initialization  */
00131     err = (*cuInitPtr)( 0 );
00132     if ( err != CUDA_SUCCESS ) {
00133         SUBDBG ("Info: Error from cuInit(): %d\n", err);
00134         return ( PAPI_ENOSUPP );
00135     }
00136 
00137     /* How many gpgpu devices do we have? */
00138     err = (*cuDeviceGetCountPtr)( &deviceCount );
00139     CHECK_CU_ERROR( err, "cuDeviceGetCount" );
00140     if ( deviceCount == 0 )
00141         return ( PAPI_ENOSUPP );
00142 
00143     /* allocate memory for device data table */
00144     device = ( DeviceData_t * ) malloc( sizeof ( DeviceData_t ) * deviceCount );
00145     if ( device == NULL ) {
00146         perror( "malloc(): Failed to allocate memory to CUDA device table" );
00147         return ( PAPI_ENOSUPP );
00148     }
00149 
00150     /* What are the devices? Get Name and # of domains per device */
00151     for ( id = 0; id < deviceCount; id++ ) {
00152         err = (*cuDeviceGetPtr)( &device[id].dev, id );
00153         CHECK_CU_ERROR( err, "cuDeviceGet" );
00154 
00155         err = (*cuDeviceGetNamePtr)( device[id].name, PAPI_MIN_STR_LEN, device[id].dev );
00156         CHECK_CU_ERROR( err, "cuDeviceGetName" );
00157 
00158         SUBDBG ("Cuda deviceName: %s\n", device[id].name);
00159 
00160         /* Skip device if there are multiple of the same type 
00161            and if it has been already added to the list */
00162         if ( 0 == strcmp( deviceName_tmp, device[id].name ) ) {
00163             skipDevice++;
00164             continue;
00165         }
00166 
00167         strcpy( deviceName_tmp, device[id].name );
00168 
00169         /* enumerate the domains on the device */
00170         if ( 0 != enumEventDomains( device[id].dev, id ) )
00171             return ( PAPI_ENOSUPP );
00172     }
00173 
00174     deviceCount = deviceCount - skipDevice;
00175 
00176     /* return number of events provided via CuPTI */
00177     return totalEventCount;
00178 }
00179 
00180 
00181 /*
00182  * Detect supported domains for specified device
00183  */
00184 static int
00185 enumEventDomains( CUdevice dev, int deviceId )
00186 {
00187     CUptiResult err = CUPTI_SUCCESS;
00188     CUpti_EventDomainID *domainId = NULL;
00189     uint32_t id = 0;
00190     size_t size = 0;
00191 
00192     device[deviceId].domainCount = 0;
00193 
00194     /* get number of domains for device dev */
00195     err = (*cuptiDeviceGetNumEventDomainsPtr)( dev, &device[deviceId].domainCount );
00196     CHECK_CUPTI_ERROR( err, "cuptiDeviceGetNumEventDomains" );
00197 
00198     if ( device[deviceId].domainCount == 0 ) {
00199         printf( "No domain is exposed by dev = %d\n", dev );
00200         return -1;
00201     }
00202 
00203     /* CuPTI domain struct */
00204     size = sizeof ( CUpti_EventDomainID ) * device[deviceId].domainCount;
00205     domainId = ( CUpti_EventDomainID * ) malloc( size );
00206     if ( domainId == NULL ) {
00207         perror( "malloc(): Failed to allocate memory to CuPTI domain ID" );
00208         return -1;
00209     }
00210     memset( domainId, 0, size );
00211 
00212     /* PAPI domain struct */
00213     device[deviceId].domain =
00214         ( DomainData_t * ) malloc( sizeof ( DomainData_t ) *
00215                                    device[deviceId].domainCount );
00216     if ( device[deviceId].domain == NULL ) {
00217         perror( "malloc(): Failed to allocate memory to PAPI domain struct" );
00218         free(domainId);
00219         return -1;
00220     }
00221 
00222     /* Enumerates the event domains for a device dev */
00223     err = (*cuptiDeviceEnumEventDomainsPtr)( dev, &size, domainId );
00224     CHECK_CUPTI_ERROR( err, "cuptiDeviceEnumEventDomains" );
00225 
00226     /* enum domains */
00227     for ( id = 0; id < device[deviceId].domainCount; id++ ) {
00228         device[deviceId].domain[id].domainId = domainId[id];
00229 
00230         /* query domain name */
00231         size = PAPI_MIN_STR_LEN;
00232 #ifdef CUDA_4_0
00233         err = cuptiEventDomainGetAttribute( dev,
00234                                            device[deviceId].domain[id].
00235                                            domainId,
00236                                            CUPTI_EVENT_DOMAIN_ATTR_NAME, &size,
00237                                            ( void * ) device[deviceId].
00238                                            domain[id].name );
00239         CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" );
00240         
00241         /* query num of events avaialble in the domain */
00242         size = sizeof ( device[deviceId].domain[id].eventCount );
00243         err = cuptiEventDomainGetAttribute( dev,
00244                                            device[deviceId].domain[id].
00245                                            domainId,
00246                                            CUPTI_EVENT_DOMAIN_MAX_EVENTS,
00247                                            &size,
00248                                            ( void * ) &device[deviceId].
00249                                            domain[id].eventCount );
00250         CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" );
00251         
00252         /* enumerate the events for the domain[id] on the device dev */
00253         if ( 0 != enumEvents( dev, deviceId, id ) )
00254             return -1;
00255 #else
00256         err = (*cuptiDeviceGetEventDomainAttributePtr)( dev,
00257                                                   device[deviceId].domain[id].domainId,
00258                                                   CUPTI_EVENT_DOMAIN_ATTR_NAME, &size,
00259                                                   ( void * ) device[deviceId].domain[id].name );
00260         CHECK_CUPTI_ERROR( err, "cuptiDeviceGetEventDomainAttribute" );
00261 
00262         /* query num of events avaialble in the domain */
00263         err = (*cuptiEventDomainGetNumEventsPtr)( device[deviceId].domain[id].domainId,
00264                                             &device[deviceId].domain[id].eventCount );
00265         CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetNumEvents" );
00266 
00267         /* enumerate the events for the domain[id] on the device deviceId */
00268         if ( 0 != enumEvents( deviceId, id ) )
00269             return -1;
00270 #endif
00271     }
00272 
00273     totalDomainCount += device[deviceId].domainCount;
00274     free( domainId );
00275     return 0;
00276 }
00277 
00278 
00279 /*
00280  * Detect supported events for specified device domain
00281  */
00282 #ifdef CUDA_4_0
00283 static int
00284 enumEvents( CUdevice dev, int deviceId, int domainId )
00285 #else
00286 static int
00287 enumEvents( int deviceId, int domainId )
00288 #endif
00289 {
00290     CUptiResult err = CUPTI_SUCCESS;
00291     CUpti_EventID *eventId = NULL;
00292     size_t size = 0;
00293     uint32_t id = 0;
00294 
00295     /* CuPTI event struct */
00296     size =
00297         sizeof ( CUpti_EventID ) * device[deviceId].domain[domainId].eventCount;
00298     eventId = ( CUpti_EventID * ) malloc( size );
00299     if ( eventId == NULL ) {
00300         perror( "malloc(): Failed to allocate memory to CuPTI event ID" );
00301         return -1;
00302     }
00303     memset( eventId, 0, size );
00304 
00305     /* PAPI event struct */
00306     device[deviceId].domain[domainId].event =
00307         ( EventData_t * ) malloc( sizeof ( EventData_t ) *
00308                                   device[deviceId].domain[domainId].
00309                                   eventCount );
00310     if ( device[deviceId].domain[domainId].event == NULL ) {
00311         perror( "malloc(): Failed to allocate memory to PAPI event struct" );
00312         free(eventId);
00313         return -1;
00314     }
00315 
00316     /* enumerate the events for the domain[domainId] on the device[deviceId] */
00317 #ifdef CUDA_4_0
00318     err =
00319         (*cuptiEventDomainEnumEventsPtr)( dev,
00320                                ( CUpti_EventDomainID ) device[deviceId].
00321                                domain[domainId].domainId, &size, eventId );
00322 #else
00323     err =
00324         (*cuptiEventDomainEnumEventsPtr)( ( CUpti_EventDomainID ) device[deviceId].
00325                                     domain[domainId].domainId, &size, eventId );
00326 #endif
00327     CHECK_CUPTI_ERROR( err, "cuptiEventDomainEnumEvents" );
00328 
00329     /* query event info */
00330     for ( id = 0; id < device[deviceId].domain[domainId].eventCount; id++ ) {
00331         device[deviceId].domain[domainId].event[id].eventId = eventId[id];
00332 
00333         /* query event name */
00334         size = PAPI_MIN_STR_LEN;
00335 #ifdef CUDA_4_0
00336         err = (*cuptiEventGetAttributePtr)( dev,
00337                                      device[deviceId].domain[domainId].
00338                                      event[id].eventId, CUPTI_EVENT_ATTR_NAME,
00339                                      &size,
00340                                      ( uint8_t * ) device[deviceId].
00341                                      domain[domainId].event[id].name );     
00342 #else
00343         err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId].
00344                                       event[id].eventId, CUPTI_EVENT_ATTR_NAME,
00345                                       &size,
00346                                       ( uint8_t * ) device[deviceId].
00347                                       domain[domainId].event[id].name );
00348 #endif
00349         CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" );
00350 
00351         /* query event description */
00352         size = PAPI_2MAX_STR_LEN;
00353 #ifdef CUDA_4_0
00354         err = (*cuptiEventGetAttributePtr)( dev,
00355                                      device[deviceId].domain[domainId].
00356                                      event[id].eventId,
00357                                      CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size,
00358                                      ( uint8_t * ) device[deviceId].
00359                                      domain[domainId].event[id].desc );     
00360 #else
00361         err = (*cuptiEventGetAttributePtr)( device[deviceId].domain[domainId].
00362                                       event[id].eventId,
00363                                       CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size,
00364                                       ( uint8_t * ) device[deviceId].
00365                                       domain[domainId].event[id].desc );
00366 #endif
00367         CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" );
00368     }
00369 
00370     totalEventCount += device[deviceId].domain[domainId].eventCount;
00371     free( eventId );
00372     return 0;
00373 }
00374 
00375 
00376 /*
00377  * Create the native events for specified domain and device
00378  */
00379 static int
00380 createNativeEvents( void )
00381 {
00382     int deviceId, id = 0;
00383     uint32_t domainId, eventId;
00384     int cuptiDomainId;
00385     int i;
00386     int devNameLen;
00387 
00388     /* create events for every GPU device and every domain per device  */
00389     for ( deviceId = 0; deviceId < deviceCount; deviceId++ ) {
00390         /* for the event names, replace blanks in the device name with underscores */
00391         devNameLen = strlen( device[deviceId].name );
00392         for ( i = 0; i < devNameLen; i++ )
00393             if ( device[deviceId].name[i] == ' ' )
00394                 device[deviceId].name[i] = '_';
00395 
00396         for ( domainId = 0; domainId < device[deviceId].domainCount;
00397               domainId++ ) {
00398             cuptiDomainId = device[deviceId].domain[domainId].domainId;
00399 
00400             for ( eventId = 0;
00401                   eventId < device[deviceId].domain[domainId].eventCount;
00402                   eventId++ ) {
00403                 /* Save native event data */
00404                 sprintf( cuda_native_table[id].name,
00405                          "%s:%s:%s",
00406                          device[deviceId].name,
00407                          device[deviceId].domain[domainId].name,
00408                          device[deviceId].domain[domainId].event[eventId].
00409                          name );
00410 
00411                 strncpy( cuda_native_table[id].description,
00412                          device[deviceId].domain[domainId].event[eventId].desc,
00413                          PAPI_2MAX_STR_LEN );
00414 
00415                 /* The selector has to be !=0 . Starts with 1 */
00416                 cuda_native_table[id].resources.selector = id + 1;
00417 
00418                 /* store event ID */
00419                 cuda_native_table[id].resources.eventId =
00420                     device[deviceId].domain[domainId].event[eventId].eventId;
00421 
00422                 /* increment the table index counter */
00423                 id++;
00424             }
00425         }
00426     }
00427 
00428     /* Return the number of events created */
00429     return id;
00430 }
00431 
00432 
00433 /*
00434  * Returns all event values from the CuPTI eventGroup 
00435  */
00436 static int
00437 getEventValue( long long *counts, CUpti_EventGroup eventGroup, AddedEvents_t addedEvents )
00438 {
00439     CUptiResult cuptiErr = CUPTI_SUCCESS;
00440     size_t events_read, bufferSizeBytes, arraySizeBytes, i;
00441     uint64_t *counterDataBuffer;
00442     CUpti_EventID *eventIDArray;
00443     int j;
00444 
00445     bufferSizeBytes = addedEvents.count * sizeof ( uint64_t );
00446     counterDataBuffer = ( uint64_t * ) malloc( bufferSizeBytes );
00447 
00448     arraySizeBytes = addedEvents.count * sizeof ( CUpti_EventID );
00449     eventIDArray = ( CUpti_EventID * ) malloc( arraySizeBytes );
00450 
00451     /* read counter data for the specified event from the CuPTI eventGroup */
00452     cuptiErr = (*cuptiEventGroupReadAllEventsPtr)( eventGroup,
00453                                              CUPTI_EVENT_READ_FLAG_NONE,
00454                                              &bufferSizeBytes,
00455                                              counterDataBuffer, &arraySizeBytes,
00456                                              eventIDArray, &events_read );
00457     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupReadAllEvents" );
00458 
00459     if ( events_read != ( size_t ) addedEvents.count )
00460         return -1;
00461 
00462     /* Since there is no guarantee that returned counter values are in the same 
00463        order as the counters in the PAPI addedEvents.list, we need to map the
00464        CUpti_EventID to PAPI event ID values.
00465        According to CuPTI doc: counter return values of counterDataBuffer 
00466        correspond to the return event IDs in eventIDArray */
00467     for ( i = 0; i < events_read; i++ )
00468         for ( j = 0; j < addedEvents.count; j++ )
00469             if ( cuda_native_table[addedEvents.list[j]].resources.eventId ==
00470                  eventIDArray[i] )
00471                 // since cuptiEventGroupReadAllEvents() resets counter values to 0;
00472                 // we have to accumulate ourselves 
00473                 counts[addedEvents.list[j]] = counts[addedEvents.list[j]] + counterDataBuffer[i];
00474 
00475     free( counterDataBuffer );
00476     free( eventIDArray );
00477     return 0;
00478 }
00479 
00480 
00481 /*****************************************************************************
00482  *******************  BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS  *************
00483  *****************************************************************************/
00484 
00485 /*
00486  * This is called whenever a thread is initialized
00487  */
00488 int
00489 CUDA_init_thread( hwd_context_t * ctx )
00490 {
00491     ( void ) ctx;
00492     
00493     return PAPI_OK;
00494 }
00495 
00496 
00497 /* Initialize hardware counters, setup the function vector table
00498  * and get hardware information, this routine is called when the 
00499  * PAPI process is initialized (IE PAPI_library_init)
00500  *
00501  * NOTE: only called by main thread (not by every thread) !!!
00502  *
00503  * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context.
00504  * This is a much easier programming model then pre-4.0 as threads - using the 
00505  * same context - can share memory, data, etc. 
00506  * It's possible to create a different context for each thread, but then we are
00507  * likely running into a limitation that only one context can be profiled at a time.
00508  * ==> and we don't want this. That's why CUDA context creation is done in 
00509  * CUDA_init_component() (called only by main thread) rather than CUDA_init() 
00510  * or CUDA_init_control_state() (both called by each thread).
00511  */
00512 int
00513 CUDA_init_component( int cidx )
00514 {
00515     SUBDBG ("Entry: cidx: %d\n", cidx);
00516     CUresult cuErr = CUDA_SUCCESS;
00517 
00518     /* link in all the cuda libraries and resolve the symbols we need to use */
00519     if (linkCudaLibraries() != PAPI_OK) {
00520         SUBDBG ("Dynamic link of CUDA libraries failed, component will be disabled.\n");
00521         SUBDBG ("See disable reason in papi_component_avail output for more details.\n");
00522         return (PAPI_ENOSUPP);
00523     }
00524 
00525     /* Create dynamic event table */
00526     NUM_EVENTS = detectDevice(  );
00527     if (NUM_EVENTS < 0) {
00528         strncpy(_cuda_vector.cmp_info.disabled_reason, "Call to detectDevice failed.",PAPI_MAX_STR_LEN);
00529         return (PAPI_ENOSUPP);
00530     }
00531     /* TODO: works only for one device right now;
00532      need to find out if user can use 2 or more devices at same time */
00533 
00534     /* want create a CUDA context for either the default device or
00535      the device specified with cudaSetDevice() in user code */
00536     if ( CUDA_SUCCESS != (*cudaGetDevicePtr)( &currentDeviceID ) ) {
00537         strncpy(_cuda_vector.cmp_info.disabled_reason, "No NVIDIA GPU's found.",PAPI_MAX_STR_LEN);
00538         return ( PAPI_ENOSUPP );
00539     }
00540     
00541     if ( getenv( "PAPI_VERBOSE" ) ) {
00542         printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name,
00543                currentDeviceID );
00544     }
00545     
00546     /* get the CUDA context from the calling CPU thread */
00547     cuErr = (*cuCtxGetCurrentPtr)( &cuCtx );
00548 
00549     /* if no CUDA context is bound to the calling CPU thread yet, create one */
00550     if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) {
00551         cuErr = (*cuCtxCreatePtr)( &cuCtx, 0, device[currentDeviceID].dev );
00552         CHECK_CU_ERROR( cuErr, "cuCtxCreate" );
00553     }
00554 
00555     /* cuCtxGetCurrent() can return a non-null context that is not valid 
00556        because the context has not yet been initialized.
00557        Here is a workaround: 
00558        cudaFree(NULL) forces the context to be initialized
00559        if cudaFree(NULL) returns success then we are able to use the context in subsequent calls
00560        if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable,
00561        and will never be useable */
00562     if ( CUDA_SUCCESS != (*cudaFreePtr)( NULL ) ) {
00563         strncpy(_cuda_vector.cmp_info.disabled_reason, "Problem initializing CUDA context.",PAPI_MAX_STR_LEN);
00564         return ( PAPI_ENOSUPP );
00565     }
00566 
00567     /* Create dynamic event table */
00568     cuda_native_table = ( CUDA_native_event_entry_t * )
00569         malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS );
00570     if ( cuda_native_table == NULL ) {
00571         perror( "malloc(): Failed to allocate memory to events table" );
00572         strncpy(_cuda_vector.cmp_info.disabled_reason, "Failed to allocate memory to events table.",PAPI_MAX_STR_LEN);
00573         return ( PAPI_ENOSUPP );
00574     }
00575 
00576     if ( NUM_EVENTS != createNativeEvents(  ) ) {
00577         strncpy(_cuda_vector.cmp_info.disabled_reason, "Error creating CUDA event list.",PAPI_MAX_STR_LEN);
00578         return ( PAPI_ENOSUPP );
00579     }
00580     
00581     /* Export the component id */
00582     _cuda_vector.cmp_info.CmpIdx = cidx;
00583 
00584     /* Number of events */
00585     _cuda_vector.cmp_info.num_native_events = NUM_EVENTS;
00586 
00587     return ( PAPI_OK );
00588 }
00589 
00590 
00591 /*
00592  * Link the necessary CUDA libraries to use the cuda component.  If any of them can not be found, then
00593  * the CUDA component will just be disabled.  This is done at runtime so that a version of PAPI built
00594  * with the CUDA component can be installed and used on systems which have the CUDA libraries installed
00595  * and on systems where these libraries are not installed.
00596  */
00597 static int 
00598 linkCudaLibraries ()
00599 {
00600         /* Attempt to guess if we were statically linked to libc, if so bail */
00601         if ( _dl_non_dynamic_init != NULL ) {
00602                 strncpy(_cuda_vector.cmp_info.disabled_reason, "The cuda component does not support statically linking to libc.",PAPI_MAX_STR_LEN);
00603                 return PAPI_ENOSUPP;
00604         }
00605     /* Need to link in the cuda libraries, if not found disable the component */
00606     dl1 = dlopen("libcuda.so", RTLD_NOW | RTLD_GLOBAL);
00607     if (!dl1)
00608     {
00609         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA library libcuda.so not found.",PAPI_MAX_STR_LEN);
00610         return ( PAPI_ENOSUPP );
00611     }
00612     cuCtxCreatePtr = dlsym(dl1, "cuCtxCreate_v2");
00613     if (dlerror() != NULL)
00614     {
00615         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxCreate not found.",PAPI_MAX_STR_LEN);
00616         return ( PAPI_ENOSUPP );
00617     }
00618     cuCtxDestroyPtr = dlsym(dl1, "cuCtxDestroy_v2");
00619     if (dlerror() != NULL)
00620     {
00621         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxDestroy not found.",PAPI_MAX_STR_LEN);
00622         return ( PAPI_ENOSUPP );
00623     }
00624     cuCtxGetCurrentPtr = dlsym(dl1, "cuCtxGetCurrent");
00625     if (dlerror() != NULL)
00626     {
00627         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuCtxGetCurrent not found.",PAPI_MAX_STR_LEN);
00628         return ( PAPI_ENOSUPP );
00629     }
00630     cuDeviceGetPtr = dlsym(dl1, "cuDeviceGet");
00631     if (dlerror() != NULL)
00632     {
00633         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGet not found.",PAPI_MAX_STR_LEN);
00634         return ( PAPI_ENOSUPP );
00635     }
00636     cuDeviceGetCountPtr = dlsym(dl1, "cuDeviceGetCount");
00637     if (dlerror() != NULL)
00638     {
00639         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGetCount not found.",PAPI_MAX_STR_LEN);
00640         return ( PAPI_ENOSUPP );
00641     }
00642     cuDeviceGetNamePtr = dlsym(dl1, "cuDeviceGetName");
00643     if (dlerror() != NULL)
00644     {
00645         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuDeviceGetName not found.",PAPI_MAX_STR_LEN);
00646         return ( PAPI_ENOSUPP );
00647     }
00648     cuInitPtr = dlsym(dl1, "cuInit");
00649     if (dlerror() != NULL)
00650     {
00651         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA function cuInit not found.",PAPI_MAX_STR_LEN);
00652         return ( PAPI_ENOSUPP );
00653     }
00654 
00655     dl2 = dlopen("libcudart.so", RTLD_NOW | RTLD_GLOBAL);
00656     if (!dl2)
00657     {
00658         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA runtime library libcudart.so not found.",PAPI_MAX_STR_LEN);
00659         return ( PAPI_ENOSUPP );
00660     }
00661     cudaFreePtr = dlsym(dl2, "cudaFree");
00662     if (dlerror() != NULL)
00663     {
00664         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaFree not found.",PAPI_MAX_STR_LEN);
00665         return ( PAPI_ENOSUPP );
00666     }
00667     cudaGetDevicePtr = dlsym(dl2, "cudaGetDevice");
00668     if (dlerror() != NULL)
00669     {
00670         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaGetDevice not found.",PAPI_MAX_STR_LEN);
00671         return ( PAPI_ENOSUPP );
00672     }
00673     cudaRuntimeGetVersionPtr = dlsym(dl2, "cudaRuntimeGetVersion");
00674     if (dlerror() != NULL)
00675     {
00676         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaRuntimeGetVersion not found.",PAPI_MAX_STR_LEN);
00677         return ( PAPI_ENOSUPP );
00678     }
00679     cudaDriverGetVersionPtr = dlsym(dl2, "cudaDriverGetVersion");
00680     if (dlerror() != NULL)
00681     {
00682         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDART function cudaDriverGetVersion not found.",PAPI_MAX_STR_LEN);
00683         return ( PAPI_ENOSUPP );
00684     }
00685 
00686     dl3 = dlopen("libcupti.so", RTLD_NOW | RTLD_GLOBAL);
00687     if (!dl3)
00688     {
00689         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUDA runtime library libcupti.so not found.",PAPI_MAX_STR_LEN);
00690         return ( PAPI_ENOSUPP );
00691     }
00692     cuptiDeviceEnumEventDomainsPtr = dlsym(dl3, "cuptiDeviceEnumEventDomains");
00693     if (dlerror() != NULL)
00694     {
00695         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceEnumEventDomains not found.",PAPI_MAX_STR_LEN);
00696         return ( PAPI_ENOSUPP );
00697     }
00698     cuptiDeviceGetEventDomainAttributePtr = dlsym(dl3, "cuptiDeviceGetEventDomainAttribute");
00699     if (dlerror() != NULL)
00700     {
00701         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceGetEventDomainAttribute not found.",PAPI_MAX_STR_LEN);
00702         return ( PAPI_ENOSUPP );
00703     }
00704     cuptiDeviceGetNumEventDomainsPtr = dlsym(dl3, "cuptiDeviceGetNumEventDomains");
00705     if (dlerror() != NULL)
00706     {
00707         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiDeviceGetNumEventDomains not found.",PAPI_MAX_STR_LEN);
00708         return ( PAPI_ENOSUPP );
00709     }
00710     cuptiEventDomainEnumEventsPtr = dlsym(dl3, "cuptiEventDomainEnumEvents");
00711     if (dlerror() != NULL)
00712     {
00713         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventDomainEnumEvents not found.",PAPI_MAX_STR_LEN);
00714         return ( PAPI_ENOSUPP );
00715     }
00716     cuptiEventDomainGetNumEventsPtr = dlsym(dl3, "cuptiEventDomainGetNumEvents");
00717     if (dlerror() != NULL)
00718     {
00719         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventDomainGetNumEvents not found.",PAPI_MAX_STR_LEN);
00720         return ( PAPI_ENOSUPP );
00721     }
00722     cuptiEventGetAttributePtr = dlsym(dl3, "cuptiEventGetAttribute");
00723     if (dlerror() != NULL)
00724     {
00725         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGetAttribute not found.",PAPI_MAX_STR_LEN);
00726         return ( PAPI_ENOSUPP );
00727     }
00728     cuptiEventGroupAddEventPtr = dlsym(dl3, "cuptiEventGroupAddEvent");
00729     if (dlerror() != NULL)
00730     {
00731         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupAddEvent not found.",PAPI_MAX_STR_LEN);
00732         return ( PAPI_ENOSUPP );
00733     }
00734     cuptiEventGroupCreatePtr = dlsym(dl3, "cuptiEventGroupCreate");
00735     if (dlerror() != NULL)
00736     {
00737         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupCreate not found.",PAPI_MAX_STR_LEN);
00738         return ( PAPI_ENOSUPP );
00739     }
00740     cuptiEventGroupDestroyPtr = dlsym(dl3, "cuptiEventGroupDestroy");
00741     if (dlerror() != NULL)
00742     {
00743         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupDestroy not found.",PAPI_MAX_STR_LEN);
00744         return ( PAPI_ENOSUPP );
00745     }
00746     cuptiEventGroupDisablePtr = dlsym(dl3, "cuptiEventGroupDisable");
00747     if (dlerror() != NULL)
00748     {
00749         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupDisable not found.",PAPI_MAX_STR_LEN);
00750         return ( PAPI_ENOSUPP );
00751     }
00752     cuptiEventGroupEnablePtr = dlsym(dl3, "cuptiEventGroupEnable");
00753     if (dlerror() != NULL)
00754     {
00755         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupEnable not found.",PAPI_MAX_STR_LEN);
00756         return ( PAPI_ENOSUPP );
00757     }
00758     cuptiEventGroupReadAllEventsPtr = dlsym(dl3, "cuptiEventGroupReadAllEvents");
00759     if (dlerror() != NULL)
00760     {
00761         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupReadAllEvents not found.",PAPI_MAX_STR_LEN);
00762         return ( PAPI_ENOSUPP );
00763     }
00764     cuptiEventGroupRemoveAllEventsPtr = dlsym(dl3, "cuptiEventGroupRemoveAllEvents");
00765     if (dlerror() != NULL)
00766     {
00767         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupRemoveAllEvents not found.",PAPI_MAX_STR_LEN);
00768         return ( PAPI_ENOSUPP );
00769     }
00770     cuptiEventGroupResetAllEventsPtr = dlsym(dl3, "cuptiEventGroupResetAllEvents");
00771     if (dlerror() != NULL)
00772     {
00773         strncpy(_cuda_vector.cmp_info.disabled_reason, "CUPTI function cuptiEventGroupResetAllEvents not found.",PAPI_MAX_STR_LEN);
00774         return ( PAPI_ENOSUPP );
00775     }
00776 
00777     return ( PAPI_OK );
00778 }
00779 
00780 
00781 /*
00782  * Control of counters (Reading/Writing/Starting/Stopping/Setup)
00783  * functions
00784  */
00785 int
00786 CUDA_init_control_state( hwd_control_state_t * ctrl )
00787 {
00788     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
00789     CUptiResult cuptiErr = CUPTI_SUCCESS;
00790     int i;
00791 
00792     /* allocate memory for the list of events that are added to the CuPTI eventGroup */
00793     CUDA_ctrl->addedEvents.list = malloc( sizeof ( int ) * NUM_EVENTS );
00794     if ( CUDA_ctrl->addedEvents.list == NULL ) {
00795         perror
00796         ( "malloc(): Failed to allocate memory to table of events that are added to CuPTI eventGroup" );
00797         return ( PAPI_ENOSUPP );
00798     }
00799     
00800     /* initialize the event list */
00801     for ( i = 0; i < NUM_EVENTS; i++ )
00802         CUDA_ctrl->addedEvents.list[i] = 0;
00803 
00804     
00805     
00806     cuptiErr = (*cuptiEventGroupCreatePtr)( cuCtx, &CUDA_ctrl->eventGroup, 0 );
00807     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupCreate" );
00808     
00809     return PAPI_OK;
00810 }
00811 
00812 
00813 /*
00814  *
00815  */
00816 int
00817 CUDA_start( hwd_context_t * ctx, hwd_control_state_t * ctrl )
00818 {
00819     ( void ) ctx;
00820     int i;
00821     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
00822     CUptiResult cuptiErr = CUPTI_SUCCESS;
00823     
00824     // reset all event values to 0
00825     for ( i = 0; i < NUM_EVENTS; i++ )
00826         CUDA_ctrl->counts[i] = 0;
00827 
00828     cuptiErr = (*cuptiEventGroupEnablePtr)( CUDA_ctrl->eventGroup );
00829     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupEnable" );
00830 
00831     /* Resets all events in the CuPTI eventGroup to zero */
00832     cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup );
00833     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" );
00834 
00835     return ( PAPI_OK );
00836 }
00837 
00838 
00839 /*
00840  *
00841  */
00842 int
00843 CUDA_stop( hwd_context_t * ctx, hwd_control_state_t * ctrl )
00844 {
00845     ( void ) ctx;
00846     ( void ) ctrl;
00847 
00848     return ( PAPI_OK );
00849 }
00850 
00851 
00852 /*
00853  *
00854  */
00855 int
00856 CUDA_read( hwd_context_t * ctx, hwd_control_state_t * ctrl,
00857            long_long ** events, int flags )
00858 {
00859     ( void ) ctx;
00860     ( void ) flags;
00861     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
00862 
00863 
00864     if ( 0 != getEventValue( CUDA_ctrl->counts, CUDA_ctrl->eventGroup, CUDA_ctrl->addedEvents ) )
00865         return ( PAPI_ENOSUPP );
00866 
00867     *events = CUDA_ctrl->counts;
00868 
00869     return ( PAPI_OK );
00870 }
00871 
00872 /* 
00873  *
00874  */
00875 int
00876 CUDA_shutdown_thread( hwd_context_t *ctx )
00877 {
00878     CUDA_context_t *CUDA_ctx = (CUDA_context_t*)ctx;
00879     free( CUDA_ctx->state.addedEvents.list );
00880     return (PAPI_OK);
00881 }
00882 
00883 /*
00884  *
00885  */
00886 int
00887 CUDA_shutdown_component( void )
00888 {
00889     CUresult cuErr = CUDA_SUCCESS;
00890     
00891     /* if running a threaded application, we need to make sure that 
00892        a thread doesn't free the same memory location(s) more than once */
00893     if ( CUDA_FREED == 0 ) {
00894         uint32_t j;
00895         int i;
00896         
00897         CUDA_FREED = 1;
00898 
00899         /* deallocate all the memory */
00900         for ( i = 0; i < deviceCount; i++ ) {
00901             for ( j = 0; j < device[i].domainCount; j++ )
00902                 free( device[i].domain[j].event );
00903             
00904             free( device[i].domain );
00905         }
00906 
00907         free( device );
00908         free( cuda_native_table );
00909         
00910         /* destroy floating CUDA context */
00911         cuErr = (*cuCtxDestroyPtr)( cuCtx );
00912         if ( cuErr != CUDA_SUCCESS )
00913             return ( PAPI_ENOSUPP );            // Not supported
00914     }
00915 
00916     // close the dynamic libraries needed by this component (opened in the init substrate call)
00917     dlclose(dl1);
00918     dlclose(dl2);
00919     dlclose(dl3);
00920 
00921     return ( PAPI_OK );
00922 }
00923 
00924 
00925 /* This function sets various options in the component
00926  * The valid codes being passed in are PAPI_SET_DEFDOM,
00927  * PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL * and PAPI_SET_INHERIT
00928  */
00929 int
00930 CUDA_ctl( hwd_context_t * ctx, int code, _papi_int_option_t * option )
00931 {
00932     ( void ) ctx;
00933     ( void ) code;
00934     ( void ) option;
00935     return ( PAPI_OK );
00936 }
00937 
00938 
00939 //int CUDA_ntv_code_to_bits ( unsigned int EventCode, hwd_register_t * bits );
00940 
00941 
00942 /*
00943  *
00944  */
00945 int
00946 CUDA_update_control_state( hwd_control_state_t * ptr,
00947                            NativeInfo_t * native, int count,
00948                            hwd_context_t * ctx )
00949 {
00950     ( void ) ctx;
00951     CUDA_control_state_t * CUDA_ptr = ( CUDA_control_state_t * ) ptr;
00952     int index, i;
00953     CUptiResult cuptiErr = CUPTI_SUCCESS;
00954 
00955     /* Disable the CUDA eventGroup;
00956      it also frees the perfmon hardware on the GPU */
00957     cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ptr->eventGroup );
00958     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" );
00959 
00960     cuptiErr = (*cuptiEventGroupRemoveAllEventsPtr)( CUDA_ptr->eventGroup );
00961     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupRemoveAllEvents" );
00962     
00963     // otherwise, add the events to the eventset
00964     for ( i = 0; i < count; i++ ) {
00965        
00966         index = native[i].ni_event;
00967         native[i].ni_position = index;
00968 
00969         /* store events, that have been added to the CuPTI eveentGroup 
00970            in a seperate place (addedEvents).
00971            Needed, so that we can read the values for the added events only */
00972         CUDA_ptr->addedEvents.count = count;
00973         CUDA_ptr->addedEvents.list[i] = index;
00974 
00975         /* if this device name is different from the actual device the code is running on, then exit */
00976         if ( 0 != strncmp( device[currentDeviceID].name,
00977                            cuda_native_table[index].name,
00978                            strlen( device[currentDeviceID].name ) ) ) {
00979             fprintf( stderr, "Device %s is used -- BUT event %s is collected. \n ---> ERROR: Specify events for the device that is used!\n\n",
00980                   device[currentDeviceID].name, cuda_native_table[index].name );
00981             
00982             return ( PAPI_ENOSUPP );    // Not supported 
00983         }
00984 
00985         /* Add events to the CuPTI eventGroup */
00986         cuptiErr =
00987             (*cuptiEventGroupAddEventPtr)( CUDA_ptr->eventGroup,
00988                                      cuda_native_table[index].resources.
00989                                      eventId );
00990         CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupAddEvent" );
00991     }
00992 
00993     return ( PAPI_OK );
00994 }
00995 
00996 
00997 /*
00998  * This function has to set the bits needed to count different domains
00999  * In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER
01000  * By default return PAPI_EINVAL if none of those are specified
01001  * and PAPI_OK with success
01002  * PAPI_DOM_USER is only user context is counted
01003  * PAPI_DOM_KERNEL is only the Kernel/OS context is counted
01004  * PAPI_DOM_OTHER  is Exception/transient mode (like user TLB misses)
01005  * PAPI_DOM_ALL   is all of the domains
01006  */
01007 int
01008 CUDA_set_domain( hwd_control_state_t * cntrl, int domain )
01009 {
01010     int found = 0;
01011     ( void ) cntrl;
01012 
01013     if ( PAPI_DOM_USER & domain )
01014         found = 1;
01015 
01016     if ( PAPI_DOM_KERNEL & domain )
01017         found = 1;
01018 
01019     if ( PAPI_DOM_OTHER & domain )
01020         found = 1;
01021 
01022     if ( !found )
01023         return ( PAPI_EINVAL );
01024 
01025     return ( PAPI_OK );
01026 }
01027 
01028 
01029 /*
01030  *
01031  */
01032 int
01033 CUDA_reset( hwd_context_t * ctx, hwd_control_state_t * ctrl )
01034 {
01035     ( void ) ctx;
01036     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
01037     CUptiResult cuptiErr = CUPTI_SUCCESS;
01038 
01039     /* Resets all events in the CuPTI eventGroup to zero */
01040     cuptiErr = (*cuptiEventGroupResetAllEventsPtr)( CUDA_ctrl->eventGroup );
01041     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" );
01042 
01043     return ( PAPI_OK );
01044 }
01045 
01046 
01047 /*
01048  * Disable and Destoy the CUDA eventGroup */
01049 int
01050 CUDA_cleanup_eventset( hwd_control_state_t * ctrl )
01051 {
01052     ( void ) ctrl;
01053     
01054     // TODO: after cleanup_eventset() which destroys the eventset, update_control_state()
01055     // is called, which operates on the already destroyed eventset. Bad!
01056 #if 0
01057     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
01058     CUptiResult cuptiErr = CUPTI_SUCCESS;
01059 
01060     /* Disable the CUDA eventGroup;
01061        it also frees the perfmon hardware on the GPU */
01062     cuptiErr = (*cuptiEventGroupDisablePtr)( CUDA_ctrl->eventGroup );
01063     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" );
01064 
01065     /* Call the CuPTI cleaning function before leaving */
01066     cuptiErr = (*cuptiEventGroupDestroyPtr)( CUDA_ctrl->eventGroup );
01067     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDestroy" );
01068 #endif
01069     return ( PAPI_OK );
01070 }
01071 
01072 
01073 /*
01074  * Native Event functions
01075  */
01076 int
01077 CUDA_ntv_enum_events( unsigned int *EventCode, int modifier )
01078 {
01079 
01080     switch ( modifier ) {
01081     case PAPI_ENUM_FIRST:
01082         *EventCode = 0;
01083 
01084         return ( PAPI_OK );
01085         break;
01086 
01087     case PAPI_ENUM_EVENTS:
01088     {
01089         int index = *EventCode;
01090 
01091         if ( index < NUM_EVENTS - 1 ) {
01092             *EventCode = *EventCode + 1;
01093             return ( PAPI_OK );
01094         } else
01095             return ( PAPI_ENOEVNT );
01096 
01097         break;
01098     }
01099     default:
01100         return ( PAPI_EINVAL );
01101     }
01102     return ( PAPI_EINVAL );
01103 }
01104 
01105 
01106 /*
01107  *
01108  */
01109 int
01110 CUDA_ntv_code_to_name( unsigned int EventCode, char *name, int len )
01111 {
01112     int index = EventCode;
01113 
01114     strncpy( name, cuda_native_table[index].name, len );
01115     return ( PAPI_OK );
01116 }
01117 
01118 
01119 /*
01120  *
01121  */
01122 int
01123 CUDA_ntv_code_to_descr( unsigned int EventCode, char *name, int len )
01124 {
01125     int index = EventCode;
01126 
01127     strncpy( name, cuda_native_table[index].description, len );
01128     return ( PAPI_OK );
01129 }
01130 
01131 
01132 /*
01133  *
01134  */
01135 int
01136 CUDA_ntv_code_to_bits( unsigned int EventCode, hwd_register_t * bits )
01137 {
01138     int index = EventCode;
01139 
01140     memcpy( ( CUDA_register_t * ) bits,
01141             &( cuda_native_table[index].resources ),
01142             sizeof ( CUDA_register_t ) );
01143 
01144     return ( PAPI_OK );
01145 }
01146 
01147 
01148 /*
01149  *
01150  */
01151 papi_vector_t _cuda_vector = {
01152     .cmp_info = {
01153                  /* default component information (unspecified values are initialized to 0) */
01154                  .name = "cuda",
01155                  .short_name = "cuda",
01156                  .version = "5.0",
01157                  .description = "CuPTI provides the API for monitoring NVIDIA GPU hardware events",
01158                  .num_mpx_cntrs = CUDA_MAX_COUNTERS,
01159                  .num_cntrs = CUDA_MAX_COUNTERS,
01160                  .default_domain = PAPI_DOM_USER,
01161                  .default_granularity = PAPI_GRN_THR,
01162                  .available_granularities = PAPI_GRN_THR,
01163                  .hardware_intr_sig = PAPI_INT_SIGNAL,
01164 
01165                  /* component specific cmp_info initializations */
01166                  .fast_real_timer = 0,
01167                  .fast_virtual_timer = 0,
01168                  .attach = 0,
01169                  .attach_must_ptrace = 0,
01170                  .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL,
01171                  }
01172     ,
01173 
01174     /* sizes of framework-opaque component-private structures */
01175     .size = {
01176              .context = sizeof ( CUDA_context_t ),
01177              .control_state = sizeof ( CUDA_control_state_t ),
01178              .reg_value = sizeof ( CUDA_register_t ),
01179              .reg_alloc = sizeof ( CUDA_reg_alloc_t ),
01180              }
01181     ,
01182     /* function pointers in this component */
01183     .init_thread = CUDA_init_thread,
01184     .init_component = CUDA_init_component,
01185     .init_control_state = CUDA_init_control_state,
01186     .start = CUDA_start,
01187     .stop = CUDA_stop,
01188     .read = CUDA_read,
01189     .shutdown_component = CUDA_shutdown_component,
01190     .shutdown_thread = CUDA_shutdown_thread,
01191     .cleanup_eventset = CUDA_cleanup_eventset,
01192     .ctl = CUDA_ctl,
01193     .update_control_state = CUDA_update_control_state,
01194     .set_domain = CUDA_set_domain,
01195     .reset = CUDA_reset,
01196 
01197     .ntv_enum_events = CUDA_ntv_enum_events,
01198     .ntv_code_to_name = CUDA_ntv_code_to_name,
01199     .ntv_code_to_descr = CUDA_ntv_code_to_descr,
01200     .ntv_code_to_bits = CUDA_ntv_code_to_bits,
01201 };
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines