PAPI  5.0.1.0
linux-cuda.c
Go to the documentation of this file.
00001 /****************************/
00002 /* THIS IS OPEN SOURCE CODE */
00003 /****************************/
00004 
00018 #include "papi.h"
00019 #include "papi_internal.h"
00020 #include "papi_vector.h"
00021 #include "papi_memory.h"
00022 #include "linux-cuda.h"
00023 
00024 papi_vector_t _cuda_vector;
00025 
00026 
00027 /******************************************************************************
00028  ********  BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT *********
00029  *****************************************************************************/
00030 /*
00031  * Specify device(s): Counts number of cuda events available in this system
00032  */
00033 static int
00034 detectDevice( void )
00035 {
00036     CUresult err;
00037     int skipDevice = 0;
00038     int id;
00039     char deviceName_tmp[PAPI_MIN_STR_LEN] = "init";
00040 
00041     totalEventCount = 0;
00042 
00043     /* CUDA initialization  */
00044     err = cuInit( 0 );
00045     if ( err != CUDA_SUCCESS ) 
00046         return ( PAPI_ENOSUPP );
00047 
00048     /* How many gpgpu devices do we have? */
00049     err = cuDeviceGetCount( &deviceCount );
00050     CHECK_CU_ERROR( err, "cuDeviceGetCount" );
00051     if ( deviceCount == 0 )
00052         return ( PAPI_ENOSUPP );
00053 
00054     /* allocate memory for device data table */
00055     device = ( DeviceData_t * ) malloc( sizeof ( DeviceData_t ) * deviceCount );
00056     if ( device == NULL ) {
00057         perror( "malloc(): Failed to allocate memory to CUDA device table" );
00058         return ( PAPI_ENOSUPP );
00059     }
00060 
00061     /* What are the devices? Get Name and # of domains per device */
00062     for ( id = 0; id < deviceCount; id++ ) {
00063         err = cuDeviceGet( &device[id].dev, id );
00064         CHECK_CU_ERROR( err, "cuDeviceGet" );
00065 
00066         err =
00067             cuDeviceGetName( device[id].name, PAPI_MIN_STR_LEN,
00068                              device[id].dev );
00069         CHECK_CU_ERROR( err, "cuDeviceGetName" );
00070 
00071         /* Skip device if there are multiple of the same type 
00072            and if it has been already added to the list */
00073         if ( 0 == strcmp( deviceName_tmp, device[id].name ) ) {
00074             skipDevice++;
00075             continue;
00076         }
00077 
00078         strcpy( deviceName_tmp, device[id].name );
00079 
00080         /* enumerate the domains on the device */
00081         if ( 0 != enumEventDomains( device[id].dev, id ) )
00082             return ( PAPI_ENOSUPP );
00083     }
00084 
00085     deviceCount = deviceCount - skipDevice;
00086 
00087     /* return number of events provided via CuPTI */
00088     return totalEventCount;
00089 }
00090 
00091 
00092 /*
00093  * Detect supported domains for specified device
00094  */
00095 static int
00096 enumEventDomains( CUdevice dev, int deviceId )
00097 {
00098     CUptiResult err = CUPTI_SUCCESS;
00099     CUpti_EventDomainID *domainId = NULL;
00100     uint32_t id = 0;
00101     size_t size = 0;
00102 
00103     device[deviceId].domainCount = 0;
00104 
00105     /* get number of domains for device dev */
00106     err = cuptiDeviceGetNumEventDomains( dev, &device[deviceId].domainCount );
00107     CHECK_CUPTI_ERROR( err, "cuptiDeviceGetNumEventDomains" );
00108 
00109     if ( device[deviceId].domainCount == 0 ) {
00110         printf( "No domain is exposed by dev = %d\n", dev );
00111         return -1;
00112     }
00113 
00114     /* CuPTI domain struct */
00115     size = sizeof ( CUpti_EventDomainID ) * device[deviceId].domainCount;
00116     domainId = ( CUpti_EventDomainID * ) malloc( size );
00117     if ( domainId == NULL ) {
00118         perror( "malloc(): Failed to allocate memory to CuPTI domain ID" );
00119         return -1;
00120     }
00121     memset( domainId, 0, size );
00122 
00123     /* PAPI domain struct */
00124     device[deviceId].domain =
00125         ( DomainData_t * ) malloc( sizeof ( DomainData_t ) *
00126                                    device[deviceId].domainCount );
00127     if ( device[deviceId].domain == NULL ) {
00128         perror( "malloc(): Failed to allocate memory to PAPI domain struct" );
00129         return -1;
00130     }
00131 
00132     /* Enumerates the event domains for a device dev */
00133     err = cuptiDeviceEnumEventDomains( dev, &size, domainId );
00134     CHECK_CUPTI_ERROR( err, "cuptiDeviceEnumEventDomains" );
00135 
00136     /* enum domains */
00137     for ( id = 0; id < device[deviceId].domainCount; id++ ) {
00138         device[deviceId].domain[id].domainId = domainId[id];
00139 
00140         /* query domain name */
00141         size = PAPI_MIN_STR_LEN;
00142 #ifdef CUDA_4_0
00143         err = cuptiEventDomainGetAttribute( dev,
00144                                            device[deviceId].domain[id].
00145                                            domainId,
00146                                            CUPTI_EVENT_DOMAIN_ATTR_NAME, &size,
00147                                            ( void * ) device[deviceId].
00148                                            domain[id].name );
00149         CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" );
00150         
00151         /* query num of events avaialble in the domain */
00152         size = sizeof ( device[deviceId].domain[id].eventCount );
00153         err = cuptiEventDomainGetAttribute( dev,
00154                                            device[deviceId].domain[id].
00155                                            domainId,
00156                                            CUPTI_EVENT_DOMAIN_MAX_EVENTS,
00157                                            &size,
00158                                            ( void * ) &device[deviceId].
00159                                            domain[id].eventCount );
00160         CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" );
00161         
00162         /* enumerate the events for the domain[id] on the device dev */
00163         if ( 0 != enumEvents( dev, deviceId, id ) )
00164             return -1;
00165 #else
00166         err = cuptiDeviceGetEventDomainAttribute( dev,
00167                                                   device[deviceId].domain[id].domainId,
00168                                                   CUPTI_EVENT_DOMAIN_ATTR_NAME, &size,
00169                                                   ( void * ) device[deviceId].domain[id].name );
00170         CHECK_CUPTI_ERROR( err, "cuptiDeviceGetEventDomainAttribute" );
00171 
00172         /* query num of events avaialble in the domain */
00173         err = cuptiEventDomainGetNumEvents( device[deviceId].domain[id].domainId,
00174                                             &device[deviceId].domain[id].eventCount );
00175         CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetNumEvents" );
00176 
00177         /* enumerate the events for the domain[id] on the device deviceId */
00178         if ( 0 != enumEvents( deviceId, id ) )
00179             return -1;
00180 #endif
00181     }
00182 
00183     totalDomainCount += device[deviceId].domainCount;
00184     free( domainId );
00185     return 0;
00186 }
00187 
00188 
00189 /*
00190  * Detect supported events for specified device domain
00191  */
00192 #ifdef CUDA_4_0
00193 static int
00194 enumEvents( CUdevice dev, int deviceId, int domainId )
00195 #else
00196 static int
00197 enumEvents( int deviceId, int domainId )
00198 #endif
00199 {
00200     CUptiResult err = CUPTI_SUCCESS;
00201     CUpti_EventID *eventId = NULL;
00202     size_t size = 0;
00203     uint32_t id = 0;
00204 
00205     /* CuPTI event struct */
00206     size =
00207         sizeof ( CUpti_EventID ) * device[deviceId].domain[domainId].eventCount;
00208     eventId = ( CUpti_EventID * ) malloc( size );
00209     if ( eventId == NULL ) {
00210         perror( "malloc(): Failed to allocate memory to CuPTI event ID" );
00211         return -1;
00212     }
00213     memset( eventId, 0, size );
00214 
00215     /* PAPI event struct */
00216     device[deviceId].domain[domainId].event =
00217         ( EventData_t * ) malloc( sizeof ( EventData_t ) *
00218                                   device[deviceId].domain[domainId].
00219                                   eventCount );
00220     if ( device[deviceId].domain[domainId].event == NULL ) {
00221         perror( "malloc(): Failed to allocate memory to PAPI event struct" );
00222         return -1;
00223     }
00224 
00225     /* enumerate the events for the domain[domainId] on the device[deviceId] */
00226 #ifdef CUDA_4_0
00227     err =
00228     cuptiEventDomainEnumEvents( dev,
00229                                ( CUpti_EventDomainID ) device[deviceId].
00230                                domain[domainId].domainId, &size, eventId );
00231 #else
00232     err =
00233         cuptiEventDomainEnumEvents( ( CUpti_EventDomainID ) device[deviceId].
00234                                     domain[domainId].domainId, &size, eventId );
00235 #endif
00236     CHECK_CUPTI_ERROR( err, "cuptiEventDomainEnumEvents" );
00237 
00238     /* query event info */
00239     for ( id = 0; id < device[deviceId].domain[domainId].eventCount; id++ ) {
00240         device[deviceId].domain[domainId].event[id].eventId = eventId[id];
00241 
00242         /* query event name */
00243         size = PAPI_MIN_STR_LEN;
00244 #ifdef CUDA_4_0
00245         err = cuptiEventGetAttribute( dev,
00246                                      device[deviceId].domain[domainId].
00247                                      event[id].eventId, CUPTI_EVENT_ATTR_NAME,
00248                                      &size,
00249                                      ( uint8_t * ) device[deviceId].
00250                                      domain[domainId].event[id].name );     
00251 #else
00252         err = cuptiEventGetAttribute( device[deviceId].domain[domainId].
00253                                       event[id].eventId, CUPTI_EVENT_ATTR_NAME,
00254                                       &size,
00255                                       ( uint8_t * ) device[deviceId].
00256                                       domain[domainId].event[id].name );
00257 #endif
00258         CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" );
00259 
00260         /* query event description */
00261         size = PAPI_2MAX_STR_LEN;
00262 #ifdef CUDA_4_0
00263         err = cuptiEventGetAttribute( dev,
00264                                      device[deviceId].domain[domainId].
00265                                      event[id].eventId,
00266                                      CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size,
00267                                      ( uint8_t * ) device[deviceId].
00268                                      domain[domainId].event[id].desc );     
00269 #else
00270         err = cuptiEventGetAttribute( device[deviceId].domain[domainId].
00271                                       event[id].eventId,
00272                                       CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size,
00273                                       ( uint8_t * ) device[deviceId].
00274                                       domain[domainId].event[id].desc );
00275 #endif
00276         CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" );
00277     }
00278 
00279     totalEventCount += device[deviceId].domain[domainId].eventCount;
00280     free( eventId );
00281     return 0;
00282 }
00283 
00284 
00285 /*
00286  * Create the native events for specified domain and device
00287  */
00288 static int
00289 createNativeEvents( void )
00290 {
00291     int deviceId, id = 0;
00292     uint32_t domainId, eventId;
00293     int cuptiDomainId;
00294     int i;
00295     int devNameLen;
00296 
00297     /* component name and description */
00298     strcpy( _cuda_vector.cmp_info.short_name, "CUDA" );
00299     strcpy( _cuda_vector.cmp_info.description,
00300             "CuPTI provides the API for monitoring CUDA hardware events" );
00301 
00302     /* create events for every GPU device and every domain per device  */
00303     for ( deviceId = 0; deviceId < deviceCount; deviceId++ ) {
00304         /* for the event names, replace blanks in the device name with underscores */
00305         devNameLen = strlen( device[deviceId].name );
00306         for ( i = 0; i < devNameLen; i++ )
00307             if ( device[deviceId].name[i] == ' ' )
00308                 device[deviceId].name[i] = '_';
00309 
00310         for ( domainId = 0; domainId < device[deviceId].domainCount;
00311               domainId++ ) {
00312             cuptiDomainId = device[deviceId].domain[domainId].domainId;
00313 
00314             for ( eventId = 0;
00315                   eventId < device[deviceId].domain[domainId].eventCount;
00316                   eventId++ ) {
00317                 /* Save native event data */
00318                 sprintf( cuda_native_table[id].name,
00319                          "%s:%s:%s",
00320                          device[deviceId].name,
00321                          device[deviceId].domain[domainId].name,
00322                          device[deviceId].domain[domainId].event[eventId].
00323                          name );
00324 
00325                 strncpy( cuda_native_table[id].description,
00326                          device[deviceId].domain[domainId].event[eventId].desc,
00327                          PAPI_2MAX_STR_LEN );
00328 
00329                 /* The selector has to be !=0 . Starts with 1 */
00330                 cuda_native_table[id].resources.selector = id + 1;
00331 
00332                 /* store event ID */
00333                 cuda_native_table[id].resources.eventId =
00334                     device[deviceId].domain[domainId].event[eventId].eventId;
00335 
00336                 /* increment the table index counter */
00337                 id++;
00338             }
00339         }
00340     }
00341 
00342     /* Return the number of events created */
00343     return id;
00344 }
00345 
00346 
00347 /*
00348  * Returns all event values from the CuPTI eventGroup 
00349  */
00350 static int
00351 getEventValue( long long *counts, CUpti_EventGroup eventGroup, AddedEvents_t addedEvents )
00352 {
00353     CUptiResult cuptiErr = CUPTI_SUCCESS;
00354     size_t events_read, bufferSizeBytes, arraySizeBytes, i;
00355     uint64_t *counterDataBuffer;
00356     CUpti_EventID *eventIDArray;
00357     int j;
00358 
00359     bufferSizeBytes = addedEvents.count * sizeof ( uint64_t );
00360     counterDataBuffer = ( uint64_t * ) malloc( bufferSizeBytes );
00361 
00362     arraySizeBytes = addedEvents.count * sizeof ( CUpti_EventID );
00363     eventIDArray = ( CUpti_EventID * ) malloc( arraySizeBytes );
00364 
00365     /* read counter data for the specified event from the CuPTI eventGroup */
00366     cuptiErr = cuptiEventGroupReadAllEvents( eventGroup,
00367                                              CUPTI_EVENT_READ_FLAG_NONE,
00368                                              &bufferSizeBytes,
00369                                              counterDataBuffer, &arraySizeBytes,
00370                                              eventIDArray, &events_read );
00371     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupReadAllEvents" );
00372 
00373     if ( events_read != ( size_t ) addedEvents.count )
00374         return -1;
00375 
00376     /* Since there is no guarantee that returned counter values are in the same 
00377        order as the counters in the PAPI addedEvents.list, we need to map the
00378        CUpti_EventID to PAPI event ID values.
00379        According to CuPTI doc: counter return values of counterDataBuffer 
00380        correspond to the return event IDs in eventIDArray */
00381     for ( i = 0; i < events_read; i++ )
00382         for ( j = 0; j < addedEvents.count; j++ )
00383             if ( cuda_native_table[addedEvents.list[j]].resources.eventId ==
00384                  eventIDArray[i] )
00385                 // since cuptiEventGroupReadAllEvents() resets counter values to 0;
00386                 // we have to accumulate ourselves 
00387                 counts[addedEvents.list[j]] = counts[addedEvents.list[j]] + counterDataBuffer[i];
00388 
00389     free( counterDataBuffer );
00390     free( eventIDArray );
00391     return 0;
00392 }
00393 
00394 
00395 /*****************************************************************************
00396  *******************  BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS  *************
00397  *****************************************************************************/
00398 
00399 /*
00400  * This is called whenever a thread is initialized
00401  */
00402 int
00403 CUDA_init_thread( hwd_context_t * ctx )
00404 {
00405     CUDA_context_t * CUDA_ctx = ( CUDA_context_t * ) ctx;
00406     /* Initialize number of events in EventSet for update_control_state() */
00407     CUDA_ctx->state.old_count = 0;
00408     
00409     return PAPI_OK;
00410 }
00411 
00412 
00413 /* Initialize hardware counters, setup the function vector table
00414  * and get hardware information, this routine is called when the 
00415  * PAPI process is initialized (IE PAPI_library_init)
00416  *
00417  * NOTE: only called by main thread (not by every thread) !!!
00418  *
00419  * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context.
00420  * This is a much easier programming model then pre-4.0 as threads - using the 
00421  * same context - can share memory, data, etc. 
00422  * It's possible to create a different context for each thread, but then we are
00423  * likely running into a limitation that only one context can be profiled at a time.
00424  * ==> and we don't want this. That's why CUDA context creation is done in 
00425  * CUDA_init_component() (called only by main thread) rather than CUDA_init() 
00426  * or CUDA_init_control_state() (both called by each thread).
00427  */
00428 int
00429 CUDA_init_component(  )
00430 {
00431     CUresult cuErr = CUDA_SUCCESS;
00432     
00433     /* Create dynamic event table */
00434     NUM_EVENTS = detectDevice(  );
00435 
00436     /* TODO: works only for one device right now; 
00437      need to find out if user can use 2 or more devices at same time */
00438     
00439     /* want create a CUDA context for either the default device or
00440      the device specified with cudaSetDevice() in user code */
00441     if ( CUDA_SUCCESS != cudaGetDevice( &currentDeviceID ) )
00442         return ( PAPI_ENOSUPP );
00443     
00444     if ( getenv( "PAPI_VERBOSE" ) ) {
00445         printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name,
00446                currentDeviceID );
00447     }
00448     
00449     /* get the CUDA context from the calling CPU thread */
00450     cuErr = cuCtxGetCurrent( &cuCtx );
00451     
00452     /* if no CUDA context is bound to the calling CPU thread yet, create one */
00453     if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) {
00454         cuErr = cuCtxCreate( &cuCtx, 0, device[currentDeviceID].dev );
00455         CHECK_CU_ERROR( cuErr, "cuCtxCreate" );
00456     }
00457     
00458     /* cuCtxGetCurrent() can return a non-null context that is not valid 
00459        because the context has not yet been initialized.
00460        Here is a workaround: 
00461        cudaFree(NULL) forces the context to be initialized
00462        if cudaFree(NULL) returns success then we are able to use the context in subsequent calls
00463        if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable,
00464        and will never be useable */
00465     if ( CUDA_SUCCESS != cudaFree( NULL ) )
00466         return ( PAPI_ENOSUPP );
00467         
00468     /* Create dynamic event table */
00469     cuda_native_table = ( CUDA_native_event_entry_t * )
00470         malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS );
00471     if ( cuda_native_table == NULL ) {
00472         perror( "malloc(): Failed to allocate memory to events table" );
00473         return ( PAPI_ENOSUPP );
00474     }
00475 
00476     if ( NUM_EVENTS != createNativeEvents(  ) ) 
00477         return ( PAPI_ENOSUPP );
00478     
00479     return ( PAPI_OK );
00480 }
00481 
00482 
00483 /*
00484  * Control of counters (Reading/Writing/Starting/Stopping/Setup)
00485  * functions
00486  */
00487 int
00488 CUDA_init_control_state( hwd_control_state_t * ctrl )
00489 {
00490     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
00491     CUptiResult cuptiErr = CUPTI_SUCCESS;
00492     int i;
00493 
00494     /* allocate memory for the list of events that are added to the CuPTI eventGroup */
00495     CUDA_ctrl->addedEvents.list = malloc( sizeof ( int ) * NUM_EVENTS );
00496     if ( CUDA_ctrl->addedEvents.list == NULL ) {
00497         perror
00498         ( "malloc(): Failed to allocate memory to table of events that are added to CuPTI eventGroup" );
00499         return ( PAPI_ENOSUPP );
00500     }
00501     
00502     /* initialize the event list */
00503     for ( i = 0; i < NUM_EVENTS; i++ )
00504         CUDA_ctrl->addedEvents.list[i] = 0;
00505 
00506     
00507     
00508     cuptiErr = cuptiEventGroupCreate( cuCtx, &CUDA_ctrl->eventGroup, 0 );
00509     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupCreate" );
00510     
00511     return PAPI_OK;
00512 }
00513 
00514 
00515 /*
00516  *
00517  */
00518 int
00519 CUDA_start( hwd_context_t * ctx, hwd_control_state_t * ctrl )
00520 {
00521     ( void ) ctx;
00522     int i;
00523     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
00524     CUptiResult cuptiErr = CUPTI_SUCCESS;
00525     
00526     // reset all event values to 0
00527     for ( i = 0; i < NUM_EVENTS; i++ )
00528         CUDA_ctrl->counts[i] = 0;
00529 
00530     cuptiErr = cuptiEventGroupEnable( CUDA_ctrl->eventGroup );
00531     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupEnable" );
00532 
00533     /* Resets all events in the CuPTI eventGroup to zero */
00534     cuptiErr = cuptiEventGroupResetAllEvents( CUDA_ctrl->eventGroup );
00535     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" );
00536 
00537     return ( PAPI_OK );
00538 }
00539 
00540 
00541 /*
00542  *
00543  */
00544 int
00545 CUDA_stop( hwd_context_t * ctx, hwd_control_state_t * ctrl )
00546 {
00547     ( void ) ctx;
00548     ( void ) ctrl;
00549 
00550     return ( PAPI_OK );
00551 }
00552 
00553 
00554 /*
00555  *
00556  */
00557 int
00558 CUDA_read( hwd_context_t * ctx, hwd_control_state_t * ctrl,
00559            long_long ** events, int flags )
00560 {
00561     ( void ) ctx;
00562     ( void ) flags;
00563     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
00564 
00565 
00566     if ( 0 != getEventValue( CUDA_ctrl->counts, CUDA_ctrl->eventGroup, CUDA_ctrl->addedEvents ) )
00567         return ( PAPI_ENOSUPP );
00568 
00569     *events = CUDA_ctrl->counts;
00570 
00571     return ( PAPI_OK );
00572 }
00573 
00574 /* 
00575  *
00576  */
00577 int
00578 CUDA_shutdown_thread( hwd_context_t *ctx )
00579 {
00580     CUDA_context_t *CUDA_ctx = (CUDA_context_t*)ctx;
00581     free( CUDA_ctx->state.addedEvents.list );
00582     return (PAPI_OK);
00583 }
00584 
00585 /*
00586  *
00587  */
00588 int
00589 CUDA_shutdown_component( void )
00590 {
00591     CUresult cuErr = CUDA_SUCCESS;
00592     
00593     /* if running a threaded application, we need to make sure that 
00594        a thread doesn't free the same memory location(s) more than once */
00595     if ( CUDA_FREED == 0 ) {
00596         uint32_t j;
00597         int i;
00598         
00599         CUDA_FREED = 1;
00600 
00601         /* deallocate all the memory */
00602         for ( i = 0; i < deviceCount; i++ ) {
00603             for ( j = 0; j < device[i].domainCount; j++ )
00604                 free( device[i].domain[j].event );
00605             
00606             free( device[i].domain );
00607         }
00608 
00609         free( device );
00610         free( cuda_native_table );
00611         
00612         /* destroy floating CUDA context */
00613         cuErr = cuCtxDestroy( cuCtx );
00614         if ( cuErr != CUDA_SUCCESS )
00615             return ( PAPI_ENOSUPP );            // Not supported
00616     }
00617 
00618     
00619     return ( PAPI_OK );
00620 }
00621 
00622 
00623 /* This function sets various options in the component
00624  * The valid codes being passed in are PAPI_SET_DEFDOM,
00625  * PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL * and PAPI_SET_INHERIT
00626  */
00627 int
00628 CUDA_ctl( hwd_context_t * ctx, int code, _papi_int_option_t * option )
00629 {
00630     ( void ) ctx;
00631     ( void ) code;
00632     ( void ) option;
00633     return ( PAPI_OK );
00634 }
00635 
00636 
00637 //int CUDA_ntv_code_to_bits ( unsigned int EventCode, hwd_register_t * bits );
00638 
00639 
00640 /*
00641  *
00642  */
00643 int
00644 CUDA_update_control_state( hwd_control_state_t * ptr,
00645                            NativeInfo_t * native, int count,
00646                            hwd_context_t * ctx )
00647 {
00648     ( void ) ctx;
00649     CUDA_control_state_t * CUDA_ptr = ( CUDA_control_state_t * ) ptr;
00650     int index;
00651     CUptiResult cuptiErr = CUPTI_SUCCESS;
00652 
00653     cuptiErr = cuptiEventGroupDisable( CUDA_ptr->eventGroup );
00654     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" );
00655     
00656     /* Remove or Add events */
00657     if ( CUDA_ptr->old_count > count ) {
00658         cuptiErr =
00659             cuptiEventGroupRemoveEvent( CUDA_ptr->eventGroup,
00660                                         cuda_native_table[CUDA_ptr->addedEvents.list[0]].
00661                                         resources.eventId );
00662 
00663         /* Keep track of events in EventGroup if an event is removed */
00664         CUDA_ptr->old_count = count;
00665     } else {
00666         index = native[count - 1].ni_event;
00667         native[count - 1].ni_position = index;
00668 
00669         /* store events, that have been added to the CuPTI eveentGroup 
00670            in a seperate place (addedEvents).
00671            Needed, so that we can read the values for the added events only */
00672         CUDA_ptr->addedEvents.count = count;
00673         CUDA_ptr->addedEvents.list[count - 1] = index;
00674 
00675         /* if this device name is different from the actual device the code is running on, then exit */
00676         if ( 0 != strncmp( device[currentDeviceID].name,
00677                            cuda_native_table[index].name,
00678                            strlen( device[currentDeviceID].name ) ) ) {
00679             fprintf( stderr, "Device %s is used -- BUT event %s is collected. \n ---> ERROR: Specify events for the device that is used!\n\n",
00680                   device[currentDeviceID].name, cuda_native_table[index].name );
00681             
00682             return ( PAPI_ENOSUPP );    // Not supported 
00683         }
00684 
00685         /* Add events to the CuPTI eventGroup */
00686         cuptiErr =
00687             cuptiEventGroupAddEvent( CUDA_ptr->eventGroup,
00688                                      cuda_native_table[index].resources.
00689                                      eventId );
00690         CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupAddEvent" );
00691 
00692         /* Keep track of events in EventGroup if an event is removed */
00693         CUDA_ptr->old_count = count;
00694     }
00695 
00696     return ( PAPI_OK );
00697 }
00698 
00699 
00700 /*
00701  * This function has to set the bits needed to count different domains
00702  * In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER
00703  * By default return PAPI_EINVAL if none of those are specified
00704  * and PAPI_OK with success
00705  * PAPI_DOM_USER is only user context is counted
00706  * PAPI_DOM_KERNEL is only the Kernel/OS context is counted
00707  * PAPI_DOM_OTHER  is Exception/transient mode (like user TLB misses)
00708  * PAPI_DOM_ALL   is all of the domains
00709  */
00710 int
00711 CUDA_set_domain( hwd_control_state_t * cntrl, int domain )
00712 {
00713     int found = 0;
00714     ( void ) cntrl;
00715 
00716     if ( PAPI_DOM_USER & domain )
00717         found = 1;
00718 
00719     if ( PAPI_DOM_KERNEL & domain )
00720         found = 1;
00721 
00722     if ( PAPI_DOM_OTHER & domain )
00723         found = 1;
00724 
00725     if ( !found )
00726         return ( PAPI_EINVAL );
00727 
00728     return ( PAPI_OK );
00729 }
00730 
00731 
00732 /*
00733  *
00734  */
00735 int
00736 CUDA_reset( hwd_context_t * ctx, hwd_control_state_t * ctrl )
00737 {
00738     ( void ) ctx;
00739     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
00740     CUptiResult cuptiErr = CUPTI_SUCCESS;
00741 
00742     /* Resets all events in the CuPTI eventGroup to zero */
00743     cuptiErr = cuptiEventGroupResetAllEvents( CUDA_ctrl->eventGroup );
00744     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" );
00745 
00746     return ( PAPI_OK );
00747 }
00748 
00749 
00750 /*
00751  * Disable and Destoy the CUDA eventGroup */
00752 int
00753 CUDA_cleanup_eventset( hwd_control_state_t * ctrl )
00754 {
00755     CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl;
00756     CUptiResult cuptiErr = CUPTI_SUCCESS;
00757 
00758     /* Disable the CUDA eventGroup; 
00759        it also frees the perfmon hardware on the GPU */
00760     cuptiErr = cuptiEventGroupDisable( CUDA_ctrl->eventGroup );
00761     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" );
00762 
00763     /* Call the CuPTI cleaning function before leaving */
00764     cuptiErr = cuptiEventGroupDestroy( CUDA_ctrl->eventGroup );
00765     CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDestroy" );
00766 
00767     return ( PAPI_OK );
00768 }
00769 
00770 
00771 /*
00772  * Native Event functions
00773  */
00774 int
00775 CUDA_ntv_enum_events( unsigned int *EventCode, int modifier )
00776 {
00777 
00778     switch ( modifier ) {
00779     case PAPI_ENUM_FIRST:
00780         *EventCode = 0;
00781 
00782         return ( PAPI_OK );
00783         break;
00784 
00785     case PAPI_ENUM_EVENTS:
00786     {
00787         int index = *EventCode;
00788 
00789         if ( index < NUM_EVENTS - 1 ) {
00790             *EventCode = *EventCode + 1;
00791             return ( PAPI_OK );
00792         } else
00793             return ( PAPI_ENOEVNT );
00794 
00795         break;
00796     }
00797     default:
00798         return ( PAPI_EINVAL );
00799     }
00800     return ( PAPI_EINVAL );
00801 }
00802 
00803 
00804 /*
00805  *
00806  */
00807 int
00808 CUDA_ntv_code_to_name( unsigned int EventCode, char *name, int len )
00809 {
00810     int index = EventCode;
00811 
00812     strncpy( name, cuda_native_table[index].name, len );
00813     return ( PAPI_OK );
00814 }
00815 
00816 
00817 /*
00818  *
00819  */
00820 int
00821 CUDA_ntv_code_to_descr( unsigned int EventCode, char *name, int len )
00822 {
00823     int index = EventCode;
00824 
00825     strncpy( name, cuda_native_table[index].description, len );
00826     return ( PAPI_OK );
00827 }
00828 
00829 
00830 /*
00831  *
00832  */
00833 int
00834 CUDA_ntv_code_to_bits( unsigned int EventCode, hwd_register_t * bits )
00835 {
00836     int index = EventCode;
00837 
00838     memcpy( ( CUDA_register_t * ) bits,
00839             &( cuda_native_table[index].resources ),
00840             sizeof ( CUDA_register_t ) );
00841 
00842     return ( PAPI_OK );
00843 }
00844 
00845 
00846 /*
00847  *
00848  */
00849 papi_vector_t _cuda_vector = {
00850     .cmp_info = {
00851                  /* default component information (unspecified values are initialized to 0) */
00852                  .name = "cuda",
00853                  .short_name = "cuda",
00854                  .version = "5.0",
00855                  .num_mpx_cntrs = CUDA_MAX_COUNTERS,
00856                  .num_cntrs = CUDA_MAX_COUNTERS,
00857                  .default_domain = PAPI_DOM_USER,
00858                  .default_granularity = PAPI_GRN_THR,
00859                  .available_granularities = PAPI_GRN_THR,
00860                  .hardware_intr_sig = PAPI_INT_SIGNAL,
00861 
00862                  /* component specific cmp_info initializations */
00863                  .fast_real_timer = 0,
00864                  .fast_virtual_timer = 0,
00865                  .attach = 0,
00866                  .attach_must_ptrace = 0,
00867                  .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL,
00868                  }
00869     ,
00870 
00871     /* sizes of framework-opaque component-private structures */
00872     .size = {
00873              .context = sizeof ( CUDA_context_t ),
00874              .control_state = sizeof ( CUDA_control_state_t ),
00875              .reg_value = sizeof ( CUDA_register_t ),
00876              .reg_alloc = sizeof ( CUDA_reg_alloc_t ),
00877              }
00878     ,
00879     /* function pointers in this component */
00880     .init_thread = CUDA_init_thread,
00881     .init_component = CUDA_init_component,
00882     .init_control_state = CUDA_init_control_state,
00883     .start = CUDA_start,
00884     .stop = CUDA_stop,
00885     .read = CUDA_read,
00886     .shutdown_component = CUDA_shutdown_component,
00887     .shutdown_thread = CUDA_shutdown_thread,
00888     .cleanup_eventset = CUDA_cleanup_eventset,
00889     .ctl = CUDA_ctl,
00890     .update_control_state = CUDA_update_control_state,
00891     .set_domain = CUDA_set_domain,
00892     .reset = CUDA_reset,
00893 
00894     .ntv_enum_events = CUDA_ntv_enum_events,
00895     .ntv_code_to_name = CUDA_ntv_code_to_name,
00896     .ntv_code_to_descr = CUDA_ntv_code_to_descr,
00897     .ntv_code_to_bits = CUDA_ntv_code_to_bits,
00898 };
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines