|
PAPI
5.0.1.0
|
00001 /****************************/ 00002 /* THIS IS OPEN SOURCE CODE */ 00003 /****************************/ 00004 00018 #include "papi.h" 00019 #include "papi_internal.h" 00020 #include "papi_vector.h" 00021 #include "papi_memory.h" 00022 #include "linux-cuda.h" 00023 00024 papi_vector_t _cuda_vector; 00025 00026 00027 /****************************************************************************** 00028 ******** BEGIN FUNCTIONS USED INTERNALLY SPECIFIC TO THIS COMPONENT ********* 00029 *****************************************************************************/ 00030 /* 00031 * Specify device(s): Counts number of cuda events available in this system 00032 */ 00033 static int 00034 detectDevice( void ) 00035 { 00036 CUresult err; 00037 int skipDevice = 0; 00038 int id; 00039 char deviceName_tmp[PAPI_MIN_STR_LEN] = "init"; 00040 00041 totalEventCount = 0; 00042 00043 /* CUDA initialization */ 00044 err = cuInit( 0 ); 00045 if ( err != CUDA_SUCCESS ) 00046 return ( PAPI_ENOSUPP ); 00047 00048 /* How many gpgpu devices do we have? */ 00049 err = cuDeviceGetCount( &deviceCount ); 00050 CHECK_CU_ERROR( err, "cuDeviceGetCount" ); 00051 if ( deviceCount == 0 ) 00052 return ( PAPI_ENOSUPP ); 00053 00054 /* allocate memory for device data table */ 00055 device = ( DeviceData_t * ) malloc( sizeof ( DeviceData_t ) * deviceCount ); 00056 if ( device == NULL ) { 00057 perror( "malloc(): Failed to allocate memory to CUDA device table" ); 00058 return ( PAPI_ENOSUPP ); 00059 } 00060 00061 /* What are the devices? Get Name and # of domains per device */ 00062 for ( id = 0; id < deviceCount; id++ ) { 00063 err = cuDeviceGet( &device[id].dev, id ); 00064 CHECK_CU_ERROR( err, "cuDeviceGet" ); 00065 00066 err = 00067 cuDeviceGetName( device[id].name, PAPI_MIN_STR_LEN, 00068 device[id].dev ); 00069 CHECK_CU_ERROR( err, "cuDeviceGetName" ); 00070 00071 /* Skip device if there are multiple of the same type 00072 and if it has been already added to the list */ 00073 if ( 0 == strcmp( deviceName_tmp, device[id].name ) ) { 00074 skipDevice++; 00075 continue; 00076 } 00077 00078 strcpy( deviceName_tmp, device[id].name ); 00079 00080 /* enumerate the domains on the device */ 00081 if ( 0 != enumEventDomains( device[id].dev, id ) ) 00082 return ( PAPI_ENOSUPP ); 00083 } 00084 00085 deviceCount = deviceCount - skipDevice; 00086 00087 /* return number of events provided via CuPTI */ 00088 return totalEventCount; 00089 } 00090 00091 00092 /* 00093 * Detect supported domains for specified device 00094 */ 00095 static int 00096 enumEventDomains( CUdevice dev, int deviceId ) 00097 { 00098 CUptiResult err = CUPTI_SUCCESS; 00099 CUpti_EventDomainID *domainId = NULL; 00100 uint32_t id = 0; 00101 size_t size = 0; 00102 00103 device[deviceId].domainCount = 0; 00104 00105 /* get number of domains for device dev */ 00106 err = cuptiDeviceGetNumEventDomains( dev, &device[deviceId].domainCount ); 00107 CHECK_CUPTI_ERROR( err, "cuptiDeviceGetNumEventDomains" ); 00108 00109 if ( device[deviceId].domainCount == 0 ) { 00110 printf( "No domain is exposed by dev = %d\n", dev ); 00111 return -1; 00112 } 00113 00114 /* CuPTI domain struct */ 00115 size = sizeof ( CUpti_EventDomainID ) * device[deviceId].domainCount; 00116 domainId = ( CUpti_EventDomainID * ) malloc( size ); 00117 if ( domainId == NULL ) { 00118 perror( "malloc(): Failed to allocate memory to CuPTI domain ID" ); 00119 return -1; 00120 } 00121 memset( domainId, 0, size ); 00122 00123 /* PAPI domain struct */ 00124 device[deviceId].domain = 00125 ( DomainData_t * ) malloc( sizeof ( DomainData_t ) * 00126 device[deviceId].domainCount ); 00127 if ( device[deviceId].domain == NULL ) { 00128 perror( "malloc(): Failed to allocate memory to PAPI domain struct" ); 00129 return -1; 00130 } 00131 00132 /* Enumerates the event domains for a device dev */ 00133 err = cuptiDeviceEnumEventDomains( dev, &size, domainId ); 00134 CHECK_CUPTI_ERROR( err, "cuptiDeviceEnumEventDomains" ); 00135 00136 /* enum domains */ 00137 for ( id = 0; id < device[deviceId].domainCount; id++ ) { 00138 device[deviceId].domain[id].domainId = domainId[id]; 00139 00140 /* query domain name */ 00141 size = PAPI_MIN_STR_LEN; 00142 #ifdef CUDA_4_0 00143 err = cuptiEventDomainGetAttribute( dev, 00144 device[deviceId].domain[id]. 00145 domainId, 00146 CUPTI_EVENT_DOMAIN_ATTR_NAME, &size, 00147 ( void * ) device[deviceId]. 00148 domain[id].name ); 00149 CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" ); 00150 00151 /* query num of events avaialble in the domain */ 00152 size = sizeof ( device[deviceId].domain[id].eventCount ); 00153 err = cuptiEventDomainGetAttribute( dev, 00154 device[deviceId].domain[id]. 00155 domainId, 00156 CUPTI_EVENT_DOMAIN_MAX_EVENTS, 00157 &size, 00158 ( void * ) &device[deviceId]. 00159 domain[id].eventCount ); 00160 CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetAttribute" ); 00161 00162 /* enumerate the events for the domain[id] on the device dev */ 00163 if ( 0 != enumEvents( dev, deviceId, id ) ) 00164 return -1; 00165 #else 00166 err = cuptiDeviceGetEventDomainAttribute( dev, 00167 device[deviceId].domain[id].domainId, 00168 CUPTI_EVENT_DOMAIN_ATTR_NAME, &size, 00169 ( void * ) device[deviceId].domain[id].name ); 00170 CHECK_CUPTI_ERROR( err, "cuptiDeviceGetEventDomainAttribute" ); 00171 00172 /* query num of events avaialble in the domain */ 00173 err = cuptiEventDomainGetNumEvents( device[deviceId].domain[id].domainId, 00174 &device[deviceId].domain[id].eventCount ); 00175 CHECK_CUPTI_ERROR( err, "cuptiEventDomainGetNumEvents" ); 00176 00177 /* enumerate the events for the domain[id] on the device deviceId */ 00178 if ( 0 != enumEvents( deviceId, id ) ) 00179 return -1; 00180 #endif 00181 } 00182 00183 totalDomainCount += device[deviceId].domainCount; 00184 free( domainId ); 00185 return 0; 00186 } 00187 00188 00189 /* 00190 * Detect supported events for specified device domain 00191 */ 00192 #ifdef CUDA_4_0 00193 static int 00194 enumEvents( CUdevice dev, int deviceId, int domainId ) 00195 #else 00196 static int 00197 enumEvents( int deviceId, int domainId ) 00198 #endif 00199 { 00200 CUptiResult err = CUPTI_SUCCESS; 00201 CUpti_EventID *eventId = NULL; 00202 size_t size = 0; 00203 uint32_t id = 0; 00204 00205 /* CuPTI event struct */ 00206 size = 00207 sizeof ( CUpti_EventID ) * device[deviceId].domain[domainId].eventCount; 00208 eventId = ( CUpti_EventID * ) malloc( size ); 00209 if ( eventId == NULL ) { 00210 perror( "malloc(): Failed to allocate memory to CuPTI event ID" ); 00211 return -1; 00212 } 00213 memset( eventId, 0, size ); 00214 00215 /* PAPI event struct */ 00216 device[deviceId].domain[domainId].event = 00217 ( EventData_t * ) malloc( sizeof ( EventData_t ) * 00218 device[deviceId].domain[domainId]. 00219 eventCount ); 00220 if ( device[deviceId].domain[domainId].event == NULL ) { 00221 perror( "malloc(): Failed to allocate memory to PAPI event struct" ); 00222 return -1; 00223 } 00224 00225 /* enumerate the events for the domain[domainId] on the device[deviceId] */ 00226 #ifdef CUDA_4_0 00227 err = 00228 cuptiEventDomainEnumEvents( dev, 00229 ( CUpti_EventDomainID ) device[deviceId]. 00230 domain[domainId].domainId, &size, eventId ); 00231 #else 00232 err = 00233 cuptiEventDomainEnumEvents( ( CUpti_EventDomainID ) device[deviceId]. 00234 domain[domainId].domainId, &size, eventId ); 00235 #endif 00236 CHECK_CUPTI_ERROR( err, "cuptiEventDomainEnumEvents" ); 00237 00238 /* query event info */ 00239 for ( id = 0; id < device[deviceId].domain[domainId].eventCount; id++ ) { 00240 device[deviceId].domain[domainId].event[id].eventId = eventId[id]; 00241 00242 /* query event name */ 00243 size = PAPI_MIN_STR_LEN; 00244 #ifdef CUDA_4_0 00245 err = cuptiEventGetAttribute( dev, 00246 device[deviceId].domain[domainId]. 00247 event[id].eventId, CUPTI_EVENT_ATTR_NAME, 00248 &size, 00249 ( uint8_t * ) device[deviceId]. 00250 domain[domainId].event[id].name ); 00251 #else 00252 err = cuptiEventGetAttribute( device[deviceId].domain[domainId]. 00253 event[id].eventId, CUPTI_EVENT_ATTR_NAME, 00254 &size, 00255 ( uint8_t * ) device[deviceId]. 00256 domain[domainId].event[id].name ); 00257 #endif 00258 CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" ); 00259 00260 /* query event description */ 00261 size = PAPI_2MAX_STR_LEN; 00262 #ifdef CUDA_4_0 00263 err = cuptiEventGetAttribute( dev, 00264 device[deviceId].domain[domainId]. 00265 event[id].eventId, 00266 CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size, 00267 ( uint8_t * ) device[deviceId]. 00268 domain[domainId].event[id].desc ); 00269 #else 00270 err = cuptiEventGetAttribute( device[deviceId].domain[domainId]. 00271 event[id].eventId, 00272 CUPTI_EVENT_ATTR_SHORT_DESCRIPTION, &size, 00273 ( uint8_t * ) device[deviceId]. 00274 domain[domainId].event[id].desc ); 00275 #endif 00276 CHECK_CUPTI_ERROR( err, "cuptiEventGetAttribute" ); 00277 } 00278 00279 totalEventCount += device[deviceId].domain[domainId].eventCount; 00280 free( eventId ); 00281 return 0; 00282 } 00283 00284 00285 /* 00286 * Create the native events for specified domain and device 00287 */ 00288 static int 00289 createNativeEvents( void ) 00290 { 00291 int deviceId, id = 0; 00292 uint32_t domainId, eventId; 00293 int cuptiDomainId; 00294 int i; 00295 int devNameLen; 00296 00297 /* component name and description */ 00298 strcpy( _cuda_vector.cmp_info.short_name, "CUDA" ); 00299 strcpy( _cuda_vector.cmp_info.description, 00300 "CuPTI provides the API for monitoring CUDA hardware events" ); 00301 00302 /* create events for every GPU device and every domain per device */ 00303 for ( deviceId = 0; deviceId < deviceCount; deviceId++ ) { 00304 /* for the event names, replace blanks in the device name with underscores */ 00305 devNameLen = strlen( device[deviceId].name ); 00306 for ( i = 0; i < devNameLen; i++ ) 00307 if ( device[deviceId].name[i] == ' ' ) 00308 device[deviceId].name[i] = '_'; 00309 00310 for ( domainId = 0; domainId < device[deviceId].domainCount; 00311 domainId++ ) { 00312 cuptiDomainId = device[deviceId].domain[domainId].domainId; 00313 00314 for ( eventId = 0; 00315 eventId < device[deviceId].domain[domainId].eventCount; 00316 eventId++ ) { 00317 /* Save native event data */ 00318 sprintf( cuda_native_table[id].name, 00319 "%s:%s:%s", 00320 device[deviceId].name, 00321 device[deviceId].domain[domainId].name, 00322 device[deviceId].domain[domainId].event[eventId]. 00323 name ); 00324 00325 strncpy( cuda_native_table[id].description, 00326 device[deviceId].domain[domainId].event[eventId].desc, 00327 PAPI_2MAX_STR_LEN ); 00328 00329 /* The selector has to be !=0 . Starts with 1 */ 00330 cuda_native_table[id].resources.selector = id + 1; 00331 00332 /* store event ID */ 00333 cuda_native_table[id].resources.eventId = 00334 device[deviceId].domain[domainId].event[eventId].eventId; 00335 00336 /* increment the table index counter */ 00337 id++; 00338 } 00339 } 00340 } 00341 00342 /* Return the number of events created */ 00343 return id; 00344 } 00345 00346 00347 /* 00348 * Returns all event values from the CuPTI eventGroup 00349 */ 00350 static int 00351 getEventValue( long long *counts, CUpti_EventGroup eventGroup, AddedEvents_t addedEvents ) 00352 { 00353 CUptiResult cuptiErr = CUPTI_SUCCESS; 00354 size_t events_read, bufferSizeBytes, arraySizeBytes, i; 00355 uint64_t *counterDataBuffer; 00356 CUpti_EventID *eventIDArray; 00357 int j; 00358 00359 bufferSizeBytes = addedEvents.count * sizeof ( uint64_t ); 00360 counterDataBuffer = ( uint64_t * ) malloc( bufferSizeBytes ); 00361 00362 arraySizeBytes = addedEvents.count * sizeof ( CUpti_EventID ); 00363 eventIDArray = ( CUpti_EventID * ) malloc( arraySizeBytes ); 00364 00365 /* read counter data for the specified event from the CuPTI eventGroup */ 00366 cuptiErr = cuptiEventGroupReadAllEvents( eventGroup, 00367 CUPTI_EVENT_READ_FLAG_NONE, 00368 &bufferSizeBytes, 00369 counterDataBuffer, &arraySizeBytes, 00370 eventIDArray, &events_read ); 00371 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupReadAllEvents" ); 00372 00373 if ( events_read != ( size_t ) addedEvents.count ) 00374 return -1; 00375 00376 /* Since there is no guarantee that returned counter values are in the same 00377 order as the counters in the PAPI addedEvents.list, we need to map the 00378 CUpti_EventID to PAPI event ID values. 00379 According to CuPTI doc: counter return values of counterDataBuffer 00380 correspond to the return event IDs in eventIDArray */ 00381 for ( i = 0; i < events_read; i++ ) 00382 for ( j = 0; j < addedEvents.count; j++ ) 00383 if ( cuda_native_table[addedEvents.list[j]].resources.eventId == 00384 eventIDArray[i] ) 00385 // since cuptiEventGroupReadAllEvents() resets counter values to 0; 00386 // we have to accumulate ourselves 00387 counts[addedEvents.list[j]] = counts[addedEvents.list[j]] + counterDataBuffer[i]; 00388 00389 free( counterDataBuffer ); 00390 free( eventIDArray ); 00391 return 0; 00392 } 00393 00394 00395 /***************************************************************************** 00396 ******************* BEGIN PAPI's COMPONENT REQUIRED FUNCTIONS ************* 00397 *****************************************************************************/ 00398 00399 /* 00400 * This is called whenever a thread is initialized 00401 */ 00402 int 00403 CUDA_init_thread( hwd_context_t * ctx ) 00404 { 00405 CUDA_context_t * CUDA_ctx = ( CUDA_context_t * ) ctx; 00406 /* Initialize number of events in EventSet for update_control_state() */ 00407 CUDA_ctx->state.old_count = 0; 00408 00409 return PAPI_OK; 00410 } 00411 00412 00413 /* Initialize hardware counters, setup the function vector table 00414 * and get hardware information, this routine is called when the 00415 * PAPI process is initialized (IE PAPI_library_init) 00416 * 00417 * NOTE: only called by main thread (not by every thread) !!! 00418 * 00419 * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context. 00420 * This is a much easier programming model then pre-4.0 as threads - using the 00421 * same context - can share memory, data, etc. 00422 * It's possible to create a different context for each thread, but then we are 00423 * likely running into a limitation that only one context can be profiled at a time. 00424 * ==> and we don't want this. That's why CUDA context creation is done in 00425 * CUDA_init_component() (called only by main thread) rather than CUDA_init() 00426 * or CUDA_init_control_state() (both called by each thread). 00427 */ 00428 int 00429 CUDA_init_component( ) 00430 { 00431 CUresult cuErr = CUDA_SUCCESS; 00432 00433 /* Create dynamic event table */ 00434 NUM_EVENTS = detectDevice( ); 00435 00436 /* TODO: works only for one device right now; 00437 need to find out if user can use 2 or more devices at same time */ 00438 00439 /* want create a CUDA context for either the default device or 00440 the device specified with cudaSetDevice() in user code */ 00441 if ( CUDA_SUCCESS != cudaGetDevice( ¤tDeviceID ) ) 00442 return ( PAPI_ENOSUPP ); 00443 00444 if ( getenv( "PAPI_VERBOSE" ) ) { 00445 printf( "DEVICE USED: %s (%d)\n", device[currentDeviceID].name, 00446 currentDeviceID ); 00447 } 00448 00449 /* get the CUDA context from the calling CPU thread */ 00450 cuErr = cuCtxGetCurrent( &cuCtx ); 00451 00452 /* if no CUDA context is bound to the calling CPU thread yet, create one */ 00453 if ( cuErr != CUDA_SUCCESS || cuCtx == NULL ) { 00454 cuErr = cuCtxCreate( &cuCtx, 0, device[currentDeviceID].dev ); 00455 CHECK_CU_ERROR( cuErr, "cuCtxCreate" ); 00456 } 00457 00458 /* cuCtxGetCurrent() can return a non-null context that is not valid 00459 because the context has not yet been initialized. 00460 Here is a workaround: 00461 cudaFree(NULL) forces the context to be initialized 00462 if cudaFree(NULL) returns success then we are able to use the context in subsequent calls 00463 if cudaFree(NULL) returns an error (or subsequent cupti* calls) then the context is not usable, 00464 and will never be useable */ 00465 if ( CUDA_SUCCESS != cudaFree( NULL ) ) 00466 return ( PAPI_ENOSUPP ); 00467 00468 /* Create dynamic event table */ 00469 cuda_native_table = ( CUDA_native_event_entry_t * ) 00470 malloc( sizeof ( CUDA_native_event_entry_t ) * NUM_EVENTS ); 00471 if ( cuda_native_table == NULL ) { 00472 perror( "malloc(): Failed to allocate memory to events table" ); 00473 return ( PAPI_ENOSUPP ); 00474 } 00475 00476 if ( NUM_EVENTS != createNativeEvents( ) ) 00477 return ( PAPI_ENOSUPP ); 00478 00479 return ( PAPI_OK ); 00480 } 00481 00482 00483 /* 00484 * Control of counters (Reading/Writing/Starting/Stopping/Setup) 00485 * functions 00486 */ 00487 int 00488 CUDA_init_control_state( hwd_control_state_t * ctrl ) 00489 { 00490 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 00491 CUptiResult cuptiErr = CUPTI_SUCCESS; 00492 int i; 00493 00494 /* allocate memory for the list of events that are added to the CuPTI eventGroup */ 00495 CUDA_ctrl->addedEvents.list = malloc( sizeof ( int ) * NUM_EVENTS ); 00496 if ( CUDA_ctrl->addedEvents.list == NULL ) { 00497 perror 00498 ( "malloc(): Failed to allocate memory to table of events that are added to CuPTI eventGroup" ); 00499 return ( PAPI_ENOSUPP ); 00500 } 00501 00502 /* initialize the event list */ 00503 for ( i = 0; i < NUM_EVENTS; i++ ) 00504 CUDA_ctrl->addedEvents.list[i] = 0; 00505 00506 00507 00508 cuptiErr = cuptiEventGroupCreate( cuCtx, &CUDA_ctrl->eventGroup, 0 ); 00509 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupCreate" ); 00510 00511 return PAPI_OK; 00512 } 00513 00514 00515 /* 00516 * 00517 */ 00518 int 00519 CUDA_start( hwd_context_t * ctx, hwd_control_state_t * ctrl ) 00520 { 00521 ( void ) ctx; 00522 int i; 00523 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 00524 CUptiResult cuptiErr = CUPTI_SUCCESS; 00525 00526 // reset all event values to 0 00527 for ( i = 0; i < NUM_EVENTS; i++ ) 00528 CUDA_ctrl->counts[i] = 0; 00529 00530 cuptiErr = cuptiEventGroupEnable( CUDA_ctrl->eventGroup ); 00531 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupEnable" ); 00532 00533 /* Resets all events in the CuPTI eventGroup to zero */ 00534 cuptiErr = cuptiEventGroupResetAllEvents( CUDA_ctrl->eventGroup ); 00535 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" ); 00536 00537 return ( PAPI_OK ); 00538 } 00539 00540 00541 /* 00542 * 00543 */ 00544 int 00545 CUDA_stop( hwd_context_t * ctx, hwd_control_state_t * ctrl ) 00546 { 00547 ( void ) ctx; 00548 ( void ) ctrl; 00549 00550 return ( PAPI_OK ); 00551 } 00552 00553 00554 /* 00555 * 00556 */ 00557 int 00558 CUDA_read( hwd_context_t * ctx, hwd_control_state_t * ctrl, 00559 long_long ** events, int flags ) 00560 { 00561 ( void ) ctx; 00562 ( void ) flags; 00563 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 00564 00565 00566 if ( 0 != getEventValue( CUDA_ctrl->counts, CUDA_ctrl->eventGroup, CUDA_ctrl->addedEvents ) ) 00567 return ( PAPI_ENOSUPP ); 00568 00569 *events = CUDA_ctrl->counts; 00570 00571 return ( PAPI_OK ); 00572 } 00573 00574 /* 00575 * 00576 */ 00577 int 00578 CUDA_shutdown_thread( hwd_context_t *ctx ) 00579 { 00580 CUDA_context_t *CUDA_ctx = (CUDA_context_t*)ctx; 00581 free( CUDA_ctx->state.addedEvents.list ); 00582 return (PAPI_OK); 00583 } 00584 00585 /* 00586 * 00587 */ 00588 int 00589 CUDA_shutdown_component( void ) 00590 { 00591 CUresult cuErr = CUDA_SUCCESS; 00592 00593 /* if running a threaded application, we need to make sure that 00594 a thread doesn't free the same memory location(s) more than once */ 00595 if ( CUDA_FREED == 0 ) { 00596 uint32_t j; 00597 int i; 00598 00599 CUDA_FREED = 1; 00600 00601 /* deallocate all the memory */ 00602 for ( i = 0; i < deviceCount; i++ ) { 00603 for ( j = 0; j < device[i].domainCount; j++ ) 00604 free( device[i].domain[j].event ); 00605 00606 free( device[i].domain ); 00607 } 00608 00609 free( device ); 00610 free( cuda_native_table ); 00611 00612 /* destroy floating CUDA context */ 00613 cuErr = cuCtxDestroy( cuCtx ); 00614 if ( cuErr != CUDA_SUCCESS ) 00615 return ( PAPI_ENOSUPP ); // Not supported 00616 } 00617 00618 00619 return ( PAPI_OK ); 00620 } 00621 00622 00623 /* This function sets various options in the component 00624 * The valid codes being passed in are PAPI_SET_DEFDOM, 00625 * PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL * and PAPI_SET_INHERIT 00626 */ 00627 int 00628 CUDA_ctl( hwd_context_t * ctx, int code, _papi_int_option_t * option ) 00629 { 00630 ( void ) ctx; 00631 ( void ) code; 00632 ( void ) option; 00633 return ( PAPI_OK ); 00634 } 00635 00636 00637 //int CUDA_ntv_code_to_bits ( unsigned int EventCode, hwd_register_t * bits ); 00638 00639 00640 /* 00641 * 00642 */ 00643 int 00644 CUDA_update_control_state( hwd_control_state_t * ptr, 00645 NativeInfo_t * native, int count, 00646 hwd_context_t * ctx ) 00647 { 00648 ( void ) ctx; 00649 CUDA_control_state_t * CUDA_ptr = ( CUDA_control_state_t * ) ptr; 00650 int index; 00651 CUptiResult cuptiErr = CUPTI_SUCCESS; 00652 00653 cuptiErr = cuptiEventGroupDisable( CUDA_ptr->eventGroup ); 00654 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" ); 00655 00656 /* Remove or Add events */ 00657 if ( CUDA_ptr->old_count > count ) { 00658 cuptiErr = 00659 cuptiEventGroupRemoveEvent( CUDA_ptr->eventGroup, 00660 cuda_native_table[CUDA_ptr->addedEvents.list[0]]. 00661 resources.eventId ); 00662 00663 /* Keep track of events in EventGroup if an event is removed */ 00664 CUDA_ptr->old_count = count; 00665 } else { 00666 index = native[count - 1].ni_event; 00667 native[count - 1].ni_position = index; 00668 00669 /* store events, that have been added to the CuPTI eveentGroup 00670 in a seperate place (addedEvents). 00671 Needed, so that we can read the values for the added events only */ 00672 CUDA_ptr->addedEvents.count = count; 00673 CUDA_ptr->addedEvents.list[count - 1] = index; 00674 00675 /* if this device name is different from the actual device the code is running on, then exit */ 00676 if ( 0 != strncmp( device[currentDeviceID].name, 00677 cuda_native_table[index].name, 00678 strlen( device[currentDeviceID].name ) ) ) { 00679 fprintf( stderr, "Device %s is used -- BUT event %s is collected. \n ---> ERROR: Specify events for the device that is used!\n\n", 00680 device[currentDeviceID].name, cuda_native_table[index].name ); 00681 00682 return ( PAPI_ENOSUPP ); // Not supported 00683 } 00684 00685 /* Add events to the CuPTI eventGroup */ 00686 cuptiErr = 00687 cuptiEventGroupAddEvent( CUDA_ptr->eventGroup, 00688 cuda_native_table[index].resources. 00689 eventId ); 00690 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupAddEvent" ); 00691 00692 /* Keep track of events in EventGroup if an event is removed */ 00693 CUDA_ptr->old_count = count; 00694 } 00695 00696 return ( PAPI_OK ); 00697 } 00698 00699 00700 /* 00701 * This function has to set the bits needed to count different domains 00702 * In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER 00703 * By default return PAPI_EINVAL if none of those are specified 00704 * and PAPI_OK with success 00705 * PAPI_DOM_USER is only user context is counted 00706 * PAPI_DOM_KERNEL is only the Kernel/OS context is counted 00707 * PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses) 00708 * PAPI_DOM_ALL is all of the domains 00709 */ 00710 int 00711 CUDA_set_domain( hwd_control_state_t * cntrl, int domain ) 00712 { 00713 int found = 0; 00714 ( void ) cntrl; 00715 00716 if ( PAPI_DOM_USER & domain ) 00717 found = 1; 00718 00719 if ( PAPI_DOM_KERNEL & domain ) 00720 found = 1; 00721 00722 if ( PAPI_DOM_OTHER & domain ) 00723 found = 1; 00724 00725 if ( !found ) 00726 return ( PAPI_EINVAL ); 00727 00728 return ( PAPI_OK ); 00729 } 00730 00731 00732 /* 00733 * 00734 */ 00735 int 00736 CUDA_reset( hwd_context_t * ctx, hwd_control_state_t * ctrl ) 00737 { 00738 ( void ) ctx; 00739 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 00740 CUptiResult cuptiErr = CUPTI_SUCCESS; 00741 00742 /* Resets all events in the CuPTI eventGroup to zero */ 00743 cuptiErr = cuptiEventGroupResetAllEvents( CUDA_ctrl->eventGroup ); 00744 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupResetAllEvents" ); 00745 00746 return ( PAPI_OK ); 00747 } 00748 00749 00750 /* 00751 * Disable and Destoy the CUDA eventGroup */ 00752 int 00753 CUDA_cleanup_eventset( hwd_control_state_t * ctrl ) 00754 { 00755 CUDA_control_state_t * CUDA_ctrl = ( CUDA_control_state_t * ) ctrl; 00756 CUptiResult cuptiErr = CUPTI_SUCCESS; 00757 00758 /* Disable the CUDA eventGroup; 00759 it also frees the perfmon hardware on the GPU */ 00760 cuptiErr = cuptiEventGroupDisable( CUDA_ctrl->eventGroup ); 00761 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDisable" ); 00762 00763 /* Call the CuPTI cleaning function before leaving */ 00764 cuptiErr = cuptiEventGroupDestroy( CUDA_ctrl->eventGroup ); 00765 CHECK_CUPTI_ERROR( cuptiErr, "cuptiEventGroupDestroy" ); 00766 00767 return ( PAPI_OK ); 00768 } 00769 00770 00771 /* 00772 * Native Event functions 00773 */ 00774 int 00775 CUDA_ntv_enum_events( unsigned int *EventCode, int modifier ) 00776 { 00777 00778 switch ( modifier ) { 00779 case PAPI_ENUM_FIRST: 00780 *EventCode = 0; 00781 00782 return ( PAPI_OK ); 00783 break; 00784 00785 case PAPI_ENUM_EVENTS: 00786 { 00787 int index = *EventCode; 00788 00789 if ( index < NUM_EVENTS - 1 ) { 00790 *EventCode = *EventCode + 1; 00791 return ( PAPI_OK ); 00792 } else 00793 return ( PAPI_ENOEVNT ); 00794 00795 break; 00796 } 00797 default: 00798 return ( PAPI_EINVAL ); 00799 } 00800 return ( PAPI_EINVAL ); 00801 } 00802 00803 00804 /* 00805 * 00806 */ 00807 int 00808 CUDA_ntv_code_to_name( unsigned int EventCode, char *name, int len ) 00809 { 00810 int index = EventCode; 00811 00812 strncpy( name, cuda_native_table[index].name, len ); 00813 return ( PAPI_OK ); 00814 } 00815 00816 00817 /* 00818 * 00819 */ 00820 int 00821 CUDA_ntv_code_to_descr( unsigned int EventCode, char *name, int len ) 00822 { 00823 int index = EventCode; 00824 00825 strncpy( name, cuda_native_table[index].description, len ); 00826 return ( PAPI_OK ); 00827 } 00828 00829 00830 /* 00831 * 00832 */ 00833 int 00834 CUDA_ntv_code_to_bits( unsigned int EventCode, hwd_register_t * bits ) 00835 { 00836 int index = EventCode; 00837 00838 memcpy( ( CUDA_register_t * ) bits, 00839 &( cuda_native_table[index].resources ), 00840 sizeof ( CUDA_register_t ) ); 00841 00842 return ( PAPI_OK ); 00843 } 00844 00845 00846 /* 00847 * 00848 */ 00849 papi_vector_t _cuda_vector = { 00850 .cmp_info = { 00851 /* default component information (unspecified values are initialized to 0) */ 00852 .name = "cuda", 00853 .short_name = "cuda", 00854 .version = "5.0", 00855 .num_mpx_cntrs = CUDA_MAX_COUNTERS, 00856 .num_cntrs = CUDA_MAX_COUNTERS, 00857 .default_domain = PAPI_DOM_USER, 00858 .default_granularity = PAPI_GRN_THR, 00859 .available_granularities = PAPI_GRN_THR, 00860 .hardware_intr_sig = PAPI_INT_SIGNAL, 00861 00862 /* component specific cmp_info initializations */ 00863 .fast_real_timer = 0, 00864 .fast_virtual_timer = 0, 00865 .attach = 0, 00866 .attach_must_ptrace = 0, 00867 .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL, 00868 } 00869 , 00870 00871 /* sizes of framework-opaque component-private structures */ 00872 .size = { 00873 .context = sizeof ( CUDA_context_t ), 00874 .control_state = sizeof ( CUDA_control_state_t ), 00875 .reg_value = sizeof ( CUDA_register_t ), 00876 .reg_alloc = sizeof ( CUDA_reg_alloc_t ), 00877 } 00878 , 00879 /* function pointers in this component */ 00880 .init_thread = CUDA_init_thread, 00881 .init_component = CUDA_init_component, 00882 .init_control_state = CUDA_init_control_state, 00883 .start = CUDA_start, 00884 .stop = CUDA_stop, 00885 .read = CUDA_read, 00886 .shutdown_component = CUDA_shutdown_component, 00887 .shutdown_thread = CUDA_shutdown_thread, 00888 .cleanup_eventset = CUDA_cleanup_eventset, 00889 .ctl = CUDA_ctl, 00890 .update_control_state = CUDA_update_control_state, 00891 .set_domain = CUDA_set_domain, 00892 .reset = CUDA_reset, 00893 00894 .ntv_enum_events = CUDA_ntv_enum_events, 00895 .ntv_code_to_name = CUDA_ntv_code_to_name, 00896 .ntv_code_to_descr = CUDA_ntv_code_to_descr, 00897 .ntv_code_to_bits = CUDA_ntv_code_to_bits, 00898 };