|
PAPI
5.3.0.0
|
00001 /* 00002 * File: perf_event_uncore.c 00003 * 00004 * Author: Vince Weaver 00005 * vincent.weaver@maine.edu 00006 */ 00007 00008 #include <fcntl.h> 00009 #include <string.h> 00010 #include <errno.h> 00011 #include <signal.h> 00012 #include <syscall.h> 00013 #include <sys/utsname.h> 00014 #include <sys/mman.h> 00015 #include <sys/ioctl.h> 00016 00017 /* PAPI-specific includes */ 00018 #include "papi.h" 00019 #include "papi_memory.h" 00020 #include "papi_internal.h" 00021 #include "papi_vector.h" 00022 #include "extras.h" 00023 00024 /* libpfm4 includes */ 00025 #include "papi_libpfm4_events.h" 00026 #include "peu_libpfm4_events.h" 00027 #include "perfmon/pfmlib.h" 00028 #include PEINCLUDE 00029 00030 /* Linux-specific includes */ 00031 #include "mb.h" 00032 #include "linux-memory.h" 00033 #include "linux-timer.h" 00034 #include "linux-common.h" 00035 #include "linux-context.h" 00036 00037 #include "components/perf_event/perf_event_lib.h" 00038 00039 /* Forward declaration */ 00040 papi_vector_t _perf_event_uncore_vector; 00041 00042 /* Globals */ 00043 struct native_event_table_t uncore_native_event_table; 00044 static int our_cidx; 00045 00046 /* Defines for ctx->state */ 00047 #define PERF_EVENTS_OPENED 0x01 00048 #define PERF_EVENTS_RUNNING 0x02 00049 00050 00051 /* The read format on perf_event varies based on various flags that */ 00052 /* are passed into it. This helper avoids copying this logic */ 00053 /* multiple places. */ 00054 static unsigned int 00055 get_read_format( unsigned int multiplex, 00056 unsigned int inherit, 00057 int format_group ) 00058 { 00059 unsigned int format = 0; 00060 00061 /* if we need read format options for multiplexing, add them now */ 00062 if (multiplex) { 00063 format |= PERF_FORMAT_TOTAL_TIME_ENABLED; 00064 format |= PERF_FORMAT_TOTAL_TIME_RUNNING; 00065 } 00066 00067 /* If we are not using inherit, add the group read options */ 00068 if (!inherit) { 00069 if (format_group) { 00070 format |= PERF_FORMAT_GROUP; 00071 } 00072 } 00073 00074 SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: %#x\n", 00075 multiplex, inherit, format_group, format); 00076 00077 return format; 00078 } 00079 00080 /********************************************************************/ 00081 /* Low-level perf_event calls */ 00082 /********************************************************************/ 00083 00084 /* In case headers aren't new enough to have __NR_perf_event_open */ 00085 #ifndef __NR_perf_event_open 00086 00087 #ifdef __powerpc__ 00088 #define __NR_perf_event_open 319 00089 #elif defined(__x86_64__) 00090 #define __NR_perf_event_open 298 00091 #elif defined(__i386__) 00092 #define __NR_perf_event_open 336 00093 #elif defined(__arm__) 366+0x900000 00094 #define __NR_perf_event_open 00095 #endif 00096 00097 #endif 00098 00099 static long 00100 sys_perf_event_open( struct perf_event_attr *hw_event, pid_t pid, int cpu, 00101 int group_fd, unsigned long flags ) 00102 { 00103 int ret; 00104 00105 SUBDBG("sys_perf_event_open(%p,%d,%d,%d,%lx\n",hw_event,pid,cpu,group_fd,flags); 00106 SUBDBG(" type: %d\n",hw_event->type); 00107 SUBDBG(" size: %d\n",hw_event->size); 00108 SUBDBG(" config: %"PRIx64" (%"PRIu64")\n",hw_event->config, 00109 hw_event->config); 00110 SUBDBG(" sample_period: %"PRIu64"\n",hw_event->sample_period); 00111 SUBDBG(" sample_type: %"PRIu64"\n",hw_event->sample_type); 00112 SUBDBG(" read_format: %"PRIu64"\n",hw_event->read_format); 00113 SUBDBG(" disabled: %d\n",hw_event->disabled); 00114 SUBDBG(" inherit: %d\n",hw_event->inherit); 00115 SUBDBG(" pinned: %d\n",hw_event->pinned); 00116 SUBDBG(" exclusive: %d\n",hw_event->exclusive); 00117 SUBDBG(" exclude_user: %d\n",hw_event->exclude_user); 00118 SUBDBG(" exclude_kernel: %d\n",hw_event->exclude_kernel); 00119 SUBDBG(" exclude_hv: %d\n",hw_event->exclude_hv); 00120 SUBDBG(" exclude_idle: %d\n",hw_event->exclude_idle); 00121 SUBDBG(" mmap: %d\n",hw_event->mmap); 00122 SUBDBG(" comm: %d\n",hw_event->comm); 00123 SUBDBG(" freq: %d\n",hw_event->freq); 00124 SUBDBG(" inherit_stat: %d\n",hw_event->inherit_stat); 00125 SUBDBG(" enable_on_exec: %d\n",hw_event->enable_on_exec); 00126 SUBDBG(" task: %d\n",hw_event->task); 00127 SUBDBG(" watermark: %d\n",hw_event->watermark); 00128 00129 ret = 00130 syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags ); 00131 SUBDBG("Returned %d %d %s\n",ret, 00132 ret<0?errno:0, 00133 ret<0?strerror(errno):" "); 00134 return ret; 00135 } 00136 00137 00138 static int map_perf_event_errors_to_papi(int perf_event_error) { 00139 00140 int ret; 00141 00142 /* These mappings are approximate. 00143 EINVAL in particular can mean lots of different things */ 00144 switch(perf_event_error) { 00145 case EPERM: 00146 case EACCES: 00147 ret = PAPI_EPERM; 00148 break; 00149 case ENODEV: 00150 case EOPNOTSUPP: 00151 ret = PAPI_ENOSUPP; 00152 break; 00153 case ENOENT: 00154 ret = PAPI_ENOEVNT; 00155 break; 00156 case ENOSYS: 00157 case EAGAIN: 00158 case EBUSY: 00159 case E2BIG: 00160 ret = PAPI_ESYS; 00161 break; 00162 case ENOMEM: 00163 ret = PAPI_ENOMEM; 00164 break; 00165 case EINVAL: 00166 default: 00167 ret = PAPI_EINVAL; 00168 break; 00169 } 00170 return ret; 00171 } 00172 00173 /* Maximum size we ever expect to read from a perf_event fd */ 00174 /* (this is the number of 64-bit values) */ 00175 /* We use this to size the read buffers */ 00176 /* The three is for event count, time_enabled, time_running */ 00177 /* and the counter term is count value and count id for each */ 00178 /* possible counter value. */ 00179 #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS)) 00180 00181 /* Open all events in the control state */ 00182 static int 00183 open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) 00184 { 00185 00186 int i, ret = PAPI_OK; 00187 long pid; 00188 00189 if (ctl->granularity==PAPI_GRN_SYS) { 00190 pid = -1; 00191 } 00192 else { 00193 pid = ctl->tid; 00194 } 00195 00196 for( i = 0; i < ctl->num_events; i++ ) { 00197 00198 ctl->events[i].event_opened=0; 00199 00200 /* set up the attr structure. We don't set up all fields here */ 00201 /* as some have already been set up previously. */ 00202 00203 /* group leader (event 0) is special */ 00204 /* If we're multiplexed, everyone is a group leader */ 00205 if (( i == 0 ) || (ctl->multiplexed)) { 00206 ctl->events[i].attr.pinned = !ctl->multiplexed; 00207 ctl->events[i].attr.disabled = 1; 00208 ctl->events[i].group_leader_fd=-1; 00209 ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 00210 ctl->inherit, 00211 !ctl->multiplexed ); 00212 } else { 00213 ctl->events[i].attr.pinned=0; 00214 ctl->events[i].attr.disabled = 0; 00215 ctl->events[i].group_leader_fd=ctl->events[0].event_fd, 00216 ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 00217 ctl->inherit, 00218 0 ); 00219 } 00220 00221 00222 /* try to open */ 00223 ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr, 00224 pid, 00225 ctl->cpu, 00226 ctl->events[i].group_leader_fd, 00227 0 /* flags */ 00228 ); 00229 00230 /* Try to match Linux errors to PAPI errors */ 00231 if ( ctl->events[i].event_fd == -1 ) { 00232 SUBDBG("sys_perf_event_open returned error on event #%d." 00233 " Error: %s\n", 00234 i, strerror( errno ) ); 00235 ret=map_perf_event_errors_to_papi(errno); 00236 00237 goto open_pe_cleanup; 00238 } 00239 00240 SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d," 00241 " group_leader/fd: %d, event_fd: %d," 00242 " read_format: 0x%"PRIu64"\n", 00243 pid, ctl->cpu, ctl->events[i].group_leader_fd, 00244 ctl->events[i].event_fd, ctl->events[i].attr.read_format); 00245 00246 ctl->events[i].event_opened=1; 00247 } 00248 00249 /* Now that we've successfully opened all of the events, do whatever */ 00250 /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */ 00251 /* and so on. */ 00252 for ( i = 0; i < ctl->num_events; i++ ) { 00253 00254 /* No sampling if uncore */ 00255 ctl->events[i].mmap_buf = NULL; 00256 } 00257 00258 /* Set num_evts only if completely successful */ 00259 ctx->state |= PERF_EVENTS_OPENED; 00260 00261 return PAPI_OK; 00262 00263 open_pe_cleanup: 00264 /* We encountered an error, close up the fds we successfully opened. */ 00265 /* We go backward in an attempt to close group leaders last, although */ 00266 /* That's probably not strictly necessary. */ 00267 while ( i > 0 ) { 00268 i--; 00269 if (ctl->events[i].event_fd>=0) { 00270 close( ctl->events[i].event_fd ); 00271 ctl->events[i].event_opened=0; 00272 } 00273 } 00274 00275 return ret; 00276 } 00277 00278 /* Close all of the opened events */ 00279 static int 00280 close_pe_events( pe_context_t *ctx, pe_control_t *ctl ) 00281 { 00282 int i; 00283 int num_closed=0; 00284 int events_not_opened=0; 00285 00286 /* should this be a more serious error? */ 00287 if ( ctx->state & PERF_EVENTS_RUNNING ) { 00288 SUBDBG("Closing without stopping first\n"); 00289 } 00290 00291 /* Close child events first */ 00292 for( i=0; i<ctl->num_events; i++ ) { 00293 00294 if (ctl->events[i].event_opened) { 00295 00296 if (ctl->events[i].group_leader_fd!=-1) { 00297 if ( ctl->events[i].mmap_buf ) { 00298 if ( munmap ( ctl->events[i].mmap_buf, 00299 ctl->events[i].nr_mmap_pages * getpagesize() ) ) { 00300 PAPIERROR( "munmap of fd = %d returned error: %s", 00301 ctl->events[i].event_fd, strerror( errno ) ); 00302 return PAPI_ESYS; 00303 } 00304 } 00305 00306 if ( close( ctl->events[i].event_fd ) ) { 00307 PAPIERROR( "close of fd = %d returned error: %s", 00308 ctl->events[i].event_fd, strerror( errno ) ); 00309 return PAPI_ESYS; 00310 } else { 00311 num_closed++; 00312 } 00313 ctl->events[i].event_opened=0; 00314 } 00315 } 00316 else { 00317 events_not_opened++; 00318 } 00319 } 00320 00321 /* Close the group leaders last */ 00322 for( i=0; i<ctl->num_events; i++ ) { 00323 00324 if (ctl->events[i].event_opened) { 00325 00326 if (ctl->events[i].group_leader_fd==-1) { 00327 if ( ctl->events[i].mmap_buf ) { 00328 if ( munmap ( ctl->events[i].mmap_buf, 00329 ctl->events[i].nr_mmap_pages * getpagesize() ) ) { 00330 PAPIERROR( "munmap of fd = %d returned error: %s", 00331 ctl->events[i].event_fd, strerror( errno ) ); 00332 return PAPI_ESYS; 00333 } 00334 } 00335 00336 00337 if ( close( ctl->events[i].event_fd ) ) { 00338 PAPIERROR( "close of fd = %d returned error: %s", 00339 ctl->events[i].event_fd, strerror( errno ) ); 00340 return PAPI_ESYS; 00341 } else { 00342 num_closed++; 00343 } 00344 ctl->events[i].event_opened=0; 00345 } 00346 } 00347 } 00348 00349 00350 if (ctl->num_events!=num_closed) { 00351 if (ctl->num_events!=(num_closed+events_not_opened)) { 00352 PAPIERROR("Didn't close all events: " 00353 "Closed %d Not Opened: %d Expected %d\n", 00354 num_closed,events_not_opened,ctl->num_events); 00355 return PAPI_EBUG; 00356 } 00357 } 00358 00359 ctl->num_events=0; 00360 00361 ctx->state &= ~PERF_EVENTS_OPENED; 00362 00363 return PAPI_OK; 00364 } 00365 00366 00367 00368 00369 /********************************************************************/ 00370 /* Component Interface */ 00371 /********************************************************************/ 00372 00373 00374 00375 /* Initialize a thread */ 00376 int 00377 _peu_init_thread( hwd_context_t *hwd_ctx ) 00378 { 00379 00380 pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx; 00381 00382 /* clear the context structure and mark as initialized */ 00383 memset( pe_ctx, 0, sizeof ( pe_context_t ) ); 00384 pe_ctx->initialized=1; 00385 00386 pe_ctx->event_table=&uncore_native_event_table; 00387 pe_ctx->cidx=our_cidx; 00388 00389 return PAPI_OK; 00390 } 00391 00392 /* Initialize a new control state */ 00393 int 00394 _peu_init_control_state( hwd_control_state_t *ctl ) 00395 { 00396 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00397 00398 /* clear the contents */ 00399 memset( pe_ctl, 0, sizeof ( pe_control_t ) ); 00400 00401 /* Set the default domain */ 00402 _pe_set_domain( ctl, _perf_event_uncore_vector.cmp_info.default_domain ); 00403 00404 /* Set the default granularity */ 00405 pe_ctl->granularity=_perf_event_uncore_vector.cmp_info.default_granularity; 00406 00407 pe_ctl->cidx=our_cidx; 00408 00409 /* Set cpu number in the control block to show events */ 00410 /* are not tied to specific cpu */ 00411 pe_ctl->cpu = -1; 00412 return PAPI_OK; 00413 } 00414 00415 00416 00417 /* Initialize the perf_event uncore component */ 00418 int 00419 _peu_init_component( int cidx ) 00420 { 00421 00422 int retval; 00423 int paranoid_level; 00424 00425 FILE *fff; 00426 00427 our_cidx=cidx; 00428 00429 /* The is the official way to detect if perf_event support exists */ 00430 /* The file is called perf_counter_paranoid on 2.6.31 */ 00431 /* currently we are lazy and do not support 2.6.31 kernels */ 00432 00433 fff=fopen("/proc/sys/kernel/perf_event_paranoid","r"); 00434 if (fff==NULL) { 00435 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 00436 "perf_event support not detected",PAPI_MAX_STR_LEN); 00437 return PAPI_ENOCMP; 00438 } 00439 retval=fscanf(fff,"%d",¶noid_level); 00440 if (retval!=1) fprintf(stderr,"Error reading paranoid level\n"); 00441 fclose(fff); 00442 00443 00444 /* Run the libpfm4-specific setup */ 00445 00446 retval = _papi_libpfm4_init(_papi_hwd[cidx]); 00447 if (retval) { 00448 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 00449 "Error initializing libpfm4",PAPI_MAX_STR_LEN); 00450 return PAPI_ENOCMP; 00451 } 00452 00453 00454 /* Run the uncore specific libpfm4 setup */ 00455 00456 retval = _peu_libpfm4_init(_papi_hwd[cidx], 00457 &uncore_native_event_table, 00458 PMU_TYPE_UNCORE); 00459 if (retval) { 00460 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 00461 "Error setting up libpfm4",PAPI_MAX_STR_LEN); 00462 return PAPI_ENOCMP; 00463 } 00464 00465 /* Check if no uncore events found */ 00466 00467 if (_papi_hwd[cidx]->cmp_info.num_native_events==0) { 00468 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 00469 "No uncore PMUs or events found",PAPI_MAX_STR_LEN); 00470 return PAPI_ENOCMP; 00471 } 00472 00473 /* Check if we have enough permissions for uncore */ 00474 00475 /* 2 means no kernel measurements allowed */ 00476 /* 1 means normal counter access */ 00477 /* 0 means you can access CPU-specific data */ 00478 /* -1 means no restrictions */ 00479 00480 if ((paranoid_level>0) && (getuid()!=0)) { 00481 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 00482 "Insufficient permissions for uncore access. Set /proc/sys/kernel/perf_event_paranoid to 0 or run as root.", 00483 PAPI_MAX_STR_LEN); 00484 return PAPI_ENOCMP; 00485 } 00486 00487 return PAPI_OK; 00488 00489 } 00490 00491 /* Shutdown the perf_event component */ 00492 int _peu_shutdown_component( void ) { 00493 00494 /* deallocate our event table */ 00495 _peu_libpfm4_shutdown(&uncore_native_event_table); 00496 00497 /* Shutdown libpfm4 */ 00498 _papi_libpfm4_shutdown(); 00499 00500 return PAPI_OK; 00501 } 00502 00503 /* This function clears the current contents of the control structure and 00504 updates it with whatever resources are allocated for all the native events 00505 in the native info structure array. */ 00506 00507 int 00508 _peu_update_control_state( hwd_control_state_t *ctl, 00509 NativeInfo_t *native, 00510 int count, hwd_context_t *ctx ) 00511 { 00512 int i = 0, ret; 00513 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 00514 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00515 00516 /* close all of the existing fds and start over again */ 00517 /* In theory we could have finer-grained control and know if */ 00518 /* things were changed, but it's easier to tear things down and rebuild. */ 00519 close_pe_events( pe_ctx, pe_ctl ); 00520 00521 /* Calling with count==0 should be OK, it's how things are deallocated */ 00522 /* when an eventset is destroyed. */ 00523 if ( count == 0 ) { 00524 SUBDBG( "Called with count == 0\n" ); 00525 return PAPI_OK; 00526 } 00527 00528 /* set up all the events */ 00529 for( i = 0; i < count; i++ ) { 00530 if ( native ) { 00531 /* Have libpfm4 set the config values for the event */ 00532 ret=_peu_libpfm4_setup_counters(&pe_ctl->events[i].attr, 00533 native[i].ni_event, 00534 pe_ctx->event_table); 00535 SUBDBG( "pe_ctl->eventss[%d].config=%"PRIx64"\n",i, 00536 pe_ctl->events[i].attr.config); 00537 if (ret!=PAPI_OK) return ret; 00538 00539 } else { 00540 /* I'm not sure how we'd end up in this case */ 00541 /* should it be an error? */ 00542 } 00543 00544 /* Copy the inherit flag into the attribute block that will be */ 00545 /* passed to the kernel */ 00546 pe_ctl->events[i].attr.inherit = pe_ctl->inherit; 00547 00548 /* Set the position in the native structure */ 00549 /* We just set up events linearly */ 00550 if ( native ) { 00551 native[i].ni_position = i; 00552 } 00553 } 00554 00555 pe_ctl->num_events = count; 00556 _pe_set_domain( ctl, pe_ctl->domain ); 00557 00558 /* actuall open the events */ 00559 /* (why is this a separate function?) */ 00560 ret = open_pe_events( pe_ctx, pe_ctl ); 00561 if ( ret != PAPI_OK ) { 00562 SUBDBG("open_pe_events failed\n"); 00563 /* Restore values ? */ 00564 return ret; 00565 } 00566 00567 return PAPI_OK; 00568 } 00569 00570 /********************************************************************/ 00571 /********************************************************************/ 00572 /* Start with functions that are exported via the module interface */ 00573 /********************************************************************/ 00574 /********************************************************************/ 00575 00576 00577 /* set the domain. FIXME: perf_events allows per-event control of this. */ 00578 /* we do not handle that yet. */ 00579 int 00580 _peu_set_domain( hwd_control_state_t *ctl, int domain) 00581 { 00582 00583 int i; 00584 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00585 00586 SUBDBG("old control domain %d, new domain %d\n", 00587 pe_ctl->domain,domain); 00588 00589 pe_ctl->domain = domain; 00590 00591 /* Force the domain on all events */ 00592 for( i = 0; i < pe_ctl->num_events; i++ ) { 00593 pe_ctl->events[i].attr.exclude_user = 00594 !( pe_ctl->domain & PAPI_DOM_USER ); 00595 pe_ctl->events[i].attr.exclude_kernel = 00596 !( pe_ctl->domain & PAPI_DOM_KERNEL ); 00597 pe_ctl->events[i].attr.exclude_hv = 00598 !( pe_ctl->domain & PAPI_DOM_SUPERVISOR ); 00599 } 00600 return PAPI_OK; 00601 } 00602 00603 /* Shutdown a thread */ 00604 int 00605 _peu_shutdown_thread( hwd_context_t *ctx ) 00606 { 00607 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 00608 00609 pe_ctx->initialized=0; 00610 00611 return PAPI_OK; 00612 } 00613 00614 00615 /* reset the hardware counters */ 00616 /* Note: PAPI_reset() does not necessarily call this */ 00617 /* unless the events are actually running. */ 00618 int 00619 _peu_reset( hwd_context_t *ctx, hwd_control_state_t *ctl ) 00620 { 00621 int i, ret; 00622 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00623 00624 ( void ) ctx; /*unused */ 00625 00626 /* We need to reset all of the events, not just the group leaders */ 00627 for( i = 0; i < pe_ctl->num_events; i++ ) { 00628 ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL ); 00629 if ( ret == -1 ) { 00630 PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) " 00631 "returned error, Linux says: %s", 00632 pe_ctl->events[i].event_fd, strerror( errno ) ); 00633 return PAPI_ESYS; 00634 } 00635 } 00636 00637 return PAPI_OK; 00638 } 00639 00640 00641 /* write (set) the hardware counters */ 00642 /* Current we do not support this. */ 00643 int 00644 _peu_write( hwd_context_t *ctx, hwd_control_state_t *ctl, 00645 long long *from ) 00646 { 00647 ( void ) ctx; /*unused */ 00648 ( void ) ctl; /*unused */ 00649 ( void ) from; /*unused */ 00650 /* 00651 * Counters cannot be written. Do we need to virtualize the 00652 * counters so that they can be written, or perhaps modify code so that 00653 * they can be written? FIXME ? 00654 */ 00655 00656 return PAPI_ENOSUPP; 00657 } 00658 00659 /* 00660 * perf_event provides a complicated read interface. 00661 * the info returned by read() varies depending on whether 00662 * you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED, 00663 * PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set 00664 * 00665 * To simplify things we just always ask for everything. This might 00666 * lead to overhead when reading more than we need, but it makes the 00667 * read code a lot simpler than the original implementation we had here. 00668 * 00669 * For more info on the layout see include/linux/perf_event.h 00670 * 00671 */ 00672 00673 int 00674 _peu_read( hwd_context_t *ctx, hwd_control_state_t *ctl, 00675 long long **events, int flags ) 00676 { 00677 ( void ) flags; /*unused */ 00678 int i, ret = -1; 00679 /* pe_context_t *pe_ctx = ( pe_context_t *) ctx; */ 00680 (void) ctx; /*unused*/ 00681 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00682 long long papi_pe_buffer[READ_BUFFER_SIZE]; 00683 long long tot_time_running, tot_time_enabled, scale; 00684 00685 /* Handle case where we are multiplexing */ 00686 if (pe_ctl->multiplexed) { 00687 00688 /* currently we handle multiplexing by having individual events */ 00689 /* so we read from each in turn. */ 00690 00691 for ( i = 0; i < pe_ctl->num_events; i++ ) { 00692 00693 ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 00694 sizeof ( papi_pe_buffer ) ); 00695 if ( ret == -1 ) { 00696 PAPIERROR("read returned an error: ", strerror( errno )); 00697 return PAPI_ESYS; 00698 } 00699 00700 /* We should read 3 64-bit values from the counter */ 00701 if (ret<(signed)(3*sizeof(long long))) { 00702 PAPIERROR("Error! short read!\n"); 00703 return PAPI_ESYS; 00704 } 00705 00706 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 00707 pe_ctl->events[i].event_fd, 00708 (long)pe_ctl->tid, pe_ctl->cpu, ret); 00709 SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0], 00710 papi_pe_buffer[1],papi_pe_buffer[2]); 00711 00712 tot_time_enabled = papi_pe_buffer[1]; 00713 tot_time_running = papi_pe_buffer[2]; 00714 00715 SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * " 00716 "tot_time_enabled %lld) / tot_time_running %lld\n", 00717 i, 0,papi_pe_buffer[0], 00718 tot_time_enabled,tot_time_running); 00719 00720 if (tot_time_running == tot_time_enabled) { 00721 /* No scaling needed */ 00722 pe_ctl->counts[i] = papi_pe_buffer[0]; 00723 } else if (tot_time_running && tot_time_enabled) { 00724 /* Scale factor of 100 to avoid overflows when computing */ 00725 /*enabled/running */ 00726 00727 scale = (tot_time_enabled * 100LL) / tot_time_running; 00728 scale = scale * papi_pe_buffer[0]; 00729 scale = scale / 100LL; 00730 pe_ctl->counts[i] = scale; 00731 } else { 00732 /* This should not happen, but Phil reports it sometime does. */ 00733 SUBDBG("perf_event kernel bug(?) count, enabled, " 00734 "running: %lld, %lld, %lld\n", 00735 papi_pe_buffer[0],tot_time_enabled, 00736 tot_time_running); 00737 00738 pe_ctl->counts[i] = papi_pe_buffer[0]; 00739 } 00740 } 00741 } 00742 00743 /* Handle cases where we cannot use FORMAT GROUP */ 00744 else if (pe_ctl->inherit) { 00745 00746 /* we must read each counter individually */ 00747 for ( i = 0; i < pe_ctl->num_events; i++ ) { 00748 00749 ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 00750 sizeof ( papi_pe_buffer ) ); 00751 if ( ret == -1 ) { 00752 PAPIERROR("read returned an error: ", strerror( errno )); 00753 return PAPI_ESYS; 00754 } 00755 00756 /* we should read one 64-bit value from each counter */ 00757 if (ret!=sizeof(long long)) { 00758 PAPIERROR("Error! short read!\n"); 00759 PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 00760 pe_ctl->events[i].event_fd, 00761 (long)pe_ctl->tid, pe_ctl->cpu, ret); 00762 return PAPI_ESYS; 00763 } 00764 00765 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 00766 pe_ctl->events[i].event_fd, (long)pe_ctl->tid, 00767 pe_ctl->cpu, ret); 00768 SUBDBG("read: %lld\n",papi_pe_buffer[0]); 00769 00770 pe_ctl->counts[i] = papi_pe_buffer[0]; 00771 } 00772 } 00773 00774 00775 /* Handle cases where we are using FORMAT_GROUP */ 00776 /* We assume only one group leader, in position 0 */ 00777 00778 else { 00779 if (pe_ctl->events[0].group_leader_fd!=-1) { 00780 PAPIERROR("Was expecting group leader!\n"); 00781 } 00782 00783 ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer, 00784 sizeof ( papi_pe_buffer ) ); 00785 00786 if ( ret == -1 ) { 00787 PAPIERROR("read returned an error: ", strerror( errno )); 00788 return PAPI_ESYS; 00789 } 00790 00791 /* we read 1 64-bit value (number of events) then */ 00792 /* num_events more 64-bit values that hold the counts */ 00793 if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) { 00794 PAPIERROR("Error! short read!\n"); 00795 return PAPI_ESYS; 00796 } 00797 00798 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 00799 pe_ctl->events[0].event_fd, 00800 (long)pe_ctl->tid, pe_ctl->cpu, ret); 00801 { 00802 int j; 00803 for(j=0;j<ret/8;j++) { 00804 SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]); 00805 } 00806 } 00807 00808 /* Make sure the kernel agrees with how many events we have */ 00809 if (papi_pe_buffer[0]!=pe_ctl->num_events) { 00810 PAPIERROR("Error! Wrong number of events!\n"); 00811 return PAPI_ESYS; 00812 } 00813 00814 /* put the count values in their proper location */ 00815 for(i=0;i<papi_pe_buffer[0];i++) { 00816 pe_ctl->counts[i] = papi_pe_buffer[1+i]; 00817 } 00818 } 00819 00820 /* point PAPI to the values we read */ 00821 *events = pe_ctl->counts; 00822 00823 return PAPI_OK; 00824 } 00825 00826 /* Start counting events */ 00827 int 00828 _peu_start( hwd_context_t *ctx, hwd_control_state_t *ctl ) 00829 { 00830 int ret; 00831 int i; 00832 int did_something = 0; 00833 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 00834 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00835 00836 /* Reset the counters first. Is this necessary? */ 00837 ret = _pe_reset( pe_ctx, pe_ctl ); 00838 if ( ret ) { 00839 return ret; 00840 } 00841 00842 /* Enable all of the group leaders */ 00843 /* All group leaders have a group_leader_fd of -1 */ 00844 for( i = 0; i < pe_ctl->num_events; i++ ) { 00845 if (pe_ctl->events[i].group_leader_fd == -1) { 00846 SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd); 00847 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ; 00848 00849 /* ioctls always return -1 on failure */ 00850 if (ret == -1) { 00851 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n"); 00852 return PAPI_ESYS; 00853 } 00854 00855 did_something++; 00856 } 00857 } 00858 00859 if (!did_something) { 00860 PAPIERROR("Did not enable any counters.\n"); 00861 return PAPI_EBUG; 00862 } 00863 00864 pe_ctx->state |= PERF_EVENTS_RUNNING; 00865 00866 return PAPI_OK; 00867 00868 } 00869 00870 /* Stop all of the counters */ 00871 int 00872 _peu_stop( hwd_context_t *ctx, hwd_control_state_t *ctl ) 00873 { 00874 00875 int ret; 00876 int i; 00877 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 00878 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00879 00880 /* Just disable the group leaders */ 00881 for ( i = 0; i < pe_ctl->num_events; i++ ) { 00882 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 00883 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL); 00884 if ( ret == -1 ) { 00885 PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) " 00886 "returned error, Linux says: %s", 00887 pe_ctl->events[i].event_fd, strerror( errno ) ); 00888 return PAPI_EBUG; 00889 } 00890 } 00891 } 00892 00893 pe_ctx->state &= ~PERF_EVENTS_RUNNING; 00894 00895 return PAPI_OK; 00896 } 00897 00898 /* Set various options on a control state */ 00899 int 00900 _peu_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option ) 00901 { 00902 int ret; 00903 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 00904 pe_control_t *pe_ctl = NULL; 00905 00906 switch ( code ) { 00907 case PAPI_MULTIPLEX: 00908 pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state ); 00909 00910 pe_ctl->multiplexed = 1; 00911 ret = _peu_update_control_state( pe_ctl, NULL, 00912 pe_ctl->num_events, pe_ctx ); 00913 if (ret != PAPI_OK) { 00914 pe_ctl->multiplexed = 0; 00915 } 00916 return ret; 00917 00918 case PAPI_ATTACH: 00919 pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state ); 00920 00921 pe_ctl->tid = option->attach.tid; 00922 00923 /* If events have been already been added, something may */ 00924 /* have been done to the kernel, so update */ 00925 ret =_peu_update_control_state( pe_ctl, NULL, 00926 pe_ctl->num_events, pe_ctx); 00927 00928 return ret; 00929 00930 case PAPI_DETACH: 00931 pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state ); 00932 00933 pe_ctl->tid = 0; 00934 return PAPI_OK; 00935 00936 case PAPI_CPU_ATTACH: 00937 pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state ); 00938 00939 /* this tells the kernel not to count for a thread */ 00940 /* should we warn if we try to set both? perf_event */ 00941 /* will reject it. */ 00942 pe_ctl->tid = -1; 00943 00944 pe_ctl->cpu = option->cpu.cpu_num; 00945 00946 return PAPI_OK; 00947 00948 case PAPI_DOMAIN: 00949 pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state ); 00950 00951 /* looks like we are allowed, so set counting domain */ 00952 return _pe_set_domain( pe_ctl, option->domain.domain ); 00953 00954 case PAPI_GRANUL: 00955 pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state ); 00956 00957 /* FIXME: we really don't support this yet */ 00958 00959 switch ( option->granularity.granularity ) { 00960 case PAPI_GRN_PROCG: 00961 case PAPI_GRN_SYS_CPU: 00962 case PAPI_GRN_PROC: 00963 return PAPI_ECMP; 00964 00965 /* Currently we only support thread and CPU granularity */ 00966 case PAPI_GRN_SYS: 00967 pe_ctl->granularity=PAPI_GRN_SYS; 00968 break; 00969 00970 case PAPI_GRN_THR: 00971 pe_ctl->granularity=PAPI_GRN_THR; 00972 break; 00973 00974 00975 default: 00976 return PAPI_EINVAL; 00977 } 00978 return PAPI_OK; 00979 00980 case PAPI_INHERIT: 00981 pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state ); 00982 00983 if (option->inherit.inherit) { 00984 /* children will inherit counters */ 00985 pe_ctl->inherit = 1; 00986 } else { 00987 /* children won't inherit counters */ 00988 pe_ctl->inherit = 0; 00989 } 00990 return PAPI_OK; 00991 00992 case PAPI_DATA_ADDRESS: 00993 return PAPI_ENOSUPP; 00994 00995 case PAPI_INSTR_ADDRESS: 00996 return PAPI_ENOSUPP; 00997 00998 case PAPI_DEF_ITIMER: 00999 return PAPI_ENOSUPP; 01000 01001 case PAPI_DEF_MPX_NS: 01002 return PAPI_ENOSUPP; 01003 01004 case PAPI_DEF_ITIMER_NS: 01005 return PAPI_ENOSUPP; 01006 01007 default: 01008 return PAPI_ENOSUPP; 01009 } 01010 } 01011 01012 01013 int 01014 _peu_ntv_enum_events( unsigned int *PapiEventCode, int modifier ) 01015 { 01016 01017 if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; 01018 01019 01020 return _peu_libpfm4_ntv_enum_events(PapiEventCode, modifier, 01021 &uncore_native_event_table); 01022 } 01023 01024 int 01025 _peu_ntv_name_to_code( char *name, unsigned int *event_code) { 01026 01027 if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; 01028 01029 return _peu_libpfm4_ntv_name_to_code(name,event_code, 01030 &uncore_native_event_table); 01031 } 01032 01033 int 01034 _peu_ntv_code_to_name(unsigned int EventCode, 01035 char *ntv_name, int len) { 01036 01037 if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; 01038 01039 return _peu_libpfm4_ntv_code_to_name(EventCode, 01040 ntv_name, len, 01041 &uncore_native_event_table); 01042 } 01043 01044 int 01045 _peu_ntv_code_to_descr( unsigned int EventCode, 01046 char *ntv_descr, int len) { 01047 01048 if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; 01049 01050 return _peu_libpfm4_ntv_code_to_descr(EventCode,ntv_descr,len, 01051 &uncore_native_event_table); 01052 } 01053 01054 int 01055 _peu_ntv_code_to_info(unsigned int EventCode, 01056 PAPI_event_info_t *info) { 01057 01058 if (_perf_event_uncore_vector.cmp_info.disabled) return PAPI_ENOEVNT; 01059 01060 return _peu_libpfm4_ntv_code_to_info(EventCode, info, 01061 &uncore_native_event_table); 01062 } 01063 01064 /* Our component vector */ 01065 01066 papi_vector_t _perf_event_uncore_vector = { 01067 .cmp_info = { 01068 /* component information (unspecified values initialized to 0) */ 01069 .name = "perf_event_uncore", 01070 .short_name = "peu", 01071 .version = "5.0", 01072 .description = "Linux perf_event CPU uncore and northbridge", 01073 01074 .default_domain = PAPI_DOM_ALL, 01075 .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR, 01076 .default_granularity = PAPI_GRN_SYS, 01077 .available_granularities = PAPI_GRN_SYS, 01078 01079 .num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS, 01080 01081 /* component specific cmp_info initializations */ 01082 .fast_virtual_timer = 0, 01083 .attach = 1, 01084 .attach_must_ptrace = 1, 01085 .cpu = 1, 01086 .inherit = 1, 01087 .cntr_umasks = 1, 01088 01089 }, 01090 01091 /* sizes of framework-opaque component-private structures */ 01092 .size = { 01093 .context = sizeof ( pe_context_t ), 01094 .control_state = sizeof ( pe_control_t ), 01095 .reg_value = sizeof ( int ), 01096 .reg_alloc = sizeof ( int ), 01097 }, 01098 01099 /* function pointers in this component */ 01100 .init_component = _peu_init_component, 01101 .shutdown_component = _peu_shutdown_component, 01102 .init_thread = _peu_init_thread, 01103 .init_control_state = _peu_init_control_state, 01104 .start = _peu_start, 01105 .stop = _peu_stop, 01106 .read = _peu_read, 01107 .shutdown_thread = _peu_shutdown_thread, 01108 .ctl = _peu_ctl, 01109 .update_control_state = _peu_update_control_state, 01110 .set_domain = _peu_set_domain, 01111 .reset = _peu_reset, 01112 .write = _peu_write, 01113 01114 /* from counter name mapper */ 01115 .ntv_enum_events = _peu_ntv_enum_events, 01116 .ntv_name_to_code = _peu_ntv_name_to_code, 01117 .ntv_code_to_name = _peu_ntv_code_to_name, 01118 .ntv_code_to_descr = _peu_ntv_code_to_descr, 01119 .ntv_code_to_info = _peu_ntv_code_to_info, 01120 }; 01121 01122