|
PAPI
5.3.0.0
|
00001 /* 00002 * File: perf_event.c 00003 * 00004 * Author: Corey Ashford 00005 * cjashfor@us.ibm.com 00006 * - based upon perfmon.c written by - 00007 * Philip Mucci 00008 * mucci@cs.utk.edu 00009 * Mods: Gary Mohr 00010 * gary.mohr@bull.com 00011 * Mods: Vince Weaver 00012 * vweaver1@eecs.utk.edu 00013 * Mods: Philip Mucci 00014 * mucci@eecs.utk.edu */ 00015 00016 00017 #include <fcntl.h> 00018 #include <string.h> 00019 #include <errno.h> 00020 #include <signal.h> 00021 #include <syscall.h> 00022 #include <sys/utsname.h> 00023 #include <sys/mman.h> 00024 #include <sys/ioctl.h> 00025 00026 /* PAPI-specific includes */ 00027 #include "papi.h" 00028 #include "papi_memory.h" 00029 #include "papi_internal.h" 00030 #include "papi_vector.h" 00031 #include "extras.h" 00032 00033 /* libpfm4 includes */ 00034 #include "papi_libpfm4_events.h" 00035 #include "pe_libpfm4_events.h" 00036 #include "perfmon/pfmlib.h" 00037 #include PEINCLUDE 00038 00039 /* Linux-specific includes */ 00040 #include "mb.h" 00041 #include "linux-memory.h" 00042 #include "linux-timer.h" 00043 #include "linux-common.h" 00044 #include "linux-context.h" 00045 00046 #include "perf_event_lib.h" 00047 00048 /* Defines for ctx->state */ 00049 #define PERF_EVENTS_OPENED 0x01 00050 #define PERF_EVENTS_RUNNING 0x02 00051 00052 /* Static globals */ 00053 int nmi_watchdog_active; 00054 00055 /* Forward declaration */ 00056 papi_vector_t _perf_event_vector; 00057 00058 /* Globals */ 00059 struct native_event_table_t perf_native_event_table; 00060 int our_cidx; 00061 00062 /* These sentinels tell _pe_set_overflow() how to set the */ 00063 /* wakeup_events field in the event descriptor record. */ 00064 00065 #define WAKEUP_COUNTER_OVERFLOW 0 00066 #define WAKEUP_PROFILING -1 00067 00068 #define WAKEUP_MODE_COUNTER_OVERFLOW 0 00069 #define WAKEUP_MODE_PROFILING 1 00070 00071 /* The kernel developers say to never use a refresh value of 0 */ 00072 /* See https://lkml.org/lkml/2011/5/24/172 */ 00073 /* However, on some platforms (like Power) a value of 1 does not work */ 00074 /* We're still tracking down why this happens. */ 00075 00076 #if defined(__powerpc__) 00077 #define PAPI_REFRESH_VALUE 0 00078 #else 00079 #define PAPI_REFRESH_VALUE 1 00080 #endif 00081 00082 /* Check for processor support */ 00083 /* Can be used for generic checking, though in general we only */ 00084 /* check for pentium4 here because support was broken for multiple */ 00085 /* kernel releases and the usual standard detections did not */ 00086 /* handle this. So we check for pentium 4 explicitly. */ 00087 static int 00088 processor_supported(int vendor, int family) { 00089 00090 /* Error out if kernel too early to support p4 */ 00091 if (( vendor == PAPI_VENDOR_INTEL ) && (family == 15)) { 00092 if (_papi_os_info.os_version < LINUX_VERSION(2,6,35)) { 00093 PAPIERROR("Pentium 4 not supported on kernels before 2.6.35"); 00094 return PAPI_ENOSUPP; 00095 } 00096 } 00097 return PAPI_OK; 00098 } 00099 00100 /* Fix up the config based on what CPU/Vendor we are running on */ 00101 static int 00102 pe_vendor_fixups(papi_vector_t *vector) 00103 { 00104 /* powerpc */ 00105 /* On IBM and Power6 Machines default domain should include supervisor */ 00106 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_IBM ) { 00107 vector->cmp_info.available_domains |= 00108 PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00109 if (strcmp(_papi_hwi_system_info.hw_info.model_string, "POWER6" ) == 0 ) { 00110 vector->cmp_info.default_domain = 00111 PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00112 } 00113 } 00114 00115 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_MIPS ) { 00116 vector->cmp_info.available_domains |= PAPI_DOM_KERNEL; 00117 } 00118 00119 if ((_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_INTEL) || 00120 (_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_AMD)) { 00121 vector->cmp_info.fast_real_timer = 1; 00122 } 00123 /* ARM */ 00124 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_ARM) { 00125 /* FIXME: this will change with Cortex A15 */ 00126 vector->cmp_info.available_domains |= 00127 PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00128 vector->cmp_info.default_domain = 00129 PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00130 } 00131 00132 /* CRAY */ 00133 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_CRAY ) { 00134 vector->cmp_info.available_domains |= PAPI_DOM_OTHER; 00135 } 00136 00137 return PAPI_OK; 00138 } 00139 00140 00141 00142 /******************************************************************/ 00143 /******** Kernel Version Dependent Routines **********************/ 00144 /******************************************************************/ 00145 00146 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch 00147 * implementations (e.g. x86) which don't do a static event scheduability 00148 * check in sys_perf_event_open. 00149 * This was fixed for x86 in the 2.6.33 kernel 00150 * 00151 * Also! Kernels newer than 2.6.34 will fail in a similar way 00152 * if the nmi_watchdog has stolen a performance counter 00153 * and we try to use the maximum number of counters. 00154 * A sys_perf_event_open() will seem to succeed but will fail 00155 * at read time. So re-use this work around code. 00156 */ 00157 static int 00158 bug_check_scheduability(void) { 00159 00160 #if defined(__powerpc__) 00161 /* PowerPC not affected by this bug */ 00162 #elif defined(__mips__) 00163 /* MIPS as of kernel 3.1 does not properly detect schedulability */ 00164 return 1; 00165 #else 00166 if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1; 00167 #endif 00168 00169 if (nmi_watchdog_active) return 1; 00170 00171 return 0; 00172 } 00173 00174 /* PERF_FORMAT_GROUP allows reading an entire group's counts at once */ 00175 /* before 2.6.34 PERF_FORMAT_GROUP did not work when reading results */ 00176 /* from attached processes. We are lazy and disable it for all cases */ 00177 /* commit was: 050735b08ca8a016bbace4445fa025b88fee770b */ 00178 00179 static int 00180 bug_format_group(void) { 00181 00182 if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) return 1; 00183 00184 /* MIPS, as of version 3.1, does not support this properly */ 00185 00186 #if defined(__mips__) 00187 return 1; 00188 #endif 00189 00190 return 0; 00191 00192 } 00193 00194 00195 /* There's a bug prior to Linux 2.6.33 where if you are using */ 00196 /* PERF_FORMAT_GROUP, the TOTAL_TIME_ENABLED and */ 00197 /* TOTAL_TIME_RUNNING fields will be zero unless you disable */ 00198 /* the counters first */ 00199 static int 00200 bug_sync_read(void) { 00201 00202 if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1; 00203 00204 return 0; 00205 00206 } 00207 00208 00209 /* Set the F_SETOWN_EX flag on the fd. */ 00210 /* This affects which thread an overflow signal gets sent to */ 00211 /* Handled in a subroutine to handle the fact that the behavior */ 00212 /* is dependent on kernel version. */ 00213 static int 00214 fcntl_setown_fd(int fd) { 00215 00216 int ret; 00217 struct f_owner_ex fown_ex; 00218 00219 /* F_SETOWN_EX is not available until 2.6.32 */ 00220 if (_papi_os_info.os_version < LINUX_VERSION(2,6,32)) { 00221 00222 /* get ownership of the descriptor */ 00223 ret = fcntl( fd, F_SETOWN, mygettid( ) ); 00224 if ( ret == -1 ) { 00225 PAPIERROR( "cannot fcntl(F_SETOWN) on %d: %s", fd, strerror(errno) ); 00226 return PAPI_ESYS; 00227 } 00228 } 00229 else { 00230 /* set ownership of the descriptor */ 00231 fown_ex.type = F_OWNER_TID; 00232 fown_ex.pid = mygettid(); 00233 ret = fcntl(fd, F_SETOWN_EX, (unsigned long)&fown_ex ); 00234 00235 if ( ret == -1 ) { 00236 PAPIERROR( "cannot fcntl(F_SETOWN_EX) on %d: %s", 00237 fd, strerror( errno ) ); 00238 return PAPI_ESYS; 00239 } 00240 } 00241 return PAPI_OK; 00242 } 00243 00244 /* The read format on perf_event varies based on various flags that */ 00245 /* are passed into it. This helper avoids copying this logic */ 00246 /* multiple places. */ 00247 static unsigned int 00248 get_read_format( unsigned int multiplex, 00249 unsigned int inherit, 00250 int format_group ) 00251 { 00252 unsigned int format = 0; 00253 00254 /* if we need read format options for multiplexing, add them now */ 00255 if (multiplex) { 00256 format |= PERF_FORMAT_TOTAL_TIME_ENABLED; 00257 format |= PERF_FORMAT_TOTAL_TIME_RUNNING; 00258 } 00259 00260 /* if our kernel supports it and we are not using inherit, */ 00261 /* add the group read options */ 00262 if ( (!bug_format_group()) && !inherit) { 00263 if (format_group) { 00264 format |= PERF_FORMAT_GROUP; 00265 } 00266 } 00267 00268 SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: %#x\n", 00269 multiplex, inherit, format_group, format); 00270 00271 return format; 00272 } 00273 00274 /*****************************************************************/ 00275 /********* End Kernel-version Dependent Routines ****************/ 00276 /*****************************************************************/ 00277 00278 /*****************************************************************/ 00279 /********* Begin perf_event low-level code ***********************/ 00280 /*****************************************************************/ 00281 00282 /* In case headers aren't new enough to have __NR_perf_event_open */ 00283 #ifndef __NR_perf_event_open 00284 00285 #ifdef __powerpc__ 00286 #define __NR_perf_event_open 319 00287 #elif defined(__x86_64__) 00288 #define __NR_perf_event_open 298 00289 #elif defined(__i386__) 00290 #define __NR_perf_event_open 336 00291 #elif defined(__arm__) 366+0x900000 00292 #define __NR_perf_event_open 00293 #endif 00294 00295 #endif 00296 00297 static long 00298 sys_perf_event_open( struct perf_event_attr *hw_event, pid_t pid, int cpu, 00299 int group_fd, unsigned long flags ) 00300 { 00301 int ret; 00302 00303 SUBDBG("sys_perf_event_open(%p,%d,%d,%d,%lx\n",hw_event,pid,cpu,group_fd,flags); 00304 SUBDBG(" type: %d\n",hw_event->type); 00305 SUBDBG(" size: %d\n",hw_event->size); 00306 SUBDBG(" config: %"PRIx64" (%"PRIu64")\n",hw_event->config, 00307 hw_event->config); 00308 SUBDBG(" sample_period: %"PRIu64"\n",hw_event->sample_period); 00309 SUBDBG(" sample_type: %"PRIu64"\n",hw_event->sample_type); 00310 SUBDBG(" read_format: %"PRIu64"\n",hw_event->read_format); 00311 SUBDBG(" disabled: %d\n",hw_event->disabled); 00312 SUBDBG(" inherit: %d\n",hw_event->inherit); 00313 SUBDBG(" pinned: %d\n",hw_event->pinned); 00314 SUBDBG(" exclusive: %d\n",hw_event->exclusive); 00315 SUBDBG(" exclude_user: %d\n",hw_event->exclude_user); 00316 SUBDBG(" exclude_kernel: %d\n",hw_event->exclude_kernel); 00317 SUBDBG(" exclude_hv: %d\n",hw_event->exclude_hv); 00318 SUBDBG(" exclude_idle: %d\n",hw_event->exclude_idle); 00319 SUBDBG(" mmap: %d\n",hw_event->mmap); 00320 SUBDBG(" comm: %d\n",hw_event->comm); 00321 SUBDBG(" freq: %d\n",hw_event->freq); 00322 SUBDBG(" inherit_stat: %d\n",hw_event->inherit_stat); 00323 SUBDBG(" enable_on_exec: %d\n",hw_event->enable_on_exec); 00324 SUBDBG(" task: %d\n",hw_event->task); 00325 SUBDBG(" watermark: %d\n",hw_event->watermark); 00326 ret = 00327 syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags ); 00328 SUBDBG("Returned %d %d %s\n",ret, 00329 ret<0?errno:0, 00330 ret<0?strerror(errno):" "); 00331 return ret; 00332 } 00333 00334 00335 static int map_perf_event_errors_to_papi(int perf_event_error) { 00336 00337 int ret; 00338 00339 /* These mappings are approximate. 00340 EINVAL in particular can mean lots of different things */ 00341 switch(perf_event_error) { 00342 case EPERM: 00343 case EACCES: 00344 ret = PAPI_EPERM; 00345 break; 00346 case ENODEV: 00347 case EOPNOTSUPP: 00348 ret = PAPI_ENOSUPP; 00349 break; 00350 case ENOENT: 00351 ret = PAPI_ENOEVNT; 00352 break; 00353 case ENOSYS: 00354 case EAGAIN: 00355 case EBUSY: 00356 case E2BIG: 00357 ret = PAPI_ESYS; 00358 break; 00359 case ENOMEM: 00360 ret = PAPI_ENOMEM; 00361 break; 00362 case EINVAL: 00363 default: 00364 ret = PAPI_EINVAL; 00365 break; 00366 } 00367 return ret; 00368 } 00369 00370 00372 /* perf_events. */ 00373 /* We do this by temporarily opening an event with the */ 00374 /* desired options then closing it again. We use the */ 00375 /* PERF_COUNT_HW_INSTRUCTION event as a dummy event */ 00376 /* on the assumption it is available on all */ 00377 /* platforms. */ 00378 00379 static int 00380 check_permissions( unsigned long tid, 00381 unsigned int cpu_num, 00382 unsigned int domain, 00383 unsigned int granularity, 00384 unsigned int multiplex, 00385 unsigned int inherit ) 00386 { 00387 int ev_fd; 00388 struct perf_event_attr attr; 00389 00390 long pid; 00391 00392 /* clearing this will set a type of hardware and to count all domains */ 00393 memset(&attr, '\0', sizeof(attr)); 00394 attr.read_format = get_read_format(multiplex, inherit, 1); 00395 00396 /* set the event id (config field) to instructios */ 00397 /* (an event that should always exist) */ 00398 /* This was cycles but that is missing on Niagara */ 00399 attr.config = PERF_COUNT_HW_INSTRUCTIONS; 00400 00401 /* now set up domains this event set will be counting */ 00402 if (!(domain & PAPI_DOM_SUPERVISOR)) { 00403 attr.exclude_hv = 1; 00404 } 00405 if (!(domain & PAPI_DOM_USER)) { 00406 attr.exclude_user = 1; 00407 } 00408 if (!(domain & PAPI_DOM_KERNEL)) { 00409 attr.exclude_kernel = 1; 00410 } 00411 00412 if (granularity==PAPI_GRN_SYS) { 00413 pid = -1; 00414 } else { 00415 pid = tid; 00416 } 00417 00418 SUBDBG("Calling sys_perf_event_open() from check_permissions\n"); 00419 00420 ev_fd = sys_perf_event_open( &attr, pid, cpu_num, -1, 0 ); 00421 if ( ev_fd == -1 ) { 00422 SUBDBG("sys_perf_event_open returned error. Linux says, %s", 00423 strerror( errno ) ); 00424 return map_perf_event_errors_to_papi(errno); 00425 } 00426 00427 /* now close it, this was just to make sure we have permissions */ 00428 /* to set these options */ 00429 close(ev_fd); 00430 return PAPI_OK; 00431 } 00432 00433 /* Maximum size we ever expect to read from a perf_event fd */ 00434 /* (this is the number of 64-bit values) */ 00435 /* We use this to size the read buffers */ 00436 /* The three is for event count, time_enabled, time_running */ 00437 /* and the counter term is count value and count id for each */ 00438 /* possible counter value. */ 00439 #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS)) 00440 00441 00442 00443 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */ 00444 /* implementations (e.g. x86 before 2.6.33) which don't do a static event */ 00445 /* scheduability check in sys_perf_event_open. It is also needed if the */ 00446 /* kernel is stealing an event, such as when NMI watchdog is enabled. */ 00447 00448 static int 00449 check_scheduability( pe_context_t *ctx, pe_control_t *ctl, int idx ) 00450 { 00451 int retval = 0, cnt = -1; 00452 ( void ) ctx; /*unused */ 00453 long long papi_pe_buffer[READ_BUFFER_SIZE]; 00454 int i,group_leader_fd; 00455 00456 if (bug_check_scheduability()) { 00457 00458 /* If the kernel isn't tracking scheduability right */ 00459 /* Then we need to start/stop/read to force the event */ 00460 /* to be scheduled and see if an error condition happens. */ 00461 00462 /* get the proper fd to start */ 00463 group_leader_fd=ctl->events[idx].group_leader_fd; 00464 if (group_leader_fd==-1) group_leader_fd=ctl->events[idx].event_fd; 00465 00466 /* start the event */ 00467 retval = ioctl( group_leader_fd, PERF_EVENT_IOC_ENABLE, NULL ); 00468 if (retval == -1) { 00469 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n"); 00470 return PAPI_ESYS; 00471 } 00472 00473 /* stop the event */ 00474 retval = ioctl(group_leader_fd, PERF_EVENT_IOC_DISABLE, NULL ); 00475 if (retval == -1) { 00476 PAPIERROR( "ioctl(PERF_EVENT_IOC_DISABLE) failed.\n" ); 00477 return PAPI_ESYS; 00478 } 00479 00480 /* See if a read returns any results */ 00481 cnt = read( group_leader_fd, papi_pe_buffer, sizeof(papi_pe_buffer)); 00482 if ( cnt == -1 ) { 00483 SUBDBG( "read returned an error! Should never happen.\n" ); 00484 return PAPI_ESYS; 00485 } 00486 00487 if ( cnt == 0 ) { 00488 /* We read 0 bytes if we could not schedule the event */ 00489 /* The kernel should have detected this at open */ 00490 /* but various bugs (including NMI watchdog) */ 00491 /* result in this behavior */ 00492 00493 return PAPI_ECNFLCT; 00494 00495 } else { 00496 00497 /* Reset all of the counters (opened so far) back to zero */ 00498 /* from the above brief enable/disable call pair. */ 00499 00500 /* We have to reset all events because reset of group leader */ 00501 /* does not reset all. */ 00502 /* we assume that the events are being added one by one and that */ 00503 /* we do not need to reset higher events (doing so may reset ones */ 00504 /* that have not been initialized yet. */ 00505 00506 /* Note... PERF_EVENT_IOC_RESET does not reset time running */ 00507 /* info if multiplexing, so we should avoid coming here if */ 00508 /* we are multiplexing the event. */ 00509 for( i = 0; i < idx; i++) { 00510 retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL ); 00511 if (retval == -1) { 00512 PAPIERROR( "ioctl(PERF_EVENT_IOC_RESET) #%d/%d %d " 00513 "(fd %d)failed.\n", 00514 i,ctl->num_events,idx,ctl->events[i].event_fd); 00515 return PAPI_ESYS; 00516 } 00517 } 00518 } 00519 } 00520 return PAPI_OK; 00521 } 00522 00523 00524 /* Do some extra work on a perf_event fd if we're doing sampling */ 00525 /* This mostly means setting up the mmap buffer. */ 00526 static int 00527 tune_up_fd( pe_control_t *ctl, int evt_idx ) 00528 { 00529 int ret; 00530 void *buf_addr; 00531 int fd = ctl->events[evt_idx].event_fd; 00532 00533 /* Register that we would like a SIGIO notification when a mmap'd page */ 00534 /* becomes full. */ 00535 ret = fcntl( fd, F_SETFL, O_ASYNC | O_NONBLOCK ); 00536 if ( ret ) { 00537 PAPIERROR ( "fcntl(%d, F_SETFL, O_ASYNC | O_NONBLOCK) " 00538 "returned error: %s", fd, strerror( errno ) ); 00539 return PAPI_ESYS; 00540 } 00541 00542 /* Set the F_SETOWN_EX flag on the fd. */ 00543 /* This affects which thread an overflow signal gets sent to. */ 00544 ret=fcntl_setown_fd(fd); 00545 if (ret!=PAPI_OK) return ret; 00546 00547 /* Set FD_CLOEXEC. Otherwise if we do an exec with an overflow */ 00548 /* running, the overflow handler will continue into the exec()'d*/ 00549 /* process and kill it because no signal handler is set up. */ 00550 ret=fcntl(fd, F_SETFD, FD_CLOEXEC); 00551 if (ret) { 00552 return PAPI_ESYS; 00553 } 00554 00555 /* when you explicitely declare that you want a particular signal, */ 00556 /* even with you use the default signal, the kernel will send more */ 00557 /* information concerning the event to the signal handler. */ 00558 /* */ 00559 /* In particular, it will send the file descriptor from which the */ 00560 /* event is originating which can be quite useful when monitoring */ 00561 /* multiple tasks from a single thread. */ 00562 ret = fcntl( fd, F_SETSIG, ctl->overflow_signal ); 00563 if ( ret == -1 ) { 00564 PAPIERROR( "cannot fcntl(F_SETSIG,%d) on %d: %s", 00565 ctl->overflow_signal, fd, 00566 strerror( errno ) ); 00567 return PAPI_ESYS; 00568 } 00569 00570 /* mmap() the sample buffer */ 00571 buf_addr = mmap( NULL, ctl->events[evt_idx].nr_mmap_pages * getpagesize(), 00572 PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 ); 00573 if ( buf_addr == MAP_FAILED ) { 00574 PAPIERROR( "mmap(NULL,%d,%d,%d,%d,0): %s", 00575 ctl->events[evt_idx].nr_mmap_pages * getpagesize( ), 00576 PROT_READ, MAP_SHARED, fd, strerror( errno ) ); 00577 return ( PAPI_ESYS ); 00578 } 00579 00580 SUBDBG( "Sample buffer for fd %d is located at %p\n", fd, buf_addr ); 00581 00582 /* Set up the mmap buffer and its associated helpers */ 00583 ctl->events[evt_idx].mmap_buf = (struct perf_counter_mmap_page *) buf_addr; 00584 ctl->events[evt_idx].tail = 0; 00585 ctl->events[evt_idx].mask = ( ctl->events[evt_idx].nr_mmap_pages - 1 ) * 00586 getpagesize() - 1; 00587 00588 return PAPI_OK; 00589 } 00590 00591 00592 00593 /* Open all events in the control state */ 00594 static int 00595 open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) 00596 { 00597 00598 int i, ret = PAPI_OK; 00599 long pid; 00600 00601 if (ctl->granularity==PAPI_GRN_SYS) { 00602 pid = -1; 00603 } 00604 else { 00605 pid = ctl->tid; 00606 } 00607 00608 for( i = 0; i < ctl->num_events; i++ ) { 00609 00610 ctl->events[i].event_opened=0; 00611 00612 /* set up the attr structure. We don't set up all fields here */ 00613 /* as some have already been set up previously. */ 00614 00615 /* group leader (event 0) is special */ 00616 /* If we're multiplexed, everyone is a group leader */ 00617 if (( i == 0 ) || (ctl->multiplexed)) { 00618 ctl->events[i].attr.pinned = !ctl->multiplexed; 00619 ctl->events[i].attr.disabled = 1; 00620 ctl->events[i].group_leader_fd=-1; 00621 ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 00622 ctl->inherit, 00623 !ctl->multiplexed ); 00624 } else { 00625 ctl->events[i].attr.pinned=0; 00626 ctl->events[i].attr.disabled = 0; 00627 ctl->events[i].group_leader_fd=ctl->events[0].event_fd; 00628 ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 00629 ctl->inherit, 00630 0 ); 00631 } 00632 00633 00634 /* try to open */ 00635 ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr, 00636 pid, 00637 ctl->cpu, 00638 ctl->events[i].group_leader_fd, 00639 0 /* flags */ 00640 ); 00641 00642 /* Try to match Linux errors to PAPI errors */ 00643 if ( ctl->events[i].event_fd == -1 ) { 00644 SUBDBG("sys_perf_event_open returned error on event #%d." 00645 " Error: %s\n", 00646 i, strerror( errno ) ); 00647 ret=map_perf_event_errors_to_papi(errno); 00648 00649 goto open_pe_cleanup; 00650 } 00651 00652 SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d," 00653 " group_leader/fd: %d, event_fd: %d," 00654 " read_format: 0x%"PRIu64"\n", 00655 pid, ctl->cpu, ctl->events[i].group_leader_fd, 00656 ctl->events[i].event_fd, ctl->events[i].attr.read_format); 00657 00658 00659 /* in many situations the kernel will indicate we opened fine */ 00660 /* yet things will fail later. So we need to double check */ 00661 /* we actually can use the events we've set up. */ 00662 00663 /* This is not necessary if we are multiplexing, and in fact */ 00664 /* we cannot do this properly if multiplexed because */ 00665 /* PERF_EVENT_IOC_RESET does not reset the time running info */ 00666 if (!ctl->multiplexed) { 00667 ret = check_scheduability( ctx, ctl, i ); 00668 00669 if ( ret != PAPI_OK ) { 00670 /* the last event did open, so we need to bump the counter */ 00671 /* before doing the cleanup */ 00672 i++; 00673 goto open_pe_cleanup; 00674 } 00675 } 00676 ctl->events[i].event_opened=1; 00677 } 00678 00679 /* Now that we've successfully opened all of the events, do whatever */ 00680 /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */ 00681 /* and so on. */ 00682 for ( i = 0; i < ctl->num_events; i++ ) { 00683 00684 /* If sampling is enabled, hook up signal handler */ 00685 if ( ctl->events[i].attr.sample_period ) { 00686 ret = tune_up_fd( ctl, i ); 00687 if ( ret != PAPI_OK ) { 00688 /* All of the fds are open, so we need to clean up all of them */ 00689 i = ctl->num_events; 00690 goto open_pe_cleanup; 00691 } 00692 } else { 00693 /* Make sure this is NULL so close_pe_events works right */ 00694 ctl->events[i].mmap_buf = NULL; 00695 } 00696 } 00697 00698 /* Set num_evts only if completely successful */ 00699 ctx->state |= PERF_EVENTS_OPENED; 00700 00701 return PAPI_OK; 00702 00703 open_pe_cleanup: 00704 /* We encountered an error, close up the fds we successfully opened. */ 00705 /* We go backward in an attempt to close group leaders last, although */ 00706 /* That's probably not strictly necessary. */ 00707 while ( i > 0 ) { 00708 i--; 00709 if (ctl->events[i].event_fd>=0) { 00710 close( ctl->events[i].event_fd ); 00711 ctl->events[i].event_opened=0; 00712 } 00713 } 00714 00715 return ret; 00716 } 00717 00718 /* Close all of the opened events */ 00719 static int 00720 close_pe_events( pe_context_t *ctx, pe_control_t *ctl ) 00721 { 00722 int i; 00723 int num_closed=0; 00724 int events_not_opened=0; 00725 00726 /* should this be a more serious error? */ 00727 if ( ctx->state & PERF_EVENTS_RUNNING ) { 00728 SUBDBG("Closing without stopping first\n"); 00729 } 00730 00731 /* Close child events first */ 00732 for( i=0; i<ctl->num_events; i++ ) { 00733 00734 if (ctl->events[i].event_opened) { 00735 00736 if (ctl->events[i].group_leader_fd!=-1) { 00737 if ( ctl->events[i].mmap_buf ) { 00738 if ( munmap ( ctl->events[i].mmap_buf, 00739 ctl->events[i].nr_mmap_pages * getpagesize() ) ) { 00740 PAPIERROR( "munmap of fd = %d returned error: %s", 00741 ctl->events[i].event_fd, strerror( errno ) ); 00742 return PAPI_ESYS; 00743 } 00744 } 00745 00746 if ( close( ctl->events[i].event_fd ) ) { 00747 PAPIERROR( "close of fd = %d returned error: %s", 00748 ctl->events[i].event_fd, strerror( errno ) ); 00749 return PAPI_ESYS; 00750 } else { 00751 num_closed++; 00752 } 00753 ctl->events[i].event_opened=0; 00754 } 00755 } 00756 else { 00757 events_not_opened++; 00758 } 00759 } 00760 00761 /* Close the group leaders last */ 00762 for( i=0; i<ctl->num_events; i++ ) { 00763 00764 if (ctl->events[i].event_opened) { 00765 00766 if (ctl->events[i].group_leader_fd==-1) { 00767 if ( ctl->events[i].mmap_buf ) { 00768 if ( munmap ( ctl->events[i].mmap_buf, 00769 ctl->events[i].nr_mmap_pages * getpagesize() ) ) { 00770 PAPIERROR( "munmap of fd = %d returned error: %s", 00771 ctl->events[i].event_fd, strerror( errno ) ); 00772 return PAPI_ESYS; 00773 } 00774 } 00775 00776 00777 if ( close( ctl->events[i].event_fd ) ) { 00778 PAPIERROR( "close of fd = %d returned error: %s", 00779 ctl->events[i].event_fd, strerror( errno ) ); 00780 return PAPI_ESYS; 00781 } else { 00782 num_closed++; 00783 } 00784 ctl->events[i].event_opened=0; 00785 } 00786 } 00787 } 00788 00789 00790 if (ctl->num_events!=num_closed) { 00791 if (ctl->num_events!=(num_closed+events_not_opened)) { 00792 PAPIERROR("Didn't close all events: " 00793 "Closed %d Not Opened: %d Expected %d\n", 00794 num_closed,events_not_opened,ctl->num_events); 00795 return PAPI_EBUG; 00796 } 00797 } 00798 00799 ctl->num_events=0; 00800 00801 ctx->state &= ~PERF_EVENTS_OPENED; 00802 00803 return PAPI_OK; 00804 } 00805 00806 00807 /********************************************************************/ 00808 /********************************************************************/ 00809 /* Functions that are exported via the component interface */ 00810 /********************************************************************/ 00811 /********************************************************************/ 00812 00813 00814 /* set the domain. FIXME: perf_events allows per-event control of this. */ 00815 /* we do not handle that yet. */ 00816 int 00817 _pe_set_domain( hwd_control_state_t *ctl, int domain) 00818 { 00819 00820 int i; 00821 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00822 00823 SUBDBG("old control domain %d, new domain %d\n", 00824 pe_ctl->domain,domain); 00825 00826 pe_ctl->domain = domain; 00827 00828 /* Force the domain on all events */ 00829 for( i = 0; i < pe_ctl->num_events; i++ ) { 00830 pe_ctl->events[i].attr.exclude_user = 00831 !( pe_ctl->domain & PAPI_DOM_USER ); 00832 pe_ctl->events[i].attr.exclude_kernel = 00833 !( pe_ctl->domain & PAPI_DOM_KERNEL ); 00834 pe_ctl->events[i].attr.exclude_hv = 00835 !( pe_ctl->domain & PAPI_DOM_SUPERVISOR ); 00836 } 00837 return PAPI_OK; 00838 } 00839 00840 /* Shutdown a thread */ 00841 int 00842 _pe_shutdown_thread( hwd_context_t *ctx ) 00843 { 00844 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 00845 00846 pe_ctx->initialized=0; 00847 00848 return PAPI_OK; 00849 } 00850 00851 00852 /* reset the hardware counters */ 00853 /* Note: PAPI_reset() does not necessarily call this */ 00854 /* unless the events are actually running. */ 00855 int 00856 _pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl ) 00857 { 00858 int i, ret; 00859 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00860 00861 ( void ) ctx; /*unused */ 00862 00863 /* We need to reset all of the events, not just the group leaders */ 00864 for( i = 0; i < pe_ctl->num_events; i++ ) { 00865 ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL ); 00866 if ( ret == -1 ) { 00867 PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) " 00868 "returned error, Linux says: %s", 00869 pe_ctl->events[i].event_fd, strerror( errno ) ); 00870 return PAPI_ESYS; 00871 } 00872 } 00873 00874 return PAPI_OK; 00875 } 00876 00877 00878 /* write (set) the hardware counters */ 00879 /* Current we do not support this. */ 00880 int 00881 _pe_write( hwd_context_t *ctx, hwd_control_state_t *ctl, 00882 long long *from ) 00883 { 00884 ( void ) ctx; /*unused */ 00885 ( void ) ctl; /*unused */ 00886 ( void ) from; /*unused */ 00887 /* 00888 * Counters cannot be written. Do we need to virtualize the 00889 * counters so that they can be written, or perhaps modify code so that 00890 * they can be written? FIXME ? 00891 */ 00892 00893 return PAPI_ENOSUPP; 00894 } 00895 00896 /* 00897 * perf_event provides a complicated read interface. 00898 * the info returned by read() varies depending on whether 00899 * you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED, 00900 * PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set 00901 * 00902 * To simplify things we just always ask for everything. This might 00903 * lead to overhead when reading more than we need, but it makes the 00904 * read code a lot simpler than the original implementation we had here. 00905 * 00906 * For more info on the layout see include/linux/perf_event.h 00907 * 00908 */ 00909 00910 int 00911 _pe_read( hwd_context_t *ctx, hwd_control_state_t *ctl, 00912 long long **events, int flags ) 00913 { 00914 ( void ) flags; /*unused */ 00915 int i, ret = -1; 00916 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 00917 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00918 long long papi_pe_buffer[READ_BUFFER_SIZE]; 00919 long long tot_time_running, tot_time_enabled, scale; 00920 00921 /* On kernels before 2.6.33 the TOTAL_TIME_ENABLED and TOTAL_TIME_RUNNING */ 00922 /* fields are always 0 unless the counter is disabled. So if we are on */ 00923 /* one of these kernels, then we must disable events before reading. */ 00924 00925 /* Elsewhere though we disable multiplexing on kernels before 2.6.34 */ 00926 /* so maybe this isn't even necessary. */ 00927 00928 if (bug_sync_read()) { 00929 if ( pe_ctx->state & PERF_EVENTS_RUNNING ) { 00930 for ( i = 0; i < pe_ctl->num_events; i++ ) { 00931 /* disable only the group leaders */ 00932 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 00933 ret = ioctl( pe_ctl->events[i].event_fd, 00934 PERF_EVENT_IOC_DISABLE, NULL ); 00935 if ( ret == -1 ) { 00936 PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) " 00937 "returned an error: ", strerror( errno )); 00938 return PAPI_ESYS; 00939 } 00940 } 00941 } 00942 } 00943 } 00944 00945 00946 /* Handle case where we are multiplexing */ 00947 if (pe_ctl->multiplexed) { 00948 00949 /* currently we handle multiplexing by having individual events */ 00950 /* so we read from each in turn. */ 00951 00952 for ( i = 0; i < pe_ctl->num_events; i++ ) { 00953 00954 ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 00955 sizeof ( papi_pe_buffer ) ); 00956 if ( ret == -1 ) { 00957 PAPIERROR("read returned an error: ", strerror( errno )); 00958 return PAPI_ESYS; 00959 } 00960 00961 /* We should read 3 64-bit values from the counter */ 00962 if (ret<(signed)(3*sizeof(long long))) { 00963 PAPIERROR("Error! short read!\n"); 00964 return PAPI_ESYS; 00965 } 00966 00967 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 00968 pe_ctl->events[i].event_fd, 00969 (long)pe_ctl->tid, pe_ctl->cpu, ret); 00970 SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0], 00971 papi_pe_buffer[1],papi_pe_buffer[2]); 00972 00973 tot_time_enabled = papi_pe_buffer[1]; 00974 tot_time_running = papi_pe_buffer[2]; 00975 00976 SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * " 00977 "tot_time_enabled %lld) / tot_time_running %lld\n", 00978 i, 0,papi_pe_buffer[0], 00979 tot_time_enabled,tot_time_running); 00980 00981 if (tot_time_running == tot_time_enabled) { 00982 /* No scaling needed */ 00983 pe_ctl->counts[i] = papi_pe_buffer[0]; 00984 } else if (tot_time_running && tot_time_enabled) { 00985 /* Scale factor of 100 to avoid overflows when computing */ 00986 /*enabled/running */ 00987 00988 scale = (tot_time_enabled * 100LL) / tot_time_running; 00989 scale = scale * papi_pe_buffer[0]; 00990 scale = scale / 100LL; 00991 pe_ctl->counts[i] = scale; 00992 } else { 00993 /* This should not happen, but Phil reports it sometime does. */ 00994 SUBDBG("perf_event kernel bug(?) count, enabled, " 00995 "running: %lld, %lld, %lld\n", 00996 papi_pe_buffer[0],tot_time_enabled, 00997 tot_time_running); 00998 00999 pe_ctl->counts[i] = papi_pe_buffer[0]; 01000 } 01001 } 01002 } 01003 01004 /* Handle cases where we cannot use FORMAT GROUP */ 01005 else if (bug_format_group() || pe_ctl->inherit) { 01006 01007 /* we must read each counter individually */ 01008 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01009 01010 ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 01011 sizeof ( papi_pe_buffer ) ); 01012 if ( ret == -1 ) { 01013 PAPIERROR("read returned an error: ", strerror( errno )); 01014 return PAPI_ESYS; 01015 } 01016 01017 /* we should read one 64-bit value from each counter */ 01018 if (ret!=sizeof(long long)) { 01019 PAPIERROR("Error! short read!\n"); 01020 PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01021 pe_ctl->events[i].event_fd, 01022 (long)pe_ctl->tid, pe_ctl->cpu, ret); 01023 return PAPI_ESYS; 01024 } 01025 01026 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01027 pe_ctl->events[i].event_fd, (long)pe_ctl->tid, 01028 pe_ctl->cpu, ret); 01029 SUBDBG("read: %lld\n",papi_pe_buffer[0]); 01030 01031 pe_ctl->counts[i] = papi_pe_buffer[0]; 01032 } 01033 } 01034 01035 01036 /* Handle cases where we are using FORMAT_GROUP */ 01037 /* We assume only one group leader, in position 0 */ 01038 01039 else { 01040 if (pe_ctl->events[0].group_leader_fd!=-1) { 01041 PAPIERROR("Was expecting group leader!\n"); 01042 } 01043 01044 ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer, 01045 sizeof ( papi_pe_buffer ) ); 01046 01047 if ( ret == -1 ) { 01048 PAPIERROR("read returned an error: ", strerror( errno )); 01049 return PAPI_ESYS; 01050 } 01051 01052 /* we read 1 64-bit value (number of events) then */ 01053 /* num_events more 64-bit values that hold the counts */ 01054 if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) { 01055 PAPIERROR("Error! short read!\n"); 01056 return PAPI_ESYS; 01057 } 01058 01059 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01060 pe_ctl->events[0].event_fd, 01061 (long)pe_ctl->tid, pe_ctl->cpu, ret); 01062 { 01063 int j; 01064 for(j=0;j<ret/8;j++) { 01065 SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]); 01066 } 01067 } 01068 01069 /* Make sure the kernel agrees with how many events we have */ 01070 if (papi_pe_buffer[0]!=pe_ctl->num_events) { 01071 PAPIERROR("Error! Wrong number of events!\n"); 01072 return PAPI_ESYS; 01073 } 01074 01075 /* put the count values in their proper location */ 01076 for(i=0;i<papi_pe_buffer[0];i++) { 01077 pe_ctl->counts[i] = papi_pe_buffer[1+i]; 01078 } 01079 } 01080 01081 01082 /* If we disabled the counters due to the sync_read_bug(), */ 01083 /* then we need to re-enable them now. */ 01084 if (bug_sync_read()) { 01085 if ( pe_ctx->state & PERF_EVENTS_RUNNING ) { 01086 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01087 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 01088 /* this should refresh any overflow counters too */ 01089 ret = ioctl( pe_ctl->events[i].event_fd, 01090 PERF_EVENT_IOC_ENABLE, NULL ); 01091 if ( ret == -1 ) { 01092 /* Should never happen */ 01093 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) returned an error: ", 01094 strerror( errno )); 01095 return PAPI_ESYS; 01096 } 01097 } 01098 } 01099 } 01100 } 01101 01102 /* point PAPI to the values we read */ 01103 *events = pe_ctl->counts; 01104 01105 return PAPI_OK; 01106 } 01107 01108 /* Start counting events */ 01109 int 01110 _pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl ) 01111 { 01112 int ret; 01113 int i; 01114 int did_something = 0; 01115 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01116 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01117 01118 /* Reset the counters first. Is this necessary? */ 01119 ret = _pe_reset( pe_ctx, pe_ctl ); 01120 if ( ret ) { 01121 return ret; 01122 } 01123 01124 /* Enable all of the group leaders */ 01125 /* All group leaders have a group_leader_fd of -1 */ 01126 for( i = 0; i < pe_ctl->num_events; i++ ) { 01127 if (pe_ctl->events[i].group_leader_fd == -1) { 01128 SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd); 01129 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ; 01130 01131 /* ioctls always return -1 on failure */ 01132 if (ret == -1) { 01133 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n"); 01134 return PAPI_ESYS; 01135 } 01136 01137 did_something++; 01138 } 01139 } 01140 01141 if (!did_something) { 01142 PAPIERROR("Did not enable any counters.\n"); 01143 return PAPI_EBUG; 01144 } 01145 01146 pe_ctx->state |= PERF_EVENTS_RUNNING; 01147 01148 return PAPI_OK; 01149 01150 } 01151 01152 /* Stop all of the counters */ 01153 int 01154 _pe_stop( hwd_context_t *ctx, hwd_control_state_t *ctl ) 01155 { 01156 01157 int ret; 01158 int i; 01159 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01160 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01161 01162 /* Just disable the group leaders */ 01163 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01164 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 01165 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL); 01166 if ( ret == -1 ) { 01167 PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) " 01168 "returned error, Linux says: %s", 01169 pe_ctl->events[i].event_fd, strerror( errno ) ); 01170 return PAPI_EBUG; 01171 } 01172 } 01173 } 01174 01175 pe_ctx->state &= ~PERF_EVENTS_RUNNING; 01176 01177 return PAPI_OK; 01178 } 01179 01180 /* This function clears the current contents of the control structure and 01181 updates it with whatever resources are allocated for all the native events 01182 in the native info structure array. */ 01183 01184 int 01185 _pe_update_control_state( hwd_control_state_t *ctl, 01186 NativeInfo_t *native, 01187 int count, hwd_context_t *ctx ) 01188 { 01189 int i = 0, ret; 01190 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01191 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01192 01193 /* close all of the existing fds and start over again */ 01194 /* In theory we could have finer-grained control and know if */ 01195 /* things were changed, but it's easier to tear things down and rebuild. */ 01196 close_pe_events( pe_ctx, pe_ctl ); 01197 01198 /* Calling with count==0 should be OK, it's how things are deallocated */ 01199 /* when an eventset is destroyed. */ 01200 if ( count == 0 ) { 01201 SUBDBG( "Called with count == 0\n" ); 01202 return PAPI_OK; 01203 } 01204 01205 /* set up all the events */ 01206 for( i = 0; i < count; i++ ) { 01207 if ( native ) { 01208 /* Have libpfm4 set the config values for the event */ 01209 ret=_pe_libpfm4_setup_counters(&pe_ctl->events[i].attr, 01210 native[i].ni_event, 01211 pe_ctx->event_table); 01212 SUBDBG( "pe_ctl->eventss[%d].config=%"PRIx64"\n",i, 01213 pe_ctl->events[i].attr.config); 01214 if (ret!=PAPI_OK) return ret; 01215 01216 } else { 01217 /* I'm not sure how we'd end up in this case */ 01218 /* should it be an error? */ 01219 } 01220 01221 /* Copy the inherit flag into the attribute block that will be */ 01222 /* passed to the kernel */ 01223 pe_ctl->events[i].attr.inherit = pe_ctl->inherit; 01224 01225 /* Set the position in the native structure */ 01226 /* We just set up events linearly */ 01227 if ( native ) { 01228 native[i].ni_position = i; 01229 } 01230 } 01231 01232 pe_ctl->num_events = count; 01233 _pe_set_domain( ctl, pe_ctl->domain ); 01234 01235 /* actuall open the events */ 01236 /* (why is this a separate function?) */ 01237 ret = open_pe_events( pe_ctx, pe_ctl ); 01238 if ( ret != PAPI_OK ) { 01239 SUBDBG("open_pe_events failed\n"); 01240 /* Restore values ? */ 01241 return ret; 01242 } 01243 01244 return PAPI_OK; 01245 } 01246 01247 /* Set various options on a control state */ 01248 int 01249 _pe_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option ) 01250 { 01251 int ret; 01252 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01253 pe_control_t *pe_ctl = NULL; 01254 01255 switch ( code ) { 01256 case PAPI_MULTIPLEX: 01257 pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state ); 01258 ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain, 01259 pe_ctl->granularity, 01260 1, pe_ctl->inherit ); 01261 if (ret != PAPI_OK) { 01262 return ret; 01263 } 01264 01265 /* looks like we are allowed, so set multiplexed attribute */ 01266 pe_ctl->multiplexed = 1; 01267 ret = _pe_update_control_state( pe_ctl, NULL, 01268 pe_ctl->num_events, pe_ctx ); 01269 if (ret != PAPI_OK) { 01270 pe_ctl->multiplexed = 0; 01271 } 01272 return ret; 01273 01274 case PAPI_ATTACH: 01275 pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state ); 01276 ret = check_permissions( option->attach.tid, pe_ctl->cpu, 01277 pe_ctl->domain, pe_ctl->granularity, 01278 pe_ctl->multiplexed, 01279 pe_ctl->inherit ); 01280 if (ret != PAPI_OK) { 01281 return ret; 01282 } 01283 01284 pe_ctl->tid = option->attach.tid; 01285 01286 /* If events have been already been added, something may */ 01287 /* have been done to the kernel, so update */ 01288 ret =_pe_update_control_state( pe_ctl, NULL, 01289 pe_ctl->num_events, pe_ctx); 01290 01291 return ret; 01292 01293 case PAPI_DETACH: 01294 pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state ); 01295 01296 pe_ctl->tid = 0; 01297 return PAPI_OK; 01298 01299 case PAPI_CPU_ATTACH: 01300 pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state ); 01301 ret = check_permissions( pe_ctl->tid, option->cpu.cpu_num, 01302 pe_ctl->domain, pe_ctl->granularity, 01303 pe_ctl->multiplexed, 01304 pe_ctl->inherit ); 01305 if (ret != PAPI_OK) { 01306 return ret; 01307 } 01308 /* looks like we are allowed so set cpu number */ 01309 01310 /* this tells the kernel not to count for a thread */ 01311 /* should we warn if we try to set both? perf_event */ 01312 /* will reject it. */ 01313 pe_ctl->tid = -1; 01314 01315 pe_ctl->cpu = option->cpu.cpu_num; 01316 01317 return PAPI_OK; 01318 01319 case PAPI_DOMAIN: 01320 pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state ); 01321 ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, 01322 option->domain.domain, 01323 pe_ctl->granularity, 01324 pe_ctl->multiplexed, 01325 pe_ctl->inherit ); 01326 if (ret != PAPI_OK) { 01327 return ret; 01328 } 01329 /* looks like we are allowed, so set counting domain */ 01330 return _pe_set_domain( pe_ctl, option->domain.domain ); 01331 01332 case PAPI_GRANUL: 01333 pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state ); 01334 01335 /* FIXME: we really don't support this yet */ 01336 01337 switch ( option->granularity.granularity ) { 01338 case PAPI_GRN_PROCG: 01339 case PAPI_GRN_SYS_CPU: 01340 case PAPI_GRN_PROC: 01341 return PAPI_ECMP; 01342 01343 /* Currently we only support thread and CPU granularity */ 01344 case PAPI_GRN_SYS: 01345 pe_ctl->granularity=PAPI_GRN_SYS; 01346 break; 01347 01348 case PAPI_GRN_THR: 01349 pe_ctl->granularity=PAPI_GRN_THR; 01350 break; 01351 01352 01353 default: 01354 return PAPI_EINVAL; 01355 } 01356 return PAPI_OK; 01357 01358 case PAPI_INHERIT: 01359 pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state ); 01360 ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain, 01361 pe_ctl->granularity, pe_ctl->multiplexed, 01362 option->inherit.inherit ); 01363 if (ret != PAPI_OK) { 01364 return ret; 01365 } 01366 /* looks like we are allowed, so set the requested inheritance */ 01367 if (option->inherit.inherit) { 01368 /* children will inherit counters */ 01369 pe_ctl->inherit = 1; 01370 } else { 01371 /* children won't inherit counters */ 01372 pe_ctl->inherit = 0; 01373 } 01374 return PAPI_OK; 01375 01376 case PAPI_DATA_ADDRESS: 01377 return PAPI_ENOSUPP; 01378 #if 0 01379 pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state); 01380 ret = set_default_domain( pe_ctl, option->address_range.domain ); 01381 if ( ret != PAPI_OK ) { 01382 return ret; 01383 } 01384 set_drange( pe_ctx, pe_ctl, option ); 01385 return PAPI_OK; 01386 #endif 01387 case PAPI_INSTR_ADDRESS: 01388 return PAPI_ENOSUPP; 01389 #if 0 01390 pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state); 01391 ret = set_default_domain( pe_ctl, option->address_range.domain ); 01392 if ( ret != PAPI_OK ) { 01393 return ret; 01394 } 01395 set_irange( pe_ctx, pe_ctl, option ); 01396 return PAPI_OK; 01397 #endif 01398 01399 case PAPI_DEF_ITIMER: 01400 /* What should we be checking for here? */ 01401 /* This seems like it should be OS-specific not component */ 01402 /* specific. */ 01403 01404 return PAPI_OK; 01405 01406 case PAPI_DEF_MPX_NS: 01407 /* Defining a given ns per set is not current supported */ 01408 return PAPI_ENOSUPP; 01409 01410 case PAPI_DEF_ITIMER_NS: 01411 /* We don't support this... */ 01412 return PAPI_OK; 01413 01414 default: 01415 return PAPI_ENOSUPP; 01416 } 01417 } 01418 01419 /* Initialize a thread */ 01420 int 01421 _pe_init_thread( hwd_context_t *hwd_ctx ) 01422 { 01423 01424 pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx; 01425 01426 /* clear the context structure and mark as initialized */ 01427 memset( pe_ctx, 0, sizeof ( pe_context_t ) ); 01428 pe_ctx->initialized=1; 01429 pe_ctx->event_table=&perf_native_event_table; 01430 pe_ctx->cidx=our_cidx; 01431 01432 return PAPI_OK; 01433 } 01434 01435 /* Initialize a new control state */ 01436 int 01437 _pe_init_control_state( hwd_control_state_t *ctl ) 01438 { 01439 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01440 01441 /* clear the contents */ 01442 memset( pe_ctl, 0, sizeof ( pe_control_t ) ); 01443 01444 /* Set the domain */ 01445 _pe_set_domain( ctl, _perf_event_vector.cmp_info.default_domain ); 01446 01447 /* default granularity */ 01448 pe_ctl->granularity= _perf_event_vector.cmp_info.default_granularity; 01449 01450 /* overflow signal */ 01451 pe_ctl->overflow_signal=_perf_event_vector.cmp_info.hardware_intr_sig; 01452 01453 pe_ctl->cidx=our_cidx; 01454 01455 /* Set cpu number in the control block to show events */ 01456 /* are not tied to specific cpu */ 01457 pe_ctl->cpu = -1; 01458 return PAPI_OK; 01459 } 01460 01461 /* Check the mmap page for rdpmc support */ 01462 static int _pe_detect_rdpmc(int default_domain) { 01463 01464 struct perf_event_attr pe; 01465 int fd,rdpmc_exists=1; 01466 void *addr; 01467 struct perf_event_mmap_page *our_mmap; 01468 01469 /* Create a fake instructions event so we can read a mmap page */ 01470 memset(&pe,0,sizeof(struct perf_event_attr)); 01471 01472 pe.type=PERF_TYPE_HARDWARE; 01473 pe.size=sizeof(struct perf_event_attr); 01474 pe.config=PERF_COUNT_HW_INSTRUCTIONS; 01475 01476 /* There should probably be a helper function to handle this */ 01477 /* we break on some ARM because there is no support for excluding */ 01478 /* kernel. */ 01479 if (default_domain & PAPI_DOM_KERNEL ) { 01480 } 01481 else { 01482 pe.exclude_kernel=1; 01483 } 01484 fd=sys_perf_event_open(&pe,0,-1,-1,0); 01485 if (fd<0) { 01486 return PAPI_ESYS; 01487 } 01488 01489 /* create the mmap page */ 01490 addr=mmap(NULL, 4096, PROT_READ, MAP_SHARED,fd,0); 01491 if (addr == (void *)(-1)) { 01492 close(fd); 01493 return PAPI_ESYS; 01494 } 01495 01496 /* get the rdpmc info */ 01497 our_mmap=(struct perf_event_mmap_page *)addr; 01498 if (our_mmap->cap_usr_rdpmc==0) { 01499 rdpmc_exists=0; 01500 } 01501 01502 /* close the fake event */ 01503 munmap(addr,4096); 01504 close(fd); 01505 01506 return rdpmc_exists; 01507 01508 } 01509 01510 01511 /* Initialize the perf_event component */ 01512 int 01513 _pe_init_component( int cidx ) 01514 { 01515 01516 int retval; 01517 int paranoid_level; 01518 01519 FILE *fff; 01520 01521 our_cidx=cidx; 01522 01523 /* The is the official way to detect if perf_event support exists */ 01524 /* The file is called perf_counter_paranoid on 2.6.31 */ 01525 /* currently we are lazy and do not support 2.6.31 kernels */ 01526 fff=fopen("/proc/sys/kernel/perf_event_paranoid","r"); 01527 if (fff==NULL) { 01528 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 01529 "perf_event support not detected",PAPI_MAX_STR_LEN); 01530 return PAPI_ENOCMP; 01531 } 01532 01533 /* 2 means no kernel measurements allowed */ 01534 /* 1 means normal counter access */ 01535 /* 0 means you can access CPU-specific data */ 01536 /* -1 means no restrictions */ 01537 retval=fscanf(fff,"%d",¶noid_level); 01538 if (retval!=1) fprintf(stderr,"Error reading paranoid level\n"); 01539 fclose(fff); 01540 01541 if ((paranoid_level==2) && (getuid()!=0)) { 01542 SUBDBG("/proc/sys/kernel/perf_event_paranoid prohibits kernel counts"); 01543 _papi_hwd[cidx]->cmp_info.available_domains &=~PAPI_DOM_KERNEL; 01544 } 01545 01546 /* Detect NMI watchdog which can steal counters */ 01547 nmi_watchdog_active=_linux_detect_nmi_watchdog(); 01548 if (nmi_watchdog_active) { 01549 SUBDBG("The Linux nmi_watchdog is using one of the performance " 01550 "counters, reducing the total number available.\n"); 01551 } 01552 /* Kernel multiplexing is broken prior to kernel 2.6.34 */ 01553 /* The fix was probably git commit: */ 01554 /* 45e16a6834b6af098702e5ea6c9a40de42ff77d8 */ 01555 if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) { 01556 _papi_hwd[cidx]->cmp_info.kernel_multiplex = 0; 01557 _papi_hwd[cidx]->cmp_info.num_mpx_cntrs = PAPI_MAX_SW_MPX_EVENTS; 01558 } 01559 else { 01560 _papi_hwd[cidx]->cmp_info.kernel_multiplex = 1; 01561 _papi_hwd[cidx]->cmp_info.num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS; 01562 } 01563 01564 /* Check that processor is supported */ 01565 if (processor_supported(_papi_hwi_system_info.hw_info.vendor, 01566 _papi_hwi_system_info.hw_info.cpuid_family)!= 01567 PAPI_OK) { 01568 fprintf(stderr,"warning, your processor is unsupported\n"); 01569 /* should not return error, as software events should still work */ 01570 } 01571 01572 /* Setup mmtimers, if appropriate */ 01573 retval=mmtimer_setup(); 01574 if (retval) { 01575 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 01576 "Error initializing mmtimer",PAPI_MAX_STR_LEN); 01577 return retval; 01578 } 01579 01580 /* Set the overflow signal */ 01581 _papi_hwd[cidx]->cmp_info.hardware_intr_sig = SIGRTMIN + 2; 01582 01583 /* Run Vendor-specific fixups */ 01584 pe_vendor_fixups(_papi_hwd[cidx]); 01585 01586 /* Detect if we can use rdpmc (or equivalent) */ 01587 /* We currently do not use rdpmc as it is slower in tests */ 01588 /* than regular read (as of Linux 3.5) */ 01589 retval=_pe_detect_rdpmc(_papi_hwd[cidx]->cmp_info.default_domain); 01590 if (retval < 0 ) { 01591 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 01592 "sys_perf_event_open() failed, perf_event support for this platform may be broken",PAPI_MAX_STR_LEN); 01593 01594 return retval; 01595 } 01596 _papi_hwd[cidx]->cmp_info.fast_counter_read = retval; 01597 01598 /* Run the libpfm4-specific setup */ 01599 retval = _papi_libpfm4_init(_papi_hwd[cidx]); 01600 if (retval) { 01601 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 01602 "Error initializing libpfm4",PAPI_MAX_STR_LEN); 01603 return retval; 01604 } 01605 01606 retval = _pe_libpfm4_init(_papi_hwd[cidx], cidx, 01607 &perf_native_event_table, 01608 PMU_TYPE_CORE | PMU_TYPE_OS); 01609 if (retval) { 01610 strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason, 01611 "Error initializing libpfm4",PAPI_MAX_STR_LEN); 01612 return retval; 01613 } 01614 01615 return PAPI_OK; 01616 01617 } 01618 01619 /* Shutdown the perf_event component */ 01620 int 01621 _pe_shutdown_component( void ) { 01622 01623 /* deallocate our event table */ 01624 _pe_libpfm4_shutdown(&perf_native_event_table); 01625 01626 /* Shutdown libpfm4 */ 01627 _papi_libpfm4_shutdown(); 01628 01629 return PAPI_OK; 01630 } 01631 01632 01633 01634 01635 int 01636 _pe_ntv_enum_events( unsigned int *PapiEventCode, int modifier ) 01637 { 01638 return _pe_libpfm4_ntv_enum_events(PapiEventCode, modifier, 01639 &perf_native_event_table); 01640 } 01641 01642 int 01643 _pe_ntv_name_to_code( char *name, unsigned int *event_code) { 01644 return _pe_libpfm4_ntv_name_to_code(name,event_code, 01645 &perf_native_event_table); 01646 } 01647 01648 int 01649 _pe_ntv_code_to_name(unsigned int EventCode, 01650 char *ntv_name, int len) { 01651 return _pe_libpfm4_ntv_code_to_name(EventCode, 01652 ntv_name, len, 01653 &perf_native_event_table); 01654 } 01655 01656 int 01657 _pe_ntv_code_to_descr( unsigned int EventCode, 01658 char *ntv_descr, int len) { 01659 01660 return _pe_libpfm4_ntv_code_to_descr(EventCode,ntv_descr,len, 01661 &perf_native_event_table); 01662 } 01663 01664 int 01665 _pe_ntv_code_to_info(unsigned int EventCode, 01666 PAPI_event_info_t *info) { 01667 01668 return _pe_libpfm4_ntv_code_to_info(EventCode, info, 01669 &perf_native_event_table); 01670 } 01671 01672 /* These functions are based on builtin-record.c in the */ 01673 /* kernel's tools/perf directory. */ 01674 01675 static uint64_t 01676 mmap_read_head( pe_event_info_t *pe ) 01677 { 01678 struct perf_event_mmap_page *pc = pe->mmap_buf; 01679 int head; 01680 01681 if ( pc == NULL ) { 01682 PAPIERROR( "perf_event_mmap_page is NULL" ); 01683 return 0; 01684 } 01685 01686 head = pc->data_head; 01687 rmb( ); 01688 01689 return head; 01690 } 01691 01692 static void 01693 mmap_write_tail( pe_event_info_t *pe, uint64_t tail ) 01694 { 01695 struct perf_event_mmap_page *pc = pe->mmap_buf; 01696 01697 /* ensure all reads are done before we write the tail out. */ 01698 pc->data_tail = tail; 01699 } 01700 01701 01702 /* Does the kernel define these somewhere? */ 01703 struct ip_event { 01704 struct perf_event_header header; 01705 uint64_t ip; 01706 }; 01707 struct lost_event { 01708 struct perf_event_header header; 01709 uint64_t id; 01710 uint64_t lost; 01711 }; 01712 typedef union event_union { 01713 struct perf_event_header header; 01714 struct ip_event ip; 01715 struct lost_event lost; 01716 } perf_sample_event_t; 01717 01718 /* Should re-write with comments if we ever figure out what's */ 01719 /* going on here. */ 01720 static void 01721 mmap_read( int cidx, ThreadInfo_t **thr, pe_event_info_t *pe, 01722 int profile_index ) 01723 { 01724 uint64_t head = mmap_read_head( pe ); 01725 uint64_t old = pe->tail; 01726 unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( ); 01727 int diff; 01728 01729 diff = head - old; 01730 if ( diff < 0 ) { 01731 SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64 01732 ", tail = %" PRIu64 ". Discarding samples.\n", head, old ); 01733 /* head points to a known good entry, start there. */ 01734 old = head; 01735 } 01736 01737 for( ; old != head; ) { 01738 perf_sample_event_t *event = ( perf_sample_event_t * ) 01739 & data[old & pe->mask]; 01740 perf_sample_event_t event_copy; 01741 size_t size = event->header.size; 01742 01743 /* Event straddles the mmap boundary -- header should always */ 01744 /* be inside due to u64 alignment of output. */ 01745 if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) { 01746 uint64_t offset = old; 01747 uint64_t len = min( sizeof ( *event ), size ), cpy; 01748 void *dst = &event_copy; 01749 01750 do { 01751 cpy = min( pe->mask + 1 - ( offset & pe->mask ), len ); 01752 memcpy( dst, &data[offset & pe->mask], cpy ); 01753 offset += cpy; 01754 dst = ((unsigned char*)dst) + cpy; 01755 len -= cpy; 01756 } while ( len ); 01757 01758 event = &event_copy; 01759 } 01760 old += size; 01761 01762 SUBDBG( "event->type = %08x\n", event->header.type ); 01763 SUBDBG( "event->size = %d\n", event->header.size ); 01764 01765 switch ( event->header.type ) { 01766 case PERF_RECORD_SAMPLE: 01767 _papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx], 01768 ( caddr_t ) ( unsigned long ) event->ip.ip, 01769 0, profile_index ); 01770 break; 01771 01772 case PERF_RECORD_LOST: 01773 SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64 01774 " events were lost.\n" 01775 "Loss was recorded when counter id 0x%"PRIx64 01776 " overflowed.\n", event->lost.lost, event->lost.id ); 01777 break; 01778 01779 default: 01780 SUBDBG( "Error: unexpected header type - %d\n", 01781 event->header.type ); 01782 break; 01783 } 01784 } 01785 01786 pe->tail = old; 01787 mmap_write_tail( pe, old ); 01788 } 01789 01790 /* Find a native event specified by a profile index */ 01791 static int 01792 find_profile_index( EventSetInfo_t *ESI, int evt_idx, int *flags, 01793 unsigned int *native_index, int *profile_index ) 01794 { 01795 int pos, esi_index, count; 01796 01797 for ( count = 0; count < ESI->profile.event_counter; count++ ) { 01798 esi_index = ESI->profile.EventIndex[count]; 01799 pos = ESI->EventInfoArray[esi_index].pos[0]; 01800 01801 if ( pos == evt_idx ) { 01802 *profile_index = count; 01803 *native_index = ESI->NativeInfoArray[pos].ni_event & 01804 PAPI_NATIVE_AND_MASK; 01805 *flags = ESI->profile.flags; 01806 SUBDBG( "Native event %d is at profile index %d, flags %d\n", 01807 *native_index, *profile_index, *flags ); 01808 return PAPI_OK; 01809 } 01810 } 01811 PAPIERROR( "wrong count: %d vs. ESI->profile.event_counter %d", count, 01812 ESI->profile.event_counter ); 01813 return PAPI_EBUG; 01814 } 01815 01816 01817 01818 /* What exactly does this do? */ 01819 static int 01820 process_smpl_buf( int evt_idx, ThreadInfo_t **thr, int cidx ) 01821 { 01822 int ret, flags, profile_index; 01823 unsigned native_index; 01824 pe_control_t *ctl; 01825 01826 ret = find_profile_index( ( *thr )->running_eventset[cidx], evt_idx, 01827 &flags, &native_index, &profile_index ); 01828 if ( ret != PAPI_OK ) { 01829 return ret; 01830 } 01831 01832 ctl= (*thr)->running_eventset[cidx]->ctl_state; 01833 01834 mmap_read( cidx, thr, 01835 &(ctl->events[evt_idx]), 01836 profile_index ); 01837 01838 return PAPI_OK; 01839 } 01840 01841 /* 01842 * This function is used when hardware overflows are working or when 01843 * software overflows are forced 01844 */ 01845 01846 void 01847 _pe_dispatch_timer( int n, hwd_siginfo_t *info, void *uc) 01848 { 01849 ( void ) n; /*unused */ 01850 _papi_hwi_context_t hw_context; 01851 int found_evt_idx = -1, fd = info->si_fd; 01852 caddr_t address; 01853 ThreadInfo_t *thread = _papi_hwi_lookup_thread( 0 ); 01854 int i; 01855 pe_control_t *ctl; 01856 int cidx = _perf_event_vector.cmp_info.CmpIdx; 01857 01858 if ( thread == NULL ) { 01859 PAPIERROR( "thread == NULL in _papi_pe_dispatch_timer for fd %d!", fd ); 01860 return; 01861 } 01862 01863 if ( thread->running_eventset[cidx] == NULL ) { 01864 PAPIERROR( "thread->running_eventset == NULL in " 01865 "_papi_pe_dispatch_timer for fd %d!",fd ); 01866 return; 01867 } 01868 01869 if ( thread->running_eventset[cidx]->overflow.flags == 0 ) { 01870 PAPIERROR( "thread->running_eventset->overflow.flags == 0 in " 01871 "_papi_pe_dispatch_timer for fd %d!", fd ); 01872 return; 01873 } 01874 01875 hw_context.si = info; 01876 hw_context.ucontext = ( hwd_ucontext_t * ) uc; 01877 01878 if ( thread->running_eventset[cidx]->overflow.flags & 01879 PAPI_OVERFLOW_FORCE_SW ) { 01880 address = GET_OVERFLOW_ADDRESS( hw_context ); 01881 _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context, 01882 address, NULL, 0, 01883 0, &thread, cidx ); 01884 return; 01885 } 01886 01887 if ( thread->running_eventset[cidx]->overflow.flags != 01888 PAPI_OVERFLOW_HARDWARE ) { 01889 PAPIERROR( "thread->running_eventset->overflow.flags is set to " 01890 "something other than PAPI_OVERFLOW_HARDWARE or " 01891 "PAPI_OVERFLOW_FORCE_SW for fd %d (%#x)", 01892 fd , thread->running_eventset[cidx]->overflow.flags); 01893 } 01894 01895 /* convoluted way to get ctl */ 01896 ctl= thread->running_eventset[cidx]->ctl_state; 01897 01898 /* See if the fd is one that's part of the this thread's context */ 01899 for( i=0; i < ctl->num_events; i++ ) { 01900 if ( fd == ctl->events[i].event_fd ) { 01901 found_evt_idx = i; 01902 break; 01903 } 01904 } 01905 01906 if ( found_evt_idx == -1 ) { 01907 PAPIERROR( "Unable to find fd %d among the open event fds " 01908 "_papi_hwi_dispatch_timer!", fd ); 01909 return; 01910 } 01911 01912 if (ioctl( fd, PERF_EVENT_IOC_DISABLE, NULL ) == -1 ) { 01913 PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) failed.\n"); 01914 } 01915 01916 if ( ( thread->running_eventset[cidx]->state & PAPI_PROFILING ) && 01917 !( thread->running_eventset[cidx]->profile.flags & 01918 PAPI_PROFIL_FORCE_SW ) ) { 01919 process_smpl_buf( found_evt_idx, &thread, cidx ); 01920 } 01921 else { 01922 uint64_t ip; 01923 unsigned int head; 01924 pe_event_info_t *pe = &(ctl->events[found_evt_idx]); 01925 unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( ); 01926 01927 /* 01928 * Read up the most recent IP from the sample in the mmap buffer. To 01929 * do this, we make the assumption that all of the records in the 01930 * mmap buffer are the same size, and that they all contain the IP as 01931 * their only record element. This means that we can use the 01932 * data_head element from the user page and move backward one record 01933 * from that point and read the data. Since we don't actually need 01934 * to access the header of the record, we can just subtract 8 (size 01935 * of the IP) from data_head and read up that word from the mmap 01936 * buffer. After we subtract 8, we account for mmap buffer wrapping 01937 * by AND'ing this offset with the buffer mask. 01938 */ 01939 head = mmap_read_head( pe ); 01940 01941 if ( head == 0 ) { 01942 PAPIERROR( "Attempting to access memory which may be inaccessable" ); 01943 return; 01944 } 01945 ip = *( uint64_t * ) ( data + ( ( head - 8 ) & pe->mask ) ); 01946 /* 01947 * Update the tail to the current head pointer. 01948 * 01949 * Note: that if we were to read the record at the tail pointer, 01950 * rather than the one at the head (as you might otherwise think 01951 * would be natural), we could run into problems. Signals don't 01952 * stack well on Linux, particularly if not using RT signals, and if 01953 * they come in rapidly enough, we can lose some. Overtime, the head 01954 * could catch up to the tail and monitoring would be stopped, and 01955 * since no more signals are coming in, this problem will never be 01956 * resolved, resulting in a complete loss of overflow notification 01957 * from that point on. So the solution we use here will result in 01958 * only the most recent IP value being read every time there are two 01959 * or more samples in the buffer (for that one overflow signal). But 01960 * the handler will always bring up the tail, so the head should 01961 * never run into the tail. 01962 */ 01963 mmap_write_tail( pe, head ); 01964 01965 /* 01966 * The fourth parameter is supposed to be a vector of bits indicating 01967 * the overflowed hardware counters, but it's not really clear that 01968 * it's useful, because the actual hardware counters used are not 01969 * exposed to the PAPI user. For now, I'm just going to set the bit 01970 * that indicates which event register in the array overflowed. The 01971 * result is that the overflow vector will not be identical to the 01972 * perfmon implementation, and part of that is due to the fact that 01973 * which hardware register is actually being used is opaque at the 01974 * user level (the kernel event dispatcher hides that info). 01975 */ 01976 01977 _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context, 01978 ( caddr_t ) ( unsigned long ) ip, 01979 NULL, ( 1 << found_evt_idx ), 0, 01980 &thread, cidx ); 01981 01982 } 01983 01984 /* Restart the counters */ 01985 if (ioctl( fd, PERF_EVENT_IOC_REFRESH, PAPI_REFRESH_VALUE ) == -1) { 01986 PAPIERROR( "overflow refresh failed", 0 ); 01987 } 01988 } 01989 01990 /* Stop profiling */ 01991 int 01992 _pe_stop_profiling( ThreadInfo_t *thread, EventSetInfo_t *ESI ) 01993 { 01994 int i, ret = PAPI_OK; 01995 pe_control_t *ctl; 01996 int cidx; 01997 01998 ctl=ESI->ctl_state; 01999 02000 cidx=ctl->cidx; 02001 02002 /* Loop through all of the events and process those which have mmap */ 02003 /* buffers attached. */ 02004 for ( i = 0; i < ctl->num_events; i++ ) { 02005 /* Use the mmap_buf field as an indicator of this fd being used for */ 02006 /* profiling. */ 02007 if ( ctl->events[i].mmap_buf ) { 02008 /* Process any remaining samples in the sample buffer */ 02009 ret = process_smpl_buf( i, &thread, cidx ); 02010 if ( ret ) { 02011 PAPIERROR( "process_smpl_buf returned error %d", ret ); 02012 return ret; 02013 } 02014 } 02015 } 02016 return ret; 02017 } 02018 02019 /* Setup an event to cause overflow */ 02020 int 02021 _pe_set_overflow( EventSetInfo_t *ESI, int EventIndex, int threshold ) 02022 { 02023 02024 pe_context_t *ctx; 02025 pe_control_t *ctl = (pe_control_t *) ( ESI->ctl_state ); 02026 int i, evt_idx, found_non_zero_sample_period = 0, retval = PAPI_OK; 02027 int cidx; 02028 02029 cidx = ctl->cidx; 02030 ctx = ( pe_context_t *) ( ESI->master->context[cidx] ); 02031 02032 evt_idx = ESI->EventInfoArray[EventIndex].pos[0]; 02033 02034 SUBDBG("Attempting to set overflow for index %d (%d) of EventSet %d\n", 02035 evt_idx,EventIndex,ESI->EventSetIndex); 02036 02037 if (evt_idx<0) { 02038 return PAPI_EINVAL; 02039 } 02040 02041 if ( threshold == 0 ) { 02042 /* If this counter isn't set to overflow, it's an error */ 02043 if ( ctl->events[evt_idx].attr.sample_period == 0 ) return PAPI_EINVAL; 02044 } 02045 02046 ctl->events[evt_idx].attr.sample_period = threshold; 02047 02048 /* 02049 * Note that the wakeup_mode field initially will be set to zero 02050 * (WAKEUP_MODE_COUNTER_OVERFLOW) as a result of a call to memset 0 to 02051 * all of the events in the ctl struct. 02052 * 02053 * Is it even set to any other value elsewhere? 02054 */ 02055 switch ( ctl->events[evt_idx].wakeup_mode ) { 02056 case WAKEUP_MODE_PROFILING: 02057 /* Setting wakeup_events to special value zero means issue a */ 02058 /* wakeup (signal) on every mmap page overflow. */ 02059 ctl->events[evt_idx].attr.wakeup_events = 0; 02060 break; 02061 02062 case WAKEUP_MODE_COUNTER_OVERFLOW: 02063 /* Can this code ever be called? */ 02064 02065 /* Setting wakeup_events to one means issue a wakeup on every */ 02066 /* counter overflow (not mmap page overflow). */ 02067 ctl->events[evt_idx].attr.wakeup_events = 1; 02068 /* We need the IP to pass to the overflow handler */ 02069 ctl->events[evt_idx].attr.sample_type = PERF_SAMPLE_IP; 02070 /* one for the user page, and two to take IP samples */ 02071 ctl->events[evt_idx].nr_mmap_pages = 1 + 2; 02072 break; 02073 default: 02074 PAPIERROR( "ctl->wakeup_mode[%d] set to an unknown value - %u", 02075 evt_idx, ctl->events[evt_idx].wakeup_mode); 02076 return PAPI_EBUG; 02077 } 02078 02079 /* Check for non-zero sample period */ 02080 for ( i = 0; i < ctl->num_events; i++ ) { 02081 if ( ctl->events[evt_idx].attr.sample_period ) { 02082 found_non_zero_sample_period = 1; 02083 break; 02084 } 02085 } 02086 02087 if ( found_non_zero_sample_period ) { 02088 /* turn on internal overflow flag for this event set */ 02089 ctl->overflow = 1; 02090 02091 /* Enable the signal handler */ 02092 retval = _papi_hwi_start_signal( 02093 ctl->overflow_signal, 02094 1, ctl->cidx ); 02095 } else { 02096 /* turn off internal overflow flag for this event set */ 02097 ctl->overflow = 0; 02098 02099 /* Remove the signal handler, if there are no remaining non-zero */ 02100 /* sample_periods set */ 02101 retval = _papi_hwi_stop_signal(ctl->overflow_signal); 02102 if ( retval != PAPI_OK ) return retval; 02103 } 02104 02105 retval = _pe_update_control_state( ctl, NULL, 02106 ( (pe_control_t *) (ESI->ctl_state) )->num_events, 02107 ctx ); 02108 02109 return retval; 02110 } 02111 02112 /* Enable profiling */ 02113 int 02114 _pe_set_profile( EventSetInfo_t *ESI, int EventIndex, int threshold ) 02115 { 02116 int ret; 02117 int evt_idx; 02118 pe_control_t *ctl = ( pe_control_t *) ( ESI->ctl_state ); 02119 02120 /* Since you can't profile on a derived event, the event is always the */ 02121 /* first and only event in the native event list. */ 02122 evt_idx = ESI->EventInfoArray[EventIndex].pos[0]; 02123 02124 if ( threshold == 0 ) { 02125 SUBDBG( "MUNMAP(%p,%"PRIu64")\n", ctl->events[evt_idx].mmap_buf, 02126 ( uint64_t ) ctl->events[evt_idx].nr_mmap_pages * 02127 getpagesize( ) ); 02128 02129 if ( ctl->events[evt_idx].mmap_buf ) { 02130 munmap( ctl->events[evt_idx].mmap_buf, 02131 ctl->events[evt_idx].nr_mmap_pages * getpagesize() ); 02132 } 02133 ctl->events[evt_idx].mmap_buf = NULL; 02134 ctl->events[evt_idx].nr_mmap_pages = 0; 02135 ctl->events[evt_idx].attr.sample_type &= ~PERF_SAMPLE_IP; 02136 ret = _pe_set_overflow( ESI, EventIndex, threshold ); 02137 /* ??? #warning "This should be handled somewhere else" */ 02138 ESI->state &= ~( PAPI_OVERFLOWING ); 02139 ESI->overflow.flags &= ~( PAPI_OVERFLOW_HARDWARE ); 02140 02141 return ret; 02142 } 02143 02144 /* Look up the native event code */ 02145 if ( ESI->profile.flags & (PAPI_PROFIL_DATA_EAR | PAPI_PROFIL_INST_EAR)) { 02146 /* Not supported yet... */ 02147 02148 return PAPI_ENOSUPP; 02149 } 02150 if ( ESI->profile.flags & PAPI_PROFIL_RANDOM ) { 02151 /* This requires an ability to randomly alter the sample_period within */ 02152 /* a given range. Kernel does not have this ability. FIXME */ 02153 return PAPI_ENOSUPP; 02154 } 02155 02156 /* Just a guess at how many pages would make this relatively efficient. */ 02157 /* Note that it's "1 +" because of the need for a control page, and the */ 02158 /* number following the "+" must be a power of 2 (1, 4, 8, 16, etc) or */ 02159 /* zero. This is required to optimize dealing with circular buffer */ 02160 /* wrapping of the mapped pages. */ 02161 02162 ctl->events[evt_idx].nr_mmap_pages = (1+8); 02163 ctl->events[evt_idx].attr.sample_type |= PERF_SAMPLE_IP; 02164 02165 ret = _pe_set_overflow( ESI, EventIndex, threshold ); 02166 if ( ret != PAPI_OK ) return ret; 02167 02168 return PAPI_OK; 02169 } 02170 02171 02172 /* Our component vector */ 02173 02174 papi_vector_t _perf_event_vector = { 02175 .cmp_info = { 02176 /* component information (unspecified values initialized to 0) */ 02177 .name = "perf_event", 02178 .short_name = "perf", 02179 .version = "5.0", 02180 .description = "Linux perf_event CPU counters", 02181 02182 .default_domain = PAPI_DOM_USER, 02183 .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR, 02184 .default_granularity = PAPI_GRN_THR, 02185 .available_granularities = PAPI_GRN_THR | PAPI_GRN_SYS, 02186 02187 .hardware_intr = 1, 02188 .kernel_profile = 1, 02189 02190 /* component specific cmp_info initializations */ 02191 .fast_virtual_timer = 0, 02192 .attach = 1, 02193 .attach_must_ptrace = 1, 02194 .cpu = 1, 02195 .inherit = 1, 02196 .cntr_umasks = 1, 02197 02198 }, 02199 02200 /* sizes of framework-opaque component-private structures */ 02201 .size = { 02202 .context = sizeof ( pe_context_t ), 02203 .control_state = sizeof ( pe_control_t ), 02204 .reg_value = sizeof ( int ), 02205 .reg_alloc = sizeof ( int ), 02206 }, 02207 02208 /* function pointers in this component */ 02209 .init_component = _pe_init_component, 02210 .shutdown_component = _pe_shutdown_component, 02211 .init_thread = _pe_init_thread, 02212 .init_control_state = _pe_init_control_state, 02213 .dispatch_timer = _pe_dispatch_timer, 02214 02215 /* function pointers from the shared perf_event lib */ 02216 .start = _pe_start, 02217 .stop = _pe_stop, 02218 .read = _pe_read, 02219 .shutdown_thread = _pe_shutdown_thread, 02220 .ctl = _pe_ctl, 02221 .update_control_state = _pe_update_control_state, 02222 .set_domain = _pe_set_domain, 02223 .reset = _pe_reset, 02224 .set_overflow = _pe_set_overflow, 02225 .set_profile = _pe_set_profile, 02226 .stop_profiling = _pe_stop_profiling, 02227 .write = _pe_write, 02228 02229 02230 /* from counter name mapper */ 02231 .ntv_enum_events = _pe_ntv_enum_events, 02232 .ntv_name_to_code = _pe_ntv_name_to_code, 02233 .ntv_code_to_name = _pe_ntv_code_to_name, 02234 .ntv_code_to_descr = _pe_ntv_code_to_descr, 02235 .ntv_code_to_info = _pe_ntv_code_to_info, 02236 };