|
PAPI
5.1.0.2
|
00001 /* 00002 * File: perf_events.c 00003 * 00004 * Author: Corey Ashford 00005 * cjashfor@us.ibm.com 00006 * - based upon perfmon.c written by - 00007 * Philip Mucci 00008 * mucci@cs.utk.edu 00009 * Mods: Gary Mohr 00010 * gary.mohr@bull.com 00011 * Mods: Vince Weaver 00012 * vweaver1@eecs.utk.edu 00013 * Mods: Philip Mucci 00014 * mucci@eecs.utk.edu */ 00015 00016 00017 #include <fcntl.h> 00018 #include <string.h> 00019 #include <errno.h> 00020 #include <signal.h> 00021 #include <syscall.h> 00022 #include <sys/utsname.h> 00023 #include <sys/mman.h> 00024 #include <sys/ioctl.h> 00025 00026 /* PAPI-specific includes */ 00027 #include "papi.h" 00028 #include "papi_memory.h" 00029 #include "papi_internal.h" 00030 #include "papi_vector.h" 00031 #include "extras.h" 00032 00033 /* libpfm4 includes */ 00034 #include "papi_libpfm4_events.h" 00035 #include "perfmon/pfmlib.h" 00036 #include PEINCLUDE 00037 00038 /* Linux-specific includes */ 00039 #include "mb.h" 00040 #include "syscalls.h" 00041 #include "linux-memory.h" 00042 #include "linux-timer.h" 00043 #include "linux-common.h" 00044 #include "linux-context.h" 00045 00046 /* Various definitions */ 00047 00048 /* This is arbitrary. Typically you can add up to ~1000 before */ 00049 /* you run out of fds */ 00050 #define PERF_EVENT_MAX_MPX_COUNTERS 64 00051 00052 /* We really don't need fancy definitions for these */ 00053 00054 typedef struct 00055 { 00056 int group_leader_fd; /* fd of group leader */ 00057 int event_fd; /* fd of event */ 00058 int event_opened; /* event successfully opened */ 00059 uint32_t nr_mmap_pages; /* number pages in the mmap buffer */ 00060 void *mmap_buf; /* used for control/profiling */ 00061 uint64_t tail; /* current read location in mmap buffer */ 00062 uint64_t mask; /* mask used for wrapping the pages */ 00063 struct perf_event_attr attr; /* perf_event config structure */ 00064 unsigned int wakeup_mode; /* wakeup mode when sampling */ 00065 } pe_event_info_t; 00066 00067 typedef struct 00068 { 00069 int num_events; /* number of events in control state */ 00070 unsigned int domain; /* control-state wide domain */ 00071 unsigned int granularity; /* granularity */ 00072 unsigned int multiplexed; /* multiplexing enable */ 00073 unsigned int overflow; /* overflow enable */ 00074 unsigned int inherit; /* inherit enable */ 00075 int cpu; /* which cpu to measure */ 00076 pid_t tid; /* thread we are monitoring */ 00077 pe_event_info_t events[PERF_EVENT_MAX_MPX_COUNTERS]; 00078 long long counts[PERF_EVENT_MAX_MPX_COUNTERS]; 00079 } pe_control_t; 00080 00081 typedef struct 00082 { 00083 int initialized; /* are we initialized? */ 00084 int state; /* are we opened and/or running? */ 00085 } pe_context_t; 00086 00087 /* These sentinels tell papi_pe_set_overflow() how to set the */ 00088 /* wakeup_events field in the event descriptor record. */ 00089 00090 #define WAKEUP_COUNTER_OVERFLOW 0 00091 #define WAKEUP_PROFILING -1 00092 00093 #define WAKEUP_MODE_COUNTER_OVERFLOW 0 00094 #define WAKEUP_MODE_PROFILING 1 00095 00096 /* Defines for ctx->state */ 00097 #define PERF_EVENTS_OPENED 0x01 00098 #define PERF_EVENTS_RUNNING 0x02 00099 00100 /* Static globals */ 00101 static int nmi_watchdog_active; 00102 00103 /* Advance declaration */ 00104 papi_vector_t _papi_pe_vector; 00105 00106 00107 /******** Kernel Version Dependent Routines **********************/ 00108 00109 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch 00110 * implementations (e.g. x86) which don't do a static event scheduability 00111 * check in sys_perf_event_open. 00112 * This was fixed for x86 in the 2.6.33 kernel 00113 * 00114 * Also! Kernels newer than 2.6.34 will fail in a similar way 00115 * if the nmi_watchdog has stolen a performance counter 00116 * and we try to use the maximum number of counters. 00117 * A sys_perf_event_open() will seem to succeed but will fail 00118 * at read time. So re-use this work around code. 00119 */ 00120 static int 00121 bug_check_scheduability(void) { 00122 00123 #if defined(__powerpc__) 00124 /* PowerPC not affected by this bug */ 00125 #elif defined(__mips__) 00126 /* MIPS as of kernel 3.1 does not properly detect schedulability */ 00127 return 1; 00128 #else 00129 if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1; 00130 #endif 00131 00132 if (nmi_watchdog_active) return 1; 00133 00134 return 0; 00135 } 00136 00137 /* PERF_FORMAT_GROUP allows reading an entire group's counts at once */ 00138 /* before 2.6.34 PERF_FORMAT_GROUP did not work when reading results */ 00139 /* from attached processes. We are lazy and disable it for all cases */ 00140 /* commit was: 050735b08ca8a016bbace4445fa025b88fee770b */ 00141 00142 static int 00143 bug_format_group(void) { 00144 00145 if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) return 1; 00146 00147 /* MIPS, as of version 3.1, does not support this properly */ 00148 00149 #if defined(__mips__) 00150 return 1; 00151 #endif 00152 00153 return 0; 00154 00155 } 00156 00157 00158 /* There's a bug prior to Linux 2.6.33 where if you are using */ 00159 /* PERF_FORMAT_GROUP, the TOTAL_TIME_ENABLED and */ 00160 /* TOTAL_TIME_RUNNING fields will be zero unless you disable */ 00161 /* the counters first */ 00162 static int 00163 bug_sync_read(void) { 00164 00165 if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1; 00166 00167 return 0; 00168 00169 } 00170 00171 00172 /* Set the F_SETOWN_EX flag on the fd. */ 00173 /* This affects which thread an overflow signal gets sent to */ 00174 /* Handled in a subroutine to handle the fact that the behavior */ 00175 /* is dependent on kernel version. */ 00176 static int 00177 fcntl_setown_fd(int fd) { 00178 00179 int ret; 00180 struct f_owner_ex fown_ex; 00181 00182 /* F_SETOWN_EX is not available until 2.6.32 */ 00183 if (_papi_os_info.os_version < LINUX_VERSION(2,6,32)) { 00184 00185 /* get ownership of the descriptor */ 00186 ret = fcntl( fd, F_SETOWN, mygettid( ) ); 00187 if ( ret == -1 ) { 00188 PAPIERROR( "cannot fcntl(F_SETOWN) on %d: %s", fd, strerror(errno) ); 00189 return PAPI_ESYS; 00190 } 00191 } 00192 else { 00193 /* set ownership of the descriptor */ 00194 fown_ex.type = F_OWNER_TID; 00195 fown_ex.pid = mygettid(); 00196 ret = fcntl(fd, F_SETOWN_EX, (unsigned long)&fown_ex ); 00197 00198 if ( ret == -1 ) { 00199 PAPIERROR( "cannot fcntl(F_SETOWN_EX) on %d: %s", 00200 fd, strerror( errno ) ); 00201 return PAPI_ESYS; 00202 } 00203 } 00204 return PAPI_OK; 00205 } 00206 00207 /* Check for processor support */ 00208 /* Can be used for generic checking, though in general we only */ 00209 /* check for pentium4 here because support was broken for multiple */ 00210 /* kernel releases and the usual standard detections did not */ 00211 /* handle this. So we check for pentium 4 explicitly. */ 00212 static int 00213 processor_supported(int vendor, int family) { 00214 00215 /* Error out if kernel too early to support p4 */ 00216 if (( vendor == PAPI_VENDOR_INTEL ) && (family == 15)) { 00217 if (_papi_os_info.os_version < LINUX_VERSION(2,6,35)) { 00218 PAPIERROR("Pentium 4 not supported on kernels before 2.6.35"); 00219 return PAPI_ENOSUPP; 00220 } 00221 } 00222 return PAPI_OK; 00223 } 00224 00225 00226 /* The read format on perf_event varies based on various flags that */ 00227 /* are passed into it. This helper avoids copying this logic */ 00228 /* multiple places. */ 00229 static unsigned int 00230 get_read_format( unsigned int multiplex, 00231 unsigned int inherit, 00232 int format_group ) 00233 { 00234 unsigned int format = 0; 00235 00236 /* if we need read format options for multiplexing, add them now */ 00237 if (multiplex) { 00238 format |= PERF_FORMAT_TOTAL_TIME_ENABLED; 00239 format |= PERF_FORMAT_TOTAL_TIME_RUNNING; 00240 } 00241 00242 /* if our kernel supports it and we are not using inherit, */ 00243 /* add the group read options */ 00244 if ( (!bug_format_group()) && !inherit) { 00245 if (format_group) { 00246 format |= PERF_FORMAT_GROUP; 00247 } 00248 } 00249 00250 SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: 0x%x\n", 00251 multiplex, inherit, format_group, format); 00252 00253 return format; 00254 } 00255 00256 /* The kernel developers say to never use a refresh value of 0 */ 00257 /* See https://lkml.org/lkml/2011/5/24/172 */ 00258 /* However, on some platforms (like Power) a value of 1 does not work */ 00259 /* We're still tracking down why this happens. */ 00260 00261 #if defined(__powerpc__) 00262 #define PAPI_REFRESH_VALUE 0 00263 #else 00264 #define PAPI_REFRESH_VALUE 1 00265 #endif 00266 00267 /********* End Kernel-version Dependent Routines ****************/ 00268 00269 00271 /* perf_events. */ 00272 /* We do this by temporarily opening an event with the */ 00273 /* desired options then closing it again. We use the */ 00274 /* PERF_COUNT_HW_INSTRUCTION event as a dummy event */ 00275 /* on the assumption it is available on all */ 00276 /* platforms. */ 00277 00278 static int 00279 check_permissions( unsigned long tid, 00280 unsigned int cpu_num, 00281 unsigned int domain, 00282 unsigned int granularity, 00283 unsigned int multiplex, 00284 unsigned int inherit ) 00285 { 00286 int ev_fd; 00287 struct perf_event_attr attr; 00288 00289 long pid; 00290 00291 /* clearing this will set a type of hardware and to count all domains */ 00292 memset(&attr, '\0', sizeof(attr)); 00293 attr.read_format = get_read_format(multiplex, inherit, 1); 00294 00295 /* set the event id (config field) to instructios */ 00296 /* (an event that should always exist) */ 00297 /* This was cycles but that is missing on Niagara */ 00298 attr.config = PERF_COUNT_HW_INSTRUCTIONS; 00299 00300 /* now set up domains this event set will be counting */ 00301 if (!(domain & PAPI_DOM_SUPERVISOR)) { 00302 attr.exclude_hv = 1; 00303 } 00304 if (!(domain & PAPI_DOM_USER)) { 00305 attr.exclude_user = 1; 00306 } 00307 if (!(domain & PAPI_DOM_KERNEL)) { 00308 attr.exclude_kernel = 1; 00309 } 00310 00311 if (granularity==PAPI_GRN_SYS) { 00312 pid = -1; 00313 } else { 00314 pid = tid; 00315 } 00316 00317 SUBDBG("Calling sys_perf_event_open() from check_permissions\n"); 00318 00319 ev_fd = sys_perf_event_open( &attr, pid, cpu_num, -1, 0 ); 00320 if ( ev_fd == -1 ) { 00321 SUBDBG("sys_perf_event_open returned error. Linux says, %s", 00322 strerror( errno ) ); 00323 return PAPI_EPERM; 00324 } 00325 00326 /* now close it, this was just to make sure we have permissions */ 00327 /* to set these options */ 00328 close(ev_fd); 00329 return PAPI_OK; 00330 } 00331 00332 00333 00334 /* Maximum size we ever expect to read from a perf_event fd */ 00335 /* (this is the number of 64-bit values) */ 00336 /* We use this to size the read buffers */ 00337 /* The three is for event count, time_enabled, time_running */ 00338 /* and the counter term is count value and count id for each */ 00339 /* possible counter value. */ 00340 #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS)) 00341 00342 00343 00344 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */ 00345 /* implementations (e.g. x86 before 2.6.33) which don't do a static event */ 00346 /* scheduability check in sys_perf_event_open. It is also needed if the */ 00347 /* kernel is stealing an event, such as when NMI watchdog is enabled. */ 00348 00349 static int 00350 check_scheduability( pe_context_t *ctx, pe_control_t *ctl, int idx ) 00351 { 00352 int retval = 0, cnt = -1; 00353 ( void ) ctx; /*unused */ 00354 long long papi_pe_buffer[READ_BUFFER_SIZE]; 00355 int i,group_leader_fd; 00356 00357 if (bug_check_scheduability()) { 00358 00359 /* If the kernel isn't tracking scheduability right */ 00360 /* Then we need to start/stop/read to force the event */ 00361 /* to be scheduled and see if an error condition happens. */ 00362 00363 /* get the proper fd to start */ 00364 group_leader_fd=ctl->events[idx].group_leader_fd; 00365 if (group_leader_fd==-1) group_leader_fd=ctl->events[idx].event_fd; 00366 00367 /* start the event */ 00368 retval = ioctl( group_leader_fd, PERF_EVENT_IOC_ENABLE, NULL ); 00369 if (retval == -1) { 00370 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n"); 00371 return PAPI_ESYS; 00372 } 00373 00374 /* stop the event */ 00375 retval = ioctl(group_leader_fd, PERF_EVENT_IOC_DISABLE, NULL ); 00376 if (retval == -1) { 00377 PAPIERROR( "ioctl(PERF_EVENT_IOC_DISABLE) failed.\n" ); 00378 return PAPI_ESYS; 00379 } 00380 00381 /* See if a read returns any results */ 00382 cnt = read( group_leader_fd, papi_pe_buffer, sizeof(papi_pe_buffer)); 00383 if ( cnt == -1 ) { 00384 SUBDBG( "read returned an error! Should never happen.\n" ); 00385 return PAPI_ESYS; 00386 } 00387 00388 if ( cnt == 0 ) { 00389 /* We read 0 bytes if we could not schedule the event */ 00390 /* The kernel should have detected this at open */ 00391 /* but various bugs (including NMI watchdog) */ 00392 /* result in this behavior */ 00393 00394 return PAPI_ECNFLCT; 00395 00396 } else { 00397 00398 /* Reset all of the counters (opened so far) back to zero */ 00399 /* from the above brief enable/disable call pair. */ 00400 00401 /* We have to reset all events because reset of group leader */ 00402 /* does not reset all. */ 00403 /* we assume that the events are being added one by one and that */ 00404 /* we do not need to reset higher events (doing so may reset ones */ 00405 /* that have not been initialized yet. */ 00406 00407 /* Note... PERF_EVENT_IOC_RESET does not reset time running */ 00408 /* info if multiplexing, so we should avoid coming here if */ 00409 /* we are multiplexing the event. */ 00410 for( i = 0; i < idx; i++) { 00411 retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL ); 00412 if (retval == -1) { 00413 PAPIERROR( "ioctl(PERF_EVENT_IOC_RESET) #%d/%d %d " 00414 "(fd %d)failed.\n", 00415 i,ctl->num_events,idx,ctl->events[i].event_fd); 00416 return PAPI_ESYS; 00417 } 00418 } 00419 } 00420 } 00421 return PAPI_OK; 00422 } 00423 00424 00425 /* Do some extrta work on a perf_event fd if we're doing sampling */ 00426 /* This mostly means setting up the mmap buffer. */ 00427 static int 00428 tune_up_fd( pe_control_t *ctl, int evt_idx ) 00429 { 00430 int ret; 00431 void *buf_addr; 00432 int fd = ctl->events[evt_idx].event_fd; 00433 00434 /* Register that we would like a SIGIO notification when a mmap'd page */ 00435 /* becomes full. */ 00436 ret = fcntl( fd, F_SETFL, O_ASYNC | O_NONBLOCK ); 00437 if ( ret ) { 00438 PAPIERROR ( "fcntl(%d, F_SETFL, O_ASYNC | O_NONBLOCK) " 00439 "returned error: %s", fd, strerror( errno ) ); 00440 return PAPI_ESYS; 00441 } 00442 00443 /* Set the F_SETOWN_EX flag on the fd. */ 00444 /* This affects which thread an overflow signal gets sent to. */ 00445 ret=fcntl_setown_fd(fd); 00446 if (ret!=PAPI_OK) return ret; 00447 00448 /* Set FD_CLOEXEC. Otherwise if we do an exec with an overflow */ 00449 /* running, the overflow handler will continue into the exec()'d*/ 00450 /* process and kill it because no signal handler is set up. */ 00451 ret=fcntl(fd, F_SETFD, FD_CLOEXEC); 00452 if (ret) { 00453 return PAPI_ESYS; 00454 } 00455 00456 /* when you explicitely declare that you want a particular signal, */ 00457 /* even with you use the default signal, the kernel will send more */ 00458 /* information concerning the event to the signal handler. */ 00459 /* */ 00460 /* In particular, it will send the file descriptor from which the */ 00461 /* event is originating which can be quite useful when monitoring */ 00462 /* multiple tasks from a single thread. */ 00463 ret = fcntl( fd, F_SETSIG, _papi_pe_vector.cmp_info.hardware_intr_sig ); 00464 if ( ret == -1 ) { 00465 PAPIERROR( "cannot fcntl(F_SETSIG,%d) on %d: %s", 00466 _papi_pe_vector.cmp_info.hardware_intr_sig, fd, 00467 strerror( errno ) ); 00468 return PAPI_ESYS; 00469 } 00470 00471 /* mmap() the sample buffer */ 00472 buf_addr = mmap( NULL, ctl->events[evt_idx].nr_mmap_pages * getpagesize(), 00473 PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 ); 00474 if ( buf_addr == MAP_FAILED ) { 00475 PAPIERROR( "mmap(NULL,%d,%d,%d,%d,0): %s", 00476 ctl->events[evt_idx].nr_mmap_pages * getpagesize( ), 00477 PROT_READ, MAP_SHARED, fd, strerror( errno ) ); 00478 return ( PAPI_ESYS ); 00479 } 00480 00481 SUBDBG( "Sample buffer for fd %d is located at %p\n", fd, buf_addr ); 00482 00483 /* Set up the mmap buffer and its associated helpers */ 00484 ctl->events[evt_idx].mmap_buf = (struct perf_counter_mmap_page *) buf_addr; 00485 ctl->events[evt_idx].tail = 0; 00486 ctl->events[evt_idx].mask = ( ctl->events[evt_idx].nr_mmap_pages - 1 ) * 00487 getpagesize() - 1; 00488 00489 return PAPI_OK; 00490 } 00491 00492 00493 /* Open all events in the control state */ 00494 static int 00495 open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) 00496 { 00497 00498 int i, ret = PAPI_OK; 00499 long pid; 00500 00501 if (ctl->granularity==PAPI_GRN_SYS) { 00502 pid = -1; 00503 } 00504 else { 00505 pid = ctl->tid; 00506 } 00507 00508 for( i = 0; i < ctl->num_events; i++ ) { 00509 00510 ctl->events[i].event_opened=0; 00511 00512 /* set up the attr structure. We don't set up all fields here */ 00513 /* as some have already been set up previously. */ 00514 00515 /* group leader (event 0) is special */ 00516 /* If we're multiplexed, everyone is a group leader */ 00517 if (( i == 0 ) || (ctl->multiplexed)) { 00518 ctl->events[i].attr.pinned = !ctl->multiplexed; 00519 ctl->events[i].attr.disabled = 1; 00520 ctl->events[i].group_leader_fd=-1; 00521 ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 00522 ctl->inherit, 00523 !ctl->multiplexed ); 00524 } else { 00525 ctl->events[i].attr.pinned=0; 00526 ctl->events[i].attr.disabled = 0; 00527 ctl->events[i].group_leader_fd=ctl->events[0].event_fd; 00528 ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 00529 ctl->inherit, 00530 0 ); 00531 } 00532 00533 00534 /* try to open */ 00535 ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr, 00536 pid, 00537 ctl->cpu, 00538 ctl->events[i].group_leader_fd, 00539 0 /* flags */ 00540 ); 00541 00542 if ( ctl->events[i].event_fd == -1 ) { 00543 SUBDBG("sys_perf_event_open returned error on event #%d." 00544 " Error: %s\n", 00545 i, strerror( errno ) ); 00546 if (errno == EPERM) ret = PAPI_EPERM; 00547 else ret = PAPI_ECNFLCT; 00548 goto open_pe_cleanup; 00549 } 00550 00551 SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d," 00552 " group_leader/fd: %d, event_fd: %d," 00553 " read_format: 0x%"PRIu64"\n", 00554 pid, ctl->cpu, ctl->events[i].group_leader_fd, 00555 ctl->events[i].event_fd, ctl->events[i].attr.read_format); 00556 00557 00558 /* in many situations the kernel will indicate we opened fine */ 00559 /* yet things will fail later. So we need to double check */ 00560 /* we actually can use the events we've set up. */ 00561 00562 /* This is not necessary if we are multiplexing, and in fact */ 00563 /* we cannot do this properly if multiplexed because */ 00564 /* PERF_EVENT_IOC_RESET does not reset the time running info */ 00565 if (!ctl->multiplexed) { 00566 ret = check_scheduability( ctx, ctl, i ); 00567 00568 if ( ret != PAPI_OK ) { 00569 /* the last event did open, so we need to bump the counter */ 00570 /* before doing the cleanup */ 00571 i++; 00572 00573 goto open_pe_cleanup; 00574 } 00575 } 00576 ctl->events[i].event_opened=1; 00577 } 00578 00579 /* Now that we've successfully opened all of the events, do whatever */ 00580 /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */ 00581 /* and so on. */ 00582 for ( i = 0; i < ctl->num_events; i++ ) { 00583 00584 /* If sampling is enabled, hook up signal handler */ 00585 if ( ctl->events[i].attr.sample_period ) { 00586 ret = tune_up_fd( ctl, i ); 00587 if ( ret != PAPI_OK ) { 00588 /* All of the fds are open, so we need to clean up all of them */ 00589 i = ctl->num_events; 00590 goto open_pe_cleanup; 00591 } 00592 } else { 00593 /* Make sure this is NULL so close_pe_events works right */ 00594 ctl->events[i].mmap_buf = NULL; 00595 } 00596 } 00597 00598 /* Set num_evts only if completely successful */ 00599 ctx->state |= PERF_EVENTS_OPENED; 00600 00601 return PAPI_OK; 00602 00603 open_pe_cleanup: 00604 /* We encountered an error, close up the fds we successfully opened. */ 00605 /* We go backward in an attempt to close group leaders last, although */ 00606 /* That's probably not strictly necessary. */ 00607 while ( i > 0 ) { 00608 i--; 00609 if (ctl->events[i].event_fd>=0) { 00610 close( ctl->events[i].event_fd ); 00611 ctl->events[i].event_opened=0; 00612 } 00613 } 00614 00615 return ret; 00616 } 00617 00618 /* Close all of the opened events */ 00619 static int 00620 close_pe_events( pe_context_t *ctx, pe_control_t *ctl ) 00621 { 00622 int i; 00623 int num_closed=0; 00624 int events_not_opened=0; 00625 00626 /* should this be a more serious error? */ 00627 if ( ctx->state & PERF_EVENTS_RUNNING ) { 00628 SUBDBG("Closing without stopping first\n"); 00629 } 00630 00631 /* Close child events first */ 00632 for( i=0; i<ctl->num_events; i++ ) { 00633 00634 if (ctl->events[i].event_opened) { 00635 00636 if (ctl->events[i].group_leader_fd!=-1) { 00637 if ( ctl->events[i].mmap_buf ) { 00638 if ( munmap ( ctl->events[i].mmap_buf, 00639 ctl->events[i].nr_mmap_pages * getpagesize() ) ) { 00640 PAPIERROR( "munmap of fd = %d returned error: %s", 00641 ctl->events[i].event_fd, strerror( errno ) ); 00642 return PAPI_ESYS; 00643 } 00644 } 00645 00646 if ( close( ctl->events[i].event_fd ) ) { 00647 PAPIERROR( "close of fd = %d returned error: %s", 00648 ctl->events[i].event_fd, strerror( errno ) ); 00649 return PAPI_ESYS; 00650 } else { 00651 num_closed++; 00652 } 00653 ctl->events[i].event_opened=0; 00654 } 00655 } 00656 else { 00657 events_not_opened++; 00658 } 00659 } 00660 00661 /* Close the group leaders last */ 00662 for( i=0; i<ctl->num_events; i++ ) { 00663 00664 if (ctl->events[i].event_opened) { 00665 00666 if (ctl->events[i].group_leader_fd==-1) { 00667 if ( ctl->events[i].mmap_buf ) { 00668 if ( munmap ( ctl->events[i].mmap_buf, 00669 ctl->events[i].nr_mmap_pages * getpagesize() ) ) { 00670 PAPIERROR( "munmap of fd = %d returned error: %s", 00671 ctl->events[i].event_fd, strerror( errno ) ); 00672 return PAPI_ESYS; 00673 } 00674 } 00675 00676 00677 if ( close( ctl->events[i].event_fd ) ) { 00678 PAPIERROR( "close of fd = %d returned error: %s", 00679 ctl->events[i].event_fd, strerror( errno ) ); 00680 return PAPI_ESYS; 00681 } else { 00682 num_closed++; 00683 } 00684 ctl->events[i].event_opened=0; 00685 } 00686 } 00687 } 00688 00689 00690 if (ctl->num_events!=num_closed) { 00691 if (ctl->num_events!=(num_closed+events_not_opened)) { 00692 PAPIERROR("Didn't close all events: " 00693 "Closed %d Not Opened: %d Expected %d\n", 00694 num_closed,events_not_opened,ctl->num_events); 00695 return PAPI_EBUG; 00696 } 00697 } 00698 00699 ctl->num_events=0; 00700 00701 ctx->state &= ~PERF_EVENTS_OPENED; 00702 00703 return PAPI_OK; 00704 } 00705 00706 /* Fix up the config based on what CPU/Vendor we are running on */ 00707 static int 00708 pe_vendor_fixups(void) 00709 { 00710 /* powerpc */ 00711 /* On IBM and Power6 Machines default domain should include supervisor */ 00712 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_IBM ) { 00713 _papi_pe_vector.cmp_info.available_domains |= 00714 PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00715 if (strcmp(_papi_hwi_system_info.hw_info.model_string, "POWER6" ) == 0 ) { 00716 _papi_pe_vector.cmp_info.default_domain = 00717 PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00718 } 00719 } 00720 00721 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_MIPS ) { 00722 _papi_pe_vector.cmp_info.available_domains |= PAPI_DOM_KERNEL; 00723 } 00724 00725 if ((_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_INTEL) || 00726 (_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_AMD)) { 00727 _papi_pe_vector.cmp_info.fast_real_timer = 1; 00728 } 00729 00730 /* ARM */ 00731 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_ARM) { 00732 /* FIXME: this will change with Cortex A15 */ 00733 _papi_pe_vector.cmp_info.available_domains |= 00734 PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00735 _papi_pe_vector.cmp_info.default_domain = 00736 PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00737 } 00738 00739 /* CRAY */ 00740 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_CRAY ) { 00741 _papi_pe_vector.cmp_info.available_domains |= PAPI_DOM_OTHER; 00742 } 00743 00744 return PAPI_OK; 00745 } 00746 00747 00748 /* Check the mmap page for rdpmc support */ 00749 static int detect_rdpmc(void) { 00750 00751 struct perf_event_attr pe; 00752 int fd,rdpmc_exists=1; 00753 void *addr; 00754 struct perf_event_mmap_page *our_mmap; 00755 00756 /* Create a fake instructions event so we can read a mmap page */ 00757 memset(&pe,0,sizeof(struct perf_event_attr)); 00758 00759 pe.type=PERF_TYPE_HARDWARE; 00760 pe.size=sizeof(struct perf_event_attr); 00761 pe.config=PERF_COUNT_HW_INSTRUCTIONS; 00762 00763 fd=sys_perf_event_open(&pe,0,-1,-1,0); 00764 if (fd<0) { 00765 return PAPI_ESYS; 00766 } 00767 00768 /* create the mmap page */ 00769 addr=mmap(NULL, 4096, PROT_READ, MAP_SHARED,fd,0); 00770 if (addr == (void *)(-1)) { 00771 close(fd); 00772 return PAPI_ESYS; 00773 } 00774 00775 /* get the rdpmc info */ 00776 our_mmap=(struct perf_event_mmap_page *)addr; 00777 if (our_mmap->cap_usr_rdpmc==0) { 00778 rdpmc_exists=0; 00779 } 00780 00781 /* close the fake event */ 00782 munmap(addr,4096); 00783 close(fd); 00784 00785 return rdpmc_exists; 00786 00787 } 00788 00789 /* Find a native event specified by a profile index */ 00790 static int 00791 find_profile_index( EventSetInfo_t *ESI, int evt_idx, int *flags, 00792 unsigned int *native_index, int *profile_index ) 00793 { 00794 int pos, esi_index, count; 00795 00796 for ( count = 0; count < ESI->profile.event_counter; count++ ) { 00797 esi_index = ESI->profile.EventIndex[count]; 00798 pos = ESI->EventInfoArray[esi_index].pos[0]; 00799 00800 if ( pos == evt_idx ) { 00801 *profile_index = count; 00802 *native_index = ESI->NativeInfoArray[pos].ni_event & 00803 PAPI_NATIVE_AND_MASK; 00804 *flags = ESI->profile.flags; 00805 SUBDBG( "Native event %d is at profile index %d, flags %d\n", 00806 *native_index, *profile_index, *flags ); 00807 return PAPI_OK; 00808 } 00809 } 00810 00811 PAPIERROR( "wrong count: %d vs. ESI->profile.event_counter %d", count, 00812 ESI->profile.event_counter ); 00813 return PAPI_EBUG; 00814 } 00815 00816 00817 /* These functions are based on builtin-record.c in the */ 00818 /* kernel's tools/perf directory. */ 00819 00820 static uint64_t 00821 mmap_read_head( pe_event_info_t *pe ) 00822 { 00823 struct perf_event_mmap_page *pc = pe->mmap_buf; 00824 int head; 00825 00826 if ( pc == NULL ) { 00827 PAPIERROR( "perf_event_mmap_page is NULL" ); 00828 return 0; 00829 } 00830 00831 head = pc->data_head; 00832 rmb( ); 00833 00834 return head; 00835 } 00836 00837 static void 00838 mmap_write_tail( pe_event_info_t *pe, uint64_t tail ) 00839 { 00840 struct perf_event_mmap_page *pc = pe->mmap_buf; 00841 00842 /* ensure all reads are done before we write the tail out. */ 00843 pc->data_tail = tail; 00844 } 00845 00846 /* Does the kernel define these somewhere? */ 00847 struct ip_event { 00848 struct perf_event_header header; 00849 uint64_t ip; 00850 }; 00851 struct lost_event { 00852 struct perf_event_header header; 00853 uint64_t id; 00854 uint64_t lost; 00855 }; 00856 typedef union event_union { 00857 struct perf_event_header header; 00858 struct ip_event ip; 00859 struct lost_event lost; 00860 } perf_sample_event_t; 00861 00862 00863 /* Should re-write with comments if we ever figure out what's */ 00864 /* going on here. */ 00865 static void 00866 mmap_read( ThreadInfo_t **thr, pe_event_info_t *pe, 00867 int profile_index ) 00868 { 00869 int cidx = _papi_pe_vector.cmp_info.CmpIdx; 00870 uint64_t head = mmap_read_head( pe ); 00871 uint64_t old = pe->tail; 00872 unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( ); 00873 int diff; 00874 00875 diff = head - old; 00876 if ( diff < 0 ) { 00877 SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64 00878 ", tail = %" PRIu64 ". Discarding samples.\n", head, old ); 00879 /* head points to a known good entry, start there. */ 00880 old = head; 00881 } 00882 00883 for( ; old != head; ) { 00884 00885 perf_sample_event_t *event = ( perf_sample_event_t * ) 00886 & data[old & pe->mask]; 00887 perf_sample_event_t event_copy; 00888 size_t size = event->header.size; 00889 00890 /* Event straddles the mmap boundary -- header should always */ 00891 /* be inside due to u64 alignment of output. */ 00892 if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) { 00893 uint64_t offset = old; 00894 uint64_t len = min( sizeof ( *event ), size ), cpy; 00895 void *dst = &event_copy; 00896 00897 do { 00898 cpy = min( pe->mask + 1 - ( offset & pe->mask ), len ); 00899 memcpy( dst, &data[offset & pe->mask], cpy ); 00900 offset += cpy; 00901 dst = ((unsigned char*)dst) + cpy; 00902 len -= cpy; 00903 } while ( len ); 00904 00905 event = &event_copy; 00906 } 00907 00908 old += size; 00909 00910 SUBDBG( "event->type = %08x\n", event->header.type ); 00911 SUBDBG( "event->size = %d\n", event->header.size ); 00912 00913 switch ( event->header.type ) { 00914 case PERF_RECORD_SAMPLE: 00915 _papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx], 00916 ( caddr_t ) ( unsigned long ) event->ip.ip, 00917 0, profile_index ); 00918 break; 00919 00920 case PERF_RECORD_LOST: 00921 SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64 00922 " events were lost.\n" 00923 "Loss was recorded when counter id 0x%"PRIx64 00924 " overflowed.\n", event->lost.lost, event->lost.id ); 00925 break; 00926 00927 default: 00928 SUBDBG( "Error: unexpected header type - %d\n", 00929 event->header.type ); 00930 break; 00931 } 00932 } 00933 00934 pe->tail = old; 00935 mmap_write_tail( pe, old ); 00936 } 00937 00938 /* What exactly does this do? */ 00939 static int 00940 process_smpl_buf( int evt_idx, ThreadInfo_t **thr ) 00941 { 00942 int ret, flags, profile_index; 00943 unsigned native_index; 00944 int cidx = _papi_pe_vector.cmp_info.CmpIdx; 00945 pe_control_t *ctl; 00946 00947 ret = find_profile_index( ( *thr )->running_eventset[cidx], evt_idx, 00948 &flags, &native_index, &profile_index ); 00949 if ( ret != PAPI_OK ) { 00950 return ret; 00951 } 00952 00953 ctl= (*thr)->running_eventset[cidx]->ctl_state; 00954 00955 mmap_read( thr, 00956 &(ctl->events[evt_idx]), 00957 profile_index ); 00958 00959 return PAPI_OK; 00960 } 00961 00962 00963 00964 00965 /********************************************************************/ 00966 /********************************************************************/ 00967 /* Start with functions that are exported via the module interface */ 00968 /********************************************************************/ 00969 /********************************************************************/ 00970 00971 00972 /* set the domain. FIXME: perf_events allows per-event control of this. */ 00973 /* we do not handle that yet. */ 00974 int 00975 _papi_pe_set_domain( hwd_control_state_t *ctl, int domain) 00976 { 00977 00978 int i; 00979 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00980 00981 SUBDBG("old control domain %d, new domain %d, default domain %d\n", 00982 pe_ctl->domain,domain,_papi_pe_vector.cmp_info.default_domain); 00983 00984 pe_ctl->domain = domain; 00985 00986 /* Force the domain on all events */ 00987 for( i = 0; i < pe_ctl->num_events; i++ ) { 00988 pe_ctl->events[i].attr.exclude_user = 00989 !( pe_ctl->domain & PAPI_DOM_USER ); 00990 pe_ctl->events[i].attr.exclude_kernel = 00991 !( pe_ctl->domain & PAPI_DOM_KERNEL ); 00992 pe_ctl->events[i].attr.exclude_hv = 00993 !( pe_ctl->domain & PAPI_DOM_SUPERVISOR ); 00994 } 00995 return PAPI_OK; 00996 } 00997 00998 00999 /* Initialize the perf_event component */ 01000 static int 01001 _papi_pe_init_component( int cidx ) 01002 { 01003 01004 int retval; 01005 int paranoid_level; 01006 01007 FILE *fff; 01008 01009 ( void ) cidx; /*unused */ 01010 01011 /* The is the official way to detect if perf_event support exists */ 01012 /* The file is called perf_counter_paranoid on 2.6.31 */ 01013 /* currently we are lazy and do not support 2.6.31 kernels */ 01014 fff=fopen("/proc/sys/kernel/perf_event_paranoid","r"); 01015 if (fff==NULL) { 01016 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 01017 "perf_event support not detected",PAPI_MAX_STR_LEN); 01018 return PAPI_ENOCMP; 01019 } 01020 01021 /* 2 means no measurements allowed */ 01022 /* 1 means normal counter access */ 01023 /* 0 means you can access CPU-specific data */ 01024 /* -1 means no restrictions */ 01025 retval=fscanf(fff,"%d",¶noid_level); 01026 if (retval!=1) fprintf(stderr,"Error reading paranoid level\n"); 01027 fclose(fff); 01028 01029 if (paranoid_level==2) { 01030 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 01031 "/proc/sys/kernel/perf_event_paranoid prohibits using counters", 01032 PAPI_MAX_STR_LEN); 01033 return PAPI_ENOCMP; 01034 } 01035 01036 /* Detect NMI watchdog which can steal counters */ 01037 nmi_watchdog_active=_linux_detect_nmi_watchdog(); 01038 if (nmi_watchdog_active) { 01039 SUBDBG("The Linux nmi_watchdog is using one of the performance " 01040 "counters, reducing the total number available.\n"); 01041 } 01042 01043 /* Kernel multiplexing is broken prior to kernel 2.6.34 */ 01044 /* The fix was probably git commit: */ 01045 /* 45e16a6834b6af098702e5ea6c9a40de42ff77d8 */ 01046 if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) { 01047 _papi_pe_vector.cmp_info.kernel_multiplex = 0; 01048 } 01049 else { 01050 _papi_pe_vector.cmp_info.kernel_multiplex = 1; 01051 } 01052 01053 /* We use the RealTime signal for some reason */ 01054 _papi_pe_vector.cmp_info.hardware_intr_sig = SIGRTMIN + 2; 01055 01056 /* Check that processor is supported */ 01057 if (processor_supported(_papi_hwi_system_info.hw_info.vendor, 01058 _papi_hwi_system_info.hw_info.cpuid_family)!= 01059 PAPI_OK) { 01060 fprintf(stderr,"warning, your processor is unsupported\n"); 01061 /* should not return error, as software events should still work */ 01062 } 01063 01064 /* Setup mmtimers, if appropriate */ 01065 retval=mmtimer_setup(); 01066 if (retval) { 01067 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 01068 "Error initializing mmtimer",PAPI_MAX_STR_LEN); 01069 return retval; 01070 } 01071 01072 /* Detect if we can use rdpmc (or equivalent) */ 01073 /* We currently do not use rdpmc as it is slower in tests */ 01074 /* than regular read (as of Linux 3.5) */ 01075 retval=detect_rdpmc(); 01076 if (retval < 0 ) { 01077 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 01078 "sys_perf_event_open() failed, perf_event support for this platform may be broken",PAPI_MAX_STR_LEN); 01079 return retval; 01080 } 01081 _papi_pe_vector.cmp_info.fast_counter_read = retval; 01082 01083 /* Run Vendor-specific fixups */ 01084 pe_vendor_fixups(); 01085 01086 /* Run the libpfm4-specific setup */ 01087 retval = _papi_libpfm4_init(&_papi_pe_vector, cidx); 01088 if (retval) { 01089 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 01090 "Error initializing libpfm4",PAPI_MAX_STR_LEN); 01091 return retval; 01092 } 01093 01094 return PAPI_OK; 01095 01096 } 01097 01098 /* Shutdown the perf_event component */ 01099 static int 01100 _papi_pe_shutdown_component( void ) { 01101 01102 /* Shutdown libpfm4 */ 01103 _papi_libpfm4_shutdown(); 01104 01105 return PAPI_OK; 01106 } 01107 01108 01109 /* Initialize a thread */ 01110 static int 01111 _papi_pe_init_thread( hwd_context_t *hwd_ctx ) 01112 { 01113 01114 pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx; 01115 01116 /* clear the context structure and mark as initialized */ 01117 memset( pe_ctx, 0, sizeof ( pe_context_t ) ); 01118 pe_ctx->initialized=1; 01119 01120 return PAPI_OK; 01121 } 01122 01123 /* Shutdown a thread */ 01124 static int 01125 _papi_pe_shutdown_thread( hwd_context_t *ctx ) 01126 { 01127 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01128 01129 pe_ctx->initialized=0; 01130 01131 return PAPI_OK; 01132 } 01133 01134 01135 /* reset the hardware counters */ 01136 /* Note: PAPI_reset() does not necessarily call this */ 01137 /* unless the events are actually running. */ 01138 static int 01139 _papi_pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl ) 01140 { 01141 int i, ret; 01142 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01143 01144 ( void ) ctx; /*unused */ 01145 01146 /* We need to reset all of the events, not just the group leaders */ 01147 for( i = 0; i < pe_ctl->num_events; i++ ) { 01148 ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL ); 01149 if ( ret == -1 ) { 01150 PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) " 01151 "returned error, Linux says: %s", 01152 pe_ctl->events[i].event_fd, strerror( errno ) ); 01153 return PAPI_ESYS; 01154 } 01155 } 01156 01157 return PAPI_OK; 01158 } 01159 01160 01161 /* write (set) the hardware counters */ 01162 /* Current we do not support this. */ 01163 static int 01164 _papi_pe_write( hwd_context_t *ctx, hwd_control_state_t *ctl, 01165 long long *from ) 01166 { 01167 ( void ) ctx; /*unused */ 01168 ( void ) ctl; /*unused */ 01169 ( void ) from; /*unused */ 01170 /* 01171 * Counters cannot be written. Do we need to virtualize the 01172 * counters so that they can be written, or perhaps modify code so that 01173 * they can be written? FIXME ? 01174 */ 01175 01176 return PAPI_ENOSUPP; 01177 } 01178 01179 /* 01180 * perf_event provides a complicated read interface. 01181 * the info returned by read() varies depending on whether 01182 * you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED, 01183 * PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set 01184 * 01185 * To simplify things we just always ask for everything. This might 01186 * lead to overhead when reading more than we need, but it makes the 01187 * read code a lot simpler than the original implementation we had here. 01188 * 01189 * For more info on the layout see include/linux/perf_event.h 01190 * 01191 */ 01192 01193 static int 01194 _papi_pe_read( hwd_context_t *ctx, hwd_control_state_t *ctl, 01195 long long **events, int flags ) 01196 { 01197 ( void ) flags; /*unused */ 01198 int i, ret = -1; 01199 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01200 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01201 long long papi_pe_buffer[READ_BUFFER_SIZE]; 01202 long long tot_time_running, tot_time_enabled, scale; 01203 01204 /* On kernels before 2.6.33 the TOTAL_TIME_ENABLED and TOTAL_TIME_RUNNING */ 01205 /* fields are always 0 unless the counter is disabled. So if we are on */ 01206 /* one of these kernels, then we must disable events before reading. */ 01207 01208 /* Elsewhere though we disable multiplexing on kernels before 2.6.34 */ 01209 /* so maybe this isn't even necessary. */ 01210 01211 if (bug_sync_read()) { 01212 if ( pe_ctx->state & PERF_EVENTS_RUNNING ) { 01213 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01214 /* disable only the group leaders */ 01215 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 01216 ret = ioctl( pe_ctl->events[i].event_fd, 01217 PERF_EVENT_IOC_DISABLE, NULL ); 01218 if ( ret == -1 ) { 01219 PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) " 01220 "returned an error: ", strerror( errno )); 01221 return PAPI_ESYS; 01222 } 01223 } 01224 } 01225 } 01226 } 01227 01228 01229 /* Handle case where we are multiplexing */ 01230 if (pe_ctl->multiplexed) { 01231 01232 /* currently we handle multiplexing by having individual events */ 01233 /* so we read from each in turn. */ 01234 01235 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01236 01237 ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 01238 sizeof ( papi_pe_buffer ) ); 01239 if ( ret == -1 ) { 01240 PAPIERROR("read returned an error: ", strerror( errno )); 01241 return PAPI_ESYS; 01242 } 01243 01244 /* We should read 3 64-bit values from the counter */ 01245 if (ret<(signed)(3*sizeof(long long))) { 01246 PAPIERROR("Error! short read!\n"); 01247 return PAPI_ESYS; 01248 } 01249 01250 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01251 pe_ctl->events[i].event_fd, 01252 (long)pe_ctl->tid, pe_ctl->cpu, ret); 01253 SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0], 01254 papi_pe_buffer[1],papi_pe_buffer[2]); 01255 01256 tot_time_enabled = papi_pe_buffer[1]; 01257 tot_time_running = papi_pe_buffer[2]; 01258 01259 SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * " 01260 "tot_time_enabled %lld) / tot_time_running %lld\n", 01261 i, 0,papi_pe_buffer[0], 01262 tot_time_enabled,tot_time_running); 01263 01264 if (tot_time_running == tot_time_enabled) { 01265 /* No scaling needed */ 01266 pe_ctl->counts[i] = papi_pe_buffer[0]; 01267 } else if (tot_time_running && tot_time_enabled) { 01268 /* Scale factor of 100 to avoid overflows when computing */ 01269 /*enabled/running */ 01270 01271 scale = (tot_time_enabled * 100LL) / tot_time_running; 01272 scale = scale * papi_pe_buffer[0]; 01273 scale = scale / 100LL; 01274 pe_ctl->counts[i] = scale; 01275 } else { 01276 /* This should not happen, but Phil reports it sometime does. */ 01277 SUBDBG("perf_event kernel bug(?) count, enabled, " 01278 "running: %lld, %lld, %lld\n", 01279 papi_pe_buffer[0],tot_time_enabled, 01280 tot_time_running); 01281 01282 pe_ctl->counts[i] = papi_pe_buffer[0]; 01283 } 01284 } 01285 } 01286 01287 /* Handle cases where we cannot use FORMAT GROUP */ 01288 else if (bug_format_group() || pe_ctl->inherit) { 01289 01290 /* we must read each counter individually */ 01291 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01292 01293 ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 01294 sizeof ( papi_pe_buffer ) ); 01295 if ( ret == -1 ) { 01296 PAPIERROR("read returned an error: ", strerror( errno )); 01297 return PAPI_ESYS; 01298 } 01299 01300 /* we should read one 64-bit value from each counter */ 01301 if (ret!=sizeof(long long)) { 01302 PAPIERROR("Error! short read!\n"); 01303 PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01304 pe_ctl->events[i].event_fd, 01305 (long)pe_ctl->tid, pe_ctl->cpu, ret); 01306 return PAPI_ESYS; 01307 } 01308 01309 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01310 pe_ctl->events[i].event_fd, (long)pe_ctl->tid, 01311 pe_ctl->cpu, ret); 01312 SUBDBG("read: %lld\n",papi_pe_buffer[0]); 01313 01314 pe_ctl->counts[i] = papi_pe_buffer[0]; 01315 } 01316 } 01317 01318 01319 /* Handle cases where we are using FORMAT_GROUP */ 01320 /* We assume only one group leader, in position 0 */ 01321 01322 else { 01323 if (pe_ctl->events[0].group_leader_fd!=-1) { 01324 PAPIERROR("Was expecting group leader!\n"); 01325 } 01326 01327 ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer, 01328 sizeof ( papi_pe_buffer ) ); 01329 01330 if ( ret == -1 ) { 01331 PAPIERROR("read returned an error: ", strerror( errno )); 01332 return PAPI_ESYS; 01333 } 01334 01335 /* we read 1 64-bit value (number of events) then */ 01336 /* num_events more 64-bit values that hold the counts */ 01337 if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) { 01338 PAPIERROR("Error! short read!\n"); 01339 return PAPI_ESYS; 01340 } 01341 01342 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01343 pe_ctl->events[0].event_fd, 01344 (long)pe_ctl->tid, pe_ctl->cpu, ret); 01345 { 01346 int j; 01347 for(j=0;j<ret/8;j++) { 01348 SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]); 01349 } 01350 } 01351 01352 /* Make sure the kernel agrees with how many events we have */ 01353 if (papi_pe_buffer[0]!=pe_ctl->num_events) { 01354 PAPIERROR("Error! Wrong number of events!\n"); 01355 return PAPI_ESYS; 01356 } 01357 01358 /* put the count values in their proper location */ 01359 for(i=0;i<papi_pe_buffer[0];i++) { 01360 pe_ctl->counts[i] = papi_pe_buffer[1+i]; 01361 } 01362 } 01363 01364 01365 /* If we disabled the counters due to the sync_read_bug(), */ 01366 /* then we need to re-enable them now. */ 01367 if (bug_sync_read()) { 01368 if ( pe_ctx->state & PERF_EVENTS_RUNNING ) { 01369 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01370 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 01371 /* this should refresh any overflow counters too */ 01372 ret = ioctl( pe_ctl->events[i].event_fd, 01373 PERF_EVENT_IOC_ENABLE, NULL ); 01374 if ( ret == -1 ) { 01375 /* Should never happen */ 01376 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) returned an error: ", 01377 strerror( errno )); 01378 return PAPI_ESYS; 01379 } 01380 } 01381 } 01382 } 01383 } 01384 01385 /* point PAPI to the values we read */ 01386 *events = pe_ctl->counts; 01387 01388 return PAPI_OK; 01389 } 01390 01391 /* Start counting events */ 01392 static int 01393 _papi_pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl ) 01394 { 01395 int ret; 01396 int i; 01397 int did_something = 0; 01398 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01399 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01400 01401 /* Reset the counters first. Is this necessary? */ 01402 ret = _papi_pe_reset( pe_ctx, pe_ctl ); 01403 if ( ret ) { 01404 return ret; 01405 } 01406 01407 /* Enable all of the group leaders */ 01408 /* All group leaders have a group_leader_fd of -1 */ 01409 for( i = 0; i < pe_ctl->num_events; i++ ) { 01410 if (pe_ctl->events[i].group_leader_fd == -1) { 01411 SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd); 01412 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ; 01413 01414 /* ioctls always return -1 on failure */ 01415 if (ret == -1) { 01416 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n"); 01417 return PAPI_ESYS; 01418 } 01419 01420 did_something++; 01421 } 01422 } 01423 01424 if (!did_something) { 01425 PAPIERROR("Did not enable any counters.\n"); 01426 return PAPI_EBUG; 01427 } 01428 01429 pe_ctx->state |= PERF_EVENTS_RUNNING; 01430 01431 return PAPI_OK; 01432 01433 } 01434 01435 /* Stop all of the counters */ 01436 static int 01437 _papi_pe_stop( hwd_context_t *ctx, hwd_control_state_t *ctl ) 01438 { 01439 01440 int ret; 01441 int i; 01442 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01443 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01444 01445 /* Just disable the group leaders */ 01446 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01447 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 01448 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL); 01449 if ( ret == -1 ) { 01450 PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) " 01451 "returned error, Linux says: %s", 01452 pe_ctl->events[i].event_fd, strerror( errno ) ); 01453 return PAPI_EBUG; 01454 } 01455 } 01456 } 01457 01458 pe_ctx->state &= ~PERF_EVENTS_RUNNING; 01459 01460 return PAPI_OK; 01461 } 01462 01463 /* Initialize a new control state */ 01464 static int 01465 _papi_pe_init_control_state( hwd_control_state_t *ctl ) 01466 { 01467 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01468 01469 /* clear the contents */ 01470 memset( pe_ctl, 0, sizeof ( pe_control_t ) ); 01471 _papi_pe_set_domain( ctl, _papi_pe_vector.cmp_info.default_domain ); 01472 01473 /* default granularity */ 01474 pe_ctl->granularity=PAPI_GRN_THR; 01475 01476 /* Set cpu number in the control block to show events */ 01477 /* are not tied to specific cpu */ 01478 pe_ctl->cpu = -1; 01479 return PAPI_OK; 01480 } 01481 01482 01483 /* This function clears the current contents of the control structure and 01484 updates it with whatever resources are allocated for all the native events 01485 in the native info structure array. */ 01486 01487 static int 01488 _papi_pe_update_control_state( hwd_control_state_t *ctl, 01489 NativeInfo_t *native, 01490 int count, hwd_context_t *ctx ) 01491 { 01492 int i = 0, ret; 01493 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01494 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01495 01496 /* close all of the existing fds and start over again */ 01497 /* In theory we could have finer-grained control and know if */ 01498 /* things were changed, but it's easier to tear things down and rebuild. */ 01499 close_pe_events( pe_ctx, pe_ctl ); 01500 01501 /* Calling with count==0 should be OK, it's how things are deallocated */ 01502 /* when an eventset is destroyed. */ 01503 if ( count == 0 ) { 01504 SUBDBG( "Called with count == 0\n" ); 01505 return PAPI_OK; 01506 } 01507 01508 /* set up all the events */ 01509 for( i = 0; i < count; i++ ) { 01510 if ( native ) { 01511 /* Have libpfm4 set the config values for the event */ 01512 ret=_papi_libpfm4_setup_counters(&pe_ctl->events[i].attr, 01513 native[i].ni_event); 01514 SUBDBG( "pe_ctl->eventss[%d].config=%"PRIx64"\n",i, 01515 pe_ctl->events[i].attr.config); 01516 if (ret!=PAPI_OK) return ret; 01517 01518 } else { 01519 /* I'm not sure how we'd end up in this case */ 01520 /* should it be an error? */ 01521 } 01522 01523 /* Copy the inherit flag into the attribute block that will be */ 01524 /* passed to the kernel */ 01525 pe_ctl->events[i].attr.inherit = pe_ctl->inherit; 01526 01527 /* Set the position in the native structure */ 01528 /* We just set up events linearly */ 01529 if ( native ) { 01530 native[i].ni_position = i; 01531 } 01532 } 01533 01534 pe_ctl->num_events = count; 01535 _papi_pe_set_domain( ctl, pe_ctl->domain ); 01536 01537 /* actuall open the events */ 01538 /* (why is this a separate function?) */ 01539 ret = open_pe_events( pe_ctx, pe_ctl ); 01540 if ( ret != PAPI_OK ) { 01541 SUBDBG("open_pe_events failed\n"); 01542 /* Restore values ? */ 01543 return ret; 01544 } 01545 01546 return PAPI_OK; 01547 } 01548 01549 /* Set various options on a control state */ 01550 static int 01551 _papi_pe_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option ) 01552 { 01553 int ret; 01554 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01555 pe_control_t *pe_ctl = NULL; 01556 01557 switch ( code ) { 01558 case PAPI_MULTIPLEX: 01559 pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state ); 01560 if (check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain, 01561 pe_ctl->granularity, 01562 1, pe_ctl->inherit ) != PAPI_OK) { 01563 return PAPI_EPERM; 01564 } 01565 01566 /* looks like we are allowed, so set multiplexed attribute */ 01567 pe_ctl->multiplexed = 1; 01568 ret = _papi_pe_update_control_state( pe_ctl, NULL, 01569 pe_ctl->num_events, pe_ctx ); 01570 if (ret != PAPI_OK) { 01571 pe_ctl->multiplexed = 0; 01572 } 01573 return ret; 01574 01575 case PAPI_ATTACH: 01576 pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state ); 01577 if (check_permissions( option->attach.tid, pe_ctl->cpu, 01578 pe_ctl->domain, pe_ctl->granularity, 01579 pe_ctl->multiplexed, 01580 pe_ctl->inherit ) != PAPI_OK) { 01581 return PAPI_EPERM; 01582 } 01583 01584 pe_ctl->tid = option->attach.tid; 01585 01586 /* If events have been already been added, something may */ 01587 /* have been done to the kernel, so update */ 01588 ret = _papi_pe_update_control_state( pe_ctl, NULL, 01589 pe_ctl->num_events, pe_ctx); 01590 01591 return ret; 01592 01593 case PAPI_DETACH: 01594 pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state ); 01595 01596 pe_ctl->tid = 0; 01597 return PAPI_OK; 01598 01599 case PAPI_CPU_ATTACH: 01600 pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state ); 01601 if (check_permissions( pe_ctl->tid, option->cpu.cpu_num, 01602 pe_ctl->domain, pe_ctl->granularity, 01603 pe_ctl->multiplexed, 01604 pe_ctl->inherit ) != PAPI_OK) { 01605 return PAPI_EPERM; 01606 } 01607 /* looks like we are allowed so set cpu number */ 01608 01609 /* this tells the kernel not to count for a thread */ 01610 /* should we warn if we try to set both? perf_event */ 01611 /* will reject it. */ 01612 pe_ctl->tid = -1; 01613 01614 pe_ctl->cpu = option->cpu.cpu_num; 01615 01616 return PAPI_OK; 01617 01618 case PAPI_DOMAIN: 01619 pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state ); 01620 if (check_permissions( pe_ctl->tid, pe_ctl->cpu, 01621 option->domain.domain, 01622 pe_ctl->granularity, 01623 pe_ctl->multiplexed, 01624 pe_ctl->inherit ) != PAPI_OK) { 01625 return PAPI_EPERM; 01626 } 01627 /* looks like we are allowed, so set counting domain */ 01628 return _papi_pe_set_domain( pe_ctl, option->domain.domain ); 01629 01630 case PAPI_GRANUL: 01631 pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state ); 01632 01633 /* FIXME: we really don't support this yet */ 01634 01635 switch ( option->granularity.granularity ) { 01636 case PAPI_GRN_PROCG: 01637 case PAPI_GRN_SYS_CPU: 01638 case PAPI_GRN_PROC: 01639 return PAPI_ECMP; 01640 01641 /* Currently we only support thread and CPU granularity */ 01642 case PAPI_GRN_SYS: 01643 pe_ctl->granularity=PAPI_GRN_SYS; 01644 break; 01645 01646 case PAPI_GRN_THR: 01647 pe_ctl->granularity=PAPI_GRN_THR; 01648 break; 01649 01650 01651 default: 01652 return PAPI_EINVAL; 01653 } 01654 return PAPI_OK; 01655 01656 case PAPI_INHERIT: 01657 pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state ); 01658 if (check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain, 01659 pe_ctl->granularity, pe_ctl->multiplexed, 01660 option->inherit.inherit ) != PAPI_OK) { 01661 return PAPI_EPERM; 01662 } 01663 /* looks like we are allowed, so set the requested inheritance */ 01664 if (option->inherit.inherit) { 01665 /* children will inherit counters */ 01666 pe_ctl->inherit = 1; 01667 } else { 01668 /* children won't inherit counters */ 01669 pe_ctl->inherit = 0; 01670 } 01671 return PAPI_OK; 01672 01673 case PAPI_DATA_ADDRESS: 01674 return PAPI_ENOSUPP; 01675 #if 0 01676 pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state); 01677 ret = set_default_domain( pe_ctl, option->address_range.domain ); 01678 if ( ret != PAPI_OK ) { 01679 return ret; 01680 } 01681 set_drange( pe_ctx, pe_ctl, option ); 01682 return PAPI_OK; 01683 #endif 01684 case PAPI_INSTR_ADDRESS: 01685 return PAPI_ENOSUPP; 01686 #if 0 01687 pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state); 01688 ret = set_default_domain( pe_ctl, option->address_range.domain ); 01689 if ( ret != PAPI_OK ) { 01690 return ret; 01691 } 01692 set_irange( pe_ctx, pe_ctl, option ); 01693 return PAPI_OK; 01694 #endif 01695 01696 case PAPI_DEF_ITIMER: 01697 /* What should we be checking for here? */ 01698 /* This seems like it should be OS-specific not component */ 01699 /* specific. */ 01700 01701 return PAPI_OK; 01702 01703 case PAPI_DEF_MPX_NS: 01704 /* Defining a given ns per set is not current supported */ 01705 return PAPI_ENOSUPP; 01706 01707 case PAPI_DEF_ITIMER_NS: 01708 /* We don't support this... */ 01709 return PAPI_OK; 01710 01711 default: 01712 return PAPI_ENOSUPP; 01713 } 01714 } 01715 01716 01717 /* 01718 * This function is used when hardware overflows are working or when 01719 * software overflows are forced 01720 */ 01721 01722 static void 01723 _papi_pe_dispatch_timer( int n, hwd_siginfo_t *info, void *uc ) 01724 { 01725 ( void ) n; /*unused */ 01726 _papi_hwi_context_t hw_context; 01727 int found_evt_idx = -1, fd = info->si_fd; 01728 caddr_t address; 01729 ThreadInfo_t *thread = _papi_hwi_lookup_thread( 0 ); 01730 int cidx = _papi_pe_vector.cmp_info.CmpIdx; 01731 int i; 01732 pe_control_t *ctl; 01733 01734 if ( thread == NULL ) { 01735 PAPIERROR( "thread == NULL in _papi_pe_dispatch_timer for fd %d!", fd ); 01736 return; 01737 } 01738 01739 if ( thread->running_eventset[cidx] == NULL ) { 01740 PAPIERROR( "thread->running_eventset == NULL in " 01741 "_papi_pe_dispatch_timer for fd %d!",fd ); 01742 return; 01743 } 01744 01745 if ( thread->running_eventset[cidx]->overflow.flags == 0 ) { 01746 PAPIERROR( "thread->running_eventset->overflow.flags == 0 in " 01747 "_papi_pe_dispatch_timer for fd %d!", fd ); 01748 return; 01749 } 01750 01751 hw_context.si = info; 01752 hw_context.ucontext = ( hwd_ucontext_t * ) uc; 01753 01754 if ( thread->running_eventset[cidx]->overflow.flags & 01755 PAPI_OVERFLOW_FORCE_SW ) { 01756 address = GET_OVERFLOW_ADDRESS( hw_context ); 01757 _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context, 01758 address, NULL, 0, 01759 0, &thread, cidx ); 01760 return; 01761 } 01762 01763 if ( thread->running_eventset[cidx]->overflow.flags != 01764 PAPI_OVERFLOW_HARDWARE ) { 01765 PAPIERROR( "thread->running_eventset->overflow.flags is set to " 01766 "something other than PAPI_OVERFLOW_HARDWARE or " 01767 "PAPI_OVERFLOW_FORCE_SW for fd %d (%x)", 01768 fd , thread->running_eventset[cidx]->overflow.flags); 01769 } 01770 01771 /* convoluted way to get ctl */ 01772 ctl= thread->running_eventset[cidx]->ctl_state; 01773 01774 /* See if the fd is one that's part of the this thread's context */ 01775 for( i=0; i < ctl->num_events; i++ ) { 01776 if ( fd == ctl->events[i].event_fd ) { 01777 found_evt_idx = i; 01778 break; 01779 } 01780 } 01781 01782 if ( found_evt_idx == -1 ) { 01783 PAPIERROR( "Unable to find fd %d among the open event fds " 01784 "_papi_hwi_dispatch_timer!", fd ); 01785 return; 01786 } 01787 01788 ioctl( fd, PERF_EVENT_IOC_DISABLE, NULL ); 01789 01790 if ( ( thread->running_eventset[cidx]->state & PAPI_PROFILING ) && 01791 !( thread->running_eventset[cidx]->profile.flags & 01792 PAPI_PROFIL_FORCE_SW ) ) { 01793 process_smpl_buf( found_evt_idx, &thread ); 01794 } 01795 else { 01796 uint64_t ip; 01797 unsigned int head; 01798 pe_event_info_t *pe = &(ctl->events[found_evt_idx]); 01799 unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( ); 01800 01801 /* 01802 * Read up the most recent IP from the sample in the mmap buffer. To 01803 * do this, we make the assumption that all of the records in the 01804 * mmap buffer are the same size, and that they all contain the IP as 01805 * their only record element. This means that we can use the 01806 * data_head element from the user page and move backward one record 01807 * from that point and read the data. Since we don't actually need 01808 * to access the header of the record, we can just subtract 8 (size 01809 * of the IP) from data_head and read up that word from the mmap 01810 * buffer. After we subtract 8, we account for mmap buffer wrapping 01811 * by AND'ing this offset with the buffer mask. 01812 */ 01813 head = mmap_read_head( pe ); 01814 01815 if ( head == 0 ) { 01816 PAPIERROR( "Attempting to access memory which may be inaccessable" ); 01817 return; 01818 } 01819 01820 ip = *( uint64_t * ) ( data + ( ( head - 8 ) & pe->mask ) ); 01821 /* 01822 * Update the tail to the current head pointer. 01823 * 01824 * Note: that if we were to read the record at the tail pointer, 01825 * rather than the one at the head (as you might otherwise think 01826 * would be natural), we could run into problems. Signals don't 01827 * stack well on Linux, particularly if not using RT signals, and if 01828 * they come in rapidly enough, we can lose some. Overtime, the head 01829 * could catch up to the tail and monitoring would be stopped, and 01830 * since no more signals are coming in, this problem will never be 01831 * resolved, resulting in a complete loss of overflow notification 01832 * from that point on. So the solution we use here will result in 01833 * only the most recent IP value being read every time there are two 01834 * or more samples in the buffer (for that one overflow signal). But 01835 * the handler will always bring up the tail, so the head should 01836 * never run into the tail. 01837 */ 01838 mmap_write_tail( pe, head ); 01839 01840 /* 01841 * The fourth parameter is supposed to be a vector of bits indicating 01842 * the overflowed hardware counters, but it's not really clear that 01843 * it's useful, because the actual hardware counters used are not 01844 * exposed to the PAPI user. For now, I'm just going to set the bit 01845 * that indicates which event register in the array overflowed. The 01846 * result is that the overflow vector will not be identical to the 01847 * perfmon implementation, and part of that is due to the fact that 01848 * which hardware register is actually being used is opaque at the 01849 * user level (the kernel event dispatcher hides that info). 01850 */ 01851 01852 _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context, 01853 ( caddr_t ) ( unsigned long ) ip, 01854 NULL, ( 1 << found_evt_idx ), 0, 01855 &thread, cidx ); 01856 01857 } 01858 01859 /* Restart the counters */ 01860 if (ioctl( fd, PERF_EVENT_IOC_REFRESH, PAPI_REFRESH_VALUE ) == -1) { 01861 PAPIERROR( "overflow refresh failed", 0 ); 01862 } 01863 } 01864 01865 /* Stop profiling */ 01866 static int 01867 _papi_pe_stop_profiling( ThreadInfo_t *thread, EventSetInfo_t *ESI ) 01868 { 01869 int i, ret = PAPI_OK; 01870 pe_control_t *ctl; 01871 01872 ctl=ESI->ctl_state; 01873 01874 /* Loop through all of the events and process those which have mmap */ 01875 /* buffers attached. */ 01876 for ( i = 0; i < ctl->num_events; i++ ) { 01877 /* Use the mmap_buf field as an indicator of this fd being used for */ 01878 /* profiling. */ 01879 if ( ctl->events[i].mmap_buf ) { 01880 /* Process any remaining samples in the sample buffer */ 01881 ret = process_smpl_buf( i, &thread ); 01882 if ( ret ) { 01883 PAPIERROR( "process_smpl_buf returned error %d", ret ); 01884 return ret; 01885 } 01886 } 01887 } 01888 return ret; 01889 } 01890 01891 01892 /* Setup an event to cause overflow */ 01893 static int 01894 _papi_pe_set_overflow( EventSetInfo_t *ESI, int EventIndex, int threshold ) 01895 { 01896 int cidx = _papi_pe_vector.cmp_info.CmpIdx; 01897 pe_context_t *ctx = ( pe_context_t *) ( ESI->master->context[cidx] ); 01898 pe_control_t *ctl = (pe_control_t *) ( ESI->ctl_state ); 01899 int i, evt_idx, found_non_zero_sample_period = 0, retval = PAPI_OK; 01900 01901 evt_idx = ESI->EventInfoArray[EventIndex].pos[0]; 01902 01903 SUBDBG("Attempting to set overflow for index %d (%d) of EventSet %d\n", 01904 evt_idx,EventIndex,ESI->EventSetIndex); 01905 01906 if (evt_idx<0) { 01907 return PAPI_EINVAL; 01908 } 01909 01910 if ( threshold == 0 ) { 01911 /* If this counter isn't set to overflow, it's an error */ 01912 if ( ctl->events[evt_idx].attr.sample_period == 0 ) return PAPI_EINVAL; 01913 } 01914 01915 ctl->events[evt_idx].attr.sample_period = threshold; 01916 01917 /* 01918 * Note that the wakeup_mode field initially will be set to zero 01919 * (WAKEUP_MODE_COUNTER_OVERFLOW) as a result of a call to memset 0 to 01920 * all of the events in the ctl struct. 01921 * 01922 * Is it even set to any other value elsewhere? 01923 */ 01924 switch ( ctl->events[evt_idx].wakeup_mode ) { 01925 case WAKEUP_MODE_PROFILING: 01926 /* Setting wakeup_events to special value zero means issue a */ 01927 /* wakeup (signal) on every mmap page overflow. */ 01928 ctl->events[evt_idx].attr.wakeup_events = 0; 01929 break; 01930 01931 case WAKEUP_MODE_COUNTER_OVERFLOW: 01932 /* Can this code ever be called? */ 01933 01934 /* Setting wakeup_events to one means issue a wakeup on every */ 01935 /* counter overflow (not mmap page overflow). */ 01936 ctl->events[evt_idx].attr.wakeup_events = 1; 01937 /* We need the IP to pass to the overflow handler */ 01938 ctl->events[evt_idx].attr.sample_type = PERF_SAMPLE_IP; 01939 /* one for the user page, and two to take IP samples */ 01940 ctl->events[evt_idx].nr_mmap_pages = 1 + 2; 01941 break; 01942 default: 01943 PAPIERROR( "ctl->wakeup_mode[%d] set to an unknown value - %u", 01944 evt_idx, ctl->events[evt_idx].wakeup_mode); 01945 return PAPI_EBUG; 01946 } 01947 01948 /* Check for non-zero sample period */ 01949 for ( i = 0; i < ctl->num_events; i++ ) { 01950 if ( ctl->events[evt_idx].attr.sample_period ) { 01951 found_non_zero_sample_period = 1; 01952 break; 01953 } 01954 } 01955 01956 if ( found_non_zero_sample_period ) { 01957 /* turn on internal overflow flag for this event set */ 01958 ctl->overflow = 1; 01959 01960 /* Enable the signal handler */ 01961 retval = _papi_hwi_start_signal( 01962 _papi_pe_vector.cmp_info.hardware_intr_sig, 01963 1, _papi_pe_vector.cmp_info.CmpIdx ); 01964 } else { 01965 /* turn off internal overflow flag for this event set */ 01966 ctl->overflow = 0; 01967 01968 /* Remove the signal handler, if there are no remaining non-zero */ 01969 /* sample_periods set */ 01970 retval = _papi_hwi_stop_signal( 01971 _papi_pe_vector.cmp_info.hardware_intr_sig ); 01972 if ( retval != PAPI_OK ) return retval; 01973 } 01974 01975 retval = _papi_pe_update_control_state( ctl, NULL, 01976 ( (pe_control_t *) (ESI->ctl_state) )->num_events, 01977 ctx ); 01978 01979 return retval; 01980 } 01981 01982 /* Enable profiling */ 01983 static int 01984 _papi_pe_set_profile( EventSetInfo_t *ESI, int EventIndex, int threshold ) 01985 { 01986 int ret; 01987 int evt_idx; 01988 pe_control_t *ctl = ( pe_control_t *) ( ESI->ctl_state ); 01989 01990 /* Since you can't profile on a derived event, the event is always the */ 01991 /* first and only event in the native event list. */ 01992 evt_idx = ESI->EventInfoArray[EventIndex].pos[0]; 01993 01994 if ( threshold == 0 ) { 01995 SUBDBG( "MUNMAP(%p,%"PRIu64")\n", ctl->events[evt_idx].mmap_buf, 01996 ( uint64_t ) ctl->events[evt_idx].nr_mmap_pages * 01997 getpagesize( ) ); 01998 01999 if ( ctl->events[evt_idx].mmap_buf ) { 02000 munmap( ctl->events[evt_idx].mmap_buf, 02001 ctl->events[evt_idx].nr_mmap_pages * getpagesize() ); 02002 } 02003 02004 ctl->events[evt_idx].mmap_buf = NULL; 02005 ctl->events[evt_idx].nr_mmap_pages = 0; 02006 ctl->events[evt_idx].attr.sample_type &= ~PERF_SAMPLE_IP; 02007 ret = _papi_pe_set_overflow( ESI, EventIndex, threshold ); 02008 /* ??? #warning "This should be handled somewhere else" */ 02009 ESI->state &= ~( PAPI_OVERFLOWING ); 02010 ESI->overflow.flags &= ~( PAPI_OVERFLOW_HARDWARE ); 02011 02012 return ret; 02013 } 02014 02015 /* Look up the native event code */ 02016 if ( ESI->profile.flags & (PAPI_PROFIL_DATA_EAR | PAPI_PROFIL_INST_EAR)) { 02017 /* Not supported yet... */ 02018 02019 return PAPI_ENOSUPP; 02020 } 02021 02022 if ( ESI->profile.flags & PAPI_PROFIL_RANDOM ) { 02023 /* This requires an ability to randomly alter the sample_period within */ 02024 /* a given range. Kernel does not have this ability. FIXME */ 02025 return PAPI_ENOSUPP; 02026 } 02027 02028 /* Just a guess at how many pages would make this relatively efficient. */ 02029 /* Note that it's "1 +" because of the need for a control page, and the */ 02030 /* number following the "+" must be a power of 2 (1, 4, 8, 16, etc) or */ 02031 /* zero. This is required to optimize dealing with circular buffer */ 02032 /* wrapping of the mapped pages. */ 02033 02034 ctl->events[evt_idx].nr_mmap_pages = (1+8); 02035 ctl->events[evt_idx].attr.sample_type |= PERF_SAMPLE_IP; 02036 02037 ret = _papi_pe_set_overflow( ESI, EventIndex, threshold ); 02038 if ( ret != PAPI_OK ) return ret; 02039 02040 return PAPI_OK; 02041 } 02042 02043 02044 /* Our component vector */ 02045 02046 papi_vector_t _papi_pe_vector = { 02047 .cmp_info = { 02048 /* component information (unspecified values initialized to 0) */ 02049 .name = "perf_events", 02050 .short_name = "pe", 02051 .version = "5.0", 02052 .description = "Linux perf_event CPU counters", 02053 02054 .default_domain = PAPI_DOM_USER, 02055 .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR, 02056 .default_granularity = PAPI_GRN_THR, 02057 .available_granularities = PAPI_GRN_THR | PAPI_GRN_SYS, 02058 02059 .hardware_intr = 1, 02060 .kernel_profile = 1, 02061 .num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS, 02062 02063 /* component specific cmp_info initializations */ 02064 .fast_virtual_timer = 0, 02065 .attach = 1, 02066 .attach_must_ptrace = 1, 02067 .cpu = 1, 02068 .inherit = 1, 02069 .cntr_umasks = 1, 02070 02071 }, 02072 02073 /* sizes of framework-opaque component-private structures */ 02074 .size = { 02075 .context = sizeof ( pe_context_t ), 02076 .control_state = sizeof ( pe_control_t ), 02077 .reg_value = sizeof ( int ), 02078 .reg_alloc = sizeof ( int ), 02079 }, 02080 02081 /* function pointers in this component */ 02082 .init_control_state = _papi_pe_init_control_state, 02083 .start = _papi_pe_start, 02084 .stop = _papi_pe_stop, 02085 .read = _papi_pe_read, 02086 .shutdown_thread = _papi_pe_shutdown_thread, 02087 .shutdown_component = _papi_pe_shutdown_component, 02088 .ctl = _papi_pe_ctl, 02089 .update_control_state = _papi_pe_update_control_state, 02090 .set_domain = _papi_pe_set_domain, 02091 .reset = _papi_pe_reset, 02092 .set_overflow = _papi_pe_set_overflow, 02093 .set_profile = _papi_pe_set_profile, 02094 .stop_profiling = _papi_pe_stop_profiling, 02095 .init_component = _papi_pe_init_component, 02096 .dispatch_timer = _papi_pe_dispatch_timer, 02097 .write = _papi_pe_write, 02098 .init_thread = _papi_pe_init_thread, 02099 02100 /* from counter name mapper */ 02101 .ntv_enum_events = _papi_libpfm4_ntv_enum_events, 02102 .ntv_name_to_code = _papi_libpfm4_ntv_name_to_code, 02103 .ntv_code_to_name = _papi_libpfm4_ntv_code_to_name, 02104 .ntv_code_to_descr = _papi_libpfm4_ntv_code_to_descr, 02105 .ntv_code_to_info = _papi_libpfm4_ntv_code_to_info, 02106 };