|
PAPI
5.0.1.0
|
00001 /* 00002 * File: perf_events.c 00003 * 00004 * Author: Corey Ashford 00005 * cjashfor@us.ibm.com 00006 * - based upon perfmon.c written by - 00007 * Philip Mucci 00008 * mucci@cs.utk.edu 00009 * Mods: Gary Mohr 00010 * gary.mohr@bull.com 00011 * Mods: Vince Weaver 00012 * vweaver1@eecs.utk.edu 00013 * Mods: Philip Mucci 00014 * mucci@eecs.utk.edu */ 00015 00016 00017 #include <fcntl.h> 00018 #include <string.h> 00019 #include <errno.h> 00020 #include <signal.h> 00021 #include <syscall.h> 00022 #include <sys/utsname.h> 00023 #include <sys/mman.h> 00024 #include <sys/ioctl.h> 00025 00026 /* PAPI-specific includes */ 00027 #include "papi.h" 00028 #include "papi_memory.h" 00029 #include "papi_internal.h" 00030 #include "papi_vector.h" 00031 #include "extras.h" 00032 00033 /* libpfm4 includes */ 00034 #include "papi_libpfm4_events.h" 00035 #include "perfmon/pfmlib.h" 00036 #include PEINCLUDE 00037 00038 /* Linux-specific includes */ 00039 #include "mb.h" 00040 #include "syscalls.h" 00041 #include "linux-memory.h" 00042 #include "linux-timer.h" 00043 #include "linux-common.h" 00044 #include "linux-context.h" 00045 00046 /* Various definitions */ 00047 00048 /* This is arbitrary. Typically you can add up to ~1000 before */ 00049 /* you run out of fds */ 00050 #define PERF_EVENT_MAX_MPX_COUNTERS 64 00051 00052 /* We really don't need fancy definitions for these */ 00053 00054 typedef struct 00055 { 00056 int group_leader_fd; /* fd of group leader */ 00057 int event_fd; /* fd of event */ 00058 int event_opened; /* event successfully opened */ 00059 uint32_t nr_mmap_pages; /* number pages in the mmap buffer */ 00060 void *mmap_buf; /* used for control/profiling */ 00061 uint64_t tail; /* current read location in mmap buffer */ 00062 uint64_t mask; /* mask used for wrapping the pages */ 00063 struct perf_event_attr attr; /* perf_event config structure */ 00064 unsigned int wakeup_mode; /* wakeup mode when sampling */ 00065 } pe_event_info_t; 00066 00067 typedef struct 00068 { 00069 int num_events; /* number of events in control state */ 00070 unsigned int domain; /* control-state wide domain */ 00071 unsigned int multiplexed; /* multiplexing enable */ 00072 unsigned int overflow; /* overflow enable */ 00073 unsigned int inherit; /* inherit enable */ 00074 int cpu; /* which cpu to measure */ 00075 pid_t tid; /* thread we are monitoring */ 00076 pe_event_info_t events[PERF_EVENT_MAX_MPX_COUNTERS]; 00077 long long counts[PERF_EVENT_MAX_MPX_COUNTERS]; 00078 } pe_control_t; 00079 00080 typedef struct 00081 { 00082 int initialized; /* are we initialized? */ 00083 int state; /* are we opened and/or running? */ 00084 } pe_context_t; 00085 00086 /* These sentinels tell papi_pe_set_overflow() how to set the */ 00087 /* wakeup_events field in the event descriptor record. */ 00088 00089 #define WAKEUP_COUNTER_OVERFLOW 0 00090 #define WAKEUP_PROFILING -1 00091 00092 #define WAKEUP_MODE_COUNTER_OVERFLOW 0 00093 #define WAKEUP_MODE_PROFILING 1 00094 00095 /* Defines for ctx->state */ 00096 #define PERF_EVENTS_OPENED 0x01 00097 #define PERF_EVENTS_RUNNING 0x02 00098 00099 /* Static globals */ 00100 static int nmi_watchdog_active; 00101 00102 /* Advance declaration */ 00103 papi_vector_t _papi_pe_vector; 00104 00105 00106 /******** Kernel Version Dependent Routines **********************/ 00107 00108 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch 00109 * implementations (e.g. x86) which don't do a static event scheduability 00110 * check in sys_perf_event_open. 00111 * This was fixed for x86 in the 2.6.33 kernel 00112 * 00113 * Also! Kernels newer than 2.6.34 will fail in a similar way 00114 * if the nmi_watchdog has stolen a performance counter 00115 * and we try to use the maximum number of counters. 00116 * A sys_perf_open() will seem to succeed but will fail 00117 * at read time. So re-use this work around code. 00118 */ 00119 static int 00120 bug_check_scheduability(void) { 00121 00122 #if defined(__powerpc__) 00123 /* PowerPC not affected by this bug */ 00124 #elif defined(__mips__) 00125 /* MIPS as of kernel 3.1 does not properly detect schedulability */ 00126 return 1; 00127 #else 00128 if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1; 00129 #endif 00130 00131 if (nmi_watchdog_active) return 1; 00132 00133 return 0; 00134 } 00135 00136 /* PERF_FORMAT_GROUP allows reading an entire group's counts at once */ 00137 /* before 2.6.34 PERF_FORMAT_GROUP did not work when reading results */ 00138 /* from attached processes. We are lazy and disable it for all cases */ 00139 /* commit was: 050735b08ca8a016bbace4445fa025b88fee770b */ 00140 00141 static int 00142 bug_format_group(void) { 00143 00144 if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) return 1; 00145 00146 /* MIPS, as of version 3.1, does not support this properly */ 00147 00148 #if defined(__mips__) 00149 return 1; 00150 #endif 00151 00152 return 0; 00153 00154 } 00155 00156 00157 /* There's a bug prior to Linux 2.6.33 where if you are using */ 00158 /* PERF_FORMAT_GROUP, the TOTAL_TIME_ENABLED and */ 00159 /* TOTAL_TIME_RUNNING fields will be zero unless you disable */ 00160 /* the counters first */ 00161 static int 00162 bug_sync_read(void) { 00163 00164 if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1; 00165 00166 return 0; 00167 00168 } 00169 00170 00171 /* Set the F_SETOWN_EX flag on the fd. */ 00172 /* This affects which thread an overflow signal gets sent to */ 00173 /* Handled in a subroutine to handle the fact that the behavior */ 00174 /* is dependent on kernel version. */ 00175 static int 00176 fcntl_setown_fd(int fd) { 00177 00178 int ret; 00179 struct f_owner_ex fown_ex; 00180 00181 /* F_SETOWN_EX is not available until 2.6.32 */ 00182 if (_papi_os_info.os_version < LINUX_VERSION(2,6,32)) { 00183 00184 /* get ownership of the descriptor */ 00185 ret = fcntl( fd, F_SETOWN, mygettid( ) ); 00186 if ( ret == -1 ) { 00187 PAPIERROR( "cannot fcntl(F_SETOWN) on %d: %s", fd, strerror(errno) ); 00188 return PAPI_ESYS; 00189 } 00190 } 00191 else { 00192 /* set ownership of the descriptor */ 00193 fown_ex.type = F_OWNER_TID; 00194 fown_ex.pid = mygettid(); 00195 ret = fcntl(fd, F_SETOWN_EX, (unsigned long)&fown_ex ); 00196 00197 if ( ret == -1 ) { 00198 PAPIERROR( "cannot fcntl(F_SETOWN_EX) on %d: %s", 00199 fd, strerror( errno ) ); 00200 return PAPI_ESYS; 00201 } 00202 } 00203 return PAPI_OK; 00204 } 00205 00206 /* Check for processor support */ 00207 /* Can be used for generic checking, though in general we only */ 00208 /* check for pentium4 here because support was broken for multiple */ 00209 /* kernel releases and the usual standard detections did not */ 00210 /* handle this. So we check for pentium 4 explicitly. */ 00211 static int 00212 processor_supported(int vendor, int family) { 00213 00214 /* Error out if kernel too early to support p4 */ 00215 if (( vendor == PAPI_VENDOR_INTEL ) && (family == 15)) { 00216 if (_papi_os_info.os_version < LINUX_VERSION(2,6,35)) { 00217 PAPIERROR("Pentium 4 not supported on kernels before 2.6.35"); 00218 return PAPI_ENOSUPP; 00219 } 00220 } 00221 return PAPI_OK; 00222 } 00223 00224 00225 /* The read format on perf_event varies based on various flags that */ 00226 /* are passed into it. This helper avoids copying this logic */ 00227 /* multiple places. */ 00228 static unsigned int 00229 get_read_format( unsigned int multiplex, 00230 unsigned int inherit, 00231 int format_group ) 00232 { 00233 unsigned int format = 0; 00234 00235 /* if we need read format options for multiplexing, add them now */ 00236 if (multiplex) { 00237 format |= PERF_FORMAT_TOTAL_TIME_ENABLED; 00238 format |= PERF_FORMAT_TOTAL_TIME_RUNNING; 00239 } 00240 00241 /* if our kernel supports it and we are not using inherit, */ 00242 /* add the group read options */ 00243 if ( (!bug_format_group()) && !inherit) { 00244 if (format_group) { 00245 format |= PERF_FORMAT_GROUP; 00246 } 00247 } 00248 00249 SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: 0x%x\n", 00250 multiplex, inherit, format_group, format); 00251 00252 return format; 00253 } 00254 00255 00256 /********* End Kernel-version Dependent Routines ****************/ 00257 00258 00260 /* perf_events. */ 00261 /* We do this by temporarily opening an event with the */ 00262 /* desired options then closing it again. We use the */ 00263 /* PERF_COUNT_HW_INSTRUCTION event as a dummy event */ 00264 /* on the assumption it is available on all */ 00265 /* platforms. */ 00266 00267 static int 00268 check_permissions( unsigned long tid, unsigned int cpu_num, 00269 unsigned int domain, unsigned int multiplex, 00270 unsigned int inherit ) 00271 { 00272 int ev_fd; 00273 struct perf_event_attr attr; 00274 00275 /* clearing this will set a type of hardware and to count all domains */ 00276 memset(&attr, '\0', sizeof(attr)); 00277 attr.read_format = get_read_format(multiplex, inherit, 1); 00278 00279 /* set the event id (config field) to instructios */ 00280 /* (an event that should always exist) */ 00281 /* This was cycles but that is missing on Niagara */ 00282 attr.config = PERF_COUNT_HW_INSTRUCTIONS; 00283 00284 /* now set up domains this event set will be counting */ 00285 if (!(domain & PAPI_DOM_SUPERVISOR)) { 00286 attr.exclude_hv = 1; 00287 } 00288 if (!(domain & PAPI_DOM_USER)) { 00289 attr.exclude_user = 1; 00290 } 00291 if (!(domain & PAPI_DOM_KERNEL)) { 00292 attr.exclude_kernel = 1; 00293 } 00294 00295 SUBDBG("Calling sys_perf_event_open() from check_permissions\n"); 00296 00297 ev_fd = sys_perf_event_open( &attr, tid, cpu_num, -1, 0 ); 00298 if ( ev_fd == -1 ) { 00299 SUBDBG("sys_perf_event_open returned error. Linux says, %s", 00300 strerror( errno ) ); 00301 return PAPI_EPERM; 00302 } 00303 00304 /* now close it, this was just to make sure we have permissions */ 00305 /* to set these options */ 00306 close(ev_fd); 00307 return PAPI_OK; 00308 } 00309 00310 00311 00312 /* Maximum size we ever expect to read from a perf_event fd */ 00313 /* (this is the number of 64-bit values) */ 00314 /* We use this to size the read buffers */ 00315 /* The three is for event count, time_enabled, time_running */ 00316 /* and the counter term is count value and count id for each */ 00317 /* possible counter value. */ 00318 #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS)) 00319 00320 00321 00322 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */ 00323 /* implementations (e.g. x86 before 2.6.33) which don't do a static event */ 00324 /* scheduability check in sys_perf_event_open. It is also needed if the */ 00325 /* kernel is stealing an event, such as when NMI watchdog is enabled. */ 00326 00327 static int 00328 check_scheduability( pe_context_t *ctx, pe_control_t *ctl, int idx ) 00329 { 00330 int retval = 0, cnt = -1; 00331 ( void ) ctx; /*unused */ 00332 long long papi_pe_buffer[READ_BUFFER_SIZE]; 00333 int i,group_leader_fd; 00334 00335 if (bug_check_scheduability()) { 00336 00337 /* If the kernel isn't tracking scheduability right */ 00338 /* Then we need to start/stop/read to force the event */ 00339 /* to be scheduled and see if an error condition happens. */ 00340 00341 /* get the proper fd to start */ 00342 group_leader_fd=ctl->events[idx].group_leader_fd; 00343 if (group_leader_fd==-1) group_leader_fd=ctl->events[idx].event_fd; 00344 00345 /* start the event */ 00346 retval = ioctl( group_leader_fd, PERF_EVENT_IOC_ENABLE, NULL ); 00347 if (retval == -1) { 00348 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n"); 00349 return PAPI_ESYS; 00350 } 00351 00352 /* stop the event */ 00353 retval = ioctl(group_leader_fd, PERF_EVENT_IOC_DISABLE, NULL ); 00354 if (retval == -1) { 00355 PAPIERROR( "ioctl(PERF_EVENT_IOC_DISABLE) failed.\n" ); 00356 return PAPI_ESYS; 00357 } 00358 00359 /* See if a read returns any results */ 00360 cnt = read( group_leader_fd, papi_pe_buffer, sizeof(papi_pe_buffer)); 00361 if ( cnt == -1 ) { 00362 SUBDBG( "read returned an error! Should never happen.\n" ); 00363 return PAPI_ESYS; 00364 } 00365 00366 if ( cnt == 0 ) { 00367 /* We read 0 bytes if we could not schedule the event */ 00368 /* The kernel should have detected this at open */ 00369 /* but various bugs (including NMI watchdog) */ 00370 /* result in this behavior */ 00371 00372 return PAPI_ECNFLCT; 00373 00374 } else { 00375 00376 /* Reset all of the counters (opened so far) back to zero */ 00377 /* from the above brief enable/disable call pair. */ 00378 00379 /* We have to reset all events because reset of group leader */ 00380 /* does not reset all. */ 00381 /* we assume that the events are being added one by one and that */ 00382 /* we do not need to reset higher events (doing so may reset ones */ 00383 /* that have not been initialized yet. */ 00384 00385 /* Note... PERF_EVENT_IOC_RESET does not reset time running */ 00386 /* info if multiplexing, so we should avoid coming here if */ 00387 /* we are multiplexing the event. */ 00388 for( i = 0; i < idx; i++) { 00389 retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL ); 00390 if (retval == -1) { 00391 PAPIERROR( "ioctl(PERF_EVENT_IOC_RESET) #%d/%d %d " 00392 "(fd %d)failed.\n", 00393 i,ctl->num_events,idx,ctl->events[i].event_fd); 00394 return PAPI_ESYS; 00395 } 00396 } 00397 } 00398 } 00399 return PAPI_OK; 00400 } 00401 00402 00403 /* Do some extrta work on a perf_event fd if we're doing sampling */ 00404 /* This mostly means setting up the mmap buffer. */ 00405 static int 00406 tune_up_fd( pe_control_t *ctl, int evt_idx ) 00407 { 00408 int ret; 00409 void *buf_addr; 00410 int fd = ctl->events[evt_idx].event_fd; 00411 00412 /* Register that we would like a SIGIO notification when a mmap'd page */ 00413 /* becomes full. */ 00414 ret = fcntl( fd, F_SETFL, O_ASYNC | O_NONBLOCK ); 00415 if ( ret ) { 00416 PAPIERROR ( "fcntl(%d, F_SETFL, O_ASYNC | O_NONBLOCK) " 00417 "returned error: %s", fd, strerror( errno ) ); 00418 return PAPI_ESYS; 00419 } 00420 00421 /* Set the F_SETOWN_EX flag on the fd. */ 00422 /* This affects which thread an overflow signal gets sent to. */ 00423 ret=fcntl_setown_fd(fd); 00424 if (ret!=PAPI_OK) return ret; 00425 00426 /* Set FD_CLOEXEC. Otherwise if we do an exec with an overflow */ 00427 /* running, the overflow handler will continue into the exec()'d*/ 00428 /* process and kill it because no signal handler is set up. */ 00429 ret=fcntl(fd, F_SETFD, FD_CLOEXEC); 00430 if (ret) { 00431 return PAPI_ESYS; 00432 } 00433 00434 /* when you explicitely declare that you want a particular signal, */ 00435 /* even with you use the default signal, the kernel will send more */ 00436 /* information concerning the event to the signal handler. */ 00437 /* */ 00438 /* In particular, it will send the file descriptor from which the */ 00439 /* event is originating which can be quite useful when monitoring */ 00440 /* multiple tasks from a single thread. */ 00441 ret = fcntl( fd, F_SETSIG, _papi_pe_vector.cmp_info.hardware_intr_sig ); 00442 if ( ret == -1 ) { 00443 PAPIERROR( "cannot fcntl(F_SETSIG,%d) on %d: %s", 00444 _papi_pe_vector.cmp_info.hardware_intr_sig, fd, 00445 strerror( errno ) ); 00446 return PAPI_ESYS; 00447 } 00448 00449 /* mmap() the sample buffer */ 00450 buf_addr = mmap( NULL, ctl->events[evt_idx].nr_mmap_pages * getpagesize(), 00451 PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 ); 00452 if ( buf_addr == MAP_FAILED ) { 00453 PAPIERROR( "mmap(NULL,%d,%d,%d,%d,0): %s", 00454 ctl->events[evt_idx].nr_mmap_pages * getpagesize( ), 00455 PROT_READ, MAP_SHARED, fd, strerror( errno ) ); 00456 return ( PAPI_ESYS ); 00457 } 00458 00459 SUBDBG( "Sample buffer for fd %d is located at %p\n", fd, buf_addr ); 00460 00461 /* Set up the mmap buffer and its associated helpers */ 00462 ctl->events[evt_idx].mmap_buf = (struct perf_counter_mmap_page *) buf_addr; 00463 ctl->events[evt_idx].tail = 0; 00464 ctl->events[evt_idx].mask = ( ctl->events[evt_idx].nr_mmap_pages - 1 ) * 00465 getpagesize() - 1; 00466 00467 return PAPI_OK; 00468 } 00469 00470 00471 /* Open all events in the control state */ 00472 static int 00473 open_pe_events( pe_context_t *ctx, pe_control_t *ctl ) 00474 { 00475 00476 int i, ret = PAPI_OK; 00477 00478 for( i = 0; i < ctl->num_events; i++ ) { 00479 00480 ctl->events[i].event_opened=0; 00481 00482 /* set up the attr structure. We don't set up all fields here */ 00483 /* as some have already been set up previously. */ 00484 00485 /* group leader (event 0) is special */ 00486 /* If we're multiplexed, everyone is a group leader */ 00487 if (( i == 0 ) || (ctl->multiplexed)) { 00488 ctl->events[i].attr.pinned = !ctl->multiplexed; 00489 ctl->events[i].attr.disabled = 1; 00490 ctl->events[i].group_leader_fd=-1; 00491 ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 00492 ctl->inherit, 00493 !ctl->multiplexed ); 00494 } else { 00495 ctl->events[i].attr.pinned=0; 00496 ctl->events[i].attr.disabled = 0; 00497 ctl->events[i].group_leader_fd=ctl->events[0].event_fd; 00498 ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 00499 ctl->inherit, 00500 0 ); 00501 } 00502 00503 00504 /* try to open */ 00505 ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr, 00506 ctl->tid, 00507 ctl->cpu, 00508 ctl->events[i].group_leader_fd, 00509 0 /* flags */ 00510 ); 00511 00512 if ( ctl->events[i].event_fd == -1 ) { 00513 SUBDBG("sys_perf_event_open returned error on event #%d." 00514 " Error: %s\n", 00515 i, strerror( errno ) ); 00516 ret = PAPI_ECNFLCT; 00517 goto open_pe_cleanup; 00518 } 00519 00520 SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d," 00521 " group_leader/fd: %d, event_fd: %d," 00522 " read_format: 0x%"PRIu64"\n", 00523 (long)ctl->tid, ctl->cpu, ctl->events[i].group_leader_fd, 00524 ctl->events[i].event_fd, ctl->events[i].attr.read_format); 00525 00526 00527 /* in many situations the kernel will indicate we opened fine */ 00528 /* yet things will fail later. So we need to double check */ 00529 /* we actually can use the events we've set up. */ 00530 00531 /* This is not necessary if we are multiplexing, and in fact */ 00532 /* we cannot do this properly if multiplexed because */ 00533 /* PERF_EVENT_IOC_RESET does not reset the time running info */ 00534 if (!ctl->multiplexed) { 00535 ret = check_scheduability( ctx, ctl, i ); 00536 00537 if ( ret != PAPI_OK ) { 00538 /* the last event did open, so we need to bump the counter */ 00539 /* before doing the cleanup */ 00540 i++; 00541 00542 goto open_pe_cleanup; 00543 } 00544 } 00545 ctl->events[i].event_opened=1; 00546 } 00547 00548 /* Now that we've successfully opened all of the events, do whatever */ 00549 /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */ 00550 /* and so on. */ 00551 for ( i = 0; i < ctl->num_events; i++ ) { 00552 00553 /* If sampling is enabled, hook up signal handler */ 00554 if ( ctl->events[i].attr.sample_period ) { 00555 ret = tune_up_fd( ctl, i ); 00556 if ( ret != PAPI_OK ) { 00557 /* All of the fds are open, so we need to clean up all of them */ 00558 i = ctl->num_events; 00559 goto open_pe_cleanup; 00560 } 00561 } else { 00562 /* Make sure this is NULL so close_pe_events works right */ 00563 ctl->events[i].mmap_buf = NULL; 00564 } 00565 } 00566 00567 /* Set num_evts only if completely successful */ 00568 ctx->state |= PERF_EVENTS_OPENED; 00569 00570 return PAPI_OK; 00571 00572 open_pe_cleanup: 00573 /* We encountered an error, close up the fds we successfully opened. */ 00574 /* We go backward in an attempt to close group leaders last, although */ 00575 /* That's probably not strictly necessary. */ 00576 while ( i > 0 ) { 00577 i--; 00578 if (ctl->events[i].event_fd>=0) { 00579 close( ctl->events[i].event_fd ); 00580 ctl->events[i].event_opened=0; 00581 } 00582 } 00583 00584 return ret; 00585 } 00586 00587 /* Close all of the opened events */ 00588 static int 00589 close_pe_events( pe_context_t *ctx, pe_control_t *ctl ) 00590 { 00591 int i; 00592 int num_closed=0; 00593 00594 /* should this be a more serious error? */ 00595 if ( ctx->state & PERF_EVENTS_RUNNING ) { 00596 SUBDBG("Closing without stopping first\n"); 00597 } 00598 00599 /* Close child events first */ 00600 for( i=0; i<ctl->num_events; i++ ) { 00601 00602 if (ctl->events[i].event_opened) { 00603 00604 if (ctl->events[i].group_leader_fd!=-1) { 00605 if ( ctl->events[i].mmap_buf ) { 00606 if ( munmap ( ctl->events[i].mmap_buf, 00607 ctl->events[i].nr_mmap_pages * getpagesize() ) ) { 00608 PAPIERROR( "munmap of fd = %d returned error: %s", 00609 ctl->events[i].event_fd, strerror( errno ) ); 00610 return PAPI_ESYS; 00611 } 00612 } 00613 00614 if ( close( ctl->events[i].event_fd ) ) { 00615 PAPIERROR( "close of fd = %d returned error: %s", 00616 ctl->events[i].event_fd, strerror( errno ) ); 00617 return PAPI_ESYS; 00618 } else { 00619 num_closed++; 00620 } 00621 ctl->events[i].event_opened=0; 00622 } 00623 } 00624 } 00625 00626 /* Close the group leaders last */ 00627 for( i=0; i<ctl->num_events; i++ ) { 00628 00629 if (ctl->events[i].event_opened) { 00630 00631 if (ctl->events[i].group_leader_fd==-1) { 00632 if ( ctl->events[i].mmap_buf ) { 00633 if ( munmap ( ctl->events[i].mmap_buf, 00634 ctl->events[i].nr_mmap_pages * getpagesize() ) ) { 00635 PAPIERROR( "munmap of fd = %d returned error: %s", 00636 ctl->events[i].event_fd, strerror( errno ) ); 00637 return PAPI_ESYS; 00638 } 00639 } 00640 00641 00642 if ( close( ctl->events[i].event_fd ) ) { 00643 PAPIERROR( "close of fd = %d returned error: %s", 00644 ctl->events[i].event_fd, strerror( errno ) ); 00645 return PAPI_ESYS; 00646 } else { 00647 num_closed++; 00648 } 00649 ctl->events[i].event_opened=0; 00650 } 00651 } 00652 } 00653 00654 00655 if (ctl->num_events!=num_closed) { 00656 PAPIERROR("Didn't close all events\n"); 00657 return PAPI_EBUG; 00658 } 00659 00660 ctl->num_events=0; 00661 00662 ctx->state &= ~PERF_EVENTS_OPENED; 00663 00664 return PAPI_OK; 00665 } 00666 00667 /* Fix up the config based on what CPU/Vendor we are running on */ 00668 static int 00669 pe_vendor_fixups(void) 00670 { 00671 /* powerpc */ 00672 /* On IBM and Power6 Machines default domain should include supervisor */ 00673 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_IBM ) { 00674 _papi_pe_vector.cmp_info.available_domains |= 00675 PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00676 if (strcmp(_papi_hwi_system_info.hw_info.model_string, "POWER6" ) == 0 ) { 00677 _papi_pe_vector.cmp_info.default_domain = 00678 PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00679 } 00680 } 00681 00682 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_MIPS ) { 00683 _papi_pe_vector.cmp_info.available_domains |= PAPI_DOM_KERNEL; 00684 } 00685 00686 if ((_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_INTEL) || 00687 (_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_AMD)) { 00688 _papi_pe_vector.cmp_info.fast_real_timer = 1; 00689 } 00690 00691 /* ARM */ 00692 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_ARM) { 00693 /* FIXME: this will change with Cortex A15 */ 00694 _papi_pe_vector.cmp_info.available_domains |= 00695 PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00696 _papi_pe_vector.cmp_info.default_domain = 00697 PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR; 00698 } 00699 00700 /* CRAY */ 00701 if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_CRAY ) { 00702 _papi_pe_vector.cmp_info.available_domains |= PAPI_DOM_OTHER; 00703 } 00704 00705 return PAPI_OK; 00706 } 00707 00708 00709 /* Check the mmap page for rdpmc support */ 00710 static int detect_rdpmc(void) { 00711 00712 struct perf_event_attr pe; 00713 int fd,rdpmc_exists=1; 00714 void *addr; 00715 struct perf_event_mmap_page *our_mmap; 00716 00717 /* Create a fake instructions event so we can read a mmap page */ 00718 memset(&pe,0,sizeof(struct perf_event_attr)); 00719 00720 pe.type=PERF_TYPE_HARDWARE; 00721 pe.size=sizeof(struct perf_event_attr); 00722 pe.config=PERF_COUNT_HW_INSTRUCTIONS; 00723 00724 fd=sys_perf_event_open(&pe,0,-1,-1,0); 00725 if (fd<0) { 00726 return PAPI_ESYS; 00727 } 00728 00729 /* create the mmap page */ 00730 addr=mmap(NULL, 4096, PROT_READ, MAP_SHARED,fd,0); 00731 if (addr == (void *)(-1)) { 00732 close(fd); 00733 return PAPI_ESYS; 00734 } 00735 00736 /* get the rdpmc info */ 00737 our_mmap=(struct perf_event_mmap_page *)addr; 00738 if (our_mmap->cap_usr_rdpmc==0) { 00739 rdpmc_exists=0; 00740 } 00741 00742 /* close the fake event */ 00743 munmap(addr,4096); 00744 close(fd); 00745 00746 return rdpmc_exists; 00747 00748 } 00749 00750 /* Find a native event specified by a profile index */ 00751 static int 00752 find_profile_index( EventSetInfo_t *ESI, int evt_idx, int *flags, 00753 unsigned int *native_index, int *profile_index ) 00754 { 00755 int pos, esi_index, count; 00756 00757 for ( count = 0; count < ESI->profile.event_counter; count++ ) { 00758 esi_index = ESI->profile.EventIndex[count]; 00759 pos = ESI->EventInfoArray[esi_index].pos[0]; 00760 00761 if ( pos == evt_idx ) { 00762 *profile_index = count; 00763 *native_index = ESI->NativeInfoArray[pos].ni_event & 00764 PAPI_NATIVE_AND_MASK; 00765 *flags = ESI->profile.flags; 00766 SUBDBG( "Native event %d is at profile index %d, flags %d\n", 00767 *native_index, *profile_index, *flags ); 00768 return PAPI_OK; 00769 } 00770 } 00771 00772 PAPIERROR( "wrong count: %d vs. ESI->profile.event_counter %d", count, 00773 ESI->profile.event_counter ); 00774 return PAPI_EBUG; 00775 } 00776 00777 00778 /* These functions are based on builtin-record.c in the */ 00779 /* kernel's tools/perf directory. */ 00780 00781 static uint64_t 00782 mmap_read_head( pe_event_info_t *pe ) 00783 { 00784 struct perf_event_mmap_page *pc = pe->mmap_buf; 00785 int head; 00786 00787 if ( pc == NULL ) { 00788 PAPIERROR( "perf_event_mmap_page is NULL" ); 00789 return 0; 00790 } 00791 00792 head = pc->data_head; 00793 rmb( ); 00794 00795 return head; 00796 } 00797 00798 static void 00799 mmap_write_tail( pe_event_info_t *pe, uint64_t tail ) 00800 { 00801 struct perf_event_mmap_page *pc = pe->mmap_buf; 00802 00803 /* ensure all reads are done before we write the tail out. */ 00804 mb( ); 00805 pc->data_tail = tail; 00806 } 00807 00808 /* Does the kernel define these somewhere? */ 00809 struct ip_event { 00810 struct perf_event_header header; 00811 uint64_t ip; 00812 }; 00813 struct lost_event { 00814 struct perf_event_header header; 00815 uint64_t id; 00816 uint64_t lost; 00817 }; 00818 typedef union event_union { 00819 struct perf_event_header header; 00820 struct ip_event ip; 00821 struct lost_event lost; 00822 } perf_sample_event_t; 00823 00824 00825 /* Should re-write with comments if we ever figure out what's */ 00826 /* going on here. */ 00827 static void 00828 mmap_read( ThreadInfo_t **thr, pe_event_info_t *pe, 00829 int profile_index ) 00830 { 00831 int cidx = _papi_pe_vector.cmp_info.CmpIdx; 00832 uint64_t head = mmap_read_head( pe ); 00833 uint64_t old = pe->tail; 00834 unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( ); 00835 int diff; 00836 00837 diff = head - old; 00838 if ( diff < 0 ) { 00839 SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64 00840 ", tail = %" PRIu64 ". Discarding samples.\n", head, old ); 00841 /* head points to a known good entry, start there. */ 00842 old = head; 00843 } 00844 00845 for( ; old != head; ) { 00846 00847 perf_sample_event_t *event = ( perf_sample_event_t * ) 00848 & data[old & pe->mask]; 00849 perf_sample_event_t event_copy; 00850 size_t size = event->header.size; 00851 00852 /* Event straddles the mmap boundary -- header should always */ 00853 /* be inside due to u64 alignment of output. */ 00854 if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) { 00855 uint64_t offset = old; 00856 uint64_t len = min( sizeof ( *event ), size ), cpy; 00857 void *dst = &event_copy; 00858 00859 do { 00860 cpy = min( pe->mask + 1 - ( offset & pe->mask ), len ); 00861 memcpy( dst, &data[offset & pe->mask], cpy ); 00862 offset += cpy; 00863 dst = ((unsigned char*)dst) + cpy; 00864 len -= cpy; 00865 } while ( len ); 00866 00867 event = &event_copy; 00868 } 00869 00870 old += size; 00871 00872 SUBDBG( "event->type = %08x\n", event->header.type ); 00873 SUBDBG( "event->size = %d\n", event->header.size ); 00874 00875 switch ( event->header.type ) { 00876 case PERF_RECORD_SAMPLE: 00877 _papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx], 00878 ( caddr_t ) ( unsigned long ) event->ip.ip, 00879 0, profile_index ); 00880 break; 00881 00882 case PERF_RECORD_LOST: 00883 SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64 00884 " events were lost.\n" 00885 "Loss was recorded when counter id 0x%"PRIx64 00886 " overflowed.\n", event->lost.lost, event->lost.id ); 00887 break; 00888 00889 default: 00890 SUBDBG( "Error: unexpected header type - %d\n", 00891 event->header.type ); 00892 break; 00893 } 00894 } 00895 00896 pe->tail = old; 00897 mmap_write_tail( pe, old ); 00898 } 00899 00900 /* What exactly does this do? */ 00901 static int 00902 process_smpl_buf( int evt_idx, ThreadInfo_t **thr ) 00903 { 00904 int ret, flags, profile_index; 00905 unsigned native_index; 00906 int cidx = _papi_pe_vector.cmp_info.CmpIdx; 00907 pe_control_t *ctl; 00908 00909 ret = find_profile_index( ( *thr )->running_eventset[cidx], evt_idx, 00910 &flags, &native_index, &profile_index ); 00911 if ( ret != PAPI_OK ) { 00912 return ret; 00913 } 00914 00915 ctl= (*thr)->running_eventset[cidx]->ctl_state; 00916 00917 mmap_read( thr, 00918 &(ctl->events[evt_idx]), 00919 profile_index ); 00920 00921 return PAPI_OK; 00922 } 00923 00924 00925 00926 00927 /********************************************************************/ 00928 /********************************************************************/ 00929 /* Start with functions that are exported via the module interface */ 00930 /********************************************************************/ 00931 /********************************************************************/ 00932 00933 00934 /* set the domain. FIXME: perf_events allows per-event control of this. */ 00935 /* we do not handle that yet. */ 00936 int 00937 _papi_pe_set_domain( hwd_control_state_t *ctl, int domain) 00938 { 00939 00940 int i; 00941 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 00942 00943 SUBDBG("old control domain %d, new domain %d, default domain %d\n", 00944 pe_ctl->domain,domain,_papi_pe_vector.cmp_info.default_domain); 00945 00946 pe_ctl->domain = domain; 00947 00948 /* Force the domain on all events */ 00949 for( i = 0; i < pe_ctl->num_events; i++ ) { 00950 pe_ctl->events[i].attr.exclude_user = 00951 !( pe_ctl->domain & PAPI_DOM_USER ); 00952 pe_ctl->events[i].attr.exclude_kernel = 00953 !( pe_ctl->domain & PAPI_DOM_KERNEL ); 00954 pe_ctl->events[i].attr.exclude_hv = 00955 !( pe_ctl->domain & PAPI_DOM_SUPERVISOR ); 00956 } 00957 return PAPI_OK; 00958 } 00959 00960 00961 /* Initialize the perf_event component */ 00962 static int 00963 _papi_pe_init_component( int cidx ) 00964 { 00965 00966 int retval; 00967 int paranoid_level; 00968 00969 FILE *fff; 00970 00971 ( void ) cidx; /*unused */ 00972 00973 /* The is the official way to detect if perf_event support exists */ 00974 /* The file is called perf_counter_paranoid on 2.6.31 */ 00975 /* currently we are lazy and do not support 2.6.31 kernels */ 00976 fff=fopen("/proc/sys/kernel/perf_event_paranoid","r"); 00977 if (fff==NULL) { 00978 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 00979 "perf_event support not detected",PAPI_MAX_STR_LEN); 00980 return PAPI_ENOCMP; 00981 } 00982 00983 /* 2 means no measurements allowed */ 00984 /* 1 means normal counter access */ 00985 /* 0 means you can access CPU-specific data */ 00986 /* -1 means no restrictions */ 00987 retval=fscanf(fff,"%d",¶noid_level); 00988 if (retval!=1) fprintf(stderr,"Error reading paranoid level\n"); 00989 fclose(fff); 00990 00991 if (paranoid_level==2) { 00992 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 00993 "/proc/sys/kernel/perf_event_paranoid prohibits using counters", 00994 PAPI_MAX_STR_LEN); 00995 return PAPI_ENOCMP; 00996 } 00997 00998 /* Detect NMI watchdog which can steal counters */ 00999 nmi_watchdog_active=_linux_detect_nmi_watchdog(); 01000 if (nmi_watchdog_active) { 01001 SUBDBG("The Linux nmi_watchdog is using one of the performance " 01002 "counters, reducing the total number available.\n"); 01003 } 01004 01005 /* Kernel multiplexing is broken prior to kernel 2.6.34 */ 01006 /* The fix was probably git commit: */ 01007 /* 45e16a6834b6af098702e5ea6c9a40de42ff77d8 */ 01008 if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) { 01009 _papi_pe_vector.cmp_info.kernel_multiplex = 0; 01010 } 01011 else { 01012 _papi_pe_vector.cmp_info.kernel_multiplex = 1; 01013 } 01014 01015 /* We use the RealTime signal for some reason */ 01016 _papi_pe_vector.cmp_info.hardware_intr_sig = SIGRTMIN + 2; 01017 01018 /* Check that processor is supported */ 01019 if (processor_supported(_papi_hwi_system_info.hw_info.vendor, 01020 _papi_hwi_system_info.hw_info.cpuid_family)!= 01021 PAPI_OK) { 01022 fprintf(stderr,"warning, your processor is unsupported\n"); 01023 /* should not return error, as software events should still work */ 01024 } 01025 01026 /* Setup mmtimers, if appropriate */ 01027 retval=mmtimer_setup(); 01028 if (retval) { 01029 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 01030 "Error initializing mmtimer",PAPI_MAX_STR_LEN); 01031 return retval; 01032 } 01033 01034 /* Detect if we can use rdpmc (or equivalent) */ 01035 /* We currently do not use rdpmc as it is slower in tests */ 01036 /* than regular read (as of Linux 3.5) */ 01037 retval=detect_rdpmc(); 01038 if (retval < 0 ) { 01039 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 01040 "Error detecting rdpmc",PAPI_MAX_STR_LEN); 01041 return retval; 01042 } 01043 _papi_pe_vector.cmp_info.fast_counter_read = retval; 01044 01045 /* Run Vendor-specific fixups */ 01046 pe_vendor_fixups(); 01047 01048 /* Run the libpfm4-specific setup */ 01049 retval = _papi_libpfm4_init(&_papi_pe_vector, cidx); 01050 if (retval) { 01051 strncpy(_papi_pe_vector.cmp_info.disabled_reason, 01052 "Error initializing libpfm4",PAPI_MAX_STR_LEN); 01053 return retval; 01054 } 01055 01056 return PAPI_OK; 01057 01058 } 01059 01060 /* Shutdown the perf_event component */ 01061 static int 01062 _papi_pe_shutdown_component( void ) { 01063 01064 /* Shutdown libpfm4 */ 01065 _papi_libpfm4_shutdown(); 01066 01067 return PAPI_OK; 01068 } 01069 01070 01071 /* Initialize a thread */ 01072 static int 01073 _papi_pe_init_thread( hwd_context_t *hwd_ctx ) 01074 { 01075 01076 pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx; 01077 01078 /* clear the context structure and mark as initialized */ 01079 memset( pe_ctx, 0, sizeof ( pe_context_t ) ); 01080 pe_ctx->initialized=1; 01081 01082 return PAPI_OK; 01083 } 01084 01085 /* Shutdown a thread */ 01086 static int 01087 _papi_pe_shutdown_thread( hwd_context_t *ctx ) 01088 { 01089 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01090 01091 pe_ctx->initialized=0; 01092 01093 return PAPI_OK; 01094 } 01095 01096 01097 /* reset the hardware counters */ 01098 /* Note: PAPI_reset() does not necessarily call this */ 01099 /* unless the events are actually running. */ 01100 static int 01101 _papi_pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl ) 01102 { 01103 int i, ret; 01104 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01105 01106 ( void ) ctx; /*unused */ 01107 01108 /* We need to reset all of the events, not just the group leaders */ 01109 for( i = 0; i < pe_ctl->num_events; i++ ) { 01110 ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL ); 01111 if ( ret == -1 ) { 01112 PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) " 01113 "returned error, Linux says: %s", 01114 pe_ctl->events[i].event_fd, strerror( errno ) ); 01115 return PAPI_ESYS; 01116 } 01117 } 01118 01119 return PAPI_OK; 01120 } 01121 01122 01123 /* write (set) the hardware counters */ 01124 /* Current we do not support this. */ 01125 static int 01126 _papi_pe_write( hwd_context_t *ctx, hwd_control_state_t *ctl, 01127 long long *from ) 01128 { 01129 ( void ) ctx; /*unused */ 01130 ( void ) ctl; /*unused */ 01131 ( void ) from; /*unused */ 01132 /* 01133 * Counters cannot be written. Do we need to virtualize the 01134 * counters so that they can be written, or perhaps modify code so that 01135 * they can be written? FIXME ? 01136 */ 01137 01138 return PAPI_ENOSUPP; 01139 } 01140 01141 /* 01142 * perf_event provides a complicated read interface. 01143 * the info returned by read() varies depending on whether 01144 * you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED, 01145 * PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set 01146 * 01147 * To simplify things we just always ask for everything. This might 01148 * lead to overhead when reading more than we need, but it makes the 01149 * read code a lot simpler than the original implementation we had here. 01150 * 01151 * For more info on the layout see include/linux/perf_event.h 01152 * 01153 */ 01154 01155 static int 01156 _papi_pe_read( hwd_context_t *ctx, hwd_control_state_t *ctl, 01157 long long **events, int flags ) 01158 { 01159 ( void ) flags; /*unused */ 01160 int i, ret = -1; 01161 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01162 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01163 long long papi_pe_buffer[READ_BUFFER_SIZE]; 01164 long long tot_time_running, tot_time_enabled, scale; 01165 01166 /* On kernels before 2.6.33 the TOTAL_TIME_ENABLED and TOTAL_TIME_RUNNING */ 01167 /* fields are always 0 unless the counter is disabled. So if we are on */ 01168 /* one of these kernels, then we must disable events before reading. */ 01169 01170 /* Elsewhere though we disable multiplexing on kernels before 2.6.34 */ 01171 /* so maybe this isn't even necessary. */ 01172 01173 if (bug_sync_read()) { 01174 if ( pe_ctx->state & PERF_EVENTS_RUNNING ) { 01175 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01176 /* disable only the group leaders */ 01177 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 01178 ret = ioctl( pe_ctl->events[i].event_fd, 01179 PERF_EVENT_IOC_DISABLE, NULL ); 01180 if ( ret == -1 ) { 01181 PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) " 01182 "returned an error: ", strerror( errno )); 01183 return PAPI_ESYS; 01184 } 01185 } 01186 } 01187 } 01188 } 01189 01190 01191 /* Handle case where we are multiplexing */ 01192 if (pe_ctl->multiplexed) { 01193 01194 /* currently we handle multiplexing by having individual events */ 01195 /* so we read from each in turn. */ 01196 01197 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01198 01199 ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 01200 sizeof ( papi_pe_buffer ) ); 01201 if ( ret == -1 ) { 01202 PAPIERROR("read returned an error: ", strerror( errno )); 01203 return PAPI_ESYS; 01204 } 01205 01206 /* We should read 3 64-bit values from the counter */ 01207 if (ret<(signed)(3*sizeof(long long))) { 01208 PAPIERROR("Error! short read!\n"); 01209 return PAPI_ESYS; 01210 } 01211 01212 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01213 pe_ctl->events[i].event_fd, 01214 (long)pe_ctl->tid, pe_ctl->cpu, ret); 01215 SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0], 01216 papi_pe_buffer[1],papi_pe_buffer[2]); 01217 01218 tot_time_enabled = papi_pe_buffer[1]; 01219 tot_time_running = papi_pe_buffer[2]; 01220 01221 SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * " 01222 "tot_time_enabled %lld) / tot_time_running %lld\n", 01223 i, 0,papi_pe_buffer[0], 01224 tot_time_enabled,tot_time_running); 01225 01226 if (tot_time_running == tot_time_enabled) { 01227 /* No scaling needed */ 01228 pe_ctl->counts[i] = papi_pe_buffer[0]; 01229 } else if (tot_time_running && tot_time_enabled) { 01230 /* Scale factor of 100 to avoid overflows when computing */ 01231 /*enabled/running */ 01232 01233 scale = (tot_time_enabled * 100LL) / tot_time_running; 01234 scale = scale * papi_pe_buffer[0]; 01235 scale = scale / 100LL; 01236 pe_ctl->counts[i] = scale; 01237 } else { 01238 /* This should not happen, but Phil reports it sometime does. */ 01239 SUBDBG("perf_event kernel bug(?) count, enabled, " 01240 "running: %lld, %lld, %lld\n", 01241 papi_pe_buffer[0],tot_time_enabled, 01242 tot_time_running); 01243 01244 pe_ctl->counts[i] = papi_pe_buffer[0]; 01245 } 01246 } 01247 } 01248 01249 /* Handle cases where we cannot use FORMAT GROUP */ 01250 else if (bug_format_group() || pe_ctl->inherit) { 01251 01252 /* we must read each counter individually */ 01253 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01254 01255 ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 01256 sizeof ( papi_pe_buffer ) ); 01257 if ( ret == -1 ) { 01258 PAPIERROR("read returned an error: ", strerror( errno )); 01259 return PAPI_ESYS; 01260 } 01261 01262 /* we should read one 64-bit value from each counter */ 01263 if (ret!=sizeof(long long)) { 01264 PAPIERROR("Error! short read!\n"); 01265 PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01266 pe_ctl->events[i].event_fd, 01267 (long)pe_ctl->tid, pe_ctl->cpu, ret); 01268 return PAPI_ESYS; 01269 } 01270 01271 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01272 pe_ctl->events[i].event_fd, (long)pe_ctl->tid, 01273 pe_ctl->cpu, ret); 01274 SUBDBG("read: %lld\n",papi_pe_buffer[0]); 01275 01276 pe_ctl->counts[i] = papi_pe_buffer[0]; 01277 } 01278 } 01279 01280 01281 /* Handle cases where we are using FORMAT_GROUP */ 01282 /* We assume only one group leader, in position 0 */ 01283 01284 else { 01285 if (pe_ctl->events[0].group_leader_fd!=-1) { 01286 PAPIERROR("Was expecting group leader!\n"); 01287 } 01288 01289 ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer, 01290 sizeof ( papi_pe_buffer ) ); 01291 01292 if ( ret == -1 ) { 01293 PAPIERROR("read returned an error: ", strerror( errno )); 01294 return PAPI_ESYS; 01295 } 01296 01297 /* we read 1 64-bit value (number of events) then */ 01298 /* num_events more 64-bit values that hold the counts */ 01299 if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) { 01300 PAPIERROR("Error! short read!\n"); 01301 return PAPI_ESYS; 01302 } 01303 01304 SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 01305 pe_ctl->events[0].event_fd, 01306 (long)pe_ctl->tid, pe_ctl->cpu, ret); 01307 { 01308 int j; 01309 for(j=0;j<ret/8;j++) { 01310 SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]); 01311 } 01312 } 01313 01314 /* Make sure the kernel agrees with how many events we have */ 01315 if (papi_pe_buffer[0]!=pe_ctl->num_events) { 01316 PAPIERROR("Error! Wrong number of events!\n"); 01317 return PAPI_ESYS; 01318 } 01319 01320 /* put the count values in their proper location */ 01321 for(i=0;i<papi_pe_buffer[0];i++) { 01322 pe_ctl->counts[i] = papi_pe_buffer[1+i]; 01323 } 01324 } 01325 01326 01327 /* If we disabled the counters due to the sync_read_bug(), */ 01328 /* then we need to re-enable them now. */ 01329 if (bug_sync_read()) { 01330 if ( pe_ctx->state & PERF_EVENTS_RUNNING ) { 01331 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01332 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 01333 /* this should refresh any overflow counters too */ 01334 ret = ioctl( pe_ctl->events[i].event_fd, 01335 PERF_EVENT_IOC_ENABLE, NULL ); 01336 if ( ret == -1 ) { 01337 /* Should never happen */ 01338 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) returned an error: ", 01339 strerror( errno )); 01340 return PAPI_ESYS; 01341 } 01342 } 01343 } 01344 } 01345 } 01346 01347 /* point PAPI to the values we read */ 01348 *events = pe_ctl->counts; 01349 01350 return PAPI_OK; 01351 } 01352 01353 /* Start counting events */ 01354 static int 01355 _papi_pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl ) 01356 { 01357 int ret; 01358 int i; 01359 int did_something = 0; 01360 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01361 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01362 01363 /* Reset the counters first. Is this necessary? */ 01364 ret = _papi_pe_reset( pe_ctx, pe_ctl ); 01365 if ( ret ) { 01366 return ret; 01367 } 01368 01369 /* Enable all of the group leaders */ 01370 /* All group leaders have a group_leader_fd of -1 */ 01371 for( i = 0; i < pe_ctl->num_events; i++ ) { 01372 if (pe_ctl->events[i].group_leader_fd == -1) { 01373 SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd); 01374 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ; 01375 01376 /* ioctls always return -1 on failure */ 01377 if (ret == -1) { 01378 PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n"); 01379 return PAPI_ESYS; 01380 } 01381 01382 did_something++; 01383 } 01384 } 01385 01386 if (!did_something) { 01387 PAPIERROR("Did not enable any counters.\n"); 01388 return PAPI_EBUG; 01389 } 01390 01391 pe_ctx->state |= PERF_EVENTS_RUNNING; 01392 01393 return PAPI_OK; 01394 01395 } 01396 01397 /* Stop all of the counters */ 01398 static int 01399 _papi_pe_stop( hwd_context_t *ctx, hwd_control_state_t *ctl ) 01400 { 01401 01402 int ret; 01403 int i; 01404 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01405 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01406 01407 /* Just disable the group leaders */ 01408 for ( i = 0; i < pe_ctl->num_events; i++ ) { 01409 if ( pe_ctl->events[i].group_leader_fd == -1 ) { 01410 ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL); 01411 if ( ret == -1 ) { 01412 PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) " 01413 "returned error, Linux says: %s", 01414 pe_ctl->events[i].event_fd, strerror( errno ) ); 01415 return PAPI_EBUG; 01416 } 01417 } 01418 } 01419 01420 pe_ctx->state &= ~PERF_EVENTS_RUNNING; 01421 01422 return PAPI_OK; 01423 } 01424 01425 /* Initialize a new control state */ 01426 static int 01427 _papi_pe_init_control_state( hwd_control_state_t *ctl ) 01428 { 01429 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01430 01431 /* clear the contents */ 01432 memset( pe_ctl, 0, sizeof ( pe_control_t ) ); 01433 _papi_pe_set_domain( ctl, _papi_pe_vector.cmp_info.default_domain ); 01434 01435 /* Set cpu number in the control block to show events */ 01436 /* are not tied to specific cpu */ 01437 pe_ctl->cpu = -1; 01438 return PAPI_OK; 01439 } 01440 01441 01442 /* This function clears the current contents of the control structure and 01443 updates it with whatever resources are allocated for all the native events 01444 in the native info structure array. */ 01445 01446 static int 01447 _papi_pe_update_control_state( hwd_control_state_t *ctl, 01448 NativeInfo_t *native, 01449 int count, hwd_context_t *ctx ) 01450 { 01451 int i = 0, ret; 01452 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01453 pe_control_t *pe_ctl = ( pe_control_t *) ctl; 01454 01455 /* close all of the existing fds and start over again */ 01456 /* In theory we could have finer-grained control and know if */ 01457 /* things were changed, but it's easier to tear things down and rebuild. */ 01458 close_pe_events( pe_ctx, pe_ctl ); 01459 01460 /* Calling with count==0 should be OK, it's how things are deallocated */ 01461 /* when an eventset is destroyed. */ 01462 if ( count == 0 ) { 01463 SUBDBG( "Called with count == 0\n" ); 01464 return PAPI_OK; 01465 } 01466 01467 /* set up all the events */ 01468 for( i = 0; i < count; i++ ) { 01469 if ( native ) { 01470 /* Have libpfm4 set the config values for the event */ 01471 ret=_papi_libpfm4_setup_counters(&pe_ctl->events[i].attr, 01472 native[i].ni_event); 01473 SUBDBG( "pe_ctl->eventss[%d].config=%"PRIx64"\n",i, 01474 pe_ctl->events[i].attr.config); 01475 if (ret!=PAPI_OK) return ret; 01476 01477 } else { 01478 /* I'm not sure how we'd end up in this case */ 01479 /* should it be an error? */ 01480 } 01481 01482 /* Copy the inherit flag into the attribute block that will be */ 01483 /* passed to the kernel */ 01484 pe_ctl->events[i].attr.inherit = pe_ctl->inherit; 01485 01486 /* Set the position in the native structure */ 01487 /* We just set up events linearly */ 01488 if ( native ) { 01489 native[i].ni_position = i; 01490 } 01491 } 01492 01493 pe_ctl->num_events = count; 01494 _papi_pe_set_domain( ctl, pe_ctl->domain ); 01495 01496 /* actuall open the events */ 01497 /* (why is this a separate function?) */ 01498 ret = open_pe_events( pe_ctx, pe_ctl ); 01499 if ( ret != PAPI_OK ) { 01500 SUBDBG("open_pe_events failed\n"); 01501 /* Restore values ? */ 01502 return ret; 01503 } 01504 01505 return PAPI_OK; 01506 } 01507 01508 /* Set various options on a control state */ 01509 static int 01510 _papi_pe_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option ) 01511 { 01512 int ret; 01513 pe_context_t *pe_ctx = ( pe_context_t *) ctx; 01514 pe_control_t *pe_ctl = NULL; 01515 01516 switch ( code ) { 01517 case PAPI_MULTIPLEX: 01518 pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state ); 01519 if (check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain, 01520 1, pe_ctl->inherit ) != PAPI_OK) { 01521 return PAPI_EPERM; 01522 } 01523 01524 /* looks like we are allowed, so set multiplexed attribute */ 01525 pe_ctl->multiplexed = 1; 01526 ret = _papi_pe_update_control_state( pe_ctl, NULL, 01527 pe_ctl->num_events, pe_ctx ); 01528 if (ret != PAPI_OK) { 01529 pe_ctl->multiplexed = 0; 01530 } 01531 return ret; 01532 01533 case PAPI_ATTACH: 01534 pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state ); 01535 if (check_permissions( option->attach.tid, pe_ctl->cpu, 01536 pe_ctl->domain, pe_ctl->multiplexed, 01537 pe_ctl->inherit ) != PAPI_OK) { 01538 return PAPI_EPERM; 01539 } 01540 01541 pe_ctl->tid = option->attach.tid; 01542 01543 /* If events have been already been added, something may */ 01544 /* have been done to the kernel, so update */ 01545 ret = _papi_pe_update_control_state( pe_ctl, NULL, 01546 pe_ctl->num_events, pe_ctx); 01547 01548 return ret; 01549 01550 case PAPI_DETACH: 01551 pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state ); 01552 01553 pe_ctl->tid = 0; 01554 return PAPI_OK; 01555 01556 case PAPI_CPU_ATTACH: 01557 pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state ); 01558 if (check_permissions( pe_ctl->tid, option->cpu.cpu_num, 01559 pe_ctl->domain, pe_ctl->multiplexed, 01560 pe_ctl->inherit ) != PAPI_OK) { 01561 return PAPI_EPERM; 01562 } 01563 /* looks like we are allowed so set cpu number */ 01564 01565 /* this tells the kernel not to count for a thread */ 01566 /* should we warn if we try to set both? perf_event */ 01567 /* will reject it. */ 01568 pe_ctl->tid = -1; 01569 01570 pe_ctl->cpu = option->cpu.cpu_num; 01571 01572 return PAPI_OK; 01573 01574 case PAPI_DOMAIN: 01575 pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state ); 01576 if (check_permissions( pe_ctl->tid, pe_ctl->cpu, 01577 option->domain.domain, pe_ctl->multiplexed, 01578 pe_ctl->inherit ) != PAPI_OK) { 01579 return PAPI_EPERM; 01580 } 01581 /* looks like we are allowed, so set counting domain */ 01582 return _papi_pe_set_domain( pe_ctl, option->domain.domain ); 01583 01584 case PAPI_GRANUL: 01585 pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state ); 01586 01587 /* FIXME: we really don't support this yet */ 01588 01589 switch ( option->granularity.granularity ) { 01590 case PAPI_GRN_PROCG: 01591 case PAPI_GRN_SYS: 01592 case PAPI_GRN_SYS_CPU: 01593 case PAPI_GRN_PROC: 01594 return PAPI_ECMP; 01595 01596 /* Currently we only support thread granularity */ 01597 case PAPI_GRN_THR: 01598 break; 01599 01600 default: 01601 return PAPI_EINVAL; 01602 } 01603 return PAPI_OK; 01604 01605 case PAPI_INHERIT: 01606 pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state ); 01607 if (check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain, 01608 pe_ctl->multiplexed, 01609 option->inherit.inherit ) != PAPI_OK) { 01610 return PAPI_EPERM; 01611 } 01612 /* looks like we are allowed, so set the requested inheritance */ 01613 if (option->inherit.inherit) { 01614 /* children will inherit counters */ 01615 pe_ctl->inherit = 1; 01616 } else { 01617 /* children won't inherit counters */ 01618 pe_ctl->inherit = 0; 01619 } 01620 return PAPI_OK; 01621 01622 case PAPI_DATA_ADDRESS: 01623 return PAPI_ENOSUPP; 01624 #if 0 01625 pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state); 01626 ret = set_default_domain( pe_ctl, option->address_range.domain ); 01627 if ( ret != PAPI_OK ) { 01628 return ret; 01629 } 01630 set_drange( pe_ctx, pe_ctl, option ); 01631 return PAPI_OK; 01632 #endif 01633 case PAPI_INSTR_ADDRESS: 01634 return PAPI_ENOSUPP; 01635 #if 0 01636 pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state); 01637 ret = set_default_domain( pe_ctl, option->address_range.domain ); 01638 if ( ret != PAPI_OK ) { 01639 return ret; 01640 } 01641 set_irange( pe_ctx, pe_ctl, option ); 01642 return PAPI_OK; 01643 #endif 01644 01645 case PAPI_DEF_ITIMER: 01646 /* What should we be checking for here? */ 01647 /* This seems like it should be OS-specific not component */ 01648 /* specific. */ 01649 01650 return PAPI_OK; 01651 01652 case PAPI_DEF_MPX_NS: 01653 /* Defining a given ns per set is not current supported */ 01654 return PAPI_ENOSUPP; 01655 01656 case PAPI_DEF_ITIMER_NS: 01657 /* We don't support this... */ 01658 return PAPI_OK; 01659 01660 default: 01661 return PAPI_ENOSUPP; 01662 } 01663 } 01664 01665 01666 /* 01667 * This function is used when hardware overflows are working or when 01668 * software overflows are forced 01669 */ 01670 01671 static void 01672 _papi_pe_dispatch_timer( int n, hwd_siginfo_t *info, void *uc ) 01673 { 01674 ( void ) n; /*unused */ 01675 _papi_hwi_context_t hw_context; 01676 int found_evt_idx = -1, fd = info->si_fd; 01677 caddr_t address; 01678 ThreadInfo_t *thread = _papi_hwi_lookup_thread( 0 ); 01679 int cidx = _papi_pe_vector.cmp_info.CmpIdx; 01680 int i; 01681 pe_control_t *ctl; 01682 01683 if ( thread == NULL ) { 01684 PAPIERROR( "thread == NULL in _papi_pe_dispatch_timer for fd %d!", fd ); 01685 return; 01686 } 01687 01688 if ( thread->running_eventset[cidx] == NULL ) { 01689 PAPIERROR( "thread->running_eventset == NULL in " 01690 "_papi_pe_dispatch_timer for fd %d!",fd ); 01691 return; 01692 } 01693 01694 if ( thread->running_eventset[cidx]->overflow.flags == 0 ) { 01695 PAPIERROR( "thread->running_eventset->overflow.flags == 0 in " 01696 "_papi_pe_dispatch_timer for fd %d!", fd ); 01697 return; 01698 } 01699 01700 hw_context.si = info; 01701 hw_context.ucontext = ( hwd_ucontext_t * ) uc; 01702 01703 if ( thread->running_eventset[cidx]->overflow.flags & 01704 PAPI_OVERFLOW_FORCE_SW ) { 01705 address = GET_OVERFLOW_ADDRESS( hw_context ); 01706 _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context, 01707 address, NULL, 0, 01708 0, &thread, cidx ); 01709 return; 01710 } 01711 01712 if ( thread->running_eventset[cidx]->overflow.flags != 01713 PAPI_OVERFLOW_HARDWARE ) { 01714 PAPIERROR( "thread->running_eventset->overflow.flags is set to " 01715 "something other than PAPI_OVERFLOW_HARDWARE or " 01716 "PAPI_OVERFLOW_FORCE_SW for fd %d (%x)", 01717 fd , thread->running_eventset[cidx]->overflow.flags); 01718 } 01719 01720 /* convoluted way to get ctl */ 01721 ctl= thread->running_eventset[cidx]->ctl_state; 01722 01723 /* See if the fd is one that's part of the this thread's context */ 01724 for( i=0; i < ctl->num_events; i++ ) { 01725 if ( fd == ctl->events[i].event_fd ) { 01726 found_evt_idx = i; 01727 break; 01728 } 01729 } 01730 01731 if ( found_evt_idx == -1 ) { 01732 PAPIERROR( "Unable to find fd %d among the open event fds " 01733 "_papi_hwi_dispatch_timer!", fd ); 01734 return; 01735 } 01736 01737 ioctl( fd, PERF_EVENT_IOC_DISABLE, NULL ); 01738 01739 if ( ( thread->running_eventset[cidx]->state & PAPI_PROFILING ) && 01740 !( thread->running_eventset[cidx]->profile.flags & 01741 PAPI_PROFIL_FORCE_SW ) ) { 01742 process_smpl_buf( found_evt_idx, &thread ); 01743 } 01744 else { 01745 uint64_t ip; 01746 unsigned int head; 01747 pe_event_info_t *pe = &(ctl->events[found_evt_idx]); 01748 unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( ); 01749 01750 /* 01751 * Read up the most recent IP from the sample in the mmap buffer. To 01752 * do this, we make the assumption that all of the records in the 01753 * mmap buffer are the same size, and that they all contain the IP as 01754 * their only record element. This means that we can use the 01755 * data_head element from the user page and move backward one record 01756 * from that point and read the data. Since we don't actually need 01757 * to access the header of the record, we can just subtract 8 (size 01758 * of the IP) from data_head and read up that word from the mmap 01759 * buffer. After we subtract 8, we account for mmap buffer wrapping 01760 * by AND'ing this offset with the buffer mask. 01761 */ 01762 head = mmap_read_head( pe ); 01763 01764 if ( head == 0 ) { 01765 PAPIERROR( "Attempting to access memory which may be inaccessable" ); 01766 return; 01767 } 01768 01769 ip = *( uint64_t * ) ( data + ( ( head - 8 ) & pe->mask ) ); 01770 /* 01771 * Update the tail to the current head pointer. 01772 * 01773 * Note: that if we were to read the record at the tail pointer, 01774 * rather than the one at the head (as you might otherwise think 01775 * would be natural), we could run into problems. Signals don't 01776 * stack well on Linux, particularly if not using RT signals, and if 01777 * they come in rapidly enough, we can lose some. Overtime, the head 01778 * could catch up to the tail and monitoring would be stopped, and 01779 * since no more signals are coming in, this problem will never be 01780 * resolved, resulting in a complete loss of overflow notification 01781 * from that point on. So the solution we use here will result in 01782 * only the most recent IP value being read every time there are two 01783 * or more samples in the buffer (for that one overflow signal). But 01784 * the handler will always bring up the tail, so the head should 01785 * never run into the tail. 01786 */ 01787 mmap_write_tail( pe, head ); 01788 01789 /* 01790 * The fourth parameter is supposed to be a vector of bits indicating 01791 * the overflowed hardware counters, but it's not really clear that 01792 * it's useful, because the actual hardware counters used are not 01793 * exposed to the PAPI user. For now, I'm just going to set the bit 01794 * that indicates which event register in the array overflowed. The 01795 * result is that the overflow vector will not be identical to the 01796 * perfmon implementation, and part of that is due to the fact that 01797 * which hardware register is actually being used is opaque at the 01798 * user level (the kernel event dispatcher hides that info). 01799 */ 01800 01801 _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context, 01802 ( caddr_t ) ( unsigned long ) ip, 01803 NULL, ( 1 << found_evt_idx ), 0, 01804 &thread, cidx ); 01805 01806 } 01807 01808 /* Restart the counters */ 01809 if (ioctl( fd, PERF_EVENT_IOC_REFRESH, 1 ) == -1) { 01810 PAPIERROR( "overflow refresh failed", 0 ); 01811 } 01812 } 01813 01814 /* Stop profiling */ 01815 static int 01816 _papi_pe_stop_profiling( ThreadInfo_t *thread, EventSetInfo_t *ESI ) 01817 { 01818 int i, ret = PAPI_OK; 01819 pe_control_t *ctl; 01820 01821 ctl=ESI->ctl_state; 01822 01823 /* Loop through all of the events and process those which have mmap */ 01824 /* buffers attached. */ 01825 for ( i = 0; i < ctl->num_events; i++ ) { 01826 /* Use the mmap_buf field as an indicator of this fd being used for */ 01827 /* profiling. */ 01828 if ( ctl->events[i].mmap_buf ) { 01829 /* Process any remaining samples in the sample buffer */ 01830 ret = process_smpl_buf( i, &thread ); 01831 if ( ret ) { 01832 PAPIERROR( "process_smpl_buf returned error %d", ret ); 01833 return ret; 01834 } 01835 } 01836 } 01837 return ret; 01838 } 01839 01840 01841 /* Setup an event to cause overflow */ 01842 static int 01843 _papi_pe_set_overflow( EventSetInfo_t *ESI, int EventIndex, int threshold ) 01844 { 01845 int cidx = _papi_pe_vector.cmp_info.CmpIdx; 01846 pe_context_t *ctx = ( pe_context_t *) ( ESI->master->context[cidx] ); 01847 pe_control_t *ctl = (pe_control_t *) ( ESI->ctl_state ); 01848 int i, evt_idx, found_non_zero_sample_period = 0, retval = PAPI_OK; 01849 01850 evt_idx = ESI->EventInfoArray[EventIndex].pos[0]; 01851 01852 SUBDBG("Attempting to set overflow for index %d (%d) of EventSet %d\n", 01853 evt_idx,EventIndex,ESI->EventSetIndex); 01854 01855 if (evt_idx<0) { 01856 return PAPI_EINVAL; 01857 } 01858 01859 if ( threshold == 0 ) { 01860 /* If this counter isn't set to overflow, it's an error */ 01861 if ( ctl->events[evt_idx].attr.sample_period == 0 ) return PAPI_EINVAL; 01862 } 01863 01864 ctl->events[evt_idx].attr.sample_period = threshold; 01865 01866 /* 01867 * Note that the wakeup_mode field initially will be set to zero 01868 * (WAKEUP_MODE_COUNTER_OVERFLOW) as a result of a call to memset 0 to 01869 * all of the events in the ctl struct. 01870 * 01871 * Is it even set to any other value elsewhere? 01872 */ 01873 switch ( ctl->events[evt_idx].wakeup_mode ) { 01874 case WAKEUP_MODE_PROFILING: 01875 /* Setting wakeup_events to special value zero means issue a */ 01876 /* wakeup (signal) on every mmap page overflow. */ 01877 ctl->events[evt_idx].attr.wakeup_events = 0; 01878 break; 01879 01880 case WAKEUP_MODE_COUNTER_OVERFLOW: 01881 /* Can this code ever be called? */ 01882 01883 /* Setting wakeup_events to one means issue a wakeup on every */ 01884 /* counter overflow (not mmap page overflow). */ 01885 ctl->events[evt_idx].attr.wakeup_events = 1; 01886 /* We need the IP to pass to the overflow handler */ 01887 ctl->events[evt_idx].attr.sample_type = PERF_SAMPLE_IP; 01888 /* one for the user page, and two to take IP samples */ 01889 ctl->events[evt_idx].nr_mmap_pages = 1 + 2; 01890 break; 01891 default: 01892 PAPIERROR( "ctl->wakeup_mode[%d] set to an unknown value - %u", 01893 evt_idx, ctl->events[evt_idx].wakeup_mode); 01894 return PAPI_EBUG; 01895 } 01896 01897 /* Check for non-zero sample period */ 01898 for ( i = 0; i < ctl->num_events; i++ ) { 01899 if ( ctl->events[evt_idx].attr.sample_period ) { 01900 found_non_zero_sample_period = 1; 01901 break; 01902 } 01903 } 01904 01905 if ( found_non_zero_sample_period ) { 01906 /* turn on internal overflow flag for this event set */ 01907 ctl->overflow = 1; 01908 01909 /* Enable the signal handler */ 01910 retval = _papi_hwi_start_signal( 01911 _papi_pe_vector.cmp_info.hardware_intr_sig, 01912 1, _papi_pe_vector.cmp_info.CmpIdx ); 01913 } else { 01914 /* turn off internal overflow flag for this event set */ 01915 ctl->overflow = 0; 01916 01917 /* Remove the signal handler, if there are no remaining non-zero */ 01918 /* sample_periods set */ 01919 retval = _papi_hwi_stop_signal( 01920 _papi_pe_vector.cmp_info.hardware_intr_sig ); 01921 if ( retval != PAPI_OK ) return retval; 01922 } 01923 01924 retval = _papi_pe_update_control_state( ctl, NULL, 01925 ( (pe_control_t *) (ESI->ctl_state) )->num_events, 01926 ctx ); 01927 01928 return retval; 01929 } 01930 01931 /* Enable profiling */ 01932 static int 01933 _papi_pe_set_profile( EventSetInfo_t *ESI, int EventIndex, int threshold ) 01934 { 01935 int ret; 01936 int evt_idx; 01937 pe_control_t *ctl = ( pe_control_t *) ( ESI->ctl_state ); 01938 01939 /* Since you can't profile on a derived event, the event is always the */ 01940 /* first and only event in the native event list. */ 01941 evt_idx = ESI->EventInfoArray[EventIndex].pos[0]; 01942 01943 if ( threshold == 0 ) { 01944 SUBDBG( "MUNMAP(%p,%"PRIu64")\n", ctl->events[evt_idx].mmap_buf, 01945 ( uint64_t ) ctl->events[evt_idx].nr_mmap_pages * 01946 getpagesize( ) ); 01947 01948 if ( ctl->events[evt_idx].mmap_buf ) { 01949 munmap( ctl->events[evt_idx].mmap_buf, 01950 ctl->events[evt_idx].nr_mmap_pages * getpagesize() ); 01951 } 01952 01953 ctl->events[evt_idx].mmap_buf = NULL; 01954 ctl->events[evt_idx].nr_mmap_pages = 0; 01955 ctl->events[evt_idx].attr.sample_type &= ~PERF_SAMPLE_IP; 01956 ret = _papi_pe_set_overflow( ESI, EventIndex, threshold ); 01957 /* ??? #warning "This should be handled somewhere else" */ 01958 ESI->state &= ~( PAPI_OVERFLOWING ); 01959 ESI->overflow.flags &= ~( PAPI_OVERFLOW_HARDWARE ); 01960 01961 return ret; 01962 } 01963 01964 /* Look up the native event code */ 01965 if ( ESI->profile.flags & (PAPI_PROFIL_DATA_EAR | PAPI_PROFIL_INST_EAR)) { 01966 /* Not supported yet... */ 01967 01968 return PAPI_ENOSUPP; 01969 } 01970 01971 if ( ESI->profile.flags & PAPI_PROFIL_RANDOM ) { 01972 /* This requires an ability to randomly alter the sample_period within */ 01973 /* a given range. Kernel does not have this ability. FIXME */ 01974 return PAPI_ENOSUPP; 01975 } 01976 01977 /* Just a guess at how many pages would make this relatively efficient. */ 01978 /* Note that it's "1 +" because of the need for a control page, and the */ 01979 /* number following the "+" must be a power of 2 (1, 4, 8, 16, etc) or */ 01980 /* zero. This is required to optimize dealing with circular buffer */ 01981 /* wrapping of the mapped pages. */ 01982 01983 ctl->events[evt_idx].nr_mmap_pages = (1+8); 01984 ctl->events[evt_idx].attr.sample_type |= PERF_SAMPLE_IP; 01985 01986 ret = _papi_pe_set_overflow( ESI, EventIndex, threshold ); 01987 if ( ret != PAPI_OK ) return ret; 01988 01989 return PAPI_OK; 01990 } 01991 01992 01993 /* Our component vector */ 01994 01995 papi_vector_t _papi_pe_vector = { 01996 .cmp_info = { 01997 /* component information (unspecified values initialized to 0) */ 01998 .name = "perf_events", 01999 .short_name = "pe", 02000 .version = "5.0", 02001 .description = "Linux perf_event CPU counters", 02002 02003 .default_domain = PAPI_DOM_USER, 02004 .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL, 02005 .default_granularity = PAPI_GRN_THR, 02006 .available_granularities = PAPI_GRN_THR, 02007 02008 .hardware_intr = 1, 02009 .kernel_profile = 1, 02010 .num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS, 02011 02012 /* component specific cmp_info initializations */ 02013 .fast_virtual_timer = 0, 02014 .attach = 1, 02015 .attach_must_ptrace = 1, 02016 .cpu = 1, 02017 .inherit = 1, 02018 .cntr_umasks = 1, 02019 02020 }, 02021 02022 /* sizes of framework-opaque component-private structures */ 02023 .size = { 02024 .context = sizeof ( pe_context_t ), 02025 .control_state = sizeof ( pe_control_t ), 02026 .reg_value = sizeof ( int ), 02027 .reg_alloc = sizeof ( int ), 02028 }, 02029 02030 /* function pointers in this component */ 02031 .init_control_state = _papi_pe_init_control_state, 02032 .start = _papi_pe_start, 02033 .stop = _papi_pe_stop, 02034 .read = _papi_pe_read, 02035 .shutdown_thread = _papi_pe_shutdown_thread, 02036 .shutdown_component = _papi_pe_shutdown_component, 02037 .ctl = _papi_pe_ctl, 02038 .update_control_state = _papi_pe_update_control_state, 02039 .set_domain = _papi_pe_set_domain, 02040 .reset = _papi_pe_reset, 02041 .set_overflow = _papi_pe_set_overflow, 02042 .set_profile = _papi_pe_set_profile, 02043 .stop_profiling = _papi_pe_stop_profiling, 02044 .init_component = _papi_pe_init_component, 02045 .dispatch_timer = _papi_pe_dispatch_timer, 02046 .write = _papi_pe_write, 02047 .init_thread = _papi_pe_init_thread, 02048 02049 /* from counter name mapper */ 02050 .ntv_enum_events = _papi_libpfm4_ntv_enum_events, 02051 .ntv_name_to_code = _papi_libpfm4_ntv_name_to_code, 02052 .ntv_code_to_name = _papi_libpfm4_ntv_code_to_name, 02053 .ntv_code_to_descr = _papi_libpfm4_ntv_code_to_descr, 02054 .ntv_code_to_info = _papi_libpfm4_ntv_code_to_info, 02055 };