PAPI  5.3.0.0
perf_event.c
Go to the documentation of this file.
00001 /*
00002 * File:    perf_event.c
00003 *
00004 * Author:  Corey Ashford
00005 *          cjashfor@us.ibm.com
00006 *          - based upon perfmon.c written by -
00007 *          Philip Mucci
00008 *          mucci@cs.utk.edu
00009 * Mods:    Gary Mohr
00010 *          gary.mohr@bull.com
00011 * Mods:    Vince Weaver
00012 *          vweaver1@eecs.utk.edu
00013 * Mods:    Philip Mucci
00014 *      mucci@eecs.utk.edu */
00015 
00016 
00017 #include <fcntl.h>
00018 #include <string.h>
00019 #include <errno.h>
00020 #include <signal.h>
00021 #include <syscall.h>
00022 #include <sys/utsname.h>
00023 #include <sys/mman.h>
00024 #include <sys/ioctl.h>
00025 
00026 /* PAPI-specific includes */
00027 #include "papi.h"
00028 #include "papi_memory.h"
00029 #include "papi_internal.h"
00030 #include "papi_vector.h"
00031 #include "extras.h"
00032 
00033 /* libpfm4 includes */
00034 #include "papi_libpfm4_events.h"
00035 #include "pe_libpfm4_events.h"
00036 #include "perfmon/pfmlib.h"
00037 #include PEINCLUDE
00038 
00039 /* Linux-specific includes */
00040 #include "mb.h"
00041 #include "linux-memory.h"
00042 #include "linux-timer.h"
00043 #include "linux-common.h"
00044 #include "linux-context.h"
00045 
00046 #include "perf_event_lib.h"
00047 
00048 /* Defines for ctx->state */
00049 #define PERF_EVENTS_OPENED  0x01
00050 #define PERF_EVENTS_RUNNING 0x02
00051 
00052 /* Static globals */
00053 int nmi_watchdog_active;
00054 
00055 /* Forward declaration */
00056 papi_vector_t _perf_event_vector;
00057 
00058 /* Globals */
00059 struct native_event_table_t perf_native_event_table;
00060 int our_cidx;
00061 
00062 /* These sentinels tell _pe_set_overflow() how to set the */
00063 /* wakeup_events field in the event descriptor record.        */
00064 
00065 #define WAKEUP_COUNTER_OVERFLOW 0
00066 #define WAKEUP_PROFILING -1
00067 
00068 #define WAKEUP_MODE_COUNTER_OVERFLOW 0
00069 #define WAKEUP_MODE_PROFILING 1
00070 
00071 /* The kernel developers say to never use a refresh value of 0        */
00072 /* See https://lkml.org/lkml/2011/5/24/172                            */
00073 /* However, on some platforms (like Power) a value of 1 does not work */
00074 /* We're still tracking down why this happens.                        */
00075 
00076 #if defined(__powerpc__)
00077 #define PAPI_REFRESH_VALUE 0
00078 #else
00079 #define PAPI_REFRESH_VALUE 1
00080 #endif
00081 
00082 /* Check for processor support */
00083 /* Can be used for generic checking, though in general we only     */
00084 /* check for pentium4 here because support was broken for multiple */
00085 /* kernel releases and the usual standard detections did not       */
00086 /* handle this.  So we check for pentium 4 explicitly.             */
00087 static int
00088 processor_supported(int vendor, int family) {
00089 
00090    /* Error out if kernel too early to support p4 */
00091    if (( vendor == PAPI_VENDOR_INTEL ) && (family == 15)) {
00092       if (_papi_os_info.os_version < LINUX_VERSION(2,6,35)) {
00093          PAPIERROR("Pentium 4 not supported on kernels before 2.6.35");
00094          return PAPI_ENOSUPP;
00095       }
00096    }
00097    return PAPI_OK;
00098 }
00099 
00100 /* Fix up the config based on what CPU/Vendor we are running on */
00101 static int
00102 pe_vendor_fixups(papi_vector_t *vector)
00103 {
00104      /* powerpc */
00105      /* On IBM and Power6 Machines default domain should include supervisor */
00106   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_IBM ) {
00107      vector->cmp_info.available_domains |=
00108                   PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00109      if (strcmp(_papi_hwi_system_info.hw_info.model_string, "POWER6" ) == 0 ) {
00110         vector->cmp_info.default_domain =
00111                   PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00112      }
00113   }
00114 
00115   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_MIPS ) {
00116      vector->cmp_info.available_domains |= PAPI_DOM_KERNEL;
00117   }
00118 
00119   if ((_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_INTEL) ||
00120       (_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_AMD)) {
00121      vector->cmp_info.fast_real_timer = 1;
00122   }
00123      /* ARM */
00124   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_ARM) {
00125      /* FIXME: this will change with Cortex A15 */
00126      vector->cmp_info.available_domains |=
00127             PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00128      vector->cmp_info.default_domain =
00129             PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00130   }
00131 
00132      /* CRAY */
00133   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_CRAY ) {
00134     vector->cmp_info.available_domains |= PAPI_DOM_OTHER;
00135   }
00136 
00137   return PAPI_OK;
00138 }
00139 
00140 
00141 
00142 /******************************************************************/
00143 /******** Kernel Version Dependent Routines  **********************/
00144 /******************************************************************/
00145 
00146 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch
00147  * implementations (e.g. x86) which don't do a static event scheduability
00148  * check in sys_perf_event_open.
00149  * This was fixed for x86 in the 2.6.33 kernel
00150  *
00151  * Also! Kernels newer than 2.6.34 will fail in a similar way
00152  *       if the nmi_watchdog has stolen a performance counter
00153  *       and we try to use the maximum number of counters.
00154  *       A sys_perf_event_open() will seem to succeed but will fail
00155  *       at read time.  So re-use this work around code.
00156  */
00157 static int
00158 bug_check_scheduability(void) {
00159 
00160 #if defined(__powerpc__)
00161   /* PowerPC not affected by this bug */
00162 #elif defined(__mips__)
00163   /* MIPS as of kernel 3.1 does not properly detect schedulability */
00164   return 1;
00165 #else
00166   if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
00167 #endif
00168 
00169   if (nmi_watchdog_active) return 1;
00170 
00171   return 0;
00172 }
00173 
00174 /* PERF_FORMAT_GROUP allows reading an entire group's counts at once   */
00175 /* before 2.6.34 PERF_FORMAT_GROUP did not work when reading results   */
00176 /*  from attached processes.  We are lazy and disable it for all cases */
00177 /*  commit was:  050735b08ca8a016bbace4445fa025b88fee770b              */
00178 
00179 static int
00180 bug_format_group(void) {
00181 
00182   if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) return 1;
00183 
00184   /* MIPS, as of version 3.1, does not support this properly */
00185 
00186 #if defined(__mips__)
00187   return 1;
00188 #endif
00189 
00190   return 0;
00191 
00192 }
00193 
00194 
00195 /* There's a bug prior to Linux 2.6.33 where if you are using */
00196 /* PERF_FORMAT_GROUP, the TOTAL_TIME_ENABLED and              */
00197 /* TOTAL_TIME_RUNNING fields will be zero unless you disable  */
00198 /* the counters first                                         */
00199 static int
00200 bug_sync_read(void) {
00201 
00202   if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
00203 
00204   return 0;
00205 
00206 }
00207 
00208 
00209 /* Set the F_SETOWN_EX flag on the fd.                          */
00210 /* This affects which thread an overflow signal gets sent to    */
00211 /* Handled in a subroutine to handle the fact that the behavior */
00212 /* is dependent on kernel version.                              */
00213 static int
00214 fcntl_setown_fd(int fd) {
00215 
00216    int ret;
00217    struct f_owner_ex fown_ex;
00218 
00219       /* F_SETOWN_EX is not available until 2.6.32 */
00220    if (_papi_os_info.os_version < LINUX_VERSION(2,6,32)) {
00221 
00222       /* get ownership of the descriptor */
00223       ret = fcntl( fd, F_SETOWN, mygettid(  ) );
00224       if ( ret == -1 ) {
00225      PAPIERROR( "cannot fcntl(F_SETOWN) on %d: %s", fd, strerror(errno) );
00226      return PAPI_ESYS;
00227       }
00228    }
00229    else {
00230       /* set ownership of the descriptor */
00231       fown_ex.type = F_OWNER_TID;
00232       fown_ex.pid  = mygettid();
00233       ret = fcntl(fd, F_SETOWN_EX, (unsigned long)&fown_ex );
00234 
00235       if ( ret == -1 ) {
00236      PAPIERROR( "cannot fcntl(F_SETOWN_EX) on %d: %s",
00237             fd, strerror( errno ) );
00238      return PAPI_ESYS;
00239       }
00240    }
00241    return PAPI_OK;
00242 }
00243 
00244 /* The read format on perf_event varies based on various flags that */
00245 /* are passed into it.  This helper avoids copying this logic       */
00246 /* multiple places.                                                 */
00247 static unsigned int
00248 get_read_format( unsigned int multiplex,
00249          unsigned int inherit,
00250          int format_group )
00251 {
00252    unsigned int format = 0;
00253 
00254    /* if we need read format options for multiplexing, add them now */
00255    if (multiplex) {
00256       format |= PERF_FORMAT_TOTAL_TIME_ENABLED;
00257       format |= PERF_FORMAT_TOTAL_TIME_RUNNING;
00258    }
00259 
00260    /* if our kernel supports it and we are not using inherit, */
00261    /* add the group read options                              */
00262    if ( (!bug_format_group()) && !inherit) {
00263       if (format_group) {
00264      format |= PERF_FORMAT_GROUP;
00265       }
00266    }
00267 
00268    SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: %#x\n",
00269       multiplex, inherit, format_group, format);
00270 
00271    return format;
00272 }
00273 
00274 /*****************************************************************/
00275 /********* End Kernel-version Dependent Routines  ****************/
00276 /*****************************************************************/
00277 
00278 /*****************************************************************/
00279 /********* Begin perf_event low-level code ***********************/
00280 /*****************************************************************/
00281 
00282 /* In case headers aren't new enough to have __NR_perf_event_open */
00283 #ifndef __NR_perf_event_open
00284 
00285 #ifdef __powerpc__
00286 #define __NR_perf_event_open    319
00287 #elif defined(__x86_64__)
00288 #define __NR_perf_event_open    298
00289 #elif defined(__i386__)
00290 #define __NR_perf_event_open    336
00291 #elif defined(__arm__)          366+0x900000
00292 #define __NR_perf_event_open
00293 #endif
00294 
00295 #endif
00296 
00297 static long
00298 sys_perf_event_open( struct perf_event_attr *hw_event, pid_t pid, int cpu,
00299                        int group_fd, unsigned long flags )
00300 {
00301     int ret;
00302 
00303    SUBDBG("sys_perf_event_open(%p,%d,%d,%d,%lx\n",hw_event,pid,cpu,group_fd,flags);
00304    SUBDBG("   type: %d\n",hw_event->type);
00305    SUBDBG("   size: %d\n",hw_event->size);
00306    SUBDBG("   config: %"PRIx64" (%"PRIu64")\n",hw_event->config,
00307       hw_event->config);
00308    SUBDBG("   sample_period: %"PRIu64"\n",hw_event->sample_period);
00309    SUBDBG("   sample_type: %"PRIu64"\n",hw_event->sample_type);
00310    SUBDBG("   read_format: %"PRIu64"\n",hw_event->read_format);
00311    SUBDBG("   disabled: %d\n",hw_event->disabled);
00312    SUBDBG("   inherit: %d\n",hw_event->inherit);
00313    SUBDBG("   pinned: %d\n",hw_event->pinned);
00314    SUBDBG("   exclusive: %d\n",hw_event->exclusive);
00315    SUBDBG("   exclude_user: %d\n",hw_event->exclude_user);
00316    SUBDBG("   exclude_kernel: %d\n",hw_event->exclude_kernel);
00317    SUBDBG("   exclude_hv: %d\n",hw_event->exclude_hv);
00318    SUBDBG("   exclude_idle: %d\n",hw_event->exclude_idle);
00319    SUBDBG("   mmap: %d\n",hw_event->mmap);
00320    SUBDBG("   comm: %d\n",hw_event->comm);
00321    SUBDBG("   freq: %d\n",hw_event->freq);
00322    SUBDBG("   inherit_stat: %d\n",hw_event->inherit_stat);
00323    SUBDBG("   enable_on_exec: %d\n",hw_event->enable_on_exec);
00324    SUBDBG("   task: %d\n",hw_event->task);
00325    SUBDBG("   watermark: %d\n",hw_event->watermark);
00326     ret =
00327         syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags );
00328     SUBDBG("Returned %d %d %s\n",ret,
00329            ret<0?errno:0,
00330            ret<0?strerror(errno):" ");
00331     return ret;
00332 }
00333 
00334 
00335 static int map_perf_event_errors_to_papi(int perf_event_error) {
00336 
00337    int ret;
00338 
00339    /* These mappings are approximate.
00340       EINVAL in particular can mean lots of different things */
00341    switch(perf_event_error) {
00342       case EPERM:
00343       case EACCES:
00344            ret = PAPI_EPERM;
00345        break;
00346       case ENODEV:
00347       case EOPNOTSUPP:
00348        ret = PAPI_ENOSUPP;
00349            break;
00350       case ENOENT:
00351        ret = PAPI_ENOEVNT;
00352            break;
00353       case ENOSYS:
00354       case EAGAIN:
00355       case EBUSY:
00356       case E2BIG:
00357        ret = PAPI_ESYS;
00358        break;
00359       case ENOMEM:
00360        ret = PAPI_ENOMEM;
00361        break;
00362       case EINVAL:
00363       default:
00364        ret = PAPI_EINVAL;
00365            break;
00366    }
00367    return ret;
00368 }
00369 
00370 
00372 /*  perf_events.                                         */
00373 /*  We do this by temporarily opening an event with the  */
00374 /*  desired options then closing it again.  We use the   */
00375 /*  PERF_COUNT_HW_INSTRUCTION event as a dummy event     */
00376 /*  on the assumption it is available on all             */
00377 /*  platforms.                                           */
00378 
00379 static int
00380 check_permissions( unsigned long tid,
00381            unsigned int cpu_num,
00382            unsigned int domain,
00383            unsigned int granularity,
00384            unsigned int multiplex,
00385            unsigned int inherit )
00386 {
00387    int ev_fd;
00388    struct perf_event_attr attr;
00389 
00390    long pid;
00391 
00392    /* clearing this will set a type of hardware and to count all domains */
00393    memset(&attr, '\0', sizeof(attr));
00394    attr.read_format = get_read_format(multiplex, inherit, 1);
00395 
00396    /* set the event id (config field) to instructios */
00397    /* (an event that should always exist)            */
00398    /* This was cycles but that is missing on Niagara */
00399    attr.config = PERF_COUNT_HW_INSTRUCTIONS;
00400 
00401    /* now set up domains this event set will be counting */
00402    if (!(domain & PAPI_DOM_SUPERVISOR)) {
00403       attr.exclude_hv = 1;
00404    }
00405    if (!(domain & PAPI_DOM_USER)) {
00406       attr.exclude_user = 1;
00407    }
00408    if (!(domain & PAPI_DOM_KERNEL)) {
00409       attr.exclude_kernel = 1;
00410    }
00411 
00412    if (granularity==PAPI_GRN_SYS) {
00413       pid = -1;
00414    } else {
00415       pid = tid;
00416    }
00417 
00418    SUBDBG("Calling sys_perf_event_open() from check_permissions\n");
00419 
00420    ev_fd = sys_perf_event_open( &attr, pid, cpu_num, -1, 0 );
00421    if ( ev_fd == -1 ) {
00422       SUBDBG("sys_perf_event_open returned error.  Linux says, %s", 
00423          strerror( errno ) );
00424       return map_perf_event_errors_to_papi(errno);
00425    }
00426 
00427    /* now close it, this was just to make sure we have permissions */
00428    /* to set these options                                         */
00429    close(ev_fd);
00430    return PAPI_OK;
00431 }
00432 
00433 /* Maximum size we ever expect to read from a perf_event fd   */
00434 /*  (this is the number of 64-bit values)                     */
00435 /* We use this to size the read buffers                       */
00436 /* The three is for event count, time_enabled, time_running   */
00437 /*  and the counter term is count value and count id for each */
00438 /*  possible counter value.                                   */
00439 #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS))
00440 
00441 
00442 
00443 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */
00444 /* implementations (e.g. x86 before 2.6.33) which don't do a static event */
00445 /* scheduability check in sys_perf_event_open.  It is also needed if the  */
00446 /* kernel is stealing an event, such as when NMI watchdog is enabled.     */
00447 
00448 static int
00449 check_scheduability( pe_context_t *ctx, pe_control_t *ctl, int idx )
00450 {
00451    int retval = 0, cnt = -1;
00452    ( void ) ctx;             /*unused */
00453    long long papi_pe_buffer[READ_BUFFER_SIZE];
00454    int i,group_leader_fd;
00455 
00456    if (bug_check_scheduability()) {
00457 
00458       /* If the kernel isn't tracking scheduability right       */
00459       /* Then we need to start/stop/read to force the event     */
00460       /* to be scheduled and see if an error condition happens. */
00461 
00462       /* get the proper fd to start */
00463       group_leader_fd=ctl->events[idx].group_leader_fd;
00464       if (group_leader_fd==-1) group_leader_fd=ctl->events[idx].event_fd;
00465 
00466       /* start the event */
00467       retval = ioctl( group_leader_fd, PERF_EVENT_IOC_ENABLE, NULL );
00468       if (retval == -1) {
00469      PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
00470      return PAPI_ESYS;
00471       }
00472 
00473       /* stop the event */
00474       retval = ioctl(group_leader_fd, PERF_EVENT_IOC_DISABLE, NULL );
00475       if (retval == -1) {
00476      PAPIERROR( "ioctl(PERF_EVENT_IOC_DISABLE) failed.\n" );
00477      return PAPI_ESYS;
00478       }
00479 
00480       /* See if a read returns any results */
00481       cnt = read( group_leader_fd, papi_pe_buffer, sizeof(papi_pe_buffer));
00482       if ( cnt == -1 ) {
00483      SUBDBG( "read returned an error!  Should never happen.\n" );
00484      return PAPI_ESYS;
00485       }
00486 
00487       if ( cnt == 0 ) {
00488          /* We read 0 bytes if we could not schedule the event */
00489          /* The kernel should have detected this at open       */
00490          /* but various bugs (including NMI watchdog)          */
00491          /* result in this behavior                            */
00492 
00493      return PAPI_ECNFLCT;
00494 
00495      } else {
00496 
00497     /* Reset all of the counters (opened so far) back to zero      */
00498     /* from the above brief enable/disable call pair.              */
00499 
00500     /* We have to reset all events because reset of group leader      */
00501         /* does not reset all.                                            */
00502     /* we assume that the events are being added one by one and that  */
00503         /* we do not need to reset higher events (doing so may reset ones */
00504         /* that have not been initialized yet.                            */
00505 
00506     /* Note... PERF_EVENT_IOC_RESET does not reset time running       */
00507     /* info if multiplexing, so we should avoid coming here if        */
00508     /* we are multiplexing the event.                                 */
00509         for( i = 0; i < idx; i++) {
00510        retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
00511        if (retval == -1) {
00512           PAPIERROR( "ioctl(PERF_EVENT_IOC_RESET) #%d/%d %d "
00513              "(fd %d)failed.\n",
00514              i,ctl->num_events,idx,ctl->events[i].event_fd);
00515           return PAPI_ESYS;
00516        }
00517     }
00518       }
00519    }
00520    return PAPI_OK;
00521 }
00522 
00523 
00524 /* Do some extra work on a perf_event fd if we're doing sampling  */
00525 /* This mostly means setting up the mmap buffer.                  */
00526 static int
00527 tune_up_fd( pe_control_t *ctl, int evt_idx )
00528 {
00529    int ret;
00530    void *buf_addr;
00531    int fd = ctl->events[evt_idx].event_fd;
00532 
00533    /* Register that we would like a SIGIO notification when a mmap'd page */
00534    /* becomes full.                                                       */
00535    ret = fcntl( fd, F_SETFL, O_ASYNC | O_NONBLOCK );
00536    if ( ret ) {
00537       PAPIERROR ( "fcntl(%d, F_SETFL, O_ASYNC | O_NONBLOCK) "
00538           "returned error: %s", fd, strerror( errno ) );
00539       return PAPI_ESYS;
00540    }
00541 
00542    /* Set the F_SETOWN_EX flag on the fd.                          */
00543    /* This affects which thread an overflow signal gets sent to.   */
00544    ret=fcntl_setown_fd(fd);
00545    if (ret!=PAPI_OK) return ret;
00546 
00547    /* Set FD_CLOEXEC.  Otherwise if we do an exec with an overflow */
00548    /* running, the overflow handler will continue into the exec()'d*/
00549    /* process and kill it because no signal handler is set up.     */
00550    ret=fcntl(fd, F_SETFD, FD_CLOEXEC);
00551    if (ret) {
00552       return PAPI_ESYS;
00553    }
00554 
00555    /* when you explicitely declare that you want a particular signal,  */
00556    /* even with you use the default signal, the kernel will send more  */
00557    /* information concerning the event to the signal handler.          */
00558    /*                                                                  */
00559    /* In particular, it will send the file descriptor from which the   */
00560    /* event is originating which can be quite useful when monitoring   */
00561    /* multiple tasks from a single thread.                             */
00562    ret = fcntl( fd, F_SETSIG, ctl->overflow_signal );
00563    if ( ret == -1 ) {
00564       PAPIERROR( "cannot fcntl(F_SETSIG,%d) on %d: %s",
00565          ctl->overflow_signal, fd,
00566          strerror( errno ) );
00567       return PAPI_ESYS;
00568    }
00569 
00570    /* mmap() the sample buffer */
00571    buf_addr = mmap( NULL, ctl->events[evt_idx].nr_mmap_pages * getpagesize(),
00572             PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
00573    if ( buf_addr == MAP_FAILED ) {
00574       PAPIERROR( "mmap(NULL,%d,%d,%d,%d,0): %s",
00575          ctl->events[evt_idx].nr_mmap_pages * getpagesize(  ), 
00576          PROT_READ, MAP_SHARED, fd, strerror( errno ) );
00577       return ( PAPI_ESYS );
00578    }
00579 
00580    SUBDBG( "Sample buffer for fd %d is located at %p\n", fd, buf_addr );
00581 
00582    /* Set up the mmap buffer and its associated helpers */
00583    ctl->events[evt_idx].mmap_buf = (struct perf_counter_mmap_page *) buf_addr;
00584    ctl->events[evt_idx].tail = 0;
00585    ctl->events[evt_idx].mask = ( ctl->events[evt_idx].nr_mmap_pages - 1 ) *
00586                                getpagesize() - 1;
00587 
00588    return PAPI_OK;
00589 }
00590 
00591 
00592 
00593 /* Open all events in the control state */
00594 static int
00595 open_pe_events( pe_context_t *ctx, pe_control_t *ctl )
00596 {
00597 
00598    int i, ret = PAPI_OK;
00599    long pid;
00600 
00601    if (ctl->granularity==PAPI_GRN_SYS) {
00602       pid = -1;
00603    }
00604    else {
00605       pid = ctl->tid;
00606    }
00607 
00608    for( i = 0; i < ctl->num_events; i++ ) {
00609 
00610       ctl->events[i].event_opened=0;
00611 
00612       /* set up the attr structure.  We don't set up all fields here */
00613       /* as some have already been set up previously.                */
00614 
00615       /* group leader (event 0) is special                */
00616       /* If we're multiplexed, everyone is a group leader */
00617       if (( i == 0 ) || (ctl->multiplexed)) {
00618          ctl->events[i].attr.pinned = !ctl->multiplexed;
00619      ctl->events[i].attr.disabled = 1;
00620      ctl->events[i].group_leader_fd=-1;
00621          ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 
00622                                ctl->inherit, 
00623                                !ctl->multiplexed );
00624       } else {
00625      ctl->events[i].attr.pinned=0;
00626      ctl->events[i].attr.disabled = 0;
00627      ctl->events[i].group_leader_fd=ctl->events[0].event_fd;
00628          ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 
00629                                ctl->inherit, 
00630                                0 );
00631       }
00632 
00633 
00634       /* try to open */
00635       ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr, 
00636                              pid,
00637                              ctl->cpu,
00638                    ctl->events[i].group_leader_fd,
00639                              0 /* flags */
00640                              );
00641 
00642             /* Try to match Linux errors to PAPI errors */
00643       if ( ctl->events[i].event_fd == -1 ) {
00644      SUBDBG("sys_perf_event_open returned error on event #%d."
00645         "  Error: %s\n",
00646         i, strerror( errno ) );
00647          ret=map_perf_event_errors_to_papi(errno);
00648 
00649      goto open_pe_cleanup;
00650       }
00651 
00652       SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d,"
00653               " group_leader/fd: %d, event_fd: %d,"
00654               " read_format: 0x%"PRIu64"\n",
00655           pid, ctl->cpu, ctl->events[i].group_leader_fd, 
00656           ctl->events[i].event_fd, ctl->events[i].attr.read_format);
00657 
00658 
00659       /* in many situations the kernel will indicate we opened fine */
00660       /* yet things will fail later.  So we need to double check    */
00661       /* we actually can use the events we've set up.               */
00662 
00663       /* This is not necessary if we are multiplexing, and in fact */
00664       /* we cannot do this properly if multiplexed because         */
00665       /* PERF_EVENT_IOC_RESET does not reset the time running info */
00666       if (!ctl->multiplexed) {
00667      ret = check_scheduability( ctx, ctl, i );
00668 
00669          if ( ret != PAPI_OK ) {
00670         /* the last event did open, so we need to bump the counter */
00671         /* before doing the cleanup                                */
00672         i++;
00673             goto open_pe_cleanup;
00674      }
00675       }
00676       ctl->events[i].event_opened=1;
00677    }
00678 
00679    /* Now that we've successfully opened all of the events, do whatever  */
00680    /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */
00681    /* and so on.                                                         */
00682    for ( i = 0; i < ctl->num_events; i++ ) {
00683 
00684       /* If sampling is enabled, hook up signal handler */
00685       if ( ctl->events[i].attr.sample_period ) {
00686      ret = tune_up_fd( ctl, i );
00687      if ( ret != PAPI_OK ) {
00688         /* All of the fds are open, so we need to clean up all of them */
00689         i = ctl->num_events;
00690         goto open_pe_cleanup;
00691      }
00692       } else {
00693      /* Make sure this is NULL so close_pe_events works right */
00694      ctl->events[i].mmap_buf = NULL;
00695       }
00696    }
00697 
00698    /* Set num_evts only if completely successful */
00699    ctx->state |= PERF_EVENTS_OPENED;
00700 
00701    return PAPI_OK;
00702 
00703 open_pe_cleanup:
00704    /* We encountered an error, close up the fds we successfully opened.  */
00705    /* We go backward in an attempt to close group leaders last, although */
00706    /* That's probably not strictly necessary.                            */
00707    while ( i > 0 ) {
00708       i--;
00709       if (ctl->events[i].event_fd>=0) {
00710      close( ctl->events[i].event_fd );
00711      ctl->events[i].event_opened=0;
00712       }
00713    }
00714 
00715    return ret;
00716 }
00717 
00718 /* Close all of the opened events */
00719 static int
00720 close_pe_events( pe_context_t *ctx, pe_control_t *ctl )
00721 {
00722    int i;
00723    int num_closed=0;
00724    int events_not_opened=0;
00725 
00726    /* should this be a more serious error? */
00727    if ( ctx->state & PERF_EVENTS_RUNNING ) {
00728       SUBDBG("Closing without stopping first\n");
00729    }
00730 
00731    /* Close child events first */
00732    for( i=0; i<ctl->num_events; i++ ) {
00733 
00734       if (ctl->events[i].event_opened) {
00735 
00736          if (ctl->events[i].group_leader_fd!=-1) {
00737             if ( ctl->events[i].mmap_buf ) {
00738            if ( munmap ( ctl->events[i].mmap_buf,
00739                      ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
00740               PAPIERROR( "munmap of fd = %d returned error: %s",
00741                  ctl->events[i].event_fd, strerror( errno ) );
00742               return PAPI_ESYS;
00743            }
00744         }
00745 
00746             if ( close( ctl->events[i].event_fd ) ) {
00747            PAPIERROR( "close of fd = %d returned error: %s",
00748                ctl->events[i].event_fd, strerror( errno ) );
00749            return PAPI_ESYS;
00750         } else {
00751            num_closed++;
00752         }
00753         ctl->events[i].event_opened=0;
00754      }
00755       }
00756       else {
00757     events_not_opened++;
00758       }
00759    }
00760 
00761    /* Close the group leaders last */
00762    for( i=0; i<ctl->num_events; i++ ) {
00763 
00764       if (ctl->events[i].event_opened) {
00765 
00766          if (ctl->events[i].group_leader_fd==-1) {
00767             if ( ctl->events[i].mmap_buf ) {
00768            if ( munmap ( ctl->events[i].mmap_buf,
00769                      ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
00770               PAPIERROR( "munmap of fd = %d returned error: %s",
00771                  ctl->events[i].event_fd, strerror( errno ) );
00772               return PAPI_ESYS;
00773            }
00774         }
00775 
00776 
00777             if ( close( ctl->events[i].event_fd ) ) {
00778            PAPIERROR( "close of fd = %d returned error: %s",
00779                ctl->events[i].event_fd, strerror( errno ) );
00780            return PAPI_ESYS;
00781         } else {
00782            num_closed++;
00783         }
00784         ctl->events[i].event_opened=0;
00785      }
00786       }
00787    }
00788 
00789 
00790    if (ctl->num_events!=num_closed) {
00791       if (ctl->num_events!=(num_closed+events_not_opened)) {
00792          PAPIERROR("Didn't close all events: "
00793            "Closed %d Not Opened: %d Expected %d\n",
00794            num_closed,events_not_opened,ctl->num_events);
00795          return PAPI_EBUG;
00796       }
00797    }
00798 
00799    ctl->num_events=0;
00800 
00801    ctx->state &= ~PERF_EVENTS_OPENED;
00802 
00803    return PAPI_OK;
00804 }
00805 
00806 
00807 /********************************************************************/
00808 /********************************************************************/
00809 /*     Functions that are exported via the component interface      */
00810 /********************************************************************/
00811 /********************************************************************/
00812 
00813 
00814 /* set the domain. FIXME: perf_events allows per-event control of this. */
00815 /* we do not handle that yet.                                           */
00816 int
00817 _pe_set_domain( hwd_control_state_t *ctl, int domain)
00818 {
00819 
00820    int i;
00821    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
00822 
00823    SUBDBG("old control domain %d, new domain %d\n",
00824       pe_ctl->domain,domain);
00825 
00826    pe_ctl->domain = domain;
00827 
00828    /* Force the domain on all events */
00829    for( i = 0; i < pe_ctl->num_events; i++ ) {
00830       pe_ctl->events[i].attr.exclude_user =
00831                     !( pe_ctl->domain & PAPI_DOM_USER );
00832       pe_ctl->events[i].attr.exclude_kernel =
00833             !( pe_ctl->domain & PAPI_DOM_KERNEL );
00834       pe_ctl->events[i].attr.exclude_hv =
00835             !( pe_ctl->domain & PAPI_DOM_SUPERVISOR );
00836    }
00837    return PAPI_OK;
00838 }
00839 
00840 /* Shutdown a thread */
00841 int
00842 _pe_shutdown_thread( hwd_context_t *ctx )
00843 {
00844     pe_context_t *pe_ctx = ( pe_context_t *) ctx;
00845 
00846     pe_ctx->initialized=0;
00847 
00848     return PAPI_OK;
00849 }
00850 
00851 
00852 /* reset the hardware counters */
00853 /* Note: PAPI_reset() does not necessarily call this */
00854 /* unless the events are actually running.           */
00855 int
00856 _pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl )
00857 {
00858    int i, ret;
00859    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
00860 
00861    ( void ) ctx;             /*unused */
00862 
00863    /* We need to reset all of the events, not just the group leaders */
00864    for( i = 0; i < pe_ctl->num_events; i++ ) {
00865       ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
00866       if ( ret == -1 ) {
00867      PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) "
00868            "returned error, Linux says: %s",
00869            pe_ctl->events[i].event_fd, strerror( errno ) );
00870      return PAPI_ESYS;
00871       }
00872    }
00873 
00874    return PAPI_OK;
00875 }
00876 
00877 
00878 /* write (set) the hardware counters */
00879 /* Current we do not support this.   */
00880 int
00881 _pe_write( hwd_context_t *ctx, hwd_control_state_t *ctl,
00882         long long *from )
00883 {
00884    ( void ) ctx;             /*unused */
00885    ( void ) ctl;             /*unused */
00886    ( void ) from;            /*unused */
00887    /*
00888     * Counters cannot be written.  Do we need to virtualize the
00889     * counters so that they can be written, or perhaps modify code so that
00890     * they can be written? FIXME ?
00891     */
00892 
00893     return PAPI_ENOSUPP;
00894 }
00895 
00896 /*
00897  * perf_event provides a complicated read interface.
00898  *  the info returned by read() varies depending on whether
00899  *  you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED,
00900  *  PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set
00901  *
00902  * To simplify things we just always ask for everything.  This might
00903  * lead to overhead when reading more than we need, but it makes the
00904  * read code a lot simpler than the original implementation we had here.
00905  *
00906  * For more info on the layout see include/linux/perf_event.h
00907  *
00908  */
00909 
00910 int
00911 _pe_read( hwd_context_t *ctx, hwd_control_state_t *ctl,
00912            long long **events, int flags )
00913 {
00914    ( void ) flags;           /*unused */
00915    int i, ret = -1;
00916    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
00917    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
00918    long long papi_pe_buffer[READ_BUFFER_SIZE];
00919    long long tot_time_running, tot_time_enabled, scale;
00920 
00921    /* On kernels before 2.6.33 the TOTAL_TIME_ENABLED and TOTAL_TIME_RUNNING */
00922    /* fields are always 0 unless the counter is disabled.  So if we are on   */
00923    /* one of these kernels, then we must disable events before reading.      */
00924 
00925    /* Elsewhere though we disable multiplexing on kernels before 2.6.34 */
00926    /* so maybe this isn't even necessary.                               */
00927 
00928    if (bug_sync_read()) {
00929       if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
00930          for ( i = 0; i < pe_ctl->num_events; i++ ) {
00931         /* disable only the group leaders */
00932         if ( pe_ctl->events[i].group_leader_fd == -1 ) {
00933            ret = ioctl( pe_ctl->events[i].event_fd, 
00934                PERF_EVENT_IOC_DISABLE, NULL );
00935            if ( ret == -1 ) {
00936               PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) "
00937                "returned an error: ", strerror( errno ));
00938               return PAPI_ESYS;
00939            }
00940         }
00941      }
00942       }
00943    }
00944 
00945 
00946    /* Handle case where we are multiplexing */
00947    if (pe_ctl->multiplexed) {
00948 
00949       /* currently we handle multiplexing by having individual events */
00950       /* so we read from each in turn.                                */
00951 
00952       for ( i = 0; i < pe_ctl->num_events; i++ ) {
00953 
00954          ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer,
00955             sizeof ( papi_pe_buffer ) );
00956          if ( ret == -1 ) {
00957         PAPIERROR("read returned an error: ", strerror( errno ));
00958         return PAPI_ESYS;
00959      }
00960 
00961      /* We should read 3 64-bit values from the counter */
00962      if (ret<(signed)(3*sizeof(long long))) {
00963         PAPIERROR("Error!  short read!\n");
00964         return PAPI_ESYS;
00965      }
00966 
00967          SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
00968             pe_ctl->events[i].event_fd,
00969         (long)pe_ctl->tid, pe_ctl->cpu, ret);
00970          SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0],
00971             papi_pe_buffer[1],papi_pe_buffer[2]);
00972 
00973          tot_time_enabled = papi_pe_buffer[1];
00974          tot_time_running = papi_pe_buffer[2];
00975 
00976          SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * "
00977         "tot_time_enabled %lld) / tot_time_running %lld\n",
00978         i, 0,papi_pe_buffer[0],
00979         tot_time_enabled,tot_time_running);
00980 
00981          if (tot_time_running == tot_time_enabled) {
00982         /* No scaling needed */
00983         pe_ctl->counts[i] = papi_pe_buffer[0];
00984          } else if (tot_time_running && tot_time_enabled) {
00985         /* Scale factor of 100 to avoid overflows when computing */
00986         /*enabled/running */
00987 
00988         scale = (tot_time_enabled * 100LL) / tot_time_running;
00989         scale = scale * papi_pe_buffer[0];
00990         scale = scale / 100LL;
00991         pe_ctl->counts[i] = scale;
00992      } else {
00993        /* This should not happen, but Phil reports it sometime does. */
00994         SUBDBG("perf_event kernel bug(?) count, enabled, "
00995            "running: %lld, %lld, %lld\n",
00996            papi_pe_buffer[0],tot_time_enabled,
00997            tot_time_running);
00998 
00999         pe_ctl->counts[i] = papi_pe_buffer[0];
01000      }
01001       }
01002    }
01003 
01004    /* Handle cases where we cannot use FORMAT GROUP */
01005    else if (bug_format_group() || pe_ctl->inherit) {
01006 
01007       /* we must read each counter individually */
01008       for ( i = 0; i < pe_ctl->num_events; i++ ) {
01009 
01010          ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer,
01011             sizeof ( papi_pe_buffer ) );
01012          if ( ret == -1 ) {
01013         PAPIERROR("read returned an error: ", strerror( errno ));
01014         return PAPI_ESYS;
01015      }
01016 
01017      /* we should read one 64-bit value from each counter */
01018      if (ret!=sizeof(long long)) {
01019         PAPIERROR("Error!  short read!\n");
01020         PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
01021            pe_ctl->events[i].event_fd,
01022            (long)pe_ctl->tid, pe_ctl->cpu, ret);
01023         return PAPI_ESYS;
01024      }
01025 
01026          SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
01027             pe_ctl->events[i].event_fd, (long)pe_ctl->tid,
01028         pe_ctl->cpu, ret);
01029          SUBDBG("read: %lld\n",papi_pe_buffer[0]);
01030 
01031      pe_ctl->counts[i] = papi_pe_buffer[0];
01032       }
01033    }
01034 
01035 
01036    /* Handle cases where we are using FORMAT_GROUP   */
01037    /* We assume only one group leader, in position 0 */
01038 
01039    else {
01040       if (pe_ctl->events[0].group_leader_fd!=-1) {
01041      PAPIERROR("Was expecting group leader!\n");
01042       }
01043 
01044       ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer,
01045           sizeof ( papi_pe_buffer ) );
01046 
01047       if ( ret == -1 ) {
01048      PAPIERROR("read returned an error: ", strerror( errno ));
01049      return PAPI_ESYS;
01050       }
01051 
01052       /* we read 1 64-bit value (number of events) then     */
01053       /* num_events more 64-bit values that hold the counts */
01054       if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) {
01055      PAPIERROR("Error! short read!\n");
01056      return PAPI_ESYS;
01057       }
01058 
01059       SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
01060          pe_ctl->events[0].event_fd,
01061          (long)pe_ctl->tid, pe_ctl->cpu, ret);
01062       {
01063      int j;
01064      for(j=0;j<ret/8;j++) {
01065             SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]);
01066      }
01067       }
01068 
01069       /* Make sure the kernel agrees with how many events we have */
01070       if (papi_pe_buffer[0]!=pe_ctl->num_events) {
01071      PAPIERROR("Error!  Wrong number of events!\n");
01072      return PAPI_ESYS;
01073       }
01074 
01075       /* put the count values in their proper location */
01076       for(i=0;i<papi_pe_buffer[0];i++) {
01077          pe_ctl->counts[i] = papi_pe_buffer[1+i];
01078       }
01079    }
01080 
01081 
01082    /* If we disabled the counters due to the sync_read_bug(), */
01083    /* then we need to re-enable them now.                     */
01084    if (bug_sync_read()) {
01085       if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
01086          for ( i = 0; i < pe_ctl->num_events; i++ ) {
01087         if ( pe_ctl->events[i].group_leader_fd == -1 ) {
01088            /* this should refresh any overflow counters too */
01089            ret = ioctl( pe_ctl->events[i].event_fd,
01090                 PERF_EVENT_IOC_ENABLE, NULL );
01091            if ( ret == -1 ) {
01092               /* Should never happen */
01093               PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) returned an error: ",
01094                 strerror( errno ));
01095               return PAPI_ESYS;
01096            }
01097         }
01098      }
01099       }
01100    }
01101 
01102    /* point PAPI to the values we read */
01103    *events = pe_ctl->counts;
01104 
01105    return PAPI_OK;
01106 }
01107 
01108 /* Start counting events */
01109 int
01110 _pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl )
01111 {
01112    int ret;
01113    int i;
01114    int did_something = 0;
01115    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01116    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01117 
01118    /* Reset the counters first.  Is this necessary? */
01119    ret = _pe_reset( pe_ctx, pe_ctl );
01120    if ( ret ) {
01121       return ret;
01122    }
01123 
01124    /* Enable all of the group leaders                */
01125    /* All group leaders have a group_leader_fd of -1 */
01126    for( i = 0; i < pe_ctl->num_events; i++ ) {
01127       if (pe_ctl->events[i].group_leader_fd == -1) {
01128      SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd);
01129      ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ;
01130 
01131      /* ioctls always return -1 on failure */
01132          if (ret == -1) {
01133             PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
01134             return PAPI_ESYS;
01135      }
01136 
01137      did_something++;
01138       }
01139    }
01140 
01141    if (!did_something) {
01142       PAPIERROR("Did not enable any counters.\n");
01143       return PAPI_EBUG;
01144    }
01145 
01146    pe_ctx->state |= PERF_EVENTS_RUNNING;
01147 
01148    return PAPI_OK;
01149 
01150 }
01151 
01152 /* Stop all of the counters */
01153 int
01154 _pe_stop( hwd_context_t *ctx, hwd_control_state_t *ctl )
01155 {
01156 
01157    int ret;
01158    int i;
01159    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01160    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01161 
01162    /* Just disable the group leaders */
01163    for ( i = 0; i < pe_ctl->num_events; i++ ) {
01164       if ( pe_ctl->events[i].group_leader_fd == -1 ) {
01165      ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL);
01166      if ( ret == -1 ) {
01167         PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) "
01168                "returned error, Linux says: %s",
01169                pe_ctl->events[i].event_fd, strerror( errno ) );
01170         return PAPI_EBUG;
01171      }
01172       }
01173    }
01174 
01175    pe_ctx->state &= ~PERF_EVENTS_RUNNING;
01176 
01177    return PAPI_OK;
01178 }
01179 
01180 /* This function clears the current contents of the control structure and
01181    updates it with whatever resources are allocated for all the native events
01182    in the native info structure array. */
01183 
01184 int
01185 _pe_update_control_state( hwd_control_state_t *ctl,
01186                    NativeInfo_t *native,
01187                    int count, hwd_context_t *ctx )
01188 {
01189    int i = 0, ret;
01190    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01191    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01192 
01193    /* close all of the existing fds and start over again */
01194    /* In theory we could have finer-grained control and know if             */
01195    /* things were changed, but it's easier to tear things down and rebuild. */
01196    close_pe_events( pe_ctx, pe_ctl );
01197 
01198    /* Calling with count==0 should be OK, it's how things are deallocated */
01199    /* when an eventset is destroyed.                                      */
01200    if ( count == 0 ) {
01201       SUBDBG( "Called with count == 0\n" );
01202       return PAPI_OK;
01203    }
01204 
01205    /* set up all the events */
01206    for( i = 0; i < count; i++ ) {
01207       if ( native ) {
01208      /* Have libpfm4 set the config values for the event */
01209      ret=_pe_libpfm4_setup_counters(&pe_ctl->events[i].attr,
01210                     native[i].ni_event,
01211                     pe_ctx->event_table);
01212      SUBDBG( "pe_ctl->eventss[%d].config=%"PRIx64"\n",i,
01213          pe_ctl->events[i].attr.config);
01214      if (ret!=PAPI_OK) return ret;
01215 
01216       } else {
01217       /* I'm not sure how we'd end up in this case */
01218           /* should it be an error?                    */
01219       }
01220 
01221       /* Copy the inherit flag into the attribute block that will be   */
01222       /* passed to the kernel */
01223       pe_ctl->events[i].attr.inherit = pe_ctl->inherit;
01224 
01225       /* Set the position in the native structure */
01226       /* We just set up events linearly           */
01227       if ( native ) {
01228      native[i].ni_position = i;
01229       }
01230    }
01231 
01232    pe_ctl->num_events = count;
01233    _pe_set_domain( ctl, pe_ctl->domain );
01234 
01235    /* actuall open the events */
01236    /* (why is this a separate function?) */
01237    ret = open_pe_events( pe_ctx, pe_ctl );
01238    if ( ret != PAPI_OK ) {
01239       SUBDBG("open_pe_events failed\n");
01240       /* Restore values ? */
01241       return ret;
01242    }
01243 
01244    return PAPI_OK;
01245 }
01246 
01247 /* Set various options on a control state */
01248 int
01249 _pe_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option )
01250 {
01251    int ret;
01252    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01253    pe_control_t *pe_ctl = NULL;
01254 
01255    switch ( code ) {
01256       case PAPI_MULTIPLEX:
01257        pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state );
01258        ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain,
01259                     pe_ctl->granularity,
01260                     1, pe_ctl->inherit );
01261            if (ret != PAPI_OK) {
01262           return ret;
01263        }
01264 
01265        /* looks like we are allowed, so set multiplexed attribute */
01266        pe_ctl->multiplexed = 1;
01267        ret = _pe_update_control_state( pe_ctl, NULL,
01268                         pe_ctl->num_events, pe_ctx );
01269        if (ret != PAPI_OK) {
01270           pe_ctl->multiplexed = 0;
01271        }
01272        return ret;
01273 
01274       case PAPI_ATTACH:
01275        pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state );
01276        ret = check_permissions( option->attach.tid, pe_ctl->cpu,
01277                   pe_ctl->domain, pe_ctl->granularity,
01278                   pe_ctl->multiplexed,
01279                     pe_ctl->inherit );
01280        if (ret != PAPI_OK) {
01281           return ret;
01282        }
01283 
01284        pe_ctl->tid = option->attach.tid;
01285 
01286        /* If events have been already been added, something may */
01287        /* have been done to the kernel, so update */
01288        ret =_pe_update_control_state( pe_ctl, NULL,
01289                         pe_ctl->num_events, pe_ctx);
01290 
01291        return ret;
01292 
01293       case PAPI_DETACH:
01294        pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state );
01295 
01296        pe_ctl->tid = 0;
01297        return PAPI_OK;
01298 
01299       case PAPI_CPU_ATTACH:
01300        pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state );
01301        ret = check_permissions( pe_ctl->tid, option->cpu.cpu_num,
01302                     pe_ctl->domain, pe_ctl->granularity,
01303                     pe_ctl->multiplexed,
01304                     pe_ctl->inherit );
01305            if (ret != PAPI_OK) {
01306            return ret;
01307        }
01308        /* looks like we are allowed so set cpu number */
01309 
01310        /* this tells the kernel not to count for a thread   */
01311        /* should we warn if we try to set both?  perf_event */
01312        /* will reject it.                                   */
01313        pe_ctl->tid = -1;
01314 
01315        pe_ctl->cpu = option->cpu.cpu_num;
01316 
01317        return PAPI_OK;
01318 
01319       case PAPI_DOMAIN:
01320        pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state );
01321        ret = check_permissions( pe_ctl->tid, pe_ctl->cpu,
01322                     option->domain.domain,
01323                     pe_ctl->granularity,
01324                     pe_ctl->multiplexed,
01325                     pe_ctl->inherit );
01326            if (ret != PAPI_OK) {
01327           return ret;
01328        }
01329        /* looks like we are allowed, so set counting domain */
01330        return _pe_set_domain( pe_ctl, option->domain.domain );
01331 
01332       case PAPI_GRANUL:
01333        pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state );
01334 
01335        /* FIXME: we really don't support this yet */
01336 
01337            switch ( option->granularity.granularity  ) {
01338               case PAPI_GRN_PROCG:
01339               case PAPI_GRN_SYS_CPU:
01340               case PAPI_GRN_PROC:
01341            return PAPI_ECMP;
01342 
01343           /* Currently we only support thread and CPU granularity */
01344               case PAPI_GRN_SYS:
01345            pe_ctl->granularity=PAPI_GRN_SYS;
01346            break;
01347 
01348               case PAPI_GRN_THR:
01349            pe_ctl->granularity=PAPI_GRN_THR;
01350            break;
01351 
01352 
01353               default:
01354            return PAPI_EINVAL;
01355        }
01356            return PAPI_OK;
01357 
01358       case PAPI_INHERIT:
01359        pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state );
01360        ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain,
01361                   pe_ctl->granularity, pe_ctl->multiplexed,
01362                     option->inherit.inherit );
01363            if (ret != PAPI_OK) {
01364           return ret;
01365        }
01366        /* looks like we are allowed, so set the requested inheritance */
01367        if (option->inherit.inherit) {
01368           /* children will inherit counters */
01369           pe_ctl->inherit = 1;
01370        } else {
01371           /* children won't inherit counters */
01372           pe_ctl->inherit = 0;
01373        }
01374        return PAPI_OK;
01375 
01376       case PAPI_DATA_ADDRESS:
01377        return PAPI_ENOSUPP;
01378 #if 0
01379        pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
01380        ret = set_default_domain( pe_ctl, option->address_range.domain );
01381        if ( ret != PAPI_OK ) {
01382           return ret;
01383        }
01384        set_drange( pe_ctx, pe_ctl, option );
01385        return PAPI_OK;
01386 #endif
01387       case PAPI_INSTR_ADDRESS:
01388        return PAPI_ENOSUPP;
01389 #if 0
01390        pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
01391        ret = set_default_domain( pe_ctl, option->address_range.domain );
01392        if ( ret != PAPI_OK ) {
01393           return ret;
01394        }
01395        set_irange( pe_ctx, pe_ctl, option );
01396        return PAPI_OK;
01397 #endif
01398 
01399       case PAPI_DEF_ITIMER:
01400        /* What should we be checking for here?                   */
01401        /* This seems like it should be OS-specific not component */
01402        /* specific.                                              */
01403 
01404        return PAPI_OK;
01405 
01406       case PAPI_DEF_MPX_NS:
01407        /* Defining a given ns per set is not current supported */
01408        return PAPI_ENOSUPP;
01409 
01410       case PAPI_DEF_ITIMER_NS:
01411        /* We don't support this... */
01412        return PAPI_OK;
01413 
01414       default:
01415        return PAPI_ENOSUPP;
01416    }
01417 }
01418 
01419 /* Initialize a thread */
01420 int
01421 _pe_init_thread( hwd_context_t *hwd_ctx )
01422 {
01423 
01424   pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx;
01425 
01426   /* clear the context structure and mark as initialized */
01427   memset( pe_ctx, 0, sizeof ( pe_context_t ) );
01428   pe_ctx->initialized=1;
01429   pe_ctx->event_table=&perf_native_event_table;
01430   pe_ctx->cidx=our_cidx;
01431 
01432   return PAPI_OK;
01433 }
01434 
01435 /* Initialize a new control state */
01436 int
01437 _pe_init_control_state( hwd_control_state_t *ctl )
01438 {
01439   pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01440 
01441   /* clear the contents */
01442   memset( pe_ctl, 0, sizeof ( pe_control_t ) );
01443 
01444   /* Set the domain */
01445   _pe_set_domain( ctl, _perf_event_vector.cmp_info.default_domain );    
01446 
01447   /* default granularity */
01448   pe_ctl->granularity= _perf_event_vector.cmp_info.default_granularity;
01449 
01450   /* overflow signal */
01451   pe_ctl->overflow_signal=_perf_event_vector.cmp_info.hardware_intr_sig;
01452 
01453   pe_ctl->cidx=our_cidx;
01454 
01455   /* Set cpu number in the control block to show events */
01456   /* are not tied to specific cpu                       */
01457   pe_ctl->cpu = -1;
01458   return PAPI_OK;
01459 }
01460 
01461 /* Check the mmap page for rdpmc support */
01462 static int _pe_detect_rdpmc(int default_domain) {
01463 
01464   struct perf_event_attr pe;
01465   int fd,rdpmc_exists=1;
01466   void *addr;
01467   struct perf_event_mmap_page *our_mmap;
01468 
01469   /* Create a fake instructions event so we can read a mmap page */
01470   memset(&pe,0,sizeof(struct perf_event_attr));
01471 
01472   pe.type=PERF_TYPE_HARDWARE;
01473   pe.size=sizeof(struct perf_event_attr);
01474   pe.config=PERF_COUNT_HW_INSTRUCTIONS;
01475 
01476   /* There should probably be a helper function to handle this      */
01477   /* we break on some ARM because there is no support for excluding */
01478   /* kernel.                                                        */
01479   if (default_domain & PAPI_DOM_KERNEL ) {
01480   }
01481   else {
01482     pe.exclude_kernel=1;
01483   }
01484   fd=sys_perf_event_open(&pe,0,-1,-1,0);
01485   if (fd<0) {
01486     return PAPI_ESYS;
01487   }
01488 
01489   /* create the mmap page */
01490   addr=mmap(NULL, 4096, PROT_READ, MAP_SHARED,fd,0);
01491   if (addr == (void *)(-1)) {
01492     close(fd);
01493     return PAPI_ESYS;
01494   }
01495 
01496   /* get the rdpmc info */
01497   our_mmap=(struct perf_event_mmap_page *)addr;
01498   if (our_mmap->cap_usr_rdpmc==0) {
01499     rdpmc_exists=0;
01500   }
01501 
01502   /* close the fake event */
01503   munmap(addr,4096);
01504   close(fd);
01505 
01506   return rdpmc_exists;
01507 
01508 }
01509 
01510 
01511 /* Initialize the perf_event component */
01512 int
01513 _pe_init_component( int cidx )
01514 {
01515 
01516   int retval;
01517   int paranoid_level;
01518 
01519   FILE *fff;
01520 
01521   our_cidx=cidx;
01522 
01523   /* The is the official way to detect if perf_event support exists */
01524   /* The file is called perf_counter_paranoid on 2.6.31             */
01525   /* currently we are lazy and do not support 2.6.31 kernels        */
01526   fff=fopen("/proc/sys/kernel/perf_event_paranoid","r");
01527   if (fff==NULL) {
01528     strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
01529         "perf_event support not detected",PAPI_MAX_STR_LEN);
01530     return PAPI_ENOCMP;
01531   }
01532 
01533   /* 2 means no kernel measurements allowed   */
01534   /* 1 means normal counter access            */
01535   /* 0 means you can access CPU-specific data */
01536   /* -1 means no restrictions                 */
01537   retval=fscanf(fff,"%d",&paranoid_level);
01538   if (retval!=1) fprintf(stderr,"Error reading paranoid level\n");
01539   fclose(fff);
01540 
01541   if ((paranoid_level==2) && (getuid()!=0)) {
01542      SUBDBG("/proc/sys/kernel/perf_event_paranoid prohibits kernel counts");
01543      _papi_hwd[cidx]->cmp_info.available_domains &=~PAPI_DOM_KERNEL;
01544   }
01545 
01546   /* Detect NMI watchdog which can steal counters */
01547   nmi_watchdog_active=_linux_detect_nmi_watchdog();
01548   if (nmi_watchdog_active) {
01549     SUBDBG("The Linux nmi_watchdog is using one of the performance "
01550        "counters, reducing the total number available.\n");
01551   }
01552   /* Kernel multiplexing is broken prior to kernel 2.6.34 */
01553   /* The fix was probably git commit:                     */
01554   /*     45e16a6834b6af098702e5ea6c9a40de42ff77d8         */
01555   if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) {
01556     _papi_hwd[cidx]->cmp_info.kernel_multiplex = 0;
01557     _papi_hwd[cidx]->cmp_info.num_mpx_cntrs = PAPI_MAX_SW_MPX_EVENTS;
01558   }
01559   else {
01560     _papi_hwd[cidx]->cmp_info.kernel_multiplex = 1;
01561     _papi_hwd[cidx]->cmp_info.num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS;
01562   }
01563 
01564   /* Check that processor is supported */
01565   if (processor_supported(_papi_hwi_system_info.hw_info.vendor,
01566               _papi_hwi_system_info.hw_info.cpuid_family)!=
01567       PAPI_OK) {
01568     fprintf(stderr,"warning, your processor is unsupported\n");
01569     /* should not return error, as software events should still work */
01570   }
01571 
01572   /* Setup mmtimers, if appropriate */
01573   retval=mmtimer_setup();
01574   if (retval) {
01575     strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
01576         "Error initializing mmtimer",PAPI_MAX_STR_LEN);
01577     return retval;
01578   }
01579 
01580    /* Set the overflow signal */
01581    _papi_hwd[cidx]->cmp_info.hardware_intr_sig = SIGRTMIN + 2;
01582 
01583    /* Run Vendor-specific fixups */
01584    pe_vendor_fixups(_papi_hwd[cidx]);
01585 
01586    /* Detect if we can use rdpmc (or equivalent) */
01587    /* We currently do not use rdpmc as it is slower in tests */
01588    /* than regular read (as of Linux 3.5)                    */
01589    retval=_pe_detect_rdpmc(_papi_hwd[cidx]->cmp_info.default_domain);
01590    if (retval < 0 ) {
01591       strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
01592         "sys_perf_event_open() failed, perf_event support for this platform may be broken",PAPI_MAX_STR_LEN);
01593 
01594        return retval;
01595     }
01596    _papi_hwd[cidx]->cmp_info.fast_counter_read = retval;
01597 
01598    /* Run the libpfm4-specific setup */
01599    retval = _papi_libpfm4_init(_papi_hwd[cidx]);
01600    if (retval) {
01601      strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
01602          "Error initializing libpfm4",PAPI_MAX_STR_LEN);
01603      return retval;
01604    }
01605 
01606    retval = _pe_libpfm4_init(_papi_hwd[cidx], cidx,
01607                    &perf_native_event_table,
01608                                PMU_TYPE_CORE | PMU_TYPE_OS);
01609    if (retval) {
01610      strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
01611          "Error initializing libpfm4",PAPI_MAX_STR_LEN);
01612      return retval;
01613    }
01614 
01615    return PAPI_OK;
01616 
01617 }
01618 
01619 /* Shutdown the perf_event component */
01620 int
01621 _pe_shutdown_component( void ) {
01622 
01623   /* deallocate our event table */
01624   _pe_libpfm4_shutdown(&perf_native_event_table);
01625 
01626   /* Shutdown libpfm4 */
01627   _papi_libpfm4_shutdown();
01628 
01629   return PAPI_OK;
01630 }
01631 
01632 
01633 
01634 
01635 int
01636 _pe_ntv_enum_events( unsigned int *PapiEventCode, int modifier )
01637 {
01638   return _pe_libpfm4_ntv_enum_events(PapiEventCode, modifier,
01639                                        &perf_native_event_table);
01640 }
01641 
01642 int
01643 _pe_ntv_name_to_code( char *name, unsigned int *event_code) {
01644   return _pe_libpfm4_ntv_name_to_code(name,event_code,
01645                                         &perf_native_event_table);
01646 }
01647 
01648 int
01649 _pe_ntv_code_to_name(unsigned int EventCode,
01650                           char *ntv_name, int len) {
01651    return _pe_libpfm4_ntv_code_to_name(EventCode,
01652                                          ntv_name, len, 
01653                     &perf_native_event_table);
01654 }
01655 
01656 int
01657 _pe_ntv_code_to_descr( unsigned int EventCode,
01658                             char *ntv_descr, int len) {
01659 
01660    return _pe_libpfm4_ntv_code_to_descr(EventCode,ntv_descr,len,
01661                                           &perf_native_event_table);
01662 }
01663 
01664 int
01665 _pe_ntv_code_to_info(unsigned int EventCode,
01666                           PAPI_event_info_t *info) {
01667 
01668   return _pe_libpfm4_ntv_code_to_info(EventCode, info,
01669                                         &perf_native_event_table);
01670 }
01671 
01672 /* These functions are based on builtin-record.c in the  */
01673 /* kernel's tools/perf directory.                        */
01674 
01675 static uint64_t
01676 mmap_read_head( pe_event_info_t *pe )
01677 {
01678   struct perf_event_mmap_page *pc = pe->mmap_buf;
01679   int head;
01680 
01681   if ( pc == NULL ) {
01682     PAPIERROR( "perf_event_mmap_page is NULL" );
01683     return 0;
01684   }
01685 
01686   head = pc->data_head;
01687   rmb(  );
01688 
01689   return head;
01690 }
01691 
01692 static void
01693 mmap_write_tail( pe_event_info_t *pe, uint64_t tail )
01694 {
01695   struct perf_event_mmap_page *pc = pe->mmap_buf;
01696 
01697   /* ensure all reads are done before we write the tail out. */
01698   pc->data_tail = tail;
01699 }
01700 
01701 
01702 /* Does the kernel define these somewhere? */
01703 struct ip_event {
01704   struct perf_event_header header;
01705   uint64_t ip;
01706 };
01707 struct lost_event {
01708   struct perf_event_header header;
01709   uint64_t id;
01710   uint64_t lost;
01711 };
01712 typedef union event_union {
01713   struct perf_event_header header;
01714   struct ip_event ip;
01715   struct lost_event lost;
01716 } perf_sample_event_t;
01717 
01718 /* Should re-write with comments if we ever figure out what's */
01719 /* going on here.                                             */
01720 static void
01721 mmap_read( int cidx, ThreadInfo_t **thr, pe_event_info_t *pe, 
01722            int profile_index )
01723 {
01724   uint64_t head = mmap_read_head( pe );
01725   uint64_t old = pe->tail;
01726   unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize(  );
01727   int diff;
01728 
01729   diff = head - old;
01730   if ( diff < 0 ) {
01731     SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64
01732         ",  tail = %" PRIu64 ". Discarding samples.\n", head, old );
01733     /* head points to a known good entry, start there. */
01734     old = head;
01735   }
01736 
01737   for( ; old != head; ) {
01738     perf_sample_event_t *event = ( perf_sample_event_t * ) 
01739       & data[old & pe->mask];
01740     perf_sample_event_t event_copy;
01741     size_t size = event->header.size;
01742 
01743     /* Event straddles the mmap boundary -- header should always */
01744     /* be inside due to u64 alignment of output.                 */
01745     if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) {
01746       uint64_t offset = old;
01747       uint64_t len = min( sizeof ( *event ), size ), cpy;
01748       void *dst = &event_copy;
01749 
01750       do {
01751     cpy = min( pe->mask + 1 - ( offset & pe->mask ), len );
01752     memcpy( dst, &data[offset & pe->mask], cpy );
01753     offset += cpy;
01754     dst = ((unsigned char*)dst) + cpy;
01755     len -= cpy;
01756       } while ( len );
01757 
01758       event = &event_copy;
01759     }
01760     old += size;
01761 
01762     SUBDBG( "event->type = %08x\n", event->header.type );
01763     SUBDBG( "event->size = %d\n", event->header.size );
01764 
01765     switch ( event->header.type ) {
01766     case PERF_RECORD_SAMPLE:
01767       _papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx],
01768                   ( caddr_t ) ( unsigned long ) event->ip.ip, 
01769                   0, profile_index );
01770       break;
01771 
01772     case PERF_RECORD_LOST:
01773       SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64
01774                       " events were lost.\n"
01775                       "Loss was recorded when counter id 0x%"PRIx64 
01776           " overflowed.\n", event->lost.lost, event->lost.id );
01777       break;
01778 
01779     default:
01780       SUBDBG( "Error: unexpected header type - %d\n",
01781           event->header.type );
01782       break;
01783     }
01784   }
01785 
01786   pe->tail = old;
01787   mmap_write_tail( pe, old );
01788 }
01789 
01790 /* Find a native event specified by a profile index */
01791 static int
01792 find_profile_index( EventSetInfo_t *ESI, int evt_idx, int *flags,
01793                     unsigned int *native_index, int *profile_index )
01794 {
01795   int pos, esi_index, count;
01796 
01797   for ( count = 0; count < ESI->profile.event_counter; count++ ) {
01798     esi_index = ESI->profile.EventIndex[count];
01799     pos = ESI->EventInfoArray[esi_index].pos[0];
01800                 
01801     if ( pos == evt_idx ) {
01802       *profile_index = count;
01803           *native_index = ESI->NativeInfoArray[pos].ni_event & 
01804         PAPI_NATIVE_AND_MASK;
01805           *flags = ESI->profile.flags;
01806           SUBDBG( "Native event %d is at profile index %d, flags %d\n",
01807                   *native_index, *profile_index, *flags );
01808           return PAPI_OK;
01809     }
01810   }
01811   PAPIERROR( "wrong count: %d vs. ESI->profile.event_counter %d", count,
01812          ESI->profile.event_counter );
01813   return PAPI_EBUG;
01814 }
01815 
01816 
01817 
01818 /* What exactly does this do? */
01819 static int
01820 process_smpl_buf( int evt_idx, ThreadInfo_t **thr, int cidx )
01821 {
01822   int ret, flags, profile_index;
01823   unsigned native_index;
01824   pe_control_t *ctl;
01825 
01826   ret = find_profile_index( ( *thr )->running_eventset[cidx], evt_idx, 
01827                 &flags, &native_index, &profile_index );
01828   if ( ret != PAPI_OK ) {
01829     return ret;
01830   }
01831 
01832   ctl= (*thr)->running_eventset[cidx]->ctl_state;
01833 
01834   mmap_read( cidx, thr, 
01835          &(ctl->events[evt_idx]),
01836          profile_index );
01837 
01838   return PAPI_OK;
01839 }
01840 
01841 /*
01842  * This function is used when hardware overflows are working or when
01843  * software overflows are forced
01844  */
01845 
01846 void
01847 _pe_dispatch_timer( int n, hwd_siginfo_t *info, void *uc)
01848 {
01849   ( void ) n;                           /*unused */
01850   _papi_hwi_context_t hw_context;
01851   int found_evt_idx = -1, fd = info->si_fd;
01852   caddr_t address;
01853   ThreadInfo_t *thread = _papi_hwi_lookup_thread( 0 );
01854   int i;
01855   pe_control_t *ctl;
01856   int cidx = _perf_event_vector.cmp_info.CmpIdx;
01857 
01858   if ( thread == NULL ) {
01859     PAPIERROR( "thread == NULL in _papi_pe_dispatch_timer for fd %d!", fd );
01860     return;
01861   }
01862 
01863   if ( thread->running_eventset[cidx] == NULL ) {
01864     PAPIERROR( "thread->running_eventset == NULL in "
01865            "_papi_pe_dispatch_timer for fd %d!",fd );
01866     return;
01867   }
01868 
01869   if ( thread->running_eventset[cidx]->overflow.flags == 0 ) {
01870     PAPIERROR( "thread->running_eventset->overflow.flags == 0 in "
01871            "_papi_pe_dispatch_timer for fd %d!", fd );
01872     return;
01873   }
01874 
01875   hw_context.si = info;
01876   hw_context.ucontext = ( hwd_ucontext_t * ) uc;
01877 
01878   if ( thread->running_eventset[cidx]->overflow.flags & 
01879        PAPI_OVERFLOW_FORCE_SW ) {
01880     address = GET_OVERFLOW_ADDRESS( hw_context );
01881     _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context, 
01882                     address, NULL, 0,
01883                     0, &thread, cidx );
01884     return;
01885   }
01886 
01887   if ( thread->running_eventset[cidx]->overflow.flags !=
01888        PAPI_OVERFLOW_HARDWARE ) {
01889     PAPIERROR( "thread->running_eventset->overflow.flags is set to "
01890                  "something other than PAPI_OVERFLOW_HARDWARE or "
01891            "PAPI_OVERFLOW_FORCE_SW for fd %d (%#x)",
01892            fd , thread->running_eventset[cidx]->overflow.flags);
01893   }
01894 
01895   /* convoluted way to get ctl */
01896   ctl= thread->running_eventset[cidx]->ctl_state;
01897 
01898   /* See if the fd is one that's part of the this thread's context */
01899   for( i=0; i < ctl->num_events; i++ ) {
01900     if ( fd == ctl->events[i].event_fd ) {
01901       found_evt_idx = i;
01902       break;
01903     }
01904   }
01905 
01906   if ( found_evt_idx == -1 ) {
01907     PAPIERROR( "Unable to find fd %d among the open event fds "
01908            "_papi_hwi_dispatch_timer!", fd );
01909     return;
01910   }
01911         
01912   if (ioctl( fd, PERF_EVENT_IOC_DISABLE, NULL ) == -1 ) {
01913       PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) failed.\n");
01914   }
01915 
01916   if ( ( thread->running_eventset[cidx]->state & PAPI_PROFILING ) && 
01917        !( thread->running_eventset[cidx]->profile.flags & 
01918       PAPI_PROFIL_FORCE_SW ) ) {
01919     process_smpl_buf( found_evt_idx, &thread, cidx );
01920   }
01921   else {
01922     uint64_t ip;
01923     unsigned int head;
01924     pe_event_info_t *pe = &(ctl->events[found_evt_idx]);
01925     unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize(  );
01926 
01927     /*
01928      * Read up the most recent IP from the sample in the mmap buffer.  To
01929      * do this, we make the assumption that all of the records in the
01930      * mmap buffer are the same size, and that they all contain the IP as
01931      * their only record element.  This means that we can use the
01932      * data_head element from the user page and move backward one record
01933      * from that point and read the data.  Since we don't actually need
01934      * to access the header of the record, we can just subtract 8 (size
01935      * of the IP) from data_head and read up that word from the mmap
01936      * buffer.  After we subtract 8, we account for mmap buffer wrapping
01937      * by AND'ing this offset with the buffer mask.
01938      */
01939     head = mmap_read_head( pe );
01940 
01941     if ( head == 0 ) {
01942       PAPIERROR( "Attempting to access memory which may be inaccessable" );
01943       return;
01944     }
01945     ip = *( uint64_t * ) ( data + ( ( head - 8 ) & pe->mask ) );
01946     /*
01947      * Update the tail to the current head pointer. 
01948      *
01949      * Note: that if we were to read the record at the tail pointer,
01950      * rather than the one at the head (as you might otherwise think
01951      * would be natural), we could run into problems.  Signals don't
01952      * stack well on Linux, particularly if not using RT signals, and if
01953      * they come in rapidly enough, we can lose some.  Overtime, the head
01954      * could catch up to the tail and monitoring would be stopped, and
01955      * since no more signals are coming in, this problem will never be
01956      * resolved, resulting in a complete loss of overflow notification
01957      * from that point on.  So the solution we use here will result in
01958      * only the most recent IP value being read every time there are two
01959      * or more samples in the buffer (for that one overflow signal).  But
01960      * the handler will always bring up the tail, so the head should
01961      * never run into the tail.
01962      */
01963     mmap_write_tail( pe, head );
01964 
01965     /*
01966      * The fourth parameter is supposed to be a vector of bits indicating
01967      * the overflowed hardware counters, but it's not really clear that
01968      * it's useful, because the actual hardware counters used are not
01969      * exposed to the PAPI user.  For now, I'm just going to set the bit
01970      * that indicates which event register in the array overflowed.  The
01971      * result is that the overflow vector will not be identical to the
01972      * perfmon implementation, and part of that is due to the fact that
01973      * which hardware register is actually being used is opaque at the
01974      * user level (the kernel event dispatcher hides that info).
01975      */
01976 
01977     _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context,
01978                     ( caddr_t ) ( unsigned long ) ip,
01979                     NULL, ( 1 << found_evt_idx ), 0,
01980                     &thread, cidx );
01981 
01982   }
01983 
01984   /* Restart the counters */
01985   if (ioctl( fd, PERF_EVENT_IOC_REFRESH, PAPI_REFRESH_VALUE ) == -1) {
01986     PAPIERROR( "overflow refresh failed", 0 );
01987   }
01988 }
01989 
01990 /* Stop profiling */
01991 int
01992 _pe_stop_profiling( ThreadInfo_t *thread, EventSetInfo_t *ESI )
01993 {
01994   int i, ret = PAPI_OK;
01995   pe_control_t *ctl;
01996   int cidx;
01997 
01998   ctl=ESI->ctl_state;
01999 
02000   cidx=ctl->cidx;
02001 
02002   /* Loop through all of the events and process those which have mmap */
02003   /* buffers attached.                                                */
02004   for ( i = 0; i < ctl->num_events; i++ ) {
02005     /* Use the mmap_buf field as an indicator of this fd being used for */
02006     /* profiling.                                                       */
02007     if ( ctl->events[i].mmap_buf ) {
02008       /* Process any remaining samples in the sample buffer */
02009       ret = process_smpl_buf( i, &thread, cidx );
02010       if ( ret ) {
02011     PAPIERROR( "process_smpl_buf returned error %d", ret );
02012     return ret;
02013       }
02014     }
02015   }
02016   return ret;
02017 }
02018 
02019 /* Setup an event to cause overflow */
02020 int
02021 _pe_set_overflow( EventSetInfo_t *ESI, int EventIndex, int threshold )
02022 {
02023 
02024   pe_context_t *ctx;
02025   pe_control_t *ctl = (pe_control_t *) ( ESI->ctl_state );
02026   int i, evt_idx, found_non_zero_sample_period = 0, retval = PAPI_OK;
02027   int cidx;
02028 
02029   cidx = ctl->cidx;
02030   ctx = ( pe_context_t *) ( ESI->master->context[cidx] );
02031 
02032   evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
02033 
02034   SUBDBG("Attempting to set overflow for index %d (%d) of EventSet %d\n",
02035      evt_idx,EventIndex,ESI->EventSetIndex);
02036 
02037   if (evt_idx<0) {
02038     return PAPI_EINVAL;
02039   }
02040 
02041   if ( threshold == 0 ) {
02042     /* If this counter isn't set to overflow, it's an error */
02043     if ( ctl->events[evt_idx].attr.sample_period == 0 ) return PAPI_EINVAL;
02044   }
02045 
02046   ctl->events[evt_idx].attr.sample_period = threshold;
02047 
02048   /*
02049    * Note that the wakeup_mode field initially will be set to zero
02050    * (WAKEUP_MODE_COUNTER_OVERFLOW) as a result of a call to memset 0 to
02051    * all of the events in the ctl struct.
02052    *
02053    * Is it even set to any other value elsewhere?
02054    */
02055   switch ( ctl->events[evt_idx].wakeup_mode ) {
02056   case WAKEUP_MODE_PROFILING:
02057     /* Setting wakeup_events to special value zero means issue a */
02058     /* wakeup (signal) on every mmap page overflow.              */
02059     ctl->events[evt_idx].attr.wakeup_events = 0;
02060     break;
02061 
02062   case WAKEUP_MODE_COUNTER_OVERFLOW:
02063     /* Can this code ever be called? */
02064 
02065     /* Setting wakeup_events to one means issue a wakeup on every */
02066     /* counter overflow (not mmap page overflow).                 */
02067     ctl->events[evt_idx].attr.wakeup_events = 1;
02068     /* We need the IP to pass to the overflow handler */
02069     ctl->events[evt_idx].attr.sample_type = PERF_SAMPLE_IP;
02070     /* one for the user page, and two to take IP samples */
02071     ctl->events[evt_idx].nr_mmap_pages = 1 + 2;
02072     break;
02073   default:
02074     PAPIERROR( "ctl->wakeup_mode[%d] set to an unknown value - %u",
02075            evt_idx, ctl->events[evt_idx].wakeup_mode);
02076     return PAPI_EBUG;
02077   }
02078 
02079   /* Check for non-zero sample period */
02080   for ( i = 0; i < ctl->num_events; i++ ) {
02081     if ( ctl->events[evt_idx].attr.sample_period ) {
02082       found_non_zero_sample_period = 1;
02083       break;
02084     }
02085   }
02086 
02087   if ( found_non_zero_sample_period ) {
02088     /* turn on internal overflow flag for this event set */
02089     ctl->overflow = 1;
02090                 
02091     /* Enable the signal handler */
02092     retval = _papi_hwi_start_signal( 
02093                     ctl->overflow_signal, 
02094                     1, ctl->cidx );
02095   } else {
02096     /* turn off internal overflow flag for this event set */
02097     ctl->overflow = 0;
02098                 
02099     /* Remove the signal handler, if there are no remaining non-zero */
02100     /* sample_periods set                                            */
02101     retval = _papi_hwi_stop_signal(ctl->overflow_signal);
02102     if ( retval != PAPI_OK ) return retval;
02103   }
02104 
02105   retval = _pe_update_control_state( ctl, NULL,
02106                      ( (pe_control_t *) (ESI->ctl_state) )->num_events,
02107                      ctx );
02108 
02109   return retval;
02110 }
02111 
02112 /* Enable profiling */
02113 int
02114 _pe_set_profile( EventSetInfo_t *ESI, int EventIndex, int threshold )
02115 {
02116   int ret;
02117   int evt_idx;
02118   pe_control_t *ctl = ( pe_control_t *) ( ESI->ctl_state );
02119 
02120   /* Since you can't profile on a derived event, the event is always the */
02121   /* first and only event in the native event list.                      */
02122   evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
02123 
02124   if ( threshold == 0 ) {
02125     SUBDBG( "MUNMAP(%p,%"PRIu64")\n", ctl->events[evt_idx].mmap_buf,
02126         ( uint64_t ) ctl->events[evt_idx].nr_mmap_pages *
02127         getpagesize(  ) );
02128 
02129     if ( ctl->events[evt_idx].mmap_buf ) {
02130       munmap( ctl->events[evt_idx].mmap_buf,
02131           ctl->events[evt_idx].nr_mmap_pages * getpagesize() );
02132     }
02133     ctl->events[evt_idx].mmap_buf = NULL;
02134     ctl->events[evt_idx].nr_mmap_pages = 0;
02135     ctl->events[evt_idx].attr.sample_type &= ~PERF_SAMPLE_IP;
02136     ret = _pe_set_overflow( ESI, EventIndex, threshold );
02137     /* ??? #warning "This should be handled somewhere else" */
02138     ESI->state &= ~( PAPI_OVERFLOWING );
02139     ESI->overflow.flags &= ~( PAPI_OVERFLOW_HARDWARE );
02140 
02141     return ret;
02142   }
02143 
02144   /* Look up the native event code */
02145   if ( ESI->profile.flags & (PAPI_PROFIL_DATA_EAR | PAPI_PROFIL_INST_EAR)) {
02146     /* Not supported yet... */
02147 
02148     return PAPI_ENOSUPP;
02149   }
02150   if ( ESI->profile.flags & PAPI_PROFIL_RANDOM ) {
02151     /* This requires an ability to randomly alter the sample_period within */
02152     /* a given range.  Kernel does not have this ability. FIXME            */
02153     return PAPI_ENOSUPP;
02154   }
02155 
02156   /* Just a guess at how many pages would make this relatively efficient.  */
02157   /* Note that it's "1 +" because of the need for a control page, and the  */
02158   /* number following the "+" must be a power of 2 (1, 4, 8, 16, etc) or   */
02159   /* zero.  This is required to optimize dealing with circular buffer      */
02160   /* wrapping of the mapped pages.                                         */
02161 
02162   ctl->events[evt_idx].nr_mmap_pages = (1+8);
02163   ctl->events[evt_idx].attr.sample_type |= PERF_SAMPLE_IP;
02164 
02165   ret = _pe_set_overflow( ESI, EventIndex, threshold );
02166   if ( ret != PAPI_OK ) return ret;
02167 
02168   return PAPI_OK;
02169 }
02170 
02171 
02172 /* Our component vector */
02173 
02174 papi_vector_t _perf_event_vector = {
02175    .cmp_info = {
02176        /* component information (unspecified values initialized to 0) */
02177       .name = "perf_event",
02178       .short_name = "perf",
02179       .version = "5.0",
02180       .description = "Linux perf_event CPU counters",
02181   
02182       .default_domain = PAPI_DOM_USER,
02183       .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR,
02184       .default_granularity = PAPI_GRN_THR,
02185       .available_granularities = PAPI_GRN_THR | PAPI_GRN_SYS,
02186 
02187       .hardware_intr = 1,
02188       .kernel_profile = 1,
02189 
02190       /* component specific cmp_info initializations */
02191       .fast_virtual_timer = 0,
02192       .attach = 1,
02193       .attach_must_ptrace = 1,
02194       .cpu = 1,
02195       .inherit = 1,
02196       .cntr_umasks = 1,
02197 
02198   },
02199 
02200   /* sizes of framework-opaque component-private structures */
02201   .size = {
02202       .context = sizeof ( pe_context_t ),
02203       .control_state = sizeof ( pe_control_t ),
02204       .reg_value = sizeof ( int ),
02205       .reg_alloc = sizeof ( int ),
02206   },
02207 
02208   /* function pointers in this component */
02209   .init_component =        _pe_init_component,
02210   .shutdown_component =    _pe_shutdown_component,
02211   .init_thread =           _pe_init_thread,
02212   .init_control_state =    _pe_init_control_state,
02213   .dispatch_timer =        _pe_dispatch_timer,
02214 
02215   /* function pointers from the shared perf_event lib */
02216   .start =                 _pe_start,
02217   .stop =                  _pe_stop,
02218   .read =                  _pe_read,
02219   .shutdown_thread =       _pe_shutdown_thread,
02220   .ctl =                   _pe_ctl,
02221   .update_control_state =  _pe_update_control_state,
02222   .set_domain =            _pe_set_domain,
02223   .reset =                 _pe_reset,
02224   .set_overflow =          _pe_set_overflow,
02225   .set_profile =           _pe_set_profile,
02226   .stop_profiling =        _pe_stop_profiling,
02227   .write =                 _pe_write,
02228 
02229 
02230   /* from counter name mapper */
02231   .ntv_enum_events =   _pe_ntv_enum_events,
02232   .ntv_name_to_code =  _pe_ntv_name_to_code,
02233   .ntv_code_to_name =  _pe_ntv_code_to_name,
02234   .ntv_code_to_descr = _pe_ntv_code_to_descr,
02235   .ntv_code_to_info =  _pe_ntv_code_to_info,
02236 };
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines