PAPI  5.1.0.2
perf_events.c
Go to the documentation of this file.
00001 /*
00002 * File:    perf_events.c
00003 *
00004 * Author:  Corey Ashford
00005 *          cjashfor@us.ibm.com
00006 *          - based upon perfmon.c written by -
00007 *          Philip Mucci
00008 *          mucci@cs.utk.edu
00009 * Mods:    Gary Mohr
00010 *          gary.mohr@bull.com
00011 * Mods:    Vince Weaver
00012 *          vweaver1@eecs.utk.edu
00013 * Mods:    Philip Mucci
00014 *      mucci@eecs.utk.edu */
00015 
00016 
00017 #include <fcntl.h>
00018 #include <string.h>
00019 #include <errno.h>
00020 #include <signal.h>
00021 #include <syscall.h>
00022 #include <sys/utsname.h>
00023 #include <sys/mman.h>
00024 #include <sys/ioctl.h>
00025 
00026 /* PAPI-specific includes */
00027 #include "papi.h"
00028 #include "papi_memory.h"
00029 #include "papi_internal.h"
00030 #include "papi_vector.h"
00031 #include "extras.h"
00032 
00033 /* libpfm4 includes */
00034 #include "papi_libpfm4_events.h"
00035 #include "perfmon/pfmlib.h"
00036 #include PEINCLUDE
00037 
00038 /* Linux-specific includes */
00039 #include "mb.h"
00040 #include "syscalls.h"
00041 #include "linux-memory.h"
00042 #include "linux-timer.h"
00043 #include "linux-common.h"
00044 #include "linux-context.h"
00045 
00046 /* Various definitions */
00047 
00048 /* This is arbitrary.  Typically you can add up to ~1000 before */
00049 /* you run out of fds                                           */
00050 #define PERF_EVENT_MAX_MPX_COUNTERS 64
00051 
00052 /* We really don't need fancy definitions for these */
00053 
00054 typedef struct
00055 {
00056   int group_leader_fd;            /* fd of group leader                   */
00057   int event_fd;                   /* fd of event                          */
00058   int event_opened;               /* event successfully opened            */
00059   uint32_t nr_mmap_pages;     /* number pages in the mmap buffer      */
00060   void *mmap_buf;         /* used for control/profiling           */
00061   uint64_t tail;          /* current read location in mmap buffer */
00062   uint64_t mask;          /* mask used for wrapping the pages     */
00063   struct perf_event_attr attr;    /* perf_event config structure          */
00064   unsigned int wakeup_mode;       /* wakeup mode when sampling            */
00065 } pe_event_info_t;
00066 
00067 typedef struct
00068 {
00069   int num_events;                 /* number of events in control state */
00070   unsigned int domain;            /* control-state wide domain         */
00071   unsigned int granularity;       /* granularity                       */
00072   unsigned int multiplexed;       /* multiplexing enable               */
00073   unsigned int overflow;          /* overflow enable                   */
00074   unsigned int inherit;           /* inherit enable                    */
00075   int cpu;                        /* which cpu to measure              */
00076   pid_t tid;                      /* thread we are monitoring          */
00077   pe_event_info_t events[PERF_EVENT_MAX_MPX_COUNTERS];
00078   long long counts[PERF_EVENT_MAX_MPX_COUNTERS];
00079 } pe_control_t;
00080 
00081 typedef struct
00082 {
00083   int initialized;                /* are we initialized?           */
00084   int state;                      /* are we opened and/or running? */
00085 } pe_context_t;
00086 
00087 /* These sentinels tell papi_pe_set_overflow() how to set the */
00088 /* wakeup_events field in the event descriptor record.        */
00089 
00090 #define WAKEUP_COUNTER_OVERFLOW 0
00091 #define WAKEUP_PROFILING -1
00092 
00093 #define WAKEUP_MODE_COUNTER_OVERFLOW 0
00094 #define WAKEUP_MODE_PROFILING 1
00095 
00096 /* Defines for ctx->state */
00097 #define PERF_EVENTS_OPENED  0x01
00098 #define PERF_EVENTS_RUNNING 0x02
00099 
00100 /* Static globals */
00101 static int nmi_watchdog_active;
00102 
00103 /* Advance declaration */
00104 papi_vector_t _papi_pe_vector;
00105 
00106 
00107 /******** Kernel Version Dependent Routines  **********************/
00108 
00109 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch
00110  * implementations (e.g. x86) which don't do a static event scheduability
00111  * check in sys_perf_event_open.  
00112  * This was fixed for x86 in the 2.6.33 kernel
00113  *
00114  * Also! Kernels newer than 2.6.34 will fail in a similar way
00115  *       if the nmi_watchdog has stolen a performance counter
00116  *       and we try to use the maximum number of counters.
00117  *       A sys_perf_event_open() will seem to succeed but will fail
00118  *       at read time.  So re-use this work around code.
00119  */
00120 static int 
00121 bug_check_scheduability(void) {
00122 
00123 #if defined(__powerpc__)
00124   /* PowerPC not affected by this bug */
00125 #elif defined(__mips__)
00126   /* MIPS as of kernel 3.1 does not properly detect schedulability */
00127   return 1;
00128 #else
00129   if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
00130 #endif
00131 
00132   if (nmi_watchdog_active) return 1;
00133 
00134   return 0;
00135 }
00136 
00137 /* PERF_FORMAT_GROUP allows reading an entire group's counts at once   */
00138 /* before 2.6.34 PERF_FORMAT_GROUP did not work when reading results   */
00139 /*  from attached processes.  We are lazy and disable it for all cases */
00140 /*  commit was:  050735b08ca8a016bbace4445fa025b88fee770b              */
00141 
00142 static int 
00143 bug_format_group(void) {
00144 
00145   if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) return 1;
00146 
00147   /* MIPS, as of version 3.1, does not support this properly */
00148 
00149 #if defined(__mips__)
00150   return 1;
00151 #endif
00152 
00153   return 0;
00154 
00155 }
00156 
00157 
00158 /* There's a bug prior to Linux 2.6.33 where if you are using */
00159 /* PERF_FORMAT_GROUP, the TOTAL_TIME_ENABLED and              */
00160 /* TOTAL_TIME_RUNNING fields will be zero unless you disable  */
00161 /* the counters first                                         */
00162 static int 
00163 bug_sync_read(void) {
00164 
00165   if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
00166 
00167   return 0;
00168 
00169 }
00170 
00171 
00172 /* Set the F_SETOWN_EX flag on the fd.                          */
00173 /* This affects which thread an overflow signal gets sent to    */
00174 /* Handled in a subroutine to handle the fact that the behavior */
00175 /* is dependent on kernel version.                              */
00176 static int 
00177 fcntl_setown_fd(int fd) {
00178 
00179    int ret;
00180    struct f_owner_ex fown_ex;
00181 
00182       /* F_SETOWN_EX is not available until 2.6.32 */
00183    if (_papi_os_info.os_version < LINUX_VERSION(2,6,32)) {
00184        
00185       /* get ownership of the descriptor */
00186       ret = fcntl( fd, F_SETOWN, mygettid(  ) );
00187       if ( ret == -1 ) {
00188      PAPIERROR( "cannot fcntl(F_SETOWN) on %d: %s", fd, strerror(errno) );
00189      return PAPI_ESYS;
00190       }
00191    }
00192    else {
00193       /* set ownership of the descriptor */   
00194       fown_ex.type = F_OWNER_TID;
00195       fown_ex.pid  = mygettid();
00196       ret = fcntl(fd, F_SETOWN_EX, (unsigned long)&fown_ex );
00197    
00198       if ( ret == -1 ) {
00199      PAPIERROR( "cannot fcntl(F_SETOWN_EX) on %d: %s", 
00200             fd, strerror( errno ) );
00201      return PAPI_ESYS;
00202       }
00203    }
00204    return PAPI_OK;
00205 }
00206 
00207 /* Check for processor support */
00208 /* Can be used for generic checking, though in general we only     */
00209 /* check for pentium4 here because support was broken for multiple */
00210 /* kernel releases and the usual standard detections did not       */
00211 /* handle this.  So we check for pentium 4 explicitly.             */
00212 static int 
00213 processor_supported(int vendor, int family) {
00214 
00215    /* Error out if kernel too early to support p4 */
00216    if (( vendor == PAPI_VENDOR_INTEL ) && (family == 15)) {   
00217       if (_papi_os_info.os_version < LINUX_VERSION(2,6,35)) {
00218      PAPIERROR("Pentium 4 not supported on kernels before 2.6.35");
00219      return PAPI_ENOSUPP;
00220       }
00221    }
00222    return PAPI_OK;
00223 }
00224 
00225 
00226 /* The read format on perf_event varies based on various flags that */
00227 /* are passed into it.  This helper avoids copying this logic       */
00228 /* multiple places.                                                 */
00229 static unsigned int
00230 get_read_format( unsigned int multiplex, 
00231          unsigned int inherit, 
00232          int format_group )
00233 {
00234    unsigned int format = 0;
00235 
00236    /* if we need read format options for multiplexing, add them now */
00237    if (multiplex) {
00238       format |= PERF_FORMAT_TOTAL_TIME_ENABLED;
00239       format |= PERF_FORMAT_TOTAL_TIME_RUNNING;
00240    }
00241 
00242    /* if our kernel supports it and we are not using inherit, */
00243    /* add the group read options                              */
00244    if ( (!bug_format_group()) && !inherit) {
00245       if (format_group) {
00246      format |= PERF_FORMAT_GROUP;
00247       }
00248    }
00249 
00250    SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: 0x%x\n",
00251       multiplex, inherit, format_group, format);
00252 
00253    return format;
00254 }
00255 
00256 /* The kernel developers say to never use a refresh value of 0        */
00257 /* See https://lkml.org/lkml/2011/5/24/172                            */
00258 /* However, on some platforms (like Power) a value of 1 does not work */
00259 /* We're still tracking down why this happens.                        */
00260 
00261 #if defined(__powerpc__)
00262 #define PAPI_REFRESH_VALUE 0
00263 #else
00264 #define PAPI_REFRESH_VALUE 1
00265 #endif
00266 
00267 /********* End Kernel-version Dependent Routines  ****************/
00268 
00269 
00271 /*  perf_events.                                         */
00272 /*  We do this by temporarily opening an event with the  */
00273 /*  desired options then closing it again.  We use the   */
00274 /*  PERF_COUNT_HW_INSTRUCTION event as a dummy event     */
00275 /*  on the assumption it is available on all             */
00276 /*  platforms.                                           */
00277 
00278 static int
00279 check_permissions( unsigned long tid, 
00280            unsigned int cpu_num, 
00281            unsigned int domain, 
00282            unsigned int granularity,
00283            unsigned int multiplex, 
00284            unsigned int inherit )
00285 {
00286    int ev_fd;
00287    struct perf_event_attr attr;
00288 
00289    long pid;
00290 
00291    /* clearing this will set a type of hardware and to count all domains */
00292    memset(&attr, '\0', sizeof(attr));
00293    attr.read_format = get_read_format(multiplex, inherit, 1);
00294 
00295    /* set the event id (config field) to instructios */
00296    /* (an event that should always exist)            */
00297    /* This was cycles but that is missing on Niagara */
00298    attr.config = PERF_COUNT_HW_INSTRUCTIONS;
00299     
00300    /* now set up domains this event set will be counting */
00301    if (!(domain & PAPI_DOM_SUPERVISOR)) {
00302       attr.exclude_hv = 1;
00303    }
00304    if (!(domain & PAPI_DOM_USER)) {
00305       attr.exclude_user = 1;
00306    }
00307    if (!(domain & PAPI_DOM_KERNEL)) {
00308       attr.exclude_kernel = 1;
00309    }
00310 
00311    if (granularity==PAPI_GRN_SYS) {
00312       pid = -1;
00313    } else {
00314       pid = tid;
00315    }
00316 
00317    SUBDBG("Calling sys_perf_event_open() from check_permissions\n");
00318 
00319    ev_fd = sys_perf_event_open( &attr, pid, cpu_num, -1, 0 );
00320    if ( ev_fd == -1 ) {
00321       SUBDBG("sys_perf_event_open returned error.  Linux says, %s", 
00322          strerror( errno ) );
00323       return PAPI_EPERM;
00324    }
00325     
00326    /* now close it, this was just to make sure we have permissions */
00327    /* to set these options                                         */
00328    close(ev_fd);
00329    return PAPI_OK;
00330 }
00331 
00332 
00333 
00334 /* Maximum size we ever expect to read from a perf_event fd   */
00335 /*  (this is the number of 64-bit values)                     */
00336 /* We use this to size the read buffers                       */
00337 /* The three is for event count, time_enabled, time_running   */
00338 /*  and the counter term is count value and count id for each */
00339 /*  possible counter value.                                   */
00340 #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS))
00341 
00342 
00343 
00344 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */
00345 /* implementations (e.g. x86 before 2.6.33) which don't do a static event */
00346 /* scheduability check in sys_perf_event_open.  It is also needed if the  */
00347 /* kernel is stealing an event, such as when NMI watchdog is enabled.     */
00348 
00349 static int
00350 check_scheduability( pe_context_t *ctx, pe_control_t *ctl, int idx )
00351 {
00352    int retval = 0, cnt = -1;
00353    ( void ) ctx;             /*unused */
00354    long long papi_pe_buffer[READ_BUFFER_SIZE];
00355    int i,group_leader_fd;
00356 
00357    if (bug_check_scheduability()) {
00358 
00359       /* If the kernel isn't tracking scheduability right       */
00360       /* Then we need to start/stop/read to force the event     */
00361       /* to be scheduled and see if an error condition happens. */
00362 
00363       /* get the proper fd to start */
00364       group_leader_fd=ctl->events[idx].group_leader_fd;
00365       if (group_leader_fd==-1) group_leader_fd=ctl->events[idx].event_fd;
00366 
00367       /* start the event */
00368       retval = ioctl( group_leader_fd, PERF_EVENT_IOC_ENABLE, NULL );
00369       if (retval == -1) {
00370      PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
00371      return PAPI_ESYS;
00372       }
00373 
00374       /* stop the event */
00375       retval = ioctl(group_leader_fd, PERF_EVENT_IOC_DISABLE, NULL );
00376       if (retval == -1) {
00377      PAPIERROR( "ioctl(PERF_EVENT_IOC_DISABLE) failed.\n" );
00378      return PAPI_ESYS;
00379       }
00380 
00381       /* See if a read returns any results */
00382       cnt = read( group_leader_fd, papi_pe_buffer, sizeof(papi_pe_buffer));
00383       if ( cnt == -1 ) {
00384      SUBDBG( "read returned an error!  Should never happen.\n" );
00385      return PAPI_ESYS;
00386       }
00387 
00388       if ( cnt == 0 ) {
00389          /* We read 0 bytes if we could not schedule the event */
00390          /* The kernel should have detected this at open       */
00391          /* but various bugs (including NMI watchdog)          */
00392          /* result in this behavior                            */
00393 
00394      return PAPI_ECNFLCT;
00395 
00396      } else {
00397 
00398     /* Reset all of the counters (opened so far) back to zero      */
00399     /* from the above brief enable/disable call pair.              */
00400 
00401     /* We have to reset all events because reset of group leader      */
00402         /* does not reset all.                                            */
00403     /* we assume that the events are being added one by one and that  */
00404         /* we do not need to reset higher events (doing so may reset ones */
00405         /* that have not been initialized yet.                            */
00406 
00407     /* Note... PERF_EVENT_IOC_RESET does not reset time running       */
00408     /* info if multiplexing, so we should avoid coming here if        */
00409     /* we are multiplexing the event.                                 */
00410         for( i = 0; i < idx; i++) {
00411        retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
00412        if (retval == -1) {
00413           PAPIERROR( "ioctl(PERF_EVENT_IOC_RESET) #%d/%d %d "
00414              "(fd %d)failed.\n",
00415              i,ctl->num_events,idx,ctl->events[i].event_fd);
00416           return PAPI_ESYS;
00417        }
00418     }
00419       }
00420    }
00421    return PAPI_OK;
00422 }
00423 
00424 
00425 /* Do some extrta work on a perf_event fd if we're doing sampling */
00426 /* This mostly means setting up the mmap buffer.                  */
00427 static int
00428 tune_up_fd( pe_control_t *ctl, int evt_idx )
00429 {
00430    int ret;
00431    void *buf_addr;
00432    int fd = ctl->events[evt_idx].event_fd;
00433 
00434    /* Register that we would like a SIGIO notification when a mmap'd page */
00435    /* becomes full.                                                       */
00436    ret = fcntl( fd, F_SETFL, O_ASYNC | O_NONBLOCK );
00437    if ( ret ) {
00438       PAPIERROR ( "fcntl(%d, F_SETFL, O_ASYNC | O_NONBLOCK) "
00439           "returned error: %s", fd, strerror( errno ) );
00440       return PAPI_ESYS;
00441    }
00442 
00443    /* Set the F_SETOWN_EX flag on the fd.                          */
00444    /* This affects which thread an overflow signal gets sent to.   */
00445    ret=fcntl_setown_fd(fd);
00446    if (ret!=PAPI_OK) return ret;
00447        
00448    /* Set FD_CLOEXEC.  Otherwise if we do an exec with an overflow */
00449    /* running, the overflow handler will continue into the exec()'d*/
00450    /* process and kill it because no signal handler is set up.     */
00451    ret=fcntl(fd, F_SETFD, FD_CLOEXEC);
00452    if (ret) {
00453       return PAPI_ESYS;
00454    }
00455 
00456    /* when you explicitely declare that you want a particular signal,  */
00457    /* even with you use the default signal, the kernel will send more  */
00458    /* information concerning the event to the signal handler.          */
00459    /*                                                                  */
00460    /* In particular, it will send the file descriptor from which the   */
00461    /* event is originating which can be quite useful when monitoring   */
00462    /* multiple tasks from a single thread.                             */
00463    ret = fcntl( fd, F_SETSIG, _papi_pe_vector.cmp_info.hardware_intr_sig );
00464    if ( ret == -1 ) {
00465       PAPIERROR( "cannot fcntl(F_SETSIG,%d) on %d: %s",
00466          _papi_pe_vector.cmp_info.hardware_intr_sig, fd,
00467          strerror( errno ) );
00468       return PAPI_ESYS;
00469    }
00470 
00471    /* mmap() the sample buffer */
00472    buf_addr = mmap( NULL, ctl->events[evt_idx].nr_mmap_pages * getpagesize(),
00473             PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
00474    if ( buf_addr == MAP_FAILED ) {
00475       PAPIERROR( "mmap(NULL,%d,%d,%d,%d,0): %s",
00476          ctl->events[evt_idx].nr_mmap_pages * getpagesize(  ), 
00477          PROT_READ, MAP_SHARED, fd, strerror( errno ) );
00478       return ( PAPI_ESYS );
00479    }
00480 
00481    SUBDBG( "Sample buffer for fd %d is located at %p\n", fd, buf_addr );
00482 
00483    /* Set up the mmap buffer and its associated helpers */
00484    ctl->events[evt_idx].mmap_buf = (struct perf_counter_mmap_page *) buf_addr;
00485    ctl->events[evt_idx].tail = 0;
00486    ctl->events[evt_idx].mask = ( ctl->events[evt_idx].nr_mmap_pages - 1 ) * 
00487                                getpagesize() - 1;
00488 
00489    return PAPI_OK;
00490 }
00491 
00492 
00493 /* Open all events in the control state */
00494 static int
00495 open_pe_events( pe_context_t *ctx, pe_control_t *ctl )
00496 {
00497 
00498    int i, ret = PAPI_OK;
00499    long pid;
00500 
00501    if (ctl->granularity==PAPI_GRN_SYS) {
00502       pid = -1;
00503    }
00504    else {
00505       pid = ctl->tid;
00506    }
00507 
00508    for( i = 0; i < ctl->num_events; i++ ) {
00509 
00510       ctl->events[i].event_opened=0;
00511 
00512       /* set up the attr structure.  We don't set up all fields here */
00513       /* as some have already been set up previously.                */
00514 
00515       /* group leader (event 0) is special                */
00516       /* If we're multiplexed, everyone is a group leader */
00517       if (( i == 0 ) || (ctl->multiplexed)) {
00518          ctl->events[i].attr.pinned = !ctl->multiplexed;
00519      ctl->events[i].attr.disabled = 1;
00520      ctl->events[i].group_leader_fd=-1;
00521          ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 
00522                                ctl->inherit, 
00523                                !ctl->multiplexed );
00524       } else {
00525      ctl->events[i].attr.pinned=0;
00526      ctl->events[i].attr.disabled = 0;
00527      ctl->events[i].group_leader_fd=ctl->events[0].event_fd;
00528          ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 
00529                                ctl->inherit, 
00530                                0 );
00531       }
00532 
00533 
00534       /* try to open */
00535       ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr, 
00536                              pid,
00537                              ctl->cpu,
00538                    ctl->events[i].group_leader_fd,
00539                              0 /* flags */
00540                              );
00541       
00542       if ( ctl->events[i].event_fd == -1 ) {
00543      SUBDBG("sys_perf_event_open returned error on event #%d."
00544         "  Error: %s\n",
00545         i, strerror( errno ) );
00546      if (errno == EPERM) ret = PAPI_EPERM;
00547      else ret = PAPI_ECNFLCT;
00548      goto open_pe_cleanup;
00549       }
00550 
00551       SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d,"
00552               " group_leader/fd: %d, event_fd: %d,"
00553               " read_format: 0x%"PRIu64"\n",
00554           pid, ctl->cpu, ctl->events[i].group_leader_fd, 
00555           ctl->events[i].event_fd, ctl->events[i].attr.read_format);
00556 
00557 
00558       /* in many situations the kernel will indicate we opened fine */
00559       /* yet things will fail later.  So we need to double check    */
00560       /* we actually can use the events we've set up.               */
00561 
00562       /* This is not necessary if we are multiplexing, and in fact */
00563       /* we cannot do this properly if multiplexed because         */
00564       /* PERF_EVENT_IOC_RESET does not reset the time running info */
00565       if (!ctl->multiplexed) {
00566      ret = check_scheduability( ctx, ctl, i );
00567 
00568          if ( ret != PAPI_OK ) {
00569         /* the last event did open, so we need to bump the counter */
00570         /* before doing the cleanup                                */
00571         i++;
00572                                   
00573             goto open_pe_cleanup;
00574      }
00575       }
00576       ctl->events[i].event_opened=1;
00577    }
00578 
00579    /* Now that we've successfully opened all of the events, do whatever  */
00580    /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */
00581    /* and so on.                                                         */
00582    for ( i = 0; i < ctl->num_events; i++ ) {
00583 
00584       /* If sampling is enabled, hook up signal handler */
00585       if ( ctl->events[i].attr.sample_period ) {
00586      ret = tune_up_fd( ctl, i );
00587      if ( ret != PAPI_OK ) {
00588         /* All of the fds are open, so we need to clean up all of them */
00589         i = ctl->num_events;
00590         goto open_pe_cleanup;
00591      }
00592       } else {
00593      /* Make sure this is NULL so close_pe_events works right */
00594      ctl->events[i].mmap_buf = NULL;
00595       }
00596    }
00597 
00598    /* Set num_evts only if completely successful */
00599    ctx->state |= PERF_EVENTS_OPENED;
00600         
00601    return PAPI_OK;
00602 
00603 open_pe_cleanup:
00604    /* We encountered an error, close up the fds we successfully opened.  */
00605    /* We go backward in an attempt to close group leaders last, although */
00606    /* That's probably not strictly necessary.                            */
00607    while ( i > 0 ) {
00608       i--;
00609       if (ctl->events[i].event_fd>=0) {
00610      close( ctl->events[i].event_fd );
00611      ctl->events[i].event_opened=0;
00612       }
00613    }
00614 
00615    return ret;
00616 }
00617 
00618 /* Close all of the opened events */
00619 static int
00620 close_pe_events( pe_context_t *ctx, pe_control_t *ctl )
00621 {
00622    int i;
00623    int num_closed=0;
00624    int events_not_opened=0;
00625 
00626    /* should this be a more serious error? */
00627    if ( ctx->state & PERF_EVENTS_RUNNING ) {
00628       SUBDBG("Closing without stopping first\n");
00629    }
00630 
00631    /* Close child events first */
00632    for( i=0; i<ctl->num_events; i++ ) {
00633 
00634       if (ctl->events[i].event_opened) {
00635 
00636          if (ctl->events[i].group_leader_fd!=-1) {
00637             if ( ctl->events[i].mmap_buf ) {
00638            if ( munmap ( ctl->events[i].mmap_buf,
00639                      ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
00640               PAPIERROR( "munmap of fd = %d returned error: %s",
00641                  ctl->events[i].event_fd, strerror( errno ) );
00642               return PAPI_ESYS;
00643            }
00644         }
00645 
00646             if ( close( ctl->events[i].event_fd ) ) {
00647            PAPIERROR( "close of fd = %d returned error: %s",
00648                ctl->events[i].event_fd, strerror( errno ) );
00649            return PAPI_ESYS;
00650         } else {
00651            num_closed++;
00652         }
00653         ctl->events[i].event_opened=0;
00654      }
00655       }
00656       else {
00657     events_not_opened++;
00658       }
00659    }
00660 
00661    /* Close the group leaders last */
00662    for( i=0; i<ctl->num_events; i++ ) {
00663 
00664       if (ctl->events[i].event_opened) {
00665 
00666          if (ctl->events[i].group_leader_fd==-1) {
00667             if ( ctl->events[i].mmap_buf ) {
00668            if ( munmap ( ctl->events[i].mmap_buf,
00669                      ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
00670               PAPIERROR( "munmap of fd = %d returned error: %s",
00671                  ctl->events[i].event_fd, strerror( errno ) );
00672               return PAPI_ESYS;
00673            }
00674         }
00675 
00676 
00677             if ( close( ctl->events[i].event_fd ) ) {
00678            PAPIERROR( "close of fd = %d returned error: %s",
00679                ctl->events[i].event_fd, strerror( errno ) );
00680            return PAPI_ESYS;
00681         } else {
00682            num_closed++;
00683         }
00684         ctl->events[i].event_opened=0;
00685      }
00686       }
00687    }
00688 
00689 
00690    if (ctl->num_events!=num_closed) {
00691       if (ctl->num_events!=(num_closed+events_not_opened)) {
00692          PAPIERROR("Didn't close all events: "
00693            "Closed %d Not Opened: %d Expected %d\n",
00694            num_closed,events_not_opened,ctl->num_events);
00695          return PAPI_EBUG;
00696       }
00697    }
00698 
00699    ctl->num_events=0;
00700 
00701    ctx->state &= ~PERF_EVENTS_OPENED;
00702 
00703    return PAPI_OK;
00704 }
00705 
00706 /* Fix up the config based on what CPU/Vendor we are running on */
00707 static int 
00708 pe_vendor_fixups(void) 
00709 {
00710      /* powerpc */
00711      /* On IBM and Power6 Machines default domain should include supervisor */
00712   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_IBM ) {
00713      _papi_pe_vector.cmp_info.available_domains |=
00714           PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00715      if (strcmp(_papi_hwi_system_info.hw_info.model_string, "POWER6" ) == 0 ) {
00716         _papi_pe_vector.cmp_info.default_domain =
00717           PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00718      }
00719   }
00720 
00721   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_MIPS ) {
00722      _papi_pe_vector.cmp_info.available_domains |= PAPI_DOM_KERNEL;
00723   }
00724 
00725   if ((_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_INTEL) ||
00726       (_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_AMD)) {
00727      _papi_pe_vector.cmp_info.fast_real_timer = 1;
00728   }
00729 
00730      /* ARM */
00731   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_ARM) {
00732      /* FIXME: this will change with Cortex A15 */
00733      _papi_pe_vector.cmp_info.available_domains |=
00734         PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00735      _papi_pe_vector.cmp_info.default_domain =
00736         PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00737   }
00738 
00739      /* CRAY */
00740   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_CRAY ) {
00741     _papi_pe_vector.cmp_info.available_domains |= PAPI_DOM_OTHER;
00742   }
00743 
00744   return PAPI_OK;
00745 }
00746 
00747 
00748 /* Check the mmap page for rdpmc support */
00749 static int detect_rdpmc(void) {
00750 
00751    struct perf_event_attr pe;
00752    int fd,rdpmc_exists=1;
00753    void *addr;
00754    struct perf_event_mmap_page *our_mmap;
00755 
00756    /* Create a fake instructions event so we can read a mmap page */
00757    memset(&pe,0,sizeof(struct perf_event_attr));
00758 
00759    pe.type=PERF_TYPE_HARDWARE;
00760    pe.size=sizeof(struct perf_event_attr);
00761    pe.config=PERF_COUNT_HW_INSTRUCTIONS;
00762 
00763    fd=sys_perf_event_open(&pe,0,-1,-1,0);
00764    if (fd<0) {
00765       return PAPI_ESYS;
00766    }
00767 
00768    /* create the mmap page */
00769    addr=mmap(NULL, 4096, PROT_READ, MAP_SHARED,fd,0);
00770    if (addr == (void *)(-1)) {
00771       close(fd);
00772       return PAPI_ESYS;
00773    }
00774 
00775    /* get the rdpmc info */
00776    our_mmap=(struct perf_event_mmap_page *)addr;
00777    if (our_mmap->cap_usr_rdpmc==0) {
00778       rdpmc_exists=0;
00779    }
00780 
00781    /* close the fake event */
00782    munmap(addr,4096);
00783    close(fd);
00784 
00785    return rdpmc_exists;
00786 
00787 } 
00788 
00789 /* Find a native event specified by a profile index */
00790 static int
00791 find_profile_index( EventSetInfo_t *ESI, int evt_idx, int *flags,
00792             unsigned int *native_index, int *profile_index )
00793 {
00794    int pos, esi_index, count;
00795 
00796    for ( count = 0; count < ESI->profile.event_counter; count++ ) {
00797        esi_index = ESI->profile.EventIndex[count];
00798        pos = ESI->EventInfoArray[esi_index].pos[0];
00799         
00800        if ( pos == evt_idx ) {
00801       *profile_index = count;
00802       *native_index = ESI->NativeInfoArray[pos].ni_event & 
00803                       PAPI_NATIVE_AND_MASK;
00804       *flags = ESI->profile.flags;
00805       SUBDBG( "Native event %d is at profile index %d, flags %d\n",
00806           *native_index, *profile_index, *flags );
00807       return PAPI_OK;
00808        }
00809    }
00810 
00811    PAPIERROR( "wrong count: %d vs. ESI->profile.event_counter %d", count,
00812           ESI->profile.event_counter );
00813    return PAPI_EBUG;
00814 }
00815 
00816 
00817 /* These functions are based on builtin-record.c in the  */
00818 /* kernel's tools/perf directory.                        */
00819 
00820 static uint64_t
00821 mmap_read_head( pe_event_info_t *pe )
00822 {
00823    struct perf_event_mmap_page *pc = pe->mmap_buf;
00824    int head;
00825 
00826    if ( pc == NULL ) {
00827       PAPIERROR( "perf_event_mmap_page is NULL" );
00828       return 0;
00829    }
00830 
00831    head = pc->data_head;
00832    rmb(  );
00833 
00834    return head;
00835 }
00836 
00837 static void
00838 mmap_write_tail( pe_event_info_t *pe, uint64_t tail )
00839 {
00840    struct perf_event_mmap_page *pc = pe->mmap_buf;
00841 
00842    /* ensure all reads are done before we write the tail out. */
00843    pc->data_tail = tail;
00844 }
00845 
00846 /* Does the kernel define these somewhere? */
00847 struct ip_event {
00848    struct perf_event_header header;
00849    uint64_t ip;
00850 };
00851 struct lost_event {
00852    struct perf_event_header header;
00853    uint64_t id;
00854    uint64_t lost;
00855 };
00856 typedef union event_union {
00857    struct perf_event_header header;
00858    struct ip_event ip;
00859    struct lost_event lost;
00860 } perf_sample_event_t;
00861 
00862 
00863 /* Should re-write with comments if we ever figure out what's */
00864 /* going on here.                                             */
00865 static void
00866 mmap_read( ThreadInfo_t **thr, pe_event_info_t *pe, 
00867        int profile_index )
00868 {
00869    int cidx = _papi_pe_vector.cmp_info.CmpIdx;
00870    uint64_t head = mmap_read_head( pe );
00871    uint64_t old = pe->tail;
00872    unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize(  );
00873    int diff;
00874 
00875    diff = head - old;
00876    if ( diff < 0 ) {
00877       SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64
00878           ",  tail = %" PRIu64 ". Discarding samples.\n", head, old );
00879       /* head points to a known good entry, start there. */
00880       old = head;
00881    }
00882 
00883    for( ; old != head; ) {
00884 
00885       perf_sample_event_t *event = ( perf_sample_event_t * ) 
00886                                & data[old & pe->mask];
00887       perf_sample_event_t event_copy;
00888       size_t size = event->header.size;
00889 
00890       /* Event straddles the mmap boundary -- header should always */
00891       /* be inside due to u64 alignment of output.                 */
00892       if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) {
00893      uint64_t offset = old;
00894      uint64_t len = min( sizeof ( *event ), size ), cpy;
00895      void *dst = &event_copy;
00896 
00897      do {
00898         cpy = min( pe->mask + 1 - ( offset & pe->mask ), len );
00899         memcpy( dst, &data[offset & pe->mask], cpy );
00900         offset += cpy;
00901         dst = ((unsigned char*)dst) + cpy;
00902         len -= cpy;
00903      } while ( len );
00904 
00905      event = &event_copy;
00906       }
00907 
00908       old += size;
00909 
00910       SUBDBG( "event->type = %08x\n", event->header.type );
00911       SUBDBG( "event->size = %d\n", event->header.size );
00912 
00913       switch ( event->header.type ) {
00914          case PERF_RECORD_SAMPLE:
00915           _papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx],
00916                 ( caddr_t ) ( unsigned long ) event->ip.ip, 
00917                       0, profile_index );
00918           break;
00919 
00920      case PERF_RECORD_LOST:
00921           SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64
00922               " events were lost.\n"
00923               "Loss was recorded when counter id 0x%"PRIx64 
00924               " overflowed.\n", event->lost.lost, event->lost.id );
00925           break;
00926 
00927      default:
00928           SUBDBG( "Error: unexpected header type - %d\n",
00929                     event->header.type );
00930           break;
00931       }
00932    }
00933 
00934    pe->tail = old;
00935    mmap_write_tail( pe, old );
00936 }
00937 
00938 /* What exactly does this do? */
00939 static int
00940 process_smpl_buf( int evt_idx, ThreadInfo_t **thr )
00941 {
00942    int ret, flags, profile_index;
00943    unsigned native_index;
00944    int cidx = _papi_pe_vector.cmp_info.CmpIdx;
00945    pe_control_t *ctl;
00946 
00947    ret = find_profile_index( ( *thr )->running_eventset[cidx], evt_idx, 
00948                  &flags, &native_index, &profile_index );
00949    if ( ret != PAPI_OK ) {
00950       return ret;
00951    }
00952 
00953    ctl= (*thr)->running_eventset[cidx]->ctl_state;
00954 
00955    mmap_read( thr, 
00956           &(ctl->events[evt_idx]),
00957           profile_index );
00958 
00959    return PAPI_OK;
00960 }
00961 
00962 
00963 
00964 
00965 /********************************************************************/
00966 /********************************************************************/
00967 /* Start with functions that are exported via the module interface  */
00968 /********************************************************************/
00969 /********************************************************************/
00970 
00971 
00972 /* set the domain. FIXME: perf_events allows per-event control of this. */
00973 /* we do not handle that yet.                                           */
00974 int
00975 _papi_pe_set_domain( hwd_control_state_t *ctl, int domain)
00976 {
00977     
00978    int i;
00979    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
00980 
00981    SUBDBG("old control domain %d, new domain %d, default domain %d\n",
00982       pe_ctl->domain,domain,_papi_pe_vector.cmp_info.default_domain);
00983 
00984    pe_ctl->domain = domain;
00985      
00986    /* Force the domain on all events */
00987    for( i = 0; i < pe_ctl->num_events; i++ ) {
00988       pe_ctl->events[i].attr.exclude_user = 
00989                     !( pe_ctl->domain & PAPI_DOM_USER );
00990       pe_ctl->events[i].attr.exclude_kernel =
00991             !( pe_ctl->domain & PAPI_DOM_KERNEL );
00992       pe_ctl->events[i].attr.exclude_hv =
00993             !( pe_ctl->domain & PAPI_DOM_SUPERVISOR );
00994    }
00995    return PAPI_OK;
00996 }
00997 
00998 
00999 /* Initialize the perf_event component */
01000 static int
01001 _papi_pe_init_component( int cidx )
01002 {
01003 
01004    int retval;
01005    int paranoid_level;
01006 
01007    FILE *fff;
01008 
01009    ( void ) cidx;          /*unused */
01010 
01011    /* The is the official way to detect if perf_event support exists */
01012    /* The file is called perf_counter_paranoid on 2.6.31             */
01013    /* currently we are lazy and do not support 2.6.31 kernels        */
01014    fff=fopen("/proc/sys/kernel/perf_event_paranoid","r");
01015    if (fff==NULL) {
01016       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
01017           "perf_event support not detected",PAPI_MAX_STR_LEN);
01018       return PAPI_ENOCMP;
01019    }
01020 
01021    /* 2 means no measurements allowed          */
01022    /* 1 means normal counter access            */
01023    /* 0 means you can access CPU-specific data */
01024    /* -1 means no restrictions                 */
01025    retval=fscanf(fff,"%d",&paranoid_level);
01026    if (retval!=1) fprintf(stderr,"Error reading paranoid level\n");
01027    fclose(fff);
01028 
01029    if (paranoid_level==2) {
01030       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
01031           "/proc/sys/kernel/perf_event_paranoid prohibits using counters",
01032           PAPI_MAX_STR_LEN);
01033       return PAPI_ENOCMP;
01034    }
01035 
01036    /* Detect NMI watchdog which can steal counters */
01037    nmi_watchdog_active=_linux_detect_nmi_watchdog();
01038    if (nmi_watchdog_active) {
01039       SUBDBG("The Linux nmi_watchdog is using one of the performance "
01040              "counters, reducing the total number available.\n");
01041    }
01042 
01043    /* Kernel multiplexing is broken prior to kernel 2.6.34 */
01044    /* The fix was probably git commit:                     */
01045    /*     45e16a6834b6af098702e5ea6c9a40de42ff77d8         */
01046    if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) {
01047       _papi_pe_vector.cmp_info.kernel_multiplex = 0;
01048    }
01049    else {
01050       _papi_pe_vector.cmp_info.kernel_multiplex = 1;
01051    }
01052 
01053    /* We use the RealTime signal for some reason */
01054    _papi_pe_vector.cmp_info.hardware_intr_sig = SIGRTMIN + 2;
01055 
01056    /* Check that processor is supported */
01057    if (processor_supported(_papi_hwi_system_info.hw_info.vendor,
01058                _papi_hwi_system_info.hw_info.cpuid_family)!=
01059       PAPI_OK) {
01060       fprintf(stderr,"warning, your processor is unsupported\n");
01061       /* should not return error, as software events should still work */
01062    }
01063 
01064    /* Setup mmtimers, if appropriate */
01065    retval=mmtimer_setup();
01066    if (retval) {
01067       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
01068           "Error initializing mmtimer",PAPI_MAX_STR_LEN);
01069       return retval;
01070    }
01071 
01072    /* Detect if we can use rdpmc (or equivalent) */
01073    /* We currently do not use rdpmc as it is slower in tests */
01074    /* than regular read (as of Linux 3.5)                    */
01075    retval=detect_rdpmc();
01076    if (retval < 0 ) {
01077       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
01078          "sys_perf_event_open() failed, perf_event support for this platform may be broken",PAPI_MAX_STR_LEN);
01079       return retval;
01080    }
01081    _papi_pe_vector.cmp_info.fast_counter_read = retval;
01082 
01083    /* Run Vendor-specific fixups */
01084    pe_vendor_fixups();
01085 
01086    /* Run the libpfm4-specific setup */
01087    retval = _papi_libpfm4_init(&_papi_pe_vector, cidx);
01088    if (retval) {
01089       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
01090           "Error initializing libpfm4",PAPI_MAX_STR_LEN);
01091       return retval;
01092    }
01093 
01094    return PAPI_OK;
01095 
01096 }
01097 
01098 /* Shutdown the perf_event component */
01099 static int
01100 _papi_pe_shutdown_component( void ) {
01101 
01102   /* Shutdown libpfm4 */
01103   _papi_libpfm4_shutdown();
01104 
01105   return PAPI_OK;
01106 }
01107 
01108 
01109 /* Initialize a thread */
01110 static int
01111 _papi_pe_init_thread( hwd_context_t *hwd_ctx )
01112 {
01113 
01114   pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx;
01115 
01116   /* clear the context structure and mark as initialized */
01117   memset( pe_ctx, 0, sizeof ( pe_context_t ) );
01118   pe_ctx->initialized=1;
01119 
01120   return PAPI_OK;
01121 }
01122 
01123 /* Shutdown a thread */
01124 static int
01125 _papi_pe_shutdown_thread( hwd_context_t *ctx )
01126 {
01127     pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01128 
01129     pe_ctx->initialized=0;
01130 
01131     return PAPI_OK;
01132 }
01133 
01134 
01135 /* reset the hardware counters */
01136 /* Note: PAPI_reset() does not necessarily call this */
01137 /* unless the events are actually running.           */
01138 static int
01139 _papi_pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl )
01140 {
01141    int i, ret;
01142    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01143 
01144    ( void ) ctx;             /*unused */
01145 
01146    /* We need to reset all of the events, not just the group leaders */
01147    for( i = 0; i < pe_ctl->num_events; i++ ) {
01148       ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
01149       if ( ret == -1 ) {
01150      PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) "
01151            "returned error, Linux says: %s",
01152            pe_ctl->events[i].event_fd, strerror( errno ) );
01153      return PAPI_ESYS;
01154       }
01155    }
01156 
01157    return PAPI_OK;
01158 }
01159 
01160 
01161 /* write (set) the hardware counters */
01162 /* Current we do not support this.   */
01163 static int
01164 _papi_pe_write( hwd_context_t *ctx, hwd_control_state_t *ctl,
01165         long long *from )
01166 {
01167    ( void ) ctx;             /*unused */
01168    ( void ) ctl;             /*unused */
01169    ( void ) from;            /*unused */
01170    /*
01171     * Counters cannot be written.  Do we need to virtualize the
01172     * counters so that they can be written, or perhaps modify code so that
01173     * they can be written? FIXME ?
01174     */
01175     
01176     return PAPI_ENOSUPP;
01177 }
01178 
01179 /*
01180  * perf_event provides a complicated read interface.
01181  *  the info returned by read() varies depending on whether
01182  *  you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED,
01183  *  PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set
01184  *
01185  * To simplify things we just always ask for everything.  This might
01186  * lead to overhead when reading more than we need, but it makes the
01187  * read code a lot simpler than the original implementation we had here.
01188  *
01189  * For more info on the layout see include/linux/perf_event.h
01190  *
01191  */
01192 
01193 static int
01194 _papi_pe_read( hwd_context_t *ctx, hwd_control_state_t *ctl,
01195            long long **events, int flags )
01196 {
01197    ( void ) flags;           /*unused */
01198    int i, ret = -1;
01199    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01200    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01201    long long papi_pe_buffer[READ_BUFFER_SIZE];
01202    long long tot_time_running, tot_time_enabled, scale;
01203 
01204    /* On kernels before 2.6.33 the TOTAL_TIME_ENABLED and TOTAL_TIME_RUNNING */
01205    /* fields are always 0 unless the counter is disabled.  So if we are on   */
01206    /* one of these kernels, then we must disable events before reading.      */
01207 
01208    /* Elsewhere though we disable multiplexing on kernels before 2.6.34 */
01209    /* so maybe this isn't even necessary.                               */
01210 
01211    if (bug_sync_read()) {
01212       if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
01213          for ( i = 0; i < pe_ctl->num_events; i++ ) {
01214         /* disable only the group leaders */
01215         if ( pe_ctl->events[i].group_leader_fd == -1 ) {
01216            ret = ioctl( pe_ctl->events[i].event_fd, 
01217                PERF_EVENT_IOC_DISABLE, NULL );
01218            if ( ret == -1 ) {
01219               PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) "
01220                "returned an error: ", strerror( errno ));
01221               return PAPI_ESYS;
01222            }
01223         }
01224      }
01225       }
01226    }
01227 
01228 
01229    /* Handle case where we are multiplexing */
01230    if (pe_ctl->multiplexed) {
01231 
01232       /* currently we handle multiplexing by having individual events */
01233       /* so we read from each in turn.                                */
01234 
01235       for ( i = 0; i < pe_ctl->num_events; i++ ) {
01236              
01237          ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 
01238             sizeof ( papi_pe_buffer ) );
01239          if ( ret == -1 ) {
01240         PAPIERROR("read returned an error: ", strerror( errno ));
01241         return PAPI_ESYS;
01242      }
01243 
01244      /* We should read 3 64-bit values from the counter */
01245      if (ret<(signed)(3*sizeof(long long))) {
01246         PAPIERROR("Error!  short read!\n");  
01247         return PAPI_ESYS;
01248      }        
01249 
01250          SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 
01251             pe_ctl->events[i].event_fd, 
01252         (long)pe_ctl->tid, pe_ctl->cpu, ret);
01253          SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0],
01254             papi_pe_buffer[1],papi_pe_buffer[2]);
01255 
01256          tot_time_enabled = papi_pe_buffer[1];     
01257          tot_time_running = papi_pe_buffer[2];
01258 
01259          SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * "
01260         "tot_time_enabled %lld) / tot_time_running %lld\n",
01261         i, 0,papi_pe_buffer[0],
01262         tot_time_enabled,tot_time_running);
01263     
01264          if (tot_time_running == tot_time_enabled) {
01265         /* No scaling needed */
01266         pe_ctl->counts[i] = papi_pe_buffer[0];
01267          } else if (tot_time_running && tot_time_enabled) {
01268         /* Scale factor of 100 to avoid overflows when computing */
01269         /*enabled/running */
01270 
01271         scale = (tot_time_enabled * 100LL) / tot_time_running;
01272         scale = scale * papi_pe_buffer[0];
01273         scale = scale / 100LL;
01274         pe_ctl->counts[i] = scale;
01275      } else {
01276        /* This should not happen, but Phil reports it sometime does. */
01277         SUBDBG("perf_event kernel bug(?) count, enabled, "
01278            "running: %lld, %lld, %lld\n",
01279            papi_pe_buffer[0],tot_time_enabled,
01280            tot_time_running);
01281 
01282         pe_ctl->counts[i] = papi_pe_buffer[0];
01283      }
01284       }
01285    }
01286 
01287    /* Handle cases where we cannot use FORMAT GROUP */
01288    else if (bug_format_group() || pe_ctl->inherit) {
01289 
01290       /* we must read each counter individually */
01291       for ( i = 0; i < pe_ctl->num_events; i++ ) {
01292 
01293          ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 
01294             sizeof ( papi_pe_buffer ) );
01295          if ( ret == -1 ) {
01296         PAPIERROR("read returned an error: ", strerror( errno ));
01297         return PAPI_ESYS;
01298      }
01299 
01300      /* we should read one 64-bit value from each counter */
01301      if (ret!=sizeof(long long)) {
01302         PAPIERROR("Error!  short read!\n");
01303         PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
01304            pe_ctl->events[i].event_fd,
01305            (long)pe_ctl->tid, pe_ctl->cpu, ret);
01306         return PAPI_ESYS;
01307      }     
01308 
01309          SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 
01310             pe_ctl->events[i].event_fd, (long)pe_ctl->tid, 
01311         pe_ctl->cpu, ret);
01312          SUBDBG("read: %lld\n",papi_pe_buffer[0]);
01313      
01314      pe_ctl->counts[i] = papi_pe_buffer[0];
01315       }
01316    }
01317 
01318    
01319    /* Handle cases where we are using FORMAT_GROUP   */
01320    /* We assume only one group leader, in position 0 */
01321 
01322    else {
01323       if (pe_ctl->events[0].group_leader_fd!=-1) {
01324      PAPIERROR("Was expecting group leader!\n");
01325       }
01326 
01327       ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer, 
01328           sizeof ( papi_pe_buffer ) );
01329 
01330       if ( ret == -1 ) {
01331      PAPIERROR("read returned an error: ", strerror( errno ));
01332      return PAPI_ESYS;
01333       }
01334 
01335       /* we read 1 64-bit value (number of events) then     */
01336       /* num_events more 64-bit values that hold the counts */
01337       if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) {
01338      PAPIERROR("Error! short read!\n");
01339      return PAPI_ESYS;
01340       }
01341 
01342       SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 
01343          pe_ctl->events[0].event_fd, 
01344          (long)pe_ctl->tid, pe_ctl->cpu, ret);
01345       { 
01346      int j;
01347      for(j=0;j<ret/8;j++) {
01348             SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]);
01349      }
01350       }
01351 
01352       /* Make sure the kernel agrees with how many events we have */
01353       if (papi_pe_buffer[0]!=pe_ctl->num_events) {
01354      PAPIERROR("Error!  Wrong number of events!\n");
01355      return PAPI_ESYS;
01356       }
01357 
01358       /* put the count values in their proper location */
01359       for(i=0;i<papi_pe_buffer[0];i++) {
01360          pe_ctl->counts[i] = papi_pe_buffer[1+i];
01361       }
01362    }
01363 
01364 
01365    /* If we disabled the counters due to the sync_read_bug(), */
01366    /* then we need to re-enable them now.                     */
01367    if (bug_sync_read()) {
01368       if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
01369          for ( i = 0; i < pe_ctl->num_events; i++ ) {
01370         if ( pe_ctl->events[i].group_leader_fd == -1 ) {
01371            /* this should refresh any overflow counters too */
01372            ret = ioctl( pe_ctl->events[i].event_fd, 
01373                 PERF_EVENT_IOC_ENABLE, NULL );
01374            if ( ret == -1 ) {
01375               /* Should never happen */
01376               PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) returned an error: ",
01377                 strerror( errno ));
01378               return PAPI_ESYS;
01379            }
01380         }
01381      }
01382       }
01383    }
01384 
01385    /* point PAPI to the values we read */
01386    *events = pe_ctl->counts;
01387 
01388    return PAPI_OK;
01389 }
01390 
01391 /* Start counting events */
01392 static int
01393 _papi_pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl )
01394 {
01395    int ret;
01396    int i;
01397    int did_something = 0;
01398    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01399    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01400 
01401    /* Reset the counters first.  Is this necessary? */
01402    ret = _papi_pe_reset( pe_ctx, pe_ctl );
01403    if ( ret ) {
01404       return ret;
01405    }
01406 
01407    /* Enable all of the group leaders                */
01408    /* All group leaders have a group_leader_fd of -1 */
01409    for( i = 0; i < pe_ctl->num_events; i++ ) {
01410       if (pe_ctl->events[i].group_leader_fd == -1) {
01411      SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd);
01412      ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ; 
01413 
01414      /* ioctls always return -1 on failure */
01415          if (ret == -1) {
01416             PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
01417             return PAPI_ESYS;
01418      }
01419 
01420      did_something++;
01421       } 
01422    }
01423 
01424    if (!did_something) {
01425       PAPIERROR("Did not enable any counters.\n");
01426       return PAPI_EBUG;
01427    }
01428 
01429    pe_ctx->state |= PERF_EVENTS_RUNNING;
01430 
01431    return PAPI_OK;
01432 
01433 }
01434 
01435 /* Stop all of the counters */
01436 static int
01437 _papi_pe_stop( hwd_context_t *ctx, hwd_control_state_t *ctl )
01438 {
01439     
01440    int ret;
01441    int i;
01442    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01443    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01444 
01445    /* Just disable the group leaders */
01446    for ( i = 0; i < pe_ctl->num_events; i++ ) {
01447       if ( pe_ctl->events[i].group_leader_fd == -1 ) {
01448      ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL);
01449      if ( ret == -1 ) {
01450         PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) "
01451                "returned error, Linux says: %s",
01452                pe_ctl->events[i].event_fd, strerror( errno ) );
01453         return PAPI_EBUG;
01454      }
01455       }
01456    }
01457 
01458    pe_ctx->state &= ~PERF_EVENTS_RUNNING;
01459 
01460    return PAPI_OK;
01461 }
01462 
01463 /* Initialize a new control state */
01464 static int
01465 _papi_pe_init_control_state( hwd_control_state_t *ctl )
01466 {
01467    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01468     
01469    /* clear the contents */
01470    memset( pe_ctl, 0, sizeof ( pe_control_t ) );
01471    _papi_pe_set_domain( ctl, _papi_pe_vector.cmp_info.default_domain );
01472 
01473    /* default granularity */
01474    pe_ctl->granularity=PAPI_GRN_THR;
01475 
01476    /* Set cpu number in the control block to show events */
01477    /* are not tied to specific cpu                       */
01478    pe_ctl->cpu = -1;
01479    return PAPI_OK;
01480 }
01481 
01482 
01483 /* This function clears the current contents of the control structure and
01484    updates it with whatever resources are allocated for all the native events
01485    in the native info structure array. */
01486 
01487 static int
01488 _papi_pe_update_control_state( hwd_control_state_t *ctl, 
01489                    NativeInfo_t *native,
01490                    int count, hwd_context_t *ctx )
01491 {
01492    int i = 0, ret;
01493    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01494    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01495 
01496    /* close all of the existing fds and start over again */
01497    /* In theory we could have finer-grained control and know if             */
01498    /* things were changed, but it's easier to tear things down and rebuild. */
01499    close_pe_events( pe_ctx, pe_ctl );
01500 
01501    /* Calling with count==0 should be OK, it's how things are deallocated */
01502    /* when an eventset is destroyed.                                      */
01503    if ( count == 0 ) {
01504       SUBDBG( "Called with count == 0\n" );
01505       return PAPI_OK;
01506    }
01507 
01508    /* set up all the events */
01509    for( i = 0; i < count; i++ ) {
01510       if ( native ) {
01511      /* Have libpfm4 set the config values for the event */
01512      ret=_papi_libpfm4_setup_counters(&pe_ctl->events[i].attr,
01513                      native[i].ni_event);
01514      SUBDBG( "pe_ctl->eventss[%d].config=%"PRIx64"\n",i,
01515          pe_ctl->events[i].attr.config);
01516      if (ret!=PAPI_OK) return ret;
01517 
01518       } else {
01519       /* I'm not sure how we'd end up in this case */
01520           /* should it be an error?                    */
01521       }
01522 
01523       /* Copy the inherit flag into the attribute block that will be   */
01524       /* passed to the kernel */
01525       pe_ctl->events[i].attr.inherit = pe_ctl->inherit;
01526 
01527       /* Set the position in the native structure */
01528       /* We just set up events linearly           */
01529       if ( native ) {
01530      native[i].ni_position = i;
01531       }
01532    }
01533 
01534    pe_ctl->num_events = count;
01535    _papi_pe_set_domain( ctl, pe_ctl->domain );
01536 
01537    /* actuall open the events */
01538    /* (why is this a separate function?) */
01539    ret = open_pe_events( pe_ctx, pe_ctl );
01540    if ( ret != PAPI_OK ) {
01541       SUBDBG("open_pe_events failed\n");
01542       /* Restore values ? */
01543       return ret;
01544    }
01545 
01546    return PAPI_OK;
01547 }
01548 
01549 /* Set various options on a control state */
01550 static int
01551 _papi_pe_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option )
01552 {
01553    int ret;
01554    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01555    pe_control_t *pe_ctl = NULL;
01556 
01557    switch ( code ) {
01558       case PAPI_MULTIPLEX:
01559        pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state );
01560        if (check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain,
01561                   pe_ctl->granularity,
01562                   1, pe_ctl->inherit ) != PAPI_OK) {
01563           return PAPI_EPERM;
01564        }
01565 
01566        /* looks like we are allowed, so set multiplexed attribute */
01567        pe_ctl->multiplexed = 1;
01568        ret = _papi_pe_update_control_state( pe_ctl, NULL, 
01569                         pe_ctl->num_events, pe_ctx );
01570        if (ret != PAPI_OK) {
01571           pe_ctl->multiplexed = 0;
01572        }
01573        return ret;
01574     
01575       case PAPI_ATTACH:
01576        pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state );
01577        if (check_permissions( option->attach.tid, pe_ctl->cpu, 
01578                   pe_ctl->domain, pe_ctl->granularity,
01579                   pe_ctl->multiplexed, 
01580                   pe_ctl->inherit ) != PAPI_OK) {
01581           return PAPI_EPERM;
01582        }
01583 
01584        pe_ctl->tid = option->attach.tid;
01585 
01586        /* If events have been already been added, something may */
01587        /* have been done to the kernel, so update */
01588        ret = _papi_pe_update_control_state( pe_ctl, NULL, 
01589                         pe_ctl->num_events, pe_ctx);
01590        
01591        return ret;
01592 
01593       case PAPI_DETACH:
01594        pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state );
01595 
01596        pe_ctl->tid = 0;
01597        return PAPI_OK;
01598 
01599       case PAPI_CPU_ATTACH:
01600        pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state );
01601        if (check_permissions( pe_ctl->tid, option->cpu.cpu_num, 
01602                   pe_ctl->domain, pe_ctl->granularity,
01603                   pe_ctl->multiplexed, 
01604                   pe_ctl->inherit ) != PAPI_OK) {
01605            return PAPI_EPERM;
01606        }
01607        /* looks like we are allowed so set cpu number */
01608 
01609        /* this tells the kernel not to count for a thread   */
01610        /* should we warn if we try to set both?  perf_event */
01611        /* will reject it.                                   */
01612        pe_ctl->tid = -1;      
01613 
01614        pe_ctl->cpu = option->cpu.cpu_num;
01615 
01616        return PAPI_OK;
01617 
01618       case PAPI_DOMAIN:
01619        pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state );
01620        if (check_permissions( pe_ctl->tid, pe_ctl->cpu, 
01621                   option->domain.domain,
01622                   pe_ctl->granularity,
01623                   pe_ctl->multiplexed,
01624                   pe_ctl->inherit ) != PAPI_OK) {
01625           return PAPI_EPERM;
01626        }
01627        /* looks like we are allowed, so set counting domain */
01628        return _papi_pe_set_domain( pe_ctl, option->domain.domain );
01629 
01630       case PAPI_GRANUL:
01631        pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state );
01632 
01633        /* FIXME: we really don't support this yet */
01634 
01635            switch ( option->granularity.granularity  ) {
01636               case PAPI_GRN_PROCG:
01637               case PAPI_GRN_SYS_CPU:
01638               case PAPI_GRN_PROC:
01639            return PAPI_ECMP;
01640      
01641           /* Currently we only support thread and CPU granularity */
01642               case PAPI_GRN_SYS:
01643            pe_ctl->granularity=PAPI_GRN_SYS;
01644            break;
01645 
01646               case PAPI_GRN_THR:
01647            pe_ctl->granularity=PAPI_GRN_THR;
01648            break;
01649 
01650 
01651               default:
01652            return PAPI_EINVAL;
01653        }
01654            return PAPI_OK;
01655 
01656       case PAPI_INHERIT:
01657        pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state );
01658        if (check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain, 
01659                   pe_ctl->granularity, pe_ctl->multiplexed, 
01660                   option->inherit.inherit ) != PAPI_OK) {
01661           return PAPI_EPERM;
01662        }
01663        /* looks like we are allowed, so set the requested inheritance */
01664        if (option->inherit.inherit) {
01665           /* children will inherit counters */
01666           pe_ctl->inherit = 1;
01667        } else {
01668           /* children won't inherit counters */
01669           pe_ctl->inherit = 0;
01670        }
01671        return PAPI_OK;
01672 
01673       case PAPI_DATA_ADDRESS:
01674        return PAPI_ENOSUPP;
01675 #if 0
01676        pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
01677        ret = set_default_domain( pe_ctl, option->address_range.domain );
01678        if ( ret != PAPI_OK ) {
01679           return ret;
01680        }
01681        set_drange( pe_ctx, pe_ctl, option );
01682        return PAPI_OK;
01683 #endif
01684       case PAPI_INSTR_ADDRESS:
01685        return PAPI_ENOSUPP;
01686 #if 0
01687        pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
01688        ret = set_default_domain( pe_ctl, option->address_range.domain );
01689        if ( ret != PAPI_OK ) {
01690           return ret;
01691        }
01692        set_irange( pe_ctx, pe_ctl, option );
01693        return PAPI_OK;
01694 #endif
01695 
01696       case PAPI_DEF_ITIMER:
01697        /* What should we be checking for here?                   */
01698        /* This seems like it should be OS-specific not component */
01699        /* specific.                                              */
01700 
01701        return PAPI_OK;
01702     
01703       case PAPI_DEF_MPX_NS:
01704        /* Defining a given ns per set is not current supported */
01705        return PAPI_ENOSUPP;
01706     
01707       case PAPI_DEF_ITIMER_NS:
01708        /* We don't support this... */
01709        return PAPI_OK;
01710     
01711       default:
01712        return PAPI_ENOSUPP;
01713    }
01714 }
01715 
01716 
01717 /*
01718  * This function is used when hardware overflows are working or when
01719  * software overflows are forced
01720  */
01721 
01722 static void
01723 _papi_pe_dispatch_timer( int n, hwd_siginfo_t *info, void *uc )
01724 {
01725    ( void ) n;               /*unused */
01726    _papi_hwi_context_t hw_context;
01727    int found_evt_idx = -1, fd = info->si_fd;
01728    caddr_t address;
01729    ThreadInfo_t *thread = _papi_hwi_lookup_thread( 0 );
01730    int cidx = _papi_pe_vector.cmp_info.CmpIdx;
01731    int i;
01732    pe_control_t *ctl;
01733 
01734    if ( thread == NULL ) {
01735       PAPIERROR( "thread == NULL in _papi_pe_dispatch_timer for fd %d!", fd );
01736       return;
01737    }
01738 
01739    if ( thread->running_eventset[cidx] == NULL ) {
01740       PAPIERROR( "thread->running_eventset == NULL in "
01741          "_papi_pe_dispatch_timer for fd %d!",fd );
01742       return;
01743    }
01744 
01745    if ( thread->running_eventset[cidx]->overflow.flags == 0 ) {
01746       PAPIERROR( "thread->running_eventset->overflow.flags == 0 in "
01747          "_papi_pe_dispatch_timer for fd %d!", fd );
01748       return;
01749    }
01750     
01751    hw_context.si = info;
01752    hw_context.ucontext = ( hwd_ucontext_t * ) uc;
01753 
01754    if ( thread->running_eventset[cidx]->overflow.flags & 
01755     PAPI_OVERFLOW_FORCE_SW ) {
01756       address = GET_OVERFLOW_ADDRESS( hw_context );
01757       _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context, 
01758                       address, NULL, 0,
01759                       0, &thread, cidx );
01760       return;
01761    }
01762 
01763    if ( thread->running_eventset[cidx]->overflow.flags !=
01764          PAPI_OVERFLOW_HARDWARE ) {
01765       PAPIERROR( "thread->running_eventset->overflow.flags is set to "
01766          "something other than PAPI_OVERFLOW_HARDWARE or "
01767          "PAPI_OVERFLOW_FORCE_SW for fd %d (%x)",
01768          fd , thread->running_eventset[cidx]->overflow.flags);
01769    }
01770 
01771    /* convoluted way to get ctl */
01772    ctl= thread->running_eventset[cidx]->ctl_state;
01773 
01774    /* See if the fd is one that's part of the this thread's context */
01775    for( i=0; i < ctl->num_events; i++ ) {
01776       if ( fd == ctl->events[i].event_fd ) {
01777      found_evt_idx = i;
01778      break;
01779       }
01780    }
01781 
01782    if ( found_evt_idx == -1 ) {
01783       PAPIERROR( "Unable to find fd %d among the open event fds "
01784          "_papi_hwi_dispatch_timer!", fd );
01785       return;
01786    }
01787     
01788    ioctl( fd, PERF_EVENT_IOC_DISABLE, NULL );
01789 
01790    if ( ( thread->running_eventset[cidx]->state & PAPI_PROFILING ) && 
01791     !( thread->running_eventset[cidx]->profile.flags & 
01792        PAPI_PROFIL_FORCE_SW ) ) {
01793       process_smpl_buf( found_evt_idx, &thread );
01794    }
01795    else {
01796       uint64_t ip;
01797       unsigned int head;
01798       pe_event_info_t *pe = &(ctl->events[found_evt_idx]);
01799       unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize(  );
01800 
01801       /*
01802        * Read up the most recent IP from the sample in the mmap buffer.  To
01803        * do this, we make the assumption that all of the records in the
01804        * mmap buffer are the same size, and that they all contain the IP as
01805        * their only record element.  This means that we can use the
01806        * data_head element from the user page and move backward one record
01807        * from that point and read the data.  Since we don't actually need
01808        * to access the header of the record, we can just subtract 8 (size
01809        * of the IP) from data_head and read up that word from the mmap
01810        * buffer.  After we subtract 8, we account for mmap buffer wrapping
01811        * by AND'ing this offset with the buffer mask.
01812        */
01813       head = mmap_read_head( pe );
01814 
01815       if ( head == 0 ) {
01816      PAPIERROR( "Attempting to access memory which may be inaccessable" );
01817      return;
01818       }
01819 
01820       ip = *( uint64_t * ) ( data + ( ( head - 8 ) & pe->mask ) );
01821       /*
01822        * Update the tail to the current head pointer. 
01823        *
01824        * Note: that if we were to read the record at the tail pointer,
01825        * rather than the one at the head (as you might otherwise think
01826        * would be natural), we could run into problems.  Signals don't
01827        * stack well on Linux, particularly if not using RT signals, and if
01828        * they come in rapidly enough, we can lose some.  Overtime, the head
01829        * could catch up to the tail and monitoring would be stopped, and
01830        * since no more signals are coming in, this problem will never be
01831        * resolved, resulting in a complete loss of overflow notification
01832        * from that point on.  So the solution we use here will result in
01833        * only the most recent IP value being read every time there are two
01834        * or more samples in the buffer (for that one overflow signal).  But
01835        * the handler will always bring up the tail, so the head should
01836        * never run into the tail.
01837        */
01838       mmap_write_tail( pe, head );
01839 
01840       /*
01841        * The fourth parameter is supposed to be a vector of bits indicating
01842        * the overflowed hardware counters, but it's not really clear that
01843        * it's useful, because the actual hardware counters used are not
01844        * exposed to the PAPI user.  For now, I'm just going to set the bit
01845        * that indicates which event register in the array overflowed.  The
01846        * result is that the overflow vector will not be identical to the
01847        * perfmon implementation, and part of that is due to the fact that
01848        * which hardware register is actually being used is opaque at the
01849        * user level (the kernel event dispatcher hides that info).
01850        */
01851 
01852       _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context,
01853                       ( caddr_t ) ( unsigned long ) ip,
01854                       NULL, ( 1 << found_evt_idx ), 0,
01855                       &thread, cidx );
01856 
01857    }
01858 
01859    /* Restart the counters */
01860    if (ioctl( fd, PERF_EVENT_IOC_REFRESH, PAPI_REFRESH_VALUE ) == -1) {
01861       PAPIERROR( "overflow refresh failed", 0 );
01862    }
01863 }
01864 
01865 /* Stop profiling */
01866 static int
01867 _papi_pe_stop_profiling( ThreadInfo_t *thread, EventSetInfo_t *ESI )
01868 {
01869    int i, ret = PAPI_OK;
01870    pe_control_t *ctl;
01871 
01872    ctl=ESI->ctl_state;
01873 
01874    /* Loop through all of the events and process those which have mmap */
01875    /* buffers attached.                                                */
01876    for ( i = 0; i < ctl->num_events; i++ ) {
01877       /* Use the mmap_buf field as an indicator of this fd being used for */
01878       /* profiling.                                                       */
01879       if ( ctl->events[i].mmap_buf ) {
01880      /* Process any remaining samples in the sample buffer */
01881      ret = process_smpl_buf( i, &thread );
01882      if ( ret ) {
01883         PAPIERROR( "process_smpl_buf returned error %d", ret );
01884         return ret;
01885      }
01886       }
01887    }
01888    return ret;
01889 }
01890 
01891 
01892 /* Setup an event to cause overflow */
01893 static int
01894 _papi_pe_set_overflow( EventSetInfo_t *ESI, int EventIndex, int threshold )
01895 {
01896    int cidx = _papi_pe_vector.cmp_info.CmpIdx;
01897    pe_context_t *ctx = ( pe_context_t *) ( ESI->master->context[cidx] );
01898    pe_control_t *ctl = (pe_control_t *) ( ESI->ctl_state );
01899    int i, evt_idx, found_non_zero_sample_period = 0, retval = PAPI_OK;
01900 
01901    evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
01902 
01903    SUBDBG("Attempting to set overflow for index %d (%d) of EventSet %d\n",
01904       evt_idx,EventIndex,ESI->EventSetIndex);
01905 
01906    if (evt_idx<0) {
01907       return PAPI_EINVAL;
01908    }
01909 
01910    if ( threshold == 0 ) {
01911       /* If this counter isn't set to overflow, it's an error */
01912       if ( ctl->events[evt_idx].attr.sample_period == 0 ) return PAPI_EINVAL;
01913    }
01914 
01915    ctl->events[evt_idx].attr.sample_period = threshold;
01916 
01917    /*
01918     * Note that the wakeup_mode field initially will be set to zero
01919     * (WAKEUP_MODE_COUNTER_OVERFLOW) as a result of a call to memset 0 to
01920     * all of the events in the ctl struct.
01921     *
01922     * Is it even set to any other value elsewhere?
01923     */
01924    switch ( ctl->events[evt_idx].wakeup_mode ) {
01925     case WAKEUP_MODE_PROFILING:
01926          /* Setting wakeup_events to special value zero means issue a */
01927          /* wakeup (signal) on every mmap page overflow.              */
01928          ctl->events[evt_idx].attr.wakeup_events = 0;
01929          break;
01930 
01931     case WAKEUP_MODE_COUNTER_OVERFLOW:
01932          /* Can this code ever be called? */
01933 
01934          /* Setting wakeup_events to one means issue a wakeup on every */
01935              /* counter overflow (not mmap page overflow).                 */
01936          ctl->events[evt_idx].attr.wakeup_events = 1;
01937          /* We need the IP to pass to the overflow handler */
01938          ctl->events[evt_idx].attr.sample_type = PERF_SAMPLE_IP;
01939          /* one for the user page, and two to take IP samples */
01940          ctl->events[evt_idx].nr_mmap_pages = 1 + 2;
01941          break;
01942     default:
01943          PAPIERROR( "ctl->wakeup_mode[%d] set to an unknown value - %u",
01944              evt_idx, ctl->events[evt_idx].wakeup_mode);
01945          return PAPI_EBUG;
01946    }
01947 
01948    /* Check for non-zero sample period */
01949    for ( i = 0; i < ctl->num_events; i++ ) {
01950       if ( ctl->events[evt_idx].attr.sample_period ) {
01951      found_non_zero_sample_period = 1;
01952      break;
01953       }
01954    }
01955 
01956    if ( found_non_zero_sample_period ) {
01957       /* turn on internal overflow flag for this event set */
01958       ctl->overflow = 1;
01959         
01960       /* Enable the signal handler */
01961       retval = _papi_hwi_start_signal( 
01962                   _papi_pe_vector.cmp_info.hardware_intr_sig, 
01963                   1, _papi_pe_vector.cmp_info.CmpIdx );
01964    } else {
01965       /* turn off internal overflow flag for this event set */
01966       ctl->overflow = 0;
01967         
01968       /* Remove the signal handler, if there are no remaining non-zero */
01969       /* sample_periods set                                            */
01970       retval = _papi_hwi_stop_signal( 
01971                  _papi_pe_vector.cmp_info.hardware_intr_sig );
01972       if ( retval != PAPI_OK ) return retval;
01973    }
01974     
01975    retval = _papi_pe_update_control_state( ctl, NULL,
01976                 ( (pe_control_t *) (ESI->ctl_state) )->num_events,
01977                        ctx );
01978 
01979    return retval;
01980 }
01981 
01982 /* Enable profiling */
01983 static int
01984 _papi_pe_set_profile( EventSetInfo_t *ESI, int EventIndex, int threshold )
01985 {
01986    int ret;
01987    int evt_idx;
01988    pe_control_t *ctl = ( pe_control_t *) ( ESI->ctl_state );
01989 
01990    /* Since you can't profile on a derived event, the event is always the */
01991    /* first and only event in the native event list.                      */
01992    evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
01993 
01994    if ( threshold == 0 ) {
01995       SUBDBG( "MUNMAP(%p,%"PRIu64")\n", ctl->events[evt_idx].mmap_buf,
01996           ( uint64_t ) ctl->events[evt_idx].nr_mmap_pages *
01997           getpagesize(  ) );
01998 
01999       if ( ctl->events[evt_idx].mmap_buf ) {
02000      munmap( ctl->events[evt_idx].mmap_buf,
02001          ctl->events[evt_idx].nr_mmap_pages * getpagesize() );
02002       }
02003 
02004       ctl->events[evt_idx].mmap_buf = NULL;
02005       ctl->events[evt_idx].nr_mmap_pages = 0;
02006       ctl->events[evt_idx].attr.sample_type &= ~PERF_SAMPLE_IP;
02007       ret = _papi_pe_set_overflow( ESI, EventIndex, threshold );
02008       /* ??? #warning "This should be handled somewhere else" */
02009       ESI->state &= ~( PAPI_OVERFLOWING );
02010       ESI->overflow.flags &= ~( PAPI_OVERFLOW_HARDWARE );
02011 
02012       return ret;
02013    }
02014 
02015    /* Look up the native event code */
02016    if ( ESI->profile.flags & (PAPI_PROFIL_DATA_EAR | PAPI_PROFIL_INST_EAR)) {
02017       /* Not supported yet... */
02018 
02019       return PAPI_ENOSUPP;
02020    }
02021 
02022    if ( ESI->profile.flags & PAPI_PROFIL_RANDOM ) {
02023       /* This requires an ability to randomly alter the sample_period within */
02024       /* a given range.  Kernel does not have this ability. FIXME            */
02025       return PAPI_ENOSUPP;
02026    }
02027 
02028    /* Just a guess at how many pages would make this relatively efficient.  */
02029    /* Note that it's "1 +" because of the need for a control page, and the  */
02030    /* number following the "+" must be a power of 2 (1, 4, 8, 16, etc) or   */
02031    /* zero.  This is required to optimize dealing with circular buffer      */
02032    /* wrapping of the mapped pages.                                         */
02033 
02034    ctl->events[evt_idx].nr_mmap_pages = (1+8);
02035    ctl->events[evt_idx].attr.sample_type |= PERF_SAMPLE_IP;
02036 
02037    ret = _papi_pe_set_overflow( ESI, EventIndex, threshold );
02038    if ( ret != PAPI_OK ) return ret;
02039 
02040    return PAPI_OK;
02041 }
02042 
02043 
02044 /* Our component vector */
02045 
02046 papi_vector_t _papi_pe_vector = {
02047    .cmp_info = {
02048           /* component information (unspecified values initialized to 0) */
02049       .name = "perf_events",
02050       .short_name = "pe",
02051       .version = "5.0",
02052       .description = "Linux perf_event CPU counters",
02053   
02054       .default_domain = PAPI_DOM_USER,
02055       .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR,
02056       .default_granularity = PAPI_GRN_THR,
02057       .available_granularities = PAPI_GRN_THR | PAPI_GRN_SYS,
02058 
02059       .hardware_intr = 1,
02060       .kernel_profile = 1,
02061       .num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS,
02062 
02063       /* component specific cmp_info initializations */
02064       .fast_virtual_timer = 0,
02065       .attach = 1,
02066       .attach_must_ptrace = 1,
02067       .cpu = 1,
02068       .inherit = 1,
02069       .cntr_umasks = 1,
02070 
02071   },
02072 
02073   /* sizes of framework-opaque component-private structures */
02074   .size = {
02075       .context = sizeof ( pe_context_t ),
02076       .control_state = sizeof ( pe_control_t ),
02077       .reg_value = sizeof ( int ),
02078       .reg_alloc = sizeof ( int ),
02079   },
02080 
02081   /* function pointers in this component */
02082   .init_control_state =    _papi_pe_init_control_state,
02083   .start =                 _papi_pe_start,
02084   .stop =                  _papi_pe_stop,
02085   .read =                  _papi_pe_read,
02086   .shutdown_thread =       _papi_pe_shutdown_thread,
02087   .shutdown_component =    _papi_pe_shutdown_component,
02088   .ctl =                   _papi_pe_ctl,
02089   .update_control_state =  _papi_pe_update_control_state,
02090   .set_domain =            _papi_pe_set_domain,
02091   .reset =                 _papi_pe_reset,
02092   .set_overflow =          _papi_pe_set_overflow,
02093   .set_profile =           _papi_pe_set_profile,
02094   .stop_profiling =        _papi_pe_stop_profiling,
02095   .init_component =        _papi_pe_init_component,
02096   .dispatch_timer =        _papi_pe_dispatch_timer,
02097   .write =                 _papi_pe_write,
02098   .init_thread =           _papi_pe_init_thread,
02099 
02100   /* from counter name mapper */
02101   .ntv_enum_events =   _papi_libpfm4_ntv_enum_events,
02102   .ntv_name_to_code =  _papi_libpfm4_ntv_name_to_code,
02103   .ntv_code_to_name =  _papi_libpfm4_ntv_code_to_name,
02104   .ntv_code_to_descr = _papi_libpfm4_ntv_code_to_descr,
02105   .ntv_code_to_info =  _papi_libpfm4_ntv_code_to_info,
02106 };
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines