PAPI  5.0.1.0
perf_events.c
Go to the documentation of this file.
00001 /*
00002 * File:    perf_events.c
00003 *
00004 * Author:  Corey Ashford
00005 *          cjashfor@us.ibm.com
00006 *          - based upon perfmon.c written by -
00007 *          Philip Mucci
00008 *          mucci@cs.utk.edu
00009 * Mods:    Gary Mohr
00010 *          gary.mohr@bull.com
00011 * Mods:    Vince Weaver
00012 *          vweaver1@eecs.utk.edu
00013 * Mods:    Philip Mucci
00014 *      mucci@eecs.utk.edu */
00015 
00016 
00017 #include <fcntl.h>
00018 #include <string.h>
00019 #include <errno.h>
00020 #include <signal.h>
00021 #include <syscall.h>
00022 #include <sys/utsname.h>
00023 #include <sys/mman.h>
00024 #include <sys/ioctl.h>
00025 
00026 /* PAPI-specific includes */
00027 #include "papi.h"
00028 #include "papi_memory.h"
00029 #include "papi_internal.h"
00030 #include "papi_vector.h"
00031 #include "extras.h"
00032 
00033 /* libpfm4 includes */
00034 #include "papi_libpfm4_events.h"
00035 #include "perfmon/pfmlib.h"
00036 #include PEINCLUDE
00037 
00038 /* Linux-specific includes */
00039 #include "mb.h"
00040 #include "syscalls.h"
00041 #include "linux-memory.h"
00042 #include "linux-timer.h"
00043 #include "linux-common.h"
00044 #include "linux-context.h"
00045 
00046 /* Various definitions */
00047 
00048 /* This is arbitrary.  Typically you can add up to ~1000 before */
00049 /* you run out of fds                                           */
00050 #define PERF_EVENT_MAX_MPX_COUNTERS 64
00051 
00052 /* We really don't need fancy definitions for these */
00053 
00054 typedef struct
00055 {
00056   int group_leader_fd;            /* fd of group leader                   */
00057   int event_fd;                   /* fd of event                          */
00058   int event_opened;               /* event successfully opened            */
00059   uint32_t nr_mmap_pages;     /* number pages in the mmap buffer      */
00060   void *mmap_buf;         /* used for control/profiling           */
00061   uint64_t tail;          /* current read location in mmap buffer */
00062   uint64_t mask;          /* mask used for wrapping the pages     */
00063   struct perf_event_attr attr;    /* perf_event config structure          */
00064   unsigned int wakeup_mode;       /* wakeup mode when sampling            */
00065 } pe_event_info_t;
00066 
00067 typedef struct
00068 {
00069   int num_events;                 /* number of events in control state */
00070   unsigned int domain;            /* control-state wide domain         */
00071   unsigned int multiplexed;       /* multiplexing enable               */
00072   unsigned int overflow;          /* overflow enable                   */
00073   unsigned int inherit;           /* inherit enable                    */
00074   int cpu;                        /* which cpu to measure              */
00075   pid_t tid;                      /* thread we are monitoring          */
00076   pe_event_info_t events[PERF_EVENT_MAX_MPX_COUNTERS];
00077   long long counts[PERF_EVENT_MAX_MPX_COUNTERS];
00078 } pe_control_t;
00079 
00080 typedef struct
00081 {
00082   int initialized;                /* are we initialized?           */
00083   int state;                      /* are we opened and/or running? */
00084 } pe_context_t;
00085 
00086 /* These sentinels tell papi_pe_set_overflow() how to set the */
00087 /* wakeup_events field in the event descriptor record.        */
00088 
00089 #define WAKEUP_COUNTER_OVERFLOW 0
00090 #define WAKEUP_PROFILING -1
00091 
00092 #define WAKEUP_MODE_COUNTER_OVERFLOW 0
00093 #define WAKEUP_MODE_PROFILING 1
00094 
00095 /* Defines for ctx->state */
00096 #define PERF_EVENTS_OPENED  0x01
00097 #define PERF_EVENTS_RUNNING 0x02
00098 
00099 /* Static globals */
00100 static int nmi_watchdog_active;
00101 
00102 /* Advance declaration */
00103 papi_vector_t _papi_pe_vector;
00104 
00105 
00106 /******** Kernel Version Dependent Routines  **********************/
00107 
00108 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch
00109  * implementations (e.g. x86) which don't do a static event scheduability
00110  * check in sys_perf_event_open.  
00111  * This was fixed for x86 in the 2.6.33 kernel
00112  *
00113  * Also! Kernels newer than 2.6.34 will fail in a similar way
00114  *       if the nmi_watchdog has stolen a performance counter
00115  *       and we try to use the maximum number of counters.
00116  *       A sys_perf_open() will seem to succeed but will fail
00117  *       at read time.  So re-use this work around code.
00118  */
00119 static int 
00120 bug_check_scheduability(void) {
00121 
00122 #if defined(__powerpc__)
00123   /* PowerPC not affected by this bug */
00124 #elif defined(__mips__)
00125   /* MIPS as of kernel 3.1 does not properly detect schedulability */
00126   return 1;
00127 #else
00128   if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
00129 #endif
00130 
00131   if (nmi_watchdog_active) return 1;
00132 
00133   return 0;
00134 }
00135 
00136 /* PERF_FORMAT_GROUP allows reading an entire group's counts at once   */
00137 /* before 2.6.34 PERF_FORMAT_GROUP did not work when reading results   */
00138 /*  from attached processes.  We are lazy and disable it for all cases */
00139 /*  commit was:  050735b08ca8a016bbace4445fa025b88fee770b              */
00140 
00141 static int 
00142 bug_format_group(void) {
00143 
00144   if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) return 1;
00145 
00146   /* MIPS, as of version 3.1, does not support this properly */
00147 
00148 #if defined(__mips__)
00149   return 1;
00150 #endif
00151 
00152   return 0;
00153 
00154 }
00155 
00156 
00157 /* There's a bug prior to Linux 2.6.33 where if you are using */
00158 /* PERF_FORMAT_GROUP, the TOTAL_TIME_ENABLED and              */
00159 /* TOTAL_TIME_RUNNING fields will be zero unless you disable  */
00160 /* the counters first                                         */
00161 static int 
00162 bug_sync_read(void) {
00163 
00164   if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
00165 
00166   return 0;
00167 
00168 }
00169 
00170 
00171 /* Set the F_SETOWN_EX flag on the fd.                          */
00172 /* This affects which thread an overflow signal gets sent to    */
00173 /* Handled in a subroutine to handle the fact that the behavior */
00174 /* is dependent on kernel version.                              */
00175 static int 
00176 fcntl_setown_fd(int fd) {
00177 
00178    int ret;
00179    struct f_owner_ex fown_ex;
00180 
00181       /* F_SETOWN_EX is not available until 2.6.32 */
00182    if (_papi_os_info.os_version < LINUX_VERSION(2,6,32)) {
00183        
00184       /* get ownership of the descriptor */
00185       ret = fcntl( fd, F_SETOWN, mygettid(  ) );
00186       if ( ret == -1 ) {
00187      PAPIERROR( "cannot fcntl(F_SETOWN) on %d: %s", fd, strerror(errno) );
00188      return PAPI_ESYS;
00189       }
00190    }
00191    else {
00192       /* set ownership of the descriptor */   
00193       fown_ex.type = F_OWNER_TID;
00194       fown_ex.pid  = mygettid();
00195       ret = fcntl(fd, F_SETOWN_EX, (unsigned long)&fown_ex );
00196    
00197       if ( ret == -1 ) {
00198      PAPIERROR( "cannot fcntl(F_SETOWN_EX) on %d: %s", 
00199             fd, strerror( errno ) );
00200      return PAPI_ESYS;
00201       }
00202    }
00203    return PAPI_OK;
00204 }
00205 
00206 /* Check for processor support */
00207 /* Can be used for generic checking, though in general we only     */
00208 /* check for pentium4 here because support was broken for multiple */
00209 /* kernel releases and the usual standard detections did not       */
00210 /* handle this.  So we check for pentium 4 explicitly.             */
00211 static int 
00212 processor_supported(int vendor, int family) {
00213 
00214    /* Error out if kernel too early to support p4 */
00215    if (( vendor == PAPI_VENDOR_INTEL ) && (family == 15)) {   
00216       if (_papi_os_info.os_version < LINUX_VERSION(2,6,35)) {
00217      PAPIERROR("Pentium 4 not supported on kernels before 2.6.35");
00218      return PAPI_ENOSUPP;
00219       }
00220    }
00221    return PAPI_OK;
00222 }
00223 
00224 
00225 /* The read format on perf_event varies based on various flags that */
00226 /* are passed into it.  This helper avoids copying this logic       */
00227 /* multiple places.                                                 */
00228 static unsigned int
00229 get_read_format( unsigned int multiplex, 
00230          unsigned int inherit, 
00231          int format_group )
00232 {
00233    unsigned int format = 0;
00234 
00235    /* if we need read format options for multiplexing, add them now */
00236    if (multiplex) {
00237       format |= PERF_FORMAT_TOTAL_TIME_ENABLED;
00238       format |= PERF_FORMAT_TOTAL_TIME_RUNNING;
00239    }
00240 
00241    /* if our kernel supports it and we are not using inherit, */
00242    /* add the group read options                              */
00243    if ( (!bug_format_group()) && !inherit) {
00244       if (format_group) {
00245      format |= PERF_FORMAT_GROUP;
00246       }
00247    }
00248 
00249    SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: 0x%x\n",
00250       multiplex, inherit, format_group, format);
00251 
00252    return format;
00253 }
00254 
00255 
00256 /********* End Kernel-version Dependent Routines  ****************/
00257 
00258 
00260 /*  perf_events.                                         */
00261 /*  We do this by temporarily opening an event with the  */
00262 /*  desired options then closing it again.  We use the   */
00263 /*  PERF_COUNT_HW_INSTRUCTION event as a dummy event     */
00264 /*  on the assumption it is available on all             */
00265 /*  platforms.                                           */
00266 
00267 static int
00268 check_permissions( unsigned long tid, unsigned int cpu_num, 
00269            unsigned int domain, unsigned int multiplex, 
00270            unsigned int inherit )
00271 {
00272    int ev_fd;
00273    struct perf_event_attr attr;
00274 
00275    /* clearing this will set a type of hardware and to count all domains */
00276    memset(&attr, '\0', sizeof(attr));
00277    attr.read_format = get_read_format(multiplex, inherit, 1);
00278 
00279    /* set the event id (config field) to instructios */
00280    /* (an event that should always exist)            */
00281    /* This was cycles but that is missing on Niagara */
00282    attr.config = PERF_COUNT_HW_INSTRUCTIONS;
00283     
00284    /* now set up domains this event set will be counting */
00285    if (!(domain & PAPI_DOM_SUPERVISOR)) {
00286       attr.exclude_hv = 1;
00287    }
00288    if (!(domain & PAPI_DOM_USER)) {
00289       attr.exclude_user = 1;
00290    }
00291    if (!(domain & PAPI_DOM_KERNEL)) {
00292       attr.exclude_kernel = 1;
00293    }
00294 
00295    SUBDBG("Calling sys_perf_event_open() from check_permissions\n");
00296 
00297    ev_fd = sys_perf_event_open( &attr, tid, cpu_num, -1, 0 );
00298    if ( ev_fd == -1 ) {
00299       SUBDBG("sys_perf_event_open returned error.  Linux says, %s", 
00300          strerror( errno ) );
00301       return PAPI_EPERM;
00302    }
00303     
00304    /* now close it, this was just to make sure we have permissions */
00305    /* to set these options                                         */
00306    close(ev_fd);
00307    return PAPI_OK;
00308 }
00309 
00310 
00311 
00312 /* Maximum size we ever expect to read from a perf_event fd   */
00313 /*  (this is the number of 64-bit values)                     */
00314 /* We use this to size the read buffers                       */
00315 /* The three is for event count, time_enabled, time_running   */
00316 /*  and the counter term is count value and count id for each */
00317 /*  possible counter value.                                   */
00318 #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS))
00319 
00320 
00321 
00322 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */
00323 /* implementations (e.g. x86 before 2.6.33) which don't do a static event */
00324 /* scheduability check in sys_perf_event_open.  It is also needed if the  */
00325 /* kernel is stealing an event, such as when NMI watchdog is enabled.     */
00326 
00327 static int
00328 check_scheduability( pe_context_t *ctx, pe_control_t *ctl, int idx )
00329 {
00330    int retval = 0, cnt = -1;
00331    ( void ) ctx;             /*unused */
00332    long long papi_pe_buffer[READ_BUFFER_SIZE];
00333    int i,group_leader_fd;
00334 
00335    if (bug_check_scheduability()) {
00336 
00337       /* If the kernel isn't tracking scheduability right       */
00338       /* Then we need to start/stop/read to force the event     */
00339       /* to be scheduled and see if an error condition happens. */
00340 
00341       /* get the proper fd to start */
00342       group_leader_fd=ctl->events[idx].group_leader_fd;
00343       if (group_leader_fd==-1) group_leader_fd=ctl->events[idx].event_fd;
00344 
00345       /* start the event */
00346       retval = ioctl( group_leader_fd, PERF_EVENT_IOC_ENABLE, NULL );
00347       if (retval == -1) {
00348      PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
00349      return PAPI_ESYS;
00350       }
00351 
00352       /* stop the event */
00353       retval = ioctl(group_leader_fd, PERF_EVENT_IOC_DISABLE, NULL );
00354       if (retval == -1) {
00355      PAPIERROR( "ioctl(PERF_EVENT_IOC_DISABLE) failed.\n" );
00356      return PAPI_ESYS;
00357       }
00358 
00359       /* See if a read returns any results */
00360       cnt = read( group_leader_fd, papi_pe_buffer, sizeof(papi_pe_buffer));
00361       if ( cnt == -1 ) {
00362      SUBDBG( "read returned an error!  Should never happen.\n" );
00363      return PAPI_ESYS;
00364       }
00365 
00366       if ( cnt == 0 ) {
00367          /* We read 0 bytes if we could not schedule the event */
00368          /* The kernel should have detected this at open       */
00369          /* but various bugs (including NMI watchdog)          */
00370          /* result in this behavior                            */
00371 
00372      return PAPI_ECNFLCT;
00373 
00374      } else {
00375 
00376     /* Reset all of the counters (opened so far) back to zero      */
00377     /* from the above brief enable/disable call pair.              */
00378 
00379     /* We have to reset all events because reset of group leader      */
00380         /* does not reset all.                                            */
00381     /* we assume that the events are being added one by one and that  */
00382         /* we do not need to reset higher events (doing so may reset ones */
00383         /* that have not been initialized yet.                            */
00384 
00385     /* Note... PERF_EVENT_IOC_RESET does not reset time running       */
00386     /* info if multiplexing, so we should avoid coming here if        */
00387     /* we are multiplexing the event.                                 */
00388         for( i = 0; i < idx; i++) {
00389        retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
00390        if (retval == -1) {
00391           PAPIERROR( "ioctl(PERF_EVENT_IOC_RESET) #%d/%d %d "
00392              "(fd %d)failed.\n",
00393              i,ctl->num_events,idx,ctl->events[i].event_fd);
00394           return PAPI_ESYS;
00395        }
00396     }
00397       }
00398    }
00399    return PAPI_OK;
00400 }
00401 
00402 
00403 /* Do some extrta work on a perf_event fd if we're doing sampling */
00404 /* This mostly means setting up the mmap buffer.                  */
00405 static int
00406 tune_up_fd( pe_control_t *ctl, int evt_idx )
00407 {
00408    int ret;
00409    void *buf_addr;
00410    int fd = ctl->events[evt_idx].event_fd;
00411 
00412    /* Register that we would like a SIGIO notification when a mmap'd page */
00413    /* becomes full.                                                       */
00414    ret = fcntl( fd, F_SETFL, O_ASYNC | O_NONBLOCK );
00415    if ( ret ) {
00416       PAPIERROR ( "fcntl(%d, F_SETFL, O_ASYNC | O_NONBLOCK) "
00417           "returned error: %s", fd, strerror( errno ) );
00418       return PAPI_ESYS;
00419    }
00420 
00421    /* Set the F_SETOWN_EX flag on the fd.                          */
00422    /* This affects which thread an overflow signal gets sent to.   */
00423    ret=fcntl_setown_fd(fd);
00424    if (ret!=PAPI_OK) return ret;
00425        
00426    /* Set FD_CLOEXEC.  Otherwise if we do an exec with an overflow */
00427    /* running, the overflow handler will continue into the exec()'d*/
00428    /* process and kill it because no signal handler is set up.     */
00429    ret=fcntl(fd, F_SETFD, FD_CLOEXEC);
00430    if (ret) {
00431       return PAPI_ESYS;
00432    }
00433 
00434    /* when you explicitely declare that you want a particular signal,  */
00435    /* even with you use the default signal, the kernel will send more  */
00436    /* information concerning the event to the signal handler.          */
00437    /*                                                                  */
00438    /* In particular, it will send the file descriptor from which the   */
00439    /* event is originating which can be quite useful when monitoring   */
00440    /* multiple tasks from a single thread.                             */
00441    ret = fcntl( fd, F_SETSIG, _papi_pe_vector.cmp_info.hardware_intr_sig );
00442    if ( ret == -1 ) {
00443       PAPIERROR( "cannot fcntl(F_SETSIG,%d) on %d: %s",
00444          _papi_pe_vector.cmp_info.hardware_intr_sig, fd,
00445          strerror( errno ) );
00446       return PAPI_ESYS;
00447    }
00448 
00449    /* mmap() the sample buffer */
00450    buf_addr = mmap( NULL, ctl->events[evt_idx].nr_mmap_pages * getpagesize(),
00451             PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
00452    if ( buf_addr == MAP_FAILED ) {
00453       PAPIERROR( "mmap(NULL,%d,%d,%d,%d,0): %s",
00454          ctl->events[evt_idx].nr_mmap_pages * getpagesize(  ), 
00455          PROT_READ, MAP_SHARED, fd, strerror( errno ) );
00456       return ( PAPI_ESYS );
00457    }
00458 
00459    SUBDBG( "Sample buffer for fd %d is located at %p\n", fd, buf_addr );
00460 
00461    /* Set up the mmap buffer and its associated helpers */
00462    ctl->events[evt_idx].mmap_buf = (struct perf_counter_mmap_page *) buf_addr;
00463    ctl->events[evt_idx].tail = 0;
00464    ctl->events[evt_idx].mask = ( ctl->events[evt_idx].nr_mmap_pages - 1 ) * 
00465                                getpagesize() - 1;
00466 
00467    return PAPI_OK;
00468 }
00469 
00470 
00471 /* Open all events in the control state */
00472 static int
00473 open_pe_events( pe_context_t *ctx, pe_control_t *ctl )
00474 {
00475 
00476    int i, ret = PAPI_OK;
00477 
00478    for( i = 0; i < ctl->num_events; i++ ) {
00479 
00480       ctl->events[i].event_opened=0;
00481 
00482       /* set up the attr structure.  We don't set up all fields here */
00483       /* as some have already been set up previously.                */
00484 
00485       /* group leader (event 0) is special                */
00486       /* If we're multiplexed, everyone is a group leader */
00487       if (( i == 0 ) || (ctl->multiplexed)) {
00488          ctl->events[i].attr.pinned = !ctl->multiplexed;
00489      ctl->events[i].attr.disabled = 1;
00490      ctl->events[i].group_leader_fd=-1;
00491          ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 
00492                                ctl->inherit, 
00493                                !ctl->multiplexed );
00494       } else {
00495      ctl->events[i].attr.pinned=0;
00496      ctl->events[i].attr.disabled = 0;
00497      ctl->events[i].group_leader_fd=ctl->events[0].event_fd;
00498          ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed, 
00499                                ctl->inherit, 
00500                                0 );
00501       }
00502 
00503 
00504       /* try to open */
00505       ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr, 
00506                              ctl->tid,
00507                              ctl->cpu,
00508                    ctl->events[i].group_leader_fd,
00509                              0 /* flags */
00510                              );
00511       
00512       if ( ctl->events[i].event_fd == -1 ) {
00513      SUBDBG("sys_perf_event_open returned error on event #%d."
00514         "  Error: %s\n",
00515         i, strerror( errno ) );
00516      ret = PAPI_ECNFLCT;
00517      goto open_pe_cleanup;
00518       }
00519 
00520       SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d,"
00521               " group_leader/fd: %d, event_fd: %d,"
00522               " read_format: 0x%"PRIu64"\n",
00523           (long)ctl->tid, ctl->cpu, ctl->events[i].group_leader_fd, 
00524           ctl->events[i].event_fd, ctl->events[i].attr.read_format);
00525 
00526 
00527       /* in many situations the kernel will indicate we opened fine */
00528       /* yet things will fail later.  So we need to double check    */
00529       /* we actually can use the events we've set up.               */
00530 
00531       /* This is not necessary if we are multiplexing, and in fact */
00532       /* we cannot do this properly if multiplexed because         */
00533       /* PERF_EVENT_IOC_RESET does not reset the time running info */
00534       if (!ctl->multiplexed) {
00535      ret = check_scheduability( ctx, ctl, i );
00536 
00537          if ( ret != PAPI_OK ) {
00538         /* the last event did open, so we need to bump the counter */
00539         /* before doing the cleanup                                */
00540         i++;
00541                                   
00542             goto open_pe_cleanup;
00543      }
00544       }
00545       ctl->events[i].event_opened=1;
00546    }
00547 
00548    /* Now that we've successfully opened all of the events, do whatever  */
00549    /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */
00550    /* and so on.                                                         */
00551    for ( i = 0; i < ctl->num_events; i++ ) {
00552 
00553       /* If sampling is enabled, hook up signal handler */
00554       if ( ctl->events[i].attr.sample_period ) {
00555      ret = tune_up_fd( ctl, i );
00556      if ( ret != PAPI_OK ) {
00557         /* All of the fds are open, so we need to clean up all of them */
00558         i = ctl->num_events;
00559         goto open_pe_cleanup;
00560      }
00561       } else {
00562      /* Make sure this is NULL so close_pe_events works right */
00563      ctl->events[i].mmap_buf = NULL;
00564       }
00565    }
00566 
00567    /* Set num_evts only if completely successful */
00568    ctx->state |= PERF_EVENTS_OPENED;
00569         
00570    return PAPI_OK;
00571 
00572 open_pe_cleanup:
00573    /* We encountered an error, close up the fds we successfully opened.  */
00574    /* We go backward in an attempt to close group leaders last, although */
00575    /* That's probably not strictly necessary.                            */
00576    while ( i > 0 ) {
00577       i--;
00578       if (ctl->events[i].event_fd>=0) {
00579      close( ctl->events[i].event_fd );
00580      ctl->events[i].event_opened=0;
00581       }
00582    }
00583 
00584    return ret;
00585 }
00586 
00587 /* Close all of the opened events */
00588 static int
00589 close_pe_events( pe_context_t *ctx, pe_control_t *ctl )
00590 {
00591    int i;
00592    int num_closed=0;
00593 
00594    /* should this be a more serious error? */
00595    if ( ctx->state & PERF_EVENTS_RUNNING ) {
00596       SUBDBG("Closing without stopping first\n");
00597    }
00598 
00599    /* Close child events first */
00600    for( i=0; i<ctl->num_events; i++ ) {
00601 
00602       if (ctl->events[i].event_opened) {
00603 
00604          if (ctl->events[i].group_leader_fd!=-1) {
00605             if ( ctl->events[i].mmap_buf ) {
00606            if ( munmap ( ctl->events[i].mmap_buf,
00607                      ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
00608               PAPIERROR( "munmap of fd = %d returned error: %s",
00609                  ctl->events[i].event_fd, strerror( errno ) );
00610               return PAPI_ESYS;
00611            }
00612         }
00613 
00614             if ( close( ctl->events[i].event_fd ) ) {
00615            PAPIERROR( "close of fd = %d returned error: %s",
00616                ctl->events[i].event_fd, strerror( errno ) );
00617            return PAPI_ESYS;
00618         } else {
00619            num_closed++;
00620         }
00621         ctl->events[i].event_opened=0;
00622      }
00623       }
00624    }
00625 
00626    /* Close the group leaders last */
00627    for( i=0; i<ctl->num_events; i++ ) {
00628 
00629       if (ctl->events[i].event_opened) {
00630 
00631          if (ctl->events[i].group_leader_fd==-1) {
00632             if ( ctl->events[i].mmap_buf ) {
00633            if ( munmap ( ctl->events[i].mmap_buf,
00634                      ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
00635               PAPIERROR( "munmap of fd = %d returned error: %s",
00636                  ctl->events[i].event_fd, strerror( errno ) );
00637               return PAPI_ESYS;
00638            }
00639         }
00640 
00641 
00642             if ( close( ctl->events[i].event_fd ) ) {
00643            PAPIERROR( "close of fd = %d returned error: %s",
00644                ctl->events[i].event_fd, strerror( errno ) );
00645            return PAPI_ESYS;
00646         } else {
00647            num_closed++;
00648         }
00649         ctl->events[i].event_opened=0;
00650      }
00651       }
00652    }
00653 
00654 
00655    if (ctl->num_events!=num_closed) {
00656       PAPIERROR("Didn't close all events\n");
00657       return PAPI_EBUG;
00658    }
00659 
00660    ctl->num_events=0;
00661 
00662    ctx->state &= ~PERF_EVENTS_OPENED;
00663 
00664    return PAPI_OK;
00665 }
00666 
00667 /* Fix up the config based on what CPU/Vendor we are running on */
00668 static int 
00669 pe_vendor_fixups(void) 
00670 {
00671      /* powerpc */
00672      /* On IBM and Power6 Machines default domain should include supervisor */
00673   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_IBM ) {
00674      _papi_pe_vector.cmp_info.available_domains |=
00675           PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00676      if (strcmp(_papi_hwi_system_info.hw_info.model_string, "POWER6" ) == 0 ) {
00677         _papi_pe_vector.cmp_info.default_domain =
00678           PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00679      }
00680   }
00681 
00682   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_MIPS ) {
00683      _papi_pe_vector.cmp_info.available_domains |= PAPI_DOM_KERNEL;
00684   }
00685 
00686   if ((_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_INTEL) ||
00687       (_papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_AMD)) {
00688      _papi_pe_vector.cmp_info.fast_real_timer = 1;
00689   }
00690 
00691      /* ARM */
00692   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_ARM) {
00693      /* FIXME: this will change with Cortex A15 */
00694      _papi_pe_vector.cmp_info.available_domains |=
00695         PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00696      _papi_pe_vector.cmp_info.default_domain =
00697         PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR;
00698   }
00699 
00700      /* CRAY */
00701   if ( _papi_hwi_system_info.hw_info.vendor == PAPI_VENDOR_CRAY ) {
00702     _papi_pe_vector.cmp_info.available_domains |= PAPI_DOM_OTHER;
00703   }
00704 
00705   return PAPI_OK;
00706 }
00707 
00708 
00709 /* Check the mmap page for rdpmc support */
00710 static int detect_rdpmc(void) {
00711 
00712    struct perf_event_attr pe;
00713    int fd,rdpmc_exists=1;
00714    void *addr;
00715    struct perf_event_mmap_page *our_mmap;
00716 
00717    /* Create a fake instructions event so we can read a mmap page */
00718    memset(&pe,0,sizeof(struct perf_event_attr));
00719 
00720    pe.type=PERF_TYPE_HARDWARE;
00721    pe.size=sizeof(struct perf_event_attr);
00722    pe.config=PERF_COUNT_HW_INSTRUCTIONS;
00723 
00724    fd=sys_perf_event_open(&pe,0,-1,-1,0);
00725    if (fd<0) {
00726       return PAPI_ESYS;
00727    }
00728 
00729    /* create the mmap page */
00730    addr=mmap(NULL, 4096, PROT_READ, MAP_SHARED,fd,0);
00731    if (addr == (void *)(-1)) {
00732       close(fd);
00733       return PAPI_ESYS;
00734    }
00735 
00736    /* get the rdpmc info */
00737    our_mmap=(struct perf_event_mmap_page *)addr;
00738    if (our_mmap->cap_usr_rdpmc==0) {
00739       rdpmc_exists=0;
00740    }
00741 
00742    /* close the fake event */
00743    munmap(addr,4096);
00744    close(fd);
00745 
00746    return rdpmc_exists;
00747 
00748 } 
00749 
00750 /* Find a native event specified by a profile index */
00751 static int
00752 find_profile_index( EventSetInfo_t *ESI, int evt_idx, int *flags,
00753             unsigned int *native_index, int *profile_index )
00754 {
00755    int pos, esi_index, count;
00756 
00757    for ( count = 0; count < ESI->profile.event_counter; count++ ) {
00758        esi_index = ESI->profile.EventIndex[count];
00759        pos = ESI->EventInfoArray[esi_index].pos[0];
00760         
00761        if ( pos == evt_idx ) {
00762       *profile_index = count;
00763       *native_index = ESI->NativeInfoArray[pos].ni_event & 
00764                       PAPI_NATIVE_AND_MASK;
00765       *flags = ESI->profile.flags;
00766       SUBDBG( "Native event %d is at profile index %d, flags %d\n",
00767           *native_index, *profile_index, *flags );
00768       return PAPI_OK;
00769        }
00770    }
00771 
00772    PAPIERROR( "wrong count: %d vs. ESI->profile.event_counter %d", count,
00773           ESI->profile.event_counter );
00774    return PAPI_EBUG;
00775 }
00776 
00777 
00778 /* These functions are based on builtin-record.c in the  */
00779 /* kernel's tools/perf directory.                        */
00780 
00781 static uint64_t
00782 mmap_read_head( pe_event_info_t *pe )
00783 {
00784    struct perf_event_mmap_page *pc = pe->mmap_buf;
00785    int head;
00786 
00787    if ( pc == NULL ) {
00788       PAPIERROR( "perf_event_mmap_page is NULL" );
00789       return 0;
00790    }
00791 
00792    head = pc->data_head;
00793    rmb(  );
00794 
00795    return head;
00796 }
00797 
00798 static void
00799 mmap_write_tail( pe_event_info_t *pe, uint64_t tail )
00800 {
00801    struct perf_event_mmap_page *pc = pe->mmap_buf;
00802 
00803    /* ensure all reads are done before we write the tail out. */
00804    mb(  );
00805    pc->data_tail = tail;
00806 }
00807 
00808 /* Does the kernel define these somewhere? */
00809 struct ip_event {
00810    struct perf_event_header header;
00811    uint64_t ip;
00812 };
00813 struct lost_event {
00814    struct perf_event_header header;
00815    uint64_t id;
00816    uint64_t lost;
00817 };
00818 typedef union event_union {
00819    struct perf_event_header header;
00820    struct ip_event ip;
00821    struct lost_event lost;
00822 } perf_sample_event_t;
00823 
00824 
00825 /* Should re-write with comments if we ever figure out what's */
00826 /* going on here.                                             */
00827 static void
00828 mmap_read( ThreadInfo_t **thr, pe_event_info_t *pe, 
00829        int profile_index )
00830 {
00831    int cidx = _papi_pe_vector.cmp_info.CmpIdx;
00832    uint64_t head = mmap_read_head( pe );
00833    uint64_t old = pe->tail;
00834    unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize(  );
00835    int diff;
00836 
00837    diff = head - old;
00838    if ( diff < 0 ) {
00839       SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64
00840           ",  tail = %" PRIu64 ". Discarding samples.\n", head, old );
00841       /* head points to a known good entry, start there. */
00842       old = head;
00843    }
00844 
00845    for( ; old != head; ) {
00846 
00847       perf_sample_event_t *event = ( perf_sample_event_t * ) 
00848                                & data[old & pe->mask];
00849       perf_sample_event_t event_copy;
00850       size_t size = event->header.size;
00851 
00852       /* Event straddles the mmap boundary -- header should always */
00853       /* be inside due to u64 alignment of output.                 */
00854       if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) {
00855      uint64_t offset = old;
00856      uint64_t len = min( sizeof ( *event ), size ), cpy;
00857      void *dst = &event_copy;
00858 
00859      do {
00860         cpy = min( pe->mask + 1 - ( offset & pe->mask ), len );
00861         memcpy( dst, &data[offset & pe->mask], cpy );
00862         offset += cpy;
00863         dst = ((unsigned char*)dst) + cpy;
00864         len -= cpy;
00865      } while ( len );
00866 
00867      event = &event_copy;
00868       }
00869 
00870       old += size;
00871 
00872       SUBDBG( "event->type = %08x\n", event->header.type );
00873       SUBDBG( "event->size = %d\n", event->header.size );
00874 
00875       switch ( event->header.type ) {
00876          case PERF_RECORD_SAMPLE:
00877           _papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx],
00878                 ( caddr_t ) ( unsigned long ) event->ip.ip, 
00879                       0, profile_index );
00880           break;
00881 
00882      case PERF_RECORD_LOST:
00883           SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64
00884               " events were lost.\n"
00885               "Loss was recorded when counter id 0x%"PRIx64 
00886               " overflowed.\n", event->lost.lost, event->lost.id );
00887           break;
00888 
00889      default:
00890           SUBDBG( "Error: unexpected header type - %d\n",
00891                     event->header.type );
00892           break;
00893       }
00894    }
00895 
00896    pe->tail = old;
00897    mmap_write_tail( pe, old );
00898 }
00899 
00900 /* What exactly does this do? */
00901 static int
00902 process_smpl_buf( int evt_idx, ThreadInfo_t **thr )
00903 {
00904    int ret, flags, profile_index;
00905    unsigned native_index;
00906    int cidx = _papi_pe_vector.cmp_info.CmpIdx;
00907    pe_control_t *ctl;
00908 
00909    ret = find_profile_index( ( *thr )->running_eventset[cidx], evt_idx, 
00910                  &flags, &native_index, &profile_index );
00911    if ( ret != PAPI_OK ) {
00912       return ret;
00913    }
00914 
00915    ctl= (*thr)->running_eventset[cidx]->ctl_state;
00916 
00917    mmap_read( thr, 
00918           &(ctl->events[evt_idx]),
00919           profile_index );
00920 
00921    return PAPI_OK;
00922 }
00923 
00924 
00925 
00926 
00927 /********************************************************************/
00928 /********************************************************************/
00929 /* Start with functions that are exported via the module interface  */
00930 /********************************************************************/
00931 /********************************************************************/
00932 
00933 
00934 /* set the domain. FIXME: perf_events allows per-event control of this. */
00935 /* we do not handle that yet.                                           */
00936 int
00937 _papi_pe_set_domain( hwd_control_state_t *ctl, int domain)
00938 {
00939     
00940    int i;
00941    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
00942 
00943    SUBDBG("old control domain %d, new domain %d, default domain %d\n",
00944       pe_ctl->domain,domain,_papi_pe_vector.cmp_info.default_domain);
00945 
00946    pe_ctl->domain = domain;
00947      
00948    /* Force the domain on all events */
00949    for( i = 0; i < pe_ctl->num_events; i++ ) {
00950       pe_ctl->events[i].attr.exclude_user = 
00951                     !( pe_ctl->domain & PAPI_DOM_USER );
00952       pe_ctl->events[i].attr.exclude_kernel =
00953             !( pe_ctl->domain & PAPI_DOM_KERNEL );
00954       pe_ctl->events[i].attr.exclude_hv =
00955             !( pe_ctl->domain & PAPI_DOM_SUPERVISOR );
00956    }
00957    return PAPI_OK;
00958 }
00959 
00960 
00961 /* Initialize the perf_event component */
00962 static int
00963 _papi_pe_init_component( int cidx )
00964 {
00965 
00966    int retval;
00967    int paranoid_level;
00968 
00969    FILE *fff;
00970 
00971    ( void ) cidx;          /*unused */
00972 
00973    /* The is the official way to detect if perf_event support exists */
00974    /* The file is called perf_counter_paranoid on 2.6.31             */
00975    /* currently we are lazy and do not support 2.6.31 kernels        */
00976    fff=fopen("/proc/sys/kernel/perf_event_paranoid","r");
00977    if (fff==NULL) {
00978       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
00979           "perf_event support not detected",PAPI_MAX_STR_LEN);
00980       return PAPI_ENOCMP;
00981    }
00982 
00983    /* 2 means no measurements allowed          */
00984    /* 1 means normal counter access            */
00985    /* 0 means you can access CPU-specific data */
00986    /* -1 means no restrictions                 */
00987    retval=fscanf(fff,"%d",&paranoid_level);
00988    if (retval!=1) fprintf(stderr,"Error reading paranoid level\n");
00989    fclose(fff);
00990 
00991    if (paranoid_level==2) {
00992       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
00993           "/proc/sys/kernel/perf_event_paranoid prohibits using counters",
00994           PAPI_MAX_STR_LEN);
00995       return PAPI_ENOCMP;
00996    }
00997 
00998    /* Detect NMI watchdog which can steal counters */
00999    nmi_watchdog_active=_linux_detect_nmi_watchdog();
01000    if (nmi_watchdog_active) {
01001       SUBDBG("The Linux nmi_watchdog is using one of the performance "
01002              "counters, reducing the total number available.\n");
01003    }
01004 
01005    /* Kernel multiplexing is broken prior to kernel 2.6.34 */
01006    /* The fix was probably git commit:                     */
01007    /*     45e16a6834b6af098702e5ea6c9a40de42ff77d8         */
01008    if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) {
01009       _papi_pe_vector.cmp_info.kernel_multiplex = 0;
01010    }
01011    else {
01012       _papi_pe_vector.cmp_info.kernel_multiplex = 1;
01013    }
01014 
01015    /* We use the RealTime signal for some reason */
01016    _papi_pe_vector.cmp_info.hardware_intr_sig = SIGRTMIN + 2;
01017 
01018    /* Check that processor is supported */
01019    if (processor_supported(_papi_hwi_system_info.hw_info.vendor,
01020                _papi_hwi_system_info.hw_info.cpuid_family)!=
01021       PAPI_OK) {
01022       fprintf(stderr,"warning, your processor is unsupported\n");
01023       /* should not return error, as software events should still work */
01024    }
01025 
01026    /* Setup mmtimers, if appropriate */
01027    retval=mmtimer_setup();
01028    if (retval) {
01029       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
01030           "Error initializing mmtimer",PAPI_MAX_STR_LEN);
01031       return retval;
01032    }
01033 
01034    /* Detect if we can use rdpmc (or equivalent) */
01035    /* We currently do not use rdpmc as it is slower in tests */
01036    /* than regular read (as of Linux 3.5)                    */
01037    retval=detect_rdpmc();
01038    if (retval < 0 ) {
01039       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
01040          "Error detecting rdpmc",PAPI_MAX_STR_LEN);
01041       return retval;
01042    }
01043    _papi_pe_vector.cmp_info.fast_counter_read = retval;
01044 
01045    /* Run Vendor-specific fixups */
01046    pe_vendor_fixups();
01047 
01048    /* Run the libpfm4-specific setup */
01049    retval = _papi_libpfm4_init(&_papi_pe_vector, cidx);
01050    if (retval) {
01051       strncpy(_papi_pe_vector.cmp_info.disabled_reason,
01052           "Error initializing libpfm4",PAPI_MAX_STR_LEN);
01053       return retval;
01054    }
01055 
01056    return PAPI_OK;
01057 
01058 }
01059 
01060 /* Shutdown the perf_event component */
01061 static int
01062 _papi_pe_shutdown_component( void ) {
01063 
01064   /* Shutdown libpfm4 */
01065   _papi_libpfm4_shutdown();
01066 
01067   return PAPI_OK;
01068 }
01069 
01070 
01071 /* Initialize a thread */
01072 static int
01073 _papi_pe_init_thread( hwd_context_t *hwd_ctx )
01074 {
01075 
01076   pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx;
01077 
01078   /* clear the context structure and mark as initialized */
01079   memset( pe_ctx, 0, sizeof ( pe_context_t ) );
01080   pe_ctx->initialized=1;
01081 
01082   return PAPI_OK;
01083 }
01084 
01085 /* Shutdown a thread */
01086 static int
01087 _papi_pe_shutdown_thread( hwd_context_t *ctx )
01088 {
01089     pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01090 
01091     pe_ctx->initialized=0;
01092 
01093     return PAPI_OK;
01094 }
01095 
01096 
01097 /* reset the hardware counters */
01098 /* Note: PAPI_reset() does not necessarily call this */
01099 /* unless the events are actually running.           */
01100 static int
01101 _papi_pe_reset( hwd_context_t *ctx, hwd_control_state_t *ctl )
01102 {
01103    int i, ret;
01104    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01105 
01106    ( void ) ctx;             /*unused */
01107 
01108    /* We need to reset all of the events, not just the group leaders */
01109    for( i = 0; i < pe_ctl->num_events; i++ ) {
01110       ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
01111       if ( ret == -1 ) {
01112      PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) "
01113            "returned error, Linux says: %s",
01114            pe_ctl->events[i].event_fd, strerror( errno ) );
01115      return PAPI_ESYS;
01116       }
01117    }
01118 
01119    return PAPI_OK;
01120 }
01121 
01122 
01123 /* write (set) the hardware counters */
01124 /* Current we do not support this.   */
01125 static int
01126 _papi_pe_write( hwd_context_t *ctx, hwd_control_state_t *ctl,
01127         long long *from )
01128 {
01129    ( void ) ctx;             /*unused */
01130    ( void ) ctl;             /*unused */
01131    ( void ) from;            /*unused */
01132    /*
01133     * Counters cannot be written.  Do we need to virtualize the
01134     * counters so that they can be written, or perhaps modify code so that
01135     * they can be written? FIXME ?
01136     */
01137     
01138     return PAPI_ENOSUPP;
01139 }
01140 
01141 /*
01142  * perf_event provides a complicated read interface.
01143  *  the info returned by read() varies depending on whether
01144  *  you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED,
01145  *  PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set
01146  *
01147  * To simplify things we just always ask for everything.  This might
01148  * lead to overhead when reading more than we need, but it makes the
01149  * read code a lot simpler than the original implementation we had here.
01150  *
01151  * For more info on the layout see include/linux/perf_event.h
01152  *
01153  */
01154 
01155 static int
01156 _papi_pe_read( hwd_context_t *ctx, hwd_control_state_t *ctl,
01157            long long **events, int flags )
01158 {
01159    ( void ) flags;           /*unused */
01160    int i, ret = -1;
01161    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01162    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01163    long long papi_pe_buffer[READ_BUFFER_SIZE];
01164    long long tot_time_running, tot_time_enabled, scale;
01165 
01166    /* On kernels before 2.6.33 the TOTAL_TIME_ENABLED and TOTAL_TIME_RUNNING */
01167    /* fields are always 0 unless the counter is disabled.  So if we are on   */
01168    /* one of these kernels, then we must disable events before reading.      */
01169 
01170    /* Elsewhere though we disable multiplexing on kernels before 2.6.34 */
01171    /* so maybe this isn't even necessary.                               */
01172 
01173    if (bug_sync_read()) {
01174       if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
01175          for ( i = 0; i < pe_ctl->num_events; i++ ) {
01176         /* disable only the group leaders */
01177         if ( pe_ctl->events[i].group_leader_fd == -1 ) {
01178            ret = ioctl( pe_ctl->events[i].event_fd, 
01179                PERF_EVENT_IOC_DISABLE, NULL );
01180            if ( ret == -1 ) {
01181               PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) "
01182                "returned an error: ", strerror( errno ));
01183               return PAPI_ESYS;
01184            }
01185         }
01186      }
01187       }
01188    }
01189 
01190 
01191    /* Handle case where we are multiplexing */
01192    if (pe_ctl->multiplexed) {
01193 
01194       /* currently we handle multiplexing by having individual events */
01195       /* so we read from each in turn.                                */
01196 
01197       for ( i = 0; i < pe_ctl->num_events; i++ ) {
01198              
01199          ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 
01200             sizeof ( papi_pe_buffer ) );
01201          if ( ret == -1 ) {
01202         PAPIERROR("read returned an error: ", strerror( errno ));
01203         return PAPI_ESYS;
01204      }
01205 
01206      /* We should read 3 64-bit values from the counter */
01207      if (ret<(signed)(3*sizeof(long long))) {
01208         PAPIERROR("Error!  short read!\n");  
01209         return PAPI_ESYS;
01210      }        
01211 
01212          SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 
01213             pe_ctl->events[i].event_fd, 
01214         (long)pe_ctl->tid, pe_ctl->cpu, ret);
01215          SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0],
01216             papi_pe_buffer[1],papi_pe_buffer[2]);
01217 
01218          tot_time_enabled = papi_pe_buffer[1];     
01219          tot_time_running = papi_pe_buffer[2];
01220 
01221          SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * "
01222         "tot_time_enabled %lld) / tot_time_running %lld\n",
01223         i, 0,papi_pe_buffer[0],
01224         tot_time_enabled,tot_time_running);
01225     
01226          if (tot_time_running == tot_time_enabled) {
01227         /* No scaling needed */
01228         pe_ctl->counts[i] = papi_pe_buffer[0];
01229          } else if (tot_time_running && tot_time_enabled) {
01230         /* Scale factor of 100 to avoid overflows when computing */
01231         /*enabled/running */
01232 
01233         scale = (tot_time_enabled * 100LL) / tot_time_running;
01234         scale = scale * papi_pe_buffer[0];
01235         scale = scale / 100LL;
01236         pe_ctl->counts[i] = scale;
01237      } else {
01238        /* This should not happen, but Phil reports it sometime does. */
01239         SUBDBG("perf_event kernel bug(?) count, enabled, "
01240            "running: %lld, %lld, %lld\n",
01241            papi_pe_buffer[0],tot_time_enabled,
01242            tot_time_running);
01243 
01244         pe_ctl->counts[i] = papi_pe_buffer[0];
01245      }
01246       }
01247    }
01248 
01249    /* Handle cases where we cannot use FORMAT GROUP */
01250    else if (bug_format_group() || pe_ctl->inherit) {
01251 
01252       /* we must read each counter individually */
01253       for ( i = 0; i < pe_ctl->num_events; i++ ) {
01254 
01255          ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer, 
01256             sizeof ( papi_pe_buffer ) );
01257          if ( ret == -1 ) {
01258         PAPIERROR("read returned an error: ", strerror( errno ));
01259         return PAPI_ESYS;
01260      }
01261 
01262      /* we should read one 64-bit value from each counter */
01263      if (ret!=sizeof(long long)) {
01264         PAPIERROR("Error!  short read!\n");
01265         PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
01266            pe_ctl->events[i].event_fd,
01267            (long)pe_ctl->tid, pe_ctl->cpu, ret);
01268         return PAPI_ESYS;
01269      }     
01270 
01271          SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 
01272             pe_ctl->events[i].event_fd, (long)pe_ctl->tid, 
01273         pe_ctl->cpu, ret);
01274          SUBDBG("read: %lld\n",papi_pe_buffer[0]);
01275      
01276      pe_ctl->counts[i] = papi_pe_buffer[0];
01277       }
01278    }
01279 
01280    
01281    /* Handle cases where we are using FORMAT_GROUP   */
01282    /* We assume only one group leader, in position 0 */
01283 
01284    else {
01285       if (pe_ctl->events[0].group_leader_fd!=-1) {
01286      PAPIERROR("Was expecting group leader!\n");
01287       }
01288 
01289       ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer, 
01290           sizeof ( papi_pe_buffer ) );
01291 
01292       if ( ret == -1 ) {
01293      PAPIERROR("read returned an error: ", strerror( errno ));
01294      return PAPI_ESYS;
01295       }
01296 
01297       /* we read 1 64-bit value (number of events) then     */
01298       /* num_events more 64-bit values that hold the counts */
01299       if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) {
01300      PAPIERROR("Error! short read!\n");
01301      return PAPI_ESYS;
01302       }
01303 
01304       SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n", 
01305          pe_ctl->events[0].event_fd, 
01306          (long)pe_ctl->tid, pe_ctl->cpu, ret);
01307       { 
01308      int j;
01309      for(j=0;j<ret/8;j++) {
01310             SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]);
01311      }
01312       }
01313 
01314       /* Make sure the kernel agrees with how many events we have */
01315       if (papi_pe_buffer[0]!=pe_ctl->num_events) {
01316      PAPIERROR("Error!  Wrong number of events!\n");
01317      return PAPI_ESYS;
01318       }
01319 
01320       /* put the count values in their proper location */
01321       for(i=0;i<papi_pe_buffer[0];i++) {
01322          pe_ctl->counts[i] = papi_pe_buffer[1+i];
01323       }
01324    }
01325 
01326 
01327    /* If we disabled the counters due to the sync_read_bug(), */
01328    /* then we need to re-enable them now.                     */
01329    if (bug_sync_read()) {
01330       if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
01331          for ( i = 0; i < pe_ctl->num_events; i++ ) {
01332         if ( pe_ctl->events[i].group_leader_fd == -1 ) {
01333            /* this should refresh any overflow counters too */
01334            ret = ioctl( pe_ctl->events[i].event_fd, 
01335                 PERF_EVENT_IOC_ENABLE, NULL );
01336            if ( ret == -1 ) {
01337               /* Should never happen */
01338               PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) returned an error: ",
01339                 strerror( errno ));
01340               return PAPI_ESYS;
01341            }
01342         }
01343      }
01344       }
01345    }
01346 
01347    /* point PAPI to the values we read */
01348    *events = pe_ctl->counts;
01349 
01350    return PAPI_OK;
01351 }
01352 
01353 /* Start counting events */
01354 static int
01355 _papi_pe_start( hwd_context_t *ctx, hwd_control_state_t *ctl )
01356 {
01357    int ret;
01358    int i;
01359    int did_something = 0;
01360    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01361    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01362 
01363    /* Reset the counters first.  Is this necessary? */
01364    ret = _papi_pe_reset( pe_ctx, pe_ctl );
01365    if ( ret ) {
01366       return ret;
01367    }
01368 
01369    /* Enable all of the group leaders                */
01370    /* All group leaders have a group_leader_fd of -1 */
01371    for( i = 0; i < pe_ctl->num_events; i++ ) {
01372       if (pe_ctl->events[i].group_leader_fd == -1) {
01373      SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd);
01374      ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ; 
01375 
01376      /* ioctls always return -1 on failure */
01377          if (ret == -1) {
01378             PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
01379             return PAPI_ESYS;
01380      }
01381 
01382      did_something++;
01383       } 
01384    }
01385 
01386    if (!did_something) {
01387       PAPIERROR("Did not enable any counters.\n");
01388       return PAPI_EBUG;
01389    }
01390 
01391    pe_ctx->state |= PERF_EVENTS_RUNNING;
01392 
01393    return PAPI_OK;
01394 
01395 }
01396 
01397 /* Stop all of the counters */
01398 static int
01399 _papi_pe_stop( hwd_context_t *ctx, hwd_control_state_t *ctl )
01400 {
01401     
01402    int ret;
01403    int i;
01404    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01405    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01406 
01407    /* Just disable the group leaders */
01408    for ( i = 0; i < pe_ctl->num_events; i++ ) {
01409       if ( pe_ctl->events[i].group_leader_fd == -1 ) {
01410      ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL);
01411      if ( ret == -1 ) {
01412         PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) "
01413                "returned error, Linux says: %s",
01414                pe_ctl->events[i].event_fd, strerror( errno ) );
01415         return PAPI_EBUG;
01416      }
01417       }
01418    }
01419 
01420    pe_ctx->state &= ~PERF_EVENTS_RUNNING;
01421 
01422    return PAPI_OK;
01423 }
01424 
01425 /* Initialize a new control state */
01426 static int
01427 _papi_pe_init_control_state( hwd_control_state_t *ctl )
01428 {
01429    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01430     
01431    /* clear the contents */
01432    memset( pe_ctl, 0, sizeof ( pe_control_t ) );
01433    _papi_pe_set_domain( ctl, _papi_pe_vector.cmp_info.default_domain );
01434 
01435    /* Set cpu number in the control block to show events */
01436    /* are not tied to specific cpu                       */
01437    pe_ctl->cpu = -1;
01438    return PAPI_OK;
01439 }
01440 
01441 
01442 /* This function clears the current contents of the control structure and
01443    updates it with whatever resources are allocated for all the native events
01444    in the native info structure array. */
01445 
01446 static int
01447 _papi_pe_update_control_state( hwd_control_state_t *ctl, 
01448                    NativeInfo_t *native,
01449                    int count, hwd_context_t *ctx )
01450 {
01451    int i = 0, ret;
01452    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01453    pe_control_t *pe_ctl = ( pe_control_t *) ctl;
01454 
01455    /* close all of the existing fds and start over again */
01456    /* In theory we could have finer-grained control and know if             */
01457    /* things were changed, but it's easier to tear things down and rebuild. */
01458    close_pe_events( pe_ctx, pe_ctl );
01459 
01460    /* Calling with count==0 should be OK, it's how things are deallocated */
01461    /* when an eventset is destroyed.                                      */
01462    if ( count == 0 ) {
01463       SUBDBG( "Called with count == 0\n" );
01464       return PAPI_OK;
01465    }
01466 
01467    /* set up all the events */
01468    for( i = 0; i < count; i++ ) {
01469       if ( native ) {
01470      /* Have libpfm4 set the config values for the event */
01471      ret=_papi_libpfm4_setup_counters(&pe_ctl->events[i].attr,
01472                      native[i].ni_event);
01473      SUBDBG( "pe_ctl->eventss[%d].config=%"PRIx64"\n",i,
01474          pe_ctl->events[i].attr.config);
01475      if (ret!=PAPI_OK) return ret;
01476 
01477       } else {
01478       /* I'm not sure how we'd end up in this case */
01479           /* should it be an error?                    */
01480       }
01481 
01482       /* Copy the inherit flag into the attribute block that will be   */
01483       /* passed to the kernel */
01484       pe_ctl->events[i].attr.inherit = pe_ctl->inherit;
01485 
01486       /* Set the position in the native structure */
01487       /* We just set up events linearly           */
01488       if ( native ) {
01489      native[i].ni_position = i;
01490       }
01491    }
01492 
01493    pe_ctl->num_events = count;
01494    _papi_pe_set_domain( ctl, pe_ctl->domain );
01495 
01496    /* actuall open the events */
01497    /* (why is this a separate function?) */
01498    ret = open_pe_events( pe_ctx, pe_ctl );
01499    if ( ret != PAPI_OK ) {
01500       SUBDBG("open_pe_events failed\n");
01501       /* Restore values ? */
01502       return ret;
01503    }
01504 
01505    return PAPI_OK;
01506 }
01507 
01508 /* Set various options on a control state */
01509 static int
01510 _papi_pe_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option )
01511 {
01512    int ret;
01513    pe_context_t *pe_ctx = ( pe_context_t *) ctx;
01514    pe_control_t *pe_ctl = NULL;
01515 
01516    switch ( code ) {
01517       case PAPI_MULTIPLEX:
01518        pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state );
01519        if (check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain, 
01520                   1, pe_ctl->inherit ) != PAPI_OK) {
01521           return PAPI_EPERM;
01522        }
01523 
01524        /* looks like we are allowed, so set multiplexed attribute */
01525        pe_ctl->multiplexed = 1;
01526        ret = _papi_pe_update_control_state( pe_ctl, NULL, 
01527                         pe_ctl->num_events, pe_ctx );
01528        if (ret != PAPI_OK) {
01529           pe_ctl->multiplexed = 0;
01530        }
01531        return ret;
01532     
01533       case PAPI_ATTACH:
01534        pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state );
01535        if (check_permissions( option->attach.tid, pe_ctl->cpu, 
01536                   pe_ctl->domain, pe_ctl->multiplexed, 
01537                   pe_ctl->inherit ) != PAPI_OK) {
01538           return PAPI_EPERM;
01539        }
01540 
01541        pe_ctl->tid = option->attach.tid;
01542 
01543        /* If events have been already been added, something may */
01544        /* have been done to the kernel, so update */
01545        ret = _papi_pe_update_control_state( pe_ctl, NULL, 
01546                         pe_ctl->num_events, pe_ctx);
01547        
01548        return ret;
01549 
01550       case PAPI_DETACH:
01551        pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state );
01552 
01553        pe_ctl->tid = 0;
01554        return PAPI_OK;
01555 
01556       case PAPI_CPU_ATTACH:
01557        pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state );
01558        if (check_permissions( pe_ctl->tid, option->cpu.cpu_num, 
01559                   pe_ctl->domain, pe_ctl->multiplexed, 
01560                   pe_ctl->inherit ) != PAPI_OK) {
01561            return PAPI_EPERM;
01562        }
01563        /* looks like we are allowed so set cpu number */
01564 
01565        /* this tells the kernel not to count for a thread   */
01566        /* should we warn if we try to set both?  perf_event */
01567        /* will reject it.                                   */
01568        pe_ctl->tid = -1;      
01569 
01570        pe_ctl->cpu = option->cpu.cpu_num;
01571 
01572        return PAPI_OK;
01573 
01574       case PAPI_DOMAIN:
01575        pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state );
01576        if (check_permissions( pe_ctl->tid, pe_ctl->cpu, 
01577                   option->domain.domain, pe_ctl->multiplexed,
01578                   pe_ctl->inherit ) != PAPI_OK) {
01579           return PAPI_EPERM;
01580        }
01581        /* looks like we are allowed, so set counting domain */
01582        return _papi_pe_set_domain( pe_ctl, option->domain.domain );
01583 
01584       case PAPI_GRANUL:
01585        pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state );
01586 
01587        /* FIXME: we really don't support this yet */
01588 
01589            switch ( option->granularity.granularity  ) {
01590               case PAPI_GRN_PROCG:
01591               case PAPI_GRN_SYS:
01592               case PAPI_GRN_SYS_CPU:
01593               case PAPI_GRN_PROC:
01594            return PAPI_ECMP;
01595      
01596           /* Currently we only support thread granularity */
01597               case PAPI_GRN_THR:
01598            break;
01599 
01600               default:
01601            return PAPI_EINVAL;
01602        }
01603            return PAPI_OK;
01604 
01605       case PAPI_INHERIT:
01606        pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state );
01607        if (check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain, 
01608                   pe_ctl->multiplexed, 
01609                   option->inherit.inherit ) != PAPI_OK) {
01610           return PAPI_EPERM;
01611        }
01612        /* looks like we are allowed, so set the requested inheritance */
01613        if (option->inherit.inherit) {
01614           /* children will inherit counters */
01615           pe_ctl->inherit = 1;
01616        } else {
01617           /* children won't inherit counters */
01618           pe_ctl->inherit = 0;
01619        }
01620        return PAPI_OK;
01621 
01622       case PAPI_DATA_ADDRESS:
01623        return PAPI_ENOSUPP;
01624 #if 0
01625        pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
01626        ret = set_default_domain( pe_ctl, option->address_range.domain );
01627        if ( ret != PAPI_OK ) {
01628           return ret;
01629        }
01630        set_drange( pe_ctx, pe_ctl, option );
01631        return PAPI_OK;
01632 #endif
01633       case PAPI_INSTR_ADDRESS:
01634        return PAPI_ENOSUPP;
01635 #if 0
01636        pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
01637        ret = set_default_domain( pe_ctl, option->address_range.domain );
01638        if ( ret != PAPI_OK ) {
01639           return ret;
01640        }
01641        set_irange( pe_ctx, pe_ctl, option );
01642        return PAPI_OK;
01643 #endif
01644 
01645       case PAPI_DEF_ITIMER:
01646        /* What should we be checking for here?                   */
01647        /* This seems like it should be OS-specific not component */
01648        /* specific.                                              */
01649 
01650        return PAPI_OK;
01651     
01652       case PAPI_DEF_MPX_NS:
01653        /* Defining a given ns per set is not current supported */
01654        return PAPI_ENOSUPP;
01655     
01656       case PAPI_DEF_ITIMER_NS:
01657        /* We don't support this... */
01658        return PAPI_OK;
01659     
01660       default:
01661        return PAPI_ENOSUPP;
01662    }
01663 }
01664 
01665 
01666 /*
01667  * This function is used when hardware overflows are working or when
01668  * software overflows are forced
01669  */
01670 
01671 static void
01672 _papi_pe_dispatch_timer( int n, hwd_siginfo_t *info, void *uc )
01673 {
01674    ( void ) n;               /*unused */
01675    _papi_hwi_context_t hw_context;
01676    int found_evt_idx = -1, fd = info->si_fd;
01677    caddr_t address;
01678    ThreadInfo_t *thread = _papi_hwi_lookup_thread( 0 );
01679    int cidx = _papi_pe_vector.cmp_info.CmpIdx;
01680    int i;
01681    pe_control_t *ctl;
01682 
01683    if ( thread == NULL ) {
01684       PAPIERROR( "thread == NULL in _papi_pe_dispatch_timer for fd %d!", fd );
01685       return;
01686    }
01687 
01688    if ( thread->running_eventset[cidx] == NULL ) {
01689       PAPIERROR( "thread->running_eventset == NULL in "
01690          "_papi_pe_dispatch_timer for fd %d!",fd );
01691       return;
01692    }
01693 
01694    if ( thread->running_eventset[cidx]->overflow.flags == 0 ) {
01695       PAPIERROR( "thread->running_eventset->overflow.flags == 0 in "
01696          "_papi_pe_dispatch_timer for fd %d!", fd );
01697       return;
01698    }
01699     
01700    hw_context.si = info;
01701    hw_context.ucontext = ( hwd_ucontext_t * ) uc;
01702 
01703    if ( thread->running_eventset[cidx]->overflow.flags & 
01704     PAPI_OVERFLOW_FORCE_SW ) {
01705       address = GET_OVERFLOW_ADDRESS( hw_context );
01706       _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context, 
01707                       address, NULL, 0,
01708                       0, &thread, cidx );
01709       return;
01710    }
01711 
01712    if ( thread->running_eventset[cidx]->overflow.flags !=
01713          PAPI_OVERFLOW_HARDWARE ) {
01714       PAPIERROR( "thread->running_eventset->overflow.flags is set to "
01715          "something other than PAPI_OVERFLOW_HARDWARE or "
01716          "PAPI_OVERFLOW_FORCE_SW for fd %d (%x)",
01717          fd , thread->running_eventset[cidx]->overflow.flags);
01718    }
01719 
01720    /* convoluted way to get ctl */
01721    ctl= thread->running_eventset[cidx]->ctl_state;
01722 
01723    /* See if the fd is one that's part of the this thread's context */
01724    for( i=0; i < ctl->num_events; i++ ) {
01725       if ( fd == ctl->events[i].event_fd ) {
01726      found_evt_idx = i;
01727      break;
01728       }
01729    }
01730 
01731    if ( found_evt_idx == -1 ) {
01732       PAPIERROR( "Unable to find fd %d among the open event fds "
01733          "_papi_hwi_dispatch_timer!", fd );
01734       return;
01735    }
01736     
01737    ioctl( fd, PERF_EVENT_IOC_DISABLE, NULL );
01738 
01739    if ( ( thread->running_eventset[cidx]->state & PAPI_PROFILING ) && 
01740     !( thread->running_eventset[cidx]->profile.flags & 
01741        PAPI_PROFIL_FORCE_SW ) ) {
01742       process_smpl_buf( found_evt_idx, &thread );
01743    }
01744    else {
01745       uint64_t ip;
01746       unsigned int head;
01747       pe_event_info_t *pe = &(ctl->events[found_evt_idx]);
01748       unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize(  );
01749 
01750       /*
01751        * Read up the most recent IP from the sample in the mmap buffer.  To
01752        * do this, we make the assumption that all of the records in the
01753        * mmap buffer are the same size, and that they all contain the IP as
01754        * their only record element.  This means that we can use the
01755        * data_head element from the user page and move backward one record
01756        * from that point and read the data.  Since we don't actually need
01757        * to access the header of the record, we can just subtract 8 (size
01758        * of the IP) from data_head and read up that word from the mmap
01759        * buffer.  After we subtract 8, we account for mmap buffer wrapping
01760        * by AND'ing this offset with the buffer mask.
01761        */
01762       head = mmap_read_head( pe );
01763 
01764       if ( head == 0 ) {
01765      PAPIERROR( "Attempting to access memory which may be inaccessable" );
01766      return;
01767       }
01768 
01769       ip = *( uint64_t * ) ( data + ( ( head - 8 ) & pe->mask ) );
01770       /*
01771        * Update the tail to the current head pointer. 
01772        *
01773        * Note: that if we were to read the record at the tail pointer,
01774        * rather than the one at the head (as you might otherwise think
01775        * would be natural), we could run into problems.  Signals don't
01776        * stack well on Linux, particularly if not using RT signals, and if
01777        * they come in rapidly enough, we can lose some.  Overtime, the head
01778        * could catch up to the tail and monitoring would be stopped, and
01779        * since no more signals are coming in, this problem will never be
01780        * resolved, resulting in a complete loss of overflow notification
01781        * from that point on.  So the solution we use here will result in
01782        * only the most recent IP value being read every time there are two
01783        * or more samples in the buffer (for that one overflow signal).  But
01784        * the handler will always bring up the tail, so the head should
01785        * never run into the tail.
01786        */
01787       mmap_write_tail( pe, head );
01788 
01789       /*
01790        * The fourth parameter is supposed to be a vector of bits indicating
01791        * the overflowed hardware counters, but it's not really clear that
01792        * it's useful, because the actual hardware counters used are not
01793        * exposed to the PAPI user.  For now, I'm just going to set the bit
01794        * that indicates which event register in the array overflowed.  The
01795        * result is that the overflow vector will not be identical to the
01796        * perfmon implementation, and part of that is due to the fact that
01797        * which hardware register is actually being used is opaque at the
01798        * user level (the kernel event dispatcher hides that info).
01799        */
01800 
01801       _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context,
01802                       ( caddr_t ) ( unsigned long ) ip,
01803                       NULL, ( 1 << found_evt_idx ), 0,
01804                       &thread, cidx );
01805 
01806    }
01807 
01808    /* Restart the counters */
01809    if (ioctl( fd, PERF_EVENT_IOC_REFRESH, 1 ) == -1) {
01810       PAPIERROR( "overflow refresh failed", 0 );
01811    }
01812 }
01813 
01814 /* Stop profiling */
01815 static int
01816 _papi_pe_stop_profiling( ThreadInfo_t *thread, EventSetInfo_t *ESI )
01817 {
01818    int i, ret = PAPI_OK;
01819    pe_control_t *ctl;
01820 
01821    ctl=ESI->ctl_state;
01822 
01823    /* Loop through all of the events and process those which have mmap */
01824    /* buffers attached.                                                */
01825    for ( i = 0; i < ctl->num_events; i++ ) {
01826       /* Use the mmap_buf field as an indicator of this fd being used for */
01827       /* profiling.                                                       */
01828       if ( ctl->events[i].mmap_buf ) {
01829      /* Process any remaining samples in the sample buffer */
01830      ret = process_smpl_buf( i, &thread );
01831      if ( ret ) {
01832         PAPIERROR( "process_smpl_buf returned error %d", ret );
01833         return ret;
01834      }
01835       }
01836    }
01837    return ret;
01838 }
01839 
01840 
01841 /* Setup an event to cause overflow */
01842 static int
01843 _papi_pe_set_overflow( EventSetInfo_t *ESI, int EventIndex, int threshold )
01844 {
01845    int cidx = _papi_pe_vector.cmp_info.CmpIdx;
01846    pe_context_t *ctx = ( pe_context_t *) ( ESI->master->context[cidx] );
01847    pe_control_t *ctl = (pe_control_t *) ( ESI->ctl_state );
01848    int i, evt_idx, found_non_zero_sample_period = 0, retval = PAPI_OK;
01849 
01850    evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
01851 
01852    SUBDBG("Attempting to set overflow for index %d (%d) of EventSet %d\n",
01853       evt_idx,EventIndex,ESI->EventSetIndex);
01854 
01855    if (evt_idx<0) {
01856       return PAPI_EINVAL;
01857    }
01858 
01859    if ( threshold == 0 ) {
01860       /* If this counter isn't set to overflow, it's an error */
01861       if ( ctl->events[evt_idx].attr.sample_period == 0 ) return PAPI_EINVAL;
01862    }
01863 
01864    ctl->events[evt_idx].attr.sample_period = threshold;
01865 
01866    /*
01867     * Note that the wakeup_mode field initially will be set to zero
01868     * (WAKEUP_MODE_COUNTER_OVERFLOW) as a result of a call to memset 0 to
01869     * all of the events in the ctl struct.
01870     *
01871     * Is it even set to any other value elsewhere?
01872     */
01873    switch ( ctl->events[evt_idx].wakeup_mode ) {
01874     case WAKEUP_MODE_PROFILING:
01875          /* Setting wakeup_events to special value zero means issue a */
01876          /* wakeup (signal) on every mmap page overflow.              */
01877          ctl->events[evt_idx].attr.wakeup_events = 0;
01878          break;
01879 
01880     case WAKEUP_MODE_COUNTER_OVERFLOW:
01881          /* Can this code ever be called? */
01882 
01883          /* Setting wakeup_events to one means issue a wakeup on every */
01884              /* counter overflow (not mmap page overflow).                 */
01885          ctl->events[evt_idx].attr.wakeup_events = 1;
01886          /* We need the IP to pass to the overflow handler */
01887          ctl->events[evt_idx].attr.sample_type = PERF_SAMPLE_IP;
01888          /* one for the user page, and two to take IP samples */
01889          ctl->events[evt_idx].nr_mmap_pages = 1 + 2;
01890          break;
01891     default:
01892          PAPIERROR( "ctl->wakeup_mode[%d] set to an unknown value - %u",
01893              evt_idx, ctl->events[evt_idx].wakeup_mode);
01894          return PAPI_EBUG;
01895    }
01896 
01897    /* Check for non-zero sample period */
01898    for ( i = 0; i < ctl->num_events; i++ ) {
01899       if ( ctl->events[evt_idx].attr.sample_period ) {
01900      found_non_zero_sample_period = 1;
01901      break;
01902       }
01903    }
01904 
01905    if ( found_non_zero_sample_period ) {
01906       /* turn on internal overflow flag for this event set */
01907       ctl->overflow = 1;
01908         
01909       /* Enable the signal handler */
01910       retval = _papi_hwi_start_signal( 
01911                   _papi_pe_vector.cmp_info.hardware_intr_sig, 
01912                   1, _papi_pe_vector.cmp_info.CmpIdx );
01913    } else {
01914       /* turn off internal overflow flag for this event set */
01915       ctl->overflow = 0;
01916         
01917       /* Remove the signal handler, if there are no remaining non-zero */
01918       /* sample_periods set                                            */
01919       retval = _papi_hwi_stop_signal( 
01920                  _papi_pe_vector.cmp_info.hardware_intr_sig );
01921       if ( retval != PAPI_OK ) return retval;
01922    }
01923     
01924    retval = _papi_pe_update_control_state( ctl, NULL,
01925                 ( (pe_control_t *) (ESI->ctl_state) )->num_events,
01926                        ctx );
01927 
01928    return retval;
01929 }
01930 
01931 /* Enable profiling */
01932 static int
01933 _papi_pe_set_profile( EventSetInfo_t *ESI, int EventIndex, int threshold )
01934 {
01935    int ret;
01936    int evt_idx;
01937    pe_control_t *ctl = ( pe_control_t *) ( ESI->ctl_state );
01938 
01939    /* Since you can't profile on a derived event, the event is always the */
01940    /* first and only event in the native event list.                      */
01941    evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
01942 
01943    if ( threshold == 0 ) {
01944       SUBDBG( "MUNMAP(%p,%"PRIu64")\n", ctl->events[evt_idx].mmap_buf,
01945           ( uint64_t ) ctl->events[evt_idx].nr_mmap_pages *
01946           getpagesize(  ) );
01947 
01948       if ( ctl->events[evt_idx].mmap_buf ) {
01949      munmap( ctl->events[evt_idx].mmap_buf,
01950          ctl->events[evt_idx].nr_mmap_pages * getpagesize() );
01951       }
01952 
01953       ctl->events[evt_idx].mmap_buf = NULL;
01954       ctl->events[evt_idx].nr_mmap_pages = 0;
01955       ctl->events[evt_idx].attr.sample_type &= ~PERF_SAMPLE_IP;
01956       ret = _papi_pe_set_overflow( ESI, EventIndex, threshold );
01957       /* ??? #warning "This should be handled somewhere else" */
01958       ESI->state &= ~( PAPI_OVERFLOWING );
01959       ESI->overflow.flags &= ~( PAPI_OVERFLOW_HARDWARE );
01960 
01961       return ret;
01962    }
01963 
01964    /* Look up the native event code */
01965    if ( ESI->profile.flags & (PAPI_PROFIL_DATA_EAR | PAPI_PROFIL_INST_EAR)) {
01966       /* Not supported yet... */
01967 
01968       return PAPI_ENOSUPP;
01969    }
01970 
01971    if ( ESI->profile.flags & PAPI_PROFIL_RANDOM ) {
01972       /* This requires an ability to randomly alter the sample_period within */
01973       /* a given range.  Kernel does not have this ability. FIXME            */
01974       return PAPI_ENOSUPP;
01975    }
01976 
01977    /* Just a guess at how many pages would make this relatively efficient.  */
01978    /* Note that it's "1 +" because of the need for a control page, and the  */
01979    /* number following the "+" must be a power of 2 (1, 4, 8, 16, etc) or   */
01980    /* zero.  This is required to optimize dealing with circular buffer      */
01981    /* wrapping of the mapped pages.                                         */
01982 
01983    ctl->events[evt_idx].nr_mmap_pages = (1+8);
01984    ctl->events[evt_idx].attr.sample_type |= PERF_SAMPLE_IP;
01985 
01986    ret = _papi_pe_set_overflow( ESI, EventIndex, threshold );
01987    if ( ret != PAPI_OK ) return ret;
01988 
01989    return PAPI_OK;
01990 }
01991 
01992 
01993 /* Our component vector */
01994 
01995 papi_vector_t _papi_pe_vector = {
01996    .cmp_info = {
01997           /* component information (unspecified values initialized to 0) */
01998       .name = "perf_events",
01999       .short_name = "pe",
02000       .version = "5.0",
02001       .description = "Linux perf_event CPU counters",
02002   
02003       .default_domain = PAPI_DOM_USER,
02004       .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL,
02005       .default_granularity = PAPI_GRN_THR,
02006       .available_granularities = PAPI_GRN_THR,
02007 
02008       .hardware_intr = 1,
02009       .kernel_profile = 1,
02010       .num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS,
02011 
02012       /* component specific cmp_info initializations */
02013       .fast_virtual_timer = 0,
02014       .attach = 1,
02015       .attach_must_ptrace = 1,
02016       .cpu = 1,
02017       .inherit = 1,
02018       .cntr_umasks = 1,
02019 
02020   },
02021 
02022   /* sizes of framework-opaque component-private structures */
02023   .size = {
02024       .context = sizeof ( pe_context_t ),
02025       .control_state = sizeof ( pe_control_t ),
02026       .reg_value = sizeof ( int ),
02027       .reg_alloc = sizeof ( int ),
02028   },
02029 
02030   /* function pointers in this component */
02031   .init_control_state =    _papi_pe_init_control_state,
02032   .start =                 _papi_pe_start,
02033   .stop =                  _papi_pe_stop,
02034   .read =                  _papi_pe_read,
02035   .shutdown_thread =       _papi_pe_shutdown_thread,
02036   .shutdown_component =    _papi_pe_shutdown_component,
02037   .ctl =                   _papi_pe_ctl,
02038   .update_control_state =  _papi_pe_update_control_state,
02039   .set_domain =            _papi_pe_set_domain,
02040   .reset =                 _papi_pe_reset,
02041   .set_overflow =          _papi_pe_set_overflow,
02042   .set_profile =           _papi_pe_set_profile,
02043   .stop_profiling =        _papi_pe_stop_profiling,
02044   .init_component =        _papi_pe_init_component,
02045   .dispatch_timer =        _papi_pe_dispatch_timer,
02046   .write =                 _papi_pe_write,
02047   .init_thread =           _papi_pe_init_thread,
02048 
02049   /* from counter name mapper */
02050   .ntv_enum_events =   _papi_libpfm4_ntv_enum_events,
02051   .ntv_name_to_code =  _papi_libpfm4_ntv_name_to_code,
02052   .ntv_code_to_name =  _papi_libpfm4_ntv_code_to_name,
02053   .ntv_code_to_descr = _papi_libpfm4_ntv_code_to_descr,
02054   .ntv_code_to_info =  _papi_libpfm4_ntv_code_to_info,
02055 };
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines