|
PAPI
5.0.1.0
|
00001 /****************************/ 00002 /* THIS IS OPEN SOURCE CODE */ 00003 /****************************/ 00004 00018 #ifndef _PAPI_CUDA_H 00019 #define _PAPI_CUDA_H 00020 00021 /* Headers required by CuPTI */ 00022 #include "cupti_events.h" 00023 #include <cuda_runtime_api.h> 00024 00025 /* Specific errors from CUDA lib */ 00026 #define CHECK_CU_ERROR(err, cufunc) \ 00027 if (err != CUDA_SUCCESS) \ 00028 { \ 00029 printf ("Error %d for CUDA Driver API function '%s'. cuptiQuery failed\n", err, cufunc); \ 00030 return -1; \ 00031 } 00032 00033 /* Specific errors from CuPTI lib */ 00034 #define CHECK_CUPTI_ERROR(err, cuptifunc) \ 00035 if (err != CUPTI_SUCCESS) \ 00036 { \ 00037 printf ("Error %d for CUPTI API function '%s'. cuptiQuery failed\n", err, cuptifunc); \ 00038 return -1; \ 00039 } 00040 00041 00042 00043 /************************* DEFINES SECTION *********************************** 00044 *******************************************************************************/ 00045 00046 /* this number assumes that there will never be more events than indicated */ 00047 #define CUDA_MAX_COUNTERS 512 00048 00049 typedef struct EventData 00050 { 00051 CUpti_EventID eventId; // CuPTI event id 00052 char name[PAPI_MIN_STR_LEN]; // event name 00053 char desc[PAPI_2MAX_STR_LEN]; // short desc of the event 00054 } EventData_t; 00055 00056 00057 typedef struct DomainData 00058 { 00059 CUpti_EventDomainID domainId; // CuPTI domain id 00060 char name[PAPI_MIN_STR_LEN]; // domain name 00061 uint32_t eventCount; // number of events per domain 00062 EventData_t *event; 00063 } DomainData_t; 00064 00065 00066 typedef struct DeviceData 00067 { 00068 CUdevice dev; // CUDA device 00069 char name[PAPI_MIN_STR_LEN]; // device name 00070 uint32_t domainCount; // number of domains per device 00071 DomainData_t *domain; 00072 } DeviceData_t; 00073 00074 00075 typedef struct AddedEvents 00076 { 00077 int count; // number of events that have been added to the CuPTI eventGroup 00078 int *list; // list of the added events 00079 } AddedEvents_t; 00080 00081 00083 typedef struct CUDA_register 00084 { 00085 /* This is used by the framework.It likes it to be !=0 to do somehting */ 00086 unsigned int selector; 00087 /* This is the information needed to locate a CUDA event */ 00088 CUpti_EventID eventId; 00089 } CUDA_register_t; 00090 00091 00093 typedef struct CUDA_native_event_entry 00094 { 00095 CUDA_register_t resources; 00096 char name[PAPI_MAX_STR_LEN]; 00097 char description[PAPI_2MAX_STR_LEN]; 00098 } CUDA_native_event_entry_t; 00099 00100 00101 typedef struct CUDA_reg_alloc 00102 { 00103 CUDA_register_t ra_bits; 00104 } CUDA_reg_alloc_t; 00105 00106 00107 typedef struct CUDA_control_state 00108 { 00109 CUpti_EventGroup eventGroup; 00110 AddedEvents_t addedEvents; 00111 long long counts[CUDA_MAX_COUNTERS]; 00112 int ncounter; 00113 int old_count; 00114 } CUDA_control_state_t; 00115 00116 /* Holds per-thread information */ 00117 typedef struct CUDA_context 00118 { 00119 CUDA_control_state_t state; 00120 } CUDA_context_t; 00121 00122 00123 /************************* GLOBALS SECTION *********************************** 00124 *******************************************************************************/ 00125 00126 static int enumEventDomains( CUdevice dev, int deviceId ); 00127 #ifdef CUDA_4_0 00128 static int enumEvents( CUdevice dev, int domainId, int eventCount ); 00129 #else 00130 static int enumEvents( int domainId, int eventCount ); 00131 #endif 00132 00133 /* This table contains the CUDA native events */ 00134 static CUDA_native_event_entry_t *cuda_native_table; 00135 /* number of events in the table */ 00136 static int NUM_EVENTS = 0; 00137 static int deviceCount = 0; 00138 static int totalDomainCount = 0; 00139 static int totalEventCount = 0; 00140 static int currentDeviceID; /* determine the actual device the user code is running on */ 00141 static int CUDA_FREED = 0; 00142 00143 /* 00144 * Why are device and cuCtx globals? 00145 * 00146 * Starting in CUDA 4.0, multiple CPU threads can access the same CUDA context. 00147 * This is a much easier programming model then pre-4.0 as threads - using the 00148 * same context - can share memory, data, etc. 00149 * It's possible to create a different context for each thread, but then we are 00150 * likely running into a limitation that only one context can be profiled at a time. 00151 * ==> and we don't want this. That's why CUDA context creation is done in 00152 * CUDA_init_component() (called only by main thread) rather than CUDA_init_thread() 00153 * or CUDA_init_control_state() (both called by each thread). 00154 */ 00155 00156 static DeviceData_t *device; 00157 static CUcontext cuCtx; 00158 00159 #endif /* _PAPI_CUDA_H */