|
PAPI
5.0.1.0
|
00001 #include <stdlib.h> 00002 #include <stdio.h> 00003 #include <string.h> 00004 #define NUMBER 100 00005 00006 inline void 00007 inline_packed_sse_add( float *aa, float *bb, float *cc ) 00008 { 00009 __asm__ __volatile__( "movaps (%0), %%xmm0;" 00010 "movaps (%1), %%xmm1;" 00011 "addps %%xmm0, %%xmm1;" 00012 "movaps %%xmm1, (%2);"::"r"( aa ), 00013 "r"( bb ), "r"( cc ) 00014 :"%xmm0", "%xmm1" ); 00015 } 00016 inline void 00017 inline_packed_sse_mul( float *aa, float *bb, float *cc ) 00018 { 00019 __asm__ __volatile__( "movaps (%0), %%xmm0;" 00020 "movaps (%1), %%xmm1;" 00021 "mulps %%xmm0, %%xmm1;" 00022 "movaps %%xmm1, (%2);"::"r"( aa ), 00023 "r"( bb ), "r"( cc ) 00024 :"%xmm0", "%xmm1" ); 00025 } 00026 inline void 00027 inline_packed_sse2_add( double *aa, double *bb, double *cc ) 00028 { 00029 __asm__ __volatile__( "movapd (%0), %%xmm0;" 00030 "movapd (%1), %%xmm1;" 00031 "addpd %%xmm0, %%xmm1;" 00032 "movapd %%xmm1, (%2);"::"r"( aa ), 00033 "r"( bb ), "r"( cc ) 00034 :"%xmm0", "%xmm1" ); 00035 } 00036 inline void 00037 inline_packed_sse2_mul( double *aa, double *bb, double *cc ) 00038 { 00039 __asm__ __volatile__( "movapd (%0), %%xmm0;" 00040 "movapd (%1), %%xmm1;" 00041 "mulpd %%xmm0, %%xmm1;" 00042 "movapd %%xmm1, (%2);"::"r"( aa ), 00043 "r"( bb ), "r"( cc ) 00044 :"%xmm0", "%xmm1" ); 00045 } 00046 inline void 00047 inline_unpacked_sse_add( float *aa, float *bb, float *cc ) 00048 { 00049 __asm__ __volatile__( "movss (%0), %%xmm0;" 00050 "movss (%1), %%xmm1;" 00051 "addss %%xmm0, %%xmm1;" 00052 "movss %%xmm1, (%2);"::"r"( aa ), "r"( bb ), "r"( cc ) 00053 :"%xmm0", "%xmm1" ); 00054 } 00055 inline void 00056 inline_unpacked_sse_mul( float *aa, float *bb, float *cc ) 00057 { 00058 __asm__ __volatile__( "movss (%0), %%xmm0;" 00059 "movss (%1), %%xmm1;" 00060 "mulss %%xmm0, %%xmm1;" 00061 "movss %%xmm1, (%2);"::"r"( aa ), "r"( bb ), "r"( cc ) 00062 :"%xmm0", "%xmm1" ); 00063 } 00064 inline void 00065 inline_unpacked_sse2_add( double *aa, double *bb, double *cc ) 00066 { 00067 __asm__ __volatile__( "movsd (%0), %%xmm0;" 00068 "movsd (%1), %%xmm1;" 00069 "addsd %%xmm0, %%xmm1;" 00070 "movsd %%xmm1, (%2);"::"r"( aa ), "r"( bb ), "r"( cc ) 00071 :"%xmm0", "%xmm1" ); 00072 } 00073 inline void 00074 inline_unpacked_sse2_mul( double *aa, double *bb, double *cc ) 00075 { 00076 __asm__ __volatile__( "movsd (%0), %%xmm0;" 00077 "movsd (%1), %%xmm1;" 00078 "mulsd %%xmm0, %%xmm1;" 00079 "movsd %%xmm1, (%2);"::"r"( aa ), "r"( bb ), "r"( cc ) 00080 :"%xmm0", "%xmm1" ); 00081 } 00082 00083 int 00084 main( int argc, char **argv ) 00085 { 00086 int i, packed = 0, sse = 0; 00087 float a[4] = { 1.0, 2.0, 3.0, 4.0 }; 00088 float b[4] = { 2.0, 3.0, 4.0, 5.0 }; 00089 float c[4] = { 0.0, 0.0, 0.0, 0.0 }; 00090 double d[4] = { 1.0, 2.0, 3.0, 4.0 }; 00091 double e[4] = { 2.0, 3.0, 4.0, 5.0 }; 00092 double f[4] = { 0.0, 0.0, 0.0, 0.0 }; 00093 00094 if ( argc != 3 ) { 00095 bail: 00096 printf( "Usage %s: <packed|unpacked> <sse|sse2>\n", argv[0] ); 00097 exit( 1 ); 00098 } 00099 if ( strcasecmp( argv[1], "packed" ) == 0 ) 00100 packed = 1; 00101 else if ( strcasecmp( argv[1], "unpacked" ) == 0 ) 00102 packed = 0; 00103 else 00104 goto bail; 00105 if ( strcasecmp( argv[2], "sse" ) == 0 ) 00106 sse = 1; 00107 else if ( strcasecmp( argv[2], "sse2" ) == 0 ) 00108 sse = 0; 00109 else 00110 goto bail; 00111 00112 #if 0 00113 if ( ( sse ) && 00114 ( system( "cat /proc/cpuinfo | grep sse > /dev/null" ) != 0 ) ) { 00115 printf( "This processor does not have SSE.\n" ); 00116 exit( 1 ); 00117 } 00118 if ( ( sse == 0 ) && 00119 ( system( "cat /proc/cpuinfo | grep sse2 > /dev/null" ) != 0 ) ) { 00120 printf( "This processor does not have SSE2.\n" ); 00121 exit( 1 ); 00122 } 00123 #endif 00124 00125 printf( "Vector 1: %f %f %f %f\n", a[0], a[1], a[2], a[3] ); 00126 printf( "Vector 2: %f %f %f %f\n\n", b[0], b[1], b[2], b[3] ); 00127 00128 if ( ( packed == 0 ) && ( sse == 1 ) ) { 00129 for ( i = 0; i < NUMBER; i++ ) { 00130 inline_unpacked_sse_add( &a[0], &b[0], &c[0] ); 00131 } 00132 printf( "%d SSE Unpacked Adds: Result %f\n", NUMBER, c[0] ); 00133 00134 for ( i = 0; i < NUMBER; i++ ) { 00135 inline_unpacked_sse_mul( &a[0], &b[0], &c[0] ); 00136 } 00137 printf( "%d SSE Unpacked Muls: Result %f\n", NUMBER, c[0] ); 00138 } 00139 if ( ( packed == 1 ) && ( sse == 1 ) ) { 00140 for ( i = 0; i < NUMBER; i++ ) { 00141 inline_packed_sse_add( a, b, c ); 00142 } 00143 printf( "%d SSE Packed Adds: Result %f %f %f %f\n", NUMBER, c[0], c[1], 00144 c[2], c[3] ); 00145 for ( i = 0; i < NUMBER; i++ ) { 00146 inline_packed_sse_mul( a, b, c ); 00147 } 00148 printf( "%d SSE Packed Muls: Result %f %f %f %f\n", NUMBER, c[0], c[1], 00149 c[2], c[3] ); 00150 } 00151 00152 if ( ( packed == 0 ) && ( sse == 0 ) ) { 00153 for ( i = 0; i < NUMBER; i++ ) { 00154 inline_unpacked_sse2_add( &d[0], &e[0], &f[0] ); 00155 } 00156 printf( "%d SSE2 Unpacked Adds: Result %f\n", NUMBER, c[0] ); 00157 00158 for ( i = 0; i < NUMBER; i++ ) { 00159 inline_unpacked_sse2_mul( &d[0], &e[0], &f[0] ); 00160 } 00161 printf( "%d SSE2 Unpacked Muls: Result %f\n", NUMBER, c[0] ); 00162 } 00163 if ( ( packed == 1 ) && ( sse == 0 ) ) { 00164 for ( i = 0; i < NUMBER; i++ ) { 00165 inline_packed_sse2_add( &d[0], &e[0], &f[0] ); 00166 } 00167 printf( "%d SSE2 Packed Adds: Result %f\n", NUMBER, c[0] ); 00168 00169 for ( i = 0; i < NUMBER; i++ ) { 00170 inline_packed_sse2_mul( &d[0], &e[0], &f[0] ); 00171 } 00172 printf( "%d SSE2 Packed Muls: Result %f\n", NUMBER, c[0] ); 00173 } 00174 00175 00176 exit( 0 ); 00177 }