|
RTOpPack: Extra C/C++ Code for Vector Reduction/Transformation Operators
Version of the Day
|
00001 /* 00002 // @HEADER 00003 // *********************************************************************** 00004 // 00005 // Moocho: Multi-functional Object-Oriented arCHitecture for Optimization 00006 // Copyright (2003) Sandia Corporation 00007 // 00008 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive 00009 // license for use of this work by or on behalf of the U.S. Government. 00010 // 00011 // Redistribution and use in source and binary forms, with or without 00012 // modification, are permitted provided that the following conditions are 00013 // met: 00014 // 00015 // 1. Redistributions of source code must retain the above copyright 00016 // notice, this list of conditions and the following disclaimer. 00017 // 00018 // 2. Redistributions in binary form must reproduce the above copyright 00019 // notice, this list of conditions and the following disclaimer in the 00020 // documentation and/or other materials provided with the distribution. 00021 // 00022 // 3. Neither the name of the Corporation nor the names of the 00023 // contributors may be used to endorse or promote products derived from 00024 // this software without specific prior written permission. 00025 // 00026 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 00030 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00031 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00032 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00033 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00034 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00035 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00036 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00037 // 00038 // Questions? Contact Roscoe A. Bartlett (rabartl@sandia.gov) 00039 // 00040 // *********************************************************************** 00041 // @HEADER 00042 */ 00043 00044 #include "RTOpToMPI.h" 00045 00046 #include <stdlib.h> 00047 00048 #ifdef RTOP_TO_MPI_SHOW_TIMES 00049 #include <time.h> 00050 #include <stdio.h> 00051 #endif 00052 00053 void RTOp_MPI_type_signature( 00054 const int num_values 00055 ,const int num_indexes 00056 ,const int num_chars 00057 ,int* num_entries 00058 ,int block_lengths[] 00059 ,MPI_Aint displacements[] 00060 ,MPI_Datatype datatypes[] 00061 ) 00062 { 00063 int k = 0, off = 0; 00064 /* values */ 00065 block_lengths[k] = 3 + num_values; /* must carry size information */ 00066 displacements[k] = 0; 00067 datatypes[k] = RTOpMPI_VALUE_TYPE; 00068 ++k; 00069 off += (3 + num_values) * sizeof(RTOp_value_type); 00070 /* indexes */ 00071 if( num_indexes ) { 00072 block_lengths[k] = num_indexes; 00073 displacements[k] = off; 00074 datatypes[k] = RTOpMPI_INDEX_TYPE; 00075 ++k; 00076 off += num_indexes * sizeof(RTOp_index_type); 00077 } 00078 /* chars */ 00079 if( num_chars ) { 00080 block_lengths[k] = num_chars; 00081 displacements[k] = off; 00082 datatypes[k] = RTOpMPI_CHAR_TYPE; 00083 ++k; 00084 } 00085 *num_entries = k; 00086 } 00087 00088 int RTOp_extract_reduct_obj_ext_state( 00089 const struct RTOp_RTOp* op 00090 ,RTOp_ReductTarget reduct_obj 00091 ,int num_values 00092 ,int num_indexes 00093 ,int num_chars 00094 ,void* reduct_obj_ext 00095 ) 00096 { 00097 int err = 0; 00098 int 00099 num_values_off = 0, 00100 num_indexes_off = num_values_off + sizeof(RTOp_value_type), 00101 num_chars_off = num_indexes_off + sizeof(RTOp_value_type), 00102 values_off = num_chars_off + sizeof(RTOp_value_type), 00103 indexes_off = values_off + num_values * sizeof(RTOp_value_type), 00104 chars_off = indexes_off + num_indexes * sizeof(RTOp_index_type); 00105 *(RTOp_value_type*)((char*)reduct_obj_ext + num_values_off) = num_values; 00106 *(RTOp_value_type*)((char*)reduct_obj_ext + num_indexes_off) = num_indexes; 00107 *(RTOp_value_type*)((char*)reduct_obj_ext + num_chars_off) = num_chars; 00108 err = RTOp_extract_reduct_obj_state( 00109 op,reduct_obj 00110 ,num_values, (RTOp_value_type*)((char*)reduct_obj_ext + values_off) 00111 ,num_indexes, (RTOp_index_type*)((char*)reduct_obj_ext + indexes_off) 00112 ,num_chars, (RTOp_char_type*)((char*)reduct_obj_ext + chars_off) 00113 ); 00114 return err; 00115 } 00116 00117 int RTOp_load_reduct_obj_ext_state( 00118 const struct RTOp_RTOp* op 00119 ,const void* reduct_obj_ext 00120 ,RTOp_ReductTarget reduct_obj 00121 ) 00122 { 00123 int err = 0; 00124 int 00125 num_values_off = 0, 00126 num_values = *(RTOp_value_type*)((char*)reduct_obj_ext + num_values_off), 00127 num_indexes_off = num_values_off + sizeof(RTOp_value_type), 00128 num_indexes = *(RTOp_value_type*)((char*)reduct_obj_ext + num_indexes_off), 00129 num_chars_off = num_indexes_off + sizeof(RTOp_value_type), 00130 num_chars = *(RTOp_value_type*)((char*)reduct_obj_ext + num_chars_off), 00131 values_off = num_chars_off + sizeof(RTOp_value_type), 00132 indexes_off = values_off + num_values * sizeof(RTOp_value_type), 00133 chars_off = indexes_off + num_indexes * sizeof(RTOp_index_type); 00134 err = RTOp_load_reduct_obj_state( 00135 op 00136 ,num_values, (RTOp_value_type*)((char*)reduct_obj_ext + values_off) 00137 ,num_indexes, (RTOp_index_type*)((char*)reduct_obj_ext + indexes_off) 00138 ,num_chars, (RTOp_char_type*)((char*) reduct_obj_ext + chars_off) 00139 ,reduct_obj 00140 ); 00141 return err; 00142 } 00143 00144 int RTOp_MPI_apply_op( 00145 MPI_Comm comm, const struct RTOp_RTOp* op, int root_rank 00146 ,const int num_cols 00147 ,const int num_vecs, const struct RTOp_SubVector sub_vecs[] 00148 ,const int num_targ_vecs, const struct RTOp_MutableSubVector sub_targ_vecs[] 00149 ,RTOp_ReductTarget reduct_objs[] 00150 ) 00151 { 00152 int err = 0; 00153 int num_reduct_type_values = 0, num_reduct_type_indexes = 0, 00154 num_reduct_type_chars = 0, num_reduct_type_entries = 0; 00155 RTOp_ReductTarget *i_reduct_objs = NULL; 00156 int target_type_block_lengths[3]; 00157 MPI_Aint target_type_displacements[3]; 00158 RTOp_Datatype target_type_datatypes[3]; 00159 MPI_Datatype mpi_reduct_ext_type = MPI_DATATYPE_NULL; 00160 int reduct_obj_ext_size = 0; 00161 char *i_reduct_objs_ext = NULL; 00162 char *i_reduct_objs_tmp = NULL; 00163 RTOp_reduct_op_func_ptr_t reduct_op_func_ptr = NULL; 00164 MPI_Op mpi_op = MPI_OP_NULL; 00165 int rank = -1; 00166 int k; 00167 int kc; 00168 #ifdef RTOP_TO_MPI_SHOW_TIMES 00169 const double secs_per_tick = ((double)1.0) / CLOCKS_PER_SEC; 00170 clock_t ticks_start_start, ticks_start=0, ticks_end = 0; 00171 #endif 00172 00173 if( comm == MPI_COMM_NULL ) { 00174 /* Just do the reduction on this processor and be done with it! */ 00175 if( sub_vecs || sub_targ_vecs ) { 00176 for( kc = 0; kc < num_cols; ++kc ) { 00177 err = RTOp_apply_op( 00178 op, num_vecs, sub_vecs+kc*num_vecs, num_targ_vecs, sub_targ_vecs+kc*num_targ_vecs 00179 ,reduct_objs ? reduct_objs[kc] : RTOp_REDUCT_OBJ_NULL 00180 ); 00181 if (err) goto ERR_LABEL; 00182 } 00183 } 00184 return 0; /* Success! */ 00185 } 00186 00187 /* 00188 * Get the description of the datatype of the target object. 00189 * We need this in order to map it to an MPI user defined 00190 * data type so that the reduction operations can be performed 00191 * over all of the of processors. 00192 * 00193 * Get the number of the entries in the array that describes 00194 * target type's datatypes 00195 */ 00196 err = RTOp_get_reduct_type_num_entries( 00197 op, &num_reduct_type_values, &num_reduct_type_indexes, &num_reduct_type_chars ); 00198 if(err) goto ERR_LABEL; 00199 num_reduct_type_entries 00200 = (num_reduct_type_values ? 1 : 0) 00201 + (num_reduct_type_indexes ? 1 : 0) 00202 + (num_reduct_type_chars ? 1 : 0); 00203 00204 if( num_reduct_type_entries ) { 00205 #ifdef RTOP_TO_MPI_SHOW_TIMES 00206 if(RTOp_MPI_apply_op_print_timings) { 00207 printf("RTOp_MPI_apply_op(...) : timing various MPI calls and other activities\n"); 00208 ticks_start_start = clock(); 00209 } 00210 #endif 00211 /* 00212 * There is a non-null reduction target object so we need to reduce 00213 * it across processors 00214 * 00215 * Allocate the intermediate target object and perform the reduction for the 00216 * vector elements on this processor. 00217 */ 00218 i_reduct_objs = malloc( sizeof(RTOp_ReductTarget) * num_cols ); 00219 for( kc = 0; kc < num_cols; ++kc ) { 00220 i_reduct_objs[kc] = RTOp_REDUCT_OBJ_NULL; 00221 RTOp_reduct_obj_create( op, i_reduct_objs + kc ); 00222 } 00223 #ifdef RTOP_TO_MPI_SHOW_TIMES 00224 if(RTOp_MPI_apply_op_print_timings) { 00225 printf("calling RTOp_apply_op(...)"); 00226 ticks_start = clock(); 00227 } 00228 #endif 00229 if( sub_vecs || sub_targ_vecs ) { 00230 for( kc = 0; kc < num_cols; ++kc ) { 00231 err = RTOp_apply_op( 00232 op, num_vecs, sub_vecs+kc*num_vecs, num_targ_vecs, sub_targ_vecs+kc*num_targ_vecs 00233 ,i_reduct_objs[kc] 00234 ); 00235 if (err) goto ERR_LABEL; 00236 } 00237 } 00238 #ifdef RTOP_TO_MPI_SHOW_TIMES 00239 if(RTOp_MPI_apply_op_print_timings) { 00240 ticks_end = clock(); 00241 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00242 } 00243 #endif 00244 /* Get the arrays that describe the reduction object datatype */ 00245 if( num_reduct_type_values == 0 ) ++num_reduct_type_entries; /* Add the block for the sizes */ 00246 RTOp_MPI_type_signature( 00247 num_reduct_type_values, num_reduct_type_indexes, num_reduct_type_chars 00248 ,&num_reduct_type_entries 00249 ,target_type_block_lengths, target_type_displacements, target_type_datatypes 00250 ); 00251 /* Translate this target datatype description to an MPI datatype */ 00252 #ifdef RTOP_TO_MPI_SHOW_TIMES 00253 if(RTOp_MPI_apply_op_print_timings) { 00254 printf("calling MPI_Type_struct(...)"); 00255 ticks_start = clock(); 00256 } 00257 #endif 00258 err = MPI_Type_struct( num_reduct_type_entries 00259 , target_type_block_lengths, target_type_displacements 00260 , target_type_datatypes, &mpi_reduct_ext_type ); 00261 #ifdef RTOP_TO_MPI_SHOW_TIMES 00262 if(RTOp_MPI_apply_op_print_timings) { 00263 ticks_end = clock(); 00264 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00265 } 00266 #endif 00267 if(err) goto ERR_LABEL; 00268 #ifdef RTOP_TO_MPI_SHOW_TIMES 00269 if(RTOp_MPI_apply_op_print_timings) { 00270 printf("calling MPI_Type_commit(...)"); 00271 ticks_start = clock(); 00272 } 00273 #endif 00274 err = MPI_Type_commit( &mpi_reduct_ext_type ); 00275 #ifdef RTOP_TO_MPI_SHOW_TIMES 00276 if(RTOp_MPI_apply_op_print_timings) { 00277 ticks_end = clock(); 00278 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00279 } 00280 #endif 00281 if(err) goto ERR_LABEL; 00282 /* */ 00283 /* Create an external contiguous representation for the intermediate reduction object */ 00284 /* that MPI can use. This is very low level but at least the user */ 00285 /* does not have to do it. */ 00286 /* */ 00287 reduct_obj_ext_size = 00288 (3 + num_reduct_type_values) * sizeof(RTOp_value_type) + 00289 num_reduct_type_indexes * sizeof(RTOp_index_type) + 00290 num_reduct_type_chars * sizeof(RTOp_char_type); 00291 i_reduct_objs_ext = malloc( reduct_obj_ext_size * num_cols ); 00292 for( kc = 0; kc < num_cols; ++kc ) { 00293 RTOp_extract_reduct_obj_ext_state( 00294 op,i_reduct_objs[kc],num_reduct_type_values,num_reduct_type_indexes,num_reduct_type_chars 00295 ,i_reduct_objs_ext+kc*reduct_obj_ext_size 00296 ); 00297 } 00298 /* */ 00299 /* Now perform a global reduction over all of the processors. */ 00300 /* */ 00301 /* Get the user defined reduction operation for MPI to use */ 00302 RTOp_get_reduct_op( op, &reduct_op_func_ptr ); 00303 if( reduct_op_func_ptr != NULL ) { 00304 /* */ 00305 /* The operator object has returned an MPI compatible reduction function so */ 00306 /* MPI will be of great help in preforming the global reduction :-). */ 00307 /* */ 00308 /* Create the MPI reduction operator object */ 00309 /* */ 00310 #ifdef RTOP_TO_MPI_SHOW_TIMES 00311 if(RTOp_MPI_apply_op_print_timings) { 00312 printf("calling MPI_Op_create(...)"); 00313 ticks_start = clock(); 00314 } 00315 #endif 00316 err = MPI_Op_create( 00317 reduct_op_func_ptr 00318 ,1 /* The op is communitive? yes == 1! */ 00319 ,&mpi_op 00320 ); 00321 #ifdef RTOP_TO_MPI_SHOW_TIMES 00322 if(RTOp_MPI_apply_op_print_timings) { 00323 ticks_end = clock(); 00324 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00325 } 00326 #endif 00327 if(err) goto ERR_LABEL; 00328 if( root_rank >= 0 ) { 00329 MPI_Comm_rank( comm, &rank ); 00330 /* Apply the reduction operation over all of the processors */ 00331 /* and reduce to one target object only on the root processor */ 00332 i_reduct_objs_tmp = malloc( reduct_obj_ext_size * num_cols ); 00333 err = MPI_Reduce( 00334 i_reduct_objs_ext 00335 ,rank == root_rank ? i_reduct_objs_tmp : NULL /* I don't know if this is MPI legal? */ 00336 ,num_cols, mpi_reduct_ext_type 00337 ,mpi_op, root_rank, comm 00338 ); 00339 if(err) goto ERR_LABEL; 00340 if( rank == root_rank ) { /* bit-by-bit copy */ 00341 for( k = 0; k < reduct_obj_ext_size * num_cols; ++k ) i_reduct_objs_ext[k] = i_reduct_objs_tmp[k]; 00342 } 00343 } 00344 else { 00345 /* Apply the reduction operation over all of the processors */ 00346 /* and reduce to one target object on each processor */ 00347 i_reduct_objs_tmp = malloc( reduct_obj_ext_size * num_cols ); 00348 #ifdef RTOP_TO_MPI_SHOW_TIMES 00349 if(RTOp_MPI_apply_op_print_timings) { 00350 printf("calling MPI_Allreduce(...)"); 00351 ticks_start = clock(); 00352 } 00353 #endif 00354 err = MPI_Allreduce( 00355 i_reduct_objs_ext, i_reduct_objs_tmp, num_cols 00356 ,mpi_reduct_ext_type, mpi_op, comm 00357 ); 00358 #ifdef RTOP_TO_MPI_SHOW_TIMES 00359 if(RTOp_MPI_apply_op_print_timings) { 00360 ticks_end = clock(); 00361 printf(" : cpu time = %g s\n", (ticks_end-ticks_start)*secs_per_tick ); 00362 } 00363 #endif 00364 if(err) goto ERR_LABEL; 00365 for( k = 0; k < reduct_obj_ext_size * num_cols; ++k ) i_reduct_objs_ext[k] = i_reduct_objs_tmp[k]; 00366 } 00367 /* Load the updated state of the reduction target object and reduce. */ 00368 for( kc = 0; kc < num_cols; ++kc ) { 00369 RTOp_load_reduct_obj_ext_state( op, i_reduct_objs_ext+kc*reduct_obj_ext_size, i_reduct_objs[kc] ); 00370 RTOp_reduce_reduct_objs( op, i_reduct_objs[kc], reduct_objs[kc] ); 00371 } 00372 } 00373 else { 00374 /* */ 00375 /* We must do without the MPI compatible reduction function :-( We must */ 00376 /* manually perform the reduction operation ourselves. Note, this will */ 00377 /* not be as efficient as when MPI would do it but it helps take some */ 00378 /* of the burden off of the developers of operator classes. */ 00379 /* */ 00380 /* What we need to do is to gather all of the intermediate reduction */ 00381 /* objects to the root process and then do the reduction pair-wise. */ 00382 /* */ 00383 assert( reduct_op_func_ptr ); /* ToDo: Implement! */ 00384 } 00385 } 00386 else { 00387 /* */ 00388 /* There is a null reduction target object so we don't need to reduce */ 00389 /* it across processors */ 00390 /* */ 00391 if( sub_vecs || sub_targ_vecs ) { 00392 for( kc = 0; kc < num_cols; ++kc ) { 00393 err = RTOp_apply_op( 00394 op, num_vecs, sub_vecs+kc*num_vecs, num_targ_vecs, sub_targ_vecs+kc*num_targ_vecs 00395 ,RTOp_REDUCT_OBJ_NULL 00396 ); 00397 if (err) goto ERR_LABEL; 00398 } 00399 } 00400 } 00401 00402 ERR_LABEL: 00403 00404 /* Clean up dynamically allocated memory! */ 00405 00406 if( i_reduct_objs_tmp != NULL ) 00407 free( i_reduct_objs_tmp ); 00408 if( mpi_op != MPI_OP_NULL ) 00409 MPI_Op_free( &mpi_op ); 00410 if( i_reduct_objs_ext != NULL ) 00411 free( i_reduct_objs_ext ); 00412 if( mpi_reduct_ext_type != MPI_DATATYPE_NULL ) 00413 MPI_Type_free( &mpi_reduct_ext_type ); 00414 if( i_reduct_objs != NULL ) { 00415 for( kc = 0; kc < num_cols; ++kc ) 00416 RTOp_reduct_obj_free( op, &i_reduct_objs[0] ); 00417 free( i_reduct_objs ); 00418 } 00419 00420 return err; 00421 } 00422 00423 #ifdef RTOP_TO_MPI_SHOW_TIMES 00424 int RTOp_MPI_apply_op_print_timings = 0; 00425 #endif
1.7.6.1