|
Zoltan2
|
00001 // @HEADER 00002 // 00003 // *********************************************************************** 00004 // 00005 // Zoltan2: A package of combinatorial algorithms for scientific computing 00006 // Copyright 2012 Sandia Corporation 00007 // 00008 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 00009 // the U.S. Government retains certain rights in this software. 00010 // 00011 // Redistribution and use in source and binary forms, with or without 00012 // modification, are permitted provided that the following conditions are 00013 // met: 00014 // 00015 // 1. Redistributions of source code must retain the above copyright 00016 // notice, this list of conditions and the following disclaimer. 00017 // 00018 // 2. Redistributions in binary form must reproduce the above copyright 00019 // notice, this list of conditions and the following disclaimer in the 00020 // documentation and/or other materials provided with the distribution. 00021 // 00022 // 3. Neither the name of the Corporation nor the names of the 00023 // contributors may be used to endorse or promote products derived from 00024 // this software without specific prior written permission. 00025 // 00026 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 00030 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00031 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00032 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00033 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00034 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00035 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00036 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00037 // 00038 // Questions? Contact Karen Devine (kddevin@sandia.gov) 00039 // Erik Boman (egboman@sandia.gov) 00040 // Siva Rajamanickam (srajama@sandia.gov) 00041 // 00042 // *********************************************************************** 00043 // 00044 // @HEADER 00045 00050 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_ 00051 #define _ZOLTAN2_ALGMultiJagged_HPP_ 00052 00053 #include <Zoltan2_MultiJagged_ReductionOps.hpp> 00054 #include <Zoltan2_CoordinateModel.hpp> 00055 #include <Zoltan2_Parameters.hpp> 00056 #include <Zoltan2_Algorithm.hpp> 00057 00058 #include <Tpetra_Distributor.hpp> 00059 #include <Teuchos_ParameterList.hpp> 00060 #include <Zoltan2_CoordinatePartitioningGraph.hpp> 00061 #include <new> // ::operator new[] 00062 #include <algorithm> // std::sort 00063 #include <Zoltan2_Util.hpp> 00064 #include <vector> 00065 00066 #ifdef HAVE_ZOLTAN2_ZOLTAN 00067 #ifdef HAVE_ZOLTAN2_MPI 00068 #define ENABLE_ZOLTAN_MIGRATION 00069 #include "zoltan_comm_cpp.h" 00070 #include "zoltan_types.h" // for error codes 00071 #endif 00072 #endif 00073 00074 00075 #ifdef HAVE_ZOLTAN2_OMP 00076 #include <omp.h> 00077 #endif 00078 00079 #define LEAST_SIGNIFICANCE 0.0001 00080 #define SIGNIFICANCE_MUL 1000 00081 00082 //if the (last dimension reduce all count) x the mpi world size 00083 //estimated to be bigger than this number then migration will be forced 00084 //in earlier iterations. 00085 #define FUTURE_REDUCEALL_CUTOFF 1500000 00086 //if parts right before last dimension are estimated to have less than 00087 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations. 00088 #define MIN_WORK_LAST_DIM 1000 00089 00090 00091 00092 00093 #define ABS(x) ((x) >= 0 ? (x) : -(x)) 00094 //imbalance calculation. Wreal / Wexpected - 1 00095 #define imbalanceOf(Wachieved, totalW, expectedRatio) \ 00096 (Wachieved) / ((totalW) * (expectedRatio)) - 1 00097 #define imbalanceOf2(Wachieved, wExpected) \ 00098 (Wachieved) / (wExpected) - 1 00099 00100 00101 00102 using std::vector; 00103 00104 namespace Zoltan2{ 00105 00109 template <typename T> 00110 T *allocMemory(size_t size){ 00111 if (size > 0){ 00112 T * a = new T[size]; 00113 if (a == NULL) { 00114 throw "cannot allocate memory"; 00115 } 00116 return a; 00117 } 00118 else { 00119 return NULL; 00120 } 00121 } 00122 00126 template <typename T> 00127 void freeArray(T *&array){ 00128 if(array != NULL){ 00129 delete [] array; 00130 array = NULL; 00131 } 00132 } 00133 00137 template <typename tt> 00138 std::string toString(tt obj){ 00139 std::stringstream ss (std::stringstream::in |std::stringstream::out); 00140 ss << obj; 00141 std::string tmp = ""; 00142 ss >> tmp; 00143 return tmp; 00144 } 00145 00153 template <typename IT, typename CT, typename WT> 00154 class uMultiSortItem 00155 { 00156 public: 00157 IT index; 00158 CT count; 00159 //unsigned int val; 00160 WT *val; 00161 WT _EPSILON; 00162 00163 uMultiSortItem(){ 00164 this->index = 0; 00165 this->count = 0; 00166 this->val = NULL; 00167 this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100; 00168 } 00169 00170 00171 uMultiSortItem(IT index_ ,CT count_, WT *vals_){ 00172 this->index = index_; 00173 this->count = count_; 00174 this->val = vals_; 00175 this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100; 00176 } 00177 00178 uMultiSortItem( const uMultiSortItem<IT,CT,WT>& other ){ 00179 this->index = other.index; 00180 this->count = other.count; 00181 this->val = other.val; 00182 this->_EPSILON = other._EPSILON; 00183 } 00184 00185 ~uMultiSortItem(){ 00186 //freeArray<WT>(this->val); 00187 } 00188 00189 void set(IT index_ ,CT count_, WT *vals_){ 00190 this->index = index_; 00191 this->count = count_; 00192 this->val = vals_; 00193 } 00194 00195 00196 uMultiSortItem<IT,CT,WT> operator=(const uMultiSortItem<IT,CT,WT>& other){ 00197 this->index = other.index; 00198 this->count = other.count; 00199 this->val = other.val; 00200 return *(this); 00201 } 00202 00203 bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{ 00204 assert (this->count == other.count); 00205 for(CT i = 0; i < this->count; ++i){ 00206 //if the values are equal go to next one. 00207 if (ABS(this->val[i] - other.val[i]) < this->_EPSILON){ 00208 continue; 00209 } 00210 //if next value is smaller return true; 00211 if(this->val[i] < other.val[i]){ 00212 return true; 00213 } 00214 //if next value is bigger return false; 00215 else { 00216 return false; 00217 } 00218 } 00219 //if they are totally equal. 00220 return this->index < other.index; 00221 } 00222 bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{ 00223 assert (this->count == other.count); 00224 for(CT i = 0; i < this->count; ++i){ 00225 //if the values are equal go to next one. 00226 if (ABS(this->val[i] - other.val[i]) < this->_EPSILON){ 00227 continue; 00228 } 00229 //if next value is bigger return true; 00230 if(this->val[i] > other.val[i]){ 00231 return true; 00232 } 00233 //if next value is smaller return false; 00234 else //(this->val[i] > other.val[i]) 00235 { 00236 return false; 00237 } 00238 } 00239 //if they are totally equal. 00240 return this->index > other.index; 00241 } 00242 };// uSortItem; 00243 00247 template <class IT, class WT> 00248 struct uSortItem 00249 { 00250 IT id; 00251 //unsigned int val; 00252 WT val; 00253 };// uSortItem; 00254 00258 template <class IT, class WT> 00259 void uqsort(IT n, uSortItem<IT, WT> * arr) 00260 { 00261 #define SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp; 00262 int NSTACK = 50; 00263 int M = 7; 00264 IT i, ir=n, j, k, l=1; 00265 IT jstack=0, istack[50]; 00266 WT aval; 00267 uSortItem<IT,WT> a, temp; 00268 00269 --arr; 00270 for (;;) 00271 { 00272 if (ir-l < M) 00273 { 00274 for (j=l+1;j<=ir;j++) 00275 { 00276 a=arr[j]; 00277 aval = a.val; 00278 for (i=j-1;i>=1;i--) 00279 { 00280 if (arr[i].val <= aval) 00281 break; 00282 arr[i+1] = arr[i]; 00283 } 00284 arr[i+1]=a; 00285 } 00286 if (jstack == 0) 00287 break; 00288 ir=istack[jstack--]; 00289 l=istack[jstack--]; 00290 } 00291 else 00292 { 00293 k=(l+ir) >> 1; 00294 SWAP(arr[k],arr[l+1], temp) 00295 if (arr[l+1].val > arr[ir].val) 00296 { 00297 SWAP(arr[l+1],arr[ir],temp) 00298 } 00299 if (arr[l].val > arr[ir].val) 00300 { 00301 SWAP(arr[l],arr[ir],temp) 00302 } 00303 if (arr[l+1].val > arr[l].val) 00304 { 00305 SWAP(arr[l+1],arr[l],temp) 00306 } 00307 i=l+1; 00308 j=ir; 00309 a=arr[l]; 00310 aval = a.val; 00311 for (;;) 00312 { 00313 do i++; while (arr[i].val < aval); 00314 do j--; while (arr[j].val > aval); 00315 if (j < i) break; 00316 SWAP(arr[i],arr[j],temp); 00317 } 00318 arr[l]=arr[j]; 00319 arr[j]=a; 00320 jstack += 2; 00321 if (jstack > NSTACK){ 00322 std::cout << "uqsort: NSTACK too small in sort." << std::endl; 00323 exit(1); 00324 } 00325 if (ir-i+1 >= j-l) 00326 { 00327 istack[jstack]=ir; 00328 istack[jstack-1]=i; 00329 ir=j-1; 00330 } 00331 else 00332 { 00333 istack[jstack]=j-1; 00334 istack[jstack-1]=l; 00335 l=i; 00336 } 00337 } 00338 } 00339 } 00340 00341 00342 00346 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 00347 typename mj_part_t> 00348 class AlgMJ 00349 { 00350 private: 00351 typedef coordinateModelPartBox<mj_scalar_t, mj_part_t> mj_partBox_t; 00352 typedef std::vector<mj_partBox_t> mj_partBoxVector_t; 00353 00354 RCP<const Environment> mj_env; //the environment object 00355 RCP<Comm<int> > mj_problemComm; //initial comm object 00356 00357 double imbalance_tolerance; //input imbalance tolerance. 00358 mj_part_t *part_no_array; //input part array specifying num part to divide along each dim. 00359 int recursion_depth; //the number of steps that partitioning will be solved in. 00360 int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord 00361 00362 size_t initial_num_loc_coords; //initial num local coords. 00363 global_size_t initial_num_glob_coords; //initial num global coords. 00364 00365 mj_lno_t num_local_coords; //number of local coords. 00366 mj_gno_t num_global_coords; //number of global coords. 00367 00368 mj_scalar_t **mj_coordinates; //two dimension coordinate array 00369 mj_scalar_t **mj_weights; //two dimension weight array 00370 bool *mj_uniform_parts; //if the target parts are uniform 00371 mj_scalar_t **mj_part_sizes; //target part weight sizes. 00372 bool *mj_uniform_weights; //if the coordinates have uniform weights. 00373 00374 ArrayView<const mj_gno_t> mj_gnos; //global ids of the coordinates, comes from the input 00375 size_t num_global_parts; //the targeted number of parts 00376 00377 mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates. 00378 mj_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration. 00379 int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations. 00380 00381 mj_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning. 00382 mj_lno_t *new_coordinate_permutations; //permutation work array. 00383 mj_part_t *assigned_part_ids; //the part ids assigned to coordinates. 00384 00385 mj_lno_t *part_xadj; //beginning and end of each part. 00386 mj_lno_t *new_part_xadj; // work array for beginning and end of each part. 00387 00388 //get mj specific parameters. 00389 bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts. 00390 mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently. 00391 00392 int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value. 00393 int mj_user_recursion_depth; //the recursion depth value provided by user. 00394 int mj_keep_part_boxes; //if the boxes need to be kept. 00395 00396 int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0 00397 mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration. 00398 int num_threads; //num threads 00399 00400 mj_part_t total_num_cut ; //how many cuts will be totally 00401 mj_part_t total_num_part; //how many parts will be totally 00402 00403 mj_part_t max_num_part_along_dim ; //maximum part count along a dimension. 00404 mj_part_t max_num_cut_along_dim; //maximum cut count along a dimension. 00405 size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension. 00406 00407 mj_part_t total_dim_num_reduce_all; //estimate on #reduceAlls can be done. 00408 mj_part_t last_dim_num_part; //max no of parts that might occur 00409 //during the partition before the 00410 //last partitioning dimension. 00411 00412 RCP<Comm<int> > comm; //comm object than can be altered during execution 00413 float fEpsilon; //epsilon for float 00414 mj_scalar_t sEpsilon; //epsilon for mj_scalar_t 00415 00416 mj_scalar_t maxScalar_t; //max possible scalar 00417 mj_scalar_t minScalar_t; //min scalar 00418 00419 mj_scalar_t *all_cut_coordinates; 00420 mj_scalar_t *max_min_coords; 00421 mj_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline 00422 mj_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline 00423 00424 // work array to manipulate coordinate of cutlines in different iterations. 00425 //necessary because previous cut line information is used for determining 00426 //the next cutline information. therefore, cannot update the cut work array 00427 //until all cutlines are determined. 00428 mj_scalar_t *cut_coordinates_work_array; 00429 00430 //cumulative part weight array. 00431 mj_scalar_t *target_part_weights; 00432 00433 mj_scalar_t *cut_upper_bound_coordinates ; //upper bound coordinate of a cut line 00434 mj_scalar_t *cut_lower_bound_coordinates ; //lower bound coordinate of a cut line 00435 mj_scalar_t *cut_lower_bound_weights ; //lower bound weight of a cut line 00436 mj_scalar_t *cut_upper_bound_weights ; //upper bound weight of a cut line 00437 00438 mj_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part. 00439 mj_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight. 00440 00441 //isDone is used to determine if a cutline is determined already. 00442 //If a cut line is already determined, the next iterations will skip this cut line. 00443 bool *is_cut_line_determined; 00444 //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part 00445 //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part. 00446 mj_part_t *my_incomplete_cut_count; 00447 //local part weights of each thread. 00448 double **thread_part_weights; 00449 //the work manupulation array for partweights. 00450 double **thread_part_weight_work; 00451 00452 //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread). 00453 mj_scalar_t **thread_cut_left_closest_point; 00454 //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread) 00455 mj_scalar_t **thread_cut_right_closest_point; 00456 00457 //to store how many points in each part a thread has. 00458 mj_lno_t **thread_point_counts; 00459 00460 mj_scalar_t *process_rectilinear_cut_weight; 00461 mj_scalar_t *global_rectilinear_cut_weight; 00462 00463 //for faster communication, concatanation of 00464 //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines 00465 //leftClosest distances sized P-1, since P-1 cut lines 00466 //rightClosest distances size P-1, since P-1 cut lines. 00467 mj_scalar_t *total_part_weight_left_right_closests ; 00468 mj_scalar_t *global_total_part_weight_left_right_closests; 00469 00470 RCP<mj_partBoxVector_t> kept_boxes; 00471 RCP<mj_partBox_t> global_box; 00472 int myRank, myActualRank; //processor rank, and initial rank 00473 00474 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in 00475 * the input. part_no_array takes 00476 * precedence if both are provided. 00477 * Depending on these parameters, total cut/part number, 00478 * maximum part/cut number along a dimension, estimated number of reduceAlls, 00479 * and the number of parts before the last dimension is calculated. 00480 * */ 00481 void set_part_specifications(); 00482 00483 /* \brief Tries to determine the part number for current dimension, 00484 * by trying to make the partitioning as square as possible. 00485 * \param num_total_future how many more partitionings are required. 00486 * \param root how many more recursion depth is left. 00487 */ 00488 inline mj_part_t get_part_count( 00489 mj_part_t num_total_future, 00490 double root); 00491 00492 /* \brief Allocates the all required memory for the mj partitioning algorithm. 00493 * 00494 */ 00495 void allocate_set_work_memory(); 00496 00497 /* \brief for part communication we keep track of the box boundaries. 00498 * This is performed when either asked specifically, or when geometric mapping is performed afterwards. 00499 * This function initializes a single box with all global min and max coordinates. 00500 * \param initial_partitioning_boxes the input and output vector for boxes. 00501 */ 00502 void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes); 00503 00504 /* \brief compute global bounding box: min/max coords of global domain */ 00505 void compute_global_box(); 00506 00507 /* \brief Function returns how many parts that will be obtained after this dimension partitioning. 00508 * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector, 00509 * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector, 00510 * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor. 00511 * 00512 * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into. 00513 * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into. 00514 * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into. 00515 * \param future_num_parts: output, max number of future parts that will be obtained from a single 00516 * \param current_num_parts: input, how many parts are there currently. 00517 * \param current_iteration: input, current dimension iteration number. 00518 * \param input_part_boxes: input, if boxes are kept, current boxes. 00519 * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts. 00520 */ 00521 mj_part_t update_part_num_arrays( 00522 std::vector<mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty. 00523 std::vector<mj_part_t> *future_num_part_in_parts, 00524 std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty. 00525 mj_part_t &future_num_parts, 00526 mj_part_t current_num_parts, 00527 int current_iteration, 00528 RCP<mj_partBoxVector_t> input_part_boxes, 00529 RCP<mj_partBoxVector_t> output_part_boxes); 00530 00542 void mj_get_local_min_max_coord_totW( 00543 mj_lno_t coordinate_begin_index, 00544 mj_lno_t coordinate_end_index, 00545 mj_lno_t *mj_current_coordinate_permutations, 00546 mj_scalar_t *mj_current_dim_coords, 00547 mj_scalar_t &min_coordinate, 00548 mj_scalar_t &max_coordinate, 00549 mj_scalar_t &total_weight); 00550 00558 void mj_get_global_min_max_coord_totW( 00559 mj_part_t current_concurrent_num_parts, 00560 mj_scalar_t *local_min_max_total, 00561 mj_scalar_t *global_min_max_total); 00562 00581 void mj_get_initial_cut_coords_target_weights( 00582 mj_scalar_t min_coord, 00583 mj_scalar_t max_coord, 00584 mj_part_t num_cuts/*p-1*/ , 00585 mj_scalar_t global_weight, 00586 mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/, 00587 mj_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/, 00588 00589 std::vector <mj_part_t> *future_num_part_in_parts, //the vecto 00590 std::vector <mj_part_t> *next_future_num_parts_in_parts, 00591 mj_part_t concurrent_current_part, 00592 mj_part_t obtained_part_index); 00593 00606 void set_initial_coordinate_parts( 00607 mj_scalar_t &max_coordinate, 00608 mj_scalar_t &min_coordinate, 00609 mj_part_t &concurrent_current_part_index, 00610 mj_lno_t coordinate_begin_index, 00611 mj_lno_t coordinate_end_index, 00612 mj_lno_t *mj_current_coordinate_permutations, 00613 mj_scalar_t *mj_current_dim_coords, 00614 mj_part_t *mj_part_ids, 00615 mj_part_t &partition_count); 00616 00627 void mj_1D_part( 00628 mj_scalar_t *mj_current_dim_coords, 00629 mj_scalar_t imbalanceTolerance, 00630 mj_part_t current_work_part, 00631 mj_part_t current_concurrent_num_parts, 00632 mj_scalar_t *current_cut_coordinates, 00633 mj_part_t total_incomplete_cut_count, 00634 std::vector <mj_part_t> &num_partitioning_in_current_dim); 00635 00655 void mj_1D_part_get_thread_part_weights( 00656 size_t total_part_count, 00657 mj_part_t num_cuts, 00658 mj_scalar_t max_coord, 00659 mj_scalar_t min_coord, 00660 mj_lno_t coordinate_begin_index, 00661 mj_lno_t coordinate_end_index, 00662 mj_scalar_t *mj_current_dim_coords, 00663 mj_scalar_t *temp_current_cut_coords, 00664 bool *current_cut_status, 00665 double *my_current_part_weights, 00666 mj_scalar_t *my_current_left_closest, 00667 mj_scalar_t *my_current_right_closest); 00668 00676 void mj_accumulate_thread_results( 00677 const std::vector <mj_part_t> &num_partitioning_in_current_dim, 00678 mj_part_t current_work_part, 00679 mj_part_t current_concurrent_num_parts); 00680 00711 void mj_get_new_cut_coordinates( 00712 const size_t &num_total_part, 00713 const mj_part_t &num_cuts, 00714 const mj_scalar_t &max_coordinate, 00715 const mj_scalar_t &min_coordinate, 00716 const mj_scalar_t &global_total_weight, 00717 const mj_scalar_t &used_imbalance_tolerance, 00718 mj_scalar_t * current_global_part_weights, 00719 const mj_scalar_t * current_local_part_weights, 00720 const mj_scalar_t *current_part_target_weights, 00721 bool *current_cut_line_determined, 00722 mj_scalar_t *current_cut_coordinates, 00723 mj_scalar_t *current_cut_upper_bounds, 00724 mj_scalar_t *current_cut_lower_bounds, 00725 mj_scalar_t *current_global_left_closest_points, 00726 mj_scalar_t *current_global_right_closest_points, 00727 mj_scalar_t * current_cut_lower_bound_weights, 00728 mj_scalar_t * current_cut_upper_weights, 00729 mj_scalar_t *new_current_cut_coordinates, 00730 mj_scalar_t *current_part_cut_line_weight_to_put_left, 00731 mj_part_t *rectilinear_cut_count, 00732 mj_part_t &my_num_incomplete_cut); 00733 00743 void mj_calculate_new_cut_position ( 00744 mj_scalar_t cut_upper_bound, 00745 mj_scalar_t cut_lower_bound, 00746 mj_scalar_t cut_upper_weight, 00747 mj_scalar_t cut_lower_weight, 00748 mj_scalar_t expected_weight, 00749 mj_scalar_t &new_cut_position); 00750 00761 void mj_create_new_partitions( 00762 mj_part_t num_parts, 00763 mj_scalar_t *mj_current_dim_coords, 00764 mj_scalar_t *current_concurrent_cut_coordinate, 00765 mj_lno_t coordinate_begin, 00766 mj_lno_t coordinate_end, 00767 mj_scalar_t *used_local_cut_line_weight_to_left, 00768 double **used_thread_part_weight_work, 00769 mj_lno_t *out_part_xadj); 00770 00793 bool mj_perform_migration( 00794 mj_part_t in_num_parts, //current umb parts 00795 mj_part_t &out_num_parts, //output umb parts. 00796 std::vector<mj_part_t> *next_future_num_parts_in_parts, 00797 mj_part_t &output_part_begin_index, 00798 size_t migration_reduce_all_population, 00799 mj_lno_t num_coords_for_last_dim_part, 00800 std::string iteration, 00801 RCP<mj_partBoxVector_t> &input_part_boxes, 00802 RCP<mj_partBoxVector_t> &output_part_boxes); 00803 00813 void get_processor_num_points_in_parts( 00814 mj_part_t num_procs, 00815 mj_part_t num_parts, 00816 mj_gno_t *&num_points_in_all_processor_parts); 00817 00830 bool mj_check_to_migrate( 00831 size_t migration_reduce_all_population, 00832 mj_lno_t num_coords_for_last_dim_part, 00833 mj_part_t num_procs, 00834 mj_part_t num_parts, 00835 mj_gno_t *num_points_in_all_processor_parts); 00836 00837 00855 void mj_migration_part_proc_assignment( 00856 mj_gno_t * num_points_in_all_processor_parts, 00857 mj_part_t num_parts, 00858 mj_part_t num_procs, 00859 mj_lno_t *send_count_to_each_proc, 00860 std::vector<mj_part_t> &processor_ranks_for_subcomm, 00861 std::vector<mj_part_t> *next_future_num_parts_in_parts, 00862 mj_part_t &out_num_part, 00863 std::vector<mj_part_t> &out_part_indices, 00864 mj_part_t &output_part_numbering_begin_index, 00865 int *coordinate_destinations); 00866 00883 void mj_assign_proc_to_parts( 00884 mj_gno_t * num_points_in_all_processor_parts, 00885 mj_part_t num_parts, 00886 mj_part_t num_procs, 00887 mj_lno_t *send_count_to_each_proc, 00888 std::vector<mj_part_t> &processor_ranks_for_subcomm, 00889 std::vector<mj_part_t> *next_future_num_parts_in_parts, 00890 mj_part_t &out_part_index, 00891 mj_part_t &output_part_numbering_begin_index, 00892 int *coordinate_destinations); 00893 00904 void assign_send_destinations( 00905 mj_part_t num_parts, 00906 mj_part_t *part_assignment_proc_begin_indices, 00907 mj_part_t *processor_chains_in_parts, 00908 mj_lno_t *send_count_to_each_proc, 00909 int *coordinate_destinations); 00910 00923 void assign_send_destinations2( 00924 mj_part_t num_parts, 00925 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors 00926 int *coordinate_destinations, 00927 mj_part_t &output_part_numbering_begin_index, 00928 std::vector<mj_part_t> *next_future_num_parts_in_parts); 00929 00946 void mj_assign_parts_to_procs( 00947 mj_gno_t * num_points_in_all_processor_parts, 00948 mj_part_t num_parts, 00949 mj_part_t num_procs, 00950 mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc. 00951 std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into. 00952 mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function. 00953 std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to. 00954 mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution 00955 int *coordinate_destinations); 00956 00969 void mj_migrate_coords( 00970 mj_part_t num_procs, 00971 mj_lno_t &num_new_local_points, 00972 std::string iteration, 00973 int *coordinate_destinations, 00974 mj_part_t num_parts); 00975 00982 void create_sub_communicator(vector<mj_part_t> &processor_ranks_for_subcomm); 00983 00984 00990 void fill_permutation_array( 00991 mj_part_t output_num_parts, 00992 mj_part_t num_parts); 00993 01002 void set_final_parts( 01003 mj_part_t current_num_parts, 01004 mj_part_t output_part_begin_index, 01005 RCP<mj_partBoxVector_t> output_part_boxes, 01006 bool is_data_ever_migrated); 01009 void free_work_memory(); 01023 void create_consistent_chunks( 01024 mj_part_t num_parts, 01025 mj_scalar_t *mj_current_dim_coords, 01026 mj_scalar_t *current_concurrent_cut_coordinate, 01027 mj_lno_t coordinate_begin, 01028 mj_lno_t coordinate_end, 01029 mj_scalar_t *used_local_cut_line_weight_to_left, 01030 mj_lno_t *out_part_xadj, 01031 int coordInd); 01032 public: 01033 AlgMJ(); 01034 01063 void multi_jagged_part( 01064 const RCP<const Environment> &env, 01065 RCP<Comm<int> > &problemComm, 01066 01067 double imbalance_tolerance, 01068 size_t num_global_parts, 01069 mj_part_t *part_no_array, 01070 int recursion_depth, 01071 01072 int coord_dim, 01073 mj_lno_t num_local_coords, 01074 mj_gno_t num_global_coords, 01075 const mj_gno_t *initial_mj_gnos, 01076 mj_scalar_t **mj_coordinates, 01077 01078 int num_weights_per_coord, 01079 bool *mj_uniform_weights, 01080 mj_scalar_t **mj_weights, 01081 bool *mj_uniform_parts, 01082 mj_scalar_t **mj_part_sizes, 01083 01084 mj_part_t *&result_assigned_part_ids, 01085 mj_gno_t *&result_mj_gnos 01086 01087 ); 01095 void set_partitioning_parameters( 01096 bool distribute_points_on_cut_lines_, 01097 int max_concurrent_part_calculation_, 01098 int check_migrate_avoid_migration_option_, 01099 mj_scalar_t minimum_migration_imbalance_); 01103 void set_to_keep_part_boxes(); 01108 RCP<mj_partBoxVector_t> get_part_boxes() const; 01109 01112 RCP<mj_partBox_t> get_global_box() const; 01113 01137 void sequential_task_partitioning( 01138 const RCP<const Environment> &env, 01139 mj_lno_t num_total_coords, 01140 mj_lno_t num_selected_coords, 01141 size_t num_target_part, 01142 int coord_dim, 01143 mj_scalar_t **mj_coordinates, 01144 mj_lno_t *initial_selected_coords_output_permutation, 01145 mj_lno_t *output_xadj, 01146 int recursion_depth, 01147 const mj_part_t *part_no_array); 01148 01149 }; 01150 01174 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 01175 typename mj_part_t> 01176 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::sequential_task_partitioning( 01177 const RCP<const Environment> &env, 01178 mj_lno_t num_total_coords, 01179 mj_lno_t num_selected_coords, 01180 size_t num_target_part, 01181 int coord_dim_, 01182 mj_scalar_t **mj_coordinates_, 01183 mj_lno_t *inital_adjList_output_adjlist, 01184 mj_lno_t *output_xadj, 01185 int rd, 01186 const mj_part_t *part_no_array_ 01187 ){ 01188 01189 this->mj_env = env; 01190 const RCP<Comm<int> > commN; 01191 this->comm = this->mj_problemComm = Teuchos::rcp_const_cast<Comm<int> > 01192 (Teuchos::DefaultComm<int>::getDefaultSerialComm(commN)); 01193 this->myActualRank = this->myRank = 1; 01194 01195 #ifdef HAVE_ZOLTAN2_OMP 01196 int actual_num_threads = omp_get_num_threads(); 01197 omp_set_num_threads(1); 01198 #endif 01199 01200 //weights are uniform for task mapping 01201 01202 //parts are uniform for task mapping 01203 //as input indices. 01204 01205 this->imbalance_tolerance = 0; 01206 this->num_global_parts = num_target_part; 01207 this->part_no_array = (mj_part_t *)part_no_array_; 01208 this->recursion_depth = rd; 01209 01210 this->coord_dim = coord_dim_; 01211 this->num_local_coords = num_total_coords; 01212 this->num_global_coords = num_total_coords; 01213 this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates. 01214 01217 this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords); 01218 01219 this->num_weights_per_coord = 0; 01220 bool *tmp_mj_uniform_weights = new bool[1]; 01221 this->mj_uniform_weights = tmp_mj_uniform_weights ; 01222 this->mj_uniform_weights[0] = true; 01223 01224 mj_scalar_t **tmp_mj_weights = new mj_scalar_t *[1]; 01225 this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights 01226 01227 bool *tmp_mj_uniform_parts = new bool[1]; 01228 this->mj_uniform_parts = tmp_mj_uniform_parts; 01229 this->mj_uniform_parts[0] = true; 01230 01231 mj_scalar_t **tmp_mj_part_sizes = new mj_scalar_t * [1]; 01232 this->mj_part_sizes = tmp_mj_part_sizes; 01233 this->mj_part_sizes[0] = NULL; 01234 01235 this->num_threads = 1; 01236 this->set_part_specifications(); 01237 01238 this->allocate_set_work_memory(); 01239 //the end of the initial partition is the end of coordinates. 01240 this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords); 01241 for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){ 01242 this->coordinate_permutations[i] = inital_adjList_output_adjlist[i]; 01243 } 01244 01245 mj_part_t current_num_parts = 1; 01246 01247 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates; 01248 01249 mj_part_t future_num_parts = this->total_num_part; 01250 01251 std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> (); 01252 std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> (); 01253 next_future_num_parts_in_parts->push_back(this->num_global_parts); 01254 RCP<mj_partBoxVector_t> t1; 01255 RCP<mj_partBoxVector_t> t2; 01256 01257 for (int i = 0; i < this->recursion_depth; ++i){ 01258 01259 //partitioning array. size will be as the number of current partitions and this 01260 //holds how many parts that each part will be in the current dimension partitioning. 01261 std::vector <mj_part_t> num_partitioning_in_current_dim; 01262 01263 //number of parts that will be obtained at the end of this partitioning. 01264 //future_num_part_in_parts is as the size of current number of parts. 01265 //holds how many more parts each should be divided in the further 01266 //iterations. this will be used to calculate num_partitioning_in_current_dim, 01267 //as the number of parts that the part will be partitioned 01268 //in the current dimension partitioning. 01269 01270 //next_future_num_parts_in_parts will be as the size of outnumParts, 01271 //and this will hold how many more parts that each output part 01272 //should be divided. this array will also be used to determine the weight ratios 01273 //of the parts. 01274 //swap the arrays to use iteratively.. 01275 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts; 01276 future_num_part_in_parts = next_future_num_parts_in_parts; 01277 next_future_num_parts_in_parts = tmpPartVect; 01278 01279 //clear next_future_num_parts_in_parts array as 01280 //getPartitionArrays expects it to be empty. 01281 //it also expects num_partitioning_in_current_dim to be empty as well. 01282 next_future_num_parts_in_parts->clear(); 01283 01284 01285 //returns the total number of output parts for this dimension partitioning. 01286 mj_part_t output_part_count_in_dimension = 01287 this->update_part_num_arrays( 01288 num_partitioning_in_current_dim, 01289 future_num_part_in_parts, 01290 next_future_num_parts_in_parts, 01291 future_num_parts, 01292 current_num_parts, 01293 i, 01294 t1, 01295 t2); 01296 01297 //if the number of obtained parts equal to current number of parts, 01298 //skip this dimension. For example, this happens when 1 is given in the input 01299 //part array is given. P=4,5,1,2 01300 if(output_part_count_in_dimension == current_num_parts) { 01301 tmpPartVect= future_num_part_in_parts; 01302 future_num_part_in_parts = next_future_num_parts_in_parts; 01303 next_future_num_parts_in_parts = tmpPartVect; 01304 continue; 01305 } 01306 01307 //get the coordinate axis along which the partitioning will be done. 01308 int coordInd = i % this->coord_dim; 01309 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd]; 01310 //convert i to string to be used for debugging purposes. 01311 std::string istring = toString<int>(i); 01312 01313 //alloc Memory to point the indices 01314 //of the parts in the permutation array. 01315 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension); 01316 01317 //the index where in the outtotalCounts will be written. 01318 mj_part_t output_part_index = 0; 01319 //whatever is written to outTotalCounts will be added with previousEnd 01320 //so that the points will be shifted. 01321 mj_part_t output_coordinate_end_index = 0; 01322 01323 mj_part_t current_work_part = 0; 01324 mj_part_t current_concurrent_num_parts = std::min(current_num_parts - current_work_part, 01325 this->max_concurrent_part_calculation); 01326 01327 mj_part_t obtained_part_index = 0; 01328 01329 //run for all available parts. 01330 for (; current_work_part < current_num_parts; 01331 current_work_part += current_concurrent_num_parts){ 01332 01333 current_concurrent_num_parts = std::min(current_num_parts - current_work_part, 01334 this->max_concurrent_part_calculation); 01335 01336 mj_part_t actual_work_part_count = 0; 01337 //initialization for 1D partitioning. 01338 //get the min and max coordinates of each part 01339 //together with the part weights of each part. 01340 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){ 01341 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk; 01342 01343 //if this part wont be partitioned any further 01344 //dont do any work for this part. 01345 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){ 01346 continue; 01347 } 01348 ++actual_work_part_count; 01349 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts]; 01350 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1]; 01351 01352 /* 01353 std::cout << "i:" << i << " j:" << current_work_part + kk 01354 << " coordinate_begin_index:" << coordinate_begin_index 01355 << " coordinate_end_index:" << coordinate_end_index 01356 << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl; 01357 */ 01358 this->mj_get_local_min_max_coord_totW( 01359 coordinate_begin_index, 01360 coordinate_end_index, 01361 this->coordinate_permutations, 01362 mj_current_dim_coords, 01363 this->process_local_min_max_coord_total_weight[kk], //min coordinate 01364 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate 01365 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight); 01366 ); 01367 } 01368 01369 //1D partitioning 01370 if (actual_work_part_count > 0){ 01371 //obtain global Min max of the part. 01372 this->mj_get_global_min_max_coord_totW( 01373 current_concurrent_num_parts, 01374 this->process_local_min_max_coord_total_weight, 01375 this->global_min_max_coord_total_weight); 01376 01377 //represents the total number of cutlines 01378 //whose coordinate should be determined. 01379 mj_part_t total_incomplete_cut_count = 0; 01380 01381 //Compute weight ratios for parts & cuts: 01382 //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1 01383 //part0 cut0 part1 cut1 part2 cut2 part3 01384 mj_part_t concurrent_part_cut_shift = 0; 01385 mj_part_t concurrent_part_part_shift = 0; 01386 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){ 01387 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk]; 01388 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk + 01389 current_concurrent_num_parts]; 01390 mj_scalar_t global_total_weight = 01391 this->global_min_max_coord_total_weight[kk + 01392 2 * current_concurrent_num_parts]; 01393 01394 mj_part_t concurrent_current_part_index = current_work_part + kk; 01395 01396 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index]; 01397 01398 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift; 01399 mj_scalar_t *current_target_part_weights = this->target_part_weights + 01400 concurrent_part_part_shift; 01401 //shift the usedCutCoordinate array as noCuts. 01402 concurrent_part_cut_shift += partition_count - 1; 01403 //shift the partRatio array as noParts. 01404 concurrent_part_part_shift += partition_count; 01405 01406 //calculate only if part is not empty, 01407 //and part will be further partitioend. 01408 if(partition_count > 1 && min_coordinate <= max_coordinate){ 01409 01410 //increase allDone by the number of cuts of the current 01411 //part's cut line number. 01412 total_incomplete_cut_count += partition_count - 1; 01413 //set the number of cut lines that should be determined 01414 //for this part. 01415 this->my_incomplete_cut_count[kk] = partition_count - 1; 01416 01417 //get the target weights of the parts. 01418 this->mj_get_initial_cut_coords_target_weights( 01419 min_coordinate, 01420 max_coordinate, 01421 partition_count - 1, 01422 global_total_weight, 01423 usedCutCoordinate, 01424 current_target_part_weights, 01425 future_num_part_in_parts, 01426 next_future_num_parts_in_parts, 01427 concurrent_current_part_index, 01428 obtained_part_index); 01429 01430 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index]; 01431 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1]; 01432 01433 //get the initial estimated part assignments of the coordinates. 01434 this->set_initial_coordinate_parts( 01435 max_coordinate, 01436 min_coordinate, 01437 concurrent_current_part_index, 01438 coordinate_begin_index, coordinate_end_index, 01439 this->coordinate_permutations, 01440 mj_current_dim_coords, 01441 this->assigned_part_ids, 01442 partition_count); 01443 } 01444 else { 01445 // e.g., if have fewer coordinates than parts, don't need to do next dim. 01446 this->my_incomplete_cut_count[kk] = 0; 01447 } 01448 obtained_part_index += partition_count; 01449 } 01450 01451 //used imbalance, it is always 0, as it is difficult to estimate a range. 01452 mj_scalar_t used_imbalance = 0; 01453 01454 // Determine cut lines for k parts here. 01455 this->mj_1D_part( 01456 mj_current_dim_coords, 01457 used_imbalance, 01458 current_work_part, 01459 current_concurrent_num_parts, 01460 current_cut_coordinates, 01461 total_incomplete_cut_count, 01462 num_partitioning_in_current_dim); 01463 } 01464 01465 //create part chunks 01466 { 01467 01468 mj_part_t output_array_shift = 0; 01469 mj_part_t cut_shift = 0; 01470 size_t tlr_shift = 0; 01471 size_t partweight_array_shift = 0; 01472 01473 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){ 01474 mj_part_t current_concurrent_work_part = current_work_part + kk; 01475 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part]; 01476 01477 //if the part is empty, skip the part. 01478 if((num_parts != 1 ) && this->global_min_max_coord_total_weight[kk] > 01479 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) { 01480 01481 for(mj_part_t jj = 0; jj < num_parts; ++jj){ 01482 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0; 01483 } 01484 cut_shift += num_parts - 1; 01485 tlr_shift += (4 *(num_parts - 1) + 1); 01486 output_array_shift += num_parts; 01487 partweight_array_shift += (2 * (num_parts - 1) + 1); 01488 continue; 01489 } 01490 01491 mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part]; 01492 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part 01493 -1]; 01494 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift; 01495 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left + 01496 cut_shift; 01497 01498 for(int ii = 0; ii < this->num_threads; ++ii){ 01499 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift; 01500 } 01501 01502 if(num_parts > 1){ 01503 // Rewrite the indices based on the computed cuts. 01504 this->create_consistent_chunks( 01505 num_parts, 01506 mj_current_dim_coords, 01507 current_concurrent_cut_coordinate, 01508 coordinate_begin, 01509 coordinate_end, 01510 used_local_cut_line_weight_to_left, 01511 this->new_part_xadj + output_part_index + output_array_shift, 01512 coordInd ); 01513 } 01514 else { 01515 //if this part is partitioned into 1 then just copy 01516 //the old values. 01517 mj_lno_t part_size = coordinate_end - coordinate_begin; 01518 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size; 01519 memcpy(this->new_coordinate_permutations + coordinate_begin, 01520 this->coordinate_permutations + coordinate_begin, 01521 part_size * sizeof(mj_lno_t)); 01522 } 01523 cut_shift += num_parts - 1; 01524 tlr_shift += (4 *(num_parts - 1) + 1); 01525 output_array_shift += num_parts; 01526 partweight_array_shift += (2 * (num_parts - 1) + 1); 01527 } 01528 01529 //shift cut coordinates so that all cut coordinates are stored. 01530 //current_cut_coordinates += cutShift; 01531 01532 //getChunks from coordinates partitioned the parts and 01533 //wrote the indices as if there were a single part. 01534 //now we need to shift the beginning indices. 01535 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){ 01536 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk]; 01537 for (mj_part_t ii = 0;ii < num_parts ; ++ii){ 01538 //shift it by previousCount 01539 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index; 01540 } 01541 //increase the previous count by current end. 01542 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1]; 01543 //increase the current out. 01544 output_part_index += num_parts ; 01545 } 01546 } 01547 } 01548 // end of this partitioning dimension 01549 01550 //set the current num parts for next dim partitioning 01551 current_num_parts = output_part_count_in_dimension; 01552 01553 //swap the coordinate permutations for the next dimension. 01554 mj_lno_t * tmp = this->coordinate_permutations; 01555 this->coordinate_permutations = this->new_coordinate_permutations; 01556 this->new_coordinate_permutations = tmp; 01557 01558 freeArray<mj_lno_t>(this->part_xadj); 01559 this->part_xadj = this->new_part_xadj; 01560 } 01561 01562 for(mj_lno_t i = 0; i < num_total_coords; ++i){ 01563 inital_adjList_output_adjlist[i] = this->coordinate_permutations[i]; 01564 } 01565 01566 for(size_t i = 0; i < this->num_global_parts ; ++i){ 01567 output_xadj[i] = this->part_xadj[i]; 01568 } 01569 01570 delete future_num_part_in_parts; 01571 delete next_future_num_parts_in_parts; 01572 01573 //free the extra memory that we allocated. 01574 freeArray<mj_part_t>(this->assigned_part_ids); 01575 freeArray<mj_gno_t>(this->initial_mj_gnos); 01576 freeArray<mj_gno_t>(this->current_mj_gnos); 01577 freeArray<bool>(tmp_mj_uniform_weights); 01578 freeArray<bool>(tmp_mj_uniform_parts); 01579 freeArray<mj_scalar_t *>(tmp_mj_weights); 01580 freeArray<mj_scalar_t *>(tmp_mj_part_sizes); 01581 01582 this->free_work_memory(); 01583 01584 #ifdef HAVE_ZOLTAN2_OMP 01585 omp_set_num_threads(actual_num_threads); 01586 #endif 01587 } 01588 01592 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 01593 typename mj_part_t> 01594 AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::AlgMJ(): 01595 mj_env(), mj_problemComm(), imbalance_tolerance(0), 01596 part_no_array(NULL), recursion_depth(0), coord_dim(0), 01597 num_weights_per_coord(0), initial_num_loc_coords(0), 01598 initial_num_glob_coords(0), 01599 num_local_coords(0), num_global_coords(0), mj_coordinates(NULL), 01600 mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL), 01601 mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1), 01602 initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL), 01603 coordinate_permutations(NULL), new_coordinate_permutations(NULL), 01604 assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL), 01605 distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1), 01606 mj_run_as_rcb(0), mj_user_recursion_depth(0), mj_keep_part_boxes(0), 01607 check_migrate_avoid_migration_option(0), minimum_migration_imbalance(0.30), 01608 num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0), 01609 max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0), 01610 last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0), 01611 all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL), 01612 thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL), 01613 target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL), 01614 cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL), 01615 process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL), 01616 is_cut_line_determined(NULL), my_incomplete_cut_count(NULL), 01617 thread_part_weights(NULL), thread_part_weight_work(NULL), 01618 thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL), 01619 thread_point_counts(NULL), process_rectilinear_cut_weight(NULL), 01620 global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL), 01621 global_total_part_weight_left_right_closests(NULL), 01622 kept_boxes(),global_box(), 01623 myRank(0), myActualRank(0) 01624 { 01625 this->fEpsilon = std::numeric_limits<float>::epsilon(); 01626 this->sEpsilon = std::numeric_limits<mj_scalar_t>::epsilon() * 100; 01627 01628 this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max(); 01629 this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max(); 01630 01631 } 01632 01636 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 01637 typename mj_part_t> 01638 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t> 01639 AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_boxes() const 01640 { 01641 if (this->mj_keep_part_boxes){ 01642 return this->kept_boxes; 01643 } else { 01644 throw std::logic_error("Error: part boxes are not stored."); 01645 } 01646 } 01647 01651 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 01652 typename mj_part_t> 01653 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t> 01654 AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::get_global_box() const 01655 { 01656 return this->global_box; 01657 } 01658 01662 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 01663 typename mj_part_t> 01664 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_to_keep_part_boxes(){ 01665 this->mj_keep_part_boxes = 1; 01666 } 01667 01668 01669 01670 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in 01671 * the input. part_no_array takes 01672 * precedence if both are provided. 01673 * Depending on these parameters, total cut/part number, 01674 * maximum part/cut number along a dimension, estimated number of reduceAlls, 01675 * and the number of parts before the last dimension is calculated. 01676 * */ 01677 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 01678 typename mj_part_t> 01679 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_part_specifications(){ 01680 01681 this->total_num_cut = 0; //how many cuts will be totally 01682 this->total_num_part = 1; //how many parts will be totally 01683 this->max_num_part_along_dim = 0; //maximum part count along a dimension. 01684 this->total_dim_num_reduce_all = 0; //estimate on #reduceAlls can be done. 01685 this->last_dim_num_part = 1; //max no of parts that might occur 01686 //during the partition before the 01687 //last partitioning dimension. 01688 this->max_num_cut_along_dim = 0; 01689 this->max_num_total_part_along_dim = 0; 01690 01691 if (this->part_no_array){ 01692 //if user provided part array, traverse the array and set variables. 01693 for (int i = 0; i < this->recursion_depth; ++i){ 01694 this->total_dim_num_reduce_all += this->total_num_part; 01695 this->total_num_part *= this->part_no_array[i]; 01696 if(this->part_no_array[i] > this->max_num_part_along_dim) { 01697 this->max_num_part_along_dim = this->part_no_array[i]; 01698 } 01699 } 01700 this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1]; 01701 this->num_global_parts = this->total_num_part; 01702 } else { 01703 mj_part_t future_num_parts = this->num_global_parts; 01704 01705 //we need to calculate the part numbers now, to determine the maximum along the dimensions. 01706 for (int i = 0; i < this->recursion_depth; ++i){ 01707 01708 mj_part_t maxNoPartAlongI = this->get_part_count( 01709 future_num_parts, 1.0f / (this->recursion_depth - i)); 01710 01711 if (maxNoPartAlongI > this->max_num_part_along_dim){ 01712 this->max_num_part_along_dim = maxNoPartAlongI; 01713 } 01714 01715 mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI; 01716 if (future_num_parts % maxNoPartAlongI){ 01717 ++nfutureNumParts; 01718 } 01719 future_num_parts = nfutureNumParts; 01720 } 01721 this->total_num_part = this->num_global_parts; 01722 //estimate reduceAll Count here. 01723 //we find the upperbound instead. 01724 mj_part_t p = 1; 01725 for (int i = 0; i < this->recursion_depth; ++i){ 01726 this->total_dim_num_reduce_all += p; 01727 p *= this->max_num_part_along_dim; 01728 } 01729 01730 this->last_dim_num_part = p / this->max_num_part_along_dim; 01731 } 01732 01733 this->total_num_cut = this->total_num_part - 1; 01734 this->max_num_cut_along_dim = this->max_num_part_along_dim - 1; 01735 this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim); 01736 //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1 01737 01738 //refine the concurrent part count, if it is given bigger than the maximum possible part count. 01739 if(this->max_concurrent_part_calculation > this->last_dim_num_part){ 01740 if(this->mj_problemComm->getRank() == 0){ 01741 std::cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation << 01742 ") has been set bigger than maximum amount that can be used." << 01743 " Setting to:" << this->last_dim_num_part << "." << std::endl; 01744 } 01745 this->max_concurrent_part_calculation = this->last_dim_num_part; 01746 } 01747 01748 } 01749 /* \brief Tries to determine the part number for current dimension, 01750 * by trying to make the partitioning as square as possible. 01751 * \param num_total_future how many more partitionings are required. 01752 * \param root how many more recursion depth is left. 01753 */ 01754 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 01755 typename mj_part_t> 01756 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count( 01757 mj_part_t num_total_future, 01758 double root) 01759 { 01760 double fp = pow(num_total_future, root); 01761 mj_part_t ip = mj_part_t (fp); 01762 if (fp - ip < this->fEpsilon * 100){ 01763 return ip; 01764 } 01765 else { 01766 return ip + 1; 01767 } 01768 } 01769 01770 /* \brief Function returns how many parts that will be obtained after this dimension partitioning. 01771 * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector, 01772 * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector, 01773 * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor. 01774 * 01775 * \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into. 01776 * \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into. 01777 * \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into. 01778 * \param future_num_parts: output, max number of future parts that will be obtained from a single 01779 * \param current_num_parts: input, how many parts are there currently. 01780 * \param current_iteration: input, current dimension iteration number. 01781 * \param input_part_boxes: input, if boxes are kept, current boxes. 01782 * \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts. 01783 */ 01784 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 01785 typename mj_part_t> 01786 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays( 01787 std::vector <mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty. 01788 std::vector<mj_part_t> *future_num_part_in_parts, 01789 std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty. 01790 mj_part_t &future_num_parts, 01791 mj_part_t current_num_parts, 01792 int current_iteration, 01793 RCP<mj_partBoxVector_t> input_part_boxes, 01794 RCP<mj_partBoxVector_t> output_part_boxes 01795 ){ 01796 //how many parts that will be obtained after this dimension. 01797 mj_part_t output_num_parts = 0; 01798 if(this->part_no_array){ 01799 //when the partNo array is provided as input, 01800 //each current partition will be partition to the same number of parts. 01801 //we dont need to use the future_num_part_in_parts vector in this case. 01802 01803 mj_part_t p = this->part_no_array[current_iteration]; 01804 if (p < 1){ 01805 std::cout << "i:" << current_iteration << " p is given as:" << p << std::endl; 01806 exit(1); 01807 } 01808 if (p == 1){ 01809 return current_num_parts; 01810 } 01811 01812 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){ 01813 num_partitioning_in_current_dim.push_back(p); 01814 01815 } 01816 //cout << "me:" << this->myRank << " current_iteration" << current_iteration << 01817 //" current_num_parts:" << current_num_parts << std::endl; 01818 //cout << "num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0] << std::endl; 01819 //set the new value of future_num_parts. 01820 01821 /* 01822 cout << "\tfuture_num_parts:" << future_num_parts 01823 << " num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0] 01824 << future_num_parts/ num_partitioning_in_current_dim[0] << std::endl; 01825 */ 01826 01827 future_num_parts /= num_partitioning_in_current_dim[0]; 01828 output_num_parts = current_num_parts * num_partitioning_in_current_dim[0]; 01829 01830 if (this->mj_keep_part_boxes){ 01831 for (mj_part_t k = 0; k < current_num_parts; ++k){ 01832 //initialized the output boxes as its ancestor. 01833 for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){ 01834 output_part_boxes->push_back((*input_part_boxes)[k]); 01835 } 01836 } 01837 } 01838 01839 //set the how many more parts each part will be divided. 01840 //this is obvious when partNo array is provided as input. 01841 //however, fill this so that weights will be calculated according to this array. 01842 for (mj_part_t ii = 0; ii < output_num_parts; ++ii){ 01843 next_future_num_parts_in_parts->push_back(future_num_parts); 01844 } 01845 } 01846 else { 01847 //if partNo array is not provided as input, 01848 //future_num_part_in_parts holds how many parts each part should be divided. 01849 //initially it holds a single number equal to the total number of global parts. 01850 01851 //calculate the future_num_parts from beginning, 01852 //since each part might be divided into different number of parts. 01853 future_num_parts = 1; 01854 01855 //cout << "i:" << i << std::endl; 01856 01857 for (mj_part_t ii = 0; ii < current_num_parts; ++ii){ 01858 //get how many parts a part should be divided. 01859 mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii]; 01860 01861 //get the ideal number of parts that is close to the 01862 //(recursion_depth - i) root of the future_num_parts_of_part_ii. 01863 mj_part_t num_partitions_in_current_dim = 01864 this->get_part_count( 01865 future_num_parts_of_part_ii, 01866 1.0 / (this->recursion_depth - current_iteration) 01867 ); 01868 01869 if (num_partitions_in_current_dim > this->max_num_part_along_dim){ 01870 std::cerr << "ERROR: maxPartNo calculation is wrong." << std::endl; 01871 exit(1); 01872 } 01873 //add this number to num_partitioning_in_current_dim vector. 01874 num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim); 01875 01876 01877 //increase the output number of parts. 01878 output_num_parts += num_partitions_in_current_dim; 01879 01880 //ideal number of future partitions for each part. 01881 mj_part_t ideal_num_future_parts_in_part = future_num_parts_of_part_ii / num_partitions_in_current_dim; 01882 for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){ 01883 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part; 01884 01885 //if there is a remainder in the part increase the part weight. 01886 if (iii < future_num_parts_of_part_ii % num_partitions_in_current_dim){ 01887 //if not uniform, add 1 for the extra parts. 01888 ++num_future_parts_for_part_iii; 01889 } 01890 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii); 01891 01892 //if part boxes are stored, initialize the box of the parts as the ancestor. 01893 if (this->mj_keep_part_boxes){ 01894 output_part_boxes->push_back((*input_part_boxes)[ii]); 01895 } 01896 01897 //set num future_num_parts to maximum in this part. 01898 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii; 01899 } 01900 } 01901 } 01902 return output_num_parts; 01903 } 01904 01905 01906 /* \brief Allocates and initializes the work memory that will be used by MJ. 01907 * 01908 * */ 01909 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 01910 typename mj_part_t> 01911 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){ 01912 01913 //points to process that initially owns the coordinate. 01914 this->owner_of_coordinate = NULL; 01915 01916 //Throughout the partitioning execution, 01917 //instead of the moving the coordinates, hold a permutation array for parts. 01918 //coordinate_permutations holds the current permutation. 01919 this->coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords); 01920 //initial configuration, set each pointer-i to i. 01921 #ifdef HAVE_ZOLTAN2_OMP 01922 #pragma omp parallel for 01923 #endif 01924 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){ 01925 this->coordinate_permutations[i] = i; 01926 } 01927 01928 //new_coordinate_permutations holds the current permutation. 01929 this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords); 01930 01931 this->assigned_part_ids = NULL; 01932 if(this->num_local_coords > 0){ 01933 this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords); 01934 } 01935 01936 //single partition starts at index-0, and ends at numLocalCoords 01937 //inTotalCounts array holds the end points in coordinate_permutations array 01938 //for each partition. Initially sized 1, and single element is set to numLocalCoords. 01939 this->part_xadj = allocMemory<mj_lno_t>(1); 01940 this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates. 01941 //the ends points of the output, this is allocated later. 01942 this->new_part_xadj = NULL; 01943 01944 // only store this much if cuts are needed to be stored. 01945 //this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->total_num_cut); 01946 01947 01948 this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); 01949 01950 this->max_min_coords = allocMemory< mj_scalar_t>(this->num_threads * 2); 01951 01952 this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline 01953 this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline 01954 //distribute_points_on_cut_lines = false; 01955 if(this->distribute_points_on_cut_lines){ 01956 this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); 01957 this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads); 01958 for(int i = 0; i < this->num_threads; ++i){ 01959 this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim); 01960 } 01961 this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim); 01962 this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim); 01963 } 01964 01965 01966 // work array to manipulate coordinate of cutlines in different iterations. 01967 //necessary because previous cut line information is used for determining 01968 //the next cutline information. therefore, cannot update the cut work array 01969 //until all cutlines are determined. 01970 this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * 01971 this->max_concurrent_part_calculation); 01972 01973 01974 //cumulative part weight array. 01975 this->target_part_weights = allocMemory<mj_scalar_t>( 01976 this->max_num_part_along_dim * this->max_concurrent_part_calculation); 01977 // the weight from left to write. 01978 01979 this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); //upper bound coordinate of a cut line 01980 this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound coordinate of a cut line 01981 this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //lower bound weight of a cut line 01982 this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation); //upper bound weight of a cut line 01983 01984 this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part. 01985 this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight. 01986 01987 //is_cut_line_determined is used to determine if a cutline is determined already. 01988 //If a cut line is already determined, the next iterations will skip this cut line. 01989 this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); 01990 //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part 01991 //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part. 01992 this->my_incomplete_cut_count = allocMemory<mj_part_t>(this->max_concurrent_part_calculation); 01993 //local part weights of each thread. 01994 this->thread_part_weights = allocMemory<double *>(this->num_threads); 01995 //the work manupulation array for partweights. 01996 this->thread_part_weight_work = allocMemory<double *>(this->num_threads); 01997 01998 //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread). 01999 this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads); 02000 //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread) 02001 this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads); 02002 02003 //to store how many points in each part a thread has. 02004 this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads); 02005 02006 for(int i = 0; i < this->num_threads; ++i){ 02007 //partWeights[i] = allocMemory<mj_scalar_t>(maxTotalPartCount); 02008 this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation); 02009 this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); 02010 this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation); 02011 this->thread_point_counts[i] = allocMemory<mj_lno_t>(this->max_num_part_along_dim); 02012 } 02013 //for faster communication, concatanation of 02014 //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines 02015 //leftClosest distances sized P-1, since P-1 cut lines 02016 //rightClosest distances size P-1, since P-1 cut lines. 02017 this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation); 02018 this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation); 02019 02020 02021 mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim); 02022 for (int i=0; i < this->coord_dim; i++){ 02023 coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords); 02024 #ifdef HAVE_ZOLTAN2_OMP 02025 #pragma omp parallel for 02026 #endif 02027 for (mj_lno_t j=0; j < this->num_local_coords; j++) 02028 coord[i][j] = this->mj_coordinates[i][j]; 02029 } 02030 this->mj_coordinates = coord; 02031 02032 02033 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1); 02034 mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim); 02035 02036 for (int i=0; i < criteria_dim; i++){ 02037 weights[i] = NULL; 02038 } 02039 for (int i=0; i < this->num_weights_per_coord; i++){ 02040 weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords); 02041 #ifdef HAVE_ZOLTAN2_OMP 02042 #pragma omp parallel for 02043 #endif 02044 for (mj_lno_t j=0; j < this->num_local_coords; j++) 02045 weights[i][j] = this->mj_weights[i][j]; 02046 02047 } 02048 this->mj_weights = weights; 02049 this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords); 02050 #ifdef HAVE_ZOLTAN2_OMP 02051 #pragma omp parallel for 02052 #endif 02053 for (mj_lno_t j=0; j < this->num_local_coords; j++) 02054 this->current_mj_gnos[j] = this->initial_mj_gnos[j]; 02055 02056 this->owner_of_coordinate = allocMemory<int>(this->num_local_coords); 02057 02058 #ifdef HAVE_ZOLTAN2_OMP 02059 #pragma omp parallel for 02060 #endif 02061 for (mj_lno_t j=0; j < this->num_local_coords; j++) 02062 this->owner_of_coordinate[j] = this->myActualRank; 02063 } 02064 02065 /* \brief compute the global bounding box 02066 */ 02067 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 02068 typename mj_part_t> 02069 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box() 02070 { 02071 //local min coords 02072 mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim); 02073 //global min coords 02074 mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim); 02075 //local max coords 02076 mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim); 02077 //global max coords 02078 mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim); 02079 02080 for (int i = 0; i < this->coord_dim; ++i){ 02081 mj_scalar_t localMin = this->mj_coordinates[i][0]; 02082 mj_scalar_t localMax = this->mj_coordinates[i][0]; 02083 for (mj_lno_t j = 1; j < this->num_local_coords; ++j){ 02084 if (this->mj_coordinates[i][j] < localMin){ 02085 localMin = this->mj_coordinates[i][j]; 02086 } 02087 if (this->mj_coordinates[i][j] > localMax){ 02088 localMax = this->mj_coordinates[i][j]; 02089 } 02090 } 02091 //cout << " localMin:" << localMin << endl; 02092 //cout << " localMax:" << localMax << endl; 02093 mins[i] = localMin; 02094 maxs[i] = localMax; 02095 } 02096 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN, 02097 this->coord_dim, mins, gmins 02098 ); 02099 02100 reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX, 02101 this->coord_dim, maxs, gmaxs 02102 ); 02103 02104 //create single box with all areas. 02105 global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs)); 02106 //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim); 02107 freeArray<mj_scalar_t>(mins); 02108 freeArray<mj_scalar_t>(gmins); 02109 freeArray<mj_scalar_t>(maxs); 02110 freeArray<mj_scalar_t>(gmaxs); 02111 } 02112 02113 /* \brief for part communication we keep track of the box boundaries. 02114 * This is performed when either asked specifically, or when geometric mapping is performed afterwards. 02115 * This function initializes a single box with all global min and max coordinates. 02116 * \param initial_partitioning_boxes the input and output vector for boxes. 02117 */ 02118 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 02119 typename mj_part_t> 02120 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes( 02121 RCP<mj_partBoxVector_t> & initial_partitioning_boxes 02122 ) 02123 { 02124 mj_partBox_t tmp_box(*global_box); 02125 initial_partitioning_boxes->push_back(tmp_box); 02126 } 02127 02138 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 02139 typename mj_part_t> 02140 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW( 02141 mj_lno_t coordinate_begin_index, 02142 mj_lno_t coordinate_end_index, 02143 mj_lno_t *mj_current_coordinate_permutations, 02144 mj_scalar_t *mj_current_dim_coords, 02145 mj_scalar_t &min_coordinate, 02146 mj_scalar_t &max_coordinate, 02147 mj_scalar_t &total_weight){ 02148 02149 //if the part is empty. 02150 //set the min and max coordinates as reverse. 02151 if(coordinate_begin_index >= coordinate_end_index) 02152 { 02153 min_coordinate = this->maxScalar_t; 02154 max_coordinate = this->minScalar_t; 02155 total_weight = 0; 02156 } 02157 else { 02158 mj_scalar_t my_total_weight = 0; 02159 #ifdef HAVE_ZOLTAN2_OMP 02160 #pragma omp parallel 02161 #endif 02162 { 02163 //if uniform weights are used, then weight is equal to count. 02164 if (this->mj_uniform_weights[0]) { 02165 #ifdef HAVE_ZOLTAN2_OMP 02166 #pragma omp single 02167 #endif 02168 { 02169 my_total_weight = coordinate_end_index - coordinate_begin_index; 02170 } 02171 02172 } 02173 else { 02174 //if not uniform, then weights are reducted from threads. 02175 #ifdef HAVE_ZOLTAN2_OMP 02176 #pragma omp for reduction(+:my_total_weight) 02177 #endif 02178 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){ 02179 int i = mj_current_coordinate_permutations[ii]; 02180 my_total_weight += this->mj_weights[0][i]; 02181 } 02182 } 02183 02184 int my_thread_id = 0; 02185 #ifdef HAVE_ZOLTAN2_OMP 02186 my_thread_id = omp_get_thread_num(); 02187 #endif 02188 mj_scalar_t my_thread_min_coord, my_thread_max_coord; 02189 my_thread_min_coord=my_thread_max_coord 02190 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]]; 02191 02192 02193 #ifdef HAVE_ZOLTAN2_OMP 02194 #pragma omp for 02195 #endif 02196 for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){ 02197 int i = mj_current_coordinate_permutations[j]; 02198 if(mj_current_dim_coords[i] > my_thread_max_coord) 02199 my_thread_max_coord = mj_current_dim_coords[i]; 02200 if(mj_current_dim_coords[i] < my_thread_min_coord) 02201 my_thread_min_coord = mj_current_dim_coords[i]; 02202 } 02203 this->max_min_coords[my_thread_id] = my_thread_min_coord; 02204 this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord; 02205 02206 #ifdef HAVE_ZOLTAN2_OMP 02207 //we need a barrier here, because max_min_array might not be filled by some of the threads. 02208 #pragma omp barrier 02209 #pragma omp single nowait 02210 #endif 02211 { 02212 min_coordinate = this->max_min_coords[0]; 02213 for(int i = 1; i < this->num_threads; ++i){ 02214 if(this->max_min_coords[i] < min_coordinate) 02215 min_coordinate = this->max_min_coords[i]; 02216 } 02217 } 02218 02219 #ifdef HAVE_ZOLTAN2_OMP 02220 #pragma omp single nowait 02221 #endif 02222 { 02223 max_coordinate = this->max_min_coords[this->num_threads]; 02224 for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){ 02225 if(this->max_min_coords[i] > max_coordinate) 02226 max_coordinate = this->max_min_coords[i]; 02227 } 02228 } 02229 } 02230 total_weight = my_total_weight; 02231 } 02232 } 02233 02234 02242 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 02243 typename mj_part_t> 02244 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW( 02245 mj_part_t current_concurrent_num_parts, 02246 mj_scalar_t *local_min_max_total, 02247 mj_scalar_t *global_min_max_total){ 02248 02249 //reduce min for first current_concurrent_num_parts elements, reduce max for next 02250 //concurrentPartCount elements, 02251 //reduce sum for the last concurrentPartCount elements. 02252 if(this->comm->getSize() > 1){ 02253 Teuchos::MultiJaggedCombinedMinMaxTotalReductionOp<int, mj_scalar_t> 02254 reductionOp( 02255 current_concurrent_num_parts, 02256 current_concurrent_num_parts, 02257 current_concurrent_num_parts); 02258 try{ 02259 reduceAll<int, mj_scalar_t>( 02260 *(this->comm), 02261 reductionOp, 02262 3 * current_concurrent_num_parts, 02263 local_min_max_total, 02264 global_min_max_total); 02265 } 02266 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env)) 02267 } 02268 else { 02269 mj_part_t s = 3 * current_concurrent_num_parts; 02270 for (mj_part_t i = 0; i < s; ++i){ 02271 global_min_max_total[i] = local_min_max_total[i]; 02272 } 02273 } 02274 } 02275 02276 02277 02296 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 02297 typename mj_part_t> 02298 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights( 02299 mj_scalar_t min_coord, 02300 mj_scalar_t max_coord, 02301 mj_part_t num_cuts/*p-1*/ , 02302 mj_scalar_t global_weight, 02303 mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/, 02304 mj_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/, 02305 02306 std::vector <mj_part_t> *future_num_part_in_parts, //the vecto 02307 std::vector <mj_part_t> *next_future_num_parts_in_parts, 02308 mj_part_t concurrent_current_part, 02309 mj_part_t obtained_part_index 02310 ){ 02311 02312 mj_scalar_t coord_range = max_coord - min_coord; 02313 if(this->mj_uniform_parts[0]){ 02314 { 02315 mj_part_t cumulative = 0; 02316 //how many total future parts the part will be partitioned into. 02317 mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]); 02318 02319 02320 //how much each part should weigh in ideal case. 02321 mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part; 02322 /* 02323 cout << "total_future_part_count_in_part:" << total_future_part_count_in_part << endl; 02324 cout << "global_weight:" << global_weight << endl; 02325 cout << "unit_part_weight" << unit_part_weight <<endl; 02326 */ 02327 for(mj_part_t i = 0; i < num_cuts; ++i){ 02328 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index]; 02329 02330 /* 02331 cout << "obtained_part_index:" << obtained_part_index << 02332 " (*next_future_num_parts_in_parts)[i + obtained_part_index]:" << (*next_future_num_parts_in_parts)[i + obtained_part_index] << 02333 " cumulative:" << cumulative << endl; 02334 */ 02335 //set target part weight. 02336 current_target_part_weights[i] = cumulative * unit_part_weight; 02337 //cout <<"i:" << i << " current_target_part_weights:" << current_target_part_weights[i] << endl; 02338 //set initial cut coordinate. 02339 initial_cut_coords[i] = min_coord + (coord_range * 02340 cumulative) / total_future_part_count_in_part; 02341 } 02342 current_target_part_weights[num_cuts] = 1; 02343 } 02344 02345 //round the target part weights. 02346 if (this->mj_uniform_weights[0]){ 02347 for(mj_part_t i = 0; i < num_cuts + 1; ++i){ 02348 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5); 02349 } 02350 } 02351 } 02352 else { 02353 std::cerr << "MJ does not support non uniform part weights" << std::endl; 02354 exit(1); 02355 } 02356 } 02357 02358 02371 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 02372 typename mj_part_t> 02373 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts( 02374 mj_scalar_t &max_coordinate, 02375 mj_scalar_t &min_coordinate, 02376 mj_part_t &concurrent_current_part_index, 02377 mj_lno_t coordinate_begin_index, 02378 mj_lno_t coordinate_end_index, 02379 mj_lno_t *mj_current_coordinate_permutations, 02380 mj_scalar_t *mj_current_dim_coords, 02381 mj_part_t *mj_part_ids, 02382 mj_part_t &partition_count 02383 ){ 02384 mj_scalar_t coordinate_range = max_coordinate - min_coordinate; 02385 02386 //if there is single point, or if all points are along a line. 02387 //set initial part to 0 for all. 02388 if(ABS(coordinate_range) < this->sEpsilon ){ 02389 #ifdef HAVE_ZOLTAN2_OMP 02390 #pragma omp parallel for 02391 #endif 02392 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){ 02393 mj_part_ids[mj_current_coordinate_permutations[ii]] = 0; 02394 } 02395 } 02396 else{ 02397 02398 //otherwise estimate an initial part for each coordinate. 02399 //assuming uniform distribution of points. 02400 mj_scalar_t slice = coordinate_range / partition_count; 02401 02402 #ifdef HAVE_ZOLTAN2_OMP 02403 #pragma omp parallel for 02404 #endif 02405 for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){ 02406 02407 mj_lno_t iii = mj_current_coordinate_permutations[ii]; 02408 mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice); 02409 mj_part_ids[iii] = 2 * pp; 02410 } 02411 } 02412 } 02413 02414 02425 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 02426 typename mj_part_t> 02427 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part( 02428 mj_scalar_t *mj_current_dim_coords, 02429 mj_scalar_t used_imbalance_tolerance, 02430 mj_part_t current_work_part, 02431 mj_part_t current_concurrent_num_parts, 02432 mj_scalar_t *current_cut_coordinates, 02433 mj_part_t total_incomplete_cut_count, 02434 std::vector <mj_part_t> &num_partitioning_in_current_dim 02435 ){ 02436 02437 02438 mj_part_t rectilinear_cut_count = 0; 02439 mj_scalar_t *temp_cut_coords = current_cut_coordinates; 02440 02441 Teuchos::MultiJaggedCombinedReductionOp<mj_part_t, mj_scalar_t> 02442 *reductionOp = NULL; 02443 reductionOp = new Teuchos::MultiJaggedCombinedReductionOp 02444 <mj_part_t, mj_scalar_t>( 02445 &num_partitioning_in_current_dim , 02446 current_work_part , 02447 current_concurrent_num_parts); 02448 02449 size_t total_reduction_size = 0; 02450 #ifdef HAVE_ZOLTAN2_OMP 02451 #pragma omp parallel shared(total_incomplete_cut_count, rectilinear_cut_count) 02452 #endif 02453 { 02454 int me = 0; 02455 #ifdef HAVE_ZOLTAN2_OMP 02456 me = omp_get_thread_num(); 02457 #endif 02458 double *my_thread_part_weights = this->thread_part_weights[me]; 02459 mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me]; 02460 mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me]; 02461 02462 #ifdef HAVE_ZOLTAN2_OMP 02463 #pragma omp single 02464 #endif 02465 { 02466 //initialize the lower and upper bounds of the cuts. 02467 mj_part_t next = 0; 02468 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){ 02469 02470 mj_part_t num_part_in_dim = num_partitioning_in_current_dim[current_work_part + i]; 02471 mj_part_t num_cut_in_dim = num_part_in_dim - 1; 02472 total_reduction_size += (4 * num_cut_in_dim + 1); 02473 02474 for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){ 02475 this->is_cut_line_determined[next] = false; 02476 this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate 02477 this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate 02478 02479 this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight 02480 this->cut_lower_bound_weights[next] = 0; 02481 02482 if(this->distribute_points_on_cut_lines){ 02483 this->process_cut_line_weight_to_put_left[next] = 0; 02484 } 02485 ++next; 02486 } 02487 } 02488 } 02489 02490 //no need to have barrier here. 02491 //pragma omp single have implicit barrier. 02492 02493 int iteration = 0; 02494 while (total_incomplete_cut_count != 0){ 02495 iteration += 1; 02496 //cout << "\niteration:" << iteration << " "; 02497 mj_part_t concurrent_cut_shifts = 0; 02498 size_t total_part_shift = 0; 02499 02500 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){ 02501 mj_part_t num_parts = -1; 02502 num_parts = num_partitioning_in_current_dim[current_work_part + kk]; 02503 02504 mj_part_t num_cuts = num_parts - 1; 02505 size_t total_part_count = num_parts + size_t (num_cuts) ; 02506 if (this->my_incomplete_cut_count[kk] > 0){ 02507 02508 //although isDone shared, currentDone is private and same for all. 02509 bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts; 02510 double *my_current_part_weights = my_thread_part_weights + total_part_shift; 02511 mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts; 02512 mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts; 02513 02514 mj_part_t conccurent_current_part = current_work_part + kk; 02515 mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1]; 02516 mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part]; 02517 mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts; 02518 02519 mj_scalar_t min_coord = global_min_max_coord_total_weight[kk]; 02520 mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts]; 02521 02522 // compute part weights using existing cuts 02523 this->mj_1D_part_get_thread_part_weights( 02524 total_part_count, 02525 num_cuts, 02526 max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar, 02527 min_coord,//globalMinMaxTotal[kk]//minScalar, 02528 coordinate_begin_index, 02529 coordinate_end_index, 02530 mj_current_dim_coords, 02531 temp_current_cut_coords, 02532 current_cut_status, 02533 my_current_part_weights, 02534 my_current_left_closest, 02535 my_current_right_closest); 02536 02537 } 02538 02539 concurrent_cut_shifts += num_cuts; 02540 total_part_shift += total_part_count; 02541 } 02542 02543 //sum up the results of threads 02544 this->mj_accumulate_thread_results( 02545 num_partitioning_in_current_dim, 02546 current_work_part, 02547 current_concurrent_num_parts); 02548 02549 //now sum up the results of mpi processors. 02550 #ifdef HAVE_ZOLTAN2_OMP 02551 #pragma omp single 02552 #endif 02553 { 02554 if(this->comm->getSize() > 1){ 02555 try{ 02556 reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp, 02557 total_reduction_size, 02558 this->total_part_weight_left_right_closests, 02559 this->global_total_part_weight_left_right_closests); 02560 02561 } 02562 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env)) 02563 } 02564 else { 02565 memcpy( 02566 this->global_total_part_weight_left_right_closests, 02567 this->total_part_weight_left_right_closests, 02568 total_reduction_size * sizeof(mj_scalar_t)); 02569 } 02570 } 02571 02572 //how much cut will be shifted for the next part in the concurrent part calculation. 02573 mj_part_t cut_shift = 0; 02574 02575 //how much the concantaneted array will be shifted for the next part in concurrent part calculation. 02576 size_t tlr_shift = 0; 02577 for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){ 02578 mj_part_t num_parts = num_partitioning_in_current_dim[current_work_part + kk]; 02579 mj_part_t num_cuts = num_parts - 1; 02580 size_t num_total_part = num_parts + size_t (num_cuts) ; 02581 02582 //if the cuts of this cut has already been completed. 02583 //nothing to do for this part. 02584 //just update the shift amount and proceed. 02585 if (this->my_incomplete_cut_count[kk] == 0) { 02586 cut_shift += num_cuts; 02587 tlr_shift += (num_total_part + 2 * num_cuts); 02588 continue; 02589 } 02590 02591 mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests + tlr_shift ; 02592 mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift; 02593 mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points 02594 mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points 02595 mj_scalar_t *current_global_part_weights = current_global_tlr; 02596 bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift; 02597 02598 mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk; 02599 mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift; 02600 02601 mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk]; 02602 mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts]; 02603 mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2]; 02604 mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift; 02605 mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift; 02606 mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift; 02607 mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift; 02608 02609 mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk]; 02610 02611 // Now compute the new cut coordinates. 02612 this->mj_get_new_cut_coordinates( 02613 num_total_part, 02614 num_cuts, 02615 max_coordinate, 02616 min_coordinate, 02617 global_total_weight, 02618 used_imbalance_tolerance, 02619 current_global_part_weights, 02620 current_local_part_weights, 02621 current_part_target_weights, 02622 current_cut_line_determined, 02623 temp_cut_coords + cut_shift, 02624 current_cut_upper_bounds, 02625 current_cut_lower_bounds, 02626 current_global_left_closest_points, 02627 current_global_right_closest_points, 02628 current_cut_lower_bound_weights, 02629 current_cut_upper_weights, 02630 this->cut_coordinates_work_array +cut_shift, //new cut coordinates 02631 current_part_cut_line_weight_to_put_left, 02632 &rectilinear_cut_count, 02633 this->my_incomplete_cut_count[kk]); 02634 02635 cut_shift += num_cuts; 02636 tlr_shift += (num_total_part + 2 * num_cuts); 02637 mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk]; 02638 #ifdef HAVE_ZOLTAN2_OMP 02639 #pragma omp single 02640 #endif 02641 { 02642 total_incomplete_cut_count -= iteration_complete_cut_count; 02643 } 02644 02645 } 02646 #ifdef HAVE_ZOLTAN2_OMP 02647 #pragma omp barrier 02648 #pragma omp single 02649 #endif 02650 { 02651 //swap the cut coordinates for next iteration. 02652 mj_scalar_t *t = temp_cut_coords; 02653 temp_cut_coords = this->cut_coordinates_work_array; 02654 this->cut_coordinates_work_array = t; 02655 } 02656 } 02657 02658 // Needed only if keep_cuts; otherwise can simply swap array pointers 02659 // cutCoordinates and cutCoordinatesWork. 02660 // (at first iteration, cutCoordinates == cutCoorindates_tmp). 02661 // computed cuts must be in cutCoordinates. 02662 if (current_cut_coordinates != temp_cut_coords){ 02663 #ifdef HAVE_ZOLTAN2_OMP 02664 #pragma omp single 02665 #endif 02666 { 02667 mj_part_t next = 0; 02668 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){ 02669 mj_part_t num_parts = -1; 02670 num_parts = num_partitioning_in_current_dim[current_work_part + i]; 02671 mj_part_t num_cuts = num_parts - 1; 02672 02673 for(mj_part_t ii = 0; ii < num_cuts; ++ii){ 02674 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii]; 02675 } 02676 next += num_cuts; 02677 } 02678 } 02679 02680 #ifdef HAVE_ZOLTAN2_OMP 02681 #pragma omp single 02682 #endif 02683 { 02684 this->cut_coordinates_work_array = temp_cut_coords; 02685 } 02686 } 02687 } 02688 delete reductionOp; 02689 } 02690 02691 02711 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 02712 typename mj_part_t> 02713 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights( 02714 size_t total_part_count, 02715 mj_part_t num_cuts, 02716 mj_scalar_t max_coord, 02717 mj_scalar_t min_coord, 02718 mj_lno_t coordinate_begin_index, 02719 mj_lno_t coordinate_end_index, 02720 mj_scalar_t *mj_current_dim_coords, 02721 mj_scalar_t *temp_current_cut_coords, 02722 bool *current_cut_status, 02723 double *my_current_part_weights, 02724 mj_scalar_t *my_current_left_closest, 02725 mj_scalar_t *my_current_right_closest){ 02726 02727 // initializations for part weights, left/right closest 02728 for (size_t i = 0; i < total_part_count; ++i){ 02729 my_current_part_weights[i] = 0; 02730 } 02731 02732 //initialize the left and right closest coordinates 02733 //to their max value. 02734 for(mj_part_t i = 0; i < num_cuts; ++i){ 02735 my_current_left_closest[i] = min_coord - 1; 02736 my_current_right_closest[i] = max_coord + 1; 02737 } 02738 //mj_lno_t comparison_count = 0; 02739 mj_scalar_t minus_EPSILON = -this->sEpsilon; 02740 #ifdef HAVE_ZOLTAN2_OMP 02741 //no need for the barrier as all threads uses their local memories. 02742 //dont change the static scheduling here, as it is assumed when the new 02743 //partitions are created later. 02744 #pragma omp for 02745 #endif 02746 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){ 02747 int i = this->coordinate_permutations[ii]; 02748 02749 //the accesses to assigned_part_ids are thread safe 02750 //since each coordinate is assigned to only a single thread. 02751 mj_part_t j = this->assigned_part_ids[i] / 2; 02752 02753 if(j >= num_cuts){ 02754 j = num_cuts - 1; 02755 } 02756 02757 mj_part_t lower_cut_index = 0; 02758 mj_part_t upper_cut_index = num_cuts - 1; 02759 02760 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i]; 02761 bool is_inserted = false; 02762 bool is_on_left_of_cut = false; 02763 bool is_on_right_of_cut = false; 02764 mj_part_t last_compared_part = -1; 02765 02766 mj_scalar_t coord = mj_current_dim_coords[i]; 02767 02768 while(upper_cut_index >= lower_cut_index) 02769 { 02770 //comparison_count++; 02771 last_compared_part = -1; 02772 is_on_left_of_cut = false; 02773 is_on_right_of_cut = false; 02774 mj_scalar_t cut = temp_current_cut_coords[j]; 02775 mj_scalar_t distance_to_cut = coord - cut; 02776 mj_scalar_t abs_distance_to_cut = ABS(distance_to_cut); 02777 02778 //if it is on the line. 02779 if(abs_distance_to_cut < this->sEpsilon){ 02780 02781 my_current_part_weights[j * 2 + 1] += w; 02782 this->assigned_part_ids[i] = j * 2 + 1; 02783 02784 //assign left and right closest point to cut as the point is on the cut. 02785 my_current_left_closest[j] = coord; 02786 my_current_right_closest[j] = coord; 02787 //now we need to check if there are other cuts on the same cut coordinate. 02788 //if there are, then we add the weight of the cut to all cuts in the same coordinate. 02789 mj_part_t kk = j + 1; 02790 while(kk < num_cuts){ 02791 // Needed when cuts shared the same position 02792 distance_to_cut =ABS(temp_current_cut_coords[kk] - cut); 02793 if(distance_to_cut < this->sEpsilon){ 02794 my_current_part_weights[2 * kk + 1] += w; 02795 my_current_left_closest[kk] = coord; 02796 my_current_right_closest[kk] = coord; 02797 kk++; 02798 } 02799 else{ 02800 //cut is far away. 02801 //just check the left closest point for the next cut. 02802 if(coord - my_current_left_closest[kk] > this->sEpsilon){ 02803 my_current_left_closest[kk] = coord; 02804 } 02805 break; 02806 } 02807 } 02808 02809 02810 kk = j - 1; 02811 //continue checking for the cuts on the left if they share the same coordinate. 02812 while(kk >= 0){ 02813 distance_to_cut =ABS(temp_current_cut_coords[kk] - cut); 02814 if(distance_to_cut < this->sEpsilon){ 02815 my_current_part_weights[2 * kk + 1] += w; 02816 //try to write the partId as the leftmost cut. 02817 this->assigned_part_ids[i] = kk * 2 + 1; 02818 my_current_left_closest[kk] = coord; 02819 my_current_right_closest[kk] = coord; 02820 kk--; 02821 } 02822 else{ 02823 //if cut is far away on the left of the point. 02824 //then just compare for right closest point. 02825 if(my_current_right_closest[kk] - coord > this->sEpsilon){ 02826 my_current_right_closest[kk] = coord; 02827 } 02828 break; 02829 } 02830 } 02831 02832 is_inserted = true; 02833 break; 02834 } 02835 else { 02836 //if point is on the left of the cut. 02837 if (distance_to_cut < 0) { 02838 bool _break = false; 02839 if(j > 0){ 02840 //check distance to the cut on the left the current cut compared. 02841 //if point is on the right, then we find the part of the point. 02842 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1]; 02843 if(distance_to_next_cut > this->sEpsilon){ 02844 _break = true; 02845 } 02846 } 02847 //if point is not on the right of the next cut, then 02848 //set the upper bound to this cut. 02849 upper_cut_index = j - 1; 02850 //set the last part, and mark it as on the left of the last part. 02851 is_on_left_of_cut = true; 02852 last_compared_part = j; 02853 if(_break) break; 02854 } 02855 else { 02856 //if point is on the right of the cut. 02857 bool _break = false; 02858 if(j < num_cuts - 1){ 02859 //check distance to the cut on the left the current cut compared. 02860 //if point is on the right, then we find the part of the point. 02861 mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1]; 02862 if(distance_to_next_cut < minus_EPSILON){ 02863 _break = true; 02864 } 02865 } 02866 02867 //if point is not on the left of the next cut, then 02868 //set the upper bound to this cut. 02869 lower_cut_index = j + 1; 02870 //set the last part, and mark it as on the right of the last part. 02871 is_on_right_of_cut = true; 02872 last_compared_part = j; 02873 if(_break) break; 02874 } 02875 } 02876 02877 j = (upper_cut_index + lower_cut_index) / 2; 02878 } 02879 if(!is_inserted){ 02880 if(is_on_right_of_cut){ 02881 02882 //add it to the right of the last compared part. 02883 my_current_part_weights[2 * last_compared_part + 2] += w; 02884 this->assigned_part_ids[i] = 2 * last_compared_part + 2; 02885 02886 //update the right closest point of last compared cut. 02887 if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){ 02888 my_current_right_closest[last_compared_part] = coord; 02889 } 02890 //update the left closest point of the cut on the right of the last compared cut. 02891 if(last_compared_part+1 < num_cuts){ 02892 02893 if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){ 02894 my_current_left_closest[last_compared_part + 1] = coord; 02895 } 02896 } 02897 02898 } 02899 else if(is_on_left_of_cut){ 02900 02901 //add it to the left of the last compared part. 02902 my_current_part_weights[2 * last_compared_part] += w; 02903 this->assigned_part_ids[i] = 2 * last_compared_part; 02904 02905 02906 //update the left closest point of last compared cut. 02907 if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){ 02908 my_current_left_closest[last_compared_part] = coord; 02909 } 02910 02911 //update the right closest point of the cut on the left of the last compared cut. 02912 if(last_compared_part-1 >= 0){ 02913 if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){ 02914 my_current_right_closest[last_compared_part -1] = coord; 02915 } 02916 } 02917 } 02918 } 02919 } 02920 02921 // prefix sum computation. 02922 //we need prefix sum for each part to determine cut positions. 02923 for (size_t i = 1; i < total_part_count; ++i){ 02924 // check for cuts sharing the same position; all cuts sharing a position 02925 // have the same weight == total weight for all cuts sharing the position. 02926 // don't want to accumulate that total weight more than once. 02927 if(i % 2 == 0 && i > 1 && i < total_part_count - 1 && 02928 ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1]) 02929 < this->sEpsilon){ 02930 //i % 2 = 0 when part i represents the cut coordinate. 02931 //if it is a cut, and if the next cut also have the same coordinate, then 02932 //dont addup. 02933 my_current_part_weights[i] = my_current_part_weights[i-2]; 02934 continue; 02935 } 02936 //otherwise do the prefix sum. 02937 my_current_part_weights[i] += my_current_part_weights[i-1]; 02938 } 02939 } 02940 02941 02949 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 02950 typename mj_part_t> 02951 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results( 02952 const std::vector <mj_part_t> &num_partitioning_in_current_dim, 02953 mj_part_t current_work_part, 02954 mj_part_t current_concurrent_num_parts){ 02955 02956 #ifdef HAVE_ZOLTAN2_OMP 02957 //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights 02958 //using parallel region here reduces the performance because of the cache invalidates. 02959 #pragma omp barrier 02960 #pragma omp single 02961 #endif 02962 { 02963 size_t tlr_array_shift = 0; 02964 mj_part_t cut_shift = 0; 02965 02966 //iterate for all concurrent parts to find the left and right closest points in the process. 02967 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){ 02968 02969 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i]; 02970 mj_part_t num_cuts_in_part = num_parts_in_part - 1; 02971 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ; 02972 02973 //iterate for cuts in a single part. 02974 for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){ 02975 mj_part_t next = tlr_array_shift + ii; 02976 mj_part_t cut_index = cut_shift + ii; 02977 if(this->is_cut_line_determined[cut_index]) continue; 02978 mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index], 02979 right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index]; 02980 02981 //find the closest points from left and right for the cut in the process. 02982 for (int j = 1; j < this->num_threads; ++j){ 02983 if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){ 02984 right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index]; 02985 } 02986 if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){ 02987 left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index]; 02988 } 02989 } 02990 //store the left and right closes points. 02991 this->total_part_weight_left_right_closests[num_total_part_in_part + 02992 next] = left_closest_in_process; 02993 this->total_part_weight_left_right_closests[num_total_part_in_part + 02994 num_cuts_in_part + next] = right_closest_in_process; 02995 } 02996 //set the shift position in the arrays 02997 tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part); 02998 cut_shift += num_cuts_in_part; 02999 } 03000 03001 tlr_array_shift = 0; 03002 cut_shift = 0; 03003 size_t total_part_array_shift = 0; 03004 03005 //iterate for all concurrent parts to find the total weight in the process. 03006 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){ 03007 03008 mj_part_t num_parts_in_part = num_partitioning_in_current_dim[current_work_part + i]; 03009 mj_part_t num_cuts_in_part = num_parts_in_part - 1; 03010 size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ; 03011 03012 for(size_t j = 0; j < num_total_part_in_part; ++j){ 03013 03014 mj_part_t cut_ind = j / 2 + cut_shift; 03015 03016 //need to check j != num_total_part_in_part - 1 03017 // which is same as j/2 != num_cuts_in_part. 03018 //we cannot check it using cut_ind, because of the concurrent part concantanetion. 03019 if(j != num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue; 03020 double pwj = 0; 03021 for (int k = 0; k < this->num_threads; ++k){ 03022 pwj += this->thread_part_weights[k][total_part_array_shift + j]; 03023 } 03024 //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts); 03025 this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj; 03026 } 03027 cut_shift += num_cuts_in_part; 03028 tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part; 03029 total_part_array_shift += num_total_part_in_part; 03030 } 03031 } 03032 //the other threads needs to wait here. 03033 //but we don't need a pragma omp barrier. 03034 //as omp single has already have implicit barrier. 03035 } 03036 03037 03047 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 03048 typename mj_part_t> 03049 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position ( 03050 mj_scalar_t cut_upper_bound, 03051 mj_scalar_t cut_lower_bound, 03052 mj_scalar_t cut_upper_weight, 03053 mj_scalar_t cut_lower_weight, 03054 mj_scalar_t expected_weight, 03055 mj_scalar_t &new_cut_position){ 03056 03057 if(ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){ 03058 new_cut_position = cut_upper_bound; //or lower bound does not matter. 03059 } 03060 03061 03062 if(ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){ 03063 new_cut_position = cut_lower_bound; 03064 } 03065 03066 mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound); 03067 mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight); 03068 mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight); 03069 03070 mj_scalar_t required_shift = (my_weight_diff / weight_range); 03071 int scale_constant = 20; 03072 int shiftint= int (required_shift * scale_constant); 03073 if (shiftint == 0) shiftint = 1; 03074 required_shift = mj_scalar_t (shiftint) / scale_constant; 03075 new_cut_position = coordinate_range * required_shift + cut_lower_bound; 03076 } 03077 03078 03089 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 03090 typename mj_part_t> 03091 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions( 03092 mj_part_t num_parts, 03093 mj_scalar_t *mj_current_dim_coords, 03094 mj_scalar_t *current_concurrent_cut_coordinate, 03095 mj_lno_t coordinate_begin, 03096 mj_lno_t coordinate_end, 03097 mj_scalar_t *used_local_cut_line_weight_to_left, 03098 double **used_thread_part_weight_work, 03099 mj_lno_t *out_part_xadj){ 03100 03101 mj_part_t num_cuts = num_parts - 1; 03102 03103 #ifdef HAVE_ZOLTAN2_OMP 03104 #pragma omp parallel 03105 #endif 03106 { 03107 int me = 0; 03108 #ifdef HAVE_ZOLTAN2_OMP 03109 me = omp_get_thread_num(); 03110 #endif 03111 03112 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me]; 03113 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL; 03114 03115 //now if the rectilinear partitioning is allowed we decide how 03116 //much weight each thread should put to left and right. 03117 if (this->distribute_points_on_cut_lines){ 03118 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me]; 03119 // this for assumes the static scheduling in mj_1D_part calculation. 03120 #ifdef HAVE_ZOLTAN2_OMP 03121 #pragma omp for 03122 #endif 03123 for (mj_part_t i = 0; i < num_cuts; ++i){ 03124 //the left to be put on the left of the cut. 03125 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i]; 03126 for(int ii = 0; ii < this->num_threads; ++ii){ 03127 if(left_weight > this->sEpsilon){ 03128 //the weight of thread ii on cut. 03129 mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ]; 03130 if(thread_ii_weight_on_cut < left_weight){ 03131 //if left weight is bigger than threads weight on cut. 03132 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut; 03133 } 03134 else { 03135 //if thread's weight is bigger than space, then put only a portion. 03136 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ; 03137 } 03138 left_weight -= thread_ii_weight_on_cut; 03139 } 03140 else { 03141 this->thread_cut_line_weight_to_put_left[ii][i] = 0; 03142 } 03143 } 03144 } 03145 03146 if(num_cuts > 0){ 03147 //this is a special case. If cutlines share the same coordinate, their weights are equal. 03148 //we need to adjust the ratio for that. 03149 for (mj_part_t i = num_cuts - 1; i > 0 ; --i){ 03150 if(ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){ 03151 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ; 03152 } 03153 my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL) 03154 / mj_scalar_t(SIGNIFICANCE_MUL); 03155 } 03156 } 03157 } 03158 03159 for(mj_part_t ii = 0; ii < num_parts; ++ii){ 03160 thread_num_points_in_parts[ii] = 0; 03161 } 03162 03163 03164 #ifdef HAVE_ZOLTAN2_OMP 03165 //dont change static scheduler. the static partitioner used later as well. 03166 #pragma omp for 03167 #endif 03168 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){ 03169 03170 mj_lno_t coordinate_index = this->coordinate_permutations[ii]; 03171 mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index]; 03172 mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index]; 03173 mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2; 03174 if(coordinate_assigned_place % 2 == 1){ 03175 //if it is on the cut. 03176 if(this->distribute_points_on_cut_lines 03177 && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){ 03178 //if the rectilinear partitioning is allowed, 03179 //and the thread has still space to put on the left of the cut 03180 //then thread puts the vertex to left. 03181 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight; 03182 //if putting the vertex to left increased the weight more than expected. 03183 //and if the next cut is on the same coordinate, 03184 //then we need to adjust how much weight next cut puts to its left as well, 03185 //in order to take care of the imbalance. 03186 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 03187 && coordinate_assigned_part < num_cuts - 1 03188 && ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] - 03189 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){ 03190 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part]; 03191 } 03192 ++thread_num_points_in_parts[coordinate_assigned_part]; 03193 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part; 03194 } 03195 else{ 03196 //if there is no more space on the left, put the coordinate to the right of the cut. 03197 ++coordinate_assigned_part; 03198 //this while loop is necessary when a line is partitioned into more than 2 parts. 03199 while(this->distribute_points_on_cut_lines && 03200 coordinate_assigned_part < num_cuts){ 03201 //traverse all the cut lines having the same partitiong 03202 if(ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] - 03203 current_concurrent_cut_coordinate[coordinate_assigned_part - 1]) 03204 < this->sEpsilon){ 03205 //if line has enough space on left, put it there. 03206 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > 03207 this->sEpsilon && 03208 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >= 03209 ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){ 03210 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight; 03211 //Again if it put too much on left of the cut, 03212 //update how much the next cut sharing the same coordinate will put to its left. 03213 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 && 03214 coordinate_assigned_part < num_cuts - 1 && 03215 ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] - 03216 current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){ 03217 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part]; 03218 } 03219 break; 03220 } 03221 } 03222 else { 03223 break; 03224 } 03225 ++coordinate_assigned_part; 03226 } 03227 ++thread_num_points_in_parts[coordinate_assigned_part]; 03228 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part; 03229 } 03230 } 03231 else { 03232 //if it is already assigned to a part, then just put it to the corresponding part. 03233 ++thread_num_points_in_parts[coordinate_assigned_part]; 03234 this->assigned_part_ids[coordinate_index] = coordinate_assigned_part; 03235 } 03236 } 03237 03238 03239 03240 //now we calculate where each thread will write in new_coordinate_permutations array. 03241 //first we find the out_part_xadj, by marking the begin and end points of each part found. 03242 //the below loop find the number of points in each part, and writes it to out_part_xadj 03243 #ifdef HAVE_ZOLTAN2_OMP 03244 #pragma omp for 03245 #endif 03246 for(mj_part_t j = 0; j < num_parts; ++j){ 03247 mj_lno_t num_points_in_part_j_upto_thread_i = 0; 03248 for (int i = 0; i < this->num_threads; ++i){ 03249 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j]; 03250 //prefix sum to thread point counts, so that each will have private space to write. 03251 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i; 03252 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j; 03253 03254 } 03255 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin; 03256 } 03257 03258 //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part. 03259 #ifdef HAVE_ZOLTAN2_OMP 03260 #pragma omp single 03261 #endif 03262 { 03263 //perform prefix sum for num_points in parts. 03264 for(mj_part_t j = 1; j < num_parts; ++j){ 03265 out_part_xadj[j] += out_part_xadj[j - 1]; 03266 } 03267 } 03268 03269 //shift the num points in threads thread to obtain the 03270 //beginning index of each thread's private space. 03271 for(mj_part_t j = 1; j < num_parts; ++j){ 03272 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ; 03273 } 03274 03275 03276 //now thread gets the coordinate and writes the index of coordinate to the permutation array 03277 //using the part index we calculated. 03278 #ifdef HAVE_ZOLTAN2_OMP 03279 #pragma omp for 03280 #endif 03281 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){ 03282 mj_lno_t i = this->coordinate_permutations[ii]; 03283 mj_part_t p = this->assigned_part_ids[i]; 03284 this->new_coordinate_permutations[coordinate_begin + 03285 thread_num_points_in_parts[p]++] = i; 03286 } 03287 } 03288 } 03289 03290 03291 03320 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 03321 typename mj_part_t> 03322 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates( 03323 const size_t &num_total_part, 03324 const mj_part_t &num_cuts, 03325 const mj_scalar_t &max_coordinate, 03326 const mj_scalar_t &min_coordinate, 03327 const mj_scalar_t &global_total_weight, 03328 const mj_scalar_t &used_imbalance_tolerance, 03329 mj_scalar_t * current_global_part_weights, 03330 const mj_scalar_t * current_local_part_weights, 03331 const mj_scalar_t *current_part_target_weights, 03332 bool *current_cut_line_determined, 03333 mj_scalar_t *current_cut_coordinates, 03334 mj_scalar_t *current_cut_upper_bounds, 03335 mj_scalar_t *current_cut_lower_bounds, 03336 mj_scalar_t *current_global_left_closest_points, 03337 mj_scalar_t *current_global_right_closest_points, 03338 mj_scalar_t * current_cut_lower_bound_weights, 03339 mj_scalar_t * current_cut_upper_weights, 03340 mj_scalar_t *new_current_cut_coordinates, 03341 mj_scalar_t *current_part_cut_line_weight_to_put_left, 03342 mj_part_t *rectilinear_cut_count, 03343 mj_part_t &my_num_incomplete_cut){ 03344 03345 //seen weight in the part 03346 mj_scalar_t seen_weight_in_part = 0; 03347 //expected weight for part. 03348 mj_scalar_t expected_weight_in_part = 0; 03349 //imbalance for the left and right side of the cut. 03350 mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0; 03351 03352 03353 #ifdef HAVE_ZOLTAN2_OMP 03354 #pragma omp for 03355 #endif 03356 for (mj_part_t i = 0; i < num_cuts; i++){ 03357 //if left and right closest points are not set yet, 03358 //set it to the cut itself. 03359 if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon) 03360 current_global_left_closest_points[i] = current_cut_coordinates[i]; 03361 if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon) 03362 current_global_right_closest_points[i] = current_cut_coordinates[i]; 03363 03364 } 03365 #ifdef HAVE_ZOLTAN2_OMP 03366 #pragma omp for 03367 #endif 03368 for (mj_part_t i = 0; i < num_cuts; i++){ 03369 03370 if(this->distribute_points_on_cut_lines){ 03371 //init the weight on the cut. 03372 this->global_rectilinear_cut_weight[i] = 0; 03373 this->process_rectilinear_cut_weight[i] = 0; 03374 } 03375 //if already determined at previous iterations, 03376 //then just write the coordinate to new array, and proceed. 03377 if(current_cut_line_determined[i]) { 03378 new_current_cut_coordinates[i] = current_cut_coordinates[i]; 03379 continue; 03380 } 03381 03382 //current weight of the part at the left of the cut line. 03383 seen_weight_in_part = current_global_part_weights[i * 2]; 03384 03385 /* 03386 cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part << endl; 03387 cout << "\tcut:" << current_cut_coordinates[i] 03388 << " current_cut_lower_bounds:" << current_cut_lower_bounds[i] 03389 << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << endl; 03390 */ 03391 //expected ratio 03392 expected_weight_in_part = current_part_target_weights[i]; 03393 //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected); 03394 imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part); 03395 //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected); 03396 imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part); 03397 03398 bool is_left_imbalance_valid = ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ; 03399 bool is_right_imbalance_valid = ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon; 03400 03401 //if the cut line reaches to desired imbalance. 03402 if(is_left_imbalance_valid && is_right_imbalance_valid){ 03403 current_cut_line_determined[i] = true; 03404 #ifdef HAVE_ZOLTAN2_OMP 03405 #pragma omp atomic 03406 #endif 03407 my_num_incomplete_cut -= 1; 03408 new_current_cut_coordinates [i] = current_cut_coordinates[i]; 03409 continue; 03410 } 03411 else if(imbalance_on_left < 0){ 03412 //if left imbalance < 0 then we need to move the cut to right. 03413 03414 if(this->distribute_points_on_cut_lines){ 03415 //if it is okay to distribute the coordinate on 03416 //the same coordinate to left and right. 03417 //then check if we can reach to the target weight by including the 03418 //coordinates in the part. 03419 if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){ 03420 //if it is we are done. 03421 current_cut_line_determined[i] = true; 03422 #ifdef HAVE_ZOLTAN2_OMP 03423 #pragma omp atomic 03424 #endif 03425 my_num_incomplete_cut -= 1; 03426 03427 //then assign everything on the cut to the left of the cut. 03428 new_current_cut_coordinates [i] = current_cut_coordinates[i]; 03429 03430 //for this cut all the weight on cut will be put to left. 03431 03432 current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2]; 03433 continue; 03434 } 03435 else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){ 03436 03437 //if the weight is larger than the expected weight, 03438 //then we need to distribute some points to left, some to right. 03439 current_cut_line_determined[i] = true; 03440 #ifdef HAVE_ZOLTAN2_OMP 03441 #pragma omp atomic 03442 #endif 03443 *rectilinear_cut_count += 1; 03444 //increase the num cuts to be determined with rectilinear partitioning. 03445 03446 #ifdef HAVE_ZOLTAN2_OMP 03447 #pragma omp atomic 03448 #endif 03449 my_num_incomplete_cut -= 1; 03450 new_current_cut_coordinates [i] = current_cut_coordinates[i]; 03451 this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] - 03452 current_local_part_weights[i * 2]; 03453 continue; 03454 } 03455 } 03456 //we need to move further right,so set lower bound to current line, and shift it to the closes point from right. 03457 current_cut_lower_bounds[i] = current_global_right_closest_points[i]; 03458 //set the lower bound weight to the weight we have seen. 03459 current_cut_lower_bound_weights[i] = seen_weight_in_part; 03460 03461 //compare the upper bound with what has been found in the last iteration. 03462 //we try to make more strict bounds for the cut here. 03463 for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){ 03464 mj_scalar_t p_weight = current_global_part_weights[ii * 2]; 03465 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1]; 03466 03467 if(p_weight >= expected_weight_in_part){ 03468 //if a cut on the right has the expected weight, then we found 03469 //our cut position. Set up and low coordiantes to this new cut coordinate. 03470 //but we need one more iteration to finalize the cut position, 03471 //as wee need to update the part ids. 03472 if(p_weight == expected_weight_in_part){ 03473 current_cut_upper_bounds[i] = current_cut_coordinates[ii]; 03474 current_cut_upper_weights[i] = p_weight; 03475 current_cut_lower_bounds[i] = current_cut_coordinates[ii]; 03476 current_cut_lower_bound_weights[i] = p_weight; 03477 } else if (p_weight < current_cut_upper_weights[i]){ 03478 //if a part weight is larger then my expected weight, 03479 //but lower than my upper bound weight, update upper bound. 03480 current_cut_upper_bounds[i] = current_global_left_closest_points[ii]; 03481 current_cut_upper_weights[i] = p_weight; 03482 } 03483 break; 03484 } 03485 //if comes here then pw < ew 03486 //then compare the weight against line weight. 03487 if(line_weight >= expected_weight_in_part){ 03488 //if the line is larger than the expected weight, 03489 //then we need to reach to the balance by distributing coordinates on this line. 03490 current_cut_upper_bounds[i] = current_cut_coordinates[ii]; 03491 current_cut_upper_weights[i] = line_weight; 03492 current_cut_lower_bounds[i] = current_cut_coordinates[ii]; 03493 current_cut_lower_bound_weights[i] = p_weight; 03494 break; 03495 } 03496 //if a stricter lower bound is found, 03497 //update the lower bound. 03498 if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){ 03499 current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ; 03500 current_cut_lower_bound_weights[i] = p_weight; 03501 } 03502 } 03503 03504 03505 mj_scalar_t new_cut_position = 0; 03506 this->mj_calculate_new_cut_position( 03507 current_cut_upper_bounds[i], 03508 current_cut_lower_bounds[i], 03509 current_cut_upper_weights[i], 03510 current_cut_lower_bound_weights[i], 03511 expected_weight_in_part, new_cut_position); 03512 03513 //if cut line does not move significantly. 03514 //then finalize the search. 03515 if (ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon 03516 /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ 03517 ){ 03518 current_cut_line_determined[i] = true; 03519 #ifdef HAVE_ZOLTAN2_OMP 03520 #pragma omp atomic 03521 #endif 03522 my_num_incomplete_cut -= 1; 03523 03524 //set the cut coordinate and proceed. 03525 new_current_cut_coordinates [i] = current_cut_coordinates[i]; 03526 } else { 03527 new_current_cut_coordinates [i] = new_cut_position; 03528 } 03529 } else { 03530 03531 //need to move the cut line to left. 03532 //set upper bound to current line. 03533 current_cut_upper_bounds[i] = current_global_left_closest_points[i]; 03534 current_cut_upper_weights[i] = seen_weight_in_part; 03535 03536 // compare the current cut line weights with previous upper and lower bounds. 03537 for (int ii = i - 1; ii >= 0; --ii){ 03538 mj_scalar_t p_weight = current_global_part_weights[ii * 2]; 03539 mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1]; 03540 if(p_weight <= expected_weight_in_part){ 03541 if(p_weight == expected_weight_in_part){ 03542 //if the weight of the part is my expected weight 03543 //then we find the solution. 03544 current_cut_upper_bounds[i] = current_cut_coordinates[ii]; 03545 current_cut_upper_weights[i] = p_weight; 03546 current_cut_lower_bounds[i] = current_cut_coordinates[ii]; 03547 current_cut_lower_bound_weights[i] = p_weight; 03548 } 03549 else if (p_weight > current_cut_lower_bound_weights[i]){ 03550 //if found weight is bigger than the lower bound 03551 //then update the lower bound. 03552 current_cut_lower_bounds[i] = current_global_right_closest_points[ii]; 03553 current_cut_lower_bound_weights[i] = p_weight; 03554 03555 //at the same time, if weight of line is bigger than the 03556 //expected weight, then update the upper bound as well. 03557 //in this case the balance will be obtained by distributing weightss 03558 //on this cut position. 03559 if(line_weight > expected_weight_in_part){ 03560 current_cut_upper_bounds[i] = current_global_right_closest_points[ii]; 03561 current_cut_upper_weights[i] = line_weight; 03562 } 03563 } 03564 break; 03565 } 03566 //if the weight of the cut on the left is still bigger than my weight, 03567 //and also if the weight is smaller than the current upper weight, 03568 //or if the weight is equal to current upper weight, but on the left of 03569 // the upper weight, then update upper bound. 03570 if (p_weight >= expected_weight_in_part && 03571 (p_weight < current_cut_upper_weights[i] || 03572 (p_weight == current_cut_upper_weights[i] && 03573 current_cut_upper_bounds[i] > current_global_left_closest_points[ii] 03574 ) 03575 ) 03576 ){ 03577 current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ; 03578 current_cut_upper_weights[i] = p_weight; 03579 } 03580 } 03581 mj_scalar_t new_cut_position = 0; 03582 this->mj_calculate_new_cut_position( 03583 current_cut_upper_bounds[i], 03584 current_cut_lower_bounds[i], 03585 current_cut_upper_weights[i], 03586 current_cut_lower_bound_weights[i], 03587 expected_weight_in_part, 03588 new_cut_position); 03589 03590 //if cut line does not move significantly. 03591 if (ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon 03592 /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){ 03593 current_cut_line_determined[i] = true; 03594 #ifdef HAVE_ZOLTAN2_OMP 03595 #pragma omp atomic 03596 #endif 03597 my_num_incomplete_cut -= 1; 03598 //set the cut coordinate and proceed. 03599 new_current_cut_coordinates [ i] = current_cut_coordinates[i]; 03600 } else { 03601 new_current_cut_coordinates [ i] = new_cut_position; 03602 } 03603 } 03604 } 03605 03606 //communication to determine the ratios of processors for the distribution 03607 //of coordinates on the cut lines. 03608 #ifdef HAVE_ZOLTAN2_OMP 03609 //no need barrier here as it is implicit. 03610 #pragma omp single 03611 #endif 03612 { 03613 if(*rectilinear_cut_count > 0){ 03614 03615 try{ 03616 Teuchos::scan<int,mj_scalar_t>( 03617 *comm, Teuchos::REDUCE_SUM, 03618 num_cuts, 03619 this->process_rectilinear_cut_weight, 03620 this->global_rectilinear_cut_weight 03621 ); 03622 } 03623 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env)) 03624 03625 for (mj_part_t i = 0; i < num_cuts; ++i){ 03626 //if cut line weight to be distributed. 03627 if(this->global_rectilinear_cut_weight[i] > 0) { 03628 //expected weight to go to left of the cut. 03629 mj_scalar_t expected_part_weight = current_part_target_weights[i]; 03630 //the weight that should be put to left of the cut. 03631 mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2]; 03632 //the weight of the cut in the process 03633 mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i]; 03634 //the sum of the cut weights upto this process, including the weight of this process. 03635 mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i]; 03636 //the space on the left side of the cut after all processes before this process (including this process) 03637 //puts their weights on cut to left. 03638 mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive; 03639 //add my weight to this space to find out how much space is left to me. 03640 mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line; 03641 03642 /* 03643 cout << "expected_part_weight:" << expected_part_weight 03644 << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left 03645 << " my_weight_on_line" << my_weight_on_line 03646 << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive 03647 << " space_to_put_left:" << space_to_put_left 03648 << " space_left_to_me" << space_left_to_me << endl; 03649 */ 03650 if(space_left_to_me < 0){ 03651 //space_left_to_me is negative and i dont need to put anything to left. 03652 current_part_cut_line_weight_to_put_left[i] = 0; 03653 } 03654 else if(space_left_to_me >= my_weight_on_line){ 03655 //space left to me is bigger than the weight of the processor on cut. 03656 //so put everything to left. 03657 current_part_cut_line_weight_to_put_left[i] = my_weight_on_line; 03658 //cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << endl; 03659 } 03660 else { 03661 //put only the weight as much as the space. 03662 current_part_cut_line_weight_to_put_left[i] = space_left_to_me ; 03663 03664 //cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << endl; 03665 } 03666 03667 } 03668 } 03669 *rectilinear_cut_count = 0; 03670 } 03671 } 03672 } 03673 03683 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 03684 typename mj_part_t> 03685 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts( 03686 mj_part_t num_procs, 03687 mj_part_t num_parts, 03688 mj_gno_t *&num_points_in_all_processor_parts){ 03689 03690 //initially allocation_size is num_parts 03691 size_t allocation_size = num_parts * (num_procs + 1); 03692 03693 //this will be output 03694 //holds how many each processor has in each part. 03695 //last portion is the sum of all processor points in each part. 03696 03697 //allocate memory for the local num coordinates in each part. 03698 mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size); 03699 03700 03701 //this is the portion of the memory which will be used 03702 //at the summation to obtain total number of processors' points in each part. 03703 mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts; 03704 //this is the portion of the memory where each stores its local number. 03705 //this information is needed by other processors. 03706 mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts; 03707 03708 //initialize the array with 0's. 03709 memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(mj_gno_t)*allocation_size); 03710 03711 //write the number of coordinates in each part. 03712 for (mj_part_t i = 0; i < num_parts; ++i){ 03713 mj_lno_t part_begin_index = 0; 03714 if (i > 0){ 03715 part_begin_index = this->new_part_xadj[i - 1]; 03716 } 03717 mj_lno_t part_end_index = this->new_part_xadj[i]; 03718 my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index; 03719 } 03720 03721 //copy the local num parts to the last portion of array, 03722 //so that this portion will represent the global num points in each part after the reduction. 03723 memcpy (my_local_point_counts_in_each_art, 03724 my_local_points_to_reduce_sum, 03725 sizeof(mj_gno_t) * (num_parts) ); 03726 03727 03728 //reduceAll operation. 03729 //the portion that belongs to a processor with index p 03730 //will start from myRank * num_parts. 03731 //the global number of points will be held at the index 03732 try{ 03733 reduceAll<int, mj_gno_t>( 03734 *(this->comm), 03735 Teuchos::REDUCE_SUM, 03736 allocation_size, 03737 num_local_points_in_each_part_to_reduce_sum, 03738 num_points_in_all_processor_parts); 03739 } 03740 Z2_THROW_OUTSIDE_ERROR(*(this->mj_env)) 03741 freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum); 03742 } 03743 03744 03745 03758 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 03759 typename mj_part_t> 03760 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate( 03761 size_t migration_reduce_all_population, 03762 mj_lno_t num_coords_for_last_dim_part, 03763 mj_part_t num_procs, 03764 mj_part_t num_parts, 03765 mj_gno_t *num_points_in_all_processor_parts){ 03766 03767 //if reduce all count and population in the last dim is too high 03768 if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true; 03769 //if the work in a part per processor in the last dim is too low. 03770 if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true; 03771 03772 //if migration is to be checked and the imbalance is too high 03773 if (this->check_migrate_avoid_migration_option == 0){ 03774 double global_imbalance = 0; 03775 //global shift to reach the sum of coordiante count in each part. 03776 size_t global_shift = num_procs * num_parts; 03777 03778 for (mj_part_t ii = 0; ii < num_procs; ++ii){ 03779 for (mj_part_t i = 0; i < num_parts; ++i){ 03780 double ideal_num = num_points_in_all_processor_parts[global_shift + i] 03781 / double(num_procs); 03782 03783 global_imbalance += ABS(ideal_num - 03784 num_points_in_all_processor_parts[ii * num_parts + i]) / (ideal_num); 03785 } 03786 } 03787 global_imbalance /= num_parts; 03788 global_imbalance /= num_procs; 03789 03790 /* 03791 if (this->myRank == 0) { 03792 cout << "imbalance for next iteration:" << global_imbalance << endl; 03793 } 03794 */ 03795 03796 if(global_imbalance <= this->minimum_migration_imbalance){ 03797 return false; 03798 } 03799 else { 03800 return true; 03801 } 03802 } 03803 else { 03804 //if migration is forced 03805 return true; 03806 } 03807 } 03808 03809 03819 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 03820 typename mj_part_t> 03821 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations( 03822 mj_part_t num_parts, 03823 mj_part_t *part_assignment_proc_begin_indices, 03824 mj_part_t *processor_chains_in_parts, 03825 mj_lno_t *send_count_to_each_proc, 03826 int *coordinate_destinations){ 03827 03828 for (mj_part_t p = 0; p < num_parts; ++p){ 03829 mj_lno_t part_begin = 0; 03830 if (p > 0) part_begin = this->new_part_xadj[p - 1]; 03831 mj_lno_t part_end = this->new_part_xadj[p]; 03832 03833 //get the first part that current processor will send its part-p. 03834 mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p]; 03835 //initialize how many point I sent to this processor. 03836 mj_lno_t num_total_send = 0; 03837 for (mj_lno_t j=part_begin; j < part_end; j++){ 03838 mj_lno_t local_ind = this->new_coordinate_permutations[j]; 03839 while (num_total_send >= send_count_to_each_proc[proc_to_sent]){ 03840 //then get the next processor to send the points in part p. 03841 num_total_send = 0; 03842 //assign new processor to part_assign_begin[p] 03843 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent]; 03844 //remove the previous processor 03845 processor_chains_in_parts[proc_to_sent] = -1; 03846 //choose the next processor as the next one to send. 03847 proc_to_sent = part_assignment_proc_begin_indices[p]; 03848 } 03849 //write the gno index to corresponding position in sendBuf. 03850 coordinate_destinations[local_ind] = proc_to_sent; 03851 ++num_total_send; 03852 } 03853 } 03854 } 03855 03870 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 03871 typename mj_part_t> 03872 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts( 03873 mj_gno_t * num_points_in_all_processor_parts, 03874 mj_part_t num_parts, 03875 mj_part_t num_procs, 03876 mj_lno_t *send_count_to_each_proc, 03877 std::vector<mj_part_t> &processor_ranks_for_subcomm, 03878 std::vector<mj_part_t> *next_future_num_parts_in_parts, 03879 mj_part_t &out_part_index, 03880 mj_part_t &output_part_numbering_begin_index, 03881 int *coordinate_destinations){ 03882 03883 03884 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts; 03885 mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts); 03886 03887 //boolean variable if the process finds its part to be assigned. 03888 bool did_i_find_my_group = false; 03889 03890 mj_part_t num_free_procs = num_procs; 03891 mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1; 03892 03893 double max_imbalance_difference = 0; 03894 mj_part_t max_differing_part = 0; 03895 03896 //find how many processor each part requires. 03897 for (mj_part_t i=0; i < num_parts; i++){ 03898 03899 //scalar portion of the required processors 03900 double scalar_required_proc = num_procs * 03901 (double (global_num_points_in_parts[i]) / double (this->num_global_coords)); 03902 03903 //round it to closest integer. 03904 mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc); 03905 03906 //if assigning the required num procs, creates problems for the rest of the parts. 03907 //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part. 03908 if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){ 03909 required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts); 03910 } 03911 03912 //reduce the free processor count 03913 num_free_procs -= required_proc; 03914 //reduce the free minimum processor count required for the rest of the part by 1. 03915 --minimum_num_procs_required_for_rest_of_parts; 03916 03917 //part (i) is assigned to (required_proc) processors. 03918 num_procs_assigned_to_each_part[i] = required_proc; 03919 03920 //because of the roundings some processors might be left as unassigned. 03921 //we want to assign those processors to the part with most imbalance. 03922 //find the part with the maximum imbalance here. 03923 double imbalance_wrt_ideal = (scalar_required_proc - required_proc) / required_proc; 03924 if (imbalance_wrt_ideal > max_imbalance_difference){ 03925 max_imbalance_difference = imbalance_wrt_ideal; 03926 max_differing_part = i; 03927 } 03928 } 03929 03930 //assign extra processors to the part with maximum imbalance than the ideal. 03931 if (num_free_procs > 0){ 03932 num_procs_assigned_to_each_part[max_differing_part] += num_free_procs; 03933 } 03934 03935 //now find what are the best processors with least migration for each part. 03936 03937 //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning 03938 //index of a processor that processor sends its data for part - i 03939 mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts); 03940 //the next processor send is found in processor_chains_in_parts, in linked list manner. 03941 mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs); 03942 mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs); 03943 03944 //initialize the assignment of each processor. 03945 //this has a linked list implementation. 03946 //the beginning of processors assigned 03947 //to each part is hold at part_assignment_proc_begin_indices[part]. 03948 //then the next processor assigned to that part is located at 03949 //proc_part_assignments[part_assign_begins[part]], this is a chain 03950 //until the value of -1 is reached. 03951 for (int i = 0; i < num_procs; ++i ){ 03952 processor_part_assignments[i] = -1; 03953 processor_chains_in_parts[i] = -1; 03954 } 03955 for (int i = 0; i < num_parts; ++i ){ 03956 part_assignment_proc_begin_indices[i] = -1; 03957 } 03958 03959 03960 //Allocate memory for sorting data structure. 03961 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_part_points_in_procs = allocMemory <uSortItem<mj_part_t, mj_gno_t> > (num_procs); 03962 for(mj_part_t i = 0; i < num_parts; ++i){ 03963 //the algorithm tries to minimize the cost of migration, 03964 //by assigning the processors with highest number of coordinates on that part. 03965 //here we might want to implement a maximum weighted bipartite matching algorithm. 03966 for(mj_part_t ii = 0; ii < num_procs; ++ii){ 03967 sort_item_num_part_points_in_procs[ii].id = ii; 03968 //if processor is not assigned yet. 03969 //add its num points to the sort data structure. 03970 if (processor_part_assignments[ii] == -1){ 03971 sort_item_num_part_points_in_procs[ii].val = 03972 num_points_in_all_processor_parts[ii * num_parts + i]; 03973 } 03974 else { 03975 //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again. 03976 //would be same if we simply set it to -1, 03977 //but more information with no extra cost (which is used later) is provided. 03978 sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1; 03979 } 03980 } 03981 //sort the processors in the part. 03982 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_part_points_in_procs); 03983 03984 mj_part_t required_proc_count = num_procs_assigned_to_each_part[i]; 03985 mj_gno_t total_num_points_in_part = global_num_points_in_parts[i]; 03986 mj_gno_t ideal_num_points_in_a_proc = 03987 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part / double (required_proc_count))); 03988 03989 //starts sending to least heaviest part. 03990 mj_part_t next_proc_to_send_index = num_procs - required_proc_count; 03991 mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id; 03992 mj_lno_t space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val; 03993 03994 //find the processors that will be assigned to this part, which are the heaviest 03995 //non assigned processors. 03996 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){ 03997 mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id; 03998 //assign processor to part - i. 03999 processor_part_assignments[proc_id] = i; 04000 } 04001 04002 bool did_change_sign = false; 04003 //if processor has a minus count, reverse it. 04004 for(mj_part_t ii = 0; ii < num_procs; ++ii){ 04005 // TODO: THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED 04006 // TODO: SEE BUG 6194 04007 if (sort_item_num_part_points_in_procs[ii].val < 0){ 04008 did_change_sign = true; 04009 sort_item_num_part_points_in_procs[ii].val = -sort_item_num_part_points_in_procs[ii].val - 1; 04010 } 04011 else { 04012 break; 04013 } 04014 } 04015 if(did_change_sign){ 04016 //resort the processors in the part for the rest of the processors that is not assigned. 04017 uqsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs); 04018 } 04019 04020 //check if this processors is one of the procs assigned to this part. 04021 //if it is, then get the group. 04022 if (!did_i_find_my_group){ 04023 for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){ 04024 04025 mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id; 04026 //add the proc to the group. 04027 processor_ranks_for_subcomm.push_back(proc_id_to_assign); 04028 04029 if(proc_id_to_assign == this->myRank){ 04030 //if the assigned process is me, then I find my group. 04031 did_i_find_my_group = true; 04032 //set the beginning of part i to my rank. 04033 part_assignment_proc_begin_indices[i] = this->myRank; 04034 processor_chains_in_parts[this->myRank] = -1; 04035 04036 //set send count to myself to the number of points that I have in part i. 04037 send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val; 04038 04039 //calculate the shift required for the output_part_numbering_begin_index 04040 for (mj_part_t in = 0; in < i; ++in){ 04041 output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in]; 04042 } 04043 out_part_index = i; 04044 } 04045 } 04046 //if these was not my group, 04047 //clear the subcomminicator processor array. 04048 if (!did_i_find_my_group){ 04049 processor_ranks_for_subcomm.clear(); 04050 } 04051 } 04052 04053 //send points of the nonassigned coordinates to the assigned coordinates. 04054 //starts from the heaviest nonassigned processor. 04055 //TODO we might want to play with this part, that allows more computational imbalance 04056 //but having better communication balance. 04057 for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){ 04058 mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id; 04059 mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val; 04060 04061 //we set number of points to -to_sent - 1 for the assigned processors. 04062 //we reverse it here. This should not happen, as we have already reversed them above. 04063 #ifdef MJ_DEBUG 04064 if (num_points_to_sent < 0) { 04065 cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << std::endl; 04066 exit(1); 04067 } 04068 #endif 04069 04070 //now sends the points to the assigned processors. 04071 while (num_points_to_sent > 0){ 04072 //if the processor has enough space. 04073 if (num_points_to_sent <= space_left_in_sent_proc){ 04074 //reduce the space left in the processor. 04075 space_left_in_sent_proc -= num_points_to_sent; 04076 //if my rank is the one that is sending the coordinates. 04077 if (this->myRank == nonassigned_proc_id){ 04078 //set my sent count to the sent processor. 04079 send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent; 04080 //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices) 04081 //that the processor will send its point in part-i. 04082 mj_part_t prev_begin = part_assignment_proc_begin_indices[i]; 04083 part_assignment_proc_begin_indices[i] = next_proc_to_send_id; 04084 processor_chains_in_parts[next_proc_to_send_id] = prev_begin; 04085 } 04086 num_points_to_sent = 0; 04087 } 04088 else { 04089 //there might be no space left in the processor. 04090 if(space_left_in_sent_proc > 0){ 04091 num_points_to_sent -= space_left_in_sent_proc; 04092 04093 //send as the space left in the processor. 04094 if (this->myRank == nonassigned_proc_id){ 04095 //send as much as the space in this case. 04096 send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc; 04097 mj_part_t prev_begin = part_assignment_proc_begin_indices[i]; 04098 part_assignment_proc_begin_indices[i] = next_proc_to_send_id; 04099 processor_chains_in_parts[next_proc_to_send_id] = prev_begin; 04100 04101 } 04102 } 04103 //change the sent part 04104 ++next_proc_to_send_index; 04105 04106 #ifdef MJ_DEBUG 04107 if(next_part_to_send_index < nprocs - required_proc_count ){ 04108 cout << "Migration - processor assignments - for part:" 04109 << i 04110 << " next_part_to_send :" << next_part_to_send_index 04111 << " nprocs:" << nprocs 04112 << " required_proc_count:" << required_proc_count 04113 << " Error: next_part_to_send_index < nprocs - required_proc_count" << std::endl; 04114 exit(1)l 04115 04116 } 04117 #endif 04118 //send the new id. 04119 next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id; 04120 //set the new space in the processor. 04121 space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val; 04122 } 04123 } 04124 } 04125 } 04126 04127 04128 04129 this->assign_send_destinations( 04130 num_parts, 04131 part_assignment_proc_begin_indices, 04132 processor_chains_in_parts, 04133 send_count_to_each_proc, 04134 coordinate_destinations); 04135 04136 freeArray<mj_part_t>(part_assignment_proc_begin_indices); 04137 freeArray<mj_part_t>(processor_chains_in_parts); 04138 freeArray<mj_part_t>(processor_part_assignments); 04139 freeArray<uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_part_points_in_procs); 04140 freeArray<mj_part_t > (num_procs_assigned_to_each_part); 04141 04142 } 04143 04144 04157 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 04158 typename mj_part_t> 04159 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2( 04160 mj_part_t num_parts, 04161 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors 04162 int *coordinate_destinations, 04163 mj_part_t &output_part_numbering_begin_index, 04164 std::vector<mj_part_t> *next_future_num_parts_in_parts){ 04165 04166 mj_part_t part_shift_amount = output_part_numbering_begin_index; 04167 mj_part_t previous_processor = -1; 04168 for(mj_part_t i = 0; i < num_parts; ++i){ 04169 mj_part_t p = sort_item_part_to_proc_assignment[i].id; 04170 //assigned processors are sorted. 04171 mj_lno_t part_begin_index = 0; 04172 if (p > 0) part_begin_index = this->new_part_xadj[p - 1]; 04173 mj_lno_t part_end_index = this->new_part_xadj[p]; 04174 04175 mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val; 04176 if (this->myRank == assigned_proc && previous_processor != assigned_proc){ 04177 output_part_numbering_begin_index = part_shift_amount; 04178 } 04179 previous_processor = assigned_proc; 04180 part_shift_amount += (*next_future_num_parts_in_parts)[p]; 04181 04182 for (mj_lno_t j=part_begin_index; j < part_end_index; j++){ 04183 mj_lno_t localInd = this->new_coordinate_permutations[j]; 04184 coordinate_destinations[localInd] = assigned_proc; 04185 } 04186 } 04187 } 04188 04189 04206 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 04207 typename mj_part_t> 04208 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs( 04209 mj_gno_t * num_points_in_all_processor_parts, 04210 mj_part_t num_parts, 04211 mj_part_t num_procs, 04212 mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc. 04213 std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into. 04214 mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function. 04215 std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to. 04216 mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution 04217 int *coordinate_destinations){ 04218 out_num_part = 0; 04219 04220 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts; 04221 out_part_indices.clear(); 04222 04223 //to sort the parts that is assigned to the processors. 04224 //id is the part number, sort value is the assigned processor id. 04225 uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts); 04226 uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs); 04227 04228 04229 //calculate the optimal number of coordinates that should be assigned to each processor. 04230 mj_lno_t work_each = mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f); 04231 //to hold the left space as the number of coordinates to the optimal number in each proc. 04232 mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs); 04233 //initialize left space in each. 04234 for (mj_part_t i = 0; i < num_procs; ++i){ 04235 space_in_each_processor[i] = work_each; 04236 } 04237 04238 //we keep track of how many parts each processor is assigned to. 04239 //because in some weird inputs, it might be possible that some 04240 //processors is not assigned to any part. Using these variables, 04241 //we force each processor to have at least one part. 04242 mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs); 04243 memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs); 04244 int empty_proc_count = num_procs; 04245 04246 //to sort the parts with decreasing order of their coordiantes. 04247 //id are the part numbers, sort value is the number of points in each. 04248 uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts); 04249 04250 //initially we will sort the parts according to the number of coordinates they have. 04251 //so that we will start assigning with the part that has the most number of coordinates. 04252 for (mj_part_t i = 0; i < num_parts; ++i){ 04253 sort_item_point_counts_in_parts[i].id = i; 04254 sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i]; 04255 } 04256 //sort parts with increasing order of loads. 04257 uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts); 04258 04259 04260 //assigning parts to the processors 04261 //traverse the part win decreasing order of load. 04262 //first assign the heaviest part. 04263 for (mj_part_t j = 0; j < num_parts; ++j){ 04264 //sorted with increasing order, traverse inverse. 04265 mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id; 04266 //load of the part 04267 mj_gno_t load = global_num_points_in_parts[i]; 04268 04269 //assigned processors 04270 mj_part_t assigned_proc = -1; 04271 //if not fit best processor. 04272 mj_part_t best_proc_to_assign = 0; 04273 04274 04275 //sort processors with increasing number of points in this part. 04276 for (mj_part_t ii = 0; ii < num_procs; ++ii){ 04277 sort_item_num_points_of_proc_in_part_i[ii].id = ii; 04278 04279 //if there are still enough parts to fill empty processors, than proceed normally. 04280 //but if empty processor count is equal to the number of part, then 04281 //we force to part assignments only to empty processors. 04282 if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){ 04283 //how many points processor ii has in part i? 04284 sort_item_num_points_of_proc_in_part_i[ii].val = num_points_in_all_processor_parts[ii * num_parts + i]; 04285 } 04286 else { 04287 sort_item_num_points_of_proc_in_part_i[ii].val = -1; 04288 } 04289 } 04290 uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i); 04291 04292 //traverse all processors with decreasing load. 04293 for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){ 04294 mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id; 04295 mj_lno_t left_space = space_in_each_processor[ii] - load; 04296 //if enought space, assign to this part. 04297 if(left_space >= 0 ){ 04298 assigned_proc = ii; 04299 break; 04300 } 04301 //if space is not enough, store the best candidate part. 04302 if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){ 04303 best_proc_to_assign = ii; 04304 } 04305 } 04306 04307 //if none had enough space, then assign it to best part. 04308 if (assigned_proc == -1){ 04309 assigned_proc = best_proc_to_assign; 04310 } 04311 04312 if (num_parts_proc_assigned[assigned_proc]++ == 0){ 04313 --empty_proc_count; 04314 } 04315 space_in_each_processor[assigned_proc] -= load; 04316 //to sort later, part-i is assigned to the proccessor - assignment. 04317 sort_item_part_to_proc_assignment[j].id = i; //part i 04318 sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment. 04319 04320 04321 //if assigned processor is me, increase the number. 04322 if (assigned_proc == this->myRank){ 04323 out_num_part++;//assigned_part_count; 04324 out_part_indices.push_back(i); 04325 } 04326 //increase the send to that processor by the number of points in that part. 04327 //as everyone send their coordiantes in this part to the processor assigned to this part. 04328 send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i]; 04329 } 04330 freeArray<mj_part_t>(num_parts_proc_assigned); 04331 freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i); 04332 freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts); 04333 freeArray<mj_lno_t >(space_in_each_processor); 04334 04335 04336 //sort assignments with respect to the assigned processors. 04337 uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment); 04338 //fill sendBuf. 04339 04340 04341 this->assign_send_destinations2( 04342 num_parts, 04343 sort_item_part_to_proc_assignment, 04344 coordinate_destinations, 04345 output_part_numbering_begin_index, 04346 next_future_num_parts_in_parts); 04347 04348 freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment); 04349 } 04350 04351 04369 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 04370 typename mj_part_t> 04371 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment( 04372 mj_gno_t * num_points_in_all_processor_parts, 04373 mj_part_t num_parts, 04374 mj_part_t num_procs, 04375 mj_lno_t *send_count_to_each_proc, 04376 std::vector<mj_part_t> &processor_ranks_for_subcomm, 04377 std::vector<mj_part_t> *next_future_num_parts_in_parts, 04378 mj_part_t &out_num_part, 04379 std::vector<mj_part_t> &out_part_indices, 04380 mj_part_t &output_part_numbering_begin_index, 04381 int *coordinate_destinations){ 04382 04383 04384 04385 processor_ranks_for_subcomm.clear(); 04386 // if (this->num_local_coords > 0) 04387 if (num_procs > num_parts){ 04388 //if there are more processors than the number of current part 04389 //then processors share the existing parts. 04390 //at the end each processor will have a single part, 04391 //but a part will be shared by a group of processors. 04392 mj_part_t out_part_index = 0; 04393 this->mj_assign_proc_to_parts( 04394 num_points_in_all_processor_parts, 04395 num_parts, 04396 num_procs, 04397 send_count_to_each_proc, 04398 processor_ranks_for_subcomm, 04399 next_future_num_parts_in_parts, 04400 out_part_index, 04401 output_part_numbering_begin_index, 04402 coordinate_destinations 04403 ); 04404 04405 out_num_part = 1; 04406 out_part_indices.clear(); 04407 out_part_indices.push_back(out_part_index); 04408 } 04409 else { 04410 04411 //there are more parts than the processors. 04412 //therefore a processor will be assigned multiple parts, 04413 //the subcommunicators will only have a single processor. 04414 processor_ranks_for_subcomm.push_back(this->myRank); 04415 04416 //since there are more parts then procs, 04417 //assign multiple parts to processors. 04418 this->mj_assign_parts_to_procs( 04419 num_points_in_all_processor_parts, 04420 num_parts, 04421 num_procs, 04422 send_count_to_each_proc, 04423 next_future_num_parts_in_parts, 04424 out_num_part, 04425 out_part_indices, 04426 output_part_numbering_begin_index, 04427 coordinate_destinations); 04428 } 04429 } 04430 04443 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 04444 typename mj_part_t> 04445 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords( 04446 mj_part_t num_procs, 04447 mj_lno_t &num_new_local_points, 04448 std::string iteration, 04449 int *coordinate_destinations, 04450 mj_part_t num_parts) 04451 { 04452 #ifdef ENABLE_ZOLTAN_MIGRATION 04453 if (sizeof(mj_lno_t) <= sizeof(int)) { 04454 04455 // Cannot use Zoltan_Comm with local ordinals larger than ints. 04456 // In Zoltan_Comm_Create, the cast int(this->num_local_coords) 04457 // may overflow. 04458 04459 ZOLTAN_COMM_OBJ *plan = NULL; 04460 MPI_Comm mpi_comm = Teuchos2MPI (this->comm); 04461 int num_incoming_gnos = 0; 04462 int message_tag = 7859; 04463 04464 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration); 04465 int ierr = Zoltan_Comm_Create( 04466 &plan, 04467 int(this->num_local_coords), 04468 coordinate_destinations, 04469 mpi_comm, 04470 message_tag, 04471 &num_incoming_gnos); 04472 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 04473 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration); 04474 04475 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration); 04476 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos); 04477 04478 //migrate gnos. 04479 message_tag++; 04480 ierr = Zoltan_Comm_Do( 04481 plan, 04482 message_tag, 04483 (char *) this->current_mj_gnos, 04484 sizeof(mj_gno_t), 04485 (char *) incoming_gnos); 04486 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 04487 04488 freeArray<mj_gno_t>(this->current_mj_gnos); 04489 this->current_mj_gnos = incoming_gnos; 04490 04491 04492 //migrate coordinates 04493 for (int i = 0; i < this->coord_dim; ++i){ 04494 message_tag++; 04495 mj_scalar_t *coord = this->mj_coordinates[i]; 04496 04497 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos); 04498 ierr = Zoltan_Comm_Do( 04499 plan, 04500 message_tag, 04501 (char *) coord, 04502 sizeof(mj_scalar_t), 04503 (char *) this->mj_coordinates[i]); 04504 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 04505 freeArray<mj_scalar_t>(coord); 04506 } 04507 04508 //migrate weights. 04509 for (int i = 0; i < this->num_weights_per_coord; ++i){ 04510 message_tag++; 04511 mj_scalar_t *weight = this->mj_weights[i]; 04512 04513 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos); 04514 ierr = Zoltan_Comm_Do( 04515 plan, 04516 message_tag, 04517 (char *) weight, 04518 sizeof(mj_scalar_t), 04519 (char *) this->mj_weights[i]); 04520 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 04521 freeArray<mj_scalar_t>(weight); 04522 } 04523 04524 04525 //migrate owners. 04526 int *coord_own = allocMemory<int>(num_incoming_gnos); 04527 message_tag++; 04528 ierr = Zoltan_Comm_Do( 04529 plan, 04530 message_tag, 04531 (char *) this->owner_of_coordinate, 04532 sizeof(int), (char *) coord_own); 04533 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 04534 freeArray<int>(this->owner_of_coordinate); 04535 this->owner_of_coordinate = coord_own; 04536 04537 04538 //if num procs is less than num parts, 04539 //we need the part assigment arrays as well, since 04540 //there will be multiple parts in processor. 04541 mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos); 04542 if(num_procs < num_parts){ 04543 message_tag++; 04544 ierr = Zoltan_Comm_Do( 04545 plan, 04546 message_tag, 04547 (char *) this->assigned_part_ids, 04548 sizeof(mj_part_t), 04549 (char *) new_parts); 04550 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 04551 } 04552 freeArray<mj_part_t>(this->assigned_part_ids); 04553 this->assigned_part_ids = new_parts; 04554 04555 ierr = Zoltan_Comm_Destroy(&plan); 04556 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 04557 num_new_local_points = num_incoming_gnos; 04558 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration); 04559 } 04560 04561 else 04562 04563 #endif // ENABLE_ZOLTAN_MIGRATION 04564 { 04565 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration); 04566 Tpetra::Distributor distributor(this->comm); 04567 ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords); 04568 mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations); 04569 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration); 04570 04571 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration); 04572 { 04573 //migrate gnos. 04574 ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos); 04575 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords); 04576 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos()); 04577 freeArray<mj_gno_t>(this->current_mj_gnos); 04578 this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos); 04579 memcpy( 04580 this->current_mj_gnos, 04581 received_gnos.getRawPtr(), 04582 num_incoming_gnos * sizeof(mj_gno_t)); 04583 } 04584 //migrate coordinates 04585 for (int i = 0; i < this->coord_dim; ++i){ 04586 04587 ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords); 04588 ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos); 04589 distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord()); 04590 freeArray<mj_scalar_t>(this->mj_coordinates[i]); 04591 this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos); 04592 memcpy( 04593 this->mj_coordinates[i], 04594 received_coord.getRawPtr(), 04595 num_incoming_gnos * sizeof(mj_scalar_t)); 04596 } 04597 04598 //migrate weights. 04599 for (int i = 0; i < this->num_weights_per_coord; ++i){ 04600 04601 ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords); 04602 ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos); 04603 distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight()); 04604 freeArray<mj_scalar_t>(this->mj_weights[i]); 04605 this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos); 04606 memcpy( 04607 this->mj_weights[i], 04608 received_weight.getRawPtr(), 04609 num_incoming_gnos * sizeof(mj_scalar_t)); 04610 } 04611 04612 { 04613 //migrate the owners of the coordinates 04614 ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords); 04615 ArrayRCP<int> received_owners(num_incoming_gnos); 04616 distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners()); 04617 freeArray<int>(this->owner_of_coordinate); 04618 this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos); 04619 memcpy( 04620 this->owner_of_coordinate, 04621 received_owners.getRawPtr(), 04622 num_incoming_gnos * sizeof(int)); 04623 } 04624 04625 //if num procs is less than num parts, 04626 //we need the part assigment arrays as well, since 04627 //there will be multiple parts in processor. 04628 if(num_procs < num_parts){ 04629 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords); 04630 ArrayRCP<mj_part_t> received_partids(num_incoming_gnos); 04631 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids()); 04632 freeArray<mj_part_t>(this->assigned_part_ids); 04633 this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos); 04634 memcpy( 04635 this->assigned_part_ids, 04636 received_partids.getRawPtr(), 04637 num_incoming_gnos * sizeof(mj_part_t)); 04638 } 04639 else { 04640 mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos); 04641 freeArray<mj_part_t>(this->assigned_part_ids); 04642 this->assigned_part_ids = new_parts; 04643 } 04644 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration); 04645 num_new_local_points = num_incoming_gnos; 04646 04647 } 04648 } 04649 04656 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 04657 typename mj_part_t> 04658 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){ 04659 mj_part_t group_size = processor_ranks_for_subcomm.size(); 04660 mj_part_t *ids = allocMemory<mj_part_t>(group_size); 04661 for(mj_part_t i = 0; i < group_size; ++i) { 04662 ids[i] = processor_ranks_for_subcomm[i]; 04663 } 04664 ArrayView<const mj_part_t> idView(ids, group_size); 04665 this->comm = this->comm->createSubcommunicator(idView); 04666 freeArray<mj_part_t>(ids); 04667 } 04668 04669 04675 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 04676 typename mj_part_t> 04677 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array( 04678 mj_part_t output_num_parts, 04679 mj_part_t num_parts){ 04680 //if there is single output part, then simply fill the permutation array. 04681 if (output_num_parts == 1){ 04682 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){ 04683 this->new_coordinate_permutations[i] = i; 04684 } 04685 this->new_part_xadj[0] = this->num_local_coords; 04686 } 04687 else { 04688 04689 //otherwise we need to count how many points are there in each part. 04690 //we allocate here as num_parts, because the sent partids are up to num_parts, 04691 //although there are outout_num_parts different part. 04692 mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts); 04693 //part shift holds the which part number an old part number corresponds to. 04694 mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts); 04695 04696 memset(num_points_in_parts, 0, sizeof(mj_lno_t) * num_parts); 04697 04698 for(mj_lno_t i = 0; i < this->num_local_coords; ++i){ 04699 mj_part_t ii = this->assigned_part_ids[i]; 04700 ++num_points_in_parts[ii]; 04701 } 04702 04703 //write the end points of the parts. 04704 mj_part_t p = 0; 04705 mj_lno_t prev_index = 0; 04706 for(mj_part_t i = 0; i < num_parts; ++i){ 04707 if(num_points_in_parts[i] > 0) { 04708 this->new_part_xadj[p] = prev_index + num_points_in_parts[i]; 04709 prev_index += num_points_in_parts[i]; 04710 part_shifts[i] = p++; 04711 } 04712 } 04713 04714 //for the rest of the parts write the end index as end point. 04715 mj_part_t assigned_num_parts = p - 1; 04716 for (;p < num_parts; ++p){ 04717 this->new_part_xadj[p] = this->new_part_xadj[assigned_num_parts]; 04718 } 04719 for(mj_part_t i = 0; i < output_num_parts; ++i){ 04720 num_points_in_parts[i] = this->new_part_xadj[i]; 04721 } 04722 04723 //write the permutation array here. 04724 //get the part of the coordinate i, shift it to obtain the new part number. 04725 //assign it to the end of the new part numbers pointer. 04726 for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){ 04727 mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])]; 04728 this->new_coordinate_permutations[--num_points_in_parts[part]] = i; 04729 } 04730 04731 freeArray<mj_lno_t>(num_points_in_parts); 04732 freeArray<mj_part_t>(part_shifts); 04733 } 04734 } 04735 04736 04759 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 04760 typename mj_part_t> 04761 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration( 04762 mj_part_t input_num_parts, //current umb parts 04763 mj_part_t &output_num_parts, //output umb parts. 04764 std::vector<mj_part_t> *next_future_num_parts_in_parts, 04765 mj_part_t &output_part_begin_index, 04766 size_t migration_reduce_all_population, 04767 mj_lno_t num_coords_for_last_dim_part, 04768 std::string iteration, 04769 RCP<mj_partBoxVector_t> &input_part_boxes, 04770 RCP<mj_partBoxVector_t> &output_part_boxes 04771 ) 04772 { 04773 mj_part_t num_procs = this->comm->getSize(); 04774 this->myRank = this->comm->getRank(); 04775 04776 04777 //this array holds how many points each processor has in each part. 04778 //to access how many points processor i has on part j, 04779 //num_points_in_all_processor_parts[i * num_parts + j] 04780 mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1)); 04781 04782 //get the number of coordinates in each part in each processor. 04783 this->get_processor_num_points_in_parts( 04784 num_procs, 04785 input_num_parts, 04786 num_points_in_all_processor_parts); 04787 04788 04789 //check if migration will be performed or not. 04790 if (!this->mj_check_to_migrate( 04791 migration_reduce_all_population, 04792 num_coords_for_last_dim_part, 04793 num_procs, 04794 input_num_parts, 04795 num_points_in_all_processor_parts)){ 04796 freeArray<mj_gno_t>(num_points_in_all_processor_parts); 04797 return false; 04798 } 04799 04800 04801 mj_lno_t *send_count_to_each_proc = NULL; 04802 int *coordinate_destinations = allocMemory<int>(this->num_local_coords); 04803 send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs); 04804 for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0; 04805 04806 std::vector<mj_part_t> processor_ranks_for_subcomm; 04807 std::vector<mj_part_t> out_part_indices; 04808 04809 //determine which processors are assigned to which parts 04810 this->mj_migration_part_proc_assignment( 04811 num_points_in_all_processor_parts, 04812 input_num_parts, 04813 num_procs, 04814 send_count_to_each_proc, 04815 processor_ranks_for_subcomm, 04816 next_future_num_parts_in_parts, 04817 output_num_parts, 04818 out_part_indices, 04819 output_part_begin_index, 04820 coordinate_destinations); 04821 04822 04823 04824 04825 freeArray<mj_lno_t>(send_count_to_each_proc); 04826 std::vector <mj_part_t> tmpv; 04827 04828 std::sort (out_part_indices.begin(), out_part_indices.end()); 04829 mj_part_t outP = out_part_indices.size(); 04830 04831 mj_gno_t new_global_num_points = 0; 04832 mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts; 04833 04834 if (this->mj_keep_part_boxes){ 04835 input_part_boxes->clear(); 04836 } 04837 04838 //now we calculate the new values for next_future_num_parts_in_parts. 04839 //same for the part boxes. 04840 for (mj_part_t i = 0; i < outP; ++i){ 04841 mj_part_t ind = out_part_indices[i]; 04842 new_global_num_points += global_num_points_in_parts[ind]; 04843 tmpv.push_back((*next_future_num_parts_in_parts)[ind]); 04844 if (this->mj_keep_part_boxes){ 04845 input_part_boxes->push_back((*output_part_boxes)[ind]); 04846 } 04847 } 04848 //swap the input and output part boxes. 04849 if (this->mj_keep_part_boxes){ 04850 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes; 04851 input_part_boxes = output_part_boxes; 04852 output_part_boxes = tmpPartBoxes; 04853 } 04854 next_future_num_parts_in_parts->clear(); 04855 for (mj_part_t i = 0; i < outP; ++i){ 04856 mj_part_t p = tmpv[i]; 04857 next_future_num_parts_in_parts->push_back(p); 04858 } 04859 04860 freeArray<mj_gno_t>(num_points_in_all_processor_parts); 04861 04862 mj_lno_t num_new_local_points = 0; 04863 04864 04865 //perform the actual migration operation here. 04866 this->mj_migrate_coords( 04867 num_procs, 04868 num_new_local_points, 04869 iteration, 04870 coordinate_destinations, 04871 input_num_parts); 04872 04873 04874 freeArray<int>(coordinate_destinations); 04875 04876 if(this->num_local_coords != num_new_local_points){ 04877 freeArray<mj_lno_t>(this->new_coordinate_permutations); 04878 freeArray<mj_lno_t>(this->coordinate_permutations); 04879 04880 this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points); 04881 this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points); 04882 } 04883 this->num_local_coords = num_new_local_points; 04884 this->num_global_coords = new_global_num_points; 04885 04886 04887 04888 //create subcommunicator. 04889 this->create_sub_communicator(processor_ranks_for_subcomm); 04890 processor_ranks_for_subcomm.clear(); 04891 04892 //fill the new permutation arrays. 04893 this->fill_permutation_array( 04894 output_num_parts, 04895 input_num_parts); 04896 return true; 04897 } 04898 04899 04913 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 04914 typename mj_part_t> 04915 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks( 04916 mj_part_t num_parts, 04917 mj_scalar_t *mj_current_dim_coords, 04918 mj_scalar_t *current_concurrent_cut_coordinate, 04919 mj_lno_t coordinate_begin, 04920 mj_lno_t coordinate_end, 04921 mj_scalar_t *used_local_cut_line_weight_to_left, 04922 mj_lno_t *out_part_xadj, 04923 int coordInd){ 04924 04925 //mj_lno_t numCoordsInPart = coordinateEnd - coordinateBegin; 04926 mj_part_t no_cuts = num_parts - 1; 04927 04928 04929 04930 int me = 0; 04931 mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me]; 04932 mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL; 04933 04934 04935 //now if the rectilinear partitioning is allowed we decide how 04936 //much weight each thread should put to left and right. 04937 if (this->distribute_points_on_cut_lines){ 04938 04939 my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me]; 04940 for (mj_part_t i = 0; i < no_cuts; ++i){ 04941 //the left to be put on the left of the cut. 04942 mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i]; 04943 //cout << "i:" << i << " left_weight:" << left_weight << endl; 04944 for(int ii = 0; ii < this->num_threads; ++ii){ 04945 if(left_weight > this->sEpsilon){ 04946 //the weight of thread ii on cut. 04947 mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ]; 04948 if(thread_ii_weight_on_cut < left_weight){ 04949 this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut; 04950 } 04951 else { 04952 this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ; 04953 } 04954 left_weight -= thread_ii_weight_on_cut; 04955 } 04956 else { 04957 this->thread_cut_line_weight_to_put_left[ii][i] = 0; 04958 } 04959 } 04960 } 04961 04962 if(no_cuts > 0){ 04963 //this is a special case. If cutlines share the same coordinate, their weights are equal. 04964 //we need to adjust the ratio for that. 04965 for (mj_part_t i = no_cuts - 1; i > 0 ; --i){ 04966 if(ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){ 04967 my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ; 04968 } 04969 my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL) 04970 / mj_scalar_t(SIGNIFICANCE_MUL); 04971 } 04972 } 04973 } 04974 04975 for(mj_part_t ii = 0; ii < num_parts; ++ii){ 04976 thread_num_points_in_parts[ii] = 0; 04977 } 04978 04979 //for this specific case we dont want to distribute the points along the cut position 04980 //randomly, as we need a specific ordering of them. Instead, 04981 //we put the coordinates into a sort item, where we sort those 04982 //using the coordinates of points on other dimensions and the index. 04983 04984 04985 //some of the cuts might share the same position. 04986 //in this case, if cut i and cut j share the same position 04987 //cut_map[i] = cut_map[j] = sort item index. 04988 mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts); 04989 04990 04991 typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem; 04992 typedef std::vector< multiSItem > multiSVector; 04993 typedef std::vector<multiSVector> multiS2Vector; 04994 04995 //to keep track of the memory allocated. 04996 std::vector<mj_scalar_t *>allocated_memory; 04997 04998 //vector for which the coordinates will be sorted. 04999 multiS2Vector sort_vector_points_on_cut; 05000 05001 //the number of cuts that have different coordinates. 05002 mj_part_t different_cut_count = 1; 05003 cut_map[0] = 0; 05004 05005 //now we insert 1 sort vector for all cuts on the different 05006 //positins.if multiple cuts are on the same position, they share sort vectors. 05007 multiSVector tmpMultiSVector; 05008 sort_vector_points_on_cut.push_back(tmpMultiSVector); 05009 05010 for (mj_part_t i = 1; i < no_cuts ; ++i){ 05011 //if cuts share the same cut coordinates 05012 //set the cutmap accordingly. 05013 if(ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){ 05014 cut_map[i] = cut_map[i-1]; 05015 } 05016 else { 05017 cut_map[i] = different_cut_count++; 05018 multiSVector tmp2MultiSVector; 05019 sort_vector_points_on_cut.push_back(tmp2MultiSVector); 05020 } 05021 } 05022 05023 05024 //now the actual part assigment. 05025 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){ 05026 05027 mj_lno_t i = this->coordinate_permutations[ii]; 05028 05029 mj_part_t pp = this->assigned_part_ids[i]; 05030 mj_part_t p = pp / 2; 05031 //if the coordinate is on a cut. 05032 if(pp % 2 == 1 ){ 05033 mj_scalar_t *vals = allocMemory<mj_scalar_t>(this->coord_dim -1); 05034 allocated_memory.push_back(vals); 05035 05036 //we insert the coordinates to the sort item here. 05037 int val_ind = 0; 05038 for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){ 05039 vals[val_ind++] = this->mj_coordinates[dim][i]; 05040 } 05041 for(int dim = 0; dim < coordInd; ++dim){ 05042 vals[val_ind++] = this->mj_coordinates[dim][i]; 05043 } 05044 multiSItem tempSortItem(i, this->coord_dim -1, vals); 05045 //inser the point to the sort vector pointed by the cut_map[p]. 05046 mj_part_t cmap = cut_map[p]; 05047 sort_vector_points_on_cut[cmap].push_back(tempSortItem); 05048 } 05049 else { 05050 //if it is not on the cut, simple sorting. 05051 ++thread_num_points_in_parts[p]; 05052 this->assigned_part_ids[i] = p; 05053 } 05054 } 05055 05056 //sort all the sort vectors. 05057 for (mj_part_t i = 0; i < different_cut_count; ++i){ 05058 std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end()); 05059 } 05060 05061 //we do the part assignment for the points on cuts here. 05062 mj_part_t previous_cut_map = cut_map[0]; 05063 05064 //this is how much previous part owns the weight of the current part. 05065 //when target part weight is 1.6, and the part on the left is given 2, 05066 //the left has an extra 0.4, while the right has missing 0.4 from the previous cut. 05067 //this parameter is used to balance this issues. 05068 //in the above example weight_stolen_from_previous_part will be 0.4. 05069 //if the left part target is 2.2 but it is given 2, 05070 //then weight_stolen_from_previous_part will be -0.2. 05071 mj_scalar_t weight_stolen_from_previous_part = 0; 05072 for (mj_part_t p = 0; p < no_cuts; ++p){ 05073 05074 mj_part_t mapped_cut = cut_map[p]; 05075 05076 //if previous cut map is done, and it does not have the same index, 05077 //then assign all points left on that cut to its right. 05078 if (previous_cut_map != mapped_cut){ 05079 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1; 05080 for (; sort_vector_end >= 0; --sort_vector_end){ 05081 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end]; 05082 mj_lno_t i = t.index; 05083 ++thread_num_points_in_parts[p]; 05084 this->assigned_part_ids[i] = p; 05085 } 05086 sort_vector_points_on_cut[previous_cut_map].clear(); 05087 } 05088 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1; 05089 05090 for (; sort_vector_end >= 0; --sort_vector_end){ 05091 multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end]; 05092 mj_lno_t i = t.index; 05093 mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i]; 05094 05095 05096 //part p has enough space for point i, then put it to point i. 05097 if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon && 05098 my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w) 05099 > this->sEpsilon){ 05100 05101 my_local_thread_cut_weights_to_put_left[p] -= w; 05102 sort_vector_points_on_cut[mapped_cut].pop_back(); 05103 ++thread_num_points_in_parts[p]; 05104 this->assigned_part_ids[i] = p; 05105 //if putting this weight to left overweights the left cut, then 05106 //increase the space for the next cut using weight_stolen_from_previous_part. 05107 if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){ 05108 if(mapped_cut == cut_map[p + 1] ){ 05109 //if the cut before the cut indexed at p was also at the same position 05110 //special case, as we handle the weight differently here. 05111 if (previous_cut_map != mapped_cut){ 05112 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p]; 05113 } 05114 else { 05115 //if the cut before the cut indexed at p was also at the same position 05116 //we assign extra weights cumulatively in this case. 05117 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p]; 05118 } 05119 } 05120 else{ 05121 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p]; 05122 } 05123 //end assignment for part p 05124 break; 05125 } 05126 } else { 05127 //if part p does not have enough space for this point 05128 //and if there is another cut sharing the same positon, 05129 //again increase the space for the next 05130 if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){ 05131 if (previous_cut_map != mapped_cut){ 05132 weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p]; 05133 } 05134 else { 05135 weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p]; 05136 } 05137 } 05138 else{ 05139 weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p]; 05140 } 05141 //end assignment for part p 05142 break; 05143 } 05144 } 05145 previous_cut_map = mapped_cut; 05146 } 05147 //put everything left on the last cut to the last part. 05148 mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1; 05149 for (; sort_vector_end >= 0; --sort_vector_end){ 05150 multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end]; 05151 mj_lno_t i = t.index; 05152 ++thread_num_points_in_parts[no_cuts]; 05153 this->assigned_part_ids[i] = no_cuts; 05154 } 05155 sort_vector_points_on_cut[previous_cut_map].clear(); 05156 freeArray<mj_part_t> (cut_map); 05157 05158 //free the memory allocated for vertex sort items . 05159 mj_lno_t vSize = (mj_lno_t) allocated_memory.size(); 05160 for(mj_lno_t i = 0; i < vSize; ++i){ 05161 freeArray<mj_scalar_t> (allocated_memory[i]); 05162 } 05163 05164 //creation of part_xadj as in usual case. 05165 for(mj_part_t j = 0; j < num_parts; ++j){ 05166 mj_lno_t num_points_in_part_j_upto_thread_i = 0; 05167 for (int i = 0; i < this->num_threads; ++i){ 05168 mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j]; 05169 this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i; 05170 num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j; 05171 05172 } 05173 out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin; 05174 } 05175 05176 //perform prefix sum for num_points in parts. 05177 for(mj_part_t j = 1; j < num_parts; ++j){ 05178 out_part_xadj[j] += out_part_xadj[j - 1]; 05179 } 05180 05181 05182 //shift the num points in threads thread to obtain the 05183 //beginning index of each thread's private space. 05184 for(mj_part_t j = 1; j < num_parts; ++j){ 05185 thread_num_points_in_parts[j] += out_part_xadj[j - 1] ; 05186 } 05187 05188 //now thread gets the coordinate and writes the index of coordinate to the permutation array 05189 //using the part index we calculated. 05190 for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){ 05191 mj_lno_t i = this->coordinate_permutations[ii]; 05192 mj_part_t p = this->assigned_part_ids[i]; 05193 this->new_coordinate_permutations[coordinate_begin + 05194 thread_num_points_in_parts[p]++] = i; 05195 } 05196 } 05197 05198 05199 05209 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 05210 typename mj_part_t> 05211 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts( 05212 mj_part_t current_num_parts, 05213 mj_part_t output_part_begin_index, 05214 RCP<mj_partBoxVector_t> output_part_boxes, 05215 bool is_data_ever_migrated) 05216 { 05217 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Part_Assignment"); 05218 05219 #ifdef HAVE_ZOLTAN2_OMP 05220 #pragma omp parallel for 05221 #endif 05222 for(mj_part_t i = 0; i < current_num_parts;++i){ 05223 05224 mj_lno_t begin = 0; 05225 mj_lno_t end = this->part_xadj[i]; 05226 05227 if(i > 0) begin = this->part_xadj[i -1]; 05228 mj_part_t part_to_set_index = i + output_part_begin_index; 05229 if (this->mj_keep_part_boxes){ 05230 (*output_part_boxes)[i].setpId(part_to_set_index); 05231 } 05232 for (mj_lno_t ii = begin; ii < end; ++ii){ 05233 mj_lno_t k = this->coordinate_permutations[ii]; 05234 this->assigned_part_ids[k] = part_to_set_index; 05235 } 05236 } 05237 05238 //ArrayRCP<const mj_gno_t> gnoList; 05239 if(!is_data_ever_migrated){ 05240 //freeArray<mj_gno_t>(this->current_mj_gnos); 05241 //if(this->num_local_coords > 0){ 05242 // gnoList = arcpFromArrayView(this->mj_gnos); 05243 //} 05244 } 05245 else { 05246 #ifdef ENABLE_ZOLTAN_MIGRATION 05247 if (sizeof(mj_lno_t) <= sizeof(int)) { 05248 05249 // Cannot use Zoltan_Comm with local ordinals larger than ints. 05250 // In Zoltan_Comm_Create, the cast int(this->num_local_coords) 05251 // may overflow. 05252 05253 //if data is migrated, then send part numbers to the original owners. 05254 ZOLTAN_COMM_OBJ *plan = NULL; 05255 MPI_Comm mpi_comm = Teuchos2MPI (this->mj_problemComm); 05256 05257 int incoming = 0; 05258 int message_tag = 7856; 05259 05260 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating"); 05261 int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords), 05262 this->owner_of_coordinate, mpi_comm, message_tag, 05263 &incoming); 05264 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 05265 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating" ); 05266 05267 mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming); 05268 05269 message_tag++; 05270 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm"); 05271 ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos, 05272 sizeof(mj_gno_t), (char *) incoming_gnos); 05273 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 05274 05275 freeArray<mj_gno_t>(this->current_mj_gnos); 05276 this->current_mj_gnos = incoming_gnos; 05277 05278 mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming); 05279 05280 message_tag++; 05281 ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids, 05282 sizeof(mj_part_t), (char *) incoming_partIds); 05283 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 05284 freeArray<mj_part_t>(this->assigned_part_ids); 05285 this->assigned_part_ids = incoming_partIds; 05286 05287 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm"); 05288 ierr = Zoltan_Comm_Destroy(&plan); 05289 Z2_ASSERT_VALUE(ierr, ZOLTAN_OK); 05290 05291 this->num_local_coords = incoming; 05292 //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true); 05293 } 05294 else 05295 05296 #endif // !ENABLE_ZOLTAN_MIGRATION 05297 { 05298 //if data is migrated, then send part numbers to the original owners. 05299 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating"); 05300 Tpetra::Distributor distributor(this->mj_problemComm); 05301 ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords); 05302 mj_lno_t incoming = distributor.createFromSends(owners_of_coords); 05303 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating" ); 05304 05305 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm"); 05306 //migrate gnos to actual owners. 05307 ArrayRCP<mj_gno_t> received_gnos(incoming); 05308 ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords); 05309 distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos()); 05310 freeArray<mj_gno_t>(this->current_mj_gnos); 05311 this->current_mj_gnos = allocMemory<mj_gno_t>(incoming); 05312 memcpy( 05313 this->current_mj_gnos, 05314 received_gnos.getRawPtr(), 05315 incoming * sizeof(mj_gno_t)); 05316 05317 //migrate part ids to actual owners. 05318 ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords); 05319 ArrayRCP<mj_part_t> received_partids(incoming); 05320 distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids()); 05321 freeArray<mj_part_t>(this->assigned_part_ids); 05322 this->assigned_part_ids = allocMemory<mj_part_t>(incoming); 05323 memcpy( 05324 this->assigned_part_ids, 05325 received_partids.getRawPtr(), 05326 incoming * sizeof(mj_part_t)); 05327 05328 this->num_local_coords = incoming; 05329 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm"); 05330 05331 } 05332 } 05333 05334 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Part_Assignment"); 05335 05336 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment"); 05337 05338 //ArrayRCP<mj_part_t> partId; 05339 //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true); 05340 05341 if (this->mj_keep_part_boxes){ 05342 this->kept_boxes = output_part_boxes; 05343 } 05344 05345 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment"); 05346 } 05347 05350 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 05351 typename mj_part_t> 05352 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){ 05353 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Free"); 05354 05355 for (int i=0; i < this->coord_dim; i++){ 05356 freeArray<mj_scalar_t>(this->mj_coordinates[i]); 05357 } 05358 freeArray<mj_scalar_t *>(this->mj_coordinates); 05359 05360 for (int i=0; i < this->num_weights_per_coord; i++){ 05361 freeArray<mj_scalar_t>(this->mj_weights[i]); 05362 } 05363 freeArray<mj_scalar_t *>(this->mj_weights); 05364 05365 freeArray<int>(this->owner_of_coordinate); 05366 05367 for(int i = 0; i < this->num_threads; ++i){ 05368 freeArray<mj_lno_t>(this->thread_point_counts[i]); 05369 } 05370 05371 freeArray<mj_lno_t *>(this->thread_point_counts); 05372 freeArray<double *> (this->thread_part_weight_work); 05373 05374 if(this->distribute_points_on_cut_lines){ 05375 freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left); 05376 for(int i = 0; i < this->num_threads; ++i){ 05377 freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]); 05378 } 05379 freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left); 05380 freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight); 05381 freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight); 05382 } 05383 05384 freeArray<mj_part_t>(this->my_incomplete_cut_count); 05385 05386 freeArray<mj_scalar_t>(this->max_min_coords); 05387 05388 freeArray<mj_lno_t>(this->new_part_xadj); 05389 05390 freeArray<mj_lno_t>(this->coordinate_permutations); 05391 05392 freeArray<mj_lno_t>(this->new_coordinate_permutations); 05393 05394 freeArray<mj_scalar_t>(this->all_cut_coordinates); 05395 05396 freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight); 05397 05398 freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight); 05399 05400 freeArray<mj_scalar_t>(this->cut_coordinates_work_array); 05401 05402 freeArray<mj_scalar_t>(this->target_part_weights); 05403 05404 freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates); 05405 05406 freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates); 05407 05408 freeArray<mj_scalar_t>(this->cut_lower_bound_weights); 05409 freeArray<mj_scalar_t>(this->cut_upper_bound_weights); 05410 freeArray<bool>(this->is_cut_line_determined); 05411 freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests); 05412 freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests); 05413 05414 for(int i = 0; i < this->num_threads; ++i){ 05415 freeArray<double>(this->thread_part_weights[i]); 05416 freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]); 05417 freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]); 05418 } 05419 05420 freeArray<double *>(this->thread_part_weights); 05421 freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point); 05422 freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point); 05423 05424 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Free"); 05425 } 05426 05434 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 05435 typename mj_part_t> 05436 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_partitioning_parameters( 05437 bool distribute_points_on_cut_lines_, 05438 int max_concurrent_part_calculation_, 05439 int check_migrate_avoid_migration_option_, 05440 mj_scalar_t minimum_migration_imbalance_){ 05441 this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_; 05442 this->max_concurrent_part_calculation = max_concurrent_part_calculation_; 05443 this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_; 05444 this->minimum_migration_imbalance = minimum_migration_imbalance_; 05445 05446 } 05447 05448 05477 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t, 05478 typename mj_part_t> 05479 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::multi_jagged_part( 05480 05481 const RCP<const Environment> &env, 05482 RCP<Comm<int> > &problemComm, 05483 05484 double imbalance_tolerance_, 05485 size_t num_global_parts_, 05486 mj_part_t *part_no_array_, 05487 int recursion_depth_, 05488 05489 int coord_dim_, 05490 mj_lno_t num_local_coords_, 05491 mj_gno_t num_global_coords_, 05492 const mj_gno_t *initial_mj_gnos_, 05493 mj_scalar_t **mj_coordinates_, 05494 05495 int num_weights_per_coord_, 05496 bool *mj_uniform_weights_, 05497 mj_scalar_t **mj_weights_, 05498 bool *mj_uniform_parts_, 05499 mj_scalar_t **mj_part_sizes_, 05500 05501 mj_part_t *&result_assigned_part_ids_, 05502 mj_gno_t *&result_mj_gnos_ 05503 ) 05504 { 05505 05506 #ifdef print_debug 05507 if(comm->getRank() == 0){ 05508 std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl; 05509 std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl; 05510 std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl; 05511 } 05512 #endif 05513 05514 this->mj_env = env; 05515 this->mj_problemComm = problemComm; 05516 this->myActualRank = this->myRank = this->mj_problemComm->getRank(); 05517 05518 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Total"); 05519 this->mj_env->debug(3, "In MultiJagged Jagged"); 05520 05521 { 05522 this->imbalance_tolerance = imbalance_tolerance_; 05523 this->num_global_parts = num_global_parts_; 05524 this->part_no_array = part_no_array_; 05525 this->recursion_depth = recursion_depth_; 05526 05527 this->coord_dim = coord_dim_; 05528 this->num_local_coords = num_local_coords_; 05529 this->num_global_coords = num_global_coords_; 05530 this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates. 05531 this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j]. 05532 05533 this->num_weights_per_coord = num_weights_per_coord_; 05534 this->mj_uniform_weights = mj_uniform_weights_; 05535 this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights 05536 this->mj_uniform_parts = mj_uniform_parts_; 05537 this->mj_part_sizes = mj_part_sizes_; 05538 05539 this->num_threads = 1; 05540 #ifdef HAVE_ZOLTAN2_OMP 05541 #pragma omp parallel 05542 05543 { 05544 this->num_threads = omp_get_num_threads(); 05545 } 05546 #endif 05547 } 05548 //this->set_input_data(); 05549 this->set_part_specifications(); 05550 05551 this->allocate_set_work_memory(); 05552 05553 //We duplicate the comm as we create subcommunicators during migration. 05554 //We keep the problemComm as it is, while comm changes after each migration. 05555 this->comm = this->mj_problemComm->duplicate(); 05556 05557 //initially there is a single partition 05558 mj_part_t current_num_parts = 1; 05559 mj_scalar_t *current_cut_coordinates = this->all_cut_coordinates; 05560 05561 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning"); 05562 05563 mj_part_t output_part_begin_index = 0; 05564 mj_part_t future_num_parts = this->total_num_part; 05565 bool is_data_ever_migrated = false; 05566 05567 std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> (); 05568 std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> (); 05569 next_future_num_parts_in_parts->push_back(this->num_global_parts); 05570 05571 RCP<mj_partBoxVector_t> input_part_boxes(new mj_partBoxVector_t(), true) ; 05572 RCP<mj_partBoxVector_t> output_part_boxes(new mj_partBoxVector_t(), true); 05573 05574 compute_global_box(); 05575 if(this->mj_keep_part_boxes){ 05576 this->init_part_boxes(output_part_boxes); 05577 } 05578 05579 for (int i = 0; i < this->recursion_depth; ++i){ 05580 //partitioning array. size will be as the number of current partitions and this 05581 //holds how many parts that each part will be in the current dimension partitioning. 05582 std::vector <mj_part_t> num_partitioning_in_current_dim; 05583 05584 //number of parts that will be obtained at the end of this partitioning. 05585 //future_num_part_in_parts is as the size of current number of parts. 05586 //holds how many more parts each should be divided in the further 05587 //iterations. this will be used to calculate num_partitioning_in_current_dim, 05588 //as the number of parts that the part will be partitioned 05589 //in the current dimension partitioning. 05590 05591 //next_future_num_parts_in_parts will be as the size of outnumParts, 05592 //and this will hold how many more parts that each output part 05593 //should be divided. this array will also be used to determine the weight ratios 05594 //of the parts. 05595 //swap the arrays to use iteratively.. 05596 std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts; 05597 future_num_part_in_parts = next_future_num_parts_in_parts; 05598 next_future_num_parts_in_parts = tmpPartVect; 05599 05600 //clear next_future_num_parts_in_parts array as 05601 //getPartitionArrays expects it to be empty. 05602 //it also expects num_partitioning_in_current_dim to be empty as well. 05603 next_future_num_parts_in_parts->clear(); 05604 05605 if(this->mj_keep_part_boxes){ 05606 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes; 05607 input_part_boxes = output_part_boxes; 05608 output_part_boxes = tmpPartBoxes; 05609 output_part_boxes->clear(); 05610 } 05611 05612 //returns the total no. of output parts for this dimension partitioning. 05613 mj_part_t output_part_count_in_dimension = 05614 this->update_part_num_arrays( 05615 num_partitioning_in_current_dim, 05616 future_num_part_in_parts, 05617 next_future_num_parts_in_parts, 05618 future_num_parts, 05619 current_num_parts, 05620 i, 05621 input_part_boxes, 05622 output_part_boxes); 05623 05624 //if the number of obtained parts equal to current number of parts, 05625 //skip this dimension. For example, this happens when 1 is given in the input 05626 //part array is given. P=4,5,1,2 05627 if(output_part_count_in_dimension == current_num_parts) { 05628 //still need to swap the input output arrays. 05629 tmpPartVect= future_num_part_in_parts; 05630 future_num_part_in_parts = next_future_num_parts_in_parts; 05631 next_future_num_parts_in_parts = tmpPartVect; 05632 05633 if(this->mj_keep_part_boxes){ 05634 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes; 05635 input_part_boxes = output_part_boxes; 05636 output_part_boxes = tmpPartBoxes; 05637 } 05638 continue; 05639 } 05640 05641 05642 //get the coordinate axis along which the partitioning will be done. 05643 int coordInd = i % this->coord_dim; 05644 mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd]; 05645 05646 //convert i to string to be used for debugging purposes. 05647 std::string istring = toString<int>(i); 05648 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring); 05649 05650 //alloc Memory to point the indices 05651 //of the parts in the permutation array. 05652 this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension); 05653 05654 //the index where in the new_part_xadj will be written. 05655 mj_part_t output_part_index = 0; 05656 //whatever is written to output_part_index will be added with putput_coordinate_end_index 05657 //so that the points will be shifted. 05658 mj_part_t output_coordinate_end_index = 0; 05659 05660 mj_part_t current_work_part = 0; 05661 mj_part_t current_concurrent_num_parts = 05662 std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation); 05663 05664 mj_part_t obtained_part_index = 0; 05665 05666 //run for all available parts. 05667 for (; current_work_part < current_num_parts; 05668 current_work_part += current_concurrent_num_parts){ 05669 05670 current_concurrent_num_parts = std::min(current_num_parts - current_work_part, 05671 this->max_concurrent_part_calculation); 05672 05673 mj_part_t actual_work_part_count = 0; 05674 //initialization for 1D partitioning. 05675 //get the min and max coordinates of each part 05676 //together with the part weights of each part. 05677 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){ 05678 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk; 05679 05680 //if this part wont be partitioned any further 05681 //dont do any work for this part. 05682 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){ 05683 continue; 05684 } 05685 ++actual_work_part_count; 05686 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts]; 05687 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1]; 05688 05689 /* 05690 cout << "i:" << i << " j:" << current_work_part + kk 05691 << " coordinate_begin_index:" << coordinate_begin_index 05692 << " coordinate_end_index:" << coordinate_end_index 05693 << " total:" << coordinate_end_index - coordinate_begin_index<< endl; 05694 */ 05695 this->mj_get_local_min_max_coord_totW( 05696 coordinate_begin_index, 05697 coordinate_end_index, 05698 this->coordinate_permutations, 05699 mj_current_dim_coords, 05700 this->process_local_min_max_coord_total_weight[kk], //min_coordinate 05701 this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate 05702 this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight 05703 05704 } 05705 05706 //1D partitioning 05707 if (actual_work_part_count > 0){ 05708 //obtain global Min max of the part. 05709 this->mj_get_global_min_max_coord_totW( 05710 current_concurrent_num_parts, 05711 this->process_local_min_max_coord_total_weight, 05712 this->global_min_max_coord_total_weight); 05713 05714 //represents the total number of cutlines 05715 //whose coordinate should be determined. 05716 mj_part_t total_incomplete_cut_count = 0; 05717 05718 //Compute weight ratios for parts & cuts: 05719 //e.g., 0.25 0.25 0.5 0.5 0.75 0.75 1 05720 //part0 cut0 part1 cut1 part2 cut2 part3 05721 mj_part_t concurrent_part_cut_shift = 0; 05722 mj_part_t concurrent_part_part_shift = 0; 05723 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){ 05724 mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk]; 05725 mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk + 05726 current_concurrent_num_parts]; 05727 05728 mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk + 05729 2 * current_concurrent_num_parts]; 05730 05731 mj_part_t concurrent_current_part_index = current_work_part + kk; 05732 05733 mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index]; 05734 05735 mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift; 05736 mj_scalar_t *current_target_part_weights = this->target_part_weights + 05737 concurrent_part_part_shift; 05738 //shift the usedCutCoordinate array as noCuts. 05739 concurrent_part_cut_shift += partition_count - 1; 05740 //shift the partRatio array as noParts. 05741 concurrent_part_part_shift += partition_count; 05742 05743 05744 //calculate only if part is not empty, 05745 //and part will be further partitioned. 05746 if(partition_count > 1 && min_coordinate <= max_coordinate){ 05747 05748 //increase num_cuts_do_be_determined by the number of cuts of the current 05749 //part's cut line number. 05750 total_incomplete_cut_count += partition_count - 1; 05751 //set the number of cut lines that should be determined 05752 //for this part. 05753 this->my_incomplete_cut_count[kk] = partition_count - 1; 05754 05755 //get the target weights of the parts. 05756 this->mj_get_initial_cut_coords_target_weights( 05757 min_coordinate, 05758 max_coordinate, 05759 partition_count - 1, 05760 global_total_weight, 05761 usedCutCoordinate, 05762 current_target_part_weights, 05763 future_num_part_in_parts, 05764 next_future_num_parts_in_parts, 05765 concurrent_current_part_index, 05766 obtained_part_index); 05767 05768 mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index]; 05769 mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1]; 05770 05771 //get the initial estimated part assignments of the 05772 //coordinates. 05773 this->set_initial_coordinate_parts( 05774 max_coordinate, 05775 min_coordinate, 05776 concurrent_current_part_index, 05777 coordinate_begin_index, coordinate_end_index, 05778 this->coordinate_permutations, 05779 mj_current_dim_coords, 05780 this->assigned_part_ids, 05781 partition_count); 05782 } 05783 else { 05784 // e.g., if have fewer coordinates than parts, don't need to do next dim. 05785 this->my_incomplete_cut_count[kk] = 0; 05786 } 05787 obtained_part_index += partition_count; 05788 } 05789 05790 05791 05792 //used imbalance, it is always 0, as it is difficult to 05793 //estimate a range. 05794 mj_scalar_t used_imbalance = 0; 05795 05796 05797 // Determine cut lines for all concurrent parts parts here. 05798 this->mj_1D_part( 05799 mj_current_dim_coords, 05800 used_imbalance, 05801 current_work_part, 05802 current_concurrent_num_parts, 05803 current_cut_coordinates, 05804 total_incomplete_cut_count, 05805 num_partitioning_in_current_dim); 05806 } 05807 05808 //create new part chunks 05809 { 05810 mj_part_t output_array_shift = 0; 05811 mj_part_t cut_shift = 0; 05812 size_t tlr_shift = 0; 05813 size_t partweight_array_shift = 0; 05814 05815 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){ 05816 mj_part_t current_concurrent_work_part = current_work_part + kk; 05817 mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part]; 05818 05819 //if the part is empty, skip the part. 05820 if((num_parts != 1 ) 05821 && 05822 this->global_min_max_coord_total_weight[kk] > 05823 this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) { 05824 05825 //we still need to write the begin and end point of the 05826 //empty part. simply set it zero, the array indices will be shifted later. 05827 for(mj_part_t jj = 0; jj < num_parts; ++jj){ 05828 this->new_part_xadj[output_part_index + output_array_shift + jj] = 0; 05829 } 05830 cut_shift += num_parts - 1; 05831 tlr_shift += (4 *(num_parts - 1) + 1); 05832 output_array_shift += num_parts; 05833 partweight_array_shift += (2 * (num_parts - 1) + 1); 05834 continue; 05835 } 05836 05837 mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part]; 05838 mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[ 05839 current_concurrent_work_part -1]; 05840 mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift; 05841 mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left + 05842 cut_shift; 05843 05844 //mj_scalar_t *used_tlr_array = this->total_part_weight_left_right_closests + tlr_shift; 05845 05846 for(int ii = 0; ii < this->num_threads; ++ii){ 05847 this->thread_part_weight_work[ii] = this->thread_part_weights[ii] + partweight_array_shift; 05848 } 05849 05850 if(num_parts > 1){ 05851 if(this->mj_keep_part_boxes){ 05852 //if part boxes are to be stored update the boundaries. 05853 for (mj_part_t j = 0; j < num_parts - 1; ++j){ 05854 (*output_part_boxes)[output_array_shift + output_part_index + 05855 j].updateMinMax(current_concurrent_cut_coordinate[j], 1 05856 /*update max*/, coordInd); 05857 05858 (*output_part_boxes)[output_array_shift + output_part_index + j + 05859 1].updateMinMax(current_concurrent_cut_coordinate[j], 0 05860 /*update min*/, coordInd); 05861 } 05862 } 05863 05864 // Rewrite the indices based on the computed cuts. 05865 this->mj_create_new_partitions( 05866 num_parts, 05867 mj_current_dim_coords, 05868 current_concurrent_cut_coordinate, 05869 coordinate_begin, 05870 coordinate_end, 05871 used_local_cut_line_weight_to_left, 05872 this->thread_part_weight_work, 05873 this->new_part_xadj + output_part_index + output_array_shift 05874 ); 05875 05876 } 05877 else { 05878 //if this part is partitioned into 1 then just copy 05879 //the old values. 05880 mj_lno_t part_size = coordinate_end - coordinate_begin; 05881 *(this->new_part_xadj + output_part_index + output_array_shift) = part_size; 05882 memcpy( 05883 this->new_coordinate_permutations + coordinate_begin, 05884 this->coordinate_permutations + coordinate_begin, 05885 part_size * sizeof(mj_lno_t)); 05886 } 05887 cut_shift += num_parts - 1; 05888 tlr_shift += (4 *(num_parts - 1) + 1); 05889 output_array_shift += num_parts; 05890 partweight_array_shift += (2 * (num_parts - 1) + 1); 05891 } 05892 05893 //shift cut coordinates so that all cut coordinates are stored. 05894 //no shift now because we dont keep the cuts. 05895 //current_cut_coordinates += cut_shift; 05896 05897 //mj_create_new_partitions from coordinates partitioned the parts and 05898 //write the indices as if there were a single part. 05899 //now we need to shift the beginning indices. 05900 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){ 05901 mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk]; 05902 for (mj_part_t ii = 0;ii < num_parts ; ++ii){ 05903 //shift it by previousCount 05904 this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index; 05905 } 05906 //increase the previous count by current end. 05907 output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1]; 05908 //increase the current out. 05909 output_part_index += num_parts ; 05910 } 05911 } 05912 } 05913 // end of this partitioning dimension 05914 05915 05916 int current_world_size = this->comm->getSize(); 05917 long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size; 05918 05919 05920 bool is_migrated_in_current_dimension = false; 05921 05922 //we migrate if there are more partitionings to be done after this step 05923 //and if the migration is not forced to be avoided. 05924 //and the operation is not sequential. 05925 if (future_num_parts > 1 && 05926 this->check_migrate_avoid_migration_option >= 0 && 05927 current_world_size > 1){ 05928 05929 this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring); 05930 mj_part_t num_parts = output_part_count_in_dimension; 05931 if ( this->mj_perform_migration( 05932 num_parts, 05933 current_num_parts, //output 05934 next_future_num_parts_in_parts, //output 05935 output_part_begin_index, 05936 migration_reduce_all_population, 05937 this->num_local_coords / (future_num_parts * current_num_parts), 05938 istring, 05939 input_part_boxes, output_part_boxes) ) { 05940 is_migrated_in_current_dimension = true; 05941 is_data_ever_migrated = true; 05942 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + 05943 istring); 05944 //since data is migrated, we reduce the number of reduceAll operations for the last part. 05945 this->total_dim_num_reduce_all /= num_parts; 05946 } 05947 else { 05948 is_migrated_in_current_dimension = false; 05949 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring); 05950 } 05951 } 05952 05953 //swap the coordinate permutations for the next dimension. 05954 mj_lno_t * tmp = this->coordinate_permutations; 05955 this->coordinate_permutations = this->new_coordinate_permutations; 05956 this->new_coordinate_permutations = tmp; 05957 05958 if(!is_migrated_in_current_dimension){ 05959 this->total_dim_num_reduce_all -= current_num_parts; 05960 current_num_parts = output_part_count_in_dimension; 05961 } 05962 freeArray<mj_lno_t>(this->part_xadj); 05963 this->part_xadj = this->new_part_xadj; 05964 05965 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring); 05966 } 05967 05968 // Partitioning is done 05969 delete future_num_part_in_parts; 05970 delete next_future_num_parts_in_parts; 05971 05972 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning"); 05974 05975 05976 //get the final parts of each initial coordinate 05977 //the results will be written to 05978 //this->assigned_part_ids for gnos given in this->current_mj_gnos 05979 this->set_final_parts( 05980 current_num_parts, 05981 output_part_begin_index, 05982 output_part_boxes, 05983 is_data_ever_migrated); 05984 05985 result_assigned_part_ids_ = this->assigned_part_ids; 05986 result_mj_gnos_ = this->current_mj_gnos; 05987 05988 this->free_work_memory(); 05989 this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Total"); 05990 this->mj_env->debug(3, "Out of MultiJagged"); 05991 05992 } 05993 05994 05998 template <typename Adapter> 05999 class Zoltan2_AlgMJ : public Algorithm<Adapter> 06000 { 06001 private: 06002 06003 #ifndef DOXYGEN_SHOULD_SKIP_THIS 06004 06005 typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t; 06006 typedef typename Adapter::scalar_t mj_scalar_t; 06007 typedef typename Adapter::gno_t mj_gno_t; 06008 typedef typename Adapter::lno_t mj_lno_t; 06009 typedef typename Adapter::node_t mj_node_t; 06010 typedef typename Adapter::part_t mj_part_t; 06011 typedef coordinateModelPartBox<mj_scalar_t, mj_part_t> mj_partBox_t; 06012 typedef std::vector<mj_partBox_t> mj_partBoxVector_t; 06013 #endif 06014 AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t> mj_partitioner; 06015 06016 RCP<const Environment> mj_env; //the environment object 06017 RCP<Comm<int> > mj_problemComm; //initial comm object 06018 RCP<const coordinateModel_t> mj_coords; //coordinate adapter 06019 06020 //PARAMETERS 06021 double imbalance_tolerance; //input imbalance tolerance. 06022 size_t num_global_parts; //the targeted number of parts 06023 mj_part_t *part_no_array; //input part array specifying num part to divide along each dim. 06024 int recursion_depth; //the number of steps that partitioning will be solved in. 06025 06026 int coord_dim; // coordinate dimension. 06027 mj_lno_t num_local_coords; //number of local coords. 06028 mj_gno_t num_global_coords; //number of global coords. 06029 const mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates. 06030 mj_scalar_t **mj_coordinates; //two dimension coordinate array 06031 06032 int num_weights_per_coord; // number of weights per coordinate 06033 bool *mj_uniform_weights; //if the coordinates have uniform weights. 06034 mj_scalar_t **mj_weights; //two dimensional weight array 06035 bool *mj_uniform_parts; //if the target parts are uniform 06036 mj_scalar_t **mj_part_sizes; //target part weight sizes. 06037 06038 bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts. 06039 mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently. 06040 int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0 06041 mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration. 06042 int mj_keep_part_boxes; //if the boxes need to be kept. 06043 06044 int num_threads; 06045 06046 int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value. 06047 06048 void set_up_partitioning_data( 06049 const RCP<PartitioningSolution<Adapter> >&solution); 06050 06051 void set_input_parameters(const Teuchos::ParameterList &p); 06052 06053 void free_work_memory(); 06054 public: 06055 06056 Zoltan2_AlgMJ(const RCP<const Environment> &env, 06057 RCP<Comm<int> > &problemComm, 06058 const RCP<const coordinateModel_t> &coords) : 06059 mj_partitioner(), mj_env(env), 06060 mj_problemComm(problemComm), 06061 mj_coords(coords), 06062 imbalance_tolerance(0), 06063 num_global_parts(1), part_no_array(NULL), 06064 recursion_depth(0), 06065 coord_dim(0),num_local_coords(0), num_global_coords(0), 06066 initial_mj_gnos(NULL), mj_coordinates(NULL), 06067 num_weights_per_coord(0), 06068 mj_uniform_weights(NULL), mj_weights(NULL), 06069 mj_uniform_parts(NULL), 06070 mj_part_sizes(NULL), 06071 distribute_points_on_cut_lines(true), 06072 max_concurrent_part_calculation(1), 06073 check_migrate_avoid_migration_option(0), 06074 minimum_migration_imbalance(0.30), 06075 mj_keep_part_boxes(0), num_threads(1), mj_run_as_rcb(0) 06076 {} 06077 ~Zoltan2_AlgMJ(){} 06078 06085 void partition(const RCP<PartitioningSolution<Adapter> > &solution); 06086 06087 mj_part_t pointAssign(int dim, mj_scalar_t *point) const; 06088 }; 06089 06090 06100 template <typename Adapter> 06101 void Zoltan2_AlgMJ<Adapter>::partition( 06102 const RCP<PartitioningSolution<Adapter> > &solution 06103 ) 06104 { 06105 this->set_up_partitioning_data(solution); 06106 this->set_input_parameters(this->mj_env->getParameters()); 06107 if (this->mj_keep_part_boxes){ 06108 this->mj_partitioner.set_to_keep_part_boxes(); 06109 } 06110 this->mj_partitioner.set_partitioning_parameters( 06111 this->distribute_points_on_cut_lines, 06112 this->max_concurrent_part_calculation, 06113 this->check_migrate_avoid_migration_option, 06114 this->minimum_migration_imbalance); 06115 06116 mj_part_t *result_assigned_part_ids = NULL; 06117 mj_gno_t *result_mj_gnos = NULL; 06118 this->mj_partitioner.multi_jagged_part( 06119 this->mj_env, 06120 this->mj_problemComm, 06121 06122 this->imbalance_tolerance, 06123 this->num_global_parts, 06124 this->part_no_array, 06125 this->recursion_depth, 06126 06127 this->coord_dim, 06128 this->num_local_coords, 06129 this->num_global_coords, 06130 this->initial_mj_gnos, 06131 this->mj_coordinates, 06132 06133 this->num_weights_per_coord, 06134 this->mj_uniform_weights, 06135 this->mj_weights, 06136 this->mj_uniform_parts, 06137 this->mj_part_sizes, 06138 06139 result_assigned_part_ids, 06140 result_mj_gnos 06141 ); 06142 06143 ArrayRCP<const mj_gno_t> gnoList = arcp(result_mj_gnos, 0, 06144 this->num_local_coords, true); 06145 ArrayRCP<mj_part_t> partId = arcp(result_assigned_part_ids, 0, 06146 this->num_local_coords, true); 06147 solution->setParts(gnoList, partId, true); 06148 if (this->mj_keep_part_boxes){ 06149 RCP<mj_partBoxVector_t> output_part_boxes = 06150 this->mj_partitioner.get_part_boxes(); 06151 solution->setPartBoxes(output_part_boxes); 06152 } 06153 this->free_work_memory(); 06154 } 06155 06156 /* \brief Freeing the memory allocated. 06157 * */ 06158 template <typename Adapter> 06159 void Zoltan2_AlgMJ<Adapter>::free_work_memory(){ 06160 freeArray<mj_scalar_t *>(this->mj_coordinates); 06161 freeArray<mj_scalar_t *>(this->mj_weights); 06162 freeArray<bool>(this->mj_uniform_parts); 06163 freeArray<mj_scalar_t *>(this->mj_part_sizes); 06164 freeArray<bool>(this->mj_uniform_weights); 06165 06166 } 06167 06168 /* \brief Sets the partitioning data for multijagged algorithm. 06169 * */ 06170 template <typename Adapter> 06171 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data( 06172 const RCP<PartitioningSolution<Adapter> > &solution 06173 ) 06174 { 06175 this->coord_dim = this->mj_coords->getCoordinateDim(); 06176 this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate(); 06177 this->num_local_coords = this->mj_coords->getLocalNumCoordinates(); 06178 this->num_global_coords = this->mj_coords->getGlobalNumCoordinates(); 06179 int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1); 06180 06181 // From the Solution we get part information. 06182 // If the part sizes for a given criteria are not uniform, 06183 // then they are values that sum to 1.0. 06184 this->num_global_parts = solution->getTargetGlobalNumberOfParts(); 06185 //allocate only two dimensional pointer. 06186 //raw pointer addresess will be obtained from multivector. 06187 this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim); 06188 this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim); 06189 06190 //if the partitioning results are to be uniform. 06191 this->mj_uniform_parts = allocMemory< bool >(criteria_dim); 06192 //if in a criteria dimension, uniform part is false this shows ratios of 06193 //the target part weights. 06194 this->mj_part_sizes = allocMemory<mj_scalar_t *>(criteria_dim); 06195 //if the weights of coordinates are uniform in a criteria dimension. 06196 this->mj_uniform_weights = allocMemory< bool >(criteria_dim); 06197 06198 typedef StridedData<mj_lno_t, mj_scalar_t> input_t; 06199 ArrayView<const mj_gno_t> gnos; 06200 ArrayView<input_t> xyz; 06201 ArrayView<input_t> wgts; 06202 06203 this->mj_coords->getCoordinates(gnos, xyz, wgts); 06204 //obtain global ids. 06205 ArrayView<const mj_gno_t> mj_gnos = gnos; 06206 this->initial_mj_gnos = mj_gnos.getRawPtr(); 06207 06208 //extract coordinates from multivector. 06209 for (int dim=0; dim < this->coord_dim; dim++){ 06210 ArrayRCP<const mj_scalar_t> ar; 06211 xyz[dim].getInputArray(ar); 06212 //multiJagged coordinate values assignment 06213 this->mj_coordinates[dim] = (mj_scalar_t *)ar.getRawPtr(); 06214 } 06215 06216 //if no weights are provided set uniform weight. 06217 if (this->num_weights_per_coord == 0){ 06218 this->mj_uniform_weights[0] = true; 06219 this->mj_weights[0] = NULL; 06220 } 06221 else{ 06222 //if weights are provided get weights for all weight indices 06223 for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){ 06224 ArrayRCP<const mj_scalar_t> ar; 06225 wgts[wdim].getInputArray(ar); 06226 this->mj_uniform_weights[wdim] = false; 06227 this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr(); 06228 } 06229 } 06230 06231 for (int wdim = 0; wdim < criteria_dim; wdim++){ 06232 if (solution->criteriaHasUniformPartSizes(wdim)){ 06233 this->mj_uniform_parts[wdim] = true; 06234 this->mj_part_sizes[wdim] = NULL; 06235 } 06236 else{ 06237 std::cerr << "MJ does not support non uniform target part weights" << std::endl; 06238 exit(1); 06239 } 06240 } 06241 } 06242 06243 /* \brief Sets the partitioning parameters for multijagged algorithm. 06244 * \param pl: is the parameter list provided to zoltan2 call 06245 * */ 06246 template <typename Adapter> 06247 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){ 06248 06249 const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance"); 06250 if (pe){ 06251 double tol; 06252 tol = pe->getValue(&tol); 06253 this->imbalance_tolerance = tol - 1.0; 06254 } 06255 06256 // TODO: May be a more relaxed tolerance is needed. RCB uses 10% 06257 if (this->imbalance_tolerance <= 0) 06258 this->imbalance_tolerance= 10e-4; 06259 06260 //if an input partitioning array is provided. 06261 this->part_no_array = NULL; 06262 //the length of the input partitioning array. 06263 this->recursion_depth = 0; 06264 06265 if (pl.getPtr<Array <mj_part_t> >("mj_parts")){ 06266 this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >("mj_parts")->getRawPtr(); 06267 this->recursion_depth = pl.getPtr<Array <mj_part_t> >("mj_parts")->size() - 1; 06268 this->mj_env->debug(2, "mj_parts provided by user"); 06269 } 06270 06271 //get mj specific parameters. 06272 this->distribute_points_on_cut_lines = true; 06273 this->max_concurrent_part_calculation = 1; 06274 06275 this->mj_run_as_rcb = 0; 06276 int mj_user_recursion_depth = -1; 06277 this->mj_keep_part_boxes = 0; 06278 this->check_migrate_avoid_migration_option = 0; 06279 this->minimum_migration_imbalance = 0.35; 06280 06281 pe = pl.getEntryPtr("mj_minimum_migration_imbalance"); 06282 if (pe){ 06283 double imb; 06284 imb = pe->getValue(&imb); 06285 this->minimum_migration_imbalance = imb - 1.0; 06286 } 06287 06288 pe = pl.getEntryPtr("mj_migration_option"); 06289 if (pe){ 06290 this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option); 06291 }else { 06292 this->check_migrate_avoid_migration_option = 0; 06293 } 06294 if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1; 06295 06296 06297 pe = pl.getEntryPtr("mj_concurrent_part_count"); 06298 if (pe){ 06299 this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation); 06300 }else { 06301 this->max_concurrent_part_calculation = 1; // Set to 1 if not provided. 06302 } 06303 06304 pe = pl.getEntryPtr("mj_keep_part_boxes"); 06305 if (pe){ 06306 this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes); 06307 }else { 06308 this->mj_keep_part_boxes = 0; // Set to invalid value 06309 } 06310 06311 // For now, need keep_part_boxes to do pointAssign and boxAssign. 06312 // pe = pl.getEntryPtr("keep_cuts"); 06313 // if (pe){ 06314 // int tmp = pe->getValue(&tmp); 06315 // if (tmp) this->mj_keep_part_boxes = 1; 06316 // } 06317 06318 //need to keep part boxes if mapping type is geometric. 06319 if (this->mj_keep_part_boxes == 0){ 06320 pe = pl.getEntryPtr("mapping_type"); 06321 if (pe){ 06322 int mapping_type = -1; 06323 mapping_type = pe->getValue(&mapping_type); 06324 if (mapping_type == 0){ 06325 mj_keep_part_boxes = 1; 06326 } 06327 } 06328 } 06329 06330 //need to keep part boxes if mapping type is geometric. 06331 pe = pl.getEntryPtr("mj_enable_rcb"); 06332 if (pe){ 06333 this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb); 06334 }else { 06335 this->mj_run_as_rcb = 0; // Set to invalid value 06336 } 06337 06338 pe = pl.getEntryPtr("mj_recursion_depth"); 06339 if (pe){ 06340 mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth); 06341 }else { 06342 mj_user_recursion_depth = -1; // Set to invalid value 06343 } 06344 06345 int val = 0; 06346 pe = pl.getEntryPtr("rectilinear"); 06347 if (pe) val = pe->getValue(&val); 06348 if (val == 1){ 06349 this->distribute_points_on_cut_lines = false; 06350 } else { 06351 this->distribute_points_on_cut_lines = true; 06352 } 06353 06354 if (this->mj_run_as_rcb){ 06355 mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0))); 06356 } 06357 if (this->recursion_depth < 1){ 06358 if (mj_user_recursion_depth > 0){ 06359 this->recursion_depth = mj_user_recursion_depth; 06360 } 06361 else { 06362 this->recursion_depth = this->coord_dim; 06363 } 06364 } 06365 06366 this->num_threads = 1; 06367 #ifdef HAVE_ZOLTAN2_OMP 06368 #pragma omp parallel 06369 { 06370 this->num_threads = omp_get_num_threads(); 06371 } 06372 #endif 06373 06374 } 06375 06377 template <typename Adapter> 06378 typename Adapter::part_t Zoltan2_AlgMJ<Adapter>::pointAssign( 06379 int dim, 06380 typename Adapter::scalar_t *point) const 06381 { 06382 06383 // TODO: Implement with cuts rather than boxes to reduce algorithmic 06384 // TODO: complexity. 06385 06386 if (this->mj_keep_part_boxes) { 06387 typename Adapter::part_t foundPart = -1; 06388 06389 // Get vector of part boxes 06390 RCP<mj_partBoxVector_t> partBoxes; 06391 try { 06392 partBoxes = this->mj_partitioner.get_part_boxes(); 06393 } 06394 Z2_FORWARD_EXCEPTIONS; 06395 06396 size_t nBoxes = (*partBoxes).size(); 06397 if (nBoxes == 0) { 06398 throw std::logic_error("no part boxes exist"); 06399 } 06400 //{ 06401 //int me; 06402 //MPI_Comm_rank(MPI_COMM_WORLD, &me); 06403 //for (size_t i = 0; i < nBoxes; i++) 06404 // printf("%d KDDKDD BOX %d PART %d (%f %f) x (%f %f)\n", me, i, (*partBoxes)[i].getpId(), (*partBoxes)[i].getlmins()[0], (*partBoxes)[i].getlmins()[1], (*partBoxes)[i].getlmaxs()[0], (*partBoxes)[i].getlmaxs()[1]); 06405 //} 06406 06407 06408 // Determine whether the point is within the global domain 06409 RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box(); 06410 06411 if (globalBox->pointInBox(dim, point)) { 06412 06413 // point is in the global domain; determine in which part it is. 06414 size_t i; 06415 for (i = 0; i < nBoxes; i++) { 06416 try { 06417 if ((*partBoxes)[i].pointInBox(dim, point)) { 06418 foundPart = (*partBoxes)[i].getpId(); 06419 // std::cout << "Point ("; 06420 // for (int j = 0; j < dim; j++) std::cout << point[j] << " "; 06421 // std::cout << ") found in box " << i << " part " << foundPart 06422 // << std::endl; 06423 // (*partBoxes)[i].print(); 06424 break; 06425 } 06426 } 06427 Z2_FORWARD_EXCEPTIONS; 06428 } 06429 06430 if (i == nBoxes) { 06431 // This error should never occur 06432 std::ostringstream oss; 06433 oss << "Point ("; 06434 for (int j = 0; j < dim; j++) oss << point[j] << " "; 06435 oss << ") not found in domain"; 06436 throw std::logic_error(oss.str()); 06437 } 06438 } 06439 06440 else { 06441 // Point is outside the global domain. 06442 // Determine to which part it is closest. 06443 // TODO: with cuts, would not need this special case 06444 06445 size_t closestBox = 0; 06446 mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max(); 06447 mj_scalar_t *centroid = new mj_scalar_t[dim]; 06448 for (size_t i = 0; i < nBoxes; i++) { 06449 (*partBoxes)[i].computeCentroid(centroid); 06450 mj_scalar_t sum = 0.; 06451 mj_scalar_t diff; 06452 for (int j = 0; j < dim; j++) { 06453 diff = centroid[j] - point[j]; 06454 sum += diff * diff; 06455 } 06456 if (sum < minDistance) { 06457 minDistance = sum; 06458 closestBox = i; 06459 } 06460 } 06461 foundPart = (*partBoxes)[closestBox].getpId(); 06462 delete [] centroid; 06463 } 06464 06465 return foundPart; 06466 } 06467 else { 06468 throw std::logic_error("need to use keep_cuts parameter for pointAssign"); 06469 } 06470 } 06471 06472 } // namespace Zoltan2 06473 06474 #endif
1.7.6.1