Zoltan2
Zoltan2_AlgMultiJagged.hpp
Go to the documentation of this file.
00001 // @HEADER
00002 //
00003 // ***********************************************************************
00004 //
00005 //   Zoltan2: A package of combinatorial algorithms for scientific computing
00006 //                  Copyright 2012 Sandia Corporation
00007 //
00008 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00009 // the U.S. Government retains certain rights in this software.
00010 //
00011 // Redistribution and use in source and binary forms, with or without
00012 // modification, are permitted provided that the following conditions are
00013 // met:
00014 //
00015 // 1. Redistributions of source code must retain the above copyright
00016 // notice, this list of conditions and the following disclaimer.
00017 //
00018 // 2. Redistributions in binary form must reproduce the above copyright
00019 // notice, this list of conditions and the following disclaimer in the
00020 // documentation and/or other materials provided with the distribution.
00021 //
00022 // 3. Neither the name of the Corporation nor the names of the
00023 // contributors may be used to endorse or promote products derived from
00024 // this software without specific prior written permission.
00025 //
00026 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00030 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00031 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00032 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00033 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00034 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00035 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00036 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00037 //
00038 // Questions? Contact Karen Devine      (kddevin@sandia.gov)
00039 //                    Erik Boman        (egboman@sandia.gov)
00040 //                    Siva Rajamanickam (srajama@sandia.gov)
00041 //
00042 // ***********************************************************************
00043 //
00044 // @HEADER
00045 
00050 #ifndef _ZOLTAN2_ALGMultiJagged_HPP_
00051 #define _ZOLTAN2_ALGMultiJagged_HPP_
00052 
00053 #include <Zoltan2_MultiJagged_ReductionOps.hpp>
00054 #include <Zoltan2_CoordinateModel.hpp>
00055 #include <Zoltan2_Parameters.hpp>
00056 #include <Zoltan2_Algorithm.hpp>
00057 
00058 #include <Tpetra_Distributor.hpp>
00059 #include <Teuchos_ParameterList.hpp>
00060 #include <Zoltan2_CoordinatePartitioningGraph.hpp>
00061 #include <new>          // ::operator new[]
00062 #include <algorithm>    // std::sort
00063 #include <Zoltan2_Util.hpp>
00064 #include <vector>
00065 
00066 #ifdef HAVE_ZOLTAN2_ZOLTAN
00067 #ifdef HAVE_ZOLTAN2_MPI
00068 #define ENABLE_ZOLTAN_MIGRATION
00069 #include "zoltan_comm_cpp.h"
00070 #include "zoltan_types.h" // for error codes
00071 #endif
00072 #endif
00073 
00074 
00075 #ifdef HAVE_ZOLTAN2_OMP
00076 #include <omp.h>
00077 #endif
00078 
00079 #define LEAST_SIGNIFICANCE 0.0001
00080 #define SIGNIFICANCE_MUL 1000
00081 
00082 //if the (last dimension reduce all count) x the mpi world size
00083 //estimated to be bigger than this number then migration will be forced
00084 //in earlier iterations.
00085 #define FUTURE_REDUCEALL_CUTOFF 1500000
00086 //if parts right before last dimension are estimated to have less than
00087 //MIN_WORK_LAST_DIM many coords, migration will be forced in earlier iterations.
00088 #define MIN_WORK_LAST_DIM 1000
00089 
00090 
00091 
00092 
00093 #define ABS(x) ((x) >= 0 ? (x) : -(x))
00094 //imbalance calculation. Wreal / Wexpected - 1
00095 #define imbalanceOf(Wachieved, totalW, expectedRatio) \
00096         (Wachieved) / ((totalW) * (expectedRatio)) - 1
00097 #define imbalanceOf2(Wachieved, wExpected) \
00098         (Wachieved) / (wExpected) - 1
00099 
00100 
00101 
00102 using std::vector;
00103 
00104 namespace Zoltan2{
00105 
00109 template <typename T>
00110 T *allocMemory(size_t size){
00111     if (size > 0){
00112         T * a = new T[size];
00113         if (a == NULL) {
00114             throw  "cannot allocate memory";
00115         }
00116         return a;
00117     }
00118     else {
00119         return NULL;
00120     }
00121 }
00122 
00126 template <typename T>
00127 void freeArray(T *&array){
00128     if(array != NULL){
00129         delete [] array;
00130         array = NULL;
00131     }
00132 }
00133 
00137 template <typename tt>
00138 std::string toString(tt obj){
00139     std::stringstream ss (std::stringstream::in |std::stringstream::out);
00140     ss << obj;
00141     std::string tmp = "";
00142     ss >> tmp;
00143     return tmp;
00144 }
00145 
00153 template <typename IT, typename CT, typename WT>
00154 class uMultiSortItem
00155 {
00156 public:
00157     IT index;
00158     CT count;
00159     //unsigned int val;
00160     WT *val;
00161     WT _EPSILON;
00162 
00163     uMultiSortItem(){
00164         this->index = 0;
00165         this->count = 0;
00166         this->val = NULL;
00167         this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
00168     }
00169 
00170 
00171     uMultiSortItem(IT index_ ,CT count_, WT *vals_){
00172         this->index = index_;
00173         this->count = count_;
00174         this->val = vals_;
00175         this->_EPSILON = std::numeric_limits<WT>::epsilon() * 100;
00176     }
00177 
00178     uMultiSortItem( const uMultiSortItem<IT,CT,WT>& other ){
00179         this->index = other.index;
00180         this->count = other.count;
00181         this->val = other.val;
00182         this->_EPSILON = other._EPSILON;
00183     }
00184 
00185     ~uMultiSortItem(){
00186         //freeArray<WT>(this->val);
00187     }
00188 
00189     void set(IT index_ ,CT count_, WT *vals_){
00190         this->index = index_;
00191         this->count = count_;
00192         this->val = vals_;
00193     }
00194 
00195 
00196     uMultiSortItem<IT,CT,WT> operator=(const uMultiSortItem<IT,CT,WT>& other){
00197         this->index = other.index;
00198         this->count = other.count;
00199         this->val = other.val;
00200         return *(this);
00201     }
00202 
00203     bool operator<(const uMultiSortItem<IT,CT,WT>& other) const{
00204         assert (this->count == other.count);
00205         for(CT i = 0; i < this->count; ++i){
00206             //if the values are equal go to next one.
00207             if (ABS(this->val[i] - other.val[i]) < this->_EPSILON){
00208                 continue;
00209             }
00210             //if next value is smaller return true;
00211             if(this->val[i] < other.val[i]){
00212                 return true;
00213             }
00214             //if next value is bigger return false;
00215             else {
00216                 return false;
00217             }
00218         }
00219         //if they are totally equal.
00220         return this->index < other.index;
00221     }
00222     bool operator>(const uMultiSortItem<IT,CT,WT>& other) const{
00223         assert (this->count == other.count);
00224         for(CT i = 0; i < this->count; ++i){
00225             //if the values are equal go to next one.
00226             if (ABS(this->val[i] - other.val[i]) < this->_EPSILON){
00227                 continue;
00228             }
00229             //if next value is bigger return true;
00230             if(this->val[i] > other.val[i]){
00231                 return true;
00232             }
00233             //if next value is smaller return false;
00234             else //(this->val[i] > other.val[i])
00235             {
00236                 return false;
00237             }
00238         }
00239         //if they are totally equal.
00240         return this->index > other.index;
00241     }
00242 };// uSortItem;
00243 
00247 template <class IT, class WT>
00248 struct uSortItem
00249 {
00250     IT id;
00251     //unsigned int val;
00252     WT val;
00253 };// uSortItem;
00254 
00258 template <class IT, class WT>
00259 void uqsort(IT n, uSortItem<IT, WT> * arr)
00260 {
00261 #define SWAP(a,b,temp) temp=(a);(a)=(b);(b)=temp;
00262     int NSTACK = 50;
00263     int M = 7;
00264     IT         i, ir=n, j, k, l=1;
00265     IT         jstack=0, istack[50];
00266     WT aval;
00267     uSortItem<IT,WT>    a, temp;
00268 
00269     --arr;
00270     for (;;)
00271     {
00272         if (ir-l < M)
00273         {
00274             for (j=l+1;j<=ir;j++)
00275             {
00276                 a=arr[j];
00277                 aval = a.val;
00278                 for (i=j-1;i>=1;i--)
00279                 {
00280                     if (arr[i].val <= aval)
00281                         break;
00282                     arr[i+1] = arr[i];
00283                 }
00284                 arr[i+1]=a;
00285             }
00286             if (jstack == 0)
00287                 break;
00288             ir=istack[jstack--];
00289             l=istack[jstack--];
00290         }
00291         else
00292         {
00293             k=(l+ir) >> 1;
00294             SWAP(arr[k],arr[l+1], temp)
00295             if (arr[l+1].val > arr[ir].val)
00296             {
00297                 SWAP(arr[l+1],arr[ir],temp)
00298             }
00299             if (arr[l].val > arr[ir].val)
00300             {
00301                 SWAP(arr[l],arr[ir],temp)
00302             }
00303             if (arr[l+1].val > arr[l].val)
00304             {
00305                 SWAP(arr[l+1],arr[l],temp)
00306             }
00307             i=l+1;
00308             j=ir;
00309             a=arr[l];
00310             aval = a.val;
00311             for (;;)
00312             {
00313                 do i++; while (arr[i].val < aval);
00314                 do j--; while (arr[j].val > aval);
00315                 if (j < i) break;
00316                 SWAP(arr[i],arr[j],temp);
00317             }
00318             arr[l]=arr[j];
00319             arr[j]=a;
00320             jstack += 2;
00321             if (jstack > NSTACK){
00322                 std::cout << "uqsort: NSTACK too small in sort." << std::endl;
00323                 exit(1);
00324             }
00325             if (ir-i+1 >= j-l)
00326             {
00327                 istack[jstack]=ir;
00328                 istack[jstack-1]=i;
00329                 ir=j-1;
00330             }
00331             else
00332             {
00333                 istack[jstack]=j-1;
00334                 istack[jstack-1]=l;
00335                 l=i;
00336             }
00337         }
00338     }
00339 }
00340 
00341 
00342 
00346 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
00347           typename mj_part_t>
00348 class AlgMJ
00349 {
00350 private:
00351     typedef coordinateModelPartBox<mj_scalar_t, mj_part_t> mj_partBox_t;
00352     typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
00353 
00354     RCP<const Environment> mj_env; //the environment object
00355     RCP<Comm<int> > mj_problemComm; //initial comm object
00356 
00357     double imbalance_tolerance; //input imbalance tolerance.
00358     mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
00359     int recursion_depth; //the number of steps that partitioning will be solved in.
00360     int coord_dim, num_weights_per_coord; //coordinate dim and # of weights per coord
00361 
00362     size_t initial_num_loc_coords; //initial num local coords.
00363     global_size_t initial_num_glob_coords; //initial num global coords.
00364 
00365     mj_lno_t num_local_coords; //number of local coords.
00366     mj_gno_t num_global_coords; //number of global coords.
00367 
00368     mj_scalar_t **mj_coordinates; //two dimension coordinate array
00369     mj_scalar_t **mj_weights; //two dimension weight array
00370     bool *mj_uniform_parts; //if the target parts are uniform
00371     mj_scalar_t **mj_part_sizes; //target part weight sizes.
00372     bool *mj_uniform_weights; //if the coordinates have uniform weights.
00373 
00374     ArrayView<const mj_gno_t> mj_gnos; //global ids of the coordinates, comes from the input
00375     size_t num_global_parts; //the targeted number of parts
00376 
00377     mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
00378     mj_gno_t *current_mj_gnos; //current global ids of the coordinates, might change during migration.
00379     int *owner_of_coordinate; //the actual processor owner of the coordinate, to track after migrations.
00380 
00381     mj_lno_t *coordinate_permutations; //permutation of coordinates, for partitioning.
00382     mj_lno_t *new_coordinate_permutations; //permutation work array.
00383     mj_part_t *assigned_part_ids; //the part ids assigned to coordinates.
00384 
00385     mj_lno_t *part_xadj; //beginning and end of each part.
00386     mj_lno_t *new_part_xadj; // work array for beginning and end of each part.
00387 
00388     //get mj specific parameters.
00389     bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
00390     mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
00391 
00392     int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
00393     int mj_user_recursion_depth; //the recursion depth value provided by user.
00394     int mj_keep_part_boxes; //if the boxes need to be kept.
00395 
00396     int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
00397     mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
00398     int num_threads; //num threads
00399 
00400     mj_part_t total_num_cut ; //how many cuts will be totally
00401     mj_part_t total_num_part;    //how many parts will be totally
00402 
00403     mj_part_t max_num_part_along_dim ;         //maximum part count along a dimension.
00404     mj_part_t max_num_cut_along_dim; //maximum cut count along a dimension.
00405     size_t max_num_total_part_along_dim; //maximum part+cut count along a dimension.
00406 
00407     mj_part_t total_dim_num_reduce_all;    //estimate on #reduceAlls can be done.
00408     mj_part_t last_dim_num_part; //max no of parts that might occur
00409                                 //during the partition before the
00410                                 //last partitioning dimension.
00411 
00412     RCP<Comm<int> > comm; //comm object than can be altered during execution
00413     float fEpsilon; //epsilon for float
00414     mj_scalar_t sEpsilon; //epsilon for mj_scalar_t
00415 
00416     mj_scalar_t maxScalar_t; //max possible scalar
00417     mj_scalar_t minScalar_t; //min scalar
00418 
00419     mj_scalar_t *all_cut_coordinates;
00420     mj_scalar_t *max_min_coords;
00421     mj_scalar_t *process_cut_line_weight_to_put_left; //how much weight should a MPI put left side of the each cutline
00422     mj_scalar_t **thread_cut_line_weight_to_put_left; //how much weight percentage should each thread in MPI put left side of the each outline
00423 
00424     // work array to manipulate coordinate of cutlines in different iterations.
00425     //necessary because previous cut line information is used for determining
00426     //the next cutline information. therefore, cannot update the cut work array
00427     //until all cutlines are determined.
00428     mj_scalar_t *cut_coordinates_work_array;
00429 
00430     //cumulative part weight array.
00431     mj_scalar_t *target_part_weights;
00432 
00433     mj_scalar_t *cut_upper_bound_coordinates ;  //upper bound coordinate of a cut line
00434     mj_scalar_t *cut_lower_bound_coordinates ;  //lower bound coordinate of a cut line
00435     mj_scalar_t *cut_lower_bound_weights ;  //lower bound weight of a cut line
00436     mj_scalar_t *cut_upper_bound_weights ;  //upper bound weight of a cut line
00437 
00438     mj_scalar_t *process_local_min_max_coord_total_weight ; //combined array to exchange the min and max coordinate, and total weight of part.
00439     mj_scalar_t *global_min_max_coord_total_weight ;//global combined array with the results for min, max and total weight.
00440 
00441     //isDone is used to determine if a cutline is determined already.
00442     //If a cut line is already determined, the next iterations will skip this cut line.
00443     bool *is_cut_line_determined;
00444     //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
00445     //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
00446     mj_part_t *my_incomplete_cut_count;
00447     //local part weights of each thread.
00448     double **thread_part_weights;
00449     //the work manupulation array for partweights.
00450     double **thread_part_weight_work;
00451 
00452     //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
00453     mj_scalar_t **thread_cut_left_closest_point;
00454     //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
00455     mj_scalar_t **thread_cut_right_closest_point;
00456 
00457     //to store how many points in each part a thread has.
00458     mj_lno_t **thread_point_counts;
00459 
00460     mj_scalar_t *process_rectilinear_cut_weight;
00461     mj_scalar_t *global_rectilinear_cut_weight;
00462 
00463     //for faster communication, concatanation of
00464     //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
00465     //leftClosest distances sized P-1, since P-1 cut lines
00466     //rightClosest distances size P-1, since P-1 cut lines.
00467     mj_scalar_t *total_part_weight_left_right_closests ;
00468     mj_scalar_t *global_total_part_weight_left_right_closests;
00469 
00470     RCP<mj_partBoxVector_t> kept_boxes;
00471     RCP<mj_partBox_t> global_box;
00472     int myRank, myActualRank; //processor rank, and initial rank
00473 
00474     /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
00475      * the input. part_no_array takes
00476      * precedence if both are provided.
00477      * Depending on these parameters, total cut/part number,
00478      * maximum part/cut number along a dimension, estimated number of reduceAlls,
00479      * and the number of parts before the last dimension is calculated.
00480      * */
00481     void set_part_specifications();
00482 
00483     /* \brief Tries to determine the part number for current dimension,
00484      * by trying to make the partitioning as square as possible.
00485      * \param num_total_future how many more partitionings are required.
00486      * \param root how many more recursion depth is left.
00487      */
00488     inline mj_part_t get_part_count(
00489         mj_part_t num_total_future,
00490         double root);
00491 
00492     /* \brief Allocates the all required memory for the mj partitioning algorithm.
00493      *
00494      */
00495     void allocate_set_work_memory();
00496 
00497     /* \brief for part communication we keep track of the box boundaries.
00498      * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
00499      * This function initializes a single box with all global min and max coordinates.
00500      * \param initial_partitioning_boxes the input and output vector for boxes.
00501      */
00502     void init_part_boxes(RCP<mj_partBoxVector_t> & outPartBoxes);
00503 
00504     /* \brief compute global bounding box:  min/max coords of global domain */
00505     void compute_global_box();
00506 
00507     /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
00508      * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
00509      * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
00510      * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
00511      *
00512      *  \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
00513      *  \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
00514      *  \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
00515      *  \param future_num_parts: output, max number of future parts that will be obtained from a single
00516      *  \param current_num_parts: input, how many parts are there currently.
00517      *  \param current_iteration: input, current dimension iteration number.
00518      *  \param input_part_boxes: input, if boxes are kept, current boxes.
00519      *  \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
00520      */
00521     mj_part_t update_part_num_arrays(
00522         std::vector<mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
00523         std::vector<mj_part_t> *future_num_part_in_parts,
00524         std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
00525         mj_part_t &future_num_parts,
00526         mj_part_t current_num_parts,
00527         int current_iteration,
00528         RCP<mj_partBoxVector_t> input_part_boxes,
00529         RCP<mj_partBoxVector_t> output_part_boxes);
00530 
00542     void mj_get_local_min_max_coord_totW(
00543         mj_lno_t coordinate_begin_index,
00544         mj_lno_t coordinate_end_index,
00545         mj_lno_t *mj_current_coordinate_permutations,
00546         mj_scalar_t *mj_current_dim_coords,
00547         mj_scalar_t &min_coordinate,
00548         mj_scalar_t &max_coordinate,
00549         mj_scalar_t &total_weight);
00550 
00558     void mj_get_global_min_max_coord_totW(
00559         mj_part_t current_concurrent_num_parts,
00560         mj_scalar_t *local_min_max_total,
00561         mj_scalar_t *global_min_max_total);
00562 
00581     void mj_get_initial_cut_coords_target_weights(
00582         mj_scalar_t min_coord,
00583         mj_scalar_t max_coord,
00584         mj_part_t num_cuts/*p-1*/ ,
00585         mj_scalar_t global_weight,
00586         mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
00587         mj_scalar_t *target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
00588 
00589         std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
00590         std::vector <mj_part_t> *next_future_num_parts_in_parts,
00591         mj_part_t concurrent_current_part,
00592         mj_part_t obtained_part_index);
00593 
00606     void set_initial_coordinate_parts(
00607         mj_scalar_t &max_coordinate,
00608         mj_scalar_t &min_coordinate,
00609         mj_part_t &concurrent_current_part_index,
00610         mj_lno_t coordinate_begin_index,
00611         mj_lno_t coordinate_end_index,
00612         mj_lno_t *mj_current_coordinate_permutations,
00613         mj_scalar_t *mj_current_dim_coords,
00614         mj_part_t *mj_part_ids,
00615         mj_part_t &partition_count);
00616 
00627     void mj_1D_part(
00628         mj_scalar_t *mj_current_dim_coords,
00629         mj_scalar_t imbalanceTolerance,
00630         mj_part_t current_work_part,
00631         mj_part_t current_concurrent_num_parts,
00632         mj_scalar_t *current_cut_coordinates,
00633         mj_part_t total_incomplete_cut_count,
00634         std::vector <mj_part_t> &num_partitioning_in_current_dim);
00635 
00655     void mj_1D_part_get_thread_part_weights(
00656         size_t total_part_count,
00657         mj_part_t num_cuts,
00658         mj_scalar_t max_coord,
00659         mj_scalar_t min_coord,
00660         mj_lno_t coordinate_begin_index,
00661         mj_lno_t coordinate_end_index,
00662         mj_scalar_t *mj_current_dim_coords,
00663         mj_scalar_t *temp_current_cut_coords,
00664         bool *current_cut_status,
00665         double *my_current_part_weights,
00666         mj_scalar_t *my_current_left_closest,
00667         mj_scalar_t *my_current_right_closest);
00668 
00676     void mj_accumulate_thread_results(
00677         const std::vector <mj_part_t> &num_partitioning_in_current_dim,
00678         mj_part_t current_work_part,
00679         mj_part_t current_concurrent_num_parts);
00680 
00711     void mj_get_new_cut_coordinates(
00712         const size_t &num_total_part,
00713         const mj_part_t &num_cuts,
00714         const mj_scalar_t &max_coordinate,
00715         const mj_scalar_t &min_coordinate,
00716         const mj_scalar_t &global_total_weight,
00717         const mj_scalar_t &used_imbalance_tolerance,
00718         mj_scalar_t * current_global_part_weights,
00719         const mj_scalar_t * current_local_part_weights,
00720         const mj_scalar_t *current_part_target_weights,
00721         bool *current_cut_line_determined,
00722         mj_scalar_t *current_cut_coordinates,
00723         mj_scalar_t *current_cut_upper_bounds,
00724         mj_scalar_t *current_cut_lower_bounds,
00725         mj_scalar_t *current_global_left_closest_points,
00726         mj_scalar_t *current_global_right_closest_points,
00727         mj_scalar_t * current_cut_lower_bound_weights,
00728         mj_scalar_t * current_cut_upper_weights,
00729         mj_scalar_t *new_current_cut_coordinates,
00730         mj_scalar_t *current_part_cut_line_weight_to_put_left,
00731         mj_part_t *rectilinear_cut_count,
00732         mj_part_t &my_num_incomplete_cut);
00733 
00743     void mj_calculate_new_cut_position (
00744       mj_scalar_t cut_upper_bound,
00745         mj_scalar_t cut_lower_bound,
00746         mj_scalar_t cut_upper_weight,
00747         mj_scalar_t cut_lower_weight,
00748         mj_scalar_t expected_weight,
00749         mj_scalar_t &new_cut_position);
00750 
00761     void mj_create_new_partitions(
00762         mj_part_t num_parts,
00763         mj_scalar_t *mj_current_dim_coords,
00764         mj_scalar_t *current_concurrent_cut_coordinate,
00765         mj_lno_t coordinate_begin,
00766         mj_lno_t coordinate_end,
00767         mj_scalar_t *used_local_cut_line_weight_to_left,
00768         double **used_thread_part_weight_work,
00769         mj_lno_t *out_part_xadj);
00770 
00793     bool mj_perform_migration(
00794         mj_part_t in_num_parts, //current umb parts
00795         mj_part_t &out_num_parts, //output umb parts.
00796         std::vector<mj_part_t> *next_future_num_parts_in_parts,
00797         mj_part_t &output_part_begin_index,
00798         size_t migration_reduce_all_population,
00799         mj_lno_t num_coords_for_last_dim_part,
00800         std::string iteration,
00801         RCP<mj_partBoxVector_t> &input_part_boxes,
00802         RCP<mj_partBoxVector_t> &output_part_boxes);
00803 
00813     void get_processor_num_points_in_parts(
00814         mj_part_t num_procs,
00815         mj_part_t num_parts,
00816         mj_gno_t *&num_points_in_all_processor_parts);
00817 
00830     bool mj_check_to_migrate(
00831         size_t migration_reduce_all_population,
00832         mj_lno_t num_coords_for_last_dim_part,
00833         mj_part_t num_procs,
00834         mj_part_t num_parts,
00835         mj_gno_t *num_points_in_all_processor_parts);
00836 
00837 
00855     void mj_migration_part_proc_assignment(
00856         mj_gno_t * num_points_in_all_processor_parts,
00857         mj_part_t num_parts,
00858         mj_part_t num_procs,
00859         mj_lno_t *send_count_to_each_proc,
00860         std::vector<mj_part_t> &processor_ranks_for_subcomm,
00861         std::vector<mj_part_t> *next_future_num_parts_in_parts,
00862         mj_part_t &out_num_part,
00863         std::vector<mj_part_t> &out_part_indices,
00864         mj_part_t &output_part_numbering_begin_index,
00865         int *coordinate_destinations);
00866 
00883     void mj_assign_proc_to_parts(
00884         mj_gno_t * num_points_in_all_processor_parts,
00885         mj_part_t num_parts,
00886         mj_part_t num_procs,
00887         mj_lno_t *send_count_to_each_proc,
00888         std::vector<mj_part_t> &processor_ranks_for_subcomm,
00889         std::vector<mj_part_t> *next_future_num_parts_in_parts,
00890         mj_part_t &out_part_index,
00891         mj_part_t &output_part_numbering_begin_index,
00892         int *coordinate_destinations);
00893 
00904     void assign_send_destinations(
00905         mj_part_t num_parts,
00906         mj_part_t *part_assignment_proc_begin_indices,
00907         mj_part_t *processor_chains_in_parts,
00908         mj_lno_t *send_count_to_each_proc,
00909         int *coordinate_destinations);
00910 
00923     void assign_send_destinations2(
00924         mj_part_t num_parts,
00925         uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
00926         int *coordinate_destinations,
00927         mj_part_t &output_part_numbering_begin_index,
00928         std::vector<mj_part_t> *next_future_num_parts_in_parts);
00929 
00946     void mj_assign_parts_to_procs(
00947         mj_gno_t * num_points_in_all_processor_parts,
00948         mj_part_t num_parts,
00949         mj_part_t num_procs,
00950         mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
00951         std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
00952         mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
00953         std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
00954         mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
00955         int *coordinate_destinations);
00956 
00969     void mj_migrate_coords(
00970         mj_part_t num_procs,
00971         mj_lno_t &num_new_local_points,
00972         std::string iteration,
00973         int *coordinate_destinations,
00974         mj_part_t num_parts);
00975 
00982     void create_sub_communicator(vector<mj_part_t> &processor_ranks_for_subcomm);
00983 
00984 
00990     void fill_permutation_array(
00991         mj_part_t output_num_parts,
00992         mj_part_t num_parts);
00993 
01002     void set_final_parts(
01003         mj_part_t current_num_parts,
01004         mj_part_t output_part_begin_index,
01005         RCP<mj_partBoxVector_t> output_part_boxes,
01006         bool is_data_ever_migrated);
01009     void free_work_memory();
01023     void create_consistent_chunks(
01024         mj_part_t num_parts,
01025         mj_scalar_t *mj_current_dim_coords,
01026         mj_scalar_t *current_concurrent_cut_coordinate,
01027         mj_lno_t coordinate_begin,
01028         mj_lno_t coordinate_end,
01029         mj_scalar_t *used_local_cut_line_weight_to_left,
01030         mj_lno_t *out_part_xadj,
01031         int coordInd);
01032 public:
01033     AlgMJ();
01034 
01063     void multi_jagged_part(
01064         const RCP<const Environment> &env,
01065           RCP<Comm<int> > &problemComm,
01066 
01067           double imbalance_tolerance,
01068           size_t num_global_parts,
01069           mj_part_t *part_no_array,
01070           int recursion_depth,
01071 
01072           int coord_dim,
01073           mj_lno_t num_local_coords,
01074           mj_gno_t num_global_coords,
01075           const mj_gno_t *initial_mj_gnos,
01076           mj_scalar_t **mj_coordinates,
01077 
01078           int num_weights_per_coord,
01079           bool *mj_uniform_weights,
01080           mj_scalar_t **mj_weights,
01081           bool *mj_uniform_parts,
01082           mj_scalar_t **mj_part_sizes,
01083 
01084           mj_part_t *&result_assigned_part_ids,
01085           mj_gno_t *&result_mj_gnos
01086 
01087         );
01095     void set_partitioning_parameters(
01096         bool distribute_points_on_cut_lines_,
01097         int max_concurrent_part_calculation_,
01098         int check_migrate_avoid_migration_option_,
01099         mj_scalar_t minimum_migration_imbalance_);
01103     void set_to_keep_part_boxes();
01108     RCP<mj_partBoxVector_t> get_part_boxes() const;
01109 
01112     RCP<mj_partBox_t> get_global_box() const;
01113 
01137     void sequential_task_partitioning(
01138         const RCP<const Environment> &env,
01139         mj_lno_t num_total_coords,
01140         mj_lno_t num_selected_coords,
01141         size_t num_target_part,
01142         int coord_dim,
01143         mj_scalar_t **mj_coordinates,
01144         mj_lno_t *initial_selected_coords_output_permutation,
01145         mj_lno_t *output_xadj,
01146         int recursion_depth,
01147         const mj_part_t *part_no_array);
01148 
01149 };
01150 
01174 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
01175           typename mj_part_t>
01176 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::sequential_task_partitioning(
01177     const RCP<const Environment> &env,
01178     mj_lno_t num_total_coords,
01179     mj_lno_t num_selected_coords,
01180     size_t num_target_part,
01181     int coord_dim_,
01182     mj_scalar_t **mj_coordinates_,
01183     mj_lno_t *inital_adjList_output_adjlist,
01184     mj_lno_t *output_xadj,
01185     int rd,
01186     const mj_part_t *part_no_array_
01187 ){
01188 
01189   this->mj_env = env;
01190   const RCP<Comm<int> > commN;
01191   this->comm = this->mj_problemComm =  Teuchos::rcp_const_cast<Comm<int> >
01192   (Teuchos::DefaultComm<int>::getDefaultSerialComm(commN));
01193   this->myActualRank = this->myRank = 1;
01194 
01195 #ifdef HAVE_ZOLTAN2_OMP
01196   int actual_num_threads = omp_get_num_threads();
01197   omp_set_num_threads(1);
01198 #endif
01199 
01200     //weights are uniform for task mapping
01201  
01202     //parts are uniform for task mapping
01203     //as input indices.
01204 
01205     this->imbalance_tolerance = 0;
01206     this->num_global_parts = num_target_part;
01207     this->part_no_array = (mj_part_t *)part_no_array_;
01208     this->recursion_depth = rd;
01209 
01210     this->coord_dim = coord_dim_;
01211     this->num_local_coords = num_total_coords;
01212     this->num_global_coords = num_total_coords;
01213     this->mj_coordinates = mj_coordinates_;  //will copy the memory to this->mj_coordinates.
01214 
01217     this->initial_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
01218 
01219     this->num_weights_per_coord = 0;
01220     bool *tmp_mj_uniform_weights = new bool[1];
01221     this->mj_uniform_weights = tmp_mj_uniform_weights ;
01222     this->mj_uniform_weights[0] = true;
01223 
01224     mj_scalar_t **tmp_mj_weights = new mj_scalar_t *[1];
01225     this->mj_weights = tmp_mj_weights; //will copy the memory to this->mj_weights
01226 
01227     bool *tmp_mj_uniform_parts = new bool[1];
01228     this->mj_uniform_parts = tmp_mj_uniform_parts;
01229     this->mj_uniform_parts[0] = true;
01230 
01231     mj_scalar_t **tmp_mj_part_sizes = new mj_scalar_t * [1];
01232     this->mj_part_sizes = tmp_mj_part_sizes;
01233     this->mj_part_sizes[0] = NULL;
01234 
01235     this->num_threads = 1;
01236     this->set_part_specifications();
01237 
01238     this->allocate_set_work_memory();
01239     //the end of the initial partition is the end of coordinates.
01240     this->part_xadj[0] = static_cast<mj_lno_t>(num_selected_coords);
01241     for(size_t i = 0; i < static_cast<size_t>(num_total_coords); ++i){
01242         this->coordinate_permutations[i] = inital_adjList_output_adjlist[i];
01243     }
01244 
01245     mj_part_t current_num_parts = 1;
01246 
01247     mj_scalar_t *current_cut_coordinates =  this->all_cut_coordinates;
01248 
01249     mj_part_t future_num_parts = this->total_num_part;
01250 
01251     std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
01252     std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
01253     next_future_num_parts_in_parts->push_back(this->num_global_parts);
01254     RCP<mj_partBoxVector_t> t1;
01255     RCP<mj_partBoxVector_t> t2;
01256 
01257     for (int i = 0; i < this->recursion_depth; ++i){
01258 
01259         //partitioning array. size will be as the number of current partitions and this
01260         //holds how many parts that each part will be in the current dimension partitioning.
01261         std::vector <mj_part_t> num_partitioning_in_current_dim;
01262 
01263         //number of parts that will be obtained at the end of this partitioning.
01264         //future_num_part_in_parts is as the size of current number of parts.
01265         //holds how many more parts each should be divided in the further
01266         //iterations. this will be used to calculate num_partitioning_in_current_dim,
01267         //as the number of parts that the part will be partitioned
01268         //in the current dimension partitioning.
01269 
01270         //next_future_num_parts_in_parts will be as the size of outnumParts,
01271         //and this will hold how many more parts that each output part
01272         //should be divided. this array will also be used to determine the weight ratios
01273         //of the parts.
01274         //swap the arrays to use iteratively..
01275         std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
01276         future_num_part_in_parts = next_future_num_parts_in_parts;
01277         next_future_num_parts_in_parts = tmpPartVect;
01278 
01279         //clear next_future_num_parts_in_parts array as
01280         //getPartitionArrays expects it to be empty.
01281         //it also expects num_partitioning_in_current_dim to be empty as well.
01282         next_future_num_parts_in_parts->clear();
01283 
01284 
01285         //returns the total number of output parts for this dimension partitioning.
01286         mj_part_t output_part_count_in_dimension =
01287             this->update_part_num_arrays(
01288                 num_partitioning_in_current_dim,
01289                 future_num_part_in_parts,
01290                 next_future_num_parts_in_parts,
01291                 future_num_parts,
01292                 current_num_parts,
01293                 i,
01294                 t1,
01295                 t2);
01296 
01297         //if the number of obtained parts equal to current number of parts,
01298         //skip this dimension. For example, this happens when 1 is given in the input
01299         //part array is given. P=4,5,1,2
01300         if(output_part_count_in_dimension == current_num_parts) {
01301             tmpPartVect= future_num_part_in_parts;
01302             future_num_part_in_parts = next_future_num_parts_in_parts;
01303             next_future_num_parts_in_parts = tmpPartVect;
01304             continue;
01305         }
01306 
01307         //get the coordinate axis along which the partitioning will be done.
01308         int coordInd = i % this->coord_dim;
01309         mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
01310         //convert i to string to be used for debugging purposes.
01311         std::string istring = toString<int>(i);
01312 
01313         //alloc Memory to point the indices
01314         //of the parts in the permutation array.
01315         this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
01316 
01317         //the index where in the outtotalCounts will be written.
01318         mj_part_t output_part_index = 0;
01319         //whatever is written to outTotalCounts will be added with previousEnd
01320         //so that the points will be shifted.
01321         mj_part_t output_coordinate_end_index = 0;
01322 
01323         mj_part_t current_work_part = 0;
01324         mj_part_t current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
01325                                          this->max_concurrent_part_calculation);
01326 
01327         mj_part_t obtained_part_index = 0;
01328 
01329         //run for all available parts.
01330         for (; current_work_part < current_num_parts;
01331                      current_work_part += current_concurrent_num_parts){
01332 
01333             current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
01334             this->max_concurrent_part_calculation);
01335 
01336             mj_part_t actual_work_part_count = 0;
01337             //initialization for 1D partitioning.
01338             //get the min and max coordinates of each part
01339             //together with the part weights of each part.
01340             for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
01341                 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
01342 
01343                 //if this part wont be partitioned any further
01344                 //dont do any work for this part.
01345                 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
01346                     continue;
01347                 }
01348                 ++actual_work_part_count;
01349                 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
01350                 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
01351 
01352                 /*
01353                 std::cout << "i:" << i << " j:" << current_work_part + kk
01354                     << " coordinate_begin_index:" << coordinate_begin_index
01355                     << " coordinate_end_index:" << coordinate_end_index
01356                     << " total:" << coordinate_end_index - coordinate_begin_index<< std::endl;
01357                     */
01358                 this->mj_get_local_min_max_coord_totW(
01359                     coordinate_begin_index,
01360                     coordinate_end_index,
01361                     this->coordinate_permutations,
01362                     mj_current_dim_coords,
01363                     this->process_local_min_max_coord_total_weight[kk], //min coordinate
01364                         this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max coordinate
01365                         this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts] //total weight);
01366                 );
01367             }
01368 
01369             //1D partitioning
01370             if (actual_work_part_count > 0){
01371                 //obtain global Min max of the part.
01372               this->mj_get_global_min_max_coord_totW(
01373                     current_concurrent_num_parts,
01374                     this->process_local_min_max_coord_total_weight,
01375                     this->global_min_max_coord_total_weight);
01376 
01377                 //represents the total number of cutlines
01378                 //whose coordinate should be determined.
01379                 mj_part_t total_incomplete_cut_count = 0;
01380 
01381                 //Compute weight ratios for parts & cuts:
01382                 //e.g., 0.25  0.25  0.5    0.5  0.75 0.75  1
01383                 //part0  cut0  part1 cut1 part2 cut2 part3
01384                 mj_part_t concurrent_part_cut_shift = 0;
01385                 mj_part_t concurrent_part_part_shift = 0;
01386                 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
01387                     mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
01388                     mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
01389                                                      current_concurrent_num_parts];
01390                     mj_scalar_t global_total_weight =
01391                               this->global_min_max_coord_total_weight[kk +
01392                                                      2 * current_concurrent_num_parts];
01393 
01394                     mj_part_t concurrent_current_part_index = current_work_part + kk;
01395 
01396                     mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
01397 
01398                     mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
01399                     mj_scalar_t *current_target_part_weights = this->target_part_weights +
01400                                                                      concurrent_part_part_shift;
01401                     //shift the usedCutCoordinate array as noCuts.
01402                     concurrent_part_cut_shift += partition_count - 1;
01403                     //shift the partRatio array as noParts.
01404                     concurrent_part_part_shift += partition_count;
01405 
01406                     //calculate only if part is not empty,
01407                     //and part will be further partitioend.
01408                     if(partition_count > 1 && min_coordinate <= max_coordinate){
01409 
01410                         //increase allDone by the number of cuts of the current
01411                         //part's cut line number.
01412                         total_incomplete_cut_count += partition_count - 1;
01413                         //set the number of cut lines that should be determined
01414                         //for this part.
01415                         this->my_incomplete_cut_count[kk] = partition_count - 1;
01416 
01417                         //get the target weights of the parts.
01418                         this->mj_get_initial_cut_coords_target_weights(
01419                             min_coordinate,
01420                             max_coordinate,
01421                             partition_count - 1,
01422                             global_total_weight,
01423                             usedCutCoordinate,
01424                             current_target_part_weights,
01425                             future_num_part_in_parts,
01426                             next_future_num_parts_in_parts,
01427                             concurrent_current_part_index,
01428                             obtained_part_index);
01429 
01430                         mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
01431                         mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
01432 
01433                         //get the initial estimated part assignments of the coordinates.
01434                         this->set_initial_coordinate_parts(
01435                             max_coordinate,
01436                             min_coordinate,
01437                             concurrent_current_part_index,
01438                             coordinate_begin_index, coordinate_end_index,
01439                             this->coordinate_permutations,
01440                             mj_current_dim_coords,
01441                             this->assigned_part_ids,
01442                             partition_count);
01443                     }
01444                     else {
01445                         // e.g., if have fewer coordinates than parts, don't need to do next dim.
01446                         this->my_incomplete_cut_count[kk] = 0;
01447                     }
01448                     obtained_part_index += partition_count;
01449                 }
01450 
01451                 //used imbalance, it is always 0, as it is difficult to estimate a range.
01452                 mj_scalar_t used_imbalance = 0;
01453 
01454                 // Determine cut lines for k parts here.
01455                 this->mj_1D_part(
01456                     mj_current_dim_coords,
01457                     used_imbalance,
01458                     current_work_part,
01459                     current_concurrent_num_parts,
01460                     current_cut_coordinates,
01461                     total_incomplete_cut_count,
01462                     num_partitioning_in_current_dim);
01463             }
01464 
01465             //create part chunks
01466             {
01467 
01468                 mj_part_t output_array_shift = 0;
01469                 mj_part_t cut_shift = 0;
01470                 size_t tlr_shift = 0;
01471                 size_t partweight_array_shift = 0;
01472 
01473                 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
01474                     mj_part_t current_concurrent_work_part = current_work_part + kk;
01475                     mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
01476 
01477                     //if the part is empty, skip the part.
01478                     if((num_parts != 1  ) && this->global_min_max_coord_total_weight[kk] >
01479                              this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
01480 
01481                         for(mj_part_t jj = 0; jj < num_parts; ++jj){
01482                             this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
01483                         }
01484                         cut_shift += num_parts - 1;
01485                         tlr_shift += (4 *(num_parts - 1) + 1);
01486                         output_array_shift += num_parts;
01487                         partweight_array_shift += (2 * (num_parts - 1) + 1);
01488                         continue;
01489                     }
01490 
01491                     mj_lno_t coordinate_end = this->part_xadj[current_concurrent_work_part];
01492                     mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[current_concurrent_work_part
01493                                                              -1];
01494                     mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
01495                     mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
01496                                                          cut_shift;
01497 
01498                     for(int ii = 0; ii < this->num_threads; ++ii){
01499                         this->thread_part_weight_work[ii] = this->thread_part_weights[ii] +  partweight_array_shift;
01500                     }
01501 
01502                     if(num_parts > 1){
01503                         // Rewrite the indices based on the computed cuts.
01504                       this->create_consistent_chunks(
01505                             num_parts,
01506                             mj_current_dim_coords,
01507                             current_concurrent_cut_coordinate,
01508                             coordinate_begin,
01509                             coordinate_end,
01510                             used_local_cut_line_weight_to_left,
01511                             this->new_part_xadj + output_part_index + output_array_shift,
01512                             coordInd );
01513                     }
01514                     else {
01515                         //if this part is partitioned into 1 then just copy
01516                         //the old values.
01517                         mj_lno_t part_size = coordinate_end - coordinate_begin;
01518                         *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
01519                         memcpy(this->new_coordinate_permutations + coordinate_begin,
01520                         this->coordinate_permutations + coordinate_begin,
01521                         part_size * sizeof(mj_lno_t));
01522                     }
01523                     cut_shift += num_parts - 1;
01524                     tlr_shift += (4 *(num_parts - 1) + 1);
01525                     output_array_shift += num_parts;
01526                     partweight_array_shift += (2 * (num_parts - 1) + 1);
01527                 }
01528 
01529                 //shift cut coordinates so that all cut coordinates are stored.
01530                 //current_cut_coordinates += cutShift;
01531 
01532                 //getChunks from coordinates partitioned the parts and
01533                 //wrote the indices as if there were a single part.
01534                 //now we need to shift the beginning indices.
01535                 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
01536                     mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
01537                     for (mj_part_t ii = 0;ii < num_parts ; ++ii){
01538                         //shift it by previousCount
01539                         this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
01540                     }
01541                     //increase the previous count by current end.
01542                     output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
01543                     //increase the current out.
01544                     output_part_index += num_parts ;
01545                 }
01546             }
01547         }
01548         // end of this partitioning dimension
01549 
01550         //set the current num parts for next dim partitioning
01551         current_num_parts = output_part_count_in_dimension;
01552 
01553         //swap the coordinate permutations for the next dimension.
01554         mj_lno_t * tmp = this->coordinate_permutations;
01555         this->coordinate_permutations = this->new_coordinate_permutations;
01556         this->new_coordinate_permutations = tmp;
01557 
01558         freeArray<mj_lno_t>(this->part_xadj);
01559         this->part_xadj = this->new_part_xadj;
01560     }
01561 
01562     for(mj_lno_t i = 0; i < num_total_coords; ++i){
01563       inital_adjList_output_adjlist[i] = this->coordinate_permutations[i];
01564     }
01565 
01566     for(size_t i = 0; i < this->num_global_parts ; ++i){
01567         output_xadj[i] = this->part_xadj[i];
01568     }
01569 
01570     delete future_num_part_in_parts;
01571     delete next_future_num_parts_in_parts;
01572 
01573     //free the extra memory that we allocated.
01574     freeArray<mj_part_t>(this->assigned_part_ids);
01575     freeArray<mj_gno_t>(this->initial_mj_gnos);
01576     freeArray<mj_gno_t>(this->current_mj_gnos);
01577     freeArray<bool>(tmp_mj_uniform_weights);
01578     freeArray<bool>(tmp_mj_uniform_parts);
01579     freeArray<mj_scalar_t *>(tmp_mj_weights);
01580     freeArray<mj_scalar_t *>(tmp_mj_part_sizes);
01581 
01582     this->free_work_memory();
01583 
01584 #ifdef HAVE_ZOLTAN2_OMP
01585     omp_set_num_threads(actual_num_threads);
01586 #endif
01587 }
01588 
01592 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
01593           typename mj_part_t>
01594 AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::AlgMJ():
01595   mj_env(), mj_problemComm(), imbalance_tolerance(0),
01596   part_no_array(NULL), recursion_depth(0), coord_dim(0),
01597   num_weights_per_coord(0), initial_num_loc_coords(0),
01598         initial_num_glob_coords(0),
01599   num_local_coords(0), num_global_coords(0), mj_coordinates(NULL),
01600   mj_weights(NULL), mj_uniform_parts(NULL), mj_part_sizes(NULL),
01601   mj_uniform_weights(NULL), mj_gnos(), num_global_parts(1),
01602   initial_mj_gnos(NULL), current_mj_gnos(NULL), owner_of_coordinate(NULL),
01603   coordinate_permutations(NULL), new_coordinate_permutations(NULL),
01604   assigned_part_ids(NULL), part_xadj(NULL), new_part_xadj(NULL),
01605   distribute_points_on_cut_lines(true), max_concurrent_part_calculation(1),
01606   mj_run_as_rcb(0), mj_user_recursion_depth(0), mj_keep_part_boxes(0),
01607   check_migrate_avoid_migration_option(0), minimum_migration_imbalance(0.30),
01608   num_threads(1), total_num_cut(0), total_num_part(0), max_num_part_along_dim(0),
01609   max_num_cut_along_dim(0), max_num_total_part_along_dim(0), total_dim_num_reduce_all(0),
01610   last_dim_num_part(0), comm(), fEpsilon(0), sEpsilon(0), maxScalar_t(0), minScalar_t(0),
01611   all_cut_coordinates(NULL), max_min_coords(NULL), process_cut_line_weight_to_put_left(NULL),
01612   thread_cut_line_weight_to_put_left(NULL), cut_coordinates_work_array(NULL),
01613   target_part_weights(NULL), cut_upper_bound_coordinates(NULL), cut_lower_bound_coordinates(NULL),
01614   cut_lower_bound_weights(NULL), cut_upper_bound_weights(NULL),
01615   process_local_min_max_coord_total_weight(NULL), global_min_max_coord_total_weight(NULL),
01616   is_cut_line_determined(NULL), my_incomplete_cut_count(NULL),
01617   thread_part_weights(NULL), thread_part_weight_work(NULL),
01618   thread_cut_left_closest_point(NULL), thread_cut_right_closest_point(NULL),
01619   thread_point_counts(NULL), process_rectilinear_cut_weight(NULL),
01620   global_rectilinear_cut_weight(NULL),total_part_weight_left_right_closests(NULL),
01621   global_total_part_weight_left_right_closests(NULL),
01622         kept_boxes(),global_box(),
01623   myRank(0), myActualRank(0)
01624 {
01625     this->fEpsilon = std::numeric_limits<float>::epsilon();
01626     this->sEpsilon = std::numeric_limits<mj_scalar_t>::epsilon() * 100;
01627 
01628     this->maxScalar_t = std::numeric_limits<mj_scalar_t>::max();
01629     this->minScalar_t = -std::numeric_limits<mj_scalar_t>::max();
01630 
01631 }
01632 
01636 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
01637           typename mj_part_t>
01638 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBoxVector_t>
01639 AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_boxes() const 
01640 {
01641   if (this->mj_keep_part_boxes){
01642     return this->kept_boxes;
01643   } else {
01644                 throw std::logic_error("Error: part boxes are not stored.");
01645   }
01646 }
01647 
01651 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
01652           typename mj_part_t>
01653 RCP<typename AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::mj_partBox_t>
01654 AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::get_global_box() const 
01655 {
01656         return this->global_box;
01657 }
01658 
01662 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
01663           typename mj_part_t>
01664 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_to_keep_part_boxes(){
01665   this->mj_keep_part_boxes = 1;
01666 }
01667 
01668 
01669 
01670 /* \brief Either the mj array (part_no_array) or num_global_parts should be provided in
01671  * the input. part_no_array takes
01672  * precedence if both are provided.
01673  * Depending on these parameters, total cut/part number,
01674  * maximum part/cut number along a dimension, estimated number of reduceAlls,
01675  * and the number of parts before the last dimension is calculated.
01676  * */
01677 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
01678           typename mj_part_t>
01679 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_part_specifications(){
01680 
01681   this->total_num_cut = 0; //how many cuts will be totally
01682   this->total_num_part = 1;    //how many parts will be totally
01683   this->max_num_part_along_dim = 0;         //maximum part count along a dimension.
01684   this->total_dim_num_reduce_all = 0;    //estimate on #reduceAlls can be done.
01685   this->last_dim_num_part = 1; //max no of parts that might occur
01686   //during the partition before the
01687   //last partitioning dimension.
01688   this->max_num_cut_along_dim = 0;
01689   this->max_num_total_part_along_dim = 0;
01690 
01691   if (this->part_no_array){
01692     //if user provided part array, traverse the array and set variables.
01693     for (int i = 0; i < this->recursion_depth; ++i){
01694       this->total_dim_num_reduce_all += this->total_num_part;
01695       this->total_num_part *= this->part_no_array[i];
01696       if(this->part_no_array[i] > this->max_num_part_along_dim) {
01697         this->max_num_part_along_dim = this->part_no_array[i];
01698       }
01699     }
01700     this->last_dim_num_part = this->total_num_part / this->part_no_array[recursion_depth-1];
01701     this->num_global_parts = this->total_num_part;
01702   } else {
01703     mj_part_t future_num_parts = this->num_global_parts;
01704 
01705     //we need to calculate the part numbers now, to determine the maximum along the dimensions.
01706     for (int i = 0; i < this->recursion_depth; ++i){
01707 
01708       mj_part_t maxNoPartAlongI = this->get_part_count(
01709           future_num_parts, 1.0f / (this->recursion_depth - i));
01710 
01711       if (maxNoPartAlongI > this->max_num_part_along_dim){
01712         this->max_num_part_along_dim = maxNoPartAlongI;
01713       }
01714 
01715       mj_part_t nfutureNumParts = future_num_parts / maxNoPartAlongI;
01716       if (future_num_parts % maxNoPartAlongI){
01717         ++nfutureNumParts;
01718       }
01719       future_num_parts = nfutureNumParts;
01720     }
01721     this->total_num_part = this->num_global_parts;
01722     //estimate reduceAll Count here.
01723     //we find the upperbound instead.
01724     mj_part_t p = 1;
01725     for (int i = 0; i < this->recursion_depth; ++i){
01726       this->total_dim_num_reduce_all += p;
01727       p *= this->max_num_part_along_dim;
01728     }
01729 
01730     this->last_dim_num_part  = p / this->max_num_part_along_dim;
01731   }
01732 
01733   this->total_num_cut = this->total_num_part - 1;
01734   this->max_num_cut_along_dim = this->max_num_part_along_dim - 1;
01735   this->max_num_total_part_along_dim = this->max_num_part_along_dim + size_t(this->max_num_cut_along_dim);
01736   //maxPartNo is P, maxCutNo = P-1, matTotalPartcount = 2P-1
01737 
01738   //refine the concurrent part count, if it is given bigger than the maximum possible part count.
01739     if(this->max_concurrent_part_calculation > this->last_dim_num_part){
01740         if(this->mj_problemComm->getRank() == 0){
01741             std::cerr << "Warning: Concurrent part count ("<< this->max_concurrent_part_calculation <<
01742             ") has been set bigger than maximum amount that can be used." <<
01743             " Setting to:" << this->last_dim_num_part << "." << std::endl;
01744         }
01745         this->max_concurrent_part_calculation = this->last_dim_num_part;
01746     }
01747 
01748 }
01749 /* \brief Tries to determine the part number for current dimension,
01750  * by trying to make the partitioning as square as possible.
01751  * \param num_total_future how many more partitionings are required.
01752  * \param root how many more recursion depth is left.
01753  */
01754 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
01755           typename mj_part_t>
01756 inline mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_part_count(
01757     mj_part_t num_total_future,
01758     double root)
01759 {
01760   double fp = pow(num_total_future, root);
01761   mj_part_t ip = mj_part_t (fp);
01762   if (fp - ip < this->fEpsilon * 100){
01763     return ip;
01764   }
01765   else {
01766     return ip  + 1;
01767   }
01768 }
01769 
01770 /* \brief Function returns how many parts that will be obtained after this dimension partitioning.
01771  * It sets how many parts each current part will be partitioned into in this dimension to num_partitioning_in_current_dim vector,
01772  * sets how many total future parts each obtained part will be partitioned into in next_future_num_parts_in_parts vector,
01773  * If part boxes are kept, then sets initializes the output_part_boxes as its ancestor.
01774  *
01775  *  \param num_partitioning_in_current_dim: output. How many parts each current part will be partitioned into.
01776  *  \param future_num_part_in_parts: input, how many future parts each current part will be partitioned into.
01777  *  \param next_future_num_parts_in_parts: output, how many future parts each obtained part will be partitioned into.
01778  *  \param future_num_parts: output, max number of future parts that will be obtained from a single
01779  *  \param current_num_parts: input, how many parts are there currently.
01780  *  \param current_iteration: input, current dimension iteration number.
01781  *  \param input_part_boxes: input, if boxes are kept, current boxes.
01782  *  \param output_part_boxes: output, if boxes are kept, the initial box boundaries for obtained parts.
01783  */
01784 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
01785           typename mj_part_t>
01786 mj_part_t AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::update_part_num_arrays(
01787   std::vector <mj_part_t> &num_partitioning_in_current_dim, //assumes this vector is empty.
01788     std::vector<mj_part_t> *future_num_part_in_parts,
01789     std::vector<mj_part_t> *next_future_num_parts_in_parts, //assumes this vector is empty.
01790     mj_part_t &future_num_parts,
01791     mj_part_t current_num_parts,
01792     int current_iteration,
01793     RCP<mj_partBoxVector_t> input_part_boxes,
01794     RCP<mj_partBoxVector_t> output_part_boxes
01795 ){
01796   //how many parts that will be obtained after this dimension.
01797     mj_part_t output_num_parts = 0;
01798     if(this->part_no_array){
01799         //when the partNo array is provided as input,
01800         //each current partition will be partition to the same number of parts.
01801         //we dont need to use the future_num_part_in_parts vector in this case.
01802 
01803         mj_part_t p = this->part_no_array[current_iteration];
01804         if (p < 1){
01805             std::cout << "i:" << current_iteration << " p is given as:" << p << std::endl;
01806             exit(1);
01807         }
01808         if (p == 1){
01809             return current_num_parts;
01810         }
01811 
01812         for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
01813             num_partitioning_in_current_dim.push_back(p);
01814 
01815         }
01816         //cout << "me:" << this->myRank << " current_iteration" << current_iteration <<
01817         //" current_num_parts:" << current_num_parts << std::endl;
01818         //cout << "num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0] << std::endl;
01819         //set the new value of future_num_parts.
01820 
01821         /*
01822         cout << "\tfuture_num_parts:" << future_num_parts
01823             << " num_partitioning_in_current_dim[0]:" << num_partitioning_in_current_dim[0]
01824             << future_num_parts/ num_partitioning_in_current_dim[0] << std::endl;
01825         */
01826 
01827         future_num_parts /= num_partitioning_in_current_dim[0];
01828         output_num_parts = current_num_parts * num_partitioning_in_current_dim[0];
01829 
01830         if (this->mj_keep_part_boxes){
01831             for (mj_part_t k = 0; k < current_num_parts; ++k){
01832               //initialized the output boxes as its ancestor.
01833               for (mj_part_t j = 0; j < num_partitioning_in_current_dim[0]; ++j){
01834                     output_part_boxes->push_back((*input_part_boxes)[k]);
01835                 }
01836             }
01837         }
01838 
01839         //set the how many more parts each part will be divided.
01840         //this is obvious when partNo array is provided as input.
01841         //however, fill this so that weights will be calculated according to this array.
01842         for (mj_part_t ii = 0; ii < output_num_parts; ++ii){
01843             next_future_num_parts_in_parts->push_back(future_num_parts);
01844         }
01845     }
01846     else {
01847         //if partNo array is not provided as input,
01848         //future_num_part_in_parts  holds how many parts each part should be divided.
01849         //initially it holds a single number equal to the total number of global parts.
01850 
01851         //calculate the future_num_parts from beginning,
01852         //since each part might be divided into different number of parts.
01853         future_num_parts = 1;
01854 
01855         //cout << "i:" << i << std::endl;
01856 
01857         for (mj_part_t ii = 0; ii < current_num_parts; ++ii){
01858             //get how many parts a part should be divided.
01859             mj_part_t future_num_parts_of_part_ii = (*future_num_part_in_parts)[ii];
01860 
01861             //get the ideal number of parts that is close to the
01862             //(recursion_depth - i) root of the future_num_parts_of_part_ii.
01863             mj_part_t num_partitions_in_current_dim =
01864                       this->get_part_count(
01865                           future_num_parts_of_part_ii,
01866                           1.0 / (this->recursion_depth - current_iteration)
01867                                   );
01868 
01869             if (num_partitions_in_current_dim > this->max_num_part_along_dim){
01870                 std::cerr << "ERROR: maxPartNo calculation is wrong." << std::endl;
01871                 exit(1);
01872             }
01873             //add this number to num_partitioning_in_current_dim vector.
01874             num_partitioning_in_current_dim.push_back(num_partitions_in_current_dim);
01875 
01876 
01877             //increase the output number of parts.
01878             output_num_parts += num_partitions_in_current_dim;
01879 
01880             //ideal number of future partitions for each part.
01881             mj_part_t ideal_num_future_parts_in_part = future_num_parts_of_part_ii / num_partitions_in_current_dim;
01882             for (mj_part_t iii = 0; iii < num_partitions_in_current_dim; ++iii){
01883                 mj_part_t num_future_parts_for_part_iii = ideal_num_future_parts_in_part;
01884 
01885                 //if there is a remainder in the part increase the part weight.
01886                 if (iii < future_num_parts_of_part_ii % num_partitions_in_current_dim){
01887                     //if not uniform, add 1 for the extra parts.
01888                     ++num_future_parts_for_part_iii;
01889                 }
01890                 next_future_num_parts_in_parts->push_back(num_future_parts_for_part_iii);
01891 
01892                 //if part boxes are stored, initialize the box of the parts as the ancestor.
01893                 if (this->mj_keep_part_boxes){
01894                     output_part_boxes->push_back((*input_part_boxes)[ii]);
01895                 }
01896 
01897                 //set num future_num_parts to maximum in this part.
01898                 if (num_future_parts_for_part_iii > future_num_parts) future_num_parts = num_future_parts_for_part_iii;
01899             }
01900         }
01901     }
01902     return output_num_parts;
01903 }
01904 
01905 
01906 /* \brief Allocates and initializes the work memory that will be used by MJ.
01907  *
01908  * */
01909 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
01910           typename mj_part_t>
01911 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::allocate_set_work_memory(){
01912 
01913   //points to process that initially owns the coordinate.
01914   this->owner_of_coordinate  = NULL;
01915 
01916   //Throughout the partitioning execution,
01917   //instead of the moving the coordinates, hold a permutation array for parts.
01918   //coordinate_permutations holds the current permutation.
01919   this->coordinate_permutations =  allocMemory< mj_lno_t>(this->num_local_coords);
01920   //initial configuration, set each pointer-i to i.
01921 #ifdef HAVE_ZOLTAN2_OMP
01922 #pragma omp parallel for
01923 #endif
01924   for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
01925     this->coordinate_permutations[i] = i;
01926   }
01927 
01928   //new_coordinate_permutations holds the current permutation.
01929   this->new_coordinate_permutations = allocMemory< mj_lno_t>(this->num_local_coords);
01930 
01931   this->assigned_part_ids = NULL;
01932   if(this->num_local_coords > 0){
01933     this->assigned_part_ids = allocMemory<mj_part_t>(this->num_local_coords);
01934   }
01935 
01936   //single partition starts at index-0, and ends at numLocalCoords
01937   //inTotalCounts array holds the end points in coordinate_permutations array
01938   //for each partition. Initially sized 1, and single element is set to numLocalCoords.
01939   this->part_xadj = allocMemory<mj_lno_t>(1);
01940   this->part_xadj[0] = static_cast<mj_lno_t>(this->num_local_coords);//the end of the initial partition is the end of coordinates.
01941   //the ends points of the output, this is allocated later.
01942   this->new_part_xadj = NULL;
01943 
01944   // only store this much if cuts are needed to be stored.
01945   //this->all_cut_coordinates = allocMemory< mj_scalar_t>(this->total_num_cut);
01946 
01947 
01948   this->all_cut_coordinates  = allocMemory< mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
01949 
01950   this->max_min_coords =  allocMemory< mj_scalar_t>(this->num_threads * 2);
01951 
01952   this->process_cut_line_weight_to_put_left = NULL; //how much weight percentage should a MPI put left side of the each cutline
01953   this->thread_cut_line_weight_to_put_left = NULL; //how much weight percentage should each thread in MPI put left side of the each outline
01954   //distribute_points_on_cut_lines = false;
01955   if(this->distribute_points_on_cut_lines){
01956     this->process_cut_line_weight_to_put_left = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
01957     this->thread_cut_line_weight_to_put_left = allocMemory<mj_scalar_t *>(this->num_threads);
01958     for(int i = 0; i < this->num_threads; ++i){
01959       this->thread_cut_line_weight_to_put_left[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
01960     }
01961       this->process_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
01962       this->global_rectilinear_cut_weight = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim);
01963   }
01964 
01965 
01966   // work array to manipulate coordinate of cutlines in different iterations.
01967   //necessary because previous cut line information is used for determining
01968   //the next cutline information. therefore, cannot update the cut work array
01969   //until all cutlines are determined.
01970   this->cut_coordinates_work_array = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim *
01971       this->max_concurrent_part_calculation);
01972 
01973 
01974   //cumulative part weight array.
01975   this->target_part_weights = allocMemory<mj_scalar_t>(
01976           this->max_num_part_along_dim * this->max_concurrent_part_calculation);
01977   // the weight from left to write.
01978 
01979     this->cut_upper_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);  //upper bound coordinate of a cut line
01980     this->cut_lower_bound_coordinates = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);  //lower bound coordinate of a cut line
01981     this->cut_lower_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);  //lower bound weight of a cut line
01982     this->cut_upper_bound_weights = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim* this->max_concurrent_part_calculation);  //upper bound weight of a cut line
01983 
01984     this->process_local_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation); //combined array to exchange the min and max coordinate, and total weight of part.
01985     this->global_min_max_coord_total_weight = allocMemory<mj_scalar_t>(3 * this->max_concurrent_part_calculation);//global combined array with the results for min, max and total weight.
01986 
01987     //is_cut_line_determined is used to determine if a cutline is determined already.
01988     //If a cut line is already determined, the next iterations will skip this cut line.
01989     this->is_cut_line_determined = allocMemory<bool>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
01990     //my_incomplete_cut_count count holds the number of cutlines that have not been finalized for each part
01991     //when concurrentPartCount>1, using this information, if my_incomplete_cut_count[x]==0, then no work is done for this part.
01992     this->my_incomplete_cut_count =  allocMemory<mj_part_t>(this->max_concurrent_part_calculation);
01993     //local part weights of each thread.
01994     this->thread_part_weights = allocMemory<double *>(this->num_threads);
01995     //the work manupulation array for partweights.
01996     this->thread_part_weight_work = allocMemory<double *>(this->num_threads);
01997 
01998     //thread_cut_left_closest_point to hold the closest coordinate to a cutline from left (for each thread).
01999     this->thread_cut_left_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
02000     //thread_cut_right_closest_point to hold the closest coordinate to a cutline from right (for each thread)
02001     this->thread_cut_right_closest_point = allocMemory<mj_scalar_t *>(this->num_threads);
02002 
02003     //to store how many points in each part a thread has.
02004     this->thread_point_counts = allocMemory<mj_lno_t *>(this->num_threads);
02005 
02006     for(int i = 0; i < this->num_threads; ++i){
02007         //partWeights[i] = allocMemory<mj_scalar_t>(maxTotalPartCount);
02008         this->thread_part_weights[i] = allocMemory < double >(this->max_num_total_part_along_dim * this->max_concurrent_part_calculation);
02009         this->thread_cut_right_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
02010         this->thread_cut_left_closest_point[i] = allocMemory<mj_scalar_t>(this->max_num_cut_along_dim * this->max_concurrent_part_calculation);
02011         this->thread_point_counts[i] =  allocMemory<mj_lno_t>(this->max_num_part_along_dim);
02012     }
02013     //for faster communication, concatanation of
02014     //totalPartWeights sized 2P-1, since there are P parts and P-1 cut lines
02015     //leftClosest distances sized P-1, since P-1 cut lines
02016     //rightClosest distances size P-1, since P-1 cut lines.
02017     this->total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
02018     this->global_total_part_weight_left_right_closests = allocMemory<mj_scalar_t>((this->max_num_total_part_along_dim + this->max_num_cut_along_dim * 2) * this->max_concurrent_part_calculation);
02019 
02020 
02021     mj_scalar_t **coord = allocMemory<mj_scalar_t *>(this->coord_dim);
02022     for (int i=0; i < this->coord_dim; i++){
02023       coord[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
02024 #ifdef HAVE_ZOLTAN2_OMP
02025 #pragma omp parallel for
02026 #endif
02027       for (mj_lno_t j=0; j < this->num_local_coords; j++)
02028         coord[i][j] = this->mj_coordinates[i][j];
02029     }
02030     this->mj_coordinates = coord;
02031 
02032 
02033     int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
02034     mj_scalar_t **weights = allocMemory<mj_scalar_t *>(criteria_dim);
02035 
02036     for (int i=0; i < criteria_dim; i++){
02037       weights[i] = NULL;
02038     }
02039     for (int i=0; i < this->num_weights_per_coord; i++){
02040       weights[i] = allocMemory<mj_scalar_t>(this->num_local_coords);
02041 #ifdef HAVE_ZOLTAN2_OMP
02042 #pragma omp parallel for
02043 #endif
02044       for (mj_lno_t j=0; j < this->num_local_coords; j++)
02045         weights[i][j] = this->mj_weights[i][j];
02046 
02047     }
02048   this->mj_weights = weights;
02049     this->current_mj_gnos = allocMemory<mj_gno_t>(this->num_local_coords);
02050 #ifdef HAVE_ZOLTAN2_OMP
02051 #pragma omp parallel for
02052 #endif
02053     for (mj_lno_t j=0; j < this->num_local_coords; j++)
02054       this->current_mj_gnos[j] = this->initial_mj_gnos[j];
02055 
02056     this->owner_of_coordinate = allocMemory<int>(this->num_local_coords);
02057 
02058 #ifdef HAVE_ZOLTAN2_OMP
02059 #pragma omp parallel for
02060 #endif
02061     for (mj_lno_t j=0; j < this->num_local_coords; j++)
02062       this->owner_of_coordinate[j] = this->myActualRank;
02063 }
02064 
02065 /* \brief compute the global bounding box
02066  */
02067 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
02068           typename mj_part_t>
02069 void AlgMJ<mj_scalar_t,mj_lno_t,mj_gno_t,mj_part_t>::compute_global_box()
02070 {
02071     //local min coords
02072     mj_scalar_t *mins = allocMemory<mj_scalar_t>(this->coord_dim);
02073     //global min coords
02074     mj_scalar_t *gmins = allocMemory<mj_scalar_t>(this->coord_dim);
02075     //local max coords
02076     mj_scalar_t *maxs = allocMemory<mj_scalar_t>(this->coord_dim);
02077     //global max coords
02078     mj_scalar_t *gmaxs = allocMemory<mj_scalar_t>(this->coord_dim);
02079 
02080     for (int i = 0; i < this->coord_dim; ++i){
02081         mj_scalar_t localMin = this->mj_coordinates[i][0];
02082         mj_scalar_t localMax = this->mj_coordinates[i][0];
02083         for (mj_lno_t j = 1; j < this->num_local_coords; ++j){
02084             if (this->mj_coordinates[i][j] < localMin){
02085                 localMin = this->mj_coordinates[i][j];
02086             }
02087             if (this->mj_coordinates[i][j] > localMax){
02088                 localMax = this->mj_coordinates[i][j];
02089             }
02090         }
02091         //cout << " localMin:" << localMin << endl;
02092         //cout << " localMax:" << localMax << endl;
02093         mins[i] = localMin;
02094         maxs[i] = localMax;
02095     }
02096     reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MIN,
02097             this->coord_dim, mins, gmins
02098     );
02099 
02100     reduceAll<int, mj_scalar_t>(*this->comm, Teuchos::REDUCE_MAX,
02101             this->coord_dim, maxs, gmaxs
02102     );
02103 
02104     //create single box with all areas.
02105     global_box = rcp(new mj_partBox_t(0,this->coord_dim,gmins,gmaxs));
02106     //coordinateModelPartBox <mj_scalar_t, mj_part_t> tmpBox (0, coordDim);
02107     freeArray<mj_scalar_t>(mins);
02108     freeArray<mj_scalar_t>(gmins);
02109     freeArray<mj_scalar_t>(maxs);
02110     freeArray<mj_scalar_t>(gmaxs);
02111 }
02112 
02113 /* \brief for part communication we keep track of the box boundaries.
02114  * This is performed when either asked specifically, or when geometric mapping is performed afterwards.
02115  * This function initializes a single box with all global min and max coordinates.
02116  * \param initial_partitioning_boxes the input and output vector for boxes.
02117  */
02118 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
02119           typename mj_part_t>
02120 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::init_part_boxes(
02121     RCP<mj_partBoxVector_t> & initial_partitioning_boxes
02122 )
02123 {
02124     mj_partBox_t tmp_box(*global_box);
02125     initial_partitioning_boxes->push_back(tmp_box);
02126 }
02127 
02138 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
02139           typename mj_part_t>
02140 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_local_min_max_coord_totW(
02141     mj_lno_t coordinate_begin_index,
02142     mj_lno_t coordinate_end_index,
02143     mj_lno_t *mj_current_coordinate_permutations,
02144     mj_scalar_t *mj_current_dim_coords,
02145     mj_scalar_t &min_coordinate,
02146     mj_scalar_t &max_coordinate,
02147     mj_scalar_t &total_weight){
02148 
02149     //if the part is empty.
02150     //set the min and max coordinates as reverse.
02151     if(coordinate_begin_index >= coordinate_end_index)
02152     {
02153         min_coordinate = this->maxScalar_t;
02154         max_coordinate = this->minScalar_t;
02155         total_weight = 0;
02156     }
02157     else {
02158         mj_scalar_t my_total_weight = 0;
02159 #ifdef HAVE_ZOLTAN2_OMP
02160 #pragma omp parallel
02161 #endif
02162         {
02163             //if uniform weights are used, then weight is equal to count.
02164             if (this->mj_uniform_weights[0]) {
02165 #ifdef HAVE_ZOLTAN2_OMP
02166 #pragma omp single
02167 #endif
02168                 {
02169                     my_total_weight = coordinate_end_index - coordinate_begin_index;
02170                 }
02171 
02172             }
02173             else {
02174                 //if not uniform, then weights are reducted from threads.
02175 #ifdef HAVE_ZOLTAN2_OMP
02176 #pragma omp for reduction(+:my_total_weight)
02177 #endif
02178                 for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
02179                     int i = mj_current_coordinate_permutations[ii];
02180                     my_total_weight += this->mj_weights[0][i];
02181                 }
02182             }
02183 
02184             int my_thread_id = 0;
02185 #ifdef HAVE_ZOLTAN2_OMP
02186             my_thread_id = omp_get_thread_num();
02187 #endif
02188             mj_scalar_t my_thread_min_coord, my_thread_max_coord;
02189             my_thread_min_coord=my_thread_max_coord
02190                 =mj_current_dim_coords[mj_current_coordinate_permutations[coordinate_begin_index]];
02191 
02192 
02193 #ifdef HAVE_ZOLTAN2_OMP
02194 #pragma omp for
02195 #endif
02196             for(mj_lno_t j = coordinate_begin_index + 1; j < coordinate_end_index; ++j){
02197                 int i = mj_current_coordinate_permutations[j];
02198                 if(mj_current_dim_coords[i] > my_thread_max_coord)
02199                     my_thread_max_coord = mj_current_dim_coords[i];
02200                 if(mj_current_dim_coords[i] < my_thread_min_coord)
02201                     my_thread_min_coord = mj_current_dim_coords[i];
02202             }
02203             this->max_min_coords[my_thread_id] = my_thread_min_coord;
02204             this->max_min_coords[my_thread_id + this->num_threads] = my_thread_max_coord;
02205 
02206 #ifdef HAVE_ZOLTAN2_OMP
02207 //we need a barrier here, because max_min_array might not be filled by some of the threads.
02208 #pragma omp barrier
02209 #pragma omp single nowait
02210 #endif
02211             {
02212                 min_coordinate = this->max_min_coords[0];
02213                 for(int i = 1; i < this->num_threads; ++i){
02214                     if(this->max_min_coords[i] < min_coordinate)
02215                         min_coordinate = this->max_min_coords[i];
02216                 }
02217             }
02218 
02219 #ifdef HAVE_ZOLTAN2_OMP
02220 #pragma omp single nowait
02221 #endif
02222             {
02223                 max_coordinate = this->max_min_coords[this->num_threads];
02224                 for(int i = this->num_threads + 1; i < this->num_threads * 2; ++i){
02225                     if(this->max_min_coords[i] > max_coordinate)
02226                         max_coordinate = this->max_min_coords[i];
02227                 }
02228             }
02229         }
02230         total_weight = my_total_weight;
02231     }
02232 }
02233 
02234 
02242 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
02243           typename mj_part_t>
02244 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_global_min_max_coord_totW(
02245     mj_part_t current_concurrent_num_parts,
02246     mj_scalar_t *local_min_max_total,
02247     mj_scalar_t *global_min_max_total){
02248 
02249   //reduce min for first current_concurrent_num_parts elements, reduce max for next
02250   //concurrentPartCount elements,
02251   //reduce sum for the last concurrentPartCount elements.
02252   if(this->comm->getSize()  > 1){
02253     Teuchos::MultiJaggedCombinedMinMaxTotalReductionOp<int, mj_scalar_t>
02254       reductionOp(
02255           current_concurrent_num_parts,
02256           current_concurrent_num_parts,
02257           current_concurrent_num_parts);
02258     try{
02259       reduceAll<int, mj_scalar_t>(
02260           *(this->comm),
02261           reductionOp,
02262           3 * current_concurrent_num_parts,
02263           local_min_max_total,
02264           global_min_max_total);
02265     }
02266     Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
02267   }
02268   else {
02269     mj_part_t s = 3 * current_concurrent_num_parts;
02270     for (mj_part_t i = 0; i < s; ++i){
02271       global_min_max_total[i] = local_min_max_total[i];
02272     }
02273   }
02274 }
02275 
02276 
02277 
02296 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
02297           typename mj_part_t>
02298 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_initial_cut_coords_target_weights(
02299     mj_scalar_t min_coord,
02300     mj_scalar_t max_coord,
02301     mj_part_t num_cuts/*p-1*/ ,
02302     mj_scalar_t global_weight,
02303     mj_scalar_t *initial_cut_coords /*p - 1 sized, coordinate of each cut line*/,
02304     mj_scalar_t *current_target_part_weights /*cumulative weights, at left side of each cut line. p-1 sized*/,
02305 
02306     std::vector <mj_part_t> *future_num_part_in_parts, //the vecto
02307     std::vector <mj_part_t> *next_future_num_parts_in_parts,
02308     mj_part_t concurrent_current_part,
02309     mj_part_t obtained_part_index
02310 ){
02311 
02312     mj_scalar_t coord_range = max_coord - min_coord;
02313     if(this->mj_uniform_parts[0]){
02314         {
02315             mj_part_t cumulative = 0;
02316             //how many total future parts the part will be partitioned into.
02317             mj_scalar_t total_future_part_count_in_part = mj_scalar_t((*future_num_part_in_parts)[concurrent_current_part]);
02318 
02319 
02320             //how much each part should weigh in ideal case.
02321             mj_scalar_t unit_part_weight = global_weight / total_future_part_count_in_part;
02322             /*
02323             cout << "total_future_part_count_in_part:" << total_future_part_count_in_part << endl;
02324             cout << "global_weight:" << global_weight << endl;
02325             cout << "unit_part_weight" << unit_part_weight <<endl;
02326             */
02327             for(mj_part_t i = 0; i < num_cuts; ++i){
02328                 cumulative += (*next_future_num_parts_in_parts)[i + obtained_part_index];
02329 
02330                 /*
02331                 cout << "obtained_part_index:" << obtained_part_index <<
02332                     " (*next_future_num_parts_in_parts)[i + obtained_part_index]:" << (*next_future_num_parts_in_parts)[i + obtained_part_index] <<
02333                     " cumulative:" << cumulative << endl;
02334                 */
02335                 //set target part weight.
02336                 current_target_part_weights[i] = cumulative * unit_part_weight;
02337                 //cout <<"i:" << i << " current_target_part_weights:" << current_target_part_weights[i] << endl;
02338                 //set initial cut coordinate.
02339                 initial_cut_coords[i] = min_coord + (coord_range *
02340                                          cumulative) / total_future_part_count_in_part;
02341             }
02342             current_target_part_weights[num_cuts] = 1;
02343         }
02344 
02345         //round the target part weights.
02346         if (this->mj_uniform_weights[0]){
02347           for(mj_part_t i = 0; i < num_cuts + 1; ++i){
02348                 current_target_part_weights[i] = long(current_target_part_weights[i] + 0.5);
02349             }
02350         }
02351     }
02352     else {
02353       std::cerr << "MJ does not support non uniform part weights" << std::endl;
02354       exit(1);
02355     }
02356 }
02357 
02358 
02371 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
02372           typename mj_part_t>
02373 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_initial_coordinate_parts(
02374     mj_scalar_t &max_coordinate,
02375     mj_scalar_t &min_coordinate,
02376     mj_part_t &concurrent_current_part_index,
02377     mj_lno_t coordinate_begin_index,
02378     mj_lno_t coordinate_end_index,
02379     mj_lno_t *mj_current_coordinate_permutations,
02380     mj_scalar_t *mj_current_dim_coords,
02381     mj_part_t *mj_part_ids,
02382     mj_part_t &partition_count
02383 ){
02384     mj_scalar_t coordinate_range = max_coordinate - min_coordinate;
02385 
02386     //if there is single point, or if all points are along a line.
02387     //set initial part to 0 for all.
02388     if(ABS(coordinate_range) < this->sEpsilon ){
02389 #ifdef HAVE_ZOLTAN2_OMP
02390 #pragma omp parallel for
02391 #endif
02392         for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
02393           mj_part_ids[mj_current_coordinate_permutations[ii]] = 0;
02394         }
02395     }
02396     else{
02397 
02398         //otherwise estimate an initial part for each coordinate.
02399         //assuming uniform distribution of points.
02400         mj_scalar_t slice = coordinate_range / partition_count;
02401 
02402 #ifdef HAVE_ZOLTAN2_OMP
02403 #pragma omp parallel for
02404 #endif
02405         for(mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
02406 
02407             mj_lno_t iii = mj_current_coordinate_permutations[ii];
02408             mj_part_t pp = mj_part_t((mj_current_dim_coords[iii] - min_coordinate) / slice);
02409             mj_part_ids[iii] = 2 * pp;
02410         }
02411     }
02412 }
02413 
02414 
02425 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
02426           typename mj_part_t>
02427 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part(
02428     mj_scalar_t *mj_current_dim_coords,
02429     mj_scalar_t used_imbalance_tolerance,
02430     mj_part_t current_work_part,
02431     mj_part_t current_concurrent_num_parts,
02432     mj_scalar_t *current_cut_coordinates,
02433     mj_part_t total_incomplete_cut_count,
02434     std::vector <mj_part_t> &num_partitioning_in_current_dim
02435 ){
02436 
02437 
02438     mj_part_t rectilinear_cut_count = 0;
02439     mj_scalar_t *temp_cut_coords = current_cut_coordinates;
02440 
02441     Teuchos::MultiJaggedCombinedReductionOp<mj_part_t, mj_scalar_t>
02442                  *reductionOp = NULL;
02443     reductionOp = new Teuchos::MultiJaggedCombinedReductionOp
02444                      <mj_part_t, mj_scalar_t>(
02445                          &num_partitioning_in_current_dim ,
02446                          current_work_part ,
02447                          current_concurrent_num_parts);
02448 
02449     size_t total_reduction_size = 0;
02450 #ifdef HAVE_ZOLTAN2_OMP
02451 #pragma omp parallel shared(total_incomplete_cut_count,  rectilinear_cut_count)
02452 #endif
02453     {
02454         int me = 0;
02455 #ifdef HAVE_ZOLTAN2_OMP
02456         me = omp_get_thread_num();
02457 #endif
02458         double *my_thread_part_weights = this->thread_part_weights[me];
02459         mj_scalar_t *my_thread_left_closest = this->thread_cut_left_closest_point[me];
02460         mj_scalar_t *my_thread_right_closest = this->thread_cut_right_closest_point[me];
02461 
02462 #ifdef HAVE_ZOLTAN2_OMP
02463 #pragma omp single
02464 #endif
02465             {
02466                 //initialize the lower and upper bounds of the cuts.
02467                 mj_part_t next = 0;
02468                 for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
02469 
02470                     mj_part_t num_part_in_dim =  num_partitioning_in_current_dim[current_work_part + i];
02471                     mj_part_t num_cut_in_dim = num_part_in_dim - 1;
02472                     total_reduction_size += (4 * num_cut_in_dim + 1);
02473 
02474                     for(mj_part_t ii = 0; ii < num_cut_in_dim; ++ii){
02475                         this->is_cut_line_determined[next] = false;
02476                         this->cut_lower_bound_coordinates[next] = global_min_max_coord_total_weight[i]; //min coordinate
02477                         this->cut_upper_bound_coordinates[next] = global_min_max_coord_total_weight[i + current_concurrent_num_parts]; //max coordinate
02478 
02479                         this->cut_upper_bound_weights[next] = global_min_max_coord_total_weight[i + 2 * current_concurrent_num_parts]; //total weight
02480                         this->cut_lower_bound_weights[next] = 0;
02481 
02482                         if(this->distribute_points_on_cut_lines){
02483                             this->process_cut_line_weight_to_put_left[next] = 0;
02484                         }
02485                         ++next;
02486                     }
02487                 }
02488             }
02489 
02490         //no need to have barrier here.
02491         //pragma omp single have implicit barrier.
02492 
02493         int iteration = 0;
02494         while (total_incomplete_cut_count != 0){
02495             iteration += 1;
02496             //cout << "\niteration:" << iteration  << " ";
02497             mj_part_t concurrent_cut_shifts = 0;
02498             size_t total_part_shift = 0;
02499 
02500             for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
02501                 mj_part_t num_parts =  -1;
02502                 num_parts =  num_partitioning_in_current_dim[current_work_part + kk];
02503 
02504                 mj_part_t num_cuts = num_parts - 1;
02505                 size_t total_part_count = num_parts + size_t (num_cuts) ;
02506                 if (this->my_incomplete_cut_count[kk] > 0){
02507 
02508                     //although isDone shared, currentDone is private and same for all.
02509                     bool *current_cut_status = this->is_cut_line_determined + concurrent_cut_shifts;
02510                     double *my_current_part_weights = my_thread_part_weights + total_part_shift;
02511                     mj_scalar_t *my_current_left_closest = my_thread_left_closest + concurrent_cut_shifts;
02512                     mj_scalar_t *my_current_right_closest = my_thread_right_closest + concurrent_cut_shifts;
02513 
02514                     mj_part_t conccurent_current_part = current_work_part + kk;
02515                     mj_lno_t coordinate_begin_index = conccurent_current_part ==0 ? 0: this->part_xadj[conccurent_current_part -1];
02516                     mj_lno_t coordinate_end_index = this->part_xadj[conccurent_current_part];
02517                     mj_scalar_t *temp_current_cut_coords = temp_cut_coords + concurrent_cut_shifts;
02518 
02519                     mj_scalar_t min_coord = global_min_max_coord_total_weight[kk];
02520                     mj_scalar_t max_coord = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
02521 
02522                     // compute part weights using existing cuts
02523                     this->mj_1D_part_get_thread_part_weights(
02524                         total_part_count,
02525                         num_cuts,
02526                         max_coord,//globalMinMaxTotal[kk + concurrentPartCount],//maxScalar,
02527                         min_coord,//globalMinMaxTotal[kk]//minScalar,
02528                         coordinate_begin_index,
02529                         coordinate_end_index,
02530                         mj_current_dim_coords,
02531                         temp_current_cut_coords,
02532                         current_cut_status,
02533                         my_current_part_weights,
02534                         my_current_left_closest,
02535                         my_current_right_closest);
02536 
02537                 }
02538 
02539                 concurrent_cut_shifts += num_cuts;
02540                 total_part_shift += total_part_count;
02541             }
02542 
02543             //sum up the results of threads
02544             this->mj_accumulate_thread_results(
02545                 num_partitioning_in_current_dim,
02546                 current_work_part,
02547                 current_concurrent_num_parts);
02548 
02549             //now sum up the results of mpi processors.
02550 #ifdef HAVE_ZOLTAN2_OMP
02551 #pragma omp single
02552 #endif
02553             {
02554                 if(this->comm->getSize() > 1){
02555                     try{
02556                         reduceAll<int, mj_scalar_t>( *(this->comm), *reductionOp,
02557                             total_reduction_size,
02558                             this->total_part_weight_left_right_closests,
02559                             this->global_total_part_weight_left_right_closests);
02560 
02561                     }
02562                     Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
02563                 }
02564                 else {
02565                         memcpy(
02566                           this->global_total_part_weight_left_right_closests,
02567                             this->total_part_weight_left_right_closests,
02568                             total_reduction_size * sizeof(mj_scalar_t));
02569                 }
02570             }
02571 
02572             //how much cut will be shifted for the next part in the concurrent part calculation.
02573             mj_part_t cut_shift = 0;
02574 
02575             //how much the concantaneted array will be shifted for the next part in concurrent part calculation.
02576             size_t tlr_shift = 0;
02577             for (mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
02578               mj_part_t num_parts =  num_partitioning_in_current_dim[current_work_part + kk];
02579               mj_part_t num_cuts = num_parts - 1;
02580               size_t num_total_part = num_parts + size_t (num_cuts) ;
02581 
02582               //if the cuts of this cut has already been completed.
02583               //nothing to do for this part.
02584               //just update the shift amount and proceed.
02585               if (this->my_incomplete_cut_count[kk] == 0) {
02586                 cut_shift += num_cuts;
02587                 tlr_shift += (num_total_part + 2 * num_cuts);
02588                 continue;
02589               }
02590 
02591               mj_scalar_t *current_local_part_weights = this->total_part_weight_left_right_closests  + tlr_shift ;
02592               mj_scalar_t *current_global_tlr = this->global_total_part_weight_left_right_closests + tlr_shift;
02593               mj_scalar_t *current_global_left_closest_points = current_global_tlr + num_total_part; //left closest points
02594               mj_scalar_t *current_global_right_closest_points = current_global_tlr + num_total_part + num_cuts; //right closest points
02595               mj_scalar_t *current_global_part_weights = current_global_tlr;
02596               bool *current_cut_line_determined = this->is_cut_line_determined + cut_shift;
02597 
02598               mj_scalar_t *current_part_target_weights = this->target_part_weights + cut_shift + kk;
02599               mj_scalar_t *current_part_cut_line_weight_to_put_left = this->process_cut_line_weight_to_put_left + cut_shift;
02600 
02601               mj_scalar_t min_coordinate = global_min_max_coord_total_weight[kk];
02602               mj_scalar_t max_coordinate = global_min_max_coord_total_weight[kk + current_concurrent_num_parts];
02603               mj_scalar_t global_total_weight = global_min_max_coord_total_weight[kk + current_concurrent_num_parts * 2];
02604               mj_scalar_t *current_cut_lower_bound_weights = this->cut_lower_bound_weights + cut_shift;
02605               mj_scalar_t *current_cut_upper_weights = this->cut_upper_bound_weights + cut_shift;
02606               mj_scalar_t *current_cut_upper_bounds = this->cut_upper_bound_coordinates + cut_shift;
02607               mj_scalar_t *current_cut_lower_bounds = this->cut_lower_bound_coordinates + cut_shift;
02608 
02609               mj_part_t initial_incomplete_cut_count = this->my_incomplete_cut_count[kk];
02610 
02611               // Now compute the new cut coordinates.
02612               this->mj_get_new_cut_coordinates(
02613                   num_total_part,
02614                   num_cuts,
02615                   max_coordinate,
02616                   min_coordinate,
02617                   global_total_weight,
02618                   used_imbalance_tolerance,
02619                   current_global_part_weights,
02620                   current_local_part_weights,
02621                   current_part_target_weights,
02622                   current_cut_line_determined,
02623                   temp_cut_coords + cut_shift,
02624                   current_cut_upper_bounds,
02625                   current_cut_lower_bounds,
02626                   current_global_left_closest_points,
02627                   current_global_right_closest_points,
02628                   current_cut_lower_bound_weights,
02629                   current_cut_upper_weights,
02630                   this->cut_coordinates_work_array +cut_shift, //new cut coordinates
02631                   current_part_cut_line_weight_to_put_left,
02632                   &rectilinear_cut_count,
02633                   this->my_incomplete_cut_count[kk]);
02634 
02635               cut_shift += num_cuts;
02636               tlr_shift += (num_total_part + 2 * num_cuts);
02637               mj_part_t iteration_complete_cut_count = initial_incomplete_cut_count - this->my_incomplete_cut_count[kk];
02638 #ifdef HAVE_ZOLTAN2_OMP
02639 #pragma omp single
02640 #endif
02641               {
02642                 total_incomplete_cut_count -= iteration_complete_cut_count;
02643               }
02644 
02645             }
02646 #ifdef HAVE_ZOLTAN2_OMP
02647 #pragma omp barrier
02648 #pragma omp single
02649 #endif
02650             {
02651               //swap the cut coordinates for next iteration.
02652                 mj_scalar_t *t = temp_cut_coords;
02653                 temp_cut_coords = this->cut_coordinates_work_array;
02654                 this->cut_coordinates_work_array = t;
02655             }
02656         }
02657 
02658         // Needed only if keep_cuts; otherwise can simply swap array pointers
02659         // cutCoordinates and cutCoordinatesWork.
02660         // (at first iteration, cutCoordinates == cutCoorindates_tmp).
02661         // computed cuts must be in cutCoordinates.
02662         if (current_cut_coordinates != temp_cut_coords){
02663 #ifdef HAVE_ZOLTAN2_OMP
02664 #pragma omp single
02665 #endif
02666           {
02667             mj_part_t next = 0;
02668             for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
02669               mj_part_t num_parts = -1;
02670               num_parts = num_partitioning_in_current_dim[current_work_part + i];
02671               mj_part_t num_cuts = num_parts - 1;
02672 
02673               for(mj_part_t ii = 0; ii < num_cuts; ++ii){
02674                 current_cut_coordinates[next + ii] = temp_cut_coords[next + ii];
02675               }
02676               next += num_cuts;
02677             }
02678           }
02679 
02680 #ifdef HAVE_ZOLTAN2_OMP
02681 #pragma omp single
02682 #endif
02683             {
02684                 this->cut_coordinates_work_array = temp_cut_coords;
02685             }
02686         }
02687     }
02688     delete reductionOp;
02689 }
02690 
02691 
02711 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
02712           typename mj_part_t>
02713 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_1D_part_get_thread_part_weights(
02714     size_t total_part_count,
02715     mj_part_t num_cuts,
02716     mj_scalar_t max_coord,
02717     mj_scalar_t min_coord,
02718     mj_lno_t coordinate_begin_index,
02719     mj_lno_t coordinate_end_index,
02720     mj_scalar_t *mj_current_dim_coords,
02721     mj_scalar_t *temp_current_cut_coords,
02722     bool *current_cut_status,
02723     double *my_current_part_weights,
02724     mj_scalar_t *my_current_left_closest,
02725     mj_scalar_t *my_current_right_closest){
02726 
02727   // initializations for part weights, left/right closest
02728   for (size_t i = 0; i < total_part_count; ++i){
02729     my_current_part_weights[i] = 0;
02730   }
02731 
02732   //initialize the left and right closest coordinates
02733   //to their max value.
02734   for(mj_part_t i = 0; i < num_cuts; ++i){
02735     my_current_left_closest[i] = min_coord - 1;
02736     my_current_right_closest[i] = max_coord + 1;
02737   }
02738   //mj_lno_t comparison_count = 0;
02739   mj_scalar_t minus_EPSILON = -this->sEpsilon;
02740 #ifdef HAVE_ZOLTAN2_OMP
02741   //no need for the barrier as all threads uses their local memories.
02742   //dont change the static scheduling here, as it is assumed when the new
02743   //partitions are created later.
02744 #pragma omp for
02745 #endif
02746   for (mj_lno_t ii = coordinate_begin_index; ii < coordinate_end_index; ++ii){
02747     int i = this->coordinate_permutations[ii];
02748 
02749     //the accesses to assigned_part_ids are thread safe
02750     //since each coordinate is assigned to only a single thread.
02751     mj_part_t j = this->assigned_part_ids[i] / 2;
02752 
02753     if(j >= num_cuts){
02754       j = num_cuts - 1;
02755     }
02756 
02757     mj_part_t lower_cut_index = 0;
02758     mj_part_t upper_cut_index = num_cuts - 1;
02759 
02760     mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
02761     bool is_inserted = false;
02762     bool is_on_left_of_cut = false;
02763     bool is_on_right_of_cut = false;
02764     mj_part_t last_compared_part = -1;
02765 
02766     mj_scalar_t coord = mj_current_dim_coords[i];
02767 
02768     while(upper_cut_index >= lower_cut_index)
02769     {
02770       //comparison_count++;
02771       last_compared_part = -1;
02772       is_on_left_of_cut = false;
02773       is_on_right_of_cut = false;
02774       mj_scalar_t cut = temp_current_cut_coords[j];
02775       mj_scalar_t distance_to_cut = coord - cut;
02776       mj_scalar_t abs_distance_to_cut = ABS(distance_to_cut);
02777 
02778       //if it is on the line.
02779       if(abs_distance_to_cut < this->sEpsilon){
02780 
02781         my_current_part_weights[j * 2 + 1] += w;
02782         this->assigned_part_ids[i] = j * 2 + 1;
02783 
02784         //assign left and right closest point to cut as the point is on the cut.
02785         my_current_left_closest[j] = coord;
02786         my_current_right_closest[j] = coord;
02787         //now we need to check if there are other cuts on the same cut coordinate.
02788         //if there are, then we add the weight of the cut to all cuts in the same coordinate.
02789         mj_part_t kk = j + 1;
02790         while(kk < num_cuts){
02791           // Needed when cuts shared the same position
02792           distance_to_cut =ABS(temp_current_cut_coords[kk] - cut);
02793           if(distance_to_cut < this->sEpsilon){
02794             my_current_part_weights[2 * kk + 1] += w;
02795             my_current_left_closest[kk] = coord;
02796             my_current_right_closest[kk] = coord;
02797             kk++;
02798           }
02799           else{
02800             //cut is far away.
02801             //just check the left closest point for the next cut.
02802             if(coord - my_current_left_closest[kk] > this->sEpsilon){
02803               my_current_left_closest[kk] = coord;
02804             }
02805             break;
02806           }
02807         }
02808 
02809 
02810         kk = j - 1;
02811         //continue checking for the cuts on the left if they share the same coordinate.
02812         while(kk >= 0){
02813           distance_to_cut =ABS(temp_current_cut_coords[kk] - cut);
02814           if(distance_to_cut < this->sEpsilon){
02815             my_current_part_weights[2 * kk + 1] += w;
02816             //try to write the partId as the leftmost cut.
02817             this->assigned_part_ids[i] = kk * 2 + 1;
02818             my_current_left_closest[kk] = coord;
02819             my_current_right_closest[kk] = coord;
02820             kk--;
02821           }
02822           else{
02823             //if cut is far away on the left of the point.
02824             //then just compare for right closest point.
02825             if(my_current_right_closest[kk] - coord > this->sEpsilon){
02826               my_current_right_closest[kk] = coord;
02827             }
02828             break;
02829           }
02830         }
02831 
02832         is_inserted = true;
02833         break;
02834       }
02835       else {
02836         //if point is on the left of the cut.
02837         if (distance_to_cut < 0) {
02838           bool _break = false;
02839           if(j > 0){
02840             //check distance to the cut on the left the current cut compared.
02841             //if point is on the right, then we find the part of the point.
02842             mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j - 1];
02843             if(distance_to_next_cut > this->sEpsilon){
02844               _break = true;
02845             }
02846           }
02847           //if point is not on the right of the next cut, then
02848           //set the upper bound to this cut.
02849           upper_cut_index = j - 1;
02850           //set the last part, and mark it as on the left of the last part.
02851           is_on_left_of_cut = true;
02852           last_compared_part = j;
02853           if(_break) break;
02854         }
02855         else {
02856           //if point is on the right of the cut.
02857           bool _break = false;
02858           if(j < num_cuts - 1){
02859             //check distance to the cut on the left the current cut compared.
02860             //if point is on the right, then we find the part of the point.
02861             mj_scalar_t distance_to_next_cut = coord - temp_current_cut_coords[j + 1];
02862             if(distance_to_next_cut < minus_EPSILON){
02863                        _break = true;
02864                      }
02865           }
02866 
02867           //if point is not on the left of the next cut, then
02868           //set the upper bound to this cut.
02869           lower_cut_index = j + 1;
02870           //set the last part, and mark it as on the right of the last part.
02871           is_on_right_of_cut = true;
02872           last_compared_part = j;
02873           if(_break) break;
02874         }
02875       }
02876 
02877       j = (upper_cut_index + lower_cut_index) / 2;
02878     }
02879     if(!is_inserted){
02880       if(is_on_right_of_cut){
02881 
02882         //add it to the right of the last compared part.
02883         my_current_part_weights[2 * last_compared_part + 2] += w;
02884         this->assigned_part_ids[i] = 2 * last_compared_part + 2;
02885 
02886         //update the right closest point of last compared cut.
02887         if(my_current_right_closest[last_compared_part] - coord > this->sEpsilon){
02888           my_current_right_closest[last_compared_part] = coord;
02889         }
02890         //update the left closest point of the cut on the right of the last compared cut.
02891         if(last_compared_part+1 < num_cuts){
02892 
02893           if(coord - my_current_left_closest[last_compared_part + 1] > this->sEpsilon){
02894             my_current_left_closest[last_compared_part + 1] = coord;
02895           }
02896         }
02897 
02898       }
02899       else if(is_on_left_of_cut){
02900 
02901         //add it to the left of the last compared part.
02902         my_current_part_weights[2 * last_compared_part] += w;
02903         this->assigned_part_ids[i] = 2 * last_compared_part;
02904 
02905 
02906         //update the left closest point of last compared cut.
02907         if(coord - my_current_left_closest[last_compared_part] > this->sEpsilon){
02908           my_current_left_closest[last_compared_part] = coord;
02909         }
02910 
02911         //update the right closest point of the cut on the left of the last compared cut.
02912         if(last_compared_part-1 >= 0){
02913           if(my_current_right_closest[last_compared_part -1] - coord > this->sEpsilon){
02914             my_current_right_closest[last_compared_part -1] = coord;
02915           }
02916         }
02917       }
02918     }
02919   }
02920 
02921   // prefix sum computation.
02922   //we need prefix sum for each part to determine cut positions.
02923   for (size_t i = 1; i < total_part_count; ++i){
02924     // check for cuts sharing the same position; all cuts sharing a position
02925     // have the same weight == total weight for all cuts sharing the position.
02926     // don't want to accumulate that total weight more than once.
02927     if(i % 2 == 0 && i > 1 && i < total_part_count - 1 &&
02928         ABS(temp_current_cut_coords[i / 2] - temp_current_cut_coords[i /2 - 1])
02929     < this->sEpsilon){
02930       //i % 2 = 0 when part i represents the cut coordinate.
02931       //if it is a cut, and if the next cut also have the same coordinate, then
02932       //dont addup.
02933       my_current_part_weights[i] = my_current_part_weights[i-2];
02934       continue;
02935     }
02936     //otherwise do the prefix sum.
02937     my_current_part_weights[i] += my_current_part_weights[i-1];
02938   }
02939 }
02940 
02941 
02949 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
02950           typename mj_part_t>
02951 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_accumulate_thread_results(
02952     const std::vector <mj_part_t> &num_partitioning_in_current_dim,
02953     mj_part_t current_work_part,
02954     mj_part_t current_concurrent_num_parts){
02955 
02956 #ifdef HAVE_ZOLTAN2_OMP
02957   //needs barrier here, as it requires all threads to finish mj_1D_part_get_thread_part_weights
02958   //using parallel region here reduces the performance because of the cache invalidates.
02959 #pragma omp barrier
02960 #pragma omp single
02961 #endif
02962   {
02963     size_t tlr_array_shift = 0;
02964     mj_part_t cut_shift = 0;
02965 
02966     //iterate for all concurrent parts to find the left and right closest points in the process.
02967     for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
02968 
02969       mj_part_t num_parts_in_part =  num_partitioning_in_current_dim[current_work_part + i];
02970       mj_part_t num_cuts_in_part = num_parts_in_part - 1;
02971       size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
02972 
02973       //iterate for cuts in a single part.
02974       for(mj_part_t ii = 0; ii < num_cuts_in_part ; ++ii){
02975         mj_part_t next = tlr_array_shift + ii;
02976         mj_part_t cut_index = cut_shift + ii;
02977         if(this->is_cut_line_determined[cut_index]) continue;
02978         mj_scalar_t left_closest_in_process = this->thread_cut_left_closest_point[0][cut_index],
02979             right_closest_in_process = this->thread_cut_right_closest_point[0][cut_index];
02980 
02981         //find the closest points from left and right for the cut in the process.
02982         for (int j = 1; j < this->num_threads; ++j){
02983           if (this->thread_cut_right_closest_point[j][cut_index] < right_closest_in_process ){
02984             right_closest_in_process = this->thread_cut_right_closest_point[j][cut_index];
02985           }
02986           if (this->thread_cut_left_closest_point[j][cut_index] > left_closest_in_process ){
02987             left_closest_in_process = this->thread_cut_left_closest_point[j][cut_index];
02988           }
02989         }
02990         //store the left and right closes points.
02991         this->total_part_weight_left_right_closests[num_total_part_in_part +
02992                                                     next] = left_closest_in_process;
02993         this->total_part_weight_left_right_closests[num_total_part_in_part +
02994                                                     num_cuts_in_part + next] = right_closest_in_process;
02995       }
02996       //set the shift position in the arrays
02997       tlr_array_shift += (num_total_part_in_part + 2 * num_cuts_in_part);
02998       cut_shift += num_cuts_in_part;
02999     }
03000 
03001     tlr_array_shift = 0;
03002     cut_shift = 0;
03003     size_t total_part_array_shift = 0;
03004 
03005     //iterate for all concurrent parts to find the total weight in the process.
03006     for(mj_part_t i = 0; i < current_concurrent_num_parts; ++i){
03007 
03008       mj_part_t num_parts_in_part =  num_partitioning_in_current_dim[current_work_part + i];
03009       mj_part_t num_cuts_in_part = num_parts_in_part - 1;
03010       size_t num_total_part_in_part = num_parts_in_part + size_t (num_cuts_in_part) ;
03011 
03012       for(size_t j = 0; j < num_total_part_in_part; ++j){
03013 
03014         mj_part_t cut_ind = j / 2 + cut_shift;
03015 
03016         //need to check j !=  num_total_part_in_part - 1
03017             // which is same as j/2 != num_cuts_in_part.
03018         //we cannot check it using cut_ind, because of the concurrent part concantanetion.
03019         if(j !=  num_total_part_in_part - 1 && this->is_cut_line_determined[cut_ind]) continue;
03020         double pwj = 0;
03021         for (int k = 0; k < this->num_threads; ++k){
03022           pwj += this->thread_part_weights[k][total_part_array_shift + j];
03023         }
03024         //size_t jshift = j % total_part_count + i * (total_part_count + 2 * noCuts);
03025         this->total_part_weight_left_right_closests[tlr_array_shift + j] = pwj;
03026       }
03027       cut_shift += num_cuts_in_part;
03028       tlr_array_shift += num_total_part_in_part + 2 * num_cuts_in_part;
03029       total_part_array_shift += num_total_part_in_part;
03030     }
03031   }
03032   //the other threads needs to wait here.
03033   //but we don't need a pragma omp barrier.
03034   //as omp single has already have implicit barrier.
03035 }
03036 
03037 
03047 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
03048           typename mj_part_t>
03049 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_calculate_new_cut_position (
03050   mj_scalar_t cut_upper_bound,
03051     mj_scalar_t cut_lower_bound,
03052     mj_scalar_t cut_upper_weight,
03053     mj_scalar_t cut_lower_weight,
03054     mj_scalar_t expected_weight,
03055     mj_scalar_t &new_cut_position){
03056 
03057     if(ABS(cut_upper_bound - cut_lower_bound) < this->sEpsilon){
03058       new_cut_position = cut_upper_bound; //or lower bound does not matter.
03059     }
03060 
03061 
03062     if(ABS(cut_upper_weight - cut_lower_weight) < this->sEpsilon){
03063       new_cut_position = cut_lower_bound;
03064     }
03065 
03066     mj_scalar_t coordinate_range = (cut_upper_bound - cut_lower_bound);
03067     mj_scalar_t weight_range = (cut_upper_weight - cut_lower_weight);
03068     mj_scalar_t my_weight_diff = (expected_weight - cut_lower_weight);
03069 
03070     mj_scalar_t required_shift = (my_weight_diff / weight_range);
03071     int scale_constant = 20;
03072     int shiftint= int (required_shift * scale_constant);
03073     if (shiftint == 0) shiftint = 1;
03074     required_shift = mj_scalar_t (shiftint) / scale_constant;
03075     new_cut_position = coordinate_range * required_shift + cut_lower_bound;
03076 }
03077 
03078 
03089 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
03090           typename mj_part_t>
03091 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_create_new_partitions(
03092     mj_part_t num_parts,
03093     mj_scalar_t *mj_current_dim_coords,
03094     mj_scalar_t *current_concurrent_cut_coordinate,
03095     mj_lno_t coordinate_begin,
03096     mj_lno_t coordinate_end,
03097     mj_scalar_t *used_local_cut_line_weight_to_left,
03098     double **used_thread_part_weight_work,
03099     mj_lno_t *out_part_xadj){
03100 
03101   mj_part_t num_cuts = num_parts - 1;
03102 
03103 #ifdef HAVE_ZOLTAN2_OMP
03104 #pragma omp parallel
03105 #endif
03106   {
03107     int me = 0;
03108 #ifdef HAVE_ZOLTAN2_OMP
03109     me = omp_get_thread_num();
03110 #endif
03111 
03112     mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
03113     mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
03114 
03115     //now if the rectilinear partitioning is allowed we decide how
03116     //much weight each thread should put to left and right.
03117     if (this->distribute_points_on_cut_lines){
03118       my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
03119       // this for assumes the static scheduling in mj_1D_part calculation.
03120 #ifdef HAVE_ZOLTAN2_OMP
03121 #pragma omp for
03122 #endif
03123       for (mj_part_t i = 0; i < num_cuts; ++i){
03124         //the left to be put on the left of the cut.
03125         mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
03126         for(int ii = 0; ii < this->num_threads; ++ii){
03127           if(left_weight > this->sEpsilon){
03128             //the weight of thread ii on cut.
03129             mj_scalar_t thread_ii_weight_on_cut = used_thread_part_weight_work[ii][i * 2 + 1] - used_thread_part_weight_work[ii][i * 2 ];
03130             if(thread_ii_weight_on_cut < left_weight){
03131               //if left weight is bigger than threads weight on cut.
03132               this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
03133             }
03134             else {
03135               //if thread's weight is bigger than space, then put only a portion.
03136               this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
03137             }
03138             left_weight -= thread_ii_weight_on_cut;
03139           }
03140           else {
03141             this->thread_cut_line_weight_to_put_left[ii][i] = 0;
03142           }
03143         }
03144       }
03145 
03146       if(num_cuts > 0){
03147         //this is a special case. If cutlines share the same coordinate, their weights are equal.
03148         //we need to adjust the ratio for that.
03149         for (mj_part_t i = num_cuts - 1; i > 0 ; --i){
03150           if(ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
03151             my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
03152           }
03153           my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
03154                                 / mj_scalar_t(SIGNIFICANCE_MUL);
03155         }
03156       }
03157     }
03158 
03159     for(mj_part_t ii = 0; ii < num_parts; ++ii){
03160       thread_num_points_in_parts[ii] = 0;
03161     }
03162 
03163 
03164 #ifdef HAVE_ZOLTAN2_OMP
03165     //dont change static scheduler. the static partitioner used later as well.
03166 #pragma omp for
03167 #endif
03168     for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
03169 
03170       mj_lno_t coordinate_index = this->coordinate_permutations[ii];
03171       mj_scalar_t coordinate_weight = this->mj_uniform_weights[0]? 1:this->mj_weights[0][coordinate_index];
03172       mj_part_t coordinate_assigned_place = this->assigned_part_ids[coordinate_index];
03173       mj_part_t coordinate_assigned_part = coordinate_assigned_place / 2;
03174       if(coordinate_assigned_place % 2 == 1){
03175         //if it is on the cut.
03176         if(this->distribute_points_on_cut_lines
03177             && my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] > this->sEpsilon){
03178           //if the rectilinear partitioning is allowed,
03179           //and the thread has still space to put on the left of the cut
03180           //then thread puts the vertex to left.
03181           my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
03182           //if putting the vertex to left increased the weight more than expected.
03183           //and if the next cut is on the same coordinate,
03184           //then we need to adjust how much weight next cut puts to its left as well,
03185           //in order to take care of the imbalance.
03186           if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0
03187               && coordinate_assigned_part < num_cuts - 1
03188               && ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
03189                   current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
03190             my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
03191           }
03192           ++thread_num_points_in_parts[coordinate_assigned_part];
03193           this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
03194         }
03195         else{
03196           //if there is no more space on the left, put the coordinate to the right of the cut.
03197           ++coordinate_assigned_part;
03198           //this while loop is necessary when a line is partitioned into more than 2 parts.
03199           while(this->distribute_points_on_cut_lines &&
03200               coordinate_assigned_part < num_cuts){
03201             //traverse all the cut lines having the same partitiong
03202             if(ABS(current_concurrent_cut_coordinate[coordinate_assigned_part] -
03203                 current_concurrent_cut_coordinate[coordinate_assigned_part - 1])
03204                 < this->sEpsilon){
03205               //if line has enough space on left, put it there.
03206               if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >
03207               this->sEpsilon &&
03208               my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] >=
03209               ABS(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] - coordinate_weight)){
03210                 my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] -= coordinate_weight;
03211                 //Again if it put too much on left of the cut,
03212                 //update how much the next cut sharing the same coordinate will put to its left.
03213                 if(my_local_thread_cut_weights_to_put_left[coordinate_assigned_part] < 0 &&
03214                     coordinate_assigned_part < num_cuts - 1 &&
03215                     ABS(current_concurrent_cut_coordinate[coordinate_assigned_part+1] -
03216                         current_concurrent_cut_coordinate[coordinate_assigned_part]) < this->sEpsilon){
03217                   my_local_thread_cut_weights_to_put_left[coordinate_assigned_part + 1] += my_local_thread_cut_weights_to_put_left[coordinate_assigned_part];
03218                 }
03219                 break;
03220               }
03221             }
03222             else {
03223               break;
03224             }
03225             ++coordinate_assigned_part;
03226           }
03227           ++thread_num_points_in_parts[coordinate_assigned_part];
03228           this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
03229         }
03230       }
03231       else {
03232         //if it is already assigned to a part, then just put it to the corresponding part.
03233         ++thread_num_points_in_parts[coordinate_assigned_part];
03234         this->assigned_part_ids[coordinate_index] = coordinate_assigned_part;
03235       }
03236     }
03237 
03238 
03239 
03240     //now we calculate where each thread will write in new_coordinate_permutations array.
03241     //first we find the out_part_xadj, by marking the begin and end points of each part found.
03242     //the below loop find the number of points in each part, and writes it to out_part_xadj
03243 #ifdef HAVE_ZOLTAN2_OMP
03244 #pragma omp for
03245 #endif
03246     for(mj_part_t j = 0; j < num_parts; ++j){
03247       mj_lno_t num_points_in_part_j_upto_thread_i = 0;
03248       for (int i = 0; i < this->num_threads; ++i){
03249         mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
03250         //prefix sum to thread point counts, so that each will have private space to write.
03251         this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
03252         num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
03253 
03254       }
03255       out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
03256     }
03257 
03258     //now we need to do a prefix sum to out_part_xadj[j], to point begin and end of each part.
03259 #ifdef HAVE_ZOLTAN2_OMP
03260 #pragma omp single
03261 #endif
03262     {
03263       //perform prefix sum for num_points in parts.
03264       for(mj_part_t j = 1; j < num_parts; ++j){
03265         out_part_xadj[j] += out_part_xadj[j - 1];
03266       }
03267     }
03268 
03269     //shift the num points in threads thread to obtain the
03270     //beginning index of each thread's private space.
03271     for(mj_part_t j = 1; j < num_parts; ++j){
03272       thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
03273     }
03274 
03275 
03276     //now thread gets the coordinate and writes the index of coordinate to the permutation array
03277     //using the part index we calculated.
03278 #ifdef HAVE_ZOLTAN2_OMP
03279 #pragma omp for
03280 #endif
03281     for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
03282       mj_lno_t i = this->coordinate_permutations[ii];
03283       mj_part_t p =  this->assigned_part_ids[i];
03284       this->new_coordinate_permutations[coordinate_begin +
03285                                         thread_num_points_in_parts[p]++] = i;
03286     }
03287   }
03288 }
03289 
03290 
03291 
03320 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
03321           typename mj_part_t>
03322 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_get_new_cut_coordinates(
03323     const size_t &num_total_part,
03324     const mj_part_t &num_cuts,
03325     const mj_scalar_t &max_coordinate,
03326     const mj_scalar_t &min_coordinate,
03327     const mj_scalar_t &global_total_weight,
03328     const mj_scalar_t &used_imbalance_tolerance,
03329     mj_scalar_t * current_global_part_weights,
03330     const mj_scalar_t * current_local_part_weights,
03331     const mj_scalar_t *current_part_target_weights,
03332     bool *current_cut_line_determined,
03333     mj_scalar_t *current_cut_coordinates,
03334     mj_scalar_t *current_cut_upper_bounds,
03335     mj_scalar_t *current_cut_lower_bounds,
03336     mj_scalar_t *current_global_left_closest_points,
03337     mj_scalar_t *current_global_right_closest_points,
03338     mj_scalar_t * current_cut_lower_bound_weights,
03339     mj_scalar_t * current_cut_upper_weights,
03340     mj_scalar_t *new_current_cut_coordinates,
03341     mj_scalar_t *current_part_cut_line_weight_to_put_left,
03342     mj_part_t *rectilinear_cut_count,
03343     mj_part_t &my_num_incomplete_cut){
03344 
03345   //seen weight in the part
03346   mj_scalar_t seen_weight_in_part = 0;
03347   //expected weight for part.
03348   mj_scalar_t expected_weight_in_part = 0;
03349   //imbalance for the left and right side of the cut.
03350   mj_scalar_t imbalance_on_left = 0, imbalance_on_right = 0;
03351 
03352 
03353 #ifdef HAVE_ZOLTAN2_OMP
03354 #pragma omp for
03355 #endif
03356   for (mj_part_t i = 0; i < num_cuts; i++){
03357     //if left and right closest points are not set yet,
03358     //set it to the cut itself.
03359     if(min_coordinate - current_global_left_closest_points[i] > this->sEpsilon)
03360       current_global_left_closest_points[i] = current_cut_coordinates[i];
03361     if(current_global_right_closest_points[i] - max_coordinate > this->sEpsilon)
03362       current_global_right_closest_points[i] = current_cut_coordinates[i];
03363 
03364   }
03365 #ifdef HAVE_ZOLTAN2_OMP
03366 #pragma omp for
03367 #endif
03368   for (mj_part_t i = 0; i < num_cuts; i++){
03369 
03370     if(this->distribute_points_on_cut_lines){
03371       //init the weight on the cut.
03372       this->global_rectilinear_cut_weight[i] = 0;
03373       this->process_rectilinear_cut_weight[i] = 0;
03374     }
03375     //if already determined at previous iterations,
03376     //then just write the coordinate to new array, and proceed.
03377     if(current_cut_line_determined[i]) {
03378       new_current_cut_coordinates[i] = current_cut_coordinates[i];
03379       continue;
03380     }
03381 
03382     //current weight of the part at the left of the cut line.
03383     seen_weight_in_part = current_global_part_weights[i * 2];
03384 
03385     /*
03386     cout << "seen_weight_in_part:" << i << " is "<< seen_weight_in_part << endl;
03387     cout << "\tcut:" << current_cut_coordinates[i]
03388            << " current_cut_lower_bounds:" << current_cut_lower_bounds[i]
03389                << " current_cut_upper_bounds:" << current_cut_upper_bounds[i] << endl;
03390                */
03391     //expected ratio
03392     expected_weight_in_part = current_part_target_weights[i];
03393     //leftImbalance = imbalanceOf(seenW, globalTotalWeight, expected);
03394     imbalance_on_left = imbalanceOf2(seen_weight_in_part, expected_weight_in_part);
03395     //rightImbalance = imbalanceOf(globalTotalWeight - seenW, globalTotalWeight, 1 - expected);
03396     imbalance_on_right = imbalanceOf2(global_total_weight - seen_weight_in_part, global_total_weight - expected_weight_in_part);
03397 
03398     bool is_left_imbalance_valid = ABS(imbalance_on_left) - used_imbalance_tolerance < this->sEpsilon ;
03399     bool is_right_imbalance_valid = ABS(imbalance_on_right) - used_imbalance_tolerance < this->sEpsilon;
03400 
03401     //if the cut line reaches to desired imbalance.
03402     if(is_left_imbalance_valid && is_right_imbalance_valid){
03403       current_cut_line_determined[i] = true;
03404 #ifdef HAVE_ZOLTAN2_OMP
03405 #pragma omp atomic
03406 #endif
03407       my_num_incomplete_cut -= 1;
03408       new_current_cut_coordinates [i] = current_cut_coordinates[i];
03409       continue;
03410     }
03411     else if(imbalance_on_left < 0){
03412       //if left imbalance < 0 then we need to move the cut to right.
03413 
03414       if(this->distribute_points_on_cut_lines){
03415         //if it is okay to distribute the coordinate on
03416         //the same coordinate to left and right.
03417         //then check if we can reach to the target weight by including the
03418         //coordinates in the part.
03419         if (current_global_part_weights[i * 2 + 1] == expected_weight_in_part){
03420           //if it is we are done.
03421           current_cut_line_determined[i] = true;
03422 #ifdef HAVE_ZOLTAN2_OMP
03423 #pragma omp atomic
03424 #endif
03425           my_num_incomplete_cut -= 1;
03426 
03427           //then assign everything on the cut to the left of the cut.
03428           new_current_cut_coordinates [i] = current_cut_coordinates[i];
03429 
03430           //for this cut all the weight on cut will be put to left.
03431 
03432           current_part_cut_line_weight_to_put_left[i] = current_local_part_weights[i * 2 + 1] - current_local_part_weights[i * 2];
03433           continue;
03434         }
03435         else if (current_global_part_weights[i * 2 + 1] > expected_weight_in_part){
03436 
03437           //if the weight is larger than the expected weight,
03438           //then we need to distribute some points to left, some to right.
03439           current_cut_line_determined[i] = true;
03440 #ifdef HAVE_ZOLTAN2_OMP
03441 #pragma omp atomic
03442 #endif
03443           *rectilinear_cut_count += 1;
03444           //increase the num cuts to be determined with rectilinear partitioning.
03445 
03446 #ifdef HAVE_ZOLTAN2_OMP
03447 #pragma omp atomic
03448 #endif
03449           my_num_incomplete_cut -= 1;
03450           new_current_cut_coordinates [i] = current_cut_coordinates[i];
03451           this->process_rectilinear_cut_weight[i] = current_local_part_weights[i * 2 + 1] -
03452               current_local_part_weights[i * 2];
03453           continue;
03454         }
03455       }
03456       //we need to move further right,so set lower bound to current line, and shift it to the closes point from right.
03457       current_cut_lower_bounds[i] = current_global_right_closest_points[i];
03458       //set the lower bound weight to the weight we have seen.
03459       current_cut_lower_bound_weights[i] = seen_weight_in_part;
03460 
03461       //compare the upper bound with what has been found in the last iteration.
03462       //we try to make more strict bounds for the cut here.
03463       for (mj_part_t ii = i + 1; ii < num_cuts ; ++ii){
03464         mj_scalar_t p_weight = current_global_part_weights[ii * 2];
03465         mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
03466 
03467         if(p_weight >= expected_weight_in_part){
03468           //if a cut on the right has the expected weight, then we found
03469           //our cut position. Set up and low coordiantes to this new cut coordinate.
03470           //but we need one more iteration to finalize the cut position,
03471           //as wee need to update the part ids.
03472           if(p_weight == expected_weight_in_part){
03473             current_cut_upper_bounds[i] = current_cut_coordinates[ii];
03474             current_cut_upper_weights[i] = p_weight;
03475             current_cut_lower_bounds[i] = current_cut_coordinates[ii];
03476             current_cut_lower_bound_weights[i] = p_weight;
03477           } else if (p_weight < current_cut_upper_weights[i]){
03478             //if a part weight is larger then my expected weight,
03479             //but lower than my upper bound weight, update upper bound.
03480             current_cut_upper_bounds[i] = current_global_left_closest_points[ii];
03481             current_cut_upper_weights[i] = p_weight;
03482           }
03483           break;
03484         }
03485         //if comes here then pw < ew
03486         //then compare the weight against line weight.
03487         if(line_weight >= expected_weight_in_part){
03488           //if the line is larger than the expected weight,
03489           //then we need to reach to the balance by distributing coordinates on this line.
03490           current_cut_upper_bounds[i] = current_cut_coordinates[ii];
03491           current_cut_upper_weights[i] = line_weight;
03492           current_cut_lower_bounds[i] = current_cut_coordinates[ii];
03493           current_cut_lower_bound_weights[i] = p_weight;
03494           break;
03495         }
03496         //if a stricter lower bound is found,
03497         //update the lower bound.
03498         if (p_weight <= expected_weight_in_part && p_weight >= current_cut_lower_bound_weights[i]){
03499           current_cut_lower_bounds[i] = current_global_right_closest_points[ii] ;
03500           current_cut_lower_bound_weights[i] = p_weight;
03501         }
03502       }
03503 
03504 
03505       mj_scalar_t new_cut_position = 0;
03506       this->mj_calculate_new_cut_position(
03507           current_cut_upper_bounds[i],
03508           current_cut_lower_bounds[i],
03509           current_cut_upper_weights[i],
03510           current_cut_lower_bound_weights[i],
03511           expected_weight_in_part, new_cut_position);
03512 
03513       //if cut line does not move significantly.
03514       //then finalize the search.
03515       if (ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
03516         /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/
03517         ){
03518         current_cut_line_determined[i] = true;
03519 #ifdef HAVE_ZOLTAN2_OMP
03520 #pragma omp atomic
03521 #endif
03522         my_num_incomplete_cut -= 1;
03523 
03524         //set the cut coordinate and proceed.
03525         new_current_cut_coordinates [i] = current_cut_coordinates[i];
03526       } else {
03527         new_current_cut_coordinates [i] = new_cut_position;
03528       }
03529     } else {
03530 
03531       //need to move the cut line to left.
03532       //set upper bound to current line.
03533       current_cut_upper_bounds[i] = current_global_left_closest_points[i];
03534       current_cut_upper_weights[i] = seen_weight_in_part;
03535 
03536       // compare the current cut line weights with previous upper and lower bounds.
03537       for (int ii = i - 1; ii >= 0; --ii){
03538         mj_scalar_t p_weight = current_global_part_weights[ii * 2];
03539         mj_scalar_t line_weight = current_global_part_weights[ii * 2 + 1];
03540         if(p_weight <= expected_weight_in_part){
03541           if(p_weight == expected_weight_in_part){
03542             //if the weight of the part is my expected weight
03543             //then we find the solution.
03544             current_cut_upper_bounds[i] = current_cut_coordinates[ii];
03545             current_cut_upper_weights[i] = p_weight;
03546             current_cut_lower_bounds[i] = current_cut_coordinates[ii];
03547             current_cut_lower_bound_weights[i] = p_weight;
03548           }
03549           else if (p_weight > current_cut_lower_bound_weights[i]){
03550             //if found weight is bigger than the lower bound
03551             //then update the lower bound.
03552             current_cut_lower_bounds[i] = current_global_right_closest_points[ii];
03553             current_cut_lower_bound_weights[i] = p_weight;
03554 
03555             //at the same time, if weight of line is bigger than the
03556             //expected weight, then update the upper bound as well.
03557             //in this case the balance will be obtained by distributing weightss
03558             //on this cut position.
03559             if(line_weight > expected_weight_in_part){
03560               current_cut_upper_bounds[i] = current_global_right_closest_points[ii];
03561               current_cut_upper_weights[i] = line_weight;
03562             }
03563           }
03564           break;
03565         }
03566         //if the weight of the cut on the left is still bigger than my weight,
03567         //and also if the weight is smaller than the current upper weight,
03568         //or if the weight is equal to current upper weight, but on the left of
03569         // the upper weight, then update upper bound.
03570         if (p_weight >= expected_weight_in_part &&
03571             (p_weight < current_cut_upper_weights[i] ||
03572                 (p_weight == current_cut_upper_weights[i] &&
03573                     current_cut_upper_bounds[i] > current_global_left_closest_points[ii]
03574                 )
03575             )
03576           ){
03577           current_cut_upper_bounds[i] = current_global_left_closest_points[ii] ;
03578           current_cut_upper_weights[i] = p_weight;
03579         }
03580       }
03581       mj_scalar_t new_cut_position = 0;
03582       this->mj_calculate_new_cut_position(
03583           current_cut_upper_bounds[i],
03584           current_cut_lower_bounds[i],
03585           current_cut_upper_weights[i],
03586           current_cut_lower_bound_weights[i],
03587           expected_weight_in_part,
03588           new_cut_position);
03589 
03590       //if cut line does not move significantly.
03591       if (ABS(current_cut_coordinates[i] - new_cut_position) < this->sEpsilon
03592           /*|| current_cut_lower_bounds[i] - current_cut_upper_bounds[i] > this->sEpsilon*/ ){
03593         current_cut_line_determined[i] = true;
03594 #ifdef HAVE_ZOLTAN2_OMP
03595 #pragma omp atomic
03596 #endif
03597         my_num_incomplete_cut -= 1;
03598         //set the cut coordinate and proceed.
03599         new_current_cut_coordinates [ i] = current_cut_coordinates[i];
03600       } else {
03601         new_current_cut_coordinates [ i] = new_cut_position;
03602       }
03603     }
03604   }
03605 
03606   //communication to determine the ratios of processors for the distribution
03607   //of coordinates on the cut lines.
03608 #ifdef HAVE_ZOLTAN2_OMP
03609   //no need barrier here as it is implicit.
03610 #pragma omp single
03611 #endif
03612   {
03613     if(*rectilinear_cut_count > 0){
03614 
03615       try{
03616         Teuchos::scan<int,mj_scalar_t>(
03617             *comm, Teuchos::REDUCE_SUM,
03618             num_cuts,
03619             this->process_rectilinear_cut_weight,
03620             this->global_rectilinear_cut_weight
03621         );
03622       }
03623       Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
03624 
03625       for (mj_part_t i = 0; i < num_cuts; ++i){
03626         //if cut line weight to be distributed.
03627         if(this->global_rectilinear_cut_weight[i] > 0) {
03628           //expected weight to go to left of the cut.
03629           mj_scalar_t expected_part_weight = current_part_target_weights[i];
03630           //the weight that should be put to left of the cut.
03631           mj_scalar_t necessary_weight_on_line_for_left = expected_part_weight - current_global_part_weights[i * 2];
03632           //the weight of the cut in the process
03633           mj_scalar_t my_weight_on_line = this->process_rectilinear_cut_weight[i];
03634           //the sum of the cut weights upto this process, including the weight of this process.
03635           mj_scalar_t weight_on_line_upto_process_inclusive = this->global_rectilinear_cut_weight[i];
03636           //the space on the left side of the cut after all processes before this process (including this process)
03637           //puts their weights on cut to left.
03638           mj_scalar_t space_to_put_left = necessary_weight_on_line_for_left - weight_on_line_upto_process_inclusive;
03639           //add my weight to this space to find out how much space is left to me.
03640           mj_scalar_t space_left_to_me = space_to_put_left + my_weight_on_line;
03641 
03642           /*
03643           cout << "expected_part_weight:" << expected_part_weight
03644               << " necessary_weight_on_line_for_left:" << necessary_weight_on_line_for_left
03645               << " my_weight_on_line" << my_weight_on_line
03646               << " weight_on_line_upto_process_inclusive:" << weight_on_line_upto_process_inclusive
03647               << " space_to_put_left:" << space_to_put_left
03648               << " space_left_to_me" << space_left_to_me << endl;
03649            */
03650           if(space_left_to_me < 0){
03651             //space_left_to_me is negative and i dont need to put anything to left.
03652             current_part_cut_line_weight_to_put_left[i] = 0;
03653           }
03654           else if(space_left_to_me >= my_weight_on_line){
03655             //space left to me is bigger than the weight of the processor on cut.
03656             //so put everything to left.
03657             current_part_cut_line_weight_to_put_left[i] = my_weight_on_line;
03658             //cout << "setting current_part_cut_line_weight_to_put_left to my_weight_on_line:" << my_weight_on_line << endl;
03659           }
03660           else {
03661             //put only the weight as much as the space.
03662             current_part_cut_line_weight_to_put_left[i] = space_left_to_me ;
03663 
03664             //cout << "setting current_part_cut_line_weight_to_put_left to space_left_to_me:" << space_left_to_me << endl;
03665           }
03666 
03667         }
03668       }
03669       *rectilinear_cut_count = 0;
03670     }
03671   }
03672 }
03673 
03683 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
03684           typename mj_part_t>
03685 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::get_processor_num_points_in_parts(
03686     mj_part_t num_procs,
03687     mj_part_t num_parts,
03688     mj_gno_t *&num_points_in_all_processor_parts){
03689 
03690   //initially allocation_size is num_parts
03691   size_t allocation_size = num_parts * (num_procs + 1);
03692 
03693   //this will be output
03694   //holds how many each processor has in each part.
03695   //last portion is the sum of all processor points in each part.
03696 
03697   //allocate memory for the local num coordinates in each part.
03698   mj_gno_t *num_local_points_in_each_part_to_reduce_sum = allocMemory<mj_gno_t>(allocation_size);
03699 
03700 
03701   //this is the portion of the memory which will be used
03702   //at the summation to obtain total number of processors' points in each part.
03703   mj_gno_t *my_local_points_to_reduce_sum = num_local_points_in_each_part_to_reduce_sum + num_procs * num_parts;
03704   //this is the portion of the memory where each stores its local number.
03705   //this information is needed by other processors.
03706   mj_gno_t *my_local_point_counts_in_each_art = num_local_points_in_each_part_to_reduce_sum + this->myRank * num_parts;
03707 
03708   //initialize the array with 0's.
03709   memset(num_local_points_in_each_part_to_reduce_sum, 0, sizeof(mj_gno_t)*allocation_size);
03710 
03711   //write the number of coordinates in each part.
03712   for (mj_part_t i = 0; i < num_parts; ++i){
03713     mj_lno_t part_begin_index = 0;
03714     if (i > 0){
03715       part_begin_index = this->new_part_xadj[i - 1];
03716     }
03717     mj_lno_t part_end_index = this->new_part_xadj[i];
03718     my_local_points_to_reduce_sum[i] = part_end_index - part_begin_index;
03719   }
03720 
03721   //copy the local num parts to the last portion of array,
03722   //so that this portion will represent the global num points in each part after the reduction.
03723   memcpy (my_local_point_counts_in_each_art,
03724       my_local_points_to_reduce_sum,
03725       sizeof(mj_gno_t) * (num_parts) );
03726 
03727 
03728   //reduceAll operation.
03729   //the portion that belongs to a processor with index p
03730   //will start from myRank * num_parts.
03731   //the global number of points will be held at the index
03732   try{
03733     reduceAll<int, mj_gno_t>(
03734         *(this->comm),
03735         Teuchos::REDUCE_SUM,
03736         allocation_size,
03737         num_local_points_in_each_part_to_reduce_sum,
03738         num_points_in_all_processor_parts);
03739   }
03740   Z2_THROW_OUTSIDE_ERROR(*(this->mj_env))
03741   freeArray<mj_gno_t>(num_local_points_in_each_part_to_reduce_sum);
03742 }
03743 
03744 
03745 
03758 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
03759           typename mj_part_t>
03760 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_check_to_migrate(
03761     size_t migration_reduce_all_population,
03762     mj_lno_t num_coords_for_last_dim_part,
03763     mj_part_t num_procs,
03764     mj_part_t num_parts,
03765     mj_gno_t *num_points_in_all_processor_parts){
03766 
03767   //if reduce all count and population in the last dim is too high
03768     if (migration_reduce_all_population > FUTURE_REDUCEALL_CUTOFF) return true;
03769     //if the work in a part per processor in the last dim is too low.
03770     if (num_coords_for_last_dim_part < MIN_WORK_LAST_DIM) return true;
03771 
03772   //if migration is to be checked and the imbalance is too high
03773     if (this->check_migrate_avoid_migration_option == 0){
03774       double global_imbalance = 0;
03775       //global shift to reach the sum of coordiante count in each part.
03776       size_t global_shift = num_procs * num_parts;
03777 
03778       for (mj_part_t ii = 0; ii < num_procs; ++ii){
03779         for (mj_part_t i = 0; i < num_parts; ++i){
03780           double ideal_num = num_points_in_all_processor_parts[global_shift + i]
03781                     / double(num_procs);
03782 
03783           global_imbalance += ABS(ideal_num -
03784               num_points_in_all_processor_parts[ii * num_parts + i]) /  (ideal_num);
03785         }
03786       }
03787       global_imbalance /= num_parts;
03788       global_imbalance /= num_procs;
03789 
03790     /*
03791       if (this->myRank == 0) {
03792         cout << "imbalance for next iteration:" << global_imbalance << endl;
03793       }
03794       */
03795 
03796       if(global_imbalance <= this->minimum_migration_imbalance){
03797         return false;
03798       }
03799       else {
03800         return true;
03801       }
03802     }
03803     else {
03804       //if migration is forced
03805         return true;
03806     }
03807 }
03808 
03809 
03819 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
03820           typename mj_part_t>
03821 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations(
03822     mj_part_t num_parts,
03823     mj_part_t *part_assignment_proc_begin_indices,
03824     mj_part_t *processor_chains_in_parts,
03825     mj_lno_t *send_count_to_each_proc,
03826     int *coordinate_destinations){
03827 
03828     for (mj_part_t p = 0; p < num_parts; ++p){
03829         mj_lno_t part_begin = 0;
03830         if (p > 0) part_begin = this->new_part_xadj[p - 1];
03831         mj_lno_t part_end = this->new_part_xadj[p];
03832 
03833         //get the first part that current processor will send its part-p.
03834         mj_part_t proc_to_sent = part_assignment_proc_begin_indices[p];
03835         //initialize how many point I sent to this processor.
03836         mj_lno_t num_total_send = 0;
03837         for (mj_lno_t j=part_begin; j < part_end; j++){
03838             mj_lno_t local_ind = this->new_coordinate_permutations[j];
03839             while (num_total_send >= send_count_to_each_proc[proc_to_sent]){
03840               //then get the next processor to send the points in part p.
03841                 num_total_send = 0;
03842                 //assign new processor to part_assign_begin[p]
03843                 part_assignment_proc_begin_indices[p] = processor_chains_in_parts[proc_to_sent];
03844                 //remove the previous processor
03845                 processor_chains_in_parts[proc_to_sent] = -1;
03846                 //choose the next processor as the next one to send.
03847                 proc_to_sent = part_assignment_proc_begin_indices[p];
03848             }
03849             //write the gno index to corresponding position in sendBuf.
03850             coordinate_destinations[local_ind] = proc_to_sent;
03851             ++num_total_send;
03852         }
03853     }
03854 }
03855 
03870 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
03871           typename mj_part_t>
03872 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_proc_to_parts(
03873     mj_gno_t * num_points_in_all_processor_parts,
03874     mj_part_t num_parts,
03875     mj_part_t num_procs,
03876     mj_lno_t *send_count_to_each_proc,
03877     std::vector<mj_part_t> &processor_ranks_for_subcomm,
03878     std::vector<mj_part_t> *next_future_num_parts_in_parts,
03879     mj_part_t &out_part_index,
03880     mj_part_t &output_part_numbering_begin_index,
03881     int *coordinate_destinations){
03882 
03883 
03884     mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
03885     mj_part_t *num_procs_assigned_to_each_part = allocMemory<mj_part_t>(num_parts);
03886 
03887     //boolean variable if the process finds its part to be assigned.
03888     bool did_i_find_my_group = false;
03889 
03890     mj_part_t num_free_procs = num_procs;
03891     mj_part_t minimum_num_procs_required_for_rest_of_parts = num_parts - 1;
03892 
03893     double max_imbalance_difference = 0;
03894     mj_part_t max_differing_part = 0;
03895 
03896     //find how many processor each part requires.
03897     for (mj_part_t i=0; i < num_parts; i++){
03898 
03899         //scalar portion of the required processors
03900         double scalar_required_proc = num_procs *
03901                 (double (global_num_points_in_parts[i]) / double (this->num_global_coords));
03902 
03903         //round it to closest integer.
03904         mj_part_t required_proc = static_cast<mj_part_t> (0.5 + scalar_required_proc);
03905 
03906         //if assigning the required num procs, creates problems for the rest of the parts.
03907         //then only assign {num_free_procs - (minimum_num_procs_required_for_rest_of_parts)} procs to this part.
03908         if (num_free_procs - required_proc < minimum_num_procs_required_for_rest_of_parts){
03909             required_proc = num_free_procs - (minimum_num_procs_required_for_rest_of_parts);
03910         }
03911 
03912         //reduce the free processor count
03913         num_free_procs -= required_proc;
03914         //reduce the free minimum processor count required for the rest of the part by 1.
03915         --minimum_num_procs_required_for_rest_of_parts;
03916 
03917         //part (i) is assigned to (required_proc) processors.
03918         num_procs_assigned_to_each_part[i] = required_proc;
03919 
03920         //because of the roundings some processors might be left as unassigned.
03921         //we want to assign those processors to the part with most imbalance.
03922         //find the part with the maximum imbalance here.
03923         double imbalance_wrt_ideal = (scalar_required_proc - required_proc) /  required_proc;
03924         if (imbalance_wrt_ideal > max_imbalance_difference){
03925             max_imbalance_difference = imbalance_wrt_ideal;
03926             max_differing_part = i;
03927         }
03928     }
03929 
03930     //assign extra processors to the part with maximum imbalance than the ideal.
03931     if (num_free_procs > 0){
03932         num_procs_assigned_to_each_part[max_differing_part] +=  num_free_procs;
03933     }
03934 
03935     //now find what are the best processors with least migration for each part.
03936 
03937     //part_assignment_proc_begin_indices ([i]) is the array that holds the beginning
03938     //index of a processor that processor sends its data for part - i
03939     mj_part_t *part_assignment_proc_begin_indices = allocMemory<mj_part_t>(num_parts);
03940     //the next processor send is found in processor_chains_in_parts, in linked list manner.
03941     mj_part_t *processor_chains_in_parts = allocMemory<mj_part_t>(num_procs);
03942     mj_part_t *processor_part_assignments = allocMemory<mj_part_t>(num_procs);
03943 
03944     //initialize the assignment of each processor.
03945     //this has a linked list implementation.
03946     //the beginning of processors assigned
03947     //to each part is hold at  part_assignment_proc_begin_indices[part].
03948     //then the next processor assigned to that part is located at
03949     //proc_part_assignments[part_assign_begins[part]], this is a chain
03950     //until the value of -1 is reached.
03951     for (int i = 0; i < num_procs; ++i ){
03952         processor_part_assignments[i] = -1;
03953         processor_chains_in_parts[i] = -1;
03954     }
03955     for (int i = 0; i < num_parts; ++i ){
03956         part_assignment_proc_begin_indices[i] = -1;
03957     }
03958 
03959 
03960     //Allocate memory for sorting data structure.
03961     uSortItem<mj_part_t, mj_gno_t> * sort_item_num_part_points_in_procs = allocMemory <uSortItem<mj_part_t, mj_gno_t> > (num_procs);
03962     for(mj_part_t i = 0; i < num_parts; ++i){
03963         //the algorithm tries to minimize the cost of migration,
03964       //by assigning the processors with highest number of coordinates on that part.
03965       //here we might want to implement a maximum weighted bipartite matching algorithm.
03966       for(mj_part_t ii = 0; ii < num_procs; ++ii){
03967         sort_item_num_part_points_in_procs[ii].id = ii;
03968         //if processor is not assigned yet.
03969         //add its num points to the sort data structure.
03970         if (processor_part_assignments[ii] == -1){
03971           sort_item_num_part_points_in_procs[ii].val =
03972               num_points_in_all_processor_parts[ii * num_parts + i];
03973         }
03974         else {
03975           //if processor is already assigned, insert -nLocal - 1 so that it won't be selected again.
03976           //would be same if we simply set it to -1,
03977           //but more information with no extra cost (which is used later) is provided.
03978           sort_item_num_part_points_in_procs[ii].val = -num_points_in_all_processor_parts[ii * num_parts + i] - 1;
03979         }
03980       }
03981         //sort the processors in the part.
03982         uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_part_points_in_procs);
03983 
03984         mj_part_t required_proc_count =  num_procs_assigned_to_each_part[i];
03985         mj_gno_t total_num_points_in_part = global_num_points_in_parts[i];
03986         mj_gno_t ideal_num_points_in_a_proc =
03987                 Teuchos::as<mj_gno_t>(ceil (total_num_points_in_part / double (required_proc_count)));
03988 
03989         //starts sending to least heaviest part.
03990         mj_part_t next_proc_to_send_index = num_procs - required_proc_count;
03991         mj_part_t next_proc_to_send_id = sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
03992         mj_lno_t space_left_in_sent_proc =  ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
03993 
03994         //find the processors that will be assigned to this part, which are the heaviest
03995         //non assigned processors.
03996         for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
03997             mj_part_t proc_id = sort_item_num_part_points_in_procs[ii].id;
03998             //assign processor to part - i.
03999             processor_part_assignments[proc_id] = i;
04000         }
04001 
04002         bool did_change_sign = false;
04003         //if processor has a minus count, reverse it.
04004         for(mj_part_t ii = 0; ii < num_procs; ++ii){
04005             // TODO:  THE LINE BELOW PRODUCES A WARNING IF gno_t IS UNSIGNED
04006             // TODO:  SEE BUG 6194
04007             if (sort_item_num_part_points_in_procs[ii].val < 0){
04008                 did_change_sign = true;
04009                 sort_item_num_part_points_in_procs[ii].val = -sort_item_num_part_points_in_procs[ii].val - 1;
04010             }
04011             else {
04012                 break;
04013             }
04014         }
04015         if(did_change_sign){
04016             //resort the processors in the part for the rest of the processors that is not assigned.
04017             uqsort<mj_part_t, mj_gno_t>(num_procs - required_proc_count, sort_item_num_part_points_in_procs);
04018         }
04019 
04020         //check if this processors is one of the procs assigned to this part.
04021         //if it is, then get the group.
04022         if (!did_i_find_my_group){
04023             for(mj_part_t ii = num_procs - 1; ii >= num_procs - required_proc_count; --ii){
04024 
04025               mj_part_t proc_id_to_assign = sort_item_num_part_points_in_procs[ii].id;
04026               //add the proc to the group.
04027                 processor_ranks_for_subcomm.push_back(proc_id_to_assign);
04028 
04029                 if(proc_id_to_assign == this->myRank){
04030                   //if the assigned process is me, then I find my group.
04031                     did_i_find_my_group = true;
04032                     //set the beginning of part i to my rank.
04033                     part_assignment_proc_begin_indices[i] = this->myRank;
04034                     processor_chains_in_parts[this->myRank] = -1;
04035 
04036                     //set send count to myself to the number of points that I have in part i.
04037                     send_count_to_each_proc[this->myRank] = sort_item_num_part_points_in_procs[ii].val;
04038 
04039                     //calculate the shift required for the output_part_numbering_begin_index
04040                     for (mj_part_t in = 0; in < i; ++in){
04041                         output_part_numbering_begin_index += (*next_future_num_parts_in_parts)[in];
04042                     }
04043                     out_part_index = i;
04044                 }
04045             }
04046             //if these was not my group,
04047             //clear the subcomminicator processor array.
04048             if (!did_i_find_my_group){
04049                 processor_ranks_for_subcomm.clear();
04050             }
04051         }
04052 
04053         //send points of the nonassigned coordinates to the assigned coordinates.
04054         //starts from the heaviest nonassigned processor.
04055         //TODO we might want to play with this part, that allows more computational imbalance
04056         //but having better communication balance.
04057         for(mj_part_t ii = num_procs - required_proc_count - 1; ii >= 0; --ii){
04058             mj_part_t nonassigned_proc_id = sort_item_num_part_points_in_procs[ii].id;
04059             mj_lno_t num_points_to_sent = sort_item_num_part_points_in_procs[ii].val;
04060 
04061             //we set number of points to -to_sent - 1 for the assigned processors.
04062             //we reverse it here. This should not happen, as we have already reversed them above.
04063 #ifdef MJ_DEBUG
04064             if (num_points_to_sent < 0) {
04065               cout << "Migration - processor assignments - for part:" << i << "from proc:" << nonassigned_proc_id << " num_points_to_sent:" << num_points_to_sent << std::endl;
04066               exit(1);
04067             }
04068 #endif
04069 
04070             //now sends the points to the assigned processors.
04071             while (num_points_to_sent > 0){
04072                 //if the processor has enough space.
04073                 if (num_points_to_sent <= space_left_in_sent_proc){
04074                   //reduce the space left in the processor.
04075                   space_left_in_sent_proc -= num_points_to_sent;
04076                   //if my rank is the one that is sending the coordinates.
04077                     if (this->myRank == nonassigned_proc_id){
04078                         //set my sent count to the sent processor.
04079                         send_count_to_each_proc[next_proc_to_send_id] = num_points_to_sent;
04080                         //save the processor in the list (processor_chains_in_parts and part_assignment_proc_begin_indices)
04081                         //that the processor will send its point in part-i.
04082                         mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
04083                         part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
04084                         processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
04085                     }
04086                     num_points_to_sent = 0;
04087                 }
04088                 else {
04089                     //there might be no space left in the processor.
04090                     if(space_left_in_sent_proc > 0){
04091                         num_points_to_sent -= space_left_in_sent_proc;
04092 
04093                         //send as the space left in the processor.
04094                         if (this->myRank == nonassigned_proc_id){
04095                           //send as much as the space in this case.
04096                             send_count_to_each_proc[next_proc_to_send_id] = space_left_in_sent_proc;
04097                             mj_part_t prev_begin = part_assignment_proc_begin_indices[i];
04098                             part_assignment_proc_begin_indices[i] = next_proc_to_send_id;
04099                             processor_chains_in_parts[next_proc_to_send_id] = prev_begin;
04100 
04101                         }
04102                     }
04103                     //change the sent part
04104                     ++next_proc_to_send_index;
04105 
04106 #ifdef MJ_DEBUG
04107                     if(next_part_to_send_index <  nprocs - required_proc_count ){
04108                       cout << "Migration - processor assignments - for part:"
04109                           << i
04110                           <<  " next_part_to_send :" << next_part_to_send_index
04111                           << " nprocs:" << nprocs
04112                           << " required_proc_count:" << required_proc_count
04113                           << " Error: next_part_to_send_index <  nprocs - required_proc_count" << std::endl;
04114                       exit(1)l
04115 
04116                     }
04117 #endif
04118                     //send the new id.
04119                     next_proc_to_send_id =  sort_item_num_part_points_in_procs[next_proc_to_send_index].id;
04120                     //set the new space in the processor.
04121                     space_left_in_sent_proc = ideal_num_points_in_a_proc - sort_item_num_part_points_in_procs[next_proc_to_send_index].val;
04122                 }
04123             }
04124         }
04125     }
04126 
04127 
04128 
04129     this->assign_send_destinations(
04130         num_parts,
04131             part_assignment_proc_begin_indices,
04132             processor_chains_in_parts,
04133             send_count_to_each_proc,
04134             coordinate_destinations);
04135 
04136     freeArray<mj_part_t>(part_assignment_proc_begin_indices);
04137     freeArray<mj_part_t>(processor_chains_in_parts);
04138     freeArray<mj_part_t>(processor_part_assignments);
04139     freeArray<uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_part_points_in_procs);
04140     freeArray<mj_part_t > (num_procs_assigned_to_each_part);
04141 
04142 }
04143 
04144 
04157 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
04158           typename mj_part_t>
04159 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::assign_send_destinations2(
04160     mj_part_t num_parts,
04161     uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment, //input sorted wrt processors
04162     int *coordinate_destinations,
04163     mj_part_t &output_part_numbering_begin_index,
04164     std::vector<mj_part_t> *next_future_num_parts_in_parts){
04165 
04166     mj_part_t part_shift_amount = output_part_numbering_begin_index;
04167     mj_part_t previous_processor = -1;
04168     for(mj_part_t i = 0; i < num_parts; ++i){
04169         mj_part_t p = sort_item_part_to_proc_assignment[i].id;
04170         //assigned processors are sorted.
04171         mj_lno_t part_begin_index = 0;
04172         if (p > 0) part_begin_index = this->new_part_xadj[p - 1];
04173         mj_lno_t part_end_index = this->new_part_xadj[p];
04174 
04175         mj_part_t assigned_proc = sort_item_part_to_proc_assignment[i].val;
04176         if (this->myRank == assigned_proc && previous_processor != assigned_proc){
04177             output_part_numbering_begin_index =  part_shift_amount;
04178         }
04179         previous_processor = assigned_proc;
04180         part_shift_amount += (*next_future_num_parts_in_parts)[p];
04181 
04182         for (mj_lno_t j=part_begin_index; j < part_end_index; j++){
04183             mj_lno_t localInd = this->new_coordinate_permutations[j];
04184             coordinate_destinations[localInd] = assigned_proc;
04185         }
04186     }
04187 }
04188 
04189 
04206 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
04207           typename mj_part_t>
04208 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_assign_parts_to_procs(
04209     mj_gno_t * num_points_in_all_processor_parts,
04210     mj_part_t num_parts,
04211     mj_part_t num_procs,
04212     mj_lno_t *send_count_to_each_proc, //output: sized nprocs, show the number of send point counts to each proc.
04213     std::vector<mj_part_t> *next_future_num_parts_in_parts,//input how many more partitions the part will be partitioned into.
04214     mj_part_t &out_num_part, //output, how many parts the processor will have. this is always 1 for this function.
04215     std::vector<mj_part_t> &out_part_indices, //output: the part indices which the processor is assigned to.
04216     mj_part_t &output_part_numbering_begin_index, //output: how much the part number should be shifted when setting the solution
04217     int *coordinate_destinations){
04218     out_num_part = 0;
04219 
04220     mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * num_parts;
04221     out_part_indices.clear();
04222 
04223     //to sort the parts that is assigned to the processors.
04224     //id is the part number, sort value is the assigned processor id.
04225     uSortItem<mj_part_t, mj_part_t> * sort_item_part_to_proc_assignment  = allocMemory <uSortItem<mj_part_t, mj_part_t> >(num_parts);
04226     uSortItem<mj_part_t, mj_gno_t> * sort_item_num_points_of_proc_in_part_i = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_procs);
04227 
04228 
04229     //calculate the optimal number of coordinates that should be assigned to each processor.
04230     mj_lno_t work_each = mj_lno_t (this->num_global_coords / (double (num_procs)) + 0.5f);
04231     //to hold the left space as the number of coordinates to the optimal number in each proc.
04232     mj_lno_t *space_in_each_processor = allocMemory <mj_lno_t>(num_procs);
04233     //initialize left space in each.
04234     for (mj_part_t i = 0; i < num_procs; ++i){
04235         space_in_each_processor[i] = work_each;
04236     }
04237 
04238     //we keep track of how many parts each processor is assigned to.
04239     //because in some weird inputs, it might be possible that some
04240     //processors is not assigned to any part. Using these variables,
04241     //we force each processor to have at least one part.
04242     mj_part_t *num_parts_proc_assigned = allocMemory <mj_part_t>(num_procs);
04243     memset(num_parts_proc_assigned, 0, sizeof(mj_part_t) * num_procs);
04244     int empty_proc_count = num_procs;
04245 
04246     //to sort the parts with decreasing order of their coordiantes.
04247     //id are the part numbers, sort value is the number of points in each.
04248     uSortItem<mj_part_t, mj_gno_t> * sort_item_point_counts_in_parts  = allocMemory <uSortItem<mj_part_t, mj_gno_t> >(num_parts);
04249 
04250     //initially we will sort the parts according to the number of coordinates they have.
04251     //so that we will start assigning with the part that has the most number of coordinates.
04252     for (mj_part_t i = 0; i < num_parts; ++i){
04253         sort_item_point_counts_in_parts[i].id = i;
04254         sort_item_point_counts_in_parts[i].val = global_num_points_in_parts[i];
04255     }
04256     //sort parts with increasing order of loads.
04257     uqsort<mj_part_t, mj_gno_t>(num_parts, sort_item_point_counts_in_parts);
04258 
04259 
04260     //assigning parts to the processors
04261     //traverse the part win decreasing order of load.
04262     //first assign the heaviest part.
04263     for (mj_part_t j = 0; j < num_parts; ++j){
04264         //sorted with increasing order, traverse inverse.
04265         mj_part_t i = sort_item_point_counts_in_parts[num_parts - 1 - j].id;
04266         //load of the part
04267         mj_gno_t load = global_num_points_in_parts[i];
04268 
04269         //assigned processors
04270         mj_part_t assigned_proc = -1;
04271         //if not fit best processor.
04272         mj_part_t best_proc_to_assign = 0;
04273 
04274 
04275         //sort processors with increasing number of points in this part.
04276         for (mj_part_t ii = 0; ii < num_procs; ++ii){
04277             sort_item_num_points_of_proc_in_part_i[ii].id = ii;
04278 
04279             //if there are still enough parts to fill empty processors, than proceed normally.
04280             //but if empty processor count is equal to the number of part, then
04281             //we force to part assignments only to empty processors.
04282             if (empty_proc_count < num_parts - j || num_parts_proc_assigned[ii] == 0){
04283               //how many points processor ii has in part i?
04284               sort_item_num_points_of_proc_in_part_i[ii].val =  num_points_in_all_processor_parts[ii * num_parts + i];
04285             }
04286             else {
04287               sort_item_num_points_of_proc_in_part_i[ii].val  = -1;
04288             }
04289         }
04290         uqsort<mj_part_t, mj_gno_t>(num_procs, sort_item_num_points_of_proc_in_part_i);
04291 
04292         //traverse all processors with decreasing load.
04293         for (mj_part_t iii = num_procs - 1; iii >= 0; --iii){
04294             mj_part_t ii = sort_item_num_points_of_proc_in_part_i[iii].id;
04295             mj_lno_t left_space = space_in_each_processor[ii] - load;
04296             //if enought space, assign to this part.
04297             if(left_space >= 0 ){
04298                 assigned_proc = ii;
04299                 break;
04300             }
04301             //if space is not enough, store the best candidate part.
04302             if (space_in_each_processor[best_proc_to_assign] < space_in_each_processor[ii]){
04303                 best_proc_to_assign = ii;
04304             }
04305         }
04306 
04307         //if none had enough space, then assign it to best part.
04308         if (assigned_proc == -1){
04309             assigned_proc = best_proc_to_assign;
04310         }
04311 
04312         if (num_parts_proc_assigned[assigned_proc]++ == 0){
04313           --empty_proc_count;
04314         }
04315         space_in_each_processor[assigned_proc] -= load;
04316         //to sort later, part-i is assigned to the proccessor - assignment.
04317         sort_item_part_to_proc_assignment[j].id = i; //part i
04318         sort_item_part_to_proc_assignment[j].val = assigned_proc; //assigned to processor - assignment.
04319 
04320 
04321         //if assigned processor is me, increase the number.
04322         if (assigned_proc == this->myRank){
04323             out_num_part++;//assigned_part_count;
04324             out_part_indices.push_back(i);
04325         }
04326         //increase the send to that processor by the number of points in that part.
04327         //as everyone send their coordiantes in this part to the processor assigned to this part.
04328         send_count_to_each_proc[assigned_proc] += num_points_in_all_processor_parts[this->myRank * num_parts + i];
04329     }
04330     freeArray<mj_part_t>(num_parts_proc_assigned);
04331     freeArray< uSortItem<mj_part_t, mj_gno_t> > (sort_item_num_points_of_proc_in_part_i);
04332     freeArray<uSortItem<mj_part_t, mj_gno_t> >(sort_item_point_counts_in_parts);
04333     freeArray<mj_lno_t >(space_in_each_processor);
04334 
04335 
04336     //sort assignments with respect to the assigned processors.
04337     uqsort<mj_part_t, mj_part_t>(num_parts, sort_item_part_to_proc_assignment);
04338     //fill sendBuf.
04339 
04340 
04341     this->assign_send_destinations2(
04342             num_parts,
04343             sort_item_part_to_proc_assignment,
04344             coordinate_destinations,
04345             output_part_numbering_begin_index,
04346             next_future_num_parts_in_parts);
04347 
04348     freeArray<uSortItem<mj_part_t, mj_part_t> >(sort_item_part_to_proc_assignment);
04349 }
04350 
04351 
04369 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
04370           typename mj_part_t>
04371 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migration_part_proc_assignment(
04372     mj_gno_t * num_points_in_all_processor_parts,
04373     mj_part_t num_parts,
04374     mj_part_t num_procs,
04375     mj_lno_t *send_count_to_each_proc,
04376     std::vector<mj_part_t> &processor_ranks_for_subcomm,
04377     std::vector<mj_part_t> *next_future_num_parts_in_parts,
04378     mj_part_t &out_num_part,
04379     std::vector<mj_part_t> &out_part_indices,
04380     mj_part_t &output_part_numbering_begin_index,
04381     int *coordinate_destinations){
04382 
04383 
04384 
04385   processor_ranks_for_subcomm.clear();
04386   // if (this->num_local_coords > 0)
04387   if (num_procs > num_parts){
04388     //if there are more processors than the number of current part
04389     //then processors share the existing parts.
04390     //at the end each processor will have a single part,
04391     //but a part will be shared by a group of processors.
04392     mj_part_t out_part_index = 0;
04393     this->mj_assign_proc_to_parts(
04394         num_points_in_all_processor_parts,
04395         num_parts,
04396         num_procs,
04397         send_count_to_each_proc,
04398         processor_ranks_for_subcomm,
04399         next_future_num_parts_in_parts,
04400         out_part_index,
04401         output_part_numbering_begin_index,
04402         coordinate_destinations
04403     );
04404 
04405     out_num_part = 1;
04406     out_part_indices.clear();
04407     out_part_indices.push_back(out_part_index);
04408   }
04409   else {
04410 
04411     //there are more parts than the processors.
04412     //therefore a processor will be assigned multiple parts,
04413     //the subcommunicators will only have a single processor.
04414     processor_ranks_for_subcomm.push_back(this->myRank);
04415 
04416     //since there are more parts then procs,
04417     //assign multiple parts to processors.
04418     this->mj_assign_parts_to_procs(
04419         num_points_in_all_processor_parts,
04420         num_parts,
04421         num_procs,
04422         send_count_to_each_proc,
04423         next_future_num_parts_in_parts,
04424         out_num_part,
04425         out_part_indices,
04426         output_part_numbering_begin_index,
04427         coordinate_destinations);
04428   }
04429 }
04430 
04443 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
04444           typename mj_part_t>
04445 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_migrate_coords(
04446     mj_part_t num_procs,
04447     mj_lno_t &num_new_local_points,
04448     std::string iteration,
04449     int *coordinate_destinations,
04450     mj_part_t num_parts)
04451 {
04452 #ifdef ENABLE_ZOLTAN_MIGRATION
04453     if (sizeof(mj_lno_t) <= sizeof(int)) {
04454 
04455         // Cannot use Zoltan_Comm with local ordinals larger than ints.
04456         // In Zoltan_Comm_Create, the cast int(this->num_local_coords) 
04457         // may overflow.
04458 
04459   ZOLTAN_COMM_OBJ *plan = NULL;
04460   MPI_Comm mpi_comm = Teuchos2MPI (this->comm);
04461   int num_incoming_gnos = 0;
04462   int message_tag = 7859;
04463 
04464   this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
04465   int ierr = Zoltan_Comm_Create(
04466       &plan,
04467       int(this->num_local_coords),
04468       coordinate_destinations,
04469       mpi_comm,
04470       message_tag,
04471       &num_incoming_gnos);
04472   Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04473   this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1PlanCreating-" + iteration);
04474 
04475   this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
04476   mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(num_incoming_gnos);
04477 
04478   //migrate gnos.
04479   message_tag++;
04480   ierr = Zoltan_Comm_Do(
04481       plan,
04482       message_tag,
04483       (char *) this->current_mj_gnos,
04484       sizeof(mj_gno_t),
04485       (char *) incoming_gnos);
04486   Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04487 
04488   freeArray<mj_gno_t>(this->current_mj_gnos);
04489   this->current_mj_gnos = incoming_gnos;
04490 
04491 
04492   //migrate coordinates
04493   for (int i = 0; i < this->coord_dim; ++i){
04494     message_tag++;
04495     mj_scalar_t *coord = this->mj_coordinates[i];
04496 
04497     this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
04498     ierr = Zoltan_Comm_Do(
04499         plan,
04500         message_tag,
04501         (char *) coord,
04502         sizeof(mj_scalar_t),
04503         (char *) this->mj_coordinates[i]);
04504     Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04505     freeArray<mj_scalar_t>(coord);
04506   }
04507 
04508   //migrate weights.
04509   for (int i = 0; i < this->num_weights_per_coord; ++i){
04510     message_tag++;
04511     mj_scalar_t *weight = this->mj_weights[i];
04512 
04513     this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
04514     ierr = Zoltan_Comm_Do(
04515         plan,
04516         message_tag,
04517         (char *) weight,
04518         sizeof(mj_scalar_t),
04519         (char *) this->mj_weights[i]);
04520     Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04521     freeArray<mj_scalar_t>(weight);
04522   }
04523 
04524 
04525   //migrate owners.
04526   int *coord_own = allocMemory<int>(num_incoming_gnos);
04527   message_tag++;
04528   ierr = Zoltan_Comm_Do(
04529       plan,
04530       message_tag,
04531       (char *) this->owner_of_coordinate,
04532       sizeof(int), (char *) coord_own);
04533   Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04534   freeArray<int>(this->owner_of_coordinate);
04535   this->owner_of_coordinate = coord_own;
04536 
04537 
04538   //if num procs is less than num parts,
04539   //we need the part assigment arrays as well, since
04540   //there will be multiple parts in processor.
04541   mj_part_t *new_parts = allocMemory<mj_part_t>(num_incoming_gnos);
04542   if(num_procs < num_parts){
04543     message_tag++;
04544     ierr = Zoltan_Comm_Do(
04545         plan,
04546         message_tag,
04547         (char *) this->assigned_part_ids,
04548         sizeof(mj_part_t),
04549         (char *) new_parts);
04550     Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04551   }
04552   freeArray<mj_part_t>(this->assigned_part_ids);
04553   this->assigned_part_ids = new_parts;
04554 
04555   ierr = Zoltan_Comm_Destroy(&plan);
04556   Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
04557   num_new_local_points = num_incoming_gnos;
04558   this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration Z1Migration-" + iteration);
04559     }
04560 
04561     else
04562 
04563 #endif  // ENABLE_ZOLTAN_MIGRATION
04564     {
04565   this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
04566   Tpetra::Distributor distributor(this->comm);
04567   ArrayView<const mj_part_t> destinations( coordinate_destinations, this->num_local_coords);
04568   mj_lno_t num_incoming_gnos = distributor.createFromSends(destinations);
04569   this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorPlanCreating-" + iteration);
04570 
04571   this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
04572   {
04573     //migrate gnos.
04574     ArrayRCP<mj_gno_t> received_gnos(num_incoming_gnos);
04575     ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
04576     distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
04577     freeArray<mj_gno_t>(this->current_mj_gnos);
04578     this->current_mj_gnos = allocMemory<mj_gno_t>(num_incoming_gnos);
04579     memcpy(
04580         this->current_mj_gnos,
04581         received_gnos.getRawPtr(),
04582         num_incoming_gnos * sizeof(mj_gno_t));
04583   }
04584   //migrate coordinates
04585   for (int i = 0; i < this->coord_dim; ++i){
04586 
04587     ArrayView<mj_scalar_t> sent_coord(this->mj_coordinates[i], this->num_local_coords);
04588     ArrayRCP<mj_scalar_t> received_coord(num_incoming_gnos);
04589     distributor.doPostsAndWaits<mj_scalar_t>(sent_coord, 1, received_coord());
04590     freeArray<mj_scalar_t>(this->mj_coordinates[i]);
04591     this->mj_coordinates[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
04592     memcpy(
04593         this->mj_coordinates[i],
04594         received_coord.getRawPtr(),
04595         num_incoming_gnos * sizeof(mj_scalar_t));
04596   }
04597 
04598   //migrate weights.
04599   for (int i = 0; i < this->num_weights_per_coord; ++i){
04600 
04601     ArrayView<mj_scalar_t> sent_weight(this->mj_weights[i], this->num_local_coords);
04602     ArrayRCP<mj_scalar_t> received_weight(num_incoming_gnos);
04603     distributor.doPostsAndWaits<mj_scalar_t>(sent_weight, 1, received_weight());
04604     freeArray<mj_scalar_t>(this->mj_weights[i]);
04605     this->mj_weights[i] = allocMemory<mj_scalar_t>(num_incoming_gnos);
04606     memcpy(
04607         this->mj_weights[i],
04608         received_weight.getRawPtr(),
04609         num_incoming_gnos * sizeof(mj_scalar_t));
04610   }
04611 
04612   {
04613     //migrate the owners of the coordinates
04614     ArrayView<int> sent_owners(this->owner_of_coordinate, this->num_local_coords);
04615     ArrayRCP<int> received_owners(num_incoming_gnos);
04616     distributor.doPostsAndWaits<int>(sent_owners, 1, received_owners());
04617     freeArray<int>(this->owner_of_coordinate);
04618     this->owner_of_coordinate = allocMemory<int>(num_incoming_gnos);
04619     memcpy(
04620             this->owner_of_coordinate,
04621             received_owners.getRawPtr(),
04622             num_incoming_gnos * sizeof(int));
04623   }
04624 
04625   //if num procs is less than num parts,
04626   //we need the part assigment arrays as well, since
04627   //there will be multiple parts in processor.
04628   if(num_procs < num_parts){
04629     ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
04630     ArrayRCP<mj_part_t> received_partids(num_incoming_gnos);
04631     distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
04632     freeArray<mj_part_t>(this->assigned_part_ids);
04633     this->assigned_part_ids = allocMemory<mj_part_t>(num_incoming_gnos);
04634     memcpy(
04635         this->assigned_part_ids,
04636         received_partids.getRawPtr(),
04637         num_incoming_gnos * sizeof(mj_part_t));
04638   }
04639   else {
04640     mj_part_t *new_parts = allocMemory<int>(num_incoming_gnos);
04641     freeArray<mj_part_t>(this->assigned_part_ids);
04642     this->assigned_part_ids = new_parts;
04643   }
04644   this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Migration DistributorMigration-" + iteration);
04645   num_new_local_points = num_incoming_gnos;
04646 
04647     }
04648 }
04649 
04656 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
04657           typename mj_part_t>
04658 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_sub_communicator(std::vector<mj_part_t> &processor_ranks_for_subcomm){
04659     mj_part_t group_size = processor_ranks_for_subcomm.size();
04660     mj_part_t *ids = allocMemory<mj_part_t>(group_size);
04661     for(mj_part_t i = 0; i < group_size; ++i) {
04662         ids[i] = processor_ranks_for_subcomm[i];
04663     }
04664     ArrayView<const mj_part_t> idView(ids, group_size);
04665     this->comm = this->comm->createSubcommunicator(idView);
04666     freeArray<mj_part_t>(ids);
04667 }
04668 
04669 
04675 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
04676           typename mj_part_t>
04677 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::fill_permutation_array(
04678     mj_part_t output_num_parts,
04679     mj_part_t num_parts){
04680   //if there is single output part, then simply fill the permutation array.
04681     if (output_num_parts == 1){
04682         for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
04683             this->new_coordinate_permutations[i] = i;
04684         }
04685         this->new_part_xadj[0] = this->num_local_coords;
04686     }
04687     else {
04688 
04689       //otherwise we need to count how many points are there in each part.
04690       //we allocate here as num_parts, because the sent partids are up to num_parts,
04691       //although there are outout_num_parts different part.
04692         mj_lno_t *num_points_in_parts = allocMemory<mj_lno_t>(num_parts);
04693         //part shift holds the which part number an old part number corresponds to.
04694         mj_part_t *part_shifts = allocMemory<mj_part_t>(num_parts);
04695 
04696         memset(num_points_in_parts, 0, sizeof(mj_lno_t) * num_parts);
04697 
04698         for(mj_lno_t i = 0; i < this->num_local_coords; ++i){
04699             mj_part_t ii = this->assigned_part_ids[i];
04700             ++num_points_in_parts[ii];
04701         }
04702 
04703         //write the end points of the parts.
04704         mj_part_t p = 0;
04705         mj_lno_t prev_index = 0;
04706         for(mj_part_t i = 0; i < num_parts; ++i){
04707             if(num_points_in_parts[i] > 0)  {
04708                 this->new_part_xadj[p] =  prev_index + num_points_in_parts[i];
04709                 prev_index += num_points_in_parts[i];
04710                 part_shifts[i] = p++;
04711             }
04712         }
04713 
04714         //for the rest of the parts write the end index as end point.
04715         mj_part_t assigned_num_parts = p - 1;
04716         for (;p < num_parts; ++p){
04717             this->new_part_xadj[p] =  this->new_part_xadj[assigned_num_parts];
04718         }
04719         for(mj_part_t i = 0; i < output_num_parts; ++i){
04720             num_points_in_parts[i] = this->new_part_xadj[i];
04721         }
04722 
04723         //write the permutation array here.
04724         //get the part of the coordinate i, shift it to obtain the new part number.
04725         //assign it to the end of the new part numbers pointer.
04726         for(mj_lno_t i = this->num_local_coords - 1; i >= 0; --i){
04727             mj_part_t part = part_shifts[mj_part_t(this->assigned_part_ids[i])];
04728             this->new_coordinate_permutations[--num_points_in_parts[part]] = i;
04729         }
04730 
04731         freeArray<mj_lno_t>(num_points_in_parts);
04732         freeArray<mj_part_t>(part_shifts);
04733     }
04734 }
04735 
04736 
04759 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
04760           typename mj_part_t>
04761 bool AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::mj_perform_migration(
04762     mj_part_t input_num_parts, //current umb parts
04763     mj_part_t &output_num_parts, //output umb parts.
04764     std::vector<mj_part_t> *next_future_num_parts_in_parts,
04765     mj_part_t &output_part_begin_index,
04766     size_t migration_reduce_all_population,
04767     mj_lno_t num_coords_for_last_dim_part,
04768     std::string iteration,
04769     RCP<mj_partBoxVector_t> &input_part_boxes,
04770     RCP<mj_partBoxVector_t> &output_part_boxes
04771 )
04772 {
04773   mj_part_t num_procs = this->comm->getSize();
04774   this->myRank = this->comm->getRank();
04775 
04776 
04777   //this array holds how many points each processor has in each part.
04778   //to access how many points processor i has on part j,
04779   //num_points_in_all_processor_parts[i * num_parts + j]
04780   mj_gno_t *num_points_in_all_processor_parts = allocMemory<mj_gno_t>(input_num_parts * (num_procs + 1));
04781 
04782   //get the number of coordinates in each part in each processor.
04783   this->get_processor_num_points_in_parts(
04784       num_procs,
04785       input_num_parts,
04786       num_points_in_all_processor_parts);
04787 
04788 
04789   //check if migration will be performed or not.
04790   if (!this->mj_check_to_migrate(
04791       migration_reduce_all_population,
04792       num_coords_for_last_dim_part,
04793       num_procs,
04794       input_num_parts,
04795       num_points_in_all_processor_parts)){
04796     freeArray<mj_gno_t>(num_points_in_all_processor_parts);
04797     return false;
04798   }
04799 
04800 
04801   mj_lno_t *send_count_to_each_proc = NULL;
04802   int *coordinate_destinations = allocMemory<int>(this->num_local_coords);
04803   send_count_to_each_proc = allocMemory<mj_lno_t>(num_procs);
04804   for (int i = 0; i < num_procs; ++i) send_count_to_each_proc[i] = 0;
04805 
04806   std::vector<mj_part_t> processor_ranks_for_subcomm;
04807   std::vector<mj_part_t> out_part_indices;
04808 
04809   //determine which processors are assigned to which parts
04810   this->mj_migration_part_proc_assignment(
04811       num_points_in_all_processor_parts,
04812       input_num_parts,
04813       num_procs,
04814       send_count_to_each_proc,
04815       processor_ranks_for_subcomm,
04816       next_future_num_parts_in_parts,
04817       output_num_parts,
04818       out_part_indices,
04819       output_part_begin_index,
04820       coordinate_destinations);
04821 
04822 
04823 
04824 
04825   freeArray<mj_lno_t>(send_count_to_each_proc);
04826   std::vector <mj_part_t> tmpv;
04827 
04828   std::sort (out_part_indices.begin(), out_part_indices.end());
04829   mj_part_t outP = out_part_indices.size();
04830 
04831   mj_gno_t new_global_num_points = 0;
04832   mj_gno_t *global_num_points_in_parts = num_points_in_all_processor_parts + num_procs * input_num_parts;
04833 
04834   if (this->mj_keep_part_boxes){
04835     input_part_boxes->clear();
04836   }
04837 
04838   //now we calculate the new values for next_future_num_parts_in_parts.
04839   //same for the part boxes.
04840   for (mj_part_t i = 0; i < outP; ++i){
04841     mj_part_t ind = out_part_indices[i];
04842     new_global_num_points += global_num_points_in_parts[ind];
04843     tmpv.push_back((*next_future_num_parts_in_parts)[ind]);
04844     if (this->mj_keep_part_boxes){
04845       input_part_boxes->push_back((*output_part_boxes)[ind]);
04846     }
04847   }
04848   //swap the input and output part boxes.
04849   if (this->mj_keep_part_boxes){
04850     RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
04851     input_part_boxes = output_part_boxes;
04852     output_part_boxes = tmpPartBoxes;
04853   }
04854   next_future_num_parts_in_parts->clear();
04855   for (mj_part_t i = 0; i < outP; ++i){
04856     mj_part_t p = tmpv[i];
04857     next_future_num_parts_in_parts->push_back(p);
04858   }
04859 
04860   freeArray<mj_gno_t>(num_points_in_all_processor_parts);
04861 
04862   mj_lno_t num_new_local_points = 0;
04863 
04864 
04865   //perform the actual migration operation here.
04866   this->mj_migrate_coords(
04867       num_procs,
04868       num_new_local_points,
04869       iteration,
04870       coordinate_destinations,
04871       input_num_parts);
04872 
04873 
04874   freeArray<int>(coordinate_destinations);
04875 
04876   if(this->num_local_coords != num_new_local_points){
04877     freeArray<mj_lno_t>(this->new_coordinate_permutations);
04878     freeArray<mj_lno_t>(this->coordinate_permutations);
04879 
04880     this->new_coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
04881     this->coordinate_permutations = allocMemory<mj_lno_t>(num_new_local_points);
04882   }
04883   this->num_local_coords = num_new_local_points;
04884   this->num_global_coords = new_global_num_points;
04885 
04886 
04887 
04888   //create subcommunicator.
04889   this->create_sub_communicator(processor_ranks_for_subcomm);
04890   processor_ranks_for_subcomm.clear();
04891 
04892   //fill the new permutation arrays.
04893   this->fill_permutation_array(
04894       output_num_parts,
04895       input_num_parts);
04896   return true;
04897 }
04898 
04899 
04913 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
04914           typename mj_part_t>
04915 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::create_consistent_chunks(
04916     mj_part_t num_parts,
04917     mj_scalar_t *mj_current_dim_coords,
04918     mj_scalar_t *current_concurrent_cut_coordinate,
04919     mj_lno_t coordinate_begin,
04920     mj_lno_t coordinate_end,
04921     mj_scalar_t *used_local_cut_line_weight_to_left,
04922     mj_lno_t *out_part_xadj,
04923     int coordInd){
04924 
04925   //mj_lno_t numCoordsInPart =  coordinateEnd - coordinateBegin;
04926   mj_part_t no_cuts = num_parts - 1;
04927 
04928 
04929 
04930   int me = 0;
04931   mj_lno_t *thread_num_points_in_parts = this->thread_point_counts[me];
04932   mj_scalar_t *my_local_thread_cut_weights_to_put_left = NULL;
04933 
04934 
04935   //now if the rectilinear partitioning is allowed we decide how
04936   //much weight each thread should put to left and right.
04937   if (this->distribute_points_on_cut_lines){
04938 
04939     my_local_thread_cut_weights_to_put_left = this->thread_cut_line_weight_to_put_left[me];
04940     for (mj_part_t i = 0; i < no_cuts; ++i){
04941       //the left to be put on the left of the cut.
04942       mj_scalar_t left_weight = used_local_cut_line_weight_to_left[i];
04943       //cout << "i:" << i << " left_weight:" << left_weight << endl;
04944       for(int ii = 0; ii < this->num_threads; ++ii){
04945         if(left_weight > this->sEpsilon){
04946           //the weight of thread ii on cut.
04947           mj_scalar_t thread_ii_weight_on_cut = this->thread_part_weight_work[ii][i * 2 + 1] - this->thread_part_weight_work[ii][i * 2 ];
04948           if(thread_ii_weight_on_cut < left_weight){
04949             this->thread_cut_line_weight_to_put_left[ii][i] = thread_ii_weight_on_cut;
04950           }
04951           else {
04952             this->thread_cut_line_weight_to_put_left[ii][i] = left_weight ;
04953           }
04954           left_weight -= thread_ii_weight_on_cut;
04955         }
04956         else {
04957           this->thread_cut_line_weight_to_put_left[ii][i] = 0;
04958         }
04959       }
04960     }
04961 
04962     if(no_cuts > 0){
04963       //this is a special case. If cutlines share the same coordinate, their weights are equal.
04964       //we need to adjust the ratio for that.
04965       for (mj_part_t i = no_cuts - 1; i > 0 ; --i){
04966         if(ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
04967           my_local_thread_cut_weights_to_put_left[i] -= my_local_thread_cut_weights_to_put_left[i - 1] ;
04968         }
04969         my_local_thread_cut_weights_to_put_left[i] = int ((my_local_thread_cut_weights_to_put_left[i] + LEAST_SIGNIFICANCE) * SIGNIFICANCE_MUL)
04970                                         / mj_scalar_t(SIGNIFICANCE_MUL);
04971       }
04972     }
04973   }
04974 
04975   for(mj_part_t ii = 0; ii < num_parts; ++ii){
04976     thread_num_points_in_parts[ii] = 0;
04977   }
04978 
04979   //for this specific case we dont want to distribute the points along the cut position
04980   //randomly, as we need a specific ordering of them. Instead,
04981   //we put the coordinates into a sort item, where we sort those
04982   //using the coordinates of points on other dimensions and the index.
04983 
04984 
04985   //some of the cuts might share the same position.
04986   //in this case, if cut i and cut j share the same position
04987   //cut_map[i] = cut_map[j] = sort item index.
04988   mj_part_t *cut_map = allocMemory<mj_part_t> (no_cuts);
04989 
04990 
04991   typedef uMultiSortItem<mj_lno_t, int, mj_scalar_t> multiSItem;
04992   typedef std::vector< multiSItem > multiSVector;
04993   typedef std::vector<multiSVector> multiS2Vector;
04994 
04995   //to keep track of the memory allocated.
04996   std::vector<mj_scalar_t *>allocated_memory;
04997 
04998   //vector for which the coordinates will be sorted.
04999   multiS2Vector sort_vector_points_on_cut;
05000 
05001   //the number of cuts that have different coordinates.
05002   mj_part_t different_cut_count = 1;
05003   cut_map[0] = 0;
05004 
05005   //now we insert 1 sort vector for all cuts on the different
05006   //positins.if multiple cuts are on the same position, they share sort vectors.
05007   multiSVector tmpMultiSVector;
05008   sort_vector_points_on_cut.push_back(tmpMultiSVector);
05009 
05010   for (mj_part_t i = 1; i < no_cuts ; ++i){
05011     //if cuts share the same cut coordinates
05012     //set the cutmap accordingly.
05013     if(ABS(current_concurrent_cut_coordinate[i] - current_concurrent_cut_coordinate[i -1]) < this->sEpsilon){
05014       cut_map[i] = cut_map[i-1];
05015     }
05016     else {
05017       cut_map[i] = different_cut_count++;
05018       multiSVector tmp2MultiSVector;
05019       sort_vector_points_on_cut.push_back(tmp2MultiSVector);
05020     }
05021   }
05022 
05023 
05024   //now the actual part assigment.
05025   for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
05026 
05027     mj_lno_t i = this->coordinate_permutations[ii];
05028 
05029     mj_part_t pp = this->assigned_part_ids[i];
05030     mj_part_t p = pp / 2;
05031     //if the coordinate is on a cut.
05032     if(pp % 2 == 1 ){
05033       mj_scalar_t *vals = allocMemory<mj_scalar_t>(this->coord_dim -1);
05034       allocated_memory.push_back(vals);
05035 
05036       //we insert the coordinates to the sort item here.
05037       int val_ind = 0;
05038       for(int dim = coordInd + 1; dim < this->coord_dim; ++dim){
05039         vals[val_ind++] = this->mj_coordinates[dim][i];
05040       }
05041       for(int dim = 0; dim < coordInd; ++dim){
05042         vals[val_ind++] = this->mj_coordinates[dim][i];
05043       }
05044       multiSItem tempSortItem(i, this->coord_dim -1, vals);
05045       //inser the point to the sort vector pointed by the cut_map[p].
05046       mj_part_t cmap = cut_map[p];
05047       sort_vector_points_on_cut[cmap].push_back(tempSortItem);
05048     }
05049     else {
05050       //if it is not on the cut, simple sorting.
05051       ++thread_num_points_in_parts[p];
05052       this->assigned_part_ids[i] = p;
05053     }
05054   }
05055 
05056   //sort all the sort vectors.
05057   for (mj_part_t i = 0; i < different_cut_count; ++i){
05058     std::sort (sort_vector_points_on_cut[i].begin(), sort_vector_points_on_cut[i].end());
05059   }
05060 
05061   //we do the part assignment for the points on cuts here.
05062   mj_part_t previous_cut_map = cut_map[0];
05063 
05064   //this is how much previous part owns the weight of the current part.
05065   //when target part weight is 1.6, and the part on the left is given 2,
05066   //the left has an extra 0.4, while the right has missing 0.4 from the previous cut.
05067   //this parameter is used to balance this issues.
05068   //in the above example weight_stolen_from_previous_part will be 0.4.
05069   //if the left part target is 2.2 but it is given 2,
05070   //then weight_stolen_from_previous_part will be -0.2.
05071   mj_scalar_t weight_stolen_from_previous_part = 0;
05072   for (mj_part_t p = 0; p < no_cuts; ++p){
05073 
05074     mj_part_t mapped_cut = cut_map[p];
05075 
05076     //if previous cut map is done, and it does not have the same index,
05077     //then assign all points left on that cut to its right.
05078     if (previous_cut_map != mapped_cut){
05079       mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
05080       for (; sort_vector_end >= 0; --sort_vector_end){
05081         multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
05082         mj_lno_t i = t.index;
05083         ++thread_num_points_in_parts[p];
05084         this->assigned_part_ids[i] = p;
05085       }
05086       sort_vector_points_on_cut[previous_cut_map].clear();
05087     }
05088     mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[mapped_cut].size() - 1;
05089 
05090     for (; sort_vector_end >= 0; --sort_vector_end){
05091       multiSItem t = sort_vector_points_on_cut[mapped_cut][sort_vector_end];
05092       mj_lno_t i = t.index;
05093       mj_scalar_t w = this->mj_uniform_weights[0]? 1:this->mj_weights[0][i];
05094 
05095 
05096       //part p has enough space for point i, then put it to point i.
05097       if( my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part> this->sEpsilon &&
05098         my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - ABS(my_local_thread_cut_weights_to_put_left[p] + weight_stolen_from_previous_part - w)
05099           > this->sEpsilon){
05100 
05101         my_local_thread_cut_weights_to_put_left[p] -= w;
05102         sort_vector_points_on_cut[mapped_cut].pop_back();
05103         ++thread_num_points_in_parts[p];
05104         this->assigned_part_ids[i] = p;
05105         //if putting this weight to left overweights the left cut, then
05106         //increase the space for the next cut using weight_stolen_from_previous_part.
05107         if(p < no_cuts - 1 && my_local_thread_cut_weights_to_put_left[p] < this->sEpsilon){
05108           if(mapped_cut == cut_map[p + 1] ){
05109             //if the cut before the cut indexed at p was also at the same position
05110             //special case, as we handle the weight differently here.
05111             if (previous_cut_map != mapped_cut){
05112               weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
05113             }
05114             else {
05115               //if the cut before the cut indexed at p was also at the same position
05116               //we assign extra weights cumulatively in this case.
05117               weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
05118             }
05119           }
05120           else{
05121             weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
05122           }
05123           //end assignment for part p
05124           break;
05125         }
05126       } else {
05127         //if part p does not have enough space for this point
05128         //and if there is another cut sharing the same positon,
05129         //again increase the space for the next
05130         if(p < no_cuts - 1 && mapped_cut == cut_map[p + 1]){
05131           if (previous_cut_map != mapped_cut){
05132             weight_stolen_from_previous_part = my_local_thread_cut_weights_to_put_left[p];
05133           }
05134           else {
05135             weight_stolen_from_previous_part += my_local_thread_cut_weights_to_put_left[p];
05136           }
05137         }
05138         else{
05139           weight_stolen_from_previous_part = -my_local_thread_cut_weights_to_put_left[p];
05140         }
05141         //end assignment for part p
05142         break;
05143       }
05144     }
05145     previous_cut_map = mapped_cut;
05146   }
05147   //put everything left on the last cut to the last part.
05148   mj_lno_t sort_vector_end = (mj_lno_t)sort_vector_points_on_cut[previous_cut_map].size() - 1;
05149   for (; sort_vector_end >= 0; --sort_vector_end){
05150     multiSItem t = sort_vector_points_on_cut[previous_cut_map][sort_vector_end];
05151     mj_lno_t i = t.index;
05152     ++thread_num_points_in_parts[no_cuts];
05153     this->assigned_part_ids[i] = no_cuts;
05154   }
05155   sort_vector_points_on_cut[previous_cut_map].clear();
05156   freeArray<mj_part_t> (cut_map);
05157 
05158   //free the memory allocated for vertex sort items .
05159   mj_lno_t vSize = (mj_lno_t) allocated_memory.size();
05160   for(mj_lno_t i = 0; i < vSize; ++i){
05161     freeArray<mj_scalar_t> (allocated_memory[i]);
05162   }
05163 
05164   //creation of part_xadj as in usual case.
05165   for(mj_part_t j = 0; j < num_parts; ++j){
05166     mj_lno_t num_points_in_part_j_upto_thread_i = 0;
05167     for (int i = 0; i < this->num_threads; ++i){
05168       mj_lno_t thread_num_points_in_part_j = this->thread_point_counts[i][j];
05169       this->thread_point_counts[i][j] = num_points_in_part_j_upto_thread_i;
05170       num_points_in_part_j_upto_thread_i += thread_num_points_in_part_j;
05171 
05172     }
05173     out_part_xadj[j] = num_points_in_part_j_upto_thread_i;// + prev2; //+ coordinateBegin;
05174   }
05175 
05176   //perform prefix sum for num_points in parts.
05177   for(mj_part_t j = 1; j < num_parts; ++j){
05178     out_part_xadj[j] += out_part_xadj[j - 1];
05179   }
05180 
05181 
05182   //shift the num points in threads thread to obtain the
05183   //beginning index of each thread's private space.
05184   for(mj_part_t j = 1; j < num_parts; ++j){
05185     thread_num_points_in_parts[j] += out_part_xadj[j - 1] ;
05186   }
05187 
05188   //now thread gets the coordinate and writes the index of coordinate to the permutation array
05189   //using the part index we calculated.
05190   for (mj_lno_t ii = coordinate_begin; ii < coordinate_end; ++ii){
05191     mj_lno_t i = this->coordinate_permutations[ii];
05192     mj_part_t p =  this->assigned_part_ids[i];
05193     this->new_coordinate_permutations[coordinate_begin +
05194                                       thread_num_points_in_parts[p]++] = i;
05195   }
05196 }
05197 
05198 
05199 
05209 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
05210           typename mj_part_t>
05211 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_final_parts(
05212     mj_part_t current_num_parts,
05213     mj_part_t output_part_begin_index,
05214     RCP<mj_partBoxVector_t> output_part_boxes,
05215     bool is_data_ever_migrated)
05216 {
05217     this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Part_Assignment");
05218 
05219 #ifdef HAVE_ZOLTAN2_OMP
05220 #pragma omp parallel for
05221 #endif
05222     for(mj_part_t i = 0; i < current_num_parts;++i){
05223 
05224       mj_lno_t begin = 0;
05225       mj_lno_t end = this->part_xadj[i];
05226 
05227       if(i > 0) begin = this->part_xadj[i -1];
05228       mj_part_t part_to_set_index = i + output_part_begin_index;
05229       if (this->mj_keep_part_boxes){
05230         (*output_part_boxes)[i].setpId(part_to_set_index);
05231       }
05232       for (mj_lno_t ii = begin; ii < end; ++ii){
05233         mj_lno_t k = this->coordinate_permutations[ii];
05234         this->assigned_part_ids[k] = part_to_set_index;
05235       }
05236     }
05237 
05238     //ArrayRCP<const mj_gno_t> gnoList;
05239     if(!is_data_ever_migrated){
05240       //freeArray<mj_gno_t>(this->current_mj_gnos);
05241         //if(this->num_local_coords > 0){
05242         //    gnoList = arcpFromArrayView(this->mj_gnos);
05243         //}
05244     }
05245     else {
05246 #ifdef ENABLE_ZOLTAN_MIGRATION
05247       if (sizeof(mj_lno_t) <=  sizeof(int)) {
05248 
05249         // Cannot use Zoltan_Comm with local ordinals larger than ints.
05250         // In Zoltan_Comm_Create, the cast int(this->num_local_coords) 
05251         // may overflow.
05252 
05253       //if data is migrated, then send part numbers to the original owners.
05254       ZOLTAN_COMM_OBJ *plan = NULL;
05255       MPI_Comm mpi_comm = Teuchos2MPI (this->mj_problemComm);
05256 
05257       int incoming = 0;
05258       int message_tag = 7856;
05259 
05260       this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating");
05261       int ierr = Zoltan_Comm_Create( &plan, int(this->num_local_coords),
05262           this->owner_of_coordinate, mpi_comm, message_tag,
05263           &incoming);
05264       Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
05265       this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanCreating" );
05266 
05267       mj_gno_t *incoming_gnos = allocMemory< mj_gno_t>(incoming);
05268 
05269       message_tag++;
05270       this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
05271       ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->current_mj_gnos,
05272           sizeof(mj_gno_t), (char *) incoming_gnos);
05273       Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
05274 
05275       freeArray<mj_gno_t>(this->current_mj_gnos);
05276       this->current_mj_gnos = incoming_gnos;
05277 
05278       mj_part_t *incoming_partIds = allocMemory< mj_part_t>(incoming);
05279 
05280       message_tag++;
05281       ierr = Zoltan_Comm_Do( plan, message_tag, (char *) this->assigned_part_ids,
05282           sizeof(mj_part_t), (char *) incoming_partIds);
05283       Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
05284       freeArray<mj_part_t>(this->assigned_part_ids);
05285       this->assigned_part_ids = incoming_partIds;
05286 
05287       this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final Z1PlanComm");
05288       ierr = Zoltan_Comm_Destroy(&plan);
05289       Z2_ASSERT_VALUE(ierr, ZOLTAN_OK);
05290 
05291       this->num_local_coords = incoming;
05292       //gnoList = arcp(this->current_mj_gnos, 0, this->num_local_coords, true);
05293       }
05294       else
05295 
05296 #endif  // !ENABLE_ZOLTAN_MIGRATION
05297       {
05298       //if data is migrated, then send part numbers to the original owners.
05299       this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating");
05300       Tpetra::Distributor distributor(this->mj_problemComm);
05301       ArrayView<const mj_part_t> owners_of_coords(this->owner_of_coordinate, this->num_local_coords);
05302       mj_lno_t incoming = distributor.createFromSends(owners_of_coords);
05303       this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanCreating" );
05304 
05305       this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
05306     //migrate gnos to actual owners.
05307     ArrayRCP<mj_gno_t> received_gnos(incoming);
05308     ArrayView<mj_gno_t> sent_gnos(this->current_mj_gnos, this->num_local_coords);
05309     distributor.doPostsAndWaits<mj_gno_t>(sent_gnos, 1, received_gnos());
05310     freeArray<mj_gno_t>(this->current_mj_gnos);
05311     this->current_mj_gnos = allocMemory<mj_gno_t>(incoming);
05312     memcpy(
05313         this->current_mj_gnos,
05314         received_gnos.getRawPtr(),
05315         incoming * sizeof(mj_gno_t));
05316 
05317     //migrate part ids to actual owners.
05318     ArrayView<mj_part_t> sent_partids(this->assigned_part_ids, this->num_local_coords);
05319     ArrayRCP<mj_part_t> received_partids(incoming);
05320     distributor.doPostsAndWaits<mj_part_t>(sent_partids, 1, received_partids());
05321     freeArray<mj_part_t>(this->assigned_part_ids);
05322     this->assigned_part_ids = allocMemory<mj_part_t>(incoming);
05323     memcpy(
05324         this->assigned_part_ids,
05325         received_partids.getRawPtr(),
05326         incoming * sizeof(mj_part_t));
05327 
05328       this->num_local_coords = incoming;
05329       this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Final DistributorPlanComm");
05330 
05331       }
05332     }
05333 
05334     this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Part_Assignment");
05335 
05336     this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
05337 
05338     //ArrayRCP<mj_part_t> partId;
05339     //partId = arcp(this->assigned_part_ids, 0, this->num_local_coords, true);
05340 
05341     if (this->mj_keep_part_boxes){
05342       this->kept_boxes = output_part_boxes;
05343     }
05344 
05345     this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Solution_Part_Assignment");
05346 }
05347 
05350 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
05351           typename mj_part_t>
05352 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::free_work_memory(){
05353   this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Free");
05354 
05355   for (int i=0; i < this->coord_dim; i++){
05356     freeArray<mj_scalar_t>(this->mj_coordinates[i]);
05357   }
05358   freeArray<mj_scalar_t *>(this->mj_coordinates);
05359 
05360   for (int i=0; i < this->num_weights_per_coord; i++){
05361     freeArray<mj_scalar_t>(this->mj_weights[i]);
05362   }
05363   freeArray<mj_scalar_t *>(this->mj_weights);
05364 
05365   freeArray<int>(this->owner_of_coordinate);
05366 
05367   for(int i = 0; i < this->num_threads; ++i){
05368     freeArray<mj_lno_t>(this->thread_point_counts[i]);
05369   }
05370 
05371   freeArray<mj_lno_t *>(this->thread_point_counts);
05372   freeArray<double *> (this->thread_part_weight_work);
05373 
05374   if(this->distribute_points_on_cut_lines){
05375     freeArray<mj_scalar_t>(this->process_cut_line_weight_to_put_left);
05376     for(int i = 0; i < this->num_threads; ++i){
05377       freeArray<mj_scalar_t>(this->thread_cut_line_weight_to_put_left[i]);
05378     }
05379     freeArray<mj_scalar_t *>(this->thread_cut_line_weight_to_put_left);
05380     freeArray<mj_scalar_t>(this->process_rectilinear_cut_weight);
05381     freeArray<mj_scalar_t>(this->global_rectilinear_cut_weight);
05382   }
05383 
05384   freeArray<mj_part_t>(this->my_incomplete_cut_count);
05385 
05386   freeArray<mj_scalar_t>(this->max_min_coords);
05387 
05388   freeArray<mj_lno_t>(this->new_part_xadj);
05389 
05390   freeArray<mj_lno_t>(this->coordinate_permutations);
05391 
05392   freeArray<mj_lno_t>(this->new_coordinate_permutations);
05393 
05394   freeArray<mj_scalar_t>(this->all_cut_coordinates);
05395 
05396   freeArray<mj_scalar_t> (this->process_local_min_max_coord_total_weight);
05397 
05398   freeArray<mj_scalar_t> (this->global_min_max_coord_total_weight);
05399 
05400   freeArray<mj_scalar_t>(this->cut_coordinates_work_array);
05401 
05402   freeArray<mj_scalar_t>(this->target_part_weights);
05403 
05404   freeArray<mj_scalar_t>(this->cut_upper_bound_coordinates);
05405 
05406   freeArray<mj_scalar_t>(this->cut_lower_bound_coordinates);
05407 
05408   freeArray<mj_scalar_t>(this->cut_lower_bound_weights);
05409   freeArray<mj_scalar_t>(this->cut_upper_bound_weights);
05410   freeArray<bool>(this->is_cut_line_determined);
05411   freeArray<mj_scalar_t>(this->total_part_weight_left_right_closests);
05412   freeArray<mj_scalar_t>(this->global_total_part_weight_left_right_closests);
05413 
05414   for(int i = 0; i < this->num_threads; ++i){
05415     freeArray<double>(this->thread_part_weights[i]);
05416     freeArray<mj_scalar_t>(this->thread_cut_right_closest_point[i]);
05417     freeArray<mj_scalar_t>(this->thread_cut_left_closest_point[i]);
05418   }
05419 
05420   freeArray<double *>(this->thread_part_weights);
05421   freeArray<mj_scalar_t *>(this->thread_cut_left_closest_point);
05422   freeArray<mj_scalar_t *>(this->thread_cut_right_closest_point);
05423 
05424   this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Free");
05425 }
05426 
05434 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
05435           typename mj_part_t>
05436 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::set_partitioning_parameters(
05437     bool distribute_points_on_cut_lines_,
05438     int max_concurrent_part_calculation_,
05439     int check_migrate_avoid_migration_option_,
05440     mj_scalar_t minimum_migration_imbalance_){
05441   this->distribute_points_on_cut_lines = distribute_points_on_cut_lines_;
05442   this->max_concurrent_part_calculation = max_concurrent_part_calculation_;
05443   this->check_migrate_avoid_migration_option = check_migrate_avoid_migration_option_;
05444   this->minimum_migration_imbalance = minimum_migration_imbalance_;
05445 
05446 }
05447 
05448 
05477 template <typename mj_scalar_t, typename mj_lno_t, typename mj_gno_t,
05478           typename mj_part_t>
05479 void AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t>::multi_jagged_part(
05480 
05481   const RCP<const Environment> &env,
05482       RCP<Comm<int> > &problemComm,
05483 
05484       double imbalance_tolerance_,
05485       size_t num_global_parts_,
05486       mj_part_t *part_no_array_,
05487       int recursion_depth_,
05488 
05489       int coord_dim_,
05490       mj_lno_t num_local_coords_,
05491       mj_gno_t num_global_coords_,
05492       const mj_gno_t *initial_mj_gnos_,
05493       mj_scalar_t **mj_coordinates_,
05494 
05495       int num_weights_per_coord_,
05496       bool *mj_uniform_weights_,
05497       mj_scalar_t **mj_weights_,
05498       bool *mj_uniform_parts_,
05499       mj_scalar_t **mj_part_sizes_,
05500 
05501       mj_part_t *&result_assigned_part_ids_,
05502       mj_gno_t *&result_mj_gnos_
05503 )
05504 {
05505 
05506 #ifdef print_debug
05507     if(comm->getRank() == 0){
05508       std::cout << "size of gno:" << sizeof(mj_gno_t) << std::endl;
05509       std::cout << "size of lno:" << sizeof(mj_lno_t) << std::endl;
05510       std::cout << "size of mj_scalar_t:" << sizeof(mj_scalar_t) << std::endl;
05511     }
05512 #endif
05513 
05514     this->mj_env = env;
05515     this->mj_problemComm = problemComm;
05516     this->myActualRank = this->myRank = this->mj_problemComm->getRank();
05517 
05518     this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Total");
05519     this->mj_env->debug(3, "In MultiJagged Jagged");
05520 
05521     {
05522       this->imbalance_tolerance = imbalance_tolerance_;
05523       this->num_global_parts = num_global_parts_;
05524       this->part_no_array =  part_no_array_;
05525       this->recursion_depth = recursion_depth_;
05526 
05527       this->coord_dim = coord_dim_;
05528       this->num_local_coords = num_local_coords_;
05529       this->num_global_coords = num_global_coords_;
05530       this->mj_coordinates = mj_coordinates_; //will copy the memory to this->mj_coordinates.
05531       this->initial_mj_gnos = (mj_gno_t *) initial_mj_gnos_; //will copy the memory to this->current_mj_gnos[j].
05532 
05533       this->num_weights_per_coord = num_weights_per_coord_;
05534       this->mj_uniform_weights = mj_uniform_weights_;
05535       this->mj_weights = mj_weights_; //will copy the memory to this->mj_weights
05536       this->mj_uniform_parts = mj_uniform_parts_;
05537       this->mj_part_sizes = mj_part_sizes_;
05538 
05539       this->num_threads = 1;
05540 #ifdef HAVE_ZOLTAN2_OMP
05541 #pragma omp parallel
05542 
05543       {
05544         this->num_threads = omp_get_num_threads();
05545       }
05546 #endif
05547     }
05548     //this->set_input_data();
05549     this->set_part_specifications();
05550 
05551     this->allocate_set_work_memory();
05552 
05553     //We duplicate the comm as we create subcommunicators during migration.
05554     //We keep the problemComm as it is, while comm changes after each migration.
05555     this->comm = this->mj_problemComm->duplicate();
05556 
05557     //initially there is a single partition
05558     mj_part_t current_num_parts = 1;
05559     mj_scalar_t *current_cut_coordinates =  this->all_cut_coordinates;
05560 
05561     this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
05562 
05563     mj_part_t output_part_begin_index = 0;
05564     mj_part_t future_num_parts = this->total_num_part;
05565     bool is_data_ever_migrated = false;
05566 
05567     std::vector<mj_part_t> *future_num_part_in_parts = new std::vector<mj_part_t> ();
05568     std::vector<mj_part_t> *next_future_num_parts_in_parts = new std::vector<mj_part_t> ();
05569     next_future_num_parts_in_parts->push_back(this->num_global_parts);
05570 
05571     RCP<mj_partBoxVector_t> input_part_boxes(new mj_partBoxVector_t(), true) ;
05572     RCP<mj_partBoxVector_t> output_part_boxes(new mj_partBoxVector_t(), true);
05573 
05574     compute_global_box();
05575     if(this->mj_keep_part_boxes){
05576       this->init_part_boxes(output_part_boxes);
05577     }
05578 
05579     for (int i = 0; i < this->recursion_depth; ++i){
05580         //partitioning array. size will be as the number of current partitions and this
05581         //holds how many parts that each part will be in the current dimension partitioning.
05582         std::vector <mj_part_t> num_partitioning_in_current_dim;
05583 
05584         //number of parts that will be obtained at the end of this partitioning.
05585         //future_num_part_in_parts is as the size of current number of parts.
05586         //holds how many more parts each should be divided in the further
05587         //iterations. this will be used to calculate num_partitioning_in_current_dim,
05588         //as the number of parts that the part will be partitioned
05589         //in the current dimension partitioning.
05590 
05591         //next_future_num_parts_in_parts will be as the size of outnumParts,
05592         //and this will hold how many more parts that each output part
05593         //should be divided. this array will also be used to determine the weight ratios
05594         //of the parts.
05595         //swap the arrays to use iteratively..
05596         std::vector<mj_part_t> *tmpPartVect= future_num_part_in_parts;
05597         future_num_part_in_parts = next_future_num_parts_in_parts;
05598         next_future_num_parts_in_parts = tmpPartVect;
05599 
05600         //clear next_future_num_parts_in_parts array as
05601         //getPartitionArrays expects it to be empty.
05602         //it also expects num_partitioning_in_current_dim to be empty as well.
05603         next_future_num_parts_in_parts->clear();
05604 
05605         if(this->mj_keep_part_boxes){
05606             RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
05607             input_part_boxes = output_part_boxes;
05608             output_part_boxes = tmpPartBoxes;
05609             output_part_boxes->clear();
05610         }
05611 
05612         //returns the total no. of output parts for this dimension partitioning.
05613         mj_part_t output_part_count_in_dimension =
05614             this->update_part_num_arrays(
05615                 num_partitioning_in_current_dim,
05616                 future_num_part_in_parts,
05617                 next_future_num_parts_in_parts,
05618                 future_num_parts,
05619                 current_num_parts,
05620                 i,
05621                 input_part_boxes,
05622                 output_part_boxes);
05623 
05624         //if the number of obtained parts equal to current number of parts,
05625         //skip this dimension. For example, this happens when 1 is given in the input
05626         //part array is given. P=4,5,1,2
05627         if(output_part_count_in_dimension == current_num_parts) {
05628           //still need to swap the input output arrays.
05629             tmpPartVect= future_num_part_in_parts;
05630             future_num_part_in_parts = next_future_num_parts_in_parts;
05631             next_future_num_parts_in_parts = tmpPartVect;
05632 
05633             if(this->mj_keep_part_boxes){
05634                 RCP<mj_partBoxVector_t> tmpPartBoxes = input_part_boxes;
05635                 input_part_boxes = output_part_boxes;
05636                 output_part_boxes = tmpPartBoxes;
05637             }
05638             continue;
05639         }
05640 
05641 
05642         //get the coordinate axis along which the partitioning will be done.
05643         int coordInd = i % this->coord_dim;
05644         mj_scalar_t * mj_current_dim_coords = this->mj_coordinates[coordInd];
05645 
05646         //convert i to string to be used for debugging purposes.
05647         std::string istring = toString<int>(i);
05648         this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
05649 
05650         //alloc Memory to point the indices
05651         //of the parts in the permutation array.
05652         this->new_part_xadj = allocMemory<mj_lno_t>(output_part_count_in_dimension);
05653 
05654         //the index where in the new_part_xadj will be written.
05655         mj_part_t output_part_index = 0;
05656         //whatever is written to output_part_index will be added with putput_coordinate_end_index
05657         //so that the points will be shifted.
05658         mj_part_t output_coordinate_end_index = 0;
05659 
05660         mj_part_t current_work_part = 0;
05661         mj_part_t current_concurrent_num_parts =
05662             std::min(current_num_parts - current_work_part, this->max_concurrent_part_calculation);
05663 
05664         mj_part_t obtained_part_index = 0;
05665 
05666         //run for all available parts.
05667         for (; current_work_part < current_num_parts;
05668                  current_work_part += current_concurrent_num_parts){
05669 
05670             current_concurrent_num_parts = std::min(current_num_parts - current_work_part,
05671                                  this->max_concurrent_part_calculation);
05672 
05673             mj_part_t actual_work_part_count = 0;
05674             //initialization for 1D partitioning.
05675             //get the min and max coordinates of each part
05676             //together with the part weights of each part.
05677             for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
05678                 mj_part_t current_work_part_in_concurrent_parts = current_work_part + kk;
05679 
05680                 //if this part wont be partitioned any further
05681                 //dont do any work for this part.
05682                 if (num_partitioning_in_current_dim[current_work_part_in_concurrent_parts] == 1){
05683                     continue;
05684                 }
05685                 ++actual_work_part_count;
05686                 mj_lno_t coordinate_end_index= this->part_xadj[current_work_part_in_concurrent_parts];
05687                 mj_lno_t coordinate_begin_index = current_work_part_in_concurrent_parts==0 ? 0: this->part_xadj[current_work_part_in_concurrent_parts -1];
05688 
05689 /*
05690                 cout << "i:" << i << " j:" << current_work_part + kk
05691                     << " coordinate_begin_index:" << coordinate_begin_index
05692                     << " coordinate_end_index:" << coordinate_end_index
05693                     << " total:" << coordinate_end_index - coordinate_begin_index<< endl;
05694                     */
05695                 this->mj_get_local_min_max_coord_totW(
05696                         coordinate_begin_index,
05697                         coordinate_end_index,
05698                         this->coordinate_permutations,
05699                         mj_current_dim_coords,
05700                             this->process_local_min_max_coord_total_weight[kk], //min_coordinate
05701                             this->process_local_min_max_coord_total_weight[kk + current_concurrent_num_parts], //max_coordinate
05702                             this->process_local_min_max_coord_total_weight[kk + 2*current_concurrent_num_parts]); //total_weight
05703 
05704             }
05705 
05706             //1D partitioning
05707             if (actual_work_part_count > 0){
05708                 //obtain global Min max of the part.
05709                 this->mj_get_global_min_max_coord_totW(
05710                     current_concurrent_num_parts,
05711                     this->process_local_min_max_coord_total_weight,
05712                     this->global_min_max_coord_total_weight);
05713 
05714                 //represents the total number of cutlines
05715                 //whose coordinate should be determined.
05716                 mj_part_t total_incomplete_cut_count = 0;
05717 
05718                 //Compute weight ratios for parts & cuts:
05719                 //e.g., 0.25  0.25  0.5    0.5  0.75 0.75  1
05720                 //part0  cut0  part1 cut1 part2 cut2 part3
05721                 mj_part_t concurrent_part_cut_shift = 0;
05722                 mj_part_t concurrent_part_part_shift = 0;
05723                 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
05724                     mj_scalar_t min_coordinate = this->global_min_max_coord_total_weight[kk];
05725                     mj_scalar_t max_coordinate = this->global_min_max_coord_total_weight[kk +
05726                                                      current_concurrent_num_parts];
05727 
05728                     mj_scalar_t global_total_weight = this->global_min_max_coord_total_weight[kk +
05729                                                         2 * current_concurrent_num_parts];
05730 
05731                     mj_part_t concurrent_current_part_index = current_work_part + kk;
05732 
05733                     mj_part_t partition_count = num_partitioning_in_current_dim[concurrent_current_part_index];
05734 
05735                     mj_scalar_t *usedCutCoordinate = current_cut_coordinates + concurrent_part_cut_shift;
05736                     mj_scalar_t *current_target_part_weights = this->target_part_weights +
05737                                                         concurrent_part_part_shift;
05738                     //shift the usedCutCoordinate array as noCuts.
05739                     concurrent_part_cut_shift += partition_count - 1;
05740                     //shift the partRatio array as noParts.
05741                     concurrent_part_part_shift += partition_count;
05742 
05743 
05744                     //calculate only if part is not empty,
05745                     //and part will be further partitioned.
05746                     if(partition_count > 1 && min_coordinate <= max_coordinate){
05747 
05748                         //increase num_cuts_do_be_determined by the number of cuts of the current
05749                         //part's cut line number.
05750                         total_incomplete_cut_count += partition_count - 1;
05751                         //set the number of cut lines that should be determined
05752                         //for this part.
05753                         this->my_incomplete_cut_count[kk] = partition_count - 1;
05754 
05755                         //get the target weights of the parts.
05756                         this->mj_get_initial_cut_coords_target_weights(
05757                             min_coordinate,
05758                             max_coordinate,
05759                             partition_count - 1,
05760                             global_total_weight,
05761                             usedCutCoordinate,
05762                             current_target_part_weights,
05763                             future_num_part_in_parts,
05764                             next_future_num_parts_in_parts,
05765                             concurrent_current_part_index,
05766                             obtained_part_index);
05767 
05768                         mj_lno_t coordinate_end_index= this->part_xadj[concurrent_current_part_index];
05769                         mj_lno_t coordinate_begin_index = concurrent_current_part_index==0 ? 0: this->part_xadj[concurrent_current_part_index -1];
05770 
05771                         //get the initial estimated part assignments of the
05772                         //coordinates.
05773                         this->set_initial_coordinate_parts(
05774                             max_coordinate,
05775                             min_coordinate,
05776                             concurrent_current_part_index,
05777                             coordinate_begin_index, coordinate_end_index,
05778                             this->coordinate_permutations,
05779                             mj_current_dim_coords,
05780                             this->assigned_part_ids,
05781                             partition_count);
05782                     }
05783                     else {
05784                         // e.g., if have fewer coordinates than parts, don't need to do next dim.
05785                         this->my_incomplete_cut_count[kk] = 0;
05786                     }
05787                     obtained_part_index += partition_count;
05788                 }
05789 
05790 
05791 
05792                 //used imbalance, it is always 0, as it is difficult to
05793                 //estimate a range.
05794                 mj_scalar_t used_imbalance = 0;
05795 
05796 
05797                 // Determine cut lines for all concurrent parts parts here.
05798                 this->mj_1D_part(
05799                     mj_current_dim_coords,
05800                     used_imbalance,
05801                     current_work_part,
05802                     current_concurrent_num_parts,
05803                     current_cut_coordinates,
05804                     total_incomplete_cut_count,
05805                     num_partitioning_in_current_dim);
05806             }
05807 
05808             //create new part chunks
05809             {
05810                 mj_part_t output_array_shift = 0;
05811                 mj_part_t cut_shift = 0;
05812                 size_t tlr_shift = 0;
05813                 size_t partweight_array_shift = 0;
05814 
05815                 for(int kk = 0; kk < current_concurrent_num_parts; ++kk){
05816                     mj_part_t current_concurrent_work_part = current_work_part + kk;
05817                     mj_part_t num_parts = num_partitioning_in_current_dim[current_concurrent_work_part];
05818 
05819                     //if the part is empty, skip the part.
05820                     if((num_parts != 1  )
05821                         &&
05822                         this->global_min_max_coord_total_weight[kk] >
05823                              this->global_min_max_coord_total_weight[kk + current_concurrent_num_parts]) {
05824 
05825                       //we still need to write the begin and end point of the
05826                       //empty part. simply set it zero, the array indices will be shifted later.
05827                       for(mj_part_t jj = 0; jj < num_parts; ++jj){
05828                         this->new_part_xadj[output_part_index + output_array_shift + jj] = 0;
05829                       }
05830                         cut_shift += num_parts - 1;
05831                         tlr_shift += (4 *(num_parts - 1) + 1);
05832                         output_array_shift += num_parts;
05833                         partweight_array_shift += (2 * (num_parts - 1) + 1);
05834                         continue;
05835                     }
05836 
05837                     mj_lno_t coordinate_end= this->part_xadj[current_concurrent_work_part];
05838                     mj_lno_t coordinate_begin = current_concurrent_work_part==0 ? 0: this->part_xadj[
05839                                                                 current_concurrent_work_part -1];
05840                     mj_scalar_t *current_concurrent_cut_coordinate = current_cut_coordinates + cut_shift;
05841                     mj_scalar_t *used_local_cut_line_weight_to_left = this->process_cut_line_weight_to_put_left +
05842                                                             cut_shift;
05843 
05844                     //mj_scalar_t *used_tlr_array =  this->total_part_weight_left_right_closests + tlr_shift;
05845 
05846                     for(int ii = 0; ii < this->num_threads; ++ii){
05847                         this->thread_part_weight_work[ii] = this->thread_part_weights[ii] +  partweight_array_shift;
05848                     }
05849 
05850                     if(num_parts > 1){
05851                         if(this->mj_keep_part_boxes){
05852                           //if part boxes are to be stored update the boundaries.
05853                             for (mj_part_t j = 0; j < num_parts - 1; ++j){
05854                                 (*output_part_boxes)[output_array_shift + output_part_index +
05855                                  j].updateMinMax(current_concurrent_cut_coordinate[j], 1
05856                                   /*update max*/, coordInd);
05857 
05858                                 (*output_part_boxes)[output_array_shift + output_part_index + j +
05859                                  1].updateMinMax(current_concurrent_cut_coordinate[j], 0
05860                                   /*update min*/, coordInd);
05861                             }
05862                         }
05863 
05864                         // Rewrite the indices based on the computed cuts.
05865                         this->mj_create_new_partitions(
05866                             num_parts,
05867                             mj_current_dim_coords,
05868                             current_concurrent_cut_coordinate,
05869                             coordinate_begin,
05870                             coordinate_end,
05871                             used_local_cut_line_weight_to_left,
05872                             this->thread_part_weight_work,
05873                             this->new_part_xadj + output_part_index + output_array_shift
05874                             );
05875 
05876                     }
05877                     else {
05878                         //if this part is partitioned into 1 then just copy
05879                         //the old values.
05880                         mj_lno_t part_size = coordinate_end - coordinate_begin;
05881                         *(this->new_part_xadj + output_part_index + output_array_shift) = part_size;
05882                         memcpy(
05883                           this->new_coordinate_permutations + coordinate_begin,
05884                             this->coordinate_permutations + coordinate_begin,
05885                             part_size * sizeof(mj_lno_t));
05886                     }
05887                     cut_shift += num_parts - 1;
05888                     tlr_shift += (4 *(num_parts - 1) + 1);
05889                     output_array_shift += num_parts;
05890                     partweight_array_shift += (2 * (num_parts - 1) + 1);
05891                 }
05892 
05893                 //shift cut coordinates so that all cut coordinates are stored.
05894                 //no shift now because we dont keep the cuts.
05895                 //current_cut_coordinates += cut_shift;
05896 
05897                 //mj_create_new_partitions from coordinates partitioned the parts and
05898                 //write the indices as if there were a single part.
05899                 //now we need to shift the beginning indices.
05900                 for(mj_part_t kk = 0; kk < current_concurrent_num_parts; ++kk){
05901                     mj_part_t num_parts = num_partitioning_in_current_dim[ current_work_part + kk];
05902                     for (mj_part_t ii = 0;ii < num_parts ; ++ii){
05903                         //shift it by previousCount
05904                         this->new_part_xadj[output_part_index+ii] += output_coordinate_end_index;
05905                     }
05906                     //increase the previous count by current end.
05907                     output_coordinate_end_index = this->new_part_xadj[output_part_index + num_parts - 1];
05908                     //increase the current out.
05909                     output_part_index += num_parts ;
05910                 }
05911             }
05912         }
05913         // end of this partitioning dimension
05914 
05915 
05916         int current_world_size = this->comm->getSize();
05917         long migration_reduce_all_population = this->total_dim_num_reduce_all * current_world_size;
05918 
05919 
05920         bool is_migrated_in_current_dimension = false;
05921 
05922         //we migrate if there are more partitionings to be done after this step
05923         //and if the migration is not forced to be avoided.
05924         //and the operation is not sequential.
05925         if (future_num_parts > 1 &&
05926             this->check_migrate_avoid_migration_option >= 0 &&
05927             current_world_size > 1){
05928 
05929           this->mj_env->timerStart(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
05930           mj_part_t num_parts = output_part_count_in_dimension;
05931           if ( this->mj_perform_migration(
05932                   num_parts,
05933                   current_num_parts, //output
05934                   next_future_num_parts_in_parts, //output
05935                   output_part_begin_index,
05936                   migration_reduce_all_population,
05937                   this->num_local_coords / (future_num_parts * current_num_parts),
05938                   istring,
05939                   input_part_boxes, output_part_boxes) ) {
05940             is_migrated_in_current_dimension = true;
05941             is_data_ever_migrated = true;
05942             this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" +
05943                 istring);
05944             //since data is migrated, we reduce the number of reduceAll operations for the last part.
05945             this->total_dim_num_reduce_all /= num_parts;
05946           }
05947           else {
05948             is_migrated_in_current_dimension = false;
05949             this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Migration-" + istring);
05950           }
05951         }
05952 
05953         //swap the coordinate permutations for the next dimension.
05954         mj_lno_t * tmp = this->coordinate_permutations;
05955         this->coordinate_permutations = this->new_coordinate_permutations;
05956         this->new_coordinate_permutations = tmp;
05957 
05958         if(!is_migrated_in_current_dimension){
05959             this->total_dim_num_reduce_all -= current_num_parts;
05960             current_num_parts = output_part_count_in_dimension;
05961         }
05962         freeArray<mj_lno_t>(this->part_xadj);
05963         this->part_xadj = this->new_part_xadj;
05964 
05965         this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning_" + istring);
05966     }
05967 
05968     // Partitioning is done
05969     delete future_num_part_in_parts;
05970     delete next_future_num_parts_in_parts;
05971 
05972     this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Problem_Partitioning");
05974 
05975 
05976     //get the final parts of each initial coordinate
05977     //the results will be written to
05978     //this->assigned_part_ids for gnos given in this->current_mj_gnos
05979     this->set_final_parts(
05980         current_num_parts,
05981         output_part_begin_index,
05982         output_part_boxes,
05983         is_data_ever_migrated);
05984 
05985   result_assigned_part_ids_ = this->assigned_part_ids;
05986   result_mj_gnos_ = this->current_mj_gnos;
05987 
05988     this->free_work_memory();
05989     this->mj_env->timerStop(MACRO_TIMERS, "MultiJagged - Total");
05990     this->mj_env->debug(3, "Out of MultiJagged");
05991 
05992 }
05993 
05994 
05998 template <typename Adapter>
05999 class Zoltan2_AlgMJ : public Algorithm<Adapter>
06000 {
06001 private:
06002 
06003 #ifndef DOXYGEN_SHOULD_SKIP_THIS
06004 
06005     typedef CoordinateModel<typename Adapter::base_adapter_t> coordinateModel_t;
06006     typedef typename Adapter::scalar_t mj_scalar_t;
06007     typedef typename Adapter::gno_t mj_gno_t;
06008     typedef typename Adapter::lno_t mj_lno_t;
06009     typedef typename Adapter::node_t mj_node_t;
06010     typedef typename Adapter::part_t mj_part_t;
06011     typedef coordinateModelPartBox<mj_scalar_t, mj_part_t> mj_partBox_t;
06012     typedef std::vector<mj_partBox_t> mj_partBoxVector_t;
06013 #endif
06014     AlgMJ<mj_scalar_t, mj_lno_t, mj_gno_t, mj_part_t> mj_partitioner;
06015 
06016     RCP<const Environment> mj_env; //the environment object
06017     RCP<Comm<int> > mj_problemComm; //initial comm object
06018     RCP<const coordinateModel_t> mj_coords; //coordinate adapter
06019 
06020     //PARAMETERS
06021     double imbalance_tolerance; //input imbalance tolerance.
06022     size_t num_global_parts; //the targeted number of parts
06023     mj_part_t *part_no_array; //input part array specifying num part to divide along each dim.
06024     int recursion_depth; //the number of steps that partitioning will be solved in.
06025 
06026     int coord_dim; // coordinate dimension.
06027     mj_lno_t num_local_coords; //number of local coords.
06028     mj_gno_t num_global_coords; //number of global coords.
06029     const mj_gno_t *initial_mj_gnos; //initial global ids of the coordinates.
06030     mj_scalar_t **mj_coordinates; //two dimension coordinate array
06031 
06032     int num_weights_per_coord; // number of weights per coordinate
06033     bool *mj_uniform_weights; //if the coordinates have uniform weights.
06034     mj_scalar_t **mj_weights; //two dimensional weight array
06035     bool *mj_uniform_parts; //if the target parts are uniform
06036     mj_scalar_t **mj_part_sizes; //target part weight sizes.
06037 
06038     bool distribute_points_on_cut_lines; //if partitioning can distribute points on same coordiante to different parts.
06039     mj_part_t max_concurrent_part_calculation; // how many parts we can calculate concurrently.
06040     int check_migrate_avoid_migration_option; //whether to migrate=1, avoid migrate=2, or leave decision to MJ=0
06041     mj_scalar_t minimum_migration_imbalance; //when MJ decides whether to migrate, the minimum imbalance for migration.
06042     int mj_keep_part_boxes; //if the boxes need to be kept.
06043 
06044     int num_threads;
06045 
06046     int mj_run_as_rcb; //if this is set, then recursion depth is adjusted to its maximum value.
06047 
06048     void set_up_partitioning_data(
06049       const RCP<PartitioningSolution<Adapter> >&solution);
06050 
06051     void set_input_parameters(const Teuchos::ParameterList &p);
06052 
06053     void free_work_memory();
06054 public:
06055 
06056     Zoltan2_AlgMJ(const RCP<const Environment> &env,
06057                   RCP<Comm<int> > &problemComm,
06058                   const RCP<const coordinateModel_t> &coords) :
06059                         mj_partitioner(), mj_env(env),
06060                         mj_problemComm(problemComm),
06061       mj_coords(coords),
06062                         imbalance_tolerance(0),
06063       num_global_parts(1), part_no_array(NULL),
06064                         recursion_depth(0),
06065       coord_dim(0),num_local_coords(0), num_global_coords(0),
06066       initial_mj_gnos(NULL), mj_coordinates(NULL),
06067                         num_weights_per_coord(0),
06068       mj_uniform_weights(NULL), mj_weights(NULL),
06069                         mj_uniform_parts(NULL),
06070       mj_part_sizes(NULL),
06071                         distribute_points_on_cut_lines(true),
06072       max_concurrent_part_calculation(1),
06073                         check_migrate_avoid_migration_option(0),
06074       minimum_migration_imbalance(0.30),
06075                         mj_keep_part_boxes(0), num_threads(1), mj_run_as_rcb(0)
06076     {}
06077     ~Zoltan2_AlgMJ(){}
06078 
06085     void partition(const RCP<PartitioningSolution<Adapter> > &solution);
06086 
06087     mj_part_t pointAssign(int dim, mj_scalar_t *point) const;
06088 };
06089 
06090 
06100 template <typename Adapter>
06101 void Zoltan2_AlgMJ<Adapter>::partition(
06102   const RCP<PartitioningSolution<Adapter> > &solution
06103 )
06104 {
06105     this->set_up_partitioning_data(solution);
06106     this->set_input_parameters(this->mj_env->getParameters());
06107     if (this->mj_keep_part_boxes){
06108       this->mj_partitioner.set_to_keep_part_boxes();
06109     }
06110     this->mj_partitioner.set_partitioning_parameters(
06111         this->distribute_points_on_cut_lines,
06112         this->max_concurrent_part_calculation,
06113         this->check_migrate_avoid_migration_option,
06114         this->minimum_migration_imbalance);
06115 
06116     mj_part_t *result_assigned_part_ids = NULL;
06117     mj_gno_t *result_mj_gnos = NULL;
06118     this->mj_partitioner.multi_jagged_part(
06119         this->mj_env,
06120         this->mj_problemComm,
06121 
06122         this->imbalance_tolerance,
06123         this->num_global_parts,
06124         this->part_no_array,
06125         this->recursion_depth,
06126 
06127         this->coord_dim,
06128         this->num_local_coords,
06129         this->num_global_coords,
06130         this->initial_mj_gnos,
06131         this->mj_coordinates,
06132 
06133         this->num_weights_per_coord,
06134         this->mj_uniform_weights,
06135         this->mj_weights,
06136         this->mj_uniform_parts,
06137         this->mj_part_sizes,
06138 
06139         result_assigned_part_ids,
06140           result_mj_gnos
06141         );
06142 
06143     ArrayRCP<const mj_gno_t> gnoList = arcp(result_mj_gnos, 0,
06144                                             this->num_local_coords, true);
06145     ArrayRCP<mj_part_t> partId = arcp(result_assigned_part_ids, 0,
06146                                       this->num_local_coords, true);
06147     solution->setParts(gnoList, partId, true);
06148     if (this->mj_keep_part_boxes){
06149       RCP<mj_partBoxVector_t> output_part_boxes = 
06150                              this->mj_partitioner.get_part_boxes();
06151         solution->setPartBoxes(output_part_boxes);
06152     }
06153     this->free_work_memory();
06154 }
06155 
06156 /* \brief Freeing the memory allocated.
06157  * */
06158 template <typename Adapter>
06159 void Zoltan2_AlgMJ<Adapter>::free_work_memory(){
06160   freeArray<mj_scalar_t *>(this->mj_coordinates);
06161   freeArray<mj_scalar_t *>(this->mj_weights);
06162   freeArray<bool>(this->mj_uniform_parts);
06163   freeArray<mj_scalar_t *>(this->mj_part_sizes);
06164   freeArray<bool>(this->mj_uniform_weights);
06165 
06166 }
06167 
06168 /* \brief Sets the partitioning data for multijagged algorithm.
06169  * */
06170 template <typename Adapter>
06171 void Zoltan2_AlgMJ<Adapter>::set_up_partitioning_data(
06172   const RCP<PartitioningSolution<Adapter> > &solution
06173 )
06174 {
06175   this->coord_dim = this->mj_coords->getCoordinateDim();
06176   this->num_weights_per_coord = this->mj_coords->getNumWeightsPerCoordinate();
06177   this->num_local_coords = this->mj_coords->getLocalNumCoordinates();
06178   this->num_global_coords = this->mj_coords->getGlobalNumCoordinates();
06179   int criteria_dim = (this->num_weights_per_coord ? this->num_weights_per_coord : 1);
06180 
06181   // From the Solution we get part information.
06182   // If the part sizes for a given criteria are not uniform,
06183   // then they are values that sum to 1.0.
06184   this->num_global_parts = solution->getTargetGlobalNumberOfParts();
06185   //allocate only two dimensional pointer.
06186   //raw pointer addresess will be obtained from multivector.
06187   this->mj_coordinates = allocMemory<mj_scalar_t *>(this->coord_dim);
06188   this->mj_weights = allocMemory<mj_scalar_t *>(criteria_dim);
06189 
06190   //if the partitioning results are to be uniform.
06191   this->mj_uniform_parts = allocMemory< bool >(criteria_dim);
06192   //if in a criteria dimension, uniform part is false this shows ratios of
06193   //the target part weights.
06194   this->mj_part_sizes =  allocMemory<mj_scalar_t *>(criteria_dim);
06195   //if the weights of coordinates are uniform in a criteria dimension.
06196   this->mj_uniform_weights = allocMemory< bool >(criteria_dim);
06197 
06198   typedef StridedData<mj_lno_t, mj_scalar_t> input_t;
06199   ArrayView<const mj_gno_t> gnos;
06200   ArrayView<input_t> xyz;
06201   ArrayView<input_t> wgts;
06202 
06203   this->mj_coords->getCoordinates(gnos, xyz, wgts);
06204   //obtain global ids.
06205   ArrayView<const mj_gno_t> mj_gnos = gnos;
06206   this->initial_mj_gnos = mj_gnos.getRawPtr();
06207 
06208   //extract coordinates from multivector.
06209   for (int dim=0; dim < this->coord_dim; dim++){
06210     ArrayRCP<const mj_scalar_t> ar;
06211     xyz[dim].getInputArray(ar);
06212     //multiJagged coordinate values assignment
06213     this->mj_coordinates[dim] =  (mj_scalar_t *)ar.getRawPtr();
06214   }
06215 
06216   //if no weights are provided set uniform weight.
06217   if (this->num_weights_per_coord == 0){
06218     this->mj_uniform_weights[0] = true;
06219     this->mj_weights[0] = NULL;
06220   }
06221   else{
06222     //if weights are provided get weights for all weight indices
06223     for (int wdim = 0; wdim < this->num_weights_per_coord; wdim++){
06224       ArrayRCP<const mj_scalar_t> ar;
06225       wgts[wdim].getInputArray(ar);
06226       this->mj_uniform_weights[wdim] = false;
06227       this->mj_weights[wdim] = (mj_scalar_t *) ar.getRawPtr();
06228     }
06229   }
06230 
06231   for (int wdim = 0; wdim < criteria_dim; wdim++){
06232     if (solution->criteriaHasUniformPartSizes(wdim)){
06233       this->mj_uniform_parts[wdim] = true;
06234       this->mj_part_sizes[wdim] = NULL;
06235     }
06236     else{
06237       std::cerr << "MJ does not support non uniform target part weights" << std::endl;
06238       exit(1);
06239     }
06240   }
06241 }
06242 
06243 /* \brief Sets the partitioning parameters for multijagged algorithm.
06244  * \param pl: is the parameter list provided to zoltan2 call
06245  * */
06246 template <typename Adapter>
06247 void Zoltan2_AlgMJ<Adapter>::set_input_parameters(const Teuchos::ParameterList &pl){
06248 
06249   const Teuchos::ParameterEntry *pe = pl.getEntryPtr("imbalance_tolerance");
06250   if (pe){
06251     double tol;
06252     tol = pe->getValue(&tol);
06253     this->imbalance_tolerance = tol - 1.0;
06254   }
06255 
06256     // TODO: May be a more relaxed tolerance is needed. RCB uses 10%
06257   if (this->imbalance_tolerance <= 0)
06258     this->imbalance_tolerance= 10e-4;
06259 
06260   //if an input partitioning array is provided.
06261   this->part_no_array = NULL;
06262   //the length of the input partitioning array.
06263   this->recursion_depth = 0;
06264 
06265   if (pl.getPtr<Array <mj_part_t> >("mj_parts")){
06266     this->part_no_array = (mj_part_t *) pl.getPtr<Array <mj_part_t> >("mj_parts")->getRawPtr();
06267     this->recursion_depth = pl.getPtr<Array <mj_part_t> >("mj_parts")->size() - 1;
06268     this->mj_env->debug(2, "mj_parts provided by user");
06269   }
06270 
06271   //get mj specific parameters.
06272   this->distribute_points_on_cut_lines = true;
06273   this->max_concurrent_part_calculation = 1;
06274 
06275   this->mj_run_as_rcb = 0;
06276   int mj_user_recursion_depth = -1;
06277   this->mj_keep_part_boxes = 0;
06278   this->check_migrate_avoid_migration_option = 0;
06279   this->minimum_migration_imbalance = 0.35;
06280 
06281   pe = pl.getEntryPtr("mj_minimum_migration_imbalance");
06282   if (pe){
06283     double imb;
06284     imb = pe->getValue(&imb);
06285     this->minimum_migration_imbalance = imb - 1.0;
06286   }
06287 
06288   pe = pl.getEntryPtr("mj_migration_option");
06289   if (pe){
06290     this->check_migrate_avoid_migration_option = pe->getValue(&this->check_migrate_avoid_migration_option);
06291   }else {
06292     this->check_migrate_avoid_migration_option = 0;
06293   }
06294   if (this->check_migrate_avoid_migration_option > 1) this->check_migrate_avoid_migration_option = -1;
06295 
06296 
06297   pe = pl.getEntryPtr("mj_concurrent_part_count");
06298   if (pe){
06299     this->max_concurrent_part_calculation = pe->getValue(&this->max_concurrent_part_calculation);
06300   }else {
06301     this->max_concurrent_part_calculation = 1; // Set to 1 if not provided.
06302   }
06303 
06304   pe = pl.getEntryPtr("mj_keep_part_boxes");
06305   if (pe){
06306     this->mj_keep_part_boxes = pe->getValue(&this->mj_keep_part_boxes);
06307   }else {
06308     this->mj_keep_part_boxes = 0; // Set to invalid value
06309   }
06310 
06311         // For now, need keep_part_boxes to do pointAssign and boxAssign.
06312   // pe = pl.getEntryPtr("keep_cuts");
06313   // if (pe){
06314   //  int tmp = pe->getValue(&tmp);
06315   //  if (tmp) this->mj_keep_part_boxes = 1;
06316         // }
06317 
06318   //need to keep part boxes if mapping type is geometric.
06319   if (this->mj_keep_part_boxes == 0){
06320     pe = pl.getEntryPtr("mapping_type");
06321     if (pe){
06322       int mapping_type = -1;
06323       mapping_type = pe->getValue(&mapping_type);
06324       if (mapping_type == 0){
06325         mj_keep_part_boxes  = 1;
06326       }
06327     }
06328   }
06329 
06330   //need to keep part boxes if mapping type is geometric.
06331   pe = pl.getEntryPtr("mj_enable_rcb");
06332   if (pe){
06333     this->mj_run_as_rcb = pe->getValue(&this->mj_run_as_rcb);
06334   }else {
06335     this->mj_run_as_rcb = 0; // Set to invalid value
06336   }
06337 
06338   pe = pl.getEntryPtr("mj_recursion_depth");
06339   if (pe){
06340     mj_user_recursion_depth = pe->getValue(&mj_user_recursion_depth);
06341   }else {
06342     mj_user_recursion_depth = -1; // Set to invalid value
06343   }
06344 
06345   int val = 0;
06346   pe = pl.getEntryPtr("rectilinear");
06347   if (pe) val = pe->getValue(&val);
06348   if (val == 1){
06349     this->distribute_points_on_cut_lines = false;
06350   } else {
06351     this->distribute_points_on_cut_lines = true;
06352   }
06353 
06354   if (this->mj_run_as_rcb){
06355     mj_user_recursion_depth = (int)(ceil(log ((this->num_global_parts)) / log (2.0)));
06356   }
06357   if (this->recursion_depth < 1){
06358     if (mj_user_recursion_depth > 0){
06359       this->recursion_depth = mj_user_recursion_depth;
06360     }
06361     else {
06362       this->recursion_depth = this->coord_dim;
06363     }
06364   }
06365 
06366   this->num_threads = 1;
06367 #ifdef HAVE_ZOLTAN2_OMP
06368 #pragma omp parallel
06369   {
06370     this->num_threads = omp_get_num_threads();
06371   }
06372 #endif
06373 
06374 }
06375 
06377 template <typename Adapter>
06378 typename Adapter::part_t Zoltan2_AlgMJ<Adapter>::pointAssign(
06379   int dim, 
06380   typename Adapter::scalar_t *point) const
06381 {
06382 
06383   // TODO:  Implement with cuts rather than boxes to reduce algorithmic
06384   // TODO:  complexity.
06385 
06386   if (this->mj_keep_part_boxes) {
06387     typename Adapter::part_t foundPart = -1;
06388 
06389     // Get vector of part boxes
06390     RCP<mj_partBoxVector_t> partBoxes;
06391     try {
06392       partBoxes = this->mj_partitioner.get_part_boxes();
06393     }
06394     Z2_FORWARD_EXCEPTIONS;
06395 
06396     size_t nBoxes = (*partBoxes).size();
06397     if (nBoxes == 0) {
06398       throw std::logic_error("no part boxes exist");
06399     }
06400 //{
06401 //int me;
06402 //MPI_Comm_rank(MPI_COMM_WORLD, &me);
06403 //for (size_t i = 0; i < nBoxes; i++)
06404 //  printf("%d KDDKDD BOX %d PART %d (%f %f) x (%f %f)\n", me, i, (*partBoxes)[i].getpId(), (*partBoxes)[i].getlmins()[0], (*partBoxes)[i].getlmins()[1], (*partBoxes)[i].getlmaxs()[0], (*partBoxes)[i].getlmaxs()[1]);
06405 //}
06406 
06407 
06408     // Determine whether the point is within the global domain
06409     RCP<mj_partBox_t> globalBox = this->mj_partitioner.get_global_box();
06410 
06411     if (globalBox->pointInBox(dim, point)) {
06412 
06413       // point is in the global domain; determine in which part it is.
06414       size_t i;
06415       for (i = 0; i < nBoxes; i++) {
06416         try {
06417           if ((*partBoxes)[i].pointInBox(dim, point)) {
06418             foundPart = (*partBoxes)[i].getpId();
06419 //          std::cout << "Point (";
06420 //          for (int j = 0; j < dim; j++) std::cout << point[j] << " ";
06421 //          std::cout << ") found in box " << i << " part " << foundPart 
06422 //                    << std::endl;
06423 //          (*partBoxes)[i].print();
06424             break;
06425           }
06426         }
06427         Z2_FORWARD_EXCEPTIONS;
06428       }
06429 
06430       if (i == nBoxes) {
06431         // This error should never occur
06432         std::ostringstream oss;
06433         oss << "Point (";
06434         for (int j = 0; j < dim; j++) oss << point[j] << " ";
06435         oss << ") not found in domain";
06436         throw std::logic_error(oss.str());
06437       }
06438     }
06439     
06440     else {
06441       // Point is outside the global domain.  
06442       // Determine to which part it is closest.
06443       // TODO:  with cuts, would not need this special case
06444 
06445       size_t closestBox = 0;
06446       mj_scalar_t minDistance = std::numeric_limits<mj_scalar_t>::max();
06447       mj_scalar_t *centroid = new mj_scalar_t[dim];
06448       for (size_t i = 0; i < nBoxes; i++) {
06449         (*partBoxes)[i].computeCentroid(centroid);
06450         mj_scalar_t sum = 0.;
06451         mj_scalar_t diff;
06452         for (int j = 0; j < dim; j++) {
06453           diff = centroid[j] - point[j];
06454           sum += diff * diff;
06455         }
06456         if (sum < minDistance) {
06457           minDistance = sum;
06458           closestBox = i;
06459         }
06460       }
06461       foundPart = (*partBoxes)[closestBox].getpId();
06462       delete [] centroid;
06463     }
06464 
06465     return foundPart;
06466   }
06467   else {
06468     throw std::logic_error("need to use keep_cuts parameter for pointAssign");
06469   }
06470 }
06471 
06472 } // namespace Zoltan2
06473 
06474 #endif