Tpetra Matrix/Vector Services  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //          Tpetra: Templated Linear Algebra Services Package
00005 //                 Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 // @HEADER
00041 
00042 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
00043 #define TPETRA_MATRIXMATRIX_DEF_HPP
00044 
00045 #include "TpetraExt_MatrixMatrix_decl.hpp"
00046 #include "Teuchos_VerboseObject.hpp"
00047 #include "Teuchos_Array.hpp"
00048 #include "Tpetra_Util.hpp"
00049 #include "Tpetra_ConfigDefs.hpp"
00050 #include "Tpetra_CrsMatrix.hpp"
00051 #include "TpetraExt_MMHelpers_def.hpp"
00052 #include "Tpetra_RowMatrixTransposer.hpp"
00053 #include "Tpetra_ConfigDefs.hpp"
00054 #include "Tpetra_Map.hpp"
00055 #include "Tpetra_Export.hpp"
00056 #include "Tpetra_Import_Util.hpp"
00057 #include "Tpetra_Import_Util2.hpp"
00058 #include <algorithm>
00059 #include "Teuchos_FancyOStream.hpp"
00060 
00061 //#define COMPUTE_MMM_STATISTICS
00062 
00063 
00069 namespace Tpetra {
00070 
00071 
00072 namespace MatrixMatrix{
00073 
00074 
00075 template <class Scalar,
00076           class LocalOrdinal,
00077           class GlobalOrdinal,
00078           class Node>
00079 void Multiply(
00080   const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
00081   bool transposeA,
00082   const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
00083   bool transposeB,
00084   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
00085   bool call_FillComplete_on_result)
00086 {
00087 #ifdef HAVE_TPETRA_MMM_TIMINGS
00088   using Teuchos::TimeMonitor;
00089   Teuchos::RCP<Teuchos::TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM All Setup")));
00090 #endif
00091 
00092   //TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
00093   typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
00094   //
00095   //This method forms the matrix-matrix product C = op(A) * op(B), where
00096   //op(A) == A   if transposeA is false,
00097   //op(A) == A^T if transposeA is true,
00098   //and similarly for op(B).
00099   //
00100 
00101   //A and B should already be Filled.
00102   //(Should we go ahead and call FillComplete() on them if necessary?
00103   // or error out? For now, we choose to error out.)
00104   TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, "MatrixMatrix::Multiply(): Matrix A is not fill complete.");
00105   TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, "MatrixMatrix::Multiply(): Matrix B is not fill complete.");
00106   TEUCHOS_TEST_FOR_EXCEPTION(C.isLocallyIndexed() , std::runtime_error, "MatrixMatrix::Multiply(): Result matrix C must not be locally indexed.");
00107 
00108   //Convience typedefs
00109   typedef CrsMatrixStruct<
00110     Scalar,
00111     LocalOrdinal,
00112     GlobalOrdinal,
00113     Node> CrsMatrixStruct_t;
00114   typedef Map<LocalOrdinal, GlobalOrdinal, Node> Map_t;
00115 
00116   RCP<const Matrix_t > Aprime = null;
00117   RCP<const Matrix_t > Bprime = null;
00118 
00119   // Is this a "clean" matrix
00120   bool NewFlag=!C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
00121 
00122   bool use_optimized_ATB=false;
00123   if(transposeA && !transposeB && call_FillComplete_on_result && NewFlag) {
00124     use_optimized_ATB=true;
00125   }
00126 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use.  Remove this later.
00127   use_optimized_ATB=false;
00128 #endif
00129 
00130 
00131   if(!use_optimized_ATB && transposeA) {
00132     RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> at (Teuchos::rcpFromRef (A));
00133     Aprime = at.createTranspose();
00134   }
00135   else{
00136     Aprime = rcpFromRef(A);
00137   }
00138 
00139   if(transposeB){
00140     RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> bt (Teuchos::rcpFromRef (B));
00141     Bprime=bt.createTranspose();
00142   }
00143   else{
00144     Bprime = rcpFromRef(B);
00145   }
00146 
00147 
00148   //now check size compatibility
00149   global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
00150   global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
00151   global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
00152   global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
00153   global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
00154   global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
00155   TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
00156     "MatrixMatrix::Multiply: ERROR, inner dimensions of op(A) and op(B) "
00157     "must match for matrix-matrix product. op(A) is "
00158     <<Aouter<<"x"<<Ainner << ", op(B) is "<<Binner<<"x"<<Bouter<<std::endl);
00159 
00160   //The result matrix C must at least have a row-map that reflects the
00161   //correct row-size. Don't check the number of columns because rectangular
00162   //matrices which were constructed with only one map can still end up
00163   //having the correct capacity and dimensions when filled.
00164   TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
00165     "MatrixMatrix::Multiply: ERROR, dimensions of result C must "
00166     "match dimensions of op(A) * op(B). C has "<<C.getGlobalNumRows()
00167      << " rows, should have at least "<<Aouter << std::endl);
00168 
00169   //It doesn't matter whether C is already Filled or not. If it is already
00170   //Filled, it must have space allocated for the positions that will be
00171   //referenced in forming C = op(A)*op(B). If it doesn't have enough space,
00172   //we'll error out later when trying to store result values.
00173 
00174   // CGB: However, matrix must be in active-fill
00175   TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
00176 
00177   //We're going to need to import remotely-owned sections of A and/or B
00178   //if more than 1 processor is performing this run, depending on the scenario.
00179   int numProcs = A.getComm()->getSize();
00180 
00181   //Declare a couple of structs that will be used to hold views of the data
00182   //of A and B, to be used for fast access during the matrix-multiplication.
00183   CrsMatrixStruct_t Aview;
00184   CrsMatrixStruct_t Bview;
00185 
00186   RCP<const Map_t > targetMap_A = Aprime->getRowMap();
00187   RCP<const Map_t > targetMap_B = Bprime->getRowMap();
00188 
00189 #ifdef HAVE_TPETRA_MMM_TIMINGS
00190   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM All I&X")));
00191 #endif
00192 
00193   //Now import any needed remote rows and populate the Aview struct.
00194   // NOTE: We assert that an import isn't needed --- since we do the transpose above to handle that.
00195   if(!use_optimized_ATB) {
00196     RCP<const Import<LocalOrdinal,GlobalOrdinal, Node> > dummyImporter;
00197     MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview,dummyImporter,true);
00198   }
00199 
00200 
00201   //We will also need local access to all rows of B that correspond to the
00202   //column-map of op(A).
00203   if (numProcs > 1) {
00204     targetMap_B = Aprime->getColMap(); //colmap_op_A;
00205   }
00206 
00207   //Now import any needed remote rows and populate the Bview struct.
00208   if(!use_optimized_ATB)
00209     MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter());
00210 
00211 #ifdef HAVE_TPETRA_MMM_TIMINGS
00212   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM All Multiply")));
00213 #endif
00214 
00215 
00216   //Now call the appropriate method to perform the actual multiplication.
00217   CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
00218 
00219   if(use_optimized_ATB) {
00220     MMdetails::mult_AT_B_newmatrix(A, B, C);
00221   }
00222   else if(call_FillComplete_on_result && NewFlag ) {
00223     MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
00224   }
00225   else {
00226     MMdetails::mult_A_B(Aview, Bview, crsmat);
00227 #ifdef HAVE_TPETRA_MMM_TIMINGS
00228     MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM All FillComplete")));
00229 #endif
00230     if (call_FillComplete_on_result) {
00231       //We'll call FillComplete on the C matrix before we exit, and give
00232       //it a domain-map and a range-map.
00233       //The domain-map will be the domain-map of B, unless
00234       //op(B)==transpose(B), in which case the range-map of B will be used.
00235       //The range-map will be the range-map of A, unless
00236       //op(A)==transpose(A), in which case the domain-map of A will be used.
00237       if (!C.isFillComplete()) {
00238         C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
00239       }
00240     }
00241   }
00242 
00243 }
00244 
00245 
00246 template <class Scalar,
00247           class LocalOrdinal,
00248           class GlobalOrdinal,
00249           class Node>
00250 void Jacobi(Scalar omega,
00251             const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
00252             const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
00253             const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
00254             CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
00255             bool call_FillComplete_on_result)
00256 {
00257 #ifdef HAVE_TPETRA_MMM_TIMINGS
00258   using Teuchos::TimeMonitor;
00259   Teuchos::RCP<Teuchos::TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi All Setup")));
00260 #endif
00261   typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
00262 
00263   //A and B should already be Filled.
00264   //(Should we go ahead and call FillComplete() on them if necessary?
00265   // or error out? For now, we choose to error out.)
00266   TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, "MatrixMatrix::Multiply(): Matrix A is not fill complete.");
00267   TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, "MatrixMatrix::Multiply(): Matrix B is not fill complete.");
00268   TEUCHOS_TEST_FOR_EXCEPTION(C.isLocallyIndexed() , std::runtime_error, "MatrixMatrix::Multiply(): Result matrix C must not be locally indexed.");
00269 
00270   //Convience typedefs
00271   typedef CrsMatrixStruct<
00272     Scalar,
00273     LocalOrdinal,
00274     GlobalOrdinal,
00275     Node> CrsMatrixStruct_t;
00276   typedef Map<LocalOrdinal, GlobalOrdinal, Node> Map_t;
00277 
00278   RCP<const Matrix_t > Aprime = rcpFromRef(A);
00279   RCP<const Matrix_t > Bprime = rcpFromRef(B);
00280 
00281   //now check size compatibility
00282   global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
00283   global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
00284   global_size_t Aouter = A.getGlobalNumRows();
00285   global_size_t Bouter = numBCols;
00286   global_size_t Ainner = numACols;
00287   global_size_t Binner = B.getGlobalNumRows();
00288   TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
00289     "MatrixMatrix::Jacobi: ERROR, inner dimensions of op(A) and op(B) "
00290     "must match for matrix-matrix product. op(A) is "
00291     <<Aouter<<"x"<<Ainner << ", op(B) is "<<Binner<<"x"<<Bouter<<std::endl);
00292 
00293   //The result matrix C must at least have a row-map that reflects the
00294   //correct row-size. Don't check the number of columns because rectangular
00295   //matrices which were constructed with only one map can still end up
00296   //having the correct capacity and dimensions when filled.
00297   TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
00298     "MatrixMatrix::Multiply: ERROR, dimensions of result C must "
00299     "match dimensions of op(A) * op(B). C has "<<C.getGlobalNumRows()
00300      << " rows, should have at least "<<Aouter << std::endl);
00301 
00302   //It doesn't matter whether C is already Filled or not. If it is already
00303   //Filled, it must have space allocated for the positions that will be
00304   //referenced in forming C = op(A)*op(B). If it doesn't have enough space,
00305   //we'll error out later when trying to store result values.
00306 
00307   // CGB: However, matrix must be in active-fill
00308   TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
00309 
00310   //We're going to need to import remotely-owned sections of A and/or B
00311   //if more than 1 processor is performing this run, depending on the scenario.
00312   int numProcs = A.getComm()->getSize();
00313 
00314   //Declare a couple of structs that will be used to hold views of the data
00315   //of A and B, to be used for fast access during the matrix-multiplication.
00316   CrsMatrixStruct_t Aview;
00317   CrsMatrixStruct_t Bview;
00318 
00319   RCP<const Map_t > targetMap_A = Aprime->getRowMap();
00320   RCP<const Map_t > targetMap_B = Bprime->getRowMap();
00321 
00322 #ifdef HAVE_TPETRA_MMM_TIMINGS
00323   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi All I&X")));
00324 #endif
00325 
00326   //Now import any needed remote rows and populate the Aview struct.
00327   MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview);
00328 
00329   //We will also need local access to all rows of B that correspond to the
00330   //column-map of op(A).
00331   if (numProcs > 1) {
00332     targetMap_B = Aprime->getColMap(); //colmap_op_A;
00333   }
00334 
00335   //Now import any needed remote rows and populate the Bview struct.
00336   MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter());
00337 
00338 #ifdef HAVE_TPETRA_MMM_TIMINGS
00339   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi All Multiply")));
00340 #endif
00341 
00342 
00343   //Now call the appropriate method to perform the actual multiplication.
00344   CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
00345 
00346   // Is this a "clean" matrix
00347   bool NewFlag=!C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
00348 
00349   if(call_FillComplete_on_result && NewFlag ) {
00350     MMdetails::jacobi_A_B_newmatrix(omega,Dinv,Aview, Bview, C);
00351   }
00352   else {
00353     TEUCHOS_TEST_FOR_EXCEPTION(
00354       true, std::runtime_error,
00355       "jacobi_A_B_general not implemented");
00356     // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
00357     // commenting it out.
00358 // #ifdef HAVE_TPETRA_MMM_TIMINGS
00359 //     MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
00360 // #endif
00361     // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
00362     // commenting it out.
00363     // if (call_FillComplete_on_result) {
00364     //   //We'll call FillComplete on the C matrix before we exit, and give
00365     //   //it a domain-map and a range-map.
00366     //   //The domain-map will be the domain-map of B, unless
00367     //   //op(B)==transpose(B), in which case the range-map of B will be used.
00368     //   //The range-map will be the range-map of A, unless
00369     //   //op(A)==transpose(A), in which case the domain-map of A will be used.
00370     //   if (!C.isFillComplete()) {
00371     //     C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
00372     //   }
00373     // }
00374   }
00375 }
00376 
00377 
00378 
00379 template <class Scalar,
00380           class LocalOrdinal,
00381           class GlobalOrdinal,
00382           class Node>
00383 void Add(
00384   const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
00385   bool transposeA,
00386   Scalar scalarA,
00387   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
00388   Scalar scalarB )
00389 {
00390   TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
00391     "MatrixMatrix::Add ERROR, input matrix A.isFillComplete() is false; it is required to be true. (Result matrix B is not required to be isFillComplete()).");
00392   TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
00393     "MatrixMatrix::Add ERROR, input matrix B must not be fill complete!");
00394   TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
00395     "MatrixMatrix::Add ERROR, input matrix B must not have static graph!");
00396   TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
00397     "MatrixMatrix::Add ERROR, input matrix B must not be locally indexed!");
00398   TEUCHOS_TEST_FOR_EXCEPTION(B.getProfileType()!=DynamicProfile, std::runtime_error,
00399     "MatrixMatrix::Add ERROR, input matrix B must have a dynamic profile!");
00400   //Convience typedef
00401   typedef CrsMatrix<
00402     Scalar,
00403     LocalOrdinal,
00404     GlobalOrdinal,
00405     Node> CrsMatrix_t;
00406   RCP<const CrsMatrix_t> Aprime = null;
00407   if( transposeA ){
00408           RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> theTransposer(Teuchos::rcpFromRef (A));
00409     Aprime = theTransposer.createTranspose();
00410   }
00411   else{
00412     Aprime = rcpFromRef(A);
00413   }
00414   size_t a_numEntries;
00415   Array<GlobalOrdinal> a_inds(A.getNodeMaxNumRowEntries());
00416   Array<Scalar> a_vals(A.getNodeMaxNumRowEntries());
00417   GlobalOrdinal row;
00418 
00419   if(scalarB != ScalarTraits<Scalar>::one()){
00420     B.scale(scalarB);
00421   }
00422 
00423   bool bFilled = B.isFillComplete();
00424   size_t numMyRows = B.getNodeNumRows();
00425   if(scalarA != ScalarTraits<Scalar>::zero()){
00426     for(LocalOrdinal i = 0; (size_t)i < numMyRows; ++i){
00427       row = B.getRowMap()->getGlobalElement(i);
00428       Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
00429       if(scalarA != ScalarTraits<Scalar>::one()){
00430         for(size_t j =0; j<a_numEntries; ++j){
00431           a_vals[j] *= scalarA;
00432         }
00433       }
00434       if(bFilled){
00435         B.sumIntoGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
00436       }
00437       else{
00438         B.insertGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
00439       }
00440 
00441     }
00442   }
00443 }
00444 
00445 
00446 template <class Scalar,
00447           class LocalOrdinal,
00448           class GlobalOrdinal,
00449           class Node>
00450 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
00451 add (const Scalar& alpha,
00452      const bool transposeA,
00453      const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
00454      const Scalar& beta,
00455      const bool transposeB,
00456      const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
00457      const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
00458      const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
00459      const Teuchos::RCP<Teuchos::ParameterList>& params)
00460 {
00461   using Teuchos::RCP;
00462   using Teuchos::rcp;
00463   using Teuchos::rcpFromRef;
00464   using Teuchos::rcp_dynamic_cast;
00465   using Teuchos::rcp_implicit_cast;
00466   typedef RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> row_matrix_type;
00467   typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crs_matrix_type;
00468   typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
00469 
00470   TEUCHOS_TEST_FOR_EXCEPTION(
00471     ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
00472     "Tpetra::MatrixMatrix::add: A and B must both be fill complete.");
00473 
00474 #ifdef HAVE_TPETRA_DEBUG
00475   // The matrices don't have domain or range Maps unless they are fill complete.
00476   if (A.isFillComplete () && B.isFillComplete ()) {
00477     const bool domainMapsSame =
00478       (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
00479       (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
00480       (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
00481     TEUCHOS_TEST_FOR_EXCEPTION(
00482       domainMapsSame, std::invalid_argument,
00483       "Tpetra::MatrixMatrix::add: The domain Maps of Op(A) and Op(B) are not the same.");
00484 
00485     const bool rangeMapsSame =
00486       (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
00487       (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
00488       (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
00489     TEUCHOS_TEST_FOR_EXCEPTION(
00490       rangeMapsSame, std::invalid_argument,
00491       "Tpetra::MatrixMatrix::add: The range Maps of Op(A) and Op(B) are not the same.");
00492   }
00493 #endif // HAVE_TPETRA_DEBUG
00494 
00495   // Form the explicit transpose of A if necessary.
00496   RCP<const crs_matrix_type> Aprime;
00497   if (transposeA) {
00498     transposer_type theTransposer (rcpFromRef (A));
00499     Aprime = theTransposer.createTranspose ();
00500   } else {
00501     Aprime = rcpFromRef (A);
00502   }
00503 
00504 #ifdef HAVE_TPETRA_DEBUG
00505   TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
00506     "Tpetra::MatrixMatrix::Add: Failed to compute Op(A).  "
00507     "Please report this bug to the Tpetra developers.");
00508 #endif // HAVE_TPETRA_DEBUG
00509 
00510   // Form the explicit transpose of B if necessary.
00511   RCP<const crs_matrix_type> Bprime;
00512   if (transposeB) {
00513     transposer_type theTransposer (rcpFromRef (B));
00514     Bprime = theTransposer.createTranspose ();
00515   } else {
00516     Bprime = rcpFromRef (B);
00517   }
00518 
00519 #ifdef HAVE_TPETRA_DEBUG
00520   TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
00521     "Tpetra::MatrixMatrix::Add: Failed to compute Op(B).  "
00522     "Please report this bug to the Tpetra developers.");
00523 
00524   TEUCHOS_TEST_FOR_EXCEPTION(
00525     ! Aprime->isFillComplete () || ! Bprime->isFillComplete (), std::invalid_argument,
00526     "Tpetra::MatrixMatrix::add: Aprime and Bprime must both be fill complete.  "
00527     "Please report this bug to the Tpetra developers.");
00528 #endif // HAVE_TPETRA_DEBUG
00529 
00530 
00531   RCP<row_matrix_type> C =
00532     Bprime->add (alpha, *rcp_implicit_cast<const row_matrix_type> (Aprime),
00533                  beta, domainMap, rangeMap, params);
00534   return rcp_dynamic_cast<crs_matrix_type> (C);
00535 }
00536 
00537 
00538 
00539 
00540 template <class Scalar,
00541           class LocalOrdinal,
00542           class GlobalOrdinal,
00543           class Node>
00544 void Add(
00545   const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
00546   bool transposeA,
00547   Scalar scalarA,
00548   const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
00549   bool transposeB,
00550   Scalar scalarB,
00551   RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > C)
00552 {
00553   using Teuchos::as;
00554   using Teuchos::Array;
00555   using Teuchos::ArrayRCP;
00556   using Teuchos::ArrayView;
00557   using Teuchos::RCP;
00558   using Teuchos::rcp;
00559   using Teuchos::rcp_dynamic_cast;
00560   using Teuchos::rcpFromRef;
00561   using Teuchos::tuple;
00562   using std::endl;
00563   //  typedef typename ArrayView<const Scalar>::size_type size_type;
00564   typedef Teuchos::ScalarTraits<Scalar> STS;
00565   typedef Map<LocalOrdinal, GlobalOrdinal, Node> map_type;
00566   //  typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
00567   //  typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
00568   //  typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
00569   typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crs_matrix_type;
00570   typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
00571 
00572   TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
00573     "Tpetra::MatrixMatrix::Add: The case C == null does not actually work.  "
00574     "Fixing this will require an interface change.");
00575 
00576   TEUCHOS_TEST_FOR_EXCEPTION(
00577     ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
00578     "Tpetra::MatrixMatrix::Add: Both input matrices must be fill complete "
00579     "before calling this function.");
00580 
00581 #ifdef HAVE_TPETRA_DEBUG
00582   {
00583     const bool domainMapsSame =
00584       (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
00585       (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
00586       (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
00587     TEUCHOS_TEST_FOR_EXCEPTION(
00588       domainMapsSame, std::invalid_argument,
00589       "Tpetra::MatrixMatrix::Add: The domain Maps of Op(A) and Op(B) are not the same.");
00590 
00591     const bool rangeMapsSame =
00592       (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
00593       (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
00594       (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
00595     TEUCHOS_TEST_FOR_EXCEPTION(
00596       rangeMapsSame, std::invalid_argument,
00597       "Tpetra::MatrixMatrix::Add: The range Maps of Op(A) and Op(B) are not the same.");
00598   }
00599 #endif // HAVE_TPETRA_DEBUG
00600 
00601   // Form the explicit transpose of A if necessary.
00602   RCP<const crs_matrix_type> Aprime;
00603   if (transposeA) {
00604     transposer_type theTransposer (rcpFromRef (A));
00605     Aprime = theTransposer.createTranspose ();
00606   } else {
00607     Aprime = rcpFromRef (A);
00608   }
00609 
00610 #ifdef HAVE_TPETRA_DEBUG
00611   TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
00612     "Tpetra::MatrixMatrix::Add: Failed to compute Op(A).  "
00613     "Please report this bug to the Tpetra developers.");
00614 #endif // HAVE_TPETRA_DEBUG
00615 
00616   // Form the explicit transpose of B if necessary.
00617   RCP<const crs_matrix_type> Bprime;
00618   if (transposeB) {
00619     transposer_type theTransposer (rcpFromRef (B));
00620     Bprime = theTransposer.createTranspose ();
00621   } else {
00622     Bprime = rcpFromRef (B);
00623   }
00624 
00625 #ifdef HAVE_TPETRA_DEBUG
00626   TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
00627     "Tpetra::MatrixMatrix::Add: Failed to compute Op(B).  "
00628     "Please report this bug to the Tpetra developers.");
00629 #endif // HAVE_TPETRA_DEBUG
00630 
00631   // Allocate or zero the entries of the result matrix.
00632   if (! C.is_null ()) {
00633     C->setAllToScalar (STS::zero ());
00634   } else {
00635 #if 0
00636     // If Aprime and Bprime have the same row Map, and if C is null,
00637     // we can optimize construction and fillComplete of C.  For now,
00638     // we just check pointer equality, to avoid the all-reduce in
00639     // isSameAs.  It may be worth that all-reduce to check, however.
00640     //if (Aprime->getRowMap ().getRawPtr () == Bprime->getRowMap ().getRawPtr ()) {
00641     if (Aprime->getRowMap ()->isSameAs (* (Bprime->getRowMap ())) {
00642       RCP<const map_type> rowMap = Aprime->getRowMap ();
00643 
00644       RCP<const crs_graph_type> A_graph =
00645         rcp_dynamic_cast<const crs_graph_type> (Aprime->getGraph (), true);
00646 #ifdef HAVE_TPETRA_DEBUG
00647       TEUCHOS_TEST_FOR_EXCEPTION(A_graph.is_null (), std::logic_error,
00648         "Tpetra::MatrixMatrix::Add: Graph of Op(A) is null.  "
00649         "Please report this bug to the Tpetra developers.");
00650 #endif // HAVE_TPETRA_DEBUG
00651       RCP<const crs_graph_type> B_graph =
00652         rcp_dynamic_cast<const crs_graph_type> (Bprime->getGraph (), true);
00653 #ifdef HAVE_TPETRA_DEBUG
00654       TEUCHOS_TEST_FOR_EXCEPTION(B_graph.is_null (), std::logic_error,
00655         "Tpetra::MatrixMatrix::Add: Graph of Op(B) is null.  "
00656         "Please report this bug to the Tpetra developers.");
00657 #endif // HAVE_TPETRA_DEBUG
00658       RCP<const map_type> A_colMap = A_graph->getColMap ();
00659 #ifdef HAVE_TPETRA_DEBUG
00660       TEUCHOS_TEST_FOR_EXCEPTION(A_colMap.is_null (), std::logic_error,
00661         "Tpetra::MatrixMatrix::Add: Column Map of Op(A) is null.  "
00662         "Please report this bug to the Tpetra developers.");
00663 #endif // HAVE_TPETRA_DEBUG
00664       RCP<const map_type> B_colMap = B_graph->getColMap ();
00665 #ifdef HAVE_TPETRA_DEBUG
00666       TEUCHOS_TEST_FOR_EXCEPTION(B_colMap.is_null (), std::logic_error,
00667         "Tpetra::MatrixMatrix::Add: Column Map of Op(B) is null.  "
00668         "Please report this bug to the Tpetra developers.");
00669       TEUCHOS_TEST_FOR_EXCEPTION(A_graph->getImporter ().is_null (),
00670         std::logic_error,
00671         "Tpetra::MatrixMatrix::Add: Op(A)'s Import is null.  "
00672         "Please report this bug to the Tpetra developers.");
00673       TEUCHOS_TEST_FOR_EXCEPTION(B_graph->getImporter ().is_null (),
00674         std::logic_error,
00675         "Tpetra::MatrixMatrix::Add: Op(B)'s Import is null.  "
00676         "Please report this bug to the Tpetra developers.");
00677 #endif // HAVE_TPETRA_DEBUG
00678 
00679       // Compute the (column Map and) Import of the matrix sum.
00680       RCP<const import_type> sumImport =
00681         A_graph->getImporter ()->setUnion (* (B_graph->getImporter ()));
00682       RCP<const map_type> C_colMap = sumImport->getTargetMap ();
00683 
00684       // First, count the number of entries in each row.  Then, go
00685       // back over the rows again, and compute the actual sum.
00686       // Remember that C may have a different column Map than Aprime
00687       // or Bprime, so its local indices may be different.  That's why
00688       // we have to convert from local to global indices.
00689 
00690       ArrayView<const LocalOrdinal> A_local_ind;
00691       Array<GlobalOrdinal> A_global_ind;
00692       ArrayView<const LocalOrdinal> B_local_ind;
00693       Array<GlobalOrdinal> B_global_ind;
00694 
00695       const size_t localNumRows = rowMap->getNodeNumElements ();
00696       ArrayRCP<size_t> numEntriesPerRow (localNumRows);
00697       // Compute the max number of entries in any row of A+B on this
00698       // process, so that we won't have to resize temporary arrays.
00699       size_t maxNumEntriesPerRow = 0;
00700       for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
00701         // Get view of current row of A_graph, in its local indices.
00702         A_graph->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind);
00703         const size_type A_numEnt = A_local_ind.size ();
00704         if (A_numEnt > A_global_ind.size ()) {
00705           A_global_ind.resize (A_numEnt);
00706         }
00707         // Convert A's local indices to global indices.
00708         for (size_type k = 0; k < A_numEnt; ++k) {
00709           A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
00710         }
00711 
00712         // Get view of current row of B_graph, in its local indices.
00713         B_graph->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind);
00714         const size_type B_numEnt = B_local_ind.size ();
00715         if (B_numEnt > B_global_ind.size ()) {
00716           B_global_ind.resize (B_numEnt);
00717         }
00718         // Convert B's local indices to global indices.
00719         for (size_type k = 0; k < B_numEnt; ++k) {
00720           B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
00721         }
00722 
00723         // Count the number of entries in the merged row of A + B.
00724         const size_t curNumEntriesPerRow =
00725           keyMergeCount (A_global_ind.begin (), A_global_ind.end (),
00726                          B_global_ind.begin (), B_global_ind.end ());
00727         numEntriesPerRow[localRow] = curNumEntriesPerRow;
00728         maxNumEntriesPerRow = std::max (maxNumEntriesPerRow, curNumEntriesPerRow);
00729       }
00730 
00731       // Create C, using the sum column Map and number of entries per
00732       // row that we computed above.  Having the exact number of
00733       // entries per row lets us use static profile, making it valid
00734       // to call expertStaticFillComplete.
00735       C = rcp (new crs_matrix_type (rowMap, C_colMap, numEntriesPerRow, StaticProfile));
00736 
00737       // Go back through the rows and actually compute the sum.  We
00738       // don't ever have to resize A_global_ind or B_global_ind below,
00739       // since we've already done it above.
00740       ArrayView<const Scalar> A_val;
00741       ArrayView<const Scalar> B_val;
00742 
00743       Array<LocalOrdinal> AplusB_local_ind (maxNumEntriesPerRow);
00744       Array<GlobalOrdinal> AplusB_global_ind (maxNumEntriesPerRow);
00745       Array<Scalar> AplusB_val (maxNumEntriesPerRow);
00746 
00747       for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
00748         // Get view of current row of A, in A's local indices.
00749         Aprime->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind, A_val);
00750         // Convert A's local indices to global indices.
00751         for (size_type k = 0; k < A_local_ind.size (); ++k) {
00752           A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
00753         }
00754 
00755         // Get view of current row of B, in B's local indices.
00756         Bprime->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind, B_val);
00757         // Convert B's local indices to global indices.
00758         for (size_type k = 0; k < B_local_ind.size (); ++k) {
00759           B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
00760         }
00761 
00762         const size_t curNumEntries = numEntriesPerRow[localRow];
00763         ArrayView<LocalOrdinal> C_local_ind = AplusB_local_ind (0, curNumEntries);
00764         ArrayView<GlobalOrdinal> C_global_ind = AplusB_global_ind (0, curNumEntries);
00765         ArrayView<Scalar> C_val = AplusB_val (0, curNumEntries);
00766 
00767         // Sum the entries in the current row of A plus B.
00768         keyValueMerge (A_global_ind.begin (), A_global_ind.end (),
00769                        A_val.begin (), A_val.end (),
00770                        B_global_ind.begin (), B_global_ind.end (),
00771                        B_val.begin (), B_val.end (),
00772                        C_global_ind.begin (), C_val.begin (),
00773                        std::plus<Scalar> ());
00774         // Convert the sum's global indices into C's local indices.
00775         for (size_type k = 0; k < as<size_type> (numEntriesPerRow[localRow]); ++k) {
00776           C_local_ind[k] = C_colMap->getLocalElement (C_global_ind[k]);
00777         }
00778         // Give the current row sum to C.
00779         C->replaceLocalValues (localRow, C_local_ind, C_val);
00780       }
00781 
00782       // Use "expert static fill complete" to bypass construction of
00783       // the Import and Export (if applicable) object(s).
00784       RCP<const map_type> domainMap = A_graph->getDomainMap ();
00785       RCP<const map_type> rangeMap = A_graph->getRangeMap ();
00786       C->expertStaticFillComplete (domainMap, rangeMap, sumImport, A_graph->getExporter ());
00787 
00788       return; // Now we're done!
00789     }
00790     else {
00791       // FIXME (mfh 08 May 2013) When I first looked at this method, I
00792       // noticed that C was being given the row Map of Aprime (the
00793       // possibly transposed version of A).  Is this what we want?
00794       C = rcp (new crs_matrix_type (Aprime->getRowMap (), null));
00795     }
00796 
00797 #else
00798     // FIXME (mfh 08 May 2013) When I first looked at this method, I
00799     // noticed that C was being given the row Map of Aprime (the
00800     // possibly transposed version of A).  Is this what we want?
00801     C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
00802 #endif // 0
00803   }
00804 
00805 #ifdef HAVE_TPETRA_DEBUG
00806   TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
00807     "Tpetra::MatrixMatrix::Add: At this point, Aprime is null.  "
00808     "Please report this bug to the Tpetra developers.");
00809   TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
00810     "Tpetra::MatrixMatrix::Add: At this point, Bprime is null.  "
00811     "Please report this bug to the Tpetra developers.");
00812   TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
00813     "Tpetra::MatrixMatrix::Add: At this point, C is null.  "
00814     "Please report this bug to the Tpetra developers.");
00815 #endif // HAVE_TPETRA_DEBUG
00816 
00817   Array<RCP<const crs_matrix_type> > Mat =
00818     tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
00819   Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
00820 
00821   // do a loop over each matrix to add: A reordering might be more efficient
00822   for (int k = 0; k < 2; ++k) {
00823     Array<GlobalOrdinal> Indices;
00824     Array<Scalar> Values;
00825 
00826     // Loop over each locally owned row of the current matrix (either
00827     // Aprime or Bprime), and sum its entries into the corresponding
00828     // row of C.  This works regardless of whether Aprime or Bprime
00829     // has the same row Map as C, because both sumIntoGlobalValues and
00830     // insertGlobalValues allow summing resp. inserting into nonowned
00831     // rows of C.
00832 #ifdef HAVE_TPETRA_DEBUG
00833     TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
00834       "Tpetra::MatrixMatrix::Add: At this point, curRowMap is null.  "
00835       "Please report this bug to the Tpetra developers.");
00836 #endif // HAVE_TPETRA_DEBUG
00837     RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
00838 #ifdef HAVE_TPETRA_DEBUG
00839     TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
00840       "Tpetra::MatrixMatrix::Add: At this point, curRowMap is null.  "
00841       "Please report this bug to the Tpetra developers.");
00842 #endif // HAVE_TPETRA_DEBUG
00843 
00844     const size_t localNumRows = Mat[k]->getNodeNumRows ();
00845     for (size_t i = 0; i < localNumRows; ++i) {
00846       const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
00847       size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
00848       if (numEntries > 0) {
00849         Indices.resize (numEntries);
00850         Values.resize (numEntries);
00851         Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
00852 
00853         if (scalar[k] != STS::one ()) {
00854           for (size_t j = 0; j < numEntries; ++j) {
00855             Values[j] *= scalar[k];
00856           }
00857         }
00858 
00859         if (C->isFillComplete ()) {
00860           C->sumIntoGlobalValues (globalRow, Indices, Values);
00861         } else {
00862           C->insertGlobalValues (globalRow, Indices, Values);
00863         }
00864       }
00865     }
00866   }
00867 }
00868 
00869 
00870 
00871 } //End namespace MatrixMatrix
00872 
00873 namespace MMdetails{
00874 
00875 // Prints MMM-style statistics on communication done with an Import or Export object
00876 template <class TransferType>
00877 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label){
00878   if(Transfer.is_null()) return;
00879 
00880   const Distributor & Distor                   = Transfer->getDistributor();
00881   Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
00882 
00883   size_t rows_send   = Transfer->getNumExportIDs();
00884   size_t rows_recv   = Transfer->getNumRemoteIDs();
00885 
00886   size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
00887   size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
00888   size_t num_send_neighbors = Distor.getNumSends();
00889   size_t num_recv_neighbors = Distor.getNumReceives();
00890   size_t round2_send, round2_recv;
00891   Distor.getLastDoStatistics(round2_send,round2_recv);
00892 
00893   int myPID    = Comm->getRank();
00894   int NumProcs = Comm->getSize();
00895 
00896   // Processor by processor statistics
00897   //    printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
00898   //    myPID,label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
00899 
00900   // Global statistics
00901   size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
00902   size_t gstats_min[8], gstats_max[8];
00903 
00904   double lstats_avg[8], gstats_avg[8];
00905   for(int i=0; i<8; i++)
00906     lstats_avg[i] = ((double)lstats[i])/NumProcs;
00907 
00908   Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
00909   Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
00910   Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
00911 
00912   if(!myPID) {
00913     printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n",label.c_str(),
00914            (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
00915            (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
00916     printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n",label.c_str(),
00917            (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
00918            (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
00919   }
00920 }
00921 
00922 
00923 //kernel method for computing the local portion of C = A*B
00924 template<class Scalar,
00925          class LocalOrdinal,
00926          class GlobalOrdinal,
00927          class Node>
00928 void mult_AT_B_newmatrix(
00929   const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
00930   const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
00931   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C) {
00932 
00933   // Using &  Typedefs
00934   using Teuchos::RCP;
00935   using Teuchos::rcp;
00936   typedef CrsMatrixStruct<
00937     Scalar,
00938     LocalOrdinal,
00939     GlobalOrdinal,
00940     Node> CrsMatrixStruct_t;
00941 
00942 #ifdef HAVE_TPETRA_MMM_TIMINGS
00943   using Teuchos::TimeMonitor;
00944   Teuchos::RCP<Teuchos::TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM-T Transpose")));
00945 #endif
00946 
00947   /*************************************************************/
00948   /* 1) Local Transpose of A                                   */
00949   /*************************************************************/
00950   RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> at (Teuchos::rcpFromRef (A));
00951   RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Atrans = at.createTransposeLocal();
00952 
00953   /*************************************************************/
00954   /* 2/3) Call mult_A_B_newmatrix w/ fillComplete              */
00955   /*************************************************************/
00956 #ifdef HAVE_TPETRA_MMM_TIMINGS
00957   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM-T I&X")));
00958 #endif
00959 
00960   // Get views, asserting that no import is required to speed up computation
00961   CrsMatrixStruct_t Aview;
00962   CrsMatrixStruct_t Bview;
00963   RCP<const Import<LocalOrdinal,GlobalOrdinal, Node> > dummyImporter;
00964   MMdetails::import_and_extract_views(*Atrans, Atrans->getRowMap(), Aview, dummyImporter,true);
00965   MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true);
00966 
00967 #ifdef HAVE_TPETRA_MMM_TIMINGS
00968   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM-T AB-core")));
00969 #endif
00970 
00971   RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >Ctemp;
00972 
00973   // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
00974   bool needs_final_export = !Atrans->getGraph()->getExporter().is_null();
00975   if(needs_final_export)
00976     Ctemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Atrans->getRowMap(),0));
00977   else
00978     Ctemp = rcp(&C,false);// don't allow deallocation
00979 
00980   // Multiply
00981   mult_A_B_newmatrix(Aview,Bview,*Ctemp);
00982 
00983   /*************************************************************/
00984   /* 4) exportAndFillComplete matrix                           */
00985   /*************************************************************/
00986 #ifdef HAVE_TPETRA_MMM_TIMINGS
00987   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM-T exportAndFillComplete")));
00988 #endif
00989 
00990   Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Crcp(&C,false);
00991   if(needs_final_export)
00992     Ctemp->exportAndFillComplete(Crcp,*Ctemp->getGraph()->getExporter(),
00993                                  B.getDomainMap(),A.getDomainMap());
00994 
00995 #ifdef COMPUTE_MMM_STATISTICS
00996   printMultiplicationStatistics(Ctemp->getGraph()->getExporter(),std::string("AT_B MMM"));
00997 #endif
00998 }
00999 
01000 
01001 //kernel method for computing the local portion of C = A*B
01002 template<class Scalar,
01003          class LocalOrdinal,
01004          class GlobalOrdinal,
01005          class Node>
01006 void mult_A_B(
01007   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
01008   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
01009   CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C)
01010 {
01011   typedef Teuchos::ScalarTraits<Scalar> STS;
01012   //TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
01013   LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
01014   LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
01015 
01016   LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
01017   LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
01018 
01019   ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
01020   ArrayView<const GlobalOrdinal> bcols_import = null;
01021   if (Bview.importColMap != null) {
01022     C_firstCol_import = Bview.importColMap->getMinLocalIndex();
01023     C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
01024 
01025     bcols_import = Bview.importColMap->getNodeElementList();
01026   }
01027 
01028   size_t C_numCols = C_lastCol - C_firstCol +
01029                         OrdinalTraits<LocalOrdinal>::one();
01030   size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
01031                                 OrdinalTraits<LocalOrdinal>::one();
01032 
01033   if (C_numCols_import > C_numCols) C_numCols = C_numCols_import;
01034 
01035   Array<Scalar> dwork = Array<Scalar>(C_numCols);
01036   Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
01037   Array<size_t> iwork2 = Array<size_t>(C_numCols);
01038 
01039   Array<Scalar> C_row_i = dwork;
01040   Array<GlobalOrdinal> C_cols = iwork;
01041   Array<size_t> c_index = iwork2;
01042   Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
01043   Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
01044 
01045   size_t C_row_i_length, j, k, last_index;
01046 
01047   // Run through all the hash table lookups once and for all
01048   LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
01049   Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
01050   Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
01051   if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
01052     // Maps are the same: Use local IDs as the hash
01053     for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
01054             Aview.colMap->getMaxLocalIndex(); i++)
01055       Acol2Brow[i]=i;
01056   }
01057   else {
01058     // Maps are not the same:  Use the map's hash
01059     for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
01060           Aview.colMap->getMaxLocalIndex(); i++) {
01061       GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
01062       LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
01063       if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
01064       else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
01065     }
01066   }
01067 
01068   //To form C = A*B we're going to execute this expression:
01069   //
01070   // C(i,j) = sum_k( A(i,k)*B(k,j) )
01071   //
01072   //Our goal, of course, is to navigate the data in A and B once, without
01073   //performing searches for column-indices, etc.
01074   ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
01075   ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
01076   ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
01077   ArrayView<const size_t> Arowptr, Browptr, Irowptr;
01078   ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
01079   ArrayView<const Scalar> Avals, Bvals, Ivals;
01080   Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
01081   Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
01082   Arowptr = Arowptr_RCP();  Acolind = Acolind_RCP();  Avals = Avals_RCP();
01083   Browptr = Browptr_RCP();  Bcolind = Bcolind_RCP();  Bvals = Bvals_RCP();
01084   if(!Bview.importMatrix.is_null()) {
01085     Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
01086     Irowptr = Irowptr_RCP();  Icolind = Icolind_RCP();  Ivals = Ivals_RCP();
01087   }
01088 
01089   bool C_filled = C.isFillComplete();
01090 
01091   for (size_t i = 0; i < C_numCols; i++)
01092       c_index[i] = OrdinalTraits<size_t>::invalid();
01093 
01094   //loop over the rows of A.
01095   size_t Arows = Aview.rowMap->getNodeNumElements();
01096   for(size_t i=0; i<Arows; ++i) {
01097 
01098     //only navigate the local portion of Aview... which is, thankfully, all of A
01099     //since this routine doesn't do transpose modes
01100     GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
01101 
01102     //loop across the i-th row of A and for each corresponding row
01103     //in B, loop across colums and accumulate product
01104     //A(i,k)*B(k,j) into our partial sum quantities C_row_i. In other words,
01105     //as we stride across B(k,:) we're calculating updates for row i of the
01106     //result matrix C.
01107 
01108 
01109     C_row_i_length = OrdinalTraits<size_t>::zero();
01110 
01111     for(k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
01112       LocalOrdinal Ak = Acol2Brow[Acolind[k]];
01113       Scalar Aval = Avals[k];
01114       if (Aval == STS::zero())
01115         continue;
01116 
01117       if (Ak==LO_INVALID) continue;
01118 
01119       for(j=Browptr[Ak]; j< Browptr[Ak+1]; ++j) {
01120           LocalOrdinal col = Bcolind[j];
01121           //assert(col >= 0 && col < C_numCols);
01122           if (c_index[col] == OrdinalTraits<size_t>::invalid()){
01123           //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
01124             // This has to be a +=  so insertGlobalValue goes out
01125             C_row_i[C_row_i_length] = Aval*Bvals[j];
01126             C_cols[C_row_i_length] = col;
01127             c_index[col] = C_row_i_length;
01128             C_row_i_length++;
01129           }
01130           else {
01131             C_row_i[c_index[col]] += Aval*Bvals[j];
01132           }
01133         }
01134     }
01135 
01136     for (size_t ii = 0; ii < C_row_i_length; ii++) {
01137       c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
01138       C_cols[ii] = bcols[C_cols[ii]];
01139       combined_index[ii] = C_cols[ii];
01140       combined_values[ii] = C_row_i[ii];
01141     }
01142     last_index = C_row_i_length;
01143 
01144     //
01145     //Now put the C_row_i values into C.
01146     //
01147     // We might have to revamp this later.
01148     C_row_i_length = OrdinalTraits<size_t>::zero();
01149 
01150     for(k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
01151       LocalOrdinal Ak = Acol2Brow[Acolind[k]];
01152       Scalar Aval = Avals[k];
01153       if (Aval == STS::zero())
01154         continue;
01155 
01156       if (Ak!=LO_INVALID) continue;
01157 
01158       Ak = Acol2Irow[Acolind[k]];
01159       for(j=Irowptr[Ak]; j< Irowptr[Ak+1]; ++j) {
01160           LocalOrdinal col = Icolind[j];
01161           //assert(col >= 0 && col < C_numCols);
01162           if (c_index[col] == OrdinalTraits<size_t>::invalid()){
01163           //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
01164             // This has to be a +=  so insertGlobalValue goes out
01165             C_row_i[C_row_i_length] = Aval*Ivals[j];
01166             C_cols[C_row_i_length] = col;
01167             c_index[col] = C_row_i_length;
01168             C_row_i_length++;
01169             }
01170             else {
01171               // This has to be a +=  so insertGlobalValue goes out
01172               C_row_i[c_index[col]] += Aval*Ivals[j];
01173             }
01174         }
01175     }
01176 
01177     for (size_t ii = 0; ii < C_row_i_length; ii++) {
01178       c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
01179       C_cols[ii] = bcols_import[C_cols[ii]];
01180       combined_index[last_index] = C_cols[ii];
01181       combined_values[last_index] = C_row_i[ii];
01182       last_index++;
01183     }
01184 
01185       //
01186       //Now put the C_row_i values into C.
01187       //
01188       // We might have to revamp this later.
01189     C_filled ?
01190       C.sumIntoGlobalValues(
01191           global_row,
01192           combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
01193           combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
01194       :
01195       C.insertGlobalValues(
01196           global_row,
01197           combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
01198           combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
01199 
01200   }
01201 
01202 }
01203 
01204 template<class Scalar,
01205          class LocalOrdinal,
01206          class GlobalOrdinal,
01207          class Node>
01208 void setMaxNumEntriesPerRow(
01209   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview)
01210 {
01211   typedef typename Array<ArrayView<const LocalOrdinal> >::size_type  local_length_size;
01212   Mview.maxNumRowEntries = OrdinalTraits<local_length_size>::zero();
01213   if(Mview.indices.size() > OrdinalTraits<local_length_size>::zero() ){
01214     Mview.maxNumRowEntries = Mview.indices[0].size();
01215     for(local_length_size i = 1; i<Mview.indices.size(); ++i){
01216       if(Mview.indices[i].size() > Mview.maxNumRowEntries){
01217         Mview.maxNumRowEntries = Mview.indices[i].size();
01218       }
01219     }
01220   }
01221 }
01222 
01223 
01224 template<class CrsMatrixType>
01225 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
01226   // Follows the NZ estimate in ML's ml_matmatmult.c
01227   size_t Aest = 100, Best=100;
01228   if(A.getNodeNumEntries() > 0)
01229     Aest = (A.getNodeNumRows()>0)? A.getNodeNumEntries()/A.getNodeNumEntries():100;
01230   if(B.getNodeNumEntries() > 0)
01231     Best=(B.getNodeNumRows()>0)? B.getNodeNumEntries()/B.getNodeNumEntries():100;
01232 
01233   size_t nnzperrow=(size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
01234   nnzperrow*=nnzperrow;
01235 
01236   return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
01237 }
01238 
01239 
01240 
01241 //kernel method for computing the local portion of C = A*B
01242 template<class Scalar,
01243          class LocalOrdinal,
01244          class GlobalOrdinal,
01245          class Node>
01246 void mult_A_B_newmatrix(
01247   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
01248   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
01249   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C)
01250 {
01251   using Teuchos::RCP;
01252   using Teuchos::rcp;
01253   using Teuchos::ArrayView;
01254   typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
01255   typedef Map<LocalOrdinal, GlobalOrdinal, Node> map_type;
01256 
01257 #ifdef HAVE_TPETRA_MMM_TIMINGS
01258   using Teuchos::TimeMonitor;
01259   RCP<TimeMonitor> MM =
01260     rcp (new TimeMonitor (* (TimeMonitor::getNewTimer ("TpetraExt: MMM M5 Cmap"))));
01261 #endif
01262   size_t ST_INVALID = Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
01263   LocalOrdinal LO_INVALID = Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
01264 
01265 
01266   // Build the final importer / column map, hash table lookups for C
01267   RCP<const import_type> Cimport;
01268   RCP<const map_type> Ccolmap;
01269   RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
01270   RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null :  Bview.importMatrix->getGraph()->getImporter();
01271   Array<LocalOrdinal> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
01272 
01273   if(Bview.importMatrix.is_null()) {
01274     Cimport = Bimport;
01275     Ccolmap = Bview.colMap;
01276     // Bcol2Ccol is trivial
01277     for(size_t i=0; i<Bview.colMap->getNodeNumElements(); i++) {
01278       Bcol2Ccol[i] = Teuchos::as<LocalOrdinal>(i);
01279     }
01280   }
01281   else {
01282     // Choose the right variant of setUnion
01283     if(!Bimport.is_null() && !Iimport.is_null()){
01284       Cimport = Bimport->setUnion(*Iimport);
01285       Ccolmap = Cimport->getTargetMap();
01286     }
01287     else if(!Bimport.is_null() && Iimport.is_null()) {
01288       Cimport = Bimport->setUnion();
01289     }
01290     else if(Bimport.is_null() && !Iimport.is_null()) {
01291       Cimport = Iimport->setUnion();
01292     }
01293     else
01294       throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
01295 
01296     Ccolmap = Cimport->getTargetMap();
01297 
01298     if(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()))
01299       throw std::runtime_error("Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
01300 
01301     // NOTE: This is not efficient and should be folded into setUnion
01302     Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
01303     ArrayView<const GlobalOrdinal> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
01304     ArrayView<const GlobalOrdinal> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
01305 
01306     for(size_t i=0; i<Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
01307       Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
01308     for(size_t i=0; i<Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
01309       Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
01310   }
01311 
01312 #ifdef HAVE_TPETRA_MMM_TIMINGS
01313   MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM Newmatrix SerialCore")));
01314 #endif
01315 
01316   // Sizes
01317   size_t m=Aview.origMatrix->getNodeNumRows();
01318   size_t n=Ccolmap->getNodeNumElements();
01319 
01320   // Get Data Pointers
01321   ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
01322   ArrayRCP<size_t> Crowptr_RCP;
01323   ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
01324   ArrayRCP<LocalOrdinal> Ccolind_RCP;
01325   ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
01326   ArrayRCP<Scalar> Cvals_RCP;
01327 
01328   Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
01329   Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
01330   if(!Bview.importMatrix.is_null()) Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
01331 
01332 
01333   // For efficiency
01334   ArrayView<const size_t> Arowptr, Browptr, Irowptr;
01335   ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
01336   ArrayView<const Scalar> Avals, Bvals, Ivals;
01337   ArrayView<size_t> Crowptr;
01338   ArrayView<LocalOrdinal> Ccolind;
01339   ArrayView<Scalar> Cvals;
01340   Arowptr = Arowptr_RCP();  Acolind = Acolind_RCP();  Avals = Avals_RCP();
01341   Browptr = Browptr_RCP();  Bcolind = Bcolind_RCP();  Bvals = Bvals_RCP();
01342   if(!Bview.importMatrix.is_null()) {
01343     Irowptr = Irowptr_RCP();  Icolind = Icolind_RCP();  Ivals = Ivals_RCP();
01344   }
01345 
01346   // The status array will contain the index into colind where this entry was last deposited.
01347   // c_status[i] < CSR_ip - not in the row yet.
01348   // c_status[i] >= CSR_ip, this is the entry where you can find the data
01349   // We start with this filled with INVALID's indicating that there are no entries yet.
01350   // Sadly, this complicates the code due to the fact that size_t's are unsigned.
01351   size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
01352   Array<size_t> c_status(n, ST_INVALID);
01353 
01354   // Classic csr assembly (low memory edition)
01355   size_t CSR_alloc=std::max(C_estimate_nnz(*Aview.origMatrix,*Bview.origMatrix),n);
01356   size_t CSR_ip=0,OLD_ip=0;
01357   Crowptr_RCP.resize(m+1);       Crowptr = Crowptr_RCP();
01358   Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
01359   Cvals_RCP.resize(CSR_alloc);   Cvals   = Cvals_RCP();
01360 
01361   // Run through all the hash table lookups once and for all
01362   Array<LocalOrdinal> targetMapToOrigRow(Aview.colMap->getNodeNumElements(),LO_INVALID);
01363   Array<LocalOrdinal> targetMapToImportRow(Aview.colMap->getNodeNumElements(),LO_INVALID);
01364 
01365   if(Aview.colMap->isSameAs(*Bview.rowMap)){
01366     // Maps are the same: Use local IDs as the hash
01367     for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
01368       LocalOrdinal B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
01369       if(B_LID != LO_INVALID) targetMapToOrigRow[i] = B_LID;
01370       else {
01371         LocalOrdinal I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
01372         targetMapToImportRow[i] = I_LID;
01373       }
01374     }
01375   }
01376   else {
01377     // Maps are not the same:  Use the map's hash
01378     for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
01379       LocalOrdinal B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
01380       if(B_LID != LO_INVALID) targetMapToOrigRow[i] = B_LID;
01381       else {
01382         LocalOrdinal I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
01383         targetMapToImportRow[i] = I_LID;
01384       }
01385     }
01386   }
01387 
01388   const Scalar SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
01389 
01390   // For each row of A/C
01391   for(size_t i=0; i<m; i++){
01392     Crowptr[i]=CSR_ip;
01393 
01394     for(size_t k=Arowptr[i]; k<Arowptr[i+1]; k++){
01395       LocalOrdinal Ak      = Acolind[k];
01396       Scalar       Aval    = Avals[k];
01397       if(Aval==SC_ZERO) continue;
01398 
01399       if(targetMapToOrigRow[Ak] != LO_INVALID){
01400         // Local matrix
01401         size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Ak]);
01402 
01403         for(size_t j=Browptr[Bk]; j<Browptr[Bk+1]; ++j) {
01404           LocalOrdinal Cj=Bcol2Ccol[Bcolind[j]];
01405 
01406           if(c_status[Cj]==INVALID || c_status[Cj]<OLD_ip){
01407             // New entry
01408             c_status[Cj]      = CSR_ip;
01409             Ccolind[CSR_ip]= Cj;
01410             Cvals[CSR_ip]  = Aval*Bvals[j];
01411             CSR_ip++;
01412           }
01413           else
01414             Cvals[c_status[Cj]]+=Aval*Bvals[j];
01415         }
01416       }
01417       else{
01418         // Remote matrix
01419         size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Ak]);
01420         for(size_t j=Irowptr[Ik]; j<Irowptr[Ik+1]; ++j) {
01421           LocalOrdinal Cj=Icol2Ccol[Icolind[j]];
01422 
01423           if(c_status[Cj]==INVALID || c_status[Cj]<OLD_ip){
01424             // New entry
01425             c_status[Cj]=CSR_ip;
01426             Ccolind[CSR_ip]=Cj;
01427             Cvals[CSR_ip]=Aval*Ivals[j];
01428             CSR_ip++;
01429           }
01430           else
01431             Cvals[c_status[Cj]]+=Aval*Ivals[j];
01432         }
01433       }
01434     }
01435 
01436     // Resize for next pass if needed
01437     if(CSR_ip + n > CSR_alloc){
01438       CSR_alloc*=2;
01439       Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
01440       Cvals_RCP.resize(CSR_alloc);   Cvals   = Cvals_RCP();
01441     }
01442     OLD_ip=CSR_ip;
01443   }
01444 
01445   Crowptr[m]=CSR_ip;
01446 
01447   // Downward resize
01448   Cvals_RCP.resize(CSR_ip);
01449   Ccolind_RCP.resize(CSR_ip);
01450 
01451 
01452 #ifdef HAVE_TPETRA_MMM_TIMINGS
01453   MM = rcp (new TimeMonitor (* (TimeMonitor::getNewTimer("TpetraExt: MMM Newmatrix Final Sort"))));
01454 #endif
01455 
01456   // Replace the column map
01457   C.replaceColMap(Ccolmap);
01458 
01459   // Final sort & set of CRS arrays
01460   Import_Util::sortCrsEntries(Crowptr_RCP(),Ccolind_RCP(),Cvals_RCP());
01461   C.setAllValues(Crowptr_RCP,Ccolind_RCP,Cvals_RCP);
01462 
01463 #ifdef HAVE_TPETRA_MMM_TIMINGS
01464   MM = rcp (new TimeMonitor (* (TimeMonitor::getNewTimer("TpetraExt: MMM Newmatrix ESFC"))));
01465 #endif
01466 
01467   // Final FillComplete
01468   C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(),Aview.origMatrix->getRangeMap(),Cimport);
01469 }
01470 
01471 
01472 //kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
01473 template<class Scalar,
01474          class LocalOrdinal,
01475          class GlobalOrdinal,
01476          class Node>
01477 void jacobi_A_B_newmatrix(
01478   Scalar omega,
01479   const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
01480   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
01481   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
01482   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C)
01483 {
01484   using Teuchos::RCP;
01485   using Teuchos::rcp;
01486   using Teuchos::ArrayView;
01487   typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
01488   typedef Map<LocalOrdinal, GlobalOrdinal, Node> map_type;
01489 
01490 #ifdef HAVE_TPETRA_MMM_TIMINGS
01491   using Teuchos::TimeMonitor;
01492   RCP<TimeMonitor> MM =
01493     rcp (new TimeMonitor (* (TimeMonitor::getNewTimer ("TpetraExt: Jacobi M5 Cmap"))));
01494 #endif
01495   size_t ST_INVALID = Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
01496   LocalOrdinal LO_INVALID = Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
01497 
01498 
01499   // Build the final importer / column map, hash table lookups for C
01500   RCP<const import_type> Cimport;
01501   RCP<const map_type> Ccolmap;
01502   RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
01503   RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null :  Bview.importMatrix->getGraph()->getImporter();
01504   Array<LocalOrdinal> Bcol2Ccol(Bview.colMap->getNodeNumElements()), Icol2Ccol;
01505 
01506   if(Bview.importMatrix.is_null()) {
01507     Cimport = Bimport;
01508     Ccolmap = Bview.colMap;
01509     // Bcol2Ccol is trivial
01510     for(size_t i=0; i<Bview.colMap->getNodeNumElements(); i++) {
01511       Bcol2Ccol[i] = Teuchos::as<LocalOrdinal>(i);
01512     }
01513   }
01514   else {
01515     // Choose the right variant of setUnion
01516     if(!Bimport.is_null() && !Iimport.is_null()){
01517       Cimport = Bimport->setUnion(*Iimport);
01518       Ccolmap = Cimport->getTargetMap();
01519     }
01520     else if(!Bimport.is_null() && Iimport.is_null()) {
01521       Cimport = Bimport->setUnion();
01522     }
01523     else if(Bimport.is_null() && !Iimport.is_null()) {
01524       Cimport = Iimport->setUnion();
01525     }
01526     else
01527       throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
01528 
01529     Ccolmap = Cimport->getTargetMap();
01530 
01531     if(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()))
01532       throw std::runtime_error("Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
01533 
01534     // NOTE: This is not efficient and should be folded into setUnion
01535     Icol2Ccol.resize(Bview.importMatrix->getColMap()->getNodeNumElements());
01536     ArrayView<const GlobalOrdinal> Bgid = Bview.origMatrix->getColMap()->getNodeElementList();
01537     ArrayView<const GlobalOrdinal> Igid = Bview.importMatrix->getColMap()->getNodeElementList();
01538 
01539     for(size_t i=0; i<Bview.origMatrix->getColMap()->getNodeNumElements(); i++)
01540       Bcol2Ccol[i] = Ccolmap->getLocalElement(Bgid[i]);
01541     for(size_t i=0; i<Bview.importMatrix->getColMap()->getNodeNumElements(); i++)
01542       Icol2Ccol[i] = Ccolmap->getLocalElement(Igid[i]);
01543   }
01544 
01545 #ifdef HAVE_TPETRA_MMM_TIMINGS
01546   MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi Newmatrix SerialCore")));
01547 #endif
01548 
01549   // Sizes
01550   size_t m=Aview.origMatrix->getNodeNumRows();
01551   size_t n=Ccolmap->getNodeNumElements();
01552 
01553   // Get Data Pointers
01554   ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
01555   ArrayRCP<size_t> Crowptr_RCP;
01556   ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
01557   ArrayRCP<LocalOrdinal> Ccolind_RCP;
01558   ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
01559   ArrayRCP<Scalar> Cvals_RCP;
01560   ArrayRCP<const Scalar> Dvals_RCP;
01561 
01562   Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
01563   Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
01564   if(!Bview.importMatrix.is_null()) Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
01565   Dvals_RCP = Dinv.getData();
01566 
01567   // For efficiency
01568   ArrayView<const size_t> Arowptr, Browptr, Irowptr;
01569   ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
01570   ArrayView<const Scalar> Avals, Bvals, Ivals;
01571   ArrayView<size_t> Crowptr;
01572   ArrayView<LocalOrdinal> Ccolind;
01573   ArrayView<Scalar> Cvals;
01574   ArrayView<const Scalar> Dvals;
01575   Arowptr = Arowptr_RCP();  Acolind = Acolind_RCP();  Avals = Avals_RCP();
01576   Browptr = Browptr_RCP();  Bcolind = Bcolind_RCP();  Bvals = Bvals_RCP();
01577   if(!Bview.importMatrix.is_null()) {
01578     Irowptr = Irowptr_RCP();  Icolind = Icolind_RCP();  Ivals = Ivals_RCP();
01579   }
01580   Dvals = Dvals_RCP();
01581 
01582   // The status array will contain the index into colind where this entry was last deposited.
01583   // c_status[i] < CSR_ip - not in the row yet.
01584   // c_status[i] >= CSR_ip, this is the entry where you can find the data
01585   // We start with this filled with INVALID's indicating that there are no entries yet.
01586   // Sadly, this complicates the code due to the fact that size_t's are unsigned.
01587   size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
01588   Array<size_t> c_status(n, ST_INVALID);
01589 
01590   // Classic csr assembly (low memory edition)
01591   size_t CSR_alloc=std::max(C_estimate_nnz(*Aview.origMatrix,*Bview.origMatrix),n);
01592   size_t CSR_ip=0,OLD_ip=0;
01593   Crowptr_RCP.resize(m+1);       Crowptr = Crowptr_RCP();
01594   Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
01595   Cvals_RCP.resize(CSR_alloc);   Cvals   = Cvals_RCP();
01596 
01597   // Run through all the hash table lookups once and for all
01598   Array<LocalOrdinal> targetMapToOrigRow(Aview.colMap->getNodeNumElements(),LO_INVALID);
01599   Array<LocalOrdinal> targetMapToImportRow(Aview.colMap->getNodeNumElements(),LO_INVALID);
01600 
01601   if(Aview.colMap->isSameAs(*Bview.rowMap)){
01602     // Maps are the same: Use local IDs as the hash
01603     for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
01604       LocalOrdinal B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
01605       if(B_LID != LO_INVALID) targetMapToOrigRow[i] = B_LID;
01606       else {
01607         LocalOrdinal I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
01608         targetMapToImportRow[i] = I_LID;
01609       }
01610     }
01611   }
01612   else {
01613     // Maps are not the same:  Use the map's hash
01614     for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <= Aview.colMap->getMaxLocalIndex(); i++) {
01615       LocalOrdinal B_LID = Bview.origMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
01616       if(B_LID != LO_INVALID) targetMapToOrigRow[i] = B_LID;
01617       else {
01618         LocalOrdinal I_LID = Bview.importMatrix->getRowMap()->getLocalElement(Aview.colMap->getGlobalElement(i));
01619         targetMapToImportRow[i] = I_LID;
01620       }
01621     }
01622   }
01623 
01624   const Scalar SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
01625 
01626   // For each row of A/C
01627   for(size_t i=0; i<m; i++){
01628     Crowptr[i]=CSR_ip;
01629     Scalar Dval = Dvals[i];
01630 
01631     // Entries of B
01632     for(size_t k=Browptr[i]; k<Browptr[i+1]; k++){
01633       Scalar Bval = Bvals[k];
01634       if(Bval==SC_ZERO) continue;
01635       LocalOrdinal Ck=Bcol2Ccol[Bcolind[k]];
01636 
01637       // Assume no repeated entries in B
01638       c_status[Ck]    = CSR_ip;
01639       Ccolind[CSR_ip] = Ck;
01640       Cvals[CSR_ip]   = Bvals[k];
01641       CSR_ip++;
01642     }
01643 
01644 
01645     // Entries of -omega * Dinv * A * B
01646     for(size_t k=Arowptr[i]; k<Arowptr[i+1]; k++){
01647       LocalOrdinal Ak      = Acolind[k];
01648       Scalar       Aval    = Avals[k];
01649       if(Aval==SC_ZERO) continue;
01650 
01651       if(targetMapToOrigRow[Ak] != LO_INVALID){
01652         // Local matrix
01653         size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Ak]);
01654 
01655         for(size_t j=Browptr[Bk]; j<Browptr[Bk+1]; ++j) {
01656           LocalOrdinal Cj=Bcol2Ccol[Bcolind[j]];
01657 
01658           if(c_status[Cj]==INVALID || c_status[Cj]<OLD_ip){
01659             // New entry
01660             c_status[Cj]    = CSR_ip;
01661             Ccolind[CSR_ip] = Cj;
01662             Cvals[CSR_ip]   = - omega * Dval* Aval * Bvals[j];
01663             CSR_ip++;
01664           }
01665           else
01666             Cvals[c_status[Cj]] -= omega * Dval* Aval * Bvals[j];
01667         }
01668       }
01669       else{
01670         // Remote matrix
01671         size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Ak]);
01672         for(size_t j=Irowptr[Ik]; j<Irowptr[Ik+1]; ++j) {
01673           LocalOrdinal Cj=Icol2Ccol[Icolind[j]];
01674 
01675           if(c_status[Cj]==INVALID || c_status[Cj]<OLD_ip){
01676             // New entry
01677             c_status[Cj]    = CSR_ip;
01678             Ccolind[CSR_ip] = Cj;
01679             Cvals[CSR_ip]   = - omega * Dval* Aval * Ivals[j];
01680             CSR_ip++;
01681           }
01682           else
01683             Cvals[c_status[Cj]] -= omega * Dval* Aval * Ivals[j];
01684         }
01685       }
01686     }
01687 
01688     // Resize for next pass if needed
01689     if(CSR_ip + n > CSR_alloc){
01690       CSR_alloc*=2;
01691       Ccolind_RCP.resize(CSR_alloc); Ccolind = Ccolind_RCP();
01692       Cvals_RCP.resize(CSR_alloc);   Cvals   = Cvals_RCP();
01693     }
01694     OLD_ip=CSR_ip;
01695   }
01696 
01697   Crowptr[m]=CSR_ip;
01698 
01699   // Downward resize
01700   Cvals_RCP.resize(CSR_ip);
01701   Ccolind_RCP.resize(CSR_ip);
01702 
01703 
01704 #ifdef HAVE_TPETRA_MMM_TIMINGS
01705   MM = rcp (new TimeMonitor (* (TimeMonitor::getNewTimer("TpetraExt: Jacobi Newmatrix Final Sort"))));
01706 #endif
01707 
01708   // Replace the column map
01709   C.replaceColMap(Ccolmap);
01710 
01711   // Final sort & set of CRS arrays
01712   Import_Util::sortCrsEntries(Crowptr_RCP(),Ccolind_RCP(),Cvals_RCP());
01713   C.setAllValues(Crowptr_RCP,Ccolind_RCP,Cvals_RCP);
01714 
01715 #ifdef HAVE_TPETRA_MMM_TIMINGS
01716   MM = rcp (new TimeMonitor (* (TimeMonitor::getNewTimer("TpetraExt: Jacobi Newmatrix ESFC"))));
01717 #endif
01718 
01719   // Final FillComplete
01720   C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(),Aview.origMatrix->getRangeMap(),Cimport);
01721 }
01722 
01723 
01724 template<class Scalar,
01725          class LocalOrdinal,
01726          class GlobalOrdinal,
01727          class Node>
01728 void import_and_extract_views(
01729   const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
01730   RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
01731   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
01732   RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
01733   bool userAssertsThereAreNoRemotes)
01734 {
01735 #ifdef HAVE_TPETRA_MMM_TIMINGS
01736   using Teuchos::TimeMonitor;
01737   Teuchos::RCP<Teuchos::TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM I&X Alloc")));
01738 #endif
01739 
01740   //Convience typedef
01741   typedef Map<LocalOrdinal, GlobalOrdinal, Node> Map_t;
01742   typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> CrsMatrix_t;
01743   // The goal of this method is to populate the 'Mview' struct with views of the
01744   // rows of M, including all rows that correspond to elements in 'targetMap'.
01745   //
01746   // If targetMap includes local elements that correspond to remotely-owned rows
01747   // of M, then those remotely-owned rows will be imported into
01748   // 'Mview.importMatrix', and views of them will be included in 'Mview'.
01749   Mview.deleteContents();
01750 
01751   RCP<const Map_t> Mrowmap = M.getRowMap();
01752   RCP<const Map_t> MremoteRowMap;
01753   const int numProcs = Mrowmap->getComm()->getSize();
01754 
01755   ArrayView<const GlobalOrdinal> Mrows = targetMap->getNodeElementList();
01756 
01757   size_t numRemote = 0;
01758   size_t numRows   = targetMap->getNodeNumElements();
01759   Mview.origMatrix = Teuchos::rcp(&M,false);
01760   Mview.origRowMap = M.getRowMap();
01761   Mview.rowMap = targetMap;
01762   Mview.colMap = M.getColMap();
01763   Mview.domainMap = M.getDomainMap();
01764   Mview.importColMap = null;
01765 
01766   // Short circuit if the user swears there are no remotes
01767   if(userAssertsThereAreNoRemotes) return;
01768 
01769 #ifdef HAVE_TPETRA_MMM_TIMINGS
01770   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM I&X RemoteMap")));
01771 #endif
01772 
01773   // mark each row in targetMap as local or remote, and go ahead and get a view for the local rows
01774   int mode = 0;
01775   if(!prototypeImporter.is_null() && prototypeImporter->getSourceMap()->isSameAs(*Mrowmap) && prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
01776     // We have a valid prototype importer --- ask it for the remotes
01777     numRemote = prototypeImporter->getNumRemoteIDs();
01778     Array<GlobalOrdinal> MremoteRows(numRemote);
01779     ArrayView<const LocalOrdinal> RemoteLIDs = prototypeImporter->getRemoteLIDs();
01780     for(size_t i=0; i<numRemote; i++) {
01781       MremoteRows[i] = targetMap->getGlobalElement(RemoteLIDs[i]);
01782     }
01783 
01784     MremoteRowMap=rcp(new Map_t(OrdinalTraits<global_size_t>::invalid(), MremoteRows(), Mrowmap->getIndexBase(), Mrowmap->getComm(), Mrowmap->getNode()));
01785     mode=1;
01786   }
01787   else if(prototypeImporter.is_null()) {
01788     // No prototype importer --- count the remotes the hard way
01789     Array<GlobalOrdinal> MremoteRows(numRows);
01790     for(size_t i=0; i < numRows; ++i) {
01791       const LocalOrdinal mlid = Mrowmap->getLocalElement(Mrows[i]);
01792 
01793       if (mlid == OrdinalTraits<LocalOrdinal>::invalid()) {
01794         MremoteRows[numRemote]=Mrows[i];
01795         ++numRemote;
01796       }
01797     }
01798     MremoteRows.resize(numRemote);
01799     MremoteRowMap=rcp(new Map_t(OrdinalTraits<global_size_t>::invalid(), MremoteRows(), Mrowmap->getIndexBase(), Mrowmap->getComm(), Mrowmap->getNode()));
01800     mode=2;
01801   }
01802   else {
01803     // prototypeImporter is bad.  But if we're in serial that's OK.
01804     mode=3;
01805   }
01806 
01807   if (numProcs < 2) {
01808     TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
01809       "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows." <<std::endl);
01810     //If only one processor we don't need to import any remote rows, so return.
01811     return;
01812   }
01813 
01814   //
01815   // Now we will import the needed remote rows of M, if the global maximum
01816   // value of numRemote is greater than 0.
01817   //
01818 #ifdef HAVE_TPETRA_MMM_TIMINGS
01819   MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM I&X Collective-0")));
01820 #endif
01821 
01822   global_size_t globalMaxNumRemote = 0;
01823   Teuchos::reduceAll(*(Mrowmap->getComm()) , Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
01824 
01825 
01826   if (globalMaxNumRemote > 0) {
01827 #ifdef HAVE_TPETRA_MMM_TIMINGS
01828     MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM I&X Import-2")));
01829 #endif
01830     // Create an importer with target-map MremoteRowMap and source-map Mrowmap.
01831     RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > importer;
01832 
01833     if(mode==1)
01834       importer = prototypeImporter->createRemoteOnlyImport(MremoteRowMap);
01835     else if(mode==2)
01836       importer=rcp(new Import<LocalOrdinal, GlobalOrdinal, Node>(Mrowmap, MremoteRowMap));
01837     else
01838       throw std::runtime_error("prototypeImporter->SourceMap() does not match M.getRowMap()!");
01839 
01840 #ifdef HAVE_TPETRA_MMM_TIMINGS
01841     MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM I&X Import-3")));
01842 #endif
01843 
01844     // Now create a new matrix into which we can import the remote rows of M that we need.
01845     Mview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<CrsMatrix_t>(Teuchos::rcp(&M,false),*importer,M.getDomainMap(),M.getRangeMap(),Teuchos::null);
01846 
01847 #ifdef COMPUTE_MMM_STATISTICS
01848     printMultiplicationStatistics(importer,std::string("I&X MMM"));
01849 #endif
01850 
01851 
01852 #ifdef HAVE_TPETRA_MMM_TIMINGS
01853     MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: MMM I&X Import-4")));
01854 #endif
01855 
01856     // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
01857     Mview.importColMap = Mview.importMatrix->getColMap();
01858   }
01859 }
01860 
01861 
01862 
01863 } //End namepsace MMdetails
01864 
01865 } //End namespace Tpetra
01866 //
01867 // Explicit instantiation macro
01868 //
01869 // Must be expanded from within the Tpetra namespace!
01870 //
01871 
01872 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
01873   \
01874   template \
01875   void MatrixMatrix::Multiply( \
01876     const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
01877     bool transposeA, \
01878     const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
01879     bool transposeB, \
01880     CrsMatrix< SCALAR , LO , GO , NODE >& C, \
01881     bool call_FillComplete_on_result); \
01882 \
01883 template \
01884   void MatrixMatrix::Jacobi( \
01885     SCALAR omega, \
01886     const Vector< SCALAR, LO, GO, NODE > & Dinv, \
01887     const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
01888     const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
01889     CrsMatrix< SCALAR , LO , GO , NODE >& C, \
01890     bool call_FillComplete_on_result); \
01891 \
01892   template \
01893   void MatrixMatrix::Add( \
01894     const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
01895     bool transposeA, \
01896     SCALAR scalarA, \
01897     const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
01898     bool transposeB, \
01899     SCALAR scalarB, \
01900     RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
01901   \
01902   template \
01903   void MatrixMatrix::Add( \
01904     const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
01905     bool transposeA, \
01906     SCALAR scalarA, \
01907     CrsMatrix<SCALAR, LO, GO, NODE>& B, \
01908     SCALAR scalarB ); \
01909   \
01910   template \
01911   Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
01912   MatrixMatrix::add (const SCALAR & alpha, \
01913                      const bool transposeA, \
01914                      const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
01915                      const SCALAR & beta, \
01916                      const bool transposeB, \
01917                      const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
01918                      const Teuchos::RCP<const Map< LO , GO , NODE > >& domainMap, \
01919                      const Teuchos::RCP<const Map< LO , GO , NODE > >& rangeMap, \
01920                      const Teuchos::RCP<Teuchos::ParameterList>& params); \
01921 \
01922 
01923 
01924 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines