Tpetra Matrix/Vector Services  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines
Tpetra_Experimental_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //          Tpetra: Templated Linear Algebra Services Package
00005 //                 Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 // @HEADER
00041 
00042 #ifndef TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
00043 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
00044 
00047 
00048 #ifdef DOXYGEN_USE_ONLY
00049 #  include "Tpetra_Experimental_BlockMultiVector_decl.hpp"
00050 #endif
00051 
00052 namespace Tpetra {
00053 namespace Experimental {
00054 
00055   template<class Scalar, class LO, class GO, class Node>
00056   BlockCrsMatrix<Scalar, LO, GO, Node>::
00057   BlockCrsMatrix () :
00058     dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
00059     graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
00060     blockSize_ (static_cast<LO> (0)),
00061     ptr_ (NULL),
00062     ind_ (NULL),
00063     X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
00064     Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
00065     columnPadding_ (0), // no padding by default
00066     rowMajor_ (true), // row major blocks by default
00067     localError_ (new bool (false)),
00068     errs_ (new Teuchos::RCP<std::ostringstream> ()), // ptr to a null ptr
00069     computedDiagonalGraph_(false)
00070   {
00071   }
00072 
00073   template<class Scalar, class LO, class GO, class Node>
00074   BlockCrsMatrix<Scalar, LO, GO, Node>::
00075   BlockCrsMatrix (const crs_graph_type& graph,
00076                   const LO blockSize) :
00077     dist_object_type (graph.getMap ()),
00078     graph_ (graph),
00079     rowMeshMap_ (* (graph.getRowMap ())),
00080     blockSize_ (blockSize),
00081     ptr_ (NULL), // to be initialized below
00082     ind_ (NULL), // to be initialized below
00083     val_ (NULL), // to be initialized below
00084     X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
00085     Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
00086     columnPadding_ (0), // no padding by default
00087     rowMajor_ (true), // row major blocks by default
00088     localError_ (new bool (false)),
00089     errs_ (new Teuchos::RCP<std::ostringstream> ()), // ptr to a null ptr
00090     computedDiagonalGraph_(false)
00091   {
00092     TEUCHOS_TEST_FOR_EXCEPTION(
00093       ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
00094       "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
00095       "rows (isSorted() is false).  This class assumes sorted rows.");
00096 
00097     graphRCP_ = Teuchos::rcpFromRef(graph_);
00098 
00099     // Trick to test whether LO is nonpositive, without a compiler
00100     // warning in case LO is unsigned (which is generally a bad idea
00101     // anyway).  I don't promise that the trick works, but it
00102     // generally does with gcc at least, in my experience.
00103     const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
00104     TEUCHOS_TEST_FOR_EXCEPTION(
00105       blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
00106       "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
00107       " <= 0.  The block size must be positive.");
00108 
00109     domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
00110     rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
00111 
00112     ptr_ = graph.getNodeRowPtrs ().getRawPtr ();
00113     ind_ = graph.getNodePackedIndices ().getRawPtr ();
00114     valView_.resize (graph.getNodeNumEntries () * offsetPerBlock ());
00115     val_ = valView_.getRawPtr ();
00116   }
00117 
00118   template<class Scalar, class LO, class GO, class Node>
00119   BlockCrsMatrix<Scalar, LO, GO, Node>::
00120   BlockCrsMatrix (const crs_graph_type& graph,
00121                   const map_type& domainPointMap,
00122                   const map_type& rangePointMap,
00123                   const LO blockSize) :
00124     dist_object_type (graph.getMap ()),
00125     graph_ (graph),
00126     rowMeshMap_ (* (graph.getRowMap ())),
00127     domainPointMap_ (domainPointMap),
00128     rangePointMap_ (rangePointMap),
00129     blockSize_ (blockSize),
00130     ptr_ (NULL), // to be initialized below
00131     ind_ (NULL), // to be initialized below
00132     X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
00133     Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
00134     columnPadding_ (0), // no padding by default
00135     rowMajor_ (true), // row major blocks by default
00136     localError_ (new bool (false)),
00137     errs_ (new Teuchos::RCP<std::ostringstream> ()), // ptr to a null ptr
00138     computedDiagonalGraph_(false)
00139   {
00140     TEUCHOS_TEST_FOR_EXCEPTION(
00141       ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
00142       "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
00143       "rows (isSorted() is false).  This class assumes sorted rows.");
00144 
00145     graphRCP_ = Teuchos::rcpFromRef(graph_);
00146 
00147     // Trick to test whether LO is nonpositive, without a compiler
00148     // warning in case LO is unsigned (which is generally a bad idea
00149     // anyway).  I don't promise that the trick works, but it
00150     // generally does with gcc at least, in my experience.
00151     const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
00152     TEUCHOS_TEST_FOR_EXCEPTION(
00153       blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
00154       "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
00155       " <= 0.  The block size must be positive.");
00156 
00157     ptr_ = graph.getNodeRowPtrs ().getRawPtr ();
00158     ind_ = graph.getNodePackedIndices ().getRawPtr ();
00159     valView_.resize (graph.getNodeNumEntries () * offsetPerBlock ());
00160     val_ = valView_.getRawPtr ();
00161   }
00162 
00163   template<class Scalar, class LO, class GO, class Node>
00164   Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
00165   BlockCrsMatrix<Scalar, LO, GO, Node>::
00166   getDomainMap () const
00167   { // Copy constructor of map_type does a shallow copy.
00168     // We're only returning by RCP for backwards compatibility.
00169     return Teuchos::rcp (new map_type (domainPointMap_));
00170   }
00171 
00172   template<class Scalar, class LO, class GO, class Node>
00173   Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
00174   BlockCrsMatrix<Scalar, LO, GO, Node>::
00175   getRangeMap () const
00176   { // Copy constructor of map_type does a shallow copy.
00177     // We're only returning by RCP for backwards compatibility.
00178     return Teuchos::rcp (new map_type (rangePointMap_));
00179   }
00180 
00181   template<class Scalar, class LO, class GO, class Node>
00182   Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
00183   BlockCrsMatrix<Scalar, LO, GO, Node>::
00184   getRowMap () const
00185   {
00186     return graph_.getRowMap();
00187   }
00188 
00189   template<class Scalar, class LO, class GO, class Node>
00190   Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
00191   BlockCrsMatrix<Scalar, LO, GO, Node>::
00192   getColMap () const
00193   {
00194     return graph_.getColMap();
00195   }
00196 
00197   template<class Scalar, class LO, class GO, class Node>
00198   global_size_t
00199   BlockCrsMatrix<Scalar, LO, GO, Node>::
00200   getGlobalNumRows() const
00201   {
00202     return graph_.getGlobalNumRows();
00203   }
00204 
00205   template<class Scalar, class LO, class GO, class Node>
00206   size_t
00207   BlockCrsMatrix<Scalar, LO, GO, Node>::
00208   getNodeNumRows() const
00209   {
00210     return graph_.getNodeNumRows();
00211   }
00212 
00213   template<class Scalar, class LO, class GO, class Node>
00214   size_t
00215   BlockCrsMatrix<Scalar, LO, GO, Node>::
00216   getNodeMaxNumRowEntries() const
00217   {
00218     return graph_.getNodeMaxNumRowEntries();
00219   }
00220 
00221   template<class Scalar, class LO, class GO, class Node>
00222   void
00223   BlockCrsMatrix<Scalar, LO, GO, Node>::
00224   apply (const mv_type& X,
00225          mv_type& Y,
00226          Teuchos::ETransp mode,
00227          scalar_type alpha,
00228          scalar_type beta) const
00229   {
00230     typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
00231     TEUCHOS_TEST_FOR_EXCEPTION(
00232       mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
00233       std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::apply: "
00234       "Invalid 'mode' argument.  Valid values are Teuchos::NO_TRANS, "
00235       "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
00236 
00237     BMV X_view;
00238     BMV Y_view;
00239     const LO blockSize = getBlockSize ();
00240     try {
00241       X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
00242       Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
00243     }
00244     catch (std::invalid_argument& e) {
00245       TEUCHOS_TEST_FOR_EXCEPTION(
00246         true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
00247         "apply: Either the input MultiVector X or the output MultiVector Y "
00248         "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
00249         "graph.  BlockMultiVector's constructor threw the following exception: "
00250         << e.what ());
00251     }
00252 
00253     try {
00254       // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
00255       // either that or mark fields of this class as 'mutable'.  The
00256       // problem is that applyBlock wants to do lazy initialization of
00257       // temporary block multivectors.
00258       const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
00259     } catch (std::invalid_argument& e) {
00260       TEUCHOS_TEST_FOR_EXCEPTION(
00261         true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
00262         "apply: The implementation method applyBlock complained about having "
00263         "an invalid argument.  It reported the following: " << e.what ());
00264     } catch (std::logic_error& e) {
00265       TEUCHOS_TEST_FOR_EXCEPTION(
00266         true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
00267         "apply: The implementation method applyBlock complained about a "
00268         "possible bug in its implementation.  It reported the following: "
00269         << e.what ());
00270     } catch (std::exception& e) {
00271       TEUCHOS_TEST_FOR_EXCEPTION(
00272         true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
00273         "apply: The implementation method applyBlock threw an exception which "
00274         "is neither std::invalid_argument nor std::logic_error, but is a "
00275         "subclass of std::exception.  It reported the following: "
00276         << e.what ());
00277     } catch (...) {
00278       TEUCHOS_TEST_FOR_EXCEPTION(
00279         true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
00280         "apply: The implementation method applyBlock threw an exception which "
00281         "is not an instance of a subclass of std::exception.  This probably "
00282         "indicates a bug.  Please report this to the Tpetra developers.");
00283     }
00284   }
00285 
00286   template<class Scalar, class LO, class GO, class Node>
00287   void
00288   BlockCrsMatrix<Scalar, LO, GO, Node>::
00289   applyBlock (const BlockMultiVector<Scalar, LO, GO, Node>& X,
00290               BlockMultiVector<Scalar, LO, GO, Node>& Y,
00291               Teuchos::ETransp mode,
00292               const Scalar alpha,
00293               const Scalar beta)
00294   {
00295     TEUCHOS_TEST_FOR_EXCEPTION(
00296       X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
00297       "Tpetra::Experimental::BlockCrsMatrix::applyBlock: "
00298       "X and Y have different block sizes.  X.getBlockSize() = "
00299       << X.getBlockSize () << " != Y.getBlockSize() = "
00300       << Y.getBlockSize () << ".");
00301 
00302     if (mode == Teuchos::NO_TRANS) {
00303       applyBlockNoTrans (X, Y, alpha, beta);
00304     } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
00305       applyBlockTrans (X, Y, mode, alpha, beta);
00306     } else {
00307       TEUCHOS_TEST_FOR_EXCEPTION(
00308         true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
00309         "applyBlock: Invalid 'mode' argument.  Valid values are "
00310         "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
00311     }
00312   }
00313 
00314   template<class Scalar, class LO, class GO, class Node>
00315   void
00316   BlockCrsMatrix<Scalar, LO, GO, Node>::
00317   setAllToScalar (const Scalar& alpha)
00318   {
00319     const LO numLocalMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
00320     for (LO lclRow = 0; lclRow < numLocalMeshRows; ++lclRow) {
00321       const size_t meshBeg = ptr_[lclRow];
00322       const size_t meshEnd = ptr_[lclRow+1];
00323       for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
00324         little_block_type A_cur = getNonConstLocalBlockFromAbsOffset (absBlkOff);
00325         A_cur.fill (alpha);
00326       }
00327     }
00328   }
00329 
00330   template<class Scalar, class LO, class GO, class Node>
00331   LO
00332   BlockCrsMatrix<Scalar, LO, GO, Node>::
00333   replaceLocalValues (const LO localRowInd,
00334                       const LO colInds[],
00335                       const Scalar vals[],
00336                       const LO numColInds) const
00337   {
00338     if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
00339       // We modified no values, because the input local row index is
00340       // invalid on the calling process.  That may not be an error, if
00341       // numColInds is zero anyway; it doesn't matter.  This is the
00342       // advantage of returning the number of valid indices.
00343       return static_cast<LO> (0);
00344     }
00345 
00346     const size_t absRowBlockOffset = ptr_[localRowInd];
00347     const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
00348     const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
00349     size_t hint = 0; // Guess for the relative offset into the current row
00350     size_t pointOffset = 0; // Current offset into input values
00351     LO validCount = 0; // number of valid column indices in colInds
00352 
00353     for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
00354       const size_t relBlockOffset =
00355         findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
00356       if (relBlockOffset != STINV) {
00357         const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
00358         little_block_type A_old =
00359           getNonConstLocalBlockFromAbsOffset (absBlockOffset);
00360         const_little_block_type A_new =
00361           getConstLocalBlockFromInput (vals, pointOffset);
00362         A_old.assign (A_new);
00363         hint = relBlockOffset + 1;
00364         ++validCount;
00365       }
00366     }
00367     return validCount;
00368   }
00369 
00370   template <class Scalar, class LO, class GO, class Node>
00371   void
00372   BlockCrsMatrix<Scalar,LO,GO,Node>::
00373   getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
00374   {
00375 
00376     const map_type& rowMap = * (graph_.getRowMap());
00377     const map_type& colMap = * (graph_.getColMap ());
00378 
00379     const size_t myNumRows = rowMeshMap_.getNodeNumElements();
00380     if (static_cast<size_t> (offsets.size ()) != myNumRows) {
00381       offsets.resize (static_cast<size_t> (myNumRows));
00382     }
00383 
00384 #ifdef HAVE_TPETRA_DEBUG
00385     bool allRowMapDiagEntriesInColMap = true;
00386     bool allDiagEntriesFound = true;
00387 #endif // HAVE_TPETRA_DEBUG
00388 
00389     for (size_t r = 0; r < myNumRows; ++r) {
00390       const GO rgid = rowMap.getGlobalElement (r);
00391       const LO rlid = colMap.getLocalElement (rgid);
00392 
00393 #ifdef HAVE_TPETRA_DEBUG
00394       if (rlid == Teuchos::OrdinalTraits<LO>::invalid ()) {
00395         allRowMapDiagEntriesInColMap = false;
00396       }
00397 #endif // HAVE_TPETRA_DEBUG
00398 
00399       if (rlid != Teuchos::OrdinalTraits<LO>::invalid ()) {
00400         RowInfo rowinfo = graph_.getRowInfo (r);
00401         if (rowinfo.numEntries > 0) {
00402           offsets[r] = graph_.findLocalIndex (rowinfo, rlid);
00403         }
00404         else {
00405           offsets[r] = Teuchos::OrdinalTraits<size_t>::invalid ();
00406 #ifdef HAVE_TPETRA_DEBUG
00407           allDiagEntriesFound = false;
00408 #endif // HAVE_TPETRA_DEBUG
00409         }
00410       }
00411     }
00412 
00413 #ifdef HAVE_TPETRA_DEBUG
00414     using Teuchos::reduceAll;
00415     using std::endl;
00416     const char tfecfFuncName[] = "getLocalDiagOffsets";
00417 
00418     const bool localSuccess =
00419       allRowMapDiagEntriesInColMap && allDiagEntriesFound;
00420     int localResults[3];
00421     localResults[0] = allRowMapDiagEntriesInColMap ? 1 : 0;
00422     localResults[1] = allDiagEntriesFound ? 1 : 0;
00423     // min-all-reduce will compute least rank of all the processes
00424     // that didn't succeed.
00425     localResults[2] =
00426       ! localSuccess ? getComm ()->getRank () : getComm ()->getSize ();
00427     int globalResults[3];
00428     globalResults[0] = 0;
00429     globalResults[1] = 0;
00430     globalResults[2] = 0;
00431     reduceAll<int, int> (* (getComm ()), Teuchos::REDUCE_MIN,
00432                          3, localResults, globalResults);
00433     if (globalResults[0] == 0 || globalResults[1] == 0) {
00434       std::ostringstream os; // build error message
00435       const bool both =
00436         globalResults[0] == 0 && globalResults[1] == 0;
00437       os << ": At least one process (including Process " << globalResults[2]
00438          << ") had the following issue" << (both ? "s" : "") << ":" << endl;
00439       if (globalResults[0] == 0) {
00440         os << "  - The column Map does not contain at least one diagonal entry "
00441           "of the matrix." << endl;
00442       }
00443       if (globalResults[1] == 0) {
00444         os << "  - There is a row on that / those process(es) that does not "
00445           "contain a diagonal entry." << endl;
00446       }
00447       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error, os.str());
00448     }
00449 #endif // HAVE_TPETRA_DEBUG
00450   }
00451 
00452   template <class Scalar, class LO, class GO, class Node>
00453   void
00454   BlockCrsMatrix<Scalar,LO,GO,Node>::
00455   computeDiagonalGraph ()
00456   {
00457     using Teuchos::rcp;
00458 
00459     if (computedDiagonalGraph_) {
00460       // FIXME (mfh 12 Aug 2014) Consider storing the "diagonal graph"
00461       // separately from the matrix.  It should really go in the
00462       // preconditioner, not here.  We could do this by adding a
00463       // method that accepts a nonconst diagonal graph, and updates
00464       // it.  btw it would probably be a better idea to use a
00465       // BlockMultiVector to store the diagonal, not a graph.
00466       return;
00467     }
00468 
00469     const size_t maxDiagEntPerRow = 1;
00470     // NOTE (mfh 12 Aug 2014) We could also pass in the column Map
00471     // here.  However, we still would have to do LID->GID lookups to
00472     // make sure that we are using the correct diagonal column
00473     // indices, so it probably wouldn't help much.
00474     diagonalGraph_ =
00475       rcp (new crs_graph_type (graph_.getRowMap (), maxDiagEntPerRow,
00476                                Tpetra::StaticProfile));
00477     const map_type& meshRowMap = * (graph_.getRowMap ());
00478 
00479     Teuchos::Array<GO> diagGblColInds (maxDiagEntPerRow);
00480 
00481     for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
00482          lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
00483       const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd);
00484       diagGblColInds[0] = gblRowInd;
00485       diagonalGraph_->insertGlobalIndices (gblRowInd, diagGblColInds ());
00486     }
00487     diagonalGraph_->fillComplete (graph_.getDomainMap (),
00488                                   graph_.getRangeMap ());
00489     computedDiagonalGraph_ = true;
00490   }
00491 
00492   template <class Scalar, class LO, class GO, class Node>
00493   Teuchos::RCP<CrsGraph<LO, GO, Node> >
00494   BlockCrsMatrix<Scalar,LO,GO,Node>::
00495   getDiagonalGraph () const
00496   {
00497     TEUCHOS_TEST_FOR_EXCEPTION(
00498       ! computedDiagonalGraph_, std::runtime_error, "Tpetra::Experimental::"
00499       "BlockCrsMatrix::getDiagonalGraph: You must call computeDiagionalGraph() "
00500       "before calling this method.");
00501     return diagonalGraph_;
00502   }
00503 
00504   template <class Scalar, class LO, class GO, class Node>
00505   void
00506   BlockCrsMatrix<Scalar,LO,GO,Node>::
00507   localGaussSeidel (const BlockMultiVector<Scalar, LO, GO, Node>& B,
00508                     BlockMultiVector<Scalar, LO, GO, Node>& X,
00509                     BlockCrsMatrix<Scalar, LO, GO, Node> & factorizedDiagonal,
00510                     const int * factorizationPivots,
00511                     const Scalar omega,
00512                     const ESweepDirection direction) const
00513   {
00514     const LO numLocalMeshRows =
00515       static_cast<LO> (rowMeshMap_.getNodeNumElements ());
00516     const LO numVecs = static_cast<LO> (X.getNumVectors ());
00517 
00518     // If using (new) Kokkos, replace localMem with thread-local
00519     // memory.  Note that for larger block sizes, this will affect the
00520     // two-level parallelization.  Look to Stokhos for best practice
00521     // on making this fast for GPUs.
00522     const LO blockSize = getBlockSize ();
00523     Teuchos::Array<Scalar> localMem (blockSize);
00524     Teuchos::Array<Scalar> localMat (blockSize*blockSize);
00525     little_vec_type X_lcl (localMem.getRawPtr (), blockSize, 1);
00526 
00527     const LO * columnIndices;
00528     Scalar * Dmat;
00529     LO numIndices;
00530 
00531     // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
00532     LO rowBegin = 0, rowEnd = 0, rowStride = 0;
00533     if (direction == Forward) {
00534       rowBegin = 1;
00535       rowEnd = numLocalMeshRows+1;
00536       rowStride = 1;
00537     }
00538     else if (direction == Backward) {
00539       rowBegin = numLocalMeshRows;
00540       rowEnd = 0;
00541       rowStride = -1;
00542     }
00543     else if (direction == Symmetric) {
00544       this->localGaussSeidel (B, X, factorizedDiagonal, factorizationPivots, omega, Forward);
00545       this->localGaussSeidel (B, X, factorizedDiagonal, factorizationPivots, omega, Backward);
00546       return;
00547     }
00548 
00549     const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
00550     const Scalar     minus_omega = -omega;
00551 
00552     if (numVecs == 1) {
00553       for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
00554         const LO actlRow = lclRow - 1;
00555 
00556         little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
00557         X_lcl.assign (B_cur);
00558         X_lcl.scale (omega);
00559 
00560         const size_t meshBeg = ptr_[actlRow];
00561         const size_t meshEnd = ptr_[actlRow+1];
00562         for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
00563           const LO meshCol = ind_[absBlkOff];
00564           const_little_block_type A_cur =
00565             getConstLocalBlockFromAbsOffset (absBlkOff);
00566 
00567           little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
00568 
00569           // X_lcl += alpha*A_cur*X_cur
00570           const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
00571           X_lcl.matvecUpdate (alpha, A_cur, X_cur);
00572         } // for each entry in the current local row of the matrx
00573 
00574         factorizedDiagonal.getLocalRowView (actlRow, columnIndices,
00575                                             Dmat, numIndices);
00576         little_block_type D_lcl = getNonConstLocalBlockFromInput (Dmat, 0);
00577 
00578         D_lcl.solve (X_lcl, &factorizationPivots[actlRow*blockSize_]);
00579         little_vec_type X_update = X.getLocalBlock (actlRow, 0);
00580         X_update.assign(X_lcl);
00581       } // for each local row of the matrix
00582     }
00583     else {
00584       for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
00585         for (LO j = 0; j < numVecs; ++j) {
00586           LO actlRow = lclRow-1;
00587 
00588           little_vec_type B_cur = B.getLocalBlock (actlRow, j);
00589           X_lcl.assign (B_cur);
00590           X_lcl.scale (omega);
00591 
00592           const size_t meshBeg = ptr_[actlRow];
00593           const size_t meshEnd = ptr_[actlRow+1];
00594           for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
00595             const LO meshCol = ind_[absBlkOff];
00596             const_little_block_type A_cur =
00597               getConstLocalBlockFromAbsOffset (absBlkOff);
00598 
00599             little_vec_type X_cur = X.getLocalBlock (meshCol, j);
00600 
00601             // X_lcl += alpha*A_cur*X_cur
00602             const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
00603             X_lcl.matvecUpdate (alpha, A_cur, X_cur);
00604           } // for each entry in the current local row of the matrx
00605 
00606           factorizedDiagonal.getLocalRowView (actlRow, columnIndices,
00607                                               Dmat, numIndices);
00608           little_block_type D_lcl = getNonConstLocalBlockFromInput(Dmat, 0);
00609 
00610           D_lcl.solve (X_lcl, &factorizationPivots[actlRow*blockSize_]);
00611 
00612           little_vec_type X_update = X.getLocalBlock (actlRow, j);
00613           X_update.assign(X_lcl);
00614         } // for each entry in the current local row of the matrix
00615       } // for each local row of the matrix
00616     }
00617   }
00618 
00619   template <class Scalar, class LO, class GO, class Node>
00620   void
00621   BlockCrsMatrix<Scalar,LO,GO,Node>::
00622   gaussSeidelCopy (MultiVector<Scalar,LO,GO,Node> &X,
00623                    const MultiVector<Scalar,LO,GO,Node> &B,
00624                    const MultiVector<Scalar,LO,GO,Node> &D,
00625                    const Scalar& dampingFactor,
00626                    const ESweepDirection direction,
00627                    const int numSweeps,
00628                    const bool zeroInitialGuess) const
00629   {
00630     // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
00631     // interface for block Gauss-Seidel.
00632     TEUCHOS_TEST_FOR_EXCEPTION(
00633       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
00634       "gaussSeidelCopy: Not implemented.");
00635   }
00636 
00637   template <class Scalar, class LO, class GO, class Node>
00638   void
00639   BlockCrsMatrix<Scalar,LO,GO,Node>::
00640   reorderedGaussSeidelCopy (MultiVector<Scalar,LO,GO,Node>& X,
00641                             const MultiVector<Scalar,LO,GO,Node>& B,
00642                             const MultiVector<Scalar,LO,GO,Node>& D,
00643                             const ArrayView<LO>& rowIndices,
00644                             const Scalar& dampingFactor,
00645                             const ESweepDirection direction,
00646                             const int numSweeps,
00647                             const bool zeroInitialGuess) const
00648   {
00649     // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
00650     // interface for block Gauss-Seidel.
00651     TEUCHOS_TEST_FOR_EXCEPTION(
00652       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
00653       "reorderedGaussSeidelCopy: Not implemented.");
00654   }
00655 
00656   template <class Scalar, class LO, class GO, class Node>
00657   void
00658   BlockCrsMatrix<Scalar,LO,GO,Node>::
00659   getLocalDiagCopy (BlockCrsMatrix<Scalar,LO,GO,Node>& diag,
00660                     const Teuchos::ArrayView<const size_t>& offsets) const
00661   {
00662     using Teuchos::ArrayRCP;
00663     using Teuchos::ArrayView;
00664     const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
00665 
00666     const size_t myNumRows = rowMeshMap_.getNodeNumElements();
00667     const LO* columnIndices;
00668     Scalar* vals;
00669     LO numColumns;
00670     Teuchos::Array<LO> cols(1);
00671 
00672     // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
00673     Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
00674     for (size_t i = 0; i < myNumRows; ++i) {
00675       cols[0] = i;
00676       if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
00677         diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
00678       }
00679       else {
00680         getLocalRowView (i, columnIndices, vals, numColumns);
00681         diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
00682       }
00683     }
00684   }
00685 
00686 
00687   template<class Scalar, class LO, class GO, class Node>
00688   LO
00689   BlockCrsMatrix<Scalar, LO, GO, Node>::
00690   absMaxLocalValues (const LO localRowInd,
00691                      const LO colInds[],
00692                      const Scalar vals[],
00693                      const LO numColInds) const
00694   {
00695     if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
00696       // We modified no values, because the input local row index is
00697       // invalid on the calling process.  That may not be an error, if
00698       // numColInds is zero anyway; it doesn't matter.  This is the
00699       // advantage of returning the number of valid indices.
00700       return static_cast<LO> (0);
00701     }
00702 
00703     const size_t absRowBlockOffset = ptr_[localRowInd];
00704     const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
00705     const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
00706     size_t hint = 0; // Guess for the relative offset into the current row
00707     size_t pointOffset = 0; // Current offset into input values
00708     LO validCount = 0; // number of valid column indices in colInds
00709 
00710     for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
00711       const size_t relBlockOffset =
00712         findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
00713       if (relBlockOffset != STINV) {
00714         const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
00715         little_block_type A_old =
00716           getNonConstLocalBlockFromAbsOffset (absBlockOffset);
00717         const_little_block_type A_new =
00718           getConstLocalBlockFromInput (vals, pointOffset);
00719         A_old.absmax (A_new);
00720         hint = relBlockOffset + 1;
00721         ++validCount;
00722       }
00723     }
00724     return validCount;
00725   }
00726 
00727 
00728   template<class Scalar, class LO, class GO, class Node>
00729   LO
00730   BlockCrsMatrix<Scalar, LO, GO, Node>::
00731   sumIntoLocalValues (const LO localRowInd,
00732                       const LO colInds[],
00733                       const Scalar vals[],
00734                       const LO numColInds) const
00735   {
00736     if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
00737       // We modified no values, because the input local row index is
00738       // invalid on the calling process.  That may not be an error, if
00739       // numColInds is zero anyway; it doesn't matter.  This is the
00740       // advantage of returning the number of valid indices.
00741       return static_cast<LO> (0);
00742     }
00743 
00744     const size_t absRowBlockOffset = ptr_[localRowInd];
00745     const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
00746     const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
00747     size_t hint = 0; // Guess for the relative offset into the current row
00748     size_t pointOffset = 0; // Current offset into input values
00749     LO validCount = 0; // number of valid column indices in colInds
00750 
00751     for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
00752       const size_t relBlockOffset =
00753         findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
00754       if (relBlockOffset != STINV) {
00755         const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
00756         little_block_type A_old =
00757           getNonConstLocalBlockFromAbsOffset (absBlockOffset);
00758         const_little_block_type A_new =
00759           getConstLocalBlockFromInput (vals, pointOffset);
00760         A_old.update (STS::one (), A_new);
00761         hint = relBlockOffset + 1;
00762         ++validCount;
00763       }
00764     }
00765     return validCount;
00766   }
00767 
00768   template<class Scalar, class LO, class GO, class Node>
00769   LO
00770   BlockCrsMatrix<Scalar, LO, GO, Node>::
00771   getLocalRowView (const LO localRowInd,
00772                    const LO*& colInds,
00773                    Scalar*& vals,
00774                    LO& numInds) const
00775   {
00776     if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
00777       colInds = NULL;
00778       vals = NULL;
00779       numInds = 0;
00780       return Teuchos::OrdinalTraits<LO>::invalid ();
00781     }
00782     else {
00783       const size_t absBlockOffsetStart = ptr_[localRowInd];
00784       colInds = ind_ + absBlockOffsetStart;
00785       vals = val_ + absBlockOffsetStart * offsetPerBlock ();
00786       numInds = ptr_[localRowInd + 1] - absBlockOffsetStart;
00787       return 0; // indicates no error
00788     }
00789   }
00790 
00791   template<class Scalar, class LO, class GO, class Node>
00792   void
00793   BlockCrsMatrix<Scalar, LO, GO, Node>::
00794   getLocalRowCopy (LO LocalRow,
00795                    const ArrayView<LO> &Indices,
00796                    const ArrayView<Scalar> &Values,
00797                    size_t &NumEntries) const
00798   {
00799     TEUCHOS_TEST_FOR_EXCEPTION(
00800       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
00801       "getLocalRowCopy: Copying is not implemented. You should use a view.");
00802   }
00803 
00804   template<class Scalar, class LO, class GO, class Node>
00805   LO
00806   BlockCrsMatrix<Scalar, LO, GO, Node>::
00807   getLocalRowOffsets (const LO localRowInd,
00808                       ptrdiff_t offsets[],
00809                       const LO colInds[],
00810                       const LO numColInds) const
00811   {
00812     if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
00813       // We got no offsets, because the input local row index is
00814       // invalid on the calling process.  That may not be an error, if
00815       // numColInds is zero anyway; it doesn't matter.  This is the
00816       // advantage of returning the number of valid indices.
00817       return static_cast<LO> (0);
00818     }
00819 
00820     const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
00821     size_t hint = 0; // Guess for the relative offset into the current row
00822     LO validCount = 0; // number of valid column indices in colInds
00823 
00824     for (LO k = 0; k < numColInds; ++k) {
00825       const size_t relBlockOffset =
00826         findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
00827       if (relBlockOffset != STINV) {
00828         offsets[k] = relBlockOffset;
00829         hint = relBlockOffset + 1;
00830         ++validCount;
00831       }
00832     }
00833     return validCount;
00834   }
00835 
00836 
00837   template<class Scalar, class LO, class GO, class Node>
00838   LO
00839   BlockCrsMatrix<Scalar, LO, GO, Node>::
00840   replaceLocalValuesByOffsets (const LO localRowInd,
00841                                const ptrdiff_t offsets[],
00842                                const Scalar vals[],
00843                                const LO numOffsets) const
00844   {
00845     if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
00846       // We modified no values, because the input local row index is
00847       // invalid on the calling process.  That may not be an error, if
00848       // numColInds is zero anyway; it doesn't matter.  This is the
00849       // advantage of returning the number of valid indices.
00850       return static_cast<LO> (0);
00851     }
00852 
00853     const size_t absRowBlockOffset = ptr_[localRowInd];
00854     const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
00855     const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
00856     size_t pointOffset = 0; // Current offset into input values
00857     LO validCount = 0; // number of valid offsets
00858 
00859     for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
00860       const size_t relBlockOffset = offsets[k];
00861       if (relBlockOffset != STINV) {
00862         const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
00863         little_block_type A_old =
00864           getNonConstLocalBlockFromAbsOffset (absBlockOffset);
00865         const_little_block_type A_new =
00866           getConstLocalBlockFromInput (vals, pointOffset);
00867         A_old.assign (A_new);
00868         ++validCount;
00869       }
00870     }
00871     return validCount;
00872   }
00873 
00874 
00875   template<class Scalar, class LO, class GO, class Node>
00876   LO
00877   BlockCrsMatrix<Scalar, LO, GO, Node>::
00878   absMaxLocalValuesByOffsets (const LO localRowInd,
00879                               const ptrdiff_t offsets[],
00880                               const Scalar vals[],
00881                               const LO numOffsets) const
00882   {
00883     if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
00884       // We modified no values, because the input local row index is
00885       // invalid on the calling process.  That may not be an error, if
00886       // numColInds is zero anyway; it doesn't matter.  This is the
00887       // advantage of returning the number of valid indices.
00888       return static_cast<LO> (0);
00889     }
00890 
00891     const size_t absRowBlockOffset = ptr_[localRowInd];
00892     const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
00893     const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
00894     size_t pointOffset = 0; // Current offset into input values
00895     LO validCount = 0; // number of valid offsets
00896 
00897     for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
00898       const size_t relBlockOffset = offsets[k];
00899       if (relBlockOffset != STINV) {
00900         const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
00901         little_block_type A_old =
00902           getNonConstLocalBlockFromAbsOffset (absBlockOffset);
00903         const_little_block_type A_new =
00904           getConstLocalBlockFromInput (vals, pointOffset);
00905         A_old.absmax (A_new);
00906         ++validCount;
00907       }
00908     }
00909     return validCount;
00910   }
00911 
00912 
00913   template<class Scalar, class LO, class GO, class Node>
00914   LO
00915   BlockCrsMatrix<Scalar, LO, GO, Node>::
00916   sumIntoLocalValuesByOffsets (const LO localRowInd,
00917                                const ptrdiff_t offsets[],
00918                                const Scalar vals[],
00919                                const LO numOffsets) const
00920   {
00921     if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
00922       // We modified no values, because the input local row index is
00923       // invalid on the calling process.  That may not be an error, if
00924       // numColInds is zero anyway; it doesn't matter.  This is the
00925       // advantage of returning the number of valid indices.
00926       return static_cast<LO> (0);
00927     }
00928 
00929     const size_t absRowBlockOffset = ptr_[localRowInd];
00930     const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
00931     const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
00932     size_t pointOffset = 0; // Current offset into input values
00933     LO validCount = 0; // number of valid offsets
00934 
00935     for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
00936       const size_t relBlockOffset = offsets[k];
00937       if (relBlockOffset != STINV) {
00938         const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
00939         little_block_type A_old =
00940           getNonConstLocalBlockFromAbsOffset (absBlockOffset);
00941         const_little_block_type A_new =
00942           getConstLocalBlockFromInput (vals, pointOffset);
00943         A_old.update (STS::one (), A_new);
00944         ++validCount;
00945       }
00946     }
00947     return validCount;
00948   }
00949 
00950 
00951   template<class Scalar, class LO, class GO, class Node>
00952   size_t
00953   BlockCrsMatrix<Scalar, LO, GO, Node>::
00954   getNumEntriesInLocalRow (const LO localRowInd) const
00955   {
00956     const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
00957     if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
00958       return static_cast<LO> (0); // the calling process doesn't have that row
00959     } else {
00960       return static_cast<LO> (numEntInGraph);
00961     }
00962   }
00963 
00964   template<class Scalar, class LO, class GO, class Node>
00965   void
00966   BlockCrsMatrix<Scalar, LO, GO, Node>::
00967   applyBlockTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
00968                    BlockMultiVector<Scalar, LO, GO, Node>& Y,
00969                    const Teuchos::ETransp mode,
00970                    const Scalar alpha,
00971                    const Scalar beta)
00972   {
00973     (void) X;
00974     (void) Y;
00975     (void) mode;
00976     (void) alpha;
00977     (void) beta;
00978 
00979     TEUCHOS_TEST_FOR_EXCEPTION(
00980       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::apply: "
00981       "transpose and conjugate transpose modes are not yet implemented.");
00982   }
00983 
00984   template<class Scalar, class LO, class GO, class Node>
00985   void
00986   BlockCrsMatrix<Scalar, LO, GO, Node>::
00987   applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
00988                      BlockMultiVector<Scalar, LO, GO, Node>& Y,
00989                      const Scalar alpha,
00990                      const Scalar beta)
00991   {
00992     using Teuchos::RCP;
00993     using Teuchos::rcp;
00994     typedef Tpetra::Import<LO, GO, Node> import_type;
00995     typedef Tpetra::Export<LO, GO, Node> export_type;
00996     const Scalar zero = STS::zero ();
00997     const Scalar one = STS::one ();
00998     RCP<const import_type> import = graph_.getImporter ();
00999     // "export" is a reserved C++ keyword, so we can't use it.
01000     RCP<const export_type> theExport = graph_.getExporter ();
01001 
01002     // FIXME (mfh 20 May 2014) X.mv_ and Y.mv_ requires a friend
01003     // declaration, which is useful only for debugging.
01004     TEUCHOS_TEST_FOR_EXCEPTION(
01005       X.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
01006       "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
01007       "The input BlockMultiVector X has deep copy semantics, "
01008       "not view semantics (as it should).");
01009     TEUCHOS_TEST_FOR_EXCEPTION(
01010       Y.mv_.getCopyOrView () != Teuchos::View, std::invalid_argument,
01011       "Tpetra::Experimental::BlockCrsMatrix::applyBlockNoTrans: "
01012       "The output BlockMultiVector Y has deep copy semantics, "
01013       "not view semantics (as it should).");
01014 
01015     if (alpha == zero) {
01016       if (beta == zero) {
01017         Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
01018       } else if (beta != one) {
01019         Y.scale (beta);
01020       }
01021     } else { // alpha != 0
01022       const BMV* X_colMap = NULL;
01023       if (import.is_null ()) {
01024         try {
01025           X_colMap = &X;
01026         } catch (std::exception& e) {
01027           TEUCHOS_TEST_FOR_EXCEPTION(
01028             true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
01029             "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
01030             "operator= threw an exception: " << std::endl << e.what ());
01031         }
01032       } else {
01033         // X_colMap_ is a pointer to a pointer to BMV.  Ditto for
01034         // Y_rowMap_ below.  This lets us do lazy initialization
01035         // correctly with view semantics of BlockCrsMatrix.  All views
01036         // of this BlockCrsMatrix have the same outer pointer.  That
01037         // way, we can set the inner pointer in one view, and all
01038         // other views will see it.
01039         if ((*X_colMap_).is_null () ||
01040             (**X_colMap_).getNumVectors () != X.getNumVectors () ||
01041             (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
01042           *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
01043                                      static_cast<LO> (X.getNumVectors ())));
01044         }
01045         (**X_colMap_).doImport (X, *import, Tpetra::REPLACE);
01046         try {
01047           X_colMap = &(**X_colMap_);
01048         } catch (std::exception& e) {
01049           TEUCHOS_TEST_FOR_EXCEPTION(
01050             true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
01051             "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
01052             "operator= threw an exception: " << std::endl << e.what ());
01053         }
01054       }
01055 
01056       BMV* Y_rowMap = NULL;
01057       if (theExport.is_null ()) {
01058         Y_rowMap = &Y;
01059       } else if ((*Y_rowMap_).is_null () ||
01060                  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
01061                  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
01062         *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
01063                                    static_cast<LO> (X.getNumVectors ())));
01064         try {
01065           Y_rowMap = &(**Y_rowMap_);
01066         } catch (std::exception& e) {
01067           TEUCHOS_TEST_FOR_EXCEPTION(
01068             true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
01069             "applyBlockNoTrans:" << std::endl << "Tpetra::MultiVector::"
01070             "operator= threw an exception: " << std::endl << e.what ());
01071         }
01072       }
01073 
01074       localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
01075 
01076       if (! theExport.is_null ()) {
01077         Y.doExport (*Y_rowMap, *theExport, Tpetra::REPLACE);
01078       }
01079     }
01080   }
01081 
01082   template<class Scalar, class LO, class GO, class Node>
01083   void
01084   BlockCrsMatrix<Scalar, LO, GO, Node>::
01085   localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
01086                           BlockMultiVector<Scalar, LO, GO, Node>& Y,
01087                           const Scalar alpha,
01088                           const Scalar beta)
01089   {
01090     // If using (new) Kokkos, prefer Kokkos::ArithTraits to
01091     // Teuchos::ScalarTraits.
01092     const Scalar zero = STS::zero ();
01093     const Scalar one = STS::one ();
01094     const LO numLocalMeshRows =
01095       static_cast<LO> (rowMeshMap_.getNodeNumElements ());
01096     const LO numVecs = static_cast<LO> (X.getNumVectors ());
01097 
01098     // If using (new) Kokkos, replace localMem with thread-local
01099     // memory.  Note that for larger block sizes, this will affect the
01100     // two-level parallelization.  Look to Stokhos for best practice
01101     // on making this fast for GPUs.
01102     const LO blockSize = getBlockSize ();
01103     Teuchos::Array<Scalar> localMem (blockSize);
01104     little_vec_type Y_lcl (localMem.getRawPtr (), blockSize, 1);
01105 
01106     if (numVecs == 1) {
01107       for (LO lclRow = 0; lclRow < numLocalMeshRows; ++lclRow) {
01108         little_vec_type Y_cur = Y.getLocalBlock (lclRow, 0);
01109 
01110         if (beta == zero) {
01111           Y_lcl.fill (zero);
01112         } else if (beta == one) {
01113           Y_lcl.assign (Y_cur);
01114         } else {
01115           Y_lcl.assign (Y_cur);
01116           Y_lcl.scale (beta);
01117         }
01118 
01119         const size_t meshBeg = ptr_[lclRow];
01120         const size_t meshEnd = ptr_[lclRow+1];
01121         for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
01122           const LO meshCol = ind_[absBlkOff];
01123           const_little_block_type A_cur =
01124             getConstLocalBlockFromAbsOffset (absBlkOff);
01125           little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
01126           // Y_lcl += alpha*A_cur*X_cur
01127           Y_lcl.matvecUpdate (alpha, A_cur, X_cur);
01128         } // for each entry in the current local row of the matrx
01129 
01130         Y_cur.assign (Y_lcl);
01131       } // for each local row of the matrix
01132     }
01133     else {
01134       for (LO lclRow = 0; lclRow < numLocalMeshRows; ++lclRow) {
01135         for (LO j = 0; j < numVecs; ++j) {
01136           little_vec_type Y_cur = Y.getLocalBlock (lclRow, j);
01137 
01138           if (beta == zero) {
01139             Y_lcl.fill (zero);
01140           } else if (beta == one) {
01141             Y_lcl.assign (Y_cur);
01142           } else {
01143             Y_lcl.assign (Y_cur);
01144             Y_lcl.scale (beta);
01145           }
01146 
01147           const size_t meshBeg = ptr_[lclRow];
01148           const size_t meshEnd = ptr_[lclRow+1];
01149           for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
01150             const LO meshCol = ind_[absBlkOff];
01151             const_little_block_type A_cur =
01152               getConstLocalBlockFromAbsOffset (absBlkOff);
01153             little_vec_type X_cur = X.getLocalBlock (meshCol, j);
01154             // Y_lcl += alpha*A_cur*X_cur
01155             Y_lcl.matvecUpdate (alpha, A_cur, X_cur);
01156           } // for each entry in the current local row of the matrix
01157 
01158           Y_cur.assign (Y_lcl);
01159         } // for each entry in the current row of Y
01160       } // for each local row of the matrix
01161     }
01162   }
01163 
01164   template<class Scalar, class LO, class GO, class Node>
01165   size_t
01166   BlockCrsMatrix<Scalar, LO, GO, Node>::
01167   findRelOffsetOfColumnIndex (const LO localRowIndex,
01168                               const LO colIndexToFind,
01169                               const size_t hint) const
01170   {
01171     const size_t absStartOffset = ptr_[localRowIndex];
01172     const size_t absEndOffset = ptr_[localRowIndex+1];
01173     const size_t numEntriesInRow = absEndOffset - absStartOffset;
01174 
01175     // If the hint was correct, then the hint is the offset to return.
01176     if (hint < numEntriesInRow && ind_[absStartOffset+hint] == colIndexToFind) {
01177       // Always return the offset relative to the current row.
01178       return hint;
01179     }
01180 
01181     // The hint was wrong, so we must search for the given column
01182     // index in the column indices for the given row.
01183     size_t relOffset = Teuchos::OrdinalTraits<size_t>::invalid ();
01184 
01185     // We require that the graph have sorted rows.  However, binary
01186     // search only pays if the current row is longer than a certain
01187     // amount.  We set this to 32, but you might want to tune this.
01188     const size_t maxNumEntriesForLinearSearch = 32;
01189     if (numEntriesInRow > maxNumEntriesForLinearSearch) {
01190       // Use binary search.  It would probably be better for us to
01191       // roll this loop by hand.  If we wrote it right, a smart
01192       // compiler could perhaps use conditional loads and avoid
01193       // branches (according to Jed Brown on May 2014).
01194       const LO* beg = ind_ + absStartOffset;
01195       const LO* end = ind_ + absEndOffset;
01196       std::pair<const LO*, const LO*> p =
01197         std::equal_range (beg, end, colIndexToFind);
01198       if (p.first != p.second) {
01199         // offset is relative to the current row
01200         relOffset = static_cast<size_t> (p.first - beg);
01201       }
01202     }
01203     else { // use linear search
01204       for (size_t k = 0; k < numEntriesInRow; ++k) {
01205         if (colIndexToFind == ind_[absStartOffset + k]) {
01206           relOffset = k; // offset is relative to the current row
01207           break;
01208         }
01209       }
01210     }
01211 
01212     return relOffset;
01213   }
01214 
01215   template<class Scalar, class LO, class GO, class Node>
01216   LO
01217   BlockCrsMatrix<Scalar, LO, GO, Node>::
01218   offsetPerBlock () const
01219   {
01220     const LO numRows = blockSize_;
01221 
01222     LO numCols = blockSize_;
01223     if (columnPadding_ > 0) { // Column padding == 0 means no padding.
01224       const LO numColsRoundedDown = (blockSize_ / columnPadding_) * columnPadding_;
01225       numCols = (numColsRoundedDown < numCols) ?
01226         (numColsRoundedDown + columnPadding_) :
01227         numColsRoundedDown;
01228     }
01229     return numRows * numCols;
01230   }
01231 
01232   template<class Scalar, class LO, class GO, class Node>
01233   typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
01234   BlockCrsMatrix<Scalar, LO, GO, Node>::
01235   getConstLocalBlockFromInput (const Scalar* val, const size_t pointOffset) const
01236   {
01237     if (rowMajor_) {
01238       const size_t rowStride = (columnPadding_ == 0) ?
01239         static_cast<size_t> (blockSize_) : static_cast<size_t> (columnPadding_);
01240       return const_little_block_type (val + pointOffset, blockSize_, rowStride, 1);
01241     } else {
01242       return const_little_block_type (val + pointOffset, blockSize_, 1, blockSize_);
01243     }
01244   }
01245 
01246   template<class Scalar, class LO, class GO, class Node>
01247   typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
01248   BlockCrsMatrix<Scalar, LO, GO, Node>::
01249   getNonConstLocalBlockFromInput (Scalar* val, const size_t pointOffset) const
01250   {
01251     if (rowMajor_) {
01252       const size_t rowStride = (columnPadding_ == 0) ?
01253         static_cast<size_t> (blockSize_) : static_cast<size_t> (columnPadding_);
01254       return little_block_type (val + pointOffset, blockSize_, rowStride, 1);
01255     } else {
01256       return little_block_type (val + pointOffset, blockSize_, 1, blockSize_);
01257     }
01258   }
01259 
01260   template<class Scalar, class LO, class GO, class Node>
01261   typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
01262   BlockCrsMatrix<Scalar, LO, GO, Node>::
01263   getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
01264   {
01265     if (absBlockOffset >= ptr_[rowMeshMap_.getNodeNumElements ()]) {
01266       // An empty block signifies an error.  We don't expect to see
01267       // this error in correct code, but it's helpful for avoiding
01268       // memory corruption in case there is a bug.
01269       return const_little_block_type (NULL, 0, 0, 0);
01270     } else {
01271       const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
01272       return getConstLocalBlockFromInput (val_, absPointOffset);
01273     }
01274   }
01275 
01276   template<class Scalar, class LO, class GO, class Node>
01277   typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
01278   BlockCrsMatrix<Scalar, LO, GO, Node>::
01279   getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
01280   {
01281     if (absBlockOffset >= ptr_[rowMeshMap_.getNodeNumElements ()]) {
01282       // An empty block signifies an error.  We don't expect to see
01283       // this error in correct code, but it's helpful for avoiding
01284       // memory corruption in case there is a bug.
01285       return little_block_type (NULL, 0, 0, 0);
01286     } else {
01287       const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
01288       return getNonConstLocalBlockFromInput (const_cast<Scalar*> (val_),
01289                                              absPointOffset);
01290     }
01291   }
01292 
01293   template<class Scalar, class LO, class GO, class Node>
01294   typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
01295   BlockCrsMatrix<Scalar, LO, GO, Node>::
01296   getLocalBlock (const LO localRowInd, const LO localColInd) const
01297   {
01298     const size_t absRowBlockOffset = ptr_[localRowInd];
01299 
01300     size_t hint = 0;
01301     const size_t relBlockOffset =
01302         findRelOffsetOfColumnIndex (localRowInd, localColInd, hint);
01303 
01304     if (relBlockOffset != Teuchos::OrdinalTraits<size_t>::invalid ()) {
01305       const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
01306       return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
01307     }
01308     else
01309     {
01310       return little_block_type (NULL, 0, 0, 0);
01311     }
01312 
01313   }
01314 
01315 
01316   template<class Scalar, class LO, class GO, class Node>
01317   bool
01318   BlockCrsMatrix<Scalar, LO, GO, Node>::
01319   checkSizes (const Tpetra::SrcDistObject& source)
01320   {
01321     typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
01322     const this_type* src = dynamic_cast<const this_type* > (&source);
01323 
01324     // Clear out the current local error state.
01325     * (const_cast<this_type*> (this)->localError_) = false;
01326     *errs_ = Teuchos::null;
01327 
01328     // We don't allow block sizes to be inconsistent across processes,
01329     // but it's possible due to user error.  It costs an all-reduce to
01330     // check in the constructor; we want to save that.
01331     if (src == NULL ||
01332         src->getBlockSize () != this->getBlockSize () ||
01333         ! src->graph_.isFillComplete () ||
01334         ! this->graph_.isFillComplete () ||
01335         src->graph_.getColMap ().is_null () ||
01336         this->graph_.getColMap ().is_null ()) {
01337       * (const_cast<this_type*> (this)->localError_) = true;
01338       if ((*errs_).is_null ()) {
01339         *errs_ = Teuchos::rcp (new std::ostringstream ());
01340       }
01341     }
01342 
01343     if (src == NULL) {
01344       **errs_ << "checkSizes: The source object of the Import or Export "
01345         "must be a BlockCrsMatrix with the same template parameters as the "
01346         "target object." << std::endl;
01347     }
01348     else {
01349       // Use a string of ifs, not if-elseifs, because we want to know
01350       // all the errors.
01351       if (src->getBlockSize () != this->getBlockSize ()) {
01352         **errs_ << "checkSizes: The source and target objects of the Import or "
01353                << "Export must have the same block sizes.  The source's block "
01354                << "size = " << src->getBlockSize () << " != the target's block "
01355                << "size = " << this->getBlockSize () << "." << std::endl;
01356       }
01357       if (! src->graph_.isFillComplete ()) {
01358         **errs_ << "checkSizes: The source object of the Import or Export is "
01359           "not fill complete.  Both source and target objects must be fill "
01360           "complete." << std::endl;
01361       }
01362       if (! this->graph_.isFillComplete ()) {
01363         **errs_ << "checkSizes: The target object of the Import or Export is "
01364           "not fill complete.  Both source and target objects must be fill "
01365           "complete." << std::endl;
01366       }
01367       if (src->graph_.getColMap ().is_null ()) {
01368         **errs_ << "checkSizes: The source object of the Import or Export does "
01369           "not have a column Map.  Both source and target objects must have "
01370           "column Maps." << std::endl;
01371       }
01372       if (this->graph_.getColMap ().is_null ()) {
01373         **errs_ << "checkSizes: The target object of the Import or Export does "
01374           "not have a column Map.  Both source and target objects must have "
01375           "column Maps." << std::endl;
01376       }
01377     }
01378 
01379     return ! (* (this->localError_));
01380   }
01381 
01382   template<class Scalar, class LO, class GO, class Node>
01383   void
01384   BlockCrsMatrix<Scalar, LO, GO, Node>::
01385   copyAndPermute (const Tpetra::SrcDistObject& source,
01386                   size_t numSameIDs,
01387                   const Teuchos::ArrayView<const LO>& permuteToLIDs,
01388                   const Teuchos::ArrayView<const LO>& permuteFromLIDs)
01389   {
01390     typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
01391     const this_type* src = dynamic_cast<const this_type* > (&source);
01392     if (src == NULL) {
01393       * (this->localError_) = true;
01394       if ((*errs_).is_null ()) {
01395         *errs_ = Teuchos::rcp (new std::ostringstream ());
01396       }
01397       **errs_ << "copyAndPermute: The source object of the Import or Export is "
01398         "either not a BlockCrsMatrix, or does not have the right template "
01399         "parameters.  checkSizes() should have caught this.  "
01400         "Please report this bug to the Tpetra developers." << std::endl;
01401       // There's no communication in this method, so it's safe just to
01402       // return on error.
01403       return;
01404     }
01405 
01406     bool srcInvalidRow = false;
01407     bool dstInvalidCol = false;
01408 
01409     // Copy the initial sequence of rows that are the same.
01410     //
01411     // The two graphs might have different column Maps, so we need to
01412     // do this using global column indices.  This is purely local, so
01413     // we only need to check for local sameness of the two column
01414     // Maps.
01415 
01416     const map_type& srcMap = * (src->graph_.getColMap ());
01417     const map_type& tgtMap = * (this->graph_.getColMap ());
01418     const bool canUseLocalColumnIndices = srcMap.locallySameAs (tgtMap);
01419     const size_t numPermute =
01420       std::min (permuteToLIDs.size (), permuteFromLIDs.size ());
01421 
01422     if (canUseLocalColumnIndices) {
01423       // Copy local rows that are the "same" in both source and target.
01424       for (size_t localRow = 0; localRow < numSameIDs; ++localRow) {
01425         const LO* lclSrcCols;
01426         Scalar* vals;
01427         LO numEntries;
01428         // If this call fails, that means the mesh row local index is
01429         // invalid.  That means the Import or Export is invalid somehow.
01430         LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
01431         if (err != 0) {
01432           srcInvalidRow = true;
01433         }
01434         else {
01435           err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
01436           if (err != numEntries) {
01437             dstInvalidCol = true;
01438           }
01439         }
01440       }
01441 
01442       // Copy the "permute" local rows.
01443       for (size_t k = 0; k < numPermute; ++k) {
01444         const LO* lclSrcCols;
01445         Scalar* vals;
01446         LO numEntries;
01447         LO err = src->getLocalRowView (permuteFromLIDs[k], lclSrcCols, vals, numEntries);
01448         if (err != 0) {
01449           srcInvalidRow = true;
01450         }
01451         else {
01452           err = this->replaceLocalValues (permuteFromLIDs[k], lclSrcCols, vals, numEntries);
01453           if (err != numEntries) {
01454             dstInvalidCol = true;
01455           }
01456         }
01457       }
01458     }
01459     else {
01460       // Reserve space to store the destination matrix's local column indices.
01461       Teuchos::Array<LO> lclDstCols (src->graph_.getNodeMaxNumRowEntries ());
01462 
01463       // Copy local rows that are the "same" in both source and target.
01464       for (size_t localRow = 0; localRow < numSameIDs; ++localRow) {
01465         const LO* lclSrcCols;
01466         Scalar* vals;
01467         LO numEntries;
01468         // If this call fails, that means the mesh row local index is
01469         // invalid.  That means the Import or Export is invalid somehow.
01470         LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
01471         if (err != 0) {
01472           srcInvalidRow = true;
01473         }
01474         else {
01475           // Convert the source matrix's local column indices to the
01476           // destination matrix's local column indices.
01477           Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
01478           for (LO k = 0; k < numEntries; ++k) {
01479             lclDstColsView[k] = tgtMap.getLocalElement (srcMap.getGlobalElement (lclSrcCols[k]));
01480           }
01481           err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
01482           if (err != numEntries) {
01483             dstInvalidCol = true;
01484           }
01485         }
01486       }
01487 
01488       // Copy the "permute" local rows.
01489       for (size_t k = 0; k < numPermute; ++k) {
01490         const LO* lclSrcCols;
01491         Scalar* vals;
01492         LO numEntries;
01493         LO err = src->getLocalRowView (permuteFromLIDs[k], lclSrcCols, vals, numEntries);
01494         if (err != 0) {
01495           srcInvalidRow = true;
01496         }
01497         else {
01498           // Convert the source matrix's local column indices to the
01499           // destination matrix's local column indices.
01500           Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
01501           for (LO j = 0; j < numEntries; ++j) {
01502             lclDstColsView[j] = tgtMap.getLocalElement (srcMap.getGlobalElement (lclSrcCols[j]));
01503           }
01504           err = this->replaceLocalValues (permuteFromLIDs[k], lclDstColsView.getRawPtr (), vals, numEntries);
01505           if (err != numEntries) {
01506             dstInvalidCol = true;
01507           }
01508         }
01509       }
01510     }
01511 
01512     // FIXME (mfh 19 May 2014) Would be great to have a way to report
01513     // errors, without throwing an exception.  The latter would be bad
01514     // in the case of differing block sizes, just in case those block
01515     // sizes are inconsistent across processes.  (The latter is not
01516     // allowed, but possible due to user error.)
01517 
01518     if (srcInvalidRow || dstInvalidCol) {
01519       * (this->localError_) = true;
01520       if ((*errs_).is_null ()) {
01521         *errs_ = Teuchos::rcp (new std::ostringstream ());
01522       }
01523       **errs_ << "copyAndPermute: The graph structure of the source of the "
01524         "Import or Export must be a subset of the graph structure of the "
01525         "target." << std::endl;
01526     }
01527   }
01528 
01529   template<class Scalar, class LO, class GO, class Node>
01530   void
01531   BlockCrsMatrix<Scalar, LO, GO, Node>::
01532   packAndPrepare (const Tpetra::SrcDistObject& source,
01533                   const Teuchos::ArrayView<const LO>& exportLIDs,
01534                   Teuchos::Array<packet_type>& exports,
01535                   const Teuchos::ArrayView<size_t>& numPacketsPerLID,
01536                   size_t& constantNumPackets,
01537                   Tpetra::Distributor& /* distor */)
01538   {
01539     typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
01540     const bool debug = true;
01541     const this_type* src = dynamic_cast<const this_type* > (&source);
01542 
01543     // Should have checked for this case in checkSizes().
01544     TEUCHOS_TEST_FOR_EXCEPTION(
01545       src == NULL, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
01546       "packAndPrepare: The source object of the Import or Export is either "
01547       "not a BlockCrsMatrix, or does not have the right template parameters.  "
01548       "checkSizes() should have caught this.  "
01549       "Please report this bug to the Tpetra developers.");
01550 
01551     if (debug) {
01552       const int myRank = graph_.getComm ()->getRank ();
01553       std::ostringstream os;
01554       os << "Proc " << myRank << ": packAndPrepare" << std::endl;
01555       std::cerr << os.str ();
01556     }
01557 
01558     const crs_graph_type& srcGraph = src->graph_;
01559     const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
01560     const size_t bytesPerBlock = blockSize * blockSize * sizeof (Scalar);
01561 
01562     // Graphs and matrices are allowed to have a variable number of
01563     // entries per row.  We could test whether all rows have the same
01564     // number of entries, but DistObject can only use this
01565     // optimization if all rows on _all_ processes have the same
01566     // number of entries.  Rather than do the all-reduce necessary to
01567     // test for this unlikely case, we tell DistObject (by setting
01568     // constantNumPackets to zero) to assume that different rows may
01569     // have different numbers of entries.
01570     constantNumPackets = 0;
01571 
01572     // Count the number of packets per row.  The number of packets is
01573     // the number of entries to send.  However, we don't pack entries
01574     // contiguously.  Instead, we put the column indices first, and
01575     // then all the block values.  This makes it easier to align the
01576     // latter to sizeof (Scalar), and reduces the number of memcpy
01577     // operations necessary to copy in the row's data.  (We have to
01578     // use memcpy because of ANSI C(++) aliasing rules, which
01579     // compilers now enforce.)  We pack global column indices, because
01580     // the source and target matrices are allowed to have different
01581     // column Maps.
01582 
01583     // Byte alignment of the start of the block values in each row's
01584     // data.  We use the max in case sizeof(Scalar) < sizeof(GO),
01585     // e.g., float vs. long long.
01586     const size_t alignment = std::max (sizeof (Scalar), sizeof (GO));
01587 
01588     // While counting the number of packets per row, compute the total
01589     // required send buffer ('exports') size, so we can resize it if
01590     // needed.  Also count the total number of entries to send, as a
01591     // sanity check for the total buffer size.
01592     size_t totalBufSize = 0;
01593     size_t totalNumEnt = 0;
01594 
01595     for (size_t k = 0; k < static_cast<size_t> (exportLIDs.size ()); ++k) {
01596       const LO localRow = exportLIDs[k];
01597       const size_t numEnt = srcGraph.getNumEntriesInLocalRow (localRow);
01598       totalNumEnt += numEnt;
01599 
01600       // If any given LIDs are invalid, the above might return either
01601       // zero or the invalid size_t value.  If the former, we have no
01602       // way to tell, but that's OK; it just means the calling process
01603       // won't pack anything (it has nothing to pack anyway).  If the
01604       // latter, we replace it with zero (that row is not owned by the
01605       // calling process, so it has no entries to pack), and set the
01606       // local error flag.
01607       if (numEnt == Teuchos::OrdinalTraits<size_t>::invalid ()) {
01608         numPacketsPerLID[localRow] = static_cast<size_t> (0);
01609         * (this->localError_) = true;
01610       } else {
01611         numPacketsPerLID[localRow] = numEnt;
01612       }
01613 
01614       // Space for the global column indices in this row.
01615       totalBufSize += numEnt * sizeof (GO);
01616 
01617       // Padding, so that the block values are Scalar aligned.
01618       {
01619         const size_t rem = (numEnt * sizeof (GO)) % alignment;
01620         const size_t padding = (rem == 0) ?
01621           static_cast<size_t> (0) :
01622           alignment - rem;
01623         totalBufSize += padding;
01624       }
01625 
01626       // Space for the block values in this row.  We don't include
01627       // padding for vectorization when packing the blocks.
01628       totalBufSize += numEnt * bytesPerBlock;
01629 
01630       // In case sizeof(Scalar) < sizeof(GO) (e.g., float vs. long
01631       // long), we pad the end of the send buffer so that the next row
01632       // is aligned to sizeof(GO) bytes.
01633       {
01634         const size_t rem = (numEnt * bytesPerBlock) % sizeof (GO);
01635         const size_t padding = (rem == 0) ?
01636           static_cast<size_t> (0) :
01637           sizeof (GO) - rem;
01638         totalBufSize += padding;
01639       }
01640     }
01641 
01642     if (debug) {
01643       const int myRank = graph_.getComm ()->getRank ();
01644       std::ostringstream os;
01645       os << "Proc " << myRank << ": packAndPrepare: totalBufSize = "
01646          << totalBufSize << std::endl;
01647       std::cerr << os.str ();
01648     }
01649 
01650     {
01651       const size_t minSaneBufSize = totalNumEnt * (bytesPerBlock + sizeof (GO));
01652       // If the test triggers, there's no point in setting the error
01653       // flag here, because the above code is completely broken.  This
01654       // probably means that the code below is broken as well.
01655       TEUCHOS_TEST_FOR_EXCEPTION(
01656         minSaneBufSize < totalBufSize, std::logic_error, "Tpetra::Experimental"
01657         "::BlockCrsMatrix: This method must have computed the total send buffer"
01658         " size incorrectly.  The minimum size in bytes without padding for "
01659         "alignment is " << minSaneBufSize << ", but this method computed the "
01660         "size in bytes _with_ padding for alignment as a lesser number " <<
01661         totalBufSize << ".  Please report this bug to the Tpetra developers.");
01662     }
01663 
01664     // packAndPrepare is responsible for resizing the send buffer.
01665     exports.resize (totalBufSize);
01666 
01667     // In the loop below: Current offset in bytes into the 'exports'
01668     // array; where to start putting the current row's data.
01669     size_t exportsOffset = 0;
01670 
01671     // Temporary space for global column indices.
01672     Teuchos::Array<GO> gblColIndsSpace (srcGraph.getNodeMaxNumRowEntries ());
01673 
01674     // Temporary row-major contiguous buffer for a block's entries.
01675     Teuchos::Array<Scalar> tempBlockSpace (blockSize * blockSize);
01676     little_block_type tempBlock (tempBlockSpace.getRawPtr (), blockSize, blockSize, 1);
01677 
01678     // Source matrix's column Map.  We verified in checkSizes() that
01679     // the column Map exists (is not null).
01680     const map_type& srcColMap = * (srcGraph.getColMap ());
01681 
01682     // Pack the data for each row to send, into the 'exports' buffer.
01683     // If any given LIDs are invalid, we pack obviously invalid data
01684     // (e.g., invalid column indices) into the buffer for that LID,
01685     // and set the local error flag.
01686     for (size_t lidInd = 0; lidInd < static_cast<size_t> (exportLIDs.size ()); ++lidInd) {
01687       // Get a view of row exportLIDs[lidInd].
01688       const LO lclRowInd = exportLIDs[lidInd];
01689       const LO* lclColInds;
01690       Scalar* vals;
01691       LO numEnt;
01692       const int err = src->getLocalRowView (lclRowInd, lclColInds, vals, numEnt);
01693       if (err != 0) {
01694         * (localError_) = true;
01695         // TODO (mfh 20 May 2014) Report the local error, without
01696         // printing a line for each bad LID.  It might help to collect
01697         // all the bad LIDs, but don't print them all if there are too
01698         // many.
01699         continue;
01700       }
01701 
01702       // Convert column indices from local to global.
01703       Teuchos::ArrayView<GO> gblColInds = gblColIndsSpace.view (0, numEnt);
01704       for (size_t j = 0; j < static_cast<size_t> (numEnt); ++j) {
01705         gblColInds[j] = srcColMap.getGlobalElement (lclColInds[j]);
01706       }
01707 
01708       // Pack the column indices.  Use memcpy to follow ANSI aliasing rules.
01709       packet_type* exportsStart = &exports[exportsOffset];
01710       memcpy (exportsStart, gblColInds.getRawPtr (), numEnt * sizeof (GO));
01711       exportsOffset += numEnt * sizeof (GO);
01712 
01713       // Compute padding so that block values are Scalar aligned.
01714       {
01715         const size_t rem = (numEnt * sizeof (GO)) % alignment;
01716         const size_t padding = (rem == 0) ?
01717           static_cast<size_t> (0) :
01718           alignment - rem;
01719         exportsOffset += padding;
01720       }
01721 
01722       // Pack the block values in this row.  Always pack row major,
01723       // for consistency, in case the source and target objects order
01724       // their blocks differently.
01725       //
01726       // In order to follow ANSI aliasing rules that forbid certain
01727       // kinds of type punning, we copy each block's entries first
01728       // into a temporary contiguous buffer, and then memcpy into the
01729       // send buffer.  We could just memcpy one entry at a time, but
01730       // it's probably faster to avoid the library call.
01731       const size_t rowStart = src->ptr_[lclRowInd];
01732       for (size_t j = 0; j < static_cast<size_t> (numEnt); ++j) {
01733         const_little_block_type srcBlock =
01734           src->getConstLocalBlockFromAbsOffset (rowStart + j);
01735         if (static_cast<size_t> (srcBlock.getBlockSize ()) == blockSize) {
01736           // A block size of zero is a possible error state.  Of
01737           // course, the actual block size could be zero.  That would
01738           // be silly, but why shouldn't it be legal?  That's why we
01739           // check whether the actual block size is different.
01740           * (localError_) = true;
01741           // Pack a block of zeros.  It might make sense to pack NaNs,
01742           // if Scalar implements a NaN value.
01743           tempBlock.fill (STS::zero ());
01744         } else {
01745           tempBlock.assign (srcBlock);
01746         }
01747         memcpy (&exports[exportsOffset], tempBlock.getRawPtr (), bytesPerBlock);
01748         exportsOffset += bytesPerBlock;
01749       }
01750 
01751       // In case sizeof(Scalar) < sizeof(GO) (e.g., float vs. long
01752       // long), we pad the end of the send buffer so that the next row
01753       // is aligned to sizeof(GO) bytes.
01754       {
01755         const size_t rem = (numEnt * bytesPerBlock) % sizeof (GO);
01756         const size_t padding = (rem == 0) ?
01757           static_cast<size_t> (0) :
01758           sizeof (GO) - rem;
01759         exportsOffset += padding;
01760       }
01761     } // for each LID (of a row) to send
01762 
01763 
01764     {
01765       const int myRank = graph_.getComm ()->getRank ();
01766       std::ostringstream os;
01767       os << "Proc " << myRank << ": packAndPrepare done" << std::endl;
01768       std::cerr << os.str ();
01769     }
01770   }
01771 
01772 
01773   template<class Scalar, class LO, class GO, class Node>
01774   void
01775   BlockCrsMatrix<Scalar, LO, GO, Node>::
01776   unpackAndCombine (const Teuchos::ArrayView<const LO> &importLIDs,
01777                     const Teuchos::ArrayView<const packet_type> &imports,
01778                     const Teuchos::ArrayView<size_t> &numPacketsPerLID,
01779                     size_t /* constantNumPackets */, // not worthwhile to use this
01780                     Tpetra::Distributor& /* distor */,
01781                     Tpetra::CombineMode CM)
01782   {
01783     if (CM != ADD && CM != INSERT && CM != REPLACE && CM != ABSMAX && CM != ZERO) {
01784       * (this->localError_) = true;
01785       if ((*errs_).is_null ()) {
01786         *errs_ = Teuchos::rcp (new std::ostringstream ());
01787         **errs_ << "unpackAndCombine: Invalid CombineMode value " << CM << ".  "
01788                 << "Valid values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
01789                 << std::endl;
01790         // It won't cause deadlock to return here, since this method
01791         // does not communicate.
01792         return;
01793       }
01794     }
01795 
01796     if (CM == ZERO) {
01797       return; // nothing to do; no need to combine entries
01798     }
01799 
01800     const size_t blockSize = this->getBlockSize ();
01801     const size_t bytesPerBlock = blockSize * blockSize * sizeof (Scalar);
01802     const size_t numImportLids = importLIDs.size ();
01803 
01804     // Byte alignment of the start of the block values in each row's
01805     // data.  We use the max in case sizeof(Scalar) < sizeof(GO),
01806     // e.g., float vs. long long.
01807     const size_t alignment = std::max (sizeof (Scalar), sizeof (GO));
01808 
01809     // Temporary space to cache local and global column indices.
01810     // Column indices come in as global indices, in case the source
01811     // object's column Map differs from the target object's (this's)
01812     // column Map.  We need the latter space in order to follow ANSI
01813     // aliasing rules (we don't want to type pun the buffer).
01814     Teuchos::Array<LO> lclColIndsSpace (graph_.getNodeMaxNumRowEntries ());
01815     Teuchos::Array<GO> gblColIndsSpace (graph_.getNodeMaxNumRowEntries ());
01816 
01817     // Temporary contiguous buffer for a row's block entries.
01818     Teuchos::Array<Scalar> tempVals (graph_.getNodeMaxNumRowEntries ());
01819 
01820     // Target matrix's column Map.  Use to convert the global column
01821     // indices in the receive buffer to local indices.  We verified in
01822     // checkSizes() that the column Map exists (is not null).
01823     const map_type& tgtColMap = * (this->graph_.getColMap ());
01824 
01825     // In the loop below: Current offset (in bytes) into the 'imports'
01826     // array; from whence to unpack data for the current LID (row).
01827     size_t importsOffset = 0;
01828 
01829     for (size_t lidInd = 0; lidInd < numImportLids; ++lidInd) {
01830       const size_t numEnt = numPacketsPerLID[lidInd]; // # entries in row
01831       const LO lclRowInd = importLIDs[lidInd]; // current local row index
01832 
01833       // Lengths and offsets for packed data in the 'imports' array.
01834       //
01835       // We pack all column indices first, then all values.  We pad
01836       // the former so that the latter is aligned to sizeof(Scalar).
01837       // 'padding' gives the size in bytes of the padding.
01838       const size_t numIndexBytes = numEnt * sizeof (GO);
01839       const size_t numBlockBytes = numEnt * bytesPerBlock;
01840 
01841       // Views for storing global and local column indices.
01842       Teuchos::ArrayView<GO> gblColInds = gblColIndsSpace.view (0, numEnt);
01843       Teuchos::ArrayView<LO> lclColInds = lclColIndsSpace.view (0, numEnt);
01844 
01845       // Unpack the global column indices from the receive buffer.
01846       memcpy (gblColInds.getRawPtr (), &imports[importsOffset], numIndexBytes);
01847       // Convert global column indices to local.
01848       for (size_t j = 0; j < numEnt; ++j) {
01849         lclColInds[j] = tgtColMap.getLocalElement (gblColInds[j]);
01850       }
01851 
01852       // Update the byte offset into the receive buffer.
01853       importsOffset += numIndexBytes;
01854       // Block values are aligned to max(sizeof(Scalar), sizeof(GO)).
01855       // Update the byte offset to account for padding to this alignment.
01856       {
01857         const size_t rem = numIndexBytes % alignment;
01858         const size_t padding = (rem == 0) ?
01859           static_cast<size_t> (0) :
01860           alignment - rem;
01861         importsOffset += padding;
01862       }
01863 
01864       // Copy out data for _all_ the blocks.  We memcpy into temp
01865       // storage, in order to follow ANSI aliasing rules that forbid
01866       // certain kinds of type punning.
01867       memcpy (tempVals.getRawPtr (), &imports[importsOffset], numBlockBytes);
01868       // Update the byte offset into the receive buffer.
01869       importsOffset += numBlockBytes;
01870       // In case sizeof(Scalar) < sizeof(GO) (e.g., float vs. long
01871       // long), the receive buffer was padded so that the next row is
01872       // aligned to sizeof(GO) bytes.
01873       const size_t rem = numBlockBytes % sizeof (GO);
01874       const size_t padding = (rem == 0) ?
01875         static_cast<size_t> (0) :
01876         sizeof (GO) - rem;
01877       importsOffset += padding;
01878 
01879       // Combine the incoming data with the matrix's current data.
01880       LO successCount = 0;
01881       if (CM == ADD) {
01882         successCount =
01883           this->sumIntoLocalValues (lclRowInd, lclColInds.getRawPtr (),
01884                                     tempVals.getRawPtr (), numEnt);
01885       } else if (CM == INSERT || CM == REPLACE) {
01886         successCount =
01887           this->replaceLocalValues (lclRowInd, lclColInds.getRawPtr (),
01888                                     tempVals.getRawPtr (), numEnt);
01889       } else if (CM == ABSMAX) {
01890         successCount =
01891           this->absMaxLocalValues (lclRowInd, lclColInds.getRawPtr (),
01892                                    tempVals.getRawPtr (), numEnt);
01893       }
01894       // We've already checked that CM is valid.
01895 
01896       if (static_cast<size_t> (successCount) != numEnt) {
01897         * (localError_) = true;
01898       }
01899     }
01900   }
01901 
01902 
01903   template<class Scalar, class LO, class GO, class Node>
01904   std::string
01905   BlockCrsMatrix<Scalar, LO, GO, Node>::description () const
01906   {
01907     using Teuchos::TypeNameTraits;
01908     std::ostringstream os;
01909     os << "\"Tpetra::BlockCrsMatrix\": { "
01910        << "Template parameters: { "
01911        << "Scalar: " << TypeNameTraits<Scalar>::name ()
01912        << "LO: " << TypeNameTraits<LO>::name ()
01913        << "GO: " << TypeNameTraits<GO>::name ()
01914        << "Node: " << TypeNameTraits<Node>::name ()
01915        << " }"
01916        << ", Label: \"" << this->getObjectLabel () << "\""
01917        << ", Global dimensions: ["
01918        << graph_.getDomainMap ()->getGlobalNumElements () << ", "
01919        << graph_.getRangeMap ()->getGlobalNumElements () << "]"
01920        << ", Block size: " << getBlockSize ()
01921        << " }";
01922     return os.str ();
01923   }
01924 
01925 
01926   template<class Scalar, class LO, class GO, class Node>
01927   void
01928   BlockCrsMatrix<Scalar, LO, GO, Node>::
01929   describe (Teuchos::FancyOStream& out,
01930             const Teuchos::EVerbosityLevel verbLevel) const
01931   {
01932     using Teuchos::ArrayRCP;
01933     using Teuchos::CommRequest;
01934     using Teuchos::FancyOStream;
01935     using Teuchos::getFancyOStream;
01936     using Teuchos::ireceive;
01937     using Teuchos::isend;
01938     using Teuchos::outArg;
01939     using Teuchos::TypeNameTraits;
01940     using Teuchos::VERB_DEFAULT;
01941     using Teuchos::VERB_NONE;
01942     using Teuchos::VERB_LOW;
01943     // using Teuchos::VERB_MEDIUM;
01944     // using Teuchos::VERB_HIGH;
01945     using Teuchos::VERB_EXTREME;
01946     using Teuchos::RCP;
01947     using Teuchos::wait;
01948     using std::endl;
01949 
01950     // Set default verbosity if applicable.
01951     const Teuchos::EVerbosityLevel vl =
01952       (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
01953 
01954     if (vl == VERB_NONE) {
01955       return; // print nothing
01956     }
01957 
01958     // describe() always starts with a tab before it prints anything.
01959     Teuchos::OSTab tab0 (out);
01960 
01961     out << "\"Tpetra::BlockCrsMatrix\":" << endl;
01962     Teuchos::OSTab tab1 (out);
01963 
01964     out << "Template parameters:" << endl;
01965     {
01966       Teuchos::OSTab tab2 (out);
01967       out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
01968           << "LO: " << TypeNameTraits<LO>::name () << endl
01969           << "GO: " << TypeNameTraits<GO>::name () << endl
01970           << "Node: " << TypeNameTraits<Node>::name () << endl;
01971     }
01972     out << "Label: \"" << this->getObjectLabel () << "\"" << endl
01973         << "Global dimensions: ["
01974         << graph_.getDomainMap ()->getGlobalNumElements () << ", "
01975         << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
01976 
01977     const LO blockSize = getBlockSize ();
01978     out << "Block size: " << blockSize << endl;
01979 
01980     if (vl >= VERB_EXTREME) {
01981       const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
01982       const int myRank = comm.getRank ();
01983       const int numProcs = comm.getSize ();
01984 
01985       // Print the calling process' data to the given output stream.
01986       RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
01987       RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
01988       FancyOStream& os = *osPtr;
01989       os << "Process " << myRank << ":" << endl;
01990       Teuchos::OSTab tab2 (os);
01991 
01992       const map_type& meshRowMap = * (graph_.getRowMap ());
01993       const map_type& meshColMap = * (graph_.getColMap ());
01994       for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
01995            meshLclRow <= meshRowMap.getMaxLocalIndex ();
01996            ++meshLclRow) {
01997         const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
01998         os << "Row " << meshGblRow << ": {";
01999 
02000         const LO* lclColInds = NULL;
02001         Scalar* vals = NULL;
02002         LO numInds = 0;
02003         this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
02004 
02005         for (LO k = 0; k < numInds; ++k) {
02006           const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
02007 
02008           os << "Col " << gblCol << ": [";
02009           for (LO i = 0; i < blockSize; ++i) {
02010             for (LO j = 0; j < blockSize; ++j) {
02011               os << vals[blockSize*blockSize*k + i*blockSize + j];
02012               if (j + 1 < blockSize) {
02013                 os << ", ";
02014               }
02015             }
02016             if (i + 1 < blockSize) {
02017               os << "; ";
02018             }
02019           }
02020           os << "]";
02021           if (k + 1 < numInds) {
02022             os << ", ";
02023           }
02024         }
02025         os << "}" << endl;
02026       }
02027 
02028       // Print data on Process 0.  This will automatically respect the
02029       // current indentation level.
02030       if (myRank == 0) {
02031         out << lclOutStrPtr->str ();
02032         lclOutStrPtr = Teuchos::null; // clear it to save space
02033       }
02034 
02035       const int sizeTag = 1337;
02036       const int dataTag = 1338;
02037 
02038       ArrayRCP<char> recvDataBuf; // only used on Process 0
02039 
02040       // Send string sizes and data from each process in turn to
02041       // Process 0, and print on that process.
02042       for (int p = 1; p < numProcs; ++p) {
02043         if (myRank == 0) {
02044           // Receive the incoming string's length.
02045           ArrayRCP<size_t> recvSize (1);
02046           recvSize[0] = 0;
02047           RCP<CommRequest<int> > recvSizeReq =
02048             ireceive<int, size_t> (recvSize, p, sizeTag, comm);
02049           wait<int> (comm, outArg (recvSizeReq));
02050           const size_t numCharsToRecv = recvSize[0];
02051 
02052           // Allocate space for the string to receive.  Reuse receive
02053           // buffer space if possible.  We can do this because in the
02054           // current implementation, we only have one receive in
02055           // flight at a time.  Leave space for the '\0' at the end,
02056           // in case the sender doesn't send it.
02057           if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
02058             recvDataBuf.resize (numCharsToRecv + 1);
02059           }
02060           ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
02061           // Post the receive of the actual string data.
02062           RCP<CommRequest<int> > recvDataReq =
02063             ireceive<int, char> (recvData, p, dataTag, comm);
02064           wait<int> (comm, outArg (recvDataReq));
02065 
02066           // Print the received data.  This will respect the current
02067           // indentation level.  Make sure that the string is
02068           // null-terminated.
02069           recvDataBuf[numCharsToRecv] = '\0';
02070           out << recvDataBuf.getRawPtr ();
02071         }
02072         else if (myRank == p) { // if I am not Process 0, and my rank is p
02073           // This deep-copies the string at most twice, depending on
02074           // whether std::string reference counts internally (it
02075           // generally does, so this won't deep-copy at all).
02076           const std::string stringToSend = lclOutStrPtr->str ();
02077           lclOutStrPtr = Teuchos::null; // clear original to save space
02078 
02079           // Send the string's length to Process 0.
02080           const size_t numCharsToSend = stringToSend.size ();
02081           ArrayRCP<size_t> sendSize (1);
02082           sendSize[0] = numCharsToSend;
02083           RCP<CommRequest<int> > sendSizeReq =
02084             isend<int, size_t> (sendSize, 0, sizeTag, comm);
02085           wait<int> (comm, outArg (sendSizeReq));
02086 
02087           // Send the actual string to Process 0.  We know that the
02088           // string has length > 0, so it's save to take the address
02089           // of the first entry.  Make a nonowning ArrayRCP to hold
02090           // the string.  Process 0 will add a null termination
02091           // character at the end of the string, after it receives the
02092           // message.
02093           ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
02094           RCP<CommRequest<int> > sendDataReq =
02095             isend<int, char> (sendData, 0, dataTag, comm);
02096           wait<int> (comm, outArg (sendDataReq));
02097         }
02098       } // for each process rank p other than 0
02099     } // extreme verbosity level (print the whole matrix)
02100   }
02101 
02102   template<class Scalar, class LO, class GO, class Node>
02103   Teuchos::RCP<const Teuchos::Comm<int> >
02104   BlockCrsMatrix<Scalar, LO, GO, Node>::
02105   getComm() const
02106   {
02107     return graph_.getComm();
02108   }
02109 
02110   template<class Scalar, class LO, class GO, class Node>
02111   Teuchos::RCP<Node>
02112   BlockCrsMatrix<Scalar, LO, GO, Node>::
02113   getNode() const
02114   {
02115     return graph_.getNode();
02116 
02117   }
02118 
02119   template<class Scalar, class LO, class GO, class Node>
02120   global_size_t
02121   BlockCrsMatrix<Scalar, LO, GO, Node>::
02122   getGlobalNumCols() const
02123   {
02124     return graph_.getGlobalNumCols();
02125   }
02126 
02127   template<class Scalar, class LO, class GO, class Node>
02128   size_t
02129   BlockCrsMatrix<Scalar, LO, GO, Node>::
02130   getNodeNumCols() const
02131   {
02132     return graph_.getNodeNumCols();
02133   }
02134 
02135   template<class Scalar, class LO, class GO, class Node>
02136   GO
02137   BlockCrsMatrix<Scalar, LO, GO, Node>::
02138   getIndexBase() const
02139   {
02140     return graph_.getIndexBase();
02141   }
02142 
02143   template<class Scalar, class LO, class GO, class Node>
02144   global_size_t
02145   BlockCrsMatrix<Scalar, LO, GO, Node>::
02146   getGlobalNumEntries() const
02147   {
02148     return graph_.getGlobalNumEntries();
02149   }
02150 
02151   template<class Scalar, class LO, class GO, class Node>
02152   size_t
02153   BlockCrsMatrix<Scalar, LO, GO, Node>::
02154   getNodeNumEntries() const
02155   {
02156     return graph_.getNodeNumEntries();
02157   }
02158 
02159   template<class Scalar, class LO, class GO, class Node>
02160   size_t
02161   BlockCrsMatrix<Scalar, LO, GO, Node>::
02162   getNumEntriesInGlobalRow (GO globalRow) const
02163   {
02164     return graph_.getNumEntriesInGlobalRow(globalRow);
02165   }
02166 
02167   template<class Scalar, class LO, class GO, class Node>
02168   global_size_t
02169   BlockCrsMatrix<Scalar, LO, GO, Node>::
02170   getGlobalNumDiags() const
02171   {
02172     return getGlobalNumDiags();
02173   }
02174 
02175   template<class Scalar, class LO, class GO, class Node>
02176   size_t
02177   BlockCrsMatrix<Scalar, LO, GO, Node>::
02178   getNodeNumDiags() const
02179   {
02180     return getNodeNumDiags();
02181   }
02182 
02183   template<class Scalar, class LO, class GO, class Node>
02184   size_t
02185   BlockCrsMatrix<Scalar, LO, GO, Node>::
02186   getGlobalMaxNumRowEntries() const
02187   {
02188     return graph_.getGlobalMaxNumRowEntries();
02189   }
02190 
02191   template<class Scalar, class LO, class GO, class Node>
02192   bool
02193   BlockCrsMatrix<Scalar, LO, GO, Node>::
02194   hasColMap() const
02195   {
02196     return graph_.hasColMap();
02197   }
02198 
02199   template<class Scalar, class LO, class GO, class Node>
02200   bool
02201   BlockCrsMatrix<Scalar, LO, GO, Node>::
02202   isLowerTriangular() const
02203   {
02204     return graph_.isLowerTriangular();
02205   }
02206 
02207   template<class Scalar, class LO, class GO, class Node>
02208   bool
02209   BlockCrsMatrix<Scalar, LO, GO, Node>::
02210   isUpperTriangular() const
02211   {
02212     return graph_.isUpperTriangular();
02213   }
02214 
02215   template<class Scalar, class LO, class GO, class Node>
02216   bool
02217   BlockCrsMatrix<Scalar, LO, GO, Node>::
02218   isLocallyIndexed() const
02219   {
02220     return graph_.isLocallyIndexed();
02221   }
02222 
02223   template<class Scalar, class LO, class GO, class Node>
02224   bool
02225   BlockCrsMatrix<Scalar, LO, GO, Node>::
02226   isGloballyIndexed() const
02227   {
02228     return graph_.isGloballyIndexed();
02229   }
02230 
02231   template<class Scalar, class LO, class GO, class Node>
02232   bool
02233   BlockCrsMatrix<Scalar, LO, GO, Node>::
02234   isFillComplete() const
02235   {
02236     return true;
02237   }
02238 
02239   template<class Scalar, class LO, class GO, class Node>
02240   bool
02241   BlockCrsMatrix<Scalar, LO, GO, Node>::
02242   supportsRowViews() const
02243   {
02244     return true;
02245   }
02246 
02247 
02248   template<class Scalar, class LO, class GO, class Node>
02249   void
02250   BlockCrsMatrix<Scalar, LO, GO, Node>::
02251   getGlobalRowCopy (GO GlobalRow,
02252                     const Teuchos::ArrayView<GO> &Indices,
02253                     const Teuchos::ArrayView<Scalar> &Values,
02254                     size_t &NumEntries) const
02255   {
02256     TEUCHOS_TEST_FOR_EXCEPTION(
02257       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowCopy: "
02258       "This class doesn't support global matrix indexing.");
02259 
02260   }
02261 
02262   template<class Scalar, class LO, class GO, class Node>
02263   void
02264   BlockCrsMatrix<Scalar, LO, GO, Node>::
02265   getGlobalRowView (GO GlobalRow,
02266                     ArrayView<const GO> &indices,
02267                     ArrayView<const Scalar> &values) const
02268   {
02269     TEUCHOS_TEST_FOR_EXCEPTION(
02270       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
02271       "This class doesn't support global matrix indexing.");
02272 
02273   }
02274 
02275   template<class Scalar, class LO, class GO, class Node>
02276   void
02277   BlockCrsMatrix<Scalar, LO, GO, Node>::
02278   getLocalRowView (LO LocalRow,
02279                    ArrayView<const LO> &indices,
02280                    ArrayView<const Scalar> &values) const
02281   {
02282     TEUCHOS_TEST_FOR_EXCEPTION(
02283       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
02284       "This class doesn't support global matrix indexing.");
02285 
02286   }
02287 
02288 
02289   template<class Scalar, class LO, class GO, class Node>
02290   void
02291   BlockCrsMatrix<Scalar, LO, GO, Node>::
02292   getLocalDiagCopy (Tpetra::Vector<Scalar,LO,GO,Node> &diag) const
02293   {
02294     TEUCHOS_TEST_FOR_EXCEPTION(
02295       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getLocalDiagCopy: "
02296       "not implemented.");
02297 
02298   }
02299 
02300   template<class Scalar, class LO, class GO, class Node>
02301   void
02302   BlockCrsMatrix<Scalar, LO, GO, Node>::
02303   leftScale (const Tpetra::Vector<Scalar, LO, GO, Node>& x)
02304   {
02305     TEUCHOS_TEST_FOR_EXCEPTION(
02306       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::leftScale: "
02307       "not implemented.");
02308 
02309   }
02310 
02311   template<class Scalar, class LO, class GO, class Node>
02312   void
02313   BlockCrsMatrix<Scalar, LO, GO, Node>::
02314   rightScale (const Tpetra::Vector<Scalar, LO, GO, Node>& x)
02315   {
02316     TEUCHOS_TEST_FOR_EXCEPTION(
02317       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::rightScale: "
02318       "not implemented.");
02319 
02320   }
02321 
02322   template<class Scalar, class LO, class GO, class Node>
02323   Teuchos::RCP<const Tpetra::RowGraph<LO, GO, Node> >
02324   BlockCrsMatrix<Scalar, LO, GO, Node>::
02325   getGraph() const
02326   {
02327     return graphRCP_;
02328   }
02329 
02330   template<class Scalar, class LO, class GO, class Node>
02331   typename Teuchos::ScalarTraits<Scalar>::magnitudeType
02332   BlockCrsMatrix<Scalar, LO, GO, Node>::
02333   getFrobeniusNorm () const
02334   {
02335     TEUCHOS_TEST_FOR_EXCEPTION(
02336       true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getFrobeniusNorm: "
02337       "not implemented.");
02338   }
02339 
02340 
02341 } // namespace Experimental
02342 } // namespace Tpetra
02343 
02344 //
02345 // Explicit instantiation macro
02346 //
02347 // Must be expanded from within the Tpetra namespace!
02348 //
02349 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
02350   template class Experimental::BlockCrsMatrix< S, LO, GO, NODE >;
02351 
02352 #endif // TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines