Tpetra Matrix/Vector Services  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines
Tpetra_KokkosRefactor_CrsMatrix_def.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //          Tpetra: Templated Linear Algebra Services Package
00005 //                 Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 // @HEADER
00041 
00042 #ifndef TPETRA_KOKKOSREFACTOR_CRSMATRIX_DEF_HPP
00043 #define TPETRA_KOKKOSREFACTOR_CRSMATRIX_DEF_HPP
00044 
00045 #ifdef DOXYGEN_USE_ONLY
00046 #  include "Tpetra_KokkosRefactor_CrsMatrix_decl.hpp"
00047 #endif
00048 #include <Kokkos_Sequential_SparseKernels.hpp>
00049 
00050 namespace Tpetra {
00051 
00052   template <class Scalar,
00053             class LocalOrdinal,
00054             class GlobalOrdinal,
00055             class DeviceType>
00056   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal,
00057             Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00058   CrsMatrix (const RCP<const map_type> &rowMap,
00059              size_t maxNumEntriesPerRow,
00060              ProfileType pftype,
00061              const RCP<Teuchos::ParameterList>& params) :
00062     DistObject<char, LocalOrdinal, GlobalOrdinal, node_type> (rowMap),
00063     storageStatus_ (pftype == StaticProfile ?
00064                     Details::STORAGE_1D_UNPACKED :
00065                     Details::STORAGE_2D),
00066     fillComplete_ (false),
00067     frobNorm_ (-STM::one ())
00068   {
00069     try {
00070       myGraph_ = rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow, pftype, params));
00071     }
00072     catch (std::exception& e) {
00073       TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
00074         "CrsMatrix constructor: caught exception while allocating CrsGraph "
00075         "object: " << std::endl << e.what ());
00076     }
00077     staticGraph_ = myGraph_;
00078     resumeFill (params);
00079     checkInternalState ();
00080   }
00081 
00082 
00083   template <class Scalar, class LocalOrdinal,
00084             class GlobalOrdinal, class DeviceType>
00085   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal,
00086             Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00087   CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
00088              const Teuchos::ArrayRCP<const size_t>& NumEntriesPerRowToAlloc,
00089              ProfileType pftype,
00090              const Teuchos::RCP<Teuchos::ParameterList>& params) :
00091     DistObject<char, LocalOrdinal, GlobalOrdinal, node_type> (rowMap),
00092     storageStatus_ (pftype == StaticProfile ?
00093                     Details::STORAGE_1D_UNPACKED :
00094                     Details::STORAGE_2D),
00095     fillComplete_ (false),
00096     frobNorm_ (-STM::one ())
00097   {
00098     try {
00099       myGraph_ = rcp (new Graph (rowMap, NumEntriesPerRowToAlloc, pftype, params));
00100     }
00101     catch (std::exception &e) {
00102       TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
00103           typeName(*this) << "::CrsMatrix(): caught exception while allocating CrsGraph object: "
00104           << std::endl << e.what() << std::endl);
00105     }
00106     staticGraph_ = myGraph_;
00107     resumeFill(params);
00108     checkInternalState();
00109   }
00110 
00111 
00112   template <class Scalar,
00113             class LocalOrdinal,
00114             class GlobalOrdinal, class DeviceType>
00115   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal,
00116             Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00117   CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
00118              const Teuchos::RCP<const map_type>& colMap,
00119              size_t maxNumEntriesPerRow,
00120              ProfileType pftype,
00121              const Teuchos::RCP<Teuchos::ParameterList>& params) :
00122     DistObject<char, LocalOrdinal, GlobalOrdinal, node_type> (rowMap),
00123     storageStatus_ (pftype == StaticProfile ?
00124                     Details::STORAGE_1D_UNPACKED :
00125                     Details::STORAGE_2D),
00126     fillComplete_ (false),
00127     frobNorm_ (-STM::one ())
00128   {
00129     TEUCHOS_TEST_FOR_EXCEPTION(! staticGraph_.is_null(), std::logic_error,
00130       "Tpetra::CrsMatrix ctor (row Map, col Map, maxNumEntriesPerRow, ...): "
00131       "staticGraph_ is not null at the beginning of the constructor.  "
00132       "Please report this bug to the Tpetra developers.");
00133     TEUCHOS_TEST_FOR_EXCEPTION(! myGraph_.is_null(), std::logic_error,
00134       "Tpetra::CrsMatrix ctor (row Map, col Map, maxNumEntriesPerRow, ...): "
00135       "myGraph_ is not null at the beginning of the constructor.  "
00136       "Please report this bug to the Tpetra developers.");
00137     try {
00138       myGraph_ = rcp (new Graph (rowMap, colMap, maxNumEntriesPerRow, pftype, params));
00139     }
00140     catch (std::exception &e) {
00141       TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
00142         "CrsMatrix constructor: Caught exception while allocating "
00143         "CrsGraph object: " << std::endl << e.what ());
00144     }
00145     staticGraph_ = myGraph_;
00146     resumeFill(params);
00147     checkInternalState();
00148   }
00149 
00150 
00151   template <class Scalar,
00152             class LocalOrdinal,
00153             class GlobalOrdinal, class DeviceType>
00154   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal,
00155             Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00156   CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
00157              const Teuchos::RCP<const map_type>& colMap,
00158              const Teuchos::ArrayRCP<const size_t>& numEntPerRow,
00159              ProfileType pftype,
00160              const RCP<Teuchos::ParameterList>& params) :
00161     DistObject<char, LocalOrdinal, GlobalOrdinal, node_type> (rowMap),
00162     storageStatus_ (pftype == StaticProfile ?
00163                     Details::STORAGE_1D_UNPACKED :
00164                     Details::STORAGE_2D),
00165     fillComplete_ (false),
00166     frobNorm_ (-STM::one ())
00167   {
00168     try {
00169       myGraph_ = rcp (new Graph (rowMap, colMap, numEntPerRow, pftype, params));
00170     }
00171     catch (std::exception &e) {
00172       TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
00173         "CrsMatrix constructor: caught exception while allocating "
00174         "CrsGraph object: " << std::endl << e.what ());
00175     }
00176     staticGraph_ = myGraph_;
00177     resumeFill (params);
00178     checkInternalState ();
00179   }
00180 
00181 
00182   template<class Scalar,
00183            class LocalOrdinal,
00184            class GlobalOrdinal, class DeviceType>
00185   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00186   CrsMatrix (const Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >& graph,
00187              const Teuchos::RCP<Teuchos::ParameterList>& params) :
00188     DistObject<char, LocalOrdinal,GlobalOrdinal, node_type> (graph->getRowMap ()),
00189     staticGraph_ (graph),
00190     storageStatus_ (Details::STORAGE_1D_PACKED),
00191     fillComplete_ (false),
00192     frobNorm_ (-STM::one ())
00193   {
00194     const char tfecfFuncName[] = "CrsMatrix(graph)";
00195     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(staticGraph_.is_null (),
00196       std::runtime_error, ": When calling the CrsMatrix constructor that "
00197       "accepts a static graph, the pointer to the graph must not be null.");
00198     // We prohibit the case where the graph is not yet filled.
00199     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! staticGraph_->isFillComplete (),
00200       std::runtime_error, ": The specified graph is not fill-complete. You "
00201       "must invoke fillComplete() on the graph before using it to construct a "
00202       "CrsMatrix.  Note that calling resumeFill() makes the graph not fill-"
00203       "complete, even if you had previously called fillComplete().  In that "
00204       "case, you must call fillComplete() on the graph again.");
00205     // the graph has entries, and the matrix should have entries as well, set to zero. no need or point in lazy allocating in this case.
00206     // first argument LocalIndices is ignored; the graph is already allocated (local or global, we don't care here)
00207     allocateValues (LocalIndices, GraphAlreadyAllocated);
00208     resumeFill(params);
00209     checkInternalState();
00210   }
00211 
00212   template <class Scalar,
00213             class LocalOrdinal,
00214             class GlobalOrdinal,
00215             class DeviceType>
00216   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00217   CrsMatrix (const RCP<const map_type>& rowMap,
00218              const RCP<const map_type>& colMap,
00219              const t_RowPtrs & rowPointers,
00220              const t_LocalOrdinal_1D & columnIndices,
00221              const t_ValuesType & values,
00222              const RCP<Teuchos::ParameterList>& params) :
00223     DistObject<char, LocalOrdinal, GlobalOrdinal, node_type> (rowMap),
00224     storageStatus_ (Details::STORAGE_1D_PACKED),
00225     fillComplete_ (false),
00226     frobNorm_ (-STM::one ())
00227   {
00228     try {
00229       myGraph_ = rcp (new Graph (rowMap, colMap, rowPointers, columnIndices, params));
00230     }
00231     catch (std::exception &e) {
00232       TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
00233         "CrsMatrix constructor: caught exception while allocating "
00234         "CrsGraph object: " << std::endl << e.what ());
00235     }
00236     staticGraph_ = myGraph_;
00237     k_values1D_  = values;
00238     values1D_    = Kokkos::Compat::persistingView (k_values1D_);
00239     resumeFill (params);
00240     checkInternalState ();
00241   }
00242 
00243   template <class Scalar,
00244             class LocalOrdinal,
00245             class GlobalOrdinal, class DeviceType>
00246   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00247   CrsMatrix (const RCP<const map_type>& rowMap,
00248              const RCP<const map_type>& colMap,
00249              const ArrayRCP<size_t> & rowPointers,
00250              const ArrayRCP<LocalOrdinal> & columnIndices,
00251              const ArrayRCP<Scalar> & values,
00252              const RCP<Teuchos::ParameterList>& params) :
00253     DistObject<char, LocalOrdinal, GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > (rowMap),
00254     storageStatus_ (Details::STORAGE_1D_PACKED),
00255     fillComplete_ (false),
00256     frobNorm_ (-STM::one ())
00257   {
00258     try {
00259       myGraph_ = rcp (new Graph (rowMap, colMap, rowPointers,columnIndices,params));
00260     }
00261     catch (std::exception &e) {
00262       TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
00263         typeName(*this) << "::CrsMatrix(): caught exception while allocating "
00264         "CrsGraph object: " << std::endl << e.what ());
00265     }
00266     staticGraph_ = myGraph_;
00267     // FIXME (mfh 05 Aug 2014) It should be possible to convince the
00268     // ArrayRCP to relinquish its allocation, but that might require
00269     // passing the ArrayRCP in by nonconst reference.
00270     k_values1D_ = Kokkos::Compat::getKokkosViewDeepCopy<DeviceType> (values ());
00271     values1D_ = Kokkos::Compat::persistingView (k_values1D_);
00272     resumeFill (params);
00273     checkInternalState ();
00274   }
00275 
00276   template <class Scalar,
00277             class LocalOrdinal,
00278             class GlobalOrdinal, class DeviceType>
00279   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00280   CrsMatrix (const RCP<const map_type>& rowMap,
00281              const RCP<const map_type>& colMap,
00282              const k_local_matrix_type& lclMatrix,
00283              const RCP<Teuchos::ParameterList>& params) :
00284     DistObject<char, LocalOrdinal, GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > (rowMap),
00285     k_lclMatrix_ (lclMatrix),
00286     storageStatus_ (Details::STORAGE_1D_PACKED),
00287     fillComplete_ (false),
00288     frobNorm_ (-STM::one ())
00289   {
00290     using Teuchos::ArrayRCP;
00291     using Teuchos::arcp;
00292     using Teuchos::rcp;
00293     using Teuchos::RCP;
00294     const char tfecfFuncName[] = "CrsMatrix(rowMap,colMap,lclMatrix,params): ";
00295 
00296     try {
00297       myGraph_ = rcp (new Graph (rowMap, colMap, lclMatrix.graph, params));
00298     }
00299     catch (std::exception &e) {
00300       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
00301         "Caught exception while allocating CrsGraph object: " << e.what ());
00302     }
00303     staticGraph_ = myGraph_;
00304     computeGlobalConstants();
00305 
00306     k_values1D_ = k_lclMatrix_.values;
00307 
00308     {
00309       // For backwards compatibility, set the Kokkos classic pointer
00310       // to the values, values1D_.
00311       ArrayRCP<scalar_type> classicValues =
00312         Kokkos::Compat::persistingView (k_lclMatrix_.values);
00313       values1D_ = classicValues;
00314     }
00315 
00316     // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
00317 
00318     // Now we're fill complete!
00319     fillComplete_ = true;
00320 
00321     // Sanity checks at the end.
00322 #ifdef HAVE_TPETRA_DEBUG
00323     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
00324       "We're at the end of fillComplete(), but isFillActive() is true.  "
00325       "Please report this bug to the Tpetra developers.");
00326     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
00327       "We're at the end of fillComplete(), but isFillActive() is true.  "
00328       "Please report this bug to the Tpetra developers.");
00329 #endif // HAVE_TPETRA_DEBUG
00330     checkInternalState ();
00331   }
00332 
00333   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
00334            class DeviceType>
00335   CrsMatrix<
00336     Scalar, LocalOrdinal, GlobalOrdinal,
00337     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00338   ~CrsMatrix () {}
00339 
00340   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
00341            class DeviceType>
00342   RCP<const Teuchos::Comm<int> >
00343   CrsMatrix<
00344     Scalar, LocalOrdinal, GlobalOrdinal,
00345     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00346   getComm () const {
00347     return getCrsGraph ()->getComm ();
00348   }
00349 
00350   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
00351            class DeviceType>
00352   Teuchos::RCP<Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >
00353   CrsMatrix<
00354     Scalar, LocalOrdinal, GlobalOrdinal,
00355     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00356   getNode () const {
00357     return getCrsGraph ()->getNode ();
00358   }
00359 
00360   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00361   ProfileType
00362   CrsMatrix<
00363     Scalar, LocalOrdinal, GlobalOrdinal,
00364     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00365   getProfileType () const {
00366     return getCrsGraph ()->getProfileType ();
00367   }
00368 
00369   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00370   bool
00371   CrsMatrix<
00372     Scalar, LocalOrdinal, GlobalOrdinal,
00373     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00374   isFillComplete () const {
00375     return fillComplete_;
00376   }
00377 
00378   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00379   bool
00380   CrsMatrix<
00381     Scalar, LocalOrdinal, GlobalOrdinal,
00382     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00383   isFillActive () const {
00384     return ! fillComplete_;
00385   }
00386 
00387   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00388   bool CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::isStorageOptimized() const {
00389     return getCrsGraph()->isStorageOptimized();
00390   }
00391 
00392   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00393   bool CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::isLocallyIndexed() const {
00394     return getCrsGraph()->isLocallyIndexed();
00395   }
00396 
00397   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00398   bool CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::isGloballyIndexed() const {
00399     return getCrsGraph()->isGloballyIndexed();
00400   }
00401 
00402   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00403   bool CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::hasColMap() const {
00404     return getCrsGraph()->hasColMap();
00405   }
00406 
00407   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00408   global_size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getGlobalNumEntries() const {
00409     return getCrsGraph()->getGlobalNumEntries();
00410   }
00411 
00412   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00413   size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getNodeNumEntries() const {
00414     return getCrsGraph()->getNodeNumEntries();
00415   }
00416 
00417   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00418   global_size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getGlobalNumRows() const {
00419     return getCrsGraph()->getGlobalNumRows();
00420   }
00421 
00422   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00423   global_size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getGlobalNumCols() const {
00424     return getCrsGraph()->getGlobalNumCols();
00425   }
00426 
00427   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00428   size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getNodeNumRows() const {
00429     return getCrsGraph()->getNodeNumRows();
00430   }
00431 
00432   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00433   size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getNodeNumCols() const {
00434     return getCrsGraph()->getNodeNumCols();
00435   }
00436 
00437   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00438   global_size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getGlobalNumDiags() const {
00439     return getCrsGraph()->getGlobalNumDiags();
00440   }
00441 
00442   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00443   size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getNodeNumDiags() const {
00444     return getCrsGraph()->getNodeNumDiags();
00445   }
00446 
00447   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00448   size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const {
00449     return getCrsGraph()->getNumEntriesInGlobalRow(globalRow);
00450   }
00451 
00452   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00453   size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getNumEntriesInLocalRow(LocalOrdinal localRow) const {
00454     return getCrsGraph()->getNumEntriesInLocalRow(localRow);
00455   }
00456 
00457   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00458   size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getGlobalMaxNumRowEntries() const {
00459     return getCrsGraph()->getGlobalMaxNumRowEntries();
00460   }
00461 
00462   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00463   size_t CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getNodeMaxNumRowEntries() const {
00464     return getCrsGraph()->getNodeMaxNumRowEntries();
00465   }
00466 
00467   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00468   GlobalOrdinal CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getIndexBase() const {
00469     return getRowMap()->getIndexBase();
00470   }
00471 
00472   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00473   RCP<const Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
00474   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getRowMap() const {
00475     return getCrsGraph()->getRowMap();
00476   }
00477 
00478 
00479   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00480   RCP<const Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
00481   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getColMap() const {
00482     return getCrsGraph()->getColMap();
00483   }
00484 
00485 
00486   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00487   RCP<const Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
00488   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getDomainMap() const {
00489     return getCrsGraph()->getDomainMap();
00490   }
00491 
00492 
00493   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00494   RCP<const Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
00495   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getRangeMap() const {
00496     return getCrsGraph()->getRangeMap();
00497   }
00498 
00499 
00500   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00501   RCP<const RowGraph<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
00502   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getGraph() const {
00503     if (staticGraph_ != null) return staticGraph_;
00504     return myGraph_;
00505   }
00506 
00507 
00508   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00509   RCP<const CrsGraph<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
00510   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getCrsGraph() const {
00511     if (staticGraph_ != null) return staticGraph_;
00512     return myGraph_;
00513   }
00514 
00515 
00516   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00517   bool CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::isLowerTriangular() const {
00518     return getCrsGraph()->isLowerTriangular();
00519   }
00520 
00521 
00522   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00523   bool CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::isUpperTriangular() const {
00524     return getCrsGraph()->isUpperTriangular();
00525   }
00526 
00527 
00528   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00529   bool CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::isStaticGraph() const {
00530     return (myGraph_ == null);
00531   }
00532 
00533 
00534   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00535   bool CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::hasTransposeApply() const {
00536     return true;
00537   }
00538 
00539   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00540   bool
00541   CrsMatrix<
00542     Scalar, LocalOrdinal, GlobalOrdinal,
00543     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00544   supportsRowViews () const {
00545     return true;
00546   }
00547 
00550   //                                                                         //
00551   //                    Internal utility methods                             //
00552   //                                                                         //
00555 
00556 
00559   template <class Scalar,
00560             class LocalOrdinal,
00561             class GlobalOrdinal,
00562             class DeviceType>
00563   void
00564   CrsMatrix<
00565     Scalar, LocalOrdinal, GlobalOrdinal,
00566     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00567   allocateValues (ELocalGlobal lg, GraphAllocationStatus gas)
00568   {
00569 #ifdef HAVE_TPETRA_DEBUG
00570     // If the graph indices are already allocated, then gas should be
00571     // GraphAlreadyAllocated.  Otherwise, gas should be
00572     // GraphNotYetAllocated.
00573     if ((gas == GraphAlreadyAllocated) != staticGraph_->indicesAreAllocated()) {
00574       const std::string err1 ("allocateValues: The caller has asserted that "
00575                               "the graph is ");
00576       const std::string err2 ("already allocated, but the static graph says "
00577                               "that its indices are ");
00578       const std::string err3 ("already allocated.  Please report this bug to "
00579                               "the Tpetra developers.");
00580       TEUCHOS_TEST_FOR_EXCEPTION(gas == GraphAlreadyAllocated && ! staticGraph_->indicesAreAllocated(),
00581         std::logic_error, err1 << err2 << "not " << err3);
00582       TEUCHOS_TEST_FOR_EXCEPTION(gas != GraphAlreadyAllocated && staticGraph_->indicesAreAllocated(),
00583         std::logic_error, err1 << "not " << err2 << err3);
00584     }
00585 
00586     // If the graph is unallocated, then it had better be a
00587     // matrix-owned graph.  ("Matrix-owned graph" means that the
00588     // matrix gets to define the graph structure.  If the CrsMatrix
00589     // constructor that takes an RCP<const CrsGraph> was used, then
00590     // the matrix does _not_ own the graph.)
00591     TEUCHOS_TEST_FOR_EXCEPTION(
00592       ! staticGraph_->indicesAreAllocated() && myGraph_.is_null(),
00593       std::logic_error,
00594       "allocateValues: The static graph says that its indices are not "
00595       "allocated, but the graph is not owned by the matrix.  Please report "
00596       "this bug to the Tpetra developers.");
00597 #endif // HAVE_TPETRA_DEBUG
00598 
00599     if (gas == GraphNotYetAllocated) {
00600       myGraph_->allocateIndices (lg);
00601     }
00602 
00603     // Allocate matrix values.
00604     if (getProfileType() == StaticProfile) {
00605       // "Static profile" means that the number of matrix entries in
00606       // each row was fixed at the time the CrsMatrix constructor was
00607       // called.  This lets us use 1-D storage for the matrix's
00608       // values.  ("1-D storage" means the same as that used by the
00609       // three arrays in the classic compressed sparse row format.)
00610 
00611       const size_t lclNumRows = staticGraph_->getNodeNumRows ();
00612       typename Graph::t_RowPtrs k_ptrs = staticGraph_->k_rowPtrs_;
00613       TEUCHOS_TEST_FOR_EXCEPTION(
00614         k_ptrs.dimension_0 () != lclNumRows+1, std::logic_error,
00615         "Tpetra::CrsMatrix::allocateValues: With StaticProfile, row offsets "
00616         "array has length " << k_ptrs.dimension_0 () << " != (lclNumRows+1) = "
00617         << (lclNumRows+1) << ".");
00618       // FIXME (mfh 08 Aug 2014) This assumes UVM.  We could fix this
00619       // either by storing the row offsets in the graph as a DualView,
00620       // or by making a device View of that entry, and copying it back
00621       // to host.
00622       const size_t lclTotalNumEntries = k_ptrs(lclNumRows);
00623 
00624       // Allocate array of (packed???) matrix values.
00625       k_values1D_ = t_ValuesType ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
00626       values1D_ = Kokkos::Compat::persistingView (k_values1D_);
00627     }
00628     else {
00629       // "Dynamic profile" means the number of matrix entries in each
00630       // row is not fixed and may expand.  Thus, we store the matrix's
00631       // values in "2-D storage," meaning an array of arrays.  The
00632       // outer array has as many inner arrays as there are rows in the
00633       // matrix, and each inner array stores the values in that row.
00634       values2D_ = staticGraph_->template allocateValues2D<Scalar>();
00635     }
00636   }
00637 
00640   template <class Scalar,
00641             class LocalOrdinal,
00642             class GlobalOrdinal,
00643             class DeviceType>
00644   void
00645   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00646   getAllValues (ArrayRCP<const size_t>& rowPointers,
00647                 ArrayRCP<const LocalOrdinal>& columnIndices,
00648                 ArrayRCP<const Scalar>& values) const
00649   {
00650     const char tfecfFuncName[] = "getAllValues";
00651     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00652       columnIndices.size () != values.size (), std::runtime_error,
00653       " requires that columnIndices and values are the same size.");
00654 
00655     RCP<const crs_graph_type> relevantGraph = getCrsGraph ();
00656     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00657       relevantGraph.is_null (), std::runtime_error,
00658       " requires that getCrsGraph() is not null.");
00659     try {
00660       rowPointers = relevantGraph->getNodeRowPtrs ();
00661       columnIndices = relevantGraph->getNodePackedIndices ();
00662     }
00663     catch (std::exception &e) {
00664       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00665         true, std::runtime_error,
00666         ": Caught exception while calling getCrsGraph()->getAllIndices().");
00667     }
00668     values = Kokkos::Compat::persistingView (k_values1D_);
00669   }
00670 
00671   template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
00672             class DeviceType>
00673   void
00674   CrsMatrix<
00675     Scalar, LocalOrdinal, GlobalOrdinal,
00676     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00677   fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
00678   {
00679     using Kokkos::create_mirror_view;
00680     using Teuchos::arcp_const_cast;
00681     using Teuchos::ArrayRCP;
00682     using Teuchos::null;
00683     using Teuchos::RCP;
00684     using Teuchos::rcp;
00685     typedef ArrayRCP<size_t>::size_type size_type;
00686     typedef typename Graph::t_numRowEntries_ row_entries_type;
00687     typedef typename Graph::t_RowPtrsNC row_offsets_type;
00688     typedef typename Graph::t_LocalOrdinal_1D lclinds_1d_type;
00689     typedef t_ValuesType values_type;
00690 
00691     // fillComplete() only calls fillLocalGraphAndMatrix() if the
00692     // matrix owns the graph, which means myGraph_ is not null.
00693     TEUCHOS_TEST_FOR_EXCEPTION(
00694       myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
00695       "fillLocalGraphAndMatrix (called from fillComplete or "
00696       "expertStaticFillComplete): The nonconst graph (myGraph_) is null.  This "
00697       "means that the matrix has a const (a.k.a. \"static\") graph.  This may "
00698       "mean that fillComplete or expertStaticFillComplete has a bug, since it "
00699       "should never call fillLocalGraphAndMatrix in that case.  "
00700       "Please report this bug to the Tpetra developers.");
00701 
00702     const size_t lclNumRows = this->getNodeNumRows ();
00703 
00704     // This method's goal is to fill in the three arrays (compressed
00705     // sparse row format) that define the sparse graph's and matrix's
00706     // structure, and the sparse matrix's values.
00707     //
00708     // Use t_RowPtrs and not
00709     // Graph::LocalStaticCrsGraphType::row_map_type for k_ptrs,
00710     // because the latter is const and we need to modify k_ptrs here.
00711     row_offsets_type k_ptrs;
00712     t_RowPtrs k_ptrs_const;
00713     lclinds_1d_type k_inds;
00714     values_type k_vals;
00715 
00716     // Get references to the data in myGraph_, so we can modify them
00717     // as well.  Note that we only call fillLocalGraphAndMatrix() if
00718     // the matrix owns the graph, which means myGraph_ is not null.
00719     lclinds_1d_type k_lclInds1D_ = myGraph_->k_lclInds1D_;
00720 
00721     // The number of entries in each locally owned row.  This is a
00722     // DualView.  2-D storage lives on host and is currently not
00723     // thread-safe for parallel kernels even on host, so we have to
00724     // work sequentially with host storage in that case.
00725     row_entries_type k_numRowEnt = myGraph_->k_numRowEntries_;
00726     typename row_entries_type::t_host h_numRowEnt = k_numRowEnt.h_view;
00727 
00728     if (getProfileType () == DynamicProfile) {
00729       // Pack 2-D storage (DynamicProfile) into 1-D packed storage.
00730       //
00731       // DynamicProfile means that the matrix's column indices and
00732       // values are currently stored in a 2-D "unpacked" format, in
00733       // the arrays-of-arrays myGraph_->lclInds2D_ (for column
00734       // indices) and values2D_ (for values).  We allocate 1-D storage
00735       // (k_inds resp. k_vals), and then copy from 2-D storage
00736       // (lclInds2D_ resp. values2D_) into 1-D storage (k_inds
00737       // resp. k_vals).
00738       TEUCHOS_TEST_FOR_EXCEPTION(
00739         static_cast<size_t> (k_numRowEnt.dimension_0 ()) != lclNumRows,
00740         std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix (called "
00741         "from fillComplete or expertStaticFillComplete): For the "
00742         "DynamicProfile branch, k_numRowEnt has the wrong length.  "
00743         "k_numRowEnt.dimension_0() = " << k_numRowEnt.dimension_0 ()
00744         << " != getNodeNumRows() = " << lclNumRows << "");
00745 
00746       // Pack the row offsets into k_ptrs, by doing a sum-scan of
00747       // the array of valid entry counts per row (h_numRowEnt).
00748       //
00749       // Total number of entries in the matrix on the calling
00750       // process.  We will compute this in the loop below.  It's
00751       // cheap to compute and useful as a sanity check.
00752       size_t lclTotalNumEntries = 0;
00753       // This will be a host view of packed row offsets.
00754       typename row_offsets_type::HostMirror h_ptrs;
00755       {
00756         // Allocate the packed row offsets array.  We use a nonconst
00757         // temporary (packedRowOffsets) here, because k_ptrs is const.
00758         // We will assign packedRowOffsets to k_ptrs below.
00759         row_offsets_type packedRowOffsets ("Tpetra::CrsGraph::ptr",
00760                                            lclNumRows+1);
00761         //
00762         // FIXME hack until we get parallel_scan in kokkos
00763         //
00764         h_ptrs = create_mirror_view (packedRowOffsets);
00765         h_ptrs(0) = 0;
00766         for (size_type i = 0; i < static_cast<size_type> (lclNumRows); ++i) {
00767           const size_t numEnt = h_numRowEnt(i);
00768           lclTotalNumEntries += numEnt;
00769           h_ptrs(i+1) = h_ptrs(i) + numEnt;
00770         }
00771         Kokkos::deep_copy (packedRowOffsets, h_ptrs);
00772         // packedRowOffsets is modifiable; k_ptrs isn't, so we have to
00773         // use packedRowOffsets in the loop above and assign here.
00774         k_ptrs = packedRowOffsets;
00775         k_ptrs_const = k_ptrs;
00776       }
00777 
00778       TEUCHOS_TEST_FOR_EXCEPTION(
00779         static_cast<size_t> (k_ptrs.dimension_0 ()) != lclNumRows + 1,
00780         std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: In "
00781         "DynamicProfile branch, after packing k_ptrs, k_ptrs.dimension_0()"
00782         " = " << k_ptrs.dimension_0 () << " != (lclNumRows+1) = "
00783         << (lclNumRows+1) << ".");
00784       TEUCHOS_TEST_FOR_EXCEPTION(
00785         static_cast<size_t> (h_ptrs.dimension_0 ()) != lclNumRows + 1,
00786         std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: In "
00787         "DynamicProfile branch, after packing h_ptrs, h_ptrs.dimension_0()"
00788         " = " << h_ptrs.dimension_0 () << " != (lclNumRows+1) = "
00789         << (lclNumRows+1) << ".");
00790       // FIXME (mfh 08 Aug 2014) This assumes UVM.
00791       TEUCHOS_TEST_FOR_EXCEPTION(
00792         k_ptrs(lclNumRows) != lclTotalNumEntries, std::logic_error,
00793         "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: In DynamicProfile branch, "
00794         "after packing k_ptrs, k_ptrs(lclNumRows = " << lclNumRows << ") = " <<
00795         k_ptrs(lclNumRows) << " != total number of entries on the calling "
00796         "process = " << lclTotalNumEntries << ".");
00797 
00798       // Allocate the arrays of packed column indices and values.
00799       k_inds = lclinds_1d_type ("Tpetra::CrsGraph::ind", lclTotalNumEntries);
00800       k_vals = t_ValuesType ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
00801 
00802       // We need host views of the above, since 2-D storage lives on host.
00803       typename lclinds_1d_type::HostMirror h_inds = create_mirror_view (k_inds);
00804       typename values_type::HostMirror h_vals = create_mirror_view (k_vals);
00805 
00806       // Pack the column indices and values on the host.
00807       ArrayRCP<Array<LocalOrdinal> > lclInds2D = myGraph_->lclInds2D_;
00808       for (size_t row = 0; row < lclNumRows; ++row) {
00809         const size_t numEnt = h_numRowEnt(row);
00810         std::copy (lclInds2D[row].begin(),
00811                    lclInds2D[row].begin() + numEnt,
00812                    h_inds.ptr_on_device() + h_ptrs(row));
00813         std::copy (values2D_[row].begin(),
00814                    values2D_[row].begin() + numEnt,
00815                    h_vals.ptr_on_device() + h_ptrs(row));
00816       }
00817       // Copy the packed column indices and values to the device.
00818       Kokkos::deep_copy (k_inds, h_inds);
00819       Kokkos::deep_copy (k_vals, h_vals);
00820 
00821       // Sanity check of packed row offsets.
00822       if (k_ptrs.dimension_0 () != 0) {
00823         const size_t numOffsets = static_cast<size_t> (k_ptrs.dimension_0 ());
00824         TEUCHOS_TEST_FOR_EXCEPTION(
00825           static_cast<size_t> (k_ptrs(numOffsets-1)) != k_vals.dimension_0 (),
00826           std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: "
00827           "In DynamicProfile branch, after packing, k_ptrs(" << (numOffsets-1)
00828           << ") = " << k_ptrs(numOffsets-1) << " != k_vals.dimension_0() = "
00829           << k_vals.dimension_0 () << ".");
00830         TEUCHOS_TEST_FOR_EXCEPTION(
00831           static_cast<size_t> (k_ptrs(numOffsets-1)) != k_inds.dimension_0 (),
00832           std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: "
00833           "In DynamicProfile branch, after packing, k_ptrs(" << (numOffsets-1)
00834           << ") = " << k_ptrs(numOffsets-1) << " != k_inds.dimension_0() = "
00835           << k_inds.dimension_0 () << ".");
00836       }
00837     }
00838     else if (getProfileType () == StaticProfile) {
00839       // StaticProfile means that the matrix's column indices and
00840       // values are currently stored in a 1-D format, with row offsets
00841       // in k_rowPtrs_ and local column indices in k_lclInds1D_.
00842 
00843       // StaticProfile also means that the graph's array of row
00844       // offsets must already be allocated.
00845       typename Graph::LocalStaticCrsGraphType::row_map_type curRowOffsets =
00846         myGraph_->k_rowPtrs_;
00847       TEUCHOS_TEST_FOR_EXCEPTION(
00848         curRowOffsets.dimension_0 () == 0, std::logic_error,
00849       "curRowOffsets has size zero, but shouldn't");
00850       TEUCHOS_TEST_FOR_EXCEPTION(
00851         curRowOffsets.dimension_0 () != lclNumRows + 1, std::logic_error,
00852         "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: curRowOffsets has size "
00853         << curRowOffsets.dimension_0 () << " != lclNumRows + 1 = "
00854         << (lclNumRows + 1) << ".")
00855       {
00856         const size_t numOffsets = curRowOffsets.dimension_0 ();
00857         // FIXME (mfh 06 Aug 2014) This relies on UVM.
00858         TEUCHOS_TEST_FOR_EXCEPTION(
00859           numOffsets != 0 &&
00860           myGraph_->k_lclInds1D_.dimension_0 () != curRowOffsets(numOffsets - 1),
00861           std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: "
00862           "numOffsets = " << numOffsets << " != 0 and "
00863           "myGraph_->k_lclInds1D_.dimension_0() = "
00864           << myGraph_->k_lclInds1D_.dimension_0 ()
00865           << " != curRowOffsets(" << numOffsets << ") = "
00866           << curRowOffsets(numOffsets - 1) << ".");
00867       }
00868 
00869       if (myGraph_->nodeNumEntries_ != myGraph_->nodeNumAllocated_) {
00870         // The matrix's current 1-D storage is "unpacked."  This means
00871         // the row offsets may differ from what the final row offsets
00872         // should be.  This could happen, for example, if the user
00873         // specified StaticProfile in the constructor and set an upper
00874         // bound on the number of entries per row, but didn't fill all
00875         // those entries.
00876         TEUCHOS_TEST_FOR_EXCEPTION(
00877           static_cast<size_t> (k_numRowEnt.dimension_0 ()) != lclNumRows,
00878           std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix (called"
00879           " from fillComplete or expertStaticFillComplete): In StaticProfile "
00880           "unpacked branch, k_numRowEnt has the wrong length.  "
00881           "k_numRowEnt.dimension_0() = " << k_numRowEnt.dimension_0 ()
00882           << " != getNodeNumRows() = " << lclNumRows << ".");
00883 
00884         if (curRowOffsets.dimension_0 () != 0) {
00885           const size_t numOffsets =
00886             static_cast<size_t> (curRowOffsets.dimension_0 ());
00887           TEUCHOS_TEST_FOR_EXCEPTION(
00888             curRowOffsets(numOffsets-1) != static_cast<size_t> (k_values1D_.dimension_0 ()),
00889             std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: "
00890             "In StaticProfile branch, before allocating or packing, "
00891             "curRowOffsets(" << (numOffsets-1) << ") = "
00892             << curRowOffsets(numOffsets - 1)
00893             << " != k_values1D_.dimension_0() = "
00894             << k_values1D_.dimension_0 () << ".");
00895           TEUCHOS_TEST_FOR_EXCEPTION(
00896             static_cast<size_t> (curRowOffsets(numOffsets - 1)) !=
00897             myGraph_->k_lclInds1D_.dimension_0 (),
00898             std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: "
00899             "In StaticProfile branch, before allocating or packing, "
00900             "curRowOffsets(" << (numOffsets-1) << ") = "
00901             << curRowOffsets(numOffsets - 1)
00902             << " != myGraph_->k_lclInds1D_.dimension_0() = "
00903             << myGraph_->k_lclInds1D_.dimension_0 () << ".");
00904         }
00905 
00906         // Pack the row offsets into k_ptrs, by doing a sum-scan of
00907         // the array of valid entry counts per row (h_numRowEnt).
00908 
00909         // Total number of entries in the matrix on the calling
00910         // process.  We will compute this in the loop below.  It's
00911         // cheap to compute and useful as a sanity check.
00912         size_t lclTotalNumEntries = 0;
00913         // This will be a host view of packed row offsets.
00914         typename row_offsets_type::HostMirror h_ptrs;
00915         {
00916           // Allocate the packed row offsets array.  We use a nonconst
00917           // temporary (packedRowOffsets) here, because k_ptrs is
00918           // const.  We will assign packedRowOffsets to k_ptrs below.
00919           row_offsets_type packedRowOffsets ("Tpetra::CrsGraph::ptr",
00920                                              lclNumRows+1);
00921           //
00922           // FIXME hack until we get parallel_scan in Kokkos
00923           //
00924           // Unlike in the 2-D storage case above, we don't need the
00925           // host view of the packed row offsets array after packing
00926           // the row offsets.
00927           h_ptrs = create_mirror_view (packedRowOffsets);
00928           h_ptrs(0) = 0;
00929           for (size_type i = 0; i < static_cast<size_type> (lclNumRows); ++i) {
00930             const size_t numEnt = h_numRowEnt(i);
00931             lclTotalNumEntries += numEnt;
00932             h_ptrs(i+1) = h_ptrs(i) + numEnt;
00933           }
00934           Kokkos::deep_copy (packedRowOffsets, h_ptrs);
00935           // packedRowOffsets is modifiable; k_ptrs isn't, so we have
00936           // to use packedRowOffsets in the loop above and assign here.
00937           k_ptrs = packedRowOffsets;
00938           k_ptrs_const = k_ptrs;
00939         }
00940 
00941         TEUCHOS_TEST_FOR_EXCEPTION(
00942           static_cast<size_t> (k_ptrs.dimension_0 ()) != lclNumRows + 1,
00943           std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: For "
00944           "the StaticProfile unpacked-but-pack branch, after packing k_ptrs, "
00945           "k_ptrs.dimension_0() = " << k_ptrs.dimension_0 () << " != "
00946           "lclNumRows+1 = " << (lclNumRows+1) << ".");
00947         // FIXME (mfh 06 Aug 2014) This assumes UVM.
00948         TEUCHOS_TEST_FOR_EXCEPTION(
00949           k_ptrs(lclNumRows) != lclTotalNumEntries, std::logic_error,
00950           "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: In StaticProfile "
00951           "unpacked-but-pack branch, after filling k_ptrs, k_ptrs(lclNumRows="
00952           << lclNumRows << ") = " << k_ptrs(lclNumRows) << " != total number "
00953           "of entries on the calling process = " << lclTotalNumEntries << ".");
00954 
00955         // Allocate the arrays of packed column indices and values.
00956         k_inds = lclinds_1d_type ("Tpetra::CrsGraph::ind", lclTotalNumEntries);
00957         k_vals = t_ValuesType ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
00958 
00959         // curRowOffsets (myGraph_->k_rowPtrs_) (???), k_lclInds1D_,
00960         // and k_values1D_ are currently unpacked.  Pack them, using
00961         // the packed row offsets array k_ptrs that we created above.
00962         //
00963         // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
00964         // need to keep around the unpacked row offsets, column
00965         // indices, and values arrays.
00966 
00967         // Pack the column indices from unpacked k_lclInds1D_ into
00968         // packed k_inds.  We will replace k_lclInds1D_ below.
00969         typedef pack_functor<typename Graph::t_LocalOrdinal_1D,
00970           typename Graph::LocalStaticCrsGraphType::row_map_type>
00971           inds_packer_type;
00972         inds_packer_type indsPacker (k_inds, myGraph_->k_lclInds1D_,
00973                                      k_ptrs, curRowOffsets);
00974         Kokkos::parallel_for (lclNumRows, indsPacker);
00975 
00976         // Pack the values from unpacked k_values1D_ into packed
00977         // k_vals.  We will replace k_values1D_ below.
00978         typedef pack_functor<t_ValuesType,
00979           typename Graph::LocalStaticCrsGraphType::row_map_type>
00980           vals_packer_type;
00981         vals_packer_type valsPacker (k_vals, this->k_values1D_,
00982                                      k_ptrs, curRowOffsets);
00983         Kokkos::parallel_for (lclNumRows, valsPacker);
00984 
00985         TEUCHOS_TEST_FOR_EXCEPTION(
00986           k_ptrs.dimension_0 () == 0, std::logic_error, "Tpetra::CrsMatrix::"
00987           "fillLocalGraphAndMatrix: In StaticProfile \"Optimize Storage\" = "
00988           "true branch, after packing, k_ptrs.dimension_0() = 0.  This "
00989           "probably means that k_rowPtrs_ was never allocated.");
00990         if (k_ptrs.dimension_0 () != 0) {
00991           const size_t numOffsets = static_cast<size_t> (k_ptrs.dimension_0 ());
00992           TEUCHOS_TEST_FOR_EXCEPTION(
00993             static_cast<size_t> (k_ptrs(numOffsets - 1)) != k_vals.dimension_0 (),
00994             std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: "
00995             "In StaticProfile \"Optimize Storage\"=true branch, after packing, "
00996             "k_ptrs(" << (numOffsets-1) << ") = " << k_ptrs(numOffsets-1) <<
00997             " != k_vals.dimension_0() = " << k_vals.dimension_0 () << ".");
00998           TEUCHOS_TEST_FOR_EXCEPTION(
00999             static_cast<size_t> (k_ptrs(numOffsets - 1)) != k_inds.dimension_0 (),
01000             std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: "
01001             "In StaticProfile \"Optimize Storage\"=true branch, after packing, "
01002             "k_ptrs(" << (numOffsets-1) << ") = " << k_ptrs(numOffsets-1) <<
01003             " != k_inds.dimension_0() = " << k_inds.dimension_0 () << ".");
01004         }
01005       }
01006       else { // We don't have to pack, so just set the pointers.
01007         k_ptrs_const = myGraph_->k_rowPtrs_;
01008         k_inds = myGraph_->k_lclInds1D_;
01009         k_vals = this->k_values1D_;
01010 
01011         TEUCHOS_TEST_FOR_EXCEPTION(
01012           k_ptrs_const.dimension_0 () == 0, std::logic_error, "Tpetra::CrsMatrix::"
01013           "fillLocalGraphAndMatrix: In StaticProfile \"Optimize Storage\" = "
01014           "false branch, k_ptrs_const.dimension_0() = 0.  This probably means that "
01015           "k_rowPtrs_ was never allocated.");
01016         if (k_ptrs_const.dimension_0 () != 0) {
01017           const size_t numOffsets = static_cast<size_t> (k_ptrs_const.dimension_0 ());
01018           TEUCHOS_TEST_FOR_EXCEPTION(
01019             static_cast<size_t> (k_ptrs_const(numOffsets - 1)) != k_vals.dimension_0 (),
01020             std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: "
01021             "In StaticProfile \"Optimize Storage\" = false branch, "
01022             "k_ptrs_const(" << (numOffsets-1) << ") = " << k_ptrs_const(numOffsets - 1)
01023             << " != k_vals.dimension_0() = " << k_vals.dimension_0 () << ".");
01024           TEUCHOS_TEST_FOR_EXCEPTION(
01025             static_cast<size_t> (k_ptrs_const(numOffsets - 1)) != k_inds.dimension_0 (),
01026             std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: "
01027             "In StaticProfile \"Optimize Storage\" = false branch, "
01028             "k_ptrs_const(" << (numOffsets-1) << ") = " << k_ptrs_const(numOffsets - 1)
01029             << " != k_inds.dimension_0() = " << k_inds.dimension_0 () << ".");
01030         }
01031       }
01032     }
01033 
01034     // Extra sanity checks.
01035     TEUCHOS_TEST_FOR_EXCEPTION(
01036       static_cast<size_t> (k_ptrs_const.dimension_0 ()) != lclNumRows + 1,
01037       std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: After "
01038       "packing, k_ptrs_const.dimension_0() = " << k_ptrs_const.dimension_0 ()
01039       << " != lclNumRows+1 = " << (lclNumRows+1) << ".");
01040     if (k_ptrs_const.dimension_0 () != 0) {
01041       const size_t numOffsets = static_cast<size_t> (k_ptrs_const.dimension_0 ());
01042       TEUCHOS_TEST_FOR_EXCEPTION(
01043         static_cast<size_t> (k_ptrs_const(numOffsets - 1)) != k_vals.dimension_0 (),
01044         std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: After "
01045         "packing, k_ptrs_const(" << (numOffsets-1) << ") = " << k_ptrs_const(numOffsets-1)
01046         << " != k_vals.dimension_0() = " << k_vals.dimension_0 () << ".");
01047       TEUCHOS_TEST_FOR_EXCEPTION(
01048         static_cast<size_t> (k_ptrs_const(numOffsets - 1)) != k_inds.dimension_0 (),
01049         std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: After "
01050         "packing, k_ptrs_const(" << (numOffsets-1) << ") = " << k_ptrs_const(numOffsets-1)
01051         << " != k_inds.dimension_0() = " << k_inds.dimension_0 () << ".");
01052     }
01053 
01054     // May we ditch the old allocations for the packed (and otherwise
01055     // "optimized") allocations, later in this routine?  Optimize
01056     // storage if the graph is not static, or if the graph already has
01057     // optimized storage.
01058     const bool defaultOptStorage =
01059       ! isStaticGraph () || staticGraph_->isStorageOptimized ();
01060     const bool requestOptimizedStorage =
01061       (! params.is_null () && params->get ("Optimize Storage", defaultOptStorage)) ||
01062       (params.is_null () && defaultOptStorage);
01063 
01064     // The graph has optimized storage when indices are allocated,
01065     // myGraph_->k_numRowEntries_ is empty, and there are more than
01066     // zero rows on this process.  It's impossible for the graph to
01067     // have dynamic profile (getProfileType() == DynamicProfile) and
01068     // be optimized (isStorageOptimized()).
01069     if (requestOptimizedStorage) {
01070       // Free the old, unpacked, unoptimized allocations.
01071       // Change the graph from dynamic to static allocation profile
01072 
01073       // Free graph data structures that are only needed for 2-D or
01074       // unpacked 1-D storage.
01075       myGraph_->lclInds2D_ = null; // legacy KokkosClassic 2-D storage
01076       myGraph_->k_numRowEntries_ = row_entries_type ();
01077       myGraph_->numRowEntries_ = null; // legacy KokkosClassic view of above
01078 
01079       // Free the matrix's 2-D storage.
01080       this->values2D_ = null;
01081 
01082       // Keep the new 1-D packed allocations.
01083       myGraph_->k_rowPtrs_ = k_ptrs_const;
01084       myGraph_->k_lclInds1D_ = k_inds;
01085       this->k_values1D_ = k_vals;
01086 
01087       // Set Kokkos classic pointer for backwards compatibility.
01088       this->values1D_ = Kokkos::Compat::persistingView (k_vals);
01089 
01090       // Storage is packed now, so the number of allocated entries is
01091       // the same as the actual number of entries.
01092       myGraph_->nodeNumAllocated_ = myGraph_->nodeNumEntries_;
01093       // The graph is definitely StaticProfile now, whether or not it
01094       // was before.
01095       myGraph_->pftype_ = StaticProfile;
01096       myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
01097       this->storageStatus_ = Details::STORAGE_1D_PACKED;
01098     }
01099 
01100     RCP<Teuchos::ParameterList> lclparams;
01101     if (params.is_null ()) {
01102       lclparams = Teuchos::parameterList ();
01103     } else {
01104       lclparams = Teuchos::sublist (params, "Local Graph");
01105     }
01106 
01107     // Make the local graph, using the arrays of row offsets and
01108     // column indices that we built above.  The local graph should be
01109     // null, but we delete it first so that any memory can be freed
01110     // before we allocate the new one.
01111     //
01112     // FIXME (mfh 06,28 Aug 2014) It would make more sense for
01113     // Tpetra::CrsGraph to have a protected method that accepts k_inds
01114     // and k_ptrs, and creates the local graph k_lclGraph_.
01115     myGraph_->k_lclGraph_ =
01116       typename Graph::LocalStaticCrsGraphType (k_inds, k_ptrs_const);
01117 
01118     // Make the local matrix, using the local graph and vals array.
01119 
01120     // FIXME (mfh 28 Aug 2014) "Local Sparse Ops" sublist is now ignored.
01121 
01122     // k_lclMatrix_ = k_local_matrix_type ("Tpetra::CrsMatrix::k_lclMatrix_",
01123     //                                     getNodeNumCols (), k_vals,
01124     //                                     staticGraph_->getLocalGraph_Kokkos ());
01125     k_lclMatrix_ = k_local_matrix_type ("Tpetra::CrsMatrix::k_lclMatrix_",
01126                                         getNodeNumCols (), k_vals,
01127                                         myGraph_->k_lclGraph_);
01128     // FIXME (mfh 28 Aug 2014) "Local Sparse Ops" sublist is now ignored.
01129   }
01130 
01131 
01132   template <class Scalar,
01133             class LocalOrdinal,
01134             class GlobalOrdinal,
01135             class DeviceType>
01136   void
01137   CrsMatrix<
01138     Scalar, LocalOrdinal, GlobalOrdinal,
01139     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01140   fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
01141   {
01142     using Kokkos::create_mirror_view;
01143     using Teuchos::ArrayRCP;
01144     using Teuchos::null;
01145     using Teuchos::RCP;
01146     using Teuchos::rcp;
01147     typedef LocalOrdinal LO;
01148     typedef typename Graph::t_numRowEntries_ row_entries_type;
01149     typedef typename Graph::LocalStaticCrsGraphType::row_map_type row_map_type;
01150     typedef typename Graph::t_RowPtrsNC row_offsets_type;
01151 
01152     const size_t lclNumRows = getNodeNumRows();
01153     const map_type& rowMap = * (getRowMap ());
01154     RCP<node_type> node = rowMap.getNode ();
01155 
01156     // The goals of this routine are first, to allocate and fill
01157     // packed 1-D storage (see below for an explanation) in the vals
01158     // array, and second, to give vals to the local matrix and
01159     // finalize the local matrix.  We only need k_ptrs, the packed 1-D
01160     // row offsets, within the scope of this routine, since we're only
01161     // filling the local matrix here (use fillLocalGraphAndMatrix() to
01162     // fill both the graph and the matrix at the same time).
01163 
01164     // get data from staticGraph_
01165     ArrayRCP<Array<LO> > lclInds2D = staticGraph_->lclInds2D_;
01166     ArrayRCP<size_t> numRowEntries = staticGraph_->numRowEntries_;
01167     size_t nodeNumEntries   = staticGraph_->nodeNumEntries_;
01168     size_t nodeNumAllocated = staticGraph_->nodeNumAllocated_;
01169     row_map_type k_rowPtrs_ = staticGraph_->k_lclGraph_.row_map;
01170 
01171     row_map_type k_ptrs; // "packed" row offsets array
01172     t_ValuesType k_vals; // "packed" values array
01173 
01174     // May we ditch the old allocations for the packed (and otherwise
01175     // "optimized") allocations, later in this routine?  Request
01176     // optimized storage by default.
01177     bool requestOptimizedStorage = true;
01178     const bool default_OptimizeStorage =
01179       ! isStaticGraph () || staticGraph_->isStorageOptimized ();
01180     if (! params.is_null () && ! params->get ("Optimize Storage", default_OptimizeStorage)) {
01181       requestOptimizedStorage = false;
01182     }
01183     // If we're not allowed to change a static graph, then we can't
01184     // change the storage of the matrix, either.  This means that if
01185     // the graph's storage isn't already optimized, we can't optimize
01186     // the matrix's storage either.  Check and give warning, as
01187     // appropriate.
01188     if (! staticGraph_->isStorageOptimized () && requestOptimizedStorage) {
01189       TPETRA_ABUSE_WARNING(true, std::runtime_error,
01190         "::fillLocalMatrix(): You requested optimized storage by setting the"
01191         "\"Optimize Storage\" flag to \"true\" in the parameter list, or by virtue"
01192         "of default behavior. However, the associated CrsGraph was filled separately"
01193         "and requested not to optimize storage. Therefore, the CrsMatrix cannot"
01194         "optimize storage.");
01195       requestOptimizedStorage = false;
01196     }
01197 
01198     // The number of entries in each locally owned row.  This is a
01199     // DualView.  2-D storage lives on host and is currently not
01200     // thread-safe for parallel kernels even on host, so we have to
01201     // work sequentially with host storage in that case.
01202     row_entries_type k_numRowEnt = staticGraph_->k_numRowEntries_;
01203     typename row_entries_type::t_host h_numRowEnt = k_numRowEnt.h_view;
01204 
01205     if (getProfileType() == DynamicProfile) {
01206       // Pack 2-D storage (DynamicProfile) into 1-D packed storage.
01207       //
01208       // DynamicProfile means that the matrix's values are currently
01209       // stored in a 2-D "unpacked" format, in the array-of-arrays
01210       // values2D_.  We allocate 1-D storage and then copy from 2-D
01211       // storage in values2D_ into 1-D storage in k_vals.  Since we're
01212       // only allocating the local matrix here, not the local graph,
01213       // we don't need to keep the row offsets array, but we do need
01214       // it here temporarily in order to convert to 1-D storage.  (The
01215       // allocStorage() function needs it.)  We'll free ptrs later in
01216       // this method.
01217       //
01218       // FIXME (mfh 08 Aug 2014) If we're in this method, then the
01219       // graph should already have packed 1-D storage.  Why can't we
01220       // just use the graph's current row offsets array?
01221 
01222       // Pack the row offsets into k_ptrs, by doing a sum-scan of
01223       // the array of valid entry counts per row (h_numRowEnt).
01224       //
01225       // Total number of entries in the matrix on the calling
01226       // process.  We will compute this in the loop below.  It's
01227       // cheap to compute and useful as a sanity check.
01228       size_t lclTotalNumEntries = 0;
01229       // This will be a host view of packed row offsets.
01230       typename row_offsets_type::HostMirror h_ptrs;
01231       {
01232         row_offsets_type packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows+1);
01233         //
01234         // FIXME hack until we get parallel_scan in Kokkos
01235         //
01236         h_ptrs = create_mirror_view (packedRowOffsets);
01237         h_ptrs(0) = 0;
01238         for (size_t i = 0; i < lclNumRows; ++i) {
01239           const size_t numEnt = h_numRowEnt(i);
01240           lclTotalNumEntries += numEnt;
01241           h_ptrs(i+1) = h_ptrs(i) + numEnt;
01242         }
01243         Kokkos::deep_copy (packedRowOffsets, h_ptrs);
01244         k_ptrs = packedRowOffsets;
01245       }
01246 
01247       TEUCHOS_TEST_FOR_EXCEPTION(
01248         static_cast<size_t> (k_ptrs.dimension_0 ()) != lclNumRows + 1,
01249         std::logic_error, "Tpetra::CrsMatrix::fillLocalMatrix: In "
01250         "DynamicProfile branch, after packing k_ptrs, k_ptrs.dimension_0()"
01251         " = " << k_ptrs.dimension_0 () << " != (lclNumRows+1) = "
01252         << (lclNumRows+1) << ".");
01253       TEUCHOS_TEST_FOR_EXCEPTION(
01254         static_cast<size_t> (h_ptrs.dimension_0 ()) != lclNumRows + 1,
01255         std::logic_error, "Tpetra::CrsMatrix::fillLocalMatrix: In "
01256         "DynamicProfile branch, after packing h_ptrs, h_ptrs.dimension_0()"
01257         " = " << h_ptrs.dimension_0 () << " != (lclNumRows+1) = "
01258         << (lclNumRows+1) << ".");
01259       // FIXME (mfh 08 Aug 2014) This assumes UVM.
01260       TEUCHOS_TEST_FOR_EXCEPTION(
01261         k_ptrs(lclNumRows) != lclTotalNumEntries, std::logic_error,
01262         "Tpetra::CrsMatrix::fillLocalMatrix: In DynamicProfile branch, "
01263         "after packing k_ptrs, k_ptrs(lclNumRows = " << lclNumRows << ") = " <<
01264         k_ptrs(lclNumRows) << " != total number of entries on the calling "
01265         "process = " << lclTotalNumEntries << ".");
01266 
01267       // Allocate the array of packed values.
01268       k_vals = t_ValuesType ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
01269       // We need a host view of the above, since 2-D storage lives on host.
01270       typename t_ValuesType::HostMirror h_vals =
01271         Kokkos::create_mirror_view (k_vals);
01272       // Pack the values on the host.
01273       for (size_t lclRow = 0; lclRow < lclNumRows; ++lclRow) {
01274         const size_t numEnt = h_numRowEnt(lclRow);
01275         std::copy (values2D_[lclRow].begin(),
01276                    values2D_[lclRow].begin() + numEnt,
01277                    h_vals.ptr_on_device() + h_ptrs(lclRow));
01278       }
01279       // Copy the packed values to the device.
01280       Kokkos::deep_copy (k_vals, h_vals);
01281 
01282       // Sanity check of packed row offsets.
01283       if (k_ptrs.dimension_0 () != 0) {
01284         const size_t numOffsets = static_cast<size_t> (k_ptrs.dimension_0 ());
01285         TEUCHOS_TEST_FOR_EXCEPTION(
01286           static_cast<size_t> (k_ptrs(numOffsets-1)) != k_vals.dimension_0 (),
01287           std::logic_error, "Tpetra::CrsMatrix::fillLocalMatrix: "
01288           "In DynamicProfile branch, after packing, k_ptrs(" << (numOffsets-1)
01289           << ") = " << k_ptrs(numOffsets-1) << " != k_vals.dimension_0() = "
01290           << k_vals.dimension_0 () << ".");
01291       }
01292     }
01293     else if (getProfileType () == StaticProfile) {
01294       // StaticProfile means that the matrix's values are currently
01295       // stored in a 1-D format.  However, this format is "unpacked";
01296       // it doesn't necessarily have the same row offsets as indicated
01297       // by the ptrs array returned by allocRowPtrs.  This could
01298       // happen, for example, if the user specified StaticProfile in
01299       // the constructor and fixed the number of matrix entries in
01300       // each row, but didn't fill all those entries.
01301       //
01302       // As above, we don't need to keep the "packed" row offsets
01303       // array ptrs here, but we do need it here temporarily, so we
01304       // have to allocate it.  We'll free ptrs later in this method.
01305       //
01306       // Note that this routine checks whether storage has already
01307       // been packed.  This is a common case for solution of nonlinear
01308       // PDEs using the finite element method, as long as the
01309       // structure of the sparse matrix does not change between linear
01310       // solves.
01311       if (nodeNumEntries != nodeNumAllocated) {
01312         // We have to pack the 1-D storage, since the user didn't fill
01313         // up all requested storage.
01314         typename Graph::t_RowPtrsNC tmpk_ptrs ("Tpetra::CrsGraph::ptr",
01315                                                lclNumRows+1);
01316         // Total number of entries in the matrix on the calling
01317         // process.  We will compute this in the loop below.  It's
01318         // cheap to compute and useful as a sanity check.
01319         size_t lclTotalNumEntries = 0;
01320         k_ptrs = tmpk_ptrs;
01321         {
01322           //
01323           // FIXME hack until we get parallel_scan in Kokkos
01324           //
01325           typename row_offsets_type::HostMirror h_ptrs =
01326             create_mirror_view (tmpk_ptrs);
01327           h_ptrs(0) = 0;
01328           for (size_t i = 0; i < lclNumRows; ++i) {
01329             const size_t numEnt = h_numRowEnt(i);
01330             lclTotalNumEntries += numEnt;
01331             h_ptrs(i+1) = h_ptrs(i) + numEnt;
01332           }
01333           Kokkos::deep_copy (tmpk_ptrs, h_ptrs);
01334         }
01335 
01336         // Allocate the "packed" values array.
01337         // It has exactly the right number of entries.
01338         k_vals = t_ValuesType ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
01339 
01340         // Pack k_values1D_ into k_vals.  We will replace k_values1D_ below.
01341         typedef pack_functor<t_ValuesType,
01342           typename Graph::LocalStaticCrsGraphType::row_map_type>
01343           packer_type;
01344         packer_type valsPacker (k_vals, k_values1D_, tmpk_ptrs, k_rowPtrs_);
01345         Kokkos::parallel_for (lclNumRows, valsPacker);
01346       }
01347       else { // We don't have to pack, so just set the pointer.
01348         k_vals = k_values1D_;
01349       }
01350     }
01351 
01352     // May we ditch the old allocations for the packed one?
01353     if (requestOptimizedStorage) {
01354       // The user requested optimized storage, so we can dump the
01355       // unpacked 2-D and 1-D storage, and keep the packed storage.
01356       values2D_ = null;
01357       k_values1D_ = k_vals;
01358       this->storageStatus_ = Details::STORAGE_1D_PACKED;
01359     }
01360 
01361     // FIXME (mfh 28 Aug 2014) "Local Matrix" sublist is now ignored.
01362 
01363     // Build the local sparse matrix object.
01364     k_lclMatrix_ = k_local_matrix_type ("Tpetra::CrsMatrix::k_lclMatrix_",
01365                                         getDomainMap ()->getNodeNumElements (),
01366                                         k_vals,
01367                                         staticGraph_->getLocalGraph_Kokkos ());
01368 
01369     // Set the legacy values1D_ array.
01370     ArrayRCP<scalar_type> classicValues =
01371       Kokkos::Compat::persistingView (k_lclMatrix_.values);
01372     values1D_ = classicValues;
01373 
01374     // FIXME (mfh 28 Aug 2014) "Local Sparse Ops" sublist is now ignored.
01375   }
01376 
01377   template<class Scalar,
01378            class LocalOrdinal,
01379            class GlobalOrdinal,
01380            class DeviceType>
01381   void
01382   CrsMatrix<
01383     Scalar, LocalOrdinal, GlobalOrdinal,
01384     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01385   insertLocalValues (const LocalOrdinal localRow,
01386                      const Teuchos::ArrayView<const LocalOrdinal>& indices,
01387                      const Teuchos::ArrayView<const Scalar>& values)
01388   {
01389     using Teuchos::Array;
01390     using Teuchos::ArrayView;
01391     using Teuchos::toString;
01392     using std::endl;
01393     const char tfecfFuncName[] = "insertLocalValues";
01394 
01395     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillActive (), std::runtime_error,
01396       ": Fill is not active.  After calling fillComplete, you must call "
01397       "resumeFill before you may insert entries into the matrix again.");
01398     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isStaticGraph (),  std::runtime_error,
01399       " cannot insert indices with static graph; use replaceLocalValues() instead.");
01400     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(myGraph_->isGloballyIndexed(),
01401       std::runtime_error, ": graph indices are global; use insertGlobalValues().");
01402     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! hasColMap (), std::runtime_error,
01403       " cannot insert local indices without a column map.");
01404     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(values.size() != indices.size(),
01405       std::runtime_error, ": values.size() must equal indices.size().");
01406     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01407       ! getRowMap()->isNodeLocalElement(localRow), std::runtime_error,
01408       ": Local row index " << localRow << " does not belong to this process.");
01409 
01410     if (! myGraph_->indicesAreAllocated ()) {
01411       try {
01412         allocateValues (LocalIndices, GraphNotYetAllocated);
01413       }
01414       catch (std::exception& e) {
01415         TEUCHOS_TEST_FOR_EXCEPTION(
01416           true, std::runtime_error, "Tpetra::CrsMatrix::insertLocalValues: "
01417           "allocateValues(LocalIndices,GraphNotYetAllocated) threw an "
01418           "exception: " << e.what ());
01419       }
01420     }
01421 
01422     const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
01423 #ifdef HAVE_TPETRA_DEBUG
01424     // In a debug build, if the matrix has a column Map, test whether
01425     // any of the given column indices are not in the column Map.
01426     // Keep track of the invalid column indices so we can tell the
01427     // user about them.
01428     if (hasColMap ()) {
01429       const map_type& colMap = * (getColMap ());
01430       Array<LocalOrdinal> badColInds;
01431       bool allInColMap = true;
01432       for (size_t k = 0; k < numEntriesToAdd; ++k) {
01433         if (! colMap.isNodeLocalElement (indices[k])) {
01434           allInColMap = false;
01435           badColInds.push_back (indices[k]);
01436         }
01437       }
01438       if (! allInColMap) {
01439         std::ostringstream os;
01440         os << "Tpetra::CrsMatrix::insertLocalValues: You attempted to insert "
01441           "entries in owned row " << localRow << ", at the following column "
01442           "indices: " << toString (indices) << "." << endl;
01443         os << "Of those, the following indices are not in the column Map on "
01444           "this process: " << toString (badColInds) << "." << endl << "Since "
01445           "the matrix has a column Map already, it is invalid to insert "
01446           "entries at those locations.";
01447         TEUCHOS_TEST_FOR_EXCEPTION(! allInColMap, std::invalid_argument, os.str ());
01448       }
01449     }
01450 #endif // HAVE_TPETRA_DEBUG
01451 
01452 #ifdef HAVE_TPETRA_DEBUG
01453     RowInfo rowInfo;
01454     try {
01455       rowInfo = myGraph_->getRowInfo (localRow);
01456     } catch (std::exception& e) {
01457       TEUCHOS_TEST_FOR_EXCEPTION(
01458         true, std::runtime_error, "Tpetra::CrsMatrix::insertLocalValues: "
01459         "myGraph_->getRowInfo threw an exception: " << e.what ());
01460     }
01461 #else
01462     RowInfo rowInfo = myGraph_->getRowInfo (localRow);
01463 #endif // HAVE_TPETRA_DEBUG
01464 
01465     const size_t curNumEntries = rowInfo.numEntries;
01466     const size_t newNumEntries = curNumEntries + numEntriesToAdd;
01467     if (newNumEntries > rowInfo.allocSize) {
01468       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01469         getProfileType() == StaticProfile, std::runtime_error,
01470         ": new indices exceed statically allocated graph structure.");
01471 
01472       // Make space for the new matrix entries.
01473       try {
01474         rowInfo = myGraph_->template updateLocalAllocAndValues<Scalar> (rowInfo,
01475                                                                         newNumEntries,
01476                                                                         values2D_[localRow]);
01477       } catch (std::exception& e) {
01478         TEUCHOS_TEST_FOR_EXCEPTION(
01479           true, std::runtime_error, "Tpetra::CrsMatrix::insertLocalValues: "
01480           "myGraph_->updateGlobalAllocAndValues threw an exception: "
01481           << e.what ());
01482       }
01483     }
01484     typename Graph::SLocalGlobalViews indsView;
01485     indsView.linds = indices;
01486 
01487 #ifdef HAVE_TPETRA_DEBUG
01488     ArrayView<Scalar> valsView;
01489     try {
01490       valsView = this->getViewNonConst (rowInfo);
01491     } catch (std::exception& e) {
01492       TEUCHOS_TEST_FOR_EXCEPTION(
01493         true, std::runtime_error, "Tpetra::CrsMatrix::insertLocalValues: "
01494         "getViewNonConst threw an exception: " << e.what ());
01495     }
01496 #else
01497     ArrayView<Scalar> valsView = this->getViewNonConst (rowInfo);
01498 #endif // HAVE_TPETRA_DEBUG
01499 
01500     try {
01501       myGraph_->template insertIndicesAndValues<Scalar> (rowInfo, indsView,
01502                                                          valsView, values,
01503                                                          LocalIndices,
01504                                                          LocalIndices);
01505     } catch (std::exception& e) {
01506       TEUCHOS_TEST_FOR_EXCEPTION(
01507         true, std::runtime_error, "Tpetra::CrsMatrix::insertLocalValues: "
01508         "myGraph_->insertIndicesAndValues threw an exception: "
01509         << e.what ());
01510     }
01511 
01512 #ifdef HAVE_TPETRA_DEBUG
01513     const size_t chkNewNumEntries = myGraph_->getNumEntriesInLocalRow (localRow);
01514     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01515       chkNewNumEntries != newNumEntries, std::logic_error,
01516       ": The row should have " << newNumEntries << " entries after insert, but "
01517       "instead has " << chkNewNumEntries << ".  Please report this bug to the "
01518       "Tpetra developers.");
01519     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isLocallyIndexed(), std::logic_error,
01520       ": At end of insertLocalValues(), this CrsMatrix is not locally indexed.  "
01521       "Please report this bug to the Tpetra developers.");
01522 #endif // HAVE_TPETRA_DEBUG
01523   }
01524 
01525   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
01526            class DeviceType>
01527   void
01528   CrsMatrix<
01529     Scalar, LocalOrdinal, GlobalOrdinal,
01530     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01531   insertLocalValuesFiltered (const LocalOrdinal localRow,
01532                              const Teuchos::ArrayView<const LocalOrdinal>& indices,
01533                              const Teuchos::ArrayView<const Scalar>& values)
01534   {
01535     const char tfecfFuncName[] = "insertLocalValues";
01536     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillActive (), std::runtime_error,
01537       " requires that fill is active.");
01538     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isStaticGraph (),  std::runtime_error,
01539       " cannot insert indices with static graph; use replaceLocalValues() instead.");
01540     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(myGraph_->isGloballyIndexed(),
01541       std::runtime_error, ": graph indices are global; use insertGlobalValues().");
01542     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! hasColMap (), std::runtime_error,
01543       " cannot insert local indices without a column map.");
01544     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(values.size() != indices.size(),
01545       std::runtime_error, ": values.size() must equal indices.size().");
01546     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01547       ! getRowMap()->isNodeLocalElement (localRow), std::runtime_error,
01548       ": Local row index " << localRow << " does not belong to this process.");
01549     if (! myGraph_->indicesAreAllocated ()) {
01550       allocateValues (LocalIndices, GraphNotYetAllocated);
01551     }
01552     // Use the graph to filter incoming entries whose column indices
01553     // aren't in the column Map.
01554     Teuchos::Array<LocalOrdinal> f_inds (indices);
01555     Teuchos::Array<Scalar> f_vals (values);
01556     const size_t numFilteredEntries =
01557       myGraph_->template filterLocalIndicesAndValues<Scalar> (f_inds (),
01558                                                               f_vals ());
01559     if (numFilteredEntries > 0) {
01560       RowInfo rowInfo = myGraph_->getRowInfo (localRow);
01561       const size_t curNumEntries = rowInfo.numEntries;
01562       const size_t newNumEntries = curNumEntries + numFilteredEntries;
01563       if (newNumEntries > rowInfo.allocSize) {
01564         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01565           getProfileType () == StaticProfile, std::runtime_error,
01566           ": new indices exceed statically allocated graph structure.  "
01567           "newNumEntries (" << newNumEntries << " > rowInfo.allocSize ("
01568           << rowInfo.allocSize << ").");
01569         // Make space for the new matrix entries.
01570         rowInfo =
01571           myGraph_->template updateLocalAllocAndValues<Scalar> (rowInfo,
01572                                                                 newNumEntries,
01573                                                                 values2D_[localRow]);
01574       }
01575       typename Graph::SLocalGlobalViews inds_view;
01576       inds_view.linds = f_inds (0, numFilteredEntries);
01577       myGraph_->template insertIndicesAndValues<Scalar> (rowInfo, inds_view,
01578                                                          this->getViewNonConst (rowInfo),
01579                                                          f_vals, LocalIndices,
01580                                                          LocalIndices);
01581 #ifdef HAVE_TPETRA_DEBUG
01582       const size_t chkNewNumEntries = myGraph_->getNumEntriesInLocalRow (localRow);
01583       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(chkNewNumEntries != newNumEntries,
01584         std::logic_error, ": Internal logic error. Please contact Tpetra team.");
01585 #endif // HAVE_TPETRA_DEBUG
01586     }
01587 #ifdef HAVE_TPETRA_DEBUG
01588     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isLocallyIndexed(), std::logic_error,
01589       ": At end of insertLocalValues(), this CrsMatrix is not locally indexed.  "
01590       "Please report this bug to the Tpetra developers.");
01591 #endif // HAVE_TPETRA_DEBUG
01592   }
01593 
01594 
01595   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
01596            class DeviceType>
01597   void
01598   CrsMatrix<
01599     Scalar, LocalOrdinal, GlobalOrdinal,
01600     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01601   insertGlobalValues (const GlobalOrdinal globalRow,
01602                       const Teuchos::ArrayView<const GlobalOrdinal>& indices,
01603                       const Teuchos::ArrayView<const Scalar>& values)
01604   {
01605     using Teuchos::Array;
01606     using Teuchos::ArrayView;
01607     using Teuchos::toString;
01608     using std::endl;
01609     typedef LocalOrdinal LO;
01610     typedef GlobalOrdinal GO;
01611     typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
01612     const char tfecfFuncName[] = "insertGlobalValues: ";
01613 
01614 #ifdef HAVE_TPETRA_DEBUG
01615     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01616       values.size() != indices.size(), std::runtime_error,
01617       "values.size() must equal indices.size().  values.size() = "
01618       << values.size() << ", but indices.size() = " << indices.size() << ".");
01619 #endif // HAVE_TPETRA_DEBUG
01620 
01621     const LO localRow = getRowMap ()->getLocalElement (globalRow);
01622 
01623     if (localRow == OTL::invalid ()) { // globalRow _not_ owned by calling process
01624       insertNonownedGlobalValues (globalRow, indices, values);
01625     }
01626     else { // globalRow _is_ owned by calling process
01627       if (this->isStaticGraph ()) {
01628         // Uh oh!  Not allowed to insert into owned rows in that case.
01629         std::ostringstream err;
01630         const int myRank = getRowMap ()->getComm ()->getRank ();
01631         const int numProcs = getRowMap ()->getComm ()->getSize ();
01632 
01633         err << "The matrix was constructed with a constant (\"static\") graph, "
01634           "yet the given global row index " << globalRow << " is in the row "
01635           "Map on the calling process (with rank " << myRank << ", of " <<
01636           numProcs << " process(es)).  In this case, you may not insert new "
01637           "entries into rows owned by the calling process.";
01638 
01639         if (! getRowMap ()->isNodeGlobalElement (globalRow)) {
01640           err << "  Furthermore, GID->LID conversion with the row Map claims that "
01641             "the global row index is owned on the calling process, yet "
01642             "getRowMap()->isNodeGlobalElement(globalRow) returns false.  That's"
01643             " weird!  This might indicate a Map bug.  Please report this to the"
01644             " Tpetra developers.";
01645         }
01646         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01647           this->isStaticGraph (), std::runtime_error, err.str ());
01648       }
01649 
01650       if (! myGraph_->indicesAreAllocated ()) {
01651         try {
01652           allocateValues (GlobalIndices, GraphNotYetAllocated);
01653         }
01654         catch (std::exception& e) {
01655           TEUCHOS_TEST_FOR_EXCEPTION(
01656             true, std::runtime_error, "Tpetra::CrsMatrix::insertGlobalValues: "
01657             "allocateValues(GlobalIndices,GraphNotYetAllocated) threw an "
01658             "exception: " << e.what ());
01659         }
01660       }
01661 
01662       const size_type numEntriesToInsert = indices.size ();
01663       // If the matrix has a column Map, check at this point whether
01664       // the column indices belong to the column Map.
01665       //
01666       // FIXME (mfh 16 May 2013) We may want to consider deferring the
01667       // test to the CrsGraph method, since it may have to do this
01668       // anyway.
01669       if (hasColMap ()) {
01670         const map_type& colMap = * (getColMap ());
01671         // In a debug build, keep track of the nonowned ("bad") column
01672         // indices, so that we can display them in the exception
01673         // message.  In a release build, just ditch the loop early if
01674         // we encounter a nonowned column index.
01675 #ifdef HAVE_TPETRA_DEBUG
01676         Array<GO> badColInds;
01677 #endif // HAVE_TPETRA_DEBUG
01678         bool allInColMap = true;
01679         for (size_type k = 0; k < numEntriesToInsert; ++k) {
01680           if (! colMap.isNodeGlobalElement (indices[k])) {
01681             allInColMap = false;
01682 #ifdef HAVE_TPETRA_DEBUG
01683             badColInds.push_back (indices[k]);
01684 #else
01685             break;
01686 #endif // HAVE_TPETRA_DEBUG
01687           }
01688         }
01689         if (! allInColMap) {
01690           std::ostringstream os;
01691           os << "You attempted to insert entries in owned row " << globalRow
01692              << ", at the following column indices: " << toString (indices)
01693              << "." << endl;
01694 #ifdef HAVE_TPETRA_DEBUG
01695           os << "Of those, the following indices are not in the column Map on "
01696             "this process: " << toString (badColInds) << "." << endl << "Since "
01697             "the matrix has a column Map already, it is invalid to insert "
01698             "entries at those locations.";
01699 #else
01700           os << "At least one of those indices is not in the column Map on this "
01701             "process." << endl << "It is invalid to insert into columns not in "
01702             "the column Map on the process that owns the row.";
01703 #endif // HAVE_TPETRA_DEBUG
01704           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01705             ! allInColMap, std::invalid_argument, os.str ());
01706         }
01707       }
01708 
01709       typename Graph::SLocalGlobalViews inds_view;
01710       ArrayView<const Scalar> vals_view;
01711 
01712       inds_view.ginds = indices;
01713       vals_view       = values;
01714 
01715 #ifdef HAVE_TPETRA_DEBUG
01716       RowInfo rowInfo;
01717       try {
01718         rowInfo = myGraph_->getRowInfo (localRow);
01719       } catch (std::exception& e) {
01720         TEUCHOS_TEST_FOR_EXCEPTION(
01721           true, std::runtime_error, "myGraph_->getRowInfo(localRow=" << localRow
01722           << ") threw an exception: " << e.what ());
01723       }
01724 #else
01725       RowInfo rowInfo = myGraph_->getRowInfo (localRow);
01726 #endif // HAVE_TPETRA_DEBUG
01727 
01728       const size_t curNumEntries = rowInfo.numEntries;
01729       const size_t newNumEntries =
01730         curNumEntries + static_cast<size_t> (numEntriesToInsert);
01731       if (newNumEntries > rowInfo.allocSize) {
01732         TEUCHOS_TEST_FOR_EXCEPTION(
01733           getProfileType () == StaticProfile && newNumEntries > rowInfo.allocSize,
01734           std::runtime_error, "Tpetra::CrsMatrix::insertGlobalValues: new "
01735           "indices exceed statically allocated graph structure.  curNumEntries"
01736           " (" << curNumEntries << ") + numEntriesToInsert (" <<
01737           numEntriesToInsert << ") > allocSize (" << rowInfo.allocSize << ").");
01738 
01739         // Update allocation only as much as necessary
01740         try {
01741           rowInfo =
01742             myGraph_->template updateGlobalAllocAndValues<Scalar> (rowInfo,
01743                                                                    newNumEntries,
01744                                                                    values2D_[localRow]);
01745         } catch (std::exception& e) {
01746           TEUCHOS_TEST_FOR_EXCEPTION(
01747             true, std::runtime_error, "myGraph_->updateGlobalAllocAndValues"
01748             "(...) threw an exception: " << e.what ());
01749         }
01750       }
01751       try {
01752         if (isGloballyIndexed ()) {
01753           // lg=GlobalIndices, I=GlobalIndices means the method calls
01754           // getGlobalViewNonConst() and does direct copying, which
01755           // should be reasonably fast.
01756           myGraph_->template insertIndicesAndValues<Scalar> (rowInfo, inds_view,
01757                                                              this->getViewNonConst (rowInfo),
01758                                                              values,
01759                                                              GlobalIndices, GlobalIndices);
01760         }
01761         else {
01762           // lg=GlobalIndices, I=LocalIndices means the method calls
01763           // the Map's getLocalElement() method once per entry to
01764           // insert.  This may be slow.
01765           myGraph_->template insertIndicesAndValues<Scalar> (rowInfo, inds_view,
01766                                                              this->getViewNonConst (rowInfo),
01767                                                              values,
01768                                                              GlobalIndices, LocalIndices);
01769         }
01770       }
01771       catch (std::exception& e) {
01772         TEUCHOS_TEST_FOR_EXCEPTION(
01773           true, std::runtime_error, "myGraph_->insertIndicesAndValues(...) "
01774           "threw an exception: " << e.what ());
01775       }
01776 
01777 #ifdef HAVE_TPETRA_DEBUG
01778       const size_t chkNewNumEntries = myGraph_->getNumEntriesInLocalRow (localRow);
01779       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(chkNewNumEntries != newNumEntries,
01780         std::logic_error, ": There should be a total of " << newNumEntries
01781         << " entries in the row, but the graph now reports " << chkNewNumEntries
01782         << " entries.  Please report this bug to the Tpetra developers.");
01783 #endif // HAVE_TPETRA_DEBUG
01784     }
01785   }
01786 
01787 
01788   template<class Scalar,
01789            class LocalOrdinal,
01790            class GlobalOrdinal, class DeviceType>
01791   void
01792   CrsMatrix<
01793     Scalar, LocalOrdinal, GlobalOrdinal,
01794     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01795   insertGlobalValuesFiltered (const GlobalOrdinal globalRow,
01796                               const ArrayView<const GlobalOrdinal>& indices,
01797                               const ArrayView<const Scalar>& values)
01798   {
01799     typedef LocalOrdinal LO;
01800     typedef GlobalOrdinal GO;
01801     const char tfecfFuncName[] = "insertGlobalValuesFiltered";
01802 
01803     // mfh 14 Dec 2012: Defer test for static graph until we know that
01804     // globalRow is in the row Map.  If it's not in the row Map, it
01805     // doesn't matter whether or not the graph is static; the data
01806     // just get stashed for later use by globalAssemble().
01807     //
01808     // TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01809     //   isStaticGraph(), std::runtime_error,
01810     //   ": matrix was constructed with static graph. Cannot insert new entries.");
01811 #ifdef HAVE_TPETRA_DEBUG
01812     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01813       values.size() != indices.size(), std::runtime_error,
01814       ": values.size() must equal indices.size().  values.size() = "
01815       << values.size() << ", but indices.size() = " << indices.size() << ".");
01816 #endif // HAVE_TPETRA_DEBUG
01817 
01818     const LO lrow = getRowMap ()->getLocalElement (globalRow);
01819 
01820     if (lrow != Teuchos::OrdinalTraits<LO>::invalid ()) { // globalRow is in our row Map.
01821       // If the matrix has a static graph, this process is now allowed
01822       // to insert into rows it owns.
01823       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01824         this->isStaticGraph(), std::runtime_error,
01825         ": The CrsMatrix was constructed with a static graph.  In that case, "
01826         "it's forbidded to insert new entries into rows owned by the calling process.");
01827       if (! myGraph_->indicesAreAllocated ()) {
01828         allocateValues (GlobalIndices, GraphNotYetAllocated);
01829       }
01830       typename Graph::SLocalGlobalViews inds_view;
01831       ArrayView<const Scalar> vals_view;
01832 
01833       // We have to declare these Arrays here rather than in the
01834       // hasColMap() if branch, so that views to them will remain
01835       // valid for the whole scope.
01836       Array<GO> filtered_indices;
01837       Array<Scalar> filtered_values;
01838       if (hasColMap ()) { // We have a column Map.
01839         // Use column Map to filter the indices and corresponding
01840         // values, so that we only insert entries into columns we own.
01841         filtered_indices.assign (indices.begin (), indices.end ());
01842         filtered_values.assign (values.begin (), values.end ());
01843         const size_t numFilteredEntries =
01844           myGraph_->template filterGlobalIndicesAndValues<Scalar> (filtered_indices (),
01845                                                                    filtered_values ());
01846         inds_view.ginds = filtered_indices (0, numFilteredEntries);
01847         vals_view       = filtered_values (0, numFilteredEntries);
01848       }
01849       else { // we don't have a column Map.
01850         inds_view.ginds = indices;
01851         vals_view       = values;
01852       }
01853       const size_t numFilteredEntries = vals_view.size ();
01854       // add the new indices and values
01855       if (numFilteredEntries > 0) {
01856         RowInfo rowInfo = myGraph_->getRowInfo(lrow);
01857         const size_t curNumEntries = rowInfo.numEntries;
01858         const size_t newNumEntries = curNumEntries + numFilteredEntries;
01859         if (newNumEntries > rowInfo.allocSize) {
01860           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01861             getProfileType() == StaticProfile, std::runtime_error,
01862             ": new indices exceed statically allocated graph structure.");
01863 
01864           // Update allocation only as much as necessary
01865           rowInfo = myGraph_->template updateGlobalAllocAndValues<Scalar> (rowInfo, newNumEntries,
01866                                                                            values2D_[lrow]);
01867         }
01868         if (isGloballyIndexed ()) {
01869           // lg=GlobalIndices, I=GlobalIndices means the method calls
01870           // getGlobalViewNonConst() and does direct copying, which
01871           // should be reasonably fast.
01872           myGraph_->template insertIndicesAndValues<Scalar> (rowInfo, inds_view,
01873                                                              this->getViewNonConst (rowInfo),
01874                                                              vals_view,
01875                                                              GlobalIndices, GlobalIndices);
01876         }
01877         else {
01878           // lg=GlobalIndices, I=LocalIndices means the method calls
01879           // the Map's getLocalElement() method once per entry to
01880           // insert.  This may be slow.
01881           myGraph_->template insertIndicesAndValues<Scalar> (rowInfo, inds_view,
01882                                                              this->getViewNonConst (rowInfo),
01883                                                              vals_view,
01884                                                              GlobalIndices, LocalIndices);
01885         }
01886 #ifdef HAVE_TPETRA_DEBUG
01887         {
01888           const size_t chkNewNumEntries = myGraph_->getNumEntriesInLocalRow(lrow);
01889           TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(chkNewNumEntries != newNumEntries,
01890             std::logic_error, ": There should be a total of " << newNumEntries
01891             << " entries in the row, but the graph now reports " << chkNewNumEntries
01892             << " entries.  Please report this bug to the Tpetra developers.");
01893         }
01894 #endif // HAVE_TPETRA_DEBUG
01895       }
01896     }
01897     else { // The calling process doesn't own the given row.
01898       insertNonownedGlobalValues (globalRow, indices, values);
01899     }
01900   }
01901 
01902 
01903   template<class Scalar,
01904            class LocalOrdinal,
01905            class GlobalOrdinal,
01906            class DeviceType>
01907   LocalOrdinal
01908   CrsMatrix<
01909     Scalar, LocalOrdinal, GlobalOrdinal,
01910     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01911   replaceLocalValues (const LocalOrdinal localRow,
01912                       const ArrayView<const LocalOrdinal> &indices,
01913                       const ArrayView<const Scalar>& values)
01914   {
01915     using Teuchos::Array;
01916     using Teuchos::ArrayView;
01917     // project2nd is a binary function that returns its second
01918     // argument.  This replaces entries in the given row with their
01919     // corresponding entry of values.
01920     typedef Tpetra::project2nd<Scalar, Scalar> f_type;
01921     typedef LocalOrdinal LO;
01922     typedef GlobalOrdinal GO;
01923     typedef typename ArrayView<GO>::size_type size_type;
01924 
01925     if (! isFillActive ()) {
01926       // Fill must be active in order to call this method.
01927       return Teuchos::OrdinalTraits<LO>::invalid ();
01928     }
01929     else if (! this->hasColMap ()) {
01930       // There is no such thing as local column indices without a column Map.
01931       return Teuchos::OrdinalTraits<LO>::invalid ();
01932     }
01933     else if (values.size () != indices.size ()) {
01934       // The sizes of values and indices must match.
01935       return Teuchos::OrdinalTraits<LO>::invalid ();
01936     }
01937     const bool isLocalRow = getRowMap ()->isNodeLocalElement (localRow);
01938     if (! isLocalRow) {
01939       // The calling process doesn't own the local row, so we can't
01940       // insert into it.
01941       return static_cast<LO> (0);
01942     }
01943 
01944     if (indices.size () == 0) {
01945       return static_cast<LO> (0);
01946     }
01947     else {
01948       RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
01949       ArrayView<Scalar> curVals = this->getViewNonConst (rowInfo);
01950       if (isLocallyIndexed ()) {
01951         return staticGraph_->template transformLocalValues<Scalar, f_type> (rowInfo, curVals,
01952                                                                             indices, values,
01953                                                                             f_type ());
01954       }
01955       else if (isGloballyIndexed ()) {
01956         // Convert the given local indices to global indices.
01957         //
01958         // FIXME (mfh 27 Jun 2014) Why can't we ask the graph to do
01959         // that?  It could do the conversions in place, so that we
01960         // wouldn't need temporary storage.
01961         const map_type& colMap = * (this->getColMap ());
01962         const size_type numInds = indices.size ();
01963 
01964         // mfh 27 Jun 2014: Some of the given local indices might be
01965         // invalid.  That's OK, though, since the graph ignores them
01966         // and their corresponding values in transformGlobalValues.
01967         // Thus, we don't have to count how many indices are valid.
01968         // We do so just as a sanity check.
01969         Array<GO> gblInds (numInds);
01970         size_type numValid = 0; // sanity check count of # valid indices
01971         for (size_type k = 0; k < numInds; ++k) {
01972           const GO gid = colMap.getGlobalElement (indices[k]);
01973           gblInds[k] = gid;
01974           if (gid != Teuchos::OrdinalTraits<GO>::invalid ()) {
01975             ++numValid; // sanity check count of # valid indices
01976           }
01977         }
01978         const LO numXformed =
01979           staticGraph_->template transformGlobalValues<Scalar, f_type> (rowInfo,
01980                                                                         curVals, // target
01981                                                                         gblInds,
01982                                                                         values, // source
01983                                                                         f_type ());
01984         if (static_cast<size_type> (numXformed) != numValid) {
01985           return Teuchos::OrdinalTraits<LO>::invalid ();
01986         } else {
01987           return numXformed;
01988         }
01989       }
01990       // NOTE (mfh 26 Jun 2014) In the current version of CrsMatrix,
01991       // it's possible for a matrix (or graph) to be neither locally
01992       // nor globally indexed on a process.  This means that the graph
01993       // or matrix has no entries on that process.  Epetra also works
01994       // like this.  It's related to lazy allocation (on first
01995       // insertion, not at graph / matrix construction).  Lazy
01996       // allocation will go away because it is not thread scalable.
01997       return static_cast<LO> (0);
01998     }
01999   }
02000 
02001 
02002   template<class Scalar,
02003            class LocalOrdinal,
02004            class GlobalOrdinal, class DeviceType>
02005   LocalOrdinal
02006   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02007   replaceGlobalValues (GlobalOrdinal globalRow,
02008                        const ArrayView<const GlobalOrdinal> &indices,
02009                        const ArrayView<const Scalar>        &values)
02010   {
02011     typedef LocalOrdinal LO;
02012     typedef GlobalOrdinal GO;
02013     using Teuchos::Array;
02014     using Teuchos::ArrayView;
02015     typedef typename ArrayView<const GO>::size_type size_type;
02016     // project2nd is a binary function that returns its second
02017     // argument.  This replaces entries in the given row with their
02018     // corresponding entry of values.
02019     typedef Tpetra::project2nd<Scalar, Scalar> f_type;
02020 
02021     if (! isFillActive ()) {
02022       // Fill must be active in order to call this method.
02023       return Teuchos::OrdinalTraits<LO>::invalid ();
02024     }
02025     else if (values.size () != indices.size ()) {
02026       // The sizes of values and indices must match.
02027       return Teuchos::OrdinalTraits<LO>::invalid ();
02028     }
02029 
02030     const LO lrow = this->getRowMap()->getLocalElement (globalRow);
02031     if (lrow == Teuchos::OrdinalTraits<LO>::invalid ()) {
02032       // We don't own the row, so we're not allowed to modify its values.
02033       return Teuchos::OrdinalTraits<LO>::invalid ();
02034     }
02035 
02036     if (staticGraph_.is_null ()) {
02037       return Teuchos::OrdinalTraits<LO>::invalid ();
02038     }
02039     const crs_graph_type& graph = *staticGraph_;
02040     RowInfo rowInfo = graph.getRowInfo (lrow);
02041     if (indices.size () == 0) {
02042       return static_cast<LO> (0);
02043     }
02044     else {
02045       ArrayView<Scalar> curVals = this->getViewNonConst (rowInfo);
02046       if (isLocallyIndexed ()) {
02047         // Convert the given global indices to local indices.
02048         //
02049         // FIXME (mfh 08 Jul 2014) Why can't we ask the graph to do
02050         // that?  It could do the conversions in place, so that we
02051         // wouldn't need temporary storage.
02052         const map_type& colMap = * (this->getColMap ());
02053         const size_type numInds = indices.size ();
02054         Array<LO> lclInds (numInds);
02055         for (size_type k = 0; k < numInds; ++k) {
02056           // There is no need to filter out indices not in the
02057           // column Map.  Those that aren't will be mapped to
02058           // invalid(), which the graph's transformGlobalValues()
02059           // will filter out (but not count in its return value).
02060           lclInds[k] = colMap.getLocalElement (indices[k]);
02061         }
02062         return graph.template transformLocalValues<Scalar, f_type> (rowInfo,
02063                                                                     curVals,
02064                                                                     lclInds (),
02065                                                                     values,
02066                                                                     f_type ());
02067       }
02068       else if (isGloballyIndexed ()) {
02069         return graph.template transformGlobalValues<Scalar, f_type> (rowInfo,
02070                                                                      curVals,
02071                                                                      indices,
02072                                                                      values,
02073                                                                      f_type ());
02074       }
02075       else {
02076         // If the graph is neither locally nor globally indexed on
02077         // the calling process, that means that the calling process
02078         // can't possibly have any entries in the owned row.  Thus,
02079         // there are no entries to transform, so we return zero.
02080         return static_cast<LO> (0);
02081       }
02082     }
02083   }
02084 
02085 
02086   template<class Scalar,
02087            class LocalOrdinal,
02088            class GlobalOrdinal, class DeviceType>
02089   LocalOrdinal
02090   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02091   sumIntoGlobalValues (const GlobalOrdinal globalRow,
02092                        const ArrayView<const GlobalOrdinal> &indices,
02093                        const ArrayView<const Scalar>        &values)
02094 
02095   {
02096     typedef LocalOrdinal LO;
02097     typedef GlobalOrdinal GO;
02098     using Teuchos::Array;
02099     using Teuchos::ArrayView;
02100     typedef typename ArrayView<const GO>::size_type size_type;
02101     typedef std::plus<Scalar> f_type;
02102 
02103     if (! isFillActive ()) {
02104       // Fill must be active in order to call this method.
02105       return Teuchos::OrdinalTraits<LO>::invalid ();
02106     }
02107     else if (values.size () != indices.size ()) {
02108       // The sizes of values and indices must match.
02109       return Teuchos::OrdinalTraits<LO>::invalid ();
02110     }
02111 
02112     const LO lrow = this->getRowMap()->getLocalElement (globalRow);
02113     if (lrow == Teuchos::OrdinalTraits<LO>::invalid ()) {
02114       // globalRow is not in the row Map, so stash the given entries
02115       // away in a separate data structure.  globalAssemble() (called
02116       // during fillComplete()) will exchange that data and sum it in
02117       // using sumIntoGlobalValues().
02118       this->insertNonownedGlobalValues (globalRow, indices, values);
02119       // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
02120       // since we won't know whether the given indices were valid
02121       // until globalAssemble (called in fillComplete) is called.
02122       // That's why insertNonownedGlobalValues doesn't return
02123       // anything.  Just for consistency, I'll return the number of
02124       // entries that the user gave us.
02125       return static_cast<LO> (indices.size ());
02126     }
02127 
02128     if (staticGraph_.is_null ()) {
02129       return Teuchos::OrdinalTraits<LO>::invalid ();
02130     }
02131     const crs_graph_type& graph = *staticGraph_;
02132     RowInfo rowInfo = graph.getRowInfo (lrow);
02133     if (indices.size () == 0) {
02134       return static_cast<LO> (0);
02135     }
02136     else {
02137       ArrayView<Scalar> curVals = this->getViewNonConst (rowInfo);
02138       if (isLocallyIndexed ()) {
02139         // Convert the given global indices to local indices.
02140         //
02141         // FIXME (mfh 08 Jul 2014) Why can't we ask the graph to do
02142         // that?  It could do the conversions in place, so that we
02143         // wouldn't need temporary storage.
02144         const map_type& colMap = * (this->getColMap ());
02145         const size_type numInds = indices.size ();
02146         Array<LO> lclInds (numInds);
02147         for (size_type k = 0; k < numInds; ++k) {
02148           // There is no need to filter out indices not in the
02149           // column Map.  Those that aren't will be mapped to
02150           // invalid(), which the graph's transformGlobalValues()
02151           // will filter out (but not count in its return value).
02152           lclInds[k] = colMap.getLocalElement (indices[k]);
02153         }
02154         return graph.template transformLocalValues<Scalar, f_type> (rowInfo,
02155                                                                     curVals,
02156                                                                     lclInds (),
02157                                                                     values,
02158                                                                     f_type ());
02159       }
02160       else if (isGloballyIndexed ()) {
02161         return graph.template transformGlobalValues<Scalar, f_type> (rowInfo,
02162                                                                      curVals,
02163                                                                      indices,
02164                                                                      values,
02165                                                                      f_type ());
02166       }
02167       else {
02168         // If the graph is neither locally nor globally indexed on
02169         // the calling process, that means that the calling process
02170         // can't possibly have any entries in the owned row.  Thus,
02171         // there are no entries to transform, so we return zero.
02172         return static_cast<LO> (0);
02173       }
02174     }
02175   }
02176 
02177 
02178   template <class Scalar,
02179             class LocalOrdinal,
02180             class GlobalOrdinal, class DeviceType>
02181   LocalOrdinal
02182   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02183   sumIntoLocalValues (const LocalOrdinal localRow,
02184                       const ArrayView<const LocalOrdinal>& indices,
02185                       const ArrayView<const Scalar>& values)
02186   {
02187     using Teuchos::Array;
02188     using Teuchos::ArrayView;
02189     typedef std::plus<Scalar> f_type;
02190     typedef LocalOrdinal LO;
02191     typedef GlobalOrdinal GO;
02192     typedef typename ArrayView<GO>::size_type size_type;
02193 
02194     if (! isFillActive ()) {
02195       // Fill must be active in order to call this method.
02196       return Teuchos::OrdinalTraits<LO>::invalid ();
02197     }
02198     else if (! this->hasColMap ()) {
02199       // There is no such thing as local column indices without a column Map.
02200       return Teuchos::OrdinalTraits<LO>::invalid ();
02201     }
02202     else if (values.size () != indices.size ()) {
02203       // The sizes of values and indices must match.
02204       return Teuchos::OrdinalTraits<LO>::invalid ();
02205     }
02206     const bool isLocalRow = getRowMap ()->isNodeLocalElement (localRow);
02207     if (! isLocalRow) {
02208       // The calling process doesn't own the local row, so we can't
02209       // insert into it.
02210       return static_cast<LO> (0);
02211     }
02212 
02213     if (indices.size () == 0) {
02214       return static_cast<LO> (0);
02215     }
02216     else {
02217       RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
02218       ArrayView<Scalar> curVals = this->getViewNonConst (rowInfo);
02219       if (isLocallyIndexed ()) {
02220         return staticGraph_->template transformLocalValues<Scalar, f_type> (rowInfo, curVals,
02221                                                                             indices, values,
02222                                                                             f_type ());
02223       }
02224       else if (isGloballyIndexed ()) {
02225         // Convert the given local indices to global indices.
02226         //
02227         // FIXME (mfh 27 Jun 2014) Why can't we ask the graph to do
02228         // that?  It could do the conversions in place, so that we
02229         // wouldn't need temporary storage.
02230         const map_type& colMap = * (this->getColMap ());
02231         const size_type numInds = indices.size ();
02232 
02233         // mfh 27 Jun 2014: Some of the given local indices might be
02234         // invalid.  That's OK, though, since the graph ignores them
02235         // and their corresponding values in transformGlobalValues.
02236         // Thus, we don't have to count how many indices are valid.
02237         // We do so just as a sanity check.
02238         Array<GO> gblInds (numInds);
02239         size_type numValid = 0; // sanity check count of # valid indices
02240         for (size_type k = 0; k < numInds; ++k) {
02241           const GO gid = colMap.getGlobalElement (indices[k]);
02242           gblInds[k] = gid;
02243           if (gid != Teuchos::OrdinalTraits<GO>::invalid ()) {
02244             ++numValid; // sanity check count of # valid indices
02245           }
02246         }
02247         const LO numXformed =
02248           staticGraph_->template transformGlobalValues<Scalar, f_type> (rowInfo,
02249                                                                         curVals, // target
02250                                                                         gblInds,
02251                                                                         values, // source
02252                                                                         f_type ());
02253         if (static_cast<size_type> (numXformed) != numValid) {
02254           return Teuchos::OrdinalTraits<LO>::invalid ();
02255         } else {
02256           return numXformed;
02257         }
02258       }
02259       // NOTE (mfh 26 Jun 2014) In the current version of CrsMatrix,
02260       // it's possible for a matrix (or graph) to be neither locally
02261       // nor globally indexed on a process.  This means that the graph
02262       // or matrix has no entries on that process.  Epetra also works
02263       // like this.  It's related to lazy allocation (on first
02264       // insertion, not at graph / matrix construction).  Lazy
02265       // allocation will go away because it is not thread scalable.
02266       return static_cast<LO> (0);
02267     }
02268   }
02269 
02270 
02271   template<class Scalar,
02272            class LocalOrdinal,
02273            class GlobalOrdinal,
02274            class DeviceType>
02275   Teuchos::ArrayView<const Scalar>
02276   CrsMatrix<
02277     Scalar, LocalOrdinal, GlobalOrdinal,
02278     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02279   getView (RowInfo rowinfo) const
02280   {
02281     if (values1D_ != null && rowinfo.allocSize > 0) {
02282 #ifdef HAVE_TPETRA_DEBUG
02283       TEUCHOS_TEST_FOR_EXCEPTION(
02284         rowinfo.offset1D + rowinfo.allocSize > values1D_.size (),
02285         std::range_error, "Tpetra::CrsMatrix::getView: Invalid access "
02286         "to 1-D storage of values." << std::endl << "rowinfo.offset1D (" <<
02287         rowinfo.offset1D << ") + rowinfo.allocSize (" << rowinfo.allocSize <<
02288         ") > values1D_.size() (" << values1D_.size () << ").");
02289 #endif // HAVE_TPETRA_DEBUG
02290       return values1D_ (rowinfo.offset1D, rowinfo.allocSize);
02291     }
02292     else if (values2D_ != null) {
02293       return values2D_[rowinfo.localRow] ();
02294     }
02295     else {
02296       return Teuchos::ArrayView<Scalar> ();
02297     }
02298   }
02299 
02300 
02301   template<class Scalar,
02302            class LocalOrdinal,
02303            class GlobalOrdinal,
02304            class DeviceType>
02305   Teuchos::ArrayView<Scalar>
02306   CrsMatrix<
02307     Scalar,
02308     LocalOrdinal,
02309     GlobalOrdinal,
02310     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02311   getViewNonConst (RowInfo rowinfo)
02312   {
02313     if (values1D_ != null && rowinfo.allocSize > 0) {
02314 #ifdef HAVE_TPETRA_DEBUG
02315       TEUCHOS_TEST_FOR_EXCEPTION(
02316         rowinfo.offset1D + rowinfo.allocSize > values1D_.size (),
02317         std::range_error, "Tpetra::CrsMatrix::getViewNonConst: Invalid access "
02318         "to 1-D storage of values." << std::endl << "rowinfo.offset1D (" <<
02319         rowinfo.offset1D << ") + rowinfo.allocSize (" << rowinfo.allocSize <<
02320         ") > values1D_.size() (" << values1D_.size () << ").");
02321 #endif // HAVE_TPETRA_DEBUG
02322       return values1D_ (rowinfo.offset1D, rowinfo.allocSize);
02323     }
02324     else if (values2D_ != null) {
02325       return values2D_[rowinfo.localRow] ();
02326     }
02327     else {
02328       return Teuchos::ArrayView<Scalar> ();
02329     }
02330   }
02331 
02332 
02333   template<class Scalar,
02334            class LocalOrdinal,
02335            class GlobalOrdinal,
02336            class DeviceType>
02337   void
02338   CrsMatrix<
02339     Scalar,
02340     LocalOrdinal,
02341     GlobalOrdinal,
02342     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02343   getLocalRowCopy (LocalOrdinal localRow,
02344                    const Teuchos::ArrayView<LocalOrdinal>& indices,
02345                    const Teuchos::ArrayView<Scalar>& values,
02346                    size_t& numEntries) const
02347   {
02348     using Teuchos::ArrayView;
02349     typedef LocalOrdinal LO;
02350     typedef GlobalOrdinal GO;
02351 
02352     TEUCHOS_TEST_FOR_EXCEPTION(
02353       isGloballyIndexed () && ! hasColMap (), std::runtime_error,
02354       "Tpetra::CrsMatrix::getLocalRowCopy: The matrix is globally indexed and "
02355       "does not have a column Map yet.  That means we don't have local indices "
02356       "for columns yet, so it doesn't make sense to call this method.  If the "
02357       "matrix doesn't have a column Map yet, you should call fillComplete on "
02358       "it first.");
02359     TEUCHOS_TEST_FOR_EXCEPTION(
02360       ! staticGraph_->hasRowInfo (), std::runtime_error,
02361       "Tpetra::CrsMatrix::getLocalRowCopy: The graph's row information was "
02362       "deleted at fillComplete().");
02363 
02364     if (! this->getRowMap ()->isNodeLocalElement (localRow)) {
02365       numEntries = 0;
02366       return;
02367     }
02368 
02369     const RowInfo rowinfo = staticGraph_->getRowInfo(localRow);
02370     const size_t theNumEntries = rowinfo.numEntries;
02371 
02372     TEUCHOS_TEST_FOR_EXCEPTION(
02373       static_cast<size_t> (indices.size ()) < theNumEntries ||
02374       static_cast<size_t> (values.size ()) < theNumEntries,
02375       std::runtime_error,
02376       "Tpetra::CrsMatrix::getLocalRowCopy: The given row " << localRow
02377       << " has " << theNumEntries << " entries.  One or both of the given "
02378       "ArrayViews are not long enough to store that many entries.  indices "
02379       "can store " << indices.size() << " entries and values can store "
02380       << values.size() << " entries.");
02381 
02382     numEntries = theNumEntries;
02383 
02384     if (staticGraph_->isLocallyIndexed ()) {
02385       ArrayView<const LO> indrowview = staticGraph_->getLocalView (rowinfo);
02386       ArrayView<const Scalar> valrowview = getView (rowinfo);
02387       std::copy (indrowview.begin (), indrowview.begin () + numEntries, indices.begin ());
02388       std::copy (valrowview.begin (), valrowview.begin () + numEntries,  values.begin ());
02389     }
02390     else if (staticGraph_->isGloballyIndexed ()) {
02391       ArrayView<const GO> indrowview = staticGraph_->getGlobalView (rowinfo);
02392       ArrayView<const Scalar>        valrowview = getView (rowinfo);
02393       std::copy (valrowview.begin (), valrowview.begin () + numEntries, values.begin ());
02394 
02395       const map_type& colMap = * (this->getColMap ());
02396       for (size_t j=0; j < numEntries; ++j) {
02397         indices[j] = colMap.getLocalElement (indrowview[j]);
02398       }
02399     }
02400     else {
02401       numEntries = 0;
02402     }
02403   }
02404 
02405   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02406   void
02407   CrsMatrix<
02408     Scalar, LocalOrdinal, GlobalOrdinal,
02409     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02410   getGlobalRowCopy (GlobalOrdinal globalRow,
02411                     const Teuchos::ArrayView<GlobalOrdinal>& indices,
02412                     const Teuchos::ArrayView<Scalar>& values,
02413                     size_t& numEntries) const
02414   {
02415     // Only locally owned rows can be queried, otherwise complain
02416     const char tfecfFuncName[] = "getGlobalRowCopy";
02417     const LocalOrdinal lrow = getRowMap ()->getLocalElement (globalRow);
02418     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02419       lrow == OTL::invalid(), std::runtime_error,
02420       ": globalRow=" << globalRow << " does not belong to the calling process "
02421       << getComm()->getRank() << ".");
02422 
02423     const RowInfo rowinfo = staticGraph_->getRowInfo (lrow);
02424     numEntries = rowinfo.numEntries;
02425     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02426       static_cast<size_t> (indices.size ()) < numEntries || static_cast<size_t> (values.size ()) < numEntries,
02427       std::runtime_error,
02428       ": size of indices,values must be sufficient to store the specified row.");
02429 
02430     if (staticGraph_->isGloballyIndexed ()) {
02431       ArrayView<const GlobalOrdinal> indrowview = staticGraph_->getGlobalView(rowinfo);
02432       ArrayView<const Scalar>        valrowview = getView(rowinfo);
02433       std::copy( indrowview.begin(), indrowview.begin() + numEntries, indices.begin() );
02434       std::copy( valrowview.begin(), valrowview.begin() + numEntries,  values.begin() );
02435     }
02436     else if (staticGraph_->isLocallyIndexed ()) {
02437       ArrayView<const LocalOrdinal> indrowview = staticGraph_->getLocalView(rowinfo);
02438       ArrayView<const Scalar>       valrowview = getView(rowinfo);
02439       std::copy( valrowview.begin(), valrowview.begin() + numEntries, values.begin() );
02440       for (size_t j=0; j < numEntries; ++j) {
02441         indices[j] = getColMap ()->getGlobalElement (indrowview[j]);
02442       }
02443     }
02444     else {
02445 #ifdef HAVE_TPETRA_DEBUG
02446       // should have fallen in one of the above if indices are allocated
02447       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02448         staticGraph_->indicesAreAllocated (), std::logic_error,
02449         ": Internal logic error. Please contact Tpetra team.");
02450 #endif // HAVE_TPETRA_DEBUG
02451       numEntries = 0;
02452     }
02453   }
02454 
02455 
02458   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02459   void
02460   CrsMatrix<
02461     Scalar, LocalOrdinal, GlobalOrdinal,
02462     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02463   getLocalRowView (LocalOrdinal localRow,
02464                    Teuchos::ArrayView<const LocalOrdinal>& indices,
02465                    Teuchos::ArrayView<const Scalar>& values) const
02466   {
02467     const char tfecfFuncName[] = "getLocalRowView: ";
02468     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02469       isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
02470       "its indices as global indices, so you cannot get a view with local "
02471       "column indices.  If the matrix has a column Map, you may call "
02472       "getLocalRowCopy() to get local column indices; otherwise, you may get "
02473       "a view with global column indices by calling getGlobalRowCopy().");
02474     indices = null;
02475     values  = null;
02476     if (getRowMap ()->isNodeLocalElement (localRow)) {
02477       const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
02478       if (rowinfo.numEntries > 0) {
02479         indices = staticGraph_->getLocalView(rowinfo);
02480         indices = indices(0,rowinfo.numEntries);
02481         values  = getView(rowinfo);
02482         values  = values(0,rowinfo.numEntries);
02483       }
02484     }
02485 #ifdef HAVE_TPETRA_DEBUG
02486     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02487       static_cast<size_t> (indices.size ()) != this->getNumEntriesInLocalRow (localRow) ||
02488       indices.size () != values.size (), std::logic_error,
02489       "Violated stated post-conditions. Please contact Tpetra team.");
02490 #endif // HAVE_TPETRA_DEBUG
02491   }
02492 
02493 
02496   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02497   void
02498   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02499   getGlobalRowView (GlobalOrdinal globalRow,
02500                     Teuchos::ArrayView<const GlobalOrdinal>& indices,
02501                     Teuchos::ArrayView<const Scalar>& values) const
02502   {
02503     const char tfecfFuncName[] = "getGlobalRowView: ";
02504     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02505       isLocallyIndexed (), std::runtime_error,
02506       "The matrix is locally indexed, so we cannot return a view of the row "
02507       "with global column indices.  Use getGlobalRowCopy() instead.");
02508     indices = Teuchos::null;
02509     values  = Teuchos::null;
02510     const LocalOrdinal lrow = getRowMap ()->getLocalElement (globalRow);
02511     if (lrow != Teuchos::OrdinalTraits<LocalOrdinal>::invalid ()) {
02512       // getRowInfo() requires a local row index, whether or not
02513       // storage has been optimized.
02514       const RowInfo rowinfo = staticGraph_->getRowInfo(lrow);
02515       if (rowinfo.numEntries > 0) {
02516         indices = staticGraph_->getGlobalView (rowinfo);
02517         indices = indices (0, rowinfo.numEntries);
02518         values  = getView (rowinfo);
02519         values  = values (0, rowinfo.numEntries);
02520       }
02521     }
02522 #ifdef HAVE_TPETRA_DEBUG
02523     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02524       static_cast<size_t> (indices.size ()) != this->getNumEntriesInGlobalRow (globalRow) ||
02525       indices.size () != values.size (),
02526       std::logic_error,
02527       "Violated stated post-conditions. Please contact Tpetra team.");
02528 #endif // HAVE_TPETRA_DEBUG
02529   }
02530 
02531 
02534   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02535   void
02536   CrsMatrix<
02537     Scalar, LocalOrdinal, GlobalOrdinal,
02538     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02539   scale (const Scalar& alpha)
02540   {
02541     typedef LocalOrdinal LO;
02542     typedef Kokkos::SparseRowView<k_local_matrix_type> row_view_type;
02543     typedef typename Teuchos::Array<Scalar>::size_type size_type;
02544     const char tfecfFuncName[] = "scale: ";
02545 
02546     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02547       ! isFillActive (), std::runtime_error,
02548       "Fill must be active before you may call this method.  "
02549       "Please call resumeFill() to make fill active.");
02550 
02551     const size_t nlrs = staticGraph_->getNodeNumRows ();
02552     const size_t numAlloc = staticGraph_->getNodeAllocationSize ();
02553     const size_t numEntries = staticGraph_->getNodeNumEntries ();
02554     if (! staticGraph_->indicesAreAllocated () || nlrs == 0 ||
02555         numAlloc == 0 || numEntries == 0) {
02556       // do nothing
02557     }
02558     else {
02559       if (staticGraph_->getProfileType () == StaticProfile) {
02560         const LO lclNumRows = k_lclMatrix_.numRows ();
02561         for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) {
02562           row_view_type row_i = k_lclMatrix_.row (lclRow);
02563           for (LO k = 0; k < row_i.length; ++k) {
02564             row_i.value (k) *= alpha;
02565           }
02566         }
02567       }
02568       else if (staticGraph_->getProfileType () == DynamicProfile) {
02569         for (size_t row = 0; row < nlrs; ++row) {
02570           const size_type numEnt = getNumEntriesInLocalRow (row);
02571           Teuchos::ArrayView<Scalar> rowVals = values2D_[row] ();
02572           for (size_type k = 0; k < numEnt; ++k) {
02573             rowVals[k] *= alpha;
02574           }
02575         }
02576       }
02577     }
02578   }
02579 
02580 
02583   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02584   void CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::setAllToScalar(const Scalar &alpha)
02585   {
02586     const char tfecfFuncName[] = "setAllToScalar: ";
02587     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02588       ! isFillActive (), std::runtime_error,
02589       "Fill must be active before you may call this method.  "
02590       "Please call resumeFill() to make fill active.");
02591 
02592     // replace all values in the matrix
02593     // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
02594     // however, if there are no valid entries, we can short-circuit
02595     // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
02596     const size_t     nlrs = staticGraph_->getNodeNumRows(),
02597                  numAlloc = staticGraph_->getNodeAllocationSize(),
02598                numEntries = staticGraph_->getNodeNumEntries();
02599     if (staticGraph_->indicesAreAllocated() == false || numAlloc == 0 || numEntries == 0) {
02600       // do nothing
02601     }
02602     else {
02603       if (staticGraph_->getProfileType() == StaticProfile) {
02604         std::fill( values1D_.begin(), values1D_.end(), alpha );
02605       }
02606       else if (staticGraph_->getProfileType() == DynamicProfile) {
02607         for (size_t row=0; row < nlrs; ++row) {
02608           std::fill( values2D_[row].begin(), values2D_[row].end(), alpha );
02609         }
02610       }
02611     }
02612   }
02613 
02614   template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
02615             class DeviceType>
02616   void
02617   CrsMatrix<
02618     Scalar, LocalOrdinal, GlobalOrdinal,
02619     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02620   setAllValues (const t_RowPtrs& rowPointers,
02621                 const t_LocalOrdinal_1D& columnIndices,
02622                 const t_ValuesType& values)
02623   {
02624     const char tfecfFuncName[] = "setAllValues";
02625     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02626       columnIndices.size () != values.size (), std::runtime_error,
02627       ": columnIndices and values must have the same size.  columnIndices.size() = "
02628       << columnIndices.size () << " != values.size() = " << values.size () << ".");
02629     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02630       myGraph_.is_null (), std::runtime_error, ": myGraph_ must not be null.");
02631 
02632     try {
02633       myGraph_->setAllIndices (rowPointers, columnIndices);
02634     }
02635     catch (std::exception &e) {
02636       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02637         true, std::runtime_error, ": Caught exception while calling myGraph_->"
02638         "setAllIndices(): " << e.what ());
02639     }
02640     k_values1D_ = values;
02641     values1D_ = Kokkos::Compat::persistingView (k_values1D_);
02642     checkInternalState();
02643   }
02644 
02645   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02646   void
02647   CrsMatrix<
02648     Scalar, LocalOrdinal, GlobalOrdinal,
02649     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02650   setAllValues (const Teuchos::ArrayRCP<size_t>& rowPointers,
02651                 const Teuchos::ArrayRCP<LocalOrdinal>& columnIndices,
02652                 const Teuchos::ArrayRCP<Scalar>& values)
02653   {
02654     const char tfecfFuncName[] = "setAllValues";
02655     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02656       columnIndices.size () != values.size (), std::runtime_error,
02657       ": columnIndices and values must have the same size.  columnIndices.size() = "
02658       << columnIndices.size () << " != values.size() = " << values.size () << ".");
02659     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02660       myGraph_.is_null (), std::runtime_error, ": myGraph_ must not be null.");
02661 
02662     try {
02663       myGraph_->setAllIndices (rowPointers, columnIndices);
02664     }
02665     catch (std::exception &e) {
02666       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02667         true, std::runtime_error, ": Caught exception while calling myGraph_->"
02668         "setAllIndices(): " << e.what ());
02669     }
02670     k_values1D_ = Kokkos::Compat::getKokkosViewDeepCopy<DeviceType> (values ());
02671     values1D_ = Kokkos::Compat::persistingView (k_values1D_);
02672     checkInternalState();
02673   }
02674 
02675   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02676   void
02677   CrsMatrix<
02678     Scalar, LocalOrdinal, GlobalOrdinal,
02679     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02680   getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
02681   {
02682     using Teuchos::ArrayRCP;
02683     using Teuchos::ArrayView;
02684     const char tfecfFuncName[] = "getLocalDiagOffsets";
02685 
02686     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02687       ! hasColMap (), std::runtime_error,
02688       ": This method requires that the matrix have a column Map.");
02689     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02690       staticGraph_.is_null (), std::runtime_error,
02691       ": This method requires that the matrix have a graph.");
02692 
02693     const map_type& rowMap = * (this->getRowMap ());
02694     const map_type& colMap = * (this->getColMap ());
02695 
02696     const size_t myNumRows = getNodeNumRows ();
02697     if (static_cast<size_t> (offsets.size ()) != myNumRows) {
02698       offsets.resize (static_cast<size_t> (myNumRows));
02699     }
02700 
02701 #ifdef HAVE_TPETRA_DEBUG
02702     bool allRowMapDiagEntriesInColMap = true;
02703     bool allDiagEntriesFound = true;
02704 #endif // HAVE_TPETRA_DEBUG
02705 
02706     for (size_t r = 0; r < myNumRows; ++r) {
02707       const GlobalOrdinal rgid = rowMap.getGlobalElement (r);
02708       const LocalOrdinal rlid = colMap.getLocalElement (rgid);
02709 
02710 #ifdef HAVE_TPETRA_DEBUG
02711       if (rlid == Teuchos::OrdinalTraits<LocalOrdinal>::invalid ()) {
02712         allRowMapDiagEntriesInColMap = false;
02713       }
02714 #endif // HAVE_TPETRA_DEBUG
02715 
02716       if (rlid != Teuchos::OrdinalTraits<LocalOrdinal>::invalid ()) {
02717         RowInfo rowinfo = staticGraph_->getRowInfo (r);
02718         if (rowinfo.numEntries > 0) {
02719           offsets[r] = staticGraph_->findLocalIndex (rowinfo, rlid);
02720         }
02721         else {
02722           offsets[r] = Teuchos::OrdinalTraits<size_t>::invalid ();
02723 #ifdef HAVE_TPETRA_DEBUG
02724           allDiagEntriesFound = false;
02725 #endif // HAVE_TPETRA_DEBUG
02726         }
02727       }
02728     }
02729 
02730 #ifdef HAVE_TPETRA_DEBUG
02731     using Teuchos::reduceAll;
02732     using std::endl;
02733 
02734     const bool localSuccess =
02735       allRowMapDiagEntriesInColMap && allDiagEntriesFound;
02736     int localResults[3];
02737     localResults[0] = allRowMapDiagEntriesInColMap ? 1 : 0;
02738     localResults[1] = allDiagEntriesFound ? 1 : 0;
02739     // min-all-reduce will compute least rank of all the processes
02740     // that didn't succeed.
02741     localResults[2] =
02742       ! localSuccess ? getComm ()->getRank () : getComm ()->getSize ();
02743     int globalResults[3];
02744     globalResults[0] = 0;
02745     globalResults[1] = 0;
02746     globalResults[2] = 0;
02747     reduceAll<int, int> (* (getComm ()), Teuchos::REDUCE_MIN,
02748                          3, localResults, globalResults);
02749     if (globalResults[0] == 0 || globalResults[1] == 0) {
02750       std::ostringstream os; // build error message
02751       const bool both =
02752         globalResults[0] == 0 && globalResults[1] == 0;
02753       os << ": At least one process (including Process " << globalResults[2]
02754          << ") had the following issue" << (both ? "s" : "") << ":" << endl;
02755       if (globalResults[0] == 0) {
02756         os << "  - The column Map does not contain at least one diagonal entry "
02757           "of the matrix." << endl;
02758       }
02759       if (globalResults[1] == 0) {
02760         os << "  - There is a row on that / those process(es) that does not "
02761           "contain a diagonal entry." << endl;
02762       }
02763       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error, os.str());
02764     }
02765 #endif // HAVE_TPETRA_DEBUG
02766   }
02767 
02768   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
02769            class DeviceType>
02770   void
02771   CrsMatrix<
02772     Scalar, LocalOrdinal, GlobalOrdinal,
02773     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02774   getLocalDiagCopy (Vector<Scalar, LocalOrdinal, GlobalOrdinal, node_type>& dvec) const
02775   {
02776     using Teuchos::ArrayRCP;
02777     using Teuchos::ArrayView;
02778     const char tfecfFuncName[] = "getLocalDiagCopy";
02779     typedef Vector<Scalar, LocalOrdinal, GlobalOrdinal, node_type> vec_type;
02780     typedef typename vec_type::dual_view_type dual_view_type;
02781     typedef typename device_type::host_mirror_device_type host_device_type;
02782 
02783     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02784       ! hasColMap (), std::runtime_error,
02785       ": This method requires that the matrix have a column Map.");
02786     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02787       staticGraph_.is_null (), std::runtime_error,
02788       ": This method requires that the matrix have a graph.");
02789 
02790     const map_type& rowMap = * (this->getRowMap ());
02791     const map_type& colMap = * (this->getColMap ());
02792 
02793 #ifdef HAVE_TPETRA_DEBUG
02794     // isCompatible() requires an all-reduce, and thus this check
02795     // should only be done in debug mode.
02796     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02797       ! dvec.getMap ()->isCompatible (rowMap), std::runtime_error,
02798       ": The input Vector's Map must be compatible with the CrsMatrix's row "
02799       "Map.  You may check this by using Map's isCompatible method: "
02800       "dvec.getMap ()->isCompatible (A.getRowMap ());");
02801 #endif // HAVE_TPETRA_DEBUG
02802 
02803     // For now, we fill the Vector on the host and sync to device.
02804     // Later, we may write a parallel kernel that works entirely on
02805     // device.
02806     dual_view_type lclVec = dvec.getDualView ();
02807     typedef typename dual_view_type::t_host host_view_type;
02808     host_view_type lclVecHost = lclVec.h_view;
02809     lclVec.template modify<host_device_type> ();
02810 
02811     // 1-D subview of lclVecHost.  All the "typename" stuff ensures
02812     // that we get the same layout and memory traits as the original
02813     // 2-D view.
02814     typedef typename Kokkos::View<scalar_type*,
02815       typename host_view_type::array_layout, typename host_view_type::device_type,
02816       typename host_view_type::memory_traits>
02817       host_view_1d_type;
02818     host_view_1d_type lclVecHost1d =
02819       Kokkos::subview<host_view_1d_type> (lclVecHost, Kokkos::ALL (), 0);
02820 
02821     // Find the diagonal entries and put them in lclVecHost1d.
02822     const size_t myNumRows = getNodeNumRows ();
02823     for (size_t r = 0; r < myNumRows; ++r) {
02824       lclVecHost1d(r) = STS::zero (); // default value if no diag entry
02825       const GlobalOrdinal rgid = rowMap.getGlobalElement (r);
02826       const LocalOrdinal rlid = colMap.getLocalElement (rgid);
02827 
02828       if (rlid != Teuchos::OrdinalTraits<LocalOrdinal>::invalid ()) {
02829         RowInfo rowinfo = staticGraph_->getRowInfo (r);
02830         if (rowinfo.numEntries > 0) {
02831           const size_t j = staticGraph_->findLocalIndex (rowinfo, rlid);
02832           if (j != Teuchos::OrdinalTraits<size_t>::invalid ()) {
02833             ArrayView<const Scalar> view = this->getView (rowinfo);
02834             lclVecHost1d(r) = view[j];
02835           }
02836         }
02837       }
02838     }
02839     lclVec.template sync<device_type> (); // sync changes back to device
02840   }
02841 
02842 
02845   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02846   void
02847   CrsMatrix<
02848     Scalar, LocalOrdinal, GlobalOrdinal,
02849     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02850   getLocalDiagCopy (Vector<Scalar, LocalOrdinal, GlobalOrdinal, node_type>& diag,
02851                     const Teuchos::ArrayView<const size_t>& offsets) const
02852   {
02853     using Teuchos::ArrayRCP;
02854     using Teuchos::ArrayView;
02855     typedef Vector<Scalar, LocalOrdinal, GlobalOrdinal, node_type> vec_type;
02856     typedef typename vec_type::dual_view_type dual_view_type;
02857     typedef typename device_type::host_mirror_device_type host_device_type;
02858 
02859 #ifdef HAVE_TPETRA_DEBUG
02860     const char tfecfFuncName[] = "getLocalDiagCopy";
02861     const map_type& rowMap = * (this->getRowMap ());
02862     // isCompatible() requires an all-reduce, and thus this check
02863     // should only be done in debug mode.
02864     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02865       ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
02866       ": The input Vector's Map must be compatible with (in the sense of Map::"
02867       "isCompatible) the CrsMatrix's row Map.");
02868 #endif // HAVE_TPETRA_DEBUG
02869 
02870     // For now, we fill the Vector on the host and sync to device.
02871     // Later, we may write a parallel kernel that works entirely on
02872     // device.
02873     dual_view_type lclVec = diag.getDualView ();
02874     typedef typename dual_view_type::t_host host_view_type;
02875     host_view_type lclVecHost = lclVec.h_view;
02876     lclVec.template modify<host_device_type> ();
02877 
02878     // 1-D subview of lclVecHost.  All the "typename" stuff ensures
02879     // that we get the same layout and memory traits as the original
02880     // 2-D view.
02881     typedef typename Kokkos::View<scalar_type*,
02882       typename host_view_type::array_layout, typename host_view_type::device_type,
02883       typename host_view_type::memory_traits>
02884       host_view_1d_type;
02885     host_view_1d_type lclVecHost1d =
02886       Kokkos::subview<host_view_1d_type> (lclVecHost, Kokkos::ALL (), 0);
02887 
02888     // Find the diagonal entries and put them in lclVecHost1d.
02889     const size_t myNumRows = getNodeNumRows ();
02890     for (size_t i = 0; i < myNumRows; ++i) {
02891       lclVecHost1d(i) = STS::zero (); // default value if no diag entry
02892       if (offsets[i] != Teuchos::OrdinalTraits<size_t>::invalid ()) {
02893         ArrayView<const LocalOrdinal> ind;
02894         ArrayView<const Scalar> val;
02895         this->getLocalRowView (i, ind, val);
02896         lclVecHost1d(i) = val[offsets[i]];
02897       }
02898     }
02899     lclVec.template sync<device_type> (); // sync changes back to device
02900   }
02901 
02902 
02903   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02904   void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::leftScale(
02905     const Vector<Scalar, LocalOrdinal, GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& x)
02906   {
02907     using Teuchos::ArrayRCP;
02908     using Teuchos::ArrayView;
02909     using Teuchos::null;
02910     using Teuchos::RCP;
02911     using Teuchos::rcp;
02912     using Teuchos::rcpFromRef;
02913     typedef Vector<Scalar, LocalOrdinal, GlobalOrdinal, node_type> vec_type;
02914     const char tfecfFuncName[] = "leftScale";
02915 
02916     // FIXME (mfh 06 Aug 2014) This doesn't make sense.  The matrix
02917     // should only be modified when it is not fill complete.
02918     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02919       ! isFillComplete (), std::runtime_error,
02920       ": matrix must be fill complete.");
02921     RCP<const vec_type> xp;
02922 
02923     if (getRangeMap ()->isSameAs (* (x.getMap ()))){
02924       // Take from Epetra: If we have a non-trivial exporter, we must
02925       // import elements that are permuted or are on other processors.
02926       // (We will use the exporter to perform the import ("reverse
02927       // mode").)
02928       if (getCrsGraph ()->getExporter () != null) {
02929         RCP<vec_type> tempVec = rcp (new vec_type (getRowMap ()));
02930         tempVec->doImport (x, * (getCrsGraph ()->getExporter ()), INSERT);
02931         xp = tempVec;
02932       }
02933       else {
02934         xp = rcpFromRef (x);
02935       }
02936     }
02937     else if (getRowMap ()->isSameAs (* (x.getMap ()))) {
02938       xp = rcpFromRef (x);
02939     }
02940     else {
02941       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, ": The "
02942         "input scaling vector x's Map must be the same as either the row Map or "
02943         "the range Map of the CrsMatrix.");
02944     }
02945     ArrayRCP<const Scalar> vectorVals = xp->getData(0);
02946     ArrayView<Scalar> rowValues = null;
02947 
02948     const size_t lclNumRows = this->getNodeNumRows ();
02949     for (size_t i = 0; i < lclNumRows; ++i) {
02950       const RowInfo rowinfo = staticGraph_->getRowInfo (static_cast<LocalOrdinal> (i));
02951       rowValues = this->getViewNonConst (rowinfo);
02952       const Scalar scaleValue = vectorVals[i];
02953       for (size_t j = 0; j < rowinfo.numEntries; ++j) {
02954         rowValues[j] *= scaleValue;
02955       }
02956       rowValues = null;
02957     }
02958   }
02959 
02960   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02961   void CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::rightScale(
02962     const Vector<Scalar, LocalOrdinal, GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& x)
02963   {
02964     using Teuchos::ArrayRCP;
02965     using Teuchos::ArrayView;
02966     using Teuchos::null;
02967     using Teuchos::RCP;
02968     using Teuchos::rcp;
02969     using Teuchos::rcpFromRef;
02970     typedef Vector<Scalar, LocalOrdinal, GlobalOrdinal, node_type> vec_type;
02971     const char tfecfFuncName[] = "rightScale";
02972 
02973     // FIXME (mfh 06 Aug 2014) This doesn't make sense.  The matrix
02974     // should only be modified when it is not fill complete.
02975     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02976       ! isFillComplete (), std::runtime_error, ": matrix must be fill complete.");
02977     RCP<const vec_type> xp;
02978     if (getDomainMap ()->isSameAs (* (x.getMap ()))) {
02979       // Take from Epetra: If we have a non-trivial exporter, we must
02980       // import elements that are permuted or are on other processors.
02981       // (We will use the exporter to perform the import.)
02982       if (getCrsGraph ()->getImporter () != null) {
02983         RCP<vec_type> tempVec = rcp (new vec_type (getColMap ()));
02984         tempVec->doImport (x, * (getCrsGraph ()->getImporter ()), INSERT);
02985         xp = tempVec;
02986       }
02987       else {
02988         xp = rcpFromRef (x);
02989       }
02990     }
02991     else if (getRowMap ()->isSameAs (* (x.getMap ()))) {
02992       xp = rcpFromRef (x);
02993     }
02994     else {
02995       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02996         true, std::runtime_error, ": The vector x must be the same as either "
02997         "the row map or the range map");
02998     }
02999 
03000     ArrayRCP<const Scalar> vectorVals = xp->getData(0);
03001     ArrayView<Scalar> rowValues = null;
03002 
03003     const size_t lclNumRows = this->getNodeNumRows ();
03004     for (size_t i = 0; i < lclNumRows; ++i) {
03005       const RowInfo rowinfo = staticGraph_->getRowInfo (static_cast<LocalOrdinal> (i));
03006       rowValues = this->getViewNonConst (rowinfo);
03007       ArrayView<const LocalOrdinal> colInds;
03008       getCrsGraph ()->getLocalRowView (static_cast<LocalOrdinal> (i), colInds);
03009       for (size_t j = 0; j < rowinfo.numEntries; ++j) {
03010         rowValues[j] *= vectorVals[colInds[j]];
03011       }
03012     }
03013   }
03014 
03015   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03016   typename ScalarTraits<Scalar>::magnitudeType
03017   CrsMatrix<
03018     Scalar, LocalOrdinal, GlobalOrdinal,
03019     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03020   getFrobeniusNorm () const
03021   {
03022     using Teuchos::outArg;
03023     using Teuchos::REDUCE_SUM;
03024     using Teuchos::reduceAll;
03025     typedef typename ArrayRCP<const Scalar>::size_type size_type;
03026 
03027     // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
03028     // local part of this computation.  It could make sense to put
03029     // this operation in the Kokkos::CrsMatrix.
03030 
03031     // check the cache first
03032     Magnitude frobNorm = frobNorm_;
03033     if (frobNorm == -STM::one ()) {
03034       Magnitude mySum = STM::zero ();
03035       if (getNodeNumEntries() > 0) {
03036         if (isStorageOptimized ()) {
03037           // "Optimized" storage is packed storage.  That means we can
03038           // iterate in one pass through the 1-D values array.
03039           const size_type numEntries =
03040             static_cast<size_type> (getNodeNumEntries ());
03041           for (size_type k = 0; k < numEntries; ++k) {
03042             // FIXME (mfh 05 Aug 2014) This assumes UVM.
03043             const Scalar val = k_values1D_(k);
03044             mySum += STS::real (val) * STS::real (val) +
03045               STS::imag (val) * STS::imag (val);
03046           }
03047         }
03048         else {
03049           const size_t numRows = getNodeNumRows ();
03050           for (size_t r = 0; r < numRows; ++r) {
03051             RowInfo rowInfo = myGraph_->getRowInfo (r);
03052             const size_type numEntries =
03053               static_cast<size_type> (rowInfo.numEntries);
03054             ArrayView<const Scalar> A_r =
03055               this->getView (rowInfo).view (0, numEntries);
03056             for (size_type k = 0; k < numEntries; ++k) {
03057               const Scalar val = A_r[k];
03058               mySum += STS::real (val) * STS::real (val) +
03059                 STS::imag (val) * STS::imag (val);
03060             }
03061           }
03062         }
03063       }
03064       Magnitude totalSum;
03065       reduceAll<int, Magnitude> (* (getComm ()), REDUCE_SUM,
03066                                  mySum, outArg (totalSum));
03067       frobNorm = STM::squareroot (totalSum);
03068     }
03069     if (isFillComplete ()) {
03070       // Only cache the result if the matrix is fill complete.
03071       // Otherwise, the values might still change.  resumeFill clears
03072       // the cache.
03073       frobNorm_ = frobNorm;
03074     }
03075     return frobNorm;
03076   }
03077 
03078   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03079   void
03080   CrsMatrix<
03081     Scalar,
03082     LocalOrdinal,
03083     GlobalOrdinal,
03084     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03085   replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
03086   {
03087     const char tfecfFuncName[] = "replaceColMap";
03088     // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
03089     // Then replacing the column Map might mean that we need to
03090     // reindex the column indices.
03091     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03092       myGraph_.is_null (), std::runtime_error,
03093       ": This method does not work if the matrix has a const graph.  The whole "
03094       "idea of a const graph is that you are not allowed to change it, but this"
03095       " method necessarily must modify the graph, since the graph owns the "
03096       "matrix's column Map.");
03097     myGraph_->replaceColMap (newColMap);
03098   }
03099 
03100   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03101   void
03102   CrsMatrix<
03103     Scalar,
03104     LocalOrdinal,
03105     GlobalOrdinal,
03106     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03107   reindexColumns (crs_graph_type* const graph,
03108                   const Teuchos::RCP<const map_type>& newColMap,
03109                   const Teuchos::RCP<const import_type>& newImport,
03110                   const bool sortEachRow)
03111   {
03112     const char tfecfFuncName[] = "reindexColumns: ";
03113     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03114       graph == NULL && myGraph_.is_null (), std::invalid_argument,
03115       "The input graph is NULL, but the matrix does not own its graph.");
03116 
03117     crs_graph_type& theGraph = (graph == NULL) ? *myGraph_ : *graph;
03118     const bool sortGraph = false; // we'll sort graph & matrix together below
03119     theGraph.reindexColumns (newColMap, newImport, sortGraph);
03120     if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
03121       // We can't just call sortEntries() here, because that fails if
03122       // the matrix has a const graph.  We want to use the given graph
03123       // in that case.
03124       const size_t lclNumRows = theGraph.getNodeNumRows ();
03125       for (size_t row = 0; row < lclNumRows; ++row) {
03126         RowInfo rowInfo = theGraph.getRowInfo (row);
03127         theGraph.template sortRowIndicesAndValues<Scalar> (rowInfo, this->getViewNonConst (rowInfo));
03128       }
03129       theGraph.indicesAreSorted_ = true;
03130     }
03131   }
03132 
03133   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03134   void
03135   CrsMatrix<
03136     Scalar,
03137     LocalOrdinal,
03138     GlobalOrdinal,
03139     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03140   replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
03141                                Teuchos::RCP<const import_type>& newImporter)
03142   {
03143     const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
03144     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03145       myGraph_.is_null (), std::runtime_error,
03146       "This method does not work if the matrix has a const graph.  The whole "
03147       "idea of a const graph is that you are not allowed to change it, but this"
03148       " method necessarily must modify the graph, since the graph owns the "
03149       "matrix's domain Map and Import objects.");
03150     myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
03151   }
03152 
03153   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03154   void
03155   CrsMatrix<Scalar,
03156             LocalOrdinal,
03157             GlobalOrdinal,
03158             Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03159   insertNonownedGlobalValues (const GlobalOrdinal globalRow,
03160                               const Teuchos::ArrayView<const GlobalOrdinal>& indices,
03161                               const Teuchos::ArrayView<const Scalar>& values)
03162   {
03163     using Teuchos::Array;
03164     typedef GlobalOrdinal GO;
03165     typedef typename Array<GO>::size_type size_type;
03166 
03167     const size_type numToInsert = indices.size ();
03168     // Add the new data to the list of nonlocals.
03169     // This creates the arrays if they don't exist yet.
03170     std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
03171     Array<GO>& curRowInds = curRow.first;
03172     Array<Scalar>& curRowVals = curRow.second;
03173     const size_type newCapacity = curRowInds.size () + numToInsert;
03174     curRowInds.reserve (newCapacity);
03175     curRowVals.reserve (newCapacity);
03176     for (size_type k = 0; k < numToInsert; ++k) {
03177       curRowInds.push_back (indices[k]);
03178       curRowVals.push_back (values[k]);
03179     }
03180   }
03181 
03182   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03183   void
03184   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal,
03185             Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03186   globalAssemble ()
03187   {
03188     using Teuchos::arcp;
03189     using Teuchos::Array;
03190     using Teuchos::ArrayRCP;
03191     using Teuchos::ArrayView;
03192     using Teuchos::CommRequest;
03193     using Teuchos::gatherAll;
03194     using Teuchos::isend;
03195     using Teuchos::ireceive;
03196     using Teuchos::null;
03197     using Teuchos::outArg;
03198     using Teuchos::RCP;
03199     using Teuchos::rcpFromRef;
03200     using Teuchos::REDUCE_MAX;
03201     using Teuchos::reduceAll;
03202     using Teuchos::SerialDenseMatrix;
03203     using Teuchos::tuple;
03204     using Teuchos::waitAll;
03205     using std::make_pair;
03206     using std::pair;
03207     typedef GlobalOrdinal GO;
03208     typedef typename Array<GO>::size_type size_type;
03209     // nonlocals_ contains the entries stored by previous calls to
03210     // insertGlobalValues() for nonowned rows.
03211     typedef std::map<GO, pair<Array<GO>, Array<Scalar> > > nonlocals_map_type;
03212     typedef typename nonlocals_map_type::const_iterator nonlocals_iter_type;
03213 
03214     const char tfecfFuncName[] = "globalAssemble";
03215     const Teuchos::Comm<int>& comm = * (getComm ());
03216     const int numImages = comm.getSize ();
03217     const int myImageID = comm.getRank ();
03218 
03219     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03220       ! isFillActive (), std::runtime_error, ": requires that fill is active.");
03221 
03222     // Determine (via a global all-reduce) if any processes have
03223     // nonlocal entries to share.  This is necessary even if the
03224     // matrix has a static graph, because insertGlobalValues allows
03225     // nonlocal entries in that case.
03226     size_t MyNonlocals = nonlocals_.size(),
03227            MaxGlobalNonlocals;
03228     reduceAll<int, size_t> (comm, REDUCE_MAX, MyNonlocals,
03229                             outArg (MaxGlobalNonlocals));
03230     if (MaxGlobalNonlocals == 0) {
03231       return;  // no entries to share
03232     }
03233 
03234     // FIXME (mfh 14 Dec 2012) The code below reimplements an Export
03235     // operation.  It would be better just to use an Export.  See
03236     // Comment #34 in discussion of Bug 5782.
03237     //
03238     // mfh 24 Feb 2014: On the other hand, this is not technically an
03239     // Export, since the row Map might not necessarily be one-to-one.
03240 
03241     // compute a list of NLRs from nonlocals_ and use it to compute:
03242     //      IdsAndRows: a vector of (id,row) pairs
03243     //          NLR2Id: a map from NLR to the Id that owns it
03244     // globalNeighbors: a global graph of connectivity between images:
03245     //                  globalNeighbors(i,j) indicates that j sends to i
03246     //         sendIDs: a list of all images I send to
03247     //         recvIDs: a list of all images I receive from (constructed later)
03248     Array<pair<int,GlobalOrdinal> > IdsAndRows;
03249     std::map<GlobalOrdinal,int> NLR2Id;
03250     SerialDenseMatrix<int,char> globalNeighbors;
03251     Array<int> sendIDs, recvIDs;
03252     {
03253       // Construct the set of all nonowned rows encountered by this
03254       // process in insertGlobalValues() or sumIntoGlobalValues().
03255       std::set<GlobalOrdinal> setOfRows;
03256       for (nonlocals_iter_type iter = nonlocals_.begin ();
03257            iter != nonlocals_.end (); ++iter) {
03258         setOfRows.insert (iter->first);
03259       }
03260       // Copy the resulting set of nonowned rows into an Array.
03261       Array<GlobalOrdinal> NLRs (setOfRows.size ());
03262       std::copy (setOfRows.begin (), setOfRows.end (), NLRs.begin ());
03263 
03264       // get a list of ImageIDs for the non-local rows (NLRs)
03265       Array<int> NLRIds (NLRs.size ());
03266       {
03267         const LookupStatus stat =
03268           getRowMap ()->getRemoteIndexList (NLRs (), NLRIds ());
03269         const int lclerr = (stat == IDNotPresent ? 1 : 0);
03270         int gblerr;
03271         reduceAll<int, int> (comm, REDUCE_MAX, lclerr, outArg (gblerr));
03272         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03273           gblerr, std::runtime_error, ": non-local entries correspond to "
03274           "invalid rows.");
03275       }
03276 
03277       // build up a list of neighbors, as well as a map between NLRs and Ids
03278       // localNeighbors[i] != 0 iff I have data to send to image i
03279       // put NLRs,Ids into an array of pairs
03280       IdsAndRows.reserve (NLRs.size ());
03281       Array<char> localNeighbors (numImages, 0);
03282       typename Array<GO>::const_iterator nlr;
03283       typename Array<int>::const_iterator id;
03284       for (nlr = NLRs.begin (), id = NLRIds.begin ();
03285            nlr != NLRs.end (); ++nlr, ++id) {
03286         NLR2Id[*nlr] = *id;
03287         localNeighbors[*id] = 1;
03288         IdsAndRows.push_back (make_pair (*id, *nlr));
03289       }
03290       for (int j = 0; j < numImages; ++j) {
03291         if (localNeighbors[j]) {
03292           sendIDs.push_back (j);
03293         }
03294       }
03295       // sort IdsAndRows, by Ids first, then rows
03296       std::sort (IdsAndRows.begin (), IdsAndRows.end ());
03297       // gather from other nodes to form the full graph
03298       //
03299       // FIXME (mfh 24 Feb 2014) Ugh, this is awful!!!  It's making a
03300       // P x P matrix which is the full graph of process connectivity.
03301       // Neither Export nor Import does this!  It would probably be
03302       // more efficient to do the following:
03303       //
03304       //   1. Form the one-to-one version of the row Map, tgtMap
03305       //   2. Form the (possibly overlapping) Map srcMap, with the
03306       //      global row indices which are the keys of nonlocals_ on
03307       //      each process
03308       //   3. Construct an Export from srcMap to tgtMap
03309       //   4. Execute the Export with Tpetra::ADD
03310       globalNeighbors.shapeUninitialized (numImages, numImages);
03311       gatherAll (comm, numImages, localNeighbors.getRawPtr (),
03312                  numImages*numImages, globalNeighbors.values ());
03313       // globalNeighbors at this point contains (on all images) the
03314       // connectivity between the images.
03315       // globalNeighbors(i,j) != 0 means that j sends to i/that i receives from j
03316     }
03317 
03319     // FIGURE OUT WHO IS SENDING TO WHOM AND HOW MUCH
03320     // DO THIS IN THE PROCESS OF PACKING ALL OUTGOING DATA ACCORDING TO DESTINATION ID
03322 
03323     // loop over all columns to know from which images I can expect to receive something
03324     for (int j=0; j<numImages; ++j) {
03325       if (globalNeighbors (myImageID, j)) {
03326         recvIDs.push_back (j);
03327       }
03328     }
03329     const size_t numRecvs = recvIDs.size ();
03330 
03331     // we know how many we're sending to already
03332     // form a contiguous list of all data to be sent
03333     // track the number of entries for each ID
03334     Array<Details::CrsIJV<GlobalOrdinal,Scalar> > IJVSendBuffer;
03335     Array<size_t> sendSizes(sendIDs.size(), 0);
03336     size_t numSends = 0;
03337     for (typename Array<pair<int,GlobalOrdinal> >::const_iterator IdAndRow = IdsAndRows.begin();
03338          IdAndRow != IdsAndRows.end(); ++IdAndRow)
03339     {
03340       const int id = IdAndRow->first;
03341       const GO row = IdAndRow->second;
03342 
03343       // have we advanced to a new send?
03344       if (sendIDs[numSends] != id) {
03345         numSends++;
03346         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03347           sendIDs[numSends] != id, std::logic_error,
03348           ": internal logic error. Contact Tpetra team.");
03349       }
03350 
03351       // copy data for row into contiguous storage
03352       pair<Array<GO>, Array<Scalar> >& nonlocalsRow = nonlocals_[row];
03353       ArrayView<const GO> nonlocalsRow_colInds = nonlocalsRow.first ();
03354       ArrayView<const Scalar> nonlocalsRow_values = nonlocalsRow.second ();
03355       const size_type numNonlocalsRow = nonlocalsRow_colInds.size ();
03356 
03357       for (size_type k = 0; k < numNonlocalsRow; ++k) {
03358         const Scalar val = nonlocalsRow_values[k];
03359         const GO col = nonlocalsRow_colInds[k];
03360         IJVSendBuffer.push_back (Details::CrsIJV<GO, Scalar> (row, col, val));
03361         sendSizes[numSends]++;
03362       }
03363     }
03364     if (IdsAndRows.size () > 0) {
03365       numSends++; // one last increment, to make it a count instead of an index
03366     }
03367     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03368       static_cast<size_type> (numSends) != sendIDs.size(),
03369       std::logic_error, ": internal logic error. Contact Tpetra team.");
03370 
03371     // don't need this data anymore
03372     // clear it before we start allocating a bunch of new memory
03373     nonlocals_.clear ();
03374 
03376     // TRANSMIT SIZE INFO BETWEEN SENDERS AND RECEIVERS
03378     // perform non-blocking sends: send sizes to our recipients
03379     Array<RCP<CommRequest<int> > > sendRequests;
03380     for (size_t s = 0; s < numSends ; ++s) {
03381       // we'll fake the memory management, because all communication will be local to this method and the scope of our data
03382       sendRequests.push_back (isend<int, size_t> (comm, rcpFromRef (sendSizes[s]), sendIDs[s]));
03383     }
03384     // perform non-blocking receives: receive sizes from our senders
03385     Array<RCP<CommRequest<int> > > recvRequests;
03386     Array<size_t> recvSizes (numRecvs);
03387     for (size_t r = 0; r < numRecvs; ++r) {
03388       // we'll fake the memory management, because all communication
03389       // will be local to this method and the scope of our data
03390       recvRequests.push_back (ireceive<int, size_t> (comm, rcpFromRef (recvSizes[r]), recvIDs[r]));
03391     }
03392     // wait on all
03393     if (! sendRequests.empty ()) {
03394       waitAll (comm, sendRequests ());
03395     }
03396     if (! recvRequests.empty ()) {
03397       waitAll (comm, recvRequests ());
03398     }
03399     comm.barrier ();
03400     sendRequests.clear ();
03401     recvRequests.clear ();
03402 
03404     // NOW SEND/RECEIVE ALL ROW DATA
03406     // from the size info, build the ArrayViews into IJVSendBuffer
03407     Array<ArrayView<Details::CrsIJV<GO, Scalar> > > sendBuffers (numSends, null);
03408     {
03409       size_t cur = 0;
03410       for (size_t s=0; s<numSends; ++s) {
03411         sendBuffers[s] = IJVSendBuffer (cur, sendSizes[s]);
03412         cur += sendSizes[s];
03413       }
03414     }
03415     // perform non-blocking sends
03416     for (size_t s = 0; s < numSends; ++s) {
03417       // we'll fake the memory management, because all communication
03418       // will be local to this method and the scope of our data
03419       ArrayRCP<Details::CrsIJV<GO, Scalar> > tmparcp =
03420         arcp (sendBuffers[s].getRawPtr (), 0, sendBuffers[s].size (), false);
03421       sendRequests.push_back (isend<int, Details::CrsIJV<GlobalOrdinal,Scalar> > (comm, tmparcp, sendIDs[s]));
03422     }
03423     // calculate amount of storage needed for receives
03424     // setup pointers for the receives as well
03425     size_t totalRecvSize = std::accumulate (recvSizes.begin (), recvSizes.end (), 0);
03426     Array<Details::CrsIJV<GO, Scalar> > IJVRecvBuffer (totalRecvSize);
03427     // from the size info, build the ArrayViews into IJVRecvBuffer
03428     Array<ArrayView<Details::CrsIJV<GO, Scalar> > > recvBuffers (numRecvs, null);
03429     {
03430       size_t cur = 0;
03431       for (size_t r = 0; r < numRecvs; ++r) {
03432         recvBuffers[r] = IJVRecvBuffer (cur, recvSizes[r]);
03433         cur += recvSizes[r];
03434       }
03435     }
03436     // perform non-blocking recvs
03437     for (size_t r = 0; r < numRecvs ; ++r) {
03438       // we'll fake the memory management, because all communication
03439       // will be local to this method and the scope of our data
03440       ArrayRCP<Details::CrsIJV<GO, Scalar> > tmparcp =
03441         arcp (recvBuffers[r].getRawPtr (), 0, recvBuffers[r].size (), false);
03442       recvRequests.push_back (ireceive (comm, tmparcp, recvIDs[r]));
03443     }
03444     // perform waits
03445     if (! sendRequests.empty ()) {
03446       waitAll (comm, sendRequests ());
03447     }
03448     if (! recvRequests.empty ()) {
03449       waitAll (comm, recvRequests ());
03450     }
03451     comm.barrier ();
03452     sendRequests.clear ();
03453     recvRequests.clear ();
03454 
03456     // NOW PROCESS THE RECEIVED ROW DATA
03458     // TODO: instead of adding one entry at a time, add one row at a time.
03459     //       this requires resorting; they arrived sorted by sending node, so that entries could be non-contiguous if we received
03460     //       multiple entries for a particular row from different processors.
03461     //       it also requires restoring the data, which may make it not worth the trouble.
03462 
03463     typedef typename Array<Details::CrsIJV<GO, Scalar> >::const_iterator ijv_iter_type;
03464     if (this->isStaticGraph ()) {
03465       for (ijv_iter_type ijv = IJVRecvBuffer.begin ();
03466            ijv != IJVRecvBuffer.end (); ++ijv) {
03467         sumIntoGlobalValues (ijv->i, tuple (ijv->j), tuple (ijv->v));
03468       }
03469     }
03470     else { // Dynamic graph; can use insertGlobalValues ()
03471       for (ijv_iter_type ijv = IJVRecvBuffer.begin ();
03472            ijv != IJVRecvBuffer.end (); ++ijv) {
03473         try {
03474           insertGlobalValues (ijv->i, tuple (ijv->j), tuple (ijv->v));
03475         }
03476         catch (std::runtime_error &e) {
03477           std::ostringstream outmsg;
03478           outmsg << e.what() << std::endl
03479                  << "caught in globalAssemble() in " << __FILE__ << ":" << __LINE__
03480                  << std::endl ;
03481           TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, outmsg.str());
03482         }
03483       }
03484     }
03485 
03486     // WHEW! THAT WAS TIRING!
03487   }
03488 
03489 
03492   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03493   void
03494   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03495   resumeFill (const RCP<ParameterList> &params)
03496   {
03497     if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
03498       myGraph_->resumeFill (params);
03499     }
03500     clearGlobalConstants ();
03501     fillComplete_ = false;
03502   }
03503 
03504   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03505   void
03506   CrsMatrix<
03507     Scalar, LocalOrdinal, GlobalOrdinal,
03508     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03509   computeGlobalConstants ()
03510   {
03511     // This method doesn't do anything.  The analogous method in
03512     // CrsGraph does actually compute something.
03513     //
03514     // Oddly enough, clearGlobalConstants() clears frobNorm_ (by
03515     // setting it to -1), but computeGlobalConstants() does _not_
03516     // compute the Frobenius norm; this is done on demand in
03517     // getFrobeniusNorm(), and the result is cached there.
03518   }
03519 
03520   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
03521            class DeviceType>
03522   void
03523   CrsMatrix<
03524     Scalar, LocalOrdinal, GlobalOrdinal,
03525     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03526   clearGlobalConstants () {
03527     // We use -1 to indicate that the Frobenius norm needs to be
03528     // recomputed, since the values might change between now and the
03529     // next fillComplete call.
03530     //
03531     // Oddly enough, clearGlobalConstants() clears frobNorm_, but
03532     // computeGlobalConstants() does _not_ compute the Frobenius norm;
03533     // this is done on demand in getFrobeniusNorm(), and the result is
03534     // cached there.
03535     frobNorm_ = -STM::one ();
03536   }
03537 
03538   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
03539            class DeviceType>
03540   void
03541   CrsMatrix<
03542     Scalar, LocalOrdinal, GlobalOrdinal,
03543     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03544   fillComplete (const RCP<ParameterList>& params)
03545   {
03546     TEUCHOS_TEST_FOR_EXCEPTION(
03547       getCrsGraph ().is_null (), std::logic_error, "Tpetra::CrsMatrix::"
03548       "fillComplete(params): getCrsGraph() returns null.  "
03549       "This should not happen at this point.  "
03550       "Please report this bug to the Tpetra developers.");
03551 
03552     if (isStaticGraph () && getCrsGraph ()->isFillComplete ()) {
03553       fillComplete (getCrsGraph ()->getDomainMap (), getCrsGraph ()->getRangeMap (), params);
03554     } else {
03555       fillComplete (getRowMap (), getRowMap (), params);
03556     }
03557   }
03558 
03559   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
03560            class DeviceType>
03561   void
03562   CrsMatrix<
03563     Scalar, LocalOrdinal, GlobalOrdinal,
03564     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03565   fillComplete (const RCP<const map_type>& domainMap,
03566                 const RCP<const map_type>& rangeMap,
03567                 const RCP<ParameterList>& params)
03568   {
03569     using Teuchos::ArrayRCP;
03570     using Teuchos::RCP;
03571     using Teuchos::rcp;
03572     const char tfecfFuncName[] = "fillComplete";
03573 
03574     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03575       ! isFillActive () || isFillComplete (),
03576       std::runtime_error, ": Matrix fill state must be active (isFillActive() "
03577       "must be true) before you may call fillComplete().");
03578     const int numProcs = getComm ()->getSize ();
03579 
03580     //
03581     // Read parameters from the input ParameterList.
03582     //
03583 
03584     // If true, the caller promises that no process did nonlocal
03585     // changes since the last call to fillComplete.
03586     bool assertNoNonlocalInserts = false;
03587     // If true, makeColMap sorts remote GIDs (within each remote
03588     // process' group).
03589     bool sortGhosts = true;
03590 
03591     if (! params.is_null ()) {
03592       assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
03593                                              assertNoNonlocalInserts);
03594       if (params->isParameter ("sort column map ghost gids")) {
03595         sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
03596       }
03597       else if (params->isParameter ("Sort column Map ghost GIDs")) {
03598         sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
03599       }
03600     }
03601     // We also don't need to do global assembly if there is only one
03602     // process in the communicator.
03603     const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
03604     // This parameter only matters if this matrix owns its graph.
03605     if (! myGraph_.is_null ()) {
03606       myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
03607     }
03608 
03609     if (! getCrsGraph()->indicesAreAllocated()) {
03610       if (hasColMap ()) {
03611         // We have a column Map, so use local indices.
03612         allocateValues (LocalIndices, GraphNotYetAllocated);
03613       } else {
03614         // We don't have a column Map, so use global indices.
03615         allocateValues (GlobalIndices, GraphNotYetAllocated);
03616       }
03617     }
03618     // Global assemble, if we need to.  This call only costs a single
03619     // all-reduce if we didn't need global assembly after all.
03620     if (needGlobalAssemble) {
03621       globalAssemble ();
03622     }
03623     else {
03624       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03625         numProcs == 1 && nonlocals_.size() > 0,
03626         std::runtime_error, ": cannot have nonlocal entries on a serial run.  "
03627         "An invalid entry (i.e., with row index not in the row Map) must have "
03628         "been submitted to the CrsMatrix.");
03629     }
03630 
03631     if (isStaticGraph ()) {
03632       // FIXME (mfh 18 Jun 2014) This check for correctness of the
03633       // input Maps incurs a penalty of two all-reduces for the
03634       // otherwise optimal const graph case.
03635       //
03636       // We could turn these (max) 2 all-reduces into (max) 1, by
03637       // fusing them.  We could do this by adding a "locallySameAs"
03638       // method to Map, which would return one of four states:
03639       //
03640       //   a. Certainly globally the same
03641       //   b. Certainly globally not the same
03642       //   c. Locally the same
03643       //   d. Locally not the same
03644       //
03645       // The first two states don't require further communication.
03646       // The latter two states require an all-reduce to communicate
03647       // globally, but we only need one all-reduce, since we only need
03648       // to check whether at least one of the Maps is wrong.
03649       const bool domainMapsMatch = staticGraph_->getDomainMap ()->isSameAs (*domainMap);
03650       const bool rangeMapsMatch = staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
03651 
03652       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03653         ! domainMapsMatch, std::runtime_error,
03654         ": The CrsMatrix's domain Map does not match the graph's domain Map.  "
03655         "The graph cannot be changed because it was given to the CrsMatrix "
03656         "constructor as const.  You can fix this by passing in the graph's "
03657         "domain Map and range Map to the matrix's fillComplete call.");
03658 
03659       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03660         ! rangeMapsMatch, std::runtime_error,
03661         ": The CrsMatrix's range Map does not match the graph's range Map.  "
03662         "The graph cannot be changed because it was given to the CrsMatrix "
03663         "constructor as const.  You can fix this by passing in the graph's "
03664         "domain Map and range Map to the matrix's fillComplete call.");
03665     }
03666     else {
03667       // Set the graph's domain and range Maps.  This will clear the
03668       // Import if the domain Map has changed (is a different
03669       // pointer), and the Export if the range Map has changed (is a
03670       // different pointer).
03671       myGraph_->setDomainRangeMaps (domainMap, rangeMap);
03672 
03673       // Make the graph's column Map, if necessary.
03674       if (! myGraph_->hasColMap ()) {
03675         myGraph_->makeColMap ();
03676       }
03677 
03678       // Make indices local, if necessary.  The method won't do
03679       // anything if the graph is already locally indexed.
03680       myGraph_->makeIndicesLocal ();
03681 
03682       if (! myGraph_->isSorted ()) {
03683         sortEntries ();
03684       }
03685       if (! myGraph_->isMerged ()) {
03686         mergeRedundantEntries ();
03687       }
03688       // Make the Import and Export, if they haven't been made already.
03689       myGraph_->makeImportExport ();
03690       myGraph_->computeGlobalConstants ();
03691       myGraph_->fillComplete_ = true;
03692       myGraph_->checkInternalState ();
03693     }
03694     computeGlobalConstants ();
03695     // fill local objects; will fill and finalize local graph if appropriate
03696     if (myGraph_.is_null ()) {
03697       // The matrix does _not_ own the graph, and the graph's
03698       // structure is already fixed, so just fill the local matrix.
03699       fillLocalMatrix (params);
03700     } else {
03701       // The matrix _does_ own the graph, so fill the local graph at
03702       // the same time as the local matrix.
03703       fillLocalGraphAndMatrix (params);
03704     }
03705 
03706     // Once we've initialized the sparse kernels, we're done with the
03707     // local objects.  We may now release them and their memory, since
03708     // they will persist in the local sparse ops if necessary.  We
03709     // keep the local graph if the parameters tell us to do so.
03710 
03711     // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
03712 
03713     fillComplete_ = true; // Now we're fill complete!
03714     checkInternalState ();
03715   }
03716 
03717   template <class Scalar, class LocalOrdinal, class GlobalOrdinal,
03718             class DeviceType>
03719   void
03720   CrsMatrix<
03721     Scalar, LocalOrdinal, GlobalOrdinal,
03722     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03723   expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
03724                             const Teuchos::RCP<const map_type> & rangeMap,
03725                             const Teuchos::RCP<const import_type>& importer,
03726                             const Teuchos::RCP<const export_type>& exporter,
03727                             const Teuchos::RCP<Teuchos::ParameterList> &params)
03728   {
03729     const char tfecfFuncName[] = "expertStaticFillComplete";
03730     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
03731       std::runtime_error, ": Matrix fill state must be active (isFillActive() "
03732       "must be true) before calling fillComplete().");
03733     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(myGraph_==Teuchos::null, std::logic_error,": myGraph_ is null.  This is not allowed.");
03734 
03735     // We will presume globalAssemble is not needed, so we do the ESFC on the graph
03736     myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter);
03737 
03738     computeGlobalConstants();
03739 
03740     // Fill the local graph and matrix
03741     fillLocalGraphAndMatrix (params);
03742 
03743     // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
03744 
03745     // Now we're fill complete!
03746     fillComplete_ = true;
03747 
03748     // Sanity checks at the end.
03749 #ifdef HAVE_TPETRA_DEBUG
03750     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
03751       ": We're at the end of fillComplete(), but isFillActive() is true.  "
03752       "Please report this bug to the Tpetra developers.");
03753     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
03754       ": We're at the end of fillComplete(), but isFillActive() is true.  "
03755       "Please report this bug to the Tpetra developers.");
03756 #endif // HAVE_TPETRA_DEBUG
03757     checkInternalState();
03758 
03759   }
03760 
03763   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03764   void CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::sortEntries()
03765   {
03766     TEUCHOS_TEST_FOR_EXCEPTION(isStaticGraph() == true, std::runtime_error,
03767         typeName(*this) << "::sortEntries(): cannot sort with static graph.");
03768     if (myGraph_->isSorted() == false) {
03769       for (size_t row=0; row < getNodeNumRows(); ++row) {
03770         RowInfo rowInfo = myGraph_->getRowInfo(row);
03771         myGraph_->template sortRowIndicesAndValues<Scalar>(rowInfo,this->getViewNonConst(rowInfo));
03772       }
03773       // we just sorted every row
03774       myGraph_->indicesAreSorted_ = true;
03775     }
03776   }
03777 
03778 
03781   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03782   void
03783   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03784   mergeRedundantEntries ()
03785   {
03786     TEUCHOS_TEST_FOR_EXCEPTION(isStaticGraph() == true, std::runtime_error,
03787       typeName(*this) << "::mergeRedundantEntries: Cannot merge with static graph.");
03788     if (! myGraph_->isMerged ()) {
03789       const size_t nodeNumRows = getNodeNumRows ();
03790       for (size_t row = 0; row < nodeNumRows; ++row) {
03791         RowInfo rowInfo = myGraph_->getRowInfo (row);
03792         Teuchos::ArrayView<Scalar> rowView = (this->getViewNonConst (rowInfo)) ();
03793         myGraph_->template mergeRowIndicesAndValues<Scalar> (rowInfo, rowView);
03794       }
03795       myGraph_->noRedundancies_ = true; // we just merged every row
03796     }
03797   }
03798 
03801   template <class Scalar,
03802             class LocalOrdinal,
03803             class GlobalOrdinal, class DeviceType>
03804   void
03805   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03806   applyNonTranspose (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,node_type> & X_in,
03807                      MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,node_type> & Y_in,
03808                      Scalar alpha,
03809                      Scalar beta) const
03810   {
03811     using Teuchos::null;
03812     using Teuchos::RCP;
03813     using Teuchos::rcp;
03814     using Teuchos::rcp_const_cast;
03815     using Teuchos::rcpFromRef;
03816 
03817     // mfh 05 Jun 2014: Special case for alpha == 0.  I added this to
03818     // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
03819     // failing only for the Kokkos refactor version of Tpetra.  It's a
03820     // good idea regardless to have the bypass.
03821     if (alpha == STS::zero ()) {
03822       if (beta == STS::zero ()) {
03823         Y_in.putScalar (STS::zero ());
03824       } else if (beta != STS::one ()) {
03825         Y_in.scale (beta);
03826       }
03827       return;
03828     }
03829 
03830     // It's possible that X is a view of Y or vice versa.  We don't
03831     // allow this (apply() requires that X and Y not alias one
03832     // another), but it's helpful to detect and work around this case.
03833     // We don't try to to detect the more subtle cases (e.g., one is a
03834     // subview of the other, but their initial pointers differ).  We
03835     // only need to do this if this matrix's Import is trivial;
03836     // otherwise, we don't actually apply the operator from X into Y.
03837 
03838     RCP<const import_type> importer = this->getGraph ()->getImporter ();
03839     RCP<const export_type> exporter = this->getGraph ()->getExporter ();
03840 
03841     // If beta == 0, then the output MV will be overwritten; none of
03842     // its entries should be read.  (Sparse BLAS semantics say that we
03843     // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
03844     // This matters if we need to do an Export operation; see below.
03845     const bool Y_is_overwritten = (beta == STS::zero());
03846 
03847     // We treat the case of a replicated MV output specially.
03848     const bool Y_is_replicated = ! Y_in.isDistributed ();
03849 
03850     // This is part of the special case for replicated MV output.
03851     // We'll let each process do its thing, but do an all-reduce at
03852     // the end to sum up the results.  Setting beta=0 on all processes
03853     // but Proc 0 makes the math work out for the all-reduce.  (This
03854     // assumes that the replicated data is correctly replicated, so
03855     // that the data are the same on all processes.)
03856     if (Y_is_replicated && this->getComm ()->getRank () > 0) {
03857       beta = STS::zero ();
03858     }
03859 
03860     // Temporary MV for Import operation.  After the block of code
03861     // below, this will be an (Imported if necessary) column Map MV
03862     // ready to give to localMultiply().
03863     RCP<const MV> X_colMap;
03864     if (importer.is_null ()) {
03865       if (! X_in.isConstantStride ()) {
03866         // Not all sparse mat-vec kernels can handle an input MV with
03867         // nonconstant stride correctly, so we have to copy it in that
03868         // case into a constant stride MV.  To make a constant stride
03869         // copy of X_in, we force creation of the column (== domain)
03870         // Map MV (if it hasn't already been created, else fetch the
03871         // cached copy).  This avoids creating a new MV each time.
03872         RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
03873         Tpetra::deep_copy (*X_colMapNonConst, X_in);
03874         X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
03875       }
03876       else {
03877         // The domain and column Maps are the same, so do the local
03878         // multiply using the domain Map input MV X_in.
03879         X_colMap = rcpFromRef (X_in);
03880       }
03881     }
03882     else {
03883       // We're doing an Import anyway, which will copy the relevant
03884       // elements of the domain Map MV X_in into a separate column Map
03885       // MV.  Thus, we don't have to worry whether X_in is constant
03886       // stride.
03887       RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
03888 
03889       // Import from the domain Map MV to the column Map MV.
03890       X_colMapNonConst->doImport (X_in, *importer, INSERT);
03891       X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
03892     }
03893 
03894     // Temporary MV for Export operation, or for copying a nonconstant
03895     // stride output MV into a constant stride MV.
03896     RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
03897 
03898     // If we have a nontrivial Export object, we must perform an
03899     // Export.  In that case, the local multiply result will go into
03900     // the row Map multivector.  We don't have to make a
03901     // constant-stride version of Y_in in this case, because we had to
03902     // make a constant stride Y_rowMap MV and do an Export anyway.
03903     if (! exporter.is_null ()) {
03904       this->template localMultiply<Scalar, Scalar> (*X_colMap, *Y_rowMap,
03905                                                     Teuchos::NO_TRANS,
03906                                                     alpha, STS::zero ());
03907       // If we're overwriting the output MV Y_in completely (beta ==
03908       // 0), then make sure that it is filled with zeros before we do
03909       // the Export.  Otherwise, the ADD combine mode will use data in
03910       // Y_in, which is supposed to be zero.
03911       if (Y_is_overwritten) {
03912         Y_in.putScalar (STS::zero ());
03913       }
03914       else {
03915         // Scale the output MV by beta, so that the Export sums in the
03916         // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
03917         Y_in.scale (beta);
03918       }
03919       // Do the Export operation.
03920       Y_in.doExport (*Y_rowMap, *exporter, ADD);
03921     }
03922     else { // Don't do an Export: row Map and range Map are the same.
03923       //
03924       // If Y_in does not have constant stride, or if the column Map
03925       // MV aliases Y_in, then we can't let the kernel write directly
03926       // to Y_in.  Instead, we have to use the cached row (== range)
03927       // Map MV as temporary storage.
03928       //
03929       // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
03930       // the user passed in the same MultiVector for both X and Y.  It
03931       // won't detect whether one MultiVector views the other.  We
03932       // should also check the MultiVectors' raw data pointers.
03933       if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
03934         // Force creating the MV if it hasn't been created already.
03935         // This will reuse a previously created cached MV.
03936         Y_rowMap = getRowMapMultiVector (Y_in, true);
03937 
03938         // If beta == 0, we don't need to copy Y_in into Y_rowMap,
03939         // since we're overwriting it anyway.
03940         if (beta != STS::zero ()) {
03941           Tpetra::deep_copy (*Y_rowMap, Y_in);
03942         }
03943         this->template localMultiply<Scalar, Scalar> (*X_colMap,
03944                                                       *Y_rowMap,
03945                                                       Teuchos::NO_TRANS,
03946                                                       alpha, beta);
03947         Tpetra::deep_copy (Y_in, *Y_rowMap);
03948       }
03949       else {
03950         this->template localMultiply<Scalar, Scalar> (*X_colMap, Y_in,
03951                                                       Teuchos::NO_TRANS,
03952                                                       alpha, beta);
03953       }
03954     }
03955 
03956     // If the range Map is a locally replicated Map, sum up
03957     // contributions from each process.  We set beta = 0 on all
03958     // processes but Proc 0 initially, so this will handle the scaling
03959     // factor beta correctly.
03960     if (Y_is_replicated) {
03961       Y_in.reduce ();
03962     }
03963   }
03964 
03965   template <class Scalar,
03966             class LocalOrdinal,
03967             class GlobalOrdinal, class DeviceType>
03968   void
03969   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03970   applyTranspose (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& X_in,
03971                   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& Y_in,
03972                   const Teuchos::ETransp mode,
03973                   Scalar alpha,
03974                   Scalar beta) const
03975   {
03976     using Teuchos::null;
03977     using Teuchos::RCP;
03978     using Teuchos::rcp;
03979     using Teuchos::rcp_const_cast;
03980     using Teuchos::rcpFromRef;
03981 
03982     // Take shortcuts for alpha == 0.
03983     if (alpha == STS::zero ()) {
03984       // Follow the Sparse BLAS convention by ignoring both the matrix
03985       // and X_in, in this case.
03986       if (beta == STS::zero ()) {
03987         // Follow the Sparse BLAS convention by overwriting any Inf or
03988         // NaN values in Y_in, in this case.
03989         Y_in.putScalar (STS::zero ());
03990       }
03991       else {
03992         Y_in.scale (beta);
03993       }
03994       return;
03995     }
03996 
03997     const size_t numVectors = X_in.getNumVectors ();
03998 
03999     // We don't allow X_in and Y_in to alias one another.  It's hard
04000     // to check this, because advanced users could create views from
04001     // raw pointers.  However, if X_in and Y_in reference the same
04002     // object, we will do the user a favor by copying X into new
04003     // storage (with a warning).  We only need to do this if we have
04004     // trivial importers; otherwise, we don't actually apply the
04005     // operator from X into Y.
04006     RCP<const import_type> importer = this->getGraph ()->getImporter ();
04007     RCP<const export_type> exporter = this->getGraph ()->getExporter ();
04008     // access X indirectly, in case we need to create temporary storage
04009     RCP<const MV> X;
04010 
04011     // some parameters for below
04012     const bool Y_is_replicated = ! Y_in.isDistributed ();
04013     const bool Y_is_overwritten = (beta == STS::zero ());
04014     if (Y_is_replicated && this->getComm ()->getRank () > 0) {
04015       beta = STS::zero ();
04016     }
04017 
04018     // The kernels do not allow input or output with nonconstant stride.
04019     if (! X_in.isConstantStride () && importer.is_null ()) {
04020       X = rcp (new MV (X_in)); // Constant-stride copy of X_in
04021     } else {
04022       X = rcpFromRef (X_in); // Reference to X_in
04023     }
04024 
04025     // Set up temporary multivectors for Import and/or Export.
04026     if (importer != null) {
04027       if (importMV_ != null && importMV_->getNumVectors() != numVectors) {
04028         importMV_ = null;
04029       }
04030       if (importMV_ == null) {
04031         importMV_ = rcp (new MV (this->getColMap (), numVectors));
04032       }
04033     }
04034     if (exporter != null) {
04035       if (exportMV_ != null && exportMV_->getNumVectors() != numVectors) {
04036         exportMV_ = null;
04037       }
04038       if (exportMV_ == null) {
04039         exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
04040       }
04041     }
04042 
04043     // If we have a non-trivial exporter, we must import elements that
04044     // are permuted or are on other processors.
04045     if (! exporter.is_null ()) {
04046       exportMV_->doImport (X_in, *exporter, INSERT);
04047       X = exportMV_; // multiply out of exportMV_
04048     }
04049 
04050     // If we have a non-trivial importer, we must export elements that
04051     // are permuted or belong to other processors.  We will compute
04052     // solution into the to-be-exported MV; get a view.
04053     if (importer != null) {
04054       // Do the local computation.
04055       this->template localMultiply<Scalar, Scalar> (*X, *importMV_, mode, alpha, STS::zero ());
04056       if (Y_is_overwritten) {
04057         Y_in.putScalar (STS::zero ());
04058       } else {
04059         Y_in.scale (beta);
04060       }
04061       Y_in.doExport(*importMV_,*importer,ADD);
04062     }
04063     // otherwise, multiply into Y
04064     else {
04065       // can't multiply in-situ; can't multiply into non-strided multivector
04066       //
04067       // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
04068       // the user passed in the same MultiVector for both X and Y.  It
04069       // won't detect whether one MultiVector views the other.  We
04070       // should also check the MultiVectors' raw data pointers.
04071       if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
04072         // Make a deep copy of Y_in, into which to write the multiply result.
04073         MV Y (Y_in, Teuchos::Copy);
04074         this->template localMultiply<Scalar, Scalar> (*X, Y, mode, alpha, beta);
04075         Tpetra::deep_copy (Y_in, Y);
04076       } else {
04077         this->template localMultiply<Scalar, Scalar> (*X, Y_in, mode, alpha, beta);
04078       }
04079     }
04080 
04081     // If the range Map is a locally replicated map, sum the
04082     // contributions from each process.  (That's why we set beta=0
04083     // above for all processes but Proc 0.)
04084     if (Y_is_replicated) {
04085       Y_in.reduce ();
04086     }
04087   }
04088 
04091   template <class Scalar,
04092             class LocalOrdinal,
04093             class GlobalOrdinal, class DeviceType>
04094   void
04095   CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
04096   apply (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type> &X,
04097          MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type> &Y,
04098          Teuchos::ETransp mode,
04099          Scalar alpha,
04100          Scalar beta) const
04101   {
04102     TEUCHOS_TEST_FOR_EXCEPTION(
04103       ! isFillComplete (), std::runtime_error,
04104       "Tpetra::CrsMatrix::apply(): Cannot call apply() until fillComplete() "
04105       "has been called.");
04106     if (mode == Teuchos::NO_TRANS) {
04107       applyNonTranspose (X, Y, alpha, beta);
04108     } else {
04109       applyTranspose (X, Y, mode, alpha, beta);
04110     }
04111   }
04112 
04115   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
04116   void
04117   CrsMatrix<
04118     Scalar, LocalOrdinal, GlobalOrdinal,
04119     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
04120   gaussSeidel (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& B,
04121                MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& X,
04122                const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& D,
04123                const Scalar& dampingFactor,
04124                const ESweepDirection direction,
04125                const int numSweeps) const
04126   {
04127     reorderedGaussSeidel(B,X,D,Teuchos::null,dampingFactor,direction,numSweeps);
04128   }
04129 
04130   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
04131            class DeviceType>
04132   void
04133   CrsMatrix<
04134     Scalar, LocalOrdinal, GlobalOrdinal,
04135     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
04136   reorderedGaussSeidel (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type>& B,
04137                         MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type>& X,
04138                         const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type>& D,
04139                         const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
04140                         const Scalar& dampingFactor,
04141                         const ESweepDirection direction,
04142                         const int numSweeps) const
04143   {
04144     using Teuchos::null;
04145     using Teuchos::RCP;
04146     using Teuchos::rcp;
04147     using Teuchos::rcpFromRef;
04148     using Teuchos::rcp_const_cast;
04149     typedef Scalar ST;
04150 
04151     TEUCHOS_TEST_FOR_EXCEPTION(
04152       isFillComplete() == false, std::runtime_error,
04153       "Tpetra::CrsMatrix::gaussSeidel: cannot call this method until "
04154       "fillComplete() has been called.");
04155     TEUCHOS_TEST_FOR_EXCEPTION(
04156       numSweeps < 0,
04157       std::invalid_argument,
04158       "Tpetra::CrsMatrix::gaussSeidel: The number of sweeps must be , "
04159       "nonnegative but you provided numSweeps = " << numSweeps << " < 0.");
04160 
04161     // Translate from global to local sweep direction.
04162     // While doing this, validate the input.
04163     KokkosClassic::ESweepDirection localDirection;
04164     if (direction == Forward) {
04165       localDirection = KokkosClassic::Forward;
04166     }
04167     else if (direction == Backward) {
04168       localDirection = KokkosClassic::Backward;
04169     }
04170     else if (direction == Symmetric) {
04171       // We'll control local sweep direction manually.
04172       localDirection = KokkosClassic::Forward;
04173     }
04174     else {
04175       TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
04176         "Tpetra::CrsMatrix::gaussSeidel: The 'direction' enum does not have "
04177         "any of its valid values: Forward, Backward, or Symmetric.");
04178     }
04179 
04180     if (numSweeps == 0) {
04181       return; // Nothing to do.
04182     }
04183 
04184     // We don't need the Export object because this method assumes
04185     // that the row, domain, and range Maps are the same.  We do need
04186     // the Import object, if there is one, though.
04187     RCP<const import_type> importer = this->getGraph()->getImporter();
04188     RCP<const export_type> exporter = this->getGraph()->getExporter();
04189     TEUCHOS_TEST_FOR_EXCEPTION(
04190       ! exporter.is_null (), std::runtime_error,
04191       "Tpetra's gaussSeidel implementation requires that the row, domain, "
04192       "and range Maps be the same.  This cannot be the case, because the "
04193       "matrix has a nontrivial Export object.");
04194 
04195     RCP<const map_type> domainMap = this->getDomainMap ();
04196     RCP<const map_type> rangeMap = this->getRangeMap ();
04197     RCP<const map_type> rowMap = this->getGraph ()->getRowMap ();
04198     RCP<const map_type> colMap = this->getGraph ()->getColMap ();
04199 
04200 #ifdef HAVE_TEUCHOS_DEBUG
04201     {
04202       // The relation 'isSameAs' is transitive.  It's also a
04203       // collective, so we don't have to do a "shared" test for
04204       // exception (i.e., a global reduction on the test value).
04205       TEUCHOS_TEST_FOR_EXCEPTION(
04206         ! X.getMap ()->isSameAs (*domainMap),
04207         std::runtime_error,
04208         "Tpetra::CrsMatrix::gaussSeidel requires that the input "
04209         "multivector X be in the domain Map of the matrix.");
04210       TEUCHOS_TEST_FOR_EXCEPTION(
04211         ! B.getMap ()->isSameAs (*rangeMap),
04212         std::runtime_error,
04213         "Tpetra::CrsMatrix::gaussSeidel requires that the input "
04214         "B be in the range Map of the matrix.");
04215       TEUCHOS_TEST_FOR_EXCEPTION(
04216         ! D.getMap ()->isSameAs (*rowMap),
04217         std::runtime_error,
04218         "Tpetra::CrsMatrix::gaussSeidel requires that the input "
04219         "D be in the row Map of the matrix.");
04220       TEUCHOS_TEST_FOR_EXCEPTION(
04221         ! rowMap->isSameAs (*rangeMap),
04222         std::runtime_error,
04223         "Tpetra::CrsMatrix::gaussSeidel requires that the row Map and the "
04224         "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
04225       TEUCHOS_TEST_FOR_EXCEPTION(
04226         ! domainMap->isSameAs (*rangeMap),
04227         std::runtime_error,
04228         "Tpetra::CrsMatrix::gaussSeidel requires that the domain Map and "
04229         "the range Map of the matrix be the same.");
04230     }
04231 #else
04232     // Forestall any compiler warnings for unused variables.
04233     (void) rangeMap;
04234     (void) rowMap;
04235 #endif // HAVE_TEUCHOS_DEBUG
04236 
04237     // If B is not constant stride, copy it into a constant stride
04238     // multivector.  We'l handle the right-hand side B first and deal
04239     // with X right before the sweeps, to improve locality of the
04240     // first sweep.  (If the problem is small enough, then that will
04241     // hopefully keep more of the entries of X in cache.  This
04242     // optimizes for the typical case of a small number of sweeps.)
04243     RCP<const MV> B_in;
04244     if (B.isConstantStride()) {
04245       B_in = rcpFromRef (B);
04246     }
04247     else {
04248       // The range Map and row Map are the same in this case, so we
04249       // can use the (possibly cached) row Map multivector to store a
04250       // constant stride copy of B.  We don't have to copy back, since
04251       // Gauss-Seidel won't modify B.
04252       RCP<MV> B_in_nonconst = getRowMapMultiVector (B, true);
04253       *B_in_nonconst = B; // Copy from B into B_in(_nonconst).
04254       B_in = rcp_const_cast<const MV> (B_in_nonconst);
04255 
04256       TPETRA_EFFICIENCY_WARNING(
04257         ! B.isConstantStride (),
04258         std::runtime_error,
04259         "gaussSeidel: The current implementation of the Gauss-Seidel kernel "
04260         "requires that X and B both have constant stride.  Since B does not "
04261         "have constant stride, we had to make a copy.  This is a limitation of "
04262         "the current implementation and not your fault, but we still report it "
04263         "as an efficiency warning for your information.");
04264     }
04265 
04266     // If X is not constant stride, copy it into a constant stride
04267     // multivector.  Also, make the column Map multivector X_colMap,
04268     // and its domain Map view X_domainMap.  (X actually must be a
04269     // domain Map view of a column Map multivector; exploit this, if X
04270     // has constant stride.)
04271 
04272     RCP<MV> X_domainMap;
04273     RCP<MV> X_colMap;
04274     bool copiedInput = false;
04275 
04276     if (importer.is_null ()) { // Domain and column Maps are the same.
04277       if (X.isConstantStride ()) {
04278         X_domainMap = rcpFromRef (X);
04279         X_colMap = X_domainMap;
04280         copiedInput = false;
04281       }
04282       else {
04283         // Get a temporary column Map multivector, make a domain Map
04284         // view of it, and copy X into the domain Map view.  We have
04285         // to copy here because we won't be doing Import operations.
04286         X_colMap = getColumnMapMultiVector (X, true);
04287         X_domainMap = X_colMap; // Domain and column Maps are the same.
04288         deep_copy(*X_domainMap, X); // Copy X into the domain Map view.
04289         copiedInput = true;
04290         TPETRA_EFFICIENCY_WARNING(
04291           ! X.isConstantStride (), std::runtime_error,
04292           "Tpetra::CrsMatrix::gaussSeidel: The current implementation of the "
04293           "Gauss-Seidel kernel requires that X and B both have constant "
04294           "stride.  Since X does not have constant stride, we had to make a "
04295           "copy.  This is a limitation of the current implementation and not "
04296           "your fault, but we still report it as an efficiency warning for "
04297           "your information.");
04298       }
04299     }
04300     else { // We will be doing Import operations in the sweeps.
04301       if (X.isConstantStride ()) {
04302         X_domainMap = rcpFromRef (X);
04303         // This kernel assumes that X is a domain Map view of a column
04304         // Map multivector.  We will only check if this is valid if
04305         // the CMake configure Teuchos_ENABLE_DEBUG is ON.
04306         X_colMap = X_domainMap->offsetViewNonConst (colMap, 0);
04307 
04308         // FIXME (mfh 19 Mar 2013) Do we need to fill the remote
04309         // entries of X_colMap with zeros?  Do we need to fill all of
04310         // X_domainMap initially with zeros?  Ifpack
04311         // (Ifpack_PointRelaxation.cpp, line 906) creates an entirely
04312         // new MultiVector each time.
04313 
04314         // Do the first Import for the first sweep.  This simplifies
04315         // the logic in the sweeps.
04316         X_colMap->doImport (X, *importer, INSERT);
04317         copiedInput = false;
04318       }
04319       else {
04320         // Get a temporary column Map multivector X_colMap, and make a
04321         // domain Map view X_domainMap of it.  Instead of copying, we
04322         // do an Import from X into X_domainMap.  This saves us a
04323         // copy, since the Import has to copy the data anyway.
04324         X_colMap = getColumnMapMultiVector (X, true);
04325         X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
04326         X_colMap->doImport (X, *importer, INSERT);
04327         copiedInput = true;
04328         TPETRA_EFFICIENCY_WARNING(
04329           ! X.isConstantStride (), std::runtime_error,
04330           "Tpetra::CrsMatrix::gaussSeidel: The current implementation of the "
04331           "Gauss-Seidel kernel requires that X and B both have constant stride.  "
04332           "Since X does not have constant stride, we had to make a copy.  "
04333           "This is a limitation of the current implementation and not your fault, "
04334           "but we still report it as an efficiency warning for your information.");
04335       }
04336     }
04337 
04338     for (int sweep = 0; sweep < numSweeps; ++sweep) {
04339       if (! importer.is_null () && sweep > 0) {
04340         // We already did the first Import for the zeroth sweep.
04341         X_colMap->doImport (*X_domainMap, *importer, INSERT);
04342       }
04343 
04344       // Do local Gauss-Seidel.
04345       if (direction != Symmetric) {
04346         if(rowIndices.is_null())
04347           this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
04348                                                    dampingFactor,
04349                                                    localDirection);
04350         else
04351           this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap, D, rowIndices,
04352                                                             dampingFactor,
04353                                                             localDirection);
04354       } else { // direction == Symmetri
04355         const bool doImportBetweenDirections = false;
04356         if(rowIndices.is_null()) {
04357           this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
04358                                                    dampingFactor,
04359                                                    KokkosClassic::Forward);
04360           // mfh 18 Mar 2013: Aztec's implementation of "symmetric
04361           // Gauss-Seidel" does _not_ do an Import between the forward
04362           // and backward sweeps.  This makes sense, because Aztec
04363           // considers "symmetric Gauss-Seidel" a subdomain solver.
04364           if (doImportBetweenDirections) {
04365             // Communicate again before the Backward sweep.
04366             if (! importer.is_null ()) {
04367               X_colMap->doImport (*X_domainMap, *importer, INSERT);
04368             }
04369           }
04370           this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
04371                                                    dampingFactor,
04372                                                    KokkosClassic::Backward);
04373         }
04374         else {
04375           this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap, D, rowIndices,
04376                                                    dampingFactor,
04377                                                    KokkosClassic::Forward);
04378           if (doImportBetweenDirections) {
04379             // Communicate again before the Backward sweep.
04380             if (! importer.is_null ()) {
04381               X_colMap->doImport (*X_domainMap, *importer, INSERT);
04382             }
04383           }
04384           this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap, D, rowIndices,
04385                                                             dampingFactor,
04386                                                             KokkosClassic::Backward);
04387         }
04388       }
04389     }
04390 
04391     if (copiedInput) {
04392       deep_copy(X, *X_domainMap); // Copy back from X_domainMap to X.
04393     }
04394   }
04395 
04396   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
04397   void
04398   CrsMatrix<
04399     Scalar, LocalOrdinal, GlobalOrdinal,
04400     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
04401   gaussSeidelCopy (MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& X,
04402                    const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& B,
04403                    const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& D,
04404                    const Scalar& dampingFactor,
04405                    const ESweepDirection direction,
04406                    const int numSweeps,
04407                    const bool zeroInitialGuess) const
04408   {
04409     reorderedGaussSeidelCopy(X,B,D,Teuchos::null,dampingFactor,direction,numSweeps,zeroInitialGuess);
04410   }
04411 
04412   template<class Scalar, class LocalOrdinal, class GlobalOrdinal,
04413            class DeviceType>
04414   void
04415   CrsMatrix<
04416     Scalar, LocalOrdinal, GlobalOrdinal,
04417     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
04418   reorderedGaussSeidelCopy (MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& X,
04419                             const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& B,
04420                             const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& D,
04421                             const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
04422                             const Scalar& dampingFactor,
04423                             const ESweepDirection direction,
04424                             const int numSweeps,
04425                             const bool zeroInitialGuess) const
04426   {
04427     using Teuchos::null;
04428     using Teuchos::RCP;
04429     using Teuchos::rcp;
04430     using Teuchos::rcpFromRef;
04431     using Teuchos::rcp_const_cast;
04432     typedef Scalar ST;
04433     TEUCHOS_TEST_FOR_EXCEPTION(
04434       isFillComplete() == false, std::runtime_error,
04435       "Tpetra::CrsMatrix::gaussSeidelCopy: cannot call this method until "
04436       "fillComplete() has been called.");
04437     TEUCHOS_TEST_FOR_EXCEPTION(
04438       numSweeps < 0,
04439       std::invalid_argument,
04440       "gaussSeidelCopy: The number of sweeps must be nonnegative, "
04441       "but you provided numSweeps = " << numSweeps << " < 0.");
04442 
04443     // Translate from global to local sweep direction.
04444     // While doing this, validate the input.
04445     KokkosClassic::ESweepDirection localDirection;
04446     if (direction == Forward) {
04447       localDirection = KokkosClassic::Forward;
04448     }
04449     else if (direction == Backward) {
04450       localDirection = KokkosClassic::Backward;
04451     }
04452     else if (direction == Symmetric) {
04453       // We'll control local sweep direction manually.
04454       localDirection = KokkosClassic::Forward;
04455     }
04456     else {
04457       TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
04458         "gaussSeidelCopy: The 'direction' enum does not have any of its "
04459         "valid values: Forward, Backward, or Symmetric.");
04460     }
04461 
04462     if (numSweeps == 0) {
04463       return;
04464     }
04465 
04466     RCP<const import_type> importer = this->getGraph()->getImporter();
04467     RCP<const export_type> exporter = this->getGraph()->getExporter();
04468     TEUCHOS_TEST_FOR_EXCEPTION(
04469       ! exporter.is_null (),
04470       std::runtime_error,
04471       "Tpetra's gaussSeidelCopy implementation requires that the row, domain, "
04472       "and range Maps be the same.  This cannot be the case, because the "
04473       "matrix has a nontrivial Export object.");
04474 
04475     RCP<const map_type> domainMap = this->getDomainMap ();
04476     RCP<const map_type> rangeMap = this->getRangeMap ();
04477     RCP<const map_type> rowMap = this->getGraph ()->getRowMap ();
04478     RCP<const map_type> colMap = this->getGraph ()->getColMap ();
04479 
04480 #ifdef HAVE_TEUCHOS_DEBUG
04481     {
04482       // The relation 'isSameAs' is transitive.  It's also a
04483       // collective, so we don't have to do a "shared" test for
04484       // exception (i.e., a global reduction on the test value).
04485       TEUCHOS_TEST_FOR_EXCEPTION(
04486         ! X.getMap ()->isSameAs (*domainMap),
04487         std::runtime_error,
04488         "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
04489         "multivector X be in the domain Map of the matrix.");
04490       TEUCHOS_TEST_FOR_EXCEPTION(
04491         ! B.getMap ()->isSameAs (*rangeMap),
04492         std::runtime_error,
04493         "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
04494         "B be in the range Map of the matrix.");
04495       TEUCHOS_TEST_FOR_EXCEPTION(
04496         ! D.getMap ()->isSameAs (*rowMap),
04497         std::runtime_error,
04498         "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
04499         "D be in the row Map of the matrix.");
04500       TEUCHOS_TEST_FOR_EXCEPTION(
04501         ! rowMap->isSameAs (*rangeMap),
04502         std::runtime_error,
04503         "Tpetra::CrsMatrix::gaussSeidelCopy requires that the row Map and the "
04504         "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
04505       TEUCHOS_TEST_FOR_EXCEPTION(
04506         ! domainMap->isSameAs (*rangeMap),
04507         std::runtime_error,
04508         "Tpetra::CrsMatrix::gaussSeidelCopy requires that the domain Map and "
04509         "the range Map of the matrix be the same.");
04510     }
04511 #else
04512     // Forestall any compiler warnings for unused variables.
04513     (void) rangeMap;
04514     (void) rowMap;
04515 #endif // HAVE_TEUCHOS_DEBUG
04516 
04517     // Fetch a (possibly cached) temporary column Map multivector
04518     // X_colMap, and a domain Map view X_domainMap of it.  Both have
04519     // constant stride by construction.  We know that the domain Map
04520     // must include the column Map, because our Gauss-Seidel kernel
04521     // requires that the row Map, domain Map, and range Map are all
04522     // the same, and that each process owns all of its own diagonal
04523     // entries of the matrix.
04524 
04525     RCP<MV> X_colMap;
04526     RCP<MV> X_domainMap;
04527     bool copyBackOutput = false;
04528     if (importer.is_null ()) {
04529       if (X.isConstantStride ()) {
04530         X_colMap = rcpFromRef (X);
04531         X_domainMap = rcpFromRef (X);
04532         // Column Map and domain Map are the same, so there are no
04533         // remote entries.  Thus, if we are not setting the initial
04534         // guess to zero, we don't have to worry about setting remote
04535         // entries to zero, even though we are not doing an Import in
04536         // this case.
04537         if (zeroInitialGuess) {
04538           X_colMap->putScalar (STS::zero ());
04539         }
04540         // No need to copy back to X at end.
04541       }
04542       else { // We must copy X into a constant stride multivector.
04543         // Just use the cached column Map multivector for that.
04544         // force=true means fill with zeros, so no need to fill
04545         // remote entries (not in domain Map) with zeros.
04546         X_colMap = getColumnMapMultiVector (X, true);
04547         // X_domainMap is always a domain Map view of the column Map
04548         // multivector.  In this case, the domain and column Maps are
04549         // the same, so X_domainMap _is_ X_colMap.
04550         X_domainMap = X_colMap;
04551         if (! zeroInitialGuess) { // Don't copy if zero initial guess
04552 
04553           try {
04554             deep_copy(*X_domainMap , X); // Copy X into constant stride multivector
04555           } catch (std::exception& e) {
04556             std::ostringstream os;
04557             os << "Tpetra::CrsMatrix::reorderedGaussSeidelCopy: "
04558               "deep_copy(*X_domainMap, X) threw an exception: "
04559                << e.what () << ".";
04560             TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
04561           }
04562         }
04563         copyBackOutput = true; // Don't forget to copy back at end.
04564         TPETRA_EFFICIENCY_WARNING(
04565           ! X.isConstantStride (),
04566           std::runtime_error,
04567           "gaussSeidelCopy: The current implementation of the Gauss-Seidel "
04568           "kernel requires that X and B both have constant stride.  Since X "
04569           "does not have constant stride, we had to make a copy.  This is a "
04570           "limitation of the current implementation and not your fault, but we "
04571           "still report it as an efficiency warning for your information.");
04572       }
04573     }
04574     else { // Column Map and domain Map are _not_ the same.
04575       X_colMap = getColumnMapMultiVector (X);
04576       X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
04577 
04578 #ifdef HAVE_TPETRA_DEBUG
04579       typename MV::dual_view_type X_colMap_view = X_colMap->getDualView ();
04580       typename MV::dual_view_type X_domainMap_view = X_domainMap->getDualView ();
04581 
04582       if (X_colMap->getLocalLength () != 0 && X_domainMap->getLocalLength ()) {
04583         TEUCHOS_TEST_FOR_EXCEPTION(
04584           X_colMap_view.h_view.ptr_on_device () != X_domainMap_view.h_view.ptr_on_device (),
04585           std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: "
04586           "Pointer to start of column Map view of X is not equal to pointer to "
04587           "start of (domain Map view of) X.  This may mean that "
04588           "Tpetra::MultiVector::offsetViewNonConst is broken.  "
04589           "Please report this bug to the Tpetra developers.");
04590       }
04591 
04592       TEUCHOS_TEST_FOR_EXCEPTION(
04593         X_colMap_view.dimension_0 () < X_domainMap_view.dimension_0 () ||
04594         X_colMap->getLocalLength () < X_domainMap->getLocalLength (),
04595         std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: "
04596         "X_colMap has fewer local rows than X_domainMap.  "
04597         "X_colMap_view.dimension_0() = " << X_colMap_view.dimension_0 ()
04598         << ", X_domainMap_view.dimension_0() = "
04599         << X_domainMap_view.dimension_0 ()
04600         << ", X_colMap->getLocalLength() = " << X_colMap->getLocalLength ()
04601         << ", and X_domainMap->getLocalLength() = "
04602         << X_domainMap->getLocalLength ()
04603         << ".  This means that Tpetra::MultiVector::offsetViewNonConst "
04604         "is broken.  Please report this bug to the Tpetra developers.");
04605 
04606       TEUCHOS_TEST_FOR_EXCEPTION(
04607         X_colMap->getNumVectors () != X_domainMap->getNumVectors (),
04608         std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: "
04609         "X_colMap has a different number of columns than X_domainMap.  "
04610         "X_colMap->getNumVectors() = " << X_colMap->getNumVectors ()
04611         << " != X_domainMap->getNumVectors() = "
04612         << X_domainMap->getNumVectors ()
04613         << ".  This means that Tpetra::MultiVector::offsetViewNonConst "
04614         "is broken.  Please report this bug to the Tpetra developers.");
04615 
04616       // TEUCHOS_TEST_FOR_EXCEPTION(
04617       //   X_colMap->getLocalMV ().getStride () !=
04618       //   X_domainMap->getLocalMV ().getStride (),
04619       //   std::logic_error,
04620       //   "Tpetra::CrsMatrix::gaussSeidelCopy: "
04621       //   "X_colMap has local stride " << X_colMap->getLocalMV ().getStride ()
04622       //   << ", which does not equal the local stride "
04623       //   << X_domainMap->getLocalMV ().getStride () << " of X_domainMap.  "
04624       //   "This means that Tpetra::MultiVector::offsetViewNonConst is broken.  "
04625       //   "Please report this bug to the Tpetra developers.");
04626 #endif // HAVE_TPETRA_DEBUG
04627 
04628       if (zeroInitialGuess) {
04629         // No need for an Import, since we're filling with zeros.
04630         X_colMap->putScalar (STS::zero ());
04631       } else {
04632         // We could just copy X into X_domainMap.  However, that
04633         // wastes a copy, because the Import also does a copy (plus
04634         // communication).  Since the typical use case for
04635         // Gauss-Seidel is a small number of sweeps (2 is typical), we
04636         // don't want to waste that copy.  Thus, we do the Import
04637         // here, and skip the first Import in the first sweep.
04638         // Importing directly from X effects the copy into X_domainMap
04639         // (which is a view of X_colMap).
04640         X_colMap->doImport (X, *importer, INSERT);
04641       }
04642       copyBackOutput = true; // Don't forget to copy back at end.
04643     } // if column and domain Maps are (not) the same
04644 
04645     // The Gauss-Seidel / SOR kernel expects multivectors of constant
04646     // stride.  X_colMap is by construction, but B might not be.  If
04647     // it's not, we have to make a copy.
04648     RCP<const MV> B_in;
04649     if (B.isConstantStride ()) {
04650       B_in = rcpFromRef (B);
04651     }
04652     else {
04653       // Range Map and row Map are the same in this case, so we can
04654       // use the cached row Map multivector to store a constant stride
04655       // copy of B.
04656       RCP<MV> B_in_nonconst = getRowMapMultiVector (B, true);
04657 
04658       try {
04659         deep_copy(*B_in_nonconst, B);
04660       } catch (std::exception& e) {
04661         std::ostringstream os;
04662         os << "Tpetra::CrsMatrix::reorderedGaussSeidelCopy: "
04663           "deep_copy(*B_in_nonconst, B) threw an exception: "
04664            << e.what () << ".";
04665         TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
04666       }
04667       B_in = rcp_const_cast<const MV> (B_in_nonconst);
04668 
04669       TPETRA_EFFICIENCY_WARNING(
04670         ! B.isConstantStride (),
04671         std::runtime_error,
04672         "gaussSeidelCopy: The current implementation requires that B have "
04673         "constant stride.  Since B does not have constant stride, we had to "
04674         "copy it into a separate constant-stride multivector.  This is a "
04675         "limitation of the current implementation and not your fault, but we "
04676         "still report it as an efficiency warning for your information.");
04677     }
04678 
04679     for (int sweep = 0; sweep < numSweeps; ++sweep) {
04680       if (! importer.is_null () && sweep > 0) {
04681         // We already did the first Import for the zeroth sweep above,
04682         // if it was necessary.
04683         X_colMap->doImport (*X_domainMap, *importer, INSERT);
04684       }
04685 
04686       // Do local Gauss-Seidel.
04687       if (direction != Symmetric) {
04688         if(rowIndices.is_null())
04689           this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
04690                                                    dampingFactor,
04691                                                    localDirection);
04692         else
04693           this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap, D, rowIndices,
04694                                                             dampingFactor,
04695                                                             localDirection);
04696       } else { // direction == Symmetric
04697         if(rowIndices.is_null()) {
04698           this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
04699                                                    dampingFactor,
04700                                                    KokkosClassic::Forward);
04701           // mfh 18 Mar 2013: Aztec's implementation of "symmetric
04702           // Gauss-Seidel" does _not_ do an Import between the forward
04703           // and backward sweeps.  This makes symmetric Gauss-Seidel a
04704           // symmetric preconditioner if the matrix A is symmetric.  We
04705           // imitate Aztec's behavior here.
04706           this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
04707                                                    dampingFactor,
04708                                                    KokkosClassic::Backward);
04709         }
04710         else {
04711           this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap, D, rowIndices,
04712                                                             dampingFactor,
04713                                                             KokkosClassic::Forward);
04714           this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap, D, rowIndices,
04715                                                             dampingFactor,
04716                                                             KokkosClassic::Backward);
04717 
04718         }
04719       }
04720     }
04721 
04722     if (copyBackOutput) {
04723       try {
04724         deep_copy(X , *X_domainMap); // Copy result back into X.
04725       } catch (std::exception& e) {
04726         std::ostringstream os;
04727         os << "Tpetra::CrsMatrix::reorderedGaussSeidelCopy: "
04728           "deep_copy(X, *X_domainMap) threw an exception: "
04729            << e.what () << ".";
04730         TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
04731       }
04732     }
04733   }
04734 
04737   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
04738   template <class DomainScalar, class RangeScalar>
04739   void
04740   CrsMatrix<
04741     Scalar, LocalOrdinal, GlobalOrdinal,
04742     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
04743   localMultiply (const MultiVector<DomainScalar,LocalOrdinal,GlobalOrdinal,node_type>& X,
04744                  MultiVector<RangeScalar,LocalOrdinal,GlobalOrdinal,node_type>& Y,
04745                  Teuchos::ETransp mode,
04746                  RangeScalar alpha,
04747                  RangeScalar beta) const
04748   {
04749     using Teuchos::NO_TRANS;
04750 #ifdef HAVE_TPETRA_DEBUG
04751     const char tfecfFuncName[] = "localMultiply: ";
04752 #endif // HAVE_TPETRA_DEBUG
04753     typedef Teuchos::ScalarTraits<RangeScalar> RST;
04754 #ifdef HAVE_TPETRA_DEBUG
04755 
04756     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
04757       X.getNumVectors() != Y.getNumVectors(), std::runtime_error,
04758       ": X and Y must have the same number of columns (vectors).  ");
04759 
04760     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
04761       mode == NO_TRANS && X.getLocalLength () != getColMap ()->getNodeNumElements (),
04762       std::runtime_error, "NO_TRANS case: X has the wrong number of local rows.  "
04763       "X.getLocalLength() = " << X.getLocalLength () << " != getColMap()->"
04764       "getNodeNumElements() = " << getColMap ()->getNodeNumElements () << ".");
04765     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
04766       mode == NO_TRANS && Y.getLocalLength () != getRowMap ()->getNodeNumElements (),
04767       std::runtime_error, "NO_TRANS case: Y has the wrong number of local rows.  "
04768       "Y.getLocalLength() = " << Y.getLocalLength () << " != getRowMap()->"
04769       "getNodeNumElements() = " << getRowMap ()->getNodeNumElements () << ".");
04770 
04771     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
04772       mode != NO_TRANS && X.getLocalLength () != getRowMap ()->getNodeNumElements (),
04773       std::runtime_error, "TRANS or CONJ_TRANS case: X has the wrong number of "
04774       "local rows.  X.getLocalLength() = " << X.getLocalLength () << " != "
04775       "getRowMap()->getNodeNumElements() = "
04776       << getRowMap ()->getNodeNumElements () << ".");
04777     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
04778       mode != NO_TRANS && Y.getLocalLength () != getColMap ()->getNodeNumElements (),
04779       std::runtime_error, "TRANS or CONJ_TRANS case: X has the wrong number of "
04780       "local rows.  Y.getLocalLength() = " << Y.getLocalLength () << " != "
04781       "getColMap()->getNodeNumElements() = "
04782       << getColMap ()->getNodeNumElements () << ".");
04783 
04784     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
04785       ! isFillComplete (), std::runtime_error, ": It is incorrect to call this "
04786       "method unless the matrix is fill complete.");
04787     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
04788       X.isConstantStride() == false || Y.isConstantStride() == false,
04789       std::runtime_error, ": X and Y must be constant stride.");
04790     // If the two pointers are NULL, then they don't alias one
04791     // another, even though they are equal.
04792     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
04793       X.getDualView ().d_view.ptr_on_device () == Y.getDualView ().d_view.ptr_on_device () &&
04794       X.getDualView ().d_view.ptr_on_device () != NULL,
04795       std::runtime_error, ": X and Y may not alias one another.");
04796 #endif
04797     //
04798     // Call the matvec
04799     if (beta == RST::zero()) {
04800       // Y = alpha*op(M)*X with overwrite semantics
04801 
04802       // FIXME (mfh 27 Mar 2014) What about CONJ_TRANS???
04803       if (mode != NO_TRANS) {
04804         Kokkos::MV_MultiplyTranspose (RST::zero (),
04805                                       Y.template getLocalView<DeviceType> (),
04806                                       alpha,
04807                                       k_lclMatrix_,
04808                                       X.template getLocalView<DeviceType> ());
04809       }
04810       else { // mode == NO_TRANS
04811         Kokkos::MV_Multiply (Y.template getLocalView<DeviceType> (),
04812                              alpha,
04813                              k_lclMatrix_,
04814                              X.template getLocalView<DeviceType> ());
04815       }
04816     }
04817     else {
04818       // Y = alpha*op(M) + beta*Y
04819 
04820       // FIXME (mfh 27 Mar 2014) What about CONJ_TRANS???
04821       if(mode != NO_TRANS) {
04822         Kokkos::MV_MultiplyTranspose (beta,
04823                                       Y.template getLocalView<DeviceType> (),
04824                                       alpha,
04825                                       k_lclMatrix_,
04826                                       X.template getLocalView<DeviceType> ());
04827       }
04828       else {
04829         Kokkos::MV_Multiply (beta,
04830                              Y.template getLocalView<DeviceType> (),
04831                              alpha,
04832                              k_lclMatrix_,
04833                              X.template getLocalView<DeviceType> ());
04834       }
04835     }
04836   }
04837 
04838   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
04839   template <class DomainScalar, class RangeScalar>
04840   void
04841   CrsMatrix<
04842     Scalar, LocalOrdinal, GlobalOrdinal,
04843     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
04844   localGaussSeidel (const MultiVector<DomainScalar,LocalOrdinal,GlobalOrdinal,node_type>& B,
04845                     MultiVector<RangeScalar,LocalOrdinal,GlobalOrdinal,node_type>& X,
04846                     const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& D,
04847                     const RangeScalar& dampingFactor,
04848                     const KokkosClassic::ESweepDirection direction) const
04849   {
04850     typedef LocalOrdinal LO;
04851     typedef GlobalOrdinal GO;
04852     typedef Tpetra::MultiVector<DomainScalar, LO, GO, node_type> DMV;
04853     typedef Tpetra::MultiVector<RangeScalar, LO, GO, node_type> RMV;
04854     typedef Tpetra::MultiVector<Scalar, LO, GO, node_type> MMV;
04855     typedef typename device_type::host_mirror_device_type HMDT;
04856     typedef typename Graph::LocalStaticCrsGraphType k_local_graph_type;
04857     typedef typename k_local_graph_type::size_type offset_type;
04858     const char prefix[] = "Tpetra::CrsMatrix::localGaussSeidel: ";
04859 
04860     TEUCHOS_TEST_FOR_EXCEPTION(
04861       ! this->isFillComplete (), std::runtime_error,
04862       prefix << "The matrix is not fill complete.");
04863     const size_t lclNumRows = this->getNodeNumRows ();
04864     const size_t numVecs = B.getNumVectors ();
04865     TEUCHOS_TEST_FOR_EXCEPTION(
04866       X.getNumVectors () != numVecs, std::invalid_argument,
04867       prefix << "B.getNumVectors() = " << numVecs << " != "
04868       "X.getNumVectors() = " << X.getNumVectors () << ".");
04869     TEUCHOS_TEST_FOR_EXCEPTION(
04870       B.getLocalLength () != lclNumRows, std::invalid_argument,
04871       prefix << "B.getLocalLength() = " << B.getLocalLength ()
04872       << " != this->getNodeNumRows() = " << lclNumRows << ".");
04873 
04874     typename DMV::dual_view_type::t_host B_lcl = B.template getLocalView<HMDT> ();
04875     typename RMV::dual_view_type::t_host X_lcl = X.template getLocalView<HMDT> ();
04876     typename MMV::dual_view_type::t_host D_lcl = D.template getLocalView<HMDT> ();
04877 
04878     offset_type B_stride[8], X_stride[8], D_stride[8];
04879     B_lcl.stride (B_stride);
04880     X_lcl.stride (X_stride);
04881     D_lcl.stride (D_stride);
04882 
04883     k_local_matrix_type lclMatrix = this->getLocalMatrix ();
04884     k_local_graph_type lclGraph = lclMatrix.graph;
04885     typename k_local_matrix_type::row_map_type ptr = lclGraph.row_map;
04886     typename k_local_matrix_type::index_type ind = lclGraph.entries;
04887     typename k_local_matrix_type::values_type val = lclMatrix.values;
04888     const offset_type* const ptrRaw = ptr.ptr_on_device ();
04889     const LO* const indRaw = ind.ptr_on_device ();
04890     const Scalar* const valRaw = val.ptr_on_device ();
04891 
04892     Kokkos::Sequential::gaussSeidel (static_cast<LO> (lclNumRows),
04893                                      static_cast<LO> (numVecs),
04894                                      ptrRaw, indRaw, valRaw,
04895                                      B_lcl.ptr_on_device (), B_stride[1],
04896                                      X_lcl.ptr_on_device (), X_stride[1],
04897                                      D_lcl.ptr_on_device (), dampingFactor,
04898                                      direction);
04899   }
04900 
04901 
04902   template<class Scalar,
04903            class LocalOrdinal,
04904            class GlobalOrdinal,
04905            class DeviceType>
04906   template<class DomainScalar,
04907            class RangeScalar>
04908   void
04909   CrsMatrix<
04910     Scalar, LocalOrdinal, GlobalOrdinal,
04911     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
04912   reorderedLocalGaussSeidel (const MultiVector<DomainScalar,LocalOrdinal,GlobalOrdinal,node_type>& B,
04913                              MultiVector<RangeScalar,LocalOrdinal,GlobalOrdinal,node_type>& X,
04914                              const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type>& D,
04915                              const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
04916                              const RangeScalar& dampingFactor,
04917                              const KokkosClassic::ESweepDirection direction) const
04918   {
04919     using Kokkos::Sequential::reorderedGaussSeidel;
04920     typedef LocalOrdinal LO;
04921     typedef GlobalOrdinal GO;
04922     typedef Tpetra::MultiVector<DomainScalar, LO, GO, node_type> DMV;
04923     typedef Tpetra::MultiVector<RangeScalar, LO, GO, node_type> RMV;
04924     typedef Tpetra::MultiVector<Scalar, LO, GO, node_type> MMV;
04925     typedef typename device_type::host_mirror_device_type HMDT;
04926     typedef typename Graph::LocalStaticCrsGraphType k_local_graph_type;
04927     typedef typename k_local_graph_type::size_type offset_type;
04928     const char prefix[] = "Tpetra::CrsMatrix::reorderedLocalGaussSeidel: ";
04929 
04930     TEUCHOS_TEST_FOR_EXCEPTION(
04931       ! this->isFillComplete (), std::runtime_error,
04932       prefix << "The matrix is not fill complete.");
04933     const size_t lclNumRows = this->getNodeNumRows ();
04934     const size_t numVecs = B.getNumVectors ();
04935     TEUCHOS_TEST_FOR_EXCEPTION(
04936       X.getNumVectors () != numVecs, std::invalid_argument,
04937       prefix << "B.getNumVectors() = " << numVecs << " != "
04938       "X.getNumVectors() = " << X.getNumVectors () << ".");
04939     TEUCHOS_TEST_FOR_EXCEPTION(
04940       B.getLocalLength () != lclNumRows, std::invalid_argument,
04941       prefix << "B.getLocalLength() = " << B.getLocalLength ()
04942       << " != this->getNodeNumRows() = " << lclNumRows << ".");
04943     TEUCHOS_TEST_FOR_EXCEPTION(
04944       static_cast<size_t> (rowIndices.size ()) < lclNumRows,
04945       std::invalid_argument, prefix << "rowIndices.size() = "
04946       << rowIndices.size () << " < this->getNodeNumRows() = "
04947       << lclNumRows << ".");
04948 
04949     typename DMV::dual_view_type::t_host B_lcl = B.template getLocalView<HMDT> ();
04950     typename RMV::dual_view_type::t_host X_lcl = X.template getLocalView<HMDT> ();
04951     typename MMV::dual_view_type::t_host D_lcl = D.template getLocalView<HMDT> ();
04952 
04953     offset_type B_stride[8], X_stride[8], D_stride[8];
04954     B_lcl.stride (B_stride);
04955     X_lcl.stride (X_stride);
04956     D_lcl.stride (D_stride);
04957 
04958     k_local_matrix_type lclMatrix = this->getLocalMatrix ();
04959     typename Graph::LocalStaticCrsGraphType lclGraph = lclMatrix.graph;
04960     typename k_local_matrix_type::index_type ind = lclGraph.entries;
04961     typename k_local_matrix_type::row_map_type ptr = lclGraph.row_map;
04962     typename k_local_matrix_type::values_type val = lclMatrix.values;
04963     const offset_type* const ptrRaw = ptr.ptr_on_device ();
04964     const LO* const indRaw = ind.ptr_on_device ();
04965     const Scalar* const valRaw = val.ptr_on_device ();
04966 
04967     reorderedGaussSeidel (static_cast<LO> (lclNumRows),
04968                           static_cast<LO> (numVecs), ptrRaw, indRaw, valRaw,
04969                           B_lcl.ptr_on_device (), B_stride[1],
04970                           X_lcl.ptr_on_device (), X_stride[1],
04971                           D_lcl.ptr_on_device (), rowIndices.getRawPtr (),
04972                           static_cast<LO> (lclNumRows),
04973                           dampingFactor, direction);
04974   }
04975 
04976 
04977   template<class Scalar,
04978            class LocalOrdinal,
04979            class GlobalOrdinal,
04980            class DeviceType>
04981   template<class DomainScalar,
04982            class RangeScalar>
04983   void
04984   CrsMatrix<
04985     Scalar, LocalOrdinal, GlobalOrdinal,
04986     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
04987   localSolve (const MultiVector<RangeScalar,LocalOrdinal,GlobalOrdinal,node_type>& Y,
04988               MultiVector<DomainScalar,LocalOrdinal,GlobalOrdinal,node_type>& X,
04989               Teuchos::ETransp mode) const
04990   {
04991     using Kokkos::Sequential::triSolveKokkos;
04992     using Teuchos::CONJ_TRANS;
04993     using Teuchos::NO_TRANS;
04994     using Teuchos::TRANS;
04995     typedef LocalOrdinal LO;
04996     typedef GlobalOrdinal GO;
04997     typedef Tpetra::MultiVector<DomainScalar, LO, GO, node_type> DMV;
04998     typedef Tpetra::MultiVector<RangeScalar, LO, GO, node_type> RMV;
04999     typedef typename device_type::host_mirror_device_type HMDT;
05000 
05001     const char tfecfFuncName[] = "localSolve: ";
05002 
05003     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05004       ! isFillComplete (), std::runtime_error,
05005       "The matrix is not fill complete.");
05006     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05007       ! X.isConstantStride () || ! Y.isConstantStride (), std::invalid_argument,
05008       "X and Y must be constant stride.");
05009     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05010       ! isUpperTriangular () && ! isLowerTriangular (), std::runtime_error,
05011       "The matrix is neither upper triangular or lower triangular.  "
05012       "You may only call this method if the matrix is triangular.  "
05013       "Remember that this is a local (per MPI process) property, and that "
05014       "Tpetra only knows how to do a local (per process) triangular solve.");
05015     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05016       STS::isComplex && mode == TRANS, std::logic_error, "This method does "
05017       "not currently support non-conjugated transposed solve (mode == "
05018       "Teuchos::TRANS) for complex scalar types.");
05019 
05020     // FIXME (mfh 27 Aug 2014) Tpetra has always made the odd decision
05021     // that if _some_ diagonal entries are missing locally, then it
05022     // assumes that the matrix has an implicitly stored unit diagonal.
05023     // Whether the matrix has an implicit unit diagonal or not should
05024     // be up to the user to decide.  What if the graph has no diagonal
05025     // entries, and the user wants it that way?  The only reason this
05026     // matters, though, is for the triangular solve, and in that case,
05027     // missing diagonal entries will cause trouble anyway.  However,
05028     // it would make sense to warn the user if they ask for a
05029     // triangular solve with an incomplete diagonal.  Furthermore,
05030     // this code should only assume an implicitly stored unit diagonal
05031     // if the matrix has _no_ explicitly stored diagonal entries.
05032     const Teuchos::EDiag diag = getNodeNumDiags () < getNodeNumRows () ?
05033       Teuchos::UNIT_DIAG : Teuchos::NON_UNIT_DIAG;
05034     Teuchos::EUplo uplo = Teuchos::UNDEF_TRI;
05035     if (isUpperTriangular ()) {
05036       uplo = Teuchos::UPPER_TRI;
05037     } else if (isLowerTriangular ()) {
05038       uplo = Teuchos::LOWER_TRI;
05039     }
05040 
05041     k_local_matrix_type A_lcl = this->getLocalMatrix ();
05042     typename DMV::dual_view_type::t_host X_lcl = X.template getLocalView<HMDT> ();
05043     typename RMV::dual_view_type::t_host Y_lcl = Y.template getLocalView<HMDT> ();
05044     triSolveKokkos (X_lcl, A_lcl, Y_lcl, uplo, diag, mode);
05045   }
05046 
05047 
05048   template<class Scalar,
05049            class LocalOrdinal,
05050            class GlobalOrdinal,
05051            class DeviceType>
05052   template<class T>
05053   Teuchos::RCP<CrsMatrix<
05054                  T, LocalOrdinal, GlobalOrdinal,
05055                  Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
05056   CrsMatrix<
05057     Scalar, LocalOrdinal, GlobalOrdinal,
05058     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
05059   convert () const
05060   {
05061     using Teuchos::ArrayRCP;
05062     using Teuchos::RCP;
05063     using Teuchos::rcp;
05064     typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, node_type> out_mat_type;
05065     typedef typename out_mat_type::t_ValuesType out_vals_type;
05066     typedef typename out_mat_type::k_local_matrix_type out_lcl_mat_type;
05067     typedef ArrayRCP<size_t>::size_type size_type;
05068     const char tfecfFuncName[] = "convert";
05069 
05070     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05071       isFillComplete () == false, std::runtime_error,
05072       ": fill must be complete.");
05073 
05074     // mfh 27 Feb 2014: It seems reasonable that if this matrix has a
05075     // const graph, then the returned matrix should also.  However, if
05076     // this matrix does not have a const graph, then neither should
05077     // the returned matrix.  The code below implements this strategy.
05078 
05079     RCP<out_mat_type> newmat; // the matrix to return
05080 
05081     if (this->isStaticGraph ()) {
05082       // This matrix has a const graph, so the returned matrix should too.
05083       newmat = rcp (new out_mat_type (this->getCrsGraph ()));
05084 
05085       // Convert the values from Scalar to T, and stuff them directly
05086       // into the matrix to return.
05087       const size_type numVals =
05088         static_cast<size_type> (this->k_lclMatrix_.values.dimension_0 ());
05089 
05090       // FIXME (mfh 05 Aug 2014) Write a copy kernel (scalar_type and
05091       // T differ, so we can't use Kokkos::deep_copy).
05092       //
05093       // FIXME (mfh 05 Aug 2014) This assumes UVM.
05094       out_vals_type newVals1D ("Tpetra::CrsMatrix::val", numVals);
05095       for (size_type k = 0; k < numVals; ++k) {
05096         newVals1D(k) = static_cast<T> (this->k_values1D_(k));
05097       }
05098       newmat->k_lclMatrix_ =
05099         out_lcl_mat_type ("Tpetra::CrsMatrix::k_lclMatrix_",
05100                           this->k_lclMatrix_.numCols (), newVals1D,
05101                           this->k_lclMatrix_.graph);
05102       newmat->k_values1D_ = newVals1D;
05103       newmat->values1D_ = Kokkos::Compat::persistingView (newVals1D);
05104       // Since newmat has a static (const) graph, the graph already
05105       // has a column Map, and Import and Export objects already exist
05106       // (if applicable).  Thus, calling fillComplete is cheap.
05107       newmat->fillComplete (this->getDomainMap (), this->getRangeMap ());
05108     }
05109     else {
05110       // This matrix has a nonconst graph, so the returned matrix
05111       // should also have a nonconst graph.  However, it's fine for
05112       // the returned matrix to have static profile.  This will
05113       // certainly speed up its fillComplete.
05114 
05115       //
05116       // FIXME (mfh 05 Aug 2014) Instead of the slow stuff below, we
05117       // should copy the values and existing graph into a new local
05118       // matrix (lclMatrix), and then use the Tpetra::CrsMatrix
05119       // constructor that takes (rowMap, colMap, lclMatrix, params).
05120       //
05121 
05122       // Get this matrix's local data.
05123       ArrayRCP<const size_t> ptr;
05124       ArrayRCP<const LocalOrdinal> ind;
05125       ArrayRCP<const Scalar> oldVal;
05126       this->getAllValues (ptr, ind, oldVal);
05127 
05128       RCP<const map_type> rowMap = this->getRowMap ();
05129       RCP<const map_type> colMap = this->getColMap ();
05130 
05131       // Get an array of the number of entries in each (locally owned)
05132       // row, so that we can make the new matrix with static profile.
05133       const size_type numLocalRows =
05134         static_cast<size_type> (rowMap->getNodeNumElements ());
05135       ArrayRCP<size_t> numEntriesPerRow (numLocalRows);
05136       for (size_type localRow = 0; localRow < numLocalRows; ++localRow) {
05137         numEntriesPerRow[localRow] =
05138           static_cast<size_type> (getNumEntriesInLocalRow (localRow));
05139       }
05140 
05141       newmat = rcp (new out_mat_type (rowMap, colMap, numEntriesPerRow,
05142                                       StaticProfile));
05143 
05144       // Convert this matrix's values from Scalar to T.
05145       const size_type numVals = this->k_lclMatrix_.values.dimension_0 ();
05146       ArrayRCP<T> newVals1D (numVals);
05147       // FIXME (mfh 05 Aug 2014) This assumes UVM.
05148       for (size_type k = 0; k < numVals; ++k) {
05149         newVals1D[k] = static_cast<T> (this->k_values1D_(k));
05150       }
05151 
05152       // Give this matrix all of its local data.  We can all this
05153       // method because newmat was _not_ created with a const graph.
05154       // The data must be passed in as nonconst, so we have to copy it
05155       // first.
05156       ArrayRCP<size_t> newPtr (ptr.size ());
05157       std::copy (ptr.begin (), ptr.end (), newPtr.begin ());
05158       ArrayRCP<LocalOrdinal> newInd (ind.size ());
05159       std::copy (ind.begin (), ind.end (), newInd.begin ());
05160       newmat->setAllValues (newPtr, newInd, newVals1D);
05161 
05162       // We already have the Import and Export (if applicable) objects
05163       // from the graph, so we can save a lot of time by passing them
05164       // in to expertStaticFillComplete.
05165       RCP<const map_type> domainMap = this->getDomainMap ();
05166       RCP<const map_type> rangeMap = this->getRangeMap ();
05167       RCP<const import_type> importer = this->getCrsGraph ()->getImporter ();
05168       RCP<const export_type> exporter = this->getCrsGraph ()->getExporter ();
05169       newmat->expertStaticFillComplete (domainMap, rangeMap, importer, exporter);
05170     }
05171 
05172     return newmat;
05173   }
05174 
05175 
05176   template<class Scalar,
05177            class LocalOrdinal,
05178            class GlobalOrdinal,
05179            class DeviceType>
05180   void
05181   CrsMatrix<
05182     Scalar, LocalOrdinal, GlobalOrdinal,
05183     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
05184   checkInternalState () const
05185   {
05186 #ifdef HAVE_TPETRA_DEBUG
05187     const char tfecfFuncName[] = "checkInternalState: ";
05188     const char err[] = "Internal state is not consistent.  "
05189       "Please report this bug to the Tpetra developers.";
05190     // check the internal state of this data structure
05191     // this is called by numerous state-changing methods, in a debug build, to ensure that the object
05192     // always remains in a valid state
05193 
05194     // we must have a static graph
05195     //
05196     // a dynamic graph, depending on which constructor was used.
05197     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05198       staticGraph_.is_null (),
05199       std::logic_error, err);
05200     // myGraph == null means that the matrix has a static graph.
05201     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05202       ! myGraph_.is_null () && myGraph_ != staticGraph_,
05203       std::logic_error, err);
05204     // if matrix is fill complete, then graph must be fill complete
05205     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05206       isFillComplete () && ! staticGraph_->isFillComplete (),
05207       std::logic_error, err << "  Specifically, the matrix is fill complete, "
05208       "but its graph is NOT fill complete.");
05209     // if matrix is storage optimized, it should have a 1D allocation
05210     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05211       isStorageOptimized () && ! values2D_.is_null (),
05212       std::logic_error, err);
05213     // if matrix/graph are static profile, then 2D allocation should not be present
05214     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05215       getProfileType() == StaticProfile && values2D_ != null,
05216       std::logic_error, err);
05217     // if matrix/graph are dynamic profile, then 1D allocation should not be present
05218     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05219       getProfileType() == DynamicProfile && k_values1D_.dimension_0 () > 0,
05220       std::logic_error, err);
05221     // if values are allocated and they are non-zero in number, then
05222     // one of the allocations should be present
05223     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05224       staticGraph_->indicesAreAllocated () &&
05225       staticGraph_->getNodeAllocationSize() > 0 &&
05226       staticGraph_->getNodeNumRows() > 0
05227       && values2D_.is_null () &&
05228       k_values1D_.dimension_0 () == 0,
05229       std::logic_error, err);
05230     // we cannot have both a 1D and 2D allocation
05231     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05232       k_values1D_.dimension_0 () > 0 && values2D_ != null,
05233       std::logic_error, err << "  Specifically, k_values1D_ is allocated (has "
05234       "size " << k_values1D_.dimension_0 () << " > 0) and values2D_ is also "
05235       "allocated.  CrsMatrix is not suppose to have both a 1-D and a 2-D "
05236       "allocation at the same time.");
05237 #endif
05238   }
05239 
05240   template<class Scalar,
05241            class LocalOrdinal,
05242            class GlobalOrdinal,
05243            class DeviceType>
05244   std::string
05245   CrsMatrix<
05246     Scalar, LocalOrdinal, GlobalOrdinal,
05247     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
05248   description () const
05249   {
05250     std::ostringstream os;
05251 
05252     os << "Tpetra::CrsMatrix (Kokkos refactor): {";
05253     if (this->getObjectLabel () != "") {
05254       os << "Label: \"" << this->getObjectLabel () << "\", ";
05255     }
05256     if (isFillComplete()) {
05257       os << "isFillComplete: true"
05258          << ", global dimensions: [" << getGlobalNumRows () << ", "
05259          << getGlobalNumCols () << "]"
05260          << ", global number of entries: " << getGlobalNumEntries ()
05261          << "}";
05262     }
05263     else {
05264       os << "isFillComplete: false"
05265          << ", global dimensions: [" << getGlobalNumRows () << ", "
05266          << getGlobalNumCols () << "]}";
05267     }
05268     return os.str ();
05269   }
05270 
05271   template<class Scalar,
05272            class LocalOrdinal,
05273            class GlobalOrdinal,
05274            class DeviceType>
05275   void
05276   CrsMatrix<
05277     Scalar, LocalOrdinal, GlobalOrdinal,
05278     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
05279   describe (Teuchos::FancyOStream &out,
05280             const Teuchos::EVerbosityLevel verbLevel) const
05281   {
05282     using std::endl;
05283     using std::setw;
05284     using Teuchos::Comm;
05285     using Teuchos::RCP;
05286     using Teuchos::TypeNameTraits;
05287     using Teuchos::VERB_DEFAULT;
05288     using Teuchos::VERB_NONE;
05289     using Teuchos::VERB_LOW;
05290     using Teuchos::VERB_MEDIUM;
05291     using Teuchos::VERB_HIGH;
05292     using Teuchos::VERB_EXTREME;
05293 
05294     const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
05295 
05296     if (vl == VERB_NONE) {
05297       return; // Don't print anything at all
05298     }
05299     // By convention, describe() always begins with a tab.
05300     Teuchos::OSTab tab0 (out);
05301 
05302     RCP<const Comm<int> > comm = this->getComm();
05303     const int myRank = comm->getRank();
05304     const int numProcs = comm->getSize();
05305     size_t width = 1;
05306     for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
05307       ++width;
05308     }
05309     width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
05310 
05311     //    none: print nothing
05312     //     low: print O(1) info from node 0
05313     //  medium: print O(P) info, num entries per process
05314     //    high: print O(N) info, num entries per row
05315     // extreme: print O(NNZ) info: print indices and values
05316     //
05317     // for medium and higher, print constituent objects at specified verbLevel
05318     if (myRank == 0) {
05319       out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
05320     }
05321     Teuchos::OSTab tab1 (out);
05322 
05323     if (myRank == 0) {
05324       if (this->getObjectLabel () != "") {
05325         out << "Label: \"" << this->getObjectLabel () << "\", ";
05326       }
05327       {
05328         out << "Template parameters:" << endl;
05329         Teuchos::OSTab tab2 (out);
05330         out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
05331             << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
05332             << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
05333             << "Node: " << Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>::name () << endl;
05334       }
05335       if (isFillComplete()) {
05336         out << "isFillComplete: true" << endl
05337             << "Global dimensions: [" << getGlobalNumRows () << ", "
05338             << getGlobalNumCols () << "]" << endl
05339             << "Global number of entries: " << getGlobalNumEntries () << endl
05340             << "Global number of diagonal entries: " << getGlobalNumDiags ()
05341             << endl << "Global max number of entries in a row: "
05342             << getGlobalMaxNumRowEntries () << endl;
05343       }
05344       else {
05345         out << "isFillComplete: false" << endl
05346             << "Global dimensions: [" << getGlobalNumRows () << ", "
05347             << getGlobalNumCols () << "]" << endl;
05348       }
05349     }
05350 
05351     if (vl < VERB_MEDIUM) {
05352       return; // all done!
05353     }
05354 
05355     // Describe the Map Map.
05356     if (myRank == 0) {
05357       out << endl << "Row Map:" << endl;
05358     }
05359     getRowMap ()->describe (out, vl);
05360 
05361     // Describe the column Map.
05362     if (myRank == 0) {
05363       out << "Column Map: ";
05364     }
05365     if (getColMap ().is_null ()) {
05366       if (myRank == 0) {
05367         out << "null" << endl;
05368       }
05369     } else if (getColMap () == getRowMap ()) {
05370       if (myRank == 0) {
05371         out << "same as row Map" << endl;
05372       }
05373     } else {
05374       if (myRank == 0) {
05375         out << endl;
05376       }
05377       getColMap ()->describe (out, vl);
05378     }
05379 
05380     // Describe the domain Map.
05381     if (myRank == 0) {
05382       out << "Domain Map: ";
05383     }
05384     if (getDomainMap ().is_null ()) {
05385       if (myRank == 0) {
05386         out << "null" << endl;
05387       }
05388     } else if (getDomainMap () == getRowMap ()) {
05389       if (myRank == 0) {
05390         out << "same as row Map" << endl;
05391       }
05392     } else if (getDomainMap () == getColMap ()) {
05393       if (myRank == 0) {
05394         out << "same as column Map" << endl;
05395       }
05396     } else {
05397       if (myRank == 0) {
05398         out << endl;
05399       }
05400       getColMap ()->describe (out, vl);
05401     }
05402 
05403     // Describe the range Map.
05404     if (myRank == 0) {
05405       out << "Range Map: ";
05406     }
05407     if (getRangeMap ().is_null ()) {
05408       if (myRank == 0) {
05409         out << "null" << endl;
05410       }
05411     } else if (getRangeMap () == getDomainMap ()) {
05412       if (myRank == 0) {
05413         out << "same as domain Map" << endl;
05414       }
05415     } else if (getRangeMap () == getRowMap ()) {
05416       if (myRank == 0) {
05417         out << "same as row Map" << endl;
05418       }
05419     } else {
05420       if (myRank == 0) {
05421         out << endl;
05422       }
05423       getColMap ()->describe (out, vl);
05424     }
05425 
05426     // O(P) data
05427     for (int curRank = 0; curRank < numProcs; ++curRank) {
05428       if (myRank == curRank) {
05429         out << "Process rank: " << curRank << endl;
05430         Teuchos::OSTab tab2 (out);
05431         if (! staticGraph_->indicesAreAllocated ()) {
05432           out << "Graph indices not allocated" << endl;
05433         }
05434         else {
05435           out << "Number of allocated entries: "
05436               << staticGraph_->getNodeAllocationSize () << endl;
05437         }
05438         out << "Number of entries: " << getNodeNumEntries () << endl;
05439         if (isFillComplete ()) {
05440           out << "Number of diagonal entries: " << getNodeNumDiags () << endl;
05441         }
05442         out << "Max number of entries per row: " << getNodeMaxNumRowEntries ()
05443             << endl;
05444       }
05445       // Give output time to complete by executing some barriers.
05446       comm->barrier ();
05447       comm->barrier ();
05448       comm->barrier ();
05449     }
05450 
05451     if (vl < VERB_HIGH) {
05452       return; // all done!
05453     }
05454 
05455     // O(N) and O(NNZ) data
05456     for (int curRank = 0; curRank < numProcs; ++curRank) {
05457       if (myRank == curRank) {
05458         out << std::setw(width) << "Proc Rank"
05459             << std::setw(width) << "Global Row"
05460             << std::setw(width) << "Num Entries";
05461         if (vl == VERB_EXTREME) {
05462           out << std::setw(width) << "(Index,Value)";
05463         }
05464         out << endl;
05465         for (size_t r = 0; r < getNodeNumRows (); ++r) {
05466           const size_t nE = getNumEntriesInLocalRow(r);
05467           GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
05468           out << std::setw(width) << myRank
05469               << std::setw(width) << gid
05470               << std::setw(width) << nE;
05471           if (vl == VERB_EXTREME) {
05472             if (isGloballyIndexed()) {
05473               ArrayView<const GlobalOrdinal> rowinds;
05474               ArrayView<const Scalar> rowvals;
05475               getGlobalRowView (gid, rowinds, rowvals);
05476               for (size_t j = 0; j < nE; ++j) {
05477                 out << " (" << rowinds[j]
05478                     << ", " << rowvals[j]
05479                     << ") ";
05480               }
05481             }
05482             else if (isLocallyIndexed()) {
05483               ArrayView<const LocalOrdinal> rowinds;
05484               ArrayView<const Scalar> rowvals;
05485               getLocalRowView (r, rowinds, rowvals);
05486               for (size_t j=0; j < nE; ++j) {
05487                 out << " (" << getColMap()->getGlobalElement(rowinds[j])
05488                     << ", " << rowvals[j]
05489                     << ") ";
05490               }
05491             } // globally or locally indexed
05492           } // vl == VERB_EXTREME
05493           out << endl;
05494         } // for each row r on this process
05495       } // if (myRank == curRank)
05496 
05497       // Give output time to complete
05498       comm->barrier ();
05499       comm->barrier ();
05500       comm->barrier ();
05501     } // for each process p
05502   }
05503 
05504 
05505   template<class Scalar,
05506            class LocalOrdinal,
05507            class GlobalOrdinal,
05508            class DeviceType>
05509   bool
05510   CrsMatrix<
05511     Scalar, LocalOrdinal, GlobalOrdinal,
05512     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
05513   checkSizes (const SrcDistObject& source)
05514   {
05515     // It's not clear what kind of compatibility checks on sizes can
05516     // be performed here.  Epetra_CrsGraph doesn't check any sizes for
05517     // compatibility.
05518 
05519     // Currently, the source object must be a RowMatrix with the same
05520     // four template parameters as the target CrsMatrix.  We might
05521     // relax this requirement later.
05522     typedef RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal,  node_type> row_matrix_type;
05523     const row_matrix_type* srcRowMat =
05524       dynamic_cast<const row_matrix_type*> (&source);
05525     return (srcRowMat != NULL);
05526   }
05527 
05528 
05529   template<class Scalar,
05530            class LocalOrdinal,
05531            class GlobalOrdinal,
05532            class DeviceType>
05533   void
05534   CrsMatrix<
05535     Scalar, LocalOrdinal, GlobalOrdinal,
05536     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
05537   copyAndPermute (const SrcDistObject& source,
05538                   size_t numSameIDs,
05539                   const ArrayView<const LocalOrdinal> &permuteToLIDs,
05540                   const ArrayView<const LocalOrdinal> &permuteFromLIDs)
05541   {
05542     using Teuchos::Array;
05543     using Teuchos::ArrayView;
05544     typedef LocalOrdinal LO;
05545     typedef GlobalOrdinal GO;
05546     typedef node_type NT;
05547     // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
05548     const char tfecfFuncName[] = "copyAndPermute";
05549 
05550     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05551       permuteToLIDs.size() != permuteFromLIDs.size(),
05552       std::invalid_argument, ": permuteToLIDs.size() = " << permuteToLIDs.size()
05553       << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
05554 
05555     // This dynamic cast should succeed, because we've already tested
05556     // it in checkSizes().
05557     typedef RowMatrix<Scalar, LO, GO, NT> row_matrix_type;
05558     const row_matrix_type& srcMat = dynamic_cast<const row_matrix_type&> (source);
05559 
05560     const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
05561     //
05562     // Copy the first numSame row from source to target (this matrix).
05563     // This involves copying rows corresponding to LIDs [0, numSame-1].
05564     //
05565     const map_type& srcRowMap = * (srcMat.getRowMap ());
05566     Array<GO> rowInds;
05567     Array<Scalar> rowVals;
05568     const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
05569     for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
05570       // Global ID for the current row index in the source matrix.
05571       // The first numSameIDs GIDs in the two input lists are the
05572       // same, so sourceGID == targetGID in this case.
05573       const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
05574       const GO targetGID = sourceGID;
05575 
05576       // Input views for the combineGlobalValues() call below.
05577       ArrayView<const GO> rowIndsConstView;
05578       ArrayView<const Scalar> rowValsConstView;
05579 
05580       if (sourceIsLocallyIndexed) {
05581         const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
05582         if (rowLength > static_cast<size_t> (rowInds.size())) {
05583           rowInds.resize (rowLength);
05584           rowVals.resize (rowLength);
05585         }
05586         // Resizing invalidates an Array's views, so we must make new
05587         // ones, even if rowLength hasn't changed.
05588         ArrayView<GO> rowIndsView = rowInds.view (0, rowLength);
05589         ArrayView<Scalar> rowValsView = rowVals.view (0, rowLength);
05590 
05591         // The source matrix is locally indexed, so we have to get a
05592         // copy.  Really it's the GIDs that have to be copied (because
05593         // they have to be converted from LIDs).
05594         size_t checkRowLength = 0;
05595         srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView, checkRowLength);
05596 
05597 #ifdef HAVE_TPETRA_DEBUG
05598         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength,
05599           std::logic_error, ": For global row index " << sourceGID << ", the source"
05600           " matrix's getNumEntriesInGlobalRow() method returns a row length of "
05601           << rowLength << ", but the getGlobalRowCopy() method reports that "
05602           "the row length is " << checkRowLength << ".  Please report this bug "
05603           "to the Tpetra developers.");
05604 #endif // HAVE_TPETRA_DEBUG
05605 
05606         rowIndsConstView = rowIndsView.view (0, rowLength);
05607         rowValsConstView = rowValsView.view (0, rowLength);
05608       }
05609       else { // source matrix is globally indexed.
05610         srcMat.getGlobalRowView (sourceGID, rowIndsConstView, rowValsConstView);
05611       }
05612 
05613       // Combine the data into the target matrix.
05614       if (isStaticGraph()) {
05615         // Applying a permutation to a matrix with a static graph
05616         // means REPLACE-ing entries.
05617         combineGlobalValues (targetGID, rowIndsConstView, rowValsConstView, REPLACE);
05618       }
05619       else {
05620         // Applying a permutation to a matrix with a dynamic graph
05621         // means INSERT-ing entries.  This has the same effect as
05622         // ADD, if the target graph already has an entry there.
05623         combineGlobalValues (targetGID, rowIndsConstView, rowValsConstView, INSERT);
05624       }
05625     } // For each of the consecutive source and target IDs that are the same
05626 
05627     //
05628     // Permute the remaining rows.
05629     //
05630     const map_type& tgtRowMap = * (this->getRowMap ());
05631     const size_t numPermuteToLIDs = static_cast<size_t> (permuteToLIDs.size ());
05632     for (size_t p = 0; p < numPermuteToLIDs; ++p) {
05633       const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
05634       const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
05635 
05636       // Input views for the combineGlobalValues() call below.
05637       ArrayView<const GO> rowIndsConstView;
05638       ArrayView<const Scalar> rowValsConstView;
05639 
05640       if (sourceIsLocallyIndexed) {
05641         const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
05642         if (rowLength > static_cast<size_t> (rowInds.size ())) {
05643           rowInds.resize (rowLength);
05644           rowVals.resize (rowLength);
05645         }
05646         // Resizing invalidates an Array's views, so we must make new
05647         // ones, even if rowLength hasn't changed.
05648         ArrayView<GO> rowIndsView = rowInds.view (0, rowLength);
05649         ArrayView<Scalar> rowValsView = rowVals.view (0, rowLength);
05650 
05651         // The source matrix is locally indexed, so we have to get a
05652         // copy.  Really it's the GIDs that have to be copied (because
05653         // they have to be converted from LIDs).
05654         size_t checkRowLength = 0;
05655         srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView, checkRowLength);
05656 
05657 #ifdef HAVE_TPETRA_DEBUG
05658         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(rowLength != checkRowLength,
05659           std::logic_error, ": For the source matrix's global row index "
05660           << sourceGID << ", the source matrix's getNumEntriesInGlobalRow() method "
05661           "returns a row length of " << rowLength << ", but the "
05662           "getGlobalRowCopy() method reports that the row length is "
05663           << checkRowLength << ".  Please report this bug to the Tpetra "
05664           "developers.");
05665 #endif // HAVE_TPETRA_DEBUG
05666 
05667         rowIndsConstView = rowIndsView.view (0, rowLength);
05668         rowValsConstView = rowValsView.view (0, rowLength);
05669       }
05670       else {
05671         srcMat.getGlobalRowView (sourceGID, rowIndsConstView, rowValsConstView);
05672       }
05673 
05674       // Combine the data into the target matrix.
05675       if (isStaticGraph()) {
05676         this->combineGlobalValues (targetGID, rowIndsConstView,
05677                                    rowValsConstView, REPLACE);
05678       }
05679       else {
05680         this->combineGlobalValues (targetGID, rowIndsConstView,
05681                                    rowValsConstView, INSERT);
05682       }
05683     } // For each ID to permute
05684   }
05685 
05686 
05687   template<class Scalar,
05688            class LocalOrdinal,
05689            class GlobalOrdinal,
05690            class DeviceType>
05691   void
05692   CrsMatrix<
05693     Scalar, LocalOrdinal, GlobalOrdinal,
05694     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
05695   packAndPrepare (const SrcDistObject& source,
05696                   const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
05697                   Teuchos::Array<char>& exports,
05698                   const Teuchos::ArrayView<size_t>& numPacketsPerLID,
05699                   size_t& constantNumPackets,
05700                   Distributor& distor)
05701   {
05702     using Teuchos::Array;
05703     using Teuchos::ArrayView;
05704     using Teuchos::av_reinterpret_cast;
05705     typedef LocalOrdinal LO;
05706     typedef GlobalOrdinal GO;
05707     //typedef typename ArrayView<const LO>::size_type size_type; // unused
05708     const char tfecfFuncName[] = "packAndPrepare";
05709 
05710     // Attempt to cast the source object to RowMatrix.  If the cast
05711     // succeeds, use the source object's pack method to pack its data
05712     // for communication.  If the source object is really a CrsMatrix,
05713     // this will pick up the CrsMatrix's more efficient override.  If
05714     // the RowMatrix cast fails, then the source object doesn't have
05715     // the right type.
05716     //
05717     // FIXME (mfh 30 Jun 2013) We don't even need the RowMatrix to
05718     // have the same Node type.  Unfortunately, we don't have a way to
05719     // ask if the RowMatrix is "a RowMatrix with any Node type," since
05720     // RowMatrix doesn't have a base class.  A hypothetical
05721     // RowMatrixBase<Scalar, LO, GO> class, which does not currently
05722     // exist, would satisfy this requirement.
05723     //
05724     // Why RowMatrixBase<Scalar, LO, GO>?  The source object's Scalar
05725     // type doesn't technically need to match the target object's
05726     // Scalar type, so we could just have RowMatrixBase<LO, GO>.  LO
05727     // and GO need not be the same, as long as there is no overflow of
05728     // the indices.  However, checking for index overflow is global
05729     // and therefore undesirable.
05730     typedef RowMatrix<Scalar, LO, GO, node_type> row_matrix_type;
05731     const row_matrix_type* srcRowMat =
05732       dynamic_cast<const row_matrix_type*> (&source);
05733     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05734       srcRowMat == NULL, std::invalid_argument,
05735       ": The source object of the Import or Export operation is neither a "
05736       "CrsMatrix (with the same template parameters as the target object), "
05737       "nor a RowMatrix (with the same first four template parameters as the "
05738       "target object).");
05739     srcRowMat->pack (exportLIDs, exports, numPacketsPerLID,
05740                      constantNumPackets, distor);
05741   }
05742 
05743 
05744   template<class Scalar,
05745            class LocalOrdinal,
05746            class GlobalOrdinal,
05747            class DeviceType>
05748   void
05749   CrsMatrix<
05750     Scalar, LocalOrdinal, GlobalOrdinal,
05751     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
05752   pack (const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
05753         Teuchos::Array<char>& exports,
05754         const Teuchos::ArrayView<size_t>& numPacketsPerLID,
05755         size_t& constantNumPackets,
05756         Distributor &distor) const
05757   {
05758     using Teuchos::Array;
05759     using Teuchos::ArrayView;
05760     using Teuchos::av_reinterpret_cast;
05761     using Teuchos::RCP;
05762     typedef LocalOrdinal LO;
05763     typedef GlobalOrdinal GO;
05764     typedef typename ArrayView<const LO>::size_type size_type;
05765     const char tfecfFuncName[] = "pack";
05766 
05767     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
05768       exportLIDs.size() != numPacketsPerLID.size(),
05769       std::invalid_argument, ": exportLIDs.size() = " << exportLIDs.size()
05770       << "!= numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
05771 
05772     // Get a reference to the matrix's row Map.
05773     const map_type& rowMap = * (this->getRowMap ());
05774 
05775     const bool locallyIndexed = this->isLocallyIndexed ();
05776     constantNumPackets = 0;
05777 
05778     // Get the GIDs of the rows we want to pack.
05779     Array<GO> exportGIDs (exportLIDs.size ());
05780     const size_type numExportGIDs = exportGIDs.size ();
05781     for (size_type i = 0; i < numExportGIDs; ++i) {
05782       exportGIDs[i] = rowMap.getGlobalElement (exportLIDs[i]);
05783     }
05784 
05785     // We say "Packet" is char (really a "byte"), but the actual unit
05786     // of packing is a (GID, value) pair.  The GID is the column index
05787     // in that row of the sparse matrix, and the value is the value at
05788     // that entry of the sparse matrix.  Thus, we have to scale
05789     // numPacketsPerLID by the number of bytes in a _packed_ (GID,
05790     // value) pair.  (We pack the GID and value in each pair
05791     // separately, so the number of bytes in a packed pair is actually
05792     // sizeof(GO) + sizeof(Scalar).)
05793     //
05794     // FIXME (mfh 24 Feb 2013) This code is only correct if
05795     // sizeof(Scalar) is a meaningful representation of the amount of
05796     // data in a Scalar instance.  (GO is always a built-in integer
05797     // type.)
05798     //
05799     // Compute the number of packets per export LID, and accumulate
05800     // the total number of packages.  While doing so, find the max
05801     // number of entries in each row owned by this process; we will
05802     // use that to size temporary arrays below.
05803     const size_t sizeOfOrdValPair = sizeof (GO) + sizeof (Scalar);
05804     size_t totalNumEntries = 0;
05805     size_t maxRowLength = 0;
05806     for (size_type i = 0; i < exportGIDs.size(); ++i) {
05807       const size_t curNumEntries =
05808         this->getNumEntriesInGlobalRow (exportGIDs[i]);
05809       numPacketsPerLID[i] = curNumEntries * sizeOfOrdValPair;
05810       totalNumEntries += curNumEntries;
05811       maxRowLength = std::max (curNumEntries, maxRowLength);
05812     }
05813 
05814     // Pack export data by interleaving rows' indices and values in
05815     // the following way:
05816     //
05817     // [inds_row0 vals_row0 inds_row1 vals_row1 ... ]
05818     if (totalNumEntries > 0) {
05819       // exports is an array of char (bytes), so scale the total
05820       // number of entries by the number of bytes per entry (where
05821       // "entry" includes both the column index and the value).
05822       const size_t totalNumBytes = totalNumEntries * sizeOfOrdValPair;
05823       exports.resize (totalNumBytes);
05824 
05825       // Current position in the 'exports' output array.
05826       size_t curOffsetInBytes = 0;
05827 
05828       // For each row of the matrix owned by the calling process, pack
05829       // that row's column indices and values into the exports array.
05830       // If the matrix is globally indexed, we can use view semantics
05831       // (getGlobalRowView), which should be faster than copy
05832       // semantics (getGlobalRowCopy).  Otherwise, we'll have to use
05833       // copy semantics.
05834       //
05835       // FIXME (mfh 28 Jun 2013) This could be made a (shared-memory)
05836       // parallel kernel, by using the CSR data layout to calculate
05837       // positions in the output buffer.
05838       if (locallyIndexed) {
05839         // Locally indexed matrices always have a column Map.
05840         const map_type& colMap = * (this->getColMap ());
05841 
05842         // Views of the column LIDs and values in each row.  It's
05843         // worth creating empty views here, because they aren't
05844         // returned by getLocalRowView; that method will modify (set)
05845         // them in place.
05846         ArrayView<const LO> lidsView;
05847         ArrayView<const Scalar> valsView;
05848 
05849         // Temporary buffer for a copy of the column indices (as GIDs)
05850         // in each row.  Import and Export operations to a CrsMatrix
05851         // target currently expect GIDs, not LIDs.
05852         //
05853         // FIXME (mfh 30 Jun 2013) If the source and target have the
05854         // same column Maps, it would make sense to pack column
05855         // indices as LIDs instead of GIDs.  Packing them as GIDs is
05856         // correct, but it's inefficient to convert LIDs to GIDs and
05857         // then back again on receipt.  Furthermore, GIDs might be
05858         // larger than LIDs, thus costing more bandwidth.
05859         Array<GO> gids (static_cast<size_type> (maxRowLength));
05860 
05861         const size_type numExportLIDs = exportLIDs.size ();
05862         for (size_type i = 0; i < numExportLIDs; ++i) {
05863           // Get a (locally indexed) view of the current row's data.
05864           this->getLocalRowView (exportLIDs[i], lidsView, valsView);
05865 
05866           // Convert column indices as LIDs to column indices as GIDs.
05867           const size_type curNumEntries = lidsView.size ();
05868           ArrayView<GO> gidsView = gids (0, curNumEntries);
05869           for (size_type k = 0; k < curNumEntries; ++k) {
05870             gidsView[k] = colMap.getGlobalElement (lidsView[k]);
05871           }
05872 
05873           // Get views of the spots in the exports array in which to
05874           // put the indices resp. values.  The type cast makes the
05875           // views look like GO resp. Scalar, when the array they are
05876           // viewing is really an array of char.
05877           ArrayView<char> gidsViewOutChar =
05878             exports (curOffsetInBytes,
05879                      static_cast<size_t> (curNumEntries) * sizeof (GO));
05880           ArrayView<char> valsViewOutChar =
05881             exports (curOffsetInBytes + static_cast<size_t> (curNumEntries) * sizeof (GO),
05882                      static_cast<size_t> (curNumEntries) * sizeof (Scalar));
05883           ArrayView<GO> gidsViewOut = av_reinterpret_cast<GO> (gidsViewOutChar);
05884           ArrayView<Scalar> valsViewOut = av_reinterpret_cast<Scalar> (valsViewOutChar);
05885 
05886           // Copy the row's data into the views of the exports array.
05887           std::copy (gidsView.begin (),
05888                      gidsView.begin () + static_cast<size_type> (curNumEntries),
05889                      gidsViewOut.begin ());
05890           std::copy (valsView.begin (),
05891                      valsView.begin () + static_cast<size_type> (curNumEntries),
05892                      valsViewOut.begin ());
05893           // Keep track of how many bytes we packed.
05894           curOffsetInBytes += sizeOfOrdValPair * curNumEntries;
05895         }
05896       }
05897       else { // the matrix is globally indexed
05898         ArrayView<const GO> gidsView;
05899         ArrayView<const Scalar> valsView;
05900 
05901         const size_type numExportLIDs = exportLIDs.size ();
05902         for (size_type i = 0; i < numExportLIDs; ++i) {
05903           // Get a view of the current row's data.
05904           this->getGlobalRowView (exportGIDs[i], gidsView, valsView);
05905           const size_t curNumEntries = static_cast<size_t> (gidsView.size ());
05906           // Get views of the spots in the exports array in which to
05907           // put the indices resp. values.  See notes and FIXME above.
05908 
05909           ArrayView<char> gidsViewOutChar =
05910             exports (curOffsetInBytes, curNumEntries * sizeof (GO));
05911           ArrayView<char> valsViewOutChar =
05912             exports (curOffsetInBytes + curNumEntries * sizeof (GO),
05913                      curNumEntries * sizeof (Scalar));
05914           ArrayView<GO> gidsViewOut = av_reinterpret_cast<GO> (gidsViewOutChar);
05915           ArrayView<Scalar> valsViewOut = av_reinterpret_cast<Scalar> (valsViewOutChar);
05916 
05917           // Copy the row's data into the views of the exports array.
05918           std::copy (gidsView.begin (), gidsView.end (), gidsViewOut.begin ());
05919           std::copy (valsView.begin (), valsView.end (), valsViewOut.begin ());
05920           // Keep track of how many bytes we packed.
05921           curOffsetInBytes += sizeOfOrdValPair * curNumEntries;
05922         }
05923       }
05924 
05925 #ifdef HAVE_TPETRA_DEBUG
05926       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(curOffsetInBytes != totalNumBytes,
05927         std::logic_error, ": At end of method, the final offset bytes count "
05928         "curOffsetInBytes=" << curOffsetInBytes << " does not equal the total "
05929         "number of bytes packed totalNumBytes=" << totalNumBytes << ".  Please "
05930         "report this bug to the Tpetra developers.");
05931 #endif //  HAVE_TPETRA_DEBUG
05932     }
05933   }
05934 
05935 
05936   template<class Scalar,
05937            class LocalOrdinal,
05938            class GlobalOrdinal,
05939            class DeviceType>
05940   void
05941   CrsMatrix<
05942     Scalar, LocalOrdinal, GlobalOrdinal,
05943     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
05944   combineGlobalValues (const GlobalOrdinal globalRowIndex,
05945                        const ArrayView<const GlobalOrdinal> columnIndices,
05946                        const ArrayView<const Scalar> values,
05947                        const Tpetra::CombineMode combineMode)
05948   {
05949     if (isStaticGraph()) {
05950       // INSERT doesn't make sense for a static graph, since you
05951       // aren't allowed to change the structure of the graph.
05952       // However, all the other combine modes work.
05953       if (combineMode == ADD) {
05954         sumIntoGlobalValues (globalRowIndex, columnIndices, values);
05955       }
05956       else if (combineMode == REPLACE) {
05957         replaceGlobalValues (globalRowIndex, columnIndices, values);
05958       }
05959       else if (combineMode == ABSMAX) {
05960         using Details::AbsMax;
05961         AbsMax<Scalar> f;
05962         this->template transformGlobalValues<AbsMax<Scalar> > (globalRowIndex,
05963                                                                columnIndices(),
05964                                                                values(), f);
05965       }
05966       else if (combineMode == INSERT) {
05967         TEUCHOS_TEST_FOR_EXCEPTION(isStaticGraph() && combineMode == INSERT,
05968           std::invalid_argument, "combineGlobalValues: INSERT combine mode "
05969           "is not allowed if the matrix has a static graph (i.e., was "
05970           "constructed with the CrsMatrix constructor that takes a const "
05971           "CrsGraph pointer).");
05972       }
05973       else {
05974         TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
05975           "combineGlobalValues: Invalid combine mode; should never get here!  "
05976           "Please report this bug to the Tpetra developers.");
05977       }
05978     }
05979     else { // The matrix has a dynamic graph.
05980       if (combineMode == ADD || combineMode == INSERT) {
05981         // For a dynamic graph, all incoming column indices are
05982         // inserted into the target graph.  Duplicate indices will
05983         // have their values summed.  In this context, ADD and INSERT
05984         // are equivalent.  We need to call insertGlobalValues()
05985         // anyway if the column indices don't yet exist in this row,
05986         // so we just call insertGlobalValues() for both cases.
05987         insertGlobalValuesFiltered (globalRowIndex, columnIndices, values);
05988       }
05989       // FIXME (mfh 14 Mar 2012):
05990       //
05991       // Implementing ABSMAX or REPLACE for a dynamic graph would
05992       // require modifying assembly to attach a possibly different
05993       // combine mode to each inserted (i, j, A_ij) entry.  For
05994       // example, consider two different Export operations to the same
05995       // target CrsMatrix, the first with ABSMAX combine mode and the
05996       // second with REPLACE.  This isn't a common use case, so we
05997       // won't mess with it for now.
05998       else if (combineMode == ABSMAX) {
05999         TEUCHOS_TEST_FOR_EXCEPTION(! isStaticGraph() && combineMode == ABSMAX,
06000           std::logic_error, "combineGlobalValues: ABSMAX combine mode when "
06001           "the matrix has a dynamic graph is not yet implemented.");
06002       }
06003       else if (combineMode == REPLACE) {
06004         TEUCHOS_TEST_FOR_EXCEPTION(! isStaticGraph() && combineMode == REPLACE,
06005           std::logic_error, "combineGlobalValues: REPLACE combine mode when "
06006           "the matrix has a dynamic graph is not yet implemented.");
06007       }
06008       else {
06009         TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
06010           "combineGlobalValues: Should never get here!  Please report this bug"
06011           "to the Tpetra developers.");
06012       }
06013     }
06014   }
06015 
06016 
06017   template<class Scalar,
06018            class LocalOrdinal,
06019            class GlobalOrdinal,
06020            class DeviceType>
06021   void
06022   CrsMatrix<
06023     Scalar, LocalOrdinal, GlobalOrdinal,
06024     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
06025   unpackAndCombine (const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
06026                     const Teuchos::ArrayView<const char>& imports,
06027                     const Teuchos::ArrayView<size_t>& numPacketsPerLID,
06028                     size_t constantNumPackets,
06029                     Distributor & /* distor */,
06030                     CombineMode combineMode)
06031   {
06032     using Teuchos::ArrayView;
06033     using Teuchos::av_reinterpret_cast;
06034     typedef LocalOrdinal LO;
06035     typedef GlobalOrdinal GO;
06036     typedef typename ArrayView<const LO>::size_type size_type;
06037     const char tfecfFuncName[] = "unpackAndCombine";
06038 
06039 #ifdef HAVE_TPETRA_DEBUG
06040     const CombineMode validModes[4] = {ADD, REPLACE, ABSMAX, INSERT};
06041     const char* validModeNames[4] = {"ADD", "REPLACE", "ABSMAX", "INSERT"};
06042     const int numValidModes = 4;
06043 
06044     if (std::find (validModes, validModes+numValidModes, combineMode) ==
06045         validModes+numValidModes) {
06046       std::ostringstream os;
06047       os << "unpackAndCombine: Invalid combine mode.  Valid modes are {";
06048       for (int k = 0; k < numValidModes; ++k) {
06049         os << validModeNames[k];
06050         if (k < numValidModes - 1) {
06051           os << ", ";
06052         }
06053       }
06054       os << "}.";
06055       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::invalid_argument, os.str());
06056     }
06057 #endif // HAVE_TPETRA_DEBUG
06058     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
06059       importLIDs.size() != numPacketsPerLID.size(),
06060       std::invalid_argument, "importLIDs.size() = " << importLIDs.size()
06061       << "!= numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
06062 
06063     // FIXME (mfh 05 Dec 2012) Here are all the assumptions encoded in
06064     // the following line of code:
06065     //
06066     // 1. The data (index,value) for each element are packed tightly,
06067     //    with no extra space in between.
06068     //
06069     // 2. sizeof(Scalar) says how much data were used to represent a
06070     //    Scalar in its packed form.
06071     //
06072     // 3. All processes and all instances of Scalar use the same
06073     //    amount of data to represent a Scalar.  (GlobalOrdinal is
06074     //    typically a built-in integer type, so this is generally true
06075     //    for GlobalOrdinal.)
06076     //
06077     const size_t SizeOfOrdValPair = sizeof (GO) + sizeof (Scalar);
06078     const size_t totalNumBytes = imports.size (); // * sizeof(char), i.e., 1.
06079     const size_t totalNumEntries = totalNumBytes / SizeOfOrdValPair;
06080 
06081     if (totalNumEntries > 0) {
06082       const map_type& rowMap = * (this->getMap ());
06083 
06084       // data packed as follows:
06085       // [inds_row0 vals_row0 inds_row1 vals_row1 ...]
06086       ArrayView<const char> avIndsC, avValsC;
06087       ArrayView<const GO> avInds;
06088       ArrayView<const Scalar> avVals;
06089 
06090       size_t curOffsetInBytes = 0;
06091       for (size_type i = 0; i < importLIDs.size (); ++i) {
06092         const size_t rowSize = numPacketsPerLID[i] / SizeOfOrdValPair;
06093         // Needs to be in here in case of zero length rows.  If not,
06094         // the lines following the if statement error out if the row
06095         // length is zero. KLN 13/06/2011
06096         //
06097         // mfh 05 Dec 2012: The problem to which Kurtis refers in the
06098         // above comment may no longer be an issue, since
06099         // ArrayView::view() (which implements ArrayView::operator())
06100         // now allows views of length zero.
06101         if (rowSize == 0) {
06102           continue;
06103         }
06104         const LO LID = importLIDs[i];
06105         const GO myGID = rowMap.getGlobalElement (LID);
06106 
06107         // Get views of the import (incoming data) buffers.  Again,
06108         // this code assumes that sizeof(Scalar) is the number of
06109         // bytes used by each Scalar.  It also assumes that
06110         // Teuchos::Comm has correctly deserialized Scalar in place in
06111         // avValsC.
06112         avIndsC = imports (curOffsetInBytes, rowSize * sizeof (GO));
06113         avValsC = imports (curOffsetInBytes + rowSize * sizeof (GO),
06114                            rowSize * sizeof (Scalar));
06115         avInds = av_reinterpret_cast<const GO> (avIndsC);
06116         avVals = av_reinterpret_cast<const Scalar> (avValsC);
06117 
06118         combineGlobalValues (myGID, avInds (), avVals (), combineMode);
06119         curOffsetInBytes += rowSize * SizeOfOrdValPair;
06120       }
06121 #ifdef HAVE_TPETRA_DEBUG
06122       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(curOffsetInBytes != totalNumBytes,
06123         std::logic_error, "After unpacking and combining all the imports, the "
06124         "final offset in bytes curOffsetInBytes=" << curOffsetInBytes << " != "
06125         "total number of bytes totalNumBytes=" << totalNumBytes << ".  Please "
06126         "report this bug to the Tpetra developers.");
06127 #endif // HAVE_TPETRA_DEBUG
06128     }
06129   }
06130 
06131   template<class Scalar,
06132            class LocalOrdinal,
06133            class GlobalOrdinal,
06134            class DeviceType>
06135   Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
06136                            Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
06137   CrsMatrix<
06138     Scalar, LocalOrdinal, GlobalOrdinal,
06139     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
06140   getColumnMapMultiVector (const MV& X_domainMap,
06141                            const bool force) const
06142   {
06143     using Teuchos::null;
06144     using Teuchos::RCP;
06145     using Teuchos::rcp;
06146 
06147     TEUCHOS_TEST_FOR_EXCEPTION(
06148       ! this->hasColMap (), std::runtime_error, "Tpetra::CrsMatrix::getColumn"
06149       "MapMultiVector: You may only call this method if the matrix has a "
06150       "column Map.  If the matrix does not yet have a column Map, you should "
06151       "first call fillComplete (with domain and range Map if necessary).");
06152 
06153     // If the graph is not fill complete, then the Import object (if
06154     // one should exist) hasn't been constructed yet.
06155     TEUCHOS_TEST_FOR_EXCEPTION(
06156       ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
06157       "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
06158       "this matrix's graph is fill complete.");
06159 
06160     const size_t numVecs = X_domainMap.getNumVectors ();
06161     RCP<const import_type> importer = this->getGraph ()->getImporter ();
06162     RCP<const map_type> colMap = this->getColMap ();
06163 
06164     RCP<MV> X_colMap; // null by default
06165 
06166     // If the Import object is trivial (null), then we don't need a
06167     // separate column Map multivector.  Just return null in that
06168     // case.  The caller is responsible for knowing not to use the
06169     // returned null pointer.
06170     //
06171     // If the Import is nontrivial, then we do need a separate
06172     // column Map multivector for the Import operation.  Check in
06173     // that case if we have to (re)create the column Map
06174     // multivector.
06175     if (! importer.is_null () || force) {
06176       if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
06177         X_colMap = rcp (new MV (colMap, numVecs));
06178 
06179         // Cache the newly created multivector for later reuse.
06180         importMV_ = X_colMap;
06181       }
06182       else { // Yay, we can reuse the cached multivector!
06183         X_colMap = importMV_;
06184         // mfh 09 Jan 2013: We don't have to fill with zeros first,
06185         // because the Import uses INSERT combine mode, which overwrites
06186         // existing entries.
06187         //
06188         //X_colMap->putScalar (STS::zero ());
06189       }
06190     }
06191     return X_colMap;
06192   }
06193 
06194 
06195   template <class Scalar,
06196             class LocalOrdinal,
06197             class GlobalOrdinal,
06198             class DeviceType>
06199   Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
06200                            Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
06201   CrsMatrix<
06202     Scalar, LocalOrdinal, GlobalOrdinal,
06203     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
06204   getRowMapMultiVector (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& Y_rangeMap,
06205                         const bool force) const
06206   {
06207     using Teuchos::null;
06208     using Teuchos::RCP;
06209     using Teuchos::rcp;
06210 
06211     // If the graph is not fill complete, then the Export object (if
06212     // one should exist) hasn't been constructed yet.
06213     TEUCHOS_TEST_FOR_EXCEPTION(
06214       ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
06215       "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
06216       "matrix's graph is fill complete.");
06217 
06218     const size_t numVecs = Y_rangeMap.getNumVectors ();
06219     RCP<const export_type> exporter = this->getGraph ()->getExporter ();
06220     // Every version of the constructor takes either a row Map, or a
06221     // graph (all of whose constructors take a row Map).  Thus, the
06222     // matrix always has a row Map.
06223     RCP<const map_type> rowMap = this->getRowMap ();
06224 
06225     RCP<MV> Y_rowMap; // null by default
06226 
06227     // If the Export object is trivial (null), then we don't need a
06228     // separate row Map multivector.  Just return null in that case.
06229     // The caller is responsible for knowing not to use the returned
06230     // null pointer.
06231     //
06232     // If the Export is nontrivial, then we do need a separate row
06233     // Map multivector for the Export operation.  Check in that case
06234     // if we have to (re)create the row Map multivector.
06235     if (! exporter.is_null () || force) {
06236       if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
06237         Y_rowMap = rcp (new MV (rowMap, numVecs));
06238 
06239         // Cache the newly created multivector for later reuse.
06240         exportMV_ = Y_rowMap;
06241       }
06242       else { // Yay, we can reuse the cached multivector!
06243         Y_rowMap = exportMV_;
06244       }
06245     }
06246     return Y_rowMap;
06247   }
06248 
06249 
06250   template <class Scalar,
06251             class LocalOrdinal,
06252             class GlobalOrdinal,
06253             class DeviceType>
06254   void
06255   CrsMatrix<
06256     Scalar, LocalOrdinal, GlobalOrdinal,
06257     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
06258   removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
06259   {
06260     TEUCHOS_TEST_FOR_EXCEPTION(
06261       myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
06262       "removeEmptyProcessesInPlace: This method does not work when the matrix "
06263       "was created with a constant graph (that is, when it was created using "
06264       "the version of its constructor that takes an RCP<const CrsGraph>).  "
06265       "This is because the matrix is not allowed to modify the graph in that "
06266       "case, but removing empty processes requires modifying the graph.");
06267     myGraph_->removeEmptyProcessesInPlace (newMap);
06268     // Even though CrsMatrix's row Map (as returned by getRowMap())
06269     // comes from its CrsGraph, CrsMatrix still implements DistObject,
06270     // so we also have to change the DistObject's Map.
06271     this->map_ = this->getRowMap ();
06272     // In the nonconst graph case, staticGraph_ is just a const
06273     // pointer to myGraph_.  This assignment is probably redundant,
06274     // but it doesn't hurt.
06275     staticGraph_ = Teuchos::rcp_const_cast<const Graph> (myGraph_);
06276   }
06277 
06278 
06279   template <class Scalar,
06280             class LocalOrdinal,
06281             class GlobalOrdinal,
06282             class DeviceType>
06283   Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal,
06284                          Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
06285   CrsMatrix<
06286     Scalar, LocalOrdinal, GlobalOrdinal,
06287     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
06288   add (const Scalar& alpha,
06289        const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, node_type>& A,
06290        const Scalar& beta,
06291        const Teuchos::RCP<const map_type>& domainMap,
06292        const Teuchos::RCP<const map_type>& rangeMap,
06293        const Teuchos::RCP<Teuchos::ParameterList>& params) const
06294   {
06295     using Teuchos::Array;
06296     using Teuchos::ArrayRCP;
06297     using Teuchos::as;
06298     using Teuchos::ParameterList;
06299     using Teuchos::RCP;
06300     using Teuchos::rcp;
06301     using Teuchos::rcp_implicit_cast;
06302     using Teuchos::sublist;
06303     typedef LocalOrdinal LO;
06304     typedef GlobalOrdinal GO;
06305     typedef RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, node_type> row_matrix_type;
06306     typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, node_type> crs_matrix_type;
06307 
06308     const crs_matrix_type& B = *this; // a convenient abbreviation
06309 
06310     // If the user didn't supply a domain or range Map, then try to
06311     // get one from B first (if it has them), then from A (if it has
06312     // them).  If we don't have any domain or range Maps, scold the
06313     // user.
06314     RCP<const map_type> A_domainMap = A.getDomainMap ();
06315     RCP<const map_type> A_rangeMap = A.getRangeMap ();
06316     RCP<const map_type> B_domainMap = B.getDomainMap ();
06317     RCP<const map_type> B_rangeMap = B.getRangeMap ();
06318 
06319     RCP<const map_type> theDomainMap = domainMap;
06320     RCP<const map_type> theRangeMap = rangeMap;
06321 
06322     if (domainMap.is_null ()) {
06323       if (B_domainMap.is_null ()) {
06324         TEUCHOS_TEST_FOR_EXCEPTION(
06325           A_domainMap.is_null (), std::invalid_argument,
06326           "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
06327           "then you must supply a nonnull domain Map to this method.");
06328         theDomainMap = A_domainMap;
06329       } else {
06330         theDomainMap = B_domainMap;
06331       }
06332     }
06333     if (rangeMap.is_null ()) {
06334       if (B_rangeMap.is_null ()) {
06335         TEUCHOS_TEST_FOR_EXCEPTION(
06336           A_rangeMap.is_null (), std::invalid_argument,
06337           "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
06338           "then you must supply a nonnull range Map to this method.");
06339         theRangeMap = A_rangeMap;
06340       } else {
06341         theRangeMap = B_rangeMap;
06342       }
06343     }
06344 
06345 #ifdef HAVE_TPETRA_DEBUG
06346     // In a debug build, check that A and B have matching domain and
06347     // range Maps, if they have domain and range Maps at all.  (If
06348     // they aren't fill complete, then they may not yet have them.)
06349     if (! A_domainMap.is_null () && ! A_rangeMap.is_null ()) {
06350       if (! B_domainMap.is_null () && ! B_rangeMap.is_null ()) {
06351         TEUCHOS_TEST_FOR_EXCEPTION(
06352           ! B_domainMap->isSameAs (*A_domainMap), std::invalid_argument,
06353           "Tpetra::CrsMatrix::add: The input RowMatrix A must have a domain Map "
06354           "which is the same as (isSameAs) this RowMatrix's domain Map.");
06355         TEUCHOS_TEST_FOR_EXCEPTION(
06356           ! B_rangeMap->isSameAs (*A_rangeMap), std::invalid_argument,
06357           "Tpetra::CrsMatrix::add: The input RowMatrix A must have a range Map "
06358           "which is the same as (isSameAs) this RowMatrix's range Map.");
06359         TEUCHOS_TEST_FOR_EXCEPTION(
06360           ! domainMap.is_null () && ! domainMap->isSameAs (*B_domainMap),
06361           std::invalid_argument,
06362           "Tpetra::CrsMatrix::add: The input domain Map must be the same as "
06363           "(isSameAs) this RowMatrix's domain Map.");
06364         TEUCHOS_TEST_FOR_EXCEPTION(
06365           ! rangeMap.is_null () && ! rangeMap->isSameAs (*B_rangeMap),
06366           std::invalid_argument,
06367           "Tpetra::CrsMatrix::add: The input range Map must be the same as "
06368           "(isSameAs) this RowMatrix's range Map.");
06369       }
06370     }
06371     else if (! B_domainMap.is_null () && ! B_rangeMap.is_null ()) {
06372       TEUCHOS_TEST_FOR_EXCEPTION(
06373         ! domainMap.is_null () && ! domainMap->isSameAs (*B_domainMap),
06374         std::invalid_argument,
06375         "Tpetra::CrsMatrix::add: The input domain Map must be the same as "
06376         "(isSameAs) this RowMatrix's domain Map.");
06377       TEUCHOS_TEST_FOR_EXCEPTION(
06378         ! rangeMap.is_null () && ! rangeMap->isSameAs (*B_rangeMap),
06379         std::invalid_argument,
06380         "Tpetra::CrsMatrix::add: The input range Map must be the same as "
06381         "(isSameAs) this RowMatrix's range Map.");
06382     }
06383     else {
06384       TEUCHOS_TEST_FOR_EXCEPTION(
06385         domainMap.is_null () || rangeMap.is_null (), std::invalid_argument,
06386         "Tpetra::CrsMatrix::add: If neither A nor B have a domain and range "
06387         "Map, then you must supply a nonnull domain and range Map to this "
06388         "method.");
06389     }
06390 #endif // HAVE_TPETRA_DEBUG
06391 
06392     // What parameters do we pass to C's constructor?  Do we call
06393     // fillComplete on C after filling it?  And if so, what parameters
06394     // do we pass to C's fillComplete call?
06395     bool callFillComplete = true;
06396     RCP<ParameterList> constructorSublist;
06397     RCP<ParameterList> fillCompleteSublist;
06398     if (! params.is_null ()) {
06399       callFillComplete = params->get ("Call fillComplete", callFillComplete);
06400       constructorSublist = sublist (params, "Constructor parameters");
06401       fillCompleteSublist = sublist (params, "fillComplete parameters");
06402     }
06403 
06404     RCP<const map_type> A_rowMap = A.getRowMap ();
06405     RCP<const map_type> B_rowMap = B.getRowMap ();
06406     RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
06407     RCP<crs_matrix_type> C; // The result matrix.
06408 
06409     // If A and B's row Maps are the same, we can compute an upper
06410     // bound on the number of entries in each row of C, before
06411     // actually computing the sum.  A reasonable upper bound is the
06412     // sum of the two entry counts in each row.  If we choose this as
06413     // the actual per-row upper bound, we can use static profile.
06414     if (A_rowMap->isSameAs (*B_rowMap)) {
06415       const LO localNumRows = static_cast<LO> (A_rowMap->getNodeNumElements ());
06416       ArrayRCP<size_t> C_maxNumEntriesPerRow (localNumRows, 0);
06417 
06418       // Get the number of entries in each row of A.
06419       if (alpha != STS::zero ()) {
06420         for (LO localRow = 0; localRow < localNumRows; ++localRow) {
06421           const size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
06422           C_maxNumEntriesPerRow[localRow] += A_numEntries;
06423         }
06424       }
06425       // Get the number of entries in each row of B.
06426       if (beta != STS::zero ()) {
06427         for (LO localRow = 0; localRow < localNumRows; ++localRow) {
06428           const size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
06429           C_maxNumEntriesPerRow[localRow] += B_numEntries;
06430         }
06431       }
06432       // Construct the result matrix C.
06433       if (constructorSublist.is_null ()) {
06434         C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow,
06435                                       StaticProfile));
06436       } else {
06437         C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow,
06438                                       StaticProfile, constructorSublist));
06439       }
06440       // Since A and B have the same row Maps, we could add them
06441       // together all at once and merge values before we call
06442       // insertGlobalValues.  However, we don't really need to, since
06443       // we've already allocated enough space in each row of C for C
06444       // to do the merge itself.
06445     }
06446     else { // the row Maps of A and B are not the same
06447       // Construct the result matrix C.
06448       if (constructorSublist.is_null ()) {
06449         C = rcp (new crs_matrix_type (C_rowMap, 0, DynamicProfile));
06450       } else {
06451         C = rcp (new crs_matrix_type (C_rowMap, 0, DynamicProfile,
06452                                       constructorSublist));
06453       }
06454     }
06455 
06456 #ifdef HAVE_TPETRA_DEBUG
06457     TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
06458       "Tpetra::RowMatrix::add: C should not be null at this point.  "
06459       "Please report this bug to the Tpetra developers.");
06460 #endif // HAVE_TPETRA_DEBUG
06461     //
06462     // Compute C = alpha*A + beta*B.
06463     //
06464     Array<GO> ind;
06465     Array<Scalar> val;
06466 
06467     if (alpha != STS::zero ()) {
06468       const LO A_localNumRows = static_cast<LO> (A_rowMap->getNodeNumElements ());
06469       for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
06470         size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
06471         const GO globalRow = A_rowMap->getGlobalElement (localRow);
06472         if (A_numEntries > static_cast<size_t> (ind.size ())) {
06473           ind.resize (A_numEntries);
06474           val.resize (A_numEntries);
06475         }
06476         ArrayView<GO> indView = ind (0, A_numEntries);
06477         ArrayView<Scalar> valView = val (0, A_numEntries);
06478         A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
06479 
06480         if (alpha != STS::one ()) {
06481           for (size_t k = 0; k < A_numEntries; ++k) {
06482             valView[k] *= alpha;
06483           }
06484         }
06485         C->insertGlobalValues (globalRow, indView, valView);
06486       }
06487     }
06488 
06489     if (beta != STS::zero ()) {
06490       const LO B_localNumRows = static_cast<LO> (B_rowMap->getNodeNumElements ());
06491       for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
06492         size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
06493         const GO globalRow = B_rowMap->getGlobalElement (localRow);
06494         if (B_numEntries > static_cast<size_t> (ind.size ())) {
06495           ind.resize (B_numEntries);
06496           val.resize (B_numEntries);
06497         }
06498         ArrayView<GO> indView = ind (0, B_numEntries);
06499         ArrayView<Scalar> valView = val (0, B_numEntries);
06500         B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
06501 
06502         if (beta != STS::one ()) {
06503           for (size_t k = 0; k < B_numEntries; ++k) {
06504             valView[k] *= beta;
06505           }
06506         }
06507         C->insertGlobalValues (globalRow, indView, valView);
06508       }
06509     }
06510 
06511     if (callFillComplete) {
06512       if (fillCompleteSublist.is_null ()) {
06513         C->fillComplete (theDomainMap, theRangeMap);
06514       } else {
06515         C->fillComplete (theDomainMap, theRangeMap, fillCompleteSublist);
06516       }
06517     }
06518 
06519     return rcp_implicit_cast<row_matrix_type> (C);
06520   }
06521 
06522 
06523   template <class Scalar,
06524             class LocalOrdinal,
06525             class GlobalOrdinal,
06526             class DeviceType>
06527   void
06528   CrsMatrix<
06529     Scalar, LocalOrdinal, GlobalOrdinal,
06530     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
06531   transferAndFillComplete (Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, node_type> > & destMat,
06532                            const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, node_type>& rowTransfer,
06533                            const Teuchos::RCP<const map_type>& domainMap,
06534                            const Teuchos::RCP<const map_type>& rangeMap,
06535                            const Teuchos::RCP<Teuchos::ParameterList>& params) const
06536   {
06537     using Teuchos::ArrayView;
06538     using Teuchos::ParameterList;
06539     using Teuchos::RCP;
06540     typedef LocalOrdinal LO;
06541     typedef GlobalOrdinal GO;
06542     typedef node_type NT;
06543     typedef CrsMatrix<Scalar, LO, GO, NT> this_type;
06544     typedef Vector<int, LO, GO, NT> IntVectorType;
06545 
06546     // Make sure that the input argument rowTransfer is either an
06547     // Import or an Export.  Import and Export are the only two
06548     // subclasses of Transfer that we defined, but users might
06549     // (unwisely, for now at least) decide to implement their own
06550     // subclasses.  Exclude this possibility.
06551     const import_type* xferAsImport = dynamic_cast<const import_type*> (&rowTransfer);
06552     const export_type* xferAsExport = dynamic_cast<const export_type*> (&rowTransfer);
06553     TEUCHOS_TEST_FOR_EXCEPTION(
06554       xferAsImport == NULL && xferAsExport == NULL, std::invalid_argument,
06555       "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
06556       "argument must be either an Import or an Export, and its template "
06557       "parameters must match the corresponding template parameters of the "
06558       "CrsMatrix.");
06559 
06560     // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
06561     // if the source Map is not distributed but the target Map is?
06562     const bool communication_needed = rowTransfer.getSourceMap ()->isDistributed ();
06563 
06564     //
06565     // Get the caller's parameters
06566     //
06567 
06568     bool reverseMode = false; // Are we in reverse mode?
06569     bool restrictComm = false; // Do we need to restrict the communicator?
06570     RCP<ParameterList> matrixparams; // parameters for the destination matrix
06571     if (! params.is_null ()) {
06572       reverseMode = params->get ("Reverse Mode", reverseMode);
06573       restrictComm = params->get ("Restrict Communicator", restrictComm);
06574       matrixparams = sublist (params, "CrsMatrix");
06575     }
06576 
06577     // Get the new domain and range Maps.  We need some of them for
06578     // error checking, now that we have the reverseMode parameter.
06579     RCP<const map_type> MyRowMap = reverseMode ?
06580       rowTransfer.getSourceMap () : rowTransfer.getTargetMap ();
06581     RCP<const map_type> MyColMap; // create this below
06582     RCP<const map_type> MyDomainMap = ! domainMap.is_null () ?
06583       domainMap : getDomainMap ();
06584     RCP<const map_type> MyRangeMap = ! rangeMap.is_null () ?
06585       rangeMap : getRangeMap ();
06586     RCP<const map_type> BaseRowMap = MyRowMap;
06587     RCP<const map_type> BaseDomainMap = MyDomainMap;
06588 
06589     // If the user gave us a nonnull destMat, then check whether it's
06590     // "pristine."  That means that it has no entries.
06591     //
06592     // FIXME (mfh 15 May 2014) If this is not true on all processes,
06593     // then this exception test may hang.  It would be better to
06594     // forward an error flag to the next communication phase.
06595     if (! destMat.is_null ()) {
06596       // FIXME (mfh 15 May 2014): The classic Petra idiom for checking
06597       // whether a graph or matrix has no entries on the calling
06598       // process, is that it is neither locally nor globally indexed.
06599       // This may change eventually with the Kokkos refactor version
06600       // of Tpetra, so it would be better just to check the quantity
06601       // of interest directly.  Note that with the Kokkos refactor
06602       // version of Tpetra, asking for the total number of entries in
06603       // a graph or matrix that is not fill complete might require
06604       // computation (kernel launch), since it is not thread scalable
06605       // to update a count every time an entry is inserted.
06606       const bool NewFlag = ! destMat->getGraph ()->isLocallyIndexed () &&
06607         ! destMat->getGraph ()->isGloballyIndexed ();
06608       TEUCHOS_TEST_FOR_EXCEPTION(
06609         ! NewFlag, std::invalid_argument, "Tpetra::CrsMatrix::"
06610         "transferAndFillComplete: The input argument 'destMat' is only allowed "
06611         "to be nonnull, if its graph is empty (neither locally nor globally "
06612         "indexed).");
06613       // FIXME (mfh 15 May 2014) At some point, we want to change
06614       // graphs and matrices so that their DistObject Map
06615       // (this->getMap()) may differ from their row Map.  This will
06616       // make redistribution for 2-D distributions more efficient.  I
06617       // hesitate to change this check, because I'm not sure how much
06618       // the code here depends on getMap() and getRowMap() being the
06619       // same.
06620       TEUCHOS_TEST_FOR_EXCEPTION(
06621         ! destMat->getRowMap ()->isSameAs (*MyRowMap), std::invalid_argument,
06622         "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
06623         "input argument 'destMat' is not the same as the (row) Map specified "
06624         "by the input argument 'rowTransfer'.");
06625       TEUCHOS_TEST_FOR_EXCEPTION(
06626         ! destMat->checkSizes (*this), std::invalid_argument,
06627         "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
06628         "destination matrix, but checkSizes() indicates that it is not a legal "
06629         "legal target for redistribution from the source matrix (*this).  This "
06630         "may mean that they do not have the same dimensions.");
06631     }
06632 
06633     // If forward mode (the default), then *this's (row) Map must be
06634     // the same as the source Map of the Transfer.  If reverse mode,
06635     // then *this's (row) Map must be the same as the target Map of
06636     // the Transfer.
06637     //
06638     // FIXME (mfh 15 May 2014) At some point, we want to change graphs
06639     // and matrices so that their DistObject Map (this->getMap()) may
06640     // differ from their row Map.  This will make redistribution for
06641     // 2-D distributions more efficient.  I hesitate to change this
06642     // check, because I'm not sure how much the code here depends on
06643     // getMap() and getRowMap() being the same.
06644     TEUCHOS_TEST_FOR_EXCEPTION(
06645       ! (reverseMode || getRowMap ()->isSameAs (*rowTransfer.getSourceMap ())),
06646       std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
06647       "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
06648     TEUCHOS_TEST_FOR_EXCEPTION(
06649       ! (! reverseMode || getRowMap ()->isSameAs (*rowTransfer.getTargetMap ())),
06650       std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
06651       "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
06652 
06653     // The basic algorithm here is:
06654     //
06655     // 1. Call the moral equivalent of "distor.do" to handle the import.
06656     // 2. Copy all the Imported and Copy/Permuted data into the raw
06657     //    CrsMatrix / CrsGraphData pointers, still using GIDs.
06658     // 3. Call an optimized version of MakeColMap that avoids the
06659     //    Directory lookups (since the importer knows who owns all the
06660     //    GIDs) AND reindexes to LIDs.
06661     // 4. Call expertStaticFillComplete()
06662 
06663     // Get information from the Importer
06664     const size_t NumSameIDs = rowTransfer.getNumSameIDs();
06665     ArrayView<const LO> ExportLIDs = reverseMode ?
06666       rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
06667     ArrayView<const LO> RemoteLIDs = reverseMode ?
06668       rowTransfer.getExportLIDs () : rowTransfer.getRemoteLIDs ();
06669     ArrayView<const LO> PermuteToLIDs = reverseMode ?
06670       rowTransfer.getPermuteFromLIDs () : rowTransfer.getPermuteToLIDs ();
06671     ArrayView<const LO> PermuteFromLIDs = reverseMode ?
06672       rowTransfer.getPermuteToLIDs () : rowTransfer.getPermuteFromLIDs ();
06673     Distributor& Distor = rowTransfer.getDistributor ();
06674 
06675     // Owning PIDs
06676     Teuchos::Array<int> SourcePids;
06677     Teuchos::Array<int> TargetPids;
06678     int MyPID = getComm ()->getRank ();
06679 
06680     // Temp variables for sub-communicators
06681     RCP<const map_type> ReducedRowMap, ReducedColMap,
06682       ReducedDomainMap, ReducedRangeMap;
06683     RCP<const Comm<int> > ReducedComm;
06684 
06685     // If the user gave us a null destMat, then construct the new
06686     // destination matrix.  We will replace its column Map later.
06687     if (destMat.is_null ()) {
06688       destMat = rcp (new this_type (MyRowMap, 0, StaticProfile, matrixparams));
06689     }
06690 
06691     /***************************************************/
06692     /***** 1) First communicator restriction phase ****/
06693     /***************************************************/
06694     if (restrictComm) {
06695       ReducedRowMap = MyRowMap->removeEmptyProcesses ();
06696       ReducedComm = ReducedRowMap.is_null () ? Teuchos::null : ReducedRowMap->getComm ();
06697       destMat->removeEmptyProcessesInPlace (ReducedRowMap);
06698 
06699       ReducedDomainMap = MyRowMap.getRawPtr () == MyDomainMap.getRawPtr () ?
06700         ReducedRowMap :
06701         MyDomainMap->replaceCommWithSubset (ReducedComm);
06702       ReducedRangeMap  = MyRowMap.getRawPtr () == MyRangeMap.getRawPtr () ?
06703         ReducedRowMap :
06704         MyRangeMap->replaceCommWithSubset (ReducedComm);
06705 
06706       // Reset the "my" maps
06707       MyRowMap    = ReducedRowMap;
06708       MyDomainMap = ReducedDomainMap;
06709       MyRangeMap  = ReducedRangeMap;
06710 
06711       // Update my PID, if we've restricted the communicator
06712       if (! ReducedComm.is_null ()) {
06713         MyPID = ReducedComm->getRank ();
06714       }
06715       else {
06716         MyPID = -2; // For debugging
06717       }
06718     }
06719     else {
06720       ReducedComm = MyRowMap->getComm ();
06721     }
06722 
06723     /***************************************************/
06724     /***** 2) From Tpera::DistObject::doTransfer() ****/
06725     /***************************************************/
06726 
06727     // Get the owning PIDs
06728     RCP<const import_type> MyImporter = getGraph ()->getImporter ();
06729 
06730     if (! restrictComm && ! MyImporter.is_null () &&
06731         BaseDomainMap->isSameAs (*getDomainMap ())) {
06732       // Same domain map as source matrix
06733       //
06734       // NOTE: This won't work for restrictComm (because the Import
06735       // doesn't know the restricted PIDs), though writing an
06736       // optimized version for that case would be easy (Import an
06737       // IntVector of the new PIDs).  Might want to add this later.
06738       Import_Util::getPids (*MyImporter, SourcePids, false);
06739     }
06740     else if (MyImporter.is_null () && BaseDomainMap->isSameAs (*getDomainMap ())) {
06741       // Matrix has no off-process entries
06742       SourcePids.resize (getColMap ()->getNodeNumElements ());
06743       SourcePids.assign (getColMap ()->getNodeNumElements (), MyPID);
06744     }
06745     else if (BaseDomainMap->isSameAs (*BaseRowMap) &&
06746              getDomainMap ()->isSameAs (*getRowMap ())) {
06747       // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
06748       IntVectorType TargetRow_pids (domainMap);
06749       IntVectorType SourceRow_pids (getRowMap ());
06750       IntVectorType SourceCol_pids (getColMap ());
06751 
06752       TargetRow_pids.putScalar (MyPID);
06753       if (! reverseMode && xferAsImport != NULL) {
06754         SourceRow_pids.doExport (TargetRow_pids, *xferAsImport, INSERT);
06755       }
06756       else if (reverseMode && xferAsExport != NULL) {
06757         SourceRow_pids.doExport (TargetRow_pids, *xferAsExport, INSERT);
06758       }
06759       else if (! reverseMode && xferAsExport != NULL) {
06760         SourceRow_pids.doImport (TargetRow_pids, *xferAsExport, INSERT);
06761       }
06762       else if (reverseMode && xferAsImport != NULL) {
06763         SourceRow_pids.doImport (TargetRow_pids, *xferAsImport, INSERT);
06764       }
06765       else {
06766         TEUCHOS_TEST_FOR_EXCEPTION(
06767           true, std::logic_error, "Tpetra::CrsMatrix::"
06768           "transferAndFillComplete: Should never get here!  "
06769           "Please report this bug to a Tpetra developer.");
06770       }
06771       SourceCol_pids.doImport (SourceRow_pids, *MyImporter, INSERT);
06772       SourcePids.resize (getColMap ()->getNodeNumElements ());
06773       SourceCol_pids.get1dCopy (SourcePids ());
06774     }
06775     else {
06776       TEUCHOS_TEST_FOR_EXCEPTION(
06777         true, std::invalid_argument, "Tpetra::CrsMatrix::"
06778         "transferAndFillComplete: This method only allows either domainMap == "
06779         "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
06780         "getDomainMap () == getRowMap ()).");
06781     }
06782 
06783     // Tpetra-specific stuff
06784     //
06785     // FIXME (mfh 15 May 2014) This should work fine if CrsMatrix
06786     // inherits from DistObject (in which case all arrays that get
06787     // resized here are Teuchos::Array), but it won't work if
06788     // CrsMatrix inherits from DistObjectKA (in which case all arrays
06789     // that get resized here are Kokkos::View).  In the latter case,
06790     // imports_ and numExportPacketsPerLID_ each have only a device
06791     // view, but numImportPacketsPerLID_ has a device view and a host
06792     // view (host_numImportPacketsPerLID_).
06793     //
06794     // Currently, CrsMatrix inherits from DistObject, not
06795     // DistObjectKA, so the code below should be fine for the Kokkos
06796     // refactor version of CrsMatrix.
06797     //
06798     // For this and for all other cases in this function that want to
06799     // resize the DistObject's communication arrays, it would make
06800     // sense to give DistObject (and DistObjectKA) methods for
06801     // resizing that don't expose the details of whether these are
06802     // Teuchos::Array or Kokkos::View.
06803     size_t constantNumPackets = destMat->constantNumberOfPackets ();
06804     if (constantNumPackets == 0) {
06805       destMat->numExportPacketsPerLID_old_.resize (ExportLIDs.size ());
06806       destMat->numImportPacketsPerLID_old_.resize (RemoteLIDs.size ());
06807     }
06808     else {
06809       // There are a constant number of packets per element.  We
06810       // already know (from the number of "remote" (incoming)
06811       // elements) how many incoming elements we expect, so we can
06812       // resize the buffer accordingly.
06813       const size_t rbufLen = RemoteLIDs.size() * constantNumPackets;
06814       if (static_cast<size_t> (destMat->imports_old_.size ()) != rbufLen) {
06815         destMat->imports_old_.resize (rbufLen);
06816       }
06817     }
06818 
06819     // Pack & Prepare w/ owning PIDs
06820     //
06821     // FIXME (mfh 15 May 2014) This should work fine if CrsMatrix
06822     // inherits from DistObject (in which case all arrays that get
06823     // passed in here are Teuchos::Array), but it won't work if
06824     // CrsMatrix inherits from DistObjectKA (in which case all arrays
06825     // that get passed in here are Kokkos::View).  In the latter case,
06826     // exports_ and numExportPacketsPerLID_ each have only a device
06827     // view.
06828     //
06829     // Currently, CrsMatrix inherits from DistObject, not
06830     // DistObjectKA, so the code below should be fine for the Kokkos
06831     // refactor version of CrsMatrix.
06832     Import_Util::packAndPrepareWithOwningPIDs (*this, ExportLIDs,
06833                                                destMat->exports_old_,
06834                                                destMat->numExportPacketsPerLID_old_ (),
06835                                                constantNumPackets, Distor,
06836                                                SourcePids);
06837 
06838     // Do the exchange of remote data.
06839     //
06840     // FIXME (mfh 15 May 2014) This should work fine if CrsMatrix
06841     // inherits from DistObject (in which case all arrays that get
06842     // passed in here are Teuchos::Array), but it won't work if
06843     // CrsMatrix inherits from DistObjectKA (in which case all arrays
06844     // that get passed in here are Kokkos::View).
06845     //
06846     // In the latter case, imports_, exports_, and
06847     // numExportPacketsPerLID_ each have only a device view.
06848     // numImportPacketsPerLIDs_ is a device view, and also has a host
06849     // view (host_numImportPacketsPerLID_).
06850     if (communication_needed) {
06851       if (reverseMode) {
06852         if (constantNumPackets == 0) { // variable number of packets per LID
06853           Distor.doReversePostsAndWaits (destMat->numExportPacketsPerLID_old_ ().getConst (), 1,
06854                                          destMat->numImportPacketsPerLID_old_ ());
06855           size_t totalImportPackets = 0;
06856           for (Array_size_type i = 0; i < destMat->numImportPacketsPerLID_old_.size (); ++i) {
06857             totalImportPackets += destMat->numImportPacketsPerLID_old_[i];
06858           }
06859           destMat->imports_old_.resize (totalImportPackets);
06860           Distor.doReversePostsAndWaits (destMat->exports_old_ ().getConst (),
06861                                          destMat->numExportPacketsPerLID_old_ (),
06862                                          destMat->imports_old_ (),
06863                                          destMat->numImportPacketsPerLID_old_ ());
06864         }
06865         else { // constant number of packets per LID
06866           Distor.doReversePostsAndWaits (destMat->exports_old_ ().getConst (),
06867                                          constantNumPackets,
06868                                          destMat->imports_old_ ());
06869         }
06870       }
06871       else { // forward mode (the default)
06872         if (constantNumPackets == 0) { // variable number of packets per LID
06873           Distor.doPostsAndWaits (destMat->numExportPacketsPerLID_old_ ().getConst (), 1,
06874                                   destMat->numImportPacketsPerLID_old_ ());
06875           size_t totalImportPackets = 0;
06876           for (Array_size_type i = 0; i < destMat->numImportPacketsPerLID_old_.size (); ++i) {
06877             totalImportPackets += destMat->numImportPacketsPerLID_old_[i];
06878           }
06879           destMat->imports_old_.resize (totalImportPackets);
06880           Distor.doPostsAndWaits (destMat->exports_old_ ().getConst (),
06881                                   destMat->numExportPacketsPerLID_old_ (),
06882                                   destMat->imports_old_ (),
06883                                   destMat->numImportPacketsPerLID_old_ ());
06884         }
06885         else { // constant number of packets per LID
06886           Distor.doPostsAndWaits (destMat->exports_old_ ().getConst (),
06887                                   constantNumPackets,
06888                                   destMat->imports_old_ ());
06889         }
06890       }
06891     }
06892 
06893     /*********************************************************************/
06894     /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
06895     /*********************************************************************/
06896 
06897     // FIXME (mfh 15 May 2014) This should work fine if CrsMatrix
06898     // inherits from DistObject (in which case all arrays that get
06899     // passed in here are Teuchos::Array), but it won't work if
06900     // CrsMatrix inherits from DistObjectKA (in which case all arrays
06901     // that get passed in here are Kokkos::View).
06902     //
06903     // In the latter case, imports_ only has a device view.
06904     // numImportPacketsPerLIDs_ is a device view, and also has a host
06905     // view (host_numImportPacketsPerLID_).
06906     size_t mynnz =
06907       Import_Util::unpackAndCombineWithOwningPIDsCount (*this, RemoteLIDs,
06908                                                         destMat->imports_old_ (),
06909                                                         destMat->numImportPacketsPerLID_old_ (),
06910                                                         constantNumPackets, Distor, INSERT,
06911                                                         NumSameIDs, PermuteToLIDs,
06912                                                         PermuteFromLIDs);
06913     size_t N = BaseRowMap->getNodeNumElements ();
06914 
06915     // Allocations
06916     ArrayRCP<size_t> CSR_rowptr(N+1);
06917     ArrayRCP<GO> CSR_colind_GID;
06918     ArrayRCP<LO> CSR_colind_LID;
06919     ArrayRCP<Scalar> CSR_vals;
06920     CSR_colind_GID.resize (mynnz);
06921     CSR_vals.resize (mynnz);
06922 
06923     // If LO and GO are the same, we can reuse memory when
06924     // converting the column indices from global to local indices.
06925     if (typeid (LO) == typeid (GO)) {
06926       CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
06927     }
06928     else {
06929       CSR_colind_LID.resize (mynnz);
06930     }
06931 
06932     // FIXME (mfh 15 May 2014) This should work fine if CrsMatrix
06933     // inherits from DistObject (in which case all arrays that get
06934     // passed in here are Teuchos::Array), but it won't work if
06935     // CrsMatrix inherits from DistObjectKA (in which case all arrays
06936     // that get passed in here are Kokkos::View).
06937     //
06938     // In the latter case, imports_ only has a device view.
06939     // numImportPacketsPerLIDs_ is a device view, and also has a host
06940     // view (host_numImportPacketsPerLID_).
06941     //
06942     // FIXME (mfh 15 May 2014) Why can't we abstract this out as an
06943     // unpackAndCombine method on a "CrsArrays" object?  This passing
06944     // in a huge list of arrays is icky.  Can't we have a bit of an
06945     // abstraction?  Implementing a concrete DistObject subclass only
06946     // takes five methods.
06947     Import_Util::unpackAndCombineIntoCrsArrays (*this, RemoteLIDs, destMat->imports_old_ (),
06948                                                 destMat->numImportPacketsPerLID_old_ (),
06949                                                 constantNumPackets, Distor, INSERT, NumSameIDs,
06950                                                 PermuteToLIDs, PermuteFromLIDs, N, mynnz, MyPID,
06951                                                 CSR_rowptr (), CSR_colind_GID (), CSR_vals (),
06952                                                 SourcePids (), TargetPids);
06953 
06954     /**************************************************************/
06955     /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
06956     /**************************************************************/
06957 
06958     // Call an optimized version of makeColMap that avoids the
06959     // Directory lookups (since the Import object knows who owns all
06960     // the GIDs).
06961     Teuchos::Array<int> RemotePids;
06962     Import_Util::lowCommunicationMakeColMapAndReindex (CSR_rowptr (),
06963                                                        CSR_colind_LID (),
06964                                                        CSR_colind_GID (),
06965                                                        BaseDomainMap,
06966                                                        TargetPids, RemotePids,
06967                                                        MyColMap);
06968 
06969     /*******************************************************/
06970     /**** 4) Second communicator restriction phase      ****/
06971     /*******************************************************/
06972     if (restrictComm) {
06973       ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
06974         ReducedRowMap :
06975         MyColMap->replaceCommWithSubset (ReducedComm);
06976       MyColMap = ReducedColMap; // Reset the "my" maps
06977     }
06978 
06979     // Replace the col map
06980     destMat->replaceColMap (MyColMap);
06981 
06982     // Short circuit if the processor is no longer in the communicator
06983     //
06984     // NOTE: Epetra replaces modifies all "removed" processes so they
06985     // have a dummy (serial) Map that doesn't touch the original
06986     // communicator.  Duplicating that here might be a good idea.
06987     if (ReducedComm.is_null ()) {
06988       return;
06989     }
06990 
06991     /***************************************************/
06992     /**** 5) Sort                                   ****/
06993     /***************************************************/
06994     Import_Util::sortCrsEntries (CSR_rowptr (),
06995                                  CSR_colind_LID (),
06996                                  CSR_vals ());
06997     if ((! reverseMode && xferAsImport != NULL) ||
06998         (reverseMode && xferAsExport != NULL)) {
06999       Import_Util::sortCrsEntries (CSR_rowptr (),
07000                                    CSR_colind_LID (),
07001                                    CSR_vals ());
07002     }
07003     else if ((! reverseMode && xferAsExport != NULL) ||
07004              (reverseMode && xferAsImport != NULL)) {
07005       Import_Util::sortAndMergeCrsEntries (CSR_rowptr (),
07006                                            CSR_colind_LID (),
07007                                            CSR_vals ());
07008       if (CSR_rowptr[N] != mynnz) {
07009         CSR_colind_LID.resize (CSR_rowptr[N]);
07010         CSR_vals.resize (CSR_rowptr[N]);
07011       }
07012     }
07013     else {
07014       TEUCHOS_TEST_FOR_EXCEPTION(
07015         true, std::logic_error, "Tpetra::CrsMatrix::"
07016         "transferAndFillComplete: Should never get here!  "
07017         "Please report this bug to a Tpetra developer.");
07018     }
07019     /***************************************************/
07020     /**** 6) Reset the colmap and the arrays        ****/
07021     /***************************************************/
07022 
07023     // Call constructor for the new matrix (restricted as needed)
07024     //
07025     // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
07026     // refactor version of CrsMatrix, though it reserves the right to
07027     // make a deep copy of the arrays.
07028     destMat->setAllValues (CSR_rowptr, CSR_colind_LID, CSR_vals);
07029 
07030     /***************************************************/
07031     /**** 7) Build Importer & Call ESFC             ****/
07032     /***************************************************/
07033     // Pre-build the importer using the existing PIDs
07034     RCP<import_type> MyImport = rcp (new import_type (MyDomainMap, MyColMap, RemotePids));
07035     destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap, MyImport);
07036   }
07037 
07038 
07039   template <class Scalar,
07040             class LocalOrdinal,
07041             class GlobalOrdinal,
07042             class DeviceType>
07043   void
07044   CrsMatrix<
07045     Scalar, LocalOrdinal, GlobalOrdinal,
07046     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
07047   importAndFillComplete (Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, node_type> >& destMatrix,
07048                          const import_type& importer,
07049                          const Teuchos::RCP<const map_type>& domainMap,
07050                          const Teuchos::RCP<const map_type>& rangeMap,
07051                          const Teuchos::RCP<Teuchos::ParameterList>& params) const
07052   {
07053     transferAndFillComplete (destMatrix, importer, domainMap, rangeMap, params);
07054   }
07055 
07056 
07057   template <class Scalar,
07058             class LocalOrdinal,
07059             class GlobalOrdinal,
07060             class DeviceType>
07061   void
07062   CrsMatrix<
07063     Scalar, LocalOrdinal, GlobalOrdinal,
07064     Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
07065   exportAndFillComplete (Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, node_type> >& destMatrix,
07066                          const export_type& exporter,
07067                          const Teuchos::RCP<const map_type>& domainMap,
07068                          const Teuchos::RCP<const map_type>& rangeMap,
07069                          const Teuchos::RCP<Teuchos::ParameterList>& params) const
07070   {
07071     transferAndFillComplete (destMatrix, exporter, domainMap, rangeMap, params);
07072   }
07073 
07074 } // namespace Tpetra
07075 
07076 #endif
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines