Tpetra Matrix/Vector Services  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines
Tpetra_KokkosRefactor_MultiVector_def.hpp
00001 // @HEADER
00002 // ***********************************************************************
00003 //
00004 //          Tpetra: Templated Linear Algebra Services Package
00005 //                 Copyright (2008) Sandia Corporation
00006 //
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00038 //
00039 // ************************************************************************
00040 // @HEADER
00041 
00042 #ifndef TPETRA_KOKKOS_REFACTOR_MULTIVECTOR_DEF_HPP
00043 #define TPETRA_KOKKOS_REFACTOR_MULTIVECTOR_DEF_HPP
00044 
00045 #include <Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp>
00046 
00047 #ifdef DOXYGEN_USE_ONLY
00048 #  include "Tpetra_KokkosRefactor_MultiVector_decl.hpp"
00049 #endif
00050 
00051 #include <KokkosCompat_View.hpp>
00052 #include <Kokkos_MV.hpp>
00053 #include <Kokkos_MV_GEMM.hpp>
00054 #include <Kokkos_Random.hpp>
00055 
00056 namespace Tpetra {
00057 
00058   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00059   bool
00060   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00061   vectorIndexOutOfRange(size_t VectorIndex) const {
00062     return (VectorIndex < 1 && VectorIndex != 0) || VectorIndex >= getNumVectors();
00063   }
00064 
00065   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00066   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00067   MultiVector () :
00068     base_type (Teuchos::rcp (new Map<LocalOrdinal, GlobalOrdinal, node_type> ()))
00069   {}
00070 
00071   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00072   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00073   MultiVector (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& map,
00074                size_t NumVectors,
00075                bool zeroOut) : /* default is true */
00076     base_type (map)
00077   {
00078     using Teuchos::ArrayRCP;
00079     using Teuchos::RCP;
00080 
00081     (void) zeroOut; // View allocation does first touch automatically.
00082 
00083     TEUCHOS_TEST_FOR_EXCEPTION(NumVectors < 1, std::invalid_argument,
00084       "Tpetra::MultiVector::MultiVector(): NumVectors must be strictly positive.");
00085     const size_t myLen = getLocalLength();
00086     if (myLen > 0) {
00087       RCP<Node> node = map->getNode();
00088       // On host-type Kokkos Nodes, allocBuffer() just calls the
00089       // one-argument version of arcp to allocate memory.  This should
00090       // not fill the memory by default, otherwise we would lose the
00091       // first-touch allocation optimization.
00092 
00093       // Allocate a DualView from new Kokkos, wrap its device data into an ArrayRCP
00094       view_ = dual_view_type("MV::dual_view",myLen,NumVectors);
00095     }
00096     else {
00097       view_ = dual_view_type("MV::dual_view",0,NumVectors);
00098     }
00099 
00100     origView_ = view_;
00101   }
00102 
00103   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00104   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00105   MultiVector (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& source) :
00106     base_type (source),
00107     view_ (source.view_),
00108     origView_ (source.origView_),
00109     whichVectors_ (source.whichVectors_)
00110   {}
00111 
00112 
00113   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00114   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00115   MultiVector (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& source,
00116                const Teuchos::DataAccess copyOrView) :
00117     base_type (source),
00118     view_ (source.view_),
00119     origView_ (source.origView_),
00120     whichVectors_ (source.whichVectors_)
00121   {
00122     if (copyOrView == Teuchos::Copy) {
00123       // Reuse the conveniently already existing function that creates
00124       // a deep copy.
00125       MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,node_type> cpy =
00126         createCopy (source);
00127       this->view_ = cpy.view_;
00128       this->origView_ = cpy.origView_;
00129       this->whichVectors_ = cpy.whichVectors_;
00130     }
00131     else if (copyOrView == Teuchos::View) {
00132     }
00133     else {
00134       TEUCHOS_TEST_FOR_EXCEPTION(
00135         true, std::invalid_argument, "Tpetra::MultiVector copy constructor: "
00136         "The second argument 'copyOrView' has an invalid value " << copyOrView
00137         << ".  Valid values include Teuchos::Copy = " << Teuchos::Copy <<
00138         " and Teuchos::View = " << Teuchos::View << ".");
00139     }
00140   }
00141 
00142   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00143   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00144   MultiVector (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& map,
00145                const dual_view_type& view) :
00146     base_type (map),
00147     view_ (view),
00148     origView_ (view)
00149   {
00150     using Teuchos::ArrayRCP;
00151     using Teuchos::RCP;
00152     const char tfecfFuncName[] = "Tpetra::MultiVector(map,view)";
00153 
00154     // Get stride of view: if second dimension is 0, the
00155     // stride might be 0, so take view_dimension instead.
00156     size_t stride[8];
00157     origView_.stride (stride);
00158     const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] : origView_.dimension_0 ();
00159     const size_t numVecs = view_.dimension_1 ();
00160 
00161     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs < 1, std::invalid_argument,
00162       ": numVecs must be strictly positive, but you specified numVecs = "
00163       << numVecs << ".");
00164     const size_t myLen = getLocalLength ();
00165     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < myLen, std::invalid_argument,
00166       ": LDA must be large enough to accomodate the local entries.");
00167   }
00168 
00169 
00170   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00171   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00172   MultiVector (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& map,
00173                const dual_view_type& view,
00174                const dual_view_type& origView) :
00175     base_type (map),
00176     view_ (view),
00177     origView_ (origView)
00178   {
00179     using Teuchos::ArrayRCP;
00180     using Teuchos::RCP;
00181     const char tfecfFuncName[] = "Tpetra::MultiVector(map,view,origView)";
00182 
00183     // Get stride of view: if second dimension is 0, the
00184     // stride might be 0, so take view_dimension instead.
00185     size_t stride[8];
00186     origView_.stride (stride);
00187     const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] : origView_.dimension_0 ();
00188     const size_t numVecs = view_.dimension_1 ();
00189 
00190     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs < 1, std::invalid_argument,
00191       ": numVecs must be strictly positive, but you specified numVecs = "
00192       << numVecs << ".");
00193     const size_t myLen = getLocalLength ();
00194     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < myLen, std::invalid_argument,
00195       ": LDA must be large enough to accomodate the local entries.");
00196   }
00197 
00198 
00199   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00200   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00201   MultiVector (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& map,
00202                const dual_view_type& view,
00203                const Teuchos::ArrayView<const size_t>& whichVectors) :
00204     base_type (map),
00205     view_ (view),
00206     origView_ (view),
00207     whichVectors_ (whichVectors.begin (), whichVectors.end ())
00208   {
00209     using Kokkos::ALL;
00210     using Kokkos::subview;
00211     using Teuchos::ArrayRCP;
00212     using Teuchos::RCP;
00213     const char tfecfFuncName[] = "MultiVector(map,view,whichVectors)";
00214 
00215     // Get stride of view: if second dimension is 0, the
00216     // stride might be 0, so take view_dimension instead.
00217     size_t stride[8];
00218     origView_.stride (stride);
00219     const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] : origView_.dimension_0 ();
00220     size_t numVecs = view_.dimension_1 ();
00221 
00222     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs < 1, std::invalid_argument,
00223       ": numVecs must be strictly positive, but you specified numVecs = "
00224       << numVecs << ".");
00225     const size_t myLen = getLocalLength();
00226     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < myLen, std::invalid_argument,
00227       ": LDA must be large enough to accomodate the local entries.");
00228 
00229     if (whichVectors.size () == 1) {
00230       // If whichVectors has only one entry, we don't need to bother
00231       // with nonconstant stride.  Just shift the view over so it
00232       // points to the desired column.
00233       //
00234       // NOTE (mfh 10 May 2014) This is a special case where we set
00235       // origView_ just to view that one column, not all of the
00236       // original columns.  This ensures that the use of origView_ in
00237       // offsetView works correctly.
00238       view_ = subview<dual_view_type> (view_, ALL (), whichVectors[0]);
00239       origView_ = subview<dual_view_type> (origView_, ALL (), whichVectors[0]);
00240       numVecs = 1;
00241       // whichVectors_.size() == 0 means "constant stride."
00242       whichVectors_.clear ();
00243     }
00244   }
00245 
00246   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00247   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00248   MultiVector (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& map,
00249                const dual_view_type& view,
00250                const dual_view_type& origView,
00251                const Teuchos::ArrayView<const size_t>& whichVectors) :
00252     base_type (map),
00253     view_ (view),
00254     origView_ (origView),
00255     whichVectors_ (whichVectors.begin (), whichVectors.end ())
00256   {
00257     using Kokkos::ALL;
00258     using Kokkos::subview;
00259     using Teuchos::ArrayRCP;
00260     using Teuchos::RCP;
00261     const char tfecfFuncName[] = "MultiVector(map,view,whichVectors)";
00262 
00263     // Get stride of view: if second dimension is 0, the
00264     // stride might be 0, so take view_dimension instead.
00265     size_t stride[8];
00266     origView_.stride (stride);
00267     const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] : origView_.dimension_0 ();
00268     size_t numVecs = view_.dimension_1 ();
00269 
00270     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(numVecs < 1, std::invalid_argument,
00271       ": numVecs must be strictly positive, but you specified numVecs = "
00272       << numVecs << ".");
00273     const size_t myLen = getLocalLength();
00274     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < myLen, std::invalid_argument,
00275       ": LDA must be large enough to accomodate the local entries.");
00276 
00277     if (whichVectors.size () == 1) {
00278       // If whichVectors has only one entry, we don't need to bother
00279       // with nonconstant stride.  Just shift the view over so it
00280       // points to the desired column.
00281       //
00282       // NOTE (mfh 10 May 2014) This is a special case where we set
00283       // origView_ just to view that one column, not all of the
00284       // original columns.  This ensures that the use of origView_ in
00285       // offsetView works correctly.
00286       view_ = subview<dual_view_type> (view_, ALL (), whichVectors[0]);
00287       origView_ = subview<dual_view_type> (origView_, ALL (), whichVectors[0]);
00288       numVecs = 1;
00289       // whichVectors_.size() == 0 means "constant stride."
00290       whichVectors_.clear ();
00291     }
00292   }
00293 
00294   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00295   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00296   MultiVector (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& map,
00297                const Teuchos::ArrayView<const Scalar>& data,
00298                const size_t LDA,
00299                const size_t numVecs) :
00300     base_type (map)
00301   {
00302     // Deep copy constructor, constant stride (NO whichVectors_).
00303     // There is no need for a deep copy constructor with nonconstant stride.
00304 
00305     const char tfecfFuncName[] = "MultiVector(map,data,LDA,numVecs)";
00306     const size_t numRows = this->getLocalLength ();
00307     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(LDA < numRows, std::runtime_error,
00308       ": LDA = " << LDA << " < numRows = " << numRows << ".");
00309 
00310     view_ = dual_view_type ("MV::view_", numRows, numVecs);
00311     for (size_t i = 0; i < numRows; ++i) {
00312       for (size_t j = 0; j < numVecs; ++j) {
00313         view_.h_view(i,j) = data[j*LDA+i];
00314       }
00315     }
00316     view_.template modify<typename dual_view_type::host_mirror_device_type> ();
00317     origView_ = view_;
00318   }
00319 
00320   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00321   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00322   MultiVector (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& map,
00323                const Teuchos::ArrayView<const ArrayView<const Scalar> >& ArrayOfPtrs,
00324                const size_t NumVectors) :
00325     base_type (map)
00326   {
00327     using Teuchos::ArrayRCP;
00328     using Teuchos::ArrayView;
00329     using Teuchos::RCP;
00330     const char tfecfFuncName[] = "MultiVector(map,ArrayOfPtrs,NumVectors)";
00331 
00332     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00333       NumVectors < 1 || NumVectors != static_cast<size_t>(ArrayOfPtrs.size()),
00334       std::runtime_error,
00335       ": ArrayOfPtrs.size() must be strictly positive and as large as ArrayOfPtrs.");
00336     const size_t myLen = getLocalLength ();
00337     view_ = dual_view_type ("MV::view_", myLen, NumVectors);
00338 
00339     // TODO: write a functor and use parallel_for.
00340 
00341     for (size_t i = 0; i < myLen; ++i) {
00342       for (size_t j = 0; j < NumVectors; ++j) {
00343         view_.h_view(i,j) = ArrayOfPtrs[j][i];
00344       }
00345     }
00346     view_.template modify<typename dual_view_type::t_host::device_type> ();
00347 
00348     origView_ = view_;
00349   }
00350 
00351   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00352   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00353   ~MultiVector () {}
00354 
00355   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00356   bool MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00357   isConstantStride () const {
00358     return whichVectors_.empty ();
00359   }
00360 
00361   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00362   size_t
00363   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00364   getLocalLength () const
00365   {
00366     if (this->getMap ().is_null ()) { // possible, due to replaceMap().
00367       return static_cast<size_t> (0);
00368     } else {
00369       return this->getMap ()->getNodeNumElements ();
00370     }
00371   }
00372 
00373   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00374   global_size_t
00375   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00376   getGlobalLength () const
00377   {
00378     if (this->getMap ().is_null ()) { // possible, due to replaceMap().
00379       return static_cast<size_t> (0);
00380     } else {
00381       return this->getMap ()->getGlobalNumElements ();
00382     }
00383   }
00384 
00385   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00386   size_t
00387   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00388   getStride () const
00389   {
00390     if (isConstantStride ()) {
00391       // Get stride of view: if second dimension is 0, the
00392       // stride might be 0, so take view_dimension instead.
00393       size_t stride[8];
00394       origView_.stride (stride);
00395       const size_t LDA = (origView_.dimension_1 () > 1) ? stride[1] : origView_.dimension_0 ();
00396       return LDA;
00397     }
00398     else {
00399       return static_cast<size_t> (0);
00400     }
00401   }
00402 
00403 
00404   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00405   bool
00406   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00407   checkSizes (const SrcDistObject& sourceObj)
00408   {
00409     // Check whether the source object is a MultiVector.  If not, then
00410     // we can't even compare sizes, so it's definitely not OK to
00411     // Import or Export from it.
00412     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> this_type;
00413     const this_type* src = dynamic_cast<const this_type*> (&sourceObj);
00414     if (src == NULL) {
00415       return false;
00416     } else {
00417       // The target of the Import or Export calls checkSizes() in
00418       // DistObject::doTransfer().  By that point, we've already
00419       // constructed an Import or Export object using the two
00420       // multivectors' Maps, which means that (hopefully) we've
00421       // already checked other attributes of the multivectors.  Thus,
00422       // all we need to do here is check the number of columns.
00423       return src->getNumVectors () == this->getNumVectors ();
00424     }
00425   }
00426 
00427   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00428   size_t
00429   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00430   constantNumberOfPackets () const {
00431     return this->getNumVectors ();
00432   }
00433 
00434   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00435   void
00436   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00437   copyAndPermuteNew (
00438     const SrcDistObject& sourceObj,
00439     size_t numSameIDs,
00440     const Kokkos::View<const LocalOrdinal*, device_type> &permuteToLIDs,
00441     const Kokkos::View<const LocalOrdinal*, device_type> &permuteFromLIDs)
00442   {
00443     using Teuchos::ArrayRCP;
00444     using Teuchos::ArrayView;
00445     using Teuchos::RCP;
00446     using Kokkos::Compat::getKokkosViewDeepCopy;
00447     typedef MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > MV;
00448     //typedef typename ArrayView<const LocalOrdinal>::size_type size_type; // unused
00449     const char tfecfFuncName[] = "copyAndPermute";
00450 
00451     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00452       permuteToLIDs.size() != permuteFromLIDs.size(), std::runtime_error,
00453       ": permuteToLIDs and permuteFromLIDs must have the same size."
00454       << std::endl << "permuteToLIDs.size() = " << permuteToLIDs.size ()
00455       << " != permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
00456 
00457     // We've already called checkSizes(), so this cast must succeed.
00458     const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
00459 
00460     const size_t numCols = this->getNumVectors ();
00461 
00462     // TODO (mfh 15 Sep 2013) When we replace
00463     // KokkosClassic::MultiVector with a Kokkos::View, there are two
00464     // ways to copy the data:
00465     //
00466     // 1. Get a (sub)view of each column and call deep_copy on that.
00467     // 2. Write a custom kernel to copy the data.
00468     //
00469     // The first is easier, but the second might be more performant in
00470     // case we decide to use layouts other than LayoutLeft.  It might
00471     // even make sense to hide whichVectors_ in an entirely new layout
00472     // for Kokkos Views.
00473 
00474     // Copy rows [0, numSameIDs-1] of the local multivectors.
00475     //
00476     // For GPU Nodes: All of this happens using device pointers; this
00477     // does not require host views of either source or destination.
00478     //
00479     // Note (ETP 2 Jul 2014)  We need to always copy one column at a
00480     // time, even when both multivectors are constant-stride, since
00481     // deep_copy between strided subviews with more than one column
00482     // doesn't currently work.
00483     if (numSameIDs > 0) {
00484       const std::pair<size_t, size_t> rows( 0, numSameIDs );
00485       for (size_t j = 0; j < numCols; ++j) {
00486         const size_t dstCol =
00487           isConstantStride() ? j : whichVectors_[j];
00488         const size_t srcCol =
00489           sourceMV.isConstantStride() ? j : sourceMV.whichVectors_[j];
00490         dual_view_type dst_j =
00491           Kokkos::subview<dual_view_type>( view_, rows, dstCol );
00492         dual_view_type src_j =
00493           Kokkos::subview<dual_view_type>( sourceMV.view_, rows, srcCol );
00494         Kokkos::deep_copy( dst_j, src_j ); // Copy src_j into dest_j
00495       }
00496     }
00497 
00498     // For the remaining GIDs, execute the permutations.  This may
00499     // involve noncontiguous access of both source and destination
00500     // vectors, depending on the LID lists.
00501     //
00502     // FIXME (mfh 20 June 2012) For an Export with duplicate GIDs on
00503     // the same process, this merges their values by replacement of
00504     // the last encountered GID, not by the specified merge rule
00505     // (such as ADD).
00506 
00507     // If there are no permutations, we are done
00508     if (permuteFromLIDs.size() == 0 || permuteToLIDs.size() == 0)
00509       return;
00510 
00511     if (this->isConstantStride ()) {
00512       KokkosRefactor::Details::permute_array_multi_column(
00513         getKokkosView(),
00514         sourceMV.getKokkosView(),
00515         permuteToLIDs,
00516         permuteFromLIDs,
00517         numCols);
00518     }
00519     else {
00520       KokkosRefactor::Details::permute_array_multi_column_variable_stride(
00521         getKokkosView(),
00522         sourceMV.getKokkosView(),
00523         permuteToLIDs,
00524         permuteFromLIDs,
00525         getKokkosViewDeepCopy<device_type> (whichVectors_ ()),
00526         getKokkosViewDeepCopy<device_type> (sourceMV.whichVectors_ ()),
00527         numCols);
00528     }
00529   }
00530 
00531   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00532   void
00533   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00534   packAndPrepareNew (
00535     const SrcDistObject& sourceObj,
00536     const Kokkos::View<const LocalOrdinal*, device_type> &exportLIDs,
00537     Kokkos::View<Scalar*, device_type> &exports,
00538     const Kokkos::View<size_t*, device_type> &numExportPacketsPerLID,
00539     size_t& constantNumPackets,
00540     Distributor & /* distor */ )
00541   {
00542     using Teuchos::Array;
00543     using Teuchos::ArrayView;
00544     using Kokkos::Compat::getKokkosViewDeepCopy;
00545     typedef MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > MV;
00546     //typedef Array<size_t>::size_type size_type; // unused
00547 
00548     // If we have no exports, there is nothing to do
00549     if (exportLIDs.size () == 0) {
00550       return;
00551     }
00552 
00553     // We've already called checkSizes(), so this cast must succeed.
00554     const MV& sourceMV = dynamic_cast<const MV&> (sourceObj);
00555 
00556     // We don't need numExportPacketsPerLID; forestall "unused
00557     // variable" compile warnings.
00558     (void) numExportPacketsPerLID;
00559 
00560     /* The layout in the export for MultiVectors is as follows:
00561        exports = { all of the data from row exportLIDs.front() ;
00562                    ....
00563                    all of the data from row exportLIDs.back() }
00564       This doesn't have the best locality, but is necessary because
00565       the data for a Packet (all data associated with an LID) is
00566       required to be contiguous. */
00567 
00568     // FIXME (mfh 15 Sep 2013) Would it make sense to rethink the
00569     // packing scheme in the above comment?  The data going to a
00570     // particular process must be contiguous, of course, but those
00571     // data could include entries from multiple LIDs.  DistObject just
00572     // needs to know how to index into that data.  Kokkos is good at
00573     // decoupling storage intent from data layout choice.
00574 
00575     const size_t numCols = sourceMV.getNumVectors ();
00576 
00577     // This spares us from needing to fill numExportPacketsPerLID.
00578     // Setting constantNumPackets to a nonzero value signals that
00579     // all packets have the same number of entries.
00580     constantNumPackets = numCols;
00581 
00582     const size_t numExportLIDs = exportLIDs.size ();
00583     const size_t newExportsSize = numCols * numExportLIDs;
00584     if (exports.size () != newExportsSize) {
00585       Kokkos::Compat::realloc (exports, newExportsSize);
00586     }
00587 
00588     if (numCols == 1) { // special case for one column only
00589       // MultiVector always represents a single column with constant
00590       // stride, but it doesn't hurt to implement both cases anyway.
00591       //
00592       // ETP:  I'm not sure I agree with the above statement.  Can't a single-
00593       // column multivector be a subview of another multi-vector, in which case
00594       // sourceMV.whichVectors_[0] != 0 ?  I think we have to handle that case
00595       // separately.
00596       if (sourceMV.isConstantStride ()) {
00597         KokkosRefactor::Details::pack_array_single_column(
00598           exports,
00599           sourceMV.getKokkosView (),
00600           exportLIDs,
00601           0);
00602       }
00603       else {
00604         KokkosRefactor::Details::pack_array_single_column(
00605           exports,
00606           sourceMV.getKokkosView (),
00607           exportLIDs,
00608           sourceMV.whichVectors_[0]);
00609       }
00610     }
00611     else { // the source MultiVector has multiple columns
00612       if (sourceMV.isConstantStride ()) {
00613         KokkosRefactor::Details::pack_array_multi_column(
00614           exports,
00615           sourceMV.getKokkosView (),
00616           exportLIDs,
00617           numCols);
00618       }
00619       else {
00620         KokkosRefactor::Details::pack_array_multi_column_variable_stride(
00621           exports,
00622           sourceMV.getKokkosView (),
00623           exportLIDs,
00624           getKokkosViewDeepCopy<device_type> (sourceMV.whichVectors_ ()),
00625           numCols);
00626       }
00627     }
00628   }
00629 
00630 
00631   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00632   void
00633   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00634   unpackAndCombineNew (
00635     const Kokkos::View<const LocalOrdinal*, device_type> &importLIDs,
00636     const Kokkos::View<const Scalar*, device_type> &imports,
00637     const Kokkos::View<size_t*, device_type> &numPacketsPerLID,
00638     size_t constantNumPackets,
00639     Distributor & /* distor */,
00640     CombineMode CM)
00641   {
00642     using Teuchos::ArrayView;
00643     using Kokkos::Compat::getKokkosViewDeepCopy;
00644     const char tfecfFuncName[] = "unpackAndCombine";
00645 
00646     // If we have no imports, there is nothing to do
00647     if (importLIDs.size () == 0) {
00648       return;
00649     }
00650 
00651     // We don't need numPacketsPerLID; forestall "unused variable"
00652     // compile warnings.
00653     (void) numPacketsPerLID;
00654 
00655     /* The layout in the export for MultiVectors is as follows:
00656        imports = { all of the data from row exportLIDs.front() ;
00657                    ....
00658                    all of the data from row exportLIDs.back() }
00659       This doesn't have the best locality, but is necessary because
00660       the data for a Packet (all data associated with an LID) is
00661       required to be contiguous. */
00662 
00663     const size_t numVecs = getNumVectors ();
00664 
00665 #ifdef HAVE_TPETRA_DEBUG
00666     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00667       static_cast<size_t> (imports.size()) != getNumVectors()*importLIDs.size(),
00668       std::runtime_error,
00669       ": 'imports' buffer size must be consistent with the amount of data to "
00670       "be sent.  " << std::endl << "imports.size() = " << imports.size()
00671       << " != getNumVectors()*importLIDs.size() = " << getNumVectors() << "*"
00672       << importLIDs.size() << " = " << getNumVectors() * importLIDs.size()
00673       << ".");
00674 
00675     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00676       constantNumPackets == static_cast<size_t> (0), std::runtime_error,
00677       ": constantNumPackets input argument must be nonzero.");
00678 
00679     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00680       static_cast<size_t> (numVecs) != static_cast<size_t> (constantNumPackets),
00681       std::runtime_error, ": constantNumPackets must equal numVecs.");
00682 #endif // HAVE_TPETRA_DEBUG
00683 
00684     if (numVecs > 0 && importLIDs.size () > 0) {
00685 
00686       // NOTE (mfh 10 Mar 2012, 24 Mar 2014) If you want to implement
00687       // custom combine modes, start editing here.  Also, if you trust
00688       // inlining, it would be nice to condense this code by using a
00689       // binary function object f in the pack functors.
00690       if (CM == INSERT || CM == REPLACE) {
00691         if (isConstantStride()) {
00692           KokkosRefactor::Details::unpack_array_multi_column(
00693             getKokkosView(),
00694             imports,
00695             importLIDs,
00696             KokkosRefactor::Details::InsertOp(),
00697             numVecs);
00698         }
00699         else {
00700           KokkosRefactor::Details::unpack_array_multi_column_variable_stride(
00701             getKokkosView(),
00702             imports,
00703             importLIDs,
00704             getKokkosViewDeepCopy<device_type>(whichVectors_ ()),
00705             KokkosRefactor::Details::InsertOp(),
00706             numVecs);
00707         }
00708       }
00709       else if (CM == ADD) {
00710         if (isConstantStride()) {
00711           KokkosRefactor::Details::unpack_array_multi_column(
00712             getKokkosView(),
00713             imports,
00714             importLIDs,
00715             KokkosRefactor::Details::AddOp(),
00716             numVecs);
00717         }
00718         else {
00719           KokkosRefactor::Details::unpack_array_multi_column_variable_stride(
00720             getKokkosView(),
00721             imports,
00722             importLIDs,
00723             getKokkosViewDeepCopy<device_type>(whichVectors_ ()),
00724             KokkosRefactor::Details::AddOp(),
00725             numVecs);
00726         }
00727       }
00728       else if (CM == ABSMAX) {
00729         if (isConstantStride()) {
00730           KokkosRefactor::Details::unpack_array_multi_column(
00731             getKokkosView(),
00732             imports,
00733             importLIDs,
00734             KokkosRefactor::Details::AbsMaxOp(),
00735             numVecs);
00736         }
00737         else {
00738           KokkosRefactor::Details::unpack_array_multi_column_variable_stride(
00739             getKokkosView(),
00740             imports,
00741             importLIDs,
00742             getKokkosViewDeepCopy<device_type>(whichVectors_ ()),
00743             KokkosRefactor::Details::AbsMaxOp(),
00744             numVecs);
00745         }
00746       }
00747       else {
00748         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00749           CM != ADD && CM != REPLACE && CM != INSERT && CM != ABSMAX,
00750           std::invalid_argument, ": Invalid CombineMode: " << CM << ".  Valid "
00751           "CombineMode values are ADD, REPLACE, INSERT, and ABSMAX.");
00752       }
00753     }
00754   }
00755 
00756   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00757   inline size_t
00758   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00759   getNumVectors () const
00760   {
00761     if (isConstantStride ()) {
00762       return static_cast<size_t> (view_.dimension_1 ());
00763     }
00764     else {
00765       return static_cast<size_t> (whichVectors_.size ());
00766     }
00767   }
00768 
00769   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00770   void
00771   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00772   dot (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type>& A,
00773        const Teuchos::ArrayView<dot_type>& dots) const
00774   {
00775     using Kokkos::ALL;
00776     using Kokkos::subview;
00777     using Teuchos::REDUCE_SUM;
00778     using Teuchos::reduceAll;
00779     // View of a MultiVector's local data (all columns).
00780     typedef typename dual_view_type::t_dev mv_view_type;
00781     // View of a single column of a MultiVector's local data.
00782     //
00783     // FIXME (mfh 14 Jul 2014) It would be better to get this typedef
00784     // from mv_view_type itself, in case the layout changes.
00785     typedef Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type> vec_view_type;
00786     typedef typename dual_view_type::host_mirror_device_type host_mirror_device_type;
00787     // View of all the dot product results.
00788     typedef Kokkos::View<dot_type*, Kokkos::LayoutLeft,
00789       host_mirror_device_type, Kokkos::MemoryUnmanaged> host_dots_view_type;
00790     typedef Kokkos::View<dot_type*, Kokkos::LayoutLeft,
00791       host_mirror_device_type> host_dots_managed_view_type;
00792     const char tfecfFuncName[] = "Tpetra::MultiVector::dot";
00793 
00794 #ifdef HAVE_TPETRA_DEBUG
00795     {
00796       const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
00797       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00798         ! compat, std::invalid_argument, "Tpetra::MultiVector::dot: *this is "
00799         "not compatible with the input MultiVector A.  We only test for this "
00800         "in a debug build.");
00801     }
00802 #endif //  HAVE_TPETRA_DEBUG
00803 
00804     // FIXME (mfh 11 Jul 2014) These exception tests may not
00805     // necessarily be thrown on all processes consistently.  We should
00806     // instead pass along error state with the inner product.  We
00807     // could do this by setting an extra slot to
00808     // Kokkos::Details::ArithTraits<dot_type>::one() on error.  The
00809     // final sum should be
00810     // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
00811     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00812       getLocalLength () != A.getLocalLength (), std::runtime_error,
00813       ": MultiVectors do not have the same local length.  "
00814       "this->getLocalLength() = " << getLocalLength () << " != "
00815       "A.getLocalLength() = " << A.getLocalLength () << ".");
00816     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00817       getNumVectors () != A.getNumVectors (), std::runtime_error,
00818       ": MultiVectors must have the same number of columns (vectors).  "
00819       "this->getNumVectors() = " << getNumVectors () << " != "
00820       "A.getNumVectors() = " << A.getNumVectors () << ".");
00821     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00822         static_cast<size_t> (dots.size () ) != getNumVectors (), std::runtime_error, ": The output "
00823       "array 'dots' must have the same number of entries as the number of "
00824       "columns (vectors) in *this and A.  dots.size() = " << dots.size ()
00825       << " != this->getNumVectors() = " << getNumVectors () << ".");
00826 
00827     // const size_t numVecs = getNumVectors (); // not used
00828     const size_t lclNumRows = getLocalLength ();
00829     const size_t numDots = static_cast<size_t> (dots.size ());
00830 
00831     // We're computing using the device's data, so we need to make
00832     // sure first that the device is in sync with the host.
00833     A.view_.template sync<DeviceType> ();
00834     view_.template sync<DeviceType> ();
00835 
00836     // In case the input dimensions don't match, make sure that we
00837     // don't overwrite memory that doesn't belong to us, by using
00838     // subset views with the minimum dimensions over all input.
00839     const std::pair<size_t, size_t> rowRng (0, lclNumRows);
00840     const std::pair<size_t, size_t> colRng (0, numDots);
00841     mv_view_type X = subview<mv_view_type> (view_.d_view, rowRng, colRng);
00842     mv_view_type Y = subview<mv_view_type> (A.view_.d_view, rowRng, colRng);
00843 
00844     if (numDots == 1) {
00845       // Special case 1: Both MultiVectors only have a single column.
00846       // The single-vector dot product kernel may be more efficient.
00847       const size_t ZERO = static_cast<size_t> (0);
00848       vec_view_type X_k = subview<vec_view_type> (X, ALL (), ZERO);
00849       vec_view_type Y_k = subview<vec_view_type> (Y, ALL (), ZERO);
00850 
00851       TEUCHOS_TEST_FOR_EXCEPTION(
00852         X_k.dimension_0 () != lclNumRows, std::logic_error, "Tpetra::Multi"
00853         "Vector::dot: In the special single-vector case, X_k.dimension_0() = "
00854         << X_k.dimension_0 () << " != lclNumRows = " << lclNumRows
00855         << ".  Please report this bug to the Tpetra developers.");
00856       TEUCHOS_TEST_FOR_EXCEPTION(
00857         Y_k.dimension_0 () != lclNumRows, std::logic_error, "Tpetra::Multi"
00858         "Vector::dot: In the special single-vector case, Y_k.dimension_0() = "
00859         << Y_k.dimension_0 () << " != lclNumRows = " << lclNumRows
00860         << ".  Please report this bug to the Tpetra developers.");
00861 
00862       dots[0] = Kokkos::V_Dot (X_k, Y_k, -1);
00863     }
00864     else if (isConstantStride () && A.isConstantStride ()) {
00865       // Special case 2: Both MultiVectors have constant stride.
00866       (void) Kokkos::MV_Dot (dots.getRawPtr (), X, Y, -1);
00867     }
00868     else {
00869       // FIXME (mfh 14 Jul 2014) This does a kernel launch for every
00870       // column.  It might be better to have a kernel that does the
00871       // work all at once.  On the other hand, we don't prioritize
00872       // performance of MultiVector views of noncontiguous columns.
00873       for (size_t k = 0; k < numDots; ++k) {
00874         const size_t X_col = isConstantStride () ? k : whichVectors_[k];
00875         const size_t Y_col = A.isConstantStride () ? k : A.whichVectors_[k];
00876         vec_view_type X_k = subview<vec_view_type> (X, ALL (), X_col);
00877         vec_view_type Y_k = subview<vec_view_type> (Y, ALL (), Y_col);
00878         dots[k] = Kokkos::V_Dot (X_k, Y_k, -1);
00879       }
00880     }
00881 
00882     // If the MultiVectors are distributed over multiple processes,
00883     // sum the results across processes.  We assume that the MPI
00884     // implementation can read from and write to device memory.
00885     //
00886     // replaceMap() may have removed some processes.  Those processes
00887     // have a null Map.  They must not participate in any collective
00888     // operations.  We ask first whether the Map is null, because
00889     // isDistributed() defers that question to the Map.  We still
00890     // compute and return local dot products for processes not
00891     // participating in collective operations; those probably don't
00892     // make any sense, but it doesn't hurt to do them, since it's
00893     // illegal to call dot() on those processes anyway.
00894     if (! this->getMap ().is_null () && this->isDistributed ()) {
00895       host_dots_view_type theDots (dots.getRawPtr (), numDots);
00896       // MPI doesn't allow aliasing of arguments, so we have to make a
00897       // copy of the local sum.
00898       host_dots_managed_view_type lclDots ("MV::dot lcl", numDots);
00899 
00900       TEUCHOS_TEST_FOR_EXCEPTION(
00901         lclDots.dimension_0 () != theDots.dimension_0 (), std::logic_error,
00902         "Tpetra::MultiVector::dot: lclDots and theDots have different sizes.  "
00903         "lclDots.dimension_0 () = " << lclDots.dimension_0 () << " != "
00904         "theDots.dimension_0 () = " << theDots.dimension_0 () << ".  "
00905         "Please report this bug to the Tpetra developers.");
00906 
00907       Kokkos::deep_copy (lclDots, theDots);
00908       const Teuchos::Comm<int>& comm = * (this->getMap ()->getComm ());
00909       const dot_type* const lclSum = lclDots.ptr_on_device ();
00910       dot_type* const gblSum = theDots.ptr_on_device ();
00911       reduceAll<int, dot_type> (comm, REDUCE_SUM, static_cast<int> (numDots),
00912                                 lclSum, gblSum);
00913     }
00914   }
00915 
00916   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
00917   void
00918   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
00919               Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
00920   dot (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type>& A,
00921        const Kokkos::View<dot_type*, device_type>& dots) const
00922   {
00923     using Kokkos::ALL;
00924     using Kokkos::subview;
00925     using Teuchos::REDUCE_SUM;
00926     using Teuchos::reduceAll;
00927     // View of a MultiVector's local data (all columns).
00928     typedef typename dual_view_type::t_dev mv_view_type;
00929     // View of a single column of a MultiVector's local data.
00930     //
00931     // FIXME (mfh 14 Jul 2014) It would be better to get this typedef
00932     // from mv_view_type itself, in case the layout changes.
00933     typedef Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type> vec_view_type;
00934     // View of all the dot product results.
00935     typedef Kokkos::View<dot_type*, device_type> dots_view_type;
00936     // Scalar view; view of a single dot product result.
00937     typedef Kokkos::View<dot_type, device_type> dot_view_type;
00938     const char tfecfFuncName[] = "Tpetra::MultiVector::dot";
00939 
00940 #ifdef HAVE_TPETRA_DEBUG
00941     {
00942       const bool compat = this->getMap ()->isCompatible (* (A.getMap ()));
00943       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00944         ! compat, std::invalid_argument, "Tpetra::MultiVector::dot: *this is "
00945         "not compatible with the input MultiVector A.  We only test for this "
00946         "in a debug build.");
00947     }
00948 #endif //  HAVE_TPETRA_DEBUG
00949 
00950     // FIXME (mfh 11 Jul 2014) These exception tests may not
00951     // necessarily be thrown on all processes consistently.  We should
00952     // instead pass along error state with the inner product.  We
00953     // could do this by setting an extra slot to
00954     // Kokkos::Details::ArithTraits<dot_type>::one() on error.  The
00955     // final sum should be
00956     // Kokkos::Details::ArithTraits<dot_type>::zero() if not error.
00957     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00958       getLocalLength () != A.getLocalLength (), std::runtime_error,
00959       ": MultiVectors do not have the same local length.  "
00960       "this->getLocalLength() = " << getLocalLength () << " != "
00961       "A.getLocalLength() = " << A.getLocalLength () << ".");
00962     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00963       getNumVectors () != A.getNumVectors (), std::runtime_error,
00964       ": MultiVectors must have the same number of columns (vectors).  "
00965       "this->getNumVectors() = " << getNumVectors () << " != "
00966       "A.getNumVectors() = " << A.getNumVectors () << ".");
00967     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
00968       dots.dimension_0 () != getNumVectors (), std::runtime_error,
00969       ": The output array 'dots' must have the same number of entries as the "
00970       "number of columns (vectors) in *this and A.  dots.dimension_0() = " <<
00971       dots.dimension_0 () << " != this->getNumVectors() = " << getNumVectors ()
00972       << ".");
00973 
00974     // We're computing using the device's data, so we need to make
00975     // sure first that the device is in sync with the host.
00976     A.view_.template sync<DeviceType> ();
00977     view_.template sync<DeviceType> ();
00978 
00979     // const size_t numVecs = getNumVectors (); // not used
00980     const size_t lclNumRows = getLocalLength ();
00981     const size_t numDots = dots.dimension_0 ();
00982 
00983     // In case the input dimensions don't match, make sure that we
00984     // don't overwrite memory that doesn't belong to us, by using
00985     // subset views with the minimum dimensions over all input.
00986     const std::pair<size_t, size_t> rowRng (0, lclNumRows);
00987     const std::pair<size_t, size_t> colRng (0, numDots);
00988     dots_view_type theDots = subview<dots_view_type> (dots, colRng);
00989     mv_view_type X = subview<mv_view_type> (view_.d_view, rowRng, colRng);
00990     mv_view_type Y = subview<mv_view_type> (A.view_.d_view, rowRng, colRng);
00991 
00992     // FIXME (mfh 14 Jul 2014) How come ALL() works as the first
00993     // argument, but not a row range?  The first line below doesn't
00994     // compile, but the second line does.  See
00995     // kokkos/core/unit_test/TestViewAPI.hpp, in particular
00996     // run_test_vector(), for an example of allowed subview arguments.
00997     //
00998     //vec_view_type X_0 = subview<vec_view_type> (X, rowRng, static_cast<size_t> (0));
00999     //vec_view_type X_0 = subview<vec_view_type> (X, ALL (), static_cast<size_t> (0));
01000 
01001     if (numDots == 1) {
01002       // Special case 1: Both MultiVectors only have a single column.
01003       // The single-vector dot product kernel may be more efficient.
01004       const size_t ZERO = static_cast<size_t> (0);
01005       vec_view_type X_k = subview<vec_view_type> (X, ALL (), ZERO);
01006       vec_view_type Y_k = subview<vec_view_type> (Y, ALL (), ZERO);
01007       dot_view_type dot_k = subview<dot_view_type> (theDots, ZERO);
01008 
01009       TEUCHOS_TEST_FOR_EXCEPTION(
01010         X_k.dimension_0 () != lclNumRows, std::logic_error, "Tpetra::Multi"
01011         "Vector::dot: In the special single-vector case, X_k.dimension_0() = "
01012         << X_k.dimension_0 () << " != lclNumRows = " << lclNumRows
01013         << ".  Please report this bug to the Tpetra developers.");
01014       TEUCHOS_TEST_FOR_EXCEPTION(
01015         Y_k.dimension_0 () != lclNumRows, std::logic_error, "Tpetra::Multi"
01016         "Vector::dot: In the special single-vector case, Y_k.dimension_0() = "
01017         << Y_k.dimension_0 () << " != lclNumRows = " << lclNumRows
01018         << ".  Please report this bug to the Tpetra developers.");
01019 
01020       Kokkos::VecDotFunctor<vec_view_type> f (X_k, Y_k, dot_k);
01021       Kokkos::parallel_reduce (lclNumRows, f);
01022     }
01023     else if (isConstantStride () && A.isConstantStride ()) {
01024       // Special case 2: Both MultiVectors have constant stride.
01025       Kokkos::MultiVecDotFunctor<mv_view_type> f (X, Y, theDots);
01026       Kokkos::parallel_reduce (lclNumRows, f);
01027     }
01028     else {
01029       // FIXME (mfh 14 Jul 2014) This does a kernel launch for every
01030       // column.  It might be better to have a kernel that does the
01031       // work all at once.  On the other hand, we don't prioritize
01032       // performance of MultiVector views of noncontiguous columns.
01033       for (size_t k = 0; k < numDots; ++k) {
01034         const size_t X_col = isConstantStride () ? k : whichVectors_[k];
01035         const size_t Y_col = A.isConstantStride () ? k : A.whichVectors_[k];
01036         vec_view_type X_k = subview<vec_view_type> (X, ALL (), X_col);
01037         vec_view_type Y_k = subview<vec_view_type> (Y, ALL (), Y_col);
01038         dot_view_type dot_k = subview<dot_view_type> (theDots, k);
01039         Kokkos::VecDotFunctor<vec_view_type> f (X_k, Y_k, dot_k);
01040         Kokkos::parallel_reduce (lclNumRows, f);
01041       }
01042     }
01043 
01044     // If the MultiVectors are distributed over multiple processes,
01045     // sum the results across processes.  We assume that the MPI
01046     // implementation can read from and write to device memory.
01047     //
01048     // replaceMap() may have removed some processes.  Those processes
01049     // have a null Map.  They must not participate in any collective
01050     // operations.  We ask first whether the Map is null, because
01051     // isDistributed() defers that question to the Map.  We still
01052     // compute and return local dot products for processes not
01053     // participating in collective operations; those probably don't
01054     // make any sense, but it doesn't hurt to do them, since it's
01055     // illegal to call dot() on those processes anyway.
01056     if (! this->getMap ().is_null () && this->isDistributed ()) {
01057       // MPI doesn't allow aliasing of arguments, so we have to make a
01058       // copy of the local sum.
01059       dots_view_type lclDots ("MV::dot lcl", numDots);
01060 
01061       TEUCHOS_TEST_FOR_EXCEPTION(
01062         lclDots.dimension_0 () != theDots.dimension_0 (), std::logic_error,
01063         "Tpetra::MultiVector::dot: lclDots and theDots have different sizes.  "
01064         "lclDots.dimension_0 () = " << lclDots.dimension_0 () << " != "
01065         "theDots.dimension_0 () = " << theDots.dimension_0 () << ".  "
01066         "Please report this bug to the Tpetra developers.");
01067 
01068       Kokkos::deep_copy (lclDots, theDots);
01069       const Teuchos::Comm<int>& comm = * (this->getMap ()->getComm ());
01070       const dot_type* const lclSum = lclDots.ptr_on_device ();
01071       dot_type* const gblSum = theDots.ptr_on_device ();
01072       reduceAll<int, dot_type> (comm, REDUCE_SUM, static_cast<int> (numDots),
01073                                 lclSum, gblSum);
01074     }
01075   }
01076 
01077 
01078   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01079   void
01080   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01081   norm2 (const Teuchos::ArrayView<mag_type>& norms) const
01082   {
01083     typedef typename dual_view_type::host_mirror_device_type host_mirror_device_type;
01084     typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
01085     typedef Kokkos::View<mag_type*, typename dev_norms_view_type::array_layout,
01086       host_mirror_device_type, Kokkos::MemoryUnmanaged> host_norms_view_type;
01087 
01088     const size_t numNorms = static_cast<size_t> (norms.size ());
01089     host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
01090     dev_norms_view_type normsDevView ("MV::norm2 tmp", numNorms);
01091     this->norm2 (normsDevView); // Do the computation on the device.
01092     Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
01093   }
01094 
01095 
01096   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01097   void
01098   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
01099               Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01100   norm2 (const Kokkos::View<mag_type*, device_type>& norms) const
01101   {
01102     using Kokkos::ALL;
01103     using Kokkos::subview;
01104     using Teuchos::Comm;
01105     using Teuchos::null;
01106     using Teuchos::RCP;
01107     using Teuchos::REDUCE_SUM;
01108     using Teuchos::reduceAll;
01109     // View of a MultiVector's local data (all columns).
01110     typedef typename dual_view_type::t_dev mv_view_type;
01111     // View of a single column of a MultiVector's local data.
01112     //
01113     // FIXME (mfh 14 Jul 2014) It would be better to get this typedef
01114     // from mv_view_type itself, in case the layout changes.
01115     typedef Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type> vec_view_type;
01116     // View of all the norm results.
01117     typedef Kokkos::View<mag_type*, device_type> norms_view_type;
01118     // Scalar view; view of a single norm result.
01119     typedef Kokkos::View<mag_type, device_type> norm_view_type;
01120     const char tfecfFuncName[] = "Tpetra::MultiVector::norm2";
01121 
01122     const size_t numVecs = getNumVectors ();
01123     const size_t lclNumRows = getLocalLength ();
01124     const size_t numNorms = static_cast<size_t> (norms.dimension_0 ());
01125 
01126     // FIXME (mfh 11 Jul 2014) These exception tests may not
01127     // necessarily be thrown on all processes consistently.  We should
01128     // instead pass along error state with the inner product.  We
01129     // could do this by setting an extra slot to
01130     // Kokkos::Details::ArithTraits<mag_type>::one() on error.  The
01131     // final sum should be
01132     // Kokkos::Details::ArithTraits<mag_type>::zero() if not error.
01133     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01134       numNorms < numVecs, std::runtime_error, "Tpetra::MultiVector::norm2: "
01135       "'norms' must have at least as many entries as the number of vectors in "
01136       "*this.  norms.dimension_0() = " << numVecs << " < this->getNumVectors()"
01137       " = " << numVecs << ".");
01138 
01139     // We're computing using the device's data, so we need to make
01140     // sure first that the device is in sync with the host.
01141     view_.template sync<DeviceType> ();
01142 
01143     // In case the input dimensions don't match, make sure that we
01144     // don't overwrite memory that doesn't belong to us, by using
01145     // subset views with the minimum dimensions over all input.
01146     const std::pair<size_t, size_t> rowRng (0, lclNumRows);
01147     const std::pair<size_t, size_t> colRng (0, numVecs);
01148     norms_view_type theNorms = subview<norms_view_type> (norms, colRng);
01149     mv_view_type X = subview<mv_view_type> (view_.d_view, rowRng, colRng);
01150 
01151     if (numVecs == 1) {
01152       // Special case 1: The MultiVector only has a single column.
01153       // The single-vector norm kernel may be more efficient.
01154       const size_t ZERO = static_cast<size_t> (0);
01155       vec_view_type X_k = subview<vec_view_type> (X, ALL (), ZERO);
01156       norm_view_type norm_k = subview<norm_view_type> (theNorms, ZERO);
01157       Kokkos::VecNorm2SquaredFunctor<vec_view_type> f (X_k, norm_k);
01158       Kokkos::parallel_reduce (lclNumRows, f);
01159     }
01160     else if (isConstantStride ()) {
01161       // Special case 2: The MultiVector has constant stride.
01162       Kokkos::MultiVecNorm2SquaredFunctor<mv_view_type> f (X, theNorms);
01163       Kokkos::parallel_reduce (lclNumRows, f);
01164     }
01165     else {
01166       // FIXME (mfh 14 Jul 2014) This does a kernel launch for every
01167       // column.  It might be better to have a kernel that does the
01168       // work all at once.  On the other hand, we don't prioritize
01169       // performance of MultiVector views of noncontiguous columns.
01170       for (size_t k = 0; k < numVecs; ++k) {
01171         const size_t X_col = isConstantStride () ? k : whichVectors_[k];
01172         vec_view_type X_k = subview<vec_view_type> (X, ALL (), X_col);
01173         norm_view_type norm_k = subview<norm_view_type> (theNorms, k);
01174         Kokkos::VecNorm2SquaredFunctor<vec_view_type> f (X_k, norm_k);
01175         Kokkos::parallel_reduce (lclNumRows, f);
01176       }
01177     }
01178 
01179     // If the MultiVectors are distributed over multiple processes,
01180     // sum the results across processes.  We assume that the MPI
01181     // implementation can read from and write to device memory.
01182     //
01183     // replaceMap() may have removed some processes.  Those processes
01184     // have a null Map.  They must not participate in any collective
01185     // operations.  We ask first whether the Map is null, because
01186     // isDistributed() defers that question to the Map.  We still
01187     // compute and return local norms for processes not participating
01188     // in collective operations; those probably don't make any sense,
01189     // but it doesn't hurt to do them, since it's illegal to call
01190     // norm2() on those processes anyway.
01191     if (this->isDistributed ()) {
01192       RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
01193         this->getMap ()->getComm ();
01194       // The calling process only participates in the collective if
01195       // both the Map and its Comm on that process are nonnull.
01196       if (! comm.is_null ()) {
01197         // MPI doesn't allow aliasing of arguments, so we have to make
01198         // a copy of the local sum.
01199         norms_view_type lclNorms ("MV::norm2 lcl", numNorms);
01200         Kokkos::deep_copy (lclNorms, theNorms);
01201         const mag_type* const lclSum = lclNorms.ptr_on_device ();
01202         mag_type* const gblSum = theNorms.ptr_on_device ();
01203         reduceAll<int, mag_type> (*comm, REDUCE_SUM, static_cast<int> (numVecs),
01204                                   lclSum, gblSum);
01205       }
01206     }
01207 
01208     // Replace the norm-squared results with their square roots in
01209     // place, to get the final output.  If the device memory and the
01210     // host memory are the same, it probably doesn't pay to launch a
01211     // parallel kernel for that, since there isn't enough
01212     // parallelism for the typical MultiVector case.
01213     typedef typename device_type::host_mirror_device_type host_mirror_device_type;
01214     const bool inHostMemory = Kokkos::Impl::is_same<typename device_type::memory_space,
01215       typename host_mirror_device_type::memory_space>::value;
01216     if (inHostMemory) {
01217       for (size_t j = 0; j < numVecs; ++j) {
01218         theNorms(j) = Kokkos::Details::ArithTraits<mag_type>::sqrt (theNorms(j));
01219       }
01220     }
01221     else {
01222       // There's not as much parallelism now, but that's OK.  The
01223       // point of doing parallel dispatch here is to keep the norm
01224       // results on the device, thus avoiding a copy to the host and
01225       // back again.
01226       Kokkos::SquareRootFunctor<norms_view_type> f (theNorms);
01227       Kokkos::parallel_for (numVecs, f);
01228     }
01229   }
01230 
01231 
01232   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01233   void
01234   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01235   normWeighted (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& weights,
01236                 const Teuchos::ArrayView<mag_type> &norms) const
01237   {
01238     // KR FIXME MVT::WeightedNorm (might already exist).
01239     //
01240     // KR FIXME Overload this method to take a View.
01241 
01242     using Kokkos::ALL;
01243     using Kokkos::subview;
01244     using Teuchos::arcp_const_cast;
01245     using Teuchos::Array;
01246     using Teuchos::ArrayRCP;
01247     using Teuchos::ArrayView;
01248     using Teuchos::reduceAll;
01249     using Teuchos::REDUCE_SUM;
01250     using Teuchos::ScalarTraits;
01251     typedef mag_type Mag;
01252     const char tfecfFuncName[] = "normWeighted";
01253 
01254     const Mag OneOverN = Teuchos::ScalarTraits<Mag>::one () / static_cast<Mag> (getGlobalLength ());
01255     bool OneW = false;
01256     const size_t numVecs = this->getNumVectors();
01257     if (weights.getNumVectors() == 1) {
01258       OneW = true;
01259     }
01260     else {
01261       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(weights.getNumVectors() != numVecs, std::runtime_error,
01262         ": MultiVector of weights must contain either one vector or the same number of vectors as this.");
01263     }
01264 #ifdef HAVE_TPETRA_DEBUG
01265     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( !this->getMap()->isCompatible(*weights.getMap()), std::runtime_error,
01266       ": MultiVectors do not have compatible Maps:" << std::endl
01267       << "this->getMap(): " << std::endl << *this->getMap()
01268       << "weights.getMap(): " << std::endl << *weights.getMap() << std::endl);
01269 #else
01270     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( getLocalLength() != weights.getLocalLength(), std::runtime_error,
01271       ": MultiVectors do not have the same local length.");
01272 #endif
01273 
01274     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01275       static_cast<size_t> (norms.size ()) != numVecs, std::runtime_error,
01276       ": norms.size() must be as large as the number of vectors in *this.");
01277     typedef Kokkos::View<Scalar*,DeviceType> view_type;
01278     view_.template sync<DeviceType>();
01279     weights.view_.template sync<DeviceType>();
01280     if (isConstantStride ()) {
01281       if(OneW) {
01282         view_type weights_0 = subview<view_type> (weights.view_.d_view, ALL (), 0);
01283         Kokkos::MV_DotWeighted (&norms[0], weights_0, view_.d_view, getLocalLength ());
01284       } else
01285         Kokkos::MV_DotWeighted (&norms[0], weights.view_.d_view , view_.d_view, getLocalLength ());
01286     }
01287     else {
01288       // FIXME (mfh 11 Mar 2014) Once we have strided Views, we won't
01289       // have to write the explicit for loop over columns any more.
01290       if(OneW) {
01291         view_type weights_0 = subview<view_type> (weights.view_.d_view, ALL (), 0);
01292         for (size_t k = 0; k < numVecs; ++k) {
01293           const size_t curCol = whichVectors_[k];
01294           view_type vector_k = subview<view_type> (view_.d_view, ALL (), curCol);
01295           norms[k] = Kokkos::V_DotWeighted (weights_0, vector_k,getLocalLength());
01296         }
01297       } else {
01298         for (size_t k = 0; k < numVecs; ++k) {
01299           const size_t curCol = whichVectors_[k];
01300           view_type weights_k = subview<view_type> (weights.view_.d_view, ALL (), curCol);
01301           view_type vector_k = subview<view_type> (view_.d_view, ALL (), curCol);
01302           norms[k] = Kokkos::V_DotWeighted (weights_k, vector_k,getLocalLength());
01303         }
01304       }
01305     }
01306     if (this->isDistributed ()) {
01307       Array<Mag> lnorms(norms);
01308       reduceAll (*this->getMap ()->getComm (), REDUCE_SUM,
01309                  static_cast<int> (numVecs), lnorms.getRawPtr (),
01310                  norms.getRawPtr ());
01311     }
01312     for (typename ArrayView<Mag>::iterator n = norms.begin(); n != norms.begin()+numVecs; ++n) {
01313       *n = ScalarTraits<Mag>::squareroot(*n * OneOverN);
01314     }
01315   }
01316 
01317 
01318   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01319   void
01320   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01321   norm1 (const Teuchos::ArrayView<mag_type>& norms) const
01322   {
01323     typedef typename dual_view_type::host_mirror_device_type host_mirror_device_type;
01324     typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
01325     typedef Kokkos::View<mag_type*, typename dev_norms_view_type::array_layout,
01326       host_mirror_device_type, Kokkos::MemoryUnmanaged> host_norms_view_type;
01327 
01328     const size_t numNorms = static_cast<size_t> (norms.size ());
01329     host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
01330     dev_norms_view_type normsDevView ("MV::norm1 tmp", numNorms);
01331     this->norm1 (normsDevView); // Do the computation on the device.
01332     Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
01333   }
01334 
01335 
01336   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01337   void
01338   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
01339               Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01340   norm1 (const Kokkos::View<mag_type*, device_type>& norms) const
01341   {
01342     using Kokkos::ALL;
01343     using Kokkos::subview;
01344     using Teuchos::Comm;
01345     using Teuchos::null;
01346     using Teuchos::RCP;
01347     using Teuchos::REDUCE_SUM;
01348     using Teuchos::reduceAll;
01349 
01350     // View of a MultiVector's local data (all columns).
01351     typedef typename dual_view_type::t_dev mv_view_type;
01352     // View of a single column of a MultiVector's local data.
01353     //
01354     // FIXME (mfh 14 Jul 2014) It would be better to get this typedef
01355     // from mv_view_type itself, in case the layout changes.
01356     typedef Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type> vec_view_type;
01357     // View of all the norm results.
01358     typedef Kokkos::View<mag_type*, device_type> norms_view_type;
01359     // Scalar view; view of a single norm result.
01360     typedef Kokkos::View<mag_type, device_type> norm_view_type;
01361 
01362     const size_t numVecs = this->getNumVectors ();
01363     const size_t lclNumRows = this->getLocalLength ();
01364     const size_t numNorms = static_cast<size_t> (norms.dimension_0 ());
01365 
01366     // FIXME (mfh 11 Jul 2014) These exception tests may not
01367     // necessarily be thrown on all processes consistently.  We should
01368     // instead pass along error state with the inner product.  We
01369     // could do this by setting an extra slot to
01370     // Kokkos::Details::ArithTraits<mag_type>::one() on error.  The
01371     // final sum should be
01372     // Kokkos::Details::ArithTraits<mag_type>::zero() if not error.
01373     TEUCHOS_TEST_FOR_EXCEPTION(
01374       numNorms < numVecs, std::runtime_error, "Tpetra::MultiVector::norm1: "
01375       "'norms' must have at least as many entries as the number of vectors in "
01376       "*this.  norms.dimension_0() = " << numVecs << " < this->getNumVectors()"
01377       " = " << numVecs << ".");
01378 
01379     // We're computing using the device's data, so we need to make
01380     // sure first that the device is in sync with the host.
01381     view_.template sync<DeviceType> ();
01382 
01383     // In case the input dimensions don't match, make sure that we
01384     // don't overwrite memory that doesn't belong to us, by using
01385     // subset views with the minimum dimensions over all input.
01386     const std::pair<size_t, size_t> rowRng (0, lclNumRows);
01387     const std::pair<size_t, size_t> colRng (0, numVecs);
01388     norms_view_type theNorms = subview<norms_view_type> (norms, colRng);
01389     mv_view_type X = subview<mv_view_type> (view_.d_view, rowRng, colRng);
01390 
01391     if (numVecs == 1) {
01392       // Special case 1: The MultiVector only has a single column.
01393       // The single-vector norm kernel may be more efficient.
01394       const size_t ZERO = static_cast<size_t> (0);
01395       vec_view_type X_k = subview<vec_view_type> (X, ALL (), ZERO);
01396       norm_view_type norm_k = subview<norm_view_type> (theNorms, ZERO);
01397       Kokkos::VecNorm1Functor<vec_view_type> f (X_k, norm_k);
01398       Kokkos::parallel_reduce (lclNumRows, f);
01399     }
01400     else if (isConstantStride ()) {
01401       // Special case 2: The MultiVector has constant stride.
01402       Kokkos::MultiVecNorm1Functor<mv_view_type> f (X, theNorms);
01403       Kokkos::parallel_reduce (lclNumRows, f);
01404     }
01405     else {
01406       // FIXME (mfh 14 Jul 2014) This does a kernel launch for every
01407       // column.  It might be better to have a kernel that does the
01408       // work all at once.  On the other hand, we don't prioritize
01409       // performance of MultiVector views of noncontiguous columns.
01410       for (size_t k = 0; k < numVecs; ++k) {
01411         const size_t X_col = isConstantStride () ? k : whichVectors_[k];
01412         vec_view_type X_k = subview<vec_view_type> (X, ALL (), X_col);
01413         norm_view_type norm_k = subview<norm_view_type> (theNorms, k);
01414         Kokkos::VecNorm1Functor<vec_view_type> f (X_k, norm_k);
01415         Kokkos::parallel_reduce (lclNumRows, f);
01416       }
01417     }
01418 
01419     // If the MultiVectors are distributed over multiple processes,
01420     // sum the results across processes.  We assume that the MPI
01421     // implementation can read from and write to device memory.
01422     //
01423     // replaceMap() may have removed some processes.  Those processes
01424     // have a null Map.  They must not participate in any collective
01425     // operations.  We ask first whether the Map is null, because
01426     // isDistributed() defers that question to the Map.  We still
01427     // compute and return local norms for processes not participating
01428     // in collective operations; those probably don't make any sense,
01429     // but it doesn't hurt to do them, since it's illegal to call
01430     // norm1() on those processes anyway.
01431     if (this->isDistributed ()) {
01432       RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
01433         this->getMap ()->getComm ();
01434       // The calling process only participates in the collective if
01435       // both the Map and its Comm on that process are nonnull.
01436       if (! comm.is_null ()) {
01437         // MPI doesn't allow aliasing of arguments, so we have to make
01438         // a copy of the local sum.
01439         norms_view_type lclNorms ("MV::norm1 lcl", numNorms);
01440         Kokkos::deep_copy (lclNorms, theNorms);
01441         const mag_type* const lclSum = lclNorms.ptr_on_device ();
01442         mag_type* const gblSum = theNorms.ptr_on_device ();
01443         reduceAll<int, mag_type> (*comm, REDUCE_SUM, static_cast<int> (numVecs),
01444                                   lclSum, gblSum);
01445       }
01446     }
01447   }
01448 
01449   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01450   void
01451   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01452   normInf (const Teuchos::ArrayView<mag_type>& norms) const
01453   {
01454     typedef typename dual_view_type::host_mirror_device_type host_mirror_device_type;
01455     typedef Kokkos::View<mag_type*, device_type> dev_norms_view_type;
01456     typedef Kokkos::View<mag_type*, typename dev_norms_view_type::array_layout, host_mirror_device_type, Kokkos::MemoryUnmanaged> host_norms_view_type;
01457 
01458     const size_t numNorms = static_cast<size_t> (norms.size ());
01459     host_norms_view_type normsHostView (norms.getRawPtr (), numNorms);
01460     dev_norms_view_type normsDevView ("MV::normInf tmp", numNorms);
01461     this->normInf (normsDevView); // Do the computation on the device.
01462     Kokkos::deep_copy (normsHostView, normsDevView); // Bring back result to host
01463   }
01464 
01465 
01466   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01467   void
01468   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
01469               Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01470   normInf (const Kokkos::View<mag_type*, device_type>& norms) const
01471   {
01472     using Kokkos::ALL;
01473     using Kokkos::subview;
01474     using Teuchos::Comm;
01475     using Teuchos::null;
01476     using Teuchos::RCP;
01477     using Teuchos::REDUCE_MAX;
01478     using Teuchos::reduceAll;
01479     // View of a MultiVector's local data (all columns).
01480     typedef typename dual_view_type::t_dev mv_view_type;
01481     // View of a single column of a MultiVector's local data.
01482     //
01483     // FIXME (mfh 14 Jul 2014) It would be better to get this typedef
01484     // from mv_view_type itself, in case the layout changes.
01485     typedef Kokkos::View<scalar_type*, Kokkos::LayoutLeft, device_type> vec_view_type;
01486     // View of all the norm results.
01487     typedef Kokkos::View<mag_type*, device_type> norms_view_type;
01488     // Scalar view; view of a single norm result.
01489     typedef Kokkos::View<mag_type, device_type> norm_view_type;
01490 
01491     const size_t numVecs = this->getNumVectors ();
01492     const size_t lclNumRows = this->getLocalLength ();
01493     const size_t numNorms = static_cast<size_t> (norms.dimension_0 ());
01494 
01495     // FIXME (mfh 11 Jul 2014) These exception tests may not
01496     // necessarily be thrown on all processes consistently.  We should
01497     // instead pass along error state with the inner product.  We
01498     // could do this by setting an extra slot to
01499     // Kokkos::Details::ArithTraits<mag_type>::one() on error.  The
01500     // final sum should be
01501     // Kokkos::Details::ArithTraits<mag_type>::zero() if not error.
01502     TEUCHOS_TEST_FOR_EXCEPTION(
01503       numNorms < numVecs, std::runtime_error, "Tpetra::MultiVector::normInf: "
01504       "'norms' must have at least as many entries as the number of vectors in "
01505       "*this.  norms.dimension_0() = " << numVecs << " < this->getNumVectors()"
01506       " = " << numVecs << ".");
01507 
01508     // We're computing using the device's data, so we need to make
01509     // sure first that the device is in sync with the host.
01510     view_.template sync<DeviceType> ();
01511 
01512     // In case the input dimensions don't match, make sure that we
01513     // don't overwrite memory that doesn't belong to us, by using
01514     // subset views with the minimum dimensions over all input.
01515     const std::pair<size_t, size_t> rowRng (0, lclNumRows);
01516     const std::pair<size_t, size_t> colRng (0, numVecs);
01517     norms_view_type theNorms = subview<norms_view_type> (norms, colRng);
01518     mv_view_type X = subview<mv_view_type> (view_.d_view, rowRng, colRng);
01519 
01520     if (numVecs == 1) {
01521       // Special case 1: The MultiVector only has a single column.
01522       // The single-vector norm kernel may be more efficient.
01523       const size_t ZERO = static_cast<size_t> (0);
01524       vec_view_type X_k = subview<vec_view_type> (X, ALL (), ZERO);
01525       norm_view_type norm_k = subview<norm_view_type> (theNorms, ZERO);
01526       Kokkos::VecNormInfFunctor<vec_view_type> f (X_k, norm_k);
01527       Kokkos::parallel_reduce (lclNumRows, f);
01528     }
01529     else if (isConstantStride ()) {
01530       // Special case 2: The MultiVector has constant stride.
01531       Kokkos::MultiVecNormInfFunctor<mv_view_type> f (X, theNorms);
01532       Kokkos::parallel_reduce (lclNumRows, f);
01533     }
01534     else {
01535       // FIXME (mfh 15 Jul 2014) This does a kernel launch for every
01536       // column.  It might be better to have a kernel that does the
01537       // work all at once.  On the other hand, we don't prioritize
01538       // performance of MultiVector views of noncontiguous columns.
01539       for (size_t k = 0; k < numVecs; ++k) {
01540         const size_t X_col = isConstantStride () ? k : whichVectors_[k];
01541         vec_view_type X_k = subview<vec_view_type> (X, ALL (), X_col);
01542         norm_view_type norm_k = subview<norm_view_type> (theNorms, k);
01543         Kokkos::VecNormInfFunctor<vec_view_type> f (X_k, norm_k);
01544         Kokkos::parallel_reduce (lclNumRows, f);
01545       }
01546     }
01547 
01548     // If the MultiVectors are distributed over multiple processes,
01549     // compute the max of the results across processes.  We assume
01550     // that the MPI implementation can read from and write to device
01551     // memory.
01552     //
01553     // replaceMap() may have removed some processes.  Those processes
01554     // have a null Map.  They must not participate in any collective
01555     // operations.  We ask first whether the Map is null, because
01556     // isDistributed() defers that question to the Map.  We still
01557     // compute and return local norms for processes not participating
01558     // in collective operations; those probably don't make any sense,
01559     // but it doesn't hurt to do them, since it's illegal to call
01560     // normInf() on those processes anyway.
01561     if (this->isDistributed ()) {
01562       RCP<const Comm<int> > comm = this->getMap ().is_null () ? null :
01563         this->getMap ()->getComm ();
01564       // The calling process only participates in the collective if
01565       // both the Map and its Comm on that process are nonnull.
01566       if (! comm.is_null ()) {
01567         // MPI doesn't allow aliasing of arguments, so we have to make
01568         // a copy of the local sum.
01569         norms_view_type lclNorms ("MV::normInf lcl", numNorms);
01570         Kokkos::deep_copy (lclNorms, theNorms);
01571         const mag_type* const lclSum = lclNorms.ptr_on_device ();
01572         mag_type* const gblSum = theNorms.ptr_on_device ();
01573         reduceAll<int, mag_type> (*comm, REDUCE_MAX, static_cast<int> (numVecs),
01574                                   lclSum, gblSum);
01575       }
01576     }
01577   }
01578 
01579 
01580   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01581   void
01582   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01583   meanValue (const Teuchos::ArrayView<Scalar> &means) const
01584   {
01585     // KR FIXME MVT::Sum (might already exist).
01586     //
01587     // KR FIXME Overload this method to take a View.
01588 
01589     using Teuchos::Array;
01590     using Teuchos::ArrayRCP;
01591     using Teuchos::arcp_const_cast;
01592     using Teuchos::reduceAll;
01593     using Teuchos::REDUCE_SUM;
01594 
01595     using Kokkos::ALL;
01596     using Kokkos::subview;
01597 
01598     typedef Teuchos::ScalarTraits<Scalar> SCT;
01599 
01600     const size_t numVecs = getNumVectors();
01601 
01602     TEUCHOS_TEST_FOR_EXCEPTION(
01603       static_cast<size_t> (means.size ()) != numVecs, std::runtime_error,
01604       "Tpetra::MultiVector::meanValue: means.size() must be as large as "
01605       "the number of vectors in *this.");
01606     // compute local components of the means
01607     // sum these across all nodes
01608     view_.template sync<DeviceType>();
01609     if (isConstantStride ()) {
01610       Kokkos::MV_Sum (&means[0], view_.d_view, getLocalLength ());
01611     }
01612     else {
01613       // FIXME (mfh 11 Mar 2014) Once we have strided Views, we won't
01614       // have to write the explicit for loop over columns any more.
01615       for (size_t k = 0; k < numVecs; ++k) {
01616         typedef Kokkos::View<Scalar*,DeviceType> view_type;
01617         const size_t curCol = whichVectors_[k];
01618         view_type vector_k = subview<view_type> (view_.d_view, ALL (), curCol);
01619         means[k] = Kokkos::V_Sum (vector_k);
01620       }
01621     }
01622     if (this->isDistributed()) {
01623       Array<Scalar> lmeans(means);
01624       // only combine if we are a distributed MV
01625       reduceAll (*this->getMap ()->getComm (), REDUCE_SUM, static_cast<int> (numVecs),
01626                  lmeans.getRawPtr (), means.getRawPtr ());
01627     }
01628     // mfh 12 Apr 2012: Don't take out the cast from the ordinal type
01629     // to the magnitude type, since operator/ (std::complex<T>, int)
01630     // isn't necessarily defined.
01631     const Scalar OneOverN =
01632       SCT::one() / static_cast<typename SCT::magnitudeType> (getGlobalLength ());
01633     for (typename ArrayView<Scalar>::iterator i = means.begin();
01634          i != means.begin() + numVecs;
01635          ++i)
01636     {
01637       (*i) = (*i) * OneOverN;
01638     }
01639   }
01640 
01641 
01642   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01643   void
01644   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01645   randomize ()
01646   {
01647     const uint64_t myRank =
01648       static_cast<uint64_t> (this->getMap ()->getComm ()->getRank ());
01649     uint64_t seed64 = static_cast<uint64_t> (std::rand ()) + myRank + 17311uLL;
01650     unsigned int seed = static_cast<unsigned int> (seed64&0xffffffff);
01651 
01652     Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool(seed);
01653 
01654     Scalar max = Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type, Scalar>::max ();
01655     Scalar min = Kokkos::Details::ArithTraits<Scalar>::is_signed ? Scalar(-max) : Kokkos::Details::ArithTraits<Scalar>::zero ();
01656 
01657 
01658     if (isConstantStride ()) {
01659       Kokkos::fill_random(view_.d_view,rand_pool,min,max);
01660       view_.template modify<DeviceType>();
01661     }
01662     else {
01663       const size_t numVecs = getNumVectors();
01664       view_.template sync<DeviceType>();
01665       typedef Kokkos::View<Scalar*, DeviceType> view_type;
01666       for (size_t k = 0; k < numVecs; ++k) {
01667         view_type vector_k = Kokkos::subview<view_type> (view_.d_view, Kokkos::ALL (), whichVectors_[k]);
01668         Kokkos::fill_random(vector_k,rand_pool,min,max);
01669       }
01670       view_.template modify<DeviceType>();
01671     }
01672   }
01673 
01674 
01675   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01676   void
01677   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01678   putScalar (const Scalar& alpha)
01679   {
01680     const size_t numVecs = getNumVectors();
01681     if (isConstantStride ()) {
01682       Kokkos::Impl::ViewFill<typename dual_view_type::t_dev> (view_.d_view, alpha);
01683     }
01684     else {
01685       typedef Kokkos::View<Scalar*, DeviceType> view_type;
01686       for (size_t k = 0; k < numVecs; ++k) {
01687         view_type vector_k = Kokkos::subview<view_type> (view_.d_view, Kokkos::ALL (), whichVectors_[k]);
01688         Kokkos::Impl::ViewFill<view_type> (vector_k, alpha);
01689       }
01690     }
01691   }
01692 
01693 
01694   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01695   void
01696   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01697   replaceMap (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& newMap)
01698   {
01699     using Teuchos::ArrayRCP;
01700     using Teuchos::Comm;
01701     using Teuchos::RCP;
01702 
01703     // mfh 28 Mar 2013: This method doesn't forget whichVectors_, so
01704     // it might work if the MV is a column view of another MV.
01705     // However, things might go wrong when restoring the original
01706     // Map, so we don't allow this case for now.
01707     TEUCHOS_TEST_FOR_EXCEPTION(
01708       ! this->isConstantStride (), std::logic_error,
01709       "Tpetra::MultiVector::replaceMap: This method does not currently work "
01710       "if the MultiVector is a column view of another MultiVector (that is, if "
01711       "isConstantStride() == false).");
01712 
01713     // Case 1: current Map and new Map are both nonnull on this process.
01714     // Case 2: current Map is nonnull, new Map is null.
01715     // Case 3: current Map is null, new Map is nonnull.
01716     // Case 4: both Maps are null: forbidden.
01717     //
01718     // Case 1 means that we don't have to do anything on this process,
01719     // other than assign the new Map.  (We always have to do that.)
01720     // It's an error for the user to supply a Map that requires
01721     // resizing in this case.
01722     //
01723     // Case 2 means that the calling process is in the current Map's
01724     // communicator, but will be excluded from the new Map's
01725     // communicator.  We don't have to do anything on the calling
01726     // process; just leave whatever data it may have alone.
01727     //
01728     // Case 3 means that the calling process is excluded from the
01729     // current Map's communicator, but will be included in the new
01730     // Map's communicator.  This means we need to (re)allocate the
01731     // local (KokkosClassic::)MultiVector if it does not have the right
01732     // number of rows.  If the new number of rows is nonzero, we'll
01733     // fill the newly allocated local data with zeros, as befits a
01734     // projection operation.
01735     //
01736     // The typical use case for Case 3 is that the MultiVector was
01737     // first created with the Map with more processes, then that Map
01738     // was replaced with a Map with fewer processes, and finally the
01739     // original Map was restored on this call to replaceMap.
01740 
01741 #ifdef HAVE_TEUCHOS_DEBUG
01742     // mfh 28 Mar 2013: We can't check for compatibility across the
01743     // whole communicator, unless we know that the current and new
01744     // Maps are nonnull on _all_ participating processes.
01745     // TEUCHOS_TEST_FOR_EXCEPTION(
01746     //   origNumProcs == newNumProcs && ! this->getMap ()->isCompatible (*map),
01747     //   std::invalid_argument, "Tpetra::MultiVector::project: "
01748     //   "If the input Map's communicator is compatible (has the same number of "
01749     //   "processes as) the current Map's communicator, then the two Maps must be "
01750     //   "compatible.  The replaceMap() method is not for data redistribution; "
01751     //   "use Import or Export for that purpose.");
01752 
01753     // TODO (mfh 28 Mar 2013) Add compatibility checks for projections
01754     // of the Map, in case the process counts don't match.
01755 #endif // HAVE_TEUCHOS_DEBUG
01756 
01757     if (this->getMap ().is_null ()) { // current Map is null
01758       // If this->getMap() is null, that means that this MultiVector
01759       // has already had replaceMap happen to it.  In that case, just
01760       // reallocate the DualView with the right size.
01761 
01762       TEUCHOS_TEST_FOR_EXCEPTION(
01763         newMap.is_null (), std::invalid_argument,
01764         "Tpetra::MultiVector::replaceMap: both current and new Maps are null.  "
01765         "This probably means that the input Map is incorrect.");
01766 
01767       // Case 3: current Map is null, new Map is nonnull.
01768       // Reallocate the DualView with the right dimensions.
01769       const size_t newNumRows = newMap->getNodeNumElements ();
01770       const size_t origNumRows = view_.dimension_0 ();
01771       const size_t numCols = this->getNumVectors ();
01772 
01773       if (origNumRows != newNumRows || view_.dimension_1 () != numCols) {
01774         view_ = dual_view_type ("MV::dual_view", newNumRows, numCols);
01775       }
01776     }
01777     else if (newMap.is_null ()) { // Case 2: current Map is nonnull, new Map is null
01778       // I am an excluded process.  Reinitialize my data so that I
01779       // have 0 rows.  Keep the number of columns as before.
01780       const size_t newNumRows = static_cast<size_t> (0);
01781       const size_t numCols = this->getNumVectors ();
01782       view_ = dual_view_type ("MV::dual_view", newNumRows, numCols);
01783     }
01784 
01785     this->map_ = newMap;
01786   }
01787 
01788 
01789   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01790   void
01791   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01792   scale (const Scalar &alpha)
01793   {
01794     using Kokkos::ALL;
01795     using Teuchos::arcp_const_cast;
01796     using Teuchos::ArrayRCP;
01797 
01798     // NOTE: can't substitute putScalar(0.0) for scale(0.0), because
01799     //       the former will overwrite NaNs present in the MultiVector, while the
01800     //       semantics of this call require multiplying them by 0, which IEEE requires to be NaN
01801     const size_t numVecs = getNumVectors();
01802     if (alpha == Teuchos::ScalarTraits<Scalar>::one()) {
01803       // do nothing
01804     }
01805     else if (isConstantStride ()) {
01806       view_.template sync<DeviceType>();
01807       view_.template modify<DeviceType>();
01808       Kokkos::MV_MulScalar (view_.d_view, alpha, view_.d_view);
01809     }
01810     else {
01811       typedef Kokkos::View<Scalar*, DeviceType> view_type;
01812 
01813       for (size_t k = 0; k < numVecs; ++k) {
01814         const size_t this_col = isConstantStride () ? k : whichVectors_[k];
01815         view_type vector_k = Kokkos::subview<view_type> (view_.d_view, ALL (), this_col);
01816         Kokkos::V_MulScalar (vector_k, alpha, vector_k);
01817       }
01818     }
01819   }
01820 
01821 
01822   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01823   void
01824   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01825   scale (Teuchos::ArrayView<const Scalar> alphas)
01826   {
01827     using Kokkos::ALL;
01828     using Teuchos::arcp_const_cast;
01829     using Teuchos::ArrayRCP;
01830 
01831     const size_t numVecs = this->getNumVectors();
01832     TEUCHOS_TEST_FOR_EXCEPTION(
01833       static_cast<size_t> (alphas.size ()) != numVecs, std::invalid_argument,
01834       "Tpetra::MultiVector::scale(alphas): alphas.size() must be as large as "
01835       "the number of vectors in *this.");
01836 
01837     const size_t myLen = view_.dimension_0();
01838     if (myLen == 0) {
01839       return;
01840     }
01841 
01842     if (isConstantStride ()) {
01843       Kokkos::DualView<Scalar*,device_type> k_alphas("Alphas::tmp",alphas.size());
01844       for(int i=0; i<alphas.size(); i++)
01845          k_alphas.h_view(i) = alphas[i];
01846       k_alphas.template modify<host_mirror_device_type>();
01847       k_alphas.template sync<device_type>();
01848       view_.template sync<DeviceType>();
01849       view_.template modify<DeviceType>();
01850       Kokkos::MV_MulScalar (view_.d_view, k_alphas.d_view, view_.d_view);
01851     }
01852     else {
01853       typedef Kokkos::View<Scalar*, DeviceType> view_type;
01854 
01855       for (size_t k = 0; k < numVecs; ++k) {
01856         const size_t this_col = isConstantStride () ? k : whichVectors_[k];
01857         view_type vector_k = Kokkos::subview<view_type> (view_.d_view, ALL (), this_col);
01858         Kokkos::V_MulScalar (vector_k, alphas[k], vector_k);
01859       }
01860     }
01861   }
01862 
01863   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01864   void
01865   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01866   scale (const Kokkos::View<const Scalar*, device_type> alphas)
01867   {
01868     using Kokkos::ALL;
01869     using Teuchos::arcp_const_cast;
01870     using Teuchos::ArrayRCP;
01871 
01872     const size_t numVecs = this->getNumVectors();
01873     TEUCHOS_TEST_FOR_EXCEPTION(
01874       static_cast<size_t> (alphas.dimension_0()) != numVecs, std::invalid_argument,
01875       "Tpetra::MultiVector::scale(alphas): alphas.size() must be as large as "
01876       "the number of vectors in *this.");
01877 
01878     const size_t myLen = view_.dimension_0 ();
01879     if (myLen == 0) {
01880       return;
01881     }
01882 
01883     if (isConstantStride ()) {
01884       view_.template sync<DeviceType>();
01885       view_.template modify<DeviceType>();
01886       Kokkos::MV_MulScalar (view_.d_view, alphas, view_.d_view);
01887     }
01888     else {
01889       typedef Kokkos::View<Scalar*, DeviceType> view_type;
01890       typename Kokkos::View<const Scalar*, device_type>::HostMirror h_alphas =
01891         Kokkos::create_mirror_view (alphas);
01892       Kokkos::deep_copy (h_alphas, alphas);
01893       for (size_t k = 0; k < numVecs; ++k) {
01894         const size_t this_col = isConstantStride () ? k : whichVectors_[k];
01895         view_type vector_k = Kokkos::subview<view_type> (view_.d_view, ALL (), this_col);
01896         Kokkos::V_MulScalar (vector_k, alphas(k), vector_k);
01897       }
01898     }
01899   }
01900 
01901 
01902   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01903   void
01904   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01905   scale (const Scalar &alpha, const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > &A)
01906   {
01907     const char tfecfFuncName[] = "scale(alpha,A)";
01908     const size_t numVecs = getNumVectors ();
01909 
01910     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01911        getLocalLength () != A.getLocalLength (), std::runtime_error,
01912        ": MultiVectors do not have the same local length.  "
01913        "this->getLocalLength() = " << getLocalLength ()
01914        << " != A.getLocalLength() = " << A.getLocalLength () << ".");
01915     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01916       A.getNumVectors () != numVecs, std::runtime_error,
01917       ": MultiVectors do not have the same number of columns (vectors).  "
01918        "this->getNumVectors() = " << getNumVectors ()
01919        << " != A.getNumVectors() = " << A.getNumVectors () << ".");
01920 
01921     if (isConstantStride () && A.isConstantStride ()) {
01922       view_.template sync<DeviceType>();
01923       view_.template modify<DeviceType>();
01924       Kokkos::MV_MulScalar (view_.d_view, alpha, A.view_.d_view);
01925     }
01926     else {
01927       using Kokkos::ALL;
01928       using Kokkos::subview;
01929       typedef Kokkos::View<Scalar*, DeviceType> view_type;
01930 
01931       view_.template sync<DeviceType>();
01932       view_.template modify<DeviceType>();
01933       A.view_.template sync<DeviceType>();
01934       A.view_.template modify<DeviceType>();
01935       for (size_t k = 0; k < numVecs; ++k) {
01936         const size_t this_col = isConstantStride () ? k : whichVectors_[k];
01937         view_type vector_k = subview<view_type> (view_.d_view, ALL (), this_col);
01938         const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
01939         view_type vector_Ak = subview<view_type> (A.view_.d_view, ALL (), A_col);
01940         Kokkos::V_MulScalar (vector_k, alpha, vector_Ak);
01941       }
01942     }
01943   }
01944 
01945 
01946   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
01947   void
01948   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
01949   reciprocal (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > &A)
01950   {
01951     const char tfecfFuncName[] = "reciprocal";
01952 
01953     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01954        getLocalLength () != A.getLocalLength (), std::runtime_error,
01955        ": MultiVectors do not have the same local length.  "
01956        "this->getLocalLength() = " << getLocalLength ()
01957        << " != A.getLocalLength() = " << A.getLocalLength () << ".");
01958     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
01959       A.getNumVectors () != this->getNumVectors (), std::runtime_error,
01960       ": MultiVectors do not have the same number of columns (vectors).  "
01961        "this->getNumVectors() = " << getNumVectors ()
01962        << " != A.getNumVectors() = " << A.getNumVectors () << ".");
01963 
01964     const size_t numVecs = getNumVectors ();
01965     try {
01966       if (isConstantStride () && A.isConstantStride ()) {
01967         view_.template sync<DeviceType> ();
01968         view_.template modify<DeviceType> ();
01969         Kokkos::MV_Reciprocal (view_.d_view, A.view_.d_view);
01970       }
01971       else {
01972         using Kokkos::ALL;
01973         using Kokkos::subview;
01974         typedef Kokkos::View<Scalar*, DeviceType> view_type;
01975 
01976         view_.template sync<DeviceType> ();
01977         view_.template modify<DeviceType> ();
01978 
01979         // FIXME (mfh 23 Jul 2014) I'm not sure if it should be our
01980         // responsibility to sync A.
01981         A.view_.template sync<DeviceType> ();
01982         A.view_.template modify<DeviceType> ();
01983 
01984         for (size_t k = 0; k < numVecs; ++k) {
01985           const size_t this_col = isConstantStride () ? k : whichVectors_[k];
01986           view_type vector_k = subview<view_type> (view_.d_view, ALL (), this_col);
01987           const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
01988           view_type vector_Ak = subview<view_type> (A.view_.d_view, ALL (), A_col);
01989           Kokkos::V_Reciprocal(vector_k, vector_Ak);
01990         }
01991       }
01992     }
01993     catch (std::runtime_error &e) {
01994       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(true, std::runtime_error,
01995         ": Caught exception from Kokkos: " << e.what () << std::endl);
01996     }
01997   }
01998 
01999 
02000   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02001   void
02002   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02003   abs (const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& A)
02004   {
02005     const char tfecfFuncName[] = "abs";
02006     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02007        getLocalLength () != A.getLocalLength (), std::runtime_error,
02008        ": MultiVectors do not have the same local length.  "
02009        "this->getLocalLength() = " << getLocalLength ()
02010        << " != A.getLocalLength() = " << A.getLocalLength () << ".");
02011     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02012       A.getNumVectors () != this->getNumVectors (), std::runtime_error,
02013       ": MultiVectors do not have the same number of columns (vectors).  "
02014        "this->getNumVectors() = " << getNumVectors ()
02015        << " != A.getNumVectors() = " << A.getNumVectors () << ".");
02016 
02017     const size_t numVecs = getNumVectors ();
02018     if (isConstantStride () && A.isConstantStride ()) {
02019       view_.template sync<DeviceType>();
02020       view_.template modify<DeviceType>();
02021       Kokkos::MV_Abs(view_.d_view,A.view_.d_view);
02022     }
02023     else {
02024       using Kokkos::ALL;
02025       using Kokkos::subview;
02026       typedef Kokkos::View<Scalar*, DeviceType> view_type;
02027 
02028       view_.template sync<DeviceType> ();
02029       view_.template modify<DeviceType> ();
02030 
02031       // FIXME (mfh 23 Jul 2014) I'm not sure if it should be our
02032       // responsibility to sync A.
02033       A.view_.template sync<DeviceType> ();
02034       A.view_.template modify<DeviceType> ();
02035 
02036       for (size_t k=0; k < numVecs; ++k) {
02037         const size_t this_col = isConstantStride () ? k : whichVectors_[k];
02038         view_type vector_k = subview<view_type> (view_.d_view, ALL (), this_col);
02039         const size_t A_col = isConstantStride () ? k : A.whichVectors_[k];
02040         view_type vector_Ak = subview<view_type> (A.view_.d_view, ALL (), A_col);
02041         Kokkos::V_Abs(vector_k, vector_Ak);
02042       }
02043     }
02044   }
02045 
02046 
02047   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02048   void
02049   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02050   update (const Scalar& alpha,
02051           const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& A,
02052           const Scalar& beta)
02053   {
02054     using Kokkos::ALL;
02055     using Kokkos::subview;
02056     typedef Kokkos::View<Scalar*, DeviceType> view_type;
02057     const char tfecfFuncName[] = "update";
02058 
02059     // this = beta*this + alpha*A
02060     // must support case where &this == &A
02061     // can't short circuit on alpha==0.0 or beta==0.0, because 0.0*NaN != 0.0
02062 
02063     // FIXME (mfh 23 Jul 2014) Actually, the BLAS' AXPY allows short
02064     // circuiting for alpha == 0.  This is the convention for both the
02065     // dense and sparse BLAS.
02066 
02067     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02068       getLocalLength () != A.getLocalLength (), std::invalid_argument,
02069       ": The input MultiVector A has " << A.getLocalLength () << " local "
02070       "row(s), but this MultiVector has " << getLocalLength () << " local "
02071       "row(s).");
02072 
02073     const size_t numVecs = getNumVectors ();
02074     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02075       A.getNumVectors () != numVecs, std::invalid_argument,
02076       ": The input MultiVector A has " << A.getNumVectors () << " column(s), "
02077       "but this MultiVector has " << numVecs << " column(s).");
02078 
02079     const size_t lclNumRows = getLocalLength ();
02080     if (isConstantStride () && A.isConstantStride ()) {
02081       Kokkos::MV_Add (view_.d_view, alpha, A.view_.d_view, beta,
02082                       view_.d_view, lclNumRows);
02083     }
02084     else {
02085       // Make sure that Kokkos only uses the local length for add.
02086       const std::pair<size_t, size_t> rowRng (0, lclNumRows);
02087       for (size_t k = 0; k < numVecs; ++k) {
02088         const size_t this_col = isConstantStride () ? k : whichVectors_[k];
02089         const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
02090         view_type this_colView =
02091           subview<view_type> (view_.d_view, rowRng, this_col);
02092         view_type A_colView =
02093           subview<view_type> (A.view_.d_view, rowRng, A_col);
02094         Kokkos::V_Add (this_colView, alpha, A_colView, beta, this_colView);
02095       }
02096     }
02097   }
02098 
02099   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02100   void
02101   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02102   update (const Scalar& alpha,
02103           const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& A,
02104           const Scalar& beta,
02105           const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& B,
02106           const Scalar& gamma)
02107   {
02108     using Kokkos::ALL;
02109     using Kokkos::V_Add;
02110     using Kokkos::subview;
02111     typedef Kokkos::View<Scalar*, DeviceType> view_type;
02112     const char tfecfFuncName[] = "update";
02113 
02114     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02115       getLocalLength () != A.getLocalLength (), std::invalid_argument,
02116       ": The input MultiVector A has " << A.getLocalLength () << " local "
02117       "row(s), but this MultiVector has " << getLocalLength () << " local "
02118       "row(s).");
02119     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02120       getLocalLength () != B.getLocalLength (), std::invalid_argument,
02121       ": The input MultiVector B has " << B.getLocalLength () << " local "
02122       "row(s), but this MultiVector has " << getLocalLength () << " local "
02123       "row(s).");
02124     const size_t numVecs = getNumVectors ();
02125     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02126       A.getNumVectors () != numVecs, std::invalid_argument,
02127       ": The input MultiVector A has " << A.getNumVectors () << " column(s), "
02128       "but this MultiVector has " << numVecs << " column(s).");
02129     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02130       B.getNumVectors () != numVecs, std::invalid_argument,
02131       ": The input MultiVector B has " << B.getNumVectors () << " column(s), "
02132       "but this MultiVector has " << numVecs << " column(s).");
02133 
02134     const Scalar zero = Teuchos::ScalarTraits<Scalar>::zero ();
02135     const Scalar one = Teuchos::ScalarTraits<Scalar>::one ();
02136 
02137     if (isConstantStride() && A.isConstantStride () && B.isConstantStride ()) {
02138       if (gamma == zero) {
02139         Kokkos::MV_Add (view_.d_view, alpha, A.view_.d_view, beta,
02140                         B.view_.d_view);
02141       } else {
02142         Kokkos::MV_Add (view_.d_view, alpha, A.view_.d_view, gamma, view_.d_view);
02143         Kokkos::MV_Add (view_.d_view, beta, B.view_.d_view, one, view_.d_view);
02144       }
02145     } else { // some input (or *this) is not constant stride
02146 
02147       // Make sure that Kokkos only uses the local length for add.
02148       const std::pair<size_t, size_t> rowRng (0, getLocalLength ());
02149 
02150       for (size_t k = 0; k < numVecs; ++k) {
02151         const size_t this_col = isConstantStride () ? k : whichVectors_[k];
02152         const size_t A_col = A.isConstantStride () ? k : A.whichVectors_[k];
02153         const size_t B_col = B.isConstantStride () ? k : B.whichVectors_[k];
02154         if (gamma == zero) {
02155           // TODO: make sure it only uses LocalLength for add.
02156           V_Add (subview<view_type> (view_.d_view, rowRng, this_col),
02157                  alpha,
02158                  subview<view_type> (A.view_.d_view, rowRng, A_col),
02159                  beta,
02160                  subview<view_type> (B.view_.d_view, rowRng, B_col));
02161         } else {
02162           V_Add (subview<view_type> (view_.d_view, rowRng, this_col),
02163                  alpha,
02164                  subview<view_type> (A.view_.d_view, rowRng, A_col),
02165                  gamma,
02166                  subview<view_type> (view_.d_view, rowRng, this_col));
02167           V_Add (subview<view_type> (view_.d_view, rowRng, this_col),
02168                  beta,
02169                  subview<view_type> (B.view_.d_view, rowRng, B_col),
02170                  one,
02171                  subview<view_type> (view_.d_view, rowRng, this_col));
02172         }
02173       }
02174     }
02175   }
02176 
02177   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02178   Teuchos::ArrayRCP<const Scalar>
02179   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02180   getData (size_t j) const
02181   {
02182     using Kokkos::ALL;
02183     using Kokkos::subview;
02184     typedef typename dual_view_type::host_mirror_device_type host_type;
02185     typedef typename dual_view_type::t_host host_view_type;
02186 
02187     // Any MultiVector method that called the (classic) Kokkos Node's
02188     // viewBuffer or viewBufferNonConst methods always implied a
02189     // device->host synchronization.  Thus, we synchronize here as
02190     // well.
02191     view_.template sync<host_type> ();
02192 
02193     // Get a host view of the entire MultiVector's data.
02194     host_view_type hostView = view_.template view<host_type> ();
02195     // Get a subview of column j.
02196     host_view_type hostView_j;
02197     if (isConstantStride ()) {
02198       hostView_j = subview<host_view_type> (hostView, ALL (), j);
02199     } else {
02200       hostView_j = subview<host_view_type> (hostView, ALL (), whichVectors_[j]);
02201     }
02202 
02203     // Wrap up the subview of column j in an ArrayRCP<const scalar_type>.
02204     Teuchos::ArrayRCP<scalar_type> dataAsArcp =
02205       Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
02206 
02207 #ifdef HAVE_TPETRA_DEBUG
02208     TEUCHOS_TEST_FOR_EXCEPTION(
02209       hostView_j.dimension_0 () < dataAsArcp.size (), std::logic_error,
02210       "Tpetra::MultiVector::getData: hostView_j.dimension_0() = "
02211       << hostView_j.dimension_0 () << " < dataAsArcp.size() = "
02212       << dataAsArcp.size () << ".  "
02213       "Please report this bug to the Tpetra developers.");
02214     TEUCHOS_TEST_FOR_EXCEPTION(
02215       getLocalLength () != 0 && getNumVectors () != 0 &&
02216       hostView_j(0,0) != dataAsArcp[0], std::logic_error, "Tpetra::MultiVector"
02217       "::getData: hostView_j(0,0) = " << hostView_j(0,0) << " != dataAsArcp[0]"
02218       " = " << dataAsArcp[0] << ".  Please report this bug to the Tpetra "
02219       "developers.");
02220 #endif // HAVE_TPETRA_DEBUG
02221 
02222     return Teuchos::arcp_const_cast<const scalar_type> (dataAsArcp);
02223   }
02224 
02225   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02226   Teuchos::ArrayRCP<Scalar>
02227   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02228   getDataNonConst (size_t j)
02229   {
02230     using Kokkos::ALL;
02231     using Kokkos::subview;
02232     typedef typename dual_view_type::host_mirror_device_type host_type;
02233     typedef typename dual_view_type::t_host host_view_type;
02234 
02235     // Any MultiVector method that called the (classic) Kokkos Node's
02236     // viewBuffer or viewBufferNonConst methods always implied a
02237     // device->host synchronization.  Thus, we synchronize here as
02238     // well.
02239     view_.template sync<host_type> ();
02240 
02241     // Get a host view of the entire MultiVector's data.
02242     host_view_type hostView = view_.template view<host_type> ();
02243     // Get a subview of column j.
02244     host_view_type hostView_j;
02245     if (isConstantStride ()) {
02246       hostView_j = subview<host_view_type> (hostView, ALL (), j);
02247     } else {
02248       hostView_j = subview<host_view_type> (hostView, ALL (), whichVectors_[j]);
02249     }
02250 
02251     // Calling getDataNonConst() implies that the user plans to modify
02252     // the values in the MultiVector, so we call modify on the view
02253     // here.
02254     view_.template modify<host_type> ();
02255 
02256     // Wrap up the subview of column j in an ArrayRCP<const scalar_type>.
02257     Teuchos::ArrayRCP<scalar_type> dataAsArcp =
02258       Kokkos::Compat::persistingView (hostView_j, 0, getLocalLength ());
02259 
02260 #ifdef HAVE_TPETRA_DEBUG
02261     TEUCHOS_TEST_FOR_EXCEPTION(
02262       hostView_j.dimension_0 () < dataAsArcp.size (), std::logic_error,
02263       "Tpetra::MultiVector::getDataNonConst: hostView_j.dimension_0() = "
02264       << hostView_j.dimension_0 () << " < dataAsArcp.size() = "
02265       << dataAsArcp.size () << ".  "
02266       "Please report this bug to the Tpetra developers.");
02267     TEUCHOS_TEST_FOR_EXCEPTION(
02268       getLocalLength () != 0 && getNumVectors () != 0 &&
02269       hostView_j(0,0) != dataAsArcp[0], std::logic_error, "Tpetra::MultiVector"
02270       "::getDataNonConst: hostView_j(0,0) = " << hostView_j(0,0) << " != "
02271       "dataAsArcp[0] = " << dataAsArcp[0] << ".  Please report this bug to the "
02272       "Tpetra developers.");
02273 #endif // HAVE_TPETRA_DEBUG
02274 
02275     return dataAsArcp;
02276   }
02277 
02278   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02279   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >&
02280   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02281   operator= (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& source)
02282   {
02283     // KR FIXME This is just the assignment operator.
02284 
02285     if (this != &source) {
02286       base_type::operator= (source);
02287       //
02288       // operator= implements view semantics (shallow copy).
02289       //
02290 
02291       // OK: Kokkos::View operator= also implements view semantics.
02292       view_ = source.view_;
02293       origView_ = source.origView_;
02294 
02295       // NOTE (mfh 24 Mar 2014) Christian wrote here that assigning
02296       // whichVectors_ is "probably not ok" (probably constitutes deep
02297       // copy).  I would say that it's OK, because whichVectors_ is
02298       // immutable (from the user's perspective); it's analogous to
02299       // the dimensions or stride.  Once we make whichVectors_ a
02300       // Kokkos::View instead of a Teuchos::Array, all debate will go
02301       // away and we will unquestionably have view semantics.
02302       whichVectors_ = source.whichVectors_;
02303     }
02304     return *this;
02305   }
02306 
02307 
02308   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02309   Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02310   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02311   subCopy (const Teuchos::ArrayView<const size_t>& cols) const
02312   {
02313     using Teuchos::RCP;
02314     using Teuchos::rcp;
02315     typedef typename dual_view_type::host_mirror_device_type
02316       host_mirror_device_type;
02317     typedef typename dual_view_type::t_host host_view_type;
02318     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type> MV;
02319 
02320     // Check whether the index set in cols is contiguous.  If it is,
02321     // use the more efficient Range1D version of subCopy.
02322     {
02323       bool contiguous = true;
02324       const size_t numCopyVecs = static_cast<size_t> (cols.size ());
02325       for (size_t j = 1; j < numCopyVecs; ++j) {
02326         if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
02327           contiguous = false;
02328           break;
02329         }
02330       }
02331       if (contiguous && numCopyVecs > 0) {
02332         return this->subCopy (Teuchos::Range1D (cols[0], cols[numCopyVecs-1]));
02333       }
02334     }
02335 
02336     // Sync the source MultiVector (*this) to host first.  Copy it to
02337     // the output View on host, then sync the output View (only) to
02338     // device.  Doing copies on host saves us the trouble of copying
02339     // whichVecsSrc and whichVecsDst over to the device.
02340     view_.template sync<host_mirror_device_type> ();
02341 
02342     const size_t numRows = this->getLocalLength ();
02343     const size_t numCols = this->getNumVectors ();
02344     const std::pair<size_t, size_t> rowRange (0, numRows);
02345     const std::pair<size_t, size_t> colRange (0, numCols);
02346     const size_t numColsToCopy = static_cast<size_t> (cols.size ());
02347 
02348     // Create a DualView which will be a contiguously stored deep copy of this MV's view.
02349     dual_view_type dstView ("MV::dual_view", numRows, numColsToCopy);
02350     Kokkos::View<LocalOrdinal*, host_mirror_device_type> whichVecsDst ("whichVecsDst", numColsToCopy);
02351     Kokkos::View<LocalOrdinal*, host_mirror_device_type> whichVecsSrc ("whichVecsSrc", numColsToCopy);
02352 
02353     if (! this->isConstantStride ()) {
02354       for (size_t j = 0; j < numColsToCopy; ++j) {
02355         whichVecsSrc(j) = static_cast<LocalOrdinal> (this->whichVectors_[cols[j]]);
02356       }
02357     }
02358     else {
02359       for (size_t j = 0; j < numColsToCopy; ++j) {
02360         whichVecsSrc(j) = static_cast<LocalOrdinal> (cols[j]);
02361       }
02362     }
02363     for (size_t j = 0; j < numColsToCopy; ++j) {
02364       whichVecsDst(j) = static_cast<LocalOrdinal> (j);
02365     }
02366 
02367     //
02368     // Do the copy on host first.
02369     //
02370     host_view_type srcView =
02371       Kokkos::subview<host_view_type> (view_.h_view, rowRange, colRange);
02372     DeepCopySelectedVectors<host_view_type, host_view_type, LocalOrdinal,
02373       host_mirror_device_type, false, false> f (dstView.h_view, srcView,
02374                                                 whichVecsDst, whichVecsSrc);
02375     Kokkos::parallel_for (numRows, f);
02376 
02377     // Sync the output DualView (only) back to device.
02378     dstView.template sync<device_type> ();
02379 
02380     // Create and return a MultiVector using the new DualView.
02381     return rcp (new MV (this->getMap (), dstView));
02382   }
02383 
02384 
02385   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02386   Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02387   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02388   subCopy (const Teuchos::Range1D &colRng) const
02389   {
02390     using Teuchos::RCP;
02391     using Teuchos::rcp;
02392     typedef typename dual_view_type::host_mirror_device_type
02393       host_mirror_device_type;
02394     typedef typename dual_view_type::t_host host_view_type;
02395     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type> MV;
02396 
02397     // Sync the source MultiVector (*this) to host first.  Copy it to
02398     // the output View on host, then sync the output View (only) to
02399     // device.  Doing copies on host saves us the trouble of copying
02400     // whichVecsSrc and whichVecsDst over to the device.
02401     view_.template sync<host_mirror_device_type> ();
02402 
02403     const size_t numRows = this->getLocalLength ();
02404     const size_t numCols = this->getNumVectors ();
02405     const std::pair<size_t, size_t> rowRange (0, numRows);
02406     const std::pair<size_t, size_t> colRange (0, numCols);
02407 
02408     // Range1D is an inclusive range.  If the upper bound is less then
02409     // the lower bound, that signifies an invalid range, which we
02410     // interpret as "copy zero columns."
02411     const size_t numColsToCopy = (colRng.ubound () >= colRng.lbound ()) ?
02412       static_cast<size_t> (colRng.size ()) :
02413       static_cast<size_t> (0);
02414 
02415     // Create a DualView which will be a contiguously stored deep copy of this MV's view.
02416     dual_view_type dstView ("MV::dual_view", numRows, numColsToCopy);
02417     Kokkos::View<LocalOrdinal*, host_mirror_device_type> whichVecsDst ("whichVecsDst", numColsToCopy);
02418     Kokkos::View<LocalOrdinal*, host_mirror_device_type> whichVecsSrc ("whichVecsSrc", numColsToCopy);
02419 
02420     if (! this->isConstantStride ()) {
02421       for (size_t j = 0; j < numColsToCopy; ++j) {
02422         const size_t col = static_cast<size_t> (colRng.lbound ()) + j;
02423         whichVecsSrc(j) = static_cast<LocalOrdinal> (this->whichVectors_[col]);
02424       }
02425     }
02426     else {
02427       for (size_t j = 0; j < numColsToCopy; ++j) {
02428         const size_t col = static_cast<size_t> (colRng.lbound ()) + j;
02429         whichVecsSrc(j) = static_cast<LocalOrdinal> (col);
02430       }
02431     }
02432     for (size_t j = 0; j < numColsToCopy; ++j) {
02433       whichVecsDst(j) = static_cast<LocalOrdinal> (j);
02434     }
02435 
02436     //
02437     // Do the copy on host first.
02438     //
02439     // FIXME (mfh 10 Jul 2014) Exploit contiguity of the desired columns.
02440     host_view_type srcView =
02441       Kokkos::subview<host_view_type> (view_.h_view, rowRange, colRange);
02442     DeepCopySelectedVectors<host_view_type, host_view_type, LocalOrdinal,
02443       host_mirror_device_type, false, false> f (dstView.h_view, srcView,
02444                                                 whichVecsDst, whichVecsSrc);
02445     Kokkos::parallel_for (numRows, f);
02446 
02447     // Sync the output DualView (only) back to device.
02448     dstView.template sync<device_type> ();
02449 
02450     // Create and return a MultiVector using the new DualView.
02451     return rcp (new MV (this->getMap (), dstView));
02452   }
02453 
02454   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02455   size_t
02456   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02457   getOrigNumLocalRows () const {
02458     return origView_.dimension_0 ();
02459   }
02460 
02461   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02462   size_t
02463   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02464   getOrigNumLocalCols () const {
02465     return origView_.dimension_1 ();
02466   }
02467 
02468   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02469   Teuchos::RCP<const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02470   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02471   offsetView (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& subMap,
02472               size_t offset) const
02473   {
02474     using Kokkos::ALL;
02475     using Kokkos::subview;
02476     using Teuchos::RCP;
02477     using Teuchos::rcp;
02478     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
02479       Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > MV;
02480 
02481     const size_t newNumRows = subMap->getNodeNumElements ();
02482     const bool tooManyElts = newNumRows + offset > this->getOrigNumLocalRows ();
02483     if (tooManyElts) {
02484       const int myRank = this->getMap ()->getComm ()->getRank ();
02485       TEUCHOS_TEST_FOR_EXCEPTION(
02486         newNumRows + offset > this->getLocalLength (), std::runtime_error,
02487         "Tpetra::MultiVector::offsetView(NonConst): Invalid input Map.  The "
02488         "input Map owns " << newNumRows << " entries on process " << myRank <<
02489         ".  offset = " << offset << ".  Yet, the MultiVector contains only "
02490         << this->getOrigNumLocalRows () << " rows on this process.");
02491     }
02492 
02493 #ifdef HAVE_TPETRA_DEBUG
02494     // Ensure that the returned MultiVector has the same stride.
02495     const size_t strideBefore = isConstantStride () ? getStride () : static_cast<size_t> (0);
02496 #endif // HAVE_TPETRA_DEBUG
02497 
02498     const std::pair<size_t, size_t> offsetPair (offset, offset + newNumRows);
02499     // FIXME (mfh 10 May 2014) Use of origView_ instead of view_ for
02500     // the second argument may be wrong, if view_ resulted from a
02501     // previous call to offsetView with offset != 0.
02502     dual_view_type newView =
02503       subview<dual_view_type> (origView_, offsetPair, ALL ());
02504     RCP<const MV> subViewMV;
02505     if (isConstantStride ()) {
02506       subViewMV = rcp (new MV (subMap, newView, origView_));
02507 
02508     }
02509     else {
02510       subViewMV = rcp (new MV (subMap, newView, origView_, whichVectors_ ()));
02511     }
02512 
02513 #ifdef HAVE_TPETRA_DEBUG
02514     const size_t strideAfter = subViewMV->isConstantStride () ?
02515       subViewMV->getStride () : static_cast<size_t> (0);
02516 
02517     KokkosClassic::MultiVector<Scalar, node_type> X_lcl = this->getLocalMV ();
02518     KokkosClassic::MultiVector<Scalar, node_type> Y_lcl = subViewMV->getLocalMV ();
02519 
02520     if (strideBefore != strideAfter ||
02521         (offset == 0 && X_lcl.getValues ().getRawPtr () != Y_lcl.getValues ().getRawPtr ()) ||
02522         X_lcl.getNumCols () != Y_lcl.getNumCols () ||
02523         X_lcl.getStride () != Y_lcl.getStride () ||
02524         X_lcl.getNumRows () != this->getMap ()->getNodeNumElements () ||
02525         Y_lcl.getNumRows () != subMap->getNodeNumElements ()) {
02526       using std::endl;
02527       std::ostringstream os;
02528       os << "Tpetra::MultiVector::offsetView: Something is wrong." << endl
02529          << "strideBefore = " << strideBefore
02530          << ", strideAfter = " << strideAfter
02531          << endl
02532          << "Raw pointers of KokkosClassic::MultiVector objects equal? "
02533          << ((X_lcl.getValues ().getRawPtr () != Y_lcl.getValues ().getRawPtr ()) ? "true" : "false")
02534          << endl
02535          << "Dimensions of KokkosClassic::MultiVector objects: "
02536          << "[" << X_lcl.getNumRows () << ", " << X_lcl.getNumCols () << "], "
02537          << "[" << Y_lcl.getNumRows () << ", " << Y_lcl.getNumCols () << "]"
02538          << endl
02539          << "Strides of KokkosClassic::MultiVector objects: "
02540          << X_lcl.getStride () << ", " << Y_lcl.getStride () << endl
02541          << "this->getMap()->getNodeNumElements() = "
02542          << this->getMap ()->getNodeNumElements () << endl
02543          << "subMap->getNodeNumElements() = "
02544          << subMap->getNodeNumElements () << endl
02545          << "Please report this bug to the Tpetra developers.";
02546       TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str ());
02547     }
02548 #endif // HAVE_TPETRA_DEBUG
02549 
02550     return subViewMV;
02551   }
02552 
02553 
02554   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02555   Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02556   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02557   offsetViewNonConst (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Node> >& subMap,
02558                       size_t offset)
02559   {
02560     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
02561       Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > MV;
02562     return Teuchos::rcp_const_cast<MV> (this->offsetView (subMap, offset));
02563   }
02564 
02565 
02566   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02567   Teuchos::RCP<const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02568   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02569   subView (const ArrayView<const size_t>& cols) const
02570   {
02571     using Teuchos::Array;
02572     using Teuchos::rcp;
02573     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type> MV;
02574 
02575     const size_t numViewCols = static_cast<size_t> (cols.size ());
02576     TEUCHOS_TEST_FOR_EXCEPTION(
02577       numViewCols < 1, std::runtime_error, "Tpetra::MultiVector::subView"
02578       "(const Teuchos::ArrayView<const size_t>&): The input array cols must "
02579       "contain at least one entry, but cols.size() = " << cols.size ()
02580       << " == 0.");
02581 
02582     // Check whether the index set in cols is contiguous.  If it is,
02583     // use the more efficient Range1D version of subView.
02584     bool contiguous = true;
02585     for (size_t j = 1; j < numViewCols; ++j) {
02586       if (cols[j] != cols[j-1] + static_cast<size_t> (1)) {
02587         contiguous = false;
02588         break;
02589       }
02590     }
02591     if (contiguous) {
02592       if (numViewCols == 0) {
02593         // The output MV has no columns, so there is nothing to view.
02594         return rcp (new MV (this->getMap (), numViewCols));
02595       } else {
02596         // Use the more efficient contiguous-index-range version.
02597         return this->subView (Teuchos::Range1D (cols[0], cols[numViewCols-1]));
02598       }
02599     }
02600 
02601     if (isConstantStride ()) {
02602       return rcp (new MV (this->getMap (), view_, origView_, cols));
02603     }
02604     else {
02605       Array<size_t> newcols (cols.size ());
02606       for (size_t j = 0; j < numViewCols; ++j) {
02607         newcols[j] = whichVectors_[cols[j]];
02608       }
02609       return rcp (new MV (this->getMap (), view_, origView_, newcols ()));
02610     }
02611   }
02612 
02613 
02614   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02615   Teuchos::RCP<const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02616   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02617   subView (const Teuchos::Range1D& colRng) const
02618   {
02619     using Kokkos::ALL;
02620     using Kokkos::subview;
02621     using Teuchos::Array;
02622     using Teuchos::rcp;
02623     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type> MV;
02624     const char prefix[] = "Tpetra::MultiVector::subView(Range1D): ";
02625 
02626     TEUCHOS_TEST_FOR_EXCEPTION(
02627       colRng.size() == 0, std::runtime_error, prefix << "Range must include "
02628       "at least one vector.");
02629     TEUCHOS_TEST_FOR_EXCEPTION(
02630       static_cast<size_t> (colRng.size ()) > this->getNumVectors (),
02631       std::runtime_error, prefix << "colRng.size() = " << colRng.size ()
02632       << " > this->getNumVectors() = " << this->getNumVectors () << ".");
02633 
02634     // resulting MultiVector is constant stride only if *this is
02635     if (isConstantStride ()) {
02636       // view goes from first entry of first vector to last entry of last vector
02637       std::pair<size_t, size_t> cols (colRng.lbound (), colRng.ubound () + 1);
02638       return rcp (new MV (this->getMap (),
02639                           subview<dual_view_type> (view_, ALL (), cols),
02640                           origView_));
02641     }
02642     else {
02643       // otherwise, use a subset of this whichVectors_ to construct new multivector
02644       Array<size_t> which (whichVectors_.begin () + colRng.lbound (),
02645                            whichVectors_.begin () + colRng.ubound () + 1);
02646       return rcp (new MV (this->getMap (), view_, origView_, which));
02647     }
02648   }
02649 
02650 
02651   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02652   Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02653   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02654   subViewNonConst (const ArrayView<const size_t> &cols)
02655   {
02656     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
02657       Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > MV;
02658     return Teuchos::rcp_const_cast<MV> (this->subView (cols));
02659   }
02660 
02661 
02662   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02663   Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02664   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02665   subViewNonConst (const Teuchos::Range1D &colRng)
02666   {
02667     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal,
02668       Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > MV;
02669     return Teuchos::rcp_const_cast<MV> (this->subView (colRng));
02670   }
02671 
02672 
02673   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02674   Teuchos::RCP<const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02675   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02676   getVector (size_t j) const
02677   {
02678     using Kokkos::ALL;
02679     using Kokkos::subview;
02680     using Teuchos::rcp;
02681     // using Teuchos::rcp_const_cast;
02682     typedef Vector<Scalar, LocalOrdinal, GlobalOrdinal,
02683       Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > V;
02684 
02685 #ifdef HAVE_TPETRA_DEBUG
02686     TEUCHOS_TEST_FOR_EXCEPTION(
02687       vectorIndexOutOfRange(j), std::runtime_error, "Tpetra::MultiVector::"
02688       "getVector(NonConst): index j (== " << j << ") exceeds valid column "
02689       "range for this multivector.");
02690 #endif // HAVE_TPETRA_DEBUG
02691 
02692     // FIXME (mfh 10 May 2014) Why can't Kokkos take size_t instead of
02693     // unsigned int?
02694     const unsigned int jj = isConstantStride () ?
02695       static_cast<unsigned int> (j) :
02696       static_cast<unsigned int> (whichVectors_[j]);
02697     // FIXME (mfh 10 May 2014) The same issue shows up here that shows
02698     // up for offsetView.
02699     return rcp (new V (this->getMap (),
02700                        subview<dual_view_type> (view_, ALL (), jj),
02701                        origView_));
02702   }
02703 
02704 
02705   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02706   Teuchos::RCP<Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
02707   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02708   getVectorNonConst (size_t j)
02709   {
02710     typedef Vector<Scalar, LocalOrdinal, GlobalOrdinal,
02711       Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > V;
02712     return Teuchos::rcp_const_cast<V> (this->getVector (j));
02713   }
02714 
02715 
02716   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02717   void
02718   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02719   get1dCopy (Teuchos::ArrayView<Scalar> A, size_t LDA) const
02720   {
02721     using Kokkos::ALL;
02722     using Kokkos::subview;
02723     typedef typename dual_view_type::host_mirror_device_type host_mirror_device_type;
02724     // The user's array is column major ("LayoutLeft").
02725     // typedef Kokkos::View<Scalar**, Kokkos::LayoutLeft,
02726     //   host_mirror_device_type, Kokkos::MemoryUnmanaged> input_view_type;
02727     typedef Kokkos::View<Scalar*, Kokkos::LayoutLeft,
02728       host_mirror_device_type, Kokkos::MemoryUnmanaged> input_col_type;
02729     typedef typename dual_view_type::t_host host_view_type;
02730     typedef Kokkos::View< Scalar*
02731                         , typename host_view_type::array_layout
02732                         , typename host_view_type::device_type
02733                         , Kokkos::MemoryUnmanaged > host_col_type ;
02734     const char tfecfFuncName[] = "get1dCopy";
02735 
02736     const size_t numRows = this->getLocalLength ();
02737     const size_t numCols = this->getNumVectors ();
02738     const std::pair<size_t, size_t> rowRange (0, numRows);
02739     const std::pair<size_t, size_t> colRange (0, numCols);
02740 
02741     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02742       LDA < numRows, std::runtime_error,
02743       ": LDA = " << LDA << " < numRows = " << numRows << ".");
02744     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02745       numRows > static_cast<size_t> (0) &&
02746       numCols > static_cast<size_t> (0) &&
02747       static_cast<size_t> (A.size ()) < LDA * (numCols - 1) + numRows,
02748       std::runtime_error,
02749       ": A.size() = " << A.size () << ", but its size must be at least "
02750       << LDA * (numCols - 1) * numRows << " to hold all the entries.");
02751 
02752     // This does a deep copy into A, which is column major with
02753     // leading dimension LDA.  We assume A is big enough to hold
02754     // everything.  Copy directly from host mirror of the data, if it
02755     // exists.
02756 
02757     // Start by sync'ing to host.
02758     view_.template sync<host_mirror_device_type> ();
02759 
02760     // FIXME (mfh 22 Jul 2014) These actually should be strided views.
02761     // This causes a run-time error with deep copy.  The temporary fix
02762     // is to copy one column at a time.
02763 
02764     //input_view_type dstWholeView (A.getRawPtr (), LDA, numCols);
02765     // input_view_type dstView =
02766     //   Kokkos::subview<input_view_type> (dstWholeView, rowRange, Kokkos::ALL ());
02767     // host_view_type srcView =
02768     //   Kokkos::subview<host_view_type> (view_.h_view, rowRange, Kokkos::ALL ());
02769 
02770     // if (this->isConstantStride ()) {
02771     //   Kokkos::deep_copy (dstView, srcView);
02772     // }
02773     // else {
02774     //   Kokkos::View<LocalOrdinal*, host_mirror_device_type> whichVecsDst ("whichVecsDst", numCols);
02775     //   Kokkos::View<LocalOrdinal*, host_mirror_device_type> whichVecsSrc ("whichVecsSrc", numCols);
02776     //   for (size_t j = 0; j < numCols; ++j) {
02777     //     whichVecsSrc(j) = static_cast<LocalOrdinal> (this->whichVectors_[j]);
02778     //     whichVecsDst(j) = static_cast<LocalOrdinal> (j);
02779     //   }
02780     //   DeepCopySelectedVectors<input_view_type, host_view_type, LocalOrdinal,
02781     //     host_mirror_device_type, false, false> f (dstView, srcView, whichVecsDst, whichVecsSrc);
02782     //   Kokkos::parallel_for (numRows, f);
02783     // }
02784 
02785     for (size_t j = 0; j < numCols; ++j) {
02786       const size_t srcCol = this->isConstantStride () ? j : this->whichVectors_[j];
02787       const size_t dstCol = j;
02788       Scalar* const dstColRaw = A.getRawPtr () + LDA * dstCol;
02789 
02790       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02791         static_cast<size_t> (A.size ()) < LDA * dstCol + numRows,
02792         std::logic_error, ": The input ArrayView A is not big enough to hold "
02793         "all the data.  Please report this bug to the Tpetra developers.");
02794 
02795       input_col_type dstColView (dstColRaw, numRows);
02796       host_col_type srcColView = subview<host_col_type> (view_.h_view, ALL (), srcCol);
02797 
02798       TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02799         dstColView.dimension_0 () != srcColView.dimension_0 (), std::logic_error,
02800         ": srcColView and dstColView have different dimensions.  "
02801         "Please report this bug to the Tpetra developers.");
02802 
02803       Kokkos::deep_copy (dstColView, srcColView);
02804     }
02805   }
02806 
02807 
02808   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02809   void
02810   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02811   get2dCopy (Teuchos::ArrayView<const Teuchos::ArrayView<Scalar> > ArrayOfPtrs) const
02812   {
02813     typedef typename dual_view_type::host_mirror_device_type
02814       host_mirror_device_type;
02815     typedef typename dual_view_type::t_host host_view_type;
02816     typedef Kokkos::View<Scalar**,
02817       typename host_view_type::array_layout,
02818       typename dual_view_type::host_mirror_device_type,
02819       Kokkos::MemoryUnmanaged> unmanaged_host_view_type;
02820 
02821     const char tfecfFuncName[] = "get2dCopy";
02822     const size_t numRows = this->getLocalLength ();
02823     const size_t numCols = this->getNumVectors ();
02824 
02825     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02826       static_cast<size_t> (ArrayOfPtrs.size ()) != numCols,
02827       std::runtime_error, ": Input array of pointers must contain as many "
02828       "entries (arrays) as the MultiVector has columns.  ArrayOfPtrs.size() = "
02829       << ArrayOfPtrs.size () << " != getNumVectors() = " << numCols << ".");
02830 
02831     if (numRows != 0 && numCols != 0) {
02832       // Start by sync'ing to host.
02833       view_.template sync<host_mirror_device_type> ();
02834 
02835       // No side effects until we've validated the input.
02836       for (size_t j = 0; j < numCols; ++j) {
02837         const size_t dstLen = static_cast<size_t> (ArrayOfPtrs[j].size ());
02838         TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
02839           dstLen < numRows, std::invalid_argument, ": Array j = " << j << " of "
02840           "the input array of arrays is not long enough to fit all entries in "
02841           "that column of the MultiVector.  ArrayOfPtrs[j].size() = " << dstLen
02842           << " < getLocalLength() = " << numRows << ".");
02843       }
02844 
02845       // We've validated the input, so it's safe to start copying.
02846       for (size_t j = 0; j < numCols; ++j) {
02847         const size_t col = isConstantStride () ? j : whichVectors_[j];
02848         host_view_type src =
02849           Kokkos::subview<host_view_type> (view_.h_view, Kokkos::ALL (), col);
02850         unmanaged_host_view_type dst (ArrayOfPtrs[j].getRawPtr (),
02851                                       ArrayOfPtrs[j].size (),
02852                                       static_cast<size_t> (1));
02853         Kokkos::deep_copy (dst, src);
02854       }
02855     }
02856   }
02857 
02858 
02859   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02860   Teuchos::ArrayRCP<const Scalar>
02861   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02862   get1dView () const
02863   {
02864     if (getLocalLength () == 0 || getNumVectors () == 0) {
02865       return Teuchos::null;
02866     } else {
02867       TEUCHOS_TEST_FOR_EXCEPTION(
02868         ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
02869         "get1dView: This MultiVector does not have constant stride, so it is "
02870         "not possible to view its data as a single array.  You may check "
02871         "whether a MultiVector has constant stride by calling "
02872         "isConstantStride().");
02873       // NOTE (mfh 09 2014) get1dView() and get1dViewNonConst() have
02874       // always been device->host synchronization points.  We might
02875       // want to change this in the future.
02876       typedef typename dual_view_type::host_mirror_device_type host_type;
02877       view_.template sync<host_type> ();
02878       // Both get1dView() and get1dViewNonConst() return a host view
02879       // of the data.
02880       Teuchos::ArrayRCP<scalar_type> dataAsArcp =
02881         Kokkos::Compat::persistingView (view_.template view<host_type> ());
02882       return Teuchos::arcp_const_cast<const scalar_type> (dataAsArcp);
02883     }
02884   }
02885 
02886 
02887   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02888   Teuchos::ArrayRCP<Scalar>
02889   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02890   get1dViewNonConst ()
02891   {
02892     if (getLocalLength () == 0 || getNumVectors () == 0) {
02893       return Teuchos::null;
02894     } else {
02895       TEUCHOS_TEST_FOR_EXCEPTION(
02896         ! isConstantStride (), std::runtime_error, "Tpetra::MultiVector::"
02897         "get1dViewNonConst: This MultiVector does not have constant stride, so "
02898         "it is not possible to view its data as a single array.  You may check "
02899         "whether a MultiVector has constant stride by calling "
02900         "isConstantStride().");
02901       // NOTE (mfh 09 May 2014) get1dView() and get1dViewNonConst()
02902       // have always been device->host synchronization points.  We
02903       // might want to change this in the future.
02904       typedef typename dual_view_type::host_mirror_device_type host_type;
02905       view_.template sync<host_type> ();
02906       // Both get1dView() and get1dViewNonConst() return a host view
02907       // of the data.
02908       Teuchos::ArrayRCP<scalar_type> dataAsArcp =
02909         Kokkos::Compat::persistingView (view_.template view<host_type> ());
02910       return dataAsArcp;
02911     }
02912   }
02913 
02914 
02915   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02916   Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> >
02917   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02918   get2dViewNonConst ()
02919   {
02920     const size_t numCols = getNumVectors ();
02921     Teuchos::ArrayRCP<Teuchos::ArrayRCP<Scalar> > views (numCols);
02922     for (size_t j = 0; j < numCols; ++j) {
02923       const size_t col = isConstantStride () ? j : whichVectors_[j];
02924       dual_view_type X_col = Kokkos::subview<dual_view_type> (view_, Kokkos::ALL (), col);
02925       views[j] = Kokkos::Compat::persistingView (X_col.d_view);
02926     }
02927     return views;
02928   }
02929 
02930 
02931   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02932   Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> >
02933   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02934   get2dView () const
02935   {
02936     const size_t numCols = getNumVectors ();
02937     Teuchos::ArrayRCP<Teuchos::ArrayRCP<const Scalar> > views (numCols);
02938     for (size_t j = 0; j < numCols; ++j) {
02939       const size_t col = isConstantStride () ? j : whichVectors_[j];
02940       dual_view_type X_col = Kokkos::subview<dual_view_type> (view_, Kokkos::ALL (), col);
02941       views[j] = Kokkos::Compat::persistingView (X_col.d_view);
02942     }
02943     return views;
02944   }
02945 
02946 
02947   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
02948   void
02949   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
02950   multiply (Teuchos::ETransp transA,
02951             Teuchos::ETransp transB,
02952             const Scalar &alpha,
02953             const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& A,
02954             const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& B,
02955             const Scalar &beta)
02956   {
02957     using Teuchos::CONJ_TRANS;
02958     using Teuchos::NO_TRANS;
02959     using Teuchos::TRANS;
02960     using Teuchos::RCP;
02961     using Teuchos::rcp;
02962     using Teuchos::rcpFromRef;
02963     typedef Teuchos::ScalarTraits<Scalar> STS;
02964     typedef MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, node_type> MV;
02965     const char errPrefix[] = "Tpetra::MultiVector::multiply: ";
02966 
02967     // This routine performs a variety of matrix-matrix multiply
02968     // operations, interpreting the MultiVector (this-aka C , A and B)
02969     // as 2D matrices.  Variations are due to the fact that A, B and C
02970     // can be local replicated or global distributed MultiVectors and
02971     // that we may or may not operate with the transpose of A and B.
02972     // Possible cases are:
02973     //
02974     //     Operations                          # Cases  Notes
02975     //  1) C(local) = A^X(local) * B^X(local)  4        X=Trans or Not, no comm needed
02976     //  2) C(local) = A^T(distr) * B  (distr)  1        2-D dot product, replicate C
02977     //  3) C(distr) = A  (distr) * B^X(local)  2        2-D vector update, no comm needed
02978     //
02979     // The following operations are not meaningful for 1-D
02980     // distributions:
02981     //
02982     // u1) C(local) = A^T(distr) * B^T(distr)  1
02983     // u2) C(local) = A  (distr) * B^X(distr)  2
02984     // u3) C(distr) = A^X(local) * B^X(local)  4
02985     // u4) C(distr) = A^X(local) * B^X(distr)  4
02986     // u5) C(distr) = A^T(distr) * B^X(local)  2
02987     // u6) C(local) = A^X(distr) * B^X(local)  4
02988     // u7) C(distr) = A^X(distr) * B^X(local)  4
02989     // u8) C(local) = A^X(local) * B^X(distr)  4
02990     //
02991     // Total number of cases: 32 (= 2^5).
02992 
02993     TEUCHOS_TEST_FOR_EXCEPTION(
02994       STS::isComplex && (transA == TRANS || transB == TRANS),
02995       std::invalid_argument, errPrefix << "Transpose without conjugation "
02996       "(transA == TRANS || transB == TRANS) not supported for complex Scalar "
02997       "types.");
02998 
02999     transA = (transA == NO_TRANS ? NO_TRANS : CONJ_TRANS);
03000     transB = (transB == NO_TRANS ? NO_TRANS : CONJ_TRANS);
03001 
03002     // Compute effective dimensions, w.r.t. transpose operations on
03003     size_t A_nrows = (transA==CONJ_TRANS) ? A.getNumVectors() : A.getLocalLength();
03004     size_t A_ncols = (transA==CONJ_TRANS) ? A.getLocalLength() : A.getNumVectors();
03005     size_t B_nrows = (transB==CONJ_TRANS) ? B.getNumVectors() : B.getLocalLength();
03006     size_t B_ncols = (transB==CONJ_TRANS) ? B.getLocalLength() : B.getNumVectors();
03007 
03008     Scalar beta_local = beta; // local copy of beta; might be reassigned below
03009 
03010     TEUCHOS_TEST_FOR_EXCEPTION(
03011       getLocalLength () != A_nrows || getNumVectors () != B_ncols ||
03012       A_ncols != B_nrows, std::runtime_error, errPrefix << "Dimensions of "
03013       "*this, op(A), and op(B) must be consistent.  Local part of *this is "
03014       << getLocalLength() << " x " << getNumVectors()
03015       << ", A is " << A_nrows << " x " << A_ncols
03016       << ", and B is " << B_nrows << " x " << B_ncols << ".");
03017 
03018     const bool A_is_local = ! A.isDistributed ();
03019     const bool B_is_local = ! B.isDistributed ();
03020     const bool C_is_local = ! this->isDistributed ();
03021     // Case 1: C(local) = A^X(local) * B^X(local)
03022     const bool Case1 = C_is_local && A_is_local && B_is_local;
03023     // Case 2: C(local) = A^T(distr) * B  (distr)
03024     const bool Case2 = C_is_local && ! A_is_local && ! B_is_local &&
03025       transA == CONJ_TRANS && transB == NO_TRANS;
03026     // Case 3: C(distr) = A  (distr) * B^X(local)
03027     const bool Case3 = ! C_is_local && ! A_is_local && B_is_local &&
03028       transA == NO_TRANS;
03029 
03030     // Test that we are considering a meaningful case
03031     TEUCHOS_TEST_FOR_EXCEPTION(
03032       ! Case1 && ! Case2 && ! Case3, std::runtime_error, errPrefix
03033       << "Multiplication of op(A) and op(B) into *this is not a "
03034       "supported use case.");
03035 
03036     if (beta != STS::zero () && Case2) {
03037       // If Case2, then C is local and contributions must be summed
03038       // across all processes.  However, if beta != 0, then accumulate
03039       // beta*C into the sum.  When summing across all processes, we
03040       // only want to accumulate this once, so set beta == 0 on all
03041       // processes except Process 0.
03042       const int myRank = this->getMap ()->getComm ()->getRank ();
03043       if (myRank != 0) {
03044         beta_local = STS::zero ();
03045       }
03046     }
03047 
03048     // We only know how to do matrix-matrix multiplies if all the
03049     // MultiVectors have constant stride.  If not, we have to make
03050     // temporary copies of those MultiVectors (including possibly
03051     // *this) that don't have constant stride.
03052     RCP<MV> C_tmp;
03053     if (! isConstantStride ()) {
03054       C_tmp = rcp (new MV (*this, Teuchos::Copy)); // deep copy
03055     } else {
03056       C_tmp = rcp (this, false);
03057     }
03058 
03059     RCP<const MV> A_tmp;
03060     if (! A.isConstantStride ()) {
03061       A_tmp = rcp (new MV (A, Teuchos::Copy)); // deep copy
03062     } else {
03063       A_tmp = rcpFromRef (A);
03064     }
03065 
03066     RCP<const MV> B_tmp;
03067     if (! B.isConstantStride ()) {
03068       B_tmp = rcp (new MV (B, Teuchos::Copy)); // deep copy
03069     } else {
03070       B_tmp = rcpFromRef (B);
03071     }
03072 
03073     TEUCHOS_TEST_FOR_EXCEPTION(
03074       ! C_tmp->isConstantStride () || ! B_tmp->isConstantStride () ||
03075       ! A_tmp->isConstantStride (), std::logic_error, errPrefix
03076       << "Failed to make temporary constant-stride copies of MultiVectors.");
03077 
03078     typedef Kokkos::DeviceGEMM<Scalar,DeviceType> gemm_type;
03079 
03080     gemm_type::GEMM (transA, transB, alpha,
03081                      A_tmp->getDualView ().d_view, B_tmp->getDualView ().d_view,
03082                      beta_local, C_tmp->getDualView ().d_view);
03083     if (! isConstantStride ()) {
03084       deep_copy (*this, *C_tmp); // Copy the result back into *this.
03085     }
03086 
03087     // Dispose of (possibly) extra copies of A and B.
03088     A_tmp = Teuchos::null;
03089     B_tmp = Teuchos::null;
03090 
03091     // If Case 2 then sum up *this and distribute it to all processes.
03092     if (Case2) {
03093       this->reduce ();
03094     }
03095   }
03096 
03097   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03098   void
03099   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03100   elementWiseMultiply (Scalar scalarAB,
03101                        const Vector<Scalar,LocalOrdinal,GlobalOrdinal, node_type>& A,
03102                        const MultiVector<Scalar,LocalOrdinal,GlobalOrdinal, node_type>& B,
03103                        Scalar scalarThis)
03104   {
03105     using Kokkos::ALL;
03106     using Kokkos::subview;
03107     using Teuchos::arcp_const_cast;
03108     typedef Kokkos::View<Scalar*, Kokkos::LayoutLeft, DeviceType> view_type;
03109     const char tfecfFuncName[] = "elementWiseMultiply: ";
03110 
03111 #ifdef HAVE_TPETRA_DEBUG
03112     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03113       getLocalLength() != A.getLocalLength() ||
03114       getLocalLength() != B.getLocalLength(), std::runtime_error,
03115       "MultiVectors do not have the same local length.");
03116 #endif // HAVE_TPETRA_DEBUG
03117     TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
03118       B.getNumVectors() != this->getNumVectors(), std::runtime_error,
03119       "MultiVectors 'this' and B must have the same number of vectors.");
03120 
03121     const size_t numVecs = this->getNumVectors ();
03122     if (isConstantStride () && A.isConstantStride ()) {
03123       // FIXME (mfh 02 Oct 2014) Shouldn't it be asking if B has
03124       // constant stride?  A is just a Vector; it only has one column,
03125       // so it always has constant stride.
03126       //
03127       // If both *this and A have constant stride, we can do an
03128       // element-wise multiply on all columns at once.
03129       view_.template sync<DeviceType>();
03130       view_.template modify<DeviceType>();
03131       A.view_.template sync<DeviceType>();
03132       A.view_.template modify<DeviceType>();
03133       B.view_.template sync<DeviceType>();
03134       B.view_.template modify<DeviceType>();
03135       view_type vector_A = subview<view_type> (A.view_.d_view, ALL (), 0);
03136       Kokkos::MV_ElementWiseMultiply (scalarThis, view_.d_view,
03137                                       scalarAB, vector_A, B.view_.d_view);
03138     }
03139     else {
03140       view_.template sync<DeviceType>();
03141       view_.template modify<DeviceType>();
03142       A.view_.template sync<DeviceType>();
03143       A.view_.template modify<DeviceType>();
03144       B.view_.template sync<DeviceType>();
03145       B.view_.template modify<DeviceType>();
03146       view_type vector_A = subview<view_type> (A.view_.d_view, ALL (), 0);
03147       for (size_t k = 0; k < numVecs; ++k) {
03148         const size_t this_col = isConstantStride () ? k : whichVectors_[k];
03149         view_type vector_k = subview<view_type> (view_.d_view, ALL (), this_col);
03150         const size_t B_col = isConstantStride () ? k : B.whichVectors_[k];
03151         view_type vector_Bk = subview<view_type> (B.view_.d_view, ALL (), B_col);
03152         Kokkos::V_ElementWiseMultiply (scalarThis, vector_k, scalarAB, vector_A, vector_Bk);
03153       }
03154     }
03155   }
03156 
03157   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03158   void
03159   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::reduce()
03160   {
03161     using Kokkos::ALL;
03162     using Kokkos::subview;
03163     using Teuchos::reduceAll;
03164     using Teuchos::REDUCE_SUM;
03165     typedef typename dual_view_type::t_dev device_view_type;
03166     typedef typename device_type::host_mirror_device_type host_mirror_device_type;
03167 
03168     TEUCHOS_TEST_FOR_EXCEPTION(
03169       this->isDistributed (), std::runtime_error,
03170       "Tpetra::MultiVector::reduce() should only be called with locally "
03171       "replicated or otherwise not distributed MultiVector objects.");
03172     const Teuchos::Comm<int>& comm = * (this->getMap ()->getComm ());
03173     if (comm.getSize () == 1) {
03174       return;
03175     }
03176 
03177     const size_t numLclRows = getLocalLength ();
03178     const size_t numCols = getNumVectors ();
03179 
03180     // FIXME (mfh 16 June 2014) This exception will cause deadlock if
03181     // it triggers on only some processes.  We don't have a good way
03182     // to pack this result into the all-reduce below, but this would
03183     // be a good reason to set a "local error flag" and find other
03184     // opportunities to let it propagate.
03185     TEUCHOS_TEST_FOR_EXCEPTION(
03186       numLclRows > static_cast<size_t> (std::numeric_limits<int>::max ()),
03187       std::runtime_error, "Tpetra::MultiVector::reduce: On Process " <<
03188       comm.getRank () << ", the number of local rows " << numLclRows <<
03189       " does not fit in int.");
03190 
03191     //
03192     // Use MPI to sum the entries across all local blocks.
03193     //
03194     // If this MultiVector's local data are stored contiguously, we
03195     // can use the local View as the source buffer in the
03196     // MPI_Allreduce.  Otherwise, we have to allocate a temporary
03197     // source buffer and pack.
03198     const bool contig = isConstantStride () && getStride () == numLclRows;
03199     device_view_type srcBuf;
03200     if (contig) {
03201       srcBuf = view_.d_view;
03202     }
03203     else {
03204       srcBuf = device_view_type ("srcBuf", numLclRows, numCols);
03205       Kokkos::deep_copy (srcBuf, view_.d_view);
03206     }
03207 
03208     // MPI requires that the send and receive buffers don't alias one
03209     // another, so we have to copy temporary storage for the result.
03210     //
03211     // We expect that MPI implementations will know how to read device
03212     // pointers.
03213     device_view_type tgtBuf ("tgtBuf", numLclRows, numCols);
03214 
03215     const int reduceCount = static_cast<int> (numLclRows * numCols);
03216     reduceAll<int, Scalar> (comm, REDUCE_SUM, reduceCount,
03217                             srcBuf.ptr_on_device (), tgtBuf.ptr_on_device ());
03218 
03219     // Tell the DualView that we plan to modify the device data.
03220     view_.template modify<device_type> ();
03221 
03222     const std::pair<size_t, size_t> lclRowRange (0, numLclRows);
03223     device_view_type d_view =
03224       subview<device_view_type> (view_.d_view, lclRowRange, ALL ());
03225 
03226     if (contig || isConstantStride ()) {
03227       Kokkos::deep_copy (d_view, tgtBuf);
03228     }
03229     else {
03230       for (size_t j = 0; j < numCols; ++j) {
03231         device_view_type d_view_j =
03232           subview<device_view_type> (d_view, ALL (), j);
03233         device_view_type tgtBuf_j =
03234           subview<device_view_type> (tgtBuf, ALL (), j);
03235         Kokkos::deep_copy (d_view_j, tgtBuf_j);
03236       }
03237     }
03238 
03239     // Synchronize the host with changes on the device.
03240     //
03241     // FIXME (mfh 16 June 2014) This raises the question of whether we
03242     // want to synchronize always.  Users will find it reassuring if
03243     // MultiVector methods always leave the MultiVector in a
03244     // synchronized state, but it seems silly to synchronize to host
03245     // if they hardly ever need host data.
03246     view_.template sync<host_mirror_device_type> ();
03247   }
03248 
03249 
03250   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03251   void
03252   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03253   replaceLocalValue (LocalOrdinal MyRow,
03254                      size_t VectorIndex,
03255                      const Scalar &ScalarValue)
03256   {
03257 #ifdef HAVE_TPETRA_DEBUG
03258     const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
03259     const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
03260     TEUCHOS_TEST_FOR_EXCEPTION(
03261       MyRow < minLocalIndex || MyRow > maxLocalIndex,
03262       std::runtime_error,
03263       "Tpetra::MultiVector::replaceLocalValue: row index " << MyRow
03264       << " is invalid.  The range of valid row indices on this process "
03265       << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
03266       << ", " << maxLocalIndex << "].");
03267     TEUCHOS_TEST_FOR_EXCEPTION(
03268       vectorIndexOutOfRange(VectorIndex),
03269       std::runtime_error,
03270       "Tpetra::MultiVector::replaceLocalValue: vector index " << VectorIndex
03271       << " of the multivector is invalid.");
03272 #endif
03273     const size_t colInd = isConstantStride () ?
03274       VectorIndex : whichVectors_[VectorIndex];
03275     view_.d_view (static_cast<size_t> (MyRow), colInd) = ScalarValue;
03276   }
03277 
03278 
03279   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03280   void
03281   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03282   sumIntoLocalValue (LocalOrdinal MyRow,
03283                      size_t VectorIndex,
03284                      const Scalar &ScalarValue)
03285   {
03286 #ifdef HAVE_TPETRA_DEBUG
03287     const LocalOrdinal minLocalIndex = this->getMap()->getMinLocalIndex();
03288     const LocalOrdinal maxLocalIndex = this->getMap()->getMaxLocalIndex();
03289     TEUCHOS_TEST_FOR_EXCEPTION(
03290       MyRow < minLocalIndex || MyRow > maxLocalIndex,
03291       std::runtime_error,
03292       "Tpetra::MultiVector::sumIntoLocalValue: row index " << MyRow
03293       << " is invalid.  The range of valid row indices on this process "
03294       << this->getMap()->getComm()->getRank() << " is [" << minLocalIndex
03295       << ", " << maxLocalIndex << "].");
03296     TEUCHOS_TEST_FOR_EXCEPTION(
03297       vectorIndexOutOfRange(VectorIndex),
03298       std::runtime_error,
03299       "Tpetra::MultiVector::sumIntoLocalValue: vector index " << VectorIndex
03300       << " of the multivector is invalid.");
03301 #endif
03302     const size_t colInd = isConstantStride () ?
03303       VectorIndex : whichVectors_[VectorIndex];
03304     view_.d_view (static_cast<size_t> (MyRow), colInd) += ScalarValue;
03305   }
03306 
03307 
03308   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03309   void
03310   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03311   replaceGlobalValue (GlobalOrdinal GlobalRow,
03312                       size_t VectorIndex,
03313                       const Scalar &ScalarValue)
03314   {
03315     LocalOrdinal MyRow = this->getMap()->getLocalElement(GlobalRow);
03316 #ifdef HAVE_TPETRA_DEBUG
03317     TEUCHOS_TEST_FOR_EXCEPTION(
03318       MyRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
03319       std::runtime_error,
03320       "Tpetra::MultiVector::replaceGlobalValue: global row index " << GlobalRow
03321       << "is not present on this process " << this->getMap()->getComm()->getRank()
03322       << ".");
03323     TEUCHOS_TEST_FOR_EXCEPTION(
03324       vectorIndexOutOfRange(VectorIndex),
03325       std::runtime_error,
03326       "Tpetra::MultiVector::replaceGlobalValue: vector index " << VectorIndex
03327       << " of the multivector is invalid.");
03328 #endif
03329     replaceLocalValue (MyRow, VectorIndex, ScalarValue);
03330   }
03331 
03332   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03333   void
03334   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03335   sumIntoGlobalValue (GlobalOrdinal GlobalRow,
03336                       size_t VectorIndex,
03337                       const Scalar &ScalarValue)
03338   {
03339     LocalOrdinal MyRow = this->getMap()->getLocalElement(GlobalRow);
03340 #ifdef HAVE_TEUCHOS_DEBUG
03341     TEUCHOS_TEST_FOR_EXCEPTION(
03342       MyRow == Teuchos::OrdinalTraits<LocalOrdinal>::invalid(),
03343       std::runtime_error,
03344       "Tpetra::MultiVector::sumIntoGlobalValue: global row index " << GlobalRow
03345       << "is not present on this process " << this->getMap()->getComm()->getRank()
03346       << ".");
03347     TEUCHOS_TEST_FOR_EXCEPTION(
03348       vectorIndexOutOfRange(VectorIndex),
03349       std::runtime_error,
03350       "Tpetra::MultiVector::sumIntoGlobalValue: vector index " << VectorIndex
03351       << " of the multivector is invalid.");
03352 #endif
03353     sumIntoLocalValue (MyRow, VectorIndex, ScalarValue);
03354   }
03355 
03356   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03357   template <class T>
03358   Teuchos::ArrayRCP<T>
03359   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03360   getSubArrayRCP (Teuchos::ArrayRCP<T> arr,
03361                   size_t j) const
03362   {
03363     const size_t col = isConstantStride () ? j : whichVectors_[j];
03364     dual_view_type X_col = Kokkos::subview<dual_view_type> (view_, Kokkos::ALL (), col);
03365     return Kokkos::Compat::persistingView (X_col.d_view); // ????? host or device???
03366   }
03367 
03368   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03369   KokkosClassic::MultiVector<Scalar,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >
03370   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getLocalMV () const
03371   {
03372     // KR Creates the KokkosClassic thing on the fly.
03373     // OK to leave this in place as backwards compat.
03374 
03375     KMV kmv (this->getMap ()->getNode ());
03376     ArrayRCP<Scalar> data = (getLocalLength() > 0) ?
03377       Kokkos::Compat::persistingView (view_.d_view) :
03378       Teuchos::null;
03379 
03380     size_t stride[8];
03381     origView_.stride (stride);
03382     const size_t LDA = origView_.dimension_1 () > 1 ? stride[1] : origView_.dimension_0 ();
03383     MVT::initializeValues (kmv, getLocalLength (), getNumVectors (),
03384                            data,
03385                            LDA,
03386                            getOrigNumLocalRows (),
03387                            getOrigNumLocalCols ());
03388     return kmv;
03389   }
03390 
03391   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03392   typename MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::dual_view_type
03393   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getDualView() const {
03394     return view_;
03395   }
03396 
03397   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03398   TEUCHOS_DEPRECATED
03399   KokkosClassic::MultiVector<Scalar,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >
03400   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::getLocalMVNonConst() {
03401 
03402     KMV kmv (this->getMap ()->getNode ());
03403     ArrayRCP<Scalar> data = (getLocalLength() > 0) ?
03404       Kokkos::Compat::persistingView (view_.d_view) :
03405       Teuchos::null;
03406 
03407     size_t stride[8];
03408     origView_.stride (stride);
03409     const size_t LDA = origView_.dimension_1 () > 1 ? stride[1] : origView_.dimension_0 ();
03410     MVT::initializeValues (kmv, getLocalLength (), getNumVectors (),
03411                            data,
03412                            LDA,
03413                            getOrigNumLocalRows (),
03414                            getOrigNumLocalCols ());
03415     return kmv;
03416   }
03417 
03418   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03419   std::string
03420   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::description() const
03421   {
03422     using std::endl;
03423     std::ostringstream oss;
03424     oss << Teuchos::typeName (*this) << " {"
03425         << "label: \"" << this->getObjectLabel () << "\""
03426         << ", numRows: " << getGlobalLength ()
03427         << ", numCols: " << getNumVectors ()
03428         << ", isConstantStride: " << isConstantStride ();
03429     if (isConstantStride ()) {
03430       oss << ", columnStride: " << getStride ();
03431     }
03432     oss << "}";
03433     return oss.str();
03434   }
03435 
03436   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03437   void
03438   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03439   describe (Teuchos::FancyOStream &out,
03440             const Teuchos::EVerbosityLevel verbLevel) const
03441   {
03442     using Teuchos::ArrayRCP;
03443     using Teuchos::RCP;
03444     using Teuchos::VERB_DEFAULT;
03445     using Teuchos::VERB_NONE;
03446     using Teuchos::VERB_LOW;
03447     using Teuchos::VERB_MEDIUM;
03448     using Teuchos::VERB_HIGH;
03449     using Teuchos::VERB_EXTREME;
03450     using std::endl;
03451     using std::setw;
03452 
03453     // Set default verbosity if applicable.
03454     const Teuchos::EVerbosityLevel vl =
03455       (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
03456 
03457     RCP<const Teuchos::Comm<int> > comm = this->getMap()->getComm();
03458     const int myImageID = comm->getRank();
03459     const int numImages = comm->getSize();
03460 
03461     if (vl != VERB_NONE) {
03462       // Don't set the tab level unless we're printing something.
03463       Teuchos::OSTab tab0 (out);
03464 
03465       if (myImageID == 0) { // >= VERB_LOW prints description()
03466         out << "Tpetra::MultiVector:" << endl;
03467         Teuchos::OSTab tab1 (out);
03468         out << "Template parameters:" << endl;
03469         {
03470           Teuchos::OSTab tab2 (out);
03471           out << "Scalar: " << Teuchos::TypeNameTraits<Scalar>::name () << endl
03472               << "LocalOrdinal: " << Teuchos::TypeNameTraits<LocalOrdinal>::name () << endl
03473               << "GlobalOrdinal: " << Teuchos::TypeNameTraits<GlobalOrdinal>::name () << endl
03474               << "Node: " << Teuchos::TypeNameTraits<Node>::name () << endl;
03475         }
03476         out << "label: \"" << this->getObjectLabel () << "\"" << endl
03477             << "numRows: " << getGlobalLength () << endl
03478             << "numCols: " << getNumVectors () << endl
03479             << "isConstantStride: " << isConstantStride () << endl;
03480         if (isConstantStride ()) {
03481           out << "columnStride: " << getStride () << endl;
03482         }
03483       }
03484       for (int imageCtr = 0; imageCtr < numImages; ++imageCtr) {
03485         if (myImageID == imageCtr) {
03486           if (vl != VERB_LOW) {
03487             // At verbosity > VERB_LOW, each process prints something.
03488             out << "Process " << myImageID << ":" << endl;
03489             Teuchos::OSTab tab2 (out);
03490 
03491             // >= VERB_MEDIUM: print the local vector length.
03492             out << "local length: " << getLocalLength();
03493             if (vl != VERB_MEDIUM) {
03494               // >= VERB_HIGH: print isConstantStride() and getStride()
03495               if (isConstantStride()) {
03496                 out << "constant stride: " << getStride() << endl;
03497               }
03498               else {
03499                 out << "not constant stride" << endl;
03500               }
03501               if (vl == VERB_EXTREME) {
03502                 // VERB_EXTREME: print all the values in the multivector.
03503                 out << "values: " << endl;
03504                 ArrayRCP<ArrayRCP<const Scalar> > X = this->get2dView();
03505                 out << "[";
03506                 for (size_t i = 0; i < getLocalLength(); ++i) {
03507                   for (size_t j = 0; j < getNumVectors(); ++j) {
03508                     out << X[j][i];
03509                     if (j + 1 < getNumVectors()) {
03510                       out << ", ";
03511                     }
03512                   } // for each column
03513                   if (i + 1 < getLocalLength ()) {
03514                     out << "; ";
03515                   } else {
03516                     out << endl;
03517                   }
03518                 } // for each row
03519                 out << "]" << endl;
03520               } // if vl == VERB_EXTREME
03521             } // if (vl != VERB_MEDIUM)
03522             else { // vl == VERB_LOW
03523               out << endl;
03524             }
03525           } // if vl != VERB_LOW
03526         } // if it is my process' turn to print
03527         comm->barrier();
03528       } // for each process in the communicator
03529     } // if vl != VERB_NONE
03530   }
03531 
03532 #if TPETRA_USE_KOKKOS_DISTOBJECT
03533   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03534   void
03535   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03536   createViews() const
03537   {
03538     // Do nothing in Kokkos::View implementation
03539   }
03540 
03541   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03542   void
03543   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03544   createViewsNonConst (KokkosClassic::ReadWriteOption rwo)
03545   {
03546     // Do nothing in Kokkos::View implementation
03547   }
03548 
03549   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03550   void
03551   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::releaseViews () const
03552   {
03553     // Do nothing in Kokkos::View implementation
03554   }
03555 
03556 #else // NOT TPETRA_USE_KOKKOS_DISTOBJECT
03557 
03558   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03559   void
03560   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03561   createViews() const
03562   {}
03563 
03564   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03565   void
03566   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03567   createViewsNonConst (KokkosClassic::ReadWriteOption /* rwo */ )
03568   {}
03569 
03570   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03571   void
03572   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03573   releaseViews () const
03574   {}
03575 
03576 #endif // TPETRA_USE_KOKKOS_DISTOBJECT
03577 
03578   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03579   void
03580   MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03581   removeEmptyProcessesInPlace (const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& newMap)
03582   {
03583     replaceMap (newMap);
03584   }
03585 
03586   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03587   void
03588   MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >::
03589   assign (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& src)
03590   {
03591     using Kokkos::parallel_for;
03592     typedef LocalOrdinal LO;
03593     typedef DeviceType DT;
03594     typedef typename DT::host_mirror_device_type HMDT;
03595     typedef typename dual_view_type::t_host host_view_type;
03596     typedef typename dual_view_type::t_dev dev_view_type;
03597 
03598     TEUCHOS_TEST_FOR_EXCEPTION(
03599       this->getGlobalLength () != src.getGlobalLength () ||
03600       this->getNumVectors () != src.getNumVectors (), std::invalid_argument,
03601       "Tpetra::deep_copy: Global dimensions of the two Tpetra::MultiVector "
03602       "objects do not match.  src has dimensions [" << src.getGlobalLength ()
03603       << "," << src.getNumVectors () << "], and *this has dimensions ["
03604       << this->getGlobalLength () << "," << this->getNumVectors () << "].");
03605     // FIXME (mfh 28 Jul 2014) Don't throw; just set a local error flag.
03606     TEUCHOS_TEST_FOR_EXCEPTION(
03607       this->getLocalLength () != src.getLocalLength (), std::invalid_argument,
03608       "Tpetra::deep_copy: The local row counts of the two Tpetra::MultiVector "
03609       "objects do not match.  src has " << src.getLocalLength () << " row(s) "
03610       << " and *this has " << this->getLocalLength () << " row(s).");
03611 
03612     if (src.isConstantStride () && this->isConstantStride ()) {
03613       Kokkos::deep_copy (this->getDualView (), src.getDualView ());
03614     }
03615     else {
03616       if (this->isConstantStride ()) {
03617         const LO numWhichVecs = static_cast<LO> (src.whichVectors_.size ());
03618         const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
03619 
03620         // We can't sync src, since it is only an input argument.
03621         // Thus, we have to use the most recently modified version of
03622         // src, device or host.
03623         if (src.getDualView ().modified_device >= src.getDualView ().modified_host) {
03624           // Copy from the device version of src.
03625           //
03626           // whichVecs tells the kernel which vectors (columns) of src
03627           // to copy.  Fill whichVecs on the host, and sync to device.
03628           typedef Kokkos::DualView<LO*, DT> whichvecs_type;
03629           whichvecs_type whichVecs (whichVecsLabel, numWhichVecs);
03630           whichVecs.template modify<HMDT> ();
03631           for (LO i = 0; i < numWhichVecs; ++i) {
03632             whichVecs.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
03633           }
03634           // Sync the host version of whichVecs to the device.
03635           whichVecs.template sync<DT> ();
03636 
03637           // Mark the device version of dst's DualView as modified.
03638           this->template modify<DT> ();
03639           // Copy from the selected vectors of src to dst, on the
03640           // device.  The functor ignores its 3rd arg in this case.
03641           typedef DeepCopySelectedVectors<dev_view_type, dev_view_type,
03642             LO, DT, true, false> functor_type;
03643           functor_type f (this->getDualView ().template view<DT> (),
03644                           src.getDualView ().template view<DT> (),
03645                           whichVecs.d_view, whichVecs.d_view);
03646           Kokkos::parallel_for (src.getLocalLength (), f);
03647           // Sync *this' DualView to the host.  This is cheaper than
03648           // repeating the above copy from src to *this on the host.
03649           this->template sync<HMDT> ();
03650         }
03651         else { // host version of src was the most recently modified
03652           // Copy from the host version of src.
03653           //
03654           // whichVecs tells the kernel which vectors (columns) of src
03655           // to copy.  Fill whichVecs on the host, and use it there.
03656           typedef Kokkos::View<LO*, HMDT> whichvecs_type;
03657           whichvecs_type whichVecs (whichVecsLabel, numWhichVecs);
03658           for (LO i = 0; i < numWhichVecs; ++i) {
03659             whichVecs(i) = static_cast<LO> (src.whichVectors_[i]);
03660           }
03661           // Copy from the selected vectors of src to dst, on the host.
03662           // The functor ignores its 3rd arg in this case.
03663           typedef DeepCopySelectedVectors<host_view_type, host_view_type,
03664             LO, HMDT, true, false> functor_type;
03665           functor_type f (this->getDualView ().template view<HMDT> (),
03666                           src.getDualView ().template view<HMDT> (),
03667                           whichVecs, whichVecs);
03668           Kokkos::parallel_for (src.getLocalLength (), f);
03669           // Sync dst back to the device, since we only copied on the host.
03670           this->template sync<DT> ();
03671         }
03672       }
03673       else { // dst is NOT constant stride
03674         if (src.isConstantStride ()) {
03675           if (src.getDualView ().modified_device >= src.getDualView ().modified_host) {
03676             // Copy from the device version of src.
03677             //
03678             // whichVecs tells the kernel which vectors (columns) of dst
03679             // to copy.  Fill whichVecs on the host, and sync to device.
03680             typedef Kokkos::DualView<LO*, DT> whichvecs_type;
03681             const std::string whichVecsLabel ("MV::deep_copy::whichVecs");
03682             const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
03683             whichvecs_type whichVecs (whichVecsLabel, numWhichVecs);
03684             whichVecs.template modify<HMDT> ();
03685             for (LO i = 0; i < numWhichVecs; ++i) {
03686               whichVecs.h_view(i) = this->whichVectors_[i];
03687             }
03688             // Sync the host version of whichVecs to the device.
03689             whichVecs.template sync<DT> ();
03690 
03691             // Copy src to the selected vectors of dst, on the device.
03692             // The functor ignores its 4th arg in this case.
03693             typedef DeepCopySelectedVectors<dev_view_type, dev_view_type,
03694               LO, DT, false, true> functor_type;
03695             functor_type f (this->getDualView ().template view<DT> (),
03696                             src.getDualView ().template view<DT> (),
03697                             whichVecs.d_view, whichVecs.d_view);
03698             Kokkos::parallel_for (src.getLocalLength (), f);
03699             // We can't sync src and repeat the above copy on the
03700             // host, so sync dst back to the host.
03701             //
03702             // FIXME (mfh 29 Jul 2014) This may overwrite columns that
03703             // don't actually belong to dst's view.
03704             this->template sync<HMDT> ();
03705           }
03706           else { // host version of src was the most recently modified
03707             // Copy from the host version of src.
03708             //
03709             // whichVecs tells the kernel which vectors (columns) of src
03710             // to copy.  Fill whichVecs on the host, and use it there.
03711             typedef Kokkos::View<LO*, HMDT> whichvecs_type;
03712             const LO numWhichVecs = static_cast<LO> (this->whichVectors_.size ());
03713             whichvecs_type whichVecs ("MV::deep_copy::whichVecs", numWhichVecs);
03714             for (LO i = 0; i < numWhichVecs; ++i) {
03715               whichVecs(i) = static_cast<LO> (this->whichVectors_[i]);
03716             }
03717             // Copy from src to the selected vectors of dst, on the
03718             // host.  The functor ignores its 4th arg in this case.
03719             typedef DeepCopySelectedVectors<host_view_type, host_view_type,
03720               LO, HMDT, false, true> functor_type;
03721             functor_type f (this->getDualView ().template view<HMDT> (),
03722                             src.getDualView ().template view<HMDT> (),
03723                             whichVecs, whichVecs);
03724             Kokkos::parallel_for (src.getLocalLength (), f);
03725             // Sync dst back to the device, since we only copied on the host.
03726             //
03727             // FIXME (mfh 29 Jul 2014) This may overwrite columns that
03728             // don't actually belong to dst's view.
03729             this->template sync<DT> ();
03730           }
03731         }
03732         else { // neither src nor dst have constant stride
03733           if (src.getDualView ().modified_device >= src.getDualView ().modified_host) {
03734             // Copy from the device version of src.
03735             //
03736             // whichVectorsDst tells the kernel which vectors
03737             // (columns) of dst to copy.  Fill it on the host, and
03738             // sync to device.
03739             const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
03740             Kokkos::DualView<LO*, DT> whichVecsDst ("MV::deep_copy::whichVecsDst",
03741                                                     dstNumWhichVecs);
03742             whichVecsDst.template modify<HMDT> ();
03743             for (LO i = 0; i < dstNumWhichVecs; ++i) {
03744               whichVecsDst.h_view(i) = static_cast<LO> (this->whichVectors_[i]);
03745             }
03746             // Sync the host version of whichVecsDst to the device.
03747             whichVecsDst.template sync<DT> ();
03748 
03749             // whichVectorsSrc tells the kernel which vectors
03750             // (columns) of src to copy.  Fill it on the host, and
03751             // sync to device.  Use the destination MultiVector's
03752             // LocalOrdinal type here.
03753             const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
03754             Kokkos::DualView<LO*, DT> whichVecsSrc ("MV::deep_copy::whichVecsSrc",
03755                                                     srcNumWhichVecs);
03756             whichVecsSrc.template modify<HMDT> ();
03757             for (LO i = 0; i < srcNumWhichVecs; ++i) {
03758               whichVecsSrc.h_view(i) = static_cast<LO> (src.whichVectors_[i]);
03759             }
03760             // Sync the host version of whichVecsSrc to the device.
03761             whichVecsSrc.template sync<DT> ();
03762 
03763             // Copy from the selected vectors of src to the selected
03764             // vectors of dst, on the device.
03765             typedef DeepCopySelectedVectors<dev_view_type, dev_view_type,
03766               LO, DT, false, false> functor_type;
03767             functor_type f (this->getDualView ().template view<DT> (),
03768                             src.getDualView ().template view<DT> (),
03769                             whichVecsDst.d_view, whichVecsSrc.d_view);
03770             Kokkos::parallel_for (src.getLocalLength (), f);
03771           }
03772           else {
03773             const LO dstNumWhichVecs = static_cast<LO> (this->whichVectors_.size ());
03774             Kokkos::View<LO*, HMDT> whichVectorsDst ("dstWhichVecs", dstNumWhichVecs);
03775             for (LO i = 0; i < dstNumWhichVecs; ++i) {
03776               whichVectorsDst(i) = this->whichVectors_[i];
03777             }
03778 
03779             // Use the destination MultiVector's LocalOrdinal type here.
03780             const LO srcNumWhichVecs = static_cast<LO> (src.whichVectors_.size ());
03781             Kokkos::View<LO*, HMDT> whichVectorsSrc ("srcWhichVecs", srcNumWhichVecs);
03782             for (LO i = 0; i < srcNumWhichVecs; ++i) {
03783               whichVectorsSrc(i) = src.whichVectors_[i];
03784             }
03785 
03786             typedef DeepCopySelectedVectors<host_view_type, host_view_type,
03787               LO, HMDT, false, false> functor_type;
03788             functor_type f (this->getDualView ().template view<HMDT> (),
03789                             src.getDualView ().template view<HMDT> (),
03790                             whichVectorsDst, whichVectorsSrc);
03791             Kokkos::parallel_for (src.getLocalLength (), f);
03792 
03793             // We can't sync src and repeat the above copy on the
03794             // host, so sync dst back to the host.
03795             //
03796             // FIXME (mfh 29 Jul 2014) This may overwrite columns that
03797             // don't actually belong to dst's view.
03798             this->template sync<HMDT> ();
03799           }
03800         }
03801       }
03802     }
03803   }
03804 
03805 
03806   template <class Scalar, class LO, class GO, class DeviceType>
03807   Teuchos::RCP<MultiVector<Scalar, LO, GO, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
03808   createMultiVector (const Teuchos::RCP<const Map<LO, GO, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >& map,
03809                      size_t numVectors)
03810   {
03811     typedef Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> node_type;
03812     typedef MultiVector<Scalar, LO, GO, node_type> MV;
03813     return Teuchos::rcp (new MV (map, numVectors));
03814   }
03815 
03841   template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
03842   Teuchos::RCP<MultiVector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >
03843   createMultiVectorFromView (const Teuchos::RCP<const Map<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> > >& map,
03844                              const Teuchos::ArrayRCP<Scalar>& view,
03845                              const size_t LDA,
03846                              const size_t numVectors)
03847   {
03848     (void) map;
03849     (void) view;
03850     (void) LDA;
03851     (void) numVectors;
03852 
03853     TEUCHOS_TEST_FOR_EXCEPTION(
03854       true, std::logic_error, "Tpetra::createMultiVectorFromView: "
03855       "Not implemented for Node = KokkosDeviceWrapperNode.");
03856   }
03857 
03858   template <class ST, class LO, class GO, class DeviceType>
03859   MultiVector<ST, LO, GO, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >
03860   createCopy (const MultiVector<ST, LO, GO, Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> >& src)
03861   {
03862     typedef Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType> node_type;
03863     typedef MultiVector<ST, LO, GO, node_type> MV;
03864 
03865     MV cpy (src.getMap (), src.getNumVectors ());
03866     if (src.isConstantStride ()) {
03867       Kokkos::deep_copy (cpy.getDualView (), src.getDualView ());
03868     }
03869     else {
03870       const char viewName[] = "MV::createCopy::whichVecs";
03871       const LO numWhichVecs = static_cast<LO> (src.whichVectors_.size ());
03872 
03873       if (src.getDualView ().modified_device >= src.getDualView ().modified_host) {
03874         typedef typename MV::dual_view_type::t_dev dev_view_type;
03875         typedef DeepCopySelectedVectors<dev_view_type, dev_view_type,
03876           LO, DeviceType, true, false> functor_type;
03877 
03878         Kokkos::View<LO*, DeviceType> whichVectors (viewName, numWhichVecs);
03879         // FIXME (mfh 23 Jul 2014) The following loop assumes CUDA UVM.
03880         for (LO i = 0; i < numWhichVecs; ++i) {
03881           whichVectors(i) = static_cast<LO> (src.whichVectors_[i]);
03882         }
03883         functor_type f (cpy.getDualView ().template view<DeviceType> (),
03884                         src.getDualView ().template view<DeviceType> (),
03885                         whichVectors, whichVectors);
03886         Kokkos::parallel_for (src.getLocalLength (), f);
03887       }
03888       else {
03889         typedef typename DeviceType::host_mirror_device_type host_dev_type;
03890         typedef typename MV::dual_view_type::t_host host_view_type;
03891         typedef DeepCopySelectedVectors<host_view_type, host_view_type,
03892           LO, host_dev_type, true, false> functor_type;
03893         Kokkos::View<LO*, host_dev_type> whichVectors (viewName, numWhichVecs);
03894         for (LO i = 0; i < numWhichVecs; ++i) {
03895           whichVectors(i) = static_cast<LO> (src.whichVectors_[i]);
03896         }
03897         functor_type f (cpy.getDualView ().template view<host_dev_type> (),
03898                         src.getDualView ().template view<host_dev_type> (),
03899                         whichVectors, whichVectors);
03900         Kokkos::parallel_for (src.getLocalLength (), f);
03901       }
03902     }
03903     return cpy;
03904   }
03905 
03906 } // namespace Tpetra
03907 
03908 
03909 #endif // TPETRA_KOKKOS_REFACTOR_MULTIVECTOR_DEF_HPP
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Defines