Kokkos Core Kernels Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Friends
Kokkos_CrsMatrix.hpp
Go to the documentation of this file.
00001 /*
00002 //@HEADER
00003 // ************************************************************************
00004 //
00005 //          Kokkos: Node API and Parallel Node Kernels
00006 //              Copyright (2008) Sandia Corporation
00007 //
00008 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00009 // the U.S. Government retains certain rights in this software.
00010 //
00011 // Redistribution and use in source and binary forms, with or without
00012 // modification, are permitted provided that the following conditions are
00013 // met:
00014 //
00015 // 1. Redistributions of source code must retain the above copyright
00016 // notice, this list of conditions and the following disclaimer.
00017 //
00018 // 2. Redistributions in binary form must reproduce the above copyright
00019 // notice, this list of conditions and the following disclaimer in the
00020 // documentation and/or other materials provided with the distribution.
00021 //
00022 // 3. Neither the name of the Corporation nor the names of the
00023 // contributors may be used to endorse or promote products derived from
00024 // this software without specific prior written permission.
00025 //
00026 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00030 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00031 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00032 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00033 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00034 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00035 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00036 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00037 //
00038 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
00039 //
00040 // ************************************************************************
00041 //@HEADER
00042 */
00043 #ifndef KOKKOS_CRSMATRIX_H_
00044 #define KOKKOS_CRSMATRIX_H_
00045 
00048 
00049 // FIXME (mfh 29 Sep 2013) There should never be a reason for using
00050 // assert() in these routines.  Routines that _could_ fail should
00051 // either return an error code (if they are device functions) or throw
00052 // an exception (if they are not device functions).
00053 #include <assert.h>
00054 #include <algorithm>
00055 
00056 #include <Kokkos_Core.hpp>
00057 #include <Kokkos_StaticCrsGraph.hpp>
00058 #include <Kokkos_MV.hpp>
00059 
00060 #ifdef KOKKOS_USE_CUSPARSE
00061 #  include <cusparse.h>
00062 #  include <Kokkos_CrsMatrix_CuSparse.hpp>
00063 #endif // KOKKOS_USE_CUSPARSE
00064 
00065 #ifdef KOKKOS_USE_MKL
00066 #  include <mkl.h>
00067 #  include <mkl_spblas.h>
00068 #  include <Kokkos_CrsMatrix_MKL.hpp>
00069 #endif // KOKKOS_USE_MKL
00070 
00071 //#include <Kokkos_Vectorization.hpp>
00072 #include <impl/Kokkos_Error.hpp>
00073 
00074 namespace Kokkos {
00075 
00113 template<class MatrixType>
00114 struct SparseRowView {
00116   typedef typename MatrixType::value_type value_type;
00118   typedef typename MatrixType::ordinal_type ordinal_type;
00119 
00120 private:
00122   value_type* values_;
00124   ordinal_type* colidx_;
00126   const int stride_;
00127 
00128 public:
00136   KOKKOS_INLINE_FUNCTION
00137   SparseRowView (value_type* const values,
00138                  ordinal_type* const colidx__,
00139                  const int stride,
00140                  const int count) :
00141     values_ (values), colidx_ (colidx__), stride_ (stride), length (count)
00142   {}
00150   KOKKOS_INLINE_FUNCTION
00151   SparseRowView (const typename MatrixType::values_type& values,
00152       const typename MatrixType::index_type& colidx__,
00153                  const int& stride,
00154                  const int& count,
00155                  const int& idx) :
00156     values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride), length (count)
00157   {}
00158 
00164   const int length;
00165 
00170   KOKKOS_INLINE_FUNCTION
00171   value_type& value (const int& i) const {
00172     return values_[i*stride_];
00173   }
00174 
00179   KOKKOS_INLINE_FUNCTION
00180   ordinal_type& colidx (const int& i) const {
00181     return colidx_[i*stride_];
00182   }
00183 };
00184 
00185 
00193 template<class MatrixType>
00194 struct SparseRowViewConst {
00196   typedef const typename MatrixType::non_const_value_type value_type;
00198   typedef const typename MatrixType::non_const_ordinal_type ordinal_type;
00199 
00200 private:
00202   value_type* values_;
00204   ordinal_type* colidx_;
00206   const int stride_;
00207 
00208 public:
00216   KOKKOS_INLINE_FUNCTION
00217   SparseRowViewConst (value_type* const values,
00218                       ordinal_type* const colidx__,
00219                       const int stride,
00220                       const int count) :
00221     values_ (values), colidx_ (colidx__), stride_ (stride), length (count)
00222   {}
00230   KOKKOS_INLINE_FUNCTION
00231   SparseRowViewConst (const typename MatrixType::values_type& values,
00232       const typename MatrixType::index_type& colidx__,
00233                  const int& stride,
00234                  const int& count,
00235                  const int& idx) :
00236     values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride), length (count)
00237   {}
00238 
00244   const int length;
00245 
00250   KOKKOS_INLINE_FUNCTION
00251   value_type& value (const int& i) const {
00252     return values_[i*stride_];
00253   }
00254 
00259   KOKKOS_INLINE_FUNCTION
00260   ordinal_type& colidx (const int& i) const {
00261     return colidx_[i*stride_];
00262   }
00263 };
00264 
00265 // A simple struct for storing a kernel launch configuration.
00266 // This is currently used by CrsMatrix to allow the user to have some control
00267 // over how kernels are launched, however it is currently only exercised by
00268 // Stokhos.  This is a simpler case of "state" needed by TPLs, and at this point
00269 // is just a hack until we figure out how to support state in a general,
00270 // extensible way.
00271 struct DeviceConfig {
00272   struct Dim3 {
00273     size_t x, y, z;
00274     Dim3(const size_t x_, const size_t y_ = 1, const size_t z_ = 1) :
00275       x(x_), y(y_), z(z_) {}
00276   };
00277 
00278   Dim3 block_dim;
00279   size_t num_blocks;
00280   size_t num_threads_per_block;
00281 
00282   DeviceConfig(const size_t num_blocks_ = 0,
00283                const size_t threads_per_block_x_ = 0,
00284                const size_t threads_per_block_y_ = 0,
00285                const size_t threads_per_block_z_ = 1) :
00286     block_dim(threads_per_block_x_,threads_per_block_y_,threads_per_block_z_),
00287     num_blocks(num_blocks_),
00288     num_threads_per_block(block_dim.x * block_dim.y * block_dim.z)
00289     {}
00290 };
00291 
00292 
00305 template<typename ScalarType,
00306          typename OrdinalType,
00307          class Device,
00308          class MemoryTraits = void,
00309          typename SizeType = size_t>
00310 class CrsMatrix {
00311 public:
00312   typedef Device        device_type;
00313   typedef ScalarType    value_type;
00314   typedef OrdinalType   ordinal_type;
00315   typedef MemoryTraits  memory_traits;
00316   typedef SizeType      size_type;
00317 
00318   // FIXME (mfh 28 Sep 2013) Cuda::host_mirror_device_type is Threads.
00319   // Shouldn't CrsMatrix::host_device_type always be the same as its
00320   // Device's host_mirror_device_type?
00321   //
00322   // OpenMP is the default host type if you turned on OpenMP when
00323   // building.  OpenMP is not on by default, so if you specified in
00324   // the build that you wanted OpenMP, then we say that the default
00325   // host type is OpenMP instead of Threads.
00326   typedef typename device_type::host_mirror_device_type host_device_type;
00327 
00329   typedef CrsMatrix<ScalarType, OrdinalType, host_device_type, MemoryTraits> HostMirror;
00330 
00363 
00365   typedef Kokkos::StaticCrsGraph<OrdinalType, Kokkos::LayoutLeft, Device,SizeType> StaticCrsGraphType;
00366 
00368   typedef typename StaticCrsGraphType::entries_type index_type;
00370   typedef typename StaticCrsGraphType::row_map_type row_map_type;
00372   typedef Kokkos::View<value_type*, Kokkos::LayoutRight, device_type, MemoryTraits> values_type;
00374   typedef typename values_type::const_value_type  const_value_type;
00376   typedef typename values_type::non_const_value_type  non_const_value_type;
00377   typedef typename index_type ::non_const_value_type  non_const_ordinal_type;
00378 
00379 #ifdef KOKKOS_USE_CUSPARSE
00380   cusparseHandle_t cusparse_handle;
00381   cusparseMatDescr_t cusparse_descr;
00382 #endif // KOKKOS_USE_CUSPARSE
00383   StaticCrsGraphType graph;
00384   values_type values;
00385 
00386   // Launch configuration that can be used by overloads/specializations of
00387   // MV_multiply().  This is a hack and needs to be replaced by a general
00388   // state mechanism.
00389   DeviceConfig dev_config;
00390 
00396   CrsMatrix()
00397     : graph(), values(), _numRows (0), _numCols (0), _nnz (0)
00398     {}
00399 
00400   //------------------------------------
00403   template<typename SType,
00404            typename OType,
00405            class DType,
00406            class MTType,
00407            typename IType>
00408   CrsMatrix(const CrsMatrix<SType,OType,DType,MTType,IType> & B) {
00409     graph = B.graph;
00410     values = B.values;
00411     _numRows = B.numRows();
00412     _numCols = B.numCols();
00413     _nnz = B.nnz();
00414   }
00415 
00416   //------------------------------------
00420   CrsMatrix( const std::string        & arg_label ,
00421              const StaticCrsGraphType & arg_graph )
00422     : graph( arg_graph )
00423     , values( arg_label , arg_graph.entries.dimension_0() )
00424     , _numRows( arg_graph.row_map.dimension_0() - 1 )
00425     , _numCols( maximum_entry( arg_graph ) + 1 )
00426     , _nnz( arg_graph.entries.dimension_0() )
00427     {}
00428 
00429   //------------------------------------
00430 
00455   CrsMatrix (const std::string &label,
00456              OrdinalType nrows,
00457              OrdinalType ncols,
00458              OrdinalType annz,
00459              ScalarType* val,
00460              OrdinalType* rows,
00461              OrdinalType* cols,
00462              bool pad = false)
00463   {
00464     import (label, nrows, ncols, annz, val, rows, cols);
00465 
00466     // FIXME (mfh 09 Aug 2013) Specialize this on the Device type.
00467     // Only use cuSPARSE for the Cuda Device.
00468 #ifdef KOKKOS_USE_CUSPARSE
00469     // FIXME (mfh 09 Aug 2013) This is actually static initialization
00470     // of the library; you should do it once for the whole program,
00471     // not once per matrix.  We need to protect this somehow.
00472     cusparseCreate (&cusparse_handle);
00473 
00474     // This is a per-matrix attribute.  It encapsulates things like
00475     // whether the matrix is lower or upper triangular, etc.  Ditto
00476     // for other TPLs like MKL.
00477     cusparseCreateMatDescr (&cusparse_descr);
00478 #endif // KOKKOS_USE_CUSPARSE
00479   }
00480 
00494   CrsMatrix (const std::string &label,
00495              OrdinalType nrows,
00496              OrdinalType ncols,
00497              OrdinalType annz,
00498              values_type vals,
00499              row_map_type rows,
00500              index_type cols) :
00501     _numRows (nrows),
00502     _numCols (ncols),
00503     _nnz (annz)
00504   {
00505     graph.row_map = rows;
00506     graph.entries = cols;
00507     values = vals;
00508 #ifdef KOKKOS_USE_CUSPARSE
00509     cusparseCreate(&cusparse_handle);
00510     cusparseCreateMatDescr(&cusparse_descr);
00511 #endif // KOKKOS_USE_CUSPARSE
00512   }
00513 
00527   CrsMatrix (const std::string &label,
00528              OrdinalType ncols,
00529              values_type vals,
00530              StaticCrsGraphType graph_) :
00531     graph(graph_),
00532     values(vals),
00533     _numRows (graph_.row_map.dimension_0()-1),
00534     _numCols (ncols),
00535     _nnz (graph_.entries.dimension_0())
00536   {
00537 #ifdef KOKKOS_USE_CUSPARSE
00538     cusparseCreate(&cusparse_handle);
00539     cusparseCreateMatDescr(&cusparse_descr);
00540 #endif // KOKKOS_USE_CUSPARSE
00541   }
00542 
00543   void
00544   import (const std::string &label,
00545           OrdinalType nrows,
00546           OrdinalType ncols,
00547           OrdinalType annz,
00548           ScalarType* val,
00549           OrdinalType* rows,
00550           OrdinalType* cols);
00551 
00553   void
00554   generate (const std::string &label,
00555             OrdinalType nrows,
00556             OrdinalType ncols,
00557             OrdinalType target_nnz,
00558             OrdinalType varianz_nel_row,
00559             OrdinalType width_row);
00560 
00561   void
00562   generate (const std::string &label,
00563       OrdinalType nrows,
00564       OrdinalType ncols,
00565       OrdinalType cols_per_row);
00566 
00567   void
00568   generate (const std::string &label);
00569 
00570   void
00571   generateHostGraph (OrdinalType nrows,
00572       OrdinalType ncols,
00573       OrdinalType cols_per_row);
00574 
00575   // FIXME (mfh 29 Sep 2013) See notes on the three-argument version
00576   // of this method below.
00577   void
00578   insertInGraph(OrdinalType rowi, OrdinalType col)
00579   {
00580     insertInGraph(rowi, &col, 1);
00581   }
00582 
00583   // FIXME (mfh 29 Sep 2013) We need a way to disable atomic updates
00584   // for ScalarType types that do not support them.  We're pretty much
00585   // limited to ScalarType = float, double, and {u}int{32,64}_t.  It
00586   // could make sense to do atomic add updates elementwise for complex
00587   // numbers, but that's about it unless we have transactional memory
00588   // extensions.  Dan Sunderland explained to me that the "array of
00589   // atomic int 'locks'" approach (for ScalarType that don't directly
00590   // support atomic updates) won't work on GPUs.
00591   KOKKOS_INLINE_FUNCTION
00592   void
00593   sumIntoValues (const OrdinalType rowi,
00594                  const OrdinalType cols[],
00595                  const size_t ncol,
00596                  ScalarType vals[],
00597                  const bool force_atomic = false) const
00598   {
00599     SparseRowView<CrsMatrix> row_view = this->row (rowi);
00600     const int length = row_view.length;
00601     for (size_t i = 0; i < ncol; ++i) {
00602       for (int j = 0; j < length; ++j) {
00603         if (row_view.colidx(j) == cols[i]) {
00604           if (force_atomic) {
00605             atomic_add(&row_view.value(j), vals[i]);
00606           } else {
00607             row_view.value(j) += vals[i];
00608           }
00609         }
00610       }
00611     }
00612   }
00613 
00614   // FIXME (mfh 29 Sep 2013) See above notes on sumIntoValues.
00615   KOKKOS_INLINE_FUNCTION
00616   void
00617   replaceValues (const OrdinalType rowi,
00618                  const OrdinalType cols[],
00619                  const size_t ncol,
00620                  ScalarType vals[],
00621                  const bool force_atomic = false) const
00622   {
00623     SparseRowView<CrsMatrix> row_view = this->row (rowi);
00624     const int length = row_view.length;
00625     for (size_t i = 0; i < ncol; ++i) {
00626       for (int j = 0; j < length; ++j) {
00627         if (row_view.colidx(j) == cols[i]) {
00628           if (force_atomic) {
00629             atomic_assign(&row_view.value(j), vals[i]);
00630           } else {
00631             row_view.value(j) = vals[i];
00632           }
00633         }
00634       }
00635     }
00636   }
00637 
00638   // FIXME (mfh 29 Sep 2013) It doesn't really make sense to template
00639   // on the scalar or ordinal types of the input, since direct
00640   // assignment of the underlying Views (which is how this operator
00641   // works) won't work if the types aren't compatible.  It would make
00642   // more sense to template on things like the Device and
00643   // MemoryTraits.
00644   //
00645   // COMMENT: (CRT 1 Oct 2013) the alternative it to template on the incoming type,
00646   // But that way it still matches this function even if you try to assign a vector
00647   // to a matrix. Using the same scalar type and ordinal type as the 'this' matrix does
00648   // not necessaryily work because of things like const / non-const
00649 
00650   template<typename aScalarType, typename aOrdinalType, class aDevice, class aMemoryTraits,typename aSizeType>
00651   CrsMatrix&
00652   operator= (const CrsMatrix<aScalarType,aOrdinalType,aDevice,aMemoryTraits, aSizeType>& mtx)
00653   {
00654     _numRows = mtx.numRows();
00655     _numCols = mtx.numCols();
00656     _nnz = mtx.nnz();
00657     graph = mtx.graph;
00658     values = mtx.values;
00659     dev_config = mtx.dev_config;
00660     return *this;
00661   }
00662 
00664   KOKKOS_INLINE_FUNCTION
00665   ordinal_type numRows() const {
00666     return _numRows;
00667   }
00669   KOKKOS_INLINE_FUNCTION
00670   ordinal_type numCols() const {
00671     return _numCols;
00672   }
00674   KOKKOS_INLINE_FUNCTION
00675   ordinal_type nnz() const {
00676     return _nnz;
00677   }
00678 
00679   friend struct SparseRowView<CrsMatrix>;
00680 
00682   KOKKOS_INLINE_FUNCTION
00683   SparseRowView<CrsMatrix> row (int i) const {
00684     const int start = graph.row_map(i);
00685     const int count = graph.row_map(i+1) - start;
00686 
00687 
00688     // If the last row in a matrix has zero entries the SparseRowView constructor
00689     // taking views will not pass the bounds check. The violation is only done
00690     // in an address calculation [ &values(start) ] and is not used since count is
00691     // zero, but the bounds check will fail. This will most likely also trigger a
00692     // invalid read message with valgrind.
00693 #if defined ( KOKKOS_EXPRESSION_CHECK )
00694     if(count==0) return SparseRowView<CrsMatrix> (NULL,NULL,1,0);
00695 #endif
00696     return SparseRowView<CrsMatrix> (values, graph.entries, 1, count,start);
00697   }
00698 
00700   KOKKOS_INLINE_FUNCTION
00701   SparseRowViewConst<CrsMatrix> rowConst (int i) const {
00702     const int start = graph.row_map(i);
00703     const int count = graph.row_map(i+1) - start;
00704 
00705 #if defined ( KOKKOS_EXPRESSION_CHECK )
00706     if(count==0) return SparseRowViewConst<CrsMatrix> (NULL,NULL,1,0);
00707 #endif
00708     return SparseRowViewConst<CrsMatrix> (values, graph.entries, 1, count,start);
00709   }
00710 
00711 private:
00712 
00713   ordinal_type _numRows;
00714   ordinal_type _numCols;
00715   ordinal_type _nnz;
00716 
00717 public:
00718 
00719   // FIXME: [HCE 2013-12-03] The following members will be removed soon.
00720 
00721   // FIXME (mfh 28 Sep 2013) std::vector should never appear in this
00722   // class, except perhaps as an input format for compatibility.
00723 
00724   std::vector<OrdinalType> h_entries_;
00725   std::vector<OrdinalType> rows_;
00726 
00727   // FIXME (mfh 29 Sep 2013) There should not be an "insertInGraph"
00728   // method.  If you want to insert into the graph, you should get the
00729   // graph and insert into it.  If you want to change the structure of
00730   // the matrix, you should be required to specify a value to put in
00731   // the new spot.
00732   //
00733   // Furthermore, this should be a device function, by analogy with
00734   // UnorderedMap.
00735   void
00736   insertInGraph (const OrdinalType rowi, OrdinalType *cols, const size_t ncol)
00737   {
00738     OrdinalType* const start = &h_entries_[rows_[rowi]];
00739     OrdinalType* const end   = &h_entries_[rows_[rowi+1]];
00740     for (size_t i = 0; i < ncol; ++i) {
00741       OrdinalType *iter = start;
00742       while (iter < end && *iter != -1 && *iter != cols[i]) {
00743         ++iter;
00744       }
00745 
00746       // FIXME (mfh 29 Sep 2013) Use of assert() statements is only
00747       // acceptable for debugging.  It's legitimate for insertions to
00748       // fail.  We should use the techniques that Dan Sunderland uses
00749       // in UnorderedMap; for example:
00750       //
00751       // 1. Insertion should return an indication of success or failure
00752       // 2. The graph should keep track of the number of failed insertions
00753 
00754       assert (iter != end );
00755       *iter = cols[i];
00756     }
00757   }
00758 
00759 };
00760 
00761 //----------------------------------------------------------------------------
00762 //----------------------------------------------------------------------------
00763 
00764 template< typename ScalarType , typename OrdinalType, class Device, class MemoryTraits, typename SizeType >
00765 void
00766 CrsMatrix<ScalarType , OrdinalType, Device, MemoryTraits, SizeType >::
00767 import (const std::string &label,
00768         OrdinalType nrows,
00769         OrdinalType ncols,
00770         OrdinalType annz,
00771         ScalarType* val,
00772         OrdinalType* rows,
00773         OrdinalType* cols)
00774 {
00775   std::string str = label;
00776   values = values_type (str.append (".values"), annz);
00777 
00778   _numRows = nrows;
00779   _numCols = ncols;
00780   _nnz = annz;
00781 
00782   // FIXME (09 Aug 2013) CrsArray only takes std::vector for now.
00783   // We'll need to fix that.
00784   std::vector<int> row_lengths (_numRows, 0);
00785 
00786   // FIXME (mfh 21 Jun 2013) This calls for a parallel_for kernel.
00787   for (int i = 0; i < _numRows; ++i) {
00788     row_lengths[i] = rows[i + 1] - rows[i];
00789   }
00790 
00791   str = label;
00792   graph = Kokkos::create_staticcrsgraph<StaticCrsGraphType> (str.append (".graph"), row_lengths);
00793   typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values);
00794   typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries);
00795 
00796   // FIXME (mfh 21 Jun 2013) This needs to be a parallel copy.
00797   // Furthermore, why are the arrays copied twice? -- once here, to a
00798   // host view, and once below, in the deep copy?
00799   for (OrdinalType i = 0; i < _nnz; ++i) {
00800     if (val) {
00801       h_values(i) = val[i];
00802     }
00803     h_entries(i) = cols[i];
00804   }
00805 
00806   Kokkos::deep_copy (values, h_values);
00807   Kokkos::deep_copy (graph.entries, h_entries);
00808 }
00809 
00810 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType>
00811 void
00812 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >::
00813 generate (const std::string &label,
00814           OrdinalType nrows,
00815           OrdinalType ncols,
00816           OrdinalType target_nnz,
00817           OrdinalType varianz_nel_row,
00818           OrdinalType width_row)
00819 {
00820   _numRows = nrows;
00821   _numCols = ncols;
00822 
00823   graph.row_map = row_map_type ("CrsMatrix::rowPtr", nrows + 1);
00824   typename row_map_type::HostMirror h_row_map = Kokkos::create_mirror_view (graph.row_map);
00825 
00826   // FIXME (mfh 21 Jun 2013) What is this method actualy doing?  It
00827   // looks like it's not setting the structure or values of the matrix
00828   // at all.
00829 
00830   OrdinalType elements_per_row = target_nnz / nrows;
00831   srand(13721);
00832   h_row_map(0) = 0;
00833 
00834   for (int rowi = 0; rowi < nrows; ++rowi) {
00835    // int varianz = (1.0 * rand() / INT_MAX - 0.5) * varianz_nel_row;
00836    // h_row_map(row + 1) = h_row_map(row) + elements_per_row + varianz;
00837   }
00838 
00839   _nnz = h_row_map(nrows);
00840   values = values_type("CrsMatrix::values", _nnz);
00841   graph.entries = index_type("CrsMatrix::colInd", _nnz);
00842   typename values_type::HostMirror h_values = Kokkos::create_mirror_view(values);
00843   typename index_type::HostMirror h_entries = Kokkos::create_mirror_view(graph.entries);
00844 
00845   for(int rowi = 0; rowi < nrows; rowi++) {
00846     for(int k = h_row_map(rowi); k < h_row_map(rowi + 1); k++) {
00847       //int pos = (1.0 * rand() / INT_MAX - 0.5) * width_row;
00848 
00849       //if(pos < 0) pos += ncols;
00850 
00851      // if(pos >= ncols) pos -= ncols;
00852 
00853      // h_entries(k) = pos;
00854      // h_values(k) = 100.0 * rand() / INT_MAX - 50.0;
00855     }
00856   }
00857 
00858   Kokkos::deep_copy(values, h_values);
00859   Kokkos::deep_copy(graph.entries, h_entries);
00860   Kokkos::deep_copy(graph.row_map, h_row_map);
00861 
00862 }
00863 
00864 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType>
00865 void
00866 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >::
00867 generate (const std::string &label,
00868     OrdinalType nrows,
00869     OrdinalType ncols,
00870     OrdinalType cols_per_row)
00871 {
00872   _numRows = nrows;
00873   _numCols = ncols;
00874   _nnz = nrows*cols_per_row;
00875 
00876   std::string str = label;
00877   values = values_type (str.append (".values"), _nnz);
00878 
00879 
00880   std::vector<int> row_lengths (_numRows, 0);
00881 
00882   // FIXME (mfh 21 Jun 2013) This calls for a parallel_for kernel.
00883   for (int i = 0; i < _numRows; ++i) {
00884     row_lengths[i] = cols_per_row;
00885   }
00886 
00887   str = label;
00888   graph = Kokkos::create_staticcrsgraph<StaticCrsGraphType> (str.append (".graph"), row_lengths);
00889   typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values);
00890   typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries);
00891 
00892   // FIXME (mfh 21 Jun 2013) Why is this copy not a parallel copy?
00893   // Furthermore, why are the arrays copied twice? -- once here, to a
00894   // host view, and once below, in the deep copy?
00895   for (OrdinalType i = 0; i < _nnz; ++i) {
00896     h_values(i) = ScalarType();
00897     h_entries(i) = OrdinalType();
00898   }
00899 
00900   Kokkos::deep_copy (values, h_values);
00901   Kokkos::deep_copy (graph.entries, h_entries);
00902 }
00903 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType>
00904 void
00905 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >::
00906 generate (const std::string &label)
00907 {
00908   // Compress the entries
00909   size_t ptr_from= 0, ptr_to = 0;
00910   int cur_row = 0;
00911   while ( ptr_from < h_entries_.size()) {
00912     size_t  row_stop = rows_[cur_row+1];
00913     while (ptr_from < row_stop) {
00914       if ( h_entries_[ptr_from] == OrdinalType(-1)) {
00915         ptr_from = row_stop;
00916       } else {
00917         h_entries_[ptr_to++] = h_entries_[ptr_from++];
00918       }
00919     }
00920     rows_[++cur_row] = ptr_to;
00921   }
00922   OrdinalType nrows = rows_.size()-1;
00923   OrdinalType nnz_ = ptr_to;
00924 
00925   h_entries_.resize(nnz_);
00926 
00927   //sort the rows
00928   for (OrdinalType i=0; i<nrows; ++i )
00929     std::sort(&h_entries_[i], &h_entries_[i+1]);
00930 
00931   // generate the matrix
00932   import(label, nrows, nrows, nnz_, NULL, &rows_[0], &h_entries_[0]);
00933 
00934 }
00935 
00936 
00937 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType>
00938 void
00939 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >::
00940 generateHostGraph ( OrdinalType nrows,
00941     OrdinalType ncols,
00942     OrdinalType cols_per_row)
00943 {
00944   _numRows = nrows;
00945   _numCols = ncols;
00946   _nnz = nrows*cols_per_row;
00947 
00948   h_entries_.resize(_nnz, OrdinalType(-1));
00949   rows_.resize(_numRows+1);
00950   rows_[0] = 0;
00951   for (OrdinalType i = 0; i < _numRows; ++i)
00952     rows_[i+1] = rows_[i]+cols_per_row;
00953 
00954 }
00955 
00956 template<class DeviceType>
00957 inline int RowsPerThread(const int NNZPerRow) {
00958   if(NNZPerRow == 0) return 1;
00959   int result = 2;
00960   while(result*NNZPerRow <= 2048) {
00961     result*=2;
00962   }
00963   return result/2;
00964 }
00965 #ifdef KOKKOS_HAVE_CUDA
00966 template<>
00967 inline int RowsPerThread<Kokkos::Cuda>(const int NNZPerRow) {
00968   return 1;
00969 }
00970 #endif
00971 
00972 //----------------------------------------------------------------------------
00973 
00974 template< class DeviceType , typename ScalarType , int NNZPerRow=27>
00975 struct MV_MultiplyShflThreadsPerRow {
00976 private:
00977 
00978   typedef typename Kokkos::Impl::remove_const< ScalarType >::type value_type ;
00979 
00980 #ifdef KOKKOS_HAVE_CUDA
00981   enum { shfl_possible =
00982     Kokkos::Impl::is_same< DeviceType , Kokkos::Cuda >::value &&
00983     (
00984       Kokkos::Impl::is_same< value_type , unsigned int >::value ||
00985       Kokkos::Impl::is_same< value_type , int >::value ||
00986       Kokkos::Impl::is_same< value_type , float >::value ||
00987       Kokkos::Impl::is_same< value_type , double >::value
00988     )};
00989 #else // NOT KOKKOS_HAVE_CUDA
00990   enum { shfl_possible = 0 };
00991 #endif // KOKKOS_HAVE_CUDA
00992 
00993 public:
00994 
00995 #if defined( __CUDA_ARCH__ )
00996   enum { device_value = shfl_possible && ( 300 <= __CUDA_ARCH__ ) ? 
00997          (NNZPerRow<8?2:
00998          (NNZPerRow<16?4:
00999          (NNZPerRow<32?8:
01000          (NNZPerRow<64?16:
01001          32))))
01002          :1 };
01003 #else
01004   enum { device_value = 1 };
01005 #endif
01006 
01007 #ifdef KOKKOS_HAVE_CUDA
01008   inline static int host_value()
01009     { return shfl_possible && ( 300 <= Kokkos::Cuda::device_arch() ) ? 
01010          (NNZPerRow<8?2:
01011          (NNZPerRow<16?4:
01012          (NNZPerRow<32?8:
01013          (NNZPerRow<64?16:
01014          32))))
01015          :1; }
01016 #else // NOT KOKKOS_HAVE_CUDA
01017   inline static int host_value() { return 1; }
01018 #endif // KOKKOS_HAVE_CUDA
01019 };
01020 
01021 //----------------------------------------------------------------------------
01022 
01023 template<class RangeVector,
01024          class CrsMatrix,
01025          class DomainVector,
01026          class CoeffVector1,
01027          class CoeffVector2,
01028          int doalpha,
01029          int dobeta,
01030          int ExplicitVectorLength = 8>
01031 struct MV_MultiplyFunctor {
01032   typedef typename CrsMatrix::device_type                  device_type ;
01033   typedef typename CrsMatrix::ordinal_type                 size_type ;
01034   typedef typename CrsMatrix::non_const_value_type         value_type ;
01035   typedef typename Kokkos::View<value_type*, device_type>  range_values;
01036   typedef typename Kokkos::TeamPolicy< device_type >       team_policy ;
01037   typedef typename team_policy::member_type                team_member ;
01038 
01039   typedef Vectorization<device_type,ExplicitVectorLength> vectorization;
01040 
01041   CoeffVector1 beta;
01042   CoeffVector2 alpha;
01043   CrsMatrix  m_A ;
01044   DomainVector  m_x ;
01045   RangeVector  m_y ;
01046   size_type n;
01047   int rows_per_thread;
01048 
01049   MV_MultiplyFunctor(const  CoeffVector1 beta_,
01050       const CoeffVector2 alpha_,
01051       const CrsMatrix  m_A_,
01052       const DomainVector  m_x_,
01053       const RangeVector  m_y_,
01054       const size_type n_,
01055       const int rows_per_thread_):
01056       beta(beta_), alpha(alpha_), m_A(m_A_), m_x(m_x_), m_y(m_y_), n(n_), rows_per_thread(rows_per_thread_) {}
01057 
01058   //--------------------------------------------------------------------------
01059 
01060   template<int UNROLL>
01061   KOKKOS_INLINE_FUNCTION
01062   void strip_mine (const team_member & dev, const size_type & iRow, const size_type& kk) const {
01063 
01064     value_type sum[UNROLL];
01065 
01066 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
01067 #pragma ivdep
01068 #endif
01069 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01070 #pragma unroll
01071 #endif
01072     for (size_type k = 0 ; k < UNROLL ; ++k) {
01073       // NOTE (mfh 09 Aug 2013) This requires that assignment from int
01074       // (in this case, 0) to value_type be defined.  It's not for
01075       // types like arprec and dd_real.
01076       //
01077       // mfh 29 Sep 2013: On the other hand, arprec and dd_real won't
01078       // work on CUDA devices anyway, since their methods aren't
01079       // device functions.  arprec has other issues (e.g., dynamic
01080       // memory allocation, and the array-of-structs memory layout
01081       // which is unfavorable to GPUs), but could be dealt with in the
01082       // same way as Sacado's AD types.
01083       sum[k] = 0;
01084     }
01085 
01086     const SparseRowView<CrsMatrix> row = m_A.row(iRow);
01087 
01088 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
01089 #pragma ivdep
01090 #endif
01091 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01092 #pragma unroll
01093 #endif
01094 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
01095 #pragma loop count (15)
01096 #endif
01097     for (size_type iEntry = vectorization::begin(); iEntry < row.length; iEntry += vectorization::increment ) {
01098       const value_type val = row.value(iEntry);
01099       const size_type ind = row.colidx(iEntry);
01100 
01101 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01102 #pragma unroll
01103 #endif
01104       for (size_type k = 0; k < UNROLL; ++k) {
01105         sum[k] +=  val * m_x(ind, kk + k);
01106       }
01107     }
01108 
01109     if(doalpha == -1)
01110       for (int ii=0; ii < UNROLL; ++ii) {
01111         sum[ii] = -vectorization::reduce(sum[ii]);
01112       }
01113     else
01114       for (int ii=0; ii < UNROLL; ++ii) {
01115         sum[ii] = vectorization::reduce(sum[ii]);
01116       }
01117 
01118     if (vectorization::is_lane_0(dev)) {
01119       if(doalpha * doalpha != 1) {
01120 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
01121 #pragma ivdep
01122 #endif
01123 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01124 #pragma unroll
01125 #endif
01126         for (size_type k = 0; k < UNROLL; ++k) {
01127           sum[k] *= alpha(kk + k);
01128         }
01129       }
01130 
01131       if (dobeta == 0) {
01132 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
01133 #pragma ivdep
01134 #endif
01135 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01136 #pragma unroll
01137 #endif
01138         for (size_type k = 0; k < UNROLL; ++k) {
01139           m_y(iRow, kk + k) = sum[k];
01140         }
01141       } else if(dobeta == 1) {
01142 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
01143 #pragma ivdep
01144 #endif
01145 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01146 #pragma unroll
01147 #endif
01148         for (size_type k = 0; k < UNROLL; ++k) {
01149           m_y(iRow, kk + k) += sum[k];
01150         }
01151       } else if (dobeta == -1) {
01152 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
01153 #pragma ivdep
01154 #endif
01155 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01156 #pragma unroll
01157 #endif
01158         for (size_type k = 0; k < UNROLL; ++k) {
01159           m_y(iRow, kk + k) = -m_y(iRow, kk + k) +  sum[k];
01160         }
01161       } else {
01162 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
01163 #pragma ivdep
01164 #endif
01165 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01166 #pragma unroll
01167 #endif
01168         for (size_type k = 0; k < UNROLL; ++k) {
01169           m_y(iRow, kk + k) = beta(kk + k) * m_y(iRow, kk + k) + sum[k] ;
01170         }
01171       }
01172     }
01173   }
01174 
01175   KOKKOS_INLINE_FUNCTION
01176   void strip_mine_1 (const team_member & dev, const size_type& iRow) const {
01177     value_type sum = 0;
01178 
01179       const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
01180 
01181 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
01182 #pragma ivdep
01183 #endif
01184 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01185 #pragma unroll
01186 #endif
01187 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
01188 #pragma loop count (15)
01189 #endif
01190       for(size_type iEntry = vectorization::begin(); iEntry < row.length; iEntry += vectorization::increment) {
01191         sum += row.value(iEntry) * m_x(row.colidx(iEntry),0);
01192       }
01193 
01194     sum = vectorization::reduce(sum);
01195 
01196     if (vectorization::is_lane_0(dev)) {
01197 
01198       if(doalpha == -1)
01199         sum *= value_type(-1);
01200       else if(doalpha * doalpha != 1) {
01201         sum *= alpha(0);
01202       }
01203 
01204       if (dobeta == 0) {
01205         m_y(iRow, 0) = sum ;
01206       } else if (dobeta == 1) {
01207         m_y(iRow, 0) += sum ;
01208       } else if (dobeta == -1) {
01209         m_y(iRow, 0) = -m_y(iRow, 0) +  sum;
01210       } else {
01211         m_y(iRow, 0) = beta(0) * m_y(iRow, 0) + sum;
01212       }
01213     }
01214   }
01215 
01216 
01217   KOKKOS_INLINE_FUNCTION
01218   void operator()(const team_member & dev) const {
01219     for(int loop = 0; loop < rows_per_thread; loop++) {
01220       const size_type iRow = vectorization::global_thread_rank(dev) * rows_per_thread + loop;
01221     if(iRow>=m_A.numRows()) return;
01222 
01223     size_type kk = 0;
01224 
01225 #ifdef KOKKOS_FAST_COMPILE
01226     for (; kk + 4 <= n; kk += 4) {
01227       strip_mine<4>(dev, iRow, kk);
01228     }
01229     for( ; kk < n; ++kk) {
01230       strip_mine<1>(dev, iRow, kk);
01231     }
01232 #else
01233 #  ifdef __CUDA_ARCH__
01234     if ((n > 8) && (n % 8 == 1)) {
01235       strip_mine<9>(dev, iRow, kk);
01236       kk += 9;
01237     }
01238     for(; kk + 8 <= n; kk += 8)
01239       strip_mine<8>(dev, iRow, kk);
01240     if(kk < n)
01241       switch(n - kk) {
01242 #  else // NOT a CUDA device
01243         if ((n > 16) && (n % 16 == 1)) {
01244           strip_mine<17>(dev, iRow, kk);
01245           kk += 17;
01246         }
01247 
01248         for (; kk + 16 <= n; kk += 16) {
01249           strip_mine<16>(dev, iRow, kk);
01250         }
01251 
01252         if(kk < n)
01253           switch(n - kk) {
01254           case 15:
01255             strip_mine<15>(dev, iRow, kk);
01256             break;
01257 
01258           case 14:
01259             strip_mine<14>(dev, iRow, kk);
01260             break;
01261 
01262           case 13:
01263             strip_mine<13>(dev, iRow, kk);
01264             break;
01265 
01266           case 12:
01267             strip_mine<12>(dev, iRow, kk);
01268             break;
01269 
01270           case 11:
01271             strip_mine<11>(dev, iRow, kk);
01272             break;
01273 
01274           case 10:
01275             strip_mine<10>(dev, iRow, kk);
01276             break;
01277 
01278           case 9:
01279             strip_mine<9>(dev, iRow, kk);
01280             break;
01281 
01282           case 8:
01283             strip_mine<8>(dev, iRow, kk);
01284             break;
01285 #  endif // __CUDA_ARCH__
01286           case 7:
01287             strip_mine<7>(dev, iRow, kk);
01288             break;
01289 
01290           case 6:
01291             strip_mine<6>(dev, iRow, kk);
01292             break;
01293 
01294           case 5:
01295             strip_mine<5>(dev, iRow, kk);
01296             break;
01297 
01298           case 4:
01299             strip_mine<4>(dev, iRow, kk);
01300             break;
01301 
01302           case 3:
01303             strip_mine<3>(dev, iRow, kk);
01304             break;
01305 
01306           case 2:
01307             strip_mine<2>(dev, iRow, kk);
01308             break;
01309 
01310           case 1:
01311             strip_mine_1(dev, iRow);
01312             break;
01313           }
01314 #endif // KOKKOS_FAST_COMPILE
01315       }
01316     }
01317   };
01318 
01319   template<class RangeVector,
01320            class CrsMatrix,
01321            class DomainVector,
01322            class CoeffVector1,
01323            class CoeffVector2,
01324            int doalpha,
01325            int dobeta,
01326            int ExplicitVectorLength = 8>
01327   struct MV_MultiplySingleFunctor {
01328     typedef typename CrsMatrix::device_type                   device_type ;
01329     typedef typename CrsMatrix::ordinal_type                    size_type ;
01330     typedef typename CrsMatrix::non_const_value_type         value_type ;
01331     typedef typename Kokkos::View<value_type*, typename CrsMatrix::device_type> range_values;
01332     typedef typename Kokkos::TeamPolicy< device_type >       team_policy ;
01333     typedef typename team_policy::member_type                team_member ;
01334 
01335     //typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ;
01336     typedef Vectorization<device_type,ExplicitVectorLength> vectorization;
01337 
01338     CoeffVector1 beta;
01339     CoeffVector2 alpha;
01340     CrsMatrix  m_A ;
01341     DomainVector  m_x ;
01342     RangeVector  m_y ;
01343     int rows_per_thread;
01344 
01345     MV_MultiplySingleFunctor(
01346         const  CoeffVector1 beta_,
01347         const CoeffVector2 alpha_,
01348         const CrsMatrix  m_A_,
01349         const DomainVector  m_x_,
01350         const RangeVector  m_y_,
01351         const int rows_per_thread_):
01352         beta(beta_), alpha(alpha_),
01353         m_A(m_A_), m_x(m_x_), m_y(m_y_),
01354         rows_per_thread(rows_per_thread_) {}
01355 
01356     KOKKOS_INLINE_FUNCTION
01357     void operator()(const team_member & dev) const {
01358 
01359       for(int loop = 0; loop < rows_per_thread; loop++) {
01360         const size_type iRow = vectorization::global_thread_rank(dev)*rows_per_thread + loop;
01361         if(iRow>=m_A.numRows()) return;
01362         const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
01363         const size_type row_length = row.length ;
01364         value_type sum = 0;
01365 
01366         #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
01367         #pragma ivdep
01368         #endif
01369         #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01370         #pragma unroll
01371         #endif
01372         #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
01373         #pragma loop count (15)
01374         #endif
01375         for (size_type iEntry = vectorization::begin(); iEntry < row_length; iEntry += vectorization::increment) {
01376           sum += row.value(iEntry) * m_x(row.colidx(iEntry));
01377         }
01378 
01379         sum = vectorization::reduce(sum);
01380 
01381 
01382         if (vectorization::is_lane_0(dev)) {
01383           if (doalpha == -1) sum *= value_type(-1);
01384           else if (doalpha * doalpha != 1) {
01385             sum *= alpha(0);
01386           }
01387 
01388           if (dobeta == 0) {
01389             m_y(iRow) = sum ;
01390           } else if (dobeta == 1) {
01391             m_y(iRow) += sum ;
01392           } else if (dobeta == -1) {
01393             m_y(iRow) = -m_y(iRow) +  sum;
01394           } else {
01395             m_y(iRow) = beta(0) * m_y(iRow) + sum;
01396           }
01397         }
01398       }
01399     }
01400   };
01401 
01402   namespace Impl {
01403 
01404     template <class RangeVector,
01405               class CrsMatrix,
01406               class DomainVector,
01407               class CoeffVector1,
01408               class CoeffVector2>
01409     void
01410     MV_Multiply_Check_Compatibility (const CoeffVector1 &betav,
01411                                      const RangeVector &y,
01412                                      const CoeffVector2 &alphav,
01413                                      const CrsMatrix &A,
01414                                      const DomainVector &x,
01415                                      const int& doalpha,
01416                                      const int& dobeta)
01417     {
01418       typename DomainVector::size_type numVecs = x.dimension_1();
01419       typename DomainVector::size_type numRows = A.numRows();
01420       typename DomainVector::size_type numCols = A.numCols();
01421 
01422       if (y.dimension_1() != numVecs) {
01423         std::ostringstream msg;
01424         msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and x do not match\n";
01425         msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
01426             << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
01427             << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
01428             << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
01429             << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
01430         msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") x(" << x.dimension_0() << "," << x.dimension_1() << ")\n";
01431         Impl::throw_runtime_exception( msg.str() );
01432       }
01433       if (numRows > y.dimension_0()) {
01434         std::ostringstream msg;
01435         msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of y and A do not match\n";
01436         msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
01437             << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
01438             << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
01439             << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
01440             << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
01441         msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") A(" << A.numRows() << "," << A.numCols() << ")\n";
01442         Impl::throw_runtime_exception( msg.str() );
01443       }
01444       if (numCols > x.dimension_0()) {
01445         std::ostringstream msg;
01446         msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of x and A do not match\n";
01447         msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
01448             << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
01449             << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
01450             << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
01451             << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
01452         msg << "\t Dimensions are: x(" << x.dimension_0() << "," << x.dimension_1() << ") A(" << A.numRows() << "," << A.numCols() << ")\n";
01453         Impl::throw_runtime_exception( msg.str() );
01454       }
01455       if (dobeta==2) {
01456         if (betav.dimension_0()!=numVecs) {
01457           std::ostringstream msg;
01458           msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and b do not match\n";
01459           msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
01460               << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
01461               << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
01462               << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
01463               << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
01464           msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") b(" << betav.dimension_0() << ")\n";
01465           Impl::throw_runtime_exception( msg.str() );
01466         }
01467       }
01468       if(doalpha==2) {
01469         if(alphav.dimension_0()!=numVecs) {
01470           std::ostringstream msg;
01471           msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of x and b do not match\n";
01472           msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b("
01473               << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a("
01474               << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x("
01475               << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x("
01476               << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n";
01477           msg << "\t Dimensions are: x(" << x.dimension_0() << "," << x.dimension_1() << ") b(" << betav.dimension_0() << ")\n";
01478           Impl::throw_runtime_exception( msg.str() );
01479         }
01480       }
01481     }
01482   } // namespace Impl
01483 
01484   // This TansposeFunctor is functional, but not necessarily performant.
01485   template<class RangeVector,
01486            class CrsMatrix,
01487            class DomainVector,
01488            class CoeffVector1,
01489            class CoeffVector2,
01490            int doalpha,
01491            int dobeta,
01492            int NNZPerRow = 27 >
01493   struct MV_MultiplyTransposeFunctor {
01494     typedef typename CrsMatrix::device_type                   device_type ;
01495     typedef typename CrsMatrix::ordinal_type                    size_type ;
01496     typedef typename CrsMatrix::non_const_value_type         value_type ;
01497     typedef typename Kokkos::View<value_type*, device_type>  range_values;
01498 
01499     typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ;
01500 
01501     CoeffVector1 beta;
01502     CoeffVector2 alpha;
01503     CrsMatrix  m_A ;
01504     DomainVector  m_x ;
01505     RangeVector  m_y ;
01506     size_type n;
01507 
01508     KOKKOS_INLINE_FUNCTION
01509     void operator()(int i) const {
01510       const size_type iRow = i/ShflThreadsPerRow::device_value;
01511       const int lane = i%ShflThreadsPerRow::device_value;
01512 
01513       const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
01514 
01515       for (size_type iEntry = lane; iEntry < row.length; iEntry += ShflThreadsPerRow::device_value ) {
01516         const value_type val = row.value(iEntry);
01517         const size_type ind = row.colidx(iEntry);
01518 
01519         if(doalpha!=1) {
01520 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01521 #pragma unroll
01522 #endif
01523           for (size_type k = 0; k < n; ++k) {
01524             atomic_add(&m_y(ind,k), value_type(alpha(k) * val * m_x(iRow, k)));
01525           }
01526         } else {
01527 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL
01528 #pragma unroll
01529 #endif
01530           for (size_type k = 0; k < n; ++k) {
01531             atomic_add(&m_y(ind,k), value_type(val * m_x(iRow, k)));
01532           }
01533         }
01534       }
01535     }
01536   };
01537 
01538   // This TansposeFunctor is functional, but not necessarily performant.
01539   template<class RangeVector,
01540            class CrsMatrix,
01541            class DomainVector,
01542            class CoeffVector1,
01543            class CoeffVector2,
01544            int doalpha,
01545            int dobeta,
01546            int NNZPerRow = 27 >
01547   struct MV_MultiplyTransposeSingleFunctor {
01548     typedef typename CrsMatrix::device_type                   device_type ;
01549     typedef typename CrsMatrix::ordinal_type                    size_type ;
01550     typedef typename CrsMatrix::non_const_value_type         value_type ;
01551     typedef typename Kokkos::View<value_type*, device_type>  range_values;
01552 
01553     typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ;
01554 
01555     CoeffVector1 beta;
01556     CoeffVector2 alpha;
01557     CrsMatrix  m_A ;
01558     DomainVector  m_x ;
01559     RangeVector  m_y ;
01560     size_type n;
01561 
01562     KOKKOS_INLINE_FUNCTION
01563     void operator()(int i) const {
01564       const size_type iRow = i/ShflThreadsPerRow::device_value;
01565       const int lane = i%ShflThreadsPerRow::device_value;
01566 
01567       const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow);
01568 
01569       for (size_type iEntry = lane; iEntry < row.length; iEntry += ShflThreadsPerRow::device_value ) {
01570         const value_type val = row.value(iEntry);
01571         const size_type ind = row.colidx(iEntry);
01572 
01573         if(doalpha!=1) {
01574           atomic_add(&m_y(ind), value_type(alpha(0) * val * m_x(iRow)));
01575         } else {
01576           atomic_add(&m_y(ind), value_type(val * m_x(iRow)));
01577         }
01578       }
01579     }
01580   };
01581 
01582 template <class RangeVector,
01583           class TCrsMatrix,
01584           class DomainVector,
01585           class CoeffVector1,
01586           class CoeffVector2,
01587           int doalpha,
01588           int dobeta>
01589 void
01590 MV_MultiplyTranspose (typename Kokkos::Impl::enable_if<DomainVector::Rank == 2, const CoeffVector1>::type& betav,
01591              const RangeVector &y,
01592              const CoeffVector2 &alphav,
01593              const TCrsMatrix &A,
01594              const DomainVector &x)
01595 {
01596   //Special case for zero Rows RowMap
01597   if(A.numRows() == -1) return;
01598 
01599   if (doalpha == 0) {
01600     if (dobeta==2) {
01601             MV_MulScalar(y,betav,y);
01602     } else {
01603             MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y);
01604     }
01605     return;
01606   } else {
01607     typedef View< typename RangeVector::non_const_data_type ,
01608                   typename RangeVector::array_layout ,
01609                   typename RangeVector::device_type ,
01610                   typename RangeVector::memory_traits >
01611     RangeVectorType;
01612 
01613     typedef View< typename DomainVector::const_data_type ,
01614                   typename DomainVector::array_layout ,
01615                   typename DomainVector::device_type ,
01616                   Kokkos::MemoryRandomAccess >
01617     DomainVectorType;
01618 
01619     typedef View< typename CoeffVector1::const_data_type ,
01620                   typename CoeffVector1::array_layout ,
01621                   typename CoeffVector1::device_type ,
01622                   Kokkos::MemoryRandomAccess >
01623     CoeffVector1Type;
01624 
01625     typedef View< typename CoeffVector2::const_data_type ,
01626                   typename CoeffVector2::array_layout ,
01627                   typename CoeffVector2::device_type ,
01628                   Kokkos::MemoryRandomAccess >
01629     CoeffVector2Type;
01630 
01631     typedef CrsMatrix<typename TCrsMatrix::const_value_type,
01632                       typename TCrsMatrix::ordinal_type,
01633                       typename TCrsMatrix::device_type,
01634                       typename TCrsMatrix::memory_traits,
01635                       typename TCrsMatrix::size_type> CrsMatrixType;
01636 
01637     //Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta);
01638 /*
01639 #ifndef KOKKOS_FAST_COMPILE
01640 
01641     if(x.dimension_1()==1) {
01642       typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type,Kokkos::MemoryRandomAccess> DomainVector1D;
01643       typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type> DomainVector1DPlain;
01644       typedef View<typename RangeVectorType::value_type*,typename RangeVector::array_layout ,typename RangeVectorType::device_type,typename RangeVector::memory_traits> RangeVector1D;
01645 
01646        typedef MV_MultiplySingleFunctor<RangeVector1D, CrsMatrixType, DomainVector1D,
01647                                 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta> OpType ;
01648 
01649        Kokkos::subview< RangeVector1D >( y , ALL(),0 );
01650 
01651        OpType op ;
01652        const typename CrsMatrixType::ordinal_type nrow = A.numRows();
01653        op.m_A = A ;
01654        op.m_x = Kokkos::subview< DomainVector1DPlain >( x , ALL(),0 ) ;
01655        op.m_y = Kokkos::subview< RangeVector1D >( y , ALL(),0 ) ;
01656        op.beta = betav;
01657        op.alpha = alphav;
01658        op.n = x.dimension(1);
01659        Kokkos::parallel_for (nrow * OpType::ShflThreadsPerRow::host_value(), op);
01660 
01661     } else {
01662       typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
01663                        CoeffVector1Type, CoeffVector2Type, doalpha, dobeta> OpType ;
01664       OpType op ;
01665       const typename CrsMatrixType::ordinal_type nrow = A.numRows();
01666       op.m_A = A ;
01667       op.m_x = x ;
01668       op.m_y = y ;
01669       op.beta = betav;
01670       op.alpha = alphav;
01671       op.n = x.dimension(1);
01672       Kokkos::parallel_for(nrow*OpType::ShflThreadsPerRow::host_value() , op);
01673     }
01674 
01675 #else // NOT KOKKOS_FAST_COMPILE
01676 */
01677     typedef MV_MultiplyTransposeFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, CoeffVector1Type, CoeffVector2Type, 2, 2 >
01678       OpType ;
01679 
01680     OpType op ;
01681 
01682     int numVecs = x.dimension_1();
01683     CoeffVector1 beta = betav;
01684     CoeffVector2 alpha = alphav;
01685 
01686     if (doalpha != 2) {
01687             alpha = CoeffVector2("CrsMatrix::auto_a", numVecs);
01688             typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
01689             typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha;
01690 
01691             for (int i = 0; i < numVecs; ++i)
01692               h_a(i) = s_a;
01693 
01694             Kokkos::deep_copy(alpha, h_a);
01695     }
01696 
01697     if (dobeta != 2) {
01698             beta = CoeffVector1("CrsMatrix::auto_b", numVecs);
01699             typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
01700             typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta;
01701 
01702             for(int i = 0; i < numVecs; i++)
01703               h_b(i) = s_b;
01704 
01705             Kokkos::deep_copy(beta, h_b);
01706     }
01707 
01708     if (dobeta==2) {
01709        MV_MulScalar(y,betav,y);
01710     } else {
01711       if (dobeta!=1)
01712        MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y);
01713     }
01714 
01715     const typename CrsMatrixType::ordinal_type nrow = A.numRows();
01716     op.m_A = A;
01717     op.m_x = x;
01718     op.m_y = y;
01719     op.beta = beta;
01720     op.alpha = alpha;
01721     op.n = x.dimension_1();
01722     Kokkos::parallel_for (nrow * OpType::ShflThreadsPerRow::host_value() , op );
01723 
01724 //#endif // KOKKOS_FAST_COMPILE
01725   }
01726 }
01727 
01728 template<class RangeVector, class CrsMatrix, class DomainVector, class CoeffVector1, class CoeffVector2>
01729 void
01730 MV_MultiplyTranspose (const CoeffVector1& betav,
01731              const RangeVector& y,
01732              const CoeffVector2& alphav,
01733              const CrsMatrix& A,
01734              const DomainVector& x,
01735              int beta,
01736              int alpha)
01737 {
01738   if (beta == 0) {
01739     if(alpha == 0)
01740       MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 0>(betav, y, alphav, A ,  x);
01741     else if(alpha == 1)
01742       MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 0>(betav, y, alphav, A ,  x);
01743     else if(alpha == -1)
01744       MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 0 > (betav, y, alphav, A ,  x);
01745     else
01746       MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 0>(betav, y, alphav, A ,  x);
01747   } else if(beta == 1) {
01748     if(alpha == 0)
01749       return;
01750     else if(alpha == 1)
01751       MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 1>(betav, y, alphav, A ,  x);
01752     else if(alpha == -1)
01753       MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 1 > (betav, y, alphav, A ,  x);
01754     else
01755       MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 1>(betav, y, alphav, A ,  x);
01756   } else if(beta == -1) {
01757     if(alpha == 0)
01758       MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, -1>(betav, y, alphav, A ,  x);
01759     else if(alpha == 1)
01760       MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, -1 > (betav, y, alphav, A ,  x);
01761     else if(alpha == -1)
01762       MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, -1 > (betav, y, alphav, A ,  x);
01763     else
01764       MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, -1 > (betav, y, alphav, A ,  x);
01765   } else {
01766     if(alpha == 0)
01767       MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 2>(betav, y, alphav, A ,  x);
01768     else if(alpha == 1)
01769       MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 2>(betav, y, alphav, A ,  x);
01770     else if(alpha == -1)
01771       MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 2 > (betav, y, alphav, A ,  x);
01772     else
01773       MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 2>(betav, y, alphav, A ,  x);
01774   }
01775 }
01776 
01777 template<class RangeVector, class CrsMatrix, class DomainVector>
01778 void
01779 MV_MultiplyTranspose (typename RangeVector::const_value_type s_b,
01780              const RangeVector& y,
01781              typename DomainVector::const_value_type s_a,
01782              const CrsMatrix& A,
01783              const DomainVector& x)
01784 {
01785 /*#ifdef KOKKOS_USE_CUSPARSE
01786   if (MV_Multiply_Try_CuSparse (s_b, y, s_a, A, x)) {
01787     return;
01788   }
01789 #endif // KOKKOSE_USE_CUSPARSE
01790 #ifdef KOKKOS_USE_MKL
01791   if (MV_Multiply_Try_MKL (s_b, y, s_a, A, x)) {
01792     return;
01793   }
01794 #endif // KOKKOS_USE_MKL*/
01795   typedef Kokkos::View<typename RangeVector::value_type*, typename RangeVector::device_type> aVector;
01796   aVector a;
01797   aVector b;
01798   int numVecs = x.dimension_1();
01799 
01800   if (s_b == 0) {
01801     if (s_a == 0)
01802       return MV_MultiplyTranspose (a, y, a, A, x, 0, 0);
01803     else if (s_a == 1)
01804       return MV_MultiplyTranspose (a, y, a, A, x, 0, 1);
01805     else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
01806       return MV_MultiplyTranspose (a, y, a, A, x, 0, -1);
01807     else {
01808       a = aVector("a", numVecs);
01809       typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
01810       for (int i = 0; i < numVecs; ++i) {
01811         h_a(i) = s_a;
01812       }
01813       Kokkos::deep_copy (a, h_a);
01814       return MV_MultiplyTranspose (a, y, a, A, x, 0, 2);
01815     }
01816   } else if (s_b == 1) {
01817     if (s_a == 0)
01818       return MV_MultiplyTranspose (a, y, a, A, x, 1, 0);
01819     else if (s_a == 1)
01820       return MV_MultiplyTranspose (a, y, a, A, x, 1, 1);
01821     else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
01822       return MV_MultiplyTranspose (a, y, a, A, x, 1, -1);
01823     else {
01824       a = aVector("a", numVecs);
01825       typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
01826       for (int i = 0; i < numVecs; ++i) {
01827         h_a(i) = s_a;
01828       }
01829       Kokkos::deep_copy (a, h_a);
01830       return MV_MultiplyTranspose (a, y, a, A, x, 1, 2);
01831     }
01832   } else if (s_b == static_cast<typename RangeVector::const_value_type> (-1)) {
01833     if (s_a == 0)
01834       return MV_MultiplyTranspose (a, y, a, A, x, -1, 0);
01835     else if (s_a == 1)
01836       return MV_MultiplyTranspose (a, y, a, A, x, -1, 1);
01837     else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
01838       return MV_MultiplyTranspose (a, y, a, A, x, -1, -1);
01839     else {
01840       a = aVector("a", numVecs);
01841       typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
01842       for (int i = 0; i < numVecs; ++i) {
01843         h_a(i) = s_a;
01844       }
01845       Kokkos::deep_copy (a, h_a);
01846       return MV_MultiplyTranspose (a, y, a, A, x, -1, 2);
01847     }
01848   } else {
01849     b = aVector("b", numVecs);
01850     typename aVector::HostMirror h_b = Kokkos::create_mirror_view(b);
01851     for (int i = 0; i < numVecs; ++i) {
01852       h_b(i) = s_b;
01853     }
01854     Kokkos::deep_copy(b, h_b);
01855 
01856     if (s_a == 0)
01857       return MV_MultiplyTranspose (b, y, a, A, x, 2, 0);
01858     else if (s_a == 1)
01859       return MV_MultiplyTranspose (b, y, a, A, x, 2, 1);
01860     else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
01861       return MV_MultiplyTranspose (b, y, a, A, x, 2, -1);
01862     else {
01863       a = aVector("a", numVecs);
01864       typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
01865       for (int i = 0; i < numVecs; ++i) {
01866         h_a(i) = s_a;
01867       }
01868       Kokkos::deep_copy (a, h_a);
01869       return MV_MultiplyTranspose (b, y, a, A, x, 2, 2);
01870     }
01871   }
01872 }
01873 
01874 template< class DeviceType >
01875 Kokkos::TeamPolicy< DeviceType >
01876 inline
01877 mv_multiply_team_policy( const int nrow , const int rows_per_thread, const int increment )
01878 {
01879 #ifdef KOKKOS_HAVE_CUDA
01880   const int teamsize = Impl::is_same< DeviceType , Kokkos::Cuda>::value ? 256 : 1;//hwloc::get_available_threads_per_core() ;
01881 #else
01882   const int teamsize = 1;//hwloc::get_available_threads_per_core();
01883 #endif
01884   const int nteams = (((nrow+rows_per_thread-1)/rows_per_thread)
01885                       *increment+teamsize-1)/teamsize;
01886   return Kokkos::TeamPolicy< DeviceType >( nteams , teamsize );
01887 }
01888 
01889 
01890   template<class RangeVector,
01891            class TCrsMatrix,
01892            class DomainVector,
01893            class CoeffVector1,
01894            class CoeffVector2,
01895            int doalpha,
01896            int dobeta>
01897   void
01898   MV_MultiplySingle (typename Kokkos::Impl::enable_if<DomainVector::Rank == 1, const CoeffVector1>::type& betav,
01899                const RangeVector &y,
01900                const CoeffVector2 &alphav,
01901                const TCrsMatrix& A,
01902                const DomainVector& x)
01903   {
01904     if(A.numRows()<=0) return;
01905     if (doalpha == 0) {
01906       if (dobeta==2) {
01907               V_MulScalar(y,betav,y);
01908       }
01909       else {
01910               V_MulScalar(y,typename RangeVector::value_type(dobeta),y);
01911       }
01912       return;
01913     } else {
01914       typedef View< typename RangeVector::non_const_data_type ,
01915                     typename RangeVector::array_layout ,
01916                     typename RangeVector::device_type ,
01917                     typename RangeVector::memory_traits >
01918       RangeVectorType;
01919 
01920       typedef View< typename DomainVector::const_data_type ,
01921                     typename DomainVector::array_layout ,
01922                     typename DomainVector::device_type ,
01923                     //typename DomainVector::memory_traits >
01924                     Kokkos::MemoryRandomAccess >
01925       DomainVectorType;
01926 
01927       typedef View< typename CoeffVector1::const_data_type ,
01928                     typename CoeffVector1::array_layout ,
01929                     typename CoeffVector1::device_type ,
01930                     Kokkos::MemoryRandomAccess >
01931       CoeffVector1Type;
01932 
01933       typedef View< typename CoeffVector2::const_data_type ,
01934                     typename CoeffVector2::array_layout ,
01935                     typename CoeffVector2::device_type ,
01936                     Kokkos::MemoryRandomAccess >
01937       CoeffVector2Type;
01938 
01939       typedef CrsMatrix<typename TCrsMatrix::const_value_type,
01940                         typename TCrsMatrix::ordinal_type,
01941                         typename TCrsMatrix::device_type,
01942                         typename TCrsMatrix::memory_traits,
01943                         typename TCrsMatrix::size_type>
01944       CrsMatrixType;
01945 
01946       Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta);
01947 
01948       const int NNZPerRow = A.nnz()/A.numRows();
01949 
01950 #ifndef KOKKOS_FAST_COMPILE
01951 
01952       if(NNZPerRow>=96) {
01953         typedef Vectorization<typename RangeVector::device_type,32> vec_type;
01954         typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
01955                                  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
01956 
01957         const typename CrsMatrixType::ordinal_type nrow = A.numRows();
01958 
01959         OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
01960         Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
01961              ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
01962 
01963       } else if(NNZPerRow>=48) {
01964         typedef Vectorization<typename RangeVector::device_type,16> vec_type;
01965         typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
01966                                  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
01967 
01968         const typename CrsMatrixType::ordinal_type nrow = A.numRows();
01969 
01970         OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
01971         Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
01972              ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
01973 
01974       } else if(NNZPerRow>=24) {
01975         typedef Vectorization<typename RangeVector::device_type,8> vec_type;
01976         typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
01977                                  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
01978 
01979         const typename CrsMatrixType::ordinal_type nrow = A.numRows();
01980 
01981         OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
01982         Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
01983              ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
01984 
01985       } else if(NNZPerRow>=12) {
01986         typedef Vectorization<typename RangeVector::device_type,4> vec_type;
01987         typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
01988                                  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
01989 
01990         const typename CrsMatrixType::ordinal_type nrow = A.numRows();
01991 
01992         OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
01993         Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
01994              ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
01995 
01996       } else if(NNZPerRow>=4) {
01997         typedef Vectorization<typename RangeVector::device_type,2> vec_type;
01998         typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
01999                                  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
02000 
02001         const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02002 
02003         OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02004         Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02005              ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02006 
02007       } else {
02008         typedef Vectorization<typename RangeVector::device_type,1> vec_type;
02009         typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
02010                                  CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ;
02011 
02012         const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02013 
02014         OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02015         Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02016              ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02017 
02018       }
02019 #else // NOT KOKKOS_FAST_COMPILE
02020       typedef Vectorization<typename RangeVector::device_type,8> vec_type;
02021       typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
02022                                CoeffVector1Type, CoeffVector2Type, 2, 2, vec_type::increment > OpType ;
02023 
02024       int numVecs = x.dimension_1(); // == 1
02025       CoeffVector1 beta = betav;
02026       CoeffVector2 alpha = alphav;
02027 
02028       if(doalpha!=2) {
02029               alpha = CoeffVector2("CrsMatrix::auto_a", numVecs);
02030               typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
02031               typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha;
02032 
02033               for(int i = 0; i < numVecs; i++)
02034                 h_a(i) = s_a;
02035 
02036               Kokkos::deep_copy(alpha, h_a);
02037       }
02038       if(dobeta!=2) {
02039               beta = CoeffVector1("CrsMatrix::auto_b", numVecs);
02040               typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
02041               typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta;
02042 
02043               for(int i = 0; i < numVecs; i++)
02044                 h_b(i) = s_b;
02045 
02046               Kokkos::deep_copy(beta, h_b);
02047       }
02048 
02049       const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02050 
02051       OpType op(beta,alpha,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02052       Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02053            ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02054 
02055 
02056 #endif // KOKKOS_FAST_COMPILE
02057     }
02058   }
02059 
02060 template <class RangeVector,
02061             class TCrsMatrix,
02062             class DomainVector,
02063             class CoeffVector1,
02064             class CoeffVector2,
02065             int doalpha,
02066             int dobeta>
02067   void
02068   MV_Multiply (typename Kokkos::Impl::enable_if<DomainVector::Rank == 2, const CoeffVector1>::type& betav,
02069                const RangeVector &y,
02070                const CoeffVector2 &alphav,
02071                const TCrsMatrix &A,
02072                const DomainVector &x)
02073   {
02074     //Special case for zero Rows RowMap
02075     if(A.numRows() <= 0) return;
02076 
02077     if (doalpha == 0) {
02078       if (dobeta==2) {
02079               MV_MulScalar(y,betav,y);
02080       } else {
02081               MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y);
02082       }
02083       return;
02084     } else {
02085       typedef View< typename RangeVector::non_const_data_type ,
02086                     typename RangeVector::array_layout ,
02087                     typename RangeVector::device_type ,
02088                     typename RangeVector::memory_traits >
02089       RangeVectorType;
02090 
02091       typedef View< typename DomainVector::const_data_type ,
02092                     typename DomainVector::array_layout ,
02093                     typename DomainVector::device_type ,
02094                     Kokkos::MemoryRandomAccess >
02095       DomainVectorType;
02096 
02097       typedef View< typename CoeffVector1::const_data_type ,
02098                     typename CoeffVector1::array_layout ,
02099                     typename CoeffVector1::device_type ,
02100                     Kokkos::MemoryRandomAccess >
02101       CoeffVector1Type;
02102 
02103       typedef View< typename CoeffVector2::const_data_type ,
02104                     typename CoeffVector2::array_layout ,
02105                     typename CoeffVector2::device_type ,
02106                     Kokkos::MemoryRandomAccess >
02107       CoeffVector2Type;
02108 
02109       typedef CrsMatrix<typename TCrsMatrix::const_value_type,
02110                         typename TCrsMatrix::ordinal_type,
02111                         typename TCrsMatrix::device_type,
02112                         typename TCrsMatrix::memory_traits,
02113                         typename TCrsMatrix::size_type> CrsMatrixType;
02114 
02115       Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta);
02116 
02117       const int NNZPerRow = A.nnz()/A.numRows();
02118 
02119 #ifndef KOKKOS_FAST_COMPILE
02120 
02121       if(x.dimension_1()==1) {
02122         typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type> DomainVector1D;
02123         typedef View<typename RangeVectorType::value_type*,typename RangeVector::array_layout ,typename RangeVectorType::device_type,typename RangeVector::memory_traits> RangeVector1D;
02124         RangeVector1D y_sub = RangeVector1D(Kokkos::ViewWithoutManaging(),y.ptr_on_device(),y.dimension_0());
02125         DomainVector1D x_sub = DomainVector1D(Kokkos::ViewWithoutManaging(),x.ptr_on_device(),x.dimension_0());
02126 
02127         return MV_MultiplySingle<RangeVector1D,TCrsMatrix,DomainVector1D,CoeffVector1,CoeffVector2,doalpha,dobeta>
02128           (betav,y_sub,alphav,A,x_sub);
02129 
02130       } else {
02131 
02132         //Currently for multiple right hand sides its not worth it to use more than 8 threads per row on GPUs
02133         if(NNZPerRow>=96) {
02134           typedef Vectorization<typename RangeVector::device_type,8> vec_type;
02135           typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
02136                                    CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
02137 
02138           const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02139 
02140           OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02141           Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02142                ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02143 
02144         } else if(NNZPerRow>=48) {
02145           typedef Vectorization<typename RangeVector::device_type,8> vec_type;
02146           typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
02147                                    CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
02148 
02149           const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02150 
02151           OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02152           Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02153                ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02154 
02155         } else if(NNZPerRow>=16) {
02156           typedef Vectorization<typename RangeVector::device_type,8> vec_type;
02157           typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
02158                                    CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
02159 
02160           const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02161 
02162           OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02163           Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02164                ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02165 
02166         } else if(NNZPerRow>=12) {
02167           typedef Vectorization<typename RangeVector::device_type,4> vec_type;
02168           typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
02169                                    CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
02170 
02171           const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02172 
02173           OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02174           Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02175                ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02176 
02177         } else if(NNZPerRow>=4) {
02178           typedef Vectorization<typename RangeVector::device_type,2> vec_type;
02179           typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
02180                                    CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
02181 
02182           const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02183 
02184           OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02185           Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02186                ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02187 
02188         } else {
02189           typedef Vectorization<typename RangeVector::device_type,1> vec_type;
02190           typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
02191                                    CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ;
02192 
02193           const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02194 
02195           OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02196           Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02197                ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02198 
02199         }
02200       }
02201 
02202 #else // NOT KOKKOS_FAST_COMPILE
02203 
02204       typedef Vectorization<typename RangeVector::device_type,8> vec_type;
02205       typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType,
02206                                  CoeffVector1Type, CoeffVector2Type, 2, 2, vec_type::increment >
02207         OpType ;
02208 
02209       int numVecs = x.dimension_1();
02210       CoeffVector1 beta = betav;
02211       CoeffVector2 alpha = alphav;
02212 
02213       if (doalpha != 2) {
02214               alpha = CoeffVector2("CrsMatrix::auto_a", numVecs);
02215               typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha);
02216               typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha;
02217 
02218               for (int i = 0; i < numVecs; ++i)
02219                 h_a(i) = s_a;
02220 
02221               Kokkos::deep_copy(alpha, h_a);
02222       }
02223 
02224       if (dobeta != 2) {
02225               beta = CoeffVector1("CrsMatrix::auto_b", numVecs);
02226               typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta);
02227               typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta;
02228 
02229               for(int i = 0; i < numVecs; i++)
02230                 h_b(i) = s_b;
02231 
02232               Kokkos::deep_copy(beta, h_b);
02233       }
02234 
02235       const typename CrsMatrixType::ordinal_type nrow = A.numRows();
02236 
02237       OpType op(beta,alpha,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ;
02238 
02239       Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type >
02240            ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op );
02241 
02242 
02243 #endif // KOKKOS_FAST_COMPILE
02244     }
02245   }
02246 
02247   template<class RangeVector,
02248            class TCrsMatrix,
02249            class DomainVector,
02250            class CoeffVector1,
02251            class CoeffVector2,
02252            int doalpha,
02253            int dobeta>
02254   void
02255   MV_Multiply (typename Kokkos::Impl::enable_if<DomainVector::Rank == 1, const CoeffVector1>::type& betav,
02256                const RangeVector &y,
02257                const CoeffVector2 &alphav,
02258                const TCrsMatrix& A,
02259                const DomainVector& x) {
02260     return MV_MultiplySingle<RangeVector,TCrsMatrix,DomainVector,CoeffVector1,CoeffVector2,doalpha,dobeta>
02261            (betav,y,alphav,A,x);
02262   }
02263 
02264     template<class RangeVector, class CrsMatrix, class DomainVector, class CoeffVector1, class CoeffVector2>
02265   void
02266   MV_Multiply (const CoeffVector1& betav,
02267                const RangeVector& y,
02268                const CoeffVector2& alphav,
02269                const CrsMatrix& A,
02270                const DomainVector& x,
02271                int beta,
02272                int alpha)
02273   {
02274     if (beta == 0) {
02275       if(alpha == 0)
02276         MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 0>(betav, y, alphav, A ,  x);
02277       else if(alpha == 1)
02278         MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 0>(betav, y, alphav, A ,  x);
02279       else if(alpha == -1)
02280         MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 0 > (betav, y, alphav, A ,  x);
02281       else
02282         MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 0>(betav, y, alphav, A ,  x);
02283     } else if(beta == 1) {
02284       if(alpha == 0)
02285         return;
02286       else if(alpha == 1)
02287         MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 1>(betav, y, alphav, A ,  x);
02288       else if(alpha == -1)
02289         MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 1 > (betav, y, alphav, A ,  x);
02290       else
02291         MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 1>(betav, y, alphav, A ,  x);
02292     } else if(beta == -1) {
02293       if(alpha == 0)
02294         MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, -1>(betav, y, alphav, A ,  x);
02295       else if(alpha == 1)
02296         MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, -1 > (betav, y, alphav, A ,  x);
02297       else if(alpha == -1)
02298         MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, -1 > (betav, y, alphav, A ,  x);
02299       else
02300         MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, -1 > (betav, y, alphav, A ,  x);
02301     } else {
02302       if(alpha == 0)
02303         MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 2>(betav, y, alphav, A ,  x);
02304       else if(alpha == 1)
02305         MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 2>(betav, y, alphav, A ,  x);
02306       else if(alpha == -1)
02307         MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 2 > (betav, y, alphav, A ,  x);
02308       else
02309         MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 2>(betav, y, alphav, A ,  x);
02310     }
02311   }
02312 
02313   template <class RangeVector, class CrsMatrix, class DomainVector,
02314             class Value1, class Layout1, class Device1, class MemoryManagement1,
02315             class Value2, class Layout2, class Device2, class MemoryManagement2>
02316   void
02317   MV_Multiply (const Kokkos::View<Value1, Layout1, Device1, MemoryManagement1>& betav,
02318                const RangeVector& y,
02319                const Kokkos::View<Value2, Layout2, Device2, MemoryManagement2>& alphav,
02320                const CrsMatrix& A,
02321                const DomainVector& x)
02322   {
02323     return MV_Multiply (betav, y, alphav, A, x, 2, 2);
02324   }
02325 
02326   template <class RangeVector, class CrsMatrix, class DomainVector,
02327             class Value1, class Layout1, class Device1, class MemoryManagement1>
02328   void
02329   MV_Multiply (const RangeVector& y,
02330                const Kokkos::View<Value1, Layout1, Device1, MemoryManagement1>& alphav,
02331                const CrsMatrix& A,
02332                const DomainVector& x)
02333   {
02334     return MV_Multiply (alphav, y, alphav, A, x, 0, 2);
02335   }
02336 
02337   template<class RangeVector, class CrsMatrix, class DomainVector>
02338   void
02339   MV_Multiply (const RangeVector& y,
02340                const CrsMatrix& A,
02341                const DomainVector& x)
02342   {
02343     // FIXME (mfh 21 Jun 2013) The way this code is supposed to work, is
02344     // that it tests at run time for each TPL in turn.  Shouldn't it
02345     // rather dispatch on the Device type?  But I suppose the "try"
02346     // functions do that.
02347     //
02348     // We want to condense this a bit: "Try TPLs" function that tests
02349     // all the suitable TPLs at run time.  This would be a run-time test
02350     // that compares the Scalar and Device types to those accepted by
02351     // the TPL(s).
02352 #ifdef KOKKOS_USE_CUSPARSE
02353     if (CuSparse::MV_Multiply_Try_CuSparse (0.0, y, 1.0, A, x)) {
02354       return;
02355     }
02356 #endif // KOKKOS_USE_CUSPARSE
02357 #ifdef KOKKOS_USE_MKL
02358     if (MV_Multiply_Try_MKL (0.0, y, 1.0, A, x)) {
02359       return;
02360     }
02361 #endif // KOKKOS_USE_MKL
02362     typedef Kokkos::View<typename DomainVector::value_type*, typename DomainVector::device_type> aVector;
02363     aVector a;
02364 
02365     return MV_Multiply (a, y, a, A, x, 0, 1);
02366   }
02367 
02368   template<class RangeVector, class CrsMatrix, class DomainVector>
02369   void
02370   MV_Multiply (const RangeVector& y,
02371                typename DomainVector::const_value_type s_a,
02372                const CrsMatrix& A,
02373                const DomainVector& x)
02374   {
02375 #ifdef KOKKOS_USE_CUSPARSE
02376     if (CuSparse::MV_Multiply_Try_CuSparse (0.0, y, s_a, A, x)) {
02377       return;
02378     }
02379 #endif // KOKKOS_USE_CUSPARSE
02380 #ifdef KOKKOS_USE_MKL
02381     if (MV_Multiply_Try_MKL (0.0, y, s_a, A, x)) {
02382       return;
02383     }
02384 #endif // KOKKOS_USE_MKL
02385     typedef Kokkos::View<typename RangeVector::value_type*, typename RangeVector::device_type> aVector;
02386     aVector a;
02387     const int numVecs = x.dimension_1();
02388 
02389     if ((s_a < 1) && (s_a != 0)) {
02390       return MV_Multiply (a, y, a, A, x, 0, -1);
02391     } else if (s_a == 1) {
02392       return MV_Multiply (a, y, a, A, x, 0, 1);
02393     }
02394 
02395     if (s_a != 0) {
02396       a = aVector("a", numVecs);
02397       typename aVector::HostMirror h_a = Kokkos::create_mirror_view (a);
02398       for (int i = 0; i < numVecs; ++i) {
02399               h_a(i) = s_a;
02400       }
02401       Kokkos::deep_copy(a, h_a);
02402       return MV_Multiply (a, y, a, A, x, 0, 2);
02403     }
02404   }
02405 
02406   template<class RangeVector, class CrsMatrix, class DomainVector>
02407   void
02408   MV_Multiply (typename RangeVector::const_value_type s_b,
02409                const RangeVector& y,
02410                typename DomainVector::const_value_type s_a,
02411                const CrsMatrix& A,
02412                const DomainVector& x)
02413   {
02414 #ifdef KOKKOS_USE_CUSPARSE
02415     if (CuSparse::MV_Multiply_Try_CuSparse (s_b, y, s_a, A, x)) {
02416       return;
02417     }
02418 #endif // KOKKOSE_USE_CUSPARSE
02419 #ifdef KOKKOS_USE_MKL
02420     if (MV_Multiply_Try_MKL (s_b, y, s_a, A, x)) {
02421       return;
02422     }
02423 #endif // KOKKOS_USE_MKL
02424     typedef Kokkos::View<typename RangeVector::value_type*, typename RangeVector::device_type> aVector;
02425     aVector a;
02426     aVector b;
02427     int numVecs = x.dimension_1();
02428 
02429     // [HCE 2013/12/09] Following 'if' appears to be a mistake and has been commented out
02430     // if(numVecs==1)
02431     if (s_b == 0) {
02432       if (s_a == 0)
02433         return MV_Multiply (a, y, a, A, x, 0, 0);
02434       else if (s_a == 1)
02435         return MV_Multiply (a, y, a, A, x, 0, 1);
02436       else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
02437         return MV_Multiply (a, y, a, A, x, 0, -1);
02438       else {
02439         a = aVector("a", numVecs);
02440         typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
02441         for (int i = 0; i < numVecs; ++i) {
02442           h_a(i) = s_a;
02443         }
02444         Kokkos::deep_copy (a, h_a);
02445         return MV_Multiply (a, y, a, A, x, 0, 2);
02446       }
02447     } else if (s_b == 1) {
02448       if (s_a == 0)
02449         return MV_Multiply (a, y, a, A, x, 1, 0);
02450       else if (s_a == 1)
02451         return MV_Multiply (a, y, a, A, x, 1, 1);
02452       else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
02453         return MV_Multiply (a, y, a, A, x, 1, -1);
02454       else {
02455         a = aVector("a", numVecs);
02456         typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
02457         for (int i = 0; i < numVecs; ++i) {
02458           h_a(i) = s_a;
02459         }
02460         Kokkos::deep_copy (a, h_a);
02461         return MV_Multiply (a, y, a, A, x, 1, 2);
02462       }
02463     } else if (s_b == static_cast<typename RangeVector::const_value_type> (-1)) {
02464       if (s_a == 0)
02465         return MV_Multiply (a, y, a, A, x, -1, 0);
02466       else if (s_a == 1)
02467         return MV_Multiply (a, y, a, A, x, -1, 1);
02468       else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
02469         return MV_Multiply (a, y, a, A, x, -1, -1);
02470       else {
02471         a = aVector("a", numVecs);
02472         typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
02473         for (int i = 0; i < numVecs; ++i) {
02474           h_a(i) = s_a;
02475         }
02476         Kokkos::deep_copy (a, h_a);
02477         return MV_Multiply (a, y, a, A, x, -1, 2);
02478       }
02479     } else {
02480       b = aVector("b", numVecs);
02481       typename aVector::HostMirror h_b = Kokkos::create_mirror_view(b);
02482       for (int i = 0; i < numVecs; ++i) {
02483         h_b(i) = s_b;
02484       }
02485       Kokkos::deep_copy(b, h_b);
02486 
02487       if (s_a == 0)
02488         return MV_Multiply (b, y, a, A, x, 2, 0);
02489       else if (s_a == 1)
02490         return MV_Multiply (b, y, a, A, x, 2, 1);
02491       else if (s_a == static_cast<typename DomainVector::const_value_type> (-1))
02492         return MV_Multiply (b, y, a, A, x, 2, -1);
02493       else {
02494         a = aVector("a", numVecs);
02495         typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a);
02496         for (int i = 0; i < numVecs; ++i) {
02497           h_a(i) = s_a;
02498         }
02499         Kokkos::deep_copy (a, h_a);
02500         return MV_Multiply (b, y, a, A, x, 2, 2);
02501       }
02502     }
02503   }
02504 
02505   namespace KokkosCrsMatrix {
02519     template <class CrsMatrixDst, class CrsMatrixSrc>
02520     void deep_copy (CrsMatrixDst A, CrsMatrixSrc B) {
02521       Kokkos::deep_copy(A.graph.entries, B.graph.entries);
02522       // FIXME (mfh 09 Aug 2013) This _should_ copy the row map.  We
02523       // couldn't do it before because the row map was const, forbidding
02524       // deep_copy.
02525       //
02526       //Kokkos::deep_copy(A.graph.row_map,B.graph.row_map);
02527       Kokkos::deep_copy(A.values, B.values);
02528 
02529       // FIXME (mfh 09 Aug 2013) Be sure to copy numRows, numCols, and
02530       // nnz as well.
02531       // (CRT 25 Sep 2013) don't copy rather check that they match.
02532       // Deep_copy in Kokkos is intended for copy between compatible objects.
02533     }
02534   } // namespace KokkosCrsMatrix
02535 
02536 } // namespace Kokkos
02537 
02538 #endif /* KOKKOS_CRSMATRIX_H_ */
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Friends