|
Kokkos Core Kernels Package
Version of the Day
|
00001 /* 00002 //@HEADER 00003 // ************************************************************************ 00004 // 00005 // Kokkos: Node API and Parallel Node Kernels 00006 // Copyright (2008) Sandia Corporation 00007 // 00008 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 00009 // the U.S. Government retains certain rights in this software. 00010 // 00011 // Redistribution and use in source and binary forms, with or without 00012 // modification, are permitted provided that the following conditions are 00013 // met: 00014 // 00015 // 1. Redistributions of source code must retain the above copyright 00016 // notice, this list of conditions and the following disclaimer. 00017 // 00018 // 2. Redistributions in binary form must reproduce the above copyright 00019 // notice, this list of conditions and the following disclaimer in the 00020 // documentation and/or other materials provided with the distribution. 00021 // 00022 // 3. Neither the name of the Corporation nor the names of the 00023 // contributors may be used to endorse or promote products derived from 00024 // this software without specific prior written permission. 00025 // 00026 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 00030 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00031 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00032 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00033 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00034 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00035 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00036 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00037 // 00038 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00039 // 00040 // ************************************************************************ 00041 //@HEADER 00042 */ 00043 #ifndef KOKKOS_CRSMATRIX_H_ 00044 #define KOKKOS_CRSMATRIX_H_ 00045 00048 00049 // FIXME (mfh 29 Sep 2013) There should never be a reason for using 00050 // assert() in these routines. Routines that _could_ fail should 00051 // either return an error code (if they are device functions) or throw 00052 // an exception (if they are not device functions). 00053 #include <assert.h> 00054 #include <algorithm> 00055 00056 #include <Kokkos_Core.hpp> 00057 #include <Kokkos_StaticCrsGraph.hpp> 00058 #include <Kokkos_MV.hpp> 00059 00060 #ifdef KOKKOS_USE_CUSPARSE 00061 # include <cusparse.h> 00062 # include <Kokkos_CrsMatrix_CuSparse.hpp> 00063 #endif // KOKKOS_USE_CUSPARSE 00064 00065 #ifdef KOKKOS_USE_MKL 00066 # include <mkl.h> 00067 # include <mkl_spblas.h> 00068 # include <Kokkos_CrsMatrix_MKL.hpp> 00069 #endif // KOKKOS_USE_MKL 00070 00071 //#include <Kokkos_Vectorization.hpp> 00072 #include <impl/Kokkos_Error.hpp> 00073 00074 namespace Kokkos { 00075 00113 template<class MatrixType> 00114 struct SparseRowView { 00116 typedef typename MatrixType::value_type value_type; 00118 typedef typename MatrixType::ordinal_type ordinal_type; 00119 00120 private: 00122 value_type* values_; 00124 ordinal_type* colidx_; 00126 const int stride_; 00127 00128 public: 00136 KOKKOS_INLINE_FUNCTION 00137 SparseRowView (value_type* const values, 00138 ordinal_type* const colidx__, 00139 const int stride, 00140 const int count) : 00141 values_ (values), colidx_ (colidx__), stride_ (stride), length (count) 00142 {} 00150 KOKKOS_INLINE_FUNCTION 00151 SparseRowView (const typename MatrixType::values_type& values, 00152 const typename MatrixType::index_type& colidx__, 00153 const int& stride, 00154 const int& count, 00155 const int& idx) : 00156 values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride), length (count) 00157 {} 00158 00164 const int length; 00165 00170 KOKKOS_INLINE_FUNCTION 00171 value_type& value (const int& i) const { 00172 return values_[i*stride_]; 00173 } 00174 00179 KOKKOS_INLINE_FUNCTION 00180 ordinal_type& colidx (const int& i) const { 00181 return colidx_[i*stride_]; 00182 } 00183 }; 00184 00185 00193 template<class MatrixType> 00194 struct SparseRowViewConst { 00196 typedef const typename MatrixType::non_const_value_type value_type; 00198 typedef const typename MatrixType::non_const_ordinal_type ordinal_type; 00199 00200 private: 00202 value_type* values_; 00204 ordinal_type* colidx_; 00206 const int stride_; 00207 00208 public: 00216 KOKKOS_INLINE_FUNCTION 00217 SparseRowViewConst (value_type* const values, 00218 ordinal_type* const colidx__, 00219 const int stride, 00220 const int count) : 00221 values_ (values), colidx_ (colidx__), stride_ (stride), length (count) 00222 {} 00230 KOKKOS_INLINE_FUNCTION 00231 SparseRowViewConst (const typename MatrixType::values_type& values, 00232 const typename MatrixType::index_type& colidx__, 00233 const int& stride, 00234 const int& count, 00235 const int& idx) : 00236 values_ (&values(idx)), colidx_ (&colidx__(idx)), stride_ (stride), length (count) 00237 {} 00238 00244 const int length; 00245 00250 KOKKOS_INLINE_FUNCTION 00251 value_type& value (const int& i) const { 00252 return values_[i*stride_]; 00253 } 00254 00259 KOKKOS_INLINE_FUNCTION 00260 ordinal_type& colidx (const int& i) const { 00261 return colidx_[i*stride_]; 00262 } 00263 }; 00264 00265 // A simple struct for storing a kernel launch configuration. 00266 // This is currently used by CrsMatrix to allow the user to have some control 00267 // over how kernels are launched, however it is currently only exercised by 00268 // Stokhos. This is a simpler case of "state" needed by TPLs, and at this point 00269 // is just a hack until we figure out how to support state in a general, 00270 // extensible way. 00271 struct DeviceConfig { 00272 struct Dim3 { 00273 size_t x, y, z; 00274 Dim3(const size_t x_, const size_t y_ = 1, const size_t z_ = 1) : 00275 x(x_), y(y_), z(z_) {} 00276 }; 00277 00278 Dim3 block_dim; 00279 size_t num_blocks; 00280 size_t num_threads_per_block; 00281 00282 DeviceConfig(const size_t num_blocks_ = 0, 00283 const size_t threads_per_block_x_ = 0, 00284 const size_t threads_per_block_y_ = 0, 00285 const size_t threads_per_block_z_ = 1) : 00286 block_dim(threads_per_block_x_,threads_per_block_y_,threads_per_block_z_), 00287 num_blocks(num_blocks_), 00288 num_threads_per_block(block_dim.x * block_dim.y * block_dim.z) 00289 {} 00290 }; 00291 00292 00305 template<typename ScalarType, 00306 typename OrdinalType, 00307 class Device, 00308 class MemoryTraits = void, 00309 typename SizeType = size_t> 00310 class CrsMatrix { 00311 public: 00312 typedef Device device_type; 00313 typedef ScalarType value_type; 00314 typedef OrdinalType ordinal_type; 00315 typedef MemoryTraits memory_traits; 00316 typedef SizeType size_type; 00317 00318 // FIXME (mfh 28 Sep 2013) Cuda::host_mirror_device_type is Threads. 00319 // Shouldn't CrsMatrix::host_device_type always be the same as its 00320 // Device's host_mirror_device_type? 00321 // 00322 // OpenMP is the default host type if you turned on OpenMP when 00323 // building. OpenMP is not on by default, so if you specified in 00324 // the build that you wanted OpenMP, then we say that the default 00325 // host type is OpenMP instead of Threads. 00326 typedef typename device_type::host_mirror_device_type host_device_type; 00327 00329 typedef CrsMatrix<ScalarType, OrdinalType, host_device_type, MemoryTraits> HostMirror; 00330 00363 00365 typedef Kokkos::StaticCrsGraph<OrdinalType, Kokkos::LayoutLeft, Device,SizeType> StaticCrsGraphType; 00366 00368 typedef typename StaticCrsGraphType::entries_type index_type; 00370 typedef typename StaticCrsGraphType::row_map_type row_map_type; 00372 typedef Kokkos::View<value_type*, Kokkos::LayoutRight, device_type, MemoryTraits> values_type; 00374 typedef typename values_type::const_value_type const_value_type; 00376 typedef typename values_type::non_const_value_type non_const_value_type; 00377 typedef typename index_type ::non_const_value_type non_const_ordinal_type; 00378 00379 #ifdef KOKKOS_USE_CUSPARSE 00380 cusparseHandle_t cusparse_handle; 00381 cusparseMatDescr_t cusparse_descr; 00382 #endif // KOKKOS_USE_CUSPARSE 00383 StaticCrsGraphType graph; 00384 values_type values; 00385 00386 // Launch configuration that can be used by overloads/specializations of 00387 // MV_multiply(). This is a hack and needs to be replaced by a general 00388 // state mechanism. 00389 DeviceConfig dev_config; 00390 00396 CrsMatrix() 00397 : graph(), values(), _numRows (0), _numCols (0), _nnz (0) 00398 {} 00399 00400 //------------------------------------ 00403 template<typename SType, 00404 typename OType, 00405 class DType, 00406 class MTType, 00407 typename IType> 00408 CrsMatrix(const CrsMatrix<SType,OType,DType,MTType,IType> & B) { 00409 graph = B.graph; 00410 values = B.values; 00411 _numRows = B.numRows(); 00412 _numCols = B.numCols(); 00413 _nnz = B.nnz(); 00414 } 00415 00416 //------------------------------------ 00420 CrsMatrix( const std::string & arg_label , 00421 const StaticCrsGraphType & arg_graph ) 00422 : graph( arg_graph ) 00423 , values( arg_label , arg_graph.entries.dimension_0() ) 00424 , _numRows( arg_graph.row_map.dimension_0() - 1 ) 00425 , _numCols( maximum_entry( arg_graph ) + 1 ) 00426 , _nnz( arg_graph.entries.dimension_0() ) 00427 {} 00428 00429 //------------------------------------ 00430 00455 CrsMatrix (const std::string &label, 00456 OrdinalType nrows, 00457 OrdinalType ncols, 00458 OrdinalType annz, 00459 ScalarType* val, 00460 OrdinalType* rows, 00461 OrdinalType* cols, 00462 bool pad = false) 00463 { 00464 import (label, nrows, ncols, annz, val, rows, cols); 00465 00466 // FIXME (mfh 09 Aug 2013) Specialize this on the Device type. 00467 // Only use cuSPARSE for the Cuda Device. 00468 #ifdef KOKKOS_USE_CUSPARSE 00469 // FIXME (mfh 09 Aug 2013) This is actually static initialization 00470 // of the library; you should do it once for the whole program, 00471 // not once per matrix. We need to protect this somehow. 00472 cusparseCreate (&cusparse_handle); 00473 00474 // This is a per-matrix attribute. It encapsulates things like 00475 // whether the matrix is lower or upper triangular, etc. Ditto 00476 // for other TPLs like MKL. 00477 cusparseCreateMatDescr (&cusparse_descr); 00478 #endif // KOKKOS_USE_CUSPARSE 00479 } 00480 00494 CrsMatrix (const std::string &label, 00495 OrdinalType nrows, 00496 OrdinalType ncols, 00497 OrdinalType annz, 00498 values_type vals, 00499 row_map_type rows, 00500 index_type cols) : 00501 _numRows (nrows), 00502 _numCols (ncols), 00503 _nnz (annz) 00504 { 00505 graph.row_map = rows; 00506 graph.entries = cols; 00507 values = vals; 00508 #ifdef KOKKOS_USE_CUSPARSE 00509 cusparseCreate(&cusparse_handle); 00510 cusparseCreateMatDescr(&cusparse_descr); 00511 #endif // KOKKOS_USE_CUSPARSE 00512 } 00513 00527 CrsMatrix (const std::string &label, 00528 OrdinalType ncols, 00529 values_type vals, 00530 StaticCrsGraphType graph_) : 00531 graph(graph_), 00532 values(vals), 00533 _numRows (graph_.row_map.dimension_0()-1), 00534 _numCols (ncols), 00535 _nnz (graph_.entries.dimension_0()) 00536 { 00537 #ifdef KOKKOS_USE_CUSPARSE 00538 cusparseCreate(&cusparse_handle); 00539 cusparseCreateMatDescr(&cusparse_descr); 00540 #endif // KOKKOS_USE_CUSPARSE 00541 } 00542 00543 void 00544 import (const std::string &label, 00545 OrdinalType nrows, 00546 OrdinalType ncols, 00547 OrdinalType annz, 00548 ScalarType* val, 00549 OrdinalType* rows, 00550 OrdinalType* cols); 00551 00553 void 00554 generate (const std::string &label, 00555 OrdinalType nrows, 00556 OrdinalType ncols, 00557 OrdinalType target_nnz, 00558 OrdinalType varianz_nel_row, 00559 OrdinalType width_row); 00560 00561 void 00562 generate (const std::string &label, 00563 OrdinalType nrows, 00564 OrdinalType ncols, 00565 OrdinalType cols_per_row); 00566 00567 void 00568 generate (const std::string &label); 00569 00570 void 00571 generateHostGraph (OrdinalType nrows, 00572 OrdinalType ncols, 00573 OrdinalType cols_per_row); 00574 00575 // FIXME (mfh 29 Sep 2013) See notes on the three-argument version 00576 // of this method below. 00577 void 00578 insertInGraph(OrdinalType rowi, OrdinalType col) 00579 { 00580 insertInGraph(rowi, &col, 1); 00581 } 00582 00583 // FIXME (mfh 29 Sep 2013) We need a way to disable atomic updates 00584 // for ScalarType types that do not support them. We're pretty much 00585 // limited to ScalarType = float, double, and {u}int{32,64}_t. It 00586 // could make sense to do atomic add updates elementwise for complex 00587 // numbers, but that's about it unless we have transactional memory 00588 // extensions. Dan Sunderland explained to me that the "array of 00589 // atomic int 'locks'" approach (for ScalarType that don't directly 00590 // support atomic updates) won't work on GPUs. 00591 KOKKOS_INLINE_FUNCTION 00592 void 00593 sumIntoValues (const OrdinalType rowi, 00594 const OrdinalType cols[], 00595 const size_t ncol, 00596 ScalarType vals[], 00597 const bool force_atomic = false) const 00598 { 00599 SparseRowView<CrsMatrix> row_view = this->row (rowi); 00600 const int length = row_view.length; 00601 for (size_t i = 0; i < ncol; ++i) { 00602 for (int j = 0; j < length; ++j) { 00603 if (row_view.colidx(j) == cols[i]) { 00604 if (force_atomic) { 00605 atomic_add(&row_view.value(j), vals[i]); 00606 } else { 00607 row_view.value(j) += vals[i]; 00608 } 00609 } 00610 } 00611 } 00612 } 00613 00614 // FIXME (mfh 29 Sep 2013) See above notes on sumIntoValues. 00615 KOKKOS_INLINE_FUNCTION 00616 void 00617 replaceValues (const OrdinalType rowi, 00618 const OrdinalType cols[], 00619 const size_t ncol, 00620 ScalarType vals[], 00621 const bool force_atomic = false) const 00622 { 00623 SparseRowView<CrsMatrix> row_view = this->row (rowi); 00624 const int length = row_view.length; 00625 for (size_t i = 0; i < ncol; ++i) { 00626 for (int j = 0; j < length; ++j) { 00627 if (row_view.colidx(j) == cols[i]) { 00628 if (force_atomic) { 00629 atomic_assign(&row_view.value(j), vals[i]); 00630 } else { 00631 row_view.value(j) = vals[i]; 00632 } 00633 } 00634 } 00635 } 00636 } 00637 00638 // FIXME (mfh 29 Sep 2013) It doesn't really make sense to template 00639 // on the scalar or ordinal types of the input, since direct 00640 // assignment of the underlying Views (which is how this operator 00641 // works) won't work if the types aren't compatible. It would make 00642 // more sense to template on things like the Device and 00643 // MemoryTraits. 00644 // 00645 // COMMENT: (CRT 1 Oct 2013) the alternative it to template on the incoming type, 00646 // But that way it still matches this function even if you try to assign a vector 00647 // to a matrix. Using the same scalar type and ordinal type as the 'this' matrix does 00648 // not necessaryily work because of things like const / non-const 00649 00650 template<typename aScalarType, typename aOrdinalType, class aDevice, class aMemoryTraits,typename aSizeType> 00651 CrsMatrix& 00652 operator= (const CrsMatrix<aScalarType,aOrdinalType,aDevice,aMemoryTraits, aSizeType>& mtx) 00653 { 00654 _numRows = mtx.numRows(); 00655 _numCols = mtx.numCols(); 00656 _nnz = mtx.nnz(); 00657 graph = mtx.graph; 00658 values = mtx.values; 00659 dev_config = mtx.dev_config; 00660 return *this; 00661 } 00662 00664 KOKKOS_INLINE_FUNCTION 00665 ordinal_type numRows() const { 00666 return _numRows; 00667 } 00669 KOKKOS_INLINE_FUNCTION 00670 ordinal_type numCols() const { 00671 return _numCols; 00672 } 00674 KOKKOS_INLINE_FUNCTION 00675 ordinal_type nnz() const { 00676 return _nnz; 00677 } 00678 00679 friend struct SparseRowView<CrsMatrix>; 00680 00682 KOKKOS_INLINE_FUNCTION 00683 SparseRowView<CrsMatrix> row (int i) const { 00684 const int start = graph.row_map(i); 00685 const int count = graph.row_map(i+1) - start; 00686 00687 00688 // If the last row in a matrix has zero entries the SparseRowView constructor 00689 // taking views will not pass the bounds check. The violation is only done 00690 // in an address calculation [ &values(start) ] and is not used since count is 00691 // zero, but the bounds check will fail. This will most likely also trigger a 00692 // invalid read message with valgrind. 00693 #if defined ( KOKKOS_EXPRESSION_CHECK ) 00694 if(count==0) return SparseRowView<CrsMatrix> (NULL,NULL,1,0); 00695 #endif 00696 return SparseRowView<CrsMatrix> (values, graph.entries, 1, count,start); 00697 } 00698 00700 KOKKOS_INLINE_FUNCTION 00701 SparseRowViewConst<CrsMatrix> rowConst (int i) const { 00702 const int start = graph.row_map(i); 00703 const int count = graph.row_map(i+1) - start; 00704 00705 #if defined ( KOKKOS_EXPRESSION_CHECK ) 00706 if(count==0) return SparseRowViewConst<CrsMatrix> (NULL,NULL,1,0); 00707 #endif 00708 return SparseRowViewConst<CrsMatrix> (values, graph.entries, 1, count,start); 00709 } 00710 00711 private: 00712 00713 ordinal_type _numRows; 00714 ordinal_type _numCols; 00715 ordinal_type _nnz; 00716 00717 public: 00718 00719 // FIXME: [HCE 2013-12-03] The following members will be removed soon. 00720 00721 // FIXME (mfh 28 Sep 2013) std::vector should never appear in this 00722 // class, except perhaps as an input format for compatibility. 00723 00724 std::vector<OrdinalType> h_entries_; 00725 std::vector<OrdinalType> rows_; 00726 00727 // FIXME (mfh 29 Sep 2013) There should not be an "insertInGraph" 00728 // method. If you want to insert into the graph, you should get the 00729 // graph and insert into it. If you want to change the structure of 00730 // the matrix, you should be required to specify a value to put in 00731 // the new spot. 00732 // 00733 // Furthermore, this should be a device function, by analogy with 00734 // UnorderedMap. 00735 void 00736 insertInGraph (const OrdinalType rowi, OrdinalType *cols, const size_t ncol) 00737 { 00738 OrdinalType* const start = &h_entries_[rows_[rowi]]; 00739 OrdinalType* const end = &h_entries_[rows_[rowi+1]]; 00740 for (size_t i = 0; i < ncol; ++i) { 00741 OrdinalType *iter = start; 00742 while (iter < end && *iter != -1 && *iter != cols[i]) { 00743 ++iter; 00744 } 00745 00746 // FIXME (mfh 29 Sep 2013) Use of assert() statements is only 00747 // acceptable for debugging. It's legitimate for insertions to 00748 // fail. We should use the techniques that Dan Sunderland uses 00749 // in UnorderedMap; for example: 00750 // 00751 // 1. Insertion should return an indication of success or failure 00752 // 2. The graph should keep track of the number of failed insertions 00753 00754 assert (iter != end ); 00755 *iter = cols[i]; 00756 } 00757 } 00758 00759 }; 00760 00761 //---------------------------------------------------------------------------- 00762 //---------------------------------------------------------------------------- 00763 00764 template< typename ScalarType , typename OrdinalType, class Device, class MemoryTraits, typename SizeType > 00765 void 00766 CrsMatrix<ScalarType , OrdinalType, Device, MemoryTraits, SizeType >:: 00767 import (const std::string &label, 00768 OrdinalType nrows, 00769 OrdinalType ncols, 00770 OrdinalType annz, 00771 ScalarType* val, 00772 OrdinalType* rows, 00773 OrdinalType* cols) 00774 { 00775 std::string str = label; 00776 values = values_type (str.append (".values"), annz); 00777 00778 _numRows = nrows; 00779 _numCols = ncols; 00780 _nnz = annz; 00781 00782 // FIXME (09 Aug 2013) CrsArray only takes std::vector for now. 00783 // We'll need to fix that. 00784 std::vector<int> row_lengths (_numRows, 0); 00785 00786 // FIXME (mfh 21 Jun 2013) This calls for a parallel_for kernel. 00787 for (int i = 0; i < _numRows; ++i) { 00788 row_lengths[i] = rows[i + 1] - rows[i]; 00789 } 00790 00791 str = label; 00792 graph = Kokkos::create_staticcrsgraph<StaticCrsGraphType> (str.append (".graph"), row_lengths); 00793 typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values); 00794 typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries); 00795 00796 // FIXME (mfh 21 Jun 2013) This needs to be a parallel copy. 00797 // Furthermore, why are the arrays copied twice? -- once here, to a 00798 // host view, and once below, in the deep copy? 00799 for (OrdinalType i = 0; i < _nnz; ++i) { 00800 if (val) { 00801 h_values(i) = val[i]; 00802 } 00803 h_entries(i) = cols[i]; 00804 } 00805 00806 Kokkos::deep_copy (values, h_values); 00807 Kokkos::deep_copy (graph.entries, h_entries); 00808 } 00809 00810 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType> 00811 void 00812 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >:: 00813 generate (const std::string &label, 00814 OrdinalType nrows, 00815 OrdinalType ncols, 00816 OrdinalType target_nnz, 00817 OrdinalType varianz_nel_row, 00818 OrdinalType width_row) 00819 { 00820 _numRows = nrows; 00821 _numCols = ncols; 00822 00823 graph.row_map = row_map_type ("CrsMatrix::rowPtr", nrows + 1); 00824 typename row_map_type::HostMirror h_row_map = Kokkos::create_mirror_view (graph.row_map); 00825 00826 // FIXME (mfh 21 Jun 2013) What is this method actualy doing? It 00827 // looks like it's not setting the structure or values of the matrix 00828 // at all. 00829 00830 OrdinalType elements_per_row = target_nnz / nrows; 00831 srand(13721); 00832 h_row_map(0) = 0; 00833 00834 for (int rowi = 0; rowi < nrows; ++rowi) { 00835 // int varianz = (1.0 * rand() / INT_MAX - 0.5) * varianz_nel_row; 00836 // h_row_map(row + 1) = h_row_map(row) + elements_per_row + varianz; 00837 } 00838 00839 _nnz = h_row_map(nrows); 00840 values = values_type("CrsMatrix::values", _nnz); 00841 graph.entries = index_type("CrsMatrix::colInd", _nnz); 00842 typename values_type::HostMirror h_values = Kokkos::create_mirror_view(values); 00843 typename index_type::HostMirror h_entries = Kokkos::create_mirror_view(graph.entries); 00844 00845 for(int rowi = 0; rowi < nrows; rowi++) { 00846 for(int k = h_row_map(rowi); k < h_row_map(rowi + 1); k++) { 00847 //int pos = (1.0 * rand() / INT_MAX - 0.5) * width_row; 00848 00849 //if(pos < 0) pos += ncols; 00850 00851 // if(pos >= ncols) pos -= ncols; 00852 00853 // h_entries(k) = pos; 00854 // h_values(k) = 100.0 * rand() / INT_MAX - 50.0; 00855 } 00856 } 00857 00858 Kokkos::deep_copy(values, h_values); 00859 Kokkos::deep_copy(graph.entries, h_entries); 00860 Kokkos::deep_copy(graph.row_map, h_row_map); 00861 00862 } 00863 00864 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType> 00865 void 00866 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >:: 00867 generate (const std::string &label, 00868 OrdinalType nrows, 00869 OrdinalType ncols, 00870 OrdinalType cols_per_row) 00871 { 00872 _numRows = nrows; 00873 _numCols = ncols; 00874 _nnz = nrows*cols_per_row; 00875 00876 std::string str = label; 00877 values = values_type (str.append (".values"), _nnz); 00878 00879 00880 std::vector<int> row_lengths (_numRows, 0); 00881 00882 // FIXME (mfh 21 Jun 2013) This calls for a parallel_for kernel. 00883 for (int i = 0; i < _numRows; ++i) { 00884 row_lengths[i] = cols_per_row; 00885 } 00886 00887 str = label; 00888 graph = Kokkos::create_staticcrsgraph<StaticCrsGraphType> (str.append (".graph"), row_lengths); 00889 typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values); 00890 typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries); 00891 00892 // FIXME (mfh 21 Jun 2013) Why is this copy not a parallel copy? 00893 // Furthermore, why are the arrays copied twice? -- once here, to a 00894 // host view, and once below, in the deep copy? 00895 for (OrdinalType i = 0; i < _nnz; ++i) { 00896 h_values(i) = ScalarType(); 00897 h_entries(i) = OrdinalType(); 00898 } 00899 00900 Kokkos::deep_copy (values, h_values); 00901 Kokkos::deep_copy (graph.entries, h_entries); 00902 } 00903 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType> 00904 void 00905 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >:: 00906 generate (const std::string &label) 00907 { 00908 // Compress the entries 00909 size_t ptr_from= 0, ptr_to = 0; 00910 int cur_row = 0; 00911 while ( ptr_from < h_entries_.size()) { 00912 size_t row_stop = rows_[cur_row+1]; 00913 while (ptr_from < row_stop) { 00914 if ( h_entries_[ptr_from] == OrdinalType(-1)) { 00915 ptr_from = row_stop; 00916 } else { 00917 h_entries_[ptr_to++] = h_entries_[ptr_from++]; 00918 } 00919 } 00920 rows_[++cur_row] = ptr_to; 00921 } 00922 OrdinalType nrows = rows_.size()-1; 00923 OrdinalType nnz_ = ptr_to; 00924 00925 h_entries_.resize(nnz_); 00926 00927 //sort the rows 00928 for (OrdinalType i=0; i<nrows; ++i ) 00929 std::sort(&h_entries_[i], &h_entries_[i+1]); 00930 00931 // generate the matrix 00932 import(label, nrows, nrows, nnz_, NULL, &rows_[0], &h_entries_[0]); 00933 00934 } 00935 00936 00937 template<typename ScalarType, typename OrdinalType, class Device, class MemoryTraits, typename SizeType> 00938 void 00939 CrsMatrix<ScalarType, OrdinalType, Device, MemoryTraits, SizeType >:: 00940 generateHostGraph ( OrdinalType nrows, 00941 OrdinalType ncols, 00942 OrdinalType cols_per_row) 00943 { 00944 _numRows = nrows; 00945 _numCols = ncols; 00946 _nnz = nrows*cols_per_row; 00947 00948 h_entries_.resize(_nnz, OrdinalType(-1)); 00949 rows_.resize(_numRows+1); 00950 rows_[0] = 0; 00951 for (OrdinalType i = 0; i < _numRows; ++i) 00952 rows_[i+1] = rows_[i]+cols_per_row; 00953 00954 } 00955 00956 template<class DeviceType> 00957 inline int RowsPerThread(const int NNZPerRow) { 00958 if(NNZPerRow == 0) return 1; 00959 int result = 2; 00960 while(result*NNZPerRow <= 2048) { 00961 result*=2; 00962 } 00963 return result/2; 00964 } 00965 #ifdef KOKKOS_HAVE_CUDA 00966 template<> 00967 inline int RowsPerThread<Kokkos::Cuda>(const int NNZPerRow) { 00968 return 1; 00969 } 00970 #endif 00971 00972 //---------------------------------------------------------------------------- 00973 00974 template< class DeviceType , typename ScalarType , int NNZPerRow=27> 00975 struct MV_MultiplyShflThreadsPerRow { 00976 private: 00977 00978 typedef typename Kokkos::Impl::remove_const< ScalarType >::type value_type ; 00979 00980 #ifdef KOKKOS_HAVE_CUDA 00981 enum { shfl_possible = 00982 Kokkos::Impl::is_same< DeviceType , Kokkos::Cuda >::value && 00983 ( 00984 Kokkos::Impl::is_same< value_type , unsigned int >::value || 00985 Kokkos::Impl::is_same< value_type , int >::value || 00986 Kokkos::Impl::is_same< value_type , float >::value || 00987 Kokkos::Impl::is_same< value_type , double >::value 00988 )}; 00989 #else // NOT KOKKOS_HAVE_CUDA 00990 enum { shfl_possible = 0 }; 00991 #endif // KOKKOS_HAVE_CUDA 00992 00993 public: 00994 00995 #if defined( __CUDA_ARCH__ ) 00996 enum { device_value = shfl_possible && ( 300 <= __CUDA_ARCH__ ) ? 00997 (NNZPerRow<8?2: 00998 (NNZPerRow<16?4: 00999 (NNZPerRow<32?8: 01000 (NNZPerRow<64?16: 01001 32)))) 01002 :1 }; 01003 #else 01004 enum { device_value = 1 }; 01005 #endif 01006 01007 #ifdef KOKKOS_HAVE_CUDA 01008 inline static int host_value() 01009 { return shfl_possible && ( 300 <= Kokkos::Cuda::device_arch() ) ? 01010 (NNZPerRow<8?2: 01011 (NNZPerRow<16?4: 01012 (NNZPerRow<32?8: 01013 (NNZPerRow<64?16: 01014 32)))) 01015 :1; } 01016 #else // NOT KOKKOS_HAVE_CUDA 01017 inline static int host_value() { return 1; } 01018 #endif // KOKKOS_HAVE_CUDA 01019 }; 01020 01021 //---------------------------------------------------------------------------- 01022 01023 template<class RangeVector, 01024 class CrsMatrix, 01025 class DomainVector, 01026 class CoeffVector1, 01027 class CoeffVector2, 01028 int doalpha, 01029 int dobeta, 01030 int ExplicitVectorLength = 8> 01031 struct MV_MultiplyFunctor { 01032 typedef typename CrsMatrix::device_type device_type ; 01033 typedef typename CrsMatrix::ordinal_type size_type ; 01034 typedef typename CrsMatrix::non_const_value_type value_type ; 01035 typedef typename Kokkos::View<value_type*, device_type> range_values; 01036 typedef typename Kokkos::TeamPolicy< device_type > team_policy ; 01037 typedef typename team_policy::member_type team_member ; 01038 01039 typedef Vectorization<device_type,ExplicitVectorLength> vectorization; 01040 01041 CoeffVector1 beta; 01042 CoeffVector2 alpha; 01043 CrsMatrix m_A ; 01044 DomainVector m_x ; 01045 RangeVector m_y ; 01046 size_type n; 01047 int rows_per_thread; 01048 01049 MV_MultiplyFunctor(const CoeffVector1 beta_, 01050 const CoeffVector2 alpha_, 01051 const CrsMatrix m_A_, 01052 const DomainVector m_x_, 01053 const RangeVector m_y_, 01054 const size_type n_, 01055 const int rows_per_thread_): 01056 beta(beta_), alpha(alpha_), m_A(m_A_), m_x(m_x_), m_y(m_y_), n(n_), rows_per_thread(rows_per_thread_) {} 01057 01058 //-------------------------------------------------------------------------- 01059 01060 template<int UNROLL> 01061 KOKKOS_INLINE_FUNCTION 01062 void strip_mine (const team_member & dev, const size_type & iRow, const size_type& kk) const { 01063 01064 value_type sum[UNROLL]; 01065 01066 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP 01067 #pragma ivdep 01068 #endif 01069 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01070 #pragma unroll 01071 #endif 01072 for (size_type k = 0 ; k < UNROLL ; ++k) { 01073 // NOTE (mfh 09 Aug 2013) This requires that assignment from int 01074 // (in this case, 0) to value_type be defined. It's not for 01075 // types like arprec and dd_real. 01076 // 01077 // mfh 29 Sep 2013: On the other hand, arprec and dd_real won't 01078 // work on CUDA devices anyway, since their methods aren't 01079 // device functions. arprec has other issues (e.g., dynamic 01080 // memory allocation, and the array-of-structs memory layout 01081 // which is unfavorable to GPUs), but could be dealt with in the 01082 // same way as Sacado's AD types. 01083 sum[k] = 0; 01084 } 01085 01086 const SparseRowView<CrsMatrix> row = m_A.row(iRow); 01087 01088 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP 01089 #pragma ivdep 01090 #endif 01091 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01092 #pragma unroll 01093 #endif 01094 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT 01095 #pragma loop count (15) 01096 #endif 01097 for (size_type iEntry = vectorization::begin(); iEntry < row.length; iEntry += vectorization::increment ) { 01098 const value_type val = row.value(iEntry); 01099 const size_type ind = row.colidx(iEntry); 01100 01101 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01102 #pragma unroll 01103 #endif 01104 for (size_type k = 0; k < UNROLL; ++k) { 01105 sum[k] += val * m_x(ind, kk + k); 01106 } 01107 } 01108 01109 if(doalpha == -1) 01110 for (int ii=0; ii < UNROLL; ++ii) { 01111 sum[ii] = -vectorization::reduce(sum[ii]); 01112 } 01113 else 01114 for (int ii=0; ii < UNROLL; ++ii) { 01115 sum[ii] = vectorization::reduce(sum[ii]); 01116 } 01117 01118 if (vectorization::is_lane_0(dev)) { 01119 if(doalpha * doalpha != 1) { 01120 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP 01121 #pragma ivdep 01122 #endif 01123 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01124 #pragma unroll 01125 #endif 01126 for (size_type k = 0; k < UNROLL; ++k) { 01127 sum[k] *= alpha(kk + k); 01128 } 01129 } 01130 01131 if (dobeta == 0) { 01132 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP 01133 #pragma ivdep 01134 #endif 01135 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01136 #pragma unroll 01137 #endif 01138 for (size_type k = 0; k < UNROLL; ++k) { 01139 m_y(iRow, kk + k) = sum[k]; 01140 } 01141 } else if(dobeta == 1) { 01142 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP 01143 #pragma ivdep 01144 #endif 01145 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01146 #pragma unroll 01147 #endif 01148 for (size_type k = 0; k < UNROLL; ++k) { 01149 m_y(iRow, kk + k) += sum[k]; 01150 } 01151 } else if (dobeta == -1) { 01152 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP 01153 #pragma ivdep 01154 #endif 01155 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01156 #pragma unroll 01157 #endif 01158 for (size_type k = 0; k < UNROLL; ++k) { 01159 m_y(iRow, kk + k) = -m_y(iRow, kk + k) + sum[k]; 01160 } 01161 } else { 01162 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP 01163 #pragma ivdep 01164 #endif 01165 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01166 #pragma unroll 01167 #endif 01168 for (size_type k = 0; k < UNROLL; ++k) { 01169 m_y(iRow, kk + k) = beta(kk + k) * m_y(iRow, kk + k) + sum[k] ; 01170 } 01171 } 01172 } 01173 } 01174 01175 KOKKOS_INLINE_FUNCTION 01176 void strip_mine_1 (const team_member & dev, const size_type& iRow) const { 01177 value_type sum = 0; 01178 01179 const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow); 01180 01181 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP 01182 #pragma ivdep 01183 #endif 01184 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01185 #pragma unroll 01186 #endif 01187 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT 01188 #pragma loop count (15) 01189 #endif 01190 for(size_type iEntry = vectorization::begin(); iEntry < row.length; iEntry += vectorization::increment) { 01191 sum += row.value(iEntry) * m_x(row.colidx(iEntry),0); 01192 } 01193 01194 sum = vectorization::reduce(sum); 01195 01196 if (vectorization::is_lane_0(dev)) { 01197 01198 if(doalpha == -1) 01199 sum *= value_type(-1); 01200 else if(doalpha * doalpha != 1) { 01201 sum *= alpha(0); 01202 } 01203 01204 if (dobeta == 0) { 01205 m_y(iRow, 0) = sum ; 01206 } else if (dobeta == 1) { 01207 m_y(iRow, 0) += sum ; 01208 } else if (dobeta == -1) { 01209 m_y(iRow, 0) = -m_y(iRow, 0) + sum; 01210 } else { 01211 m_y(iRow, 0) = beta(0) * m_y(iRow, 0) + sum; 01212 } 01213 } 01214 } 01215 01216 01217 KOKKOS_INLINE_FUNCTION 01218 void operator()(const team_member & dev) const { 01219 for(int loop = 0; loop < rows_per_thread; loop++) { 01220 const size_type iRow = vectorization::global_thread_rank(dev) * rows_per_thread + loop; 01221 if(iRow>=m_A.numRows()) return; 01222 01223 size_type kk = 0; 01224 01225 #ifdef KOKKOS_FAST_COMPILE 01226 for (; kk + 4 <= n; kk += 4) { 01227 strip_mine<4>(dev, iRow, kk); 01228 } 01229 for( ; kk < n; ++kk) { 01230 strip_mine<1>(dev, iRow, kk); 01231 } 01232 #else 01233 # ifdef __CUDA_ARCH__ 01234 if ((n > 8) && (n % 8 == 1)) { 01235 strip_mine<9>(dev, iRow, kk); 01236 kk += 9; 01237 } 01238 for(; kk + 8 <= n; kk += 8) 01239 strip_mine<8>(dev, iRow, kk); 01240 if(kk < n) 01241 switch(n - kk) { 01242 # else // NOT a CUDA device 01243 if ((n > 16) && (n % 16 == 1)) { 01244 strip_mine<17>(dev, iRow, kk); 01245 kk += 17; 01246 } 01247 01248 for (; kk + 16 <= n; kk += 16) { 01249 strip_mine<16>(dev, iRow, kk); 01250 } 01251 01252 if(kk < n) 01253 switch(n - kk) { 01254 case 15: 01255 strip_mine<15>(dev, iRow, kk); 01256 break; 01257 01258 case 14: 01259 strip_mine<14>(dev, iRow, kk); 01260 break; 01261 01262 case 13: 01263 strip_mine<13>(dev, iRow, kk); 01264 break; 01265 01266 case 12: 01267 strip_mine<12>(dev, iRow, kk); 01268 break; 01269 01270 case 11: 01271 strip_mine<11>(dev, iRow, kk); 01272 break; 01273 01274 case 10: 01275 strip_mine<10>(dev, iRow, kk); 01276 break; 01277 01278 case 9: 01279 strip_mine<9>(dev, iRow, kk); 01280 break; 01281 01282 case 8: 01283 strip_mine<8>(dev, iRow, kk); 01284 break; 01285 # endif // __CUDA_ARCH__ 01286 case 7: 01287 strip_mine<7>(dev, iRow, kk); 01288 break; 01289 01290 case 6: 01291 strip_mine<6>(dev, iRow, kk); 01292 break; 01293 01294 case 5: 01295 strip_mine<5>(dev, iRow, kk); 01296 break; 01297 01298 case 4: 01299 strip_mine<4>(dev, iRow, kk); 01300 break; 01301 01302 case 3: 01303 strip_mine<3>(dev, iRow, kk); 01304 break; 01305 01306 case 2: 01307 strip_mine<2>(dev, iRow, kk); 01308 break; 01309 01310 case 1: 01311 strip_mine_1(dev, iRow); 01312 break; 01313 } 01314 #endif // KOKKOS_FAST_COMPILE 01315 } 01316 } 01317 }; 01318 01319 template<class RangeVector, 01320 class CrsMatrix, 01321 class DomainVector, 01322 class CoeffVector1, 01323 class CoeffVector2, 01324 int doalpha, 01325 int dobeta, 01326 int ExplicitVectorLength = 8> 01327 struct MV_MultiplySingleFunctor { 01328 typedef typename CrsMatrix::device_type device_type ; 01329 typedef typename CrsMatrix::ordinal_type size_type ; 01330 typedef typename CrsMatrix::non_const_value_type value_type ; 01331 typedef typename Kokkos::View<value_type*, typename CrsMatrix::device_type> range_values; 01332 typedef typename Kokkos::TeamPolicy< device_type > team_policy ; 01333 typedef typename team_policy::member_type team_member ; 01334 01335 //typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ; 01336 typedef Vectorization<device_type,ExplicitVectorLength> vectorization; 01337 01338 CoeffVector1 beta; 01339 CoeffVector2 alpha; 01340 CrsMatrix m_A ; 01341 DomainVector m_x ; 01342 RangeVector m_y ; 01343 int rows_per_thread; 01344 01345 MV_MultiplySingleFunctor( 01346 const CoeffVector1 beta_, 01347 const CoeffVector2 alpha_, 01348 const CrsMatrix m_A_, 01349 const DomainVector m_x_, 01350 const RangeVector m_y_, 01351 const int rows_per_thread_): 01352 beta(beta_), alpha(alpha_), 01353 m_A(m_A_), m_x(m_x_), m_y(m_y_), 01354 rows_per_thread(rows_per_thread_) {} 01355 01356 KOKKOS_INLINE_FUNCTION 01357 void operator()(const team_member & dev) const { 01358 01359 for(int loop = 0; loop < rows_per_thread; loop++) { 01360 const size_type iRow = vectorization::global_thread_rank(dev)*rows_per_thread + loop; 01361 if(iRow>=m_A.numRows()) return; 01362 const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow); 01363 const size_type row_length = row.length ; 01364 value_type sum = 0; 01365 01366 #ifdef KOKKOS_HAVE_PRAGMA_IVDEP 01367 #pragma ivdep 01368 #endif 01369 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01370 #pragma unroll 01371 #endif 01372 #ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT 01373 #pragma loop count (15) 01374 #endif 01375 for (size_type iEntry = vectorization::begin(); iEntry < row_length; iEntry += vectorization::increment) { 01376 sum += row.value(iEntry) * m_x(row.colidx(iEntry)); 01377 } 01378 01379 sum = vectorization::reduce(sum); 01380 01381 01382 if (vectorization::is_lane_0(dev)) { 01383 if (doalpha == -1) sum *= value_type(-1); 01384 else if (doalpha * doalpha != 1) { 01385 sum *= alpha(0); 01386 } 01387 01388 if (dobeta == 0) { 01389 m_y(iRow) = sum ; 01390 } else if (dobeta == 1) { 01391 m_y(iRow) += sum ; 01392 } else if (dobeta == -1) { 01393 m_y(iRow) = -m_y(iRow) + sum; 01394 } else { 01395 m_y(iRow) = beta(0) * m_y(iRow) + sum; 01396 } 01397 } 01398 } 01399 } 01400 }; 01401 01402 namespace Impl { 01403 01404 template <class RangeVector, 01405 class CrsMatrix, 01406 class DomainVector, 01407 class CoeffVector1, 01408 class CoeffVector2> 01409 void 01410 MV_Multiply_Check_Compatibility (const CoeffVector1 &betav, 01411 const RangeVector &y, 01412 const CoeffVector2 &alphav, 01413 const CrsMatrix &A, 01414 const DomainVector &x, 01415 const int& doalpha, 01416 const int& dobeta) 01417 { 01418 typename DomainVector::size_type numVecs = x.dimension_1(); 01419 typename DomainVector::size_type numRows = A.numRows(); 01420 typename DomainVector::size_type numCols = A.numCols(); 01421 01422 if (y.dimension_1() != numVecs) { 01423 std::ostringstream msg; 01424 msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and x do not match\n"; 01425 msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b(" 01426 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a(" 01427 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x(" 01428 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x(" 01429 << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n"; 01430 msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") x(" << x.dimension_0() << "," << x.dimension_1() << ")\n"; 01431 Impl::throw_runtime_exception( msg.str() ); 01432 } 01433 if (numRows > y.dimension_0()) { 01434 std::ostringstream msg; 01435 msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of y and A do not match\n"; 01436 msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b(" 01437 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a(" 01438 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x(" 01439 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x(" 01440 << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n"; 01441 msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") A(" << A.numRows() << "," << A.numCols() << ")\n"; 01442 Impl::throw_runtime_exception( msg.str() ); 01443 } 01444 if (numCols > x.dimension_0()) { 01445 std::ostringstream msg; 01446 msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): dimensions of x and A do not match\n"; 01447 msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b(" 01448 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a(" 01449 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x(" 01450 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x(" 01451 << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n"; 01452 msg << "\t Dimensions are: x(" << x.dimension_0() << "," << x.dimension_1() << ") A(" << A.numRows() << "," << A.numCols() << ")\n"; 01453 Impl::throw_runtime_exception( msg.str() ); 01454 } 01455 if (dobeta==2) { 01456 if (betav.dimension_0()!=numVecs) { 01457 std::ostringstream msg; 01458 msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of y and b do not match\n"; 01459 msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b(" 01460 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a(" 01461 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x(" 01462 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x(" 01463 << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n"; 01464 msg << "\t Dimensions are: y(" << y.dimension_0() << "," << y.dimension_1() << ") b(" << betav.dimension_0() << ")\n"; 01465 Impl::throw_runtime_exception( msg.str() ); 01466 } 01467 } 01468 if(doalpha==2) { 01469 if(alphav.dimension_0()!=numVecs) { 01470 std::ostringstream msg; 01471 msg << "Error in CRSMatrix - Vector Multiply (y = by + aAx): 2nd dimensions of x and b do not match\n"; 01472 msg << "\t Labels are: y(" << RangeVector::memory_space::query_label(y.ptr_on_device()) << ") b(" 01473 << CoeffVector1::memory_space::query_label(betav.ptr_on_device()) << ") a(" 01474 << CoeffVector2::memory_space::query_label(alphav.ptr_on_device()) << ") x(" 01475 << CrsMatrix::values_type::memory_space::query_label(A.values.ptr_on_device()) << ") x(" 01476 << DomainVector::memory_space::query_label(x.ptr_on_device()) << ")\n"; 01477 msg << "\t Dimensions are: x(" << x.dimension_0() << "," << x.dimension_1() << ") b(" << betav.dimension_0() << ")\n"; 01478 Impl::throw_runtime_exception( msg.str() ); 01479 } 01480 } 01481 } 01482 } // namespace Impl 01483 01484 // This TansposeFunctor is functional, but not necessarily performant. 01485 template<class RangeVector, 01486 class CrsMatrix, 01487 class DomainVector, 01488 class CoeffVector1, 01489 class CoeffVector2, 01490 int doalpha, 01491 int dobeta, 01492 int NNZPerRow = 27 > 01493 struct MV_MultiplyTransposeFunctor { 01494 typedef typename CrsMatrix::device_type device_type ; 01495 typedef typename CrsMatrix::ordinal_type size_type ; 01496 typedef typename CrsMatrix::non_const_value_type value_type ; 01497 typedef typename Kokkos::View<value_type*, device_type> range_values; 01498 01499 typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ; 01500 01501 CoeffVector1 beta; 01502 CoeffVector2 alpha; 01503 CrsMatrix m_A ; 01504 DomainVector m_x ; 01505 RangeVector m_y ; 01506 size_type n; 01507 01508 KOKKOS_INLINE_FUNCTION 01509 void operator()(int i) const { 01510 const size_type iRow = i/ShflThreadsPerRow::device_value; 01511 const int lane = i%ShflThreadsPerRow::device_value; 01512 01513 const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow); 01514 01515 for (size_type iEntry = lane; iEntry < row.length; iEntry += ShflThreadsPerRow::device_value ) { 01516 const value_type val = row.value(iEntry); 01517 const size_type ind = row.colidx(iEntry); 01518 01519 if(doalpha!=1) { 01520 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01521 #pragma unroll 01522 #endif 01523 for (size_type k = 0; k < n; ++k) { 01524 atomic_add(&m_y(ind,k), value_type(alpha(k) * val * m_x(iRow, k))); 01525 } 01526 } else { 01527 #ifdef KOKKOS_HAVE_PRAGMA_UNROLL 01528 #pragma unroll 01529 #endif 01530 for (size_type k = 0; k < n; ++k) { 01531 atomic_add(&m_y(ind,k), value_type(val * m_x(iRow, k))); 01532 } 01533 } 01534 } 01535 } 01536 }; 01537 01538 // This TansposeFunctor is functional, but not necessarily performant. 01539 template<class RangeVector, 01540 class CrsMatrix, 01541 class DomainVector, 01542 class CoeffVector1, 01543 class CoeffVector2, 01544 int doalpha, 01545 int dobeta, 01546 int NNZPerRow = 27 > 01547 struct MV_MultiplyTransposeSingleFunctor { 01548 typedef typename CrsMatrix::device_type device_type ; 01549 typedef typename CrsMatrix::ordinal_type size_type ; 01550 typedef typename CrsMatrix::non_const_value_type value_type ; 01551 typedef typename Kokkos::View<value_type*, device_type> range_values; 01552 01553 typedef MV_MultiplyShflThreadsPerRow< device_type , value_type , NNZPerRow > ShflThreadsPerRow ; 01554 01555 CoeffVector1 beta; 01556 CoeffVector2 alpha; 01557 CrsMatrix m_A ; 01558 DomainVector m_x ; 01559 RangeVector m_y ; 01560 size_type n; 01561 01562 KOKKOS_INLINE_FUNCTION 01563 void operator()(int i) const { 01564 const size_type iRow = i/ShflThreadsPerRow::device_value; 01565 const int lane = i%ShflThreadsPerRow::device_value; 01566 01567 const SparseRowViewConst<CrsMatrix> row = m_A.rowConst(iRow); 01568 01569 for (size_type iEntry = lane; iEntry < row.length; iEntry += ShflThreadsPerRow::device_value ) { 01570 const value_type val = row.value(iEntry); 01571 const size_type ind = row.colidx(iEntry); 01572 01573 if(doalpha!=1) { 01574 atomic_add(&m_y(ind), value_type(alpha(0) * val * m_x(iRow))); 01575 } else { 01576 atomic_add(&m_y(ind), value_type(val * m_x(iRow))); 01577 } 01578 } 01579 } 01580 }; 01581 01582 template <class RangeVector, 01583 class TCrsMatrix, 01584 class DomainVector, 01585 class CoeffVector1, 01586 class CoeffVector2, 01587 int doalpha, 01588 int dobeta> 01589 void 01590 MV_MultiplyTranspose (typename Kokkos::Impl::enable_if<DomainVector::Rank == 2, const CoeffVector1>::type& betav, 01591 const RangeVector &y, 01592 const CoeffVector2 &alphav, 01593 const TCrsMatrix &A, 01594 const DomainVector &x) 01595 { 01596 //Special case for zero Rows RowMap 01597 if(A.numRows() == -1) return; 01598 01599 if (doalpha == 0) { 01600 if (dobeta==2) { 01601 MV_MulScalar(y,betav,y); 01602 } else { 01603 MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y); 01604 } 01605 return; 01606 } else { 01607 typedef View< typename RangeVector::non_const_data_type , 01608 typename RangeVector::array_layout , 01609 typename RangeVector::device_type , 01610 typename RangeVector::memory_traits > 01611 RangeVectorType; 01612 01613 typedef View< typename DomainVector::const_data_type , 01614 typename DomainVector::array_layout , 01615 typename DomainVector::device_type , 01616 Kokkos::MemoryRandomAccess > 01617 DomainVectorType; 01618 01619 typedef View< typename CoeffVector1::const_data_type , 01620 typename CoeffVector1::array_layout , 01621 typename CoeffVector1::device_type , 01622 Kokkos::MemoryRandomAccess > 01623 CoeffVector1Type; 01624 01625 typedef View< typename CoeffVector2::const_data_type , 01626 typename CoeffVector2::array_layout , 01627 typename CoeffVector2::device_type , 01628 Kokkos::MemoryRandomAccess > 01629 CoeffVector2Type; 01630 01631 typedef CrsMatrix<typename TCrsMatrix::const_value_type, 01632 typename TCrsMatrix::ordinal_type, 01633 typename TCrsMatrix::device_type, 01634 typename TCrsMatrix::memory_traits, 01635 typename TCrsMatrix::size_type> CrsMatrixType; 01636 01637 //Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta); 01638 /* 01639 #ifndef KOKKOS_FAST_COMPILE 01640 01641 if(x.dimension_1()==1) { 01642 typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type,Kokkos::MemoryRandomAccess> DomainVector1D; 01643 typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type> DomainVector1DPlain; 01644 typedef View<typename RangeVectorType::value_type*,typename RangeVector::array_layout ,typename RangeVectorType::device_type,typename RangeVector::memory_traits> RangeVector1D; 01645 01646 typedef MV_MultiplySingleFunctor<RangeVector1D, CrsMatrixType, DomainVector1D, 01647 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta> OpType ; 01648 01649 Kokkos::subview< RangeVector1D >( y , ALL(),0 ); 01650 01651 OpType op ; 01652 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 01653 op.m_A = A ; 01654 op.m_x = Kokkos::subview< DomainVector1DPlain >( x , ALL(),0 ) ; 01655 op.m_y = Kokkos::subview< RangeVector1D >( y , ALL(),0 ) ; 01656 op.beta = betav; 01657 op.alpha = alphav; 01658 op.n = x.dimension(1); 01659 Kokkos::parallel_for (nrow * OpType::ShflThreadsPerRow::host_value(), op); 01660 01661 } else { 01662 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 01663 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta> OpType ; 01664 OpType op ; 01665 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 01666 op.m_A = A ; 01667 op.m_x = x ; 01668 op.m_y = y ; 01669 op.beta = betav; 01670 op.alpha = alphav; 01671 op.n = x.dimension(1); 01672 Kokkos::parallel_for(nrow*OpType::ShflThreadsPerRow::host_value() , op); 01673 } 01674 01675 #else // NOT KOKKOS_FAST_COMPILE 01676 */ 01677 typedef MV_MultiplyTransposeFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, CoeffVector1Type, CoeffVector2Type, 2, 2 > 01678 OpType ; 01679 01680 OpType op ; 01681 01682 int numVecs = x.dimension_1(); 01683 CoeffVector1 beta = betav; 01684 CoeffVector2 alpha = alphav; 01685 01686 if (doalpha != 2) { 01687 alpha = CoeffVector2("CrsMatrix::auto_a", numVecs); 01688 typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha); 01689 typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha; 01690 01691 for (int i = 0; i < numVecs; ++i) 01692 h_a(i) = s_a; 01693 01694 Kokkos::deep_copy(alpha, h_a); 01695 } 01696 01697 if (dobeta != 2) { 01698 beta = CoeffVector1("CrsMatrix::auto_b", numVecs); 01699 typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta); 01700 typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta; 01701 01702 for(int i = 0; i < numVecs; i++) 01703 h_b(i) = s_b; 01704 01705 Kokkos::deep_copy(beta, h_b); 01706 } 01707 01708 if (dobeta==2) { 01709 MV_MulScalar(y,betav,y); 01710 } else { 01711 if (dobeta!=1) 01712 MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y); 01713 } 01714 01715 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 01716 op.m_A = A; 01717 op.m_x = x; 01718 op.m_y = y; 01719 op.beta = beta; 01720 op.alpha = alpha; 01721 op.n = x.dimension_1(); 01722 Kokkos::parallel_for (nrow * OpType::ShflThreadsPerRow::host_value() , op ); 01723 01724 //#endif // KOKKOS_FAST_COMPILE 01725 } 01726 } 01727 01728 template<class RangeVector, class CrsMatrix, class DomainVector, class CoeffVector1, class CoeffVector2> 01729 void 01730 MV_MultiplyTranspose (const CoeffVector1& betav, 01731 const RangeVector& y, 01732 const CoeffVector2& alphav, 01733 const CrsMatrix& A, 01734 const DomainVector& x, 01735 int beta, 01736 int alpha) 01737 { 01738 if (beta == 0) { 01739 if(alpha == 0) 01740 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 0>(betav, y, alphav, A , x); 01741 else if(alpha == 1) 01742 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 0>(betav, y, alphav, A , x); 01743 else if(alpha == -1) 01744 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 0 > (betav, y, alphav, A , x); 01745 else 01746 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 0>(betav, y, alphav, A , x); 01747 } else if(beta == 1) { 01748 if(alpha == 0) 01749 return; 01750 else if(alpha == 1) 01751 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 1>(betav, y, alphav, A , x); 01752 else if(alpha == -1) 01753 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 1 > (betav, y, alphav, A , x); 01754 else 01755 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 1>(betav, y, alphav, A , x); 01756 } else if(beta == -1) { 01757 if(alpha == 0) 01758 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, -1>(betav, y, alphav, A , x); 01759 else if(alpha == 1) 01760 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, -1 > (betav, y, alphav, A , x); 01761 else if(alpha == -1) 01762 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, -1 > (betav, y, alphav, A , x); 01763 else 01764 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, -1 > (betav, y, alphav, A , x); 01765 } else { 01766 if(alpha == 0) 01767 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 2>(betav, y, alphav, A , x); 01768 else if(alpha == 1) 01769 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 2>(betav, y, alphav, A , x); 01770 else if(alpha == -1) 01771 MV_MultiplyTranspose < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 2 > (betav, y, alphav, A , x); 01772 else 01773 MV_MultiplyTranspose<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 2>(betav, y, alphav, A , x); 01774 } 01775 } 01776 01777 template<class RangeVector, class CrsMatrix, class DomainVector> 01778 void 01779 MV_MultiplyTranspose (typename RangeVector::const_value_type s_b, 01780 const RangeVector& y, 01781 typename DomainVector::const_value_type s_a, 01782 const CrsMatrix& A, 01783 const DomainVector& x) 01784 { 01785 /*#ifdef KOKKOS_USE_CUSPARSE 01786 if (MV_Multiply_Try_CuSparse (s_b, y, s_a, A, x)) { 01787 return; 01788 } 01789 #endif // KOKKOSE_USE_CUSPARSE 01790 #ifdef KOKKOS_USE_MKL 01791 if (MV_Multiply_Try_MKL (s_b, y, s_a, A, x)) { 01792 return; 01793 } 01794 #endif // KOKKOS_USE_MKL*/ 01795 typedef Kokkos::View<typename RangeVector::value_type*, typename RangeVector::device_type> aVector; 01796 aVector a; 01797 aVector b; 01798 int numVecs = x.dimension_1(); 01799 01800 if (s_b == 0) { 01801 if (s_a == 0) 01802 return MV_MultiplyTranspose (a, y, a, A, x, 0, 0); 01803 else if (s_a == 1) 01804 return MV_MultiplyTranspose (a, y, a, A, x, 0, 1); 01805 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1)) 01806 return MV_MultiplyTranspose (a, y, a, A, x, 0, -1); 01807 else { 01808 a = aVector("a", numVecs); 01809 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a); 01810 for (int i = 0; i < numVecs; ++i) { 01811 h_a(i) = s_a; 01812 } 01813 Kokkos::deep_copy (a, h_a); 01814 return MV_MultiplyTranspose (a, y, a, A, x, 0, 2); 01815 } 01816 } else if (s_b == 1) { 01817 if (s_a == 0) 01818 return MV_MultiplyTranspose (a, y, a, A, x, 1, 0); 01819 else if (s_a == 1) 01820 return MV_MultiplyTranspose (a, y, a, A, x, 1, 1); 01821 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1)) 01822 return MV_MultiplyTranspose (a, y, a, A, x, 1, -1); 01823 else { 01824 a = aVector("a", numVecs); 01825 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a); 01826 for (int i = 0; i < numVecs; ++i) { 01827 h_a(i) = s_a; 01828 } 01829 Kokkos::deep_copy (a, h_a); 01830 return MV_MultiplyTranspose (a, y, a, A, x, 1, 2); 01831 } 01832 } else if (s_b == static_cast<typename RangeVector::const_value_type> (-1)) { 01833 if (s_a == 0) 01834 return MV_MultiplyTranspose (a, y, a, A, x, -1, 0); 01835 else if (s_a == 1) 01836 return MV_MultiplyTranspose (a, y, a, A, x, -1, 1); 01837 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1)) 01838 return MV_MultiplyTranspose (a, y, a, A, x, -1, -1); 01839 else { 01840 a = aVector("a", numVecs); 01841 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a); 01842 for (int i = 0; i < numVecs; ++i) { 01843 h_a(i) = s_a; 01844 } 01845 Kokkos::deep_copy (a, h_a); 01846 return MV_MultiplyTranspose (a, y, a, A, x, -1, 2); 01847 } 01848 } else { 01849 b = aVector("b", numVecs); 01850 typename aVector::HostMirror h_b = Kokkos::create_mirror_view(b); 01851 for (int i = 0; i < numVecs; ++i) { 01852 h_b(i) = s_b; 01853 } 01854 Kokkos::deep_copy(b, h_b); 01855 01856 if (s_a == 0) 01857 return MV_MultiplyTranspose (b, y, a, A, x, 2, 0); 01858 else if (s_a == 1) 01859 return MV_MultiplyTranspose (b, y, a, A, x, 2, 1); 01860 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1)) 01861 return MV_MultiplyTranspose (b, y, a, A, x, 2, -1); 01862 else { 01863 a = aVector("a", numVecs); 01864 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a); 01865 for (int i = 0; i < numVecs; ++i) { 01866 h_a(i) = s_a; 01867 } 01868 Kokkos::deep_copy (a, h_a); 01869 return MV_MultiplyTranspose (b, y, a, A, x, 2, 2); 01870 } 01871 } 01872 } 01873 01874 template< class DeviceType > 01875 Kokkos::TeamPolicy< DeviceType > 01876 inline 01877 mv_multiply_team_policy( const int nrow , const int rows_per_thread, const int increment ) 01878 { 01879 #ifdef KOKKOS_HAVE_CUDA 01880 const int teamsize = Impl::is_same< DeviceType , Kokkos::Cuda>::value ? 256 : 1;//hwloc::get_available_threads_per_core() ; 01881 #else 01882 const int teamsize = 1;//hwloc::get_available_threads_per_core(); 01883 #endif 01884 const int nteams = (((nrow+rows_per_thread-1)/rows_per_thread) 01885 *increment+teamsize-1)/teamsize; 01886 return Kokkos::TeamPolicy< DeviceType >( nteams , teamsize ); 01887 } 01888 01889 01890 template<class RangeVector, 01891 class TCrsMatrix, 01892 class DomainVector, 01893 class CoeffVector1, 01894 class CoeffVector2, 01895 int doalpha, 01896 int dobeta> 01897 void 01898 MV_MultiplySingle (typename Kokkos::Impl::enable_if<DomainVector::Rank == 1, const CoeffVector1>::type& betav, 01899 const RangeVector &y, 01900 const CoeffVector2 &alphav, 01901 const TCrsMatrix& A, 01902 const DomainVector& x) 01903 { 01904 if(A.numRows()<=0) return; 01905 if (doalpha == 0) { 01906 if (dobeta==2) { 01907 V_MulScalar(y,betav,y); 01908 } 01909 else { 01910 V_MulScalar(y,typename RangeVector::value_type(dobeta),y); 01911 } 01912 return; 01913 } else { 01914 typedef View< typename RangeVector::non_const_data_type , 01915 typename RangeVector::array_layout , 01916 typename RangeVector::device_type , 01917 typename RangeVector::memory_traits > 01918 RangeVectorType; 01919 01920 typedef View< typename DomainVector::const_data_type , 01921 typename DomainVector::array_layout , 01922 typename DomainVector::device_type , 01923 //typename DomainVector::memory_traits > 01924 Kokkos::MemoryRandomAccess > 01925 DomainVectorType; 01926 01927 typedef View< typename CoeffVector1::const_data_type , 01928 typename CoeffVector1::array_layout , 01929 typename CoeffVector1::device_type , 01930 Kokkos::MemoryRandomAccess > 01931 CoeffVector1Type; 01932 01933 typedef View< typename CoeffVector2::const_data_type , 01934 typename CoeffVector2::array_layout , 01935 typename CoeffVector2::device_type , 01936 Kokkos::MemoryRandomAccess > 01937 CoeffVector2Type; 01938 01939 typedef CrsMatrix<typename TCrsMatrix::const_value_type, 01940 typename TCrsMatrix::ordinal_type, 01941 typename TCrsMatrix::device_type, 01942 typename TCrsMatrix::memory_traits, 01943 typename TCrsMatrix::size_type> 01944 CrsMatrixType; 01945 01946 Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta); 01947 01948 const int NNZPerRow = A.nnz()/A.numRows(); 01949 01950 #ifndef KOKKOS_FAST_COMPILE 01951 01952 if(NNZPerRow>=96) { 01953 typedef Vectorization<typename RangeVector::device_type,32> vec_type; 01954 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 01955 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ; 01956 01957 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 01958 01959 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 01960 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 01961 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 01962 01963 } else if(NNZPerRow>=48) { 01964 typedef Vectorization<typename RangeVector::device_type,16> vec_type; 01965 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 01966 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ; 01967 01968 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 01969 01970 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 01971 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 01972 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 01973 01974 } else if(NNZPerRow>=24) { 01975 typedef Vectorization<typename RangeVector::device_type,8> vec_type; 01976 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 01977 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ; 01978 01979 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 01980 01981 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 01982 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 01983 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 01984 01985 } else if(NNZPerRow>=12) { 01986 typedef Vectorization<typename RangeVector::device_type,4> vec_type; 01987 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 01988 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ; 01989 01990 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 01991 01992 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 01993 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 01994 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 01995 01996 } else if(NNZPerRow>=4) { 01997 typedef Vectorization<typename RangeVector::device_type,2> vec_type; 01998 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 01999 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ; 02000 02001 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02002 02003 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02004 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02005 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02006 02007 } else { 02008 typedef Vectorization<typename RangeVector::device_type,1> vec_type; 02009 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 02010 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment > OpType ; 02011 02012 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02013 02014 OpType op(betav,alphav,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02015 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02016 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02017 02018 } 02019 #else // NOT KOKKOS_FAST_COMPILE 02020 typedef Vectorization<typename RangeVector::device_type,8> vec_type; 02021 typedef MV_MultiplySingleFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 02022 CoeffVector1Type, CoeffVector2Type, 2, 2, vec_type::increment > OpType ; 02023 02024 int numVecs = x.dimension_1(); // == 1 02025 CoeffVector1 beta = betav; 02026 CoeffVector2 alpha = alphav; 02027 02028 if(doalpha!=2) { 02029 alpha = CoeffVector2("CrsMatrix::auto_a", numVecs); 02030 typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha); 02031 typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha; 02032 02033 for(int i = 0; i < numVecs; i++) 02034 h_a(i) = s_a; 02035 02036 Kokkos::deep_copy(alpha, h_a); 02037 } 02038 if(dobeta!=2) { 02039 beta = CoeffVector1("CrsMatrix::auto_b", numVecs); 02040 typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta); 02041 typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta; 02042 02043 for(int i = 0; i < numVecs; i++) 02044 h_b(i) = s_b; 02045 02046 Kokkos::deep_copy(beta, h_b); 02047 } 02048 02049 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02050 02051 OpType op(beta,alpha,A,x,y,RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02052 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02053 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02054 02055 02056 #endif // KOKKOS_FAST_COMPILE 02057 } 02058 } 02059 02060 template <class RangeVector, 02061 class TCrsMatrix, 02062 class DomainVector, 02063 class CoeffVector1, 02064 class CoeffVector2, 02065 int doalpha, 02066 int dobeta> 02067 void 02068 MV_Multiply (typename Kokkos::Impl::enable_if<DomainVector::Rank == 2, const CoeffVector1>::type& betav, 02069 const RangeVector &y, 02070 const CoeffVector2 &alphav, 02071 const TCrsMatrix &A, 02072 const DomainVector &x) 02073 { 02074 //Special case for zero Rows RowMap 02075 if(A.numRows() <= 0) return; 02076 02077 if (doalpha == 0) { 02078 if (dobeta==2) { 02079 MV_MulScalar(y,betav,y); 02080 } else { 02081 MV_MulScalar(y,static_cast<typename RangeVector::const_value_type> (dobeta),y); 02082 } 02083 return; 02084 } else { 02085 typedef View< typename RangeVector::non_const_data_type , 02086 typename RangeVector::array_layout , 02087 typename RangeVector::device_type , 02088 typename RangeVector::memory_traits > 02089 RangeVectorType; 02090 02091 typedef View< typename DomainVector::const_data_type , 02092 typename DomainVector::array_layout , 02093 typename DomainVector::device_type , 02094 Kokkos::MemoryRandomAccess > 02095 DomainVectorType; 02096 02097 typedef View< typename CoeffVector1::const_data_type , 02098 typename CoeffVector1::array_layout , 02099 typename CoeffVector1::device_type , 02100 Kokkos::MemoryRandomAccess > 02101 CoeffVector1Type; 02102 02103 typedef View< typename CoeffVector2::const_data_type , 02104 typename CoeffVector2::array_layout , 02105 typename CoeffVector2::device_type , 02106 Kokkos::MemoryRandomAccess > 02107 CoeffVector2Type; 02108 02109 typedef CrsMatrix<typename TCrsMatrix::const_value_type, 02110 typename TCrsMatrix::ordinal_type, 02111 typename TCrsMatrix::device_type, 02112 typename TCrsMatrix::memory_traits, 02113 typename TCrsMatrix::size_type> CrsMatrixType; 02114 02115 Impl::MV_Multiply_Check_Compatibility(betav,y,alphav,A,x,doalpha,dobeta); 02116 02117 const int NNZPerRow = A.nnz()/A.numRows(); 02118 02119 #ifndef KOKKOS_FAST_COMPILE 02120 02121 if(x.dimension_1()==1) { 02122 typedef View<typename DomainVectorType::const_value_type*,typename DomainVector::array_layout ,typename DomainVectorType::device_type> DomainVector1D; 02123 typedef View<typename RangeVectorType::value_type*,typename RangeVector::array_layout ,typename RangeVectorType::device_type,typename RangeVector::memory_traits> RangeVector1D; 02124 RangeVector1D y_sub = RangeVector1D(Kokkos::ViewWithoutManaging(),y.ptr_on_device(),y.dimension_0()); 02125 DomainVector1D x_sub = DomainVector1D(Kokkos::ViewWithoutManaging(),x.ptr_on_device(),x.dimension_0()); 02126 02127 return MV_MultiplySingle<RangeVector1D,TCrsMatrix,DomainVector1D,CoeffVector1,CoeffVector2,doalpha,dobeta> 02128 (betav,y_sub,alphav,A,x_sub); 02129 02130 } else { 02131 02132 //Currently for multiple right hand sides its not worth it to use more than 8 threads per row on GPUs 02133 if(NNZPerRow>=96) { 02134 typedef Vectorization<typename RangeVector::device_type,8> vec_type; 02135 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 02136 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ; 02137 02138 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02139 02140 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02141 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02142 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02143 02144 } else if(NNZPerRow>=48) { 02145 typedef Vectorization<typename RangeVector::device_type,8> vec_type; 02146 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 02147 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ; 02148 02149 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02150 02151 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02152 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02153 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02154 02155 } else if(NNZPerRow>=16) { 02156 typedef Vectorization<typename RangeVector::device_type,8> vec_type; 02157 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 02158 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ; 02159 02160 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02161 02162 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02163 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02164 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02165 02166 } else if(NNZPerRow>=12) { 02167 typedef Vectorization<typename RangeVector::device_type,4> vec_type; 02168 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 02169 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ; 02170 02171 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02172 02173 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02174 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02175 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02176 02177 } else if(NNZPerRow>=4) { 02178 typedef Vectorization<typename RangeVector::device_type,2> vec_type; 02179 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 02180 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ; 02181 02182 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02183 02184 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02185 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02186 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02187 02188 } else { 02189 typedef Vectorization<typename RangeVector::device_type,1> vec_type; 02190 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 02191 CoeffVector1Type, CoeffVector2Type, doalpha, dobeta,vec_type::increment> OpType ; 02192 02193 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02194 02195 OpType op(betav,alphav,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02196 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02197 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02198 02199 } 02200 } 02201 02202 #else // NOT KOKKOS_FAST_COMPILE 02203 02204 typedef Vectorization<typename RangeVector::device_type,8> vec_type; 02205 typedef MV_MultiplyFunctor<RangeVectorType, CrsMatrixType, DomainVectorType, 02206 CoeffVector1Type, CoeffVector2Type, 2, 2, vec_type::increment > 02207 OpType ; 02208 02209 int numVecs = x.dimension_1(); 02210 CoeffVector1 beta = betav; 02211 CoeffVector2 alpha = alphav; 02212 02213 if (doalpha != 2) { 02214 alpha = CoeffVector2("CrsMatrix::auto_a", numVecs); 02215 typename CoeffVector2::HostMirror h_a = Kokkos::create_mirror_view(alpha); 02216 typename CoeffVector2::value_type s_a = (typename CoeffVector2::value_type) doalpha; 02217 02218 for (int i = 0; i < numVecs; ++i) 02219 h_a(i) = s_a; 02220 02221 Kokkos::deep_copy(alpha, h_a); 02222 } 02223 02224 if (dobeta != 2) { 02225 beta = CoeffVector1("CrsMatrix::auto_b", numVecs); 02226 typename CoeffVector1::HostMirror h_b = Kokkos::create_mirror_view(beta); 02227 typename CoeffVector1::value_type s_b = (typename CoeffVector1::value_type) dobeta; 02228 02229 for(int i = 0; i < numVecs; i++) 02230 h_b(i) = s_b; 02231 02232 Kokkos::deep_copy(beta, h_b); 02233 } 02234 02235 const typename CrsMatrixType::ordinal_type nrow = A.numRows(); 02236 02237 OpType op(beta,alpha,A,x,y,x.dimension_1(),RowsPerThread<typename RangeVector::device_type >(NNZPerRow)) ; 02238 02239 Kokkos::parallel_for( mv_multiply_team_policy< typename RangeVector::device_type > 02240 ( nrow ,RowsPerThread<typename RangeVector::device_type >(NNZPerRow), vec_type::increment ) , op ); 02241 02242 02243 #endif // KOKKOS_FAST_COMPILE 02244 } 02245 } 02246 02247 template<class RangeVector, 02248 class TCrsMatrix, 02249 class DomainVector, 02250 class CoeffVector1, 02251 class CoeffVector2, 02252 int doalpha, 02253 int dobeta> 02254 void 02255 MV_Multiply (typename Kokkos::Impl::enable_if<DomainVector::Rank == 1, const CoeffVector1>::type& betav, 02256 const RangeVector &y, 02257 const CoeffVector2 &alphav, 02258 const TCrsMatrix& A, 02259 const DomainVector& x) { 02260 return MV_MultiplySingle<RangeVector,TCrsMatrix,DomainVector,CoeffVector1,CoeffVector2,doalpha,dobeta> 02261 (betav,y,alphav,A,x); 02262 } 02263 02264 template<class RangeVector, class CrsMatrix, class DomainVector, class CoeffVector1, class CoeffVector2> 02265 void 02266 MV_Multiply (const CoeffVector1& betav, 02267 const RangeVector& y, 02268 const CoeffVector2& alphav, 02269 const CrsMatrix& A, 02270 const DomainVector& x, 02271 int beta, 02272 int alpha) 02273 { 02274 if (beta == 0) { 02275 if(alpha == 0) 02276 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 0>(betav, y, alphav, A , x); 02277 else if(alpha == 1) 02278 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 0>(betav, y, alphav, A , x); 02279 else if(alpha == -1) 02280 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 0 > (betav, y, alphav, A , x); 02281 else 02282 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 0>(betav, y, alphav, A , x); 02283 } else if(beta == 1) { 02284 if(alpha == 0) 02285 return; 02286 else if(alpha == 1) 02287 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 1>(betav, y, alphav, A , x); 02288 else if(alpha == -1) 02289 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 1 > (betav, y, alphav, A , x); 02290 else 02291 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 1>(betav, y, alphav, A , x); 02292 } else if(beta == -1) { 02293 if(alpha == 0) 02294 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, -1>(betav, y, alphav, A , x); 02295 else if(alpha == 1) 02296 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, -1 > (betav, y, alphav, A , x); 02297 else if(alpha == -1) 02298 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, -1 > (betav, y, alphav, A , x); 02299 else 02300 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, -1 > (betav, y, alphav, A , x); 02301 } else { 02302 if(alpha == 0) 02303 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 0, 2>(betav, y, alphav, A , x); 02304 else if(alpha == 1) 02305 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 1, 2>(betav, y, alphav, A , x); 02306 else if(alpha == -1) 02307 MV_Multiply < RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, -1, 2 > (betav, y, alphav, A , x); 02308 else 02309 MV_Multiply<RangeVector, CrsMatrix, DomainVector, CoeffVector1, CoeffVector2, 2, 2>(betav, y, alphav, A , x); 02310 } 02311 } 02312 02313 template <class RangeVector, class CrsMatrix, class DomainVector, 02314 class Value1, class Layout1, class Device1, class MemoryManagement1, 02315 class Value2, class Layout2, class Device2, class MemoryManagement2> 02316 void 02317 MV_Multiply (const Kokkos::View<Value1, Layout1, Device1, MemoryManagement1>& betav, 02318 const RangeVector& y, 02319 const Kokkos::View<Value2, Layout2, Device2, MemoryManagement2>& alphav, 02320 const CrsMatrix& A, 02321 const DomainVector& x) 02322 { 02323 return MV_Multiply (betav, y, alphav, A, x, 2, 2); 02324 } 02325 02326 template <class RangeVector, class CrsMatrix, class DomainVector, 02327 class Value1, class Layout1, class Device1, class MemoryManagement1> 02328 void 02329 MV_Multiply (const RangeVector& y, 02330 const Kokkos::View<Value1, Layout1, Device1, MemoryManagement1>& alphav, 02331 const CrsMatrix& A, 02332 const DomainVector& x) 02333 { 02334 return MV_Multiply (alphav, y, alphav, A, x, 0, 2); 02335 } 02336 02337 template<class RangeVector, class CrsMatrix, class DomainVector> 02338 void 02339 MV_Multiply (const RangeVector& y, 02340 const CrsMatrix& A, 02341 const DomainVector& x) 02342 { 02343 // FIXME (mfh 21 Jun 2013) The way this code is supposed to work, is 02344 // that it tests at run time for each TPL in turn. Shouldn't it 02345 // rather dispatch on the Device type? But I suppose the "try" 02346 // functions do that. 02347 // 02348 // We want to condense this a bit: "Try TPLs" function that tests 02349 // all the suitable TPLs at run time. This would be a run-time test 02350 // that compares the Scalar and Device types to those accepted by 02351 // the TPL(s). 02352 #ifdef KOKKOS_USE_CUSPARSE 02353 if (CuSparse::MV_Multiply_Try_CuSparse (0.0, y, 1.0, A, x)) { 02354 return; 02355 } 02356 #endif // KOKKOS_USE_CUSPARSE 02357 #ifdef KOKKOS_USE_MKL 02358 if (MV_Multiply_Try_MKL (0.0, y, 1.0, A, x)) { 02359 return; 02360 } 02361 #endif // KOKKOS_USE_MKL 02362 typedef Kokkos::View<typename DomainVector::value_type*, typename DomainVector::device_type> aVector; 02363 aVector a; 02364 02365 return MV_Multiply (a, y, a, A, x, 0, 1); 02366 } 02367 02368 template<class RangeVector, class CrsMatrix, class DomainVector> 02369 void 02370 MV_Multiply (const RangeVector& y, 02371 typename DomainVector::const_value_type s_a, 02372 const CrsMatrix& A, 02373 const DomainVector& x) 02374 { 02375 #ifdef KOKKOS_USE_CUSPARSE 02376 if (CuSparse::MV_Multiply_Try_CuSparse (0.0, y, s_a, A, x)) { 02377 return; 02378 } 02379 #endif // KOKKOS_USE_CUSPARSE 02380 #ifdef KOKKOS_USE_MKL 02381 if (MV_Multiply_Try_MKL (0.0, y, s_a, A, x)) { 02382 return; 02383 } 02384 #endif // KOKKOS_USE_MKL 02385 typedef Kokkos::View<typename RangeVector::value_type*, typename RangeVector::device_type> aVector; 02386 aVector a; 02387 const int numVecs = x.dimension_1(); 02388 02389 if ((s_a < 1) && (s_a != 0)) { 02390 return MV_Multiply (a, y, a, A, x, 0, -1); 02391 } else if (s_a == 1) { 02392 return MV_Multiply (a, y, a, A, x, 0, 1); 02393 } 02394 02395 if (s_a != 0) { 02396 a = aVector("a", numVecs); 02397 typename aVector::HostMirror h_a = Kokkos::create_mirror_view (a); 02398 for (int i = 0; i < numVecs; ++i) { 02399 h_a(i) = s_a; 02400 } 02401 Kokkos::deep_copy(a, h_a); 02402 return MV_Multiply (a, y, a, A, x, 0, 2); 02403 } 02404 } 02405 02406 template<class RangeVector, class CrsMatrix, class DomainVector> 02407 void 02408 MV_Multiply (typename RangeVector::const_value_type s_b, 02409 const RangeVector& y, 02410 typename DomainVector::const_value_type s_a, 02411 const CrsMatrix& A, 02412 const DomainVector& x) 02413 { 02414 #ifdef KOKKOS_USE_CUSPARSE 02415 if (CuSparse::MV_Multiply_Try_CuSparse (s_b, y, s_a, A, x)) { 02416 return; 02417 } 02418 #endif // KOKKOSE_USE_CUSPARSE 02419 #ifdef KOKKOS_USE_MKL 02420 if (MV_Multiply_Try_MKL (s_b, y, s_a, A, x)) { 02421 return; 02422 } 02423 #endif // KOKKOS_USE_MKL 02424 typedef Kokkos::View<typename RangeVector::value_type*, typename RangeVector::device_type> aVector; 02425 aVector a; 02426 aVector b; 02427 int numVecs = x.dimension_1(); 02428 02429 // [HCE 2013/12/09] Following 'if' appears to be a mistake and has been commented out 02430 // if(numVecs==1) 02431 if (s_b == 0) { 02432 if (s_a == 0) 02433 return MV_Multiply (a, y, a, A, x, 0, 0); 02434 else if (s_a == 1) 02435 return MV_Multiply (a, y, a, A, x, 0, 1); 02436 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1)) 02437 return MV_Multiply (a, y, a, A, x, 0, -1); 02438 else { 02439 a = aVector("a", numVecs); 02440 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a); 02441 for (int i = 0; i < numVecs; ++i) { 02442 h_a(i) = s_a; 02443 } 02444 Kokkos::deep_copy (a, h_a); 02445 return MV_Multiply (a, y, a, A, x, 0, 2); 02446 } 02447 } else if (s_b == 1) { 02448 if (s_a == 0) 02449 return MV_Multiply (a, y, a, A, x, 1, 0); 02450 else if (s_a == 1) 02451 return MV_Multiply (a, y, a, A, x, 1, 1); 02452 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1)) 02453 return MV_Multiply (a, y, a, A, x, 1, -1); 02454 else { 02455 a = aVector("a", numVecs); 02456 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a); 02457 for (int i = 0; i < numVecs; ++i) { 02458 h_a(i) = s_a; 02459 } 02460 Kokkos::deep_copy (a, h_a); 02461 return MV_Multiply (a, y, a, A, x, 1, 2); 02462 } 02463 } else if (s_b == static_cast<typename RangeVector::const_value_type> (-1)) { 02464 if (s_a == 0) 02465 return MV_Multiply (a, y, a, A, x, -1, 0); 02466 else if (s_a == 1) 02467 return MV_Multiply (a, y, a, A, x, -1, 1); 02468 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1)) 02469 return MV_Multiply (a, y, a, A, x, -1, -1); 02470 else { 02471 a = aVector("a", numVecs); 02472 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a); 02473 for (int i = 0; i < numVecs; ++i) { 02474 h_a(i) = s_a; 02475 } 02476 Kokkos::deep_copy (a, h_a); 02477 return MV_Multiply (a, y, a, A, x, -1, 2); 02478 } 02479 } else { 02480 b = aVector("b", numVecs); 02481 typename aVector::HostMirror h_b = Kokkos::create_mirror_view(b); 02482 for (int i = 0; i < numVecs; ++i) { 02483 h_b(i) = s_b; 02484 } 02485 Kokkos::deep_copy(b, h_b); 02486 02487 if (s_a == 0) 02488 return MV_Multiply (b, y, a, A, x, 2, 0); 02489 else if (s_a == 1) 02490 return MV_Multiply (b, y, a, A, x, 2, 1); 02491 else if (s_a == static_cast<typename DomainVector::const_value_type> (-1)) 02492 return MV_Multiply (b, y, a, A, x, 2, -1); 02493 else { 02494 a = aVector("a", numVecs); 02495 typename aVector::HostMirror h_a = Kokkos::create_mirror_view(a); 02496 for (int i = 0; i < numVecs; ++i) { 02497 h_a(i) = s_a; 02498 } 02499 Kokkos::deep_copy (a, h_a); 02500 return MV_Multiply (b, y, a, A, x, 2, 2); 02501 } 02502 } 02503 } 02504 02505 namespace KokkosCrsMatrix { 02519 template <class CrsMatrixDst, class CrsMatrixSrc> 02520 void deep_copy (CrsMatrixDst A, CrsMatrixSrc B) { 02521 Kokkos::deep_copy(A.graph.entries, B.graph.entries); 02522 // FIXME (mfh 09 Aug 2013) This _should_ copy the row map. We 02523 // couldn't do it before because the row map was const, forbidding 02524 // deep_copy. 02525 // 02526 //Kokkos::deep_copy(A.graph.row_map,B.graph.row_map); 02527 Kokkos::deep_copy(A.values, B.values); 02528 02529 // FIXME (mfh 09 Aug 2013) Be sure to copy numRows, numCols, and 02530 // nnz as well. 02531 // (CRT 25 Sep 2013) don't copy rather check that they match. 02532 // Deep_copy in Kokkos is intended for copy between compatible objects. 02533 } 02534 } // namespace KokkosCrsMatrix 02535 02536 } // namespace Kokkos 02537 02538 #endif /* KOKKOS_CRSMATRIX_H_ */
1.7.6.1