|
Tpetra Matrix/Vector Services
Version of the Day
|
00001 // @HEADER 00002 // *********************************************************************** 00003 // 00004 // Tpetra: Templated Linear Algebra Services Package 00005 // Copyright (2008) Sandia Corporation 00006 // 00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, 00008 // the U.S. Government retains certain rights in this software. 00009 // 00010 // Redistribution and use in source and binary forms, with or without 00011 // modification, are permitted provided that the following conditions are 00012 // met: 00013 // 00014 // 1. Redistributions of source code must retain the above copyright 00015 // notice, this list of conditions and the following disclaimer. 00016 // 00017 // 2. Redistributions in binary form must reproduce the above copyright 00018 // notice, this list of conditions and the following disclaimer in the 00019 // documentation and/or other materials provided with the distribution. 00020 // 00021 // 3. Neither the name of the Corporation nor the names of the 00022 // contributors may be used to endorse or promote products derived from 00023 // this software without specific prior written permission. 00024 // 00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY 00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE 00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00036 // 00037 // Questions? Contact Michael A. Heroux (maherou@sandia.gov) 00038 // 00039 // ************************************************************************ 00040 // @HEADER 00041 00042 #ifndef TPETRA_DISTRIBUTOR_HPP 00043 #define TPETRA_DISTRIBUTOR_HPP 00044 00045 #include "Tpetra_Util.hpp" 00046 #include <Teuchos_as.hpp> 00047 #include <Teuchos_Describable.hpp> 00048 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp> 00049 #include <Teuchos_VerboseObject.hpp> 00050 00051 // If TPETRA_DISTRIBUTOR_TIMERS is defined, Distributor will time 00052 // doPosts (both versions) and doWaits, and register those timers with 00053 // Teuchos::TimeMonitor so that summarize() or report() will show 00054 // results. 00055 00056 // #ifndef TPETRA_DISTRIBUTOR_TIMERS 00057 // # define TPETRA_DISTRIBUTOR_TIMERS 1 00058 // #endif // TPETRA_DISTRIBUTOR_TIMERS 00059 00060 #ifdef TPETRA_DISTRIBUTOR_TIMERS 00061 # undef TPETRA_DISTRIBUTOR_TIMERS 00062 #endif // TPETRA_DISTRIBUTOR_TIMERS 00063 00064 #if TPETRA_USE_KOKKOS_DISTOBJECT || defined(TPETRA_HAVE_KOKKOS_REFACTOR) 00065 #include "KokkosCompat_View.hpp" 00066 #include "Kokkos_Core.hpp" 00067 #include "Kokkos_TeuchosCommAdapters.hpp" 00068 #endif 00069 00070 00071 namespace Tpetra { 00072 00073 namespace Details { 00078 enum EDistributorSendType { 00079 DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend) 00080 DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend) 00081 DISTRIBUTOR_SEND, // Use MPI_Send (Teuchos::send) 00082 DISTRIBUTOR_SSEND // Use MPI_Ssend (Teuchos::ssend) 00083 }; 00084 00089 std::string 00090 DistributorSendTypeEnumToString (EDistributorSendType sendType); 00091 00096 enum EDistributorHowInitialized { 00097 DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet 00098 DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends 00099 DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs 00100 DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor 00101 DISTRIBUTOR_INITIALIZED_BY_COPY // By copy constructor 00102 }; 00103 00108 std::string 00109 DistributorHowInitializedEnumToString (EDistributorHowInitialized how); 00110 00111 } // namespace Details 00112 00119 Array<std::string> distributorSendTypes (); 00120 00188 class Distributor : 00189 public Teuchos::Describable, 00190 public Teuchos::ParameterListAcceptorDefaultBase, 00191 public Teuchos::VerboseObject<Distributor> { 00192 public: 00194 00195 00203 explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm); 00204 00214 Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm, 00215 const Teuchos::RCP<Teuchos::FancyOStream>& out); 00216 00229 Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm, 00230 const Teuchos::RCP<Teuchos::ParameterList>& plist); 00231 00246 Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm, 00247 const Teuchos::RCP<Teuchos::FancyOStream>& out, 00248 const Teuchos::RCP<Teuchos::ParameterList>& plist); 00249 00251 Distributor (const Distributor &distributor); 00252 00254 virtual ~Distributor (); 00255 00261 void swap (Distributor& rhs); 00262 00264 00265 00266 00271 void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist); 00272 00277 Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const; 00278 00280 00281 00282 00302 size_t createFromSends (const ArrayView<const int>& exportNodeIDs); 00303 00337 template <class Ordinal> 00338 void 00339 createFromRecvs (const ArrayView<const Ordinal>& remoteIDs, 00340 const ArrayView<const int>& remoteNodeIDs, 00341 Array<Ordinal>& exportIDs, 00342 Array<int>& exportNodeIDs); 00343 00345 00346 00347 00351 size_t getNumReceives() const; 00352 00356 size_t getNumSends() const; 00357 00359 bool hasSelfMessage() const; 00360 00362 size_t getMaxSendLength() const; 00363 00365 size_t getTotalReceiveLength() const; 00366 00371 ArrayView<const int> getImagesFrom() const; 00372 00377 ArrayView<const int> getImagesTo() const; 00378 00386 ArrayView<const size_t> getLengthsFrom() const; 00387 00395 ArrayView<const size_t> getLengthsTo() const; 00396 00401 Details::EDistributorHowInitialized howInitialized () const { 00402 return howInitialized_; 00403 } 00404 00406 00407 00408 00419 RCP<Distributor> getReverse() const; 00420 00422 00423 00424 00445 template <class Packet> 00446 void 00447 doPostsAndWaits (const ArrayView<const Packet> &exports, 00448 size_t numPackets, 00449 const ArrayView<Packet> &imports); 00450 00472 template <class Packet> 00473 void 00474 doPostsAndWaits (const ArrayView<const Packet> &exports, 00475 const ArrayView<size_t> &numExportPacketsPerLID, 00476 const ArrayView<Packet> &imports, 00477 const ArrayView<size_t> &numImportPacketsPerLID); 00478 00503 template <class Packet> 00504 void 00505 doPosts (const ArrayRCP<const Packet> &exports, 00506 size_t numPackets, 00507 const ArrayRCP<Packet> &imports); 00508 00527 template <class Packet> 00528 void 00529 doPosts (const ArrayRCP<const Packet> &exports, 00530 const ArrayView<size_t> &numExportPacketsPerLID, 00531 const ArrayRCP<Packet> &imports, 00532 const ArrayView<size_t> &numImportPacketsPerLID); 00533 00540 void doWaits (); 00541 00546 template <class Packet> 00547 void 00548 doReversePostsAndWaits (const ArrayView<const Packet> &exports, 00549 size_t numPackets, 00550 const ArrayView<Packet> &imports); 00551 00556 template <class Packet> 00557 void 00558 doReversePostsAndWaits (const ArrayView<const Packet> &exports, 00559 const ArrayView<size_t> &numExportPacketsPerLID, 00560 const ArrayView<Packet> &imports, 00561 const ArrayView<size_t> &numImportPacketsPerLID); 00562 00567 template <class Packet> 00568 void 00569 doReversePosts (const ArrayRCP<const Packet> &exports, 00570 size_t numPackets, 00571 const ArrayRCP<Packet> &imports); 00572 00577 template <class Packet> 00578 void 00579 doReversePosts (const ArrayRCP<const Packet> &exports, 00580 const ArrayView<size_t> &numExportPacketsPerLID, 00581 const ArrayRCP<Packet> &imports, 00582 const ArrayView<size_t> &numImportPacketsPerLID); 00583 00590 void doReverseWaits (); 00591 00592 #if TPETRA_USE_KOKKOS_DISTOBJECT || defined(TPETRA_HAVE_KOKKOS_REFACTOR) 00593 00614 template <class Packet, class Layout, class Device, class Mem> 00615 void 00616 doPostsAndWaits ( 00617 const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 00618 size_t numPackets, 00619 const Kokkos::View<Packet*, Layout, Device, Mem> &imports); 00620 00642 template <class Packet, class Layout, class Device, class Mem> 00643 void 00644 doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 00645 const ArrayView<size_t> &numExportPacketsPerLID, 00646 const Kokkos::View<Packet*, Layout, Device, Mem> &imports, 00647 const ArrayView<size_t> &numImportPacketsPerLID); 00648 00673 template <class Packet, class Layout, class Device, class Mem> 00674 void 00675 doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 00676 size_t numPackets, 00677 const Kokkos::View<Packet*, Layout, Device, Mem> &imports); 00678 00697 template <class Packet, class Layout, class Device, class Mem> 00698 void 00699 doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 00700 const ArrayView<size_t> &numExportPacketsPerLID, 00701 const Kokkos::View<Packet*, Layout, Device, Mem> &imports, 00702 const ArrayView<size_t> &numImportPacketsPerLID); 00703 00708 template <class Packet, class Layout, class Device, class Mem> 00709 void 00710 doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 00711 size_t numPackets, 00712 const Kokkos::View<Packet*, Layout, Device, Mem> &imports); 00713 00718 template <class Packet, class Layout, class Device, class Mem> 00719 void 00720 doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 00721 const ArrayView<size_t> &numExportPacketsPerLID, 00722 const Kokkos::View<Packet*, Layout, Device, Mem> &imports, 00723 const ArrayView<size_t> &numImportPacketsPerLID); 00724 00729 template <class Packet, class Layout, class Device, class Mem> 00730 void 00731 doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 00732 size_t numPackets, 00733 const Kokkos::View<Packet*, Layout, Device, Mem> &imports); 00734 00739 template <class Packet, class Layout, class Device, class Mem> 00740 void 00741 doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 00742 const ArrayView<size_t> &numExportPacketsPerLID, 00743 const Kokkos::View<Packet*, Layout, Device, Mem> &imports, 00744 const ArrayView<size_t> &numImportPacketsPerLID); 00745 #endif 00746 00750 void getLastDoStatistics(size_t & bytes_sent, size_t & bytes_recvd) const{ 00751 bytes_sent = lastRoundBytesSend_; 00752 bytes_recvd = lastRoundBytesRecv_; 00753 } 00754 00755 00757 00758 00759 00761 std::string description() const; 00762 00764 void describe (Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const; 00765 00767 00768 private: 00770 RCP<const Comm<int> > comm_; 00771 00773 Teuchos::RCP<Teuchos::FancyOStream> out_; 00774 00776 Details::EDistributorHowInitialized howInitialized_; 00777 00779 00780 00782 Details::EDistributorSendType sendType_; 00783 00785 bool barrierBetween_; 00786 00788 bool debug_; 00789 00791 00800 bool enable_cuda_rdma_; 00801 00803 00812 size_t numExports_; 00813 00817 bool selfMessage_; 00818 00828 size_t numSends_; 00829 00834 Array<int> imagesTo_; 00835 00844 Array<size_t> startsTo_; 00845 00851 Array<size_t> lengthsTo_; 00852 00856 size_t maxSendLength_; 00857 00873 Array<size_t> indicesTo_; 00874 00884 size_t numReceives_; 00885 00892 size_t totalReceiveLength_; 00893 00899 Array<size_t> lengthsFrom_; 00900 00906 Array<int> imagesFrom_; 00907 00913 Array<size_t> startsFrom_; 00914 00921 Array<size_t> indicesFrom_; 00922 00929 Array<RCP<Teuchos::CommRequest<int> > > requests_; 00930 00935 mutable RCP<Distributor> reverseDistributor_; 00936 00937 00939 size_t lastRoundBytesSend_; 00940 00942 size_t lastRoundBytesRecv_; 00943 00944 #ifdef TPETRA_DISTRIBUTOR_TIMERS 00945 Teuchos::RCP<Teuchos::Time> timer_doPosts3_; 00946 Teuchos::RCP<Teuchos::Time> timer_doPosts4_; 00947 Teuchos::RCP<Teuchos::Time> timer_doWaits_; 00948 Teuchos::RCP<Teuchos::Time> timer_doPosts3_recvs_; 00949 Teuchos::RCP<Teuchos::Time> timer_doPosts4_recvs_; 00950 Teuchos::RCP<Teuchos::Time> timer_doPosts3_barrier_; 00951 Teuchos::RCP<Teuchos::Time> timer_doPosts4_barrier_; 00952 Teuchos::RCP<Teuchos::Time> timer_doPosts3_sends_; 00953 Teuchos::RCP<Teuchos::Time> timer_doPosts4_sends_; 00954 00956 void makeTimers (); 00957 #endif // TPETRA_DISTRIBUTOR_TIMERS 00958 00970 bool useDistinctTags_; 00971 00976 int getTag (const int pathTag) const; 00977 00991 void 00992 init (const Teuchos::RCP<const Teuchos::Comm<int> >& comm, 00993 const Teuchos::RCP<Teuchos::ParameterList>& plist); 00994 01005 void computeReceives (); 01006 01019 template <class Ordinal> 01020 void computeSends (const ArrayView<const Ordinal> &importIDs, 01021 const ArrayView<const int> &importNodeIDs, 01022 Array<Ordinal> &exportIDs, 01023 Array<int> &exportNodeIDs); 01024 01026 void createReverseDistributor() const; 01027 01028 }; // class Distributor 01029 01030 01031 template <class Packet> 01032 void Distributor:: 01033 doPostsAndWaits (const ArrayView<const Packet>& exports, 01034 size_t numPackets, 01035 const ArrayView<Packet>& imports) 01036 { 01037 using Teuchos::arcp; 01038 using Teuchos::ArrayRCP; 01039 typedef typename ArrayRCP<const Packet>::size_type size_type; 01040 01041 TEUCHOS_TEST_FOR_EXCEPTION( 01042 requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::" 01043 "doPostsAndWaits(3 args): There are " << requests_.size () << 01044 " outstanding nonblocking messages pending. It is incorrect to call " 01045 "this method with posts outstanding."); 01046 01047 // doPosts() accepts the exports and imports arrays as ArrayRCPs, 01048 // requiring that the memory location is persisting (as is 01049 // necessary for nonblocking receives). However, it need only 01050 // persist until doWaits() completes, so it is safe for us to use 01051 // a nonpersisting reference in this case. The use of a 01052 // nonpersisting reference is purely a performance optimization. 01053 01054 //const Packet* exportsPtr = exports.getRawPtr(); 01055 //ArrayRCP<const Packet> exportsArcp (exportsPtr, static_cast<size_type> (0), 01056 // exports.size(), false); 01057 ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), 01058 static_cast<size_type> (0), 01059 exports.size(), false); 01060 01061 // For some reason, neither of the options below (that use arcp) 01062 // compile for Packet=std::complex<double> with GCC 4.5.1. The 01063 // issue only arises with the exports array. This is why we 01064 // construct a separate nonowning ArrayRCP. 01065 01066 // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false), 01067 // numPackets, 01068 // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false)); 01069 // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false), 01070 // numPackets, 01071 // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false)); 01072 doPosts (exportsArcp, 01073 numPackets, 01074 arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false)); 01075 doWaits (); 01076 01077 lastRoundBytesSend_ = exports.size () * sizeof (Packet); 01078 lastRoundBytesRecv_ = imports.size () * sizeof (Packet); 01079 } 01080 01081 template <class Packet> 01082 void Distributor:: 01083 doPostsAndWaits (const ArrayView<const Packet>& exports, 01084 const ArrayView<size_t> &numExportPacketsPerLID, 01085 const ArrayView<Packet> &imports, 01086 const ArrayView<size_t> &numImportPacketsPerLID) 01087 { 01088 using Teuchos::arcp; 01089 using Teuchos::ArrayRCP; 01090 01091 TEUCHOS_TEST_FOR_EXCEPTION( 01092 requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::" 01093 "doPostsAndWaits: There are " << requests_.size () << " outstanding " 01094 "nonblocking messages pending. It is incorrect to call doPostsAndWaits " 01095 "with posts outstanding."); 01096 01097 // doPosts() accepts the exports and imports arrays as ArrayRCPs, 01098 // requiring that the memory location is persisting (as is 01099 // necessary for nonblocking receives). However, it need only 01100 // persist until doWaits() completes, so it is safe for us to use 01101 // a nonpersisting reference in this case. 01102 01103 // mfh 04 Apr 2012: For some reason, calling arcp<const Packet> 01104 // for Packet=std::complex<T> (e.g., T=float) fails to compile 01105 // with some versions of GCC. The issue only arises with the 01106 // exports array. This is why we construct a separate nonowning 01107 // ArrayRCP. 01108 typedef typename ArrayRCP<const Packet>::size_type size_type; 01109 ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), 01110 static_cast<size_type> (0), 01111 exports.size (), false); 01112 // mfh 04 Apr 2012: This is the offending code. This statement 01113 // would normally be in place of "exportsArcp" in the 01114 // doPosts() call below. 01115 //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false), 01116 doPosts (exportsArcp, 01117 numExportPacketsPerLID, 01118 arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false), 01119 numImportPacketsPerLID); 01120 doWaits (); 01121 01122 lastRoundBytesSend_ = exports.size () * sizeof (Packet); 01123 lastRoundBytesRecv_ = imports.size () * sizeof (Packet); 01124 } 01125 01126 01127 template <class Packet> 01128 void Distributor:: 01129 doPosts (const ArrayRCP<const Packet>& exports, 01130 size_t numPackets, 01131 const ArrayRCP<Packet>& imports) 01132 { 01133 using Teuchos::Array; 01134 using Teuchos::as; 01135 using Teuchos::FancyOStream; 01136 using Teuchos::includesVerbLevel; 01137 using Teuchos::ireceive; 01138 using Teuchos::isend; 01139 using Teuchos::OSTab; 01140 using Teuchos::readySend; 01141 using Teuchos::send; 01142 using Teuchos::ssend; 01143 using Teuchos::TypeNameTraits; 01144 using Teuchos::typeName; 01145 using std::endl; 01146 typedef Array<size_t>::size_type size_type; 01147 01148 Teuchos::OSTab tab (out_); 01149 01150 #ifdef TPETRA_DISTRIBUTOR_TIMERS 01151 Teuchos::TimeMonitor timeMon (*timer_doPosts3_); 01152 #endif // TPETRA_DISTRIBUTOR_TIMERS 01153 01154 // Run-time configurable parameters that come from the input 01155 // ParameterList set by setParameterList(). 01156 const Details::EDistributorSendType sendType = sendType_; 01157 const bool doBarrier = barrierBetween_; 01158 01159 // #ifdef HAVE_TEUCHOS_DEBUG 01160 // // Prepare for verbose output, if applicable. 01161 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel (); 01162 // (void) verbLevel; // Silence "unused variable" compiler warning. 01163 // RCP<FancyOStream> out = this->getOStream (); 01164 // // const bool doPrint = out.get () && (comm_->getRank () == 0) && 01165 // // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true); 01166 // const bool doPrint = out.get () && (comm_->getRank () == 0); 01167 01168 // if (doPrint) { 01169 // // Only need one process to print out parameters. 01170 // *out << "Distributor::doPosts (3 args)" << endl; 01171 // } 01172 // // Add one tab level. We declare this outside the doPrint scopes 01173 // // so that the tab persists until the end of this method. 01174 // OSTab tab = this->getOSTab (); 01175 // if (doPrint) { 01176 // *out << "Parameters:" << endl; 01177 // { 01178 // OSTab tab2 (out); 01179 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType) 01180 // << endl << "barrierBetween: " << doBarrier << endl; 01181 // } 01182 // } 01183 // #endif // HAVE_TEUCHOS_DEBUG 01184 01185 TEUCHOS_TEST_FOR_EXCEPTION( 01186 sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error, 01187 "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a " 01188 "barrier between posting receives and posting ready sends. This should " 01189 "have been checked before. " 01190 "Please report this bug to the Tpetra developers."); 01191 01192 const int myImageID = comm_->getRank (); 01193 size_t selfReceiveOffset = 0; 01194 01195 // Each message has the same number of packets. 01196 // 01197 // FIXME (mfh 18 Jul 2014): Relaxing this test from strict 01198 // inequality to a less-than seems to have fixed Bug 6170. It's 01199 // OK for the 'imports' array to be longer than it needs to be; 01200 // I'm just curious why it would be. 01201 const size_t totalNumImportPackets = totalReceiveLength_ * numPackets; 01202 TEUCHOS_TEST_FOR_EXCEPTION( 01203 static_cast<size_t> (imports.size ()) < totalNumImportPackets, 01204 std::invalid_argument, "Tpetra::Distributor::doPosts(3 args): The " 01205 "'imports' array must have enough entries to hold the expected number " 01206 "of import packets. imports.size() = " << imports.size () << " < " 01207 "totalNumImportPackets = " << totalNumImportPackets << "."); 01208 01209 // MPI tag for nonblocking receives and blocking sends in this 01210 // method. Some processes might take the "fast" path 01211 // (indicesTo_.empty()) and others might take the "slow" path for 01212 // the same doPosts() call, so the path tag must be the same for 01213 // both. 01214 const int pathTag = 0; 01215 const int tag = this->getTag (pathTag); 01216 01217 if (debug_) { 01218 TEUCHOS_TEST_FOR_EXCEPTION( 01219 requests_.size () != 0, std::logic_error, "Tpetra::Distributor::" 01220 "doPosts(3 args): Process " << myImageID << ": requests_.size() = " 01221 << requests_.size () << " != 0."); 01222 std::ostringstream os; 01223 os << myImageID << ": doPosts(3," 01224 << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl; 01225 *out_ << os.str (); 01226 } 01227 01228 // Distributor uses requests_.size() as the number of outstanding 01229 // nonblocking message requests, so we resize to zero to maintain 01230 // this invariant. 01231 // 01232 // numReceives_ does _not_ include the self message, if there is 01233 // one. Here, we do actually send a message to ourselves, so we 01234 // include any self message in the "actual" number of receives to 01235 // post. 01236 // 01237 // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts() 01238 // doesn't (re)allocate its array of requests. That happens in 01239 // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on 01240 // demand), or Resize_(). 01241 const size_type actualNumReceives = as<size_type> (numReceives_) + 01242 as<size_type> (selfMessage_ ? 1 : 0); 01243 requests_.resize (0); 01244 01245 // Post the nonblocking receives. It's common MPI wisdom to post 01246 // receives before sends. In MPI terms, this means favoring 01247 // adding to the "posted queue" (of receive requests) over adding 01248 // to the "unexpected queue" (of arrived messages not yet matched 01249 // with a receive). 01250 { 01251 #ifdef TPETRA_DISTRIBUTOR_TIMERS 01252 Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_); 01253 #endif // TPETRA_DISTRIBUTOR_TIMERS 01254 01255 size_t curBufOffset = 0; 01256 for (size_type i = 0; i < actualNumReceives; ++i) { 01257 const size_t curBufLen = lengthsFrom_[i] * numPackets; 01258 if (imagesFrom_[i] != myImageID) { 01259 // If my process is receiving these packet(s) from another 01260 // process (not a self-receive): 01261 // 01262 // 1. Set up the persisting view (recvBuf) of the imports 01263 // array, given the offset and size (total number of 01264 // packets from process imagesFrom_[i]). 01265 // 2. Start the Irecv and save the resulting request. 01266 TEUCHOS_TEST_FOR_EXCEPTION( 01267 curBufOffset + curBufLen > static_cast<size_t> (imports.size ()), 01268 std::logic_error, "Tpetra::Distributor::doPosts(3 args): Exceeded " 01269 "size of 'imports' array in packing loop on Process " << myImageID 01270 << ". imports.size() = " << imports.size () << " < offset + length" 01271 " = " << (curBufOffset + curBufLen) << "."); 01272 01273 ArrayRCP<Packet> recvBuf = 01274 imports.persistingView (curBufOffset, curBufLen); 01275 requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i], 01276 tag, *comm_)); 01277 if (debug_) { 01278 std::ostringstream os; 01279 os << myImageID << ": doPosts(3," 01280 << (indicesTo_.empty () ? "fast" : "slow") << "): " 01281 << "Posted irecv from Proc " << imagesFrom_[i] << " with " 01282 "specified tag " << tag << endl; 01283 *out_ << os.str (); 01284 } 01285 } 01286 else { // Receiving from myself 01287 selfReceiveOffset = curBufOffset; // Remember the self-recv offset 01288 } 01289 curBufOffset += curBufLen; 01290 } 01291 } 01292 01293 if (doBarrier) { 01294 #ifdef TPETRA_DISTRIBUTOR_TIMERS 01295 Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_); 01296 #endif // TPETRA_DISTRIBUTOR_TIMERS 01297 // If we are using ready sends (MPI_Rsend) below, we need to do 01298 // a barrier before we post the ready sends. This is because a 01299 // ready send requires that its matching receive has already 01300 // been posted before the send has been posted. The only way to 01301 // guarantee that in this case is to use a barrier. 01302 comm_->barrier (); 01303 } 01304 01305 #ifdef TPETRA_DISTRIBUTOR_TIMERS 01306 Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_); 01307 #endif // TPETRA_DISTRIBUTOR_TIMERS 01308 01309 // setup scan through imagesTo_ list starting with higher numbered images 01310 // (should help balance message traffic) 01311 // 01312 // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this? 01313 // It doesn't depend on the input at all. 01314 size_t numBlocks = numSends_ + selfMessage_; 01315 size_t imageIndex = 0; 01316 while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) { 01317 ++imageIndex; 01318 } 01319 if (imageIndex == numBlocks) { 01320 imageIndex = 0; 01321 } 01322 01323 size_t selfNum = 0; 01324 size_t selfIndex = 0; 01325 01326 if (indicesTo_.empty()) { 01327 if (debug_) { 01328 std::ostringstream os; 01329 os << myImageID << ": doPosts(3,fast): posting sends" << endl; 01330 *out_ << os.str (); 01331 } 01332 01333 // Data are already blocked (laid out) by process, so we don't 01334 // need a separate send buffer (besides the exports array). 01335 for (size_t i = 0; i < numBlocks; ++i) { 01336 size_t p = i + imageIndex; 01337 if (p > (numBlocks - 1)) { 01338 p -= numBlocks; 01339 } 01340 01341 if (imagesTo_[p] != myImageID) { 01342 ArrayView<const Packet> tmpSend = 01343 exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets); 01344 01345 if (sendType == Details::DISTRIBUTOR_SEND) { 01346 send<int, Packet> (tmpSend.getRawPtr (), 01347 as<int> (tmpSend.size ()), 01348 imagesTo_[p], tag, *comm_); 01349 } 01350 else if (sendType == Details::DISTRIBUTOR_ISEND) { 01351 ArrayRCP<const Packet> tmpSendBuf = 01352 exports.persistingView (startsTo_[p] * numPackets, 01353 lengthsTo_[p] * numPackets); 01354 requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p], 01355 tag, *comm_)); 01356 } 01357 else if (sendType == Details::DISTRIBUTOR_RSEND) { 01358 readySend<int, Packet> (tmpSend.getRawPtr (), 01359 as<int> (tmpSend.size ()), 01360 imagesTo_[p], tag, *comm_); 01361 } 01362 else if (sendType == Details::DISTRIBUTOR_SSEND) { 01363 ssend<int, Packet> (tmpSend.getRawPtr (), 01364 as<int> (tmpSend.size ()), 01365 imagesTo_[p], tag, *comm_); 01366 } else { 01367 TEUCHOS_TEST_FOR_EXCEPTION( 01368 true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): " 01369 "Invalid send type. We should never get here. " 01370 "Please report this bug to the Tpetra developers."); 01371 } 01372 01373 if (debug_) { 01374 std::ostringstream os; 01375 os << myImageID << ": doPosts(3,fast): " 01376 << "Posted send to Proc " << imagesTo_[i] 01377 << " w/ specified tag " << tag << endl; 01378 *out_ << os.str (); 01379 } 01380 } 01381 else { // "Sending" the message to myself 01382 selfNum = p; 01383 } 01384 } 01385 01386 if (selfMessage_) { 01387 // This is how we "send a message to ourself": we copy from 01388 // the export buffer to the import buffer. That saves 01389 // Teuchos::Comm implementations other than MpiComm (in 01390 // particular, SerialComm) the trouble of implementing self 01391 // messages correctly. (To do this right, SerialComm would 01392 // need internal buffer space for messages, keyed on the 01393 // message's tag.) 01394 std::copy (exports.begin()+startsTo_[selfNum]*numPackets, 01395 exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets, 01396 imports.begin()+selfReceiveOffset); 01397 } 01398 if (debug_) { 01399 std::ostringstream os; 01400 os << myImageID << ": doPosts(3,fast) done" << endl; 01401 *out_ << os.str (); 01402 } 01403 } 01404 else { // data are not blocked by image, use send buffer 01405 if (debug_) { 01406 std::ostringstream os; 01407 os << myImageID << ": doPosts(3,slow): posting sends" << endl; 01408 *out_ << os.str (); 01409 } 01410 01411 // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking 01412 // sends), because the buffer is only long enough for one send. 01413 ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer 01414 01415 TEUCHOS_TEST_FOR_EXCEPTION( 01416 sendType == Details::DISTRIBUTOR_ISEND, std::logic_error, 01417 "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path " 01418 << "doesn't currently work with nonblocking sends."); 01419 01420 for (size_t i = 0; i < numBlocks; ++i) { 01421 size_t p = i + imageIndex; 01422 if (p > (numBlocks - 1)) { 01423 p -= numBlocks; 01424 } 01425 01426 if (imagesTo_[p] != myImageID) { 01427 typename ArrayView<const Packet>::iterator srcBegin, srcEnd; 01428 size_t sendArrayOffset = 0; 01429 size_t j = startsTo_[p]; 01430 for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) { 01431 srcBegin = exports.begin() + indicesTo_[j]*numPackets; 01432 srcEnd = srcBegin + numPackets; 01433 std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset); 01434 sendArrayOffset += numPackets; 01435 } 01436 ArrayView<const Packet> tmpSend = 01437 sendArray.view (0, lengthsTo_[p]*numPackets); 01438 01439 if (sendType == Details::DISTRIBUTOR_SEND) { 01440 send<int, Packet> (tmpSend.getRawPtr (), 01441 as<int> (tmpSend.size ()), 01442 imagesTo_[p], tag, *comm_); 01443 } 01444 else if (sendType == Details::DISTRIBUTOR_ISEND) { 01445 ArrayRCP<const Packet> tmpSendBuf = 01446 sendArray.persistingView (0, lengthsTo_[p] * numPackets); 01447 requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p], 01448 tag, *comm_)); 01449 } 01450 else if (sendType == Details::DISTRIBUTOR_RSEND) { 01451 readySend<int, Packet> (tmpSend.getRawPtr (), 01452 as<int> (tmpSend.size ()), 01453 imagesTo_[p], tag, *comm_); 01454 } 01455 else if (sendType == Details::DISTRIBUTOR_SSEND) { 01456 ssend<int, Packet> (tmpSend.getRawPtr (), 01457 as<int> (tmpSend.size ()), 01458 imagesTo_[p], tag, *comm_); 01459 } 01460 else { 01461 TEUCHOS_TEST_FOR_EXCEPTION( 01462 true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): " 01463 "Invalid send type. We should never get here. " 01464 "Please report this bug to the Tpetra developers."); 01465 } 01466 01467 if (debug_) { 01468 std::ostringstream os; 01469 os << myImageID << ": doPosts(3,slow): " 01470 << "Posted send to Proc " << imagesTo_[i] 01471 << " w/ specified tag " << tag << endl; 01472 *out_ << os.str (); 01473 } 01474 } 01475 else { // "Sending" the message to myself 01476 selfNum = p; 01477 selfIndex = startsTo_[p]; 01478 } 01479 } 01480 01481 if (selfMessage_) { 01482 for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) { 01483 std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets, 01484 exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets, 01485 imports.begin() + selfReceiveOffset); 01486 ++selfIndex; 01487 selfReceiveOffset += numPackets; 01488 } 01489 } 01490 if (debug_) { 01491 std::ostringstream os; 01492 os << myImageID << ": doPosts(3,slow) done" << endl; 01493 *out_ << os.str (); 01494 } 01495 } 01496 } 01497 01498 template <class Packet> 01499 void Distributor:: 01500 doPosts (const ArrayRCP<const Packet>& exports, 01501 const ArrayView<size_t>& numExportPacketsPerLID, 01502 const ArrayRCP<Packet>& imports, 01503 const ArrayView<size_t>& numImportPacketsPerLID) 01504 { 01505 using Teuchos::Array; 01506 using Teuchos::as; 01507 using Teuchos::ireceive; 01508 using Teuchos::isend; 01509 using Teuchos::readySend; 01510 using Teuchos::send; 01511 using Teuchos::ssend; 01512 using Teuchos::TypeNameTraits; 01513 #ifdef HAVE_TEUCHOS_DEBUG 01514 using Teuchos::OSTab; 01515 #endif // HAVE_TEUCHOS_DEBUG 01516 using std::endl; 01517 typedef Array<size_t>::size_type size_type; 01518 01519 Teuchos::OSTab tab (out_); 01520 01521 #ifdef TPETRA_DISTRIBUTOR_TIMERS 01522 Teuchos::TimeMonitor timeMon (*timer_doPosts4_); 01523 #endif // TPETRA_DISTRIBUTOR_TIMERS 01524 01525 // Run-time configurable parameters that come from the input 01526 // ParameterList set by setParameterList(). 01527 const Details::EDistributorSendType sendType = sendType_; 01528 const bool doBarrier = barrierBetween_; 01529 01530 // #ifdef HAVE_TEUCHOS_DEBUG 01531 // // Prepare for verbose output, if applicable. 01532 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel (); 01533 // RCP<Teuchos::FancyOStream> out = this->getOStream (); 01534 // const bool doPrint = out.get () && (comm_->getRank () == 0) && 01535 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true); 01536 01537 // if (doPrint) { 01538 // // Only need one process to print out parameters. 01539 // *out << "Distributor::doPosts (4 args)" << endl; 01540 // } 01541 // // Add one tab level. We declare this outside the doPrint scopes 01542 // // so that the tab persists until the end of this method. 01543 // Teuchos::OSTab tab = this->getOSTab (); 01544 // if (doPrint) { 01545 // *out << "Parameters:" << endl; 01546 // { 01547 // OSTab tab2 (out); 01548 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType) 01549 // << endl << "barrierBetween: " << doBarrier << endl; 01550 // } 01551 // } 01552 // #endif // HAVE_TEUCHOS_DEBUG 01553 01554 TEUCHOS_TEST_FOR_EXCEPTION( 01555 sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error, 01556 "Tpetra::Distributor::doPosts(4 args): Ready-send version requires a " 01557 "barrier between posting receives and posting ready sends. This should " 01558 "have been checked before. " 01559 "Please report this bug to the Tpetra developers."); 01560 01561 const int myImageID = comm_->getRank (); 01562 size_t selfReceiveOffset = 0; 01563 01564 #ifdef HAVE_TEUCHOS_DEBUG 01565 // Different messages may have different numbers of packets. 01566 size_t totalNumImportPackets = 0; 01567 for (int ii = 0; ii < numImportPacketsPerLID.size(); ++ii) { 01568 totalNumImportPackets += numImportPacketsPerLID[ii]; 01569 } 01570 TEUCHOS_TEST_FOR_EXCEPTION( 01571 static_cast<size_t> (imports.size ()) < totalNumImportPackets, 01572 std::runtime_error, "Tpetra::Distributor::doPosts(4 args): The 'imports' " 01573 "array must have enough entries to hold the expected number of import " 01574 "packets. imports.size() = " << imports.size() << " < " 01575 "totalNumImportPackets = " << totalNumImportPackets << "."); 01576 #endif // HAVE_TEUCHOS_DEBUG 01577 01578 // MPI tag for nonblocking receives and blocking sends in this 01579 // method. Some processes might take the "fast" path 01580 // (indicesTo_.empty()) and others might take the "slow" path for 01581 // the same doPosts() call, so the path tag must be the same for 01582 // both. 01583 const int pathTag = 1; 01584 const int tag = this->getTag (pathTag); 01585 01586 if (debug_) { 01587 TEUCHOS_TEST_FOR_EXCEPTION( 01588 requests_.size () != 0, std::logic_error, "Tpetra::Distributor::" 01589 "doPosts(4 args): Process " << myImageID << ": requests_.size() = " 01590 << requests_.size () << " != 0."); 01591 std::ostringstream os; 01592 os << myImageID << ": doPosts(4," 01593 << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl; 01594 *out_ << os.str (); 01595 } 01596 01597 // Distributor uses requests_.size() as the number of outstanding 01598 // nonblocking message requests, so we resize to zero to maintain 01599 // this invariant. 01600 // 01601 // numReceives_ does _not_ include the self message, if there is 01602 // one. Here, we do actually send a message to ourselves, so we 01603 // include any self message in the "actual" number of receives to 01604 // post. 01605 // 01606 // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts() 01607 // doesn't (re)allocate its array of requests. That happens in 01608 // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on 01609 // demand), or Resize_(). 01610 const size_type actualNumReceives = as<size_type> (numReceives_) + 01611 as<size_type> (selfMessage_ ? 1 : 0); 01612 requests_.resize (0); 01613 01614 // Post the nonblocking receives. It's common MPI wisdom to post 01615 // receives before sends. In MPI terms, this means favoring 01616 // adding to the "posted queue" (of receive requests) over adding 01617 // to the "unexpected queue" (of arrived messages not yet matched 01618 // with a receive). 01619 { 01620 #ifdef TPETRA_DISTRIBUTOR_TIMERS 01621 Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_); 01622 #endif // TPETRA_DISTRIBUTOR_TIMERS 01623 01624 size_t curBufferOffset = 0; 01625 size_t curLIDoffset = 0; 01626 for (size_type i = 0; i < actualNumReceives; ++i) { 01627 size_t totalPacketsFrom_i = 0; 01628 for (size_t j = 0; j < lengthsFrom_[i]; ++j) { 01629 totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j]; 01630 } 01631 curLIDoffset += lengthsFrom_[i]; 01632 if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) { 01633 // If my process is receiving these packet(s) from another 01634 // process (not a self-receive), and if there is at least 01635 // one packet to receive: 01636 // 01637 // 1. Set up the persisting view (recvBuf) into the imports 01638 // array, given the offset and size (total number of 01639 // packets from process imagesFrom_[i]). 01640 // 2. Start the Irecv and save the resulting request. 01641 ArrayRCP<Packet> recvBuf = 01642 imports.persistingView (curBufferOffset, totalPacketsFrom_i); 01643 requests_.push_back (ireceive<int, Packet> (recvBuf, imagesFrom_[i], 01644 tag, *comm_)); 01645 } 01646 else { // Receiving these packet(s) from myself 01647 selfReceiveOffset = curBufferOffset; // Remember the offset 01648 } 01649 curBufferOffset += totalPacketsFrom_i; 01650 } 01651 } 01652 01653 if (doBarrier) { 01654 #ifdef TPETRA_DISTRIBUTOR_TIMERS 01655 Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_); 01656 #endif // TPETRA_DISTRIBUTOR_TIMERS 01657 // If we are using ready sends (MPI_Rsend) below, we need to do 01658 // a barrier before we post the ready sends. This is because a 01659 // ready send requires that its matching receive has already 01660 // been posted before the send has been posted. The only way to 01661 // guarantee that in this case is to use a barrier. 01662 comm_->barrier (); 01663 } 01664 01665 #ifdef TPETRA_DISTRIBUTOR_TIMERS 01666 Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_); 01667 #endif // TPETRA_DISTRIBUTOR_TIMERS 01668 01669 // setup arrays containing starting-offsets into exports for each send, 01670 // and num-packets-to-send for each send. 01671 Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0); 01672 size_t maxNumPackets = 0; 01673 size_t curPKToffset = 0; 01674 for (size_t pp=0; pp<numSends_; ++pp) { 01675 sendPacketOffsets[pp] = curPKToffset; 01676 size_t numPackets = 0; 01677 for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) { 01678 numPackets += numExportPacketsPerLID[j]; 01679 } 01680 if (numPackets > maxNumPackets) maxNumPackets = numPackets; 01681 packetsPerSend[pp] = numPackets; 01682 curPKToffset += numPackets; 01683 } 01684 01685 // setup scan through imagesTo_ list starting with higher numbered images 01686 // (should help balance message traffic) 01687 size_t numBlocks = numSends_+ selfMessage_; 01688 size_t imageIndex = 0; 01689 while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) { 01690 ++imageIndex; 01691 } 01692 if (imageIndex == numBlocks) { 01693 imageIndex = 0; 01694 } 01695 01696 size_t selfNum = 0; 01697 size_t selfIndex = 0; 01698 01699 if (indicesTo_.empty()) { 01700 if (debug_) { 01701 std::ostringstream os; 01702 os << myImageID << ": doPosts(4,fast): posting sends" << endl; 01703 *out_ << os.str (); 01704 } 01705 01706 // Data are already blocked (laid out) by process, so we don't 01707 // need a separate send buffer (besides the exports array). 01708 for (size_t i = 0; i < numBlocks; ++i) { 01709 size_t p = i + imageIndex; 01710 if (p > (numBlocks - 1)) { 01711 p -= numBlocks; 01712 } 01713 01714 if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) { 01715 ArrayView<const Packet> tmpSend = 01716 exports.view (sendPacketOffsets[p], packetsPerSend[p]); 01717 01718 if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first 01719 send<int, Packet> (tmpSend.getRawPtr (), 01720 as<int> (tmpSend.size ()), 01721 imagesTo_[p], tag, *comm_); 01722 } 01723 else if (sendType == Details::DISTRIBUTOR_RSEND) { 01724 readySend<int, Packet> (tmpSend.getRawPtr (), 01725 as<int> (tmpSend.size ()), 01726 imagesTo_[p], tag, *comm_); 01727 } 01728 else if (sendType == Details::DISTRIBUTOR_ISEND) { 01729 ArrayRCP<const Packet> tmpSendBuf = 01730 exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]); 01731 requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p], 01732 tag, *comm_)); 01733 } 01734 else if (sendType == Details::DISTRIBUTOR_SSEND) { 01735 ssend<int, Packet> (tmpSend.getRawPtr (), 01736 as<int> (tmpSend.size ()), 01737 imagesTo_[p], tag, *comm_); 01738 } 01739 else { 01740 TEUCHOS_TEST_FOR_EXCEPTION( 01741 true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): " 01742 "Invalid send type. We should never get here. Please report " 01743 "this bug to the Tpetra developers."); 01744 } 01745 } 01746 else { // "Sending" the message to myself 01747 selfNum = p; 01748 } 01749 } 01750 01751 if (selfMessage_) { 01752 std::copy (exports.begin()+sendPacketOffsets[selfNum], 01753 exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum], 01754 imports.begin()+selfReceiveOffset); 01755 } 01756 if (debug_) { 01757 std::ostringstream os; 01758 os << myImageID << ": doPosts(4,fast) done" << endl; 01759 *out_ << os.str (); 01760 } 01761 } 01762 else { // data are not blocked by image, use send buffer 01763 if (debug_) { 01764 std::ostringstream os; 01765 os << myImageID << ": doPosts(4,slow): posting sends" << endl; 01766 *out_ << os.str (); 01767 } 01768 01769 // FIXME (mfh 05 Mar 2013) This may be broken for Isend. 01770 ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer 01771 01772 TEUCHOS_TEST_FOR_EXCEPTION( 01773 sendType == Details::DISTRIBUTOR_ISEND, std::logic_error, 01774 "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" " 01775 "code path may not necessarily work with nonblocking sends."); 01776 01777 Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0); 01778 size_t ioffset = 0; 01779 for (int j=0; j<numExportPacketsPerLID.size(); ++j) { 01780 indicesOffsets[j] = ioffset; 01781 ioffset += numExportPacketsPerLID[j]; 01782 } 01783 01784 for (size_t i = 0; i < numBlocks; ++i) { 01785 size_t p = i + imageIndex; 01786 if (p > (numBlocks - 1)) { 01787 p -= numBlocks; 01788 } 01789 01790 if (imagesTo_[p] != myImageID) { 01791 typename ArrayView<const Packet>::iterator srcBegin, srcEnd; 01792 size_t sendArrayOffset = 0; 01793 size_t j = startsTo_[p]; 01794 size_t numPacketsTo_p = 0; 01795 for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) { 01796 srcBegin = exports.begin() + indicesOffsets[j]; 01797 srcEnd = srcBegin + numExportPacketsPerLID[j]; 01798 numPacketsTo_p += numExportPacketsPerLID[j]; 01799 std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset); 01800 sendArrayOffset += numExportPacketsPerLID[j]; 01801 } 01802 if (numPacketsTo_p > 0) { 01803 ArrayView<const Packet> tmpSend = 01804 sendArray.view (0, numPacketsTo_p); 01805 01806 if (sendType == Details::DISTRIBUTOR_RSEND) { 01807 readySend<int, Packet> (tmpSend.getRawPtr (), 01808 as<int> (tmpSend.size ()), 01809 imagesTo_[p], tag, *comm_); 01810 } 01811 else if (sendType == Details::DISTRIBUTOR_ISEND) { 01812 ArrayRCP<const Packet> tmpSendBuf = 01813 sendArray.persistingView (0, numPacketsTo_p); 01814 requests_.push_back (isend<int, Packet> (tmpSendBuf, imagesTo_[p], 01815 tag, *comm_)); 01816 } 01817 else if (sendType == Details::DISTRIBUTOR_SSEND) { 01818 ssend<int, Packet> (tmpSend.getRawPtr (), 01819 as<int> (tmpSend.size ()), 01820 imagesTo_[p], tag, *comm_); 01821 } 01822 else { // if (sendType == Details::DISTRIBUTOR_SSEND) 01823 send<int, Packet> (tmpSend.getRawPtr (), 01824 as<int> (tmpSend.size ()), 01825 imagesTo_[p], tag, *comm_); 01826 } 01827 } 01828 } 01829 else { // "Sending" the message to myself 01830 selfNum = p; 01831 selfIndex = startsTo_[p]; 01832 } 01833 } 01834 01835 if (selfMessage_) { 01836 for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) { 01837 std::copy (exports.begin()+indicesOffsets[selfIndex], 01838 exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex], 01839 imports.begin() + selfReceiveOffset); 01840 selfReceiveOffset += numExportPacketsPerLID[selfIndex]; 01841 ++selfIndex; 01842 } 01843 } 01844 if (debug_) { 01845 std::ostringstream os; 01846 os << myImageID << ": doPosts(4,slow) done" << endl; 01847 *out_ << os.str (); 01848 } 01849 } 01850 } 01851 01852 template <class Packet> 01853 void Distributor:: 01854 doReversePostsAndWaits (const ArrayView<const Packet>& exports, 01855 size_t numPackets, 01856 const ArrayView<Packet>& imports) 01857 { 01858 using Teuchos::as; 01859 01860 // doReversePosts() takes exports and imports as ArrayRCPs, 01861 // requiring that the memory locations are persisting. However, 01862 // they need only persist within the scope of that routine, so it 01863 // is safe for us to use nonpersisting references in this case. 01864 01865 // mfh 04 Apr 2012: For some reason, calling arcp<const Packet> 01866 // for Packet=std::complex<T> (e.g., T=float) fails to compile 01867 // with some versions of GCC. The issue only arises with the 01868 // exports array. This is why we construct a separate nonowning 01869 // ArrayRCP. 01870 typedef typename ArrayRCP<const Packet>::size_type size_type; 01871 ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0), 01872 exports.size(), false); 01873 // mfh 04 Apr 2012: This is the offending code. This statement 01874 // would normally be in place of "exportsArcp" in the 01875 // doReversePosts() call below. 01876 //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false) 01877 doReversePosts (exportsArcp, 01878 numPackets, 01879 arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false)); 01880 doReverseWaits (); 01881 01882 lastRoundBytesSend_ = exports.size() * sizeof(Packet); 01883 lastRoundBytesRecv_ = imports.size() * sizeof(Packet); 01884 } 01885 01886 template <class Packet> 01887 void Distributor:: 01888 doReversePostsAndWaits (const ArrayView<const Packet>& exports, 01889 const ArrayView<size_t> &numExportPacketsPerLID, 01890 const ArrayView<Packet> &imports, 01891 const ArrayView<size_t> &numImportPacketsPerLID) 01892 { 01893 using Teuchos::as; 01894 using Teuchos::arcp; 01895 using Teuchos::ArrayRCP; 01896 01897 TEUCHOS_TEST_FOR_EXCEPTION( 01898 requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::" 01899 "doReversePostsAndWaits(4 args): There are " << requests_.size () 01900 << " outstanding nonblocking messages pending. It is incorrect to call " 01901 "this method with posts outstanding."); 01902 01903 // doReversePosts() accepts the exports and imports arrays as 01904 // ArrayRCPs, requiring that the memory location is persisting (as 01905 // is necessary for nonblocking receives). However, it need only 01906 // persist until doReverseWaits() completes, so it is safe for us 01907 // to use a nonpersisting reference in this case. The use of a 01908 // nonpersisting reference is purely a performance optimization. 01909 01910 // mfh 02 Apr 2012: For some reason, calling arcp<const Packet> 01911 // for Packet=std::complex<double> fails to compile with some 01912 // versions of GCC. The issue only arises with the exports array. 01913 // This is why we construct a separate nonowning ArrayRCP. 01914 typedef typename ArrayRCP<const Packet>::size_type size_type; 01915 ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0), 01916 exports.size (), false); 01917 doReversePosts (exportsArcp, 01918 numExportPacketsPerLID, 01919 arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false), 01920 numImportPacketsPerLID); 01921 doReverseWaits (); 01922 01923 lastRoundBytesSend_ = exports.size() * sizeof(Packet); 01924 lastRoundBytesRecv_ = imports.size() * sizeof(Packet); 01925 } 01926 01927 template <class Packet> 01928 void Distributor:: 01929 doReversePosts (const ArrayRCP<const Packet>& exports, 01930 size_t numPackets, 01931 const ArrayRCP<Packet>& imports) 01932 { 01933 // FIXME (mfh 29 Mar 2012) WHY? 01934 TEUCHOS_TEST_FOR_EXCEPTION( 01935 ! indicesTo_.empty (), std::runtime_error, 01936 "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse " 01937 "communication when original data are blocked by process."); 01938 if (reverseDistributor_.is_null ()) { 01939 createReverseDistributor (); 01940 } 01941 reverseDistributor_->doPosts (exports, numPackets, imports); 01942 } 01943 01944 template <class Packet> 01945 void Distributor:: 01946 doReversePosts (const ArrayRCP<const Packet>& exports, 01947 const ArrayView<size_t>& numExportPacketsPerLID, 01948 const ArrayRCP<Packet>& imports, 01949 const ArrayView<size_t>& numImportPacketsPerLID) 01950 { 01951 // FIXME (mfh 29 Mar 2012) WHY? 01952 TEUCHOS_TEST_FOR_EXCEPTION( 01953 ! indicesTo_.empty (), std::runtime_error, 01954 "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse " 01955 "communication when original data are blocked by process."); 01956 if (reverseDistributor_.is_null ()) { 01957 createReverseDistributor (); 01958 } 01959 reverseDistributor_->doPosts (exports, numExportPacketsPerLID, 01960 imports, numImportPacketsPerLID); 01961 } 01962 01963 #if TPETRA_USE_KOKKOS_DISTOBJECT || defined(TPETRA_HAVE_KOKKOS_REFACTOR) 01964 01965 template <class Packet, class Layout, class Device, class Mem> 01966 void Distributor:: 01967 doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 01968 size_t numPackets, 01969 const Kokkos::View<Packet*, Layout, Device, Mem> &imports) 01970 { 01971 // If the MPI library doesn't support RDMA for communication 01972 // directly to or from the GPU's memory, we must transfer exports 01973 // to the host, do the communication, then transfer imports back 01974 // to the GPU. 01975 // 01976 // We need to do this here instead of doPosts() because the copy 01977 // back to the GPU must occur after the waits. 01978 typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view; 01979 typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view; 01980 if (! enable_cuda_rdma_ && ! exports_view::is_hostspace) { 01981 typename exports_view::HostMirror host_exports = 01982 Kokkos::create_mirror_view (exports); 01983 typename imports_view::HostMirror host_imports = 01984 Kokkos::create_mirror_view (imports); 01985 Kokkos::deep_copy (host_exports, exports); 01986 doPostsAndWaits (Kokkos::Compat::create_const_view (host_exports), 01987 numPackets, 01988 host_imports); 01989 Kokkos::deep_copy (imports, host_imports); 01990 return; 01991 } 01992 01993 TEUCHOS_TEST_FOR_EXCEPTION( 01994 requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::" 01995 "doPostsAndWaits(3 args): There are " << requests_.size () << 01996 " outstanding nonblocking messages pending. It is incorrect to call " 01997 "this method with posts outstanding."); 01998 01999 doPosts (exports, numPackets, imports); 02000 doWaits (); 02001 } 02002 02003 template <class Packet, class Layout, class Device, class Mem> 02004 void Distributor:: 02005 doPostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 02006 const ArrayView<size_t> &numExportPacketsPerLID, 02007 const Kokkos::View<Packet*, Layout, Device, Mem> &imports, 02008 const ArrayView<size_t> &numImportPacketsPerLID) 02009 { 02010 // If MPI library doesn't support RDMA for communication directly 02011 // to/from GPU, transfer exports to the host, do the communication, then 02012 // transfer imports back to the GPU 02013 // 02014 // We need to do this here instead of doPosts() because the copy back 02015 // to the GPU must occur after the waits. 02016 typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view; 02017 typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view; 02018 if (!enable_cuda_rdma_ && !exports_view::is_hostspace) { 02019 typename exports_view::HostMirror host_exports = 02020 Kokkos::create_mirror_view(exports); 02021 typename imports_view::HostMirror host_imports = 02022 Kokkos::create_mirror_view(imports); 02023 Kokkos::deep_copy (host_exports, exports); 02024 doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports), 02025 numExportPacketsPerLID, 02026 host_imports, 02027 numImportPacketsPerLID); 02028 Kokkos::deep_copy (imports, host_imports); 02029 return; 02030 } 02031 02032 TEUCHOS_TEST_FOR_EXCEPTION( 02033 requests_.size () != 0, std::runtime_error, 02034 "Tpetra::Distributor::doPostsAndWaits(4 args): There are " 02035 << requests_.size () << " outstanding nonblocking messages pending. " 02036 "It is incorrect to call this method with posts outstanding."); 02037 02038 doPosts (exports, numExportPacketsPerLID, imports, numImportPacketsPerLID); 02039 doWaits (); 02040 } 02041 02042 02043 template <class Packet, class Layout, class Device, class Mem> 02044 void Distributor:: 02045 doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 02046 size_t numPackets, 02047 const Kokkos::View<Packet*, Layout, Device, Mem> &imports) 02048 { 02049 using Teuchos::Array; 02050 using Teuchos::as; 02051 using Teuchos::FancyOStream; 02052 using Teuchos::includesVerbLevel; 02053 using Teuchos::ireceive; 02054 using Teuchos::isend; 02055 using Teuchos::OSTab; 02056 using Teuchos::readySend; 02057 using Teuchos::send; 02058 using Teuchos::ssend; 02059 using Teuchos::TypeNameTraits; 02060 using Teuchos::typeName; 02061 using std::endl; 02062 using Kokkos::Compat::create_const_view; 02063 using Kokkos::Compat::create_view; 02064 using Kokkos::Compat::subview_offset; 02065 using Kokkos::Compat::deep_copy_offset; 02066 typedef Array<size_t>::size_type size_type; 02067 typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view_type; 02068 typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view_type; 02069 02070 Teuchos::OSTab tab (out_); 02071 02072 #ifdef TPETRA_DISTRIBUTOR_TIMERS 02073 Teuchos::TimeMonitor timeMon (*timer_doPosts3_); 02074 #endif // TPETRA_DISTRIBUTOR_TIMERS 02075 02076 // Run-time configurable parameters that come from the input 02077 // ParameterList set by setParameterList(). 02078 const Details::EDistributorSendType sendType = sendType_; 02079 const bool doBarrier = barrierBetween_; 02080 02081 // #ifdef HAVE_TEUCHOS_DEBUG 02082 // // Prepare for verbose output, if applicable. 02083 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel (); 02084 // (void) verbLevel; // Silence "unused variable" compiler warning. 02085 // RCP<FancyOStream> out = this->getOStream (); 02086 // // const bool doPrint = out.get () && (comm_->getRank () == 0) && 02087 // // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true); 02088 // const bool doPrint = out.get () && (comm_->getRank () == 0); 02089 02090 // if (doPrint) { 02091 // // Only need one process to print out parameters. 02092 // *out << "Distributor::doPosts (3 args)" << endl; 02093 // } 02094 // // Add one tab level. We declare this outside the doPrint scopes 02095 // // so that the tab persists until the end of this method. 02096 // OSTab tab = this->getOSTab (); 02097 // if (doPrint) { 02098 // *out << "Parameters:" << endl; 02099 // { 02100 // OSTab tab2 (out); 02101 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType) 02102 // << endl << "barrierBetween: " << doBarrier << endl; 02103 // } 02104 // } 02105 // #endif // HAVE_TEUCHOS_DEBUG 02106 02107 TEUCHOS_TEST_FOR_EXCEPTION( 02108 sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error, 02109 "Tpetra::Distributor::doPosts(3 args): Ready-send version requires a " 02110 "barrier between posting receives and posting ready sends. This should " 02111 "have been checked before. " 02112 "Please report this bug to the Tpetra developers."); 02113 02114 const int myImageID = comm_->getRank (); 02115 size_t selfReceiveOffset = 0; 02116 02117 // Each message has the same number of packets. 02118 const size_t totalNumImportPackets = totalReceiveLength_ * numPackets; 02119 TEUCHOS_TEST_FOR_EXCEPTION( 02120 static_cast<size_t> (imports.dimension_0 ()) < totalNumImportPackets, 02121 std::runtime_error, "Tpetra::Distributor::doPosts(3 args): The 'imports' " 02122 "array must have enough entries to hold the expected number of import " 02123 "packets. imports.dimension_0() = " << imports.dimension_0 () << " < " 02124 "totalNumImportPackets = " << totalNumImportPackets << "."); 02125 02126 // MPI tag for nonblocking receives and blocking sends in this 02127 // method. Some processes might take the "fast" path 02128 // (indicesTo_.empty()) and others might take the "slow" path for 02129 // the same doPosts() call, so the path tag must be the same for 02130 // both. 02131 const int pathTag = 0; 02132 const int tag = this->getTag (pathTag); 02133 02134 if (debug_) { 02135 TEUCHOS_TEST_FOR_EXCEPTION( 02136 requests_.size () != 0, std::logic_error, "Tpetra::Distributor::" 02137 "doPosts(3 args): Process " << myImageID << ": requests_.size() = " 02138 << requests_.size () << " != 0."); 02139 std::ostringstream os; 02140 os << myImageID << ": doPosts(3," 02141 << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl; 02142 *out_ << os.str (); 02143 } 02144 02145 // Distributor uses requests_.size() as the number of outstanding 02146 // nonblocking message requests, so we resize to zero to maintain 02147 // this invariant. 02148 // 02149 // numReceives_ does _not_ include the self message, if there is 02150 // one. Here, we do actually send a message to ourselves, so we 02151 // include any self message in the "actual" number of receives to 02152 // post. 02153 // 02154 // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts() 02155 // doesn't (re)allocate its array of requests. That happens in 02156 // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on 02157 // demand), or Resize_(). 02158 const size_type actualNumReceives = as<size_type> (numReceives_) + 02159 as<size_type> (selfMessage_ ? 1 : 0); 02160 requests_.resize (0); 02161 02162 // Post the nonblocking receives. It's common MPI wisdom to post 02163 // receives before sends. In MPI terms, this means favoring 02164 // adding to the "posted queue" (of receive requests) over adding 02165 // to the "unexpected queue" (of arrived messages not yet matched 02166 // with a receive). 02167 { 02168 #ifdef TPETRA_DISTRIBUTOR_TIMERS 02169 Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_); 02170 #endif // TPETRA_DISTRIBUTOR_TIMERS 02171 02172 size_t curBufferOffset = 0; 02173 for (size_type i = 0; i < actualNumReceives; ++i) { 02174 if (imagesFrom_[i] != myImageID) { 02175 // If my process is receiving these packet(s) from another 02176 // process (not a self-receive): 02177 // 02178 // 1. Set up the persisting view (recvBuf) of the imports 02179 // array, given the offset and size (total number of 02180 // packets from process imagesFrom_[i]). 02181 // 2. Start the Irecv and save the resulting request. 02182 imports_view_type recvBuf = 02183 subview_offset (imports, curBufferOffset, lengthsFrom_[i]*numPackets); 02184 requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i], 02185 tag, *comm_)); 02186 if (debug_) { 02187 std::ostringstream os; 02188 os << myImageID << ": doPosts(3," 02189 << (indicesTo_.empty () ? "fast" : "slow") << "): " 02190 << "Posted irecv from Proc " << imagesFrom_[i] << " with " 02191 "specified tag " << tag << endl; 02192 *out_ << os.str (); 02193 } 02194 } 02195 else { // Receiving from myself 02196 selfReceiveOffset = curBufferOffset; // Remember the self-recv offset 02197 } 02198 curBufferOffset += lengthsFrom_[i]*numPackets; 02199 } 02200 } 02201 02202 if (doBarrier) { 02203 #ifdef TPETRA_DISTRIBUTOR_TIMERS 02204 Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_); 02205 #endif // TPETRA_DISTRIBUTOR_TIMERS 02206 // If we are using ready sends (MPI_Rsend) below, we need to do 02207 // a barrier before we post the ready sends. This is because a 02208 // ready send requires that its matching receive has already 02209 // been posted before the send has been posted. The only way to 02210 // guarantee that in this case is to use a barrier. 02211 comm_->barrier (); 02212 } 02213 02214 #ifdef TPETRA_DISTRIBUTOR_TIMERS 02215 Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_); 02216 #endif // TPETRA_DISTRIBUTOR_TIMERS 02217 02218 // setup scan through imagesTo_ list starting with higher numbered images 02219 // (should help balance message traffic) 02220 // 02221 // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this? 02222 // It doesn't depend on the input at all. 02223 size_t numBlocks = numSends_ + selfMessage_; 02224 size_t imageIndex = 0; 02225 while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) { 02226 ++imageIndex; 02227 } 02228 if (imageIndex == numBlocks) { 02229 imageIndex = 0; 02230 } 02231 02232 size_t selfNum = 0; 02233 size_t selfIndex = 0; 02234 02235 if (indicesTo_.empty()) { 02236 if (debug_) { 02237 std::ostringstream os; 02238 os << myImageID << ": doPosts(3,fast): posting sends" << endl; 02239 *out_ << os.str (); 02240 } 02241 02242 // Data are already blocked (laid out) by process, so we don't 02243 // need a separate send buffer (besides the exports array). 02244 for (size_t i = 0; i < numBlocks; ++i) { 02245 size_t p = i + imageIndex; 02246 if (p > (numBlocks - 1)) { 02247 p -= numBlocks; 02248 } 02249 02250 if (imagesTo_[p] != myImageID) { 02251 exports_view_type tmpSend = subview_offset( 02252 exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets); 02253 02254 if (sendType == Details::DISTRIBUTOR_SEND) { 02255 send<int> (tmpSend, 02256 as<int> (tmpSend.size ()), 02257 imagesTo_[p], tag, *comm_); 02258 } 02259 else if (sendType == Details::DISTRIBUTOR_ISEND) { 02260 exports_view_type tmpSendBuf = 02261 subview_offset (exports, startsTo_[p] * numPackets, 02262 lengthsTo_[p] * numPackets); 02263 requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p], 02264 tag, *comm_)); 02265 } 02266 else if (sendType == Details::DISTRIBUTOR_RSEND) { 02267 readySend<int> (tmpSend, 02268 as<int> (tmpSend.size ()), 02269 imagesTo_[p], tag, *comm_); 02270 } 02271 else if (sendType == Details::DISTRIBUTOR_SSEND) { 02272 ssend<int> (tmpSend, 02273 as<int> (tmpSend.size ()), 02274 imagesTo_[p], tag, *comm_); 02275 } else { 02276 TEUCHOS_TEST_FOR_EXCEPTION( 02277 true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): " 02278 "Invalid send type. We should never get here. " 02279 "Please report this bug to the Tpetra developers."); 02280 } 02281 02282 if (debug_) { 02283 std::ostringstream os; 02284 os << myImageID << ": doPosts(3,fast): " 02285 << "Posted send to Proc " << imagesTo_[i] 02286 << " w/ specified tag " << tag << endl; 02287 *out_ << os.str (); 02288 } 02289 } 02290 else { // "Sending" the message to myself 02291 selfNum = p; 02292 } 02293 } 02294 02295 if (selfMessage_) { 02296 // This is how we "send a message to ourself": we copy from 02297 // the export buffer to the import buffer. That saves 02298 // Teuchos::Comm implementations other than MpiComm (in 02299 // particular, SerialComm) the trouble of implementing self 02300 // messages correctly. (To do this right, SerialComm would 02301 // need internal buffer space for messages, keyed on the 02302 // message's tag.) 02303 deep_copy_offset(imports, exports, selfReceiveOffset, 02304 startsTo_[selfNum]*numPackets, 02305 lengthsTo_[selfNum]*numPackets); 02306 } 02307 if (debug_) { 02308 std::ostringstream os; 02309 os << myImageID << ": doPosts(3,fast) done" << endl; 02310 *out_ << os.str (); 02311 } 02312 } 02313 else { // data are not blocked by image, use send buffer 02314 if (debug_) { 02315 std::ostringstream os; 02316 os << myImageID << ": doPosts(3,slow): posting sends" << endl; 02317 *out_ << os.str (); 02318 } 02319 02320 // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking 02321 // sends), because the buffer is only long enough for one send. 02322 Kokkos::View<Packet*,Layout,Device,Mem> sendArray = 02323 create_view<Packet,Layout,Device,Mem> ("sendArray", 02324 maxSendLength_ * numPackets); 02325 02326 TEUCHOS_TEST_FOR_EXCEPTION( 02327 sendType == Details::DISTRIBUTOR_ISEND, std::logic_error, 02328 "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path " 02329 "doesn't currently work with nonblocking sends."); 02330 02331 for (size_t i = 0; i < numBlocks; ++i) { 02332 size_t p = i + imageIndex; 02333 if (p > (numBlocks - 1)) { 02334 p -= numBlocks; 02335 } 02336 02337 if (imagesTo_[p] != myImageID) { 02338 size_t sendArrayOffset = 0; 02339 size_t j = startsTo_[p]; 02340 for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) { 02341 deep_copy_offset(sendArray, exports, sendArrayOffset, 02342 indicesTo_[j]*numPackets, numPackets); 02343 sendArrayOffset += numPackets; 02344 } 02345 Kokkos::View<Packet*, Layout, Device, Mem> tmpSend = 02346 subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets); 02347 02348 if (sendType == Details::DISTRIBUTOR_SEND) { 02349 send<int> (tmpSend, 02350 as<int> (tmpSend.size ()), 02351 imagesTo_[p], tag, *comm_); 02352 } 02353 else if (sendType == Details::DISTRIBUTOR_ISEND) { 02354 exports_view_type tmpSendBuf = 02355 subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets); 02356 requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p], 02357 tag, *comm_)); 02358 } 02359 else if (sendType == Details::DISTRIBUTOR_RSEND) { 02360 readySend<int> (tmpSend, 02361 as<int> (tmpSend.size ()), 02362 imagesTo_[p], tag, *comm_); 02363 } 02364 else if (sendType == Details::DISTRIBUTOR_SSEND) { 02365 ssend<int> (tmpSend, 02366 as<int> (tmpSend.size ()), 02367 imagesTo_[p], tag, *comm_); 02368 } 02369 else { 02370 TEUCHOS_TEST_FOR_EXCEPTION( 02371 true, std::logic_error, "Tpetra::Distributor::doPosts(3 args): " 02372 "Invalid send type. We should never get here. " 02373 "Please report this bug to the Tpetra developers."); 02374 } 02375 02376 if (debug_) { 02377 std::ostringstream os; 02378 os << myImageID << ": doPosts(3,slow): " 02379 << "Posted send to Proc " << imagesTo_[i] 02380 << " w/ specified tag " << tag << endl; 02381 *out_ << os.str (); 02382 } 02383 } 02384 else { // "Sending" the message to myself 02385 selfNum = p; 02386 selfIndex = startsTo_[p]; 02387 } 02388 } 02389 02390 if (selfMessage_) { 02391 for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) { 02392 deep_copy_offset(imports, exports, selfReceiveOffset, 02393 indicesTo_[selfIndex]*numPackets, numPackets); 02394 ++selfIndex; 02395 selfReceiveOffset += numPackets; 02396 } 02397 } 02398 if (debug_) { 02399 std::ostringstream os; 02400 os << myImageID << ": doPosts(3,slow) done" << endl; 02401 *out_ << os.str (); 02402 } 02403 } 02404 } 02405 02406 template <class Packet, class Layout, class Device, class Mem> 02407 void Distributor:: 02408 doPosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 02409 const ArrayView<size_t> &numExportPacketsPerLID, 02410 const Kokkos::View<Packet*, Layout, Device, Mem> &imports, 02411 const ArrayView<size_t> &numImportPacketsPerLID) 02412 { 02413 using Teuchos::Array; 02414 using Teuchos::as; 02415 using Teuchos::ireceive; 02416 using Teuchos::isend; 02417 using Teuchos::readySend; 02418 using Teuchos::send; 02419 using Teuchos::ssend; 02420 using Teuchos::TypeNameTraits; 02421 #ifdef HAVE_TEUCHOS_DEBUG 02422 using Teuchos::OSTab; 02423 #endif // HAVE_TEUCHOS_DEBUG 02424 using std::endl; 02425 using Kokkos::Compat::create_const_view; 02426 using Kokkos::Compat::create_view; 02427 using Kokkos::Compat::subview_offset; 02428 using Kokkos::Compat::deep_copy_offset; 02429 typedef Array<size_t>::size_type size_type; 02430 typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view_type; 02431 typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view_type; 02432 02433 Teuchos::OSTab tab (out_); 02434 02435 #ifdef TPETRA_DISTRIBUTOR_TIMERS 02436 Teuchos::TimeMonitor timeMon (*timer_doPosts4_); 02437 #endif // TPETRA_DISTRIBUTOR_TIMERS 02438 02439 // Run-time configurable parameters that come from the input 02440 // ParameterList set by setParameterList(). 02441 const Details::EDistributorSendType sendType = sendType_; 02442 const bool doBarrier = barrierBetween_; 02443 02444 // #ifdef HAVE_TEUCHOS_DEBUG 02445 // // Prepare for verbose output, if applicable. 02446 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel (); 02447 // RCP<Teuchos::FancyOStream> out = this->getOStream (); 02448 // const bool doPrint = out.get () && (comm_->getRank () == 0) && 02449 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true); 02450 02451 // if (doPrint) { 02452 // // Only need one process to print out parameters. 02453 // *out << "Distributor::doPosts (4 args)" << endl; 02454 // } 02455 // // Add one tab level. We declare this outside the doPrint scopes 02456 // // so that the tab persists until the end of this method. 02457 // Teuchos::OSTab tab = this->getOSTab (); 02458 // if (doPrint) { 02459 // *out << "Parameters:" << endl; 02460 // { 02461 // OSTab tab2 (out); 02462 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType) 02463 // << endl << "barrierBetween: " << doBarrier << endl; 02464 // } 02465 // } 02466 // #endif // HAVE_TEUCHOS_DEBUG 02467 02468 TEUCHOS_TEST_FOR_EXCEPTION( 02469 sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, 02470 std::logic_error, "Tpetra::Distributor::doPosts(4 args): Ready-send " 02471 "version requires a barrier between posting receives and posting ready " 02472 "sends. This should have been checked before. " 02473 "Please report this bug to the Tpetra developers."); 02474 02475 const int myImageID = comm_->getRank (); 02476 size_t selfReceiveOffset = 0; 02477 02478 #ifdef HAVE_TEUCHOS_DEBUG 02479 // Different messages may have different numbers of packets. 02480 size_t totalNumImportPackets = 0; 02481 for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) { 02482 totalNumImportPackets += numImportPacketsPerLID[ii]; 02483 } 02484 TEUCHOS_TEST_FOR_EXCEPTION( 02485 imports.dimension_0 () < totalNumImportPackets, std::runtime_error, 02486 "Tpetra::Distributor::doPosts(4 args): The 'imports' array must have " 02487 "enough entries to hold the expected number of import packets. " 02488 "imports.dimension_0() = " << imports.dimension_0 () << " < " 02489 "totalNumImportPackets = " << totalNumImportPackets << "."); 02490 #endif // HAVE_TEUCHOS_DEBUG 02491 02492 // MPI tag for nonblocking receives and blocking sends in this 02493 // method. Some processes might take the "fast" path 02494 // (indicesTo_.empty()) and others might take the "slow" path for 02495 // the same doPosts() call, so the path tag must be the same for 02496 // both. 02497 const int pathTag = 1; 02498 const int tag = this->getTag (pathTag); 02499 02500 if (debug_) { 02501 TEUCHOS_TEST_FOR_EXCEPTION( 02502 requests_.size () != 0, std::logic_error, "Tpetra::Distributor::" 02503 "doPosts(4 args): Process " << myImageID << ": requests_.size () = " 02504 << requests_.size () << " != 0."); 02505 std::ostringstream os; 02506 os << myImageID << ": doPosts(4," 02507 << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl; 02508 *out_ << os.str (); 02509 } 02510 02511 // Distributor uses requests_.size() as the number of outstanding 02512 // nonblocking message requests, so we resize to zero to maintain 02513 // this invariant. 02514 // 02515 // numReceives_ does _not_ include the self message, if there is 02516 // one. Here, we do actually send a message to ourselves, so we 02517 // include any self message in the "actual" number of receives to 02518 // post. 02519 // 02520 // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts() 02521 // doesn't (re)allocate its array of requests. That happens in 02522 // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on 02523 // demand), or Resize_(). 02524 const size_type actualNumReceives = as<size_type> (numReceives_) + 02525 as<size_type> (selfMessage_ ? 1 : 0); 02526 requests_.resize (0); 02527 02528 // Post the nonblocking receives. It's common MPI wisdom to post 02529 // receives before sends. In MPI terms, this means favoring 02530 // adding to the "posted queue" (of receive requests) over adding 02531 // to the "unexpected queue" (of arrived messages not yet matched 02532 // with a receive). 02533 { 02534 #ifdef TPETRA_DISTRIBUTOR_TIMERS 02535 Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_); 02536 #endif // TPETRA_DISTRIBUTOR_TIMERS 02537 02538 size_t curBufferOffset = 0; 02539 size_t curLIDoffset = 0; 02540 for (size_type i = 0; i < actualNumReceives; ++i) { 02541 size_t totalPacketsFrom_i = 0; 02542 for (size_t j = 0; j < lengthsFrom_[i]; ++j) { 02543 totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j]; 02544 } 02545 curLIDoffset += lengthsFrom_[i]; 02546 if (imagesFrom_[i] != myImageID && totalPacketsFrom_i) { 02547 // If my process is receiving these packet(s) from another 02548 // process (not a self-receive), and if there is at least 02549 // one packet to receive: 02550 // 02551 // 1. Set up the persisting view (recvBuf) into the imports 02552 // array, given the offset and size (total number of 02553 // packets from process imagesFrom_[i]). 02554 // 2. Start the Irecv and save the resulting request. 02555 imports_view_type recvBuf = 02556 subview_offset (imports, curBufferOffset, totalPacketsFrom_i); 02557 requests_.push_back (ireceive<int> (recvBuf, imagesFrom_[i], 02558 tag, *comm_)); 02559 } 02560 else { // Receiving these packet(s) from myself 02561 selfReceiveOffset = curBufferOffset; // Remember the offset 02562 } 02563 curBufferOffset += totalPacketsFrom_i; 02564 } 02565 } 02566 02567 if (doBarrier) { 02568 #ifdef TPETRA_DISTRIBUTOR_TIMERS 02569 Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_); 02570 #endif // TPETRA_DISTRIBUTOR_TIMERS 02571 // If we are using ready sends (MPI_Rsend) below, we need to do 02572 // a barrier before we post the ready sends. This is because a 02573 // ready send requires that its matching receive has already 02574 // been posted before the send has been posted. The only way to 02575 // guarantee that in this case is to use a barrier. 02576 comm_->barrier (); 02577 } 02578 02579 #ifdef TPETRA_DISTRIBUTOR_TIMERS 02580 Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_); 02581 #endif // TPETRA_DISTRIBUTOR_TIMERS 02582 02583 // setup arrays containing starting-offsets into exports for each send, 02584 // and num-packets-to-send for each send. 02585 Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0); 02586 size_t maxNumPackets = 0; 02587 size_t curPKToffset = 0; 02588 for (size_t pp=0; pp<numSends_; ++pp) { 02589 sendPacketOffsets[pp] = curPKToffset; 02590 size_t numPackets = 0; 02591 for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) { 02592 numPackets += numExportPacketsPerLID[j]; 02593 } 02594 if (numPackets > maxNumPackets) maxNumPackets = numPackets; 02595 packetsPerSend[pp] = numPackets; 02596 curPKToffset += numPackets; 02597 } 02598 02599 // setup scan through imagesTo_ list starting with higher numbered images 02600 // (should help balance message traffic) 02601 size_t numBlocks = numSends_+ selfMessage_; 02602 size_t imageIndex = 0; 02603 while ((imageIndex < numBlocks) && (imagesTo_[imageIndex] < myImageID)) { 02604 ++imageIndex; 02605 } 02606 if (imageIndex == numBlocks) { 02607 imageIndex = 0; 02608 } 02609 02610 size_t selfNum = 0; 02611 size_t selfIndex = 0; 02612 02613 if (indicesTo_.empty()) { 02614 if (debug_) { 02615 std::ostringstream os; 02616 os << myImageID << ": doPosts(4,fast): posting sends" << endl; 02617 *out_ << os.str (); 02618 } 02619 02620 // Data are already blocked (laid out) by process, so we don't 02621 // need a separate send buffer (besides the exports array). 02622 for (size_t i = 0; i < numBlocks; ++i) { 02623 size_t p = i + imageIndex; 02624 if (p > (numBlocks - 1)) { 02625 p -= numBlocks; 02626 } 02627 02628 if (imagesTo_[p] != myImageID && packetsPerSend[p] > 0) { 02629 exports_view_type tmpSend = 02630 subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]); 02631 02632 if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first 02633 send<int> (tmpSend, 02634 as<int> (tmpSend.size ()), 02635 imagesTo_[p], tag, *comm_); 02636 } 02637 else if (sendType == Details::DISTRIBUTOR_RSEND) { 02638 readySend<int> (tmpSend, 02639 as<int> (tmpSend.size ()), 02640 imagesTo_[p], tag, *comm_); 02641 } 02642 else if (sendType == Details::DISTRIBUTOR_ISEND) { 02643 exports_view_type tmpSendBuf = 02644 subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]); 02645 requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p], 02646 tag, *comm_)); 02647 } 02648 else if (sendType == Details::DISTRIBUTOR_SSEND) { 02649 ssend<int> (tmpSend, 02650 as<int> (tmpSend.size ()), 02651 imagesTo_[p], tag, *comm_); 02652 } 02653 else { 02654 TEUCHOS_TEST_FOR_EXCEPTION( 02655 true, std::logic_error, "Tpetra::Distributor::doPosts(4 args): " 02656 "Invalid send type. We should never get here. " 02657 "Please report this bug to the Tpetra developers."); 02658 } 02659 } 02660 else { // "Sending" the message to myself 02661 selfNum = p; 02662 } 02663 } 02664 02665 if (selfMessage_) { 02666 deep_copy_offset(imports, exports, selfReceiveOffset, 02667 sendPacketOffsets[selfNum], packetsPerSend[selfNum]); 02668 } 02669 if (debug_) { 02670 std::ostringstream os; 02671 os << myImageID << ": doPosts(4,fast) done" << endl; 02672 *out_ << os.str (); 02673 } 02674 } 02675 else { // data are not blocked by image, use send buffer 02676 if (debug_) { 02677 std::ostringstream os; 02678 os << myImageID << ": doPosts(4,slow): posting sends" << endl; 02679 *out_ << os.str (); 02680 } 02681 02682 // FIXME (mfh 05 Mar 2013) This may be broken for Isend. 02683 Kokkos::View<Packet*,Layout,Device,Mem> sendArray = 02684 create_view<Packet,Layout,Device,Mem>("sendArray", maxNumPackets); // send buffer 02685 02686 TEUCHOS_TEST_FOR_EXCEPTION( 02687 sendType == Details::DISTRIBUTOR_ISEND, std::logic_error, 02688 "Tpetra::Distributor::doPosts(3 args): The \"send buffer\" code path " 02689 "may not necessarily work with nonblocking sends."); 02690 02691 Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0); 02692 size_t ioffset = 0; 02693 for (int j=0; j<numExportPacketsPerLID.size(); ++j) { 02694 indicesOffsets[j] = ioffset; 02695 ioffset += numExportPacketsPerLID[j]; 02696 } 02697 02698 for (size_t i = 0; i < numBlocks; ++i) { 02699 size_t p = i + imageIndex; 02700 if (p > (numBlocks - 1)) { 02701 p -= numBlocks; 02702 } 02703 02704 if (imagesTo_[p] != myImageID) { 02705 size_t sendArrayOffset = 0; 02706 size_t j = startsTo_[p]; 02707 size_t numPacketsTo_p = 0; 02708 for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) { 02709 deep_copy_offset(sendArray, exports, sendArrayOffset, 02710 indicesOffsets[j], numExportPacketsPerLID[j]); 02711 sendArrayOffset += numExportPacketsPerLID[j]; 02712 } 02713 if (numPacketsTo_p > 0) { 02714 Kokkos::View<Packet*, Layout, Device, Mem> tmpSend = 02715 subview_offset(sendArray, size_t(0), numPacketsTo_p); 02716 02717 if (sendType == Details::DISTRIBUTOR_RSEND) { 02718 readySend<int> (tmpSend, 02719 as<int> (tmpSend.size ()), 02720 imagesTo_[p], tag, *comm_); 02721 } 02722 else if (sendType == Details::DISTRIBUTOR_ISEND) { 02723 exports_view_type tmpSendBuf = 02724 subview_offset (sendArray, size_t(0), numPacketsTo_p); 02725 requests_.push_back (isend<int> (tmpSendBuf, imagesTo_[p], 02726 tag, *comm_)); 02727 } 02728 else if (sendType == Details::DISTRIBUTOR_SSEND) { 02729 ssend<int> (tmpSend, 02730 as<int> (tmpSend.size ()), 02731 imagesTo_[p], tag, *comm_); 02732 } 02733 else { // if (sendType == Details::DISTRIBUTOR_SSEND) 02734 send<int> (tmpSend, 02735 as<int> (tmpSend.size ()), 02736 imagesTo_[p], tag, *comm_); 02737 } 02738 } 02739 } 02740 else { // "Sending" the message to myself 02741 selfNum = p; 02742 selfIndex = startsTo_[p]; 02743 } 02744 } 02745 02746 if (selfMessage_) { 02747 for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) { 02748 deep_copy_offset(imports, exports, selfReceiveOffset, 02749 indicesOffsets[selfIndex], 02750 numExportPacketsPerLID[selfIndex]); 02751 selfReceiveOffset += numExportPacketsPerLID[selfIndex]; 02752 ++selfIndex; 02753 } 02754 } 02755 if (debug_) { 02756 std::ostringstream os; 02757 os << myImageID << ": doPosts(4,slow) done" << endl; 02758 *out_ << os.str (); 02759 } 02760 } 02761 } 02762 02763 template <class Packet, class Layout, class Device, class Mem> 02764 void Distributor:: 02765 doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 02766 size_t numPackets, 02767 const Kokkos::View<Packet*, Layout, Device, Mem> &imports) 02768 { 02769 // If MPI library doesn't support RDMA for communication directly 02770 // to/from GPU, transfer exports to the host, do the communication, then 02771 // transfer imports back to the GPU 02772 // 02773 // We need to do this here instead of doPosts() because the copy back 02774 // to the GPU must occur after the waits. 02775 typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view; 02776 typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view; 02777 if (!enable_cuda_rdma_ && !exports_view::is_hostspace) { 02778 typename exports_view::HostMirror host_exports = 02779 Kokkos::create_mirror_view(exports); 02780 typename imports_view::HostMirror host_imports = 02781 Kokkos::create_mirror_view(imports); 02782 Kokkos::deep_copy (host_exports, exports); 02783 doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports), 02784 numPackets, 02785 host_imports); 02786 Kokkos::deep_copy (imports, host_imports); 02787 return; 02788 } 02789 02790 doReversePosts (exports, numPackets, imports); 02791 doReverseWaits (); 02792 } 02793 02794 template <class Packet, class Layout, class Device, class Mem> 02795 void Distributor:: 02796 doReversePostsAndWaits (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 02797 const ArrayView<size_t> &numExportPacketsPerLID, 02798 const Kokkos::View<Packet*, Layout, Device, Mem> &imports, 02799 const ArrayView<size_t> &numImportPacketsPerLID) 02800 { 02801 // If MPI library doesn't support RDMA for communication directly 02802 // to/from GPU, transfer exports to the host, do the communication, then 02803 // transfer imports back to the GPU 02804 // 02805 // We need to do this here instead of doPosts() because the copy back 02806 // to the GPU must occur after the waits. 02807 typedef Kokkos::View<const Packet*, Layout, Device, Mem> exports_view; 02808 typedef Kokkos::View<Packet*, Layout, Device, Mem> imports_view; 02809 if (!enable_cuda_rdma_ && !exports_view::is_hostspace) { 02810 typename exports_view::HostMirror host_exports = 02811 Kokkos::create_mirror_view(exports); 02812 typename imports_view::HostMirror host_imports = 02813 Kokkos::create_mirror_view(imports); 02814 Kokkos::deep_copy (host_exports, exports); 02815 doPostsAndWaits(Kokkos::Compat::create_const_view(host_exports), 02816 numExportPacketsPerLID, 02817 host_imports, 02818 numImportPacketsPerLID); 02819 Kokkos::deep_copy (imports, host_imports); 02820 return; 02821 } 02822 02823 TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error, 02824 "Tpetra::Distributor::doReversePostsAndWaits(4 args): There are " 02825 << requests_.size() << " outstanding nonblocking messages pending. It " 02826 "is incorrect to call this method with posts outstanding."); 02827 02828 doReversePosts (exports, numExportPacketsPerLID, imports, 02829 numImportPacketsPerLID); 02830 doReverseWaits (); 02831 } 02832 02833 template <class Packet, class Layout, class Device, class Mem> 02834 void Distributor:: 02835 doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 02836 size_t numPackets, 02837 const Kokkos::View<Packet*, Layout, Device, Mem> &imports) 02838 { 02839 // FIXME (mfh 29 Mar 2012) WHY? 02840 TEUCHOS_TEST_FOR_EXCEPTION( 02841 ! indicesTo_.empty (), std::runtime_error, 02842 "Tpetra::Distributor::doReversePosts(3 args): Can only do " 02843 "reverse communication when original data are blocked by process."); 02844 if (reverseDistributor_.is_null ()) { 02845 createReverseDistributor (); 02846 } 02847 reverseDistributor_->doPosts (exports, numPackets, imports); 02848 } 02849 02850 template <class Packet, class Layout, class Device, class Mem> 02851 void Distributor:: 02852 doReversePosts (const Kokkos::View<const Packet*, Layout, Device, Mem> &exports, 02853 const ArrayView<size_t> &numExportPacketsPerLID, 02854 const Kokkos::View<Packet*, Layout, Device, Mem> &imports, 02855 const ArrayView<size_t> &numImportPacketsPerLID) 02856 { 02857 // FIXME (mfh 29 Mar 2012) WHY? 02858 TEUCHOS_TEST_FOR_EXCEPTION( 02859 ! indicesTo_.empty (), std::runtime_error, 02860 "Tpetra::Distributor::doReversePosts(3 args): Can only do " 02861 "reverse communication when original data are blocked by process."); 02862 if (reverseDistributor_.is_null ()) { 02863 createReverseDistributor (); 02864 } 02865 reverseDistributor_->doPosts (exports, numExportPacketsPerLID, 02866 imports, numImportPacketsPerLID); 02867 } 02868 02869 #endif 02870 02871 template <class OrdinalType> 02872 void Distributor:: 02873 computeSends (const ArrayView<const OrdinalType> & importIDs, 02874 const ArrayView<const int> & importNodeIDs, 02875 Array<OrdinalType> & exportIDs, 02876 Array<int> & exportNodeIDs) 02877 { 02878 // NOTE (mfh 19 Apr 2012): There was a note on this code saying: 02879 // "assumes that size_t >= Ordinal". The code certainly does 02880 // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as 02881 // sizeof(size_t) >= sizeof(int). This is because it casts the 02882 // OrdinalType elements of importIDs (along with their 02883 // corresponding process IDs, as int) to size_t, and does a 02884 // doPostsAndWaits<size_t>() to send the packed data. 02885 using std::endl; 02886 typedef typename ArrayView<const OrdinalType>::size_type size_type; 02887 02888 Teuchos::OSTab tab (out_); 02889 const int myRank = comm_->getRank (); 02890 if (debug_) { 02891 std::ostringstream os; 02892 os << myRank << ": computeSends" << endl; 02893 *out_ << os.str (); 02894 } 02895 02896 TEUCHOS_TEST_FOR_EXCEPTION( 02897 importIDs.size () != importNodeIDs.size (), std::invalid_argument, 02898 "Tpetra::Distributor::computeSends: On Process " << myRank << ": " 02899 "importNodeIDs.size() = " << importNodeIDs.size () 02900 << " != importIDs.size() = " << importIDs.size () << "."); 02901 02902 const size_type numImports = importNodeIDs.size (); 02903 Array<size_t> importObjs (2*numImports); 02904 // Pack pairs (importIDs[i], my process ID) to send into importObjs. 02905 for (size_type i = 0; i < numImports; ++i) { 02906 importObjs[2*i] = static_cast<size_t> (importIDs[i]); 02907 importObjs[2*i+1] = static_cast<size_t> (myRank); 02908 } 02909 // 02910 // Use a temporary Distributor to send the (importIDs[i], myRank) 02911 // pairs to importNodeIDs[i]. 02912 // 02913 Distributor tempPlan (comm_, out_); 02914 if (debug_) { 02915 std::ostringstream os; 02916 os << myRank << ": computeSends: tempPlan.createFromSends" << endl; 02917 *out_ << os.str (); 02918 } 02919 02920 // mfh 20 Mar 2014: An extra-cautious cast from unsigned to 02921 // signed, in order to forestall any possible causes for Bug 6069. 02922 const size_t numExportsAsSizeT = tempPlan.createFromSends (importNodeIDs); 02923 const size_type numExports = static_cast<size_type> (numExportsAsSizeT); 02924 TEUCHOS_TEST_FOR_EXCEPTION( 02925 numExports < 0, std::logic_error, "Tpetra::Distributor::computeSends: " 02926 "tempPlan.createFromSends() returned numExports = " << numExportsAsSizeT 02927 << " as a size_t, which overflows to " << numExports << " when cast to " 02928 << Teuchos::TypeNameTraits<size_type>::name () << ". " 02929 "Please report this bug to the Tpetra developers."); 02930 TEUCHOS_TEST_FOR_EXCEPTION( 02931 static_cast<size_type> (tempPlan.getTotalReceiveLength ()) != numExports, 02932 std::logic_error, "Tpetra::Distributor::computeSends: tempPlan.getTotal" 02933 "ReceiveLength() = " << tempPlan.getTotalReceiveLength () << " != num" 02934 "Exports = " << numExports << ". Please report this bug to the " 02935 "Tpetra developers."); 02936 02937 if (numExports > 0) { 02938 exportIDs.resize (numExports); 02939 exportNodeIDs.resize (numExports); 02940 } 02941 02942 // exportObjs: Packed receive buffer. (exportObjs[2*i], 02943 // exportObjs[2*i+1]) will give the (GID, PID) pair for export i, 02944 // after tempPlan.doPostsAndWaits(...) finishes below. 02945 // 02946 // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in 02947 // size_t. This issue might come up, for example, on a 32-bit 02948 // machine using 64-bit global indices. I will add a check here 02949 // for that case. 02950 TEUCHOS_TEST_FOR_EXCEPTION( 02951 sizeof (size_t) < sizeof (OrdinalType), std::logic_error, 02952 "Tpetra::Distributor::computeSends: sizeof(size_t) = " << sizeof(size_t) 02953 << " < sizeof(" << Teuchos::TypeNameTraits<OrdinalType>::name () << ") = " 02954 << sizeof (OrdinalType) << ". This violates an assumption of the " 02955 "method. It's not hard to work around (just use Array<OrdinalType> as " 02956 "the export buffer, not Array<size_t>), but we haven't done that yet. " 02957 "Please report this bug to the Tpetra developers."); 02958 02959 TEUCHOS_TEST_FOR_EXCEPTION( 02960 tempPlan.getTotalReceiveLength () < static_cast<size_t> (numExports), 02961 std::logic_error, 02962 "Tpetra::Distributor::computeSends: tempPlan.getTotalReceiveLength() = " 02963 << tempPlan.getTotalReceiveLength() << " < numExports = " << numExports 02964 << ". Please report this bug to the Tpetra developers."); 02965 02966 Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2); 02967 if (debug_) { 02968 std::ostringstream os; 02969 os << myRank << ": computeSends: tempPlan.doPostsAndWaits" << endl; 02970 *out_ << os.str (); 02971 } 02972 tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ()); 02973 02974 // Unpack received (GID, PID) pairs into exportIDs resp. exportNodeIDs. 02975 for (size_type i = 0; i < numExports; ++i) { 02976 exportIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]); 02977 exportNodeIDs[i] = static_cast<int> (exportObjs[2*i+1]); 02978 } 02979 02980 if (debug_) { 02981 std::ostringstream os; 02982 os << myRank << ": computeSends done" << endl; 02983 *out_ << os.str (); 02984 } 02985 } 02986 02987 template <class OrdinalType> 02988 void Distributor:: 02989 createFromRecvs (const ArrayView<const OrdinalType> &remoteIDs, 02990 const ArrayView<const int> &remoteImageIDs, 02991 Array<OrdinalType> &exportGIDs, 02992 Array<int> &exportNodeIDs) 02993 { 02994 using std::endl; 02995 02996 Teuchos::OSTab tab (out_); 02997 const int myRank = comm_->getRank(); 02998 02999 if (debug_) { 03000 *out_ << myRank << ": createFromRecvs" << endl; 03001 } 03002 03003 #ifdef HAVE_TPETRA_DEBUG 03004 using Teuchos::outArg; 03005 using Teuchos::reduceAll; 03006 03007 // In debug mode, first test locally, then do an all-reduce to 03008 // make sure that all processes passed. 03009 const int errProc = 03010 (remoteIDs.size () != remoteImageIDs.size ()) ? myRank : -1; 03011 int maxErrProc = -1; 03012 reduceAll<int, int> (*comm_, Teuchos::REDUCE_MAX, errProc, outArg (maxErrProc)); 03013 TEUCHOS_TEST_FOR_EXCEPTION(maxErrProc != -1, std::runtime_error, 03014 Teuchos::typeName (*this) << "::createFromRecvs(): lists of remote IDs " 03015 "and remote process IDs must have the same size on all participating " 03016 "processes. Maximum process ID with error: " << maxErrProc << "."); 03017 #else // NOT HAVE_TPETRA_DEBUG 03018 03019 // In non-debug mode, just test locally. 03020 TEUCHOS_TEST_FOR_EXCEPTION( 03021 remoteIDs.size () != remoteImageIDs.size (), std::invalid_argument, 03022 Teuchos::typeName (*this) << "::createFromRecvs<" << 03023 Teuchos::TypeNameTraits<OrdinalType>::name () << ">(): On Process " << 03024 myRank << ": remoteIDs.size() = " << remoteIDs.size () << " != " 03025 "remoteImageIDs.size() = " << remoteImageIDs.size () << "."); 03026 #endif // HAVE_TPETRA_DEBUG 03027 03028 computeSends (remoteIDs, remoteImageIDs, exportGIDs, exportNodeIDs); 03029 03030 const size_t numProcsSendingToMe = createFromSends (exportNodeIDs ()); 03031 03032 if (debug_) { 03033 // NOTE (mfh 20 Mar 2014) If remoteImageIDs could contain 03034 // duplicates, then its length might not be the right check here, 03035 // even if we account for selfMessage_. selfMessage_ is set in 03036 // createFromSends. 03037 std::ostringstream os; 03038 os << "Proc " << myRank << ": {numProcsSendingToMe: " 03039 << numProcsSendingToMe << ", remoteImageIDs.size(): " 03040 << remoteImageIDs.size () << ", selfMessage_: " 03041 << (selfMessage_ ? "true" : "false") << "}" << std::endl; 03042 std::cerr << os.str (); 03043 } 03044 03045 if (debug_) { 03046 *out_ << myRank << ": createFromRecvs done" << endl; 03047 } 03048 03049 howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS; 03050 } 03051 03052 } // namespace Tpetra 03053 03054 #endif // TPETRA_DISTRIBUTOR_HPP
1.7.6.1