PlayaErrorPolling.hpp
Go to the documentation of this file.
00001 /* @HEADER@ */
00002 // ************************************************************************
00003 // 
00004 //                 Playa: Programmable Linear Algebra
00005 //                 Copyright 2012 Sandia Corporation
00006 // 
00007 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00008 // the U.S. Government retains certain rights in this software.
00009 //
00010 // Redistribution and use in source and binary forms, with or without
00011 // modification, are permitted provided that the following conditions are
00012 // met:
00013 //
00014 // 1. Redistributions of source code must retain the above copyright
00015 // notice, this list of conditions and the following disclaimer.
00016 //
00017 // 2. Redistributions in binary form must reproduce the above copyright
00018 // notice, this list of conditions and the following disclaimer in the
00019 // documentation and/or other materials provided with the distribution.
00020 //
00021 // 3. Neither the name of the Corporation nor the names of the
00022 // contributors may be used to endorse or promote products derived from
00023 // this software without specific prior written permission.
00024 //
00025 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
00026 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00027 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00028 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
00029 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00030 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00031 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
00032 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
00033 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
00034 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
00035 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00036 //
00037 // Questions? Contact Kevin Long (kevin.long@ttu.edu)
00038 // 
00039 
00040 /* @HEADER@ */
00041 
00042 #ifndef Playa_ERRORPOLLING_H
00043 #define Playa_ERRORPOLLING_H
00044 
00045 #include "Teuchos_ConfigDefs.hpp"
00046 #include "Teuchos_Assert.hpp"
00047 
00048 /*! \defgroup ErrorPolling_grp Utility code for synchronizing std::exception detection across processors. 
00049 */
00050 //@{
00051 
00052 namespace Playa
00053 {
00054   class MPIComm;
00055 
00056   /** \brief ErrorPolling provides utilities for establishing agreement
00057    * between processors on whether an std::exception has been detected on any one
00058    * processor.
00059    *
00060    * The two functions must be used in a coordinated way. The simplest use
00061    * case is to embed a call to reportFailure() whenever an std::exception is
00062    * detected at the top-level try/catch block, and then to do a call to
00063    * pollForFailures() whenever it is desired to check for off-processor
00064    * errors before proceeding. The macro
00065 
00066     \code
00067     TEUCHOS_TEST_FOR_FAILURE(comm);
00068     \endcode  
00069 
00070    * calls pollForFailures() and throws an std::exception if the return value is
00071    * true.
00072    *
00073    * Polling is a collective operation (an MPI_Reduce) and so incurs some
00074    * performance overhead. It can be disabled with a call to 
00075    * \code
00076    * Teuchos::ErrorPolling::disable();
00077    * \endcode 
00078    * IMPORTANT: all processors must agree on whether collective error checking
00079    * is enabled or disabled. If there are inconsistent states, the reduction
00080    * operations in pollForFailures() will hang because some processors cannot be 
00081    * contacted. 
00082    */
00083   class ErrorPolling
00084   {
00085   public:
00086     /** Call this function upon catching an std::exception in order to
00087      * inform other processors of the error. This function will do an
00088      * AllReduce in conjunction with calls to either this function or
00089      * its partner, pollForFailures(), on the other processors. This
00090      * procedure has the effect of communicating to the other
00091      * processors that an std::exception has been detected on this one. */
00092     static void reportFailure(const MPIComm& comm);
00093     
00094     /** Call this function after std::exception-free completion of a
00095      * try/catch block. This function will do an AllReduce in
00096      * conjunction with calls to either this function or its partner,
00097      * reportFailure(), on the other processors. If a failure has been
00098      * reported by another processor, the call to pollForFailures()
00099      * will return true and an std::exception can be thrown. */
00100     static bool pollForFailures(const MPIComm& comm);
00101     
00102     /** Activate error polling */
00103     static void enable() {isActive()=true;}
00104 
00105     /** Disable error polling */
00106     static void disable() {isActive()=false;}
00107 
00108   private:
00109     /** Set or check whether error polling is active */
00110     static bool& isActive() {static bool rtn = true; return rtn;}
00111   };
00112 
00113   /** 
00114    * This macro polls all processors in the given communicator to find
00115    * out whether an error has been reported by a call to 
00116    * ErrorPolling::reportFailure(comm).
00117    * 
00118    * @param comm [in] The communicator on which polling will be done
00119    */
00120 #define TEUCHOS_POLL_FOR_FAILURES(comm)                                  \
00121   TEUCHOS_TEST_FOR_EXCEPTION(Playa::ErrorPolling::pollForFailures(comm), \
00122                      std::runtime_error,                                     \
00123                      "off-processor error detected by proc=" << (comm).getRank());
00124 }
00125 
00126 //@}
00127 
00128 #endif

Site Contact