|
Sierra Toolkit
Version of the Day
|
00001 /*------------------------------------------------------------------------*/ 00002 /* Copyright 2010 Sandia Corporation. */ 00003 /* Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive */ 00004 /* license for use of this work by or on behalf of the U.S. Government. */ 00005 /* Export of this program may require a license from the */ 00006 /* United States Government. */ 00007 /*------------------------------------------------------------------------*/ 00008 00009 // Portions of this source code file are: 00010 // Copyright (c) 2010, Victor J. Duvanenko. 00011 // All rights reserved. 00012 // Used with permission. 00013 // 00014 // Redistribution and use in source and binary forms, with or without 00015 // modification, are permitted provided that the following conditions 00016 // are met: 00017 // * Redistributions of source code must retain the above copyright 00018 // notice, this list of conditions and the following disclaimer. 00019 // * Redistributions in binary form must reproduce the above copyright 00020 // notice, this list of conditions and the following disclaimer in the 00021 // documentation and/or other materials provided with the distribution. 00022 // * The name of Victor J. Duvanenko may not be used to endorse or 00023 // promote products derived from this software without specific prior 00024 // written permission. 00025 // 00026 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER "AS IS" AND ANY 00027 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00028 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00029 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE 00030 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00031 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00032 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 00033 // BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00034 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 00035 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 00036 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00037 00038 #ifndef STK_UTIL_UTIL_RADIXSORT_HPP 00039 #define STK_UTIL_UTIL_RADIXSORT_HPP 00040 00041 // 00042 // Internal implementation 00043 // 00044 namespace { // anonymous namespace 00045 00046 // Swap that does not check for self-assignment. 00047 template< class T > 00048 inline void swap_impl( T& a, T& b ) 00049 { 00050 T tmp = a; 00051 a = b; 00052 b = tmp; 00053 } 00054 00055 // insertion sort similar to STL with no self-assignment 00056 template <class T> 00057 inline void insertion_sort_impl( T* a, size_t a_size ) 00058 { 00059 for ( size_t i = 1; i < a_size; ++i ) 00060 { 00061 if ( a[ i ] < a[ i - 1 ] ) // no need to do (j > 0) compare for the first iteration 00062 { 00063 T currentElement = a[ i ]; 00064 a[ i ] = a[ i - 1 ]; 00065 size_t j; 00066 for ( j = i - 1; j > 0 && currentElement < a[ j - 1 ]; --j ) 00067 { 00068 a[ j ] = a[ j - 1 ]; 00069 } 00070 a[ j ] = currentElement; // always necessary work/write 00071 } 00072 // Perform no work at all if the first comparison fails - i.e. never assign an element to itself! 00073 } 00074 } 00075 00076 // Recursive implementation of radix sort for unsigned integer types only. 00077 // 00078 // PowerOfTwoRadix - must be a power of 2, 256 is suggested for current hardware as of 03/20/2011. 00079 // Log2ofPowerOfTwoRadix - for example log( 256 ) = 8 00080 // Threshold - length below which array sections are sorted with an insertion sort 00081 // a - pointer to start of an array of integer type 00082 // last - one less than the length of the a array 00083 // bitMask - controls how many and which bits we process at a time 00084 // shiftRightAmount - sizeof(T) * 8 - Log2ofPowerOfTwoRadix 00085 template< class T, unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold > 00086 inline void radix_sort_unsigned_impl( T* a, long last, T bitMask, unsigned long shiftRightAmount ) 00087 { 00088 unsigned long count[ PowerOfTwoRadix ]; 00089 for( unsigned long i = 0; i < PowerOfTwoRadix; ++i ) count[ i ] = 0; 00090 for ( long _current = 0; _current <= last; ++_current ) count[ (unsigned long)(( a[ _current ] & bitMask ) >> shiftRightAmount ) ]++; // Scan the array and count the number of times each value appears 00091 00092 long startOfBin[ PowerOfTwoRadix + 1 ], endOfBin[ PowerOfTwoRadix ], nextBin = 1; 00093 startOfBin[ 0 ] = endOfBin[ 0 ] = 0; startOfBin[ PowerOfTwoRadix ] = -1; // sentinel 00094 for( unsigned long i = 1; i < PowerOfTwoRadix; ++i ) 00095 startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count[ i - 1 ]; 00096 00097 for ( long _current = 0; _current <= last; ) 00098 { 00099 unsigned long digit; 00100 T _current_element = a[ _current ]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location 00101 while( endOfBin[ digit = (unsigned long)(( _current_element & bitMask ) >> shiftRightAmount )] != _current ) swap_impl( _current_element, a[ endOfBin[ digit ]++ ] ); 00102 a[ _current ] = _current_element; 00103 00104 endOfBin[ digit ]++; 00105 while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] ) ++nextBin; // skip over empty and full bins, when the end of the current bin reaches the start of the next bin 00106 _current = endOfBin[ nextBin - 1 ]; 00107 } 00108 bitMask >>= Log2ofPowerOfTwoRadix; 00109 if ( bitMask != 0 ) // end recursion when all the bits have been processes 00110 { 00111 if ( shiftRightAmount >= Log2ofPowerOfTwoRadix ) shiftRightAmount -= Log2ofPowerOfTwoRadix; 00112 else shiftRightAmount = 0; 00113 00114 for( unsigned long i = 0; i < PowerOfTwoRadix; ++i ) 00115 { 00116 long numberOfElements = endOfBin[ i ] - startOfBin[ i ]; 00117 if ( numberOfElements >= Threshold ) // endOfBin actually points to one beyond the bin 00118 radix_sort_unsigned_impl< T, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &a[ startOfBin[ i ]], numberOfElements - 1, bitMask, shiftRightAmount ); 00119 else if ( numberOfElements >= 2 ) 00120 insertion_sort_impl( &a[ startOfBin[ i ]], numberOfElements ); 00121 } 00122 } 00123 } 00124 00125 } // end anonymous namespace 00126 00127 // 00128 // Public API 00129 // 00130 namespace stk_classic { 00131 namespace util { 00132 00153 template< class T > 00154 void radix_sort_unsigned( T* a, size_t a_size ) 00155 { 00156 const long Threshold = 25; // smaller array sections sorted by insertion sort 00157 const unsigned long PowerOfTwoRadix = 256; // Radix - must be a power of 2 00158 const unsigned long Log2ofPowerOfTwoRadix = 8; // log( 256 ) = 8 00159 00160 if ( a_size >= (size_t)Threshold ) { 00161 if (a_size > (size_t)std::numeric_limits<long>::max()) { 00162 std::ostringstream msg ; 00163 msg << "stk_classic::utility::radix_sort() exceeded allowable array size ("; 00164 msg << a_size << " < " << std::numeric_limits<long>::max() << ")"; 00165 throw std::runtime_error( msg.str() ); 00166 } 00167 unsigned long shiftRightAmount = sizeof(T) * 8 - Log2ofPowerOfTwoRadix; 00168 T bitMask = (T)( ((T)( PowerOfTwoRadix - 1 )) << shiftRightAmount ); // bitMask controls/selects how many and which bits we process at a time 00169 radix_sort_unsigned_impl< T, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( a, a_size - 1, bitMask, shiftRightAmount ); 00170 } 00171 else 00172 insertion_sort_impl( a, a_size ); 00173 } 00174 00175 } // namespace util 00176 } // namespace stk_classic 00177 00178 00179 //-------------------------------------- 00180 // PARALLEL RADIX SORT using TBB library 00181 //-------------------------------------- 00182 00183 /* TODO: this code works with TBB, it just needs minor modifications and some renaming 00184 template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class T > 00185 class RadixInPlaceOperation_3 00186 { 00187 T* my_a; // a local copy to the input array to provide a pointer to each parallel task 00188 long* my_startOfBin; 00189 long* my_endOfBin; 00190 T my_bitMask; // a local copy of the bitMask 00191 unsigned long my_shiftRightAmount; 00192 static const unsigned long Threshold_P = 10000; // threshold when to switch between parallel and non-parallel implementations 00193 public: 00194 static const unsigned long numberOfBins = PowerOfTwoRadix; 00195 unsigned long my_count[ numberOfBins ]; // the count for this task 00196 00197 RadixInPlaceOperation_3( T a[], long* startOfBin, long* endOfBin, T bitMask, unsigned long shiftRightAmount ) : 00198 my_a( a ), my_startOfBin( startOfBin ), my_endOfBin( endOfBin ), 00199 my_bitMask( bitMask ), my_shiftRightAmount( shiftRightAmount ) 00200 {} 00201 void operator()( const blocked_range< long >& r ) const 00202 { 00203 for( long i = r.begin(); i != r.end(); ++i ) 00204 { 00205 long numOfElements = my_endOfBin[ i ] - my_startOfBin[ i ]; 00206 if ( numOfElements >= Threshold_P ) 00207 _RadixSort_Unsigned_PowerOf2Radix_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &my_a[ my_startOfBin[ i ]], numOfElements - 1, my_bitMask, my_shiftRightAmount ); 00208 else if ( numOfElements >= Threshold ) 00209 _RadixSort_Unsigned_PowerOf2Radix_1< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &my_a[ my_startOfBin[ i ]], numOfElements - 1, my_bitMask, my_shiftRightAmount ); 00210 else if ( numOfElements >= 2 ) 00211 insertion_sort_impl( &my_a[ my_startOfBin[ i ]], numOfElements ); 00212 } 00213 } 00214 }; 00215 00216 template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class T > 00217 inline void _RadixSort_Unsigned_PowerOf2Radix_3( T* a, long last, T bitMask, unsigned long shiftRightAmount ) 00218 { 00219 const unsigned long numberOfBins = PowerOfTwoRadix; 00220 00221 CountingType_1< PowerOfTwoRadix, T > count( a, bitMask, shiftRightAmount ); // contains the count array, which is initialized to all zeros 00222 // Scan the array and count the number of times each value appears 00223 parallel_reduce( blocked_range< unsigned long >( 0, last + 1, 1000 ), count ); 00224 00225 long startOfBin[ numberOfBins ], endOfBin[ numberOfBins ], nextBin; 00226 startOfBin[ 0 ] = endOfBin[ 0 ] = nextBin = 0; 00227 for( unsigned long i = 1; i < numberOfBins; i++ ) 00228 startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count.my_count[ i - 1 ]; 00229 00230 for ( long _current = 0; _current <= last; ) 00231 { 00232 unsigned long digit; 00233 T tmp = a[ _current ]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location 00234 while ( true ) { 00235 digit = (unsigned long)(( tmp & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on 00236 if ( endOfBin[ digit ] == _current ) 00237 break; 00238 swap_impl( tmp, a[ endOfBin[ digit ] ] ); 00239 endOfBin[ digit ]++; 00240 } 00241 a[ _current ] = tmp; 00242 00243 endOfBin[ digit ]++; // leave the element at its location and grow the bin 00244 _current++; // advance the current pointer to the next element 00245 while( _current >= startOfBin[ nextBin ] && nextBin < numberOfBins ) 00246 nextBin++; 00247 while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] && nextBin < numberOfBins ) 00248 nextBin++; 00249 if ( _current < endOfBin[ nextBin - 1 ] ) 00250 _current = endOfBin[ nextBin - 1 ]; 00251 } 00252 bitMask >>= Log2ofPowerOfTwoRadix; 00253 if ( bitMask != 0 ) // end recursion when all the bits have been processes 00254 { 00255 if ( shiftRightAmount >= Log2ofPowerOfTwoRadix ) shiftRightAmount -= Log2ofPowerOfTwoRadix; 00256 else shiftRightAmount = 0; 00257 00258 RadixInPlaceOperation_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold, T > radixOp( a, startOfBin, endOfBin, bitMask, shiftRightAmount ); 00259 parallel_for( blocked_range< long >( 0, numberOfBins ), radixOp ); 00260 } 00261 } 00262 00263 template< unsigned long PowerOfTwoRadix, unsigned long Log2ofPowerOfTwoRadix, long Threshold, class T > 00264 inline void _RadixSort_Unsigned_PowerOf2Radix_1( T* a, long last, T bitMask, unsigned long shiftRightAmount ) 00265 { 00266 const unsigned long numberOfBins = PowerOfTwoRadix; 00267 unsigned long count[ numberOfBins ]; 00268 // Counting occurrence of digits within the array (related to Counting Sort) 00269 for( unsigned long i = 0; i < numberOfBins; i++ ) 00270 count[ i ] = 0; 00271 00272 for ( long _current = 0; _current <= last; _current++ ) // Scan the array and count the number of times each value appears 00273 { 00274 unsigned long digit = (unsigned long)(( a[ _current ] & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on 00275 count[ digit ]++; 00276 } 00277 // Moving array elements into their bins 00278 long startOfBin[ numberOfBins ], endOfBin[ numberOfBins ], nextBin; 00279 startOfBin[ 0 ] = endOfBin[ 0 ] = nextBin = 0; 00280 for( unsigned long i = 1; i < numberOfBins; i++ ) 00281 startOfBin[ i ] = endOfBin[ i ] = startOfBin[ i - 1 ] + count[ i - 1 ]; 00282 00283 for ( long _current = 0; _current <= last; ) 00284 { 00285 unsigned long digit; 00286 T tmp = a[ _current ]; // get the compiler to recognize that a register can be used for the loop instead of a[_current] memory location 00287 while ( true ) { 00288 digit = (unsigned long)(( tmp & bitMask ) >> shiftRightAmount ); // extract the digit we are sorting based on 00289 if ( endOfBin[ digit ] == _current ) 00290 break; 00291 swap_impl( tmp, a[ endOfBin[ digit ] ] ); 00292 endOfBin[ digit ]++; 00293 } 00294 a[ _current ] = tmp; 00295 00296 endOfBin[ digit ]++; // leave the element at its location and grow the bin 00297 _current++; // advance the current pointer to the next element 00298 while( _current >= startOfBin[ nextBin ] && nextBin < numberOfBins ) 00299 nextBin++; 00300 while( endOfBin[ nextBin - 1 ] == startOfBin[ nextBin ] && nextBin < numberOfBins ) 00301 nextBin++; 00302 if ( _current < endOfBin[ nextBin - 1 ] ) 00303 _current = endOfBin[ nextBin - 1 ]; 00304 } 00305 // Recursion for each bin 00306 bitMask >>= Log2ofPowerOfTwoRadix; 00307 if ( bitMask != 0 ) // end recursion when all the bits have been processes 00308 { 00309 if ( shiftRightAmount >= Log2ofPowerOfTwoRadix ) shiftRightAmount -= Log2ofPowerOfTwoRadix; 00310 else shiftRightAmount = 0; 00311 00312 for( unsigned long i = 0; i < numberOfBins; i++ ) 00313 { 00314 long numberOfElements = endOfBin[ i ] - startOfBin[ i ]; 00315 if ( numberOfElements >= Threshold ) { // endOfBin actually points to one beyond the bin 00316 _RadixSort_Unsigned_PowerOf2Radix_1< T, PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( &a[ startOfBin[ i ]], numberOfElements - 1, bitMask, shiftRightAmount ); 00317 } 00318 else if ( numberOfElements >= 2 ) { 00319 insertion_sort_impl( &a[ startOfBin[ i ]], numberOfElements ); 00320 } 00321 } 00322 } 00323 } 00324 00325 // Top-level template function 00326 template< class T > 00327 inline void RadixSortParallel( T* a, unsigned long a_size ) 00328 { 00329 if ( a_size < 2 ) return; 00330 00331 const unsigned long Threshold = 32; // Threshold of when to switch to using Insertion Sort 00332 const unsigned long PowerOfTwoRadix = 256; // Radix - must be a power of 2 00333 const unsigned long Log2ofPowerOfTwoRadix = 8; 00334 unsigned long shiftRightAmount = sizeof( T ) * 8 - Log2ofPowerOfTwoRadix; // Create bit-mask and shift right amount 00335 T bitMask = (T)( ((T)( PowerOfTwoRadix - 1 )) << shiftRightAmount ); // bitMask controls/selects how many and which bits we process at a time 00336 00337 if ( a_size >= Threshold ) 00338 _RadixSort_Unsigned_PowerOf2Radix_3< PowerOfTwoRadix, Log2ofPowerOfTwoRadix, Threshold >( a, a_size - 1, bitMask, shiftRightAmount ); 00339 else 00340 insertion_sort_impl( a, a_size ); 00341 } 00342 00343 */ 00344 00345 00346 00347 #endif /* STK_UTIL_UTIL_RADIXSORT_HPP */