Main Page | Modules | Namespace List | Class Hierarchy | Alphabetical List | Data Structures | Directories | File List | Namespace Members | Data Fields | Globals | Examples

bmsse2.h

Go to the documentation of this file.
00001 /*
00002 Copyright(c) 2002-2005 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
00003 
00004 Permission is hereby granted, free of charge, to any person 
00005 obtaining a copy of this software and associated documentation 
00006 files (the "Software"), to deal in the Software without restriction, 
00007 including without limitation the rights to use, copy, modify, merge, 
00008 publish, distribute, sublicense, and/or sell copies of the Software, 
00009 and to permit persons to whom the Software is furnished to do so, 
00010 subject to the following conditions:
00011 
00012 The above copyright notice and this permission notice shall be included 
00013 in all copies or substantial portions of the Software.
00014 
00015 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
00016 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
00017 OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 
00018 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 
00019 DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
00020 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 
00021 OTHER DEALINGS IN THE SOFTWARE.
00022 
00023 For more information please visit:  http://bmagic.sourceforge.net
00024 
00025 */
00026 
00027 
00028 #ifndef BMSSE2__H__INCLUDED__
00029 #define BMSSE2__H__INCLUDED__
00030 
00031 
00032 //    Header implements processor specific intrinsics declarations for SSE2
00033 //    instruction set
00034 #include<emmintrin.h>
00035 
00036 
00037 
00038 namespace bm
00039 {
00040 
00041 /** @defgroup SSE2 Processor specific optimizations for SSE2 instructions
00042  *  @ingroup bmagic
00043  */
00044 
00045 
00046 /*! 
00047   @brief SSE2 reinitialization guard class
00048 
00049   SSE2 requires to call _mm_empty() if we are intermixing
00050   MMX integer commands with floating point arithmetics.
00051   This class guards critical code fragments where SSE2 integer
00052   is used.
00053 
00054   @ingroup SSE2
00055 
00056 */
00057 class sse2_empty_guard
00058 {
00059 public:
00060     BMFORCEINLINE sse2_empty_guard() 
00061     {
00062         _mm_empty();
00063     }
00064 
00065     BMFORCEINLINE ~sse2_empty_guard() 
00066     {
00067         _mm_empty();
00068     }
00069 };
00070 
00071 /*
00072 # ifndef BM_SET_MMX_GUARD
00073 #  define BM_SET_MMX_GUARD  sse2_empty_guard  bm_mmx_guard_;
00074 # endif
00075 */
00076 
00077 /*! 
00078     @brief XOR array elements to specified mask
00079     *dst = *src ^ mask
00080 
00081     @ingroup SSE2
00082 */
00083 BMFORCEINLINE 
00084 void sse2_xor_arr_2_mask(__m128i* BMRESTRICT dst, 
00085                          const __m128i* BMRESTRICT src, 
00086                          const __m128i* BMRESTRICT src_end,
00087                          bm::word_t mask)
00088 {
00089      __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
00090      do
00091      {
00092         __m128i xmm1 = _mm_load_si128(src);
00093 
00094         xmm1 = _mm_xor_si128(xmm1, xmm2);
00095         _mm_store_si128(dst, xmm1);
00096         ++dst;
00097         ++src;
00098 
00099      } while (src < src_end);
00100 }
00101 
00102 /*! 
00103     @brief Inverts array elements and NOT them to specified mask
00104     *dst = ~*src & mask
00105 
00106     @ingroup SSE2
00107 */
00108 BMFORCEINLINE 
00109 void sse2_andnot_arr_2_mask(__m128i* BMRESTRICT dst, 
00110                             const __m128i* BMRESTRICT src, 
00111                             const __m128i* BMRESTRICT src_end,
00112                             bm::word_t mask)
00113 {
00114      __m128i xmm2 = _mm_set_epi32(mask, mask, mask, mask);
00115      do
00116      {
00117         //_mm_prefetch((const char*)(src)+1024, _MM_HINT_NTA);
00118         //_mm_prefetch((const char*)(src)+1088, _MM_HINT_NTA);
00119 
00120         __m128i xmm1 = _mm_load_si128(src);
00121 
00122         xmm1 = _mm_andnot_si128(xmm1, xmm2); // xmm1 = (~xmm1) & xmm2 
00123         _mm_store_si128(dst, xmm1);
00124         ++dst;
00125         ++src;
00126 
00127      } while (src < src_end);
00128 }
00129 
00130 /*! 
00131     @brief AND array elements against another array
00132     *dst &= *src
00133 
00134     @ingroup SSE2
00135 */
00136 BMFORCEINLINE 
00137 void sse2_and_arr(__m128i* BMRESTRICT dst, 
00138                   const __m128i* BMRESTRICT src, 
00139                   const __m128i* BMRESTRICT src_end)
00140 {
00141     __m128i xmm1, xmm2;
00142     do
00143     {
00144         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00145     
00146         xmm1 = _mm_load_si128(src++);
00147         xmm2 = _mm_load_si128(dst);
00148         xmm1 = _mm_and_si128(xmm1, xmm2);
00149         _mm_store_si128(dst++, xmm1);
00150         
00151         xmm1 = _mm_load_si128(src++);
00152         xmm2 = _mm_load_si128(dst);
00153         xmm1 = _mm_and_si128(xmm1, xmm2);
00154         _mm_store_si128(dst++, xmm1);
00155 
00156         xmm1 = _mm_load_si128(src++);
00157         xmm2 = _mm_load_si128(dst);
00158         xmm1 = _mm_and_si128(xmm1, xmm2);
00159         _mm_store_si128(dst++, xmm1);
00160 
00161         xmm1 = _mm_load_si128(src++);
00162         xmm2 = _mm_load_si128(dst);
00163         xmm1 = _mm_and_si128(xmm1, xmm2);
00164         _mm_store_si128(dst++, xmm1);
00165 
00166     } while (src < src_end);
00167 
00168 }
00169 
00170 
00171 
00172 /*! 
00173     @brief OR array elements against another array
00174     *dst |= *src
00175 
00176     @ingroup SSE2
00177 */
00178 BMFORCEINLINE 
00179 void sse2_or_arr(__m128i* BMRESTRICT dst, 
00180                  const __m128i* BMRESTRICT src, 
00181                  const __m128i* BMRESTRICT src_end)
00182 {
00183     __m128i xmm1, xmm2;
00184     do
00185     {
00186         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00187     
00188         xmm1 = _mm_load_si128(src++);
00189         xmm2 = _mm_load_si128(dst);
00190         xmm1 = _mm_or_si128(xmm1, xmm2);
00191         _mm_store_si128(dst++, xmm1);
00192         
00193         xmm1 = _mm_load_si128(src++);
00194         xmm2 = _mm_load_si128(dst);
00195         xmm1 = _mm_or_si128(xmm1, xmm2);
00196         _mm_store_si128(dst++, xmm1);
00197 
00198         xmm1 = _mm_load_si128(src++);
00199         xmm2 = _mm_load_si128(dst);
00200         xmm1 = _mm_or_si128(xmm1, xmm2);
00201         _mm_store_si128(dst++, xmm1);
00202 
00203         xmm1 = _mm_load_si128(src++);
00204         xmm2 = _mm_load_si128(dst);
00205         xmm1 = _mm_or_si128(xmm1, xmm2);
00206         _mm_store_si128(dst++, xmm1);
00207 
00208     } while (src < src_end);
00209 }
00210 
00211 /*! 
00212     @brief OR array elements against another array
00213     *dst |= *src
00214 
00215     @ingroup SSE2
00216 */
00217 BMFORCEINLINE 
00218 void sse2_xor_arr(__m128i* BMRESTRICT dst, 
00219                   const __m128i* BMRESTRICT src, 
00220                   const __m128i* BMRESTRICT src_end)
00221 {
00222     __m128i xmm1, xmm2;
00223     do
00224     {
00225         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00226     
00227         xmm1 = _mm_load_si128(src++);
00228         xmm2 = _mm_load_si128(dst);
00229         xmm1 = _mm_xor_si128(xmm1, xmm2);
00230         _mm_store_si128(dst++, xmm1);
00231         
00232         xmm1 = _mm_load_si128(src++);
00233         xmm2 = _mm_load_si128(dst);
00234         xmm1 = _mm_xor_si128(xmm1, xmm2);
00235         _mm_store_si128(dst++, xmm1);
00236 
00237         xmm1 = _mm_load_si128(src++);
00238         xmm2 = _mm_load_si128(dst);
00239         xmm1 = _mm_xor_si128(xmm1, xmm2);
00240         _mm_store_si128(dst++, xmm1);
00241 
00242         xmm1 = _mm_load_si128(src++);
00243         xmm2 = _mm_load_si128(dst);
00244         xmm1 = _mm_xor_si128(xmm1, xmm2);
00245         _mm_store_si128(dst++, xmm1);
00246 
00247     } while (src < src_end);
00248 }
00249 
00250 
00251 /*! 
00252     @brief AND-NOT (SUB) array elements against another array
00253     *dst &= ~*src
00254 
00255     @ingroup SSE2
00256 */
00257 BMFORCEINLINE 
00258 void sse2_sub_arr(__m128i* BMRESTRICT dst, 
00259                  const __m128i* BMRESTRICT src, 
00260                  const __m128i* BMRESTRICT src_end)
00261 {
00262     __m128i xmm1, xmm2;
00263     do
00264     {
00265         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00266     
00267         xmm1 = _mm_load_si128(src++);
00268         xmm2 = _mm_load_si128(dst);
00269         xmm1 = _mm_andnot_si128(xmm1, xmm2);
00270         _mm_store_si128(dst++, xmm1);
00271         
00272         xmm1 = _mm_load_si128(src++);
00273         xmm2 = _mm_load_si128(dst);
00274         xmm1 = _mm_andnot_si128(xmm1, xmm2);
00275         _mm_store_si128(dst++, xmm1);
00276 
00277         xmm1 = _mm_load_si128(src++);
00278         xmm2 = _mm_load_si128(dst);
00279         xmm1 = _mm_andnot_si128(xmm1, xmm2);
00280         _mm_store_si128(dst++, xmm1);
00281 
00282         xmm1 = _mm_load_si128(src++);
00283         xmm2 = _mm_load_si128(dst);
00284         xmm1 = _mm_andnot_si128(xmm1, xmm2);
00285         _mm_store_si128(dst++, xmm1);
00286 
00287     } while (src < src_end);    
00288 }
00289 
00290 /*! 
00291     @brief SSE2 block memset
00292     *dst = value
00293 
00294     @ingroup SSE2
00295 */
00296 
00297 BMFORCEINLINE 
00298 void sse2_set_block(__m128i* BMRESTRICT dst, 
00299                     __m128i* BMRESTRICT dst_end, 
00300                     bm::word_t value)
00301 {
00302     __m128i xmm0 = _mm_set_epi32 (value, value, value, value);
00303     do
00304     {            
00305         _mm_store_si128(dst, xmm0);
00306 /*        
00307         _mm_store_si128(dst+1, xmm0);
00308         _mm_store_si128(dst+2, xmm0);
00309         _mm_store_si128(dst+3, xmm0);
00310 
00311         _mm_store_si128(dst+4, xmm0);
00312         _mm_store_si128(dst+5, xmm0);
00313         _mm_store_si128(dst+6, xmm0);
00314         _mm_store_si128(dst+7, xmm0);
00315 
00316         dst += 8;
00317 */        
00318     } while (++dst < dst_end);
00319     
00320     _mm_sfence();
00321 }
00322 
00323 /*! 
00324     @brief SSE2 block copy
00325     *dst = *src
00326 
00327     @ingroup SSE2
00328 */
00329 BMFORCEINLINE 
00330 void sse2_copy_block(__m128i* BMRESTRICT dst, 
00331                      const __m128i* BMRESTRICT src, 
00332                      const __m128i* BMRESTRICT src_end)
00333 {
00334     __m128i xmm0, xmm1, xmm2, xmm3;
00335     do
00336     {
00337         _mm_prefetch((const char*)(src)+512,  _MM_HINT_NTA);
00338     
00339         xmm0 = _mm_load_si128(src+0);
00340         xmm1 = _mm_load_si128(src+1);
00341         xmm2 = _mm_load_si128(src+2);
00342         xmm3 = _mm_load_si128(src+3);
00343         
00344         _mm_store_si128(dst+0, xmm0);
00345         _mm_store_si128(dst+1, xmm1);
00346         _mm_store_si128(dst+2, xmm2);
00347         _mm_store_si128(dst+3, xmm3);
00348         
00349         xmm0 = _mm_load_si128(src+4);
00350         xmm1 = _mm_load_si128(src+5);
00351         xmm2 = _mm_load_si128(src+6);
00352         xmm3 = _mm_load_si128(src+7);
00353         
00354         _mm_store_si128(dst+4, xmm0);
00355         _mm_store_si128(dst+5, xmm1);
00356         _mm_store_si128(dst+6, xmm2);
00357         _mm_store_si128(dst+7, xmm3);
00358         
00359         src += 8;
00360         dst += 8;
00361         
00362     } while (src < src_end);    
00363 }
00364 
00365 
00366 /*! 
00367     @brief Invert array elements
00368     *dst = ~*dst
00369     or
00370     *dst ^= *dst 
00371 
00372     @ingroup SSE2
00373 */
00374 BMFORCEINLINE 
00375 void sse2_invert_arr(bm::word_t* first, bm::word_t* last)
00376 {
00377     __m128i xmm1 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 
00378                                  0xFFFFFFFF, 0xFFFFFFFF);
00379     __m128i* wrd_ptr = (__m128i*)first;
00380 
00381     do 
00382     {
00383         _mm_prefetch((const char*)(wrd_ptr)+512,  _MM_HINT_NTA);
00384         
00385         __m128i xmm0 = _mm_load_si128(wrd_ptr);
00386         xmm0 = _mm_xor_si128(xmm0, xmm1);
00387         _mm_store_si128(wrd_ptr, xmm0);
00388         ++wrd_ptr;
00389     } while (wrd_ptr < (__m128i*)last);
00390 }
00391 
00392 
00393 
00394 /*!
00395     SSE2 optimized bitcounting function implements parallel bitcounting
00396     algorithm for SSE2 instruction set.
00397 
00398 <pre>
00399 unsigned CalcBitCount32(unsigned b)
00400 {
00401     b = (b & 0x55555555) + (b >> 1 & 0x55555555);
00402     b = (b & 0x33333333) + (b >> 2 & 0x33333333);
00403     b = (b + (b >> 4)) & 0x0F0F0F0F;
00404     b = b + (b >> 8);
00405     b = (b + (b >> 16)) & 0x0000003F;
00406     return b;
00407 }
00408 </pre>
00409 
00410     @ingroup SSE2
00411 
00412 */
00413 inline 
00414 bm::id_t sse2_bit_count(const __m128i* block, const __m128i* block_end)
00415 {
00416     const unsigned mu1 = 0x55555555;
00417     const unsigned mu2 = 0x33333333;
00418     const unsigned mu3 = 0x0F0F0F0F;
00419     const unsigned mu4 = 0x0000003F;
00420 
00421     // Loading masks
00422     __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00423     __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00424     __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00425     __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00426     __m128i mcnt;
00427     mcnt = _mm_xor_si128(m1, m1); // cnt = 0
00428 
00429     __m128i tmp1, tmp2;
00430     do
00431     {        
00432         __m128i b = _mm_load_si128(block);
00433         ++block;
00434 
00435         // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
00436         tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
00437         tmp1 = _mm_and_si128(tmp1, m1); 
00438         tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
00439         b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
00440 
00441         // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
00442         tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
00443         tmp1 = _mm_and_si128(tmp1, m2); 
00444         tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
00445         b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
00446 
00447         // b = (b + (b >> 4)) & 0x0F0F0F0F;
00448         tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
00449         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
00450         b = _mm_and_si128(b, m3);                       //           & 0x0F0F0F0F
00451 
00452         // b = b + (b >> 8);
00453         tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
00454         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
00455 
00456         // b = (b + (b >> 16)) & 0x0000003F;
00457         tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
00458         b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
00459         b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
00460 
00461         mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
00462 
00463     } while (block < block_end);
00464 
00465     __declspec(align(16)) bm::id_t tcnt[4];
00466     _mm_store_si128((__m128i*)tcnt, mcnt);
00467 
00468     return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00469 }
00470 
00471 BMFORCEINLINE 
00472 __m128i sse2_and(__m128i a, __m128i b)
00473 {
00474     return _mm_and_si128(a, b);
00475 }
00476 
00477 BMFORCEINLINE 
00478 __m128i sse2_or(__m128i a, __m128i b)
00479 {
00480     return _mm_or_si128(a, b);
00481 }
00482 
00483 
00484 BMFORCEINLINE 
00485 __m128i sse2_xor(__m128i a, __m128i b)
00486 {
00487     return _mm_xor_si128(a, b);
00488 }
00489 
00490 BMFORCEINLINE 
00491 __m128i sse2_sub(__m128i a, __m128i b)
00492 {
00493     return _mm_andnot_si128(b, a);
00494 }
00495 
00496 
00497 template<class Func>
00498 bm::id_t sse2_bit_count_op(const __m128i* BMRESTRICT block, 
00499                            const __m128i* BMRESTRICT block_end,
00500                            const __m128i* BMRESTRICT mask_block,
00501                            Func sse2_func)
00502 {
00503     const unsigned mu1 = 0x55555555;
00504     const unsigned mu2 = 0x33333333;
00505     const unsigned mu3 = 0x0F0F0F0F;
00506     const unsigned mu4 = 0x0000003F;
00507 
00508     // Loading masks
00509     __m128i m1 = _mm_set_epi32 (mu1, mu1, mu1, mu1);
00510     __m128i m2 = _mm_set_epi32 (mu2, mu2, mu2, mu2);
00511     __m128i m3 = _mm_set_epi32 (mu3, mu3, mu3, mu3);
00512     __m128i m4 = _mm_set_epi32 (mu4, mu4, mu4, mu4);
00513     __m128i mcnt;
00514     mcnt = _mm_xor_si128(m1, m1); // cnt = 0
00515     do
00516     {
00517         __m128i tmp1, tmp2;
00518         __m128i b = _mm_load_si128(block++);
00519 
00520         tmp1 = _mm_load_si128(mask_block++);
00521         
00522         b = sse2_func(b, tmp1);
00523                         
00524         // b = (b & 0x55555555) + (b >> 1 & 0x55555555);
00525         tmp1 = _mm_srli_epi32(b, 1);                    // tmp1 = (b >> 1 & 0x55555555)
00526         tmp1 = _mm_and_si128(tmp1, m1); 
00527         tmp2 = _mm_and_si128(b, m1);                    // tmp2 = (b & 0x55555555)
00528         b    = _mm_add_epi32(tmp1, tmp2);               //  b = tmp1 + tmp2
00529 
00530         // b = (b & 0x33333333) + (b >> 2 & 0x33333333);
00531         tmp1 = _mm_srli_epi32(b, 2);                    // (b >> 2 & 0x33333333)
00532         tmp1 = _mm_and_si128(tmp1, m2); 
00533         tmp2 = _mm_and_si128(b, m2);                    // (b & 0x33333333)
00534         b    = _mm_add_epi32(tmp1, tmp2);               // b = tmp1 + tmp2
00535 
00536         // b = (b + (b >> 4)) & 0x0F0F0F0F;
00537         tmp1 = _mm_srli_epi32(b, 4);                    // tmp1 = b >> 4
00538         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 4)
00539         b = _mm_and_si128(b, m3);                       //           & 0x0F0F0F0F
00540 
00541         // b = b + (b >> 8);
00542         tmp1 = _mm_srli_epi32 (b, 8);                   // tmp1 = b >> 8
00543         b = _mm_add_epi32(b, tmp1);                     // b = b + (b >> 8)
00544         
00545         // b = (b + (b >> 16)) & 0x0000003F;
00546         tmp1 = _mm_srli_epi32 (b, 16);                  // b >> 16
00547         b = _mm_add_epi32(b, tmp1);                     // b + (b >> 16)
00548         b = _mm_and_si128(b, m4);                       // (b >> 16) & 0x0000003F;
00549 
00550         mcnt = _mm_add_epi32(mcnt, b);                  // mcnt += b
00551 
00552     } while (block < block_end);
00553 
00554     __declspec(align(16)) bm::id_t tcnt[4];
00555     _mm_store_si128((__m128i*)tcnt, mcnt);
00556 
00557     return tcnt[0] + tcnt[1] + tcnt[2] + tcnt[3];
00558 }
00559 
00560 
00561 
00562 
00563 #define VECT_XOR_ARR_2_MASK(dst, src, src_end, mask)\
00564     sse2_xor_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00565 
00566 #define VECT_ANDNOT_ARR_2_MASK(dst, src, src_end, mask)\
00567     sse2_andnot_arr_2_mask((__m128i*)(dst), (__m128i*)(src), (__m128i*)(src_end), mask)
00568 
00569 #define VECT_BITCOUNT(first, last) \
00570     sse2_bit_count((__m128i*) (first), (__m128i*) (last)) 
00571 
00572 #define VECT_BITCOUNT_AND(first, last, mask) \
00573     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_and) 
00574 
00575 #define VECT_BITCOUNT_OR(first, last, mask) \
00576     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_or) 
00577 
00578 #define VECT_BITCOUNT_XOR(first, last, mask) \
00579     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_xor) 
00580 
00581 #define VECT_BITCOUNT_SUB(first, last, mask) \
00582     sse2_bit_count_op((__m128i*) (first), (__m128i*) (last), (__m128i*) (mask), sse2_sub) 
00583 
00584 #define VECT_INVERT_ARR(first, last) \
00585     sse2_invert_arr(first, last);
00586 
00587 #define VECT_AND_ARR(dst, src, src_end) \
00588     sse2_and_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00589 
00590 #define VECT_OR_ARR(dst, src, src_end) \
00591     sse2_or_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00592 
00593 #define VECT_SUB_ARR(dst, src, src_end) \
00594     sse2_sub_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00595 
00596 #define VECT_XOR_ARR(dst, src, src_end) \
00597     sse2_xor_arr((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00598 
00599 #define VECT_COPY_BLOCK(dst, src, src_end) \
00600     sse2_copy_block((__m128i*) dst, (__m128i*) (src), (__m128i*) (src_end))
00601 
00602 #define VECT_SET_BLOCK(dst, dst_end, value) \
00603     sse2_set_block((__m128i*) dst, (__m128i*) (dst_end), (value))
00604 
00605 } // namespace
00606 
00607 #endif

Generated on Sun Aug 5 14:12:26 2007 for BitMagic by  doxygen 1.4.1