33 #include "m4ri_config.h" 36 #include <emmintrin.h> 54 __m128i *__c = (__m128i*)c;
55 __m128i *__t1 = (__m128i*)t1;
56 __m128i *__t2 = (__m128i*)t2;
57 __m128i *__t3 = (__m128i*)t3;
58 __m128i *__t4 = (__m128i*)t4;
59 __m128i *__t5 = (__m128i*)t5;
60 __m128i *__t6 = (__m128i*)t6;
61 __m128i *__t7 = (__m128i*)t7;
62 __m128i *__t8 = (__m128i*)t8;
63 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
67 xmm1 = _mm_xor_si128(*__c, *__t1++);
68 xmm1 = _mm_xor_si128(xmm1, *__t2++);
69 xmm1 = _mm_xor_si128(xmm1, *__t3++);
70 xmm1 = _mm_xor_si128(xmm1, *__t4++);
71 xmm1 = _mm_xor_si128(xmm1, *__t5++);
72 xmm1 = _mm_xor_si128(xmm1, *__t6++);
73 xmm1 = _mm_xor_si128(xmm1, *__t7++);
74 xmm1 = _mm_xor_si128(xmm1, *__t8++);
86 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
89 for(
wi_t i = 0; i < wide; ++i) {
90 c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i];
93 __M4RI_DD_RAWROW(c, wide_in);
106 __m128i *__c = (__m128i*)c;
107 __m128i *__t1 = (__m128i*)t1;
108 __m128i *__t2 = (__m128i*)t2;
109 __m128i *__t3 = (__m128i*)t3;
110 __m128i *__t4 = (__m128i*)t4;
111 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
115 xmm1 = _mm_xor_si128(*__c, *__t1++);
116 xmm1 = _mm_xor_si128(xmm1, *__t2++);
117 xmm1 = _mm_xor_si128(xmm1, *__t3++);
118 xmm1 = _mm_xor_si128(xmm1, *__t4++);
126 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
129 __M4RI_DD_RAWROW(c, wide_in);
132 #endif // __M4RI_HAVE_SSE2 133 wi_t n = (wide + 7) / 8;
135 case 0:
do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
136 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
137 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
138 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
139 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
140 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
141 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
142 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
145 __M4RI_DD_RAWROW(c, wide_in);
158 __m128i *__c = (__m128i*)c;
159 __m128i *__t1 = (__m128i*)t1;
160 __m128i *__t2 = (__m128i*)t2;
161 __m128i *__t3 = (__m128i*)t3;
162 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
166 xmm1 = _mm_xor_si128(*__c, *__t1++);
167 xmm1 = _mm_xor_si128(xmm1, *__t2++);
168 xmm1 = _mm_xor_si128(xmm1, *__t3++);
175 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
178 __M4RI_DD_RAWROW(c, wide_in);
181 #endif // __M4RI_HAVE_SSE2 182 wi_t n = (wide + 7) / 8;
184 case 0:
do { *c++ ^= *t1++ ^ *t2++ ^ *t3++;
185 case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
186 case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
187 case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
188 case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
189 case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
190 case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
191 case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
194 __M4RI_DD_RAWROW(c, wide_in);
208 __m128i *__c = (__m128i*)c;
209 __m128i *__t1 = (__m128i*)t1;
210 __m128i *__t2 = (__m128i*)t2;
211 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
215 xmm1 = _mm_xor_si128(*__c, *__t1++);
216 xmm1 = _mm_xor_si128(xmm1, *__t2++);
222 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
225 __M4RI_DD_RAWROW(c, wide_in);
228 #endif // __M4RI_HAVE_SSE2 229 wi_t n = (wide + 7) / 8;
231 case 0:
do { *c++ ^= *t1++ ^ *t2++;
232 case 7: *c++ ^= *t1++ ^ *t2++;
233 case 6: *c++ ^= *t1++ ^ *t2++;
234 case 5: *c++ ^= *t1++ ^ *t2++;
235 case 4: *c++ ^= *t1++ ^ *t2++;
236 case 3: *c++ ^= *t1++ ^ *t2++;
237 case 2: *c++ ^= *t1++ ^ *t2++;
238 case 1: *c++ ^= *t1++ ^ *t2++;
241 __M4RI_DD_RAWROW(c, wide_in);
259 __m128i *__c = (__m128i*)c;
260 __m128i *__t1 = (__m128i*)t1;
261 const __m128i *eof = (__m128i*)((
unsigned long)(c + wide) & ~0xFUL);
266 xmm1 = _mm_xor_si128(*__c, *__t1++);
268 xmm1 = _mm_xor_si128(*__c, *__t1++);
273 xmm1 = _mm_xor_si128(*__c, *__t1++);
279 wide = ((
sizeof(
word) * wide) % 16) /
sizeof(
word);
282 __M4RI_DD_RAWROW(c, wide_in);
285 #endif // __M4RI_HAVE_SSE2 287 wi_t n = (wide + 7) / 8;
289 case 0:
do { *c++ ^= *t1++;
290 case 7: *c++ ^= *t1++;
291 case 6: *c++ ^= *t1++;
292 case 5: *c++ ^= *t1++;
293 case 4: *c++ ^= *t1++;
294 case 3: *c++ ^= *t1++;
295 case 2: *c++ ^= *t1++;
296 case 1: *c++ ^= *t1++;
299 __M4RI_DD_RAWROW(c, wide_in);
303 #ifdef __M4RI_M4RM_GRAY8 304 #define _MZD_COMBINE _mzd_combine8(c, t1, t2, t3, t4, t5, t6, t7, t8, wide) 305 #else // __M4RI_M4RM_GRAY8 306 #define _MZD_COMBINE _mzd_combine4(c, t1, t2, t3, t4, wide) 307 #endif // __M4RI_M4RM_GRAY8
static void _mzd_combine2(word *c, word const *t1, word const *t2, wi_t wide_in)
Definition: xor.h:203
#define __M4RI_ALIGNMENT(addr, n)
Return alignment of addr w.r.t. n. For example the address 17 would be 1 aligned w.r.t. 16.
Definition: misc.h:421
static void _mzd_combine(word *c, word const *t1, wi_t wide_in)
Definition: xor.h:249
static void _mzd_combine4(word *c, word const *t1, word const *t2, word const *t3, word const *t4, wi_t wide_in)
Definition: xor.h:101
static void _mzd_combine3(word *c, word const *t1, word const *t2, word const *t3, wi_t wide_in)
Definition: xor.h:153
static void _mzd_combine8(word *c, word const *t1, word const *t2, word const *t3, word const *t4, word const *t5, word const *t6, word const *t7, word const *t8, wi_t wide_in)
Definition: xor.h:48
uint64_t word
A word is the typical packed data structure to represent packed bits.
Definition: misc.h:85
int wi_t
Type of word indexes.
Definition: misc.h:74