M4RI  1.0.1
xor.h
Go to the documentation of this file.
1 
10 #ifndef M4RI_XOR_H
11 #define M4RI_XOR_H
12 
13  /*******************************************************************
14  *
15  * M4RI: Linear Algebra over GF(2)
16  *
17  * Copyright (C) 2008-2010 Martin Albrecht <martinralbrecht@googlemail.com>
18  *
19  * Distributed under the terms of the GNU General Public License (GPL)
20  * version 2 or higher.
21  *
22  * This code is distributed in the hope that it will be useful,
23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25  * General Public License for more details.
26  *
27  * The full text of the GPL is available at:
28  *
29  * http://www.gnu.org/licenses/
30  *
31  ********************************************************************/
32 
33 #include <m4ri/m4ri_config.h>
34 
35 #if __M4RI_HAVE_SSE2
36 #include <emmintrin.h>
37 #endif
38 
39 #include <m4ri/misc.h>
40 
48 static inline void _mzd_combine8(word *c, word const *t1, word const *t2, word const *t3, word const *t4,
49  word const *t5, word const *t6, word const *t7, word const *t8, wi_t wide_in) {
50  wi_t wide = wide_in;
51 #if __M4RI_HAVE_SSE2
52  /* assuming t1 ... t8 are aligned, but c might not be */
53  if (__M4RI_ALIGNMENT(c,16)==0) {
54  __m128i *__c = (__m128i*)c;
55  __m128i *__t1 = (__m128i*)t1;
56  __m128i *__t2 = (__m128i*)t2;
57  __m128i *__t3 = (__m128i*)t3;
58  __m128i *__t4 = (__m128i*)t4;
59  __m128i *__t5 = (__m128i*)t5;
60  __m128i *__t6 = (__m128i*)t6;
61  __m128i *__t7 = (__m128i*)t7;
62  __m128i *__t8 = (__m128i*)t8;
63  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
64  __m128i xmm1;
65 
66  while(__c < eof) {
67  xmm1 = _mm_xor_si128(*__c, *__t1++);
68  xmm1 = _mm_xor_si128(xmm1, *__t2++);
69  xmm1 = _mm_xor_si128(xmm1, *__t3++);
70  xmm1 = _mm_xor_si128(xmm1, *__t4++);
71  xmm1 = _mm_xor_si128(xmm1, *__t5++);
72  xmm1 = _mm_xor_si128(xmm1, *__t6++);
73  xmm1 = _mm_xor_si128(xmm1, *__t7++);
74  xmm1 = _mm_xor_si128(xmm1, *__t8++);
75  *__c++ = xmm1;
76  }
77  c = (word*)__c;
78  t1 = (word*)__t1;
79  t2 = (word*)__t2;
80  t3 = (word*)__t3;
81  t4 = (word*)__t4;
82  t5 = (word*)__t5;
83  t6 = (word*)__t6;
84  t7 = (word*)__t7;
85  t8 = (word*)__t8;
86  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
87  }
88 #endif
89  for(wi_t i = 0; i < wide; ++i) {
90  c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i];
91  }
92 
93  __M4RI_DD_RAWROW(c, wide_in);
94 }
100 static inline void _mzd_combine6(word *c, word const *t1, word const *t2, word const *t3, word const *t4, word const *t5, word const *t6, wi_t wide_in) {
101  wi_t wide = wide_in;
102 #if __M4RI_HAVE_SSE2
103  /* assuming t1 ... t4 are aligned, but c might not be */
104  assert(__M4RI_ALIGNMENT(c,16) == 8 || __M4RI_ALIGNMENT(c,16) == 0);
105  assert(__M4RI_ALIGNMENT(c,16) == __M4RI_ALIGNMENT(t1,16));
106 
107  if (__M4RI_ALIGNMENT(c,16) == 8) {
108  *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
109  wide--;
110  }
111 
112  __m128i *__c = (__m128i*)c;
113  __m128i *__t1 = (__m128i*)t1;
114  __m128i *__t2 = (__m128i*)t2;
115  __m128i *__t3 = (__m128i*)t3;
116  __m128i *__t4 = (__m128i*)t4;
117  __m128i *__t5 = (__m128i*)t5;
118  __m128i *__t6 = (__m128i*)t6;
119  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
120  __m128i xmm1;
121 
122  while(__c < eof) {
123  xmm1 = _mm_xor_si128(*__c, *__t1++);
124  xmm1 = _mm_xor_si128(xmm1, *__t2++);
125  xmm1 = _mm_xor_si128(xmm1, *__t3++);
126  xmm1 = _mm_xor_si128(xmm1, *__t4++);
127  xmm1 = _mm_xor_si128(xmm1, *__t5++);
128  xmm1 = _mm_xor_si128(xmm1, *__t6++);
129  *__c++ = xmm1;
130  }
131  c = (word*)__c;
132  t1 = (word*)__t1;
133  t2 = (word*)__t2;
134  t3 = (word*)__t3;
135  t4 = (word*)__t4;
136  t5 = (word*)__t5;
137  t6 = (word*)__t6;
138  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
139 
140  if(wide) {
141  *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
142  }
143  __M4RI_DD_RAWROW(c, wide_in);
144  return;
145 #else
146  wi_t n = (wide + 7) / 8;
147  switch (wide % 8) {
148  case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
149  case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
150  case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
151  case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
152  case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
153  case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
154  case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
155  case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++ ^ *t6++;
156  } while (--n > 0);
157  }
158  __M4RI_DD_RAWROW(c, wide_in);
159  return;
160 #endif // __M4RI_HAVE_SSE2
161 }
162 
163 
169 static inline void _mzd_combine5(word *c, word const *t1, word const *t2, word const *t3, word const *t4, word const *t5, wi_t wide_in) {
170  wi_t wide = wide_in;
171 #if __M4RI_HAVE_SSE2
172  /* assuming t1 ... t4 are aligned, but c might not be */
173  assert(__M4RI_ALIGNMENT(c,16) == 8 || __M4RI_ALIGNMENT(c,16) == 0);
174  assert(__M4RI_ALIGNMENT(c,16) == __M4RI_ALIGNMENT(t1,16));
175 
176  if (__M4RI_ALIGNMENT(c,16) == 8) {
177  *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
178  wide--;
179  }
180 
181  __m128i *__c = (__m128i*)c;
182  __m128i *__t1 = (__m128i*)t1;
183  __m128i *__t2 = (__m128i*)t2;
184  __m128i *__t3 = (__m128i*)t3;
185  __m128i *__t4 = (__m128i*)t4;
186  __m128i *__t5 = (__m128i*)t5;
187  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
188  __m128i xmm1;
189 
190  while(__c < eof) {
191  xmm1 = _mm_xor_si128(*__c, *__t1++);
192  xmm1 = _mm_xor_si128(xmm1, *__t2++);
193  xmm1 = _mm_xor_si128(xmm1, *__t3++);
194  xmm1 = _mm_xor_si128(xmm1, *__t4++);
195  xmm1 = _mm_xor_si128(xmm1, *__t5++);
196  *__c++ = xmm1;
197  }
198  c = (word*)__c;
199  t1 = (word*)__t1;
200  t2 = (word*)__t2;
201  t3 = (word*)__t3;
202  t4 = (word*)__t4;
203  t5 = (word*)__t5;
204  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
205 
206  if(wide) {
207  *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
208  }
209  __M4RI_DD_RAWROW(c, wide_in);
210  return;
211 #else
212  wi_t n = (wide + 7) / 8;
213  switch (wide % 8) {
214  case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
215  case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
216  case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
217  case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
218  case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
219  case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
220  case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
221  case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++ ^ *t5++;
222  } while (--n > 0);
223  }
224  __M4RI_DD_RAWROW(c, wide_in);
225  return;
226 #endif // __M4RI_HAVE_SSE2
227 }
228 
229 
235 static inline void _mzd_combine4(word *c, word const *t1, word const *t2, word const *t3, word const *t4, wi_t wide_in) {
236  wi_t wide = wide_in;
237 #if __M4RI_HAVE_SSE2
238  /* assuming t1 ... t4 are aligned, but c might not be */
239  assert(__M4RI_ALIGNMENT(c,16) == 8 || __M4RI_ALIGNMENT(c,16) == 0);
240  assert(__M4RI_ALIGNMENT(c,16) == __M4RI_ALIGNMENT(t1,16));
241 
242  if (__M4RI_ALIGNMENT(c,16) == 8) {
243  *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
244  wide--;
245  }
246 
247  __m128i *__c = (__m128i*)c;
248  __m128i *__t1 = (__m128i*)t1;
249  __m128i *__t2 = (__m128i*)t2;
250  __m128i *__t3 = (__m128i*)t3;
251  __m128i *__t4 = (__m128i*)t4;
252  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
253  __m128i xmm1;
254 
255  while(__c < eof) {
256  xmm1 = _mm_xor_si128(*__c, *__t1++);
257  xmm1 = _mm_xor_si128(xmm1, *__t2++);
258  xmm1 = _mm_xor_si128(xmm1, *__t3++);
259  xmm1 = _mm_xor_si128(xmm1, *__t4++);
260  *__c++ = xmm1;
261  }
262  c = (word*)__c;
263  t1 = (word*)__t1;
264  t2 = (word*)__t2;
265  t3 = (word*)__t3;
266  t4 = (word*)__t4;
267  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
268 
269  if(wide) {
270  *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
271  }
272  __M4RI_DD_RAWROW(c, wide_in);
273  return;
274 #else
275  wi_t n = (wide + 7) / 8;
276  switch (wide % 8) {
277  case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
278  case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
279  case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
280  case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
281  case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
282  case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
283  case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
284  case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
285  } while (--n > 0);
286  }
287  __M4RI_DD_RAWROW(c, wide_in);
288  return;
289 #endif // __M4RI_HAVE_SSE2
290 }
291 
297 static inline void _mzd_combine3(word *c, word const *t1, word const *t2, word const *t3, wi_t wide_in) {
298  wi_t wide = wide_in;
299 #if __M4RI_HAVE_SSE2
300  /* assuming t1 ... t3 are aligned, but c might not be */
301  if (__M4RI_ALIGNMENT(c,16)==0) {
302  __m128i *__c = (__m128i*)c;
303  __m128i *__t1 = (__m128i*)t1;
304  __m128i *__t2 = (__m128i*)t2;
305  __m128i *__t3 = (__m128i*)t3;
306  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
307  __m128i xmm1;
308 
309  while(__c < eof) {
310  xmm1 = _mm_xor_si128(*__c, *__t1++);
311  xmm1 = _mm_xor_si128(xmm1, *__t2++);
312  xmm1 = _mm_xor_si128(xmm1, *__t3++);
313  *__c++ = xmm1;
314  }
315  c = (word*)__c;
316  t1 = (word*)__t1;
317  t2 = (word*)__t2;
318  t3 = (word*)__t3;
319  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
320  }
321  if(!wide) {
322  __M4RI_DD_RAWROW(c, wide_in);
323  return;
324  }
325 #endif // __M4RI_HAVE_SSE2
326  wi_t n = (wide + 7) / 8;
327  switch (wide % 8) {
328  case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++;
329  case 7: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
330  case 6: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
331  case 5: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
332  case 4: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
333  case 3: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
334  case 2: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
335  case 1: *c++ ^= *t1++ ^ *t2++ ^ *t3++;
336  } while (--n > 0);
337  }
338  __M4RI_DD_RAWROW(c, wide_in);
339 }
340 
341 
347 static inline void _mzd_combine2(word *c, word const *t1, word const *t2, wi_t wide_in) {
348  wi_t wide = wide_in;
349 #if __M4RI_HAVE_SSE2
350  /* assuming t1 ... t2 are aligned, but c might not be */
351  if (__M4RI_ALIGNMENT(c,16)==0) {
352  __m128i *__c = (__m128i*)c;
353  __m128i *__t1 = (__m128i*)t1;
354  __m128i *__t2 = (__m128i*)t2;
355  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
356  __m128i xmm1;
357 
358  while(__c < eof) {
359  xmm1 = _mm_xor_si128(*__c, *__t1++);
360  xmm1 = _mm_xor_si128(xmm1, *__t2++);
361  *__c++ = xmm1;
362  }
363  c = (word*)__c;
364  t1 = (word*)__t1;
365  t2 = (word*)__t2;
366  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
367  }
368  if(!wide) {
369  __M4RI_DD_RAWROW(c, wide_in);
370  return;
371  }
372 #endif // __M4RI_HAVE_SSE2
373  wi_t n = (wide + 7) / 8;
374  switch (wide % 8) {
375  case 0: do { *c++ ^= *t1++ ^ *t2++;
376  case 7: *c++ ^= *t1++ ^ *t2++;
377  case 6: *c++ ^= *t1++ ^ *t2++;
378  case 5: *c++ ^= *t1++ ^ *t2++;
379  case 4: *c++ ^= *t1++ ^ *t2++;
380  case 3: *c++ ^= *t1++ ^ *t2++;
381  case 2: *c++ ^= *t1++ ^ *t2++;
382  case 1: *c++ ^= *t1++ ^ *t2++;
383  } while (--n > 0);
384  }
385  __M4RI_DD_RAWROW(c, wide_in);
386 }
387 
393 static inline void _mzd_combine(word *c, word const *t1, wi_t wide_in) {
394  wi_t wide = wide_in;
395 #if __M4RI_HAVE_SSE2
396  /* assuming c, t1 are alligned the same way */
397 
398  if (__M4RI_ALIGNMENT(c,16)==8 && wide) {
399  *c++ ^= *t1++;
400  wide--;
401  }
402 
403  __m128i *__c = (__m128i*)c;
404  __m128i *__t1 = (__m128i*)t1;
405  const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
406  __m128i xmm1;
407 
408 
409  while(__c < eof-1) {
410  xmm1 = _mm_xor_si128(*__c, *__t1++);
411  *__c++ = xmm1;
412  xmm1 = _mm_xor_si128(*__c, *__t1++);
413  *__c++ = xmm1;
414  }
415 
416  if(__c < eof) {
417  xmm1 = _mm_xor_si128(*__c, *__t1++);
418  *__c++ = xmm1;
419  }
420 
421  c = (word*)__c;
422  t1 = (word*)__t1;
423  wide = ((sizeof(word) * wide) % 16) / sizeof(word);
424 
425  if(!wide) {
426  __M4RI_DD_RAWROW(c, wide_in);
427  return;
428  }
429 #endif // __M4RI_HAVE_SSE2
430 
431  wi_t n = (wide + 7) / 8;
432  switch (wide % 8) {
433  case 0: do { *c++ ^= *t1++;
434  case 7: *c++ ^= *t1++;
435  case 6: *c++ ^= *t1++;
436  case 5: *c++ ^= *t1++;
437  case 4: *c++ ^= *t1++;
438  case 3: *c++ ^= *t1++;
439  case 2: *c++ ^= *t1++;
440  case 1: *c++ ^= *t1++;
441  } while (--n > 0);
442  }
443  __M4RI_DD_RAWROW(c, wide_in);
444 }
445 
446 #endif // M4RI_XOR_H