xref: /freebsd/sys/contrib/libb2/blake2b-load-sse41.h (revision e4c66ddabdb470bab319705c1834a4867c508a43)
1 /*
2    BLAKE2 reference source code package - optimized C implementations
3 
4    Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
5 
6    To the extent possible under law, the author(s) have dedicated all copyright
7    and related and neighboring rights to this software to the public domain
8    worldwide. This software is distributed without any warranty.
9 
10    You should have received a copy of the CC0 Public Domain Dedication along with
11    this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
12 */
13 #pragma once
14 #ifndef __BLAKE2B_LOAD_SSE41_H__
15 #define __BLAKE2B_LOAD_SSE41_H__
16 
17 #define LOAD_MSG_0_1(b0, b1) \
18 do \
19 { \
20 b0 = _mm_unpacklo_epi64(m0, m1); \
21 b1 = _mm_unpacklo_epi64(m2, m3); \
22 } while(0)
23 
24 
25 #define LOAD_MSG_0_2(b0, b1) \
26 do \
27 { \
28 b0 = _mm_unpackhi_epi64(m0, m1); \
29 b1 = _mm_unpackhi_epi64(m2, m3); \
30 } while(0)
31 
32 
33 #define LOAD_MSG_0_3(b0, b1) \
34 do \
35 { \
36 b0 = _mm_unpacklo_epi64(m4, m5); \
37 b1 = _mm_unpacklo_epi64(m6, m7); \
38 } while(0)
39 
40 
41 #define LOAD_MSG_0_4(b0, b1) \
42 do \
43 { \
44 b0 = _mm_unpackhi_epi64(m4, m5); \
45 b1 = _mm_unpackhi_epi64(m6, m7); \
46 } while(0)
47 
48 
49 #define LOAD_MSG_1_1(b0, b1) \
50 do \
51 { \
52 b0 = _mm_unpacklo_epi64(m7, m2); \
53 b1 = _mm_unpackhi_epi64(m4, m6); \
54 } while(0)
55 
56 
57 #define LOAD_MSG_1_2(b0, b1) \
58 do \
59 { \
60 b0 = _mm_unpacklo_epi64(m5, m4); \
61 b1 = _mm_alignr_epi8(m3, m7, 8); \
62 } while(0)
63 
64 
65 #define LOAD_MSG_1_3(b0, b1) \
66 do \
67 { \
68 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
69 b1 = _mm_unpackhi_epi64(m5, m2); \
70 } while(0)
71 
72 
73 #define LOAD_MSG_1_4(b0, b1) \
74 do \
75 { \
76 b0 = _mm_unpacklo_epi64(m6, m1); \
77 b1 = _mm_unpackhi_epi64(m3, m1); \
78 } while(0)
79 
80 
81 #define LOAD_MSG_2_1(b0, b1) \
82 do \
83 { \
84 b0 = _mm_alignr_epi8(m6, m5, 8); \
85 b1 = _mm_unpackhi_epi64(m2, m7); \
86 } while(0)
87 
88 
89 #define LOAD_MSG_2_2(b0, b1) \
90 do \
91 { \
92 b0 = _mm_unpacklo_epi64(m4, m0); \
93 b1 = _mm_blend_epi16(m1, m6, 0xF0); \
94 } while(0)
95 
96 
97 #define LOAD_MSG_2_3(b0, b1) \
98 do \
99 { \
100 b0 = _mm_blend_epi16(m5, m1, 0xF0); \
101 b1 = _mm_unpackhi_epi64(m3, m4); \
102 } while(0)
103 
104 
105 #define LOAD_MSG_2_4(b0, b1) \
106 do \
107 { \
108 b0 = _mm_unpacklo_epi64(m7, m3); \
109 b1 = _mm_alignr_epi8(m2, m0, 8); \
110 } while(0)
111 
112 
113 #define LOAD_MSG_3_1(b0, b1) \
114 do \
115 { \
116 b0 = _mm_unpackhi_epi64(m3, m1); \
117 b1 = _mm_unpackhi_epi64(m6, m5); \
118 } while(0)
119 
120 
121 #define LOAD_MSG_3_2(b0, b1) \
122 do \
123 { \
124 b0 = _mm_unpackhi_epi64(m4, m0); \
125 b1 = _mm_unpacklo_epi64(m6, m7); \
126 } while(0)
127 
128 
129 #define LOAD_MSG_3_3(b0, b1) \
130 do \
131 { \
132 b0 = _mm_blend_epi16(m1, m2, 0xF0); \
133 b1 = _mm_blend_epi16(m2, m7, 0xF0); \
134 } while(0)
135 
136 
137 #define LOAD_MSG_3_4(b0, b1) \
138 do \
139 { \
140 b0 = _mm_unpacklo_epi64(m3, m5); \
141 b1 = _mm_unpacklo_epi64(m0, m4); \
142 } while(0)
143 
144 
145 #define LOAD_MSG_4_1(b0, b1) \
146 do \
147 { \
148 b0 = _mm_unpackhi_epi64(m4, m2); \
149 b1 = _mm_unpacklo_epi64(m1, m5); \
150 } while(0)
151 
152 
153 #define LOAD_MSG_4_2(b0, b1) \
154 do \
155 { \
156 b0 = _mm_blend_epi16(m0, m3, 0xF0); \
157 b1 = _mm_blend_epi16(m2, m7, 0xF0); \
158 } while(0)
159 
160 
161 #define LOAD_MSG_4_3(b0, b1) \
162 do \
163 { \
164 b0 = _mm_blend_epi16(m7, m5, 0xF0); \
165 b1 = _mm_blend_epi16(m3, m1, 0xF0); \
166 } while(0)
167 
168 
169 #define LOAD_MSG_4_4(b0, b1) \
170 do \
171 { \
172 b0 = _mm_alignr_epi8(m6, m0, 8); \
173 b1 = _mm_blend_epi16(m4, m6, 0xF0); \
174 } while(0)
175 
176 
177 #define LOAD_MSG_5_1(b0, b1) \
178 do \
179 { \
180 b0 = _mm_unpacklo_epi64(m1, m3); \
181 b1 = _mm_unpacklo_epi64(m0, m4); \
182 } while(0)
183 
184 
185 #define LOAD_MSG_5_2(b0, b1) \
186 do \
187 { \
188 b0 = _mm_unpacklo_epi64(m6, m5); \
189 b1 = _mm_unpackhi_epi64(m5, m1); \
190 } while(0)
191 
192 
193 #define LOAD_MSG_5_3(b0, b1) \
194 do \
195 { \
196 b0 = _mm_blend_epi16(m2, m3, 0xF0); \
197 b1 = _mm_unpackhi_epi64(m7, m0); \
198 } while(0)
199 
200 
201 #define LOAD_MSG_5_4(b0, b1) \
202 do \
203 { \
204 b0 = _mm_unpackhi_epi64(m6, m2); \
205 b1 = _mm_blend_epi16(m7, m4, 0xF0); \
206 } while(0)
207 
208 
209 #define LOAD_MSG_6_1(b0, b1) \
210 do \
211 { \
212 b0 = _mm_blend_epi16(m6, m0, 0xF0); \
213 b1 = _mm_unpacklo_epi64(m7, m2); \
214 } while(0)
215 
216 
217 #define LOAD_MSG_6_2(b0, b1) \
218 do \
219 { \
220 b0 = _mm_unpackhi_epi64(m2, m7); \
221 b1 = _mm_alignr_epi8(m5, m6, 8); \
222 } while(0)
223 
224 
225 #define LOAD_MSG_6_3(b0, b1) \
226 do \
227 { \
228 b0 = _mm_unpacklo_epi64(m0, m3); \
229 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \
230 } while(0)
231 
232 
233 #define LOAD_MSG_6_4(b0, b1) \
234 do \
235 { \
236 b0 = _mm_unpackhi_epi64(m3, m1); \
237 b1 = _mm_blend_epi16(m1, m5, 0xF0); \
238 } while(0)
239 
240 
241 #define LOAD_MSG_7_1(b0, b1) \
242 do \
243 { \
244 b0 = _mm_unpackhi_epi64(m6, m3); \
245 b1 = _mm_blend_epi16(m6, m1, 0xF0); \
246 } while(0)
247 
248 
249 #define LOAD_MSG_7_2(b0, b1) \
250 do \
251 { \
252 b0 = _mm_alignr_epi8(m7, m5, 8); \
253 b1 = _mm_unpackhi_epi64(m0, m4); \
254 } while(0)
255 
256 
257 #define LOAD_MSG_7_3(b0, b1) \
258 do \
259 { \
260 b0 = _mm_unpackhi_epi64(m2, m7); \
261 b1 = _mm_unpacklo_epi64(m4, m1); \
262 } while(0)
263 
264 
265 #define LOAD_MSG_7_4(b0, b1) \
266 do \
267 { \
268 b0 = _mm_unpacklo_epi64(m0, m2); \
269 b1 = _mm_unpacklo_epi64(m3, m5); \
270 } while(0)
271 
272 
273 #define LOAD_MSG_8_1(b0, b1) \
274 do \
275 { \
276 b0 = _mm_unpacklo_epi64(m3, m7); \
277 b1 = _mm_alignr_epi8(m0, m5, 8); \
278 } while(0)
279 
280 
281 #define LOAD_MSG_8_2(b0, b1) \
282 do \
283 { \
284 b0 = _mm_unpackhi_epi64(m7, m4); \
285 b1 = _mm_alignr_epi8(m4, m1, 8); \
286 } while(0)
287 
288 
289 #define LOAD_MSG_8_3(b0, b1) \
290 do \
291 { \
292 b0 = m6; \
293 b1 = _mm_alignr_epi8(m5, m0, 8); \
294 } while(0)
295 
296 
297 #define LOAD_MSG_8_4(b0, b1) \
298 do \
299 { \
300 b0 = _mm_blend_epi16(m1, m3, 0xF0); \
301 b1 = m2; \
302 } while(0)
303 
304 
305 #define LOAD_MSG_9_1(b0, b1) \
306 do \
307 { \
308 b0 = _mm_unpacklo_epi64(m5, m4); \
309 b1 = _mm_unpackhi_epi64(m3, m0); \
310 } while(0)
311 
312 
313 #define LOAD_MSG_9_2(b0, b1) \
314 do \
315 { \
316 b0 = _mm_unpacklo_epi64(m1, m2); \
317 b1 = _mm_blend_epi16(m3, m2, 0xF0); \
318 } while(0)
319 
320 
321 #define LOAD_MSG_9_3(b0, b1) \
322 do \
323 { \
324 b0 = _mm_unpackhi_epi64(m7, m4); \
325 b1 = _mm_unpackhi_epi64(m1, m6); \
326 } while(0)
327 
328 
329 #define LOAD_MSG_9_4(b0, b1) \
330 do \
331 { \
332 b0 = _mm_alignr_epi8(m7, m5, 8); \
333 b1 = _mm_unpacklo_epi64(m6, m0); \
334 } while(0)
335 
336 
337 #define LOAD_MSG_10_1(b0, b1) \
338 do \
339 { \
340 b0 = _mm_unpacklo_epi64(m0, m1); \
341 b1 = _mm_unpacklo_epi64(m2, m3); \
342 } while(0)
343 
344 
345 #define LOAD_MSG_10_2(b0, b1) \
346 do \
347 { \
348 b0 = _mm_unpackhi_epi64(m0, m1); \
349 b1 = _mm_unpackhi_epi64(m2, m3); \
350 } while(0)
351 
352 
353 #define LOAD_MSG_10_3(b0, b1) \
354 do \
355 { \
356 b0 = _mm_unpacklo_epi64(m4, m5); \
357 b1 = _mm_unpacklo_epi64(m6, m7); \
358 } while(0)
359 
360 
361 #define LOAD_MSG_10_4(b0, b1) \
362 do \
363 { \
364 b0 = _mm_unpackhi_epi64(m4, m5); \
365 b1 = _mm_unpackhi_epi64(m6, m7); \
366 } while(0)
367 
368 
369 #define LOAD_MSG_11_1(b0, b1) \
370 do \
371 { \
372 b0 = _mm_unpacklo_epi64(m7, m2); \
373 b1 = _mm_unpackhi_epi64(m4, m6); \
374 } while(0)
375 
376 
377 #define LOAD_MSG_11_2(b0, b1) \
378 do \
379 { \
380 b0 = _mm_unpacklo_epi64(m5, m4); \
381 b1 = _mm_alignr_epi8(m3, m7, 8); \
382 } while(0)
383 
384 
385 #define LOAD_MSG_11_3(b0, b1) \
386 do \
387 { \
388 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \
389 b1 = _mm_unpackhi_epi64(m5, m2); \
390 } while(0)
391 
392 
393 #define LOAD_MSG_11_4(b0, b1) \
394 do \
395 { \
396 b0 = _mm_unpacklo_epi64(m6, m1); \
397 b1 = _mm_unpackhi_epi64(m3, m1); \
398 } while(0)
399 
400 
401 #endif
402 
403