xref: /freebsd/sys/contrib/libb2/blake2s-load-sse41.h (revision 0e33efe4e4b5d24e2d416938af8bc6e6e4160ec8)
1*0e33efe4SConrad Meyer /*
2*0e33efe4SConrad Meyer    BLAKE2 reference source code package - optimized C implementations
3*0e33efe4SConrad Meyer 
4*0e33efe4SConrad Meyer    Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
5*0e33efe4SConrad Meyer 
6*0e33efe4SConrad Meyer    To the extent possible under law, the author(s) have dedicated all copyright
7*0e33efe4SConrad Meyer    and related and neighboring rights to this software to the public domain
8*0e33efe4SConrad Meyer    worldwide. This software is distributed without any warranty.
9*0e33efe4SConrad Meyer 
10*0e33efe4SConrad Meyer    You should have received a copy of the CC0 Public Domain Dedication along with
11*0e33efe4SConrad Meyer    this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
12*0e33efe4SConrad Meyer */
13*0e33efe4SConrad Meyer #pragma once
14*0e33efe4SConrad Meyer #ifndef __BLAKE2S_LOAD_SSE41_H__
15*0e33efe4SConrad Meyer #define __BLAKE2S_LOAD_SSE41_H__
16*0e33efe4SConrad Meyer 
17*0e33efe4SConrad Meyer #define LOAD_MSG_0_1(buf) \
18*0e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0)));
19*0e33efe4SConrad Meyer 
20*0e33efe4SConrad Meyer #define LOAD_MSG_0_2(buf) \
21*0e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1)));
22*0e33efe4SConrad Meyer 
23*0e33efe4SConrad Meyer #define LOAD_MSG_0_3(buf) \
24*0e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0)));
25*0e33efe4SConrad Meyer 
26*0e33efe4SConrad Meyer #define LOAD_MSG_0_4(buf) \
27*0e33efe4SConrad Meyer buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1)));
28*0e33efe4SConrad Meyer 
29*0e33efe4SConrad Meyer #define LOAD_MSG_1_1(buf) \
30*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m1, m2, 0x0C); \
31*0e33efe4SConrad Meyer t1 = _mm_slli_si128(m3, 4); \
32*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0xF0); \
33*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
34*0e33efe4SConrad Meyer 
35*0e33efe4SConrad Meyer #define LOAD_MSG_1_2(buf) \
36*0e33efe4SConrad Meyer t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \
37*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m1,m3,0xC0); \
38*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0xF0); \
39*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
40*0e33efe4SConrad Meyer 
41*0e33efe4SConrad Meyer #define LOAD_MSG_1_3(buf) \
42*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m1, 4); \
43*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m2, t0, 0x30); \
44*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(m0, t1, 0xF0); \
45*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
46*0e33efe4SConrad Meyer 
47*0e33efe4SConrad Meyer #define LOAD_MSG_1_4(buf) \
48*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \
49*0e33efe4SConrad Meyer t1 = _mm_slli_si128(m3, 4); \
50*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0x0C); \
51*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
52*0e33efe4SConrad Meyer 
53*0e33efe4SConrad Meyer #define LOAD_MSG_2_1(buf) \
54*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m2,m3); \
55*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m3,m1,0x0C); \
56*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0, t1, 0x0F); \
57*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
58*0e33efe4SConrad Meyer 
59*0e33efe4SConrad Meyer #define LOAD_MSG_2_2(buf) \
60*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi32(m2,m0); \
61*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(t0, m0, 0xF0); \
62*0e33efe4SConrad Meyer t2 = _mm_slli_si128(m3, 8); \
63*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1, t2, 0xC0);
64*0e33efe4SConrad Meyer 
65*0e33efe4SConrad Meyer #define LOAD_MSG_2_3(buf) \
66*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0, m2, 0x3C); \
67*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m1, 12); \
68*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x03); \
69*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
70*0e33efe4SConrad Meyer 
71*0e33efe4SConrad Meyer #define LOAD_MSG_2_4(buf) \
72*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m3, 4); \
73*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m0, m1, 0x33); \
74*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1, t0, 0xC0); \
75*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
76*0e33efe4SConrad Meyer 
77*0e33efe4SConrad Meyer #define LOAD_MSG_3_1(buf) \
78*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \
79*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi32(t0, m2); \
80*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1, m3, 0x0C); \
81*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
82*0e33efe4SConrad Meyer 
83*0e33efe4SConrad Meyer #define LOAD_MSG_3_2(buf) \
84*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m2, 8); \
85*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m3,m0,0x0C); \
86*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1, t0, 0xC0); \
87*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
88*0e33efe4SConrad Meyer 
89*0e33efe4SConrad Meyer #define LOAD_MSG_3_3(buf) \
90*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0,m1,0x0F); \
91*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(t0, m3, 0xC0); \
92*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
93*0e33efe4SConrad Meyer 
94*0e33efe4SConrad Meyer #define LOAD_MSG_3_4(buf) \
95*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi32(m0,m2); \
96*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi32(m1,m2); \
97*0e33efe4SConrad Meyer buf = _mm_unpacklo_epi64(t1,t0);
98*0e33efe4SConrad Meyer 
99*0e33efe4SConrad Meyer #define LOAD_MSG_4_1(buf) \
100*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi64(m1,m2); \
101*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi64(m0,m2); \
102*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x33); \
103*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
104*0e33efe4SConrad Meyer 
105*0e33efe4SConrad Meyer #define LOAD_MSG_4_2(buf) \
106*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi64(m1,m3); \
107*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi64(m0,m1); \
108*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t0,t1,0x33);
109*0e33efe4SConrad Meyer 
110*0e33efe4SConrad Meyer #define LOAD_MSG_4_3(buf) \
111*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi64(m3,m1); \
112*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi64(m2,m0); \
113*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,t0,0x33);
114*0e33efe4SConrad Meyer 
115*0e33efe4SConrad Meyer #define LOAD_MSG_4_4(buf) \
116*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0,m2,0x03); \
117*0e33efe4SConrad Meyer t1 = _mm_slli_si128(t0, 8); \
118*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1,m3,0x0F); \
119*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
120*0e33efe4SConrad Meyer 
121*0e33efe4SConrad Meyer #define LOAD_MSG_5_1(buf) \
122*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \
123*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi32(m0,m2); \
124*0e33efe4SConrad Meyer buf = _mm_unpacklo_epi64(t0,t1);
125*0e33efe4SConrad Meyer 
126*0e33efe4SConrad Meyer #define LOAD_MSG_5_2(buf) \
127*0e33efe4SConrad Meyer t0 = _mm_srli_si128(m2, 4); \
128*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m0,m3,0x03); \
129*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,t0,0x3C);
130*0e33efe4SConrad Meyer 
131*0e33efe4SConrad Meyer #define LOAD_MSG_5_3(buf) \
132*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m1,m0,0x0C); \
133*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m3, 4); \
134*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x30); \
135*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
136*0e33efe4SConrad Meyer 
137*0e33efe4SConrad Meyer #define LOAD_MSG_5_4(buf) \
138*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi64(m1,m2); \
139*0e33efe4SConrad Meyer t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \
140*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t0,t1,0x33);
141*0e33efe4SConrad Meyer 
142*0e33efe4SConrad Meyer #define LOAD_MSG_6_1(buf) \
143*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m1, 12); \
144*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m0,m3,0x33); \
145*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,t0,0xC0);
146*0e33efe4SConrad Meyer 
147*0e33efe4SConrad Meyer #define LOAD_MSG_6_2(buf) \
148*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m3,m2,0x30); \
149*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m1, 4); \
150*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x03); \
151*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
152*0e33efe4SConrad Meyer 
153*0e33efe4SConrad Meyer #define LOAD_MSG_6_3(buf) \
154*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi64(m0,m2); \
155*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m1, 4); \
156*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
157*0e33efe4SConrad Meyer 
158*0e33efe4SConrad Meyer #define LOAD_MSG_6_4(buf) \
159*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m1,m2); \
160*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi64(m0,t0); \
161*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
162*0e33efe4SConrad Meyer 
163*0e33efe4SConrad Meyer #define LOAD_MSG_7_1(buf) \
164*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m1); \
165*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(t0,m3,0x0F); \
166*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
167*0e33efe4SConrad Meyer 
168*0e33efe4SConrad Meyer #define LOAD_MSG_7_2(buf) \
169*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m2,m3,0x30); \
170*0e33efe4SConrad Meyer t1 = _mm_srli_si128(m0,4); \
171*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x03); \
172*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
173*0e33efe4SConrad Meyer 
174*0e33efe4SConrad Meyer #define LOAD_MSG_7_3(buf) \
175*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi64(m0,m3); \
176*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi64(m1,m2); \
177*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x3C); \
178*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
179*0e33efe4SConrad Meyer 
180*0e33efe4SConrad Meyer #define LOAD_MSG_7_4(buf) \
181*0e33efe4SConrad Meyer t0 = _mm_unpacklo_epi32(m0,m1); \
182*0e33efe4SConrad Meyer t1 = _mm_unpackhi_epi32(m1,m2); \
183*0e33efe4SConrad Meyer buf = _mm_unpacklo_epi64(t0,t1);
184*0e33efe4SConrad Meyer 
185*0e33efe4SConrad Meyer #define LOAD_MSG_8_1(buf) \
186*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m1,m3); \
187*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi64(t0,m0); \
188*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1,m2,0xC0); \
189*0e33efe4SConrad Meyer buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
190*0e33efe4SConrad Meyer 
191*0e33efe4SConrad Meyer #define LOAD_MSG_8_2(buf) \
192*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m3); \
193*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m2,t0,0xF0); \
194*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
195*0e33efe4SConrad Meyer 
196*0e33efe4SConrad Meyer #define LOAD_MSG_8_3(buf) \
197*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m2,m0,0x0C); \
198*0e33efe4SConrad Meyer t1 = _mm_slli_si128(t0,4); \
199*0e33efe4SConrad Meyer buf = _mm_blend_epi16(t1,m3,0x0F);
200*0e33efe4SConrad Meyer 
201*0e33efe4SConrad Meyer #define LOAD_MSG_8_4(buf) \
202*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m1,m0,0x30); \
203*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
204*0e33efe4SConrad Meyer 
205*0e33efe4SConrad Meyer #define LOAD_MSG_9_1(buf) \
206*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m0,m2,0x03); \
207*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m1,m2,0x30); \
208*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t1,t0,0x0F); \
209*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
210*0e33efe4SConrad Meyer 
211*0e33efe4SConrad Meyer #define LOAD_MSG_9_2(buf) \
212*0e33efe4SConrad Meyer t0 = _mm_slli_si128(m0,4); \
213*0e33efe4SConrad Meyer t1 = _mm_blend_epi16(m1,t0,0xC0); \
214*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
215*0e33efe4SConrad Meyer 
216*0e33efe4SConrad Meyer #define LOAD_MSG_9_3(buf) \
217*0e33efe4SConrad Meyer t0 = _mm_unpackhi_epi32(m0,m3); \
218*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi32(m2,m3); \
219*0e33efe4SConrad Meyer t2 = _mm_unpackhi_epi64(t0,t1); \
220*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
221*0e33efe4SConrad Meyer 
222*0e33efe4SConrad Meyer #define LOAD_MSG_9_4(buf) \
223*0e33efe4SConrad Meyer t0 = _mm_blend_epi16(m3,m2,0xC0); \
224*0e33efe4SConrad Meyer t1 = _mm_unpacklo_epi32(m0,m3); \
225*0e33efe4SConrad Meyer t2 = _mm_blend_epi16(t0,t1,0x0F); \
226*0e33efe4SConrad Meyer buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
227*0e33efe4SConrad Meyer 
228*0e33efe4SConrad Meyer #endif
229*0e33efe4SConrad Meyer 
230