xref: /freebsd/sys/contrib/libb2/blake2s-load-xop.h (revision 0e33efe4e4b5d24e2d416938af8bc6e6e4160ec8)
1*0e33efe4SConrad Meyer /*
2*0e33efe4SConrad Meyer    BLAKE2 reference source code package - optimized C implementations
3*0e33efe4SConrad Meyer 
4*0e33efe4SConrad Meyer    Written in 2012 by Samuel Neves <sneves@dei.uc.pt>
5*0e33efe4SConrad Meyer 
6*0e33efe4SConrad Meyer    To the extent possible under law, the author(s) have dedicated all copyright
7*0e33efe4SConrad Meyer    and related and neighboring rights to this software to the public domain
8*0e33efe4SConrad Meyer    worldwide. This software is distributed without any warranty.
9*0e33efe4SConrad Meyer 
10*0e33efe4SConrad Meyer    You should have received a copy of the CC0 Public Domain Dedication along with
11*0e33efe4SConrad Meyer    this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
12*0e33efe4SConrad Meyer */
13*0e33efe4SConrad Meyer #pragma once
14*0e33efe4SConrad Meyer #ifndef __BLAKE2S_LOAD_XOP_H__
15*0e33efe4SConrad Meyer #define __BLAKE2S_LOAD_XOP_H__
16*0e33efe4SConrad Meyer 
17*0e33efe4SConrad Meyer #define TOB(x) ((x)*4*0x01010101 + 0x03020100) // ..or not TOB
18*0e33efe4SConrad Meyer 
19*0e33efe4SConrad Meyer /* Basic VPPERM emulation, for testing purposes */
20*0e33efe4SConrad Meyer /*static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel)
21*0e33efe4SConrad Meyer {
22*0e33efe4SConrad Meyer    const __m128i sixteen = _mm_set1_epi8(16);
23*0e33efe4SConrad Meyer    const __m128i t0 = _mm_shuffle_epi8(src1, sel);
24*0e33efe4SConrad Meyer    const __m128i s1 = _mm_shuffle_epi8(src2, _mm_sub_epi8(sel, sixteen));
25*0e33efe4SConrad Meyer    const __m128i mask = _mm_or_si128(_mm_cmpeq_epi8(sel, sixteen),
26*0e33efe4SConrad Meyer                                      _mm_cmpgt_epi8(sel, sixteen)); // (>=16) = 0xff : 00
27*0e33efe4SConrad Meyer    return _mm_blendv_epi8(t0, s1, mask);
28*0e33efe4SConrad Meyer }*/
29*0e33efe4SConrad Meyer 
30*0e33efe4SConrad Meyer #define LOAD_MSG_0_1(buf) \
31*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) );
32*0e33efe4SConrad Meyer 
33*0e33efe4SConrad Meyer #define LOAD_MSG_0_2(buf) \
34*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) );
35*0e33efe4SConrad Meyer 
36*0e33efe4SConrad Meyer #define LOAD_MSG_0_3(buf) \
37*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) );
38*0e33efe4SConrad Meyer 
39*0e33efe4SConrad Meyer #define LOAD_MSG_0_4(buf) \
40*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) );
41*0e33efe4SConrad Meyer 
42*0e33efe4SConrad Meyer #define LOAD_MSG_1_1(buf) \
43*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \
44*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) );
45*0e33efe4SConrad Meyer 
46*0e33efe4SConrad Meyer #define LOAD_MSG_1_2(buf) \
47*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(2),TOB(0),TOB(4),TOB(6)) ); \
48*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) );
49*0e33efe4SConrad Meyer 
50*0e33efe4SConrad Meyer #define LOAD_MSG_1_3(buf) \
51*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(0),TOB(0),TOB(1)) ); \
52*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) );
53*0e33efe4SConrad Meyer 
54*0e33efe4SConrad Meyer #define LOAD_MSG_1_4(buf) \
55*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(7),TOB(2),TOB(0)) ); \
56*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) );
57*0e33efe4SConrad Meyer 
58*0e33efe4SConrad Meyer #define LOAD_MSG_2_1(buf) \
59*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(1),TOB(0),TOB(7)) ); \
60*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(4),TOB(0)) );
61*0e33efe4SConrad Meyer 
62*0e33efe4SConrad Meyer #define LOAD_MSG_2_2(buf) \
63*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(2),TOB(0),TOB(4)) ); \
64*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(0)) );
65*0e33efe4SConrad Meyer 
66*0e33efe4SConrad Meyer #define LOAD_MSG_2_3(buf) \
67*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(7),TOB(3),TOB(0)) ); \
68*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) );
69*0e33efe4SConrad Meyer 
70*0e33efe4SConrad Meyer #define LOAD_MSG_2_4(buf) \
71*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \
72*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) );
73*0e33efe4SConrad Meyer 
74*0e33efe4SConrad Meyer #define LOAD_MSG_3_1(buf) \
75*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \
76*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); \
77*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(1),TOB(0)) );
78*0e33efe4SConrad Meyer 
79*0e33efe4SConrad Meyer #define LOAD_MSG_3_2(buf) \
80*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(1),TOB(5)) ); \
81*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) );
82*0e33efe4SConrad Meyer 
83*0e33efe4SConrad Meyer #define LOAD_MSG_3_3(buf) \
84*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(5),TOB(2)) ); \
85*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) );
86*0e33efe4SConrad Meyer 
87*0e33efe4SConrad Meyer #define LOAD_MSG_3_4(buf) \
88*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \
89*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(6),TOB(0)) );
90*0e33efe4SConrad Meyer 
91*0e33efe4SConrad Meyer #define LOAD_MSG_4_1(buf) \
92*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(5),TOB(0)) ); \
93*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(5)) );
94*0e33efe4SConrad Meyer 
95*0e33efe4SConrad Meyer #define LOAD_MSG_4_2(buf) \
96*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(7),TOB(0)) ); \
97*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) );
98*0e33efe4SConrad Meyer 
99*0e33efe4SConrad Meyer #define LOAD_MSG_4_3(buf) \
100*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \
101*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \
102*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) );
103*0e33efe4SConrad Meyer 
104*0e33efe4SConrad Meyer #define LOAD_MSG_4_4(buf) \
105*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \
106*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(4),TOB(0)) );
107*0e33efe4SConrad Meyer 
108*0e33efe4SConrad Meyer #define LOAD_MSG_5_1(buf) \
109*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(2)) ); \
110*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(1),TOB(0)) );
111*0e33efe4SConrad Meyer 
112*0e33efe4SConrad Meyer #define LOAD_MSG_5_2(buf) \
113*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(6),TOB(0)) ); \
114*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) );
115*0e33efe4SConrad Meyer 
116*0e33efe4SConrad Meyer #define LOAD_MSG_5_3(buf) \
117*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(0),TOB(7),TOB(4)) ); \
118*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) );
119*0e33efe4SConrad Meyer 
120*0e33efe4SConrad Meyer #define LOAD_MSG_5_4(buf) \
121*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(5),TOB(0),TOB(1),TOB(0)) ); \
122*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(5)) );
123*0e33efe4SConrad Meyer 
124*0e33efe4SConrad Meyer #define LOAD_MSG_6_1(buf) \
125*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(0),TOB(1),TOB(0)) ); \
126*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(4)) );
127*0e33efe4SConrad Meyer 
128*0e33efe4SConrad Meyer #define LOAD_MSG_6_2(buf) \
129*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(6),TOB(0),TOB(0),TOB(1)) ); \
130*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(7),TOB(0)) );
131*0e33efe4SConrad Meyer 
132*0e33efe4SConrad Meyer #define LOAD_MSG_6_3(buf) \
133*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(0)) ); \
134*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(5),TOB(1),TOB(0)) );
135*0e33efe4SConrad Meyer 
136*0e33efe4SConrad Meyer #define LOAD_MSG_6_4(buf) \
137*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(3),TOB(7)) ); \
138*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) );
139*0e33efe4SConrad Meyer 
140*0e33efe4SConrad Meyer #define LOAD_MSG_7_1(buf) \
141*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(0),TOB(7),TOB(0)) ); \
142*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(5)) );
143*0e33efe4SConrad Meyer 
144*0e33efe4SConrad Meyer #define LOAD_MSG_7_2(buf) \
145*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(7)) ); \
146*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) );
147*0e33efe4SConrad Meyer 
148*0e33efe4SConrad Meyer #define LOAD_MSG_7_3(buf) \
149*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \
150*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \
151*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) );
152*0e33efe4SConrad Meyer 
153*0e33efe4SConrad Meyer #define LOAD_MSG_7_4(buf) \
154*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \
155*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(0)) );
156*0e33efe4SConrad Meyer 
157*0e33efe4SConrad Meyer #define LOAD_MSG_8_1(buf) \
158*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \
159*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); \
160*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) );
161*0e33efe4SConrad Meyer 
162*0e33efe4SConrad Meyer #define LOAD_MSG_8_2(buf) \
163*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(4),TOB(3),TOB(5),TOB(0)) ); \
164*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) );
165*0e33efe4SConrad Meyer 
166*0e33efe4SConrad Meyer #define LOAD_MSG_8_3(buf) \
167*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \
168*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(5),TOB(4)) ); \
169*0e33efe4SConrad Meyer 
170*0e33efe4SConrad Meyer #define LOAD_MSG_8_4(buf) \
171*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(4),TOB(7),TOB(2)) );
172*0e33efe4SConrad Meyer 
173*0e33efe4SConrad Meyer #define LOAD_MSG_9_1(buf) \
174*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \
175*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(4),TOB(6)) );
176*0e33efe4SConrad Meyer 
177*0e33efe4SConrad Meyer #define LOAD_MSG_9_2(buf) \
178*0e33efe4SConrad Meyer buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) );
179*0e33efe4SConrad Meyer 
180*0e33efe4SConrad Meyer #define LOAD_MSG_9_3(buf) \
181*0e33efe4SConrad Meyer t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \
182*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(7)) );
183*0e33efe4SConrad Meyer 
184*0e33efe4SConrad Meyer #define LOAD_MSG_9_4(buf) \
185*0e33efe4SConrad Meyer t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \
186*0e33efe4SConrad Meyer buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(6),TOB(0)) );
187*0e33efe4SConrad Meyer 
188*0e33efe4SConrad Meyer #endif
189*0e33efe4SConrad Meyer 
190