1*0957b409SSimon J. Gerraty /*
2*0957b409SSimon J. Gerraty * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3*0957b409SSimon J. Gerraty *
4*0957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining
5*0957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the
6*0957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including
7*0957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish,
8*0957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to
9*0957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to
10*0957b409SSimon J. Gerraty * the following conditions:
11*0957b409SSimon J. Gerraty *
12*0957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be
13*0957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software.
14*0957b409SSimon J. Gerraty *
15*0957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16*0957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17*0957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18*0957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19*0957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20*0957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21*0957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*0957b409SSimon J. Gerraty * SOFTWARE.
23*0957b409SSimon J. Gerraty */
24*0957b409SSimon J. Gerraty
25*0957b409SSimon J. Gerraty #define BR_POWER_ASM_MACROS 1
26*0957b409SSimon J. Gerraty #include "inner.h"
27*0957b409SSimon J. Gerraty
28*0957b409SSimon J. Gerraty /*
29*0957b409SSimon J. Gerraty * This is the GHASH implementation that leverages the POWER8 opcodes.
30*0957b409SSimon J. Gerraty */
31*0957b409SSimon J. Gerraty
32*0957b409SSimon J. Gerraty #if BR_POWER8
33*0957b409SSimon J. Gerraty
34*0957b409SSimon J. Gerraty /*
35*0957b409SSimon J. Gerraty * Some symbolic names for registers.
36*0957b409SSimon J. Gerraty * HB0 = 16 bytes of value 0
37*0957b409SSimon J. Gerraty * HB1 = 16 bytes of value 1
38*0957b409SSimon J. Gerraty * HB2 = 16 bytes of value 2
39*0957b409SSimon J. Gerraty * HB6 = 16 bytes of value 6
40*0957b409SSimon J. Gerraty * HB7 = 16 bytes of value 7
41*0957b409SSimon J. Gerraty * TT0, TT1 and TT2 are temporaries
42*0957b409SSimon J. Gerraty *
43*0957b409SSimon J. Gerraty * BSW holds the pattern for byteswapping 32-bit words; this is set only
44*0957b409SSimon J. Gerraty * on little-endian systems. XBSW is the same register with the +32 offset
45*0957b409SSimon J. Gerraty * for access with the VSX opcodes.
46*0957b409SSimon J. Gerraty */
47*0957b409SSimon J. Gerraty #define HB0 0
48*0957b409SSimon J. Gerraty #define HB1 1
49*0957b409SSimon J. Gerraty #define HB2 2
50*0957b409SSimon J. Gerraty #define HB6 3
51*0957b409SSimon J. Gerraty #define HB7 4
52*0957b409SSimon J. Gerraty #define TT0 5
53*0957b409SSimon J. Gerraty #define TT1 6
54*0957b409SSimon J. Gerraty #define TT2 7
55*0957b409SSimon J. Gerraty
56*0957b409SSimon J. Gerraty #define BSW 8
57*0957b409SSimon J. Gerraty #define XBSW 40
58*0957b409SSimon J. Gerraty
59*0957b409SSimon J. Gerraty /*
60*0957b409SSimon J. Gerraty * Macro to initialise the constants.
61*0957b409SSimon J. Gerraty */
62*0957b409SSimon J. Gerraty #define INIT \
63*0957b409SSimon J. Gerraty vxor(HB0, HB0, HB0) \
64*0957b409SSimon J. Gerraty vspltisb(HB1, 1) \
65*0957b409SSimon J. Gerraty vspltisb(HB2, 2) \
66*0957b409SSimon J. Gerraty vspltisb(HB6, 6) \
67*0957b409SSimon J. Gerraty vspltisb(HB7, 7) \
68*0957b409SSimon J. Gerraty INIT_BSW
69*0957b409SSimon J. Gerraty
70*0957b409SSimon J. Gerraty /*
71*0957b409SSimon J. Gerraty * Fix endianness of a value after reading it or before writing it, if
72*0957b409SSimon J. Gerraty * necessary.
73*0957b409SSimon J. Gerraty */
74*0957b409SSimon J. Gerraty #if BR_POWER8_LE
75*0957b409SSimon J. Gerraty #define INIT_BSW lxvw4x(XBSW, 0, %[idx2be])
76*0957b409SSimon J. Gerraty #define FIX_ENDIAN(xx) vperm(xx, xx, xx, BSW)
77*0957b409SSimon J. Gerraty #else
78*0957b409SSimon J. Gerraty #define INIT_BSW
79*0957b409SSimon J. Gerraty #define FIX_ENDIAN(xx)
80*0957b409SSimon J. Gerraty #endif
81*0957b409SSimon J. Gerraty
82*0957b409SSimon J. Gerraty /*
83*0957b409SSimon J. Gerraty * Left-shift x0:x1 by one bit to the left. This is a corrective action
84*0957b409SSimon J. Gerraty * needed because GHASH is defined in full little-endian specification,
85*0957b409SSimon J. Gerraty * while the opcodes use full big-endian convention, so the 255-bit product
86*0957b409SSimon J. Gerraty * ends up one bit to the right.
87*0957b409SSimon J. Gerraty */
88*0957b409SSimon J. Gerraty #define SL_256(x0, x1) \
89*0957b409SSimon J. Gerraty vsldoi(TT0, HB0, x1, 1) \
90*0957b409SSimon J. Gerraty vsl(x0, x0, HB1) \
91*0957b409SSimon J. Gerraty vsr(TT0, TT0, HB7) \
92*0957b409SSimon J. Gerraty vsl(x1, x1, HB1) \
93*0957b409SSimon J. Gerraty vxor(x0, x0, TT0)
94*0957b409SSimon J. Gerraty
95*0957b409SSimon J. Gerraty /*
96*0957b409SSimon J. Gerraty * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as
97*0957b409SSimon J. Gerraty * x0 or x1, or a different register). x0 and x1 are modified.
98*0957b409SSimon J. Gerraty */
99*0957b409SSimon J. Gerraty #define REDUCE_F128(xd, x0, x1) \
100*0957b409SSimon J. Gerraty vxor(x0, x0, x1) \
101*0957b409SSimon J. Gerraty vsr(TT0, x1, HB1) \
102*0957b409SSimon J. Gerraty vsr(TT1, x1, HB2) \
103*0957b409SSimon J. Gerraty vsr(TT2, x1, HB7) \
104*0957b409SSimon J. Gerraty vxor(x0, x0, TT0) \
105*0957b409SSimon J. Gerraty vxor(TT1, TT1, TT2) \
106*0957b409SSimon J. Gerraty vxor(x0, x0, TT1) \
107*0957b409SSimon J. Gerraty vsldoi(x1, x1, HB0, 15) \
108*0957b409SSimon J. Gerraty vsl(TT1, x1, HB6) \
109*0957b409SSimon J. Gerraty vsl(TT2, x1, HB1) \
110*0957b409SSimon J. Gerraty vxor(x1, TT1, TT2) \
111*0957b409SSimon J. Gerraty vsr(TT0, x1, HB1) \
112*0957b409SSimon J. Gerraty vsr(TT1, x1, HB2) \
113*0957b409SSimon J. Gerraty vsr(TT2, x1, HB7) \
114*0957b409SSimon J. Gerraty vxor(x0, x0, x1) \
115*0957b409SSimon J. Gerraty vxor(x0, x0, TT0) \
116*0957b409SSimon J. Gerraty vxor(TT1, TT1, TT2) \
117*0957b409SSimon J. Gerraty vxor(xd, x0, TT1)
118*0957b409SSimon J. Gerraty
119*0957b409SSimon J. Gerraty /* see bearssl_hash.h */
120*0957b409SSimon J. Gerraty void
br_ghash_pwr8(void * y,const void * h,const void * data,size_t len)121*0957b409SSimon J. Gerraty br_ghash_pwr8(void *y, const void *h, const void *data, size_t len)
122*0957b409SSimon J. Gerraty {
123*0957b409SSimon J. Gerraty const unsigned char *buf1, *buf2;
124*0957b409SSimon J. Gerraty size_t num4, num1;
125*0957b409SSimon J. Gerraty unsigned char tmp[64];
126*0957b409SSimon J. Gerraty long cc0, cc1, cc2, cc3;
127*0957b409SSimon J. Gerraty
128*0957b409SSimon J. Gerraty #if BR_POWER8_LE
129*0957b409SSimon J. Gerraty static const uint32_t idx2be[] = {
130*0957b409SSimon J. Gerraty 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
131*0957b409SSimon J. Gerraty };
132*0957b409SSimon J. Gerraty #endif
133*0957b409SSimon J. Gerraty
134*0957b409SSimon J. Gerraty buf1 = data;
135*0957b409SSimon J. Gerraty
136*0957b409SSimon J. Gerraty /*
137*0957b409SSimon J. Gerraty * Assembly code requires data into two chunks; first chunk
138*0957b409SSimon J. Gerraty * must contain a number of blocks which is a multiple of 4.
139*0957b409SSimon J. Gerraty * Since the processing for the first chunk is faster, we want
140*0957b409SSimon J. Gerraty * to make it as big as possible.
141*0957b409SSimon J. Gerraty *
142*0957b409SSimon J. Gerraty * For the remainder, there are two possibilities:
143*0957b409SSimon J. Gerraty * -- if the remainder size is a multiple of 16, then use it
144*0957b409SSimon J. Gerraty * in place;
145*0957b409SSimon J. Gerraty * -- otherwise, copy it to the tmp[] array and pad it with
146*0957b409SSimon J. Gerraty * zeros.
147*0957b409SSimon J. Gerraty */
148*0957b409SSimon J. Gerraty num4 = len >> 6;
149*0957b409SSimon J. Gerraty buf2 = buf1 + (num4 << 6);
150*0957b409SSimon J. Gerraty len &= 63;
151*0957b409SSimon J. Gerraty num1 = (len + 15) >> 4;
152*0957b409SSimon J. Gerraty if ((len & 15) != 0) {
153*0957b409SSimon J. Gerraty memcpy(tmp, buf2, len);
154*0957b409SSimon J. Gerraty memset(tmp + len, 0, (num1 << 4) - len);
155*0957b409SSimon J. Gerraty buf2 = tmp;
156*0957b409SSimon J. Gerraty }
157*0957b409SSimon J. Gerraty
158*0957b409SSimon J. Gerraty cc0 = 0;
159*0957b409SSimon J. Gerraty cc1 = 16;
160*0957b409SSimon J. Gerraty cc2 = 32;
161*0957b409SSimon J. Gerraty cc3 = 48;
162*0957b409SSimon J. Gerraty asm volatile (
163*0957b409SSimon J. Gerraty INIT
164*0957b409SSimon J. Gerraty
165*0957b409SSimon J. Gerraty /*
166*0957b409SSimon J. Gerraty * Load current h (denoted hereafter h1) in v9.
167*0957b409SSimon J. Gerraty */
168*0957b409SSimon J. Gerraty lxvw4x(41, 0, %[h])
169*0957b409SSimon J. Gerraty FIX_ENDIAN(9)
170*0957b409SSimon J. Gerraty
171*0957b409SSimon J. Gerraty /*
172*0957b409SSimon J. Gerraty * Load current y into v28.
173*0957b409SSimon J. Gerraty */
174*0957b409SSimon J. Gerraty lxvw4x(60, 0, %[y])
175*0957b409SSimon J. Gerraty FIX_ENDIAN(28)
176*0957b409SSimon J. Gerraty
177*0957b409SSimon J. Gerraty /*
178*0957b409SSimon J. Gerraty * Split h1 into three registers:
179*0957b409SSimon J. Gerraty * v17 = h1_1:h1_0
180*0957b409SSimon J. Gerraty * v18 = 0:h1_0
181*0957b409SSimon J. Gerraty * v19 = h1_1:0
182*0957b409SSimon J. Gerraty */
183*0957b409SSimon J. Gerraty xxpermdi(49, 41, 41, 2)
184*0957b409SSimon J. Gerraty vsldoi(18, HB0, 9, 8)
185*0957b409SSimon J. Gerraty vsldoi(19, 9, HB0, 8)
186*0957b409SSimon J. Gerraty
187*0957b409SSimon J. Gerraty /*
188*0957b409SSimon J. Gerraty * If num4 is 0, skip directly to the second chunk.
189*0957b409SSimon J. Gerraty */
190*0957b409SSimon J. Gerraty cmpldi(%[num4], 0)
191*0957b409SSimon J. Gerraty beq(chunk1)
192*0957b409SSimon J. Gerraty
193*0957b409SSimon J. Gerraty /*
194*0957b409SSimon J. Gerraty * Compute h2 = h*h in v10.
195*0957b409SSimon J. Gerraty */
196*0957b409SSimon J. Gerraty vpmsumd(10, 18, 18)
197*0957b409SSimon J. Gerraty vpmsumd(11, 19, 19)
198*0957b409SSimon J. Gerraty SL_256(10, 11)
199*0957b409SSimon J. Gerraty REDUCE_F128(10, 10, 11)
200*0957b409SSimon J. Gerraty
201*0957b409SSimon J. Gerraty /*
202*0957b409SSimon J. Gerraty * Compute h3 = h*h*h in v11.
203*0957b409SSimon J. Gerraty * We first split h2 into:
204*0957b409SSimon J. Gerraty * v10 = h2_0:h2_1
205*0957b409SSimon J. Gerraty * v11 = 0:h2_0
206*0957b409SSimon J. Gerraty * v12 = h2_1:0
207*0957b409SSimon J. Gerraty * Then we do the product with h1, and reduce into v11.
208*0957b409SSimon J. Gerraty */
209*0957b409SSimon J. Gerraty vsldoi(11, HB0, 10, 8)
210*0957b409SSimon J. Gerraty vsldoi(12, 10, HB0, 8)
211*0957b409SSimon J. Gerraty vpmsumd(13, 10, 17)
212*0957b409SSimon J. Gerraty vpmsumd(11, 11, 18)
213*0957b409SSimon J. Gerraty vpmsumd(12, 12, 19)
214*0957b409SSimon J. Gerraty vsldoi(14, HB0, 13, 8)
215*0957b409SSimon J. Gerraty vsldoi(15, 13, HB0, 8)
216*0957b409SSimon J. Gerraty vxor(11, 11, 14)
217*0957b409SSimon J. Gerraty vxor(12, 12, 15)
218*0957b409SSimon J. Gerraty SL_256(11, 12)
219*0957b409SSimon J. Gerraty REDUCE_F128(11, 11, 12)
220*0957b409SSimon J. Gerraty
221*0957b409SSimon J. Gerraty /*
222*0957b409SSimon J. Gerraty * Compute h4 = h*h*h*h in v12. This is done by squaring h2.
223*0957b409SSimon J. Gerraty */
224*0957b409SSimon J. Gerraty vsldoi(12, HB0, 10, 8)
225*0957b409SSimon J. Gerraty vsldoi(13, 10, HB0, 8)
226*0957b409SSimon J. Gerraty vpmsumd(12, 12, 12)
227*0957b409SSimon J. Gerraty vpmsumd(13, 13, 13)
228*0957b409SSimon J. Gerraty SL_256(12, 13)
229*0957b409SSimon J. Gerraty REDUCE_F128(12, 12, 13)
230*0957b409SSimon J. Gerraty
231*0957b409SSimon J. Gerraty /*
232*0957b409SSimon J. Gerraty * Repack h1, h2, h3 and h4:
233*0957b409SSimon J. Gerraty * v13 = h4_0:h3_0
234*0957b409SSimon J. Gerraty * v14 = h4_1:h3_1
235*0957b409SSimon J. Gerraty * v15 = h2_0:h1_0
236*0957b409SSimon J. Gerraty * v16 = h2_1:h1_1
237*0957b409SSimon J. Gerraty */
238*0957b409SSimon J. Gerraty xxpermdi(45, 44, 43, 0)
239*0957b409SSimon J. Gerraty xxpermdi(46, 44, 43, 3)
240*0957b409SSimon J. Gerraty xxpermdi(47, 42, 41, 0)
241*0957b409SSimon J. Gerraty xxpermdi(48, 42, 41, 3)
242*0957b409SSimon J. Gerraty
243*0957b409SSimon J. Gerraty /*
244*0957b409SSimon J. Gerraty * Loop for each group of four blocks.
245*0957b409SSimon J. Gerraty */
246*0957b409SSimon J. Gerraty mtctr(%[num4])
247*0957b409SSimon J. Gerraty label(loop4)
248*0957b409SSimon J. Gerraty /*
249*0957b409SSimon J. Gerraty * Read the four next blocks.
250*0957b409SSimon J. Gerraty * v20 = y + a0 = b0
251*0957b409SSimon J. Gerraty * v21 = a1 = b1
252*0957b409SSimon J. Gerraty * v22 = a2 = b2
253*0957b409SSimon J. Gerraty * v23 = a3 = b3
254*0957b409SSimon J. Gerraty */
255*0957b409SSimon J. Gerraty lxvw4x(52, %[cc0], %[buf1])
256*0957b409SSimon J. Gerraty lxvw4x(53, %[cc1], %[buf1])
257*0957b409SSimon J. Gerraty lxvw4x(54, %[cc2], %[buf1])
258*0957b409SSimon J. Gerraty lxvw4x(55, %[cc3], %[buf1])
259*0957b409SSimon J. Gerraty FIX_ENDIAN(20)
260*0957b409SSimon J. Gerraty FIX_ENDIAN(21)
261*0957b409SSimon J. Gerraty FIX_ENDIAN(22)
262*0957b409SSimon J. Gerraty FIX_ENDIAN(23)
263*0957b409SSimon J. Gerraty addi(%[buf1], %[buf1], 64)
264*0957b409SSimon J. Gerraty vxor(20, 20, 28)
265*0957b409SSimon J. Gerraty
266*0957b409SSimon J. Gerraty /*
267*0957b409SSimon J. Gerraty * Repack the blocks into v9, v10, v11 and v12.
268*0957b409SSimon J. Gerraty * v9 = b0_0:b1_0
269*0957b409SSimon J. Gerraty * v10 = b0_1:b1_1
270*0957b409SSimon J. Gerraty * v11 = b2_0:b3_0
271*0957b409SSimon J. Gerraty * v12 = b2_1:b3_1
272*0957b409SSimon J. Gerraty */
273*0957b409SSimon J. Gerraty xxpermdi(41, 52, 53, 0)
274*0957b409SSimon J. Gerraty xxpermdi(42, 52, 53, 3)
275*0957b409SSimon J. Gerraty xxpermdi(43, 54, 55, 0)
276*0957b409SSimon J. Gerraty xxpermdi(44, 54, 55, 3)
277*0957b409SSimon J. Gerraty
278*0957b409SSimon J. Gerraty /*
279*0957b409SSimon J. Gerraty * Compute the products.
280*0957b409SSimon J. Gerraty * v20 = b0_0*h4_0 + b1_0*h3_0
281*0957b409SSimon J. Gerraty * v21 = b0_1*h4_0 + b1_1*h3_0
282*0957b409SSimon J. Gerraty * v22 = b0_0*h4_1 + b1_0*h3_1
283*0957b409SSimon J. Gerraty * v23 = b0_1*h4_1 + b1_1*h3_1
284*0957b409SSimon J. Gerraty * v24 = b2_0*h2_0 + b3_0*h1_0
285*0957b409SSimon J. Gerraty * v25 = b2_1*h2_0 + b3_1*h1_0
286*0957b409SSimon J. Gerraty * v26 = b2_0*h2_1 + b3_0*h1_1
287*0957b409SSimon J. Gerraty * v27 = b2_1*h2_1 + b3_1*h1_1
288*0957b409SSimon J. Gerraty */
289*0957b409SSimon J. Gerraty vpmsumd(20, 13, 9)
290*0957b409SSimon J. Gerraty vpmsumd(21, 13, 10)
291*0957b409SSimon J. Gerraty vpmsumd(22, 14, 9)
292*0957b409SSimon J. Gerraty vpmsumd(23, 14, 10)
293*0957b409SSimon J. Gerraty vpmsumd(24, 15, 11)
294*0957b409SSimon J. Gerraty vpmsumd(25, 15, 12)
295*0957b409SSimon J. Gerraty vpmsumd(26, 16, 11)
296*0957b409SSimon J. Gerraty vpmsumd(27, 16, 12)
297*0957b409SSimon J. Gerraty
298*0957b409SSimon J. Gerraty /*
299*0957b409SSimon J. Gerraty * Sum products into a single 256-bit result in v11:v12.
300*0957b409SSimon J. Gerraty */
301*0957b409SSimon J. Gerraty vxor(11, 20, 24)
302*0957b409SSimon J. Gerraty vxor(12, 23, 27)
303*0957b409SSimon J. Gerraty vxor( 9, 21, 22)
304*0957b409SSimon J. Gerraty vxor(10, 25, 26)
305*0957b409SSimon J. Gerraty vxor(20, 9, 10)
306*0957b409SSimon J. Gerraty vsldoi( 9, HB0, 20, 8)
307*0957b409SSimon J. Gerraty vsldoi(10, 20, HB0, 8)
308*0957b409SSimon J. Gerraty vxor(11, 11, 9)
309*0957b409SSimon J. Gerraty vxor(12, 12, 10)
310*0957b409SSimon J. Gerraty
311*0957b409SSimon J. Gerraty /*
312*0957b409SSimon J. Gerraty * Fix and reduce in GF(2^128); this is the new y (in v28).
313*0957b409SSimon J. Gerraty */
314*0957b409SSimon J. Gerraty SL_256(11, 12)
315*0957b409SSimon J. Gerraty REDUCE_F128(28, 11, 12)
316*0957b409SSimon J. Gerraty
317*0957b409SSimon J. Gerraty /*
318*0957b409SSimon J. Gerraty * Loop for next group of four blocks.
319*0957b409SSimon J. Gerraty */
320*0957b409SSimon J. Gerraty bdnz(loop4)
321*0957b409SSimon J. Gerraty
322*0957b409SSimon J. Gerraty /*
323*0957b409SSimon J. Gerraty * Process second chunk, one block at a time.
324*0957b409SSimon J. Gerraty */
325*0957b409SSimon J. Gerraty label(chunk1)
326*0957b409SSimon J. Gerraty cmpldi(%[num1], 0)
327*0957b409SSimon J. Gerraty beq(done)
328*0957b409SSimon J. Gerraty
329*0957b409SSimon J. Gerraty mtctr(%[num1])
330*0957b409SSimon J. Gerraty label(loop1)
331*0957b409SSimon J. Gerraty /*
332*0957b409SSimon J. Gerraty * Load next data block and XOR it into y.
333*0957b409SSimon J. Gerraty */
334*0957b409SSimon J. Gerraty lxvw4x(41, 0, %[buf2])
335*0957b409SSimon J. Gerraty #if BR_POWER8_LE
336*0957b409SSimon J. Gerraty FIX_ENDIAN(9)
337*0957b409SSimon J. Gerraty #endif
338*0957b409SSimon J. Gerraty addi(%[buf2], %[buf2], 16)
339*0957b409SSimon J. Gerraty vxor(9, 28, 9)
340*0957b409SSimon J. Gerraty
341*0957b409SSimon J. Gerraty /*
342*0957b409SSimon J. Gerraty * Split y into doublewords:
343*0957b409SSimon J. Gerraty * v9 = y_0:y_1
344*0957b409SSimon J. Gerraty * v10 = 0:y_0
345*0957b409SSimon J. Gerraty * v11 = y_1:0
346*0957b409SSimon J. Gerraty */
347*0957b409SSimon J. Gerraty vsldoi(10, HB0, 9, 8)
348*0957b409SSimon J. Gerraty vsldoi(11, 9, HB0, 8)
349*0957b409SSimon J. Gerraty
350*0957b409SSimon J. Gerraty /*
351*0957b409SSimon J. Gerraty * Compute products with h:
352*0957b409SSimon J. Gerraty * v12 = y_0 * h_0
353*0957b409SSimon J. Gerraty * v13 = y_1 * h_1
354*0957b409SSimon J. Gerraty * v14 = y_1 * h_0 + y_0 * h_1
355*0957b409SSimon J. Gerraty */
356*0957b409SSimon J. Gerraty vpmsumd(14, 9, 17)
357*0957b409SSimon J. Gerraty vpmsumd(12, 10, 18)
358*0957b409SSimon J. Gerraty vpmsumd(13, 11, 19)
359*0957b409SSimon J. Gerraty
360*0957b409SSimon J. Gerraty /*
361*0957b409SSimon J. Gerraty * Propagate v14 into v12:v13 to finalise product.
362*0957b409SSimon J. Gerraty */
363*0957b409SSimon J. Gerraty vsldoi(10, HB0, 14, 8)
364*0957b409SSimon J. Gerraty vsldoi(11, 14, HB0, 8)
365*0957b409SSimon J. Gerraty vxor(12, 12, 10)
366*0957b409SSimon J. Gerraty vxor(13, 13, 11)
367*0957b409SSimon J. Gerraty
368*0957b409SSimon J. Gerraty /*
369*0957b409SSimon J. Gerraty * Fix result and reduce into v28 (next value for y).
370*0957b409SSimon J. Gerraty */
371*0957b409SSimon J. Gerraty SL_256(12, 13)
372*0957b409SSimon J. Gerraty REDUCE_F128(28, 12, 13)
373*0957b409SSimon J. Gerraty bdnz(loop1)
374*0957b409SSimon J. Gerraty
375*0957b409SSimon J. Gerraty label(done)
376*0957b409SSimon J. Gerraty /*
377*0957b409SSimon J. Gerraty * Write back the new y.
378*0957b409SSimon J. Gerraty */
379*0957b409SSimon J. Gerraty FIX_ENDIAN(28)
380*0957b409SSimon J. Gerraty stxvw4x(60, 0, %[y])
381*0957b409SSimon J. Gerraty
382*0957b409SSimon J. Gerraty : [buf1] "+b" (buf1), [buf2] "+b" (buf2)
383*0957b409SSimon J. Gerraty : [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1),
384*0957b409SSimon J. Gerraty [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3)
385*0957b409SSimon J. Gerraty #if BR_POWER8_LE
386*0957b409SSimon J. Gerraty , [idx2be] "b" (idx2be)
387*0957b409SSimon J. Gerraty #endif
388*0957b409SSimon J. Gerraty : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
389*0957b409SSimon J. Gerraty "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
390*0957b409SSimon J. Gerraty "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
391*0957b409SSimon J. Gerraty "ctr", "memory"
392*0957b409SSimon J. Gerraty );
393*0957b409SSimon J. Gerraty }
394*0957b409SSimon J. Gerraty
395*0957b409SSimon J. Gerraty /* see bearssl_hash.h */
396*0957b409SSimon J. Gerraty br_ghash
br_ghash_pwr8_get(void)397*0957b409SSimon J. Gerraty br_ghash_pwr8_get(void)
398*0957b409SSimon J. Gerraty {
399*0957b409SSimon J. Gerraty return &br_ghash_pwr8;
400*0957b409SSimon J. Gerraty }
401*0957b409SSimon J. Gerraty
402*0957b409SSimon J. Gerraty #else
403*0957b409SSimon J. Gerraty
404*0957b409SSimon J. Gerraty /* see bearssl_hash.h */
405*0957b409SSimon J. Gerraty br_ghash
br_ghash_pwr8_get(void)406*0957b409SSimon J. Gerraty br_ghash_pwr8_get(void)
407*0957b409SSimon J. Gerraty {
408*0957b409SSimon J. Gerraty return 0;
409*0957b409SSimon J. Gerraty }
410*0957b409SSimon J. Gerraty
411*0957b409SSimon J. Gerraty #endif
412