1*0957b409SSimon J. Gerraty /*
2*0957b409SSimon J. Gerraty * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3*0957b409SSimon J. Gerraty *
4*0957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining
5*0957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the
6*0957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including
7*0957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish,
8*0957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to
9*0957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to
10*0957b409SSimon J. Gerraty * the following conditions:
11*0957b409SSimon J. Gerraty *
12*0957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be
13*0957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software.
14*0957b409SSimon J. Gerraty *
15*0957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16*0957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17*0957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18*0957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19*0957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20*0957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21*0957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*0957b409SSimon J. Gerraty * SOFTWARE.
23*0957b409SSimon J. Gerraty */
24*0957b409SSimon J. Gerraty
25*0957b409SSimon J. Gerraty #define BR_POWER_ASM_MACROS 1
26*0957b409SSimon J. Gerraty #include "inner.h"
27*0957b409SSimon J. Gerraty
28*0957b409SSimon J. Gerraty /*
29*0957b409SSimon J. Gerraty * This code contains the AES key schedule implementation using the
30*0957b409SSimon J. Gerraty * POWER8 opcodes.
31*0957b409SSimon J. Gerraty */
32*0957b409SSimon J. Gerraty
33*0957b409SSimon J. Gerraty #if BR_POWER8
34*0957b409SSimon J. Gerraty
35*0957b409SSimon J. Gerraty static void
key_schedule_128(unsigned char * sk,const unsigned char * key)36*0957b409SSimon J. Gerraty key_schedule_128(unsigned char *sk, const unsigned char *key)
37*0957b409SSimon J. Gerraty {
38*0957b409SSimon J. Gerraty long cc;
39*0957b409SSimon J. Gerraty
40*0957b409SSimon J. Gerraty static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };
41*0957b409SSimon J. Gerraty #if BR_POWER8_LE
42*0957b409SSimon J. Gerraty static const uint32_t idx2be[] = {
43*0957b409SSimon J. Gerraty 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
44*0957b409SSimon J. Gerraty };
45*0957b409SSimon J. Gerraty #endif
46*0957b409SSimon J. Gerraty
47*0957b409SSimon J. Gerraty cc = 0;
48*0957b409SSimon J. Gerraty
49*0957b409SSimon J. Gerraty /*
50*0957b409SSimon J. Gerraty * We use the VSX instructions for loading and storing the
51*0957b409SSimon J. Gerraty * key/subkeys, since they support unaligned accesses. The rest
52*0957b409SSimon J. Gerraty * of the computation is VMX only. VMX register 0 is VSX
53*0957b409SSimon J. Gerraty * register 32.
54*0957b409SSimon J. Gerraty */
55*0957b409SSimon J. Gerraty asm volatile (
56*0957b409SSimon J. Gerraty
57*0957b409SSimon J. Gerraty /*
58*0957b409SSimon J. Gerraty * v0 = all-zero word
59*0957b409SSimon J. Gerraty * v1 = constant -8 / +8, copied into four words
60*0957b409SSimon J. Gerraty * v2 = current subkey
61*0957b409SSimon J. Gerraty * v3 = Rcon (x4 words)
62*0957b409SSimon J. Gerraty * v6 = constant 8, copied into four words
63*0957b409SSimon J. Gerraty * v7 = constant 0x11B, copied into four words
64*0957b409SSimon J. Gerraty * v8 = constant for byteswapping words
65*0957b409SSimon J. Gerraty */
66*0957b409SSimon J. Gerraty vspltisw(0, 0)
67*0957b409SSimon J. Gerraty #if BR_POWER8_LE
68*0957b409SSimon J. Gerraty vspltisw(1, -8)
69*0957b409SSimon J. Gerraty #else
70*0957b409SSimon J. Gerraty vspltisw(1, 8)
71*0957b409SSimon J. Gerraty #endif
72*0957b409SSimon J. Gerraty lxvw4x(34, 0, %[key])
73*0957b409SSimon J. Gerraty vspltisw(3, 1)
74*0957b409SSimon J. Gerraty vspltisw(6, 8)
75*0957b409SSimon J. Gerraty lxvw4x(39, 0, %[fmod])
76*0957b409SSimon J. Gerraty #if BR_POWER8_LE
77*0957b409SSimon J. Gerraty lxvw4x(40, 0, %[idx2be])
78*0957b409SSimon J. Gerraty #endif
79*0957b409SSimon J. Gerraty
80*0957b409SSimon J. Gerraty /*
81*0957b409SSimon J. Gerraty * First subkey is a copy of the key itself.
82*0957b409SSimon J. Gerraty */
83*0957b409SSimon J. Gerraty #if BR_POWER8_LE
84*0957b409SSimon J. Gerraty vperm(4, 2, 2, 8)
85*0957b409SSimon J. Gerraty stxvw4x(36, 0, %[sk])
86*0957b409SSimon J. Gerraty #else
87*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk])
88*0957b409SSimon J. Gerraty #endif
89*0957b409SSimon J. Gerraty
90*0957b409SSimon J. Gerraty /*
91*0957b409SSimon J. Gerraty * Loop must run 10 times.
92*0957b409SSimon J. Gerraty */
93*0957b409SSimon J. Gerraty li(%[cc], 10)
94*0957b409SSimon J. Gerraty mtctr(%[cc])
95*0957b409SSimon J. Gerraty label(loop)
96*0957b409SSimon J. Gerraty /* Increment subkey address */
97*0957b409SSimon J. Gerraty addi(%[sk], %[sk], 16)
98*0957b409SSimon J. Gerraty
99*0957b409SSimon J. Gerraty /* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */
100*0957b409SSimon J. Gerraty vrlw(4, 2, 1)
101*0957b409SSimon J. Gerraty vsbox(4, 4)
102*0957b409SSimon J. Gerraty #if BR_POWER8_LE
103*0957b409SSimon J. Gerraty vxor(4, 4, 3)
104*0957b409SSimon J. Gerraty #else
105*0957b409SSimon J. Gerraty vsldoi(5, 3, 0, 3)
106*0957b409SSimon J. Gerraty vxor(4, 4, 5)
107*0957b409SSimon J. Gerraty #endif
108*0957b409SSimon J. Gerraty vspltw(4, 4, 3)
109*0957b409SSimon J. Gerraty
110*0957b409SSimon J. Gerraty /* XOR words for next subkey */
111*0957b409SSimon J. Gerraty vsldoi(5, 0, 2, 12)
112*0957b409SSimon J. Gerraty vxor(2, 2, 5)
113*0957b409SSimon J. Gerraty vsldoi(5, 0, 2, 12)
114*0957b409SSimon J. Gerraty vxor(2, 2, 5)
115*0957b409SSimon J. Gerraty vsldoi(5, 0, 2, 12)
116*0957b409SSimon J. Gerraty vxor(2, 2, 5)
117*0957b409SSimon J. Gerraty vxor(2, 2, 4)
118*0957b409SSimon J. Gerraty
119*0957b409SSimon J. Gerraty /* Store next subkey */
120*0957b409SSimon J. Gerraty #if BR_POWER8_LE
121*0957b409SSimon J. Gerraty vperm(4, 2, 2, 8)
122*0957b409SSimon J. Gerraty stxvw4x(36, 0, %[sk])
123*0957b409SSimon J. Gerraty #else
124*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk])
125*0957b409SSimon J. Gerraty #endif
126*0957b409SSimon J. Gerraty
127*0957b409SSimon J. Gerraty /* Update Rcon */
128*0957b409SSimon J. Gerraty vadduwm(3, 3, 3)
129*0957b409SSimon J. Gerraty vsrw(4, 3, 6)
130*0957b409SSimon J. Gerraty vsubuwm(4, 0, 4)
131*0957b409SSimon J. Gerraty vand(4, 4, 7)
132*0957b409SSimon J. Gerraty vxor(3, 3, 4)
133*0957b409SSimon J. Gerraty
134*0957b409SSimon J. Gerraty bdnz(loop)
135*0957b409SSimon J. Gerraty
136*0957b409SSimon J. Gerraty : [sk] "+b" (sk), [cc] "+b" (cc)
137*0957b409SSimon J. Gerraty : [key] "b" (key), [fmod] "b" (fmod)
138*0957b409SSimon J. Gerraty #if BR_POWER8_LE
139*0957b409SSimon J. Gerraty , [idx2be] "b" (idx2be)
140*0957b409SSimon J. Gerraty #endif
141*0957b409SSimon J. Gerraty : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
142*0957b409SSimon J. Gerraty );
143*0957b409SSimon J. Gerraty }
144*0957b409SSimon J. Gerraty
145*0957b409SSimon J. Gerraty static void
key_schedule_192(unsigned char * sk,const unsigned char * key)146*0957b409SSimon J. Gerraty key_schedule_192(unsigned char *sk, const unsigned char *key)
147*0957b409SSimon J. Gerraty {
148*0957b409SSimon J. Gerraty long cc;
149*0957b409SSimon J. Gerraty
150*0957b409SSimon J. Gerraty #if BR_POWER8_LE
151*0957b409SSimon J. Gerraty static const uint32_t idx2be[] = {
152*0957b409SSimon J. Gerraty 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
153*0957b409SSimon J. Gerraty };
154*0957b409SSimon J. Gerraty #endif
155*0957b409SSimon J. Gerraty
156*0957b409SSimon J. Gerraty cc = 0;
157*0957b409SSimon J. Gerraty
158*0957b409SSimon J. Gerraty /*
159*0957b409SSimon J. Gerraty * We use the VSX instructions for loading and storing the
160*0957b409SSimon J. Gerraty * key/subkeys, since they support unaligned accesses. The rest
161*0957b409SSimon J. Gerraty * of the computation is VMX only. VMX register 0 is VSX
162*0957b409SSimon J. Gerraty * register 32.
163*0957b409SSimon J. Gerraty */
164*0957b409SSimon J. Gerraty asm volatile (
165*0957b409SSimon J. Gerraty
166*0957b409SSimon J. Gerraty /*
167*0957b409SSimon J. Gerraty * v0 = all-zero word
168*0957b409SSimon J. Gerraty * v1 = constant -8 / +8, copied into four words
169*0957b409SSimon J. Gerraty * v2, v3 = current subkey
170*0957b409SSimon J. Gerraty * v5 = Rcon (x4 words) (already shifted on big-endian)
171*0957b409SSimon J. Gerraty * v6 = constant 8, copied into four words
172*0957b409SSimon J. Gerraty * v8 = constant for byteswapping words
173*0957b409SSimon J. Gerraty *
174*0957b409SSimon J. Gerraty * The left two words of v3 are ignored.
175*0957b409SSimon J. Gerraty */
176*0957b409SSimon J. Gerraty vspltisw(0, 0)
177*0957b409SSimon J. Gerraty #if BR_POWER8_LE
178*0957b409SSimon J. Gerraty vspltisw(1, -8)
179*0957b409SSimon J. Gerraty #else
180*0957b409SSimon J. Gerraty vspltisw(1, 8)
181*0957b409SSimon J. Gerraty #endif
182*0957b409SSimon J. Gerraty li(%[cc], 8)
183*0957b409SSimon J. Gerraty lxvw4x(34, 0, %[key])
184*0957b409SSimon J. Gerraty lxvw4x(35, %[cc], %[key])
185*0957b409SSimon J. Gerraty vsldoi(3, 3, 0, 8)
186*0957b409SSimon J. Gerraty vspltisw(5, 1)
187*0957b409SSimon J. Gerraty #if !BR_POWER8_LE
188*0957b409SSimon J. Gerraty vsldoi(5, 5, 0, 3)
189*0957b409SSimon J. Gerraty #endif
190*0957b409SSimon J. Gerraty vspltisw(6, 8)
191*0957b409SSimon J. Gerraty #if BR_POWER8_LE
192*0957b409SSimon J. Gerraty lxvw4x(40, 0, %[idx2be])
193*0957b409SSimon J. Gerraty #endif
194*0957b409SSimon J. Gerraty
195*0957b409SSimon J. Gerraty /*
196*0957b409SSimon J. Gerraty * Loop must run 8 times. Each iteration produces 256
197*0957b409SSimon J. Gerraty * bits of subkeys, with a 64-bit overlap.
198*0957b409SSimon J. Gerraty */
199*0957b409SSimon J. Gerraty li(%[cc], 8)
200*0957b409SSimon J. Gerraty mtctr(%[cc])
201*0957b409SSimon J. Gerraty li(%[cc], 16)
202*0957b409SSimon J. Gerraty label(loop)
203*0957b409SSimon J. Gerraty
204*0957b409SSimon J. Gerraty /*
205*0957b409SSimon J. Gerraty * Last 6 words in v2:v3l. Compute next 6 words into
206*0957b409SSimon J. Gerraty * v3r:v4.
207*0957b409SSimon J. Gerraty */
208*0957b409SSimon J. Gerraty vrlw(10, 3, 1)
209*0957b409SSimon J. Gerraty vsbox(10, 10)
210*0957b409SSimon J. Gerraty vxor(10, 10, 5)
211*0957b409SSimon J. Gerraty vspltw(10, 10, 1)
212*0957b409SSimon J. Gerraty vsldoi(11, 0, 10, 8)
213*0957b409SSimon J. Gerraty
214*0957b409SSimon J. Gerraty vsldoi(12, 0, 2, 12)
215*0957b409SSimon J. Gerraty vxor(12, 2, 12)
216*0957b409SSimon J. Gerraty vsldoi(13, 0, 12, 12)
217*0957b409SSimon J. Gerraty vxor(12, 12, 13)
218*0957b409SSimon J. Gerraty vsldoi(13, 0, 12, 12)
219*0957b409SSimon J. Gerraty vxor(12, 12, 13)
220*0957b409SSimon J. Gerraty
221*0957b409SSimon J. Gerraty vspltw(13, 12, 3)
222*0957b409SSimon J. Gerraty vxor(13, 13, 3)
223*0957b409SSimon J. Gerraty vsldoi(14, 0, 3, 12)
224*0957b409SSimon J. Gerraty vxor(13, 13, 14)
225*0957b409SSimon J. Gerraty
226*0957b409SSimon J. Gerraty vsldoi(4, 12, 13, 8)
227*0957b409SSimon J. Gerraty vsldoi(14, 0, 3, 8)
228*0957b409SSimon J. Gerraty vsldoi(3, 14, 12, 8)
229*0957b409SSimon J. Gerraty
230*0957b409SSimon J. Gerraty vxor(3, 3, 11)
231*0957b409SSimon J. Gerraty vxor(4, 4, 10)
232*0957b409SSimon J. Gerraty
233*0957b409SSimon J. Gerraty /*
234*0957b409SSimon J. Gerraty * Update Rcon. Since for a 192-bit key, we use only 8
235*0957b409SSimon J. Gerraty * such constants, we will not hit the field modulus,
236*0957b409SSimon J. Gerraty * so a simple shift (addition) works well.
237*0957b409SSimon J. Gerraty */
238*0957b409SSimon J. Gerraty vadduwm(5, 5, 5)
239*0957b409SSimon J. Gerraty
240*0957b409SSimon J. Gerraty /*
241*0957b409SSimon J. Gerraty * Write out the two left 128-bit words
242*0957b409SSimon J. Gerraty */
243*0957b409SSimon J. Gerraty #if BR_POWER8_LE
244*0957b409SSimon J. Gerraty vperm(10, 2, 2, 8)
245*0957b409SSimon J. Gerraty vperm(11, 3, 3, 8)
246*0957b409SSimon J. Gerraty stxvw4x(42, 0, %[sk])
247*0957b409SSimon J. Gerraty stxvw4x(43, %[cc], %[sk])
248*0957b409SSimon J. Gerraty #else
249*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk])
250*0957b409SSimon J. Gerraty stxvw4x(35, %[cc], %[sk])
251*0957b409SSimon J. Gerraty #endif
252*0957b409SSimon J. Gerraty addi(%[sk], %[sk], 24)
253*0957b409SSimon J. Gerraty
254*0957b409SSimon J. Gerraty /*
255*0957b409SSimon J. Gerraty * Shift words for next iteration.
256*0957b409SSimon J. Gerraty */
257*0957b409SSimon J. Gerraty vsldoi(2, 3, 4, 8)
258*0957b409SSimon J. Gerraty vsldoi(3, 4, 0, 8)
259*0957b409SSimon J. Gerraty
260*0957b409SSimon J. Gerraty bdnz(loop)
261*0957b409SSimon J. Gerraty
262*0957b409SSimon J. Gerraty /*
263*0957b409SSimon J. Gerraty * The loop wrote the first 50 subkey words, but we need
264*0957b409SSimon J. Gerraty * to produce 52, so we must do one last write.
265*0957b409SSimon J. Gerraty */
266*0957b409SSimon J. Gerraty #if BR_POWER8_LE
267*0957b409SSimon J. Gerraty vperm(10, 2, 2, 8)
268*0957b409SSimon J. Gerraty stxvw4x(42, 0, %[sk])
269*0957b409SSimon J. Gerraty #else
270*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk])
271*0957b409SSimon J. Gerraty #endif
272*0957b409SSimon J. Gerraty
273*0957b409SSimon J. Gerraty : [sk] "+b" (sk), [cc] "+b" (cc)
274*0957b409SSimon J. Gerraty : [key] "b" (key)
275*0957b409SSimon J. Gerraty #if BR_POWER8_LE
276*0957b409SSimon J. Gerraty , [idx2be] "b" (idx2be)
277*0957b409SSimon J. Gerraty #endif
278*0957b409SSimon J. Gerraty : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
279*0957b409SSimon J. Gerraty "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
280*0957b409SSimon J. Gerraty );
281*0957b409SSimon J. Gerraty }
282*0957b409SSimon J. Gerraty
283*0957b409SSimon J. Gerraty static void
key_schedule_256(unsigned char * sk,const unsigned char * key)284*0957b409SSimon J. Gerraty key_schedule_256(unsigned char *sk, const unsigned char *key)
285*0957b409SSimon J. Gerraty {
286*0957b409SSimon J. Gerraty long cc;
287*0957b409SSimon J. Gerraty
288*0957b409SSimon J. Gerraty #if BR_POWER8_LE
289*0957b409SSimon J. Gerraty static const uint32_t idx2be[] = {
290*0957b409SSimon J. Gerraty 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
291*0957b409SSimon J. Gerraty };
292*0957b409SSimon J. Gerraty #endif
293*0957b409SSimon J. Gerraty
294*0957b409SSimon J. Gerraty cc = 0;
295*0957b409SSimon J. Gerraty
296*0957b409SSimon J. Gerraty /*
297*0957b409SSimon J. Gerraty * We use the VSX instructions for loading and storing the
298*0957b409SSimon J. Gerraty * key/subkeys, since they support unaligned accesses. The rest
299*0957b409SSimon J. Gerraty * of the computation is VMX only. VMX register 0 is VSX
300*0957b409SSimon J. Gerraty * register 32.
301*0957b409SSimon J. Gerraty */
302*0957b409SSimon J. Gerraty asm volatile (
303*0957b409SSimon J. Gerraty
304*0957b409SSimon J. Gerraty /*
305*0957b409SSimon J. Gerraty * v0 = all-zero word
306*0957b409SSimon J. Gerraty * v1 = constant -8 / +8, copied into four words
307*0957b409SSimon J. Gerraty * v2, v3 = current subkey
308*0957b409SSimon J. Gerraty * v6 = Rcon (x4 words) (already shifted on big-endian)
309*0957b409SSimon J. Gerraty * v7 = constant 8, copied into four words
310*0957b409SSimon J. Gerraty * v8 = constant for byteswapping words
311*0957b409SSimon J. Gerraty *
312*0957b409SSimon J. Gerraty * The left two words of v3 are ignored.
313*0957b409SSimon J. Gerraty */
314*0957b409SSimon J. Gerraty vspltisw(0, 0)
315*0957b409SSimon J. Gerraty #if BR_POWER8_LE
316*0957b409SSimon J. Gerraty vspltisw(1, -8)
317*0957b409SSimon J. Gerraty #else
318*0957b409SSimon J. Gerraty vspltisw(1, 8)
319*0957b409SSimon J. Gerraty #endif
320*0957b409SSimon J. Gerraty li(%[cc], 16)
321*0957b409SSimon J. Gerraty lxvw4x(34, 0, %[key])
322*0957b409SSimon J. Gerraty lxvw4x(35, %[cc], %[key])
323*0957b409SSimon J. Gerraty vspltisw(6, 1)
324*0957b409SSimon J. Gerraty #if !BR_POWER8_LE
325*0957b409SSimon J. Gerraty vsldoi(6, 6, 0, 3)
326*0957b409SSimon J. Gerraty #endif
327*0957b409SSimon J. Gerraty vspltisw(7, 8)
328*0957b409SSimon J. Gerraty #if BR_POWER8_LE
329*0957b409SSimon J. Gerraty lxvw4x(40, 0, %[idx2be])
330*0957b409SSimon J. Gerraty #endif
331*0957b409SSimon J. Gerraty
332*0957b409SSimon J. Gerraty /*
333*0957b409SSimon J. Gerraty * Loop must run 7 times. Each iteration produces two
334*0957b409SSimon J. Gerraty * subkeys.
335*0957b409SSimon J. Gerraty */
336*0957b409SSimon J. Gerraty li(%[cc], 7)
337*0957b409SSimon J. Gerraty mtctr(%[cc])
338*0957b409SSimon J. Gerraty li(%[cc], 16)
339*0957b409SSimon J. Gerraty label(loop)
340*0957b409SSimon J. Gerraty
341*0957b409SSimon J. Gerraty /*
342*0957b409SSimon J. Gerraty * Current words are in v2:v3. Compute next word in v4.
343*0957b409SSimon J. Gerraty */
344*0957b409SSimon J. Gerraty vrlw(10, 3, 1)
345*0957b409SSimon J. Gerraty vsbox(10, 10)
346*0957b409SSimon J. Gerraty vxor(10, 10, 6)
347*0957b409SSimon J. Gerraty vspltw(10, 10, 3)
348*0957b409SSimon J. Gerraty
349*0957b409SSimon J. Gerraty vsldoi(4, 0, 2, 12)
350*0957b409SSimon J. Gerraty vxor(4, 2, 4)
351*0957b409SSimon J. Gerraty vsldoi(5, 0, 4, 12)
352*0957b409SSimon J. Gerraty vxor(4, 4, 5)
353*0957b409SSimon J. Gerraty vsldoi(5, 0, 4, 12)
354*0957b409SSimon J. Gerraty vxor(4, 4, 5)
355*0957b409SSimon J. Gerraty vxor(4, 4, 10)
356*0957b409SSimon J. Gerraty
357*0957b409SSimon J. Gerraty /*
358*0957b409SSimon J. Gerraty * Then other word in v5.
359*0957b409SSimon J. Gerraty */
360*0957b409SSimon J. Gerraty vsbox(10, 4)
361*0957b409SSimon J. Gerraty vspltw(10, 10, 3)
362*0957b409SSimon J. Gerraty
363*0957b409SSimon J. Gerraty vsldoi(5, 0, 3, 12)
364*0957b409SSimon J. Gerraty vxor(5, 3, 5)
365*0957b409SSimon J. Gerraty vsldoi(11, 0, 5, 12)
366*0957b409SSimon J. Gerraty vxor(5, 5, 11)
367*0957b409SSimon J. Gerraty vsldoi(11, 0, 5, 12)
368*0957b409SSimon J. Gerraty vxor(5, 5, 11)
369*0957b409SSimon J. Gerraty vxor(5, 5, 10)
370*0957b409SSimon J. Gerraty
371*0957b409SSimon J. Gerraty /*
372*0957b409SSimon J. Gerraty * Update Rcon. Since for a 256-bit key, we use only 7
373*0957b409SSimon J. Gerraty * such constants, we will not hit the field modulus,
374*0957b409SSimon J. Gerraty * so a simple shift (addition) works well.
375*0957b409SSimon J. Gerraty */
376*0957b409SSimon J. Gerraty vadduwm(6, 6, 6)
377*0957b409SSimon J. Gerraty
378*0957b409SSimon J. Gerraty /*
379*0957b409SSimon J. Gerraty * Write out the two left 128-bit words
380*0957b409SSimon J. Gerraty */
381*0957b409SSimon J. Gerraty #if BR_POWER8_LE
382*0957b409SSimon J. Gerraty vperm(10, 2, 2, 8)
383*0957b409SSimon J. Gerraty vperm(11, 3, 3, 8)
384*0957b409SSimon J. Gerraty stxvw4x(42, 0, %[sk])
385*0957b409SSimon J. Gerraty stxvw4x(43, %[cc], %[sk])
386*0957b409SSimon J. Gerraty #else
387*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk])
388*0957b409SSimon J. Gerraty stxvw4x(35, %[cc], %[sk])
389*0957b409SSimon J. Gerraty #endif
390*0957b409SSimon J. Gerraty addi(%[sk], %[sk], 32)
391*0957b409SSimon J. Gerraty
392*0957b409SSimon J. Gerraty /*
393*0957b409SSimon J. Gerraty * Replace v2:v3 with v4:v5.
394*0957b409SSimon J. Gerraty */
395*0957b409SSimon J. Gerraty vxor(2, 0, 4)
396*0957b409SSimon J. Gerraty vxor(3, 0, 5)
397*0957b409SSimon J. Gerraty
398*0957b409SSimon J. Gerraty bdnz(loop)
399*0957b409SSimon J. Gerraty
400*0957b409SSimon J. Gerraty /*
401*0957b409SSimon J. Gerraty * The loop wrote the first 14 subkeys, but we need 15,
402*0957b409SSimon J. Gerraty * so we must do an extra write.
403*0957b409SSimon J. Gerraty */
404*0957b409SSimon J. Gerraty #if BR_POWER8_LE
405*0957b409SSimon J. Gerraty vperm(10, 2, 2, 8)
406*0957b409SSimon J. Gerraty stxvw4x(42, 0, %[sk])
407*0957b409SSimon J. Gerraty #else
408*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk])
409*0957b409SSimon J. Gerraty #endif
410*0957b409SSimon J. Gerraty
411*0957b409SSimon J. Gerraty : [sk] "+b" (sk), [cc] "+b" (cc)
412*0957b409SSimon J. Gerraty : [key] "b" (key)
413*0957b409SSimon J. Gerraty #if BR_POWER8_LE
414*0957b409SSimon J. Gerraty , [idx2be] "b" (idx2be)
415*0957b409SSimon J. Gerraty #endif
416*0957b409SSimon J. Gerraty : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
417*0957b409SSimon J. Gerraty "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
418*0957b409SSimon J. Gerraty );
419*0957b409SSimon J. Gerraty }
420*0957b409SSimon J. Gerraty
421*0957b409SSimon J. Gerraty /* see inner.h */
422*0957b409SSimon J. Gerraty int
br_aes_pwr8_supported(void)423*0957b409SSimon J. Gerraty br_aes_pwr8_supported(void)
424*0957b409SSimon J. Gerraty {
425*0957b409SSimon J. Gerraty return 1;
426*0957b409SSimon J. Gerraty }
427*0957b409SSimon J. Gerraty
428*0957b409SSimon J. Gerraty /* see inner.h */
429*0957b409SSimon J. Gerraty unsigned
br_aes_pwr8_keysched(unsigned char * sk,const void * key,size_t len)430*0957b409SSimon J. Gerraty br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)
431*0957b409SSimon J. Gerraty {
432*0957b409SSimon J. Gerraty switch (len) {
433*0957b409SSimon J. Gerraty case 16:
434*0957b409SSimon J. Gerraty key_schedule_128(sk, key);
435*0957b409SSimon J. Gerraty return 10;
436*0957b409SSimon J. Gerraty case 24:
437*0957b409SSimon J. Gerraty key_schedule_192(sk, key);
438*0957b409SSimon J. Gerraty return 12;
439*0957b409SSimon J. Gerraty default:
440*0957b409SSimon J. Gerraty key_schedule_256(sk, key);
441*0957b409SSimon J. Gerraty return 14;
442*0957b409SSimon J. Gerraty }
443*0957b409SSimon J. Gerraty }
444*0957b409SSimon J. Gerraty
445*0957b409SSimon J. Gerraty #endif
446