1 /*
2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #define BR_POWER_ASM_MACROS 1
26 #include "inner.h"
27
28 /*
29 * This code contains the AES key schedule implementation using the
30 * POWER8 opcodes.
31 */
32
33 #if BR_POWER8
34
35 static void
key_schedule_128(unsigned char * sk,const unsigned char * key)36 key_schedule_128(unsigned char *sk, const unsigned char *key)
37 {
38 long cc;
39
40 static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };
41 #if BR_POWER8_LE
42 static const uint32_t idx2be[] = {
43 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
44 };
45 #endif
46
47 cc = 0;
48
49 /*
50 * We use the VSX instructions for loading and storing the
51 * key/subkeys, since they support unaligned accesses. The rest
52 * of the computation is VMX only. VMX register 0 is VSX
53 * register 32.
54 */
55 asm volatile (
56
57 /*
58 * v0 = all-zero word
59 * v1 = constant -8 / +8, copied into four words
60 * v2 = current subkey
61 * v3 = Rcon (x4 words)
62 * v6 = constant 8, copied into four words
63 * v7 = constant 0x11B, copied into four words
64 * v8 = constant for byteswapping words
65 */
66 vspltisw(0, 0)
67 #if BR_POWER8_LE
68 vspltisw(1, -8)
69 #else
70 vspltisw(1, 8)
71 #endif
72 lxvw4x(34, 0, %[key])
73 vspltisw(3, 1)
74 vspltisw(6, 8)
75 lxvw4x(39, 0, %[fmod])
76 #if BR_POWER8_LE
77 lxvw4x(40, 0, %[idx2be])
78 #endif
79
80 /*
81 * First subkey is a copy of the key itself.
82 */
83 #if BR_POWER8_LE
84 vperm(4, 2, 2, 8)
85 stxvw4x(36, 0, %[sk])
86 #else
87 stxvw4x(34, 0, %[sk])
88 #endif
89
90 /*
91 * Loop must run 10 times.
92 */
93 li(%[cc], 10)
94 mtctr(%[cc])
95 label(loop)
96 /* Increment subkey address */
97 addi(%[sk], %[sk], 16)
98
99 /* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */
100 vrlw(4, 2, 1)
101 vsbox(4, 4)
102 #if BR_POWER8_LE
103 vxor(4, 4, 3)
104 #else
105 vsldoi(5, 3, 0, 3)
106 vxor(4, 4, 5)
107 #endif
108 vspltw(4, 4, 3)
109
110 /* XOR words for next subkey */
111 vsldoi(5, 0, 2, 12)
112 vxor(2, 2, 5)
113 vsldoi(5, 0, 2, 12)
114 vxor(2, 2, 5)
115 vsldoi(5, 0, 2, 12)
116 vxor(2, 2, 5)
117 vxor(2, 2, 4)
118
119 /* Store next subkey */
120 #if BR_POWER8_LE
121 vperm(4, 2, 2, 8)
122 stxvw4x(36, 0, %[sk])
123 #else
124 stxvw4x(34, 0, %[sk])
125 #endif
126
127 /* Update Rcon */
128 vadduwm(3, 3, 3)
129 vsrw(4, 3, 6)
130 vsubuwm(4, 0, 4)
131 vand(4, 4, 7)
132 vxor(3, 3, 4)
133
134 bdnz(loop)
135
136 : [sk] "+b" (sk), [cc] "+b" (cc)
137 : [key] "b" (key), [fmod] "b" (fmod)
138 #if BR_POWER8_LE
139 , [idx2be] "b" (idx2be)
140 #endif
141 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
142 );
143 }
144
145 static void
key_schedule_192(unsigned char * sk,const unsigned char * key)146 key_schedule_192(unsigned char *sk, const unsigned char *key)
147 {
148 long cc;
149
150 #if BR_POWER8_LE
151 static const uint32_t idx2be[] = {
152 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
153 };
154 #endif
155
156 cc = 0;
157
158 /*
159 * We use the VSX instructions for loading and storing the
160 * key/subkeys, since they support unaligned accesses. The rest
161 * of the computation is VMX only. VMX register 0 is VSX
162 * register 32.
163 */
164 asm volatile (
165
166 /*
167 * v0 = all-zero word
168 * v1 = constant -8 / +8, copied into four words
169 * v2, v3 = current subkey
170 * v5 = Rcon (x4 words) (already shifted on big-endian)
171 * v6 = constant 8, copied into four words
172 * v8 = constant for byteswapping words
173 *
174 * The left two words of v3 are ignored.
175 */
176 vspltisw(0, 0)
177 #if BR_POWER8_LE
178 vspltisw(1, -8)
179 #else
180 vspltisw(1, 8)
181 #endif
182 li(%[cc], 8)
183 lxvw4x(34, 0, %[key])
184 lxvw4x(35, %[cc], %[key])
185 vsldoi(3, 3, 0, 8)
186 vspltisw(5, 1)
187 #if !BR_POWER8_LE
188 vsldoi(5, 5, 0, 3)
189 #endif
190 vspltisw(6, 8)
191 #if BR_POWER8_LE
192 lxvw4x(40, 0, %[idx2be])
193 #endif
194
195 /*
196 * Loop must run 8 times. Each iteration produces 256
197 * bits of subkeys, with a 64-bit overlap.
198 */
199 li(%[cc], 8)
200 mtctr(%[cc])
201 li(%[cc], 16)
202 label(loop)
203
204 /*
205 * Last 6 words in v2:v3l. Compute next 6 words into
206 * v3r:v4.
207 */
208 vrlw(10, 3, 1)
209 vsbox(10, 10)
210 vxor(10, 10, 5)
211 vspltw(10, 10, 1)
212 vsldoi(11, 0, 10, 8)
213
214 vsldoi(12, 0, 2, 12)
215 vxor(12, 2, 12)
216 vsldoi(13, 0, 12, 12)
217 vxor(12, 12, 13)
218 vsldoi(13, 0, 12, 12)
219 vxor(12, 12, 13)
220
221 vspltw(13, 12, 3)
222 vxor(13, 13, 3)
223 vsldoi(14, 0, 3, 12)
224 vxor(13, 13, 14)
225
226 vsldoi(4, 12, 13, 8)
227 vsldoi(14, 0, 3, 8)
228 vsldoi(3, 14, 12, 8)
229
230 vxor(3, 3, 11)
231 vxor(4, 4, 10)
232
233 /*
234 * Update Rcon. Since for a 192-bit key, we use only 8
235 * such constants, we will not hit the field modulus,
236 * so a simple shift (addition) works well.
237 */
238 vadduwm(5, 5, 5)
239
240 /*
241 * Write out the two left 128-bit words
242 */
243 #if BR_POWER8_LE
244 vperm(10, 2, 2, 8)
245 vperm(11, 3, 3, 8)
246 stxvw4x(42, 0, %[sk])
247 stxvw4x(43, %[cc], %[sk])
248 #else
249 stxvw4x(34, 0, %[sk])
250 stxvw4x(35, %[cc], %[sk])
251 #endif
252 addi(%[sk], %[sk], 24)
253
254 /*
255 * Shift words for next iteration.
256 */
257 vsldoi(2, 3, 4, 8)
258 vsldoi(3, 4, 0, 8)
259
260 bdnz(loop)
261
262 /*
263 * The loop wrote the first 50 subkey words, but we need
264 * to produce 52, so we must do one last write.
265 */
266 #if BR_POWER8_LE
267 vperm(10, 2, 2, 8)
268 stxvw4x(42, 0, %[sk])
269 #else
270 stxvw4x(34, 0, %[sk])
271 #endif
272
273 : [sk] "+b" (sk), [cc] "+b" (cc)
274 : [key] "b" (key)
275 #if BR_POWER8_LE
276 , [idx2be] "b" (idx2be)
277 #endif
278 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
279 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
280 );
281 }
282
283 static void
key_schedule_256(unsigned char * sk,const unsigned char * key)284 key_schedule_256(unsigned char *sk, const unsigned char *key)
285 {
286 long cc;
287
288 #if BR_POWER8_LE
289 static const uint32_t idx2be[] = {
290 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
291 };
292 #endif
293
294 cc = 0;
295
296 /*
297 * We use the VSX instructions for loading and storing the
298 * key/subkeys, since they support unaligned accesses. The rest
299 * of the computation is VMX only. VMX register 0 is VSX
300 * register 32.
301 */
302 asm volatile (
303
304 /*
305 * v0 = all-zero word
306 * v1 = constant -8 / +8, copied into four words
307 * v2, v3 = current subkey
308 * v6 = Rcon (x4 words) (already shifted on big-endian)
309 * v7 = constant 8, copied into four words
310 * v8 = constant for byteswapping words
311 *
312 * The left two words of v3 are ignored.
313 */
314 vspltisw(0, 0)
315 #if BR_POWER8_LE
316 vspltisw(1, -8)
317 #else
318 vspltisw(1, 8)
319 #endif
320 li(%[cc], 16)
321 lxvw4x(34, 0, %[key])
322 lxvw4x(35, %[cc], %[key])
323 vspltisw(6, 1)
324 #if !BR_POWER8_LE
325 vsldoi(6, 6, 0, 3)
326 #endif
327 vspltisw(7, 8)
328 #if BR_POWER8_LE
329 lxvw4x(40, 0, %[idx2be])
330 #endif
331
332 /*
333 * Loop must run 7 times. Each iteration produces two
334 * subkeys.
335 */
336 li(%[cc], 7)
337 mtctr(%[cc])
338 li(%[cc], 16)
339 label(loop)
340
341 /*
342 * Current words are in v2:v3. Compute next word in v4.
343 */
344 vrlw(10, 3, 1)
345 vsbox(10, 10)
346 vxor(10, 10, 6)
347 vspltw(10, 10, 3)
348
349 vsldoi(4, 0, 2, 12)
350 vxor(4, 2, 4)
351 vsldoi(5, 0, 4, 12)
352 vxor(4, 4, 5)
353 vsldoi(5, 0, 4, 12)
354 vxor(4, 4, 5)
355 vxor(4, 4, 10)
356
357 /*
358 * Then other word in v5.
359 */
360 vsbox(10, 4)
361 vspltw(10, 10, 3)
362
363 vsldoi(5, 0, 3, 12)
364 vxor(5, 3, 5)
365 vsldoi(11, 0, 5, 12)
366 vxor(5, 5, 11)
367 vsldoi(11, 0, 5, 12)
368 vxor(5, 5, 11)
369 vxor(5, 5, 10)
370
371 /*
372 * Update Rcon. Since for a 256-bit key, we use only 7
373 * such constants, we will not hit the field modulus,
374 * so a simple shift (addition) works well.
375 */
376 vadduwm(6, 6, 6)
377
378 /*
379 * Write out the two left 128-bit words
380 */
381 #if BR_POWER8_LE
382 vperm(10, 2, 2, 8)
383 vperm(11, 3, 3, 8)
384 stxvw4x(42, 0, %[sk])
385 stxvw4x(43, %[cc], %[sk])
386 #else
387 stxvw4x(34, 0, %[sk])
388 stxvw4x(35, %[cc], %[sk])
389 #endif
390 addi(%[sk], %[sk], 32)
391
392 /*
393 * Replace v2:v3 with v4:v5.
394 */
395 vxor(2, 0, 4)
396 vxor(3, 0, 5)
397
398 bdnz(loop)
399
400 /*
401 * The loop wrote the first 14 subkeys, but we need 15,
402 * so we must do an extra write.
403 */
404 #if BR_POWER8_LE
405 vperm(10, 2, 2, 8)
406 stxvw4x(42, 0, %[sk])
407 #else
408 stxvw4x(34, 0, %[sk])
409 #endif
410
411 : [sk] "+b" (sk), [cc] "+b" (cc)
412 : [key] "b" (key)
413 #if BR_POWER8_LE
414 , [idx2be] "b" (idx2be)
415 #endif
416 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
417 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
418 );
419 }
420
421 /* see inner.h */
422 int
br_aes_pwr8_supported(void)423 br_aes_pwr8_supported(void)
424 {
425 return 1;
426 }
427
428 /* see inner.h */
429 unsigned
br_aes_pwr8_keysched(unsigned char * sk,const void * key,size_t len)430 br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)
431 {
432 switch (len) {
433 case 16:
434 key_schedule_128(sk, key);
435 return 10;
436 case 24:
437 key_schedule_192(sk, key);
438 return 12;
439 default:
440 key_schedule_256(sk, key);
441 return 14;
442 }
443 }
444
445 #endif
446