xref: /freebsd/contrib/bearssl/src/symcipher/aes_pwr8.c (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #define BR_POWER_ASM_MACROS   1
26 #include "inner.h"
27 
28 /*
29  * This code contains the AES key schedule implementation using the
30  * POWER8 opcodes.
31  */
32 
33 #if BR_POWER8
34 
35 static void
36 key_schedule_128(unsigned char *sk, const unsigned char *key)
37 {
38 	long cc;
39 
40 	static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };
41 #if BR_POWER8_LE
42 	static const uint32_t idx2be[] = {
43 		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
44 	};
45 #endif
46 
47 	cc = 0;
48 
49 	/*
50 	 * We use the VSX instructions for loading and storing the
51 	 * key/subkeys, since they support unaligned accesses. The rest
52 	 * of the computation is VMX only. VMX register 0 is VSX
53 	 * register 32.
54 	 */
55 	asm volatile (
56 
57 		/*
58 		 * v0 = all-zero word
59 		 * v1 = constant -8 / +8, copied into four words
60 		 * v2 = current subkey
61 		 * v3 = Rcon (x4 words)
62 		 * v6 = constant 8, copied into four words
63 		 * v7 = constant 0x11B, copied into four words
64 		 * v8 = constant for byteswapping words
65 		 */
66 		vspltisw(0, 0)
67 #if BR_POWER8_LE
68 		vspltisw(1, -8)
69 #else
70 		vspltisw(1, 8)
71 #endif
72 		lxvw4x(34, 0, %[key])
73 		vspltisw(3, 1)
74 		vspltisw(6, 8)
75 		lxvw4x(39, 0, %[fmod])
76 #if BR_POWER8_LE
77 		lxvw4x(40, 0, %[idx2be])
78 #endif
79 
80 		/*
81 		 * First subkey is a copy of the key itself.
82 		 */
83 #if BR_POWER8_LE
84 		vperm(4, 2, 2, 8)
85 		stxvw4x(36, 0, %[sk])
86 #else
87 		stxvw4x(34, 0, %[sk])
88 #endif
89 
90 		/*
91 		 * Loop must run 10 times.
92 		 */
93 		li(%[cc], 10)
94 		mtctr(%[cc])
95 	label(loop)
96 		/* Increment subkey address */
97 		addi(%[sk], %[sk], 16)
98 
99 		/* Compute SubWord(RotWord(temp)) xor Rcon  (into v4, splat) */
100 		vrlw(4, 2, 1)
101 		vsbox(4, 4)
102 #if BR_POWER8_LE
103 		vxor(4, 4, 3)
104 #else
105 		vsldoi(5, 3, 0, 3)
106 		vxor(4, 4, 5)
107 #endif
108 		vspltw(4, 4, 3)
109 
110 		/* XOR words for next subkey */
111 		vsldoi(5, 0, 2, 12)
112 		vxor(2, 2, 5)
113 		vsldoi(5, 0, 2, 12)
114 		vxor(2, 2, 5)
115 		vsldoi(5, 0, 2, 12)
116 		vxor(2, 2, 5)
117 		vxor(2, 2, 4)
118 
119 		/* Store next subkey */
120 #if BR_POWER8_LE
121 		vperm(4, 2, 2, 8)
122 		stxvw4x(36, 0, %[sk])
123 #else
124 		stxvw4x(34, 0, %[sk])
125 #endif
126 
127 		/* Update Rcon */
128 		vadduwm(3, 3, 3)
129 		vsrw(4, 3, 6)
130 		vsubuwm(4, 0, 4)
131 		vand(4, 4, 7)
132 		vxor(3, 3, 4)
133 
134 		bdnz(loop)
135 
136 : [sk] "+b" (sk), [cc] "+b" (cc)
137 : [key] "b" (key), [fmod] "b" (fmod)
138 #if BR_POWER8_LE
139 	, [idx2be] "b" (idx2be)
140 #endif
141 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
142 	);
143 }
144 
145 static void
146 key_schedule_192(unsigned char *sk, const unsigned char *key)
147 {
148 	long cc;
149 
150 #if BR_POWER8_LE
151 	static const uint32_t idx2be[] = {
152 		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
153 	};
154 #endif
155 
156 	cc = 0;
157 
158 	/*
159 	 * We use the VSX instructions for loading and storing the
160 	 * key/subkeys, since they support unaligned accesses. The rest
161 	 * of the computation is VMX only. VMX register 0 is VSX
162 	 * register 32.
163 	 */
164 	asm volatile (
165 
166 		/*
167 		 * v0 = all-zero word
168 		 * v1 = constant -8 / +8, copied into four words
169 		 * v2, v3 = current subkey
170 		 * v5 = Rcon (x4 words) (already shifted on big-endian)
171 		 * v6 = constant 8, copied into four words
172 		 * v8 = constant for byteswapping words
173 		 *
174 		 * The left two words of v3 are ignored.
175 		 */
176 		vspltisw(0, 0)
177 #if BR_POWER8_LE
178 		vspltisw(1, -8)
179 #else
180 		vspltisw(1, 8)
181 #endif
182 		li(%[cc], 8)
183 		lxvw4x(34, 0, %[key])
184 		lxvw4x(35, %[cc], %[key])
185 		vsldoi(3, 3, 0, 8)
186 		vspltisw(5, 1)
187 #if !BR_POWER8_LE
188 		vsldoi(5, 5, 0, 3)
189 #endif
190 		vspltisw(6, 8)
191 #if BR_POWER8_LE
192 		lxvw4x(40, 0, %[idx2be])
193 #endif
194 
195 		/*
196 		 * Loop must run 8 times. Each iteration produces 256
197 		 * bits of subkeys, with a 64-bit overlap.
198 		 */
199 		li(%[cc], 8)
200 		mtctr(%[cc])
201 		li(%[cc], 16)
202 	label(loop)
203 
204 		/*
205 		 * Last 6 words in v2:v3l. Compute next 6 words into
206 		 * v3r:v4.
207 		 */
208 		vrlw(10, 3, 1)
209 		vsbox(10, 10)
210 		vxor(10, 10, 5)
211 		vspltw(10, 10, 1)
212 		vsldoi(11, 0, 10, 8)
213 
214 		vsldoi(12, 0, 2, 12)
215 		vxor(12, 2, 12)
216 		vsldoi(13, 0, 12, 12)
217 		vxor(12, 12, 13)
218 		vsldoi(13, 0, 12, 12)
219 		vxor(12, 12, 13)
220 
221 		vspltw(13, 12, 3)
222 		vxor(13, 13, 3)
223 		vsldoi(14, 0, 3, 12)
224 		vxor(13, 13, 14)
225 
226 		vsldoi(4, 12, 13, 8)
227 		vsldoi(14, 0, 3, 8)
228 		vsldoi(3, 14, 12, 8)
229 
230 		vxor(3, 3, 11)
231 		vxor(4, 4, 10)
232 
233 		/*
234 		 * Update Rcon. Since for a 192-bit key, we use only 8
235 		 * such constants, we will not hit the field modulus,
236 		 * so a simple shift (addition) works well.
237 		 */
238 		vadduwm(5, 5, 5)
239 
240 		/*
241 		 * Write out the two left 128-bit words
242 		 */
243 #if BR_POWER8_LE
244 		vperm(10, 2, 2, 8)
245 		vperm(11, 3, 3, 8)
246 		stxvw4x(42, 0, %[sk])
247 		stxvw4x(43, %[cc], %[sk])
248 #else
249 		stxvw4x(34, 0, %[sk])
250 		stxvw4x(35, %[cc], %[sk])
251 #endif
252 		addi(%[sk], %[sk], 24)
253 
254 		/*
255 		 * Shift words for next iteration.
256 		 */
257 		vsldoi(2, 3, 4, 8)
258 		vsldoi(3, 4, 0, 8)
259 
260 		bdnz(loop)
261 
262 		/*
263 		 * The loop wrote the first 50 subkey words, but we need
264 		 * to produce 52, so we must do one last write.
265 		 */
266 #if BR_POWER8_LE
267 		vperm(10, 2, 2, 8)
268 		stxvw4x(42, 0, %[sk])
269 #else
270 		stxvw4x(34, 0, %[sk])
271 #endif
272 
273 : [sk] "+b" (sk), [cc] "+b" (cc)
274 : [key] "b" (key)
275 #if BR_POWER8_LE
276 	, [idx2be] "b" (idx2be)
277 #endif
278 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
279   "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
280 	);
281 }
282 
283 static void
284 key_schedule_256(unsigned char *sk, const unsigned char *key)
285 {
286 	long cc;
287 
288 #if BR_POWER8_LE
289 	static const uint32_t idx2be[] = {
290 		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
291 	};
292 #endif
293 
294 	cc = 0;
295 
296 	/*
297 	 * We use the VSX instructions for loading and storing the
298 	 * key/subkeys, since they support unaligned accesses. The rest
299 	 * of the computation is VMX only. VMX register 0 is VSX
300 	 * register 32.
301 	 */
302 	asm volatile (
303 
304 		/*
305 		 * v0 = all-zero word
306 		 * v1 = constant -8 / +8, copied into four words
307 		 * v2, v3 = current subkey
308 		 * v6 = Rcon (x4 words) (already shifted on big-endian)
309 		 * v7 = constant 8, copied into four words
310 		 * v8 = constant for byteswapping words
311 		 *
312 		 * The left two words of v3 are ignored.
313 		 */
314 		vspltisw(0, 0)
315 #if BR_POWER8_LE
316 		vspltisw(1, -8)
317 #else
318 		vspltisw(1, 8)
319 #endif
320 		li(%[cc], 16)
321 		lxvw4x(34, 0, %[key])
322 		lxvw4x(35, %[cc], %[key])
323 		vspltisw(6, 1)
324 #if !BR_POWER8_LE
325 		vsldoi(6, 6, 0, 3)
326 #endif
327 		vspltisw(7, 8)
328 #if BR_POWER8_LE
329 		lxvw4x(40, 0, %[idx2be])
330 #endif
331 
332 		/*
333 		 * Loop must run 7 times. Each iteration produces two
334 		 * subkeys.
335 		 */
336 		li(%[cc], 7)
337 		mtctr(%[cc])
338 		li(%[cc], 16)
339 	label(loop)
340 
341 		/*
342 		 * Current words are in v2:v3. Compute next word in v4.
343 		 */
344 		vrlw(10, 3, 1)
345 		vsbox(10, 10)
346 		vxor(10, 10, 6)
347 		vspltw(10, 10, 3)
348 
349 		vsldoi(4, 0, 2, 12)
350 		vxor(4, 2, 4)
351 		vsldoi(5, 0, 4, 12)
352 		vxor(4, 4, 5)
353 		vsldoi(5, 0, 4, 12)
354 		vxor(4, 4, 5)
355 		vxor(4, 4, 10)
356 
357 		/*
358 		 * Then other word in v5.
359 		 */
360 		vsbox(10, 4)
361 		vspltw(10, 10, 3)
362 
363 		vsldoi(5, 0, 3, 12)
364 		vxor(5, 3, 5)
365 		vsldoi(11, 0, 5, 12)
366 		vxor(5, 5, 11)
367 		vsldoi(11, 0, 5, 12)
368 		vxor(5, 5, 11)
369 		vxor(5, 5, 10)
370 
371 		/*
372 		 * Update Rcon. Since for a 256-bit key, we use only 7
373 		 * such constants, we will not hit the field modulus,
374 		 * so a simple shift (addition) works well.
375 		 */
376 		vadduwm(6, 6, 6)
377 
378 		/*
379 		 * Write out the two left 128-bit words
380 		 */
381 #if BR_POWER8_LE
382 		vperm(10, 2, 2, 8)
383 		vperm(11, 3, 3, 8)
384 		stxvw4x(42, 0, %[sk])
385 		stxvw4x(43, %[cc], %[sk])
386 #else
387 		stxvw4x(34, 0, %[sk])
388 		stxvw4x(35, %[cc], %[sk])
389 #endif
390 		addi(%[sk], %[sk], 32)
391 
392 		/*
393 		 * Replace v2:v3 with v4:v5.
394 		 */
395 		vxor(2, 0, 4)
396 		vxor(3, 0, 5)
397 
398 		bdnz(loop)
399 
400 		/*
401 		 * The loop wrote the first 14 subkeys, but we need 15,
402 		 * so we must do an extra write.
403 		 */
404 #if BR_POWER8_LE
405 		vperm(10, 2, 2, 8)
406 		stxvw4x(42, 0, %[sk])
407 #else
408 		stxvw4x(34, 0, %[sk])
409 #endif
410 
411 : [sk] "+b" (sk), [cc] "+b" (cc)
412 : [key] "b" (key)
413 #if BR_POWER8_LE
414 	, [idx2be] "b" (idx2be)
415 #endif
416 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
417   "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
418 	);
419 }
420 
421 /* see inner.h */
422 int
423 br_aes_pwr8_supported(void)
424 {
425 	return 1;
426 }
427 
428 /* see inner.h */
429 unsigned
430 br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)
431 {
432 	switch (len) {
433 	case 16:
434 		key_schedule_128(sk, key);
435 		return 10;
436 	case 24:
437 		key_schedule_192(sk, key);
438 		return 12;
439 	default:
440 		key_schedule_256(sk, key);
441 		return 14;
442 	}
443 }
444 
445 #endif
446