1 /*
2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice shall be
13 * included in all copies or substantial portions of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #define BR_POWER_ASM_MACROS 1
26 #include "inner.h"
27
28 #if BR_POWER8
29
30 /* see bearssl_block.h */
31 const br_block_ctrcbc_class *
br_aes_pwr8_ctrcbc_get_vtable(void)32 br_aes_pwr8_ctrcbc_get_vtable(void)
33 {
34 return br_aes_pwr8_supported() ? &br_aes_pwr8_ctrcbc_vtable : NULL;
35 }
36
37 /* see bearssl_block.h */
38 void
br_aes_pwr8_ctrcbc_init(br_aes_pwr8_ctrcbc_keys * ctx,const void * key,size_t len)39 br_aes_pwr8_ctrcbc_init(br_aes_pwr8_ctrcbc_keys *ctx,
40 const void *key, size_t len)
41 {
42 ctx->vtable = &br_aes_pwr8_ctrcbc_vtable;
43 ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
44 }
45
46 /*
47 * Register conventions for CTR + CBC-MAC:
48 *
49 * AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
50 * Register v15 contains the byteswap index register (little-endian only)
51 * Register v16 contains the CTR counter value
52 * Register v17 contains the CBC-MAC current value
53 * Registers v18 to v27 are scratch
54 * Counter increment uses v28, v29 and v30
55 *
56 * For CTR alone:
57 *
58 * AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
59 * Register v15 contains the byteswap index register (little-endian only)
60 * Registers v16 to v19 contain the CTR counter values (four blocks)
61 * Registers v20 to v27 are scratch
62 * Counter increment uses v28, v29 and v30
63 */
64
65 #define LOAD_SUBKEYS_128 \
66 lxvw4x(32, %[cc], %[sk]) \
67 addi(%[cc], %[cc], 16) \
68 lxvw4x(33, %[cc], %[sk]) \
69 addi(%[cc], %[cc], 16) \
70 lxvw4x(34, %[cc], %[sk]) \
71 addi(%[cc], %[cc], 16) \
72 lxvw4x(35, %[cc], %[sk]) \
73 addi(%[cc], %[cc], 16) \
74 lxvw4x(36, %[cc], %[sk]) \
75 addi(%[cc], %[cc], 16) \
76 lxvw4x(37, %[cc], %[sk]) \
77 addi(%[cc], %[cc], 16) \
78 lxvw4x(38, %[cc], %[sk]) \
79 addi(%[cc], %[cc], 16) \
80 lxvw4x(39, %[cc], %[sk]) \
81 addi(%[cc], %[cc], 16) \
82 lxvw4x(40, %[cc], %[sk]) \
83 addi(%[cc], %[cc], 16) \
84 lxvw4x(41, %[cc], %[sk]) \
85 addi(%[cc], %[cc], 16) \
86 lxvw4x(42, %[cc], %[sk])
87
88 #define LOAD_SUBKEYS_192 \
89 LOAD_SUBKEYS_128 \
90 addi(%[cc], %[cc], 16) \
91 lxvw4x(43, %[cc], %[sk]) \
92 addi(%[cc], %[cc], 16) \
93 lxvw4x(44, %[cc], %[sk])
94
95 #define LOAD_SUBKEYS_256 \
96 LOAD_SUBKEYS_192 \
97 addi(%[cc], %[cc], 16) \
98 lxvw4x(45, %[cc], %[sk]) \
99 addi(%[cc], %[cc], 16) \
100 lxvw4x(46, %[cc], %[sk])
101
102 #define BLOCK_ENCRYPT_128(x) \
103 vxor(x, x, 0) \
104 vcipher(x, x, 1) \
105 vcipher(x, x, 2) \
106 vcipher(x, x, 3) \
107 vcipher(x, x, 4) \
108 vcipher(x, x, 5) \
109 vcipher(x, x, 6) \
110 vcipher(x, x, 7) \
111 vcipher(x, x, 8) \
112 vcipher(x, x, 9) \
113 vcipherlast(x, x, 10)
114
115 #define BLOCK_ENCRYPT_192(x) \
116 vxor(x, x, 0) \
117 vcipher(x, x, 1) \
118 vcipher(x, x, 2) \
119 vcipher(x, x, 3) \
120 vcipher(x, x, 4) \
121 vcipher(x, x, 5) \
122 vcipher(x, x, 6) \
123 vcipher(x, x, 7) \
124 vcipher(x, x, 8) \
125 vcipher(x, x, 9) \
126 vcipher(x, x, 10) \
127 vcipher(x, x, 11) \
128 vcipherlast(x, x, 12)
129
130 #define BLOCK_ENCRYPT_256(x) \
131 vxor(x, x, 0) \
132 vcipher(x, x, 1) \
133 vcipher(x, x, 2) \
134 vcipher(x, x, 3) \
135 vcipher(x, x, 4) \
136 vcipher(x, x, 5) \
137 vcipher(x, x, 6) \
138 vcipher(x, x, 7) \
139 vcipher(x, x, 8) \
140 vcipher(x, x, 9) \
141 vcipher(x, x, 10) \
142 vcipher(x, x, 11) \
143 vcipher(x, x, 12) \
144 vcipher(x, x, 13) \
145 vcipherlast(x, x, 14)
146
147 #define BLOCK_ENCRYPT_X2_128(x, y) \
148 vxor(x, x, 0) \
149 vxor(y, y, 0) \
150 vcipher(x, x, 1) \
151 vcipher(y, y, 1) \
152 vcipher(x, x, 2) \
153 vcipher(y, y, 2) \
154 vcipher(x, x, 3) \
155 vcipher(y, y, 3) \
156 vcipher(x, x, 4) \
157 vcipher(y, y, 4) \
158 vcipher(x, x, 5) \
159 vcipher(y, y, 5) \
160 vcipher(x, x, 6) \
161 vcipher(y, y, 6) \
162 vcipher(x, x, 7) \
163 vcipher(y, y, 7) \
164 vcipher(x, x, 8) \
165 vcipher(y, y, 8) \
166 vcipher(x, x, 9) \
167 vcipher(y, y, 9) \
168 vcipherlast(x, x, 10) \
169 vcipherlast(y, y, 10)
170
171 #define BLOCK_ENCRYPT_X2_192(x, y) \
172 vxor(x, x, 0) \
173 vxor(y, y, 0) \
174 vcipher(x, x, 1) \
175 vcipher(y, y, 1) \
176 vcipher(x, x, 2) \
177 vcipher(y, y, 2) \
178 vcipher(x, x, 3) \
179 vcipher(y, y, 3) \
180 vcipher(x, x, 4) \
181 vcipher(y, y, 4) \
182 vcipher(x, x, 5) \
183 vcipher(y, y, 5) \
184 vcipher(x, x, 6) \
185 vcipher(y, y, 6) \
186 vcipher(x, x, 7) \
187 vcipher(y, y, 7) \
188 vcipher(x, x, 8) \
189 vcipher(y, y, 8) \
190 vcipher(x, x, 9) \
191 vcipher(y, y, 9) \
192 vcipher(x, x, 10) \
193 vcipher(y, y, 10) \
194 vcipher(x, x, 11) \
195 vcipher(y, y, 11) \
196 vcipherlast(x, x, 12) \
197 vcipherlast(y, y, 12)
198
199 #define BLOCK_ENCRYPT_X2_256(x, y) \
200 vxor(x, x, 0) \
201 vxor(y, y, 0) \
202 vcipher(x, x, 1) \
203 vcipher(y, y, 1) \
204 vcipher(x, x, 2) \
205 vcipher(y, y, 2) \
206 vcipher(x, x, 3) \
207 vcipher(y, y, 3) \
208 vcipher(x, x, 4) \
209 vcipher(y, y, 4) \
210 vcipher(x, x, 5) \
211 vcipher(y, y, 5) \
212 vcipher(x, x, 6) \
213 vcipher(y, y, 6) \
214 vcipher(x, x, 7) \
215 vcipher(y, y, 7) \
216 vcipher(x, x, 8) \
217 vcipher(y, y, 8) \
218 vcipher(x, x, 9) \
219 vcipher(y, y, 9) \
220 vcipher(x, x, 10) \
221 vcipher(y, y, 10) \
222 vcipher(x, x, 11) \
223 vcipher(y, y, 11) \
224 vcipher(x, x, 12) \
225 vcipher(y, y, 12) \
226 vcipher(x, x, 13) \
227 vcipher(y, y, 13) \
228 vcipherlast(x, x, 14) \
229 vcipherlast(y, y, 14)
230
231 #define BLOCK_ENCRYPT_X4_128(x0, x1, x2, x3) \
232 vxor(x0, x0, 0) \
233 vxor(x1, x1, 0) \
234 vxor(x2, x2, 0) \
235 vxor(x3, x3, 0) \
236 vcipher(x0, x0, 1) \
237 vcipher(x1, x1, 1) \
238 vcipher(x2, x2, 1) \
239 vcipher(x3, x3, 1) \
240 vcipher(x0, x0, 2) \
241 vcipher(x1, x1, 2) \
242 vcipher(x2, x2, 2) \
243 vcipher(x3, x3, 2) \
244 vcipher(x0, x0, 3) \
245 vcipher(x1, x1, 3) \
246 vcipher(x2, x2, 3) \
247 vcipher(x3, x3, 3) \
248 vcipher(x0, x0, 4) \
249 vcipher(x1, x1, 4) \
250 vcipher(x2, x2, 4) \
251 vcipher(x3, x3, 4) \
252 vcipher(x0, x0, 5) \
253 vcipher(x1, x1, 5) \
254 vcipher(x2, x2, 5) \
255 vcipher(x3, x3, 5) \
256 vcipher(x0, x0, 6) \
257 vcipher(x1, x1, 6) \
258 vcipher(x2, x2, 6) \
259 vcipher(x3, x3, 6) \
260 vcipher(x0, x0, 7) \
261 vcipher(x1, x1, 7) \
262 vcipher(x2, x2, 7) \
263 vcipher(x3, x3, 7) \
264 vcipher(x0, x0, 8) \
265 vcipher(x1, x1, 8) \
266 vcipher(x2, x2, 8) \
267 vcipher(x3, x3, 8) \
268 vcipher(x0, x0, 9) \
269 vcipher(x1, x1, 9) \
270 vcipher(x2, x2, 9) \
271 vcipher(x3, x3, 9) \
272 vcipherlast(x0, x0, 10) \
273 vcipherlast(x1, x1, 10) \
274 vcipherlast(x2, x2, 10) \
275 vcipherlast(x3, x3, 10)
276
277 #define BLOCK_ENCRYPT_X4_192(x0, x1, x2, x3) \
278 vxor(x0, x0, 0) \
279 vxor(x1, x1, 0) \
280 vxor(x2, x2, 0) \
281 vxor(x3, x3, 0) \
282 vcipher(x0, x0, 1) \
283 vcipher(x1, x1, 1) \
284 vcipher(x2, x2, 1) \
285 vcipher(x3, x3, 1) \
286 vcipher(x0, x0, 2) \
287 vcipher(x1, x1, 2) \
288 vcipher(x2, x2, 2) \
289 vcipher(x3, x3, 2) \
290 vcipher(x0, x0, 3) \
291 vcipher(x1, x1, 3) \
292 vcipher(x2, x2, 3) \
293 vcipher(x3, x3, 3) \
294 vcipher(x0, x0, 4) \
295 vcipher(x1, x1, 4) \
296 vcipher(x2, x2, 4) \
297 vcipher(x3, x3, 4) \
298 vcipher(x0, x0, 5) \
299 vcipher(x1, x1, 5) \
300 vcipher(x2, x2, 5) \
301 vcipher(x3, x3, 5) \
302 vcipher(x0, x0, 6) \
303 vcipher(x1, x1, 6) \
304 vcipher(x2, x2, 6) \
305 vcipher(x3, x3, 6) \
306 vcipher(x0, x0, 7) \
307 vcipher(x1, x1, 7) \
308 vcipher(x2, x2, 7) \
309 vcipher(x3, x3, 7) \
310 vcipher(x0, x0, 8) \
311 vcipher(x1, x1, 8) \
312 vcipher(x2, x2, 8) \
313 vcipher(x3, x3, 8) \
314 vcipher(x0, x0, 9) \
315 vcipher(x1, x1, 9) \
316 vcipher(x2, x2, 9) \
317 vcipher(x3, x3, 9) \
318 vcipher(x0, x0, 10) \
319 vcipher(x1, x1, 10) \
320 vcipher(x2, x2, 10) \
321 vcipher(x3, x3, 10) \
322 vcipher(x0, x0, 11) \
323 vcipher(x1, x1, 11) \
324 vcipher(x2, x2, 11) \
325 vcipher(x3, x3, 11) \
326 vcipherlast(x0, x0, 12) \
327 vcipherlast(x1, x1, 12) \
328 vcipherlast(x2, x2, 12) \
329 vcipherlast(x3, x3, 12)
330
331 #define BLOCK_ENCRYPT_X4_256(x0, x1, x2, x3) \
332 vxor(x0, x0, 0) \
333 vxor(x1, x1, 0) \
334 vxor(x2, x2, 0) \
335 vxor(x3, x3, 0) \
336 vcipher(x0, x0, 1) \
337 vcipher(x1, x1, 1) \
338 vcipher(x2, x2, 1) \
339 vcipher(x3, x3, 1) \
340 vcipher(x0, x0, 2) \
341 vcipher(x1, x1, 2) \
342 vcipher(x2, x2, 2) \
343 vcipher(x3, x3, 2) \
344 vcipher(x0, x0, 3) \
345 vcipher(x1, x1, 3) \
346 vcipher(x2, x2, 3) \
347 vcipher(x3, x3, 3) \
348 vcipher(x0, x0, 4) \
349 vcipher(x1, x1, 4) \
350 vcipher(x2, x2, 4) \
351 vcipher(x3, x3, 4) \
352 vcipher(x0, x0, 5) \
353 vcipher(x1, x1, 5) \
354 vcipher(x2, x2, 5) \
355 vcipher(x3, x3, 5) \
356 vcipher(x0, x0, 6) \
357 vcipher(x1, x1, 6) \
358 vcipher(x2, x2, 6) \
359 vcipher(x3, x3, 6) \
360 vcipher(x0, x0, 7) \
361 vcipher(x1, x1, 7) \
362 vcipher(x2, x2, 7) \
363 vcipher(x3, x3, 7) \
364 vcipher(x0, x0, 8) \
365 vcipher(x1, x1, 8) \
366 vcipher(x2, x2, 8) \
367 vcipher(x3, x3, 8) \
368 vcipher(x0, x0, 9) \
369 vcipher(x1, x1, 9) \
370 vcipher(x2, x2, 9) \
371 vcipher(x3, x3, 9) \
372 vcipher(x0, x0, 10) \
373 vcipher(x1, x1, 10) \
374 vcipher(x2, x2, 10) \
375 vcipher(x3, x3, 10) \
376 vcipher(x0, x0, 11) \
377 vcipher(x1, x1, 11) \
378 vcipher(x2, x2, 11) \
379 vcipher(x3, x3, 11) \
380 vcipher(x0, x0, 12) \
381 vcipher(x1, x1, 12) \
382 vcipher(x2, x2, 12) \
383 vcipher(x3, x3, 12) \
384 vcipher(x0, x0, 13) \
385 vcipher(x1, x1, 13) \
386 vcipher(x2, x2, 13) \
387 vcipher(x3, x3, 13) \
388 vcipherlast(x0, x0, 14) \
389 vcipherlast(x1, x1, 14) \
390 vcipherlast(x2, x2, 14) \
391 vcipherlast(x3, x3, 14)
392
393 #if BR_POWER8_LE
394 static const uint32_t idx2be[] = {
395 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
396 };
397 #define BYTESWAP_INIT lxvw4x(47, 0, %[idx2be])
398 #define BYTESWAP(x) vperm(x, x, x, 15)
399 #define BYTESWAPX(d, s) vperm(d, s, s, 15)
400 #define BYTESWAP_REG , [idx2be] "b" (idx2be)
401 #else
402 #define BYTESWAP_INIT
403 #define BYTESWAP(x)
404 #define BYTESWAPX(d, s) vand(d, s, s)
405 #define BYTESWAP_REG
406 #endif
407
408 static const uint32_t ctrinc[] = {
409 0, 0, 0, 1
410 };
411 static const uint32_t ctrinc_x4[] = {
412 0, 0, 0, 4
413 };
414 #define INCR_128_INIT lxvw4x(60, 0, %[ctrinc])
415 #define INCR_128_X4_INIT lxvw4x(60, 0, %[ctrinc_x4])
416 #define INCR_128(d, s) \
417 vaddcuw(29, s, 28) \
418 vadduwm(d, s, 28) \
419 vsldoi(30, 29, 29, 4) \
420 vaddcuw(29, d, 30) \
421 vadduwm(d, d, 30) \
422 vsldoi(30, 29, 29, 4) \
423 vaddcuw(29, d, 30) \
424 vadduwm(d, d, 30) \
425 vsldoi(30, 29, 29, 4) \
426 vadduwm(d, d, 30)
427
428 #define MKCTR(size) \
429 static void \
430 ctr_ ## size(const unsigned char *sk, \
431 unsigned char *ctrbuf, unsigned char *buf, size_t num_blocks_x4) \
432 { \
433 long cc, cc0, cc1, cc2, cc3; \
434 \
435 cc = 0; \
436 cc0 = 0; \
437 cc1 = 16; \
438 cc2 = 32; \
439 cc3 = 48; \
440 asm volatile ( \
441 \
442 /* \
443 * Load subkeys into v0..v10 \
444 */ \
445 LOAD_SUBKEYS_ ## size \
446 li(%[cc], 0) \
447 \
448 BYTESWAP_INIT \
449 INCR_128_X4_INIT \
450 \
451 /* \
452 * Load current CTR counters into v16 to v19. \
453 */ \
454 lxvw4x(48, %[cc0], %[ctrbuf]) \
455 lxvw4x(49, %[cc1], %[ctrbuf]) \
456 lxvw4x(50, %[cc2], %[ctrbuf]) \
457 lxvw4x(51, %[cc3], %[ctrbuf]) \
458 BYTESWAP(16) \
459 BYTESWAP(17) \
460 BYTESWAP(18) \
461 BYTESWAP(19) \
462 \
463 mtctr(%[num_blocks_x4]) \
464 \
465 label(loop) \
466 /* \
467 * Compute next counter values into v20..v23. \
468 */ \
469 INCR_128(20, 16) \
470 INCR_128(21, 17) \
471 INCR_128(22, 18) \
472 INCR_128(23, 19) \
473 \
474 /* \
475 * Encrypt counter values and XOR into next data blocks. \
476 */ \
477 lxvw4x(56, %[cc0], %[buf]) \
478 lxvw4x(57, %[cc1], %[buf]) \
479 lxvw4x(58, %[cc2], %[buf]) \
480 lxvw4x(59, %[cc3], %[buf]) \
481 BYTESWAP(24) \
482 BYTESWAP(25) \
483 BYTESWAP(26) \
484 BYTESWAP(27) \
485 BLOCK_ENCRYPT_X4_ ## size(16, 17, 18, 19) \
486 vxor(16, 16, 24) \
487 vxor(17, 17, 25) \
488 vxor(18, 18, 26) \
489 vxor(19, 19, 27) \
490 BYTESWAP(16) \
491 BYTESWAP(17) \
492 BYTESWAP(18) \
493 BYTESWAP(19) \
494 stxvw4x(48, %[cc0], %[buf]) \
495 stxvw4x(49, %[cc1], %[buf]) \
496 stxvw4x(50, %[cc2], %[buf]) \
497 stxvw4x(51, %[cc3], %[buf]) \
498 \
499 /* \
500 * Update counters and data pointer. \
501 */ \
502 vand(16, 20, 20) \
503 vand(17, 21, 21) \
504 vand(18, 22, 22) \
505 vand(19, 23, 23) \
506 addi(%[buf], %[buf], 64) \
507 \
508 bdnz(loop) \
509 \
510 /* \
511 * Write back new counter values. \
512 */ \
513 BYTESWAP(16) \
514 BYTESWAP(17) \
515 BYTESWAP(18) \
516 BYTESWAP(19) \
517 stxvw4x(48, %[cc0], %[ctrbuf]) \
518 stxvw4x(49, %[cc1], %[ctrbuf]) \
519 stxvw4x(50, %[cc2], %[ctrbuf]) \
520 stxvw4x(51, %[cc3], %[ctrbuf]) \
521 \
522 : [cc] "+b" (cc), [buf] "+b" (buf), \
523 [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3) \
524 : [sk] "b" (sk), [ctrbuf] "b" (ctrbuf), \
525 [num_blocks_x4] "b" (num_blocks_x4), [ctrinc_x4] "b" (ctrinc_x4) \
526 BYTESWAP_REG \
527 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
528 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
529 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
530 "v30", "ctr", "memory" \
531 ); \
532 }
533
534 MKCTR(128)
535 MKCTR(192)
536 MKCTR(256)
537
538 #define MKCBCMAC(size) \
539 static void \
540 cbcmac_ ## size(const unsigned char *sk, \
541 unsigned char *cbcmac, const unsigned char *buf, size_t num_blocks) \
542 { \
543 long cc; \
544 \
545 cc = 0; \
546 asm volatile ( \
547 \
548 /* \
549 * Load subkeys into v0..v10 \
550 */ \
551 LOAD_SUBKEYS_ ## size \
552 li(%[cc], 0) \
553 \
554 BYTESWAP_INIT \
555 \
556 /* \
557 * Load current CBC-MAC value into v16. \
558 */ \
559 lxvw4x(48, %[cc], %[cbcmac]) \
560 BYTESWAP(16) \
561 \
562 mtctr(%[num_blocks]) \
563 \
564 label(loop) \
565 /* \
566 * Load next block, XOR into current CBC-MAC value, \
567 * and then encrypt it. \
568 */ \
569 lxvw4x(49, %[cc], %[buf]) \
570 BYTESWAP(17) \
571 vxor(16, 16, 17) \
572 BLOCK_ENCRYPT_ ## size(16) \
573 addi(%[buf], %[buf], 16) \
574 \
575 bdnz(loop) \
576 \
577 /* \
578 * Write back new CBC-MAC value. \
579 */ \
580 BYTESWAP(16) \
581 stxvw4x(48, %[cc], %[cbcmac]) \
582 \
583 : [cc] "+b" (cc), [buf] "+b" (buf) \
584 : [sk] "b" (sk), [cbcmac] "b" (cbcmac), [num_blocks] "b" (num_blocks) \
585 BYTESWAP_REG \
586 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
587 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
588 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
589 "v30", "ctr", "memory" \
590 ); \
591 }
592
593 MKCBCMAC(128)
594 MKCBCMAC(192)
595 MKCBCMAC(256)
596
597 #define MKENCRYPT(size) \
598 static void \
599 ctrcbc_ ## size ## _encrypt(const unsigned char *sk, \
600 unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
601 size_t num_blocks) \
602 { \
603 long cc; \
604 \
605 cc = 0; \
606 asm volatile ( \
607 \
608 /* \
609 * Load subkeys into v0..v10 \
610 */ \
611 LOAD_SUBKEYS_ ## size \
612 li(%[cc], 0) \
613 \
614 BYTESWAP_INIT \
615 INCR_128_INIT \
616 \
617 /* \
618 * Load current CTR counter into v16, and current \
619 * CBC-MAC IV into v17. \
620 */ \
621 lxvw4x(48, %[cc], %[ctr]) \
622 lxvw4x(49, %[cc], %[cbcmac]) \
623 BYTESWAP(16) \
624 BYTESWAP(17) \
625 \
626 /* \
627 * At each iteration, we do two parallel encryption: \
628 * - new counter value for encryption of the next block; \
629 * - CBC-MAC over the previous encrypted block. \
630 * Thus, each plaintext block implies two AES instances, \
631 * over two successive iterations. This requires a single \
632 * counter encryption before the loop, and a single \
633 * CBC-MAC encryption after the loop. \
634 */ \
635 \
636 /* \
637 * Encrypt first block (into v20). \
638 */ \
639 lxvw4x(52, %[cc], %[buf]) \
640 BYTESWAP(20) \
641 INCR_128(22, 16) \
642 BLOCK_ENCRYPT_ ## size(16) \
643 vxor(20, 20, 16) \
644 BYTESWAPX(21, 20) \
645 stxvw4x(53, %[cc], %[buf]) \
646 vand(16, 22, 22) \
647 addi(%[buf], %[buf], 16) \
648 \
649 /* \
650 * Load loop counter; skip the loop if there is only \
651 * one block in total (already handled by the boundary \
652 * conditions). \
653 */ \
654 mtctr(%[num_blocks]) \
655 bdz(fastexit) \
656 \
657 label(loop) \
658 /* \
659 * Upon loop entry: \
660 * v16 counter value for next block \
661 * v17 current CBC-MAC value \
662 * v20 encrypted previous block \
663 */ \
664 vxor(17, 17, 20) \
665 INCR_128(22, 16) \
666 lxvw4x(52, %[cc], %[buf]) \
667 BYTESWAP(20) \
668 BLOCK_ENCRYPT_X2_ ## size(16, 17) \
669 vxor(20, 20, 16) \
670 BYTESWAPX(21, 20) \
671 stxvw4x(53, %[cc], %[buf]) \
672 addi(%[buf], %[buf], 16) \
673 vand(16, 22, 22) \
674 \
675 bdnz(loop) \
676 \
677 label(fastexit) \
678 vxor(17, 17, 20) \
679 BLOCK_ENCRYPT_ ## size(17) \
680 BYTESWAP(16) \
681 BYTESWAP(17) \
682 stxvw4x(48, %[cc], %[ctr]) \
683 stxvw4x(49, %[cc], %[cbcmac]) \
684 \
685 : [cc] "+b" (cc), [buf] "+b" (buf) \
686 : [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
687 [num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
688 BYTESWAP_REG \
689 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
690 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
691 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
692 "v30", "ctr", "memory" \
693 ); \
694 }
695
696 MKENCRYPT(128)
697 MKENCRYPT(192)
698 MKENCRYPT(256)
699
700 #define MKDECRYPT(size) \
701 static void \
702 ctrcbc_ ## size ## _decrypt(const unsigned char *sk, \
703 unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
704 size_t num_blocks) \
705 { \
706 long cc; \
707 \
708 cc = 0; \
709 asm volatile ( \
710 \
711 /* \
712 * Load subkeys into v0..v10 \
713 */ \
714 LOAD_SUBKEYS_ ## size \
715 li(%[cc], 0) \
716 \
717 BYTESWAP_INIT \
718 INCR_128_INIT \
719 \
720 /* \
721 * Load current CTR counter into v16, and current \
722 * CBC-MAC IV into v17. \
723 */ \
724 lxvw4x(48, %[cc], %[ctr]) \
725 lxvw4x(49, %[cc], %[cbcmac]) \
726 BYTESWAP(16) \
727 BYTESWAP(17) \
728 \
729 /* \
730 * At each iteration, we do two parallel encryption: \
731 * - new counter value for decryption of the next block; \
732 * - CBC-MAC over the next encrypted block. \
733 * Each iteration performs the two AES instances related \
734 * to the current block; there is thus no need for some \
735 * extra pre-loop and post-loop work as in encryption. \
736 */ \
737 \
738 mtctr(%[num_blocks]) \
739 \
740 label(loop) \
741 /* \
742 * Upon loop entry: \
743 * v16 counter value for next block \
744 * v17 current CBC-MAC value \
745 */ \
746 lxvw4x(52, %[cc], %[buf]) \
747 BYTESWAP(20) \
748 vxor(17, 17, 20) \
749 INCR_128(22, 16) \
750 BLOCK_ENCRYPT_X2_ ## size(16, 17) \
751 vxor(20, 20, 16) \
752 BYTESWAPX(21, 20) \
753 stxvw4x(53, %[cc], %[buf]) \
754 addi(%[buf], %[buf], 16) \
755 vand(16, 22, 22) \
756 \
757 bdnz(loop) \
758 \
759 /* \
760 * Store back counter and CBC-MAC value. \
761 */ \
762 BYTESWAP(16) \
763 BYTESWAP(17) \
764 stxvw4x(48, %[cc], %[ctr]) \
765 stxvw4x(49, %[cc], %[cbcmac]) \
766 \
767 : [cc] "+b" (cc), [buf] "+b" (buf) \
768 : [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
769 [num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
770 BYTESWAP_REG \
771 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
772 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
773 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
774 "v30", "ctr", "memory" \
775 ); \
776 }
777
778 MKDECRYPT(128)
779 MKDECRYPT(192)
780 MKDECRYPT(256)
781
782 /* see bearssl_block.h */
783 void
br_aes_pwr8_ctrcbc_encrypt(const br_aes_pwr8_ctrcbc_keys * ctx,void * ctr,void * cbcmac,void * data,size_t len)784 br_aes_pwr8_ctrcbc_encrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
785 void *ctr, void *cbcmac, void *data, size_t len)
786 {
787 if (len == 0) {
788 return;
789 }
790 switch (ctx->num_rounds) {
791 case 10:
792 ctrcbc_128_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
793 break;
794 case 12:
795 ctrcbc_192_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
796 break;
797 default:
798 ctrcbc_256_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
799 break;
800 }
801 }
802
803 /* see bearssl_block.h */
804 void
br_aes_pwr8_ctrcbc_decrypt(const br_aes_pwr8_ctrcbc_keys * ctx,void * ctr,void * cbcmac,void * data,size_t len)805 br_aes_pwr8_ctrcbc_decrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
806 void *ctr, void *cbcmac, void *data, size_t len)
807 {
808 if (len == 0) {
809 return;
810 }
811 switch (ctx->num_rounds) {
812 case 10:
813 ctrcbc_128_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
814 break;
815 case 12:
816 ctrcbc_192_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
817 break;
818 default:
819 ctrcbc_256_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
820 break;
821 }
822 }
823
824 static inline void
incr_ctr(void * dst,const void * src)825 incr_ctr(void *dst, const void *src)
826 {
827 uint64_t hi, lo;
828
829 hi = br_dec64be(src);
830 lo = br_dec64be((const unsigned char *)src + 8);
831 lo ++;
832 hi += ((lo | -lo) >> 63) ^ (uint64_t)1;
833 br_enc64be(dst, hi);
834 br_enc64be((unsigned char *)dst + 8, lo);
835 }
836
837 /* see bearssl_block.h */
838 void
br_aes_pwr8_ctrcbc_ctr(const br_aes_pwr8_ctrcbc_keys * ctx,void * ctr,void * data,size_t len)839 br_aes_pwr8_ctrcbc_ctr(const br_aes_pwr8_ctrcbc_keys *ctx,
840 void *ctr, void *data, size_t len)
841 {
842 unsigned char ctrbuf[64];
843
844 memcpy(ctrbuf, ctr, 16);
845 incr_ctr(ctrbuf + 16, ctrbuf);
846 incr_ctr(ctrbuf + 32, ctrbuf + 16);
847 incr_ctr(ctrbuf + 48, ctrbuf + 32);
848 if (len >= 64) {
849 switch (ctx->num_rounds) {
850 case 10:
851 ctr_128(ctx->skey.skni, ctrbuf, data, len >> 6);
852 break;
853 case 12:
854 ctr_192(ctx->skey.skni, ctrbuf, data, len >> 6);
855 break;
856 default:
857 ctr_256(ctx->skey.skni, ctrbuf, data, len >> 6);
858 break;
859 }
860 data = (unsigned char *)data + (len & ~(size_t)63);
861 len &= 63;
862 }
863 if (len > 0) {
864 unsigned char tmp[64];
865
866 if (len >= 32) {
867 if (len >= 48) {
868 memcpy(ctr, ctrbuf + 48, 16);
869 } else {
870 memcpy(ctr, ctrbuf + 32, 16);
871 }
872 } else {
873 if (len >= 16) {
874 memcpy(ctr, ctrbuf + 16, 16);
875 }
876 }
877 memcpy(tmp, data, len);
878 memset(tmp + len, 0, (sizeof tmp) - len);
879 switch (ctx->num_rounds) {
880 case 10:
881 ctr_128(ctx->skey.skni, ctrbuf, tmp, 1);
882 break;
883 case 12:
884 ctr_192(ctx->skey.skni, ctrbuf, tmp, 1);
885 break;
886 default:
887 ctr_256(ctx->skey.skni, ctrbuf, tmp, 1);
888 break;
889 }
890 memcpy(data, tmp, len);
891 } else {
892 memcpy(ctr, ctrbuf, 16);
893 }
894 }
895
896 /* see bearssl_block.h */
897 void
br_aes_pwr8_ctrcbc_mac(const br_aes_pwr8_ctrcbc_keys * ctx,void * cbcmac,const void * data,size_t len)898 br_aes_pwr8_ctrcbc_mac(const br_aes_pwr8_ctrcbc_keys *ctx,
899 void *cbcmac, const void *data, size_t len)
900 {
901 if (len > 0) {
902 switch (ctx->num_rounds) {
903 case 10:
904 cbcmac_128(ctx->skey.skni, cbcmac, data, len >> 4);
905 break;
906 case 12:
907 cbcmac_192(ctx->skey.skni, cbcmac, data, len >> 4);
908 break;
909 default:
910 cbcmac_256(ctx->skey.skni, cbcmac, data, len >> 4);
911 break;
912 }
913 }
914 }
915
916 /* see bearssl_block.h */
917 const br_block_ctrcbc_class br_aes_pwr8_ctrcbc_vtable = {
918 sizeof(br_aes_pwr8_ctrcbc_keys),
919 16,
920 4,
921 (void (*)(const br_block_ctrcbc_class **, const void *, size_t))
922 &br_aes_pwr8_ctrcbc_init,
923 (void (*)(const br_block_ctrcbc_class *const *,
924 void *, void *, void *, size_t))
925 &br_aes_pwr8_ctrcbc_encrypt,
926 (void (*)(const br_block_ctrcbc_class *const *,
927 void *, void *, void *, size_t))
928 &br_aes_pwr8_ctrcbc_decrypt,
929 (void (*)(const br_block_ctrcbc_class *const *,
930 void *, void *, size_t))
931 &br_aes_pwr8_ctrcbc_ctr,
932 (void (*)(const br_block_ctrcbc_class *const *,
933 void *, const void *, size_t))
934 &br_aes_pwr8_ctrcbc_mac
935 };
936
937 #else
938
939 /* see bearssl_block.h */
940 const br_block_ctrcbc_class *
br_aes_pwr8_ctrcbc_get_vtable(void)941 br_aes_pwr8_ctrcbc_get_vtable(void)
942 {
943 return NULL;
944 }
945
946 #endif
947