xref: /freebsd/contrib/bearssl/src/symcipher/aes_pwr8_ctrcbc.c (revision 99282790b7d01ec3c4072621d46a0d7302517ad4)
1 /*
2  * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #define BR_POWER_ASM_MACROS   1
26 #include "inner.h"
27 
28 #if BR_POWER8
29 
30 /* see bearssl_block.h */
31 const br_block_ctrcbc_class *
32 br_aes_pwr8_ctrcbc_get_vtable(void)
33 {
34 	return br_aes_pwr8_supported() ? &br_aes_pwr8_ctrcbc_vtable : NULL;
35 }
36 
37 /* see bearssl_block.h */
38 void
39 br_aes_pwr8_ctrcbc_init(br_aes_pwr8_ctrcbc_keys *ctx,
40 	const void *key, size_t len)
41 {
42 	ctx->vtable = &br_aes_pwr8_ctrcbc_vtable;
43 	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
44 }
45 
46 /*
47  * Register conventions for CTR + CBC-MAC:
48  *
49  *   AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
50  *   Register v15 contains the byteswap index register (little-endian only)
51  *   Register v16 contains the CTR counter value
52  *   Register v17 contains the CBC-MAC current value
53  *   Registers v18 to v27 are scratch
54  *   Counter increment uses v28, v29 and v30
55  *
56  * For CTR alone:
57  *
58  *   AES subkeys are in registers 0 to 10/12/14 (depending on keys size)
59  *   Register v15 contains the byteswap index register (little-endian only)
60  *   Registers v16 to v19 contain the CTR counter values (four blocks)
61  *   Registers v20 to v27 are scratch
62  *   Counter increment uses v28, v29 and v30
63  */
64 
65 #define LOAD_SUBKEYS_128 \
66 		lxvw4x(32, %[cc], %[sk])   \
67 		addi(%[cc], %[cc], 16)     \
68 		lxvw4x(33, %[cc], %[sk])   \
69 		addi(%[cc], %[cc], 16)     \
70 		lxvw4x(34, %[cc], %[sk])   \
71 		addi(%[cc], %[cc], 16)     \
72 		lxvw4x(35, %[cc], %[sk])   \
73 		addi(%[cc], %[cc], 16)     \
74 		lxvw4x(36, %[cc], %[sk])   \
75 		addi(%[cc], %[cc], 16)     \
76 		lxvw4x(37, %[cc], %[sk])   \
77 		addi(%[cc], %[cc], 16)     \
78 		lxvw4x(38, %[cc], %[sk])   \
79 		addi(%[cc], %[cc], 16)     \
80 		lxvw4x(39, %[cc], %[sk])   \
81 		addi(%[cc], %[cc], 16)     \
82 		lxvw4x(40, %[cc], %[sk])   \
83 		addi(%[cc], %[cc], 16)     \
84 		lxvw4x(41, %[cc], %[sk])   \
85 		addi(%[cc], %[cc], 16)     \
86 		lxvw4x(42, %[cc], %[sk])
87 
88 #define LOAD_SUBKEYS_192 \
89 		LOAD_SUBKEYS_128 \
90 		addi(%[cc], %[cc], 16)     \
91 		lxvw4x(43, %[cc], %[sk])   \
92 		addi(%[cc], %[cc], 16)     \
93 		lxvw4x(44, %[cc], %[sk])
94 
95 #define LOAD_SUBKEYS_256 \
96 		LOAD_SUBKEYS_192 \
97 		addi(%[cc], %[cc], 16)     \
98 		lxvw4x(45, %[cc], %[sk])   \
99 		addi(%[cc], %[cc], 16)     \
100 		lxvw4x(46, %[cc], %[sk])
101 
102 #define BLOCK_ENCRYPT_128(x) \
103 		vxor(x, x, 0) \
104 		vcipher(x, x, 1) \
105 		vcipher(x, x, 2) \
106 		vcipher(x, x, 3) \
107 		vcipher(x, x, 4) \
108 		vcipher(x, x, 5) \
109 		vcipher(x, x, 6) \
110 		vcipher(x, x, 7) \
111 		vcipher(x, x, 8) \
112 		vcipher(x, x, 9) \
113 		vcipherlast(x, x, 10)
114 
115 #define BLOCK_ENCRYPT_192(x) \
116 		vxor(x, x, 0) \
117 		vcipher(x, x, 1) \
118 		vcipher(x, x, 2) \
119 		vcipher(x, x, 3) \
120 		vcipher(x, x, 4) \
121 		vcipher(x, x, 5) \
122 		vcipher(x, x, 6) \
123 		vcipher(x, x, 7) \
124 		vcipher(x, x, 8) \
125 		vcipher(x, x, 9) \
126 		vcipher(x, x, 10) \
127 		vcipher(x, x, 11) \
128 		vcipherlast(x, x, 12)
129 
130 #define BLOCK_ENCRYPT_256(x) \
131 		vxor(x, x, 0) \
132 		vcipher(x, x, 1) \
133 		vcipher(x, x, 2) \
134 		vcipher(x, x, 3) \
135 		vcipher(x, x, 4) \
136 		vcipher(x, x, 5) \
137 		vcipher(x, x, 6) \
138 		vcipher(x, x, 7) \
139 		vcipher(x, x, 8) \
140 		vcipher(x, x, 9) \
141 		vcipher(x, x, 10) \
142 		vcipher(x, x, 11) \
143 		vcipher(x, x, 12) \
144 		vcipher(x, x, 13) \
145 		vcipherlast(x, x, 14)
146 
147 #define BLOCK_ENCRYPT_X2_128(x, y) \
148 		vxor(x, x, 0) \
149 		vxor(y, y, 0) \
150 		vcipher(x, x, 1) \
151 		vcipher(y, y, 1) \
152 		vcipher(x, x, 2) \
153 		vcipher(y, y, 2) \
154 		vcipher(x, x, 3) \
155 		vcipher(y, y, 3) \
156 		vcipher(x, x, 4) \
157 		vcipher(y, y, 4) \
158 		vcipher(x, x, 5) \
159 		vcipher(y, y, 5) \
160 		vcipher(x, x, 6) \
161 		vcipher(y, y, 6) \
162 		vcipher(x, x, 7) \
163 		vcipher(y, y, 7) \
164 		vcipher(x, x, 8) \
165 		vcipher(y, y, 8) \
166 		vcipher(x, x, 9) \
167 		vcipher(y, y, 9) \
168 		vcipherlast(x, x, 10) \
169 		vcipherlast(y, y, 10)
170 
171 #define BLOCK_ENCRYPT_X2_192(x, y) \
172 		vxor(x, x, 0) \
173 		vxor(y, y, 0) \
174 		vcipher(x, x, 1) \
175 		vcipher(y, y, 1) \
176 		vcipher(x, x, 2) \
177 		vcipher(y, y, 2) \
178 		vcipher(x, x, 3) \
179 		vcipher(y, y, 3) \
180 		vcipher(x, x, 4) \
181 		vcipher(y, y, 4) \
182 		vcipher(x, x, 5) \
183 		vcipher(y, y, 5) \
184 		vcipher(x, x, 6) \
185 		vcipher(y, y, 6) \
186 		vcipher(x, x, 7) \
187 		vcipher(y, y, 7) \
188 		vcipher(x, x, 8) \
189 		vcipher(y, y, 8) \
190 		vcipher(x, x, 9) \
191 		vcipher(y, y, 9) \
192 		vcipher(x, x, 10) \
193 		vcipher(y, y, 10) \
194 		vcipher(x, x, 11) \
195 		vcipher(y, y, 11) \
196 		vcipherlast(x, x, 12) \
197 		vcipherlast(y, y, 12)
198 
199 #define BLOCK_ENCRYPT_X2_256(x, y) \
200 		vxor(x, x, 0) \
201 		vxor(y, y, 0) \
202 		vcipher(x, x, 1) \
203 		vcipher(y, y, 1) \
204 		vcipher(x, x, 2) \
205 		vcipher(y, y, 2) \
206 		vcipher(x, x, 3) \
207 		vcipher(y, y, 3) \
208 		vcipher(x, x, 4) \
209 		vcipher(y, y, 4) \
210 		vcipher(x, x, 5) \
211 		vcipher(y, y, 5) \
212 		vcipher(x, x, 6) \
213 		vcipher(y, y, 6) \
214 		vcipher(x, x, 7) \
215 		vcipher(y, y, 7) \
216 		vcipher(x, x, 8) \
217 		vcipher(y, y, 8) \
218 		vcipher(x, x, 9) \
219 		vcipher(y, y, 9) \
220 		vcipher(x, x, 10) \
221 		vcipher(y, y, 10) \
222 		vcipher(x, x, 11) \
223 		vcipher(y, y, 11) \
224 		vcipher(x, x, 12) \
225 		vcipher(y, y, 12) \
226 		vcipher(x, x, 13) \
227 		vcipher(y, y, 13) \
228 		vcipherlast(x, x, 14) \
229 		vcipherlast(y, y, 14)
230 
231 #define BLOCK_ENCRYPT_X4_128(x0, x1, x2, x3) \
232 		vxor(x0, x0, 0) \
233 		vxor(x1, x1, 0) \
234 		vxor(x2, x2, 0) \
235 		vxor(x3, x3, 0) \
236 		vcipher(x0, x0, 1) \
237 		vcipher(x1, x1, 1) \
238 		vcipher(x2, x2, 1) \
239 		vcipher(x3, x3, 1) \
240 		vcipher(x0, x0, 2) \
241 		vcipher(x1, x1, 2) \
242 		vcipher(x2, x2, 2) \
243 		vcipher(x3, x3, 2) \
244 		vcipher(x0, x0, 3) \
245 		vcipher(x1, x1, 3) \
246 		vcipher(x2, x2, 3) \
247 		vcipher(x3, x3, 3) \
248 		vcipher(x0, x0, 4) \
249 		vcipher(x1, x1, 4) \
250 		vcipher(x2, x2, 4) \
251 		vcipher(x3, x3, 4) \
252 		vcipher(x0, x0, 5) \
253 		vcipher(x1, x1, 5) \
254 		vcipher(x2, x2, 5) \
255 		vcipher(x3, x3, 5) \
256 		vcipher(x0, x0, 6) \
257 		vcipher(x1, x1, 6) \
258 		vcipher(x2, x2, 6) \
259 		vcipher(x3, x3, 6) \
260 		vcipher(x0, x0, 7) \
261 		vcipher(x1, x1, 7) \
262 		vcipher(x2, x2, 7) \
263 		vcipher(x3, x3, 7) \
264 		vcipher(x0, x0, 8) \
265 		vcipher(x1, x1, 8) \
266 		vcipher(x2, x2, 8) \
267 		vcipher(x3, x3, 8) \
268 		vcipher(x0, x0, 9) \
269 		vcipher(x1, x1, 9) \
270 		vcipher(x2, x2, 9) \
271 		vcipher(x3, x3, 9) \
272 		vcipherlast(x0, x0, 10) \
273 		vcipherlast(x1, x1, 10) \
274 		vcipherlast(x2, x2, 10) \
275 		vcipherlast(x3, x3, 10)
276 
277 #define BLOCK_ENCRYPT_X4_192(x0, x1, x2, x3) \
278 		vxor(x0, x0, 0) \
279 		vxor(x1, x1, 0) \
280 		vxor(x2, x2, 0) \
281 		vxor(x3, x3, 0) \
282 		vcipher(x0, x0, 1) \
283 		vcipher(x1, x1, 1) \
284 		vcipher(x2, x2, 1) \
285 		vcipher(x3, x3, 1) \
286 		vcipher(x0, x0, 2) \
287 		vcipher(x1, x1, 2) \
288 		vcipher(x2, x2, 2) \
289 		vcipher(x3, x3, 2) \
290 		vcipher(x0, x0, 3) \
291 		vcipher(x1, x1, 3) \
292 		vcipher(x2, x2, 3) \
293 		vcipher(x3, x3, 3) \
294 		vcipher(x0, x0, 4) \
295 		vcipher(x1, x1, 4) \
296 		vcipher(x2, x2, 4) \
297 		vcipher(x3, x3, 4) \
298 		vcipher(x0, x0, 5) \
299 		vcipher(x1, x1, 5) \
300 		vcipher(x2, x2, 5) \
301 		vcipher(x3, x3, 5) \
302 		vcipher(x0, x0, 6) \
303 		vcipher(x1, x1, 6) \
304 		vcipher(x2, x2, 6) \
305 		vcipher(x3, x3, 6) \
306 		vcipher(x0, x0, 7) \
307 		vcipher(x1, x1, 7) \
308 		vcipher(x2, x2, 7) \
309 		vcipher(x3, x3, 7) \
310 		vcipher(x0, x0, 8) \
311 		vcipher(x1, x1, 8) \
312 		vcipher(x2, x2, 8) \
313 		vcipher(x3, x3, 8) \
314 		vcipher(x0, x0, 9) \
315 		vcipher(x1, x1, 9) \
316 		vcipher(x2, x2, 9) \
317 		vcipher(x3, x3, 9) \
318 		vcipher(x0, x0, 10) \
319 		vcipher(x1, x1, 10) \
320 		vcipher(x2, x2, 10) \
321 		vcipher(x3, x3, 10) \
322 		vcipher(x0, x0, 11) \
323 		vcipher(x1, x1, 11) \
324 		vcipher(x2, x2, 11) \
325 		vcipher(x3, x3, 11) \
326 		vcipherlast(x0, x0, 12) \
327 		vcipherlast(x1, x1, 12) \
328 		vcipherlast(x2, x2, 12) \
329 		vcipherlast(x3, x3, 12)
330 
331 #define BLOCK_ENCRYPT_X4_256(x0, x1, x2, x3) \
332 		vxor(x0, x0, 0) \
333 		vxor(x1, x1, 0) \
334 		vxor(x2, x2, 0) \
335 		vxor(x3, x3, 0) \
336 		vcipher(x0, x0, 1) \
337 		vcipher(x1, x1, 1) \
338 		vcipher(x2, x2, 1) \
339 		vcipher(x3, x3, 1) \
340 		vcipher(x0, x0, 2) \
341 		vcipher(x1, x1, 2) \
342 		vcipher(x2, x2, 2) \
343 		vcipher(x3, x3, 2) \
344 		vcipher(x0, x0, 3) \
345 		vcipher(x1, x1, 3) \
346 		vcipher(x2, x2, 3) \
347 		vcipher(x3, x3, 3) \
348 		vcipher(x0, x0, 4) \
349 		vcipher(x1, x1, 4) \
350 		vcipher(x2, x2, 4) \
351 		vcipher(x3, x3, 4) \
352 		vcipher(x0, x0, 5) \
353 		vcipher(x1, x1, 5) \
354 		vcipher(x2, x2, 5) \
355 		vcipher(x3, x3, 5) \
356 		vcipher(x0, x0, 6) \
357 		vcipher(x1, x1, 6) \
358 		vcipher(x2, x2, 6) \
359 		vcipher(x3, x3, 6) \
360 		vcipher(x0, x0, 7) \
361 		vcipher(x1, x1, 7) \
362 		vcipher(x2, x2, 7) \
363 		vcipher(x3, x3, 7) \
364 		vcipher(x0, x0, 8) \
365 		vcipher(x1, x1, 8) \
366 		vcipher(x2, x2, 8) \
367 		vcipher(x3, x3, 8) \
368 		vcipher(x0, x0, 9) \
369 		vcipher(x1, x1, 9) \
370 		vcipher(x2, x2, 9) \
371 		vcipher(x3, x3, 9) \
372 		vcipher(x0, x0, 10) \
373 		vcipher(x1, x1, 10) \
374 		vcipher(x2, x2, 10) \
375 		vcipher(x3, x3, 10) \
376 		vcipher(x0, x0, 11) \
377 		vcipher(x1, x1, 11) \
378 		vcipher(x2, x2, 11) \
379 		vcipher(x3, x3, 11) \
380 		vcipher(x0, x0, 12) \
381 		vcipher(x1, x1, 12) \
382 		vcipher(x2, x2, 12) \
383 		vcipher(x3, x3, 12) \
384 		vcipher(x0, x0, 13) \
385 		vcipher(x1, x1, 13) \
386 		vcipher(x2, x2, 13) \
387 		vcipher(x3, x3, 13) \
388 		vcipherlast(x0, x0, 14) \
389 		vcipherlast(x1, x1, 14) \
390 		vcipherlast(x2, x2, 14) \
391 		vcipherlast(x3, x3, 14)
392 
393 #if BR_POWER8_LE
394 static const uint32_t idx2be[] = {
395 	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
396 };
397 #define BYTESWAP_INIT     lxvw4x(47, 0, %[idx2be])
398 #define BYTESWAP(x)       vperm(x, x, x, 15)
399 #define BYTESWAPX(d, s)   vperm(d, s, s, 15)
400 #define BYTESWAP_REG      , [idx2be] "b" (idx2be)
401 #else
402 #define BYTESWAP_INIT
403 #define BYTESWAP(x)
404 #define BYTESWAPX(d, s)   vand(d, s, s)
405 #define BYTESWAP_REG
406 #endif
407 
408 static const uint32_t ctrinc[] = {
409 	0, 0, 0, 1
410 };
411 static const uint32_t ctrinc_x4[] = {
412 	0, 0, 0, 4
413 };
414 #define INCR_128_INIT      lxvw4x(60, 0, %[ctrinc])
415 #define INCR_128_X4_INIT   lxvw4x(60, 0, %[ctrinc_x4])
416 #define INCR_128(d, s) \
417 		vaddcuw(29, s, 28) \
418 		vadduwm(d, s, 28) \
419 		vsldoi(30, 29, 29, 4) \
420 		vaddcuw(29, d, 30) \
421 		vadduwm(d, d, 30) \
422 		vsldoi(30, 29, 29, 4) \
423 		vaddcuw(29, d, 30) \
424 		vadduwm(d, d, 30) \
425 		vsldoi(30, 29, 29, 4) \
426 		vadduwm(d, d, 30)
427 
428 #define MKCTR(size) \
429 static void \
430 ctr_ ## size(const unsigned char *sk, \
431 	unsigned char *ctrbuf, unsigned char *buf, size_t num_blocks_x4) \
432 { \
433 	long cc, cc0, cc1, cc2, cc3; \
434  \
435 	cc = 0; \
436 	cc0 = 0; \
437 	cc1 = 16; \
438 	cc2 = 32; \
439 	cc3 = 48; \
440 	asm volatile ( \
441  \
442 		/* \
443 		 * Load subkeys into v0..v10 \
444 		 */ \
445 		LOAD_SUBKEYS_ ## size \
446 		li(%[cc], 0) \
447  \
448 		BYTESWAP_INIT \
449 		INCR_128_X4_INIT \
450  \
451 		/* \
452 		 * Load current CTR counters into v16 to v19. \
453 		 */ \
454 		lxvw4x(48, %[cc0], %[ctrbuf]) \
455 		lxvw4x(49, %[cc1], %[ctrbuf]) \
456 		lxvw4x(50, %[cc2], %[ctrbuf]) \
457 		lxvw4x(51, %[cc3], %[ctrbuf]) \
458 		BYTESWAP(16) \
459 		BYTESWAP(17) \
460 		BYTESWAP(18) \
461 		BYTESWAP(19) \
462  \
463 		mtctr(%[num_blocks_x4]) \
464  \
465 	label(loop) \
466 		/* \
467 		 * Compute next counter values into v20..v23. \
468 		 */ \
469 		INCR_128(20, 16) \
470 		INCR_128(21, 17) \
471 		INCR_128(22, 18) \
472 		INCR_128(23, 19) \
473  \
474 		/* \
475 		 * Encrypt counter values and XOR into next data blocks. \
476 		 */ \
477 		lxvw4x(56, %[cc0], %[buf]) \
478 		lxvw4x(57, %[cc1], %[buf]) \
479 		lxvw4x(58, %[cc2], %[buf]) \
480 		lxvw4x(59, %[cc3], %[buf]) \
481 		BYTESWAP(24) \
482 		BYTESWAP(25) \
483 		BYTESWAP(26) \
484 		BYTESWAP(27) \
485 		BLOCK_ENCRYPT_X4_ ## size(16, 17, 18, 19) \
486 		vxor(16, 16, 24) \
487 		vxor(17, 17, 25) \
488 		vxor(18, 18, 26) \
489 		vxor(19, 19, 27) \
490 		BYTESWAP(16) \
491 		BYTESWAP(17) \
492 		BYTESWAP(18) \
493 		BYTESWAP(19) \
494 		stxvw4x(48, %[cc0], %[buf]) \
495 		stxvw4x(49, %[cc1], %[buf]) \
496 		stxvw4x(50, %[cc2], %[buf]) \
497 		stxvw4x(51, %[cc3], %[buf]) \
498  \
499 		/* \
500 		 * Update counters and data pointer. \
501 		 */ \
502 		vand(16, 20, 20) \
503 		vand(17, 21, 21) \
504 		vand(18, 22, 22) \
505 		vand(19, 23, 23) \
506 		addi(%[buf], %[buf], 64) \
507  \
508 		bdnz(loop) \
509  \
510 		/* \
511 		 * Write back new counter values. \
512 		 */ \
513 		BYTESWAP(16) \
514 		BYTESWAP(17) \
515 		BYTESWAP(18) \
516 		BYTESWAP(19) \
517 		stxvw4x(48, %[cc0], %[ctrbuf]) \
518 		stxvw4x(49, %[cc1], %[ctrbuf]) \
519 		stxvw4x(50, %[cc2], %[ctrbuf]) \
520 		stxvw4x(51, %[cc3], %[ctrbuf]) \
521  \
522 : [cc] "+b" (cc), [buf] "+b" (buf), \
523 	[cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3) \
524 : [sk] "b" (sk), [ctrbuf] "b" (ctrbuf), \
525 	[num_blocks_x4] "b" (num_blocks_x4), [ctrinc_x4] "b" (ctrinc_x4) \
526 	BYTESWAP_REG \
527 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
528   "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
529   "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
530   "v30", "ctr", "memory" \
531 	); \
532 }
533 
534 MKCTR(128)
535 MKCTR(192)
536 MKCTR(256)
537 
538 #define MKCBCMAC(size) \
539 static void \
540 cbcmac_ ## size(const unsigned char *sk, \
541 	unsigned char *cbcmac, const unsigned char *buf, size_t num_blocks) \
542 { \
543 	long cc; \
544  \
545 	cc = 0; \
546 	asm volatile ( \
547  \
548 		/* \
549 		 * Load subkeys into v0..v10 \
550 		 */ \
551 		LOAD_SUBKEYS_ ## size \
552 		li(%[cc], 0) \
553  \
554 		BYTESWAP_INIT \
555  \
556 		/* \
557 		 * Load current CBC-MAC value into v16. \
558 		 */ \
559 		lxvw4x(48, %[cc], %[cbcmac]) \
560 		BYTESWAP(16) \
561  \
562 		mtctr(%[num_blocks]) \
563  \
564 	label(loop) \
565 		/* \
566 		 * Load next block, XOR into current CBC-MAC value, \
567 		 * and then encrypt it. \
568 		 */ \
569 		lxvw4x(49, %[cc], %[buf]) \
570 		BYTESWAP(17) \
571 		vxor(16, 16, 17) \
572 		BLOCK_ENCRYPT_ ## size(16) \
573 		addi(%[buf], %[buf], 16) \
574  \
575 		bdnz(loop) \
576  \
577 		/* \
578 		 * Write back new CBC-MAC value. \
579 		 */ \
580 		BYTESWAP(16) \
581 		stxvw4x(48, %[cc], %[cbcmac]) \
582  \
583 : [cc] "+b" (cc), [buf] "+b" (buf) \
584 : [sk] "b" (sk), [cbcmac] "b" (cbcmac), [num_blocks] "b" (num_blocks) \
585 	BYTESWAP_REG \
586 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
587   "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
588   "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
589   "v30", "ctr", "memory" \
590 	); \
591 }
592 
593 MKCBCMAC(128)
594 MKCBCMAC(192)
595 MKCBCMAC(256)
596 
597 #define MKENCRYPT(size) \
598 static void \
599 ctrcbc_ ## size ## _encrypt(const unsigned char *sk, \
600 	unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
601 	size_t num_blocks) \
602 { \
603 	long cc; \
604  \
605 	cc = 0; \
606 	asm volatile ( \
607  \
608 		/* \
609 		 * Load subkeys into v0..v10 \
610 		 */ \
611 		LOAD_SUBKEYS_ ## size \
612 		li(%[cc], 0) \
613  \
614 		BYTESWAP_INIT \
615 		INCR_128_INIT \
616  \
617 		/* \
618 		 * Load current CTR counter into v16, and current \
619 		 * CBC-MAC IV into v17. \
620 		 */ \
621 		lxvw4x(48, %[cc], %[ctr]) \
622 		lxvw4x(49, %[cc], %[cbcmac]) \
623 		BYTESWAP(16) \
624 		BYTESWAP(17) \
625  \
626 		/* \
627 		 * At each iteration, we do two parallel encryption: \
628 		 *  - new counter value for encryption of the next block; \
629 		 *  - CBC-MAC over the previous encrypted block. \
630 		 * Thus, each plaintext block implies two AES instances, \
631 		 * over two successive iterations. This requires a single \
632 		 * counter encryption before the loop, and a single \
633 		 * CBC-MAC encryption after the loop. \
634 		 */ \
635  \
636 		/* \
637 		 * Encrypt first block (into v20). \
638 		 */ \
639 		lxvw4x(52, %[cc], %[buf]) \
640 		BYTESWAP(20) \
641 		INCR_128(22, 16) \
642 		BLOCK_ENCRYPT_ ## size(16) \
643 		vxor(20, 20, 16) \
644 		BYTESWAPX(21, 20) \
645 		stxvw4x(53, %[cc], %[buf]) \
646 		vand(16, 22, 22) \
647 		addi(%[buf], %[buf], 16) \
648  \
649 		/* \
650 		 * Load loop counter; skip the loop if there is only \
651 		 * one block in total (already handled by the boundary \
652 		 * conditions). \
653 		 */ \
654 		mtctr(%[num_blocks]) \
655 		bdz(fastexit) \
656  \
657 	label(loop) \
658 		/* \
659 		 * Upon loop entry: \
660 		 *    v16   counter value for next block \
661 		 *    v17   current CBC-MAC value \
662 		 *    v20   encrypted previous block \
663 		 */ \
664 		vxor(17, 17, 20) \
665 		INCR_128(22, 16) \
666 		lxvw4x(52, %[cc], %[buf]) \
667 		BYTESWAP(20) \
668 		BLOCK_ENCRYPT_X2_ ## size(16, 17) \
669 		vxor(20, 20, 16) \
670 		BYTESWAPX(21, 20) \
671 		stxvw4x(53, %[cc], %[buf]) \
672 		addi(%[buf], %[buf], 16) \
673 		vand(16, 22, 22) \
674  \
675 		bdnz(loop) \
676  \
677 	label(fastexit) \
678 		vxor(17, 17, 20) \
679 		BLOCK_ENCRYPT_ ## size(17) \
680 		BYTESWAP(16) \
681 		BYTESWAP(17) \
682 		stxvw4x(48, %[cc], %[ctr]) \
683 		stxvw4x(49, %[cc], %[cbcmac]) \
684  \
685 : [cc] "+b" (cc), [buf] "+b" (buf) \
686 : [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
687 	[num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
688 	BYTESWAP_REG \
689 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
690   "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
691   "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
692   "v30", "ctr", "memory" \
693 	); \
694 }
695 
696 MKENCRYPT(128)
697 MKENCRYPT(192)
698 MKENCRYPT(256)
699 
700 #define MKDECRYPT(size) \
701 static void \
702 ctrcbc_ ## size ## _decrypt(const unsigned char *sk, \
703 	unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \
704 	size_t num_blocks) \
705 { \
706 	long cc; \
707  \
708 	cc = 0; \
709 	asm volatile ( \
710  \
711 		/* \
712 		 * Load subkeys into v0..v10 \
713 		 */ \
714 		LOAD_SUBKEYS_ ## size \
715 		li(%[cc], 0) \
716  \
717 		BYTESWAP_INIT \
718 		INCR_128_INIT \
719  \
720 		/* \
721 		 * Load current CTR counter into v16, and current \
722 		 * CBC-MAC IV into v17. \
723 		 */ \
724 		lxvw4x(48, %[cc], %[ctr]) \
725 		lxvw4x(49, %[cc], %[cbcmac]) \
726 		BYTESWAP(16) \
727 		BYTESWAP(17) \
728  \
729 		/* \
730 		 * At each iteration, we do two parallel encryption: \
731 		 *  - new counter value for decryption of the next block; \
732 		 *  - CBC-MAC over the next encrypted block. \
733 		 * Each iteration performs the two AES instances related \
734 		 * to the current block; there is thus no need for some \
735 		 * extra pre-loop and post-loop work as in encryption. \
736 		 */ \
737  \
738 		mtctr(%[num_blocks]) \
739  \
740 	label(loop) \
741 		/* \
742 		 * Upon loop entry: \
743 		 *    v16   counter value for next block \
744 		 *    v17   current CBC-MAC value \
745 		 */ \
746 		lxvw4x(52, %[cc], %[buf]) \
747 		BYTESWAP(20) \
748 		vxor(17, 17, 20) \
749 		INCR_128(22, 16) \
750 		BLOCK_ENCRYPT_X2_ ## size(16, 17) \
751 		vxor(20, 20, 16) \
752 		BYTESWAPX(21, 20) \
753 		stxvw4x(53, %[cc], %[buf]) \
754 		addi(%[buf], %[buf], 16) \
755 		vand(16, 22, 22) \
756  \
757 		bdnz(loop) \
758  \
759 		/* \
760 		 * Store back counter and CBC-MAC value. \
761 		 */ \
762 		BYTESWAP(16) \
763 		BYTESWAP(17) \
764 		stxvw4x(48, %[cc], %[ctr]) \
765 		stxvw4x(49, %[cc], %[cbcmac]) \
766  \
767 : [cc] "+b" (cc), [buf] "+b" (buf) \
768 : [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \
769 	[num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \
770 	BYTESWAP_REG \
771 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \
772   "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \
773   "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \
774   "v30", "ctr", "memory" \
775 	); \
776 }
777 
778 MKDECRYPT(128)
779 MKDECRYPT(192)
780 MKDECRYPT(256)
781 
782 /* see bearssl_block.h */
783 void
784 br_aes_pwr8_ctrcbc_encrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
785 	void *ctr, void *cbcmac, void *data, size_t len)
786 {
787 	if (len == 0) {
788 		return;
789 	}
790 	switch (ctx->num_rounds) {
791 	case 10:
792 		ctrcbc_128_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
793 		break;
794 	case 12:
795 		ctrcbc_192_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
796 		break;
797 	default:
798 		ctrcbc_256_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
799 		break;
800 	}
801 }
802 
803 /* see bearssl_block.h */
804 void
805 br_aes_pwr8_ctrcbc_decrypt(const br_aes_pwr8_ctrcbc_keys *ctx,
806 	void *ctr, void *cbcmac, void *data, size_t len)
807 {
808 	if (len == 0) {
809 		return;
810 	}
811 	switch (ctx->num_rounds) {
812 	case 10:
813 		ctrcbc_128_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
814 		break;
815 	case 12:
816 		ctrcbc_192_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
817 		break;
818 	default:
819 		ctrcbc_256_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4);
820 		break;
821 	}
822 }
823 
824 static inline void
825 incr_ctr(void *dst, const void *src)
826 {
827 	uint64_t hi, lo;
828 
829 	hi = br_dec64be(src);
830 	lo = br_dec64be((const unsigned char *)src + 8);
831 	lo ++;
832 	hi += ((lo | -lo) >> 63) ^ (uint64_t)1;
833 	br_enc64be(dst, hi);
834 	br_enc64be((unsigned char *)dst + 8, lo);
835 }
836 
837 /* see bearssl_block.h */
838 void
839 br_aes_pwr8_ctrcbc_ctr(const br_aes_pwr8_ctrcbc_keys *ctx,
840 	void *ctr, void *data, size_t len)
841 {
842 	unsigned char ctrbuf[64];
843 
844 	memcpy(ctrbuf, ctr, 16);
845 	incr_ctr(ctrbuf + 16, ctrbuf);
846 	incr_ctr(ctrbuf + 32, ctrbuf + 16);
847 	incr_ctr(ctrbuf + 48, ctrbuf + 32);
848 	if (len >= 64) {
849 		switch (ctx->num_rounds) {
850 		case 10:
851 			ctr_128(ctx->skey.skni, ctrbuf, data, len >> 6);
852 			break;
853 		case 12:
854 			ctr_192(ctx->skey.skni, ctrbuf, data, len >> 6);
855 			break;
856 		default:
857 			ctr_256(ctx->skey.skni, ctrbuf, data, len >> 6);
858 			break;
859 		}
860 		data = (unsigned char *)data + (len & ~(size_t)63);
861 		len &= 63;
862 	}
863 	if (len > 0) {
864 		unsigned char tmp[64];
865 
866 		if (len >= 32) {
867 			if (len >= 48) {
868 				memcpy(ctr, ctrbuf + 48, 16);
869 			} else {
870 				memcpy(ctr, ctrbuf + 32, 16);
871 			}
872 		} else {
873 			if (len >= 16) {
874 				memcpy(ctr, ctrbuf + 16, 16);
875 			}
876 		}
877 		memcpy(tmp, data, len);
878 		memset(tmp + len, 0, (sizeof tmp) - len);
879 		switch (ctx->num_rounds) {
880 		case 10:
881 			ctr_128(ctx->skey.skni, ctrbuf, tmp, 1);
882 			break;
883 		case 12:
884 			ctr_192(ctx->skey.skni, ctrbuf, tmp, 1);
885 			break;
886 		default:
887 			ctr_256(ctx->skey.skni, ctrbuf, tmp, 1);
888 			break;
889 		}
890 		memcpy(data, tmp, len);
891 	} else {
892 		memcpy(ctr, ctrbuf, 16);
893 	}
894 }
895 
896 /* see bearssl_block.h */
897 void
898 br_aes_pwr8_ctrcbc_mac(const br_aes_pwr8_ctrcbc_keys *ctx,
899 	void *cbcmac, const void *data, size_t len)
900 {
901 	if (len > 0) {
902 		switch (ctx->num_rounds) {
903 		case 10:
904 			cbcmac_128(ctx->skey.skni, cbcmac, data, len >> 4);
905 			break;
906 		case 12:
907 			cbcmac_192(ctx->skey.skni, cbcmac, data, len >> 4);
908 			break;
909 		default:
910 			cbcmac_256(ctx->skey.skni, cbcmac, data, len >> 4);
911 			break;
912 		}
913 	}
914 }
915 
916 /* see bearssl_block.h */
917 const br_block_ctrcbc_class br_aes_pwr8_ctrcbc_vtable = {
918 	sizeof(br_aes_pwr8_ctrcbc_keys),
919 	16,
920 	4,
921 	(void (*)(const br_block_ctrcbc_class **, const void *, size_t))
922 		&br_aes_pwr8_ctrcbc_init,
923 	(void (*)(const br_block_ctrcbc_class *const *,
924 		void *, void *, void *, size_t))
925 		&br_aes_pwr8_ctrcbc_encrypt,
926 	(void (*)(const br_block_ctrcbc_class *const *,
927 		void *, void *, void *, size_t))
928 		&br_aes_pwr8_ctrcbc_decrypt,
929 	(void (*)(const br_block_ctrcbc_class *const *,
930 		void *, void *, size_t))
931 		&br_aes_pwr8_ctrcbc_ctr,
932 	(void (*)(const br_block_ctrcbc_class *const *,
933 		void *, const void *, size_t))
934 		&br_aes_pwr8_ctrcbc_mac
935 };
936 
937 #else
938 
939 /* see bearssl_block.h */
940 const br_block_ctrcbc_class *
941 br_aes_pwr8_ctrcbc_get_vtable(void)
942 {
943 	return NULL;
944 }
945 
946 #endif
947