xref: /freebsd/contrib/bearssl/src/symcipher/aes_pwr8_ctr.c (revision 031beb4e239bfce798af17f5fe8dba8bcaf13d99)
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #define BR_POWER_ASM_MACROS   1
26 #include "inner.h"
27 
28 #if BR_POWER8
29 
30 /* see bearssl_block.h */
31 void
32 br_aes_pwr8_ctr_init(br_aes_pwr8_ctr_keys *ctx,
33 	const void *key, size_t len)
34 {
35 	ctx->vtable = &br_aes_pwr8_ctr_vtable;
36 	ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len);
37 }
38 
39 static void
40 ctr_128(const unsigned char *sk, const unsigned char *ivbuf,
41 	unsigned char *buf, size_t num_blocks)
42 {
43 	long cc0, cc1, cc2, cc3;
44 
45 #if BR_POWER8_LE
46 	static const uint32_t idx2be[] = {
47 		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
48 	};
49 #endif
50 	static const uint32_t ctrinc[] = {
51 		0, 0, 0, 4
52 	};
53 
54 	cc0 = 0;
55 	cc1 = 16;
56 	cc2 = 32;
57 	cc3 = 48;
58 	asm volatile (
59 
60 		/*
61 		 * Load subkeys into v0..v10
62 		 */
63 		lxvw4x(32, %[cc0], %[sk])
64 		addi(%[cc0], %[cc0], 16)
65 		lxvw4x(33, %[cc0], %[sk])
66 		addi(%[cc0], %[cc0], 16)
67 		lxvw4x(34, %[cc0], %[sk])
68 		addi(%[cc0], %[cc0], 16)
69 		lxvw4x(35, %[cc0], %[sk])
70 		addi(%[cc0], %[cc0], 16)
71 		lxvw4x(36, %[cc0], %[sk])
72 		addi(%[cc0], %[cc0], 16)
73 		lxvw4x(37, %[cc0], %[sk])
74 		addi(%[cc0], %[cc0], 16)
75 		lxvw4x(38, %[cc0], %[sk])
76 		addi(%[cc0], %[cc0], 16)
77 		lxvw4x(39, %[cc0], %[sk])
78 		addi(%[cc0], %[cc0], 16)
79 		lxvw4x(40, %[cc0], %[sk])
80 		addi(%[cc0], %[cc0], 16)
81 		lxvw4x(41, %[cc0], %[sk])
82 		addi(%[cc0], %[cc0], 16)
83 		lxvw4x(42, %[cc0], %[sk])
84 		li(%[cc0], 0)
85 
86 #if BR_POWER8_LE
87 		/*
88 		 * v15 = constant for byteswapping words
89 		 */
90 		lxvw4x(47, 0, %[idx2be])
91 #endif
92 		/*
93 		 * v28 = increment for IV counter.
94 		 */
95 		lxvw4x(60, 0, %[ctrinc])
96 
97 		/*
98 		 * Load IV into v16..v19
99 		 */
100 		lxvw4x(48, %[cc0], %[ivbuf])
101 		lxvw4x(49, %[cc1], %[ivbuf])
102 		lxvw4x(50, %[cc2], %[ivbuf])
103 		lxvw4x(51, %[cc3], %[ivbuf])
104 #if BR_POWER8_LE
105 		vperm(16, 16, 16, 15)
106 		vperm(17, 17, 17, 15)
107 		vperm(18, 18, 18, 15)
108 		vperm(19, 19, 19, 15)
109 #endif
110 
111 		mtctr(%[num_blocks])
112 	label(loop)
113 		/*
114 		 * Compute next IV into v24..v27
115 		 */
116 		vadduwm(24, 16, 28)
117 		vadduwm(25, 17, 28)
118 		vadduwm(26, 18, 28)
119 		vadduwm(27, 19, 28)
120 
121 		/*
122 		 * Load next data blocks. We do this early on but we
123 		 * won't need them until IV encryption is done.
124 		 */
125 		lxvw4x(52, %[cc0], %[buf])
126 		lxvw4x(53, %[cc1], %[buf])
127 		lxvw4x(54, %[cc2], %[buf])
128 		lxvw4x(55, %[cc3], %[buf])
129 
130 		/*
131 		 * Encrypt the current IV.
132 		 */
133 		vxor(16, 16, 0)
134 		vxor(17, 17, 0)
135 		vxor(18, 18, 0)
136 		vxor(19, 19, 0)
137 		vcipher(16, 16, 1)
138 		vcipher(17, 17, 1)
139 		vcipher(18, 18, 1)
140 		vcipher(19, 19, 1)
141 		vcipher(16, 16, 2)
142 		vcipher(17, 17, 2)
143 		vcipher(18, 18, 2)
144 		vcipher(19, 19, 2)
145 		vcipher(16, 16, 3)
146 		vcipher(17, 17, 3)
147 		vcipher(18, 18, 3)
148 		vcipher(19, 19, 3)
149 		vcipher(16, 16, 4)
150 		vcipher(17, 17, 4)
151 		vcipher(18, 18, 4)
152 		vcipher(19, 19, 4)
153 		vcipher(16, 16, 5)
154 		vcipher(17, 17, 5)
155 		vcipher(18, 18, 5)
156 		vcipher(19, 19, 5)
157 		vcipher(16, 16, 6)
158 		vcipher(17, 17, 6)
159 		vcipher(18, 18, 6)
160 		vcipher(19, 19, 6)
161 		vcipher(16, 16, 7)
162 		vcipher(17, 17, 7)
163 		vcipher(18, 18, 7)
164 		vcipher(19, 19, 7)
165 		vcipher(16, 16, 8)
166 		vcipher(17, 17, 8)
167 		vcipher(18, 18, 8)
168 		vcipher(19, 19, 8)
169 		vcipher(16, 16, 9)
170 		vcipher(17, 17, 9)
171 		vcipher(18, 18, 9)
172 		vcipher(19, 19, 9)
173 		vcipherlast(16, 16, 10)
174 		vcipherlast(17, 17, 10)
175 		vcipherlast(18, 18, 10)
176 		vcipherlast(19, 19, 10)
177 
178 #if BR_POWER8_LE
179 		vperm(16, 16, 16, 15)
180 		vperm(17, 17, 17, 15)
181 		vperm(18, 18, 18, 15)
182 		vperm(19, 19, 19, 15)
183 #endif
184 
185 		/*
186 		 * Load next plaintext word and XOR with encrypted IV.
187 		 */
188 		vxor(16, 20, 16)
189 		vxor(17, 21, 17)
190 		vxor(18, 22, 18)
191 		vxor(19, 23, 19)
192 		stxvw4x(48, %[cc0], %[buf])
193 		stxvw4x(49, %[cc1], %[buf])
194 		stxvw4x(50, %[cc2], %[buf])
195 		stxvw4x(51, %[cc3], %[buf])
196 
197 		addi(%[buf], %[buf], 64)
198 
199 		/*
200 		 * Update IV.
201 		 */
202 		vand(16, 24, 24)
203 		vand(17, 25, 25)
204 		vand(18, 26, 26)
205 		vand(19, 27, 27)
206 
207 		bdnz(loop)
208 
209 : [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
210   [buf] "+b" (buf)
211 : [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2),
212   [ctrinc] "b" (ctrinc)
213 #if BR_POWER8_LE
214 	, [idx2be] "b" (idx2be)
215 #endif
216 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
217   "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
218   "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
219   "ctr", "memory"
220 	);
221 }
222 
223 static void
224 ctr_192(const unsigned char *sk, const unsigned char *ivbuf,
225 	unsigned char *buf, size_t num_blocks)
226 {
227 	long cc0, cc1, cc2, cc3;
228 
229 #if BR_POWER8_LE
230 	static const uint32_t idx2be[] = {
231 		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
232 	};
233 #endif
234 	static const uint32_t ctrinc[] = {
235 		0, 0, 0, 4
236 	};
237 
238 	cc0 = 0;
239 	cc1 = 16;
240 	cc2 = 32;
241 	cc3 = 48;
242 	asm volatile (
243 
244 		/*
245 		 * Load subkeys into v0..v12
246 		 */
247 		lxvw4x(32, %[cc0], %[sk])
248 		addi(%[cc0], %[cc0], 16)
249 		lxvw4x(33, %[cc0], %[sk])
250 		addi(%[cc0], %[cc0], 16)
251 		lxvw4x(34, %[cc0], %[sk])
252 		addi(%[cc0], %[cc0], 16)
253 		lxvw4x(35, %[cc0], %[sk])
254 		addi(%[cc0], %[cc0], 16)
255 		lxvw4x(36, %[cc0], %[sk])
256 		addi(%[cc0], %[cc0], 16)
257 		lxvw4x(37, %[cc0], %[sk])
258 		addi(%[cc0], %[cc0], 16)
259 		lxvw4x(38, %[cc0], %[sk])
260 		addi(%[cc0], %[cc0], 16)
261 		lxvw4x(39, %[cc0], %[sk])
262 		addi(%[cc0], %[cc0], 16)
263 		lxvw4x(40, %[cc0], %[sk])
264 		addi(%[cc0], %[cc0], 16)
265 		lxvw4x(41, %[cc0], %[sk])
266 		addi(%[cc0], %[cc0], 16)
267 		lxvw4x(42, %[cc0], %[sk])
268 		addi(%[cc0], %[cc0], 16)
269 		lxvw4x(43, %[cc0], %[sk])
270 		addi(%[cc0], %[cc0], 16)
271 		lxvw4x(44, %[cc0], %[sk])
272 		li(%[cc0], 0)
273 
274 #if BR_POWER8_LE
275 		/*
276 		 * v15 = constant for byteswapping words
277 		 */
278 		lxvw4x(47, 0, %[idx2be])
279 #endif
280 		/*
281 		 * v28 = increment for IV counter.
282 		 */
283 		lxvw4x(60, 0, %[ctrinc])
284 
285 		/*
286 		 * Load IV into v16..v19
287 		 */
288 		lxvw4x(48, %[cc0], %[ivbuf])
289 		lxvw4x(49, %[cc1], %[ivbuf])
290 		lxvw4x(50, %[cc2], %[ivbuf])
291 		lxvw4x(51, %[cc3], %[ivbuf])
292 #if BR_POWER8_LE
293 		vperm(16, 16, 16, 15)
294 		vperm(17, 17, 17, 15)
295 		vperm(18, 18, 18, 15)
296 		vperm(19, 19, 19, 15)
297 #endif
298 
299 		mtctr(%[num_blocks])
300 	label(loop)
301 		/*
302 		 * Compute next IV into v24..v27
303 		 */
304 		vadduwm(24, 16, 28)
305 		vadduwm(25, 17, 28)
306 		vadduwm(26, 18, 28)
307 		vadduwm(27, 19, 28)
308 
309 		/*
310 		 * Load next data blocks. We do this early on but we
311 		 * won't need them until IV encryption is done.
312 		 */
313 		lxvw4x(52, %[cc0], %[buf])
314 		lxvw4x(53, %[cc1], %[buf])
315 		lxvw4x(54, %[cc2], %[buf])
316 		lxvw4x(55, %[cc3], %[buf])
317 
318 		/*
319 		 * Encrypt the current IV.
320 		 */
321 		vxor(16, 16, 0)
322 		vxor(17, 17, 0)
323 		vxor(18, 18, 0)
324 		vxor(19, 19, 0)
325 		vcipher(16, 16, 1)
326 		vcipher(17, 17, 1)
327 		vcipher(18, 18, 1)
328 		vcipher(19, 19, 1)
329 		vcipher(16, 16, 2)
330 		vcipher(17, 17, 2)
331 		vcipher(18, 18, 2)
332 		vcipher(19, 19, 2)
333 		vcipher(16, 16, 3)
334 		vcipher(17, 17, 3)
335 		vcipher(18, 18, 3)
336 		vcipher(19, 19, 3)
337 		vcipher(16, 16, 4)
338 		vcipher(17, 17, 4)
339 		vcipher(18, 18, 4)
340 		vcipher(19, 19, 4)
341 		vcipher(16, 16, 5)
342 		vcipher(17, 17, 5)
343 		vcipher(18, 18, 5)
344 		vcipher(19, 19, 5)
345 		vcipher(16, 16, 6)
346 		vcipher(17, 17, 6)
347 		vcipher(18, 18, 6)
348 		vcipher(19, 19, 6)
349 		vcipher(16, 16, 7)
350 		vcipher(17, 17, 7)
351 		vcipher(18, 18, 7)
352 		vcipher(19, 19, 7)
353 		vcipher(16, 16, 8)
354 		vcipher(17, 17, 8)
355 		vcipher(18, 18, 8)
356 		vcipher(19, 19, 8)
357 		vcipher(16, 16, 9)
358 		vcipher(17, 17, 9)
359 		vcipher(18, 18, 9)
360 		vcipher(19, 19, 9)
361 		vcipher(16, 16, 10)
362 		vcipher(17, 17, 10)
363 		vcipher(18, 18, 10)
364 		vcipher(19, 19, 10)
365 		vcipher(16, 16, 11)
366 		vcipher(17, 17, 11)
367 		vcipher(18, 18, 11)
368 		vcipher(19, 19, 11)
369 		vcipherlast(16, 16, 12)
370 		vcipherlast(17, 17, 12)
371 		vcipherlast(18, 18, 12)
372 		vcipherlast(19, 19, 12)
373 
374 #if BR_POWER8_LE
375 		vperm(16, 16, 16, 15)
376 		vperm(17, 17, 17, 15)
377 		vperm(18, 18, 18, 15)
378 		vperm(19, 19, 19, 15)
379 #endif
380 
381 		/*
382 		 * Load next plaintext word and XOR with encrypted IV.
383 		 */
384 		vxor(16, 20, 16)
385 		vxor(17, 21, 17)
386 		vxor(18, 22, 18)
387 		vxor(19, 23, 19)
388 		stxvw4x(48, %[cc0], %[buf])
389 		stxvw4x(49, %[cc1], %[buf])
390 		stxvw4x(50, %[cc2], %[buf])
391 		stxvw4x(51, %[cc3], %[buf])
392 
393 		addi(%[buf], %[buf], 64)
394 
395 		/*
396 		 * Update IV.
397 		 */
398 		vand(16, 24, 24)
399 		vand(17, 25, 25)
400 		vand(18, 26, 26)
401 		vand(19, 27, 27)
402 
403 		bdnz(loop)
404 
405 : [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
406   [buf] "+b" (buf)
407 : [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2),
408   [ctrinc] "b" (ctrinc)
409 #if BR_POWER8_LE
410 	, [idx2be] "b" (idx2be)
411 #endif
412 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
413   "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
414   "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
415   "ctr", "memory"
416 	);
417 }
418 
419 static void
420 ctr_256(const unsigned char *sk, const unsigned char *ivbuf,
421 	unsigned char *buf, size_t num_blocks)
422 {
423 	long cc0, cc1, cc2, cc3;
424 
425 #if BR_POWER8_LE
426 	static const uint32_t idx2be[] = {
427 		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
428 	};
429 #endif
430 	static const uint32_t ctrinc[] = {
431 		0, 0, 0, 4
432 	};
433 
434 	cc0 = 0;
435 	cc1 = 16;
436 	cc2 = 32;
437 	cc3 = 48;
438 	asm volatile (
439 
440 		/*
441 		 * Load subkeys into v0..v14
442 		 */
443 		lxvw4x(32, %[cc0], %[sk])
444 		addi(%[cc0], %[cc0], 16)
445 		lxvw4x(33, %[cc0], %[sk])
446 		addi(%[cc0], %[cc0], 16)
447 		lxvw4x(34, %[cc0], %[sk])
448 		addi(%[cc0], %[cc0], 16)
449 		lxvw4x(35, %[cc0], %[sk])
450 		addi(%[cc0], %[cc0], 16)
451 		lxvw4x(36, %[cc0], %[sk])
452 		addi(%[cc0], %[cc0], 16)
453 		lxvw4x(37, %[cc0], %[sk])
454 		addi(%[cc0], %[cc0], 16)
455 		lxvw4x(38, %[cc0], %[sk])
456 		addi(%[cc0], %[cc0], 16)
457 		lxvw4x(39, %[cc0], %[sk])
458 		addi(%[cc0], %[cc0], 16)
459 		lxvw4x(40, %[cc0], %[sk])
460 		addi(%[cc0], %[cc0], 16)
461 		lxvw4x(41, %[cc0], %[sk])
462 		addi(%[cc0], %[cc0], 16)
463 		lxvw4x(42, %[cc0], %[sk])
464 		addi(%[cc0], %[cc0], 16)
465 		lxvw4x(43, %[cc0], %[sk])
466 		addi(%[cc0], %[cc0], 16)
467 		lxvw4x(44, %[cc0], %[sk])
468 		addi(%[cc0], %[cc0], 16)
469 		lxvw4x(45, %[cc0], %[sk])
470 		addi(%[cc0], %[cc0], 16)
471 		lxvw4x(46, %[cc0], %[sk])
472 		li(%[cc0], 0)
473 
474 #if BR_POWER8_LE
475 		/*
476 		 * v15 = constant for byteswapping words
477 		 */
478 		lxvw4x(47, 0, %[idx2be])
479 #endif
480 		/*
481 		 * v28 = increment for IV counter.
482 		 */
483 		lxvw4x(60, 0, %[ctrinc])
484 
485 		/*
486 		 * Load IV into v16..v19
487 		 */
488 		lxvw4x(48, %[cc0], %[ivbuf])
489 		lxvw4x(49, %[cc1], %[ivbuf])
490 		lxvw4x(50, %[cc2], %[ivbuf])
491 		lxvw4x(51, %[cc3], %[ivbuf])
492 #if BR_POWER8_LE
493 		vperm(16, 16, 16, 15)
494 		vperm(17, 17, 17, 15)
495 		vperm(18, 18, 18, 15)
496 		vperm(19, 19, 19, 15)
497 #endif
498 
499 		mtctr(%[num_blocks])
500 	label(loop)
501 		/*
502 		 * Compute next IV into v24..v27
503 		 */
504 		vadduwm(24, 16, 28)
505 		vadduwm(25, 17, 28)
506 		vadduwm(26, 18, 28)
507 		vadduwm(27, 19, 28)
508 
509 		/*
510 		 * Load next data blocks. We do this early on but we
511 		 * won't need them until IV encryption is done.
512 		 */
513 		lxvw4x(52, %[cc0], %[buf])
514 		lxvw4x(53, %[cc1], %[buf])
515 		lxvw4x(54, %[cc2], %[buf])
516 		lxvw4x(55, %[cc3], %[buf])
517 
518 		/*
519 		 * Encrypt the current IV.
520 		 */
521 		vxor(16, 16, 0)
522 		vxor(17, 17, 0)
523 		vxor(18, 18, 0)
524 		vxor(19, 19, 0)
525 		vcipher(16, 16, 1)
526 		vcipher(17, 17, 1)
527 		vcipher(18, 18, 1)
528 		vcipher(19, 19, 1)
529 		vcipher(16, 16, 2)
530 		vcipher(17, 17, 2)
531 		vcipher(18, 18, 2)
532 		vcipher(19, 19, 2)
533 		vcipher(16, 16, 3)
534 		vcipher(17, 17, 3)
535 		vcipher(18, 18, 3)
536 		vcipher(19, 19, 3)
537 		vcipher(16, 16, 4)
538 		vcipher(17, 17, 4)
539 		vcipher(18, 18, 4)
540 		vcipher(19, 19, 4)
541 		vcipher(16, 16, 5)
542 		vcipher(17, 17, 5)
543 		vcipher(18, 18, 5)
544 		vcipher(19, 19, 5)
545 		vcipher(16, 16, 6)
546 		vcipher(17, 17, 6)
547 		vcipher(18, 18, 6)
548 		vcipher(19, 19, 6)
549 		vcipher(16, 16, 7)
550 		vcipher(17, 17, 7)
551 		vcipher(18, 18, 7)
552 		vcipher(19, 19, 7)
553 		vcipher(16, 16, 8)
554 		vcipher(17, 17, 8)
555 		vcipher(18, 18, 8)
556 		vcipher(19, 19, 8)
557 		vcipher(16, 16, 9)
558 		vcipher(17, 17, 9)
559 		vcipher(18, 18, 9)
560 		vcipher(19, 19, 9)
561 		vcipher(16, 16, 10)
562 		vcipher(17, 17, 10)
563 		vcipher(18, 18, 10)
564 		vcipher(19, 19, 10)
565 		vcipher(16, 16, 11)
566 		vcipher(17, 17, 11)
567 		vcipher(18, 18, 11)
568 		vcipher(19, 19, 11)
569 		vcipher(16, 16, 12)
570 		vcipher(17, 17, 12)
571 		vcipher(18, 18, 12)
572 		vcipher(19, 19, 12)
573 		vcipher(16, 16, 13)
574 		vcipher(17, 17, 13)
575 		vcipher(18, 18, 13)
576 		vcipher(19, 19, 13)
577 		vcipherlast(16, 16, 14)
578 		vcipherlast(17, 17, 14)
579 		vcipherlast(18, 18, 14)
580 		vcipherlast(19, 19, 14)
581 
582 #if BR_POWER8_LE
583 		vperm(16, 16, 16, 15)
584 		vperm(17, 17, 17, 15)
585 		vperm(18, 18, 18, 15)
586 		vperm(19, 19, 19, 15)
587 #endif
588 
589 		/*
590 		 * Load next plaintext word and XOR with encrypted IV.
591 		 */
592 		vxor(16, 20, 16)
593 		vxor(17, 21, 17)
594 		vxor(18, 22, 18)
595 		vxor(19, 23, 19)
596 		stxvw4x(48, %[cc0], %[buf])
597 		stxvw4x(49, %[cc1], %[buf])
598 		stxvw4x(50, %[cc2], %[buf])
599 		stxvw4x(51, %[cc3], %[buf])
600 
601 		addi(%[buf], %[buf], 64)
602 
603 		/*
604 		 * Update IV.
605 		 */
606 		vand(16, 24, 24)
607 		vand(17, 25, 25)
608 		vand(18, 26, 26)
609 		vand(19, 27, 27)
610 
611 		bdnz(loop)
612 
613 : [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3),
614   [buf] "+b" (buf)
615 : [sk] "b" (sk), [ivbuf] "b" (ivbuf), [num_blocks] "b" (num_blocks >> 2),
616   [ctrinc] "b" (ctrinc)
617 #if BR_POWER8_LE
618 	, [idx2be] "b" (idx2be)
619 #endif
620 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
621   "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
622   "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
623   "ctr", "memory"
624 	);
625 }
626 
627 /* see bearssl_block.h */
628 uint32_t
629 br_aes_pwr8_ctr_run(const br_aes_pwr8_ctr_keys *ctx,
630 	const void *iv, uint32_t cc, void *data, size_t len)
631 {
632 	unsigned char *buf;
633 	unsigned char ivbuf[64];
634 
635 	buf = data;
636 	memcpy(ivbuf +  0, iv, 12);
637 	memcpy(ivbuf + 16, iv, 12);
638 	memcpy(ivbuf + 32, iv, 12);
639 	memcpy(ivbuf + 48, iv, 12);
640 	if (len >= 64) {
641 		br_enc32be(ivbuf + 12, cc + 0);
642 		br_enc32be(ivbuf + 28, cc + 1);
643 		br_enc32be(ivbuf + 44, cc + 2);
644 		br_enc32be(ivbuf + 60, cc + 3);
645 		switch (ctx->num_rounds) {
646 		case 10:
647 			ctr_128(ctx->skey.skni, ivbuf, buf,
648 				(len >> 4) & ~(size_t)3);
649 			break;
650 		case 12:
651 			ctr_192(ctx->skey.skni, ivbuf, buf,
652 				(len >> 4) & ~(size_t)3);
653 			break;
654 		default:
655 			ctr_256(ctx->skey.skni, ivbuf, buf,
656 				(len >> 4) & ~(size_t)3);
657 			break;
658 		}
659 		cc += (len >> 4) & ~(size_t)3;
660 		buf += len & ~(size_t)63;
661 		len &= 63;
662 	}
663 	if (len > 0) {
664 		unsigned char tmp[64];
665 
666 		memcpy(tmp, buf, len);
667 		memset(tmp + len, 0, (sizeof tmp) - len);
668 		br_enc32be(ivbuf + 12, cc + 0);
669 		br_enc32be(ivbuf + 28, cc + 1);
670 		br_enc32be(ivbuf + 44, cc + 2);
671 		br_enc32be(ivbuf + 60, cc + 3);
672 		switch (ctx->num_rounds) {
673 		case 10:
674 			ctr_128(ctx->skey.skni, ivbuf, tmp, 4);
675 			break;
676 		case 12:
677 			ctr_192(ctx->skey.skni, ivbuf, tmp, 4);
678 			break;
679 		default:
680 			ctr_256(ctx->skey.skni, ivbuf, tmp, 4);
681 			break;
682 		}
683 		memcpy(buf, tmp, len);
684 		cc += (len + 15) >> 4;
685 	}
686 	return cc;
687 }
688 
689 /* see bearssl_block.h */
690 const br_block_ctr_class br_aes_pwr8_ctr_vtable = {
691 	sizeof(br_aes_pwr8_ctr_keys),
692 	16,
693 	4,
694 	(void (*)(const br_block_ctr_class **, const void *, size_t))
695 		&br_aes_pwr8_ctr_init,
696 	(uint32_t (*)(const br_block_ctr_class *const *,
697 		const void *, uint32_t, void *, size_t))
698 		&br_aes_pwr8_ctr_run
699 };
700 
701 /* see bearssl_block.h */
702 const br_block_ctr_class *
703 br_aes_pwr8_ctr_get_vtable(void)
704 {
705 	return br_aes_pwr8_supported() ? &br_aes_pwr8_ctr_vtable : NULL;
706 }
707 
708 #else
709 
710 /* see bearssl_block.h */
711 const br_block_ctr_class *
712 br_aes_pwr8_ctr_get_vtable(void)
713 {
714 	return NULL;
715 }
716 
717 #endif
718