1 /* 2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #define BR_POWER_ASM_MACROS 1 26 #include "inner.h" 27 28 #if BR_POWER8 29 30 /* see bearssl_block.h */ 31 const br_block_ctrcbc_class * 32 br_aes_pwr8_ctrcbc_get_vtable(void) 33 { 34 return br_aes_pwr8_supported() ? &br_aes_pwr8_ctrcbc_vtable : NULL; 35 } 36 37 /* see bearssl_block.h */ 38 void 39 br_aes_pwr8_ctrcbc_init(br_aes_pwr8_ctrcbc_keys *ctx, 40 const void *key, size_t len) 41 { 42 ctx->vtable = &br_aes_pwr8_ctrcbc_vtable; 43 ctx->num_rounds = br_aes_pwr8_keysched(ctx->skey.skni, key, len); 44 } 45 46 /* 47 * Register conventions for CTR + CBC-MAC: 48 * 49 * AES subkeys are in registers 0 to 10/12/14 (depending on keys size) 50 * Register v15 contains the byteswap index register (little-endian only) 51 * Register v16 contains the CTR counter value 52 * Register v17 contains the CBC-MAC current value 53 * Registers v18 to v27 are scratch 54 * Counter increment uses v28, v29 and v30 55 * 56 * For CTR alone: 57 * 58 * AES subkeys are in registers 0 to 10/12/14 (depending on keys size) 59 * Register v15 contains the byteswap index register (little-endian only) 60 * Registers v16 to v19 contain the CTR counter values (four blocks) 61 * Registers v20 to v27 are scratch 62 * Counter increment uses v28, v29 and v30 63 */ 64 65 #define LOAD_SUBKEYS_128 \ 66 lxvw4x(32, %[cc], %[sk]) \ 67 addi(%[cc], %[cc], 16) \ 68 lxvw4x(33, %[cc], %[sk]) \ 69 addi(%[cc], %[cc], 16) \ 70 lxvw4x(34, %[cc], %[sk]) \ 71 addi(%[cc], %[cc], 16) \ 72 lxvw4x(35, %[cc], %[sk]) \ 73 addi(%[cc], %[cc], 16) \ 74 lxvw4x(36, %[cc], %[sk]) \ 75 addi(%[cc], %[cc], 16) \ 76 lxvw4x(37, %[cc], %[sk]) \ 77 addi(%[cc], %[cc], 16) \ 78 lxvw4x(38, %[cc], %[sk]) \ 79 addi(%[cc], %[cc], 16) \ 80 lxvw4x(39, %[cc], %[sk]) \ 81 addi(%[cc], %[cc], 16) \ 82 lxvw4x(40, %[cc], %[sk]) \ 83 addi(%[cc], %[cc], 16) \ 84 lxvw4x(41, %[cc], %[sk]) \ 85 addi(%[cc], %[cc], 16) \ 86 lxvw4x(42, %[cc], %[sk]) 87 88 #define LOAD_SUBKEYS_192 \ 89 LOAD_SUBKEYS_128 \ 90 addi(%[cc], %[cc], 16) \ 91 lxvw4x(43, %[cc], %[sk]) \ 92 addi(%[cc], %[cc], 16) \ 93 lxvw4x(44, %[cc], %[sk]) 94 95 #define LOAD_SUBKEYS_256 \ 96 LOAD_SUBKEYS_192 \ 97 addi(%[cc], %[cc], 16) \ 98 lxvw4x(45, %[cc], %[sk]) \ 99 addi(%[cc], %[cc], 16) \ 100 lxvw4x(46, %[cc], %[sk]) 101 102 #define BLOCK_ENCRYPT_128(x) \ 103 vxor(x, x, 0) \ 104 vcipher(x, x, 1) \ 105 vcipher(x, x, 2) \ 106 vcipher(x, x, 3) \ 107 vcipher(x, x, 4) \ 108 vcipher(x, x, 5) \ 109 vcipher(x, x, 6) \ 110 vcipher(x, x, 7) \ 111 vcipher(x, x, 8) \ 112 vcipher(x, x, 9) \ 113 vcipherlast(x, x, 10) 114 115 #define BLOCK_ENCRYPT_192(x) \ 116 vxor(x, x, 0) \ 117 vcipher(x, x, 1) \ 118 vcipher(x, x, 2) \ 119 vcipher(x, x, 3) \ 120 vcipher(x, x, 4) \ 121 vcipher(x, x, 5) \ 122 vcipher(x, x, 6) \ 123 vcipher(x, x, 7) \ 124 vcipher(x, x, 8) \ 125 vcipher(x, x, 9) \ 126 vcipher(x, x, 10) \ 127 vcipher(x, x, 11) \ 128 vcipherlast(x, x, 12) 129 130 #define BLOCK_ENCRYPT_256(x) \ 131 vxor(x, x, 0) \ 132 vcipher(x, x, 1) \ 133 vcipher(x, x, 2) \ 134 vcipher(x, x, 3) \ 135 vcipher(x, x, 4) \ 136 vcipher(x, x, 5) \ 137 vcipher(x, x, 6) \ 138 vcipher(x, x, 7) \ 139 vcipher(x, x, 8) \ 140 vcipher(x, x, 9) \ 141 vcipher(x, x, 10) \ 142 vcipher(x, x, 11) \ 143 vcipher(x, x, 12) \ 144 vcipher(x, x, 13) \ 145 vcipherlast(x, x, 14) 146 147 #define BLOCK_ENCRYPT_X2_128(x, y) \ 148 vxor(x, x, 0) \ 149 vxor(y, y, 0) \ 150 vcipher(x, x, 1) \ 151 vcipher(y, y, 1) \ 152 vcipher(x, x, 2) \ 153 vcipher(y, y, 2) \ 154 vcipher(x, x, 3) \ 155 vcipher(y, y, 3) \ 156 vcipher(x, x, 4) \ 157 vcipher(y, y, 4) \ 158 vcipher(x, x, 5) \ 159 vcipher(y, y, 5) \ 160 vcipher(x, x, 6) \ 161 vcipher(y, y, 6) \ 162 vcipher(x, x, 7) \ 163 vcipher(y, y, 7) \ 164 vcipher(x, x, 8) \ 165 vcipher(y, y, 8) \ 166 vcipher(x, x, 9) \ 167 vcipher(y, y, 9) \ 168 vcipherlast(x, x, 10) \ 169 vcipherlast(y, y, 10) 170 171 #define BLOCK_ENCRYPT_X2_192(x, y) \ 172 vxor(x, x, 0) \ 173 vxor(y, y, 0) \ 174 vcipher(x, x, 1) \ 175 vcipher(y, y, 1) \ 176 vcipher(x, x, 2) \ 177 vcipher(y, y, 2) \ 178 vcipher(x, x, 3) \ 179 vcipher(y, y, 3) \ 180 vcipher(x, x, 4) \ 181 vcipher(y, y, 4) \ 182 vcipher(x, x, 5) \ 183 vcipher(y, y, 5) \ 184 vcipher(x, x, 6) \ 185 vcipher(y, y, 6) \ 186 vcipher(x, x, 7) \ 187 vcipher(y, y, 7) \ 188 vcipher(x, x, 8) \ 189 vcipher(y, y, 8) \ 190 vcipher(x, x, 9) \ 191 vcipher(y, y, 9) \ 192 vcipher(x, x, 10) \ 193 vcipher(y, y, 10) \ 194 vcipher(x, x, 11) \ 195 vcipher(y, y, 11) \ 196 vcipherlast(x, x, 12) \ 197 vcipherlast(y, y, 12) 198 199 #define BLOCK_ENCRYPT_X2_256(x, y) \ 200 vxor(x, x, 0) \ 201 vxor(y, y, 0) \ 202 vcipher(x, x, 1) \ 203 vcipher(y, y, 1) \ 204 vcipher(x, x, 2) \ 205 vcipher(y, y, 2) \ 206 vcipher(x, x, 3) \ 207 vcipher(y, y, 3) \ 208 vcipher(x, x, 4) \ 209 vcipher(y, y, 4) \ 210 vcipher(x, x, 5) \ 211 vcipher(y, y, 5) \ 212 vcipher(x, x, 6) \ 213 vcipher(y, y, 6) \ 214 vcipher(x, x, 7) \ 215 vcipher(y, y, 7) \ 216 vcipher(x, x, 8) \ 217 vcipher(y, y, 8) \ 218 vcipher(x, x, 9) \ 219 vcipher(y, y, 9) \ 220 vcipher(x, x, 10) \ 221 vcipher(y, y, 10) \ 222 vcipher(x, x, 11) \ 223 vcipher(y, y, 11) \ 224 vcipher(x, x, 12) \ 225 vcipher(y, y, 12) \ 226 vcipher(x, x, 13) \ 227 vcipher(y, y, 13) \ 228 vcipherlast(x, x, 14) \ 229 vcipherlast(y, y, 14) 230 231 #define BLOCK_ENCRYPT_X4_128(x0, x1, x2, x3) \ 232 vxor(x0, x0, 0) \ 233 vxor(x1, x1, 0) \ 234 vxor(x2, x2, 0) \ 235 vxor(x3, x3, 0) \ 236 vcipher(x0, x0, 1) \ 237 vcipher(x1, x1, 1) \ 238 vcipher(x2, x2, 1) \ 239 vcipher(x3, x3, 1) \ 240 vcipher(x0, x0, 2) \ 241 vcipher(x1, x1, 2) \ 242 vcipher(x2, x2, 2) \ 243 vcipher(x3, x3, 2) \ 244 vcipher(x0, x0, 3) \ 245 vcipher(x1, x1, 3) \ 246 vcipher(x2, x2, 3) \ 247 vcipher(x3, x3, 3) \ 248 vcipher(x0, x0, 4) \ 249 vcipher(x1, x1, 4) \ 250 vcipher(x2, x2, 4) \ 251 vcipher(x3, x3, 4) \ 252 vcipher(x0, x0, 5) \ 253 vcipher(x1, x1, 5) \ 254 vcipher(x2, x2, 5) \ 255 vcipher(x3, x3, 5) \ 256 vcipher(x0, x0, 6) \ 257 vcipher(x1, x1, 6) \ 258 vcipher(x2, x2, 6) \ 259 vcipher(x3, x3, 6) \ 260 vcipher(x0, x0, 7) \ 261 vcipher(x1, x1, 7) \ 262 vcipher(x2, x2, 7) \ 263 vcipher(x3, x3, 7) \ 264 vcipher(x0, x0, 8) \ 265 vcipher(x1, x1, 8) \ 266 vcipher(x2, x2, 8) \ 267 vcipher(x3, x3, 8) \ 268 vcipher(x0, x0, 9) \ 269 vcipher(x1, x1, 9) \ 270 vcipher(x2, x2, 9) \ 271 vcipher(x3, x3, 9) \ 272 vcipherlast(x0, x0, 10) \ 273 vcipherlast(x1, x1, 10) \ 274 vcipherlast(x2, x2, 10) \ 275 vcipherlast(x3, x3, 10) 276 277 #define BLOCK_ENCRYPT_X4_192(x0, x1, x2, x3) \ 278 vxor(x0, x0, 0) \ 279 vxor(x1, x1, 0) \ 280 vxor(x2, x2, 0) \ 281 vxor(x3, x3, 0) \ 282 vcipher(x0, x0, 1) \ 283 vcipher(x1, x1, 1) \ 284 vcipher(x2, x2, 1) \ 285 vcipher(x3, x3, 1) \ 286 vcipher(x0, x0, 2) \ 287 vcipher(x1, x1, 2) \ 288 vcipher(x2, x2, 2) \ 289 vcipher(x3, x3, 2) \ 290 vcipher(x0, x0, 3) \ 291 vcipher(x1, x1, 3) \ 292 vcipher(x2, x2, 3) \ 293 vcipher(x3, x3, 3) \ 294 vcipher(x0, x0, 4) \ 295 vcipher(x1, x1, 4) \ 296 vcipher(x2, x2, 4) \ 297 vcipher(x3, x3, 4) \ 298 vcipher(x0, x0, 5) \ 299 vcipher(x1, x1, 5) \ 300 vcipher(x2, x2, 5) \ 301 vcipher(x3, x3, 5) \ 302 vcipher(x0, x0, 6) \ 303 vcipher(x1, x1, 6) \ 304 vcipher(x2, x2, 6) \ 305 vcipher(x3, x3, 6) \ 306 vcipher(x0, x0, 7) \ 307 vcipher(x1, x1, 7) \ 308 vcipher(x2, x2, 7) \ 309 vcipher(x3, x3, 7) \ 310 vcipher(x0, x0, 8) \ 311 vcipher(x1, x1, 8) \ 312 vcipher(x2, x2, 8) \ 313 vcipher(x3, x3, 8) \ 314 vcipher(x0, x0, 9) \ 315 vcipher(x1, x1, 9) \ 316 vcipher(x2, x2, 9) \ 317 vcipher(x3, x3, 9) \ 318 vcipher(x0, x0, 10) \ 319 vcipher(x1, x1, 10) \ 320 vcipher(x2, x2, 10) \ 321 vcipher(x3, x3, 10) \ 322 vcipher(x0, x0, 11) \ 323 vcipher(x1, x1, 11) \ 324 vcipher(x2, x2, 11) \ 325 vcipher(x3, x3, 11) \ 326 vcipherlast(x0, x0, 12) \ 327 vcipherlast(x1, x1, 12) \ 328 vcipherlast(x2, x2, 12) \ 329 vcipherlast(x3, x3, 12) 330 331 #define BLOCK_ENCRYPT_X4_256(x0, x1, x2, x3) \ 332 vxor(x0, x0, 0) \ 333 vxor(x1, x1, 0) \ 334 vxor(x2, x2, 0) \ 335 vxor(x3, x3, 0) \ 336 vcipher(x0, x0, 1) \ 337 vcipher(x1, x1, 1) \ 338 vcipher(x2, x2, 1) \ 339 vcipher(x3, x3, 1) \ 340 vcipher(x0, x0, 2) \ 341 vcipher(x1, x1, 2) \ 342 vcipher(x2, x2, 2) \ 343 vcipher(x3, x3, 2) \ 344 vcipher(x0, x0, 3) \ 345 vcipher(x1, x1, 3) \ 346 vcipher(x2, x2, 3) \ 347 vcipher(x3, x3, 3) \ 348 vcipher(x0, x0, 4) \ 349 vcipher(x1, x1, 4) \ 350 vcipher(x2, x2, 4) \ 351 vcipher(x3, x3, 4) \ 352 vcipher(x0, x0, 5) \ 353 vcipher(x1, x1, 5) \ 354 vcipher(x2, x2, 5) \ 355 vcipher(x3, x3, 5) \ 356 vcipher(x0, x0, 6) \ 357 vcipher(x1, x1, 6) \ 358 vcipher(x2, x2, 6) \ 359 vcipher(x3, x3, 6) \ 360 vcipher(x0, x0, 7) \ 361 vcipher(x1, x1, 7) \ 362 vcipher(x2, x2, 7) \ 363 vcipher(x3, x3, 7) \ 364 vcipher(x0, x0, 8) \ 365 vcipher(x1, x1, 8) \ 366 vcipher(x2, x2, 8) \ 367 vcipher(x3, x3, 8) \ 368 vcipher(x0, x0, 9) \ 369 vcipher(x1, x1, 9) \ 370 vcipher(x2, x2, 9) \ 371 vcipher(x3, x3, 9) \ 372 vcipher(x0, x0, 10) \ 373 vcipher(x1, x1, 10) \ 374 vcipher(x2, x2, 10) \ 375 vcipher(x3, x3, 10) \ 376 vcipher(x0, x0, 11) \ 377 vcipher(x1, x1, 11) \ 378 vcipher(x2, x2, 11) \ 379 vcipher(x3, x3, 11) \ 380 vcipher(x0, x0, 12) \ 381 vcipher(x1, x1, 12) \ 382 vcipher(x2, x2, 12) \ 383 vcipher(x3, x3, 12) \ 384 vcipher(x0, x0, 13) \ 385 vcipher(x1, x1, 13) \ 386 vcipher(x2, x2, 13) \ 387 vcipher(x3, x3, 13) \ 388 vcipherlast(x0, x0, 14) \ 389 vcipherlast(x1, x1, 14) \ 390 vcipherlast(x2, x2, 14) \ 391 vcipherlast(x3, x3, 14) 392 393 #if BR_POWER8_LE 394 static const uint32_t idx2be[] = { 395 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 396 }; 397 #define BYTESWAP_INIT lxvw4x(47, 0, %[idx2be]) 398 #define BYTESWAP(x) vperm(x, x, x, 15) 399 #define BYTESWAPX(d, s) vperm(d, s, s, 15) 400 #define BYTESWAP_REG , [idx2be] "b" (idx2be) 401 #else 402 #define BYTESWAP_INIT 403 #define BYTESWAP(x) 404 #define BYTESWAPX(d, s) vand(d, s, s) 405 #define BYTESWAP_REG 406 #endif 407 408 static const uint32_t ctrinc[] = { 409 0, 0, 0, 1 410 }; 411 static const uint32_t ctrinc_x4[] = { 412 0, 0, 0, 4 413 }; 414 #define INCR_128_INIT lxvw4x(60, 0, %[ctrinc]) 415 #define INCR_128_X4_INIT lxvw4x(60, 0, %[ctrinc_x4]) 416 #define INCR_128(d, s) \ 417 vaddcuw(29, s, 28) \ 418 vadduwm(d, s, 28) \ 419 vsldoi(30, 29, 29, 4) \ 420 vaddcuw(29, d, 30) \ 421 vadduwm(d, d, 30) \ 422 vsldoi(30, 29, 29, 4) \ 423 vaddcuw(29, d, 30) \ 424 vadduwm(d, d, 30) \ 425 vsldoi(30, 29, 29, 4) \ 426 vadduwm(d, d, 30) 427 428 #define MKCTR(size) \ 429 static void \ 430 ctr_ ## size(const unsigned char *sk, \ 431 unsigned char *ctrbuf, unsigned char *buf, size_t num_blocks_x4) \ 432 { \ 433 long cc, cc0, cc1, cc2, cc3; \ 434 \ 435 cc = 0; \ 436 cc0 = 0; \ 437 cc1 = 16; \ 438 cc2 = 32; \ 439 cc3 = 48; \ 440 asm volatile ( \ 441 \ 442 /* \ 443 * Load subkeys into v0..v10 \ 444 */ \ 445 LOAD_SUBKEYS_ ## size \ 446 li(%[cc], 0) \ 447 \ 448 BYTESWAP_INIT \ 449 INCR_128_X4_INIT \ 450 \ 451 /* \ 452 * Load current CTR counters into v16 to v19. \ 453 */ \ 454 lxvw4x(48, %[cc0], %[ctrbuf]) \ 455 lxvw4x(49, %[cc1], %[ctrbuf]) \ 456 lxvw4x(50, %[cc2], %[ctrbuf]) \ 457 lxvw4x(51, %[cc3], %[ctrbuf]) \ 458 BYTESWAP(16) \ 459 BYTESWAP(17) \ 460 BYTESWAP(18) \ 461 BYTESWAP(19) \ 462 \ 463 mtctr(%[num_blocks_x4]) \ 464 \ 465 label(loop) \ 466 /* \ 467 * Compute next counter values into v20..v23. \ 468 */ \ 469 INCR_128(20, 16) \ 470 INCR_128(21, 17) \ 471 INCR_128(22, 18) \ 472 INCR_128(23, 19) \ 473 \ 474 /* \ 475 * Encrypt counter values and XOR into next data blocks. \ 476 */ \ 477 lxvw4x(56, %[cc0], %[buf]) \ 478 lxvw4x(57, %[cc1], %[buf]) \ 479 lxvw4x(58, %[cc2], %[buf]) \ 480 lxvw4x(59, %[cc3], %[buf]) \ 481 BYTESWAP(24) \ 482 BYTESWAP(25) \ 483 BYTESWAP(26) \ 484 BYTESWAP(27) \ 485 BLOCK_ENCRYPT_X4_ ## size(16, 17, 18, 19) \ 486 vxor(16, 16, 24) \ 487 vxor(17, 17, 25) \ 488 vxor(18, 18, 26) \ 489 vxor(19, 19, 27) \ 490 BYTESWAP(16) \ 491 BYTESWAP(17) \ 492 BYTESWAP(18) \ 493 BYTESWAP(19) \ 494 stxvw4x(48, %[cc0], %[buf]) \ 495 stxvw4x(49, %[cc1], %[buf]) \ 496 stxvw4x(50, %[cc2], %[buf]) \ 497 stxvw4x(51, %[cc3], %[buf]) \ 498 \ 499 /* \ 500 * Update counters and data pointer. \ 501 */ \ 502 vand(16, 20, 20) \ 503 vand(17, 21, 21) \ 504 vand(18, 22, 22) \ 505 vand(19, 23, 23) \ 506 addi(%[buf], %[buf], 64) \ 507 \ 508 bdnz(loop) \ 509 \ 510 /* \ 511 * Write back new counter values. \ 512 */ \ 513 BYTESWAP(16) \ 514 BYTESWAP(17) \ 515 BYTESWAP(18) \ 516 BYTESWAP(19) \ 517 stxvw4x(48, %[cc0], %[ctrbuf]) \ 518 stxvw4x(49, %[cc1], %[ctrbuf]) \ 519 stxvw4x(50, %[cc2], %[ctrbuf]) \ 520 stxvw4x(51, %[cc3], %[ctrbuf]) \ 521 \ 522 : [cc] "+b" (cc), [buf] "+b" (buf), \ 523 [cc0] "+b" (cc0), [cc1] "+b" (cc1), [cc2] "+b" (cc2), [cc3] "+b" (cc3) \ 524 : [sk] "b" (sk), [ctrbuf] "b" (ctrbuf), \ 525 [num_blocks_x4] "b" (num_blocks_x4), [ctrinc_x4] "b" (ctrinc_x4) \ 526 BYTESWAP_REG \ 527 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \ 528 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \ 529 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \ 530 "v30", "ctr", "memory" \ 531 ); \ 532 } 533 534 MKCTR(128) 535 MKCTR(192) 536 MKCTR(256) 537 538 #define MKCBCMAC(size) \ 539 static void \ 540 cbcmac_ ## size(const unsigned char *sk, \ 541 unsigned char *cbcmac, const unsigned char *buf, size_t num_blocks) \ 542 { \ 543 long cc; \ 544 \ 545 cc = 0; \ 546 asm volatile ( \ 547 \ 548 /* \ 549 * Load subkeys into v0..v10 \ 550 */ \ 551 LOAD_SUBKEYS_ ## size \ 552 li(%[cc], 0) \ 553 \ 554 BYTESWAP_INIT \ 555 \ 556 /* \ 557 * Load current CBC-MAC value into v16. \ 558 */ \ 559 lxvw4x(48, %[cc], %[cbcmac]) \ 560 BYTESWAP(16) \ 561 \ 562 mtctr(%[num_blocks]) \ 563 \ 564 label(loop) \ 565 /* \ 566 * Load next block, XOR into current CBC-MAC value, \ 567 * and then encrypt it. \ 568 */ \ 569 lxvw4x(49, %[cc], %[buf]) \ 570 BYTESWAP(17) \ 571 vxor(16, 16, 17) \ 572 BLOCK_ENCRYPT_ ## size(16) \ 573 addi(%[buf], %[buf], 16) \ 574 \ 575 bdnz(loop) \ 576 \ 577 /* \ 578 * Write back new CBC-MAC value. \ 579 */ \ 580 BYTESWAP(16) \ 581 stxvw4x(48, %[cc], %[cbcmac]) \ 582 \ 583 : [cc] "+b" (cc), [buf] "+b" (buf) \ 584 : [sk] "b" (sk), [cbcmac] "b" (cbcmac), [num_blocks] "b" (num_blocks) \ 585 BYTESWAP_REG \ 586 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \ 587 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \ 588 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \ 589 "v30", "ctr", "memory" \ 590 ); \ 591 } 592 593 MKCBCMAC(128) 594 MKCBCMAC(192) 595 MKCBCMAC(256) 596 597 #define MKENCRYPT(size) \ 598 static void \ 599 ctrcbc_ ## size ## _encrypt(const unsigned char *sk, \ 600 unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \ 601 size_t num_blocks) \ 602 { \ 603 long cc; \ 604 \ 605 cc = 0; \ 606 asm volatile ( \ 607 \ 608 /* \ 609 * Load subkeys into v0..v10 \ 610 */ \ 611 LOAD_SUBKEYS_ ## size \ 612 li(%[cc], 0) \ 613 \ 614 BYTESWAP_INIT \ 615 INCR_128_INIT \ 616 \ 617 /* \ 618 * Load current CTR counter into v16, and current \ 619 * CBC-MAC IV into v17. \ 620 */ \ 621 lxvw4x(48, %[cc], %[ctr]) \ 622 lxvw4x(49, %[cc], %[cbcmac]) \ 623 BYTESWAP(16) \ 624 BYTESWAP(17) \ 625 \ 626 /* \ 627 * At each iteration, we do two parallel encryption: \ 628 * - new counter value for encryption of the next block; \ 629 * - CBC-MAC over the previous encrypted block. \ 630 * Thus, each plaintext block implies two AES instances, \ 631 * over two successive iterations. This requires a single \ 632 * counter encryption before the loop, and a single \ 633 * CBC-MAC encryption after the loop. \ 634 */ \ 635 \ 636 /* \ 637 * Encrypt first block (into v20). \ 638 */ \ 639 lxvw4x(52, %[cc], %[buf]) \ 640 BYTESWAP(20) \ 641 INCR_128(22, 16) \ 642 BLOCK_ENCRYPT_ ## size(16) \ 643 vxor(20, 20, 16) \ 644 BYTESWAPX(21, 20) \ 645 stxvw4x(53, %[cc], %[buf]) \ 646 vand(16, 22, 22) \ 647 addi(%[buf], %[buf], 16) \ 648 \ 649 /* \ 650 * Load loop counter; skip the loop if there is only \ 651 * one block in total (already handled by the boundary \ 652 * conditions). \ 653 */ \ 654 mtctr(%[num_blocks]) \ 655 bdz(fastexit) \ 656 \ 657 label(loop) \ 658 /* \ 659 * Upon loop entry: \ 660 * v16 counter value for next block \ 661 * v17 current CBC-MAC value \ 662 * v20 encrypted previous block \ 663 */ \ 664 vxor(17, 17, 20) \ 665 INCR_128(22, 16) \ 666 lxvw4x(52, %[cc], %[buf]) \ 667 BYTESWAP(20) \ 668 BLOCK_ENCRYPT_X2_ ## size(16, 17) \ 669 vxor(20, 20, 16) \ 670 BYTESWAPX(21, 20) \ 671 stxvw4x(53, %[cc], %[buf]) \ 672 addi(%[buf], %[buf], 16) \ 673 vand(16, 22, 22) \ 674 \ 675 bdnz(loop) \ 676 \ 677 label(fastexit) \ 678 vxor(17, 17, 20) \ 679 BLOCK_ENCRYPT_ ## size(17) \ 680 BYTESWAP(16) \ 681 BYTESWAP(17) \ 682 stxvw4x(48, %[cc], %[ctr]) \ 683 stxvw4x(49, %[cc], %[cbcmac]) \ 684 \ 685 : [cc] "+b" (cc), [buf] "+b" (buf) \ 686 : [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \ 687 [num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \ 688 BYTESWAP_REG \ 689 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \ 690 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \ 691 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \ 692 "v30", "ctr", "memory" \ 693 ); \ 694 } 695 696 MKENCRYPT(128) 697 MKENCRYPT(192) 698 MKENCRYPT(256) 699 700 #define MKDECRYPT(size) \ 701 static void \ 702 ctrcbc_ ## size ## _decrypt(const unsigned char *sk, \ 703 unsigned char *ctr, unsigned char *cbcmac, unsigned char *buf, \ 704 size_t num_blocks) \ 705 { \ 706 long cc; \ 707 \ 708 cc = 0; \ 709 asm volatile ( \ 710 \ 711 /* \ 712 * Load subkeys into v0..v10 \ 713 */ \ 714 LOAD_SUBKEYS_ ## size \ 715 li(%[cc], 0) \ 716 \ 717 BYTESWAP_INIT \ 718 INCR_128_INIT \ 719 \ 720 /* \ 721 * Load current CTR counter into v16, and current \ 722 * CBC-MAC IV into v17. \ 723 */ \ 724 lxvw4x(48, %[cc], %[ctr]) \ 725 lxvw4x(49, %[cc], %[cbcmac]) \ 726 BYTESWAP(16) \ 727 BYTESWAP(17) \ 728 \ 729 /* \ 730 * At each iteration, we do two parallel encryption: \ 731 * - new counter value for decryption of the next block; \ 732 * - CBC-MAC over the next encrypted block. \ 733 * Each iteration performs the two AES instances related \ 734 * to the current block; there is thus no need for some \ 735 * extra pre-loop and post-loop work as in encryption. \ 736 */ \ 737 \ 738 mtctr(%[num_blocks]) \ 739 \ 740 label(loop) \ 741 /* \ 742 * Upon loop entry: \ 743 * v16 counter value for next block \ 744 * v17 current CBC-MAC value \ 745 */ \ 746 lxvw4x(52, %[cc], %[buf]) \ 747 BYTESWAP(20) \ 748 vxor(17, 17, 20) \ 749 INCR_128(22, 16) \ 750 BLOCK_ENCRYPT_X2_ ## size(16, 17) \ 751 vxor(20, 20, 16) \ 752 BYTESWAPX(21, 20) \ 753 stxvw4x(53, %[cc], %[buf]) \ 754 addi(%[buf], %[buf], 16) \ 755 vand(16, 22, 22) \ 756 \ 757 bdnz(loop) \ 758 \ 759 /* \ 760 * Store back counter and CBC-MAC value. \ 761 */ \ 762 BYTESWAP(16) \ 763 BYTESWAP(17) \ 764 stxvw4x(48, %[cc], %[ctr]) \ 765 stxvw4x(49, %[cc], %[cbcmac]) \ 766 \ 767 : [cc] "+b" (cc), [buf] "+b" (buf) \ 768 : [sk] "b" (sk), [ctr] "b" (ctr), [cbcmac] "b" (cbcmac), \ 769 [num_blocks] "b" (num_blocks), [ctrinc] "b" (ctrinc) \ 770 BYTESWAP_REG \ 771 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", \ 772 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", \ 773 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", \ 774 "v30", "ctr", "memory" \ 775 ); \ 776 } 777 778 MKDECRYPT(128) 779 MKDECRYPT(192) 780 MKDECRYPT(256) 781 782 /* see bearssl_block.h */ 783 void 784 br_aes_pwr8_ctrcbc_encrypt(const br_aes_pwr8_ctrcbc_keys *ctx, 785 void *ctr, void *cbcmac, void *data, size_t len) 786 { 787 if (len == 0) { 788 return; 789 } 790 switch (ctx->num_rounds) { 791 case 10: 792 ctrcbc_128_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); 793 break; 794 case 12: 795 ctrcbc_192_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); 796 break; 797 default: 798 ctrcbc_256_encrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); 799 break; 800 } 801 } 802 803 /* see bearssl_block.h */ 804 void 805 br_aes_pwr8_ctrcbc_decrypt(const br_aes_pwr8_ctrcbc_keys *ctx, 806 void *ctr, void *cbcmac, void *data, size_t len) 807 { 808 if (len == 0) { 809 return; 810 } 811 switch (ctx->num_rounds) { 812 case 10: 813 ctrcbc_128_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); 814 break; 815 case 12: 816 ctrcbc_192_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); 817 break; 818 default: 819 ctrcbc_256_decrypt(ctx->skey.skni, ctr, cbcmac, data, len >> 4); 820 break; 821 } 822 } 823 824 static inline void 825 incr_ctr(void *dst, const void *src) 826 { 827 uint64_t hi, lo; 828 829 hi = br_dec64be(src); 830 lo = br_dec64be((const unsigned char *)src + 8); 831 lo ++; 832 hi += ((lo | -lo) >> 63) ^ (uint64_t)1; 833 br_enc64be(dst, hi); 834 br_enc64be((unsigned char *)dst + 8, lo); 835 } 836 837 /* see bearssl_block.h */ 838 void 839 br_aes_pwr8_ctrcbc_ctr(const br_aes_pwr8_ctrcbc_keys *ctx, 840 void *ctr, void *data, size_t len) 841 { 842 unsigned char ctrbuf[64]; 843 844 memcpy(ctrbuf, ctr, 16); 845 incr_ctr(ctrbuf + 16, ctrbuf); 846 incr_ctr(ctrbuf + 32, ctrbuf + 16); 847 incr_ctr(ctrbuf + 48, ctrbuf + 32); 848 if (len >= 64) { 849 switch (ctx->num_rounds) { 850 case 10: 851 ctr_128(ctx->skey.skni, ctrbuf, data, len >> 6); 852 break; 853 case 12: 854 ctr_192(ctx->skey.skni, ctrbuf, data, len >> 6); 855 break; 856 default: 857 ctr_256(ctx->skey.skni, ctrbuf, data, len >> 6); 858 break; 859 } 860 data = (unsigned char *)data + (len & ~(size_t)63); 861 len &= 63; 862 } 863 if (len > 0) { 864 unsigned char tmp[64]; 865 866 if (len >= 32) { 867 if (len >= 48) { 868 memcpy(ctr, ctrbuf + 48, 16); 869 } else { 870 memcpy(ctr, ctrbuf + 32, 16); 871 } 872 } else { 873 if (len >= 16) { 874 memcpy(ctr, ctrbuf + 16, 16); 875 } 876 } 877 memcpy(tmp, data, len); 878 memset(tmp + len, 0, (sizeof tmp) - len); 879 switch (ctx->num_rounds) { 880 case 10: 881 ctr_128(ctx->skey.skni, ctrbuf, tmp, 1); 882 break; 883 case 12: 884 ctr_192(ctx->skey.skni, ctrbuf, tmp, 1); 885 break; 886 default: 887 ctr_256(ctx->skey.skni, ctrbuf, tmp, 1); 888 break; 889 } 890 memcpy(data, tmp, len); 891 } else { 892 memcpy(ctr, ctrbuf, 16); 893 } 894 } 895 896 /* see bearssl_block.h */ 897 void 898 br_aes_pwr8_ctrcbc_mac(const br_aes_pwr8_ctrcbc_keys *ctx, 899 void *cbcmac, const void *data, size_t len) 900 { 901 if (len > 0) { 902 switch (ctx->num_rounds) { 903 case 10: 904 cbcmac_128(ctx->skey.skni, cbcmac, data, len >> 4); 905 break; 906 case 12: 907 cbcmac_192(ctx->skey.skni, cbcmac, data, len >> 4); 908 break; 909 default: 910 cbcmac_256(ctx->skey.skni, cbcmac, data, len >> 4); 911 break; 912 } 913 } 914 } 915 916 /* see bearssl_block.h */ 917 const br_block_ctrcbc_class br_aes_pwr8_ctrcbc_vtable = { 918 sizeof(br_aes_pwr8_ctrcbc_keys), 919 16, 920 4, 921 (void (*)(const br_block_ctrcbc_class **, const void *, size_t)) 922 &br_aes_pwr8_ctrcbc_init, 923 (void (*)(const br_block_ctrcbc_class *const *, 924 void *, void *, void *, size_t)) 925 &br_aes_pwr8_ctrcbc_encrypt, 926 (void (*)(const br_block_ctrcbc_class *const *, 927 void *, void *, void *, size_t)) 928 &br_aes_pwr8_ctrcbc_decrypt, 929 (void (*)(const br_block_ctrcbc_class *const *, 930 void *, void *, size_t)) 931 &br_aes_pwr8_ctrcbc_ctr, 932 (void (*)(const br_block_ctrcbc_class *const *, 933 void *, const void *, size_t)) 934 &br_aes_pwr8_ctrcbc_mac 935 }; 936 937 #else 938 939 /* see bearssl_block.h */ 940 const br_block_ctrcbc_class * 941 br_aes_pwr8_ctrcbc_get_vtable(void) 942 { 943 return NULL; 944 } 945 946 #endif 947