1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #define BR_POWER_ASM_MACROS 1 26 #include "inner.h" 27 28 /* 29 * This code contains the AES key schedule implementation using the 30 * POWER8 opcodes. 31 */ 32 33 #if BR_POWER8 34 35 static void 36 key_schedule_128(unsigned char *sk, const unsigned char *key) 37 { 38 long cc; 39 40 static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B }; 41 #if BR_POWER8_LE 42 static const uint32_t idx2be[] = { 43 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 44 }; 45 #endif 46 47 cc = 0; 48 49 /* 50 * We use the VSX instructions for loading and storing the 51 * key/subkeys, since they support unaligned accesses. The rest 52 * of the computation is VMX only. VMX register 0 is VSX 53 * register 32. 54 */ 55 asm volatile ( 56 57 /* 58 * v0 = all-zero word 59 * v1 = constant -8 / +8, copied into four words 60 * v2 = current subkey 61 * v3 = Rcon (x4 words) 62 * v6 = constant 8, copied into four words 63 * v7 = constant 0x11B, copied into four words 64 * v8 = constant for byteswapping words 65 */ 66 vspltisw(0, 0) 67 #if BR_POWER8_LE 68 vspltisw(1, -8) 69 #else 70 vspltisw(1, 8) 71 #endif 72 lxvw4x(34, 0, %[key]) 73 vspltisw(3, 1) 74 vspltisw(6, 8) 75 lxvw4x(39, 0, %[fmod]) 76 #if BR_POWER8_LE 77 lxvw4x(40, 0, %[idx2be]) 78 #endif 79 80 /* 81 * First subkey is a copy of the key itself. 82 */ 83 #if BR_POWER8_LE 84 vperm(4, 2, 2, 8) 85 stxvw4x(36, 0, %[sk]) 86 #else 87 stxvw4x(34, 0, %[sk]) 88 #endif 89 90 /* 91 * Loop must run 10 times. 92 */ 93 li(%[cc], 10) 94 mtctr(%[cc]) 95 label(loop) 96 /* Increment subkey address */ 97 addi(%[sk], %[sk], 16) 98 99 /* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */ 100 vrlw(4, 2, 1) 101 vsbox(4, 4) 102 #if BR_POWER8_LE 103 vxor(4, 4, 3) 104 #else 105 vsldoi(5, 3, 0, 3) 106 vxor(4, 4, 5) 107 #endif 108 vspltw(4, 4, 3) 109 110 /* XOR words for next subkey */ 111 vsldoi(5, 0, 2, 12) 112 vxor(2, 2, 5) 113 vsldoi(5, 0, 2, 12) 114 vxor(2, 2, 5) 115 vsldoi(5, 0, 2, 12) 116 vxor(2, 2, 5) 117 vxor(2, 2, 4) 118 119 /* Store next subkey */ 120 #if BR_POWER8_LE 121 vperm(4, 2, 2, 8) 122 stxvw4x(36, 0, %[sk]) 123 #else 124 stxvw4x(34, 0, %[sk]) 125 #endif 126 127 /* Update Rcon */ 128 vadduwm(3, 3, 3) 129 vsrw(4, 3, 6) 130 vsubuwm(4, 0, 4) 131 vand(4, 4, 7) 132 vxor(3, 3, 4) 133 134 bdnz(loop) 135 136 : [sk] "+b" (sk), [cc] "+b" (cc) 137 : [key] "b" (key), [fmod] "b" (fmod) 138 #if BR_POWER8_LE 139 , [idx2be] "b" (idx2be) 140 #endif 141 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory" 142 ); 143 } 144 145 static void 146 key_schedule_192(unsigned char *sk, const unsigned char *key) 147 { 148 long cc; 149 150 #if BR_POWER8_LE 151 static const uint32_t idx2be[] = { 152 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 153 }; 154 #endif 155 156 cc = 0; 157 158 /* 159 * We use the VSX instructions for loading and storing the 160 * key/subkeys, since they support unaligned accesses. The rest 161 * of the computation is VMX only. VMX register 0 is VSX 162 * register 32. 163 */ 164 asm volatile ( 165 166 /* 167 * v0 = all-zero word 168 * v1 = constant -8 / +8, copied into four words 169 * v2, v3 = current subkey 170 * v5 = Rcon (x4 words) (already shifted on big-endian) 171 * v6 = constant 8, copied into four words 172 * v8 = constant for byteswapping words 173 * 174 * The left two words of v3 are ignored. 175 */ 176 vspltisw(0, 0) 177 #if BR_POWER8_LE 178 vspltisw(1, -8) 179 #else 180 vspltisw(1, 8) 181 #endif 182 li(%[cc], 8) 183 lxvw4x(34, 0, %[key]) 184 lxvw4x(35, %[cc], %[key]) 185 vsldoi(3, 3, 0, 8) 186 vspltisw(5, 1) 187 #if !BR_POWER8_LE 188 vsldoi(5, 5, 0, 3) 189 #endif 190 vspltisw(6, 8) 191 #if BR_POWER8_LE 192 lxvw4x(40, 0, %[idx2be]) 193 #endif 194 195 /* 196 * Loop must run 8 times. Each iteration produces 256 197 * bits of subkeys, with a 64-bit overlap. 198 */ 199 li(%[cc], 8) 200 mtctr(%[cc]) 201 li(%[cc], 16) 202 label(loop) 203 204 /* 205 * Last 6 words in v2:v3l. Compute next 6 words into 206 * v3r:v4. 207 */ 208 vrlw(10, 3, 1) 209 vsbox(10, 10) 210 vxor(10, 10, 5) 211 vspltw(10, 10, 1) 212 vsldoi(11, 0, 10, 8) 213 214 vsldoi(12, 0, 2, 12) 215 vxor(12, 2, 12) 216 vsldoi(13, 0, 12, 12) 217 vxor(12, 12, 13) 218 vsldoi(13, 0, 12, 12) 219 vxor(12, 12, 13) 220 221 vspltw(13, 12, 3) 222 vxor(13, 13, 3) 223 vsldoi(14, 0, 3, 12) 224 vxor(13, 13, 14) 225 226 vsldoi(4, 12, 13, 8) 227 vsldoi(14, 0, 3, 8) 228 vsldoi(3, 14, 12, 8) 229 230 vxor(3, 3, 11) 231 vxor(4, 4, 10) 232 233 /* 234 * Update Rcon. Since for a 192-bit key, we use only 8 235 * such constants, we will not hit the field modulus, 236 * so a simple shift (addition) works well. 237 */ 238 vadduwm(5, 5, 5) 239 240 /* 241 * Write out the two left 128-bit words 242 */ 243 #if BR_POWER8_LE 244 vperm(10, 2, 2, 8) 245 vperm(11, 3, 3, 8) 246 stxvw4x(42, 0, %[sk]) 247 stxvw4x(43, %[cc], %[sk]) 248 #else 249 stxvw4x(34, 0, %[sk]) 250 stxvw4x(35, %[cc], %[sk]) 251 #endif 252 addi(%[sk], %[sk], 24) 253 254 /* 255 * Shift words for next iteration. 256 */ 257 vsldoi(2, 3, 4, 8) 258 vsldoi(3, 4, 0, 8) 259 260 bdnz(loop) 261 262 /* 263 * The loop wrote the first 50 subkey words, but we need 264 * to produce 52, so we must do one last write. 265 */ 266 #if BR_POWER8_LE 267 vperm(10, 2, 2, 8) 268 stxvw4x(42, 0, %[sk]) 269 #else 270 stxvw4x(34, 0, %[sk]) 271 #endif 272 273 : [sk] "+b" (sk), [cc] "+b" (cc) 274 : [key] "b" (key) 275 #if BR_POWER8_LE 276 , [idx2be] "b" (idx2be) 277 #endif 278 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 279 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" 280 ); 281 } 282 283 static void 284 key_schedule_256(unsigned char *sk, const unsigned char *key) 285 { 286 long cc; 287 288 #if BR_POWER8_LE 289 static const uint32_t idx2be[] = { 290 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 291 }; 292 #endif 293 294 cc = 0; 295 296 /* 297 * We use the VSX instructions for loading and storing the 298 * key/subkeys, since they support unaligned accesses. The rest 299 * of the computation is VMX only. VMX register 0 is VSX 300 * register 32. 301 */ 302 asm volatile ( 303 304 /* 305 * v0 = all-zero word 306 * v1 = constant -8 / +8, copied into four words 307 * v2, v3 = current subkey 308 * v6 = Rcon (x4 words) (already shifted on big-endian) 309 * v7 = constant 8, copied into four words 310 * v8 = constant for byteswapping words 311 * 312 * The left two words of v3 are ignored. 313 */ 314 vspltisw(0, 0) 315 #if BR_POWER8_LE 316 vspltisw(1, -8) 317 #else 318 vspltisw(1, 8) 319 #endif 320 li(%[cc], 16) 321 lxvw4x(34, 0, %[key]) 322 lxvw4x(35, %[cc], %[key]) 323 vspltisw(6, 1) 324 #if !BR_POWER8_LE 325 vsldoi(6, 6, 0, 3) 326 #endif 327 vspltisw(7, 8) 328 #if BR_POWER8_LE 329 lxvw4x(40, 0, %[idx2be]) 330 #endif 331 332 /* 333 * Loop must run 7 times. Each iteration produces two 334 * subkeys. 335 */ 336 li(%[cc], 7) 337 mtctr(%[cc]) 338 li(%[cc], 16) 339 label(loop) 340 341 /* 342 * Current words are in v2:v3. Compute next word in v4. 343 */ 344 vrlw(10, 3, 1) 345 vsbox(10, 10) 346 vxor(10, 10, 6) 347 vspltw(10, 10, 3) 348 349 vsldoi(4, 0, 2, 12) 350 vxor(4, 2, 4) 351 vsldoi(5, 0, 4, 12) 352 vxor(4, 4, 5) 353 vsldoi(5, 0, 4, 12) 354 vxor(4, 4, 5) 355 vxor(4, 4, 10) 356 357 /* 358 * Then other word in v5. 359 */ 360 vsbox(10, 4) 361 vspltw(10, 10, 3) 362 363 vsldoi(5, 0, 3, 12) 364 vxor(5, 3, 5) 365 vsldoi(11, 0, 5, 12) 366 vxor(5, 5, 11) 367 vsldoi(11, 0, 5, 12) 368 vxor(5, 5, 11) 369 vxor(5, 5, 10) 370 371 /* 372 * Update Rcon. Since for a 256-bit key, we use only 7 373 * such constants, we will not hit the field modulus, 374 * so a simple shift (addition) works well. 375 */ 376 vadduwm(6, 6, 6) 377 378 /* 379 * Write out the two left 128-bit words 380 */ 381 #if BR_POWER8_LE 382 vperm(10, 2, 2, 8) 383 vperm(11, 3, 3, 8) 384 stxvw4x(42, 0, %[sk]) 385 stxvw4x(43, %[cc], %[sk]) 386 #else 387 stxvw4x(34, 0, %[sk]) 388 stxvw4x(35, %[cc], %[sk]) 389 #endif 390 addi(%[sk], %[sk], 32) 391 392 /* 393 * Replace v2:v3 with v4:v5. 394 */ 395 vxor(2, 0, 4) 396 vxor(3, 0, 5) 397 398 bdnz(loop) 399 400 /* 401 * The loop wrote the first 14 subkeys, but we need 15, 402 * so we must do an extra write. 403 */ 404 #if BR_POWER8_LE 405 vperm(10, 2, 2, 8) 406 stxvw4x(42, 0, %[sk]) 407 #else 408 stxvw4x(34, 0, %[sk]) 409 #endif 410 411 : [sk] "+b" (sk), [cc] "+b" (cc) 412 : [key] "b" (key) 413 #if BR_POWER8_LE 414 , [idx2be] "b" (idx2be) 415 #endif 416 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 417 "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" 418 ); 419 } 420 421 /* see inner.h */ 422 int 423 br_aes_pwr8_supported(void) 424 { 425 return 1; 426 } 427 428 /* see inner.h */ 429 unsigned 430 br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len) 431 { 432 switch (len) { 433 case 16: 434 key_schedule_128(sk, key); 435 return 10; 436 case 24: 437 key_schedule_192(sk, key); 438 return 12; 439 default: 440 key_schedule_256(sk, key); 441 return 14; 442 } 443 } 444 445 #endif 446