1*0957b409SSimon J. Gerraty /* 2*0957b409SSimon J. Gerraty * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3*0957b409SSimon J. Gerraty * 4*0957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining 5*0957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the 6*0957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including 7*0957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish, 8*0957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to 9*0957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to 10*0957b409SSimon J. Gerraty * the following conditions: 11*0957b409SSimon J. Gerraty * 12*0957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be 13*0957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software. 14*0957b409SSimon J. Gerraty * 15*0957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16*0957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17*0957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18*0957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19*0957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20*0957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21*0957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22*0957b409SSimon J. Gerraty * SOFTWARE. 23*0957b409SSimon J. Gerraty */ 24*0957b409SSimon J. Gerraty 25*0957b409SSimon J. Gerraty #define BR_POWER_ASM_MACROS 1 26*0957b409SSimon J. Gerraty #include "inner.h" 27*0957b409SSimon J. Gerraty 28*0957b409SSimon J. Gerraty /* 29*0957b409SSimon J. Gerraty * This code contains the AES key schedule implementation using the 30*0957b409SSimon J. Gerraty * POWER8 opcodes. 31*0957b409SSimon J. Gerraty */ 32*0957b409SSimon J. Gerraty 33*0957b409SSimon J. Gerraty #if BR_POWER8 34*0957b409SSimon J. Gerraty 35*0957b409SSimon J. Gerraty static void 36*0957b409SSimon J. Gerraty key_schedule_128(unsigned char *sk, const unsigned char *key) 37*0957b409SSimon J. Gerraty { 38*0957b409SSimon J. Gerraty long cc; 39*0957b409SSimon J. Gerraty 40*0957b409SSimon J. Gerraty static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B }; 41*0957b409SSimon J. Gerraty #if BR_POWER8_LE 42*0957b409SSimon J. Gerraty static const uint32_t idx2be[] = { 43*0957b409SSimon J. Gerraty 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 44*0957b409SSimon J. Gerraty }; 45*0957b409SSimon J. Gerraty #endif 46*0957b409SSimon J. Gerraty 47*0957b409SSimon J. Gerraty cc = 0; 48*0957b409SSimon J. Gerraty 49*0957b409SSimon J. Gerraty /* 50*0957b409SSimon J. Gerraty * We use the VSX instructions for loading and storing the 51*0957b409SSimon J. Gerraty * key/subkeys, since they support unaligned accesses. The rest 52*0957b409SSimon J. Gerraty * of the computation is VMX only. VMX register 0 is VSX 53*0957b409SSimon J. Gerraty * register 32. 54*0957b409SSimon J. Gerraty */ 55*0957b409SSimon J. Gerraty asm volatile ( 56*0957b409SSimon J. Gerraty 57*0957b409SSimon J. Gerraty /* 58*0957b409SSimon J. Gerraty * v0 = all-zero word 59*0957b409SSimon J. Gerraty * v1 = constant -8 / +8, copied into four words 60*0957b409SSimon J. Gerraty * v2 = current subkey 61*0957b409SSimon J. Gerraty * v3 = Rcon (x4 words) 62*0957b409SSimon J. Gerraty * v6 = constant 8, copied into four words 63*0957b409SSimon J. Gerraty * v7 = constant 0x11B, copied into four words 64*0957b409SSimon J. Gerraty * v8 = constant for byteswapping words 65*0957b409SSimon J. Gerraty */ 66*0957b409SSimon J. Gerraty vspltisw(0, 0) 67*0957b409SSimon J. Gerraty #if BR_POWER8_LE 68*0957b409SSimon J. Gerraty vspltisw(1, -8) 69*0957b409SSimon J. Gerraty #else 70*0957b409SSimon J. Gerraty vspltisw(1, 8) 71*0957b409SSimon J. Gerraty #endif 72*0957b409SSimon J. Gerraty lxvw4x(34, 0, %[key]) 73*0957b409SSimon J. Gerraty vspltisw(3, 1) 74*0957b409SSimon J. Gerraty vspltisw(6, 8) 75*0957b409SSimon J. Gerraty lxvw4x(39, 0, %[fmod]) 76*0957b409SSimon J. Gerraty #if BR_POWER8_LE 77*0957b409SSimon J. Gerraty lxvw4x(40, 0, %[idx2be]) 78*0957b409SSimon J. Gerraty #endif 79*0957b409SSimon J. Gerraty 80*0957b409SSimon J. Gerraty /* 81*0957b409SSimon J. Gerraty * First subkey is a copy of the key itself. 82*0957b409SSimon J. Gerraty */ 83*0957b409SSimon J. Gerraty #if BR_POWER8_LE 84*0957b409SSimon J. Gerraty vperm(4, 2, 2, 8) 85*0957b409SSimon J. Gerraty stxvw4x(36, 0, %[sk]) 86*0957b409SSimon J. Gerraty #else 87*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk]) 88*0957b409SSimon J. Gerraty #endif 89*0957b409SSimon J. Gerraty 90*0957b409SSimon J. Gerraty /* 91*0957b409SSimon J. Gerraty * Loop must run 10 times. 92*0957b409SSimon J. Gerraty */ 93*0957b409SSimon J. Gerraty li(%[cc], 10) 94*0957b409SSimon J. Gerraty mtctr(%[cc]) 95*0957b409SSimon J. Gerraty label(loop) 96*0957b409SSimon J. Gerraty /* Increment subkey address */ 97*0957b409SSimon J. Gerraty addi(%[sk], %[sk], 16) 98*0957b409SSimon J. Gerraty 99*0957b409SSimon J. Gerraty /* Compute SubWord(RotWord(temp)) xor Rcon (into v4, splat) */ 100*0957b409SSimon J. Gerraty vrlw(4, 2, 1) 101*0957b409SSimon J. Gerraty vsbox(4, 4) 102*0957b409SSimon J. Gerraty #if BR_POWER8_LE 103*0957b409SSimon J. Gerraty vxor(4, 4, 3) 104*0957b409SSimon J. Gerraty #else 105*0957b409SSimon J. Gerraty vsldoi(5, 3, 0, 3) 106*0957b409SSimon J. Gerraty vxor(4, 4, 5) 107*0957b409SSimon J. Gerraty #endif 108*0957b409SSimon J. Gerraty vspltw(4, 4, 3) 109*0957b409SSimon J. Gerraty 110*0957b409SSimon J. Gerraty /* XOR words for next subkey */ 111*0957b409SSimon J. Gerraty vsldoi(5, 0, 2, 12) 112*0957b409SSimon J. Gerraty vxor(2, 2, 5) 113*0957b409SSimon J. Gerraty vsldoi(5, 0, 2, 12) 114*0957b409SSimon J. Gerraty vxor(2, 2, 5) 115*0957b409SSimon J. Gerraty vsldoi(5, 0, 2, 12) 116*0957b409SSimon J. Gerraty vxor(2, 2, 5) 117*0957b409SSimon J. Gerraty vxor(2, 2, 4) 118*0957b409SSimon J. Gerraty 119*0957b409SSimon J. Gerraty /* Store next subkey */ 120*0957b409SSimon J. Gerraty #if BR_POWER8_LE 121*0957b409SSimon J. Gerraty vperm(4, 2, 2, 8) 122*0957b409SSimon J. Gerraty stxvw4x(36, 0, %[sk]) 123*0957b409SSimon J. Gerraty #else 124*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk]) 125*0957b409SSimon J. Gerraty #endif 126*0957b409SSimon J. Gerraty 127*0957b409SSimon J. Gerraty /* Update Rcon */ 128*0957b409SSimon J. Gerraty vadduwm(3, 3, 3) 129*0957b409SSimon J. Gerraty vsrw(4, 3, 6) 130*0957b409SSimon J. Gerraty vsubuwm(4, 0, 4) 131*0957b409SSimon J. Gerraty vand(4, 4, 7) 132*0957b409SSimon J. Gerraty vxor(3, 3, 4) 133*0957b409SSimon J. Gerraty 134*0957b409SSimon J. Gerraty bdnz(loop) 135*0957b409SSimon J. Gerraty 136*0957b409SSimon J. Gerraty : [sk] "+b" (sk), [cc] "+b" (cc) 137*0957b409SSimon J. Gerraty : [key] "b" (key), [fmod] "b" (fmod) 138*0957b409SSimon J. Gerraty #if BR_POWER8_LE 139*0957b409SSimon J. Gerraty , [idx2be] "b" (idx2be) 140*0957b409SSimon J. Gerraty #endif 141*0957b409SSimon J. Gerraty : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory" 142*0957b409SSimon J. Gerraty ); 143*0957b409SSimon J. Gerraty } 144*0957b409SSimon J. Gerraty 145*0957b409SSimon J. Gerraty static void 146*0957b409SSimon J. Gerraty key_schedule_192(unsigned char *sk, const unsigned char *key) 147*0957b409SSimon J. Gerraty { 148*0957b409SSimon J. Gerraty long cc; 149*0957b409SSimon J. Gerraty 150*0957b409SSimon J. Gerraty #if BR_POWER8_LE 151*0957b409SSimon J. Gerraty static const uint32_t idx2be[] = { 152*0957b409SSimon J. Gerraty 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 153*0957b409SSimon J. Gerraty }; 154*0957b409SSimon J. Gerraty #endif 155*0957b409SSimon J. Gerraty 156*0957b409SSimon J. Gerraty cc = 0; 157*0957b409SSimon J. Gerraty 158*0957b409SSimon J. Gerraty /* 159*0957b409SSimon J. Gerraty * We use the VSX instructions for loading and storing the 160*0957b409SSimon J. Gerraty * key/subkeys, since they support unaligned accesses. The rest 161*0957b409SSimon J. Gerraty * of the computation is VMX only. VMX register 0 is VSX 162*0957b409SSimon J. Gerraty * register 32. 163*0957b409SSimon J. Gerraty */ 164*0957b409SSimon J. Gerraty asm volatile ( 165*0957b409SSimon J. Gerraty 166*0957b409SSimon J. Gerraty /* 167*0957b409SSimon J. Gerraty * v0 = all-zero word 168*0957b409SSimon J. Gerraty * v1 = constant -8 / +8, copied into four words 169*0957b409SSimon J. Gerraty * v2, v3 = current subkey 170*0957b409SSimon J. Gerraty * v5 = Rcon (x4 words) (already shifted on big-endian) 171*0957b409SSimon J. Gerraty * v6 = constant 8, copied into four words 172*0957b409SSimon J. Gerraty * v8 = constant for byteswapping words 173*0957b409SSimon J. Gerraty * 174*0957b409SSimon J. Gerraty * The left two words of v3 are ignored. 175*0957b409SSimon J. Gerraty */ 176*0957b409SSimon J. Gerraty vspltisw(0, 0) 177*0957b409SSimon J. Gerraty #if BR_POWER8_LE 178*0957b409SSimon J. Gerraty vspltisw(1, -8) 179*0957b409SSimon J. Gerraty #else 180*0957b409SSimon J. Gerraty vspltisw(1, 8) 181*0957b409SSimon J. Gerraty #endif 182*0957b409SSimon J. Gerraty li(%[cc], 8) 183*0957b409SSimon J. Gerraty lxvw4x(34, 0, %[key]) 184*0957b409SSimon J. Gerraty lxvw4x(35, %[cc], %[key]) 185*0957b409SSimon J. Gerraty vsldoi(3, 3, 0, 8) 186*0957b409SSimon J. Gerraty vspltisw(5, 1) 187*0957b409SSimon J. Gerraty #if !BR_POWER8_LE 188*0957b409SSimon J. Gerraty vsldoi(5, 5, 0, 3) 189*0957b409SSimon J. Gerraty #endif 190*0957b409SSimon J. Gerraty vspltisw(6, 8) 191*0957b409SSimon J. Gerraty #if BR_POWER8_LE 192*0957b409SSimon J. Gerraty lxvw4x(40, 0, %[idx2be]) 193*0957b409SSimon J. Gerraty #endif 194*0957b409SSimon J. Gerraty 195*0957b409SSimon J. Gerraty /* 196*0957b409SSimon J. Gerraty * Loop must run 8 times. Each iteration produces 256 197*0957b409SSimon J. Gerraty * bits of subkeys, with a 64-bit overlap. 198*0957b409SSimon J. Gerraty */ 199*0957b409SSimon J. Gerraty li(%[cc], 8) 200*0957b409SSimon J. Gerraty mtctr(%[cc]) 201*0957b409SSimon J. Gerraty li(%[cc], 16) 202*0957b409SSimon J. Gerraty label(loop) 203*0957b409SSimon J. Gerraty 204*0957b409SSimon J. Gerraty /* 205*0957b409SSimon J. Gerraty * Last 6 words in v2:v3l. Compute next 6 words into 206*0957b409SSimon J. Gerraty * v3r:v4. 207*0957b409SSimon J. Gerraty */ 208*0957b409SSimon J. Gerraty vrlw(10, 3, 1) 209*0957b409SSimon J. Gerraty vsbox(10, 10) 210*0957b409SSimon J. Gerraty vxor(10, 10, 5) 211*0957b409SSimon J. Gerraty vspltw(10, 10, 1) 212*0957b409SSimon J. Gerraty vsldoi(11, 0, 10, 8) 213*0957b409SSimon J. Gerraty 214*0957b409SSimon J. Gerraty vsldoi(12, 0, 2, 12) 215*0957b409SSimon J. Gerraty vxor(12, 2, 12) 216*0957b409SSimon J. Gerraty vsldoi(13, 0, 12, 12) 217*0957b409SSimon J. Gerraty vxor(12, 12, 13) 218*0957b409SSimon J. Gerraty vsldoi(13, 0, 12, 12) 219*0957b409SSimon J. Gerraty vxor(12, 12, 13) 220*0957b409SSimon J. Gerraty 221*0957b409SSimon J. Gerraty vspltw(13, 12, 3) 222*0957b409SSimon J. Gerraty vxor(13, 13, 3) 223*0957b409SSimon J. Gerraty vsldoi(14, 0, 3, 12) 224*0957b409SSimon J. Gerraty vxor(13, 13, 14) 225*0957b409SSimon J. Gerraty 226*0957b409SSimon J. Gerraty vsldoi(4, 12, 13, 8) 227*0957b409SSimon J. Gerraty vsldoi(14, 0, 3, 8) 228*0957b409SSimon J. Gerraty vsldoi(3, 14, 12, 8) 229*0957b409SSimon J. Gerraty 230*0957b409SSimon J. Gerraty vxor(3, 3, 11) 231*0957b409SSimon J. Gerraty vxor(4, 4, 10) 232*0957b409SSimon J. Gerraty 233*0957b409SSimon J. Gerraty /* 234*0957b409SSimon J. Gerraty * Update Rcon. Since for a 192-bit key, we use only 8 235*0957b409SSimon J. Gerraty * such constants, we will not hit the field modulus, 236*0957b409SSimon J. Gerraty * so a simple shift (addition) works well. 237*0957b409SSimon J. Gerraty */ 238*0957b409SSimon J. Gerraty vadduwm(5, 5, 5) 239*0957b409SSimon J. Gerraty 240*0957b409SSimon J. Gerraty /* 241*0957b409SSimon J. Gerraty * Write out the two left 128-bit words 242*0957b409SSimon J. Gerraty */ 243*0957b409SSimon J. Gerraty #if BR_POWER8_LE 244*0957b409SSimon J. Gerraty vperm(10, 2, 2, 8) 245*0957b409SSimon J. Gerraty vperm(11, 3, 3, 8) 246*0957b409SSimon J. Gerraty stxvw4x(42, 0, %[sk]) 247*0957b409SSimon J. Gerraty stxvw4x(43, %[cc], %[sk]) 248*0957b409SSimon J. Gerraty #else 249*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk]) 250*0957b409SSimon J. Gerraty stxvw4x(35, %[cc], %[sk]) 251*0957b409SSimon J. Gerraty #endif 252*0957b409SSimon J. Gerraty addi(%[sk], %[sk], 24) 253*0957b409SSimon J. Gerraty 254*0957b409SSimon J. Gerraty /* 255*0957b409SSimon J. Gerraty * Shift words for next iteration. 256*0957b409SSimon J. Gerraty */ 257*0957b409SSimon J. Gerraty vsldoi(2, 3, 4, 8) 258*0957b409SSimon J. Gerraty vsldoi(3, 4, 0, 8) 259*0957b409SSimon J. Gerraty 260*0957b409SSimon J. Gerraty bdnz(loop) 261*0957b409SSimon J. Gerraty 262*0957b409SSimon J. Gerraty /* 263*0957b409SSimon J. Gerraty * The loop wrote the first 50 subkey words, but we need 264*0957b409SSimon J. Gerraty * to produce 52, so we must do one last write. 265*0957b409SSimon J. Gerraty */ 266*0957b409SSimon J. Gerraty #if BR_POWER8_LE 267*0957b409SSimon J. Gerraty vperm(10, 2, 2, 8) 268*0957b409SSimon J. Gerraty stxvw4x(42, 0, %[sk]) 269*0957b409SSimon J. Gerraty #else 270*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk]) 271*0957b409SSimon J. Gerraty #endif 272*0957b409SSimon J. Gerraty 273*0957b409SSimon J. Gerraty : [sk] "+b" (sk), [cc] "+b" (cc) 274*0957b409SSimon J. Gerraty : [key] "b" (key) 275*0957b409SSimon J. Gerraty #if BR_POWER8_LE 276*0957b409SSimon J. Gerraty , [idx2be] "b" (idx2be) 277*0957b409SSimon J. Gerraty #endif 278*0957b409SSimon J. Gerraty : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 279*0957b409SSimon J. Gerraty "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" 280*0957b409SSimon J. Gerraty ); 281*0957b409SSimon J. Gerraty } 282*0957b409SSimon J. Gerraty 283*0957b409SSimon J. Gerraty static void 284*0957b409SSimon J. Gerraty key_schedule_256(unsigned char *sk, const unsigned char *key) 285*0957b409SSimon J. Gerraty { 286*0957b409SSimon J. Gerraty long cc; 287*0957b409SSimon J. Gerraty 288*0957b409SSimon J. Gerraty #if BR_POWER8_LE 289*0957b409SSimon J. Gerraty static const uint32_t idx2be[] = { 290*0957b409SSimon J. Gerraty 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 291*0957b409SSimon J. Gerraty }; 292*0957b409SSimon J. Gerraty #endif 293*0957b409SSimon J. Gerraty 294*0957b409SSimon J. Gerraty cc = 0; 295*0957b409SSimon J. Gerraty 296*0957b409SSimon J. Gerraty /* 297*0957b409SSimon J. Gerraty * We use the VSX instructions for loading and storing the 298*0957b409SSimon J. Gerraty * key/subkeys, since they support unaligned accesses. The rest 299*0957b409SSimon J. Gerraty * of the computation is VMX only. VMX register 0 is VSX 300*0957b409SSimon J. Gerraty * register 32. 301*0957b409SSimon J. Gerraty */ 302*0957b409SSimon J. Gerraty asm volatile ( 303*0957b409SSimon J. Gerraty 304*0957b409SSimon J. Gerraty /* 305*0957b409SSimon J. Gerraty * v0 = all-zero word 306*0957b409SSimon J. Gerraty * v1 = constant -8 / +8, copied into four words 307*0957b409SSimon J. Gerraty * v2, v3 = current subkey 308*0957b409SSimon J. Gerraty * v6 = Rcon (x4 words) (already shifted on big-endian) 309*0957b409SSimon J. Gerraty * v7 = constant 8, copied into four words 310*0957b409SSimon J. Gerraty * v8 = constant for byteswapping words 311*0957b409SSimon J. Gerraty * 312*0957b409SSimon J. Gerraty * The left two words of v3 are ignored. 313*0957b409SSimon J. Gerraty */ 314*0957b409SSimon J. Gerraty vspltisw(0, 0) 315*0957b409SSimon J. Gerraty #if BR_POWER8_LE 316*0957b409SSimon J. Gerraty vspltisw(1, -8) 317*0957b409SSimon J. Gerraty #else 318*0957b409SSimon J. Gerraty vspltisw(1, 8) 319*0957b409SSimon J. Gerraty #endif 320*0957b409SSimon J. Gerraty li(%[cc], 16) 321*0957b409SSimon J. Gerraty lxvw4x(34, 0, %[key]) 322*0957b409SSimon J. Gerraty lxvw4x(35, %[cc], %[key]) 323*0957b409SSimon J. Gerraty vspltisw(6, 1) 324*0957b409SSimon J. Gerraty #if !BR_POWER8_LE 325*0957b409SSimon J. Gerraty vsldoi(6, 6, 0, 3) 326*0957b409SSimon J. Gerraty #endif 327*0957b409SSimon J. Gerraty vspltisw(7, 8) 328*0957b409SSimon J. Gerraty #if BR_POWER8_LE 329*0957b409SSimon J. Gerraty lxvw4x(40, 0, %[idx2be]) 330*0957b409SSimon J. Gerraty #endif 331*0957b409SSimon J. Gerraty 332*0957b409SSimon J. Gerraty /* 333*0957b409SSimon J. Gerraty * Loop must run 7 times. Each iteration produces two 334*0957b409SSimon J. Gerraty * subkeys. 335*0957b409SSimon J. Gerraty */ 336*0957b409SSimon J. Gerraty li(%[cc], 7) 337*0957b409SSimon J. Gerraty mtctr(%[cc]) 338*0957b409SSimon J. Gerraty li(%[cc], 16) 339*0957b409SSimon J. Gerraty label(loop) 340*0957b409SSimon J. Gerraty 341*0957b409SSimon J. Gerraty /* 342*0957b409SSimon J. Gerraty * Current words are in v2:v3. Compute next word in v4. 343*0957b409SSimon J. Gerraty */ 344*0957b409SSimon J. Gerraty vrlw(10, 3, 1) 345*0957b409SSimon J. Gerraty vsbox(10, 10) 346*0957b409SSimon J. Gerraty vxor(10, 10, 6) 347*0957b409SSimon J. Gerraty vspltw(10, 10, 3) 348*0957b409SSimon J. Gerraty 349*0957b409SSimon J. Gerraty vsldoi(4, 0, 2, 12) 350*0957b409SSimon J. Gerraty vxor(4, 2, 4) 351*0957b409SSimon J. Gerraty vsldoi(5, 0, 4, 12) 352*0957b409SSimon J. Gerraty vxor(4, 4, 5) 353*0957b409SSimon J. Gerraty vsldoi(5, 0, 4, 12) 354*0957b409SSimon J. Gerraty vxor(4, 4, 5) 355*0957b409SSimon J. Gerraty vxor(4, 4, 10) 356*0957b409SSimon J. Gerraty 357*0957b409SSimon J. Gerraty /* 358*0957b409SSimon J. Gerraty * Then other word in v5. 359*0957b409SSimon J. Gerraty */ 360*0957b409SSimon J. Gerraty vsbox(10, 4) 361*0957b409SSimon J. Gerraty vspltw(10, 10, 3) 362*0957b409SSimon J. Gerraty 363*0957b409SSimon J. Gerraty vsldoi(5, 0, 3, 12) 364*0957b409SSimon J. Gerraty vxor(5, 3, 5) 365*0957b409SSimon J. Gerraty vsldoi(11, 0, 5, 12) 366*0957b409SSimon J. Gerraty vxor(5, 5, 11) 367*0957b409SSimon J. Gerraty vsldoi(11, 0, 5, 12) 368*0957b409SSimon J. Gerraty vxor(5, 5, 11) 369*0957b409SSimon J. Gerraty vxor(5, 5, 10) 370*0957b409SSimon J. Gerraty 371*0957b409SSimon J. Gerraty /* 372*0957b409SSimon J. Gerraty * Update Rcon. Since for a 256-bit key, we use only 7 373*0957b409SSimon J. Gerraty * such constants, we will not hit the field modulus, 374*0957b409SSimon J. Gerraty * so a simple shift (addition) works well. 375*0957b409SSimon J. Gerraty */ 376*0957b409SSimon J. Gerraty vadduwm(6, 6, 6) 377*0957b409SSimon J. Gerraty 378*0957b409SSimon J. Gerraty /* 379*0957b409SSimon J. Gerraty * Write out the two left 128-bit words 380*0957b409SSimon J. Gerraty */ 381*0957b409SSimon J. Gerraty #if BR_POWER8_LE 382*0957b409SSimon J. Gerraty vperm(10, 2, 2, 8) 383*0957b409SSimon J. Gerraty vperm(11, 3, 3, 8) 384*0957b409SSimon J. Gerraty stxvw4x(42, 0, %[sk]) 385*0957b409SSimon J. Gerraty stxvw4x(43, %[cc], %[sk]) 386*0957b409SSimon J. Gerraty #else 387*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk]) 388*0957b409SSimon J. Gerraty stxvw4x(35, %[cc], %[sk]) 389*0957b409SSimon J. Gerraty #endif 390*0957b409SSimon J. Gerraty addi(%[sk], %[sk], 32) 391*0957b409SSimon J. Gerraty 392*0957b409SSimon J. Gerraty /* 393*0957b409SSimon J. Gerraty * Replace v2:v3 with v4:v5. 394*0957b409SSimon J. Gerraty */ 395*0957b409SSimon J. Gerraty vxor(2, 0, 4) 396*0957b409SSimon J. Gerraty vxor(3, 0, 5) 397*0957b409SSimon J. Gerraty 398*0957b409SSimon J. Gerraty bdnz(loop) 399*0957b409SSimon J. Gerraty 400*0957b409SSimon J. Gerraty /* 401*0957b409SSimon J. Gerraty * The loop wrote the first 14 subkeys, but we need 15, 402*0957b409SSimon J. Gerraty * so we must do an extra write. 403*0957b409SSimon J. Gerraty */ 404*0957b409SSimon J. Gerraty #if BR_POWER8_LE 405*0957b409SSimon J. Gerraty vperm(10, 2, 2, 8) 406*0957b409SSimon J. Gerraty stxvw4x(42, 0, %[sk]) 407*0957b409SSimon J. Gerraty #else 408*0957b409SSimon J. Gerraty stxvw4x(34, 0, %[sk]) 409*0957b409SSimon J. Gerraty #endif 410*0957b409SSimon J. Gerraty 411*0957b409SSimon J. Gerraty : [sk] "+b" (sk), [cc] "+b" (cc) 412*0957b409SSimon J. Gerraty : [key] "b" (key) 413*0957b409SSimon J. Gerraty #if BR_POWER8_LE 414*0957b409SSimon J. Gerraty , [idx2be] "b" (idx2be) 415*0957b409SSimon J. Gerraty #endif 416*0957b409SSimon J. Gerraty : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", 417*0957b409SSimon J. Gerraty "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory" 418*0957b409SSimon J. Gerraty ); 419*0957b409SSimon J. Gerraty } 420*0957b409SSimon J. Gerraty 421*0957b409SSimon J. Gerraty /* see inner.h */ 422*0957b409SSimon J. Gerraty int 423*0957b409SSimon J. Gerraty br_aes_pwr8_supported(void) 424*0957b409SSimon J. Gerraty { 425*0957b409SSimon J. Gerraty return 1; 426*0957b409SSimon J. Gerraty } 427*0957b409SSimon J. Gerraty 428*0957b409SSimon J. Gerraty /* see inner.h */ 429*0957b409SSimon J. Gerraty unsigned 430*0957b409SSimon J. Gerraty br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len) 431*0957b409SSimon J. Gerraty { 432*0957b409SSimon J. Gerraty switch (len) { 433*0957b409SSimon J. Gerraty case 16: 434*0957b409SSimon J. Gerraty key_schedule_128(sk, key); 435*0957b409SSimon J. Gerraty return 10; 436*0957b409SSimon J. Gerraty case 24: 437*0957b409SSimon J. Gerraty key_schedule_192(sk, key); 438*0957b409SSimon J. Gerraty return 12; 439*0957b409SSimon J. Gerraty default: 440*0957b409SSimon J. Gerraty key_schedule_256(sk, key); 441*0957b409SSimon J. Gerraty return 14; 442*0957b409SSimon J. Gerraty } 443*0957b409SSimon J. Gerraty } 444*0957b409SSimon J. Gerraty 445*0957b409SSimon J. Gerraty #endif 446