1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #define BR_POWER_ASM_MACROS 1 26 #include "inner.h" 27 28 /* 29 * This is the GHASH implementation that leverages the POWER8 opcodes. 30 */ 31 32 #if BR_POWER8 33 34 /* 35 * Some symbolic names for registers. 36 * HB0 = 16 bytes of value 0 37 * HB1 = 16 bytes of value 1 38 * HB2 = 16 bytes of value 2 39 * HB6 = 16 bytes of value 6 40 * HB7 = 16 bytes of value 7 41 * TT0, TT1 and TT2 are temporaries 42 * 43 * BSW holds the pattern for byteswapping 32-bit words; this is set only 44 * on little-endian systems. XBSW is the same register with the +32 offset 45 * for access with the VSX opcodes. 46 */ 47 #define HB0 0 48 #define HB1 1 49 #define HB2 2 50 #define HB6 3 51 #define HB7 4 52 #define TT0 5 53 #define TT1 6 54 #define TT2 7 55 56 #define BSW 8 57 #define XBSW 40 58 59 /* 60 * Macro to initialise the constants. 61 */ 62 #define INIT \ 63 vxor(HB0, HB0, HB0) \ 64 vspltisb(HB1, 1) \ 65 vspltisb(HB2, 2) \ 66 vspltisb(HB6, 6) \ 67 vspltisb(HB7, 7) \ 68 INIT_BSW 69 70 /* 71 * Fix endianness of a value after reading it or before writing it, if 72 * necessary. 73 */ 74 #if BR_POWER8_LE 75 #define INIT_BSW lxvw4x(XBSW, 0, %[idx2be]) 76 #define FIX_ENDIAN(xx) vperm(xx, xx, xx, BSW) 77 #else 78 #define INIT_BSW 79 #define FIX_ENDIAN(xx) 80 #endif 81 82 /* 83 * Left-shift x0:x1 by one bit to the left. This is a corrective action 84 * needed because GHASH is defined in full little-endian specification, 85 * while the opcodes use full big-endian convention, so the 255-bit product 86 * ends up one bit to the right. 87 */ 88 #define SL_256(x0, x1) \ 89 vsldoi(TT0, HB0, x1, 1) \ 90 vsl(x0, x0, HB1) \ 91 vsr(TT0, TT0, HB7) \ 92 vsl(x1, x1, HB1) \ 93 vxor(x0, x0, TT0) 94 95 /* 96 * Reduce x0:x1 in GF(2^128), result in xd (register xd may be the same as 97 * x0 or x1, or a different register). x0 and x1 are modified. 98 */ 99 #define REDUCE_F128(xd, x0, x1) \ 100 vxor(x0, x0, x1) \ 101 vsr(TT0, x1, HB1) \ 102 vsr(TT1, x1, HB2) \ 103 vsr(TT2, x1, HB7) \ 104 vxor(x0, x0, TT0) \ 105 vxor(TT1, TT1, TT2) \ 106 vxor(x0, x0, TT1) \ 107 vsldoi(x1, x1, HB0, 15) \ 108 vsl(TT1, x1, HB6) \ 109 vsl(TT2, x1, HB1) \ 110 vxor(x1, TT1, TT2) \ 111 vsr(TT0, x1, HB1) \ 112 vsr(TT1, x1, HB2) \ 113 vsr(TT2, x1, HB7) \ 114 vxor(x0, x0, x1) \ 115 vxor(x0, x0, TT0) \ 116 vxor(TT1, TT1, TT2) \ 117 vxor(xd, x0, TT1) 118 119 /* see bearssl_hash.h */ 120 void 121 br_ghash_pwr8(void *y, const void *h, const void *data, size_t len) 122 { 123 const unsigned char *buf1, *buf2; 124 size_t num4, num1; 125 unsigned char tmp[64]; 126 long cc0, cc1, cc2, cc3; 127 128 #if BR_POWER8_LE 129 static const uint32_t idx2be[] = { 130 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C 131 }; 132 #endif 133 134 buf1 = data; 135 136 /* 137 * Assembly code requires data into two chunks; first chunk 138 * must contain a number of blocks which is a multiple of 4. 139 * Since the processing for the first chunk is faster, we want 140 * to make it as big as possible. 141 * 142 * For the remainder, there are two possibilities: 143 * -- if the remainder size is a multiple of 16, then use it 144 * in place; 145 * -- otherwise, copy it to the tmp[] array and pad it with 146 * zeros. 147 */ 148 num4 = len >> 6; 149 buf2 = buf1 + (num4 << 6); 150 len &= 63; 151 num1 = (len + 15) >> 4; 152 if ((len & 15) != 0) { 153 memcpy(tmp, buf2, len); 154 memset(tmp + len, 0, (num1 << 4) - len); 155 buf2 = tmp; 156 } 157 158 cc0 = 0; 159 cc1 = 16; 160 cc2 = 32; 161 cc3 = 48; 162 asm volatile ( 163 INIT 164 165 /* 166 * Load current h (denoted hereafter h1) in v9. 167 */ 168 lxvw4x(41, 0, %[h]) 169 FIX_ENDIAN(9) 170 171 /* 172 * Load current y into v28. 173 */ 174 lxvw4x(60, 0, %[y]) 175 FIX_ENDIAN(28) 176 177 /* 178 * Split h1 into three registers: 179 * v17 = h1_1:h1_0 180 * v18 = 0:h1_0 181 * v19 = h1_1:0 182 */ 183 xxpermdi(49, 41, 41, 2) 184 vsldoi(18, HB0, 9, 8) 185 vsldoi(19, 9, HB0, 8) 186 187 /* 188 * If num4 is 0, skip directly to the second chunk. 189 */ 190 cmpldi(%[num4], 0) 191 beq(chunk1) 192 193 /* 194 * Compute h2 = h*h in v10. 195 */ 196 vpmsumd(10, 18, 18) 197 vpmsumd(11, 19, 19) 198 SL_256(10, 11) 199 REDUCE_F128(10, 10, 11) 200 201 /* 202 * Compute h3 = h*h*h in v11. 203 * We first split h2 into: 204 * v10 = h2_0:h2_1 205 * v11 = 0:h2_0 206 * v12 = h2_1:0 207 * Then we do the product with h1, and reduce into v11. 208 */ 209 vsldoi(11, HB0, 10, 8) 210 vsldoi(12, 10, HB0, 8) 211 vpmsumd(13, 10, 17) 212 vpmsumd(11, 11, 18) 213 vpmsumd(12, 12, 19) 214 vsldoi(14, HB0, 13, 8) 215 vsldoi(15, 13, HB0, 8) 216 vxor(11, 11, 14) 217 vxor(12, 12, 15) 218 SL_256(11, 12) 219 REDUCE_F128(11, 11, 12) 220 221 /* 222 * Compute h4 = h*h*h*h in v12. This is done by squaring h2. 223 */ 224 vsldoi(12, HB0, 10, 8) 225 vsldoi(13, 10, HB0, 8) 226 vpmsumd(12, 12, 12) 227 vpmsumd(13, 13, 13) 228 SL_256(12, 13) 229 REDUCE_F128(12, 12, 13) 230 231 /* 232 * Repack h1, h2, h3 and h4: 233 * v13 = h4_0:h3_0 234 * v14 = h4_1:h3_1 235 * v15 = h2_0:h1_0 236 * v16 = h2_1:h1_1 237 */ 238 xxpermdi(45, 44, 43, 0) 239 xxpermdi(46, 44, 43, 3) 240 xxpermdi(47, 42, 41, 0) 241 xxpermdi(48, 42, 41, 3) 242 243 /* 244 * Loop for each group of four blocks. 245 */ 246 mtctr(%[num4]) 247 label(loop4) 248 /* 249 * Read the four next blocks. 250 * v20 = y + a0 = b0 251 * v21 = a1 = b1 252 * v22 = a2 = b2 253 * v23 = a3 = b3 254 */ 255 lxvw4x(52, %[cc0], %[buf1]) 256 lxvw4x(53, %[cc1], %[buf1]) 257 lxvw4x(54, %[cc2], %[buf1]) 258 lxvw4x(55, %[cc3], %[buf1]) 259 FIX_ENDIAN(20) 260 FIX_ENDIAN(21) 261 FIX_ENDIAN(22) 262 FIX_ENDIAN(23) 263 addi(%[buf1], %[buf1], 64) 264 vxor(20, 20, 28) 265 266 /* 267 * Repack the blocks into v9, v10, v11 and v12. 268 * v9 = b0_0:b1_0 269 * v10 = b0_1:b1_1 270 * v11 = b2_0:b3_0 271 * v12 = b2_1:b3_1 272 */ 273 xxpermdi(41, 52, 53, 0) 274 xxpermdi(42, 52, 53, 3) 275 xxpermdi(43, 54, 55, 0) 276 xxpermdi(44, 54, 55, 3) 277 278 /* 279 * Compute the products. 280 * v20 = b0_0*h4_0 + b1_0*h3_0 281 * v21 = b0_1*h4_0 + b1_1*h3_0 282 * v22 = b0_0*h4_1 + b1_0*h3_1 283 * v23 = b0_1*h4_1 + b1_1*h3_1 284 * v24 = b2_0*h2_0 + b3_0*h1_0 285 * v25 = b2_1*h2_0 + b3_1*h1_0 286 * v26 = b2_0*h2_1 + b3_0*h1_1 287 * v27 = b2_1*h2_1 + b3_1*h1_1 288 */ 289 vpmsumd(20, 13, 9) 290 vpmsumd(21, 13, 10) 291 vpmsumd(22, 14, 9) 292 vpmsumd(23, 14, 10) 293 vpmsumd(24, 15, 11) 294 vpmsumd(25, 15, 12) 295 vpmsumd(26, 16, 11) 296 vpmsumd(27, 16, 12) 297 298 /* 299 * Sum products into a single 256-bit result in v11:v12. 300 */ 301 vxor(11, 20, 24) 302 vxor(12, 23, 27) 303 vxor( 9, 21, 22) 304 vxor(10, 25, 26) 305 vxor(20, 9, 10) 306 vsldoi( 9, HB0, 20, 8) 307 vsldoi(10, 20, HB0, 8) 308 vxor(11, 11, 9) 309 vxor(12, 12, 10) 310 311 /* 312 * Fix and reduce in GF(2^128); this is the new y (in v28). 313 */ 314 SL_256(11, 12) 315 REDUCE_F128(28, 11, 12) 316 317 /* 318 * Loop for next group of four blocks. 319 */ 320 bdnz(loop4) 321 322 /* 323 * Process second chunk, one block at a time. 324 */ 325 label(chunk1) 326 cmpldi(%[num1], 0) 327 beq(done) 328 329 mtctr(%[num1]) 330 label(loop1) 331 /* 332 * Load next data block and XOR it into y. 333 */ 334 lxvw4x(41, 0, %[buf2]) 335 #if BR_POWER8_LE 336 FIX_ENDIAN(9) 337 #endif 338 addi(%[buf2], %[buf2], 16) 339 vxor(9, 28, 9) 340 341 /* 342 * Split y into doublewords: 343 * v9 = y_0:y_1 344 * v10 = 0:y_0 345 * v11 = y_1:0 346 */ 347 vsldoi(10, HB0, 9, 8) 348 vsldoi(11, 9, HB0, 8) 349 350 /* 351 * Compute products with h: 352 * v12 = y_0 * h_0 353 * v13 = y_1 * h_1 354 * v14 = y_1 * h_0 + y_0 * h_1 355 */ 356 vpmsumd(14, 9, 17) 357 vpmsumd(12, 10, 18) 358 vpmsumd(13, 11, 19) 359 360 /* 361 * Propagate v14 into v12:v13 to finalise product. 362 */ 363 vsldoi(10, HB0, 14, 8) 364 vsldoi(11, 14, HB0, 8) 365 vxor(12, 12, 10) 366 vxor(13, 13, 11) 367 368 /* 369 * Fix result and reduce into v28 (next value for y). 370 */ 371 SL_256(12, 13) 372 REDUCE_F128(28, 12, 13) 373 bdnz(loop1) 374 375 label(done) 376 /* 377 * Write back the new y. 378 */ 379 FIX_ENDIAN(28) 380 stxvw4x(60, 0, %[y]) 381 382 : [buf1] "+b" (buf1), [buf2] "+b" (buf2) 383 : [y] "b" (y), [h] "b" (h), [num4] "b" (num4), [num1] "b" (num1), 384 [cc0] "b" (cc0), [cc1] "b" (cc1), [cc2] "b" (cc2), [cc3] "b" (cc3) 385 #if BR_POWER8_LE 386 , [idx2be] "b" (idx2be) 387 #endif 388 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", 389 "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", 390 "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", 391 "ctr", "memory" 392 ); 393 } 394 395 /* see bearssl_hash.h */ 396 br_ghash 397 br_ghash_pwr8_get(void) 398 { 399 return &br_ghash_pwr8; 400 } 401 402 #else 403 404 /* see bearssl_hash.h */ 405 br_ghash 406 br_ghash_pwr8_get(void) 407 { 408 return 0; 409 } 410 411 #endif 412