1/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */ 2#include "arm_arch.h" 3 4#if __ARM_MAX_ARCH__>=8 5.arch armv8-a+crypto 6.text 7.globl aes_gcm_enc_128_kernel 8.type aes_gcm_enc_128_kernel,%function 9.align 4 10aes_gcm_enc_128_kernel: 11 cbz x1, .L128_enc_ret 12 stp x19, x20, [sp, #-112]! 13 mov x16, x4 14 mov x8, x5 15 stp x21, x22, [sp, #16] 16 stp x23, x24, [sp, #32] 17 stp d8, d9, [sp, #48] 18 stp d10, d11, [sp, #64] 19 stp d12, d13, [sp, #80] 20 stp d14, d15, [sp, #96] 21 22 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 23#ifdef __AARCH64EB__ 24 rev x10, x10 25 rev x11, x11 26#endif 27 ldp x13, x14, [x8, #160] //load rk10 28#ifdef __AARCH64EB__ 29 ror x13, x13, #32 30 ror x14, x14, #32 31#endif 32 ld1 {v11.16b}, [x3] 33 ext v11.16b, v11.16b, v11.16b, #8 34 rev64 v11.16b, v11.16b 35 lsr x5, x1, #3 //byte_len 36 mov x15, x5 37 38 ld1 {v18.4s}, [x8], #16 //load rk0 39 add x4, x0, x1, lsr #3 //end_input_ptr 40 sub x5, x5, #1 //byte_len - 1 41 42 lsr x12, x11, #32 43 ldr q15, [x3, #112] //load h4l | h4h 44#ifndef __AARCH64EB__ 45 ext v15.16b, v15.16b, v15.16b, #8 46#endif 47 fmov d1, x10 //CTR block 1 48 rev w12, w12 //rev_ctr32 49 50 add w12, w12, #1 //increment rev_ctr32 51 orr w11, w11, w11 52 ld1 {v19.4s}, [x8], #16 //load rk1 53 54 rev w9, w12 //CTR block 1 55 add w12, w12, #1 //CTR block 1 56 fmov d3, x10 //CTR block 3 57 58 orr x9, x11, x9, lsl #32 //CTR block 1 59 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 60 61 fmov v1.d[1], x9 //CTR block 1 62 rev w9, w12 //CTR block 2 63 64 fmov d2, x10 //CTR block 2 65 orr x9, x11, x9, lsl #32 //CTR block 2 66 add w12, w12, #1 //CTR block 2 67 68 fmov v2.d[1], x9 //CTR block 2 69 rev w9, w12 //CTR block 3 70 71 orr x9, x11, x9, lsl #32 //CTR block 3 72 ld1 {v20.4s}, [x8], #16 //load rk2 73 74 add w12, w12, #1 //CTR block 3 75 fmov v3.d[1], x9 //CTR block 3 76 77 ldr q14, [x3, #80] //load h3l | h3h 78#ifndef __AARCH64EB__ 79 ext v14.16b, v14.16b, v14.16b, #8 80#endif 81 aese v1.16b, v18.16b 82 aesmc v1.16b, v1.16b //AES block 1 - round 0 83 ld1 {v21.4s}, [x8], #16 //load rk3 84 85 aese v2.16b, v18.16b 86 aesmc v2.16b, v2.16b //AES block 2 - round 0 87 ldr q12, [x3, #32] //load h1l | h1h 88#ifndef __AARCH64EB__ 89 ext v12.16b, v12.16b, v12.16b, #8 90#endif 91 92 aese v0.16b, v18.16b 93 aesmc v0.16b, v0.16b //AES block 0 - round 0 94 ld1 {v22.4s}, [x8], #16 //load rk4 95 96 aese v3.16b, v18.16b 97 aesmc v3.16b, v3.16b //AES block 3 - round 0 98 ld1 {v23.4s}, [x8], #16 //load rk5 99 100 aese v2.16b, v19.16b 101 aesmc v2.16b, v2.16b //AES block 2 - round 1 102 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 103 104 aese v0.16b, v19.16b 105 aesmc v0.16b, v0.16b //AES block 0 - round 1 106 ld1 {v24.4s}, [x8], #16 //load rk6 107 108 aese v1.16b, v19.16b 109 aesmc v1.16b, v1.16b //AES block 1 - round 1 110 ld1 {v25.4s}, [x8], #16 //load rk7 111 112 aese v3.16b, v19.16b 113 aesmc v3.16b, v3.16b //AES block 3 - round 1 114 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 115 116 aese v0.16b, v20.16b 117 aesmc v0.16b, v0.16b //AES block 0 - round 2 118 ld1 {v26.4s}, [x8], #16 //load rk8 119 120 aese v1.16b, v20.16b 121 aesmc v1.16b, v1.16b //AES block 1 - round 2 122 ldr q13, [x3, #64] //load h2l | h2h 123#ifndef __AARCH64EB__ 124 ext v13.16b, v13.16b, v13.16b, #8 125#endif 126 127 aese v3.16b, v20.16b 128 aesmc v3.16b, v3.16b //AES block 3 - round 2 129 130 aese v2.16b, v20.16b 131 aesmc v2.16b, v2.16b //AES block 2 - round 2 132 eor v17.16b, v17.16b, v9.16b //h4k | h3k 133 134 aese v0.16b, v21.16b 135 aesmc v0.16b, v0.16b //AES block 0 - round 3 136 137 aese v1.16b, v21.16b 138 aesmc v1.16b, v1.16b //AES block 1 - round 3 139 140 aese v2.16b, v21.16b 141 aesmc v2.16b, v2.16b //AES block 2 - round 3 142 ld1 {v27.4s}, [x8], #16 //load rk9 143 144 aese v3.16b, v21.16b 145 aesmc v3.16b, v3.16b //AES block 3 - round 3 146 147 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 148 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 149 150 aese v3.16b, v22.16b 151 aesmc v3.16b, v3.16b //AES block 3 - round 4 152 add x5, x5, x0 153 154 aese v2.16b, v22.16b 155 aesmc v2.16b, v2.16b //AES block 2 - round 4 156 cmp x0, x5 //check if we have <= 4 blocks 157 158 aese v0.16b, v22.16b 159 aesmc v0.16b, v0.16b //AES block 0 - round 4 160 161 aese v3.16b, v23.16b 162 aesmc v3.16b, v3.16b //AES block 3 - round 5 163 164 aese v2.16b, v23.16b 165 aesmc v2.16b, v2.16b //AES block 2 - round 5 166 167 aese v0.16b, v23.16b 168 aesmc v0.16b, v0.16b //AES block 0 - round 5 169 170 aese v3.16b, v24.16b 171 aesmc v3.16b, v3.16b //AES block 3 - round 6 172 173 aese v1.16b, v22.16b 174 aesmc v1.16b, v1.16b //AES block 1 - round 4 175 176 aese v2.16b, v24.16b 177 aesmc v2.16b, v2.16b //AES block 2 - round 6 178 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 179 180 aese v0.16b, v24.16b 181 aesmc v0.16b, v0.16b //AES block 0 - round 6 182 183 aese v1.16b, v23.16b 184 aesmc v1.16b, v1.16b //AES block 1 - round 5 185 186 aese v3.16b, v25.16b 187 aesmc v3.16b, v3.16b //AES block 3 - round 7 188 189 aese v0.16b, v25.16b 190 aesmc v0.16b, v0.16b //AES block 0 - round 7 191 192 aese v1.16b, v24.16b 193 aesmc v1.16b, v1.16b //AES block 1 - round 6 194 195 aese v2.16b, v25.16b 196 aesmc v2.16b, v2.16b //AES block 2 - round 7 197 198 aese v0.16b, v26.16b 199 aesmc v0.16b, v0.16b //AES block 0 - round 8 200 201 aese v1.16b, v25.16b 202 aesmc v1.16b, v1.16b //AES block 1 - round 7 203 204 aese v2.16b, v26.16b 205 aesmc v2.16b, v2.16b //AES block 2 - round 8 206 207 aese v3.16b, v26.16b 208 aesmc v3.16b, v3.16b //AES block 3 - round 8 209 210 aese v1.16b, v26.16b 211 aesmc v1.16b, v1.16b //AES block 1 - round 8 212 213 aese v2.16b, v27.16b //AES block 2 - round 9 214 215 aese v0.16b, v27.16b //AES block 0 - round 9 216 217 eor v16.16b, v16.16b, v8.16b //h2k | h1k 218 219 aese v1.16b, v27.16b //AES block 1 - round 9 220 221 aese v3.16b, v27.16b //AES block 3 - round 9 222 b.ge .L128_enc_tail //handle tail 223 224 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 225#ifdef __AARCH64EB__ 226 rev x6, x6 227 rev x7, x7 228#endif 229 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 230#ifdef __AARCH64EB__ 231 rev x21, x21 232 rev x22, x22 233#endif 234 ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 235#ifdef __AARCH64EB__ 236 rev x19, x19 237 rev x20, x20 238#endif 239 ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 240#ifdef __AARCH64EB__ 241 rev x23, x23 242 rev x24, x24 243#endif 244 eor x6, x6, x13 //AES block 0 - round 10 low 245 eor x7, x7, x14 //AES block 0 - round 10 high 246 247 eor x21, x21, x13 //AES block 2 - round 10 low 248 fmov d4, x6 //AES block 0 - mov low 249 250 eor x19, x19, x13 //AES block 1 - round 10 low 251 eor x22, x22, x14 //AES block 2 - round 10 high 252 fmov v4.d[1], x7 //AES block 0 - mov high 253 254 fmov d5, x19 //AES block 1 - mov low 255 eor x20, x20, x14 //AES block 1 - round 10 high 256 257 eor x23, x23, x13 //AES block 3 - round 10 low 258 fmov v5.d[1], x20 //AES block 1 - mov high 259 260 fmov d6, x21 //AES block 2 - mov low 261 eor x24, x24, x14 //AES block 3 - round 10 high 262 rev w9, w12 //CTR block 4 263 264 fmov v6.d[1], x22 //AES block 2 - mov high 265 orr x9, x11, x9, lsl #32 //CTR block 4 266 267 eor v4.16b, v4.16b, v0.16b //AES block 0 - result 268 fmov d0, x10 //CTR block 4 269 add w12, w12, #1 //CTR block 4 270 271 fmov v0.d[1], x9 //CTR block 4 272 rev w9, w12 //CTR block 5 273 274 eor v5.16b, v5.16b, v1.16b //AES block 1 - result 275 fmov d1, x10 //CTR block 5 276 orr x9, x11, x9, lsl #32 //CTR block 5 277 278 add w12, w12, #1 //CTR block 5 279 add x0, x0, #64 //AES input_ptr update 280 fmov v1.d[1], x9 //CTR block 5 281 282 fmov d7, x23 //AES block 3 - mov low 283 rev w9, w12 //CTR block 6 284 st1 { v4.16b}, [x2], #16 //AES block 0 - store result 285 286 fmov v7.d[1], x24 //AES block 3 - mov high 287 orr x9, x11, x9, lsl #32 //CTR block 6 288 289 add w12, w12, #1 //CTR block 6 290 eor v6.16b, v6.16b, v2.16b //AES block 2 - result 291 st1 { v5.16b}, [x2], #16 //AES block 1 - store result 292 293 fmov d2, x10 //CTR block 6 294 cmp x0, x5 //check if we have <= 8 blocks 295 296 fmov v2.d[1], x9 //CTR block 6 297 rev w9, w12 //CTR block 7 298 st1 { v6.16b}, [x2], #16 //AES block 2 - store result 299 300 orr x9, x11, x9, lsl #32 //CTR block 7 301 302 eor v7.16b, v7.16b, v3.16b //AES block 3 - result 303 st1 { v7.16b}, [x2], #16 //AES block 3 - store result 304 b.ge .L128_enc_prepretail //do prepretail 305 306.L128_enc_main_loop: //main loop start 307 ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext 308#ifdef __AARCH64EB__ 309 rev x23, x23 310 rev x24, x24 311#endif 312 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 313 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 314 315 aese v2.16b, v18.16b 316 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 317 fmov d3, x10 //CTR block 4k+3 318 319 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 320 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 321 322 aese v1.16b, v18.16b 323 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 324 add w12, w12, #1 //CTR block 4k+3 325 fmov v3.d[1], x9 //CTR block 4k+3 326 327 aese v0.16b, v18.16b 328 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 329 mov d31, v6.d[1] //GHASH block 4k+2 - mid 330 331 aese v2.16b, v19.16b 332 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 333 mov d30, v5.d[1] //GHASH block 4k+1 - mid 334 335 aese v1.16b, v19.16b 336 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 337 eor v4.16b, v4.16b, v11.16b //PRE 1 338 339 aese v3.16b, v18.16b 340 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 341 eor x24, x24, x14 //AES block 4k+3 - round 10 high 342 343 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 344 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 345 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 346#ifdef __AARCH64EB__ 347 rev x6, x6 348 rev x7, x7 349#endif 350 aese v0.16b, v19.16b 351 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 352 rev w9, w12 //CTR block 4k+8 353 354 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 355 mov d8, v4.d[1] //GHASH block 4k - mid 356 orr x9, x11, x9, lsl #32 //CTR block 4k+8 357 358 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 359 add w12, w12, #1 //CTR block 4k+8 360 mov d10, v17.d[1] //GHASH block 4k - mid 361 362 aese v0.16b, v20.16b 363 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 364 365 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 366 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 367 368 aese v1.16b, v20.16b 369 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 370 371 aese v0.16b, v21.16b 372 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 373 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 374 375 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 376 377 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 378 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 379 380 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 381 382 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 383 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 384 385 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 386 eor x7, x7, x14 //AES block 4k+4 - round 10 high 387 388 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 389 mov d30, v7.d[1] //GHASH block 4k+3 - mid 390 391 aese v3.16b, v19.16b 392 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 393 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 394 395 aese v2.16b, v20.16b 396 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 397 eor x6, x6, x13 //AES block 4k+4 - round 10 low 398 399 aese v1.16b, v21.16b 400 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 401 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 402 403 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 404 405 aese v2.16b, v21.16b 406 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 407 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 408 409 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 410 411 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 412 movi v8.8b, #0xc2 413 414 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 415 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 416 417 aese v1.16b, v22.16b 418 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 419 420 aese v3.16b, v20.16b 421 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 422 shl d8, d8, #56 //mod_constant 423 424 aese v0.16b, v22.16b 425 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 426 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 427 428 aese v1.16b, v23.16b 429 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 430 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 431#ifdef __AARCH64EB__ 432 rev x19, x19 433 rev x20, x20 434#endif 435 aese v3.16b, v21.16b 436 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 437 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 438 439 aese v0.16b, v23.16b 440 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 441 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 442#ifdef __AARCH64EB__ 443 rev x21, x21 444 rev x22, x22 445#endif 446 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 447 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 448 449 aese v2.16b, v22.16b 450 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 451 eor x19, x19, x13 //AES block 4k+5 - round 10 low 452 453 aese v3.16b, v22.16b 454 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 455 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 456 457 aese v1.16b, v24.16b 458 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 459 eor x23, x23, x13 //AES block 4k+3 - round 10 low 460 461 aese v2.16b, v23.16b 462 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 463 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 464 465 fmov d4, x6 //AES block 4k+4 - mov low 466 aese v0.16b, v24.16b 467 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 468 fmov v4.d[1], x7 //AES block 4k+4 - mov high 469 470 add x0, x0, #64 //AES input_ptr update 471 fmov d7, x23 //AES block 4k+3 - mov low 472 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 473 474 aese v3.16b, v23.16b 475 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 476 fmov d5, x19 //AES block 4k+5 - mov low 477 478 aese v0.16b, v25.16b 479 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 480 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 481 482 aese v2.16b, v24.16b 483 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 484 eor x20, x20, x14 //AES block 4k+5 - round 10 high 485 486 aese v1.16b, v25.16b 487 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 488 fmov v5.d[1], x20 //AES block 4k+5 - mov high 489 490 aese v0.16b, v26.16b 491 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 492 fmov v7.d[1], x24 //AES block 4k+3 - mov high 493 494 aese v3.16b, v24.16b 495 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 496 cmp x0, x5 //.LOOP CONTROL 497 498 aese v1.16b, v26.16b 499 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 500 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 501 502 aese v0.16b, v27.16b //AES block 4k+4 - round 9 503 eor x21, x21, x13 //AES block 4k+6 - round 10 low 504 eor x22, x22, x14 //AES block 4k+6 - round 10 high 505 506 aese v3.16b, v25.16b 507 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 508 fmov d6, x21 //AES block 4k+6 - mov low 509 510 aese v1.16b, v27.16b //AES block 4k+5 - round 9 511 fmov v6.d[1], x22 //AES block 4k+6 - mov high 512 513 aese v2.16b, v25.16b 514 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 515 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 516 517 fmov d0, x10 //CTR block 4k+8 518 aese v3.16b, v26.16b 519 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 520 521 fmov v0.d[1], x9 //CTR block 4k+8 522 rev w9, w12 //CTR block 4k+9 523 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 524 525 aese v2.16b, v26.16b 526 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 527 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 528 529 add w12, w12, #1 //CTR block 4k+9 530 orr x9, x11, x9, lsl #32 //CTR block 4k+9 531 fmov d1, x10 //CTR block 4k+9 532 533 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 534 fmov v1.d[1], x9 //CTR block 4k+9 535 rev w9, w12 //CTR block 4k+10 536 537 aese v2.16b, v27.16b //AES block 4k+6 - round 9 538 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 539 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 540 orr x9, x11, x9, lsl #32 //CTR block 4k+10 541 542 aese v3.16b, v27.16b //AES block 4k+7 - round 9 543 add w12, w12, #1 //CTR block 4k+10 544 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 545 fmov d2, x10 //CTR block 4k+10 546 547 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 548 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 549 550 fmov v2.d[1], x9 //CTR block 4k+10 551 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 552 rev w9, w12 //CTR block 4k+11 553 554 orr x9, x11, x9, lsl #32 //CTR block 4k+11 555 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result 556 557 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 558 st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result 559 b.lt .L128_enc_main_loop 560 561.L128_enc_prepretail: //PREPRETAIL 562 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 563 fmov d3, x10 //CTR block 4k+3 564 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 565 566 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 567 add w12, w12, #1 //CTR block 4k+3 568 fmov v3.d[1], x9 //CTR block 4k+3 569 570 aese v1.16b, v18.16b 571 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 572 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 573 574 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 575 576 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 577 eor v4.16b, v4.16b, v11.16b //PRE 1 578 579 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 580 581 aese v3.16b, v18.16b 582 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 583 mov d30, v5.d[1] //GHASH block 4k+1 - mid 584 585 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 586 mov d8, v4.d[1] //GHASH block 4k - mid 587 588 mov d31, v6.d[1] //GHASH block 4k+2 - mid 589 mov d10, v17.d[1] //GHASH block 4k - mid 590 591 aese v1.16b, v19.16b 592 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 593 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 594 595 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 596 597 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 598 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 599 600 aese v3.16b, v19.16b 601 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 602 603 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 604 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 605 606 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 607 608 aese v0.16b, v18.16b 609 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 610 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 611 612 aese v2.16b, v18.16b 613 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 614 615 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 616 mov d30, v7.d[1] //GHASH block 4k+3 - mid 617 618 aese v0.16b, v19.16b 619 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 620 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 621 622 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 623 624 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 625 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 626 627 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 628 629 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 630 631 aese v2.16b, v19.16b 632 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 633 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 634 635 aese v0.16b, v20.16b 636 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 637 638 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 639 movi v8.8b, #0xc2 640 641 aese v2.16b, v20.16b 642 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 643 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 644 645 aese v3.16b, v20.16b 646 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 647 648 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 649 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 650 651 aese v2.16b, v21.16b 652 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 653 654 aese v1.16b, v20.16b 655 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 656 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 657 658 aese v0.16b, v21.16b 659 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 660 661 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 662 shl d8, d8, #56 //mod_constant 663 664 aese v1.16b, v21.16b 665 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 666 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 667 668 aese v0.16b, v22.16b 669 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 670 671 pmull v28.1q, v9.1d, v8.1d 672 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 673 674 aese v1.16b, v22.16b 675 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 676 677 aese v0.16b, v23.16b 678 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 679 ext v9.16b, v9.16b, v9.16b, #8 680 681 aese v3.16b, v21.16b 682 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 683 684 aese v2.16b, v22.16b 685 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 686 eor v10.16b, v10.16b, v11.16b 687 688 aese v0.16b, v24.16b 689 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 690 691 aese v3.16b, v22.16b 692 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 693 694 aese v1.16b, v23.16b 695 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 696 697 aese v2.16b, v23.16b 698 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 699 eor v10.16b, v10.16b, v28.16b 700 701 aese v3.16b, v23.16b 702 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 703 704 aese v1.16b, v24.16b 705 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 706 707 aese v2.16b, v24.16b 708 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 709 710 aese v3.16b, v24.16b 711 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 712 eor v10.16b, v10.16b, v9.16b 713 714 aese v0.16b, v25.16b 715 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 716 717 aese v2.16b, v25.16b 718 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 719 720 aese v3.16b, v25.16b 721 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 722 723 pmull v28.1q, v10.1d, v8.1d 724 725 aese v1.16b, v25.16b 726 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 727 ext v10.16b, v10.16b, v10.16b, #8 728 729 aese v3.16b, v26.16b 730 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 731 732 aese v0.16b, v26.16b 733 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 734 eor v11.16b, v11.16b, v28.16b 735 736 aese v1.16b, v26.16b 737 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 738 739 aese v3.16b, v27.16b //AES block 4k+7 - round 9 740 741 aese v2.16b, v26.16b 742 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 743 744 aese v0.16b, v27.16b //AES block 4k+4 - round 9 745 746 aese v1.16b, v27.16b //AES block 4k+5 - round 9 747 eor v11.16b, v11.16b, v10.16b 748 749 aese v2.16b, v27.16b //AES block 4k+6 - round 9 750.L128_enc_tail: //TAIL 751 752 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 753 ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 754#ifdef __AARCH64EB__ 755 rev x6, x6 756 rev x7, x7 757#endif 758 cmp x5, #48 759 760 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 761 eor x6, x6, x13 //AES block 4k+4 - round 10 low 762 eor x7, x7, x14 //AES block 4k+4 - round 10 high 763 764 fmov d4, x6 //AES block 4k+4 - mov low 765 766 fmov v4.d[1], x7 //AES block 4k+4 - mov high 767 768 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 769 770 b.gt .L128_enc_blocks_more_than_3 771 772 sub w12, w12, #1 773 movi v11.8b, #0 774 mov v3.16b, v2.16b 775 776 cmp x5, #32 777 mov v2.16b, v1.16b 778 movi v9.8b, #0 779 780 movi v10.8b, #0 781 b.gt .L128_enc_blocks_more_than_2 782 783 mov v3.16b, v1.16b 784 cmp x5, #16 785 786 sub w12, w12, #1 787 b.gt .L128_enc_blocks_more_than_1 788 789 sub w12, w12, #1 790 b .L128_enc_blocks_less_than_1 791.L128_enc_blocks_more_than_3: //blocks left > 3 792 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 793 794 ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 795#ifdef __AARCH64EB__ 796 rev x6, x6 797 rev x7, x7 798#endif 799 rev64 v4.16b, v5.16b //GHASH final-3 block 800 801 eor v4.16b, v4.16b, v8.16b //feed in partial tag 802 eor x7, x7, x14 //AES final-2 block - round 10 high 803 eor x6, x6, x13 //AES final-2 block - round 10 low 804 805 fmov d5, x6 //AES final-2 block - mov low 806 807 movi v8.8b, #0 //suppress further partial tag feed in 808 fmov v5.d[1], x7 //AES final-2 block - mov high 809 810 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 811 mov d22, v4.d[1] //GHASH final-3 block - mid 812 813 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 814 815 mov d10, v17.d[1] //GHASH final-3 block - mid 816 817 eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 818 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 819 820 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 821.L128_enc_blocks_more_than_2: //blocks left > 2 822 823 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 824 825 rev64 v4.16b, v5.16b //GHASH final-2 block 826 ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 827#ifdef __AARCH64EB__ 828 rev x6, x6 829 rev x7, x7 830#endif 831 eor v4.16b, v4.16b, v8.16b //feed in partial tag 832 833 eor x6, x6, x13 //AES final-1 block - round 10 low 834 835 fmov d5, x6 //AES final-1 block - mov low 836 eor x7, x7, x14 //AES final-1 block - round 10 high 837 838 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 839 fmov v5.d[1], x7 //AES final-1 block - mov high 840 841 mov d22, v4.d[1] //GHASH final-2 block - mid 842 843 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 844 845 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 846 847 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 848 849 eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 850 851 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 852 853 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 854 855 movi v8.8b, #0 //suppress further partial tag feed in 856 857 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 858.L128_enc_blocks_more_than_1: //blocks left > 1 859 860 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 861 862 rev64 v4.16b, v5.16b //GHASH final-1 block 863 ldp x6, x7, [x0], #16 //AES final block - load input low & high 864#ifdef __AARCH64EB__ 865 rev x6, x6 866 rev x7, x7 867#endif 868 eor v4.16b, v4.16b, v8.16b //feed in partial tag 869 870 eor x7, x7, x14 //AES final block - round 10 high 871 eor x6, x6, x13 //AES final block - round 10 low 872 873 fmov d5, x6 //AES final block - mov low 874 875 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 876 fmov v5.d[1], x7 //AES final block - mov high 877 878 mov d22, v4.d[1] //GHASH final-1 block - mid 879 880 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 881 882 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 883 884 eor v5.16b, v5.16b, v3.16b //AES final block - result 885 886 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 887 888 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 889 890 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 891 892 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 893 894 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 895 movi v8.8b, #0 //suppress further partial tag feed in 896.L128_enc_blocks_less_than_1: //blocks left <= 1 897 898 and x1, x1, #127 //bit_length %= 128 899 mvn x13, xzr //rk10_l = 0xffffffffffffffff 900 901 mvn x14, xzr //rk10_h = 0xffffffffffffffff 902 sub x1, x1, #128 //bit_length -= 128 903 904 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 905 906 and x1, x1, #127 //bit_length %= 128 907 908 lsr x14, x14, x1 //rk10_h is mask for top 64b of last block 909 cmp x1, #64 910 911 csel x6, x13, x14, lt 912 csel x7, x14, xzr, lt 913 914 fmov d0, x6 //ctr0b is mask for last block 915 916 fmov v0.d[1], x7 917 918 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 919 920 rev64 v4.16b, v5.16b //GHASH final block 921 922 eor v4.16b, v4.16b, v8.16b //feed in partial tag 923 924 mov d8, v4.d[1] //GHASH final block - mid 925 926 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 927 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 928 929 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 930#ifndef __AARCH64EB__ 931 rev w9, w12 932#else 933 mov w9, w12 934#endif 935 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 936 937 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 938 939 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 940 941 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 942 943 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 944 movi v8.8b, #0xc2 945 946 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 947 948 shl d8, d8, #56 //mod_constant 949 950 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 951 952 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 953 954 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 955 956 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 957 958 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 959 960 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 961 962 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 963 964 bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 965 966 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 967 st1 { v5.16b}, [x2] //store all 16B 968 969 str w9, [x16, #12] //store the updated counter 970 971 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 972 ext v11.16b, v11.16b, v11.16b, #8 973 rev64 v11.16b, v11.16b 974 mov x0, x15 975 st1 { v11.16b }, [x3] 976 ldp x21, x22, [sp, #16] 977 ldp x23, x24, [sp, #32] 978 ldp d8, d9, [sp, #48] 979 ldp d10, d11, [sp, #64] 980 ldp d12, d13, [sp, #80] 981 ldp d14, d15, [sp, #96] 982 ldp x19, x20, [sp], #112 983 ret 984 985.L128_enc_ret: 986 mov w0, #0x0 987 ret 988.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel 989.globl aes_gcm_dec_128_kernel 990.type aes_gcm_dec_128_kernel,%function 991.align 4 992aes_gcm_dec_128_kernel: 993 cbz x1, .L128_dec_ret 994 stp x19, x20, [sp, #-112]! 995 mov x16, x4 996 mov x8, x5 997 stp x21, x22, [sp, #16] 998 stp x23, x24, [sp, #32] 999 stp d8, d9, [sp, #48] 1000 stp d10, d11, [sp, #64] 1001 stp d12, d13, [sp, #80] 1002 stp d14, d15, [sp, #96] 1003 1004 lsr x5, x1, #3 //byte_len 1005 mov x15, x5 1006 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 1007#ifdef __AARCH64EB__ 1008 rev x10, x10 1009 rev x11, x11 1010#endif 1011 ldp x13, x14, [x8, #160] //load rk10 1012#ifdef __AARCH64EB__ 1013 ror x14, x14, 32 1014 ror x13, x13, 32 1015#endif 1016 sub x5, x5, #1 //byte_len - 1 1017 ld1 {v18.4s}, [x8], #16 //load rk0 1018 1019 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 1020 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 1021 1022 ldr q13, [x3, #64] //load h2l | h2h 1023#ifndef __AARCH64EB__ 1024 ext v13.16b, v13.16b, v13.16b, #8 1025#endif 1026 lsr x12, x11, #32 1027 fmov d2, x10 //CTR block 2 1028 1029 ld1 {v19.4s}, [x8], #16 //load rk1 1030 orr w11, w11, w11 1031 rev w12, w12 //rev_ctr32 1032 1033 fmov d1, x10 //CTR block 1 1034 add w12, w12, #1 //increment rev_ctr32 1035 1036 aese v0.16b, v18.16b 1037 aesmc v0.16b, v0.16b //AES block 0 - round 0 1038 rev w9, w12 //CTR block 1 1039 1040 orr x9, x11, x9, lsl #32 //CTR block 1 1041 ld1 {v20.4s}, [x8], #16 //load rk2 1042 add w12, w12, #1 //CTR block 1 1043 1044 fmov v1.d[1], x9 //CTR block 1 1045 rev w9, w12 //CTR block 2 1046 add w12, w12, #1 //CTR block 2 1047 1048 aese v0.16b, v19.16b 1049 aesmc v0.16b, v0.16b //AES block 0 - round 1 1050 orr x9, x11, x9, lsl #32 //CTR block 2 1051 1052 fmov v2.d[1], x9 //CTR block 2 1053 rev w9, w12 //CTR block 3 1054 1055 fmov d3, x10 //CTR block 3 1056 orr x9, x11, x9, lsl #32 //CTR block 3 1057 add w12, w12, #1 //CTR block 3 1058 1059 fmov v3.d[1], x9 //CTR block 3 1060 add x4, x0, x1, lsr #3 //end_input_ptr 1061 1062 aese v1.16b, v18.16b 1063 aesmc v1.16b, v1.16b //AES block 1 - round 0 1064 ld1 {v21.4s}, [x8], #16 //load rk3 1065 1066 aese v0.16b, v20.16b 1067 aesmc v0.16b, v0.16b //AES block 0 - round 2 1068 ld1 {v22.4s}, [x8], #16 //load rk4 1069 1070 aese v2.16b, v18.16b 1071 aesmc v2.16b, v2.16b //AES block 2 - round 0 1072 ld1 {v23.4s}, [x8], #16 //load rk5 1073 1074 aese v1.16b, v19.16b 1075 aesmc v1.16b, v1.16b //AES block 1 - round 1 1076 ld1 {v24.4s}, [x8], #16 //load rk6 1077 1078 aese v3.16b, v18.16b 1079 aesmc v3.16b, v3.16b //AES block 3 - round 0 1080 1081 aese v2.16b, v19.16b 1082 aesmc v2.16b, v2.16b //AES block 2 - round 1 1083 1084 aese v1.16b, v20.16b 1085 aesmc v1.16b, v1.16b //AES block 1 - round 2 1086 1087 aese v3.16b, v19.16b 1088 aesmc v3.16b, v3.16b //AES block 3 - round 1 1089 ld1 { v11.16b}, [x3] 1090 ext v11.16b, v11.16b, v11.16b, #8 1091 rev64 v11.16b, v11.16b 1092 1093 aese v0.16b, v21.16b 1094 aesmc v0.16b, v0.16b //AES block 0 - round 3 1095 ld1 {v25.4s}, [x8], #16 //load rk7 1096 1097 aese v1.16b, v21.16b 1098 aesmc v1.16b, v1.16b //AES block 1 - round 3 1099 1100 aese v3.16b, v20.16b 1101 aesmc v3.16b, v3.16b //AES block 3 - round 2 1102 1103 aese v2.16b, v20.16b 1104 aesmc v2.16b, v2.16b //AES block 2 - round 2 1105 ld1 {v26.4s}, [x8], #16 //load rk8 1106 1107 aese v1.16b, v22.16b 1108 aesmc v1.16b, v1.16b //AES block 1 - round 4 1109 1110 aese v3.16b, v21.16b 1111 aesmc v3.16b, v3.16b //AES block 3 - round 3 1112 1113 aese v2.16b, v21.16b 1114 aesmc v2.16b, v2.16b //AES block 2 - round 3 1115 ldr q14, [x3, #80] //load h3l | h3h 1116#ifndef __AARCH64EB__ 1117 ext v14.16b, v14.16b, v14.16b, #8 1118#endif 1119 aese v0.16b, v22.16b 1120 aesmc v0.16b, v0.16b //AES block 0 - round 4 1121 ld1 {v27.4s}, [x8], #16 //load rk9 1122 1123 aese v1.16b, v23.16b 1124 aesmc v1.16b, v1.16b //AES block 1 - round 5 1125 1126 aese v2.16b, v22.16b 1127 aesmc v2.16b, v2.16b //AES block 2 - round 4 1128 1129 aese v3.16b, v22.16b 1130 aesmc v3.16b, v3.16b //AES block 3 - round 4 1131 1132 aese v0.16b, v23.16b 1133 aesmc v0.16b, v0.16b //AES block 0 - round 5 1134 1135 aese v2.16b, v23.16b 1136 aesmc v2.16b, v2.16b //AES block 2 - round 5 1137 ldr q12, [x3, #32] //load h1l | h1h 1138#ifndef __AARCH64EB__ 1139 ext v12.16b, v12.16b, v12.16b, #8 1140#endif 1141 aese v3.16b, v23.16b 1142 aesmc v3.16b, v3.16b //AES block 3 - round 5 1143 1144 aese v0.16b, v24.16b 1145 aesmc v0.16b, v0.16b //AES block 0 - round 6 1146 1147 aese v1.16b, v24.16b 1148 aesmc v1.16b, v1.16b //AES block 1 - round 6 1149 1150 aese v3.16b, v24.16b 1151 aesmc v3.16b, v3.16b //AES block 3 - round 6 1152 1153 aese v2.16b, v24.16b 1154 aesmc v2.16b, v2.16b //AES block 2 - round 6 1155 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 1156 1157 ldr q15, [x3, #112] //load h4l | h4h 1158#ifndef __AARCH64EB__ 1159 ext v15.16b, v15.16b, v15.16b, #8 1160#endif 1161 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 1162 add x5, x5, x0 1163 1164 aese v1.16b, v25.16b 1165 aesmc v1.16b, v1.16b //AES block 1 - round 7 1166 1167 aese v2.16b, v25.16b 1168 aesmc v2.16b, v2.16b //AES block 2 - round 7 1169 1170 aese v0.16b, v25.16b 1171 aesmc v0.16b, v0.16b //AES block 0 - round 7 1172 eor v16.16b, v16.16b, v8.16b //h2k | h1k 1173 1174 aese v3.16b, v25.16b 1175 aesmc v3.16b, v3.16b //AES block 3 - round 7 1176 1177 aese v1.16b, v26.16b 1178 aesmc v1.16b, v1.16b //AES block 1 - round 8 1179 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 1180 1181 aese v2.16b, v26.16b 1182 aesmc v2.16b, v2.16b //AES block 2 - round 8 1183 1184 aese v3.16b, v26.16b 1185 aesmc v3.16b, v3.16b //AES block 3 - round 8 1186 1187 aese v0.16b, v26.16b 1188 aesmc v0.16b, v0.16b //AES block 0 - round 8 1189 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 1190 1191 aese v2.16b, v27.16b //AES block 2 - round 9 1192 1193 aese v3.16b, v27.16b //AES block 3 - round 9 1194 1195 aese v0.16b, v27.16b //AES block 0 - round 9 1196 cmp x0, x5 //check if we have <= 4 blocks 1197 1198 aese v1.16b, v27.16b //AES block 1 - round 9 1199 eor v17.16b, v17.16b, v9.16b //h4k | h3k 1200 b.ge .L128_dec_tail //handle tail 1201 1202 ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext 1203 1204 eor v1.16b, v5.16b, v1.16b //AES block 1 - result 1205 ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext 1206 1207 eor v0.16b, v4.16b, v0.16b //AES block 0 - result 1208 rev64 v4.16b, v4.16b //GHASH block 0 1209 rev w9, w12 //CTR block 4 1210 1211 orr x9, x11, x9, lsl #32 //CTR block 4 1212 add w12, w12, #1 //CTR block 4 1213 ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext 1214 1215 rev64 v5.16b, v5.16b //GHASH block 1 1216 mov x19, v1.d[0] //AES block 1 - mov low 1217 1218 mov x20, v1.d[1] //AES block 1 - mov high 1219 1220 mov x6, v0.d[0] //AES block 0 - mov low 1221 cmp x0, x5 //check if we have <= 8 blocks 1222 1223 mov x7, v0.d[1] //AES block 0 - mov high 1224 1225 fmov d0, x10 //CTR block 4 1226 1227 fmov v0.d[1], x9 //CTR block 4 1228 rev w9, w12 //CTR block 5 1229 eor x19, x19, x13 //AES block 1 - round 10 low 1230#ifdef __AARCH64EB__ 1231 rev x19, x19 1232#endif 1233 fmov d1, x10 //CTR block 5 1234 add w12, w12, #1 //CTR block 5 1235 orr x9, x11, x9, lsl #32 //CTR block 5 1236 1237 fmov v1.d[1], x9 //CTR block 5 1238 rev w9, w12 //CTR block 6 1239 add w12, w12, #1 //CTR block 6 1240 1241 orr x9, x11, x9, lsl #32 //CTR block 6 1242 1243 eor x20, x20, x14 //AES block 1 - round 10 high 1244#ifdef __AARCH64EB__ 1245 rev x20, x20 1246#endif 1247 eor x6, x6, x13 //AES block 0 - round 10 low 1248#ifdef __AARCH64EB__ 1249 rev x6, x6 1250#endif 1251 eor v2.16b, v6.16b, v2.16b //AES block 2 - result 1252 1253 eor x7, x7, x14 //AES block 0 - round 10 high 1254#ifdef __AARCH64EB__ 1255 rev x7, x7 1256#endif 1257 stp x6, x7, [x2], #16 //AES block 0 - store result 1258 1259 stp x19, x20, [x2], #16 //AES block 1 - store result 1260 b.ge .L128_dec_prepretail //do prepretail 1261 1262.L128_dec_main_loop: //main loop start 1263 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 1264 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 1265 mov x21, v2.d[0] //AES block 4k+2 - mov low 1266 1267 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 1268 mov x22, v2.d[1] //AES block 4k+2 - mov high 1269 1270 aese v1.16b, v18.16b 1271 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 1272 fmov d2, x10 //CTR block 4k+6 1273 1274 rev64 v6.16b, v6.16b //GHASH block 4k+2 1275 fmov v2.d[1], x9 //CTR block 4k+6 1276 rev w9, w12 //CTR block 4k+7 1277 1278 mov x23, v3.d[0] //AES block 4k+3 - mov low 1279 eor v4.16b, v4.16b, v11.16b //PRE 1 1280 mov d30, v5.d[1] //GHASH block 4k+1 - mid 1281 1282 aese v1.16b, v19.16b 1283 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 1284 rev64 v7.16b, v7.16b //GHASH block 4k+3 1285 1286 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 1287 mov x24, v3.d[1] //AES block 4k+3 - mov high 1288 orr x9, x11, x9, lsl #32 //CTR block 4k+7 1289 1290 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 1291 fmov d3, x10 //CTR block 4k+7 1292 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 1293 1294 aese v1.16b, v20.16b 1295 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 1296 fmov v3.d[1], x9 //CTR block 4k+7 1297 1298 aese v2.16b, v18.16b 1299 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 1300 mov d10, v17.d[1] //GHASH block 4k - mid 1301 1302 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 1303 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 1304 1305 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 1306 1307 aese v1.16b, v21.16b 1308 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 1309 mov d8, v4.d[1] //GHASH block 4k - mid 1310 1311 aese v3.16b, v18.16b 1312 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 1313 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 1314 1315 aese v0.16b, v18.16b 1316 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 1317 1318 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 1319 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 1320 1321 aese v3.16b, v19.16b 1322 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 1323 eor x23, x23, x13 //AES block 4k+3 - round 10 low 1324#ifdef __AARCH64EB__ 1325 rev x23, x23 1326#endif 1327 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 1328 eor x22, x22, x14 //AES block 4k+2 - round 10 high 1329#ifdef __AARCH64EB__ 1330 rev x22, x22 1331#endif 1332 mov d31, v6.d[1] //GHASH block 4k+2 - mid 1333 1334 aese v0.16b, v19.16b 1335 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 1336 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 1337 1338 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 1339 1340 aese v3.16b, v20.16b 1341 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 1342 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 1343 1344 aese v0.16b, v20.16b 1345 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 1346 1347 aese v1.16b, v22.16b 1348 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 1349 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 1350 1351 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 1352 1353 aese v0.16b, v21.16b 1354 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 1355 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 1356 1357 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 1358 1359 aese v2.16b, v19.16b 1360 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 1361 mov d30, v7.d[1] //GHASH block 4k+3 - mid 1362 1363 aese v0.16b, v22.16b 1364 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 1365 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 1366 1367 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 1368 eor x24, x24, x14 //AES block 4k+3 - round 10 high 1369#ifdef __AARCH64EB__ 1370 rev x24, x24 1371#endif 1372 aese v2.16b, v20.16b 1373 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 1374 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 1375 1376 aese v1.16b, v23.16b 1377 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 1378 eor x21, x21, x13 //AES block 4k+2 - round 10 low 1379#ifdef __AARCH64EB__ 1380 rev x21, x21 1381#endif 1382 aese v0.16b, v23.16b 1383 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 1384 movi v8.8b, #0xc2 1385 1386 aese v2.16b, v21.16b 1387 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 1388 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 1389 1390 aese v1.16b, v24.16b 1391 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 1392 1393 aese v0.16b, v24.16b 1394 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 1395 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 1396 1397 aese v2.16b, v22.16b 1398 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 1399 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 1400 1401 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 1402 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 1403 ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext 1404 1405 aese v1.16b, v25.16b 1406 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 1407 add w12, w12, #1 //CTR block 4k+7 1408 1409 aese v0.16b, v25.16b 1410 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 1411 shl d8, d8, #56 //mod_constant 1412 1413 aese v2.16b, v23.16b 1414 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 1415 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 1416 1417 aese v1.16b, v26.16b 1418 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 1419 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 1420 1421 aese v0.16b, v26.16b 1422 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 1423 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1424 1425 aese v3.16b, v21.16b 1426 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 1427 rev w9, w12 //CTR block 4k+8 1428 1429 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1430 ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 1431 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1432 1433 aese v0.16b, v27.16b //AES block 4k+4 - round 9 1434 orr x9, x11, x9, lsl #32 //CTR block 4k+8 1435 1436 aese v3.16b, v22.16b 1437 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 1438 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1439 1440 aese v1.16b, v27.16b //AES block 4k+5 - round 9 1441 1442 aese v2.16b, v24.16b 1443 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 1444 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 1445 1446 aese v3.16b, v23.16b 1447 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 1448 ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 1449 1450 add w12, w12, #1 //CTR block 4k+8 1451 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1452 eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 1453 1454 aese v2.16b, v25.16b 1455 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 1456 ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 1457 1458 aese v3.16b, v24.16b 1459 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 1460 1461 rev64 v5.16b, v5.16b //GHASH block 4k+5 1462 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1463 mov x7, v0.d[1] //AES block 4k+4 - mov high 1464 1465 aese v2.16b, v26.16b 1466 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 1467 mov x6, v0.d[0] //AES block 4k+4 - mov low 1468 1469 aese v3.16b, v25.16b 1470 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 1471 fmov d0, x10 //CTR block 4k+8 1472 1473 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1474 fmov v0.d[1], x9 //CTR block 4k+8 1475 rev w9, w12 //CTR block 4k+9 1476 1477 aese v2.16b, v27.16b //AES block 4k+6 - round 9 1478 orr x9, x11, x9, lsl #32 //CTR block 4k+9 1479 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1480 1481 aese v3.16b, v26.16b 1482 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 1483 eor x7, x7, x14 //AES block 4k+4 - round 10 high 1484#ifdef __AARCH64EB__ 1485 rev x7, x7 1486#endif 1487 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1488 mov x20, v1.d[1] //AES block 4k+5 - mov high 1489 eor x6, x6, x13 //AES block 4k+4 - round 10 low 1490#ifdef __AARCH64EB__ 1491 rev x6, x6 1492#endif 1493 eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 1494 mov x19, v1.d[0] //AES block 4k+5 - mov low 1495 add w12, w12, #1 //CTR block 4k+9 1496 1497 aese v3.16b, v27.16b //AES block 4k+7 - round 9 1498 fmov d1, x10 //CTR block 4k+9 1499 cmp x0, x5 //.LOOP CONTROL 1500 1501 rev64 v4.16b, v4.16b //GHASH block 4k+4 1502 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1503 fmov v1.d[1], x9 //CTR block 4k+9 1504 1505 rev w9, w12 //CTR block 4k+10 1506 add w12, w12, #1 //CTR block 4k+10 1507 1508 eor x20, x20, x14 //AES block 4k+5 - round 10 high 1509#ifdef __AARCH64EB__ 1510 rev x20, x20 1511#endif 1512 stp x6, x7, [x2], #16 //AES block 4k+4 - store result 1513 1514 eor x19, x19, x13 //AES block 4k+5 - round 10 low 1515#ifdef __AARCH64EB__ 1516 rev x19, x19 1517#endif 1518 stp x19, x20, [x2], #16 //AES block 4k+5 - store result 1519 1520 orr x9, x11, x9, lsl #32 //CTR block 4k+10 1521 b.lt .L128_dec_main_loop 1522 1523.L128_dec_prepretail: //PREPRETAIL 1524 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 1525 mov x21, v2.d[0] //AES block 4k+2 - mov low 1526 mov d30, v5.d[1] //GHASH block 4k+1 - mid 1527 1528 aese v0.16b, v18.16b 1529 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 1530 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 1531 1532 aese v1.16b, v18.16b 1533 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 1534 mov x22, v2.d[1] //AES block 4k+2 - mov high 1535 1536 eor v4.16b, v4.16b, v11.16b //PRE 1 1537 fmov d2, x10 //CTR block 4k+6 1538 rev64 v6.16b, v6.16b //GHASH block 4k+2 1539 1540 aese v0.16b, v19.16b 1541 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 1542 fmov v2.d[1], x9 //CTR block 4k+6 1543 1544 rev w9, w12 //CTR block 4k+7 1545 mov x23, v3.d[0] //AES block 4k+3 - mov low 1546 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 1547 1548 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 1549 mov d10, v17.d[1] //GHASH block 4k - mid 1550 mov x24, v3.d[1] //AES block 4k+3 - mov high 1551 1552 aese v1.16b, v19.16b 1553 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 1554 mov d31, v6.d[1] //GHASH block 4k+2 - mid 1555 1556 aese v0.16b, v20.16b 1557 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 1558 orr x9, x11, x9, lsl #32 //CTR block 4k+7 1559 1560 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 1561 mov d8, v4.d[1] //GHASH block 4k - mid 1562 fmov d3, x10 //CTR block 4k+7 1563 1564 aese v2.16b, v18.16b 1565 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 1566 fmov v3.d[1], x9 //CTR block 4k+7 1567 1568 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 1569 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 1570 1571 rev64 v7.16b, v7.16b //GHASH block 4k+3 1572 1573 aese v2.16b, v19.16b 1574 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 1575 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 1576 1577 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 1578 1579 aese v3.16b, v18.16b 1580 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 1581 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 1582 1583 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 1584 1585 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 1586 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 1587 1588 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 1589 1590 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 1591 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 1592 1593 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 1594 1595 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 1596 1597 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 1598 mov d30, v7.d[1] //GHASH block 4k+3 - mid 1599 1600 aese v1.16b, v20.16b 1601 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 1602 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 1603 1604 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 1605 1606 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 1607 movi v8.8b, #0xc2 1608 1609 aese v3.16b, v19.16b 1610 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 1611 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 1612 1613 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 1614 1615 aese v2.16b, v20.16b 1616 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 1617 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 1618 1619 aese v3.16b, v20.16b 1620 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 1621 eor x23, x23, x13 //AES block 4k+3 - round 10 low 1622#ifdef __AARCH64EB__ 1623 rev x23, x23 1624#endif 1625 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 1626 eor x21, x21, x13 //AES block 4k+2 - round 10 low 1627#ifdef __AARCH64EB__ 1628 rev x21, x21 1629#endif 1630 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 1631 1632 aese v2.16b, v21.16b 1633 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 1634 1635 aese v1.16b, v21.16b 1636 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 1637 shl d8, d8, #56 //mod_constant 1638 1639 aese v0.16b, v21.16b 1640 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 1641 1642 aese v2.16b, v22.16b 1643 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 1644 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 1645 1646 aese v1.16b, v22.16b 1647 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 1648 1649 aese v3.16b, v21.16b 1650 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 1651 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1652 1653 aese v2.16b, v23.16b 1654 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 1655 1656 aese v1.16b, v23.16b 1657 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 1658 1659 aese v3.16b, v22.16b 1660 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 1661 1662 aese v0.16b, v22.16b 1663 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 1664 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1665 1666 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1667 1668 aese v1.16b, v24.16b 1669 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 1670 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1671 1672 aese v3.16b, v23.16b 1673 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 1674 1675 aese v0.16b, v23.16b 1676 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 1677 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1678 1679 aese v1.16b, v25.16b 1680 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 1681 1682 aese v2.16b, v24.16b 1683 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 1684 1685 aese v0.16b, v24.16b 1686 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 1687 1688 aese v1.16b, v26.16b 1689 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 1690 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1691 1692 aese v3.16b, v24.16b 1693 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 1694 1695 aese v0.16b, v25.16b 1696 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 1697 1698 aese v1.16b, v27.16b //AES block 4k+5 - round 9 1699 1700 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1701 eor x24, x24, x14 //AES block 4k+3 - round 10 high 1702#ifdef __AARCH64EB__ 1703 rev x24, x24 1704#endif 1705 aese v2.16b, v25.16b 1706 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 1707 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1708 1709 aese v3.16b, v25.16b 1710 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 1711 1712 aese v0.16b, v26.16b 1713 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 1714 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1715 1716 aese v2.16b, v26.16b 1717 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 1718 1719 aese v3.16b, v26.16b 1720 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 1721 eor x22, x22, x14 //AES block 4k+2 - round 10 high 1722#ifdef __AARCH64EB__ 1723 rev x22, x22 1724#endif 1725 aese v0.16b, v27.16b //AES block 4k+4 - round 9 1726 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 1727 1728 aese v2.16b, v27.16b //AES block 4k+6 - round 9 1729 add w12, w12, #1 //CTR block 4k+7 1730 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 1731 1732 aese v3.16b, v27.16b //AES block 4k+7 - round 9 1733 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1734.L128_dec_tail: //TAIL 1735 1736 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 1737 ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 1738 1739 eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 1740 1741 mov x7, v0.d[1] //AES block 4k+4 - mov high 1742 1743 mov x6, v0.d[0] //AES block 4k+4 - mov low 1744 1745 cmp x5, #48 1746 1747 eor x7, x7, x14 //AES block 4k+4 - round 10 high 1748#ifdef __AARCH64EB__ 1749 rev x7, x7 1750#endif 1751 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 1752 eor x6, x6, x13 //AES block 4k+4 - round 10 low 1753#ifdef __AARCH64EB__ 1754 rev x6, x6 1755#endif 1756 b.gt .L128_dec_blocks_more_than_3 1757 1758 mov v3.16b, v2.16b 1759 sub w12, w12, #1 1760 movi v11.8b, #0 1761 1762 movi v9.8b, #0 1763 mov v2.16b, v1.16b 1764 1765 movi v10.8b, #0 1766 cmp x5, #32 1767 b.gt .L128_dec_blocks_more_than_2 1768 1769 cmp x5, #16 1770 1771 mov v3.16b, v1.16b 1772 sub w12, w12, #1 1773 b.gt .L128_dec_blocks_more_than_1 1774 1775 sub w12, w12, #1 1776 b .L128_dec_blocks_less_than_1 1777.L128_dec_blocks_more_than_3: //blocks left > 3 1778 rev64 v4.16b, v5.16b //GHASH final-3 block 1779 ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 1780 1781 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1782 1783 mov d10, v17.d[1] //GHASH final-3 block - mid 1784 stp x6, x7, [x2], #16 //AES final-3 block - store result 1785 eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 1786 1787 mov d22, v4.d[1] //GHASH final-3 block - mid 1788 mov x7, v0.d[1] //AES final-2 block - mov high 1789 1790 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 1791 mov x6, v0.d[0] //AES final-2 block - mov low 1792 1793 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 1794 1795 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 1796 1797 movi v8.8b, #0 //suppress further partial tag feed in 1798 eor x7, x7, x14 //AES final-2 block - round 10 high 1799#ifdef __AARCH64EB__ 1800 rev x7, x7 1801#endif 1802 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 1803 eor x6, x6, x13 //AES final-2 block - round 10 low 1804#ifdef __AARCH64EB__ 1805 rev x6, x6 1806#endif 1807.L128_dec_blocks_more_than_2: //blocks left > 2 1808 1809 rev64 v4.16b, v5.16b //GHASH final-2 block 1810 ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 1811 1812 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1813 1814 eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 1815 stp x6, x7, [x2], #16 //AES final-2 block - store result 1816 1817 mov d22, v4.d[1] //GHASH final-2 block - mid 1818 1819 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 1820 1821 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 1822 mov x6, v0.d[0] //AES final-1 block - mov low 1823 1824 mov x7, v0.d[1] //AES final-1 block - mov high 1825 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 1826 1827 movi v8.8b, #0 //suppress further partial tag feed in 1828 1829 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 1830 1831 eor x6, x6, x13 //AES final-1 block - round 10 low 1832#ifdef __AARCH64EB__ 1833 rev x6, x6 1834#endif 1835 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 1836 1837 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 1838 1839 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 1840 eor x7, x7, x14 //AES final-1 block - round 10 high 1841#ifdef __AARCH64EB__ 1842 rev x7, x7 1843#endif 1844.L128_dec_blocks_more_than_1: //blocks left > 1 1845 1846 rev64 v4.16b, v5.16b //GHASH final-1 block 1847 1848 ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 1849 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1850 1851 mov d22, v4.d[1] //GHASH final-1 block - mid 1852 1853 eor v0.16b, v5.16b, v3.16b //AES final block - result 1854 1855 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 1856 1857 stp x6, x7, [x2], #16 //AES final-1 block - store result 1858 mov x6, v0.d[0] //AES final block - mov low 1859 1860 mov x7, v0.d[1] //AES final block - mov high 1861 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 1862 1863 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 1864 1865 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 1866 1867 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 1868 movi v8.8b, #0 //suppress further partial tag feed in 1869 1870 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 1871 1872 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 1873 eor x7, x7, x14 //AES final block - round 10 high 1874#ifdef __AARCH64EB__ 1875 rev x7, x7 1876#endif 1877 eor x6, x6, x13 //AES final block - round 10 low 1878#ifdef __AARCH64EB__ 1879 rev x6, x6 1880#endif 1881 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 1882.L128_dec_blocks_less_than_1: //blocks left <= 1 1883 1884 mvn x14, xzr //rk10_h = 0xffffffffffffffff 1885 and x1, x1, #127 //bit_length %= 128 1886 1887 mvn x13, xzr //rk10_l = 0xffffffffffffffff 1888 sub x1, x1, #128 //bit_length -= 128 1889 1890 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 1891 1892 and x1, x1, #127 //bit_length %= 128 1893 1894 lsr x14, x14, x1 //rk10_h is mask for top 64b of last block 1895 cmp x1, #64 1896 1897 csel x10, x14, xzr, lt 1898 csel x9, x13, x14, lt 1899 1900 fmov d0, x9 //ctr0b is mask for last block 1901 1902 mov v0.d[1], x10 1903 1904 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 1905 1906 rev64 v4.16b, v5.16b //GHASH final block 1907 1908 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1909 1910 ldp x4, x5, [x2] //load existing bytes we need to not overwrite 1911 1912 and x7, x7, x10 1913 1914 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 1915 mov d8, v4.d[1] //GHASH final block - mid 1916 1917 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 1918 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 1919 1920 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 1921 1922 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 1923 bic x4, x4, x9 //mask out low existing bytes 1924 and x6, x6, x9 1925 1926#ifndef __AARCH64EB__ 1927 rev w9, w12 1928#else 1929 mov w9, w12 1930#endif 1931 1932 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 1933 movi v8.8b, #0xc2 1934 1935 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 1936 1937 bic x5, x5, x10 //mask out high existing bytes 1938 shl d8, d8, #56 //mod_constant 1939 1940 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1941 1942 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1943 1944 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1945 1946 orr x6, x6, x4 1947 str w9, [x16, #12] //store the updated counter 1948 1949 orr x7, x7, x5 1950 stp x6, x7, [x2] 1951 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1952 1953 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1954 1955 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1956 1957 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1958 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1959 1960 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1961 1962 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1963 ext v11.16b, v11.16b, v11.16b, #8 1964 rev64 v11.16b, v11.16b 1965 mov x0, x15 1966 st1 { v11.16b }, [x3] 1967 1968 ldp x21, x22, [sp, #16] 1969 ldp x23, x24, [sp, #32] 1970 ldp d8, d9, [sp, #48] 1971 ldp d10, d11, [sp, #64] 1972 ldp d12, d13, [sp, #80] 1973 ldp d14, d15, [sp, #96] 1974 ldp x19, x20, [sp], #112 1975 ret 1976 1977.L128_dec_ret: 1978 mov w0, #0x0 1979 ret 1980.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel 1981.globl aes_gcm_enc_192_kernel 1982.type aes_gcm_enc_192_kernel,%function 1983.align 4 1984aes_gcm_enc_192_kernel: 1985 cbz x1, .L192_enc_ret 1986 stp x19, x20, [sp, #-112]! 1987 mov x16, x4 1988 mov x8, x5 1989 stp x21, x22, [sp, #16] 1990 stp x23, x24, [sp, #32] 1991 stp d8, d9, [sp, #48] 1992 stp d10, d11, [sp, #64] 1993 stp d12, d13, [sp, #80] 1994 stp d14, d15, [sp, #96] 1995 1996 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 1997#ifdef __AARCH64EB__ 1998 rev x10, x10 1999 rev x11, x11 2000#endif 2001 ldp x13, x14, [x8, #192] //load rk12 2002#ifdef __AARCH64EB__ 2003 ror x13, x13, #32 2004 ror x14, x14, #32 2005#endif 2006 ld1 {v18.4s}, [x8], #16 //load rk0 2007 2008 ld1 {v19.4s}, [x8], #16 //load rk1 2009 2010 ld1 {v20.4s}, [x8], #16 //load rk2 2011 2012 lsr x12, x11, #32 2013 ld1 {v21.4s}, [x8], #16 //load rk3 2014 orr w11, w11, w11 2015 2016 ld1 {v22.4s}, [x8], #16 //load rk4 2017 rev w12, w12 //rev_ctr32 2018 2019 add w12, w12, #1 //increment rev_ctr32 2020 fmov d3, x10 //CTR block 3 2021 2022 rev w9, w12 //CTR block 1 2023 add w12, w12, #1 //CTR block 1 2024 fmov d1, x10 //CTR block 1 2025 2026 orr x9, x11, x9, lsl #32 //CTR block 1 2027 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 2028 2029 fmov v1.d[1], x9 //CTR block 1 2030 rev w9, w12 //CTR block 2 2031 add w12, w12, #1 //CTR block 2 2032 2033 fmov d2, x10 //CTR block 2 2034 orr x9, x11, x9, lsl #32 //CTR block 2 2035 2036 fmov v2.d[1], x9 //CTR block 2 2037 rev w9, w12 //CTR block 3 2038 2039 orr x9, x11, x9, lsl #32 //CTR block 3 2040 ld1 {v23.4s}, [x8], #16 //load rk5 2041 2042 fmov v3.d[1], x9 //CTR block 3 2043 2044 ld1 {v24.4s}, [x8], #16 //load rk6 2045 2046 ld1 {v25.4s}, [x8], #16 //load rk7 2047 2048 aese v0.16b, v18.16b 2049 aesmc v0.16b, v0.16b //AES block 0 - round 0 2050 ld1 { v11.16b}, [x3] 2051 ext v11.16b, v11.16b, v11.16b, #8 2052 rev64 v11.16b, v11.16b 2053 2054 aese v3.16b, v18.16b 2055 aesmc v3.16b, v3.16b //AES block 3 - round 0 2056 ld1 {v26.4s}, [x8], #16 //load rk8 2057 2058 aese v1.16b, v18.16b 2059 aesmc v1.16b, v1.16b //AES block 1 - round 0 2060 ldr q15, [x3, #112] //load h4l | h4h 2061#ifndef __AARCH64EB__ 2062 ext v15.16b, v15.16b, v15.16b, #8 2063#endif 2064 aese v2.16b, v18.16b 2065 aesmc v2.16b, v2.16b //AES block 2 - round 0 2066 ld1 {v27.4s}, [x8], #16 //load rk9 2067 2068 aese v0.16b, v19.16b 2069 aesmc v0.16b, v0.16b //AES block 0 - round 1 2070 ld1 {v28.4s}, [x8], #16 //load rk10 2071 2072 aese v1.16b, v19.16b 2073 aesmc v1.16b, v1.16b //AES block 1 - round 1 2074 ldr q12, [x3, #32] //load h1l | h1h 2075#ifndef __AARCH64EB__ 2076 ext v12.16b, v12.16b, v12.16b, #8 2077#endif 2078 aese v2.16b, v19.16b 2079 aesmc v2.16b, v2.16b //AES block 2 - round 1 2080 ld1 {v29.4s}, [x8], #16 //load rk11 2081 2082 aese v3.16b, v19.16b 2083 aesmc v3.16b, v3.16b //AES block 3 - round 1 2084 ldr q14, [x3, #80] //load h3l | h3h 2085#ifndef __AARCH64EB__ 2086 ext v14.16b, v14.16b, v14.16b, #8 2087#endif 2088 aese v0.16b, v20.16b 2089 aesmc v0.16b, v0.16b //AES block 0 - round 2 2090 2091 aese v2.16b, v20.16b 2092 aesmc v2.16b, v2.16b //AES block 2 - round 2 2093 2094 aese v3.16b, v20.16b 2095 aesmc v3.16b, v3.16b //AES block 3 - round 2 2096 2097 aese v0.16b, v21.16b 2098 aesmc v0.16b, v0.16b //AES block 0 - round 3 2099 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 2100 2101 aese v2.16b, v21.16b 2102 aesmc v2.16b, v2.16b //AES block 2 - round 3 2103 2104 aese v1.16b, v20.16b 2105 aesmc v1.16b, v1.16b //AES block 1 - round 2 2106 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 2107 2108 aese v0.16b, v22.16b 2109 aesmc v0.16b, v0.16b //AES block 0 - round 4 2110 2111 aese v3.16b, v21.16b 2112 aesmc v3.16b, v3.16b //AES block 3 - round 3 2113 2114 aese v1.16b, v21.16b 2115 aesmc v1.16b, v1.16b //AES block 1 - round 3 2116 2117 aese v0.16b, v23.16b 2118 aesmc v0.16b, v0.16b //AES block 0 - round 5 2119 2120 aese v2.16b, v22.16b 2121 aesmc v2.16b, v2.16b //AES block 2 - round 4 2122 2123 aese v1.16b, v22.16b 2124 aesmc v1.16b, v1.16b //AES block 1 - round 4 2125 2126 aese v0.16b, v24.16b 2127 aesmc v0.16b, v0.16b //AES block 0 - round 6 2128 2129 aese v3.16b, v22.16b 2130 aesmc v3.16b, v3.16b //AES block 3 - round 4 2131 2132 aese v2.16b, v23.16b 2133 aesmc v2.16b, v2.16b //AES block 2 - round 5 2134 2135 aese v1.16b, v23.16b 2136 aesmc v1.16b, v1.16b //AES block 1 - round 5 2137 2138 aese v3.16b, v23.16b 2139 aesmc v3.16b, v3.16b //AES block 3 - round 5 2140 2141 aese v2.16b, v24.16b 2142 aesmc v2.16b, v2.16b //AES block 2 - round 6 2143 ldr q13, [x3, #64] //load h2l | h2h 2144#ifndef __AARCH64EB__ 2145 ext v13.16b, v13.16b, v13.16b, #8 2146#endif 2147 aese v1.16b, v24.16b 2148 aesmc v1.16b, v1.16b //AES block 1 - round 6 2149 2150 aese v3.16b, v24.16b 2151 aesmc v3.16b, v3.16b //AES block 3 - round 6 2152 2153 aese v0.16b, v25.16b 2154 aesmc v0.16b, v0.16b //AES block 0 - round 7 2155 2156 aese v1.16b, v25.16b 2157 aesmc v1.16b, v1.16b //AES block 1 - round 7 2158 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 2159 2160 aese v3.16b, v25.16b 2161 aesmc v3.16b, v3.16b //AES block 3 - round 7 2162 2163 aese v0.16b, v26.16b 2164 aesmc v0.16b, v0.16b //AES block 0 - round 8 2165 2166 aese v2.16b, v25.16b 2167 aesmc v2.16b, v2.16b //AES block 2 - round 7 2168 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 2169 2170 aese v1.16b, v26.16b 2171 aesmc v1.16b, v1.16b //AES block 1 - round 8 2172 2173 aese v3.16b, v26.16b 2174 aesmc v3.16b, v3.16b //AES block 3 - round 8 2175 2176 aese v2.16b, v26.16b 2177 aesmc v2.16b, v2.16b //AES block 2 - round 8 2178 2179 aese v0.16b, v27.16b 2180 aesmc v0.16b, v0.16b //AES block 0 - round 9 2181 2182 aese v3.16b, v27.16b 2183 aesmc v3.16b, v3.16b //AES block 3 - round 9 2184 2185 aese v2.16b, v27.16b 2186 aesmc v2.16b, v2.16b //AES block 2 - round 9 2187 2188 aese v1.16b, v27.16b 2189 aesmc v1.16b, v1.16b //AES block 1 - round 9 2190 2191 aese v0.16b, v28.16b 2192 aesmc v0.16b, v0.16b //AES block 0 - round 10 2193 2194 aese v2.16b, v28.16b 2195 aesmc v2.16b, v2.16b //AES block 2 - round 10 2196 2197 aese v1.16b, v28.16b 2198 aesmc v1.16b, v1.16b //AES block 1 - round 10 2199 lsr x5, x1, #3 //byte_len 2200 mov x15, x5 2201 2202 aese v3.16b, v28.16b 2203 aesmc v3.16b, v3.16b //AES block 3 - round 10 2204 sub x5, x5, #1 //byte_len - 1 2205 2206 eor v16.16b, v16.16b, v8.16b //h2k | h1k 2207 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 2208 2209 eor v17.16b, v17.16b, v9.16b //h4k | h3k 2210 2211 aese v2.16b, v29.16b //AES block 2 - round 11 2212 add x4, x0, x1, lsr #3 //end_input_ptr 2213 add x5, x5, x0 2214 2215 aese v1.16b, v29.16b //AES block 1 - round 11 2216 cmp x0, x5 //check if we have <= 4 blocks 2217 2218 aese v0.16b, v29.16b //AES block 0 - round 11 2219 add w12, w12, #1 //CTR block 3 2220 2221 aese v3.16b, v29.16b //AES block 3 - round 11 2222 b.ge .L192_enc_tail //handle tail 2223 2224 rev w9, w12 //CTR block 4 2225 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 2226#ifdef __AARCH64EB__ 2227 rev x6, x6 2228 rev x7, x7 2229#endif 2230 orr x9, x11, x9, lsl #32 //CTR block 4 2231 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 2232#ifdef __AARCH64EB__ 2233 rev x21, x21 2234 rev x22, x22 2235#endif 2236 ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 2237#ifdef __AARCH64EB__ 2238 rev x23, x23 2239 rev x24, x24 2240#endif 2241 ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 2242#ifdef __AARCH64EB__ 2243 rev x19, x19 2244 rev x20, x20 2245#endif 2246 add x0, x0, #64 //AES input_ptr update 2247 cmp x0, x5 //check if we have <= 8 blocks 2248 2249 eor x6, x6, x13 //AES block 0 - round 12 low 2250 2251 eor x7, x7, x14 //AES block 0 - round 12 high 2252 eor x22, x22, x14 //AES block 2 - round 12 high 2253 fmov d4, x6 //AES block 0 - mov low 2254 2255 eor x24, x24, x14 //AES block 3 - round 12 high 2256 fmov v4.d[1], x7 //AES block 0 - mov high 2257 2258 eor x21, x21, x13 //AES block 2 - round 12 low 2259 eor x19, x19, x13 //AES block 1 - round 12 low 2260 2261 fmov d5, x19 //AES block 1 - mov low 2262 eor x20, x20, x14 //AES block 1 - round 12 high 2263 2264 fmov v5.d[1], x20 //AES block 1 - mov high 2265 2266 eor x23, x23, x13 //AES block 3 - round 12 low 2267 fmov d6, x21 //AES block 2 - mov low 2268 2269 add w12, w12, #1 //CTR block 4 2270 eor v4.16b, v4.16b, v0.16b //AES block 0 - result 2271 fmov d0, x10 //CTR block 4 2272 2273 fmov v0.d[1], x9 //CTR block 4 2274 rev w9, w12 //CTR block 5 2275 2276 orr x9, x11, x9, lsl #32 //CTR block 5 2277 add w12, w12, #1 //CTR block 5 2278 2279 fmov d7, x23 //AES block 3 - mov low 2280 st1 { v4.16b}, [x2], #16 //AES block 0 - store result 2281 2282 fmov v6.d[1], x22 //AES block 2 - mov high 2283 2284 eor v5.16b, v5.16b, v1.16b //AES block 1 - result 2285 fmov d1, x10 //CTR block 5 2286 st1 { v5.16b}, [x2], #16 //AES block 1 - store result 2287 2288 fmov v7.d[1], x24 //AES block 3 - mov high 2289 2290 fmov v1.d[1], x9 //CTR block 5 2291 rev w9, w12 //CTR block 6 2292 2293 orr x9, x11, x9, lsl #32 //CTR block 6 2294 2295 add w12, w12, #1 //CTR block 6 2296 eor v6.16b, v6.16b, v2.16b //AES block 2 - result 2297 fmov d2, x10 //CTR block 6 2298 2299 fmov v2.d[1], x9 //CTR block 6 2300 rev w9, w12 //CTR block 7 2301 2302 orr x9, x11, x9, lsl #32 //CTR block 7 2303 st1 { v6.16b}, [x2], #16 //AES block 2 - store result 2304 2305 eor v7.16b, v7.16b, v3.16b //AES block 3 - result 2306 st1 { v7.16b}, [x2], #16 //AES block 3 - store result 2307 b.ge .L192_enc_prepretail //do prepretail 2308 2309.L192_enc_main_loop: //main loop start 2310 aese v2.16b, v18.16b 2311 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 2312 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 2313 2314 aese v1.16b, v18.16b 2315 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 2316 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 2317#ifdef __AARCH64EB__ 2318 rev x19, x19 2319 rev x20, x20 2320#endif 2321 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 2322 fmov d3, x10 //CTR block 4k+3 2323 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 2324 2325 aese v2.16b, v19.16b 2326 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 2327 fmov v3.d[1], x9 //CTR block 4k+3 2328 2329 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 2330 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 2331 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 2332#ifdef __AARCH64EB__ 2333 rev x21, x21 2334 rev x22, x22 2335#endif 2336 aese v0.16b, v18.16b 2337 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 2338 ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext 2339#ifdef __AARCH64EB__ 2340 rev x23, x23 2341 rev x24, x24 2342#endif 2343 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 2344 eor v4.16b, v4.16b, v11.16b //PRE 1 2345 2346 aese v1.16b, v19.16b 2347 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 2348 2349 aese v0.16b, v19.16b 2350 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 2351 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 2352 2353 aese v3.16b, v18.16b 2354 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 2355 eor x24, x24, x14 //AES block 4k+3 - round 12 high 2356 2357 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 2358 mov d8, v4.d[1] //GHASH block 4k - mid 2359 2360 aese v0.16b, v20.16b 2361 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 2362 2363 aese v3.16b, v19.16b 2364 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 2365 eor x21, x21, x13 //AES block 4k+6 - round 12 low 2366 2367 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 2368 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 2369 2370 aese v0.16b, v21.16b 2371 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 2372 eor x19, x19, x13 //AES block 4k+5 - round 12 low 2373 2374 aese v1.16b, v20.16b 2375 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 2376 mov d31, v6.d[1] //GHASH block 4k+2 - mid 2377 2378 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 2379 mov d4, v5.d[1] //GHASH block 4k+1 - mid 2380 2381 aese v2.16b, v20.16b 2382 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 2383 2384 aese v1.16b, v21.16b 2385 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 2386 2387 mov d10, v17.d[1] //GHASH block 4k - mid 2388 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 2389 2390 aese v3.16b, v20.16b 2391 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 2392 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 2393 2394 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 2395 2396 aese v0.16b, v22.16b 2397 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 2398 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 2399 2400 aese v3.16b, v21.16b 2401 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 2402 2403 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 2404 eor x20, x20, x14 //AES block 4k+5 - round 12 high 2405 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 2406 2407 aese v0.16b, v23.16b 2408 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 2409 add w12, w12, #1 //CTR block 4k+3 2410 2411 aese v3.16b, v22.16b 2412 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 2413 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 2414 2415 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 2416 eor x22, x22, x14 //AES block 4k+6 - round 12 high 2417 2418 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 2419 eor x23, x23, x13 //AES block 4k+3 - round 12 low 2420 mov d30, v7.d[1] //GHASH block 4k+3 - mid 2421 2422 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 2423 rev w9, w12 //CTR block 4k+8 2424 2425 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 2426 orr x9, x11, x9, lsl #32 //CTR block 4k+8 2427 2428 aese v2.16b, v21.16b 2429 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 2430 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 2431 2432 aese v1.16b, v22.16b 2433 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 2434 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 2435#ifdef __AARCH64EB__ 2436 rev x6, x6 2437 rev x7, x7 2438#endif 2439 aese v0.16b, v24.16b 2440 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 2441 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 2442 2443 aese v2.16b, v22.16b 2444 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 2445 add x0, x0, #64 //AES input_ptr update 2446 2447 aese v1.16b, v23.16b 2448 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 2449 movi v8.8b, #0xc2 2450 2451 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 2452 eor x7, x7, x14 //AES block 4k+4 - round 12 high 2453 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 2454 2455 aese v2.16b, v23.16b 2456 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 2457 eor x6, x6, x13 //AES block 4k+4 - round 12 low 2458 2459 aese v1.16b, v24.16b 2460 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 2461 shl d8, d8, #56 //mod_constant 2462 2463 aese v3.16b, v23.16b 2464 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 2465 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 2466 2467 aese v0.16b, v25.16b 2468 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 2469 fmov d5, x19 //AES block 4k+5 - mov low 2470 2471 aese v1.16b, v25.16b 2472 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 2473 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 2474 2475 aese v3.16b, v24.16b 2476 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 2477 fmov v5.d[1], x20 //AES block 4k+5 - mov high 2478 2479 aese v0.16b, v26.16b 2480 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 2481 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 2482 2483 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 2484 cmp x0, x5 //.LOOP CONTROL 2485 fmov d4, x6 //AES block 4k+4 - mov low 2486 2487 aese v2.16b, v24.16b 2488 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 2489 fmov v4.d[1], x7 //AES block 4k+4 - mov high 2490 2491 aese v1.16b, v26.16b 2492 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 2493 fmov d7, x23 //AES block 4k+3 - mov low 2494 2495 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 2496 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 2497 add w12, w12, #1 //CTR block 4k+8 2498 2499 aese v2.16b, v25.16b 2500 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 2501 fmov v7.d[1], x24 //AES block 4k+3 - mov high 2502 2503 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 2504 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 2505 fmov d6, x21 //AES block 4k+6 - mov low 2506 2507 aese v3.16b, v25.16b 2508 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 2509 2510 aese v0.16b, v27.16b 2511 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 2512 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 2513 2514 aese v2.16b, v26.16b 2515 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 2516 2517 aese v3.16b, v26.16b 2518 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 2519 2520 aese v1.16b, v27.16b 2521 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 2522 2523 aese v0.16b, v28.16b 2524 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 2525 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 2526 2527 aese v3.16b, v27.16b 2528 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 2529 2530 aese v2.16b, v27.16b 2531 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 2532 2533 aese v0.16b, v29.16b //AES block 4k+4 - round 11 2534 2535 aese v1.16b, v28.16b 2536 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 2537 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 2538 2539 aese v2.16b, v28.16b 2540 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 2541 2542 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 2543 fmov d0, x10 //CTR block 4k+8 2544 2545 aese v1.16b, v29.16b //AES block 4k+5 - round 11 2546 fmov v0.d[1], x9 //CTR block 4k+8 2547 rev w9, w12 //CTR block 4k+9 2548 2549 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 2550 fmov v6.d[1], x22 //AES block 4k+6 - mov high 2551 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 2552 2553 aese v3.16b, v28.16b 2554 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 2555 orr x9, x11, x9, lsl #32 //CTR block 4k+9 2556 2557 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 2558 add w12, w12, #1 //CTR block 4k+9 2559 fmov d1, x10 //CTR block 4k+9 2560 2561 aese v2.16b, v29.16b //AES block 4k+6 - round 11 2562 fmov v1.d[1], x9 //CTR block 4k+9 2563 rev w9, w12 //CTR block 4k+10 2564 2565 add w12, w12, #1 //CTR block 4k+10 2566 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 2567 orr x9, x11, x9, lsl #32 //CTR block 4k+10 2568 2569 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 2570 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 2571 2572 aese v3.16b, v29.16b //AES block 4k+7 - round 11 2573 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 2574 fmov d2, x10 //CTR block 4k+10 2575 2576 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 2577 fmov v2.d[1], x9 //CTR block 4k+10 2578 rev w9, w12 //CTR block 4k+11 2579 2580 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 2581 orr x9, x11, x9, lsl #32 //CTR block 4k+11 2582 2583 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result 2584 st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result 2585 b.lt .L192_enc_main_loop 2586 2587.L192_enc_prepretail: //PREPRETAIL 2588 aese v0.16b, v18.16b 2589 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 2590 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 2591 2592 fmov d3, x10 //CTR block 4k+3 2593 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 2594 add w12, w12, #1 //CTR block 4k+3 2595 2596 aese v1.16b, v18.16b 2597 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 2598 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 2599 2600 aese v2.16b, v18.16b 2601 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 2602 2603 fmov v3.d[1], x9 //CTR block 4k+3 2604 eor v4.16b, v4.16b, v11.16b //PRE 1 2605 mov d10, v17.d[1] //GHASH block 4k - mid 2606 2607 aese v1.16b, v19.16b 2608 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 2609 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 2610 2611 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 2612 2613 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 2614 mov d8, v4.d[1] //GHASH block 4k - mid 2615 2616 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 2617 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 2618 2619 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 2620 2621 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 2622 mov d4, v5.d[1] //GHASH block 4k+1 - mid 2623 2624 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 2625 mov d31, v6.d[1] //GHASH block 4k+2 - mid 2626 2627 aese v3.16b, v18.16b 2628 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 2629 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 2630 2631 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 2632 2633 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 2634 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 2635 2636 aese v3.16b, v19.16b 2637 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 2638 2639 aese v2.16b, v19.16b 2640 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 2641 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 2642 2643 aese v0.16b, v19.16b 2644 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 2645 2646 aese v1.16b, v20.16b 2647 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 2648 mov d30, v7.d[1] //GHASH block 4k+3 - mid 2649 2650 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 2651 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 2652 2653 aese v0.16b, v20.16b 2654 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 2655 2656 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 2657 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 2658 2659 aese v1.16b, v21.16b 2660 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 2661 2662 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 2663 2664 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 2665 2666 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 2667 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 2668 2669 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 2670 2671 aese v0.16b, v21.16b 2672 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 2673 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 2674 2675 aese v3.16b, v20.16b 2676 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 2677 2678 aese v2.16b, v20.16b 2679 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 2680 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 2681 2682 aese v0.16b, v22.16b 2683 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 2684 2685 aese v3.16b, v21.16b 2686 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 2687 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 2688 2689 aese v2.16b, v21.16b 2690 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 2691 2692 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 2693 movi v8.8b, #0xc2 2694 2695 aese v3.16b, v22.16b 2696 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 2697 2698 aese v2.16b, v22.16b 2699 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 2700 2701 aese v1.16b, v22.16b 2702 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 2703 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 2704 2705 aese v3.16b, v23.16b 2706 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 2707 2708 aese v2.16b, v23.16b 2709 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 2710 2711 aese v1.16b, v23.16b 2712 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 2713 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 2714 2715 aese v0.16b, v23.16b 2716 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 2717 2718 aese v3.16b, v24.16b 2719 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 2720 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 2721 2722 aese v1.16b, v24.16b 2723 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 2724 2725 aese v0.16b, v24.16b 2726 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 2727 shl d8, d8, #56 //mod_constant 2728 2729 aese v3.16b, v25.16b 2730 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 2731 2732 aese v1.16b, v25.16b 2733 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 2734 eor v10.16b, v10.16b, v11.16b 2735 2736 aese v0.16b, v25.16b 2737 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 2738 2739 pmull v30.1q, v9.1d, v8.1d 2740 2741 aese v2.16b, v24.16b 2742 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 2743 ext v9.16b, v9.16b, v9.16b, #8 2744 2745 aese v0.16b, v26.16b 2746 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 2747 2748 aese v1.16b, v26.16b 2749 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 2750 eor v10.16b, v10.16b, v30.16b 2751 2752 aese v2.16b, v25.16b 2753 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 2754 2755 aese v3.16b, v26.16b 2756 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 2757 2758 aese v0.16b, v27.16b 2759 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 2760 2761 aese v2.16b, v26.16b 2762 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 2763 eor v10.16b, v10.16b, v9.16b 2764 2765 aese v3.16b, v27.16b 2766 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 2767 2768 aese v1.16b, v27.16b 2769 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 2770 2771 aese v2.16b, v27.16b 2772 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 2773 2774 pmull v30.1q, v10.1d, v8.1d 2775 2776 ext v10.16b, v10.16b, v10.16b, #8 2777 2778 aese v3.16b, v28.16b 2779 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 2780 2781 aese v0.16b, v28.16b 2782 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 2783 2784 aese v2.16b, v28.16b 2785 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 2786 2787 aese v1.16b, v28.16b 2788 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 2789 eor v11.16b, v11.16b, v30.16b 2790 2791 aese v0.16b, v29.16b //AES block 4k+4 - round 11 2792 2793 aese v3.16b, v29.16b //AES block 4k+7 - round 11 2794 2795 aese v2.16b, v29.16b //AES block 4k+6 - round 11 2796 2797 aese v1.16b, v29.16b //AES block 4k+5 - round 11 2798 eor v11.16b, v11.16b, v10.16b 2799.L192_enc_tail: //TAIL 2800 2801 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 2802 ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 2803#ifdef __AARCH64EB__ 2804 rev x6, x6 2805 rev x7, x7 2806#endif 2807 eor x6, x6, x13 //AES block 4k+4 - round 12 low 2808 eor x7, x7, x14 //AES block 4k+4 - round 12 high 2809 2810 fmov d4, x6 //AES block 4k+4 - mov low 2811 2812 fmov v4.d[1], x7 //AES block 4k+4 - mov high 2813 cmp x5, #48 2814 2815 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 2816 2817 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 2818 b.gt .L192_enc_blocks_more_than_3 2819 2820 sub w12, w12, #1 2821 movi v10.8b, #0 2822 2823 mov v3.16b, v2.16b 2824 movi v9.8b, #0 2825 cmp x5, #32 2826 2827 mov v2.16b, v1.16b 2828 movi v11.8b, #0 2829 b.gt .L192_enc_blocks_more_than_2 2830 2831 sub w12, w12, #1 2832 2833 mov v3.16b, v1.16b 2834 cmp x5, #16 2835 b.gt .L192_enc_blocks_more_than_1 2836 2837 sub w12, w12, #1 2838 b .L192_enc_blocks_less_than_1 2839.L192_enc_blocks_more_than_3: //blocks left > 3 2840 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 2841 2842 ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 2843#ifdef __AARCH64EB__ 2844 rev x6, x6 2845 rev x7, x7 2846#endif 2847 rev64 v4.16b, v5.16b //GHASH final-3 block 2848 2849 eor x6, x6, x13 //AES final-2 block - round 12 low 2850 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2851 2852 eor x7, x7, x14 //AES final-2 block - round 12 high 2853 fmov d5, x6 //AES final-2 block - mov low 2854 2855 fmov v5.d[1], x7 //AES final-2 block - mov high 2856 2857 mov d22, v4.d[1] //GHASH final-3 block - mid 2858 2859 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 2860 2861 mov d10, v17.d[1] //GHASH final-3 block - mid 2862 2863 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 2864 2865 movi v8.8b, #0 //suppress further partial tag feed in 2866 2867 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 2868 2869 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 2870 eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 2871.L192_enc_blocks_more_than_2: //blocks left > 2 2872 2873 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 2874 2875 rev64 v4.16b, v5.16b //GHASH final-2 block 2876 ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 2877#ifdef __AARCH64EB__ 2878 rev x6, x6 2879 rev x7, x7 2880#endif 2881 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2882 2883 eor x7, x7, x14 //AES final-1 block - round 12 high 2884 2885 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 2886 mov d22, v4.d[1] //GHASH final-2 block - mid 2887 2888 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 2889 eor x6, x6, x13 //AES final-1 block - round 12 low 2890 2891 fmov d5, x6 //AES final-1 block - mov low 2892 2893 fmov v5.d[1], x7 //AES final-1 block - mov high 2894 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 2895 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 2896 2897 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 2898 2899 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 2900 2901 movi v8.8b, #0 //suppress further partial tag feed in 2902 2903 eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 2904 2905 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 2906.L192_enc_blocks_more_than_1: //blocks left > 1 2907 2908 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 2909 2910 ldp x6, x7, [x0], #16 //AES final block - load input low & high 2911#ifdef __AARCH64EB__ 2912 rev x6, x6 2913 rev x7, x7 2914#endif 2915 rev64 v4.16b, v5.16b //GHASH final-1 block 2916 2917 eor x6, x6, x13 //AES final block - round 12 low 2918 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2919 movi v8.8b, #0 //suppress further partial tag feed in 2920 2921 mov d22, v4.d[1] //GHASH final-1 block - mid 2922 2923 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 2924 eor x7, x7, x14 //AES final block - round 12 high 2925 fmov d5, x6 //AES final block - mov low 2926 2927 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 2928 fmov v5.d[1], x7 //AES final block - mov high 2929 2930 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 2931 2932 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 2933 2934 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 2935 2936 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 2937 2938 eor v5.16b, v5.16b, v3.16b //AES final block - result 2939 2940 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 2941 2942 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 2943.L192_enc_blocks_less_than_1: //blocks left <= 1 2944 2945 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 2946#ifndef __AARCH64EB__ 2947 rev w9, w12 2948#else 2949 mov w9, w12 2950#endif 2951 and x1, x1, #127 //bit_length %= 128 2952 2953 sub x1, x1, #128 //bit_length -= 128 2954 mvn x14, xzr //rk12_h = 0xffffffffffffffff 2955 2956 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 2957 mvn x13, xzr //rk12_l = 0xffffffffffffffff 2958 2959 and x1, x1, #127 //bit_length %= 128 2960 2961 lsr x14, x14, x1 //rk12_h is mask for top 64b of last block 2962 cmp x1, #64 2963 2964 csel x6, x13, x14, lt 2965 csel x7, x14, xzr, lt 2966 2967 fmov d0, x6 //ctr0b is mask for last block 2968 2969 fmov v0.d[1], x7 2970 2971 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 2972 2973 rev64 v4.16b, v5.16b //GHASH final block 2974 2975 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2976 2977 mov d8, v4.d[1] //GHASH final block - mid 2978 2979 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 2980 2981 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 2982 2983 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 2984 2985 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 2986 2987 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 2988 2989 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 2990 2991 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 2992 movi v8.8b, #0xc2 2993 2994 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 2995 2996 shl d8, d8, #56 //mod_constant 2997 2998 bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 2999 3000 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3001 3002 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3003 3004 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3005 3006 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3007 3008 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3009 3010 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3011 3012 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3013 3014 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 3015 str w9, [x16, #12] //store the updated counter 3016 3017 st1 { v5.16b}, [x2] //store all 16B 3018 3019 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3020 ext v11.16b, v11.16b, v11.16b, #8 3021 rev64 v11.16b, v11.16b 3022 mov x0, x15 3023 st1 { v11.16b }, [x3] 3024 3025 ldp x21, x22, [sp, #16] 3026 ldp x23, x24, [sp, #32] 3027 ldp d8, d9, [sp, #48] 3028 ldp d10, d11, [sp, #64] 3029 ldp d12, d13, [sp, #80] 3030 ldp d14, d15, [sp, #96] 3031 ldp x19, x20, [sp], #112 3032 ret 3033 3034.L192_enc_ret: 3035 mov w0, #0x0 3036 ret 3037.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel 3038.globl aes_gcm_dec_192_kernel 3039.type aes_gcm_dec_192_kernel,%function 3040.align 4 3041aes_gcm_dec_192_kernel: 3042 cbz x1, .L192_dec_ret 3043 stp x19, x20, [sp, #-112]! 3044 mov x16, x4 3045 mov x8, x5 3046 stp x21, x22, [sp, #16] 3047 stp x23, x24, [sp, #32] 3048 stp d8, d9, [sp, #48] 3049 stp d10, d11, [sp, #64] 3050 stp d12, d13, [sp, #80] 3051 stp d14, d15, [sp, #96] 3052 3053 add x4, x0, x1, lsr #3 //end_input_ptr 3054 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 3055#ifdef __AARCH64EB__ 3056 rev x10, x10 3057 rev x11, x11 3058#endif 3059 ldp x13, x14, [x8, #192] //load rk12 3060#ifdef __AARCH64EB__ 3061 ror x13, x13, #32 3062 ror x14, x14, #32 3063#endif 3064 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 3065 3066 ld1 {v18.4s}, [x8], #16 //load rk0 3067 3068 lsr x5, x1, #3 //byte_len 3069 mov x15, x5 3070 ld1 {v19.4s}, [x8], #16 //load rk1 3071 3072 lsr x12, x11, #32 3073 orr w11, w11, w11 3074 fmov d3, x10 //CTR block 3 3075 3076 rev w12, w12 //rev_ctr32 3077 fmov d1, x10 //CTR block 1 3078 3079 add w12, w12, #1 //increment rev_ctr32 3080 ld1 {v20.4s}, [x8], #16 //load rk2 3081 3082 aese v0.16b, v18.16b 3083 aesmc v0.16b, v0.16b //AES block 0 - round 0 3084 rev w9, w12 //CTR block 1 3085 3086 add w12, w12, #1 //CTR block 1 3087 orr x9, x11, x9, lsl #32 //CTR block 1 3088 ld1 {v21.4s}, [x8], #16 //load rk3 3089 3090 fmov v1.d[1], x9 //CTR block 1 3091 rev w9, w12 //CTR block 2 3092 add w12, w12, #1 //CTR block 2 3093 3094 fmov d2, x10 //CTR block 2 3095 orr x9, x11, x9, lsl #32 //CTR block 2 3096 3097 fmov v2.d[1], x9 //CTR block 2 3098 rev w9, w12 //CTR block 3 3099 3100 aese v0.16b, v19.16b 3101 aesmc v0.16b, v0.16b //AES block 0 - round 1 3102 orr x9, x11, x9, lsl #32 //CTR block 3 3103 3104 fmov v3.d[1], x9 //CTR block 3 3105 3106 ld1 {v22.4s}, [x8], #16 //load rk4 3107 3108 aese v0.16b, v20.16b 3109 aesmc v0.16b, v0.16b //AES block 0 - round 2 3110 3111 aese v2.16b, v18.16b 3112 aesmc v2.16b, v2.16b //AES block 2 - round 0 3113 ld1 {v23.4s}, [x8], #16 //load rk5 3114 3115 aese v1.16b, v18.16b 3116 aesmc v1.16b, v1.16b //AES block 1 - round 0 3117 ldr q15, [x3, #112] //load h4l | h4h 3118#ifndef __AARCH64EB__ 3119 ext v15.16b, v15.16b, v15.16b, #8 3120#endif 3121 aese v3.16b, v18.16b 3122 aesmc v3.16b, v3.16b //AES block 3 - round 0 3123 ldr q13, [x3, #64] //load h2l | h2h 3124#ifndef __AARCH64EB__ 3125 ext v13.16b, v13.16b, v13.16b, #8 3126#endif 3127 aese v2.16b, v19.16b 3128 aesmc v2.16b, v2.16b //AES block 2 - round 1 3129 ldr q14, [x3, #80] //load h3l | h3h 3130#ifndef __AARCH64EB__ 3131 ext v14.16b, v14.16b, v14.16b, #8 3132#endif 3133 aese v1.16b, v19.16b 3134 aesmc v1.16b, v1.16b //AES block 1 - round 1 3135 3136 aese v3.16b, v19.16b 3137 aesmc v3.16b, v3.16b //AES block 3 - round 1 3138 ldr q12, [x3, #32] //load h1l | h1h 3139#ifndef __AARCH64EB__ 3140 ext v12.16b, v12.16b, v12.16b, #8 3141#endif 3142 aese v2.16b, v20.16b 3143 aesmc v2.16b, v2.16b //AES block 2 - round 2 3144 ld1 {v24.4s}, [x8], #16 //load rk6 3145 3146 aese v0.16b, v21.16b 3147 aesmc v0.16b, v0.16b //AES block 0 - round 3 3148 ld1 {v25.4s}, [x8], #16 //load rk7 3149 3150 aese v1.16b, v20.16b 3151 aesmc v1.16b, v1.16b //AES block 1 - round 2 3152 ld1 {v26.4s}, [x8], #16 //load rk8 3153 3154 aese v3.16b, v20.16b 3155 aesmc v3.16b, v3.16b //AES block 3 - round 2 3156 ld1 {v27.4s}, [x8], #16 //load rk9 3157 3158 aese v2.16b, v21.16b 3159 aesmc v2.16b, v2.16b //AES block 2 - round 3 3160 ld1 { v11.16b}, [x3] 3161 ext v11.16b, v11.16b, v11.16b, #8 3162 rev64 v11.16b, v11.16b 3163 3164 aese v1.16b, v21.16b 3165 aesmc v1.16b, v1.16b //AES block 1 - round 3 3166 add w12, w12, #1 //CTR block 3 3167 3168 aese v3.16b, v21.16b 3169 aesmc v3.16b, v3.16b //AES block 3 - round 3 3170 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 3171 3172 aese v0.16b, v22.16b 3173 aesmc v0.16b, v0.16b //AES block 0 - round 4 3174 ld1 {v28.4s}, [x8], #16 //load rk10 3175 3176 aese v1.16b, v22.16b 3177 aesmc v1.16b, v1.16b //AES block 1 - round 4 3178 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 3179 3180 aese v2.16b, v22.16b 3181 aesmc v2.16b, v2.16b //AES block 2 - round 4 3182 3183 aese v3.16b, v22.16b 3184 aesmc v3.16b, v3.16b //AES block 3 - round 4 3185 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 3186 3187 aese v0.16b, v23.16b 3188 aesmc v0.16b, v0.16b //AES block 0 - round 5 3189 ld1 {v29.4s}, [x8], #16 //load rk11 3190 3191 aese v1.16b, v23.16b 3192 aesmc v1.16b, v1.16b //AES block 1 - round 5 3193 3194 aese v2.16b, v23.16b 3195 aesmc v2.16b, v2.16b //AES block 2 - round 5 3196 3197 aese v3.16b, v23.16b 3198 aesmc v3.16b, v3.16b //AES block 3 - round 5 3199 3200 aese v0.16b, v24.16b 3201 aesmc v0.16b, v0.16b //AES block 0 - round 6 3202 3203 aese v2.16b, v24.16b 3204 aesmc v2.16b, v2.16b //AES block 2 - round 6 3205 3206 aese v3.16b, v24.16b 3207 aesmc v3.16b, v3.16b //AES block 3 - round 6 3208 3209 aese v0.16b, v25.16b 3210 aesmc v0.16b, v0.16b //AES block 0 - round 7 3211 3212 aese v2.16b, v25.16b 3213 aesmc v2.16b, v2.16b //AES block 2 - round 7 3214 3215 aese v3.16b, v25.16b 3216 aesmc v3.16b, v3.16b //AES block 3 - round 7 3217 3218 aese v1.16b, v24.16b 3219 aesmc v1.16b, v1.16b //AES block 1 - round 6 3220 3221 aese v2.16b, v26.16b 3222 aesmc v2.16b, v2.16b //AES block 2 - round 8 3223 3224 aese v3.16b, v26.16b 3225 aesmc v3.16b, v3.16b //AES block 3 - round 8 3226 3227 aese v1.16b, v25.16b 3228 aesmc v1.16b, v1.16b //AES block 1 - round 7 3229 3230 aese v2.16b, v27.16b 3231 aesmc v2.16b, v2.16b //AES block 2 - round 9 3232 3233 aese v3.16b, v27.16b 3234 aesmc v3.16b, v3.16b //AES block 3 - round 9 3235 3236 aese v1.16b, v26.16b 3237 aesmc v1.16b, v1.16b //AES block 1 - round 8 3238 sub x5, x5, #1 //byte_len - 1 3239 3240 aese v0.16b, v26.16b 3241 aesmc v0.16b, v0.16b //AES block 0 - round 8 3242 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 3243 3244 aese v3.16b, v28.16b 3245 aesmc v3.16b, v3.16b //AES block 3 - round 10 3246 add x5, x5, x0 3247 3248 aese v1.16b, v27.16b 3249 aesmc v1.16b, v1.16b //AES block 1 - round 9 3250 cmp x0, x5 //check if we have <= 4 blocks 3251 3252 aese v0.16b, v27.16b 3253 aesmc v0.16b, v0.16b //AES block 0 - round 9 3254 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 3255 3256 aese v3.16b, v29.16b //AES block 3 - round 11 3257 3258 aese v2.16b, v28.16b 3259 aesmc v2.16b, v2.16b //AES block 2 - round 10 3260 3261 aese v1.16b, v28.16b 3262 aesmc v1.16b, v1.16b //AES block 1 - round 10 3263 3264 aese v0.16b, v28.16b 3265 aesmc v0.16b, v0.16b //AES block 0 - round 10 3266 eor v16.16b, v16.16b, v8.16b //h2k | h1k 3267 3268 aese v2.16b, v29.16b //AES block 2 - round 11 3269 3270 aese v1.16b, v29.16b //AES block 1 - round 11 3271 eor v17.16b, v17.16b, v9.16b //h4k | h3k 3272 3273 aese v0.16b, v29.16b //AES block 0 - round 11 3274 b.ge .L192_dec_tail //handle tail 3275 3276 ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext 3277 3278 eor v1.16b, v5.16b, v1.16b //AES block 1 - result 3279 3280 eor v0.16b, v4.16b, v0.16b //AES block 0 - result 3281 rev w9, w12 //CTR block 4 3282 ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext 3283 3284 mov x19, v1.d[0] //AES block 1 - mov low 3285 3286 mov x20, v1.d[1] //AES block 1 - mov high 3287 3288 mov x6, v0.d[0] //AES block 0 - mov low 3289 orr x9, x11, x9, lsl #32 //CTR block 4 3290 add w12, w12, #1 //CTR block 4 3291 3292 mov x7, v0.d[1] //AES block 0 - mov high 3293 rev64 v4.16b, v4.16b //GHASH block 0 3294 3295 fmov d0, x10 //CTR block 4 3296 rev64 v5.16b, v5.16b //GHASH block 1 3297 cmp x0, x5 //check if we have <= 8 blocks 3298 3299 eor x19, x19, x13 //AES block 1 - round 12 low 3300#ifdef __AARCH64EB__ 3301 rev x19, x19 3302#endif 3303 fmov v0.d[1], x9 //CTR block 4 3304 rev w9, w12 //CTR block 5 3305 3306 orr x9, x11, x9, lsl #32 //CTR block 5 3307 fmov d1, x10 //CTR block 5 3308 eor x20, x20, x14 //AES block 1 - round 12 high 3309#ifdef __AARCH64EB__ 3310 rev x20, x20 3311#endif 3312 add w12, w12, #1 //CTR block 5 3313 fmov v1.d[1], x9 //CTR block 5 3314 eor x6, x6, x13 //AES block 0 - round 12 low 3315#ifdef __AARCH64EB__ 3316 rev x6, x6 3317#endif 3318 rev w9, w12 //CTR block 6 3319 eor x7, x7, x14 //AES block 0 - round 12 high 3320#ifdef __AARCH64EB__ 3321 rev x7, x7 3322#endif 3323 stp x6, x7, [x2], #16 //AES block 0 - store result 3324 orr x9, x11, x9, lsl #32 //CTR block 6 3325 3326 stp x19, x20, [x2], #16 //AES block 1 - store result 3327 3328 add w12, w12, #1 //CTR block 6 3329 eor v2.16b, v6.16b, v2.16b //AES block 2 - result 3330 b.ge .L192_dec_prepretail //do prepretail 3331 3332.L192_dec_main_loop: //main loop start 3333 aese v1.16b, v18.16b 3334 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 3335 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 3336 3337 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 3338 mov x21, v2.d[0] //AES block 4k+2 - mov low 3339 3340 mov x22, v2.d[1] //AES block 4k+2 - mov high 3341 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 3342 rev64 v7.16b, v7.16b //GHASH block 4k+3 3343 3344 aese v1.16b, v19.16b 3345 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 3346 fmov d2, x10 //CTR block 4k+6 3347 3348 aese v0.16b, v18.16b 3349 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 3350 eor v4.16b, v4.16b, v11.16b //PRE 1 3351 3352 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 3353 fmov v2.d[1], x9 //CTR block 4k+6 3354 3355 aese v1.16b, v20.16b 3356 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 3357 mov x24, v3.d[1] //AES block 4k+3 - mov high 3358 3359 aese v0.16b, v19.16b 3360 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 3361 mov x23, v3.d[0] //AES block 4k+3 - mov low 3362 3363 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 3364 fmov d3, x10 //CTR block 4k+7 3365 mov d8, v4.d[1] //GHASH block 4k - mid 3366 3367 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 3368 mov d10, v17.d[1] //GHASH block 4k - mid 3369 rev w9, w12 //CTR block 4k+7 3370 3371 aese v2.16b, v18.16b 3372 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 3373 orr x9, x11, x9, lsl #32 //CTR block 4k+7 3374 3375 fmov v3.d[1], x9 //CTR block 4k+7 3376 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 3377 mov d4, v5.d[1] //GHASH block 4k+1 - mid 3378 3379 aese v1.16b, v21.16b 3380 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 3381 3382 aese v0.16b, v20.16b 3383 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 3384 eor x22, x22, x14 //AES block 4k+2 - round 12 high 3385#ifdef __AARCH64EB__ 3386 rev x22, x22 3387#endif 3388 aese v2.16b, v19.16b 3389 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 3390 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 3391 3392 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 3393 3394 aese v3.16b, v18.16b 3395 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 3396 rev64 v6.16b, v6.16b //GHASH block 4k+2 3397 3398 aese v2.16b, v20.16b 3399 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 3400 3401 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 3402 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 3403 eor x21, x21, x13 //AES block 4k+2 - round 12 low 3404#ifdef __AARCH64EB__ 3405 rev x21, x21 3406#endif 3407 aese v1.16b, v22.16b 3408 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 3409 3410 aese v0.16b, v21.16b 3411 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 3412 3413 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 3414 mov d31, v6.d[1] //GHASH block 4k+2 - mid 3415 3416 aese v3.16b, v19.16b 3417 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 3418 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 3419 3420 aese v0.16b, v22.16b 3421 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 3422 3423 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 3424 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 3425 3426 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 3427 3428 aese v0.16b, v23.16b 3429 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 3430 3431 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 3432 mov d30, v7.d[1] //GHASH block 4k+3 - mid 3433 3434 aese v1.16b, v23.16b 3435 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 3436 3437 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 3438 3439 aese v3.16b, v20.16b 3440 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 3441 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 3442 3443 aese v1.16b, v24.16b 3444 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 3445 3446 aese v0.16b, v24.16b 3447 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 3448 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 3449 3450 aese v3.16b, v21.16b 3451 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 3452 3453 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 3454 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 3455 3456 aese v0.16b, v25.16b 3457 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 3458 3459 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 3460 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 3461 3462 aese v1.16b, v25.16b 3463 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 3464 3465 aese v0.16b, v26.16b 3466 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 3467 movi v8.8b, #0xc2 3468 3469 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 3470 3471 aese v1.16b, v26.16b 3472 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 3473 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 3474 3475 aese v2.16b, v21.16b 3476 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 3477 3478 aese v0.16b, v27.16b 3479 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 3480 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 3481 3482 aese v3.16b, v22.16b 3483 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 3484 3485 aese v2.16b, v22.16b 3486 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 3487 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 3488 3489 aese v0.16b, v28.16b 3490 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 3491 3492 aese v1.16b, v27.16b 3493 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 3494 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 3495 3496 aese v2.16b, v23.16b 3497 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 3498 3499 aese v3.16b, v23.16b 3500 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 3501 shl d8, d8, #56 //mod_constant 3502 3503 aese v1.16b, v28.16b 3504 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 3505 3506 aese v2.16b, v24.16b 3507 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 3508 ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 3509 3510 aese v3.16b, v24.16b 3511 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 3512 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3513 3514 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3515 ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 3516 eor x23, x23, x13 //AES block 4k+3 - round 12 low 3517#ifdef __AARCH64EB__ 3518 rev x23, x23 3519#endif 3520 aese v2.16b, v25.16b 3521 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 3522 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3523 3524 aese v0.16b, v29.16b //AES block 4k+4 - round 11 3525 add w12, w12, #1 //CTR block 4k+7 3526 3527 aese v3.16b, v25.16b 3528 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 3529 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3530 3531 aese v2.16b, v26.16b 3532 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 3533 ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 3534 3535 aese v1.16b, v29.16b //AES block 4k+5 - round 11 3536 ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext 3537 rev w9, w12 //CTR block 4k+8 3538 3539 aese v3.16b, v26.16b 3540 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 3541 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 3542 3543 aese v2.16b, v27.16b 3544 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 3545 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3546 3547 cmp x0, x5 //.LOOP CONTROL 3548 3549 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 3550 eor x24, x24, x14 //AES block 4k+3 - round 12 high 3551#ifdef __AARCH64EB__ 3552 rev x24, x24 3553#endif 3554 eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 3555 3556 aese v2.16b, v28.16b 3557 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 3558 orr x9, x11, x9, lsl #32 //CTR block 4k+8 3559 3560 aese v3.16b, v27.16b 3561 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 3562 3563 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3564 mov x19, v1.d[0] //AES block 4k+5 - mov low 3565 3566 mov x6, v0.d[0] //AES block 4k+4 - mov low 3567 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 3568 rev64 v5.16b, v5.16b //GHASH block 4k+5 3569 3570 aese v2.16b, v29.16b //AES block 4k+6 - round 11 3571 mov x7, v0.d[1] //AES block 4k+4 - mov high 3572 3573 aese v3.16b, v28.16b 3574 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 3575 mov x20, v1.d[1] //AES block 4k+5 - mov high 3576 3577 fmov d0, x10 //CTR block 4k+8 3578 add w12, w12, #1 //CTR block 4k+8 3579 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3580 3581 eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 3582 fmov v0.d[1], x9 //CTR block 4k+8 3583 rev w9, w12 //CTR block 4k+9 3584 3585 eor x6, x6, x13 //AES block 4k+4 - round 12 low 3586#ifdef __AARCH64EB__ 3587 rev x6, x6 3588#endif 3589 orr x9, x11, x9, lsl #32 //CTR block 4k+9 3590 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 3591 3592 fmov d1, x10 //CTR block 4k+9 3593 add w12, w12, #1 //CTR block 4k+9 3594 eor x19, x19, x13 //AES block 4k+5 - round 12 low 3595#ifdef __AARCH64EB__ 3596 rev x19, x19 3597#endif 3598 fmov v1.d[1], x9 //CTR block 4k+9 3599 rev w9, w12 //CTR block 4k+10 3600 eor x20, x20, x14 //AES block 4k+5 - round 12 high 3601#ifdef __AARCH64EB__ 3602 rev x20, x20 3603#endif 3604 eor x7, x7, x14 //AES block 4k+4 - round 12 high 3605#ifdef __AARCH64EB__ 3606 rev x7, x7 3607#endif 3608 stp x6, x7, [x2], #16 //AES block 4k+4 - store result 3609 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3610 3611 add w12, w12, #1 //CTR block 4k+10 3612 rev64 v4.16b, v4.16b //GHASH block 4k+4 3613 orr x9, x11, x9, lsl #32 //CTR block 4k+10 3614 3615 aese v3.16b, v29.16b //AES block 4k+7 - round 11 3616 stp x19, x20, [x2], #16 //AES block 4k+5 - store result 3617 b.lt .L192_dec_main_loop 3618 3619.L192_dec_prepretail: //PREPRETAIL 3620 mov x22, v2.d[1] //AES block 4k+2 - mov high 3621 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 3622 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 3623 3624 aese v1.16b, v18.16b 3625 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 3626 mov x21, v2.d[0] //AES block 4k+2 - mov low 3627 3628 aese v0.16b, v18.16b 3629 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 3630 mov d10, v17.d[1] //GHASH block 4k - mid 3631 3632 eor v4.16b, v4.16b, v11.16b //PRE 1 3633 fmov d2, x10 //CTR block 4k+6 3634 3635 aese v1.16b, v19.16b 3636 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 3637 mov x23, v3.d[0] //AES block 4k+3 - mov low 3638 3639 aese v0.16b, v19.16b 3640 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 3641 mov x24, v3.d[1] //AES block 4k+3 - mov high 3642 3643 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 3644 mov d8, v4.d[1] //GHASH block 4k - mid 3645 fmov d3, x10 //CTR block 4k+7 3646 3647 aese v1.16b, v20.16b 3648 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 3649 rev64 v6.16b, v6.16b //GHASH block 4k+2 3650 3651 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 3652 fmov v2.d[1], x9 //CTR block 4k+6 3653 rev w9, w12 //CTR block 4k+7 3654 3655 orr x9, x11, x9, lsl #32 //CTR block 4k+7 3656 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 3657 mov d4, v5.d[1] //GHASH block 4k+1 - mid 3658 3659 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 3660 eor x24, x24, x14 //AES block 4k+3 - round 12 high 3661#ifdef __AARCH64EB__ 3662 rev x24, x24 3663#endif 3664 fmov v3.d[1], x9 //CTR block 4k+7 3665 3666 aese v0.16b, v20.16b 3667 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 3668 eor x21, x21, x13 //AES block 4k+2 - round 12 low 3669#ifdef __AARCH64EB__ 3670 rev x21, x21 3671#endif 3672 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 3673 eor x22, x22, x14 //AES block 4k+2 - round 12 high 3674#ifdef __AARCH64EB__ 3675 rev x22, x22 3676#endif 3677 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 3678 3679 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 3680 eor x23, x23, x13 //AES block 4k+3 - round 12 low 3681#ifdef __AARCH64EB__ 3682 rev x23, x23 3683#endif 3684 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 3685 3686 rev64 v7.16b, v7.16b //GHASH block 4k+3 3687 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 3688 3689 aese v3.16b, v18.16b 3690 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 3691 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 3692 3693 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 3694 add w12, w12, #1 //CTR block 4k+7 3695 3696 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 3697 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 3698 3699 aese v2.16b, v18.16b 3700 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 3701 3702 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 3703 mov d31, v6.d[1] //GHASH block 4k+2 - mid 3704 3705 aese v3.16b, v19.16b 3706 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 3707 3708 aese v2.16b, v19.16b 3709 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 3710 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 3711 3712 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 3713 3714 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 3715 3716 aese v2.16b, v20.16b 3717 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 3718 mov d30, v7.d[1] //GHASH block 4k+3 - mid 3719 3720 aese v3.16b, v20.16b 3721 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 3722 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 3723 3724 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 3725 3726 aese v0.16b, v21.16b 3727 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 3728 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 3729 3730 aese v1.16b, v21.16b 3731 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 3732 3733 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 3734 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 3735 3736 aese v0.16b, v22.16b 3737 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 3738 3739 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 3740 movi v8.8b, #0xc2 3741 3742 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 3743 3744 aese v2.16b, v21.16b 3745 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 3746 3747 shl d8, d8, #56 //mod_constant 3748 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 3749 3750 aese v0.16b, v23.16b 3751 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 3752 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 3753 3754 aese v2.16b, v22.16b 3755 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 3756 3757 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3758 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 3759 3760 aese v0.16b, v24.16b 3761 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 3762 3763 aese v3.16b, v21.16b 3764 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 3765 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 3766 3767 aese v2.16b, v23.16b 3768 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 3769 3770 aese v0.16b, v25.16b 3771 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 3772 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 3773 3774 aese v3.16b, v22.16b 3775 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 3776 3777 aese v2.16b, v24.16b 3778 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 3779 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3780 3781 aese v0.16b, v26.16b 3782 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 3783 3784 aese v3.16b, v23.16b 3785 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 3786 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3787 3788 aese v1.16b, v22.16b 3789 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 3790 3791 aese v2.16b, v25.16b 3792 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 3793 3794 aese v0.16b, v27.16b 3795 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 3796 3797 aese v1.16b, v23.16b 3798 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 3799 3800 aese v3.16b, v24.16b 3801 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 3802 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3803 3804 aese v0.16b, v28.16b 3805 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 3806 3807 aese v1.16b, v24.16b 3808 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 3809 3810 aese v3.16b, v25.16b 3811 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 3812 3813 aese v2.16b, v26.16b 3814 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 3815 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3816 3817 aese v1.16b, v25.16b 3818 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 3819 3820 aese v3.16b, v26.16b 3821 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 3822 3823 aese v2.16b, v27.16b 3824 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 3825 3826 aese v1.16b, v26.16b 3827 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 3828 3829 aese v3.16b, v27.16b 3830 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 3831 3832 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3833 3834 aese v1.16b, v27.16b 3835 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 3836 3837 aese v2.16b, v28.16b 3838 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 3839 3840 aese v3.16b, v28.16b 3841 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 3842 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3843 3844 aese v1.16b, v28.16b 3845 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 3846 3847 aese v0.16b, v29.16b 3848 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 3849 3850 aese v2.16b, v29.16b 3851 3852 aese v1.16b, v29.16b 3853 3854 aese v3.16b, v29.16b 3855 3856 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3857.L192_dec_tail: //TAIL 3858 3859 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 3860 ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 3861 3862 eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 3863 3864 mov x7, v0.d[1] //AES block 4k+4 - mov high 3865 3866 mov x6, v0.d[0] //AES block 4k+4 - mov low 3867 3868 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 3869 3870 cmp x5, #48 3871 3872 eor x7, x7, x14 //AES block 4k+4 - round 12 high 3873#ifdef __AARCH64EB__ 3874 rev x7, x7 3875#endif 3876 eor x6, x6, x13 //AES block 4k+4 - round 12 low 3877#ifdef __AARCH64EB__ 3878 rev x6, x6 3879#endif 3880 b.gt .L192_dec_blocks_more_than_3 3881 3882 movi v11.8b, #0 3883 movi v9.8b, #0 3884 3885 mov v3.16b, v2.16b 3886 mov v2.16b, v1.16b 3887 sub w12, w12, #1 3888 3889 movi v10.8b, #0 3890 cmp x5, #32 3891 b.gt .L192_dec_blocks_more_than_2 3892 3893 mov v3.16b, v1.16b 3894 cmp x5, #16 3895 sub w12, w12, #1 3896 3897 b.gt .L192_dec_blocks_more_than_1 3898 3899 sub w12, w12, #1 3900 b .L192_dec_blocks_less_than_1 3901.L192_dec_blocks_more_than_3: //blocks left > 3 3902 rev64 v4.16b, v5.16b //GHASH final-3 block 3903 ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 3904 3905 stp x6, x7, [x2], #16 //AES final-3 block - store result 3906 3907 eor v4.16b, v4.16b, v8.16b //feed in partial tag 3908 3909 eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 3910 3911 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 3912 mov x6, v0.d[0] //AES final-2 block - mov low 3913 mov d22, v4.d[1] //GHASH final-3 block - mid 3914 3915 mov x7, v0.d[1] //AES final-2 block - mov high 3916 3917 mov d10, v17.d[1] //GHASH final-3 block - mid 3918 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 3919 3920 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 3921 3922 eor x6, x6, x13 //AES final-2 block - round 12 low 3923#ifdef __AARCH64EB__ 3924 rev x6, x6 3925#endif 3926 movi v8.8b, #0 //suppress further partial tag feed in 3927 3928 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 3929 eor x7, x7, x14 //AES final-2 block - round 12 high 3930#ifdef __AARCH64EB__ 3931 rev x7, x7 3932#endif 3933.L192_dec_blocks_more_than_2: //blocks left > 2 3934 3935 rev64 v4.16b, v5.16b //GHASH final-2 block 3936 ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 3937 3938 eor v4.16b, v4.16b, v8.16b //feed in partial tag 3939 3940 movi v8.8b, #0 //suppress further partial tag feed in 3941 3942 eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 3943 3944 mov d22, v4.d[1] //GHASH final-2 block - mid 3945 3946 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 3947 3948 stp x6, x7, [x2], #16 //AES final-2 block - store result 3949 3950 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 3951 mov x7, v0.d[1] //AES final-1 block - mov high 3952 3953 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 3954 mov x6, v0.d[0] //AES final-1 block - mov low 3955 3956 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 3957 3958 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 3959 3960 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 3961 eor x7, x7, x14 //AES final-1 block - round 12 high 3962#ifdef __AARCH64EB__ 3963 rev x7, x7 3964#endif 3965 eor x6, x6, x13 //AES final-1 block - round 12 low 3966#ifdef __AARCH64EB__ 3967 rev x6, x6 3968#endif 3969 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 3970.L192_dec_blocks_more_than_1: //blocks left > 1 3971 3972 rev64 v4.16b, v5.16b //GHASH final-1 block 3973 3974 eor v4.16b, v4.16b, v8.16b //feed in partial tag 3975 ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 3976 3977 mov d22, v4.d[1] //GHASH final-1 block - mid 3978 3979 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 3980 3981 eor v0.16b, v5.16b, v3.16b //AES final block - result 3982 stp x6, x7, [x2], #16 //AES final-1 block - store result 3983 3984 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 3985 3986 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 3987 3988 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 3989 mov x7, v0.d[1] //AES final block - mov high 3990 3991 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 3992 mov x6, v0.d[0] //AES final block - mov low 3993 3994 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 3995 3996 movi v8.8b, #0 //suppress further partial tag feed in 3997 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 3998 eor x7, x7, x14 //AES final block - round 12 high 3999#ifdef __AARCH64EB__ 4000 rev x7, x7 4001#endif 4002 eor x6, x6, x13 //AES final block - round 12 low 4003#ifdef __AARCH64EB__ 4004 rev x6, x6 4005#endif 4006 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 4007.L192_dec_blocks_less_than_1: //blocks left <= 1 4008 4009 mvn x13, xzr //rk12_l = 0xffffffffffffffff 4010 ldp x4, x5, [x2] //load existing bytes we need to not overwrite 4011 and x1, x1, #127 //bit_length %= 128 4012 4013 sub x1, x1, #128 //bit_length -= 128 4014 4015 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 4016 4017 and x1, x1, #127 //bit_length %= 128 4018 mvn x14, xzr //rk12_h = 0xffffffffffffffff 4019 4020 lsr x14, x14, x1 //rk12_h is mask for top 64b of last block 4021 cmp x1, #64 4022 4023 csel x9, x13, x14, lt 4024 csel x10, x14, xzr, lt 4025 4026 fmov d0, x9 //ctr0b is mask for last block 4027 and x6, x6, x9 4028 bic x4, x4, x9 //mask out low existing bytes 4029 4030 orr x6, x6, x4 4031 mov v0.d[1], x10 4032#ifndef __AARCH64EB__ 4033 rev w9, w12 4034#else 4035 mov w9, w12 4036#endif 4037 4038 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 4039 str w9, [x16, #12] //store the updated counter 4040 4041 rev64 v4.16b, v5.16b //GHASH final block 4042 4043 eor v4.16b, v4.16b, v8.16b //feed in partial tag 4044 bic x5, x5, x10 //mask out high existing bytes 4045 4046 and x7, x7, x10 4047 4048 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 4049 mov d8, v4.d[1] //GHASH final block - mid 4050 4051 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 4052 4053 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 4054 4055 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 4056 4057 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 4058 4059 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 4060 4061 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 4062 movi v8.8b, #0xc2 4063 4064 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 4065 4066 shl d8, d8, #56 //mod_constant 4067 4068 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 4069 4070 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 4071 orr x7, x7, x5 4072 stp x6, x7, [x2] 4073 4074 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 4075 4076 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 4077 4078 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 4079 4080 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 4081 4082 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 4083 4084 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 4085 4086 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 4087 ext v11.16b, v11.16b, v11.16b, #8 4088 rev64 v11.16b, v11.16b 4089 mov x0, x15 4090 st1 { v11.16b }, [x3] 4091 4092 ldp x21, x22, [sp, #16] 4093 ldp x23, x24, [sp, #32] 4094 ldp d8, d9, [sp, #48] 4095 ldp d10, d11, [sp, #64] 4096 ldp d12, d13, [sp, #80] 4097 ldp d14, d15, [sp, #96] 4098 ldp x19, x20, [sp], #112 4099 ret 4100 4101.L192_dec_ret: 4102 mov w0, #0x0 4103 ret 4104.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel 4105.globl aes_gcm_enc_256_kernel 4106.type aes_gcm_enc_256_kernel,%function 4107.align 4 4108aes_gcm_enc_256_kernel: 4109 cbz x1, .L256_enc_ret 4110 stp x19, x20, [sp, #-112]! 4111 mov x16, x4 4112 mov x8, x5 4113 stp x21, x22, [sp, #16] 4114 stp x23, x24, [sp, #32] 4115 stp d8, d9, [sp, #48] 4116 stp d10, d11, [sp, #64] 4117 stp d12, d13, [sp, #80] 4118 stp d14, d15, [sp, #96] 4119 4120 add x4, x0, x1, lsr #3 //end_input_ptr 4121 lsr x5, x1, #3 //byte_len 4122 mov x15, x5 4123 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 4124#ifdef __AARCH64EB__ 4125 rev x10, x10 4126 rev x11, x11 4127#endif 4128 ldp x13, x14, [x8, #224] //load rk14 4129#ifdef __AARCH64EB__ 4130 ror x13, x13, #32 4131 ror x14, x14, #32 4132#endif 4133 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 4134 sub x5, x5, #1 //byte_len - 1 4135 4136 ld1 {v18.4s}, [x8], #16 //load rk0 4137 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 4138 4139 ld1 {v19.4s}, [x8], #16 //load rk1 4140 add x5, x5, x0 4141 4142 lsr x12, x11, #32 4143 fmov d2, x10 //CTR block 2 4144 orr w11, w11, w11 4145 4146 rev w12, w12 //rev_ctr32 4147 cmp x0, x5 //check if we have <= 4 blocks 4148 fmov d1, x10 //CTR block 1 4149 4150 aese v0.16b, v18.16b 4151 aesmc v0.16b, v0.16b //AES block 0 - round 0 4152 add w12, w12, #1 //increment rev_ctr32 4153 4154 rev w9, w12 //CTR block 1 4155 fmov d3, x10 //CTR block 3 4156 4157 orr x9, x11, x9, lsl #32 //CTR block 1 4158 add w12, w12, #1 //CTR block 1 4159 ld1 {v20.4s}, [x8], #16 //load rk2 4160 4161 fmov v1.d[1], x9 //CTR block 1 4162 rev w9, w12 //CTR block 2 4163 add w12, w12, #1 //CTR block 2 4164 4165 orr x9, x11, x9, lsl #32 //CTR block 2 4166 ld1 {v21.4s}, [x8], #16 //load rk3 4167 4168 fmov v2.d[1], x9 //CTR block 2 4169 rev w9, w12 //CTR block 3 4170 4171 aese v0.16b, v19.16b 4172 aesmc v0.16b, v0.16b //AES block 0 - round 1 4173 orr x9, x11, x9, lsl #32 //CTR block 3 4174 4175 fmov v3.d[1], x9 //CTR block 3 4176 4177 aese v1.16b, v18.16b 4178 aesmc v1.16b, v1.16b //AES block 1 - round 0 4179 ld1 {v22.4s}, [x8], #16 //load rk4 4180 4181 aese v0.16b, v20.16b 4182 aesmc v0.16b, v0.16b //AES block 0 - round 2 4183 ld1 {v23.4s}, [x8], #16 //load rk5 4184 4185 aese v2.16b, v18.16b 4186 aesmc v2.16b, v2.16b //AES block 2 - round 0 4187 ld1 {v24.4s}, [x8], #16 //load rk6 4188 4189 aese v1.16b, v19.16b 4190 aesmc v1.16b, v1.16b //AES block 1 - round 1 4191 ldr q14, [x3, #80] //load h3l | h3h 4192#ifndef __AARCH64EB__ 4193 ext v14.16b, v14.16b, v14.16b, #8 4194#endif 4195 aese v3.16b, v18.16b 4196 aesmc v3.16b, v3.16b //AES block 3 - round 0 4197 ld1 {v25.4s}, [x8], #16 //load rk7 4198 4199 aese v2.16b, v19.16b 4200 aesmc v2.16b, v2.16b //AES block 2 - round 1 4201 ld1 {v26.4s}, [x8], #16 //load rk8 4202 4203 aese v1.16b, v20.16b 4204 aesmc v1.16b, v1.16b //AES block 1 - round 2 4205 ldr q13, [x3, #64] //load h2l | h2h 4206#ifndef __AARCH64EB__ 4207 ext v13.16b, v13.16b, v13.16b, #8 4208#endif 4209 aese v3.16b, v19.16b 4210 aesmc v3.16b, v3.16b //AES block 3 - round 1 4211 ld1 {v27.4s}, [x8], #16 //load rk9 4212 4213 aese v2.16b, v20.16b 4214 aesmc v2.16b, v2.16b //AES block 2 - round 2 4215 ldr q15, [x3, #112] //load h4l | h4h 4216#ifndef __AARCH64EB__ 4217 ext v15.16b, v15.16b, v15.16b, #8 4218#endif 4219 aese v1.16b, v21.16b 4220 aesmc v1.16b, v1.16b //AES block 1 - round 3 4221 ld1 {v28.4s}, [x8], #16 //load rk10 4222 4223 aese v3.16b, v20.16b 4224 aesmc v3.16b, v3.16b //AES block 3 - round 2 4225 ld1 {v29.4s}, [x8], #16 //load rk11 4226 4227 aese v2.16b, v21.16b 4228 aesmc v2.16b, v2.16b //AES block 2 - round 3 4229 add w12, w12, #1 //CTR block 3 4230 4231 aese v0.16b, v21.16b 4232 aesmc v0.16b, v0.16b //AES block 0 - round 3 4233 4234 aese v3.16b, v21.16b 4235 aesmc v3.16b, v3.16b //AES block 3 - round 3 4236 ld1 { v11.16b}, [x3] 4237 ext v11.16b, v11.16b, v11.16b, #8 4238 rev64 v11.16b, v11.16b 4239 4240 aese v2.16b, v22.16b 4241 aesmc v2.16b, v2.16b //AES block 2 - round 4 4242 4243 aese v0.16b, v22.16b 4244 aesmc v0.16b, v0.16b //AES block 0 - round 4 4245 4246 aese v1.16b, v22.16b 4247 aesmc v1.16b, v1.16b //AES block 1 - round 4 4248 4249 aese v3.16b, v22.16b 4250 aesmc v3.16b, v3.16b //AES block 3 - round 4 4251 4252 aese v0.16b, v23.16b 4253 aesmc v0.16b, v0.16b //AES block 0 - round 5 4254 4255 aese v1.16b, v23.16b 4256 aesmc v1.16b, v1.16b //AES block 1 - round 5 4257 4258 aese v3.16b, v23.16b 4259 aesmc v3.16b, v3.16b //AES block 3 - round 5 4260 4261 aese v2.16b, v23.16b 4262 aesmc v2.16b, v2.16b //AES block 2 - round 5 4263 4264 aese v1.16b, v24.16b 4265 aesmc v1.16b, v1.16b //AES block 1 - round 6 4266 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 4267 4268 aese v3.16b, v24.16b 4269 aesmc v3.16b, v3.16b //AES block 3 - round 6 4270 ld1 {v30.4s}, [x8], #16 //load rk12 4271 4272 aese v0.16b, v24.16b 4273 aesmc v0.16b, v0.16b //AES block 0 - round 6 4274 ldr q12, [x3, #32] //load h1l | h1h 4275#ifndef __AARCH64EB__ 4276 ext v12.16b, v12.16b, v12.16b, #8 4277#endif 4278 aese v2.16b, v24.16b 4279 aesmc v2.16b, v2.16b //AES block 2 - round 6 4280 ld1 {v31.4s}, [x8], #16 //load rk13 4281 4282 aese v1.16b, v25.16b 4283 aesmc v1.16b, v1.16b //AES block 1 - round 7 4284 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 4285 4286 aese v0.16b, v25.16b 4287 aesmc v0.16b, v0.16b //AES block 0 - round 7 4288 4289 aese v2.16b, v25.16b 4290 aesmc v2.16b, v2.16b //AES block 2 - round 7 4291 4292 aese v3.16b, v25.16b 4293 aesmc v3.16b, v3.16b //AES block 3 - round 7 4294 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 4295 4296 aese v1.16b, v26.16b 4297 aesmc v1.16b, v1.16b //AES block 1 - round 8 4298 4299 aese v2.16b, v26.16b 4300 aesmc v2.16b, v2.16b //AES block 2 - round 8 4301 4302 aese v3.16b, v26.16b 4303 aesmc v3.16b, v3.16b //AES block 3 - round 8 4304 4305 aese v1.16b, v27.16b 4306 aesmc v1.16b, v1.16b //AES block 1 - round 9 4307 4308 aese v2.16b, v27.16b 4309 aesmc v2.16b, v2.16b //AES block 2 - round 9 4310 4311 aese v0.16b, v26.16b 4312 aesmc v0.16b, v0.16b //AES block 0 - round 8 4313 4314 aese v1.16b, v28.16b 4315 aesmc v1.16b, v1.16b //AES block 1 - round 10 4316 4317 aese v3.16b, v27.16b 4318 aesmc v3.16b, v3.16b //AES block 3 - round 9 4319 4320 aese v0.16b, v27.16b 4321 aesmc v0.16b, v0.16b //AES block 0 - round 9 4322 4323 aese v2.16b, v28.16b 4324 aesmc v2.16b, v2.16b //AES block 2 - round 10 4325 4326 aese v3.16b, v28.16b 4327 aesmc v3.16b, v3.16b //AES block 3 - round 10 4328 4329 aese v1.16b, v29.16b 4330 aesmc v1.16b, v1.16b //AES block 1 - round 11 4331 4332 aese v2.16b, v29.16b 4333 aesmc v2.16b, v2.16b //AES block 2 - round 11 4334 4335 aese v0.16b, v28.16b 4336 aesmc v0.16b, v0.16b //AES block 0 - round 10 4337 4338 aese v1.16b, v30.16b 4339 aesmc v1.16b, v1.16b //AES block 1 - round 12 4340 4341 aese v2.16b, v30.16b 4342 aesmc v2.16b, v2.16b //AES block 2 - round 12 4343 4344 aese v0.16b, v29.16b 4345 aesmc v0.16b, v0.16b //AES block 0 - round 11 4346 eor v17.16b, v17.16b, v9.16b //h4k | h3k 4347 4348 aese v3.16b, v29.16b 4349 aesmc v3.16b, v3.16b //AES block 3 - round 11 4350 4351 aese v2.16b, v31.16b //AES block 2 - round 13 4352 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 4353 4354 aese v0.16b, v30.16b 4355 aesmc v0.16b, v0.16b //AES block 0 - round 12 4356 4357 aese v3.16b, v30.16b 4358 aesmc v3.16b, v3.16b //AES block 3 - round 12 4359 4360 aese v1.16b, v31.16b //AES block 1 - round 13 4361 4362 aese v0.16b, v31.16b //AES block 0 - round 13 4363 4364 aese v3.16b, v31.16b //AES block 3 - round 13 4365 eor v16.16b, v16.16b, v8.16b //h2k | h1k 4366 b.ge .L256_enc_tail //handle tail 4367 4368 ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 4369#ifdef __AARCH64EB__ 4370 rev x19, x19 4371 rev x20, x20 4372#endif 4373 rev w9, w12 //CTR block 4 4374 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 4375#ifdef __AARCH64EB__ 4376 rev x6, x6 4377 rev x7, x7 4378#endif 4379 ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 4380#ifdef __AARCH64EB__ 4381 rev x23, x23 4382 rev x24, x24 4383#endif 4384 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 4385#ifdef __AARCH64EB__ 4386 rev x21, x21 4387 rev x22, x22 4388#endif 4389 add x0, x0, #64 //AES input_ptr update 4390 4391 eor x19, x19, x13 //AES block 1 - round 14 low 4392 eor x20, x20, x14 //AES block 1 - round 14 high 4393 4394 fmov d5, x19 //AES block 1 - mov low 4395 eor x6, x6, x13 //AES block 0 - round 14 low 4396 4397 eor x7, x7, x14 //AES block 0 - round 14 high 4398 eor x24, x24, x14 //AES block 3 - round 14 high 4399 fmov d4, x6 //AES block 0 - mov low 4400 4401 cmp x0, x5 //check if we have <= 8 blocks 4402 fmov v4.d[1], x7 //AES block 0 - mov high 4403 eor x23, x23, x13 //AES block 3 - round 14 low 4404 4405 eor x21, x21, x13 //AES block 2 - round 14 low 4406 fmov v5.d[1], x20 //AES block 1 - mov high 4407 4408 fmov d6, x21 //AES block 2 - mov low 4409 add w12, w12, #1 //CTR block 4 4410 4411 orr x9, x11, x9, lsl #32 //CTR block 4 4412 fmov d7, x23 //AES block 3 - mov low 4413 eor x22, x22, x14 //AES block 2 - round 14 high 4414 4415 fmov v6.d[1], x22 //AES block 2 - mov high 4416 4417 eor v4.16b, v4.16b, v0.16b //AES block 0 - result 4418 fmov d0, x10 //CTR block 4 4419 4420 fmov v0.d[1], x9 //CTR block 4 4421 rev w9, w12 //CTR block 5 4422 add w12, w12, #1 //CTR block 5 4423 4424 eor v5.16b, v5.16b, v1.16b //AES block 1 - result 4425 fmov d1, x10 //CTR block 5 4426 orr x9, x11, x9, lsl #32 //CTR block 5 4427 4428 fmov v1.d[1], x9 //CTR block 5 4429 rev w9, w12 //CTR block 6 4430 st1 { v4.16b}, [x2], #16 //AES block 0 - store result 4431 4432 fmov v7.d[1], x24 //AES block 3 - mov high 4433 orr x9, x11, x9, lsl #32 //CTR block 6 4434 eor v6.16b, v6.16b, v2.16b //AES block 2 - result 4435 4436 st1 { v5.16b}, [x2], #16 //AES block 1 - store result 4437 4438 add w12, w12, #1 //CTR block 6 4439 fmov d2, x10 //CTR block 6 4440 4441 fmov v2.d[1], x9 //CTR block 6 4442 st1 { v6.16b}, [x2], #16 //AES block 2 - store result 4443 rev w9, w12 //CTR block 7 4444 4445 orr x9, x11, x9, lsl #32 //CTR block 7 4446 4447 eor v7.16b, v7.16b, v3.16b //AES block 3 - result 4448 st1 { v7.16b}, [x2], #16 //AES block 3 - store result 4449 b.ge .L256_enc_prepretail //do prepretail 4450 4451.L256_enc_main_loop: //main loop start 4452 aese v0.16b, v18.16b 4453 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 4454 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 4455 4456 aese v1.16b, v18.16b 4457 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 4458 fmov d3, x10 //CTR block 4k+3 4459 4460 aese v2.16b, v18.16b 4461 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 4462 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 4463 4464 aese v0.16b, v19.16b 4465 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 4466 fmov v3.d[1], x9 //CTR block 4k+3 4467 4468 aese v1.16b, v19.16b 4469 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 4470 ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext 4471#ifdef __AARCH64EB__ 4472 rev x23, x23 4473 rev x24, x24 4474#endif 4475 aese v2.16b, v19.16b 4476 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 4477 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 4478#ifdef __AARCH64EB__ 4479 rev x21, x21 4480 rev x22, x22 4481#endif 4482 aese v0.16b, v20.16b 4483 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 4484 eor v4.16b, v4.16b, v11.16b //PRE 1 4485 4486 aese v1.16b, v20.16b 4487 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 4488 4489 aese v3.16b, v18.16b 4490 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 4491 eor x23, x23, x13 //AES block 4k+7 - round 14 low 4492 4493 aese v0.16b, v21.16b 4494 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 4495 mov d10, v17.d[1] //GHASH block 4k - mid 4496 4497 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 4498 eor x22, x22, x14 //AES block 4k+6 - round 14 high 4499 mov d8, v4.d[1] //GHASH block 4k - mid 4500 4501 aese v3.16b, v19.16b 4502 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 4503 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 4504 4505 aese v0.16b, v22.16b 4506 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 4507 4508 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 4509 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 4510 4511 aese v2.16b, v20.16b 4512 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 4513 4514 aese v0.16b, v23.16b 4515 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 4516 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 4517 4518 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 4519 4520 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 4521 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 4522 4523 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 4524 4525 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 4526 mov d4, v5.d[1] //GHASH block 4k+1 - mid 4527 4528 aese v1.16b, v21.16b 4529 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 4530 4531 aese v3.16b, v20.16b 4532 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 4533 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 4534 4535 aese v2.16b, v21.16b 4536 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 4537 4538 aese v1.16b, v22.16b 4539 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 4540 mov d8, v6.d[1] //GHASH block 4k+2 - mid 4541 4542 aese v3.16b, v21.16b 4543 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 4544 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 4545 4546 aese v2.16b, v22.16b 4547 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 4548 4549 aese v0.16b, v24.16b 4550 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 4551 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 4552 4553 aese v3.16b, v22.16b 4554 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 4555 4556 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 4557 4558 aese v0.16b, v25.16b 4559 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 4560 4561 aese v3.16b, v23.16b 4562 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 4563 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 4564 4565 aese v1.16b, v23.16b 4566 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 4567 4568 aese v0.16b, v26.16b 4569 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 4570 4571 aese v2.16b, v23.16b 4572 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 4573 4574 aese v1.16b, v24.16b 4575 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 4576 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 4577 4578 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 4579 4580 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 4581 4582 aese v1.16b, v25.16b 4583 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 4584 4585 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 4586 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 4587 4588 aese v3.16b, v24.16b 4589 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 4590 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 4591#ifdef __AARCH64EB__ 4592 rev x19, x19 4593 rev x20, x20 4594#endif 4595 aese v1.16b, v26.16b 4596 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 4597 mov d4, v7.d[1] //GHASH block 4k+3 - mid 4598 4599 aese v2.16b, v24.16b 4600 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 4601 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 4602 4603 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 4604 4605 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 4606 eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid 4607 4608 aese v2.16b, v25.16b 4609 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 4610 eor x19, x19, x13 //AES block 4k+5 - round 14 low 4611 4612 aese v1.16b, v27.16b 4613 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 4614 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 4615 4616 aese v3.16b, v25.16b 4617 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 4618 eor x21, x21, x13 //AES block 4k+6 - round 14 low 4619 4620 aese v0.16b, v27.16b 4621 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 4622 movi v8.8b, #0xc2 4623 4624 pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid 4625 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 4626 fmov d5, x19 //AES block 4k+5 - mov low 4627 4628 aese v2.16b, v26.16b 4629 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 4630 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 4631#ifdef __AARCH64EB__ 4632 rev x6, x6 4633 rev x7, x7 4634#endif 4635 aese v0.16b, v28.16b 4636 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 4637 shl d8, d8, #56 //mod_constant 4638 4639 aese v3.16b, v26.16b 4640 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 4641 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 4642 4643 aese v2.16b, v27.16b 4644 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 4645 4646 aese v1.16b, v28.16b 4647 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 4648 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid 4649 4650 aese v3.16b, v27.16b 4651 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 4652 add w12, w12, #1 //CTR block 4k+3 4653 4654 aese v0.16b, v29.16b 4655 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 4656 eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 4657 4658 aese v1.16b, v29.16b 4659 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 4660 add x0, x0, #64 //AES input_ptr update 4661 4662 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 4663 rev w9, w12 //CTR block 4k+8 4664 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 4665 4666 aese v2.16b, v28.16b 4667 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 4668 eor x6, x6, x13 //AES block 4k+4 - round 14 low 4669 4670 aese v1.16b, v30.16b 4671 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 4672 eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up 4673 4674 aese v3.16b, v28.16b 4675 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 4676 eor x7, x7, x14 //AES block 4k+4 - round 14 high 4677 4678 fmov d4, x6 //AES block 4k+4 - mov low 4679 orr x9, x11, x9, lsl #32 //CTR block 4k+8 4680 eor v7.16b, v9.16b, v7.16b //MODULO - fold into mid 4681 4682 aese v0.16b, v30.16b 4683 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 4684 eor x20, x20, x14 //AES block 4k+5 - round 14 high 4685 4686 aese v2.16b, v29.16b 4687 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 4688 eor x24, x24, x14 //AES block 4k+7 - round 14 high 4689 4690 aese v3.16b, v29.16b 4691 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 4692 add w12, w12, #1 //CTR block 4k+8 4693 4694 aese v0.16b, v31.16b //AES block 4k+4 - round 13 4695 fmov v4.d[1], x7 //AES block 4k+4 - mov high 4696 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 4697 4698 aese v2.16b, v30.16b 4699 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 4700 fmov d7, x23 //AES block 4k+7 - mov low 4701 4702 aese v1.16b, v31.16b //AES block 4k+5 - round 13 4703 fmov v5.d[1], x20 //AES block 4k+5 - mov high 4704 4705 fmov d6, x21 //AES block 4k+6 - mov low 4706 cmp x0, x5 //.LOOP CONTROL 4707 4708 fmov v6.d[1], x22 //AES block 4k+6 - mov high 4709 4710 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 4711 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 4712 fmov d0, x10 //CTR block 4k+8 4713 4714 fmov v0.d[1], x9 //CTR block 4k+8 4715 rev w9, w12 //CTR block 4k+9 4716 add w12, w12, #1 //CTR block 4k+9 4717 4718 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 4719 fmov d1, x10 //CTR block 4k+9 4720 orr x9, x11, x9, lsl #32 //CTR block 4k+9 4721 4722 aese v3.16b, v30.16b 4723 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 4724 fmov v1.d[1], x9 //CTR block 4k+9 4725 4726 aese v2.16b, v31.16b //AES block 4k+6 - round 13 4727 rev w9, w12 //CTR block 4k+10 4728 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 4729 4730 orr x9, x11, x9, lsl #32 //CTR block 4k+10 4731 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 4732 fmov v7.d[1], x24 //AES block 4k+7 - mov high 4733 4734 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 4735 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 4736 add w12, w12, #1 //CTR block 4k+10 4737 4738 aese v3.16b, v31.16b //AES block 4k+7 - round 13 4739 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 4740 fmov d2, x10 //CTR block 4k+10 4741 4742 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 4743 fmov v2.d[1], x9 //CTR block 4k+10 4744 rev w9, w12 //CTR block 4k+11 4745 4746 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 4747 orr x9, x11, x9, lsl #32 //CTR block 4k+11 4748 4749 eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result 4750 st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result 4751 b.lt .L256_enc_main_loop 4752 4753.L256_enc_prepretail: //PREPRETAIL 4754 aese v1.16b, v18.16b 4755 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 4756 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 4757 4758 aese v2.16b, v18.16b 4759 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 4760 fmov d3, x10 //CTR block 4k+3 4761 4762 aese v0.16b, v18.16b 4763 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 4764 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 4765 4766 fmov v3.d[1], x9 //CTR block 4k+3 4767 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 4768 4769 aese v2.16b, v19.16b 4770 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 4771 4772 aese v0.16b, v19.16b 4773 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 4774 4775 eor v4.16b, v4.16b, v11.16b //PRE 1 4776 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 4777 4778 aese v2.16b, v20.16b 4779 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 4780 4781 aese v3.16b, v18.16b 4782 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 4783 mov d10, v17.d[1] //GHASH block 4k - mid 4784 4785 aese v1.16b, v19.16b 4786 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 4787 4788 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 4789 mov d8, v4.d[1] //GHASH block 4k - mid 4790 4791 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 4792 4793 aese v2.16b, v21.16b 4794 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 4795 4796 aese v1.16b, v20.16b 4797 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 4798 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 4799 4800 aese v0.16b, v20.16b 4801 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 4802 4803 aese v3.16b, v19.16b 4804 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 4805 4806 aese v1.16b, v21.16b 4807 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 4808 4809 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 4810 4811 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 4812 4813 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 4814 4815 aese v3.16b, v20.16b 4816 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 4817 4818 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 4819 mov d4, v5.d[1] //GHASH block 4k+1 - mid 4820 4821 aese v0.16b, v21.16b 4822 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 4823 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 4824 4825 aese v3.16b, v21.16b 4826 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 4827 4828 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 4829 mov d8, v6.d[1] //GHASH block 4k+2 - mid 4830 4831 aese v0.16b, v22.16b 4832 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 4833 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 4834 4835 aese v3.16b, v22.16b 4836 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 4837 4838 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 4839 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 4840 add w12, w12, #1 //CTR block 4k+3 4841 4842 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 4843 4844 aese v3.16b, v23.16b 4845 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 4846 4847 aese v2.16b, v22.16b 4848 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 4849 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 4850 4851 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 4852 4853 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 4854 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 4855 4856 aese v2.16b, v23.16b 4857 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 4858 4859 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 4860 mov d4, v7.d[1] //GHASH block 4k+3 - mid 4861 4862 aese v1.16b, v22.16b 4863 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 4864 4865 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 4866 4867 eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid 4868 4869 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 4870 4871 aese v1.16b, v23.16b 4872 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 4873 4874 pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid 4875 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 4876 4877 aese v0.16b, v23.16b 4878 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 4879 4880 aese v1.16b, v24.16b 4881 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 4882 4883 aese v2.16b, v24.16b 4884 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 4885 4886 aese v0.16b, v24.16b 4887 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 4888 movi v8.8b, #0xc2 4889 4890 aese v3.16b, v24.16b 4891 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 4892 4893 aese v1.16b, v25.16b 4894 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 4895 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 4896 4897 aese v0.16b, v25.16b 4898 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 4899 4900 aese v3.16b, v25.16b 4901 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 4902 shl d8, d8, #56 //mod_constant 4903 4904 aese v1.16b, v26.16b 4905 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 4906 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid 4907 4908 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 4909 4910 aese v3.16b, v26.16b 4911 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 4912 4913 aese v1.16b, v27.16b 4914 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 4915 4916 aese v0.16b, v26.16b 4917 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 4918 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 4919 4920 aese v3.16b, v27.16b 4921 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 4922 4923 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 4924 4925 pmull v4.1q, v9.1d, v8.1d 4926 ext v9.16b, v9.16b, v9.16b, #8 4927 4928 aese v3.16b, v28.16b 4929 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 4930 4931 aese v2.16b, v25.16b 4932 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 4933 eor v10.16b, v10.16b, v11.16b 4934 4935 aese v1.16b, v28.16b 4936 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 4937 4938 aese v0.16b, v27.16b 4939 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 4940 4941 aese v2.16b, v26.16b 4942 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 4943 4944 aese v1.16b, v29.16b 4945 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 4946 eor v10.16b, v10.16b, v4.16b 4947 4948 aese v0.16b, v28.16b 4949 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 4950 4951 aese v2.16b, v27.16b 4952 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 4953 4954 aese v1.16b, v30.16b 4955 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 4956 4957 aese v0.16b, v29.16b 4958 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 4959 eor v10.16b, v10.16b, v9.16b 4960 4961 aese v3.16b, v29.16b 4962 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 4963 4964 aese v2.16b, v28.16b 4965 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 4966 4967 aese v0.16b, v30.16b 4968 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 4969 4970 pmull v4.1q, v10.1d, v8.1d 4971 4972 aese v2.16b, v29.16b 4973 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 4974 ext v10.16b, v10.16b, v10.16b, #8 4975 4976 aese v3.16b, v30.16b 4977 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 4978 4979 aese v1.16b, v31.16b //AES block 4k+5 - round 13 4980 eor v11.16b, v11.16b, v4.16b 4981 4982 aese v2.16b, v30.16b 4983 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 4984 4985 aese v3.16b, v31.16b //AES block 4k+7 - round 13 4986 4987 aese v0.16b, v31.16b //AES block 4k+4 - round 13 4988 4989 aese v2.16b, v31.16b //AES block 4k+6 - round 13 4990 eor v11.16b, v11.16b, v10.16b 4991.L256_enc_tail: //TAIL 4992 4993 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 4994 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 4995 ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 4996#ifdef __AARCH64EB__ 4997 rev x6, x6 4998 rev x7, x7 4999#endif 5000 eor x6, x6, x13 //AES block 4k+4 - round 14 low 5001 eor x7, x7, x14 //AES block 4k+4 - round 14 high 5002 5003 cmp x5, #48 5004 fmov d4, x6 //AES block 4k+4 - mov low 5005 5006 fmov v4.d[1], x7 //AES block 4k+4 - mov high 5007 5008 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 5009 b.gt .L256_enc_blocks_more_than_3 5010 5011 cmp x5, #32 5012 mov v3.16b, v2.16b 5013 movi v11.8b, #0 5014 5015 movi v9.8b, #0 5016 sub w12, w12, #1 5017 5018 mov v2.16b, v1.16b 5019 movi v10.8b, #0 5020 b.gt .L256_enc_blocks_more_than_2 5021 5022 mov v3.16b, v1.16b 5023 sub w12, w12, #1 5024 cmp x5, #16 5025 5026 b.gt .L256_enc_blocks_more_than_1 5027 5028 sub w12, w12, #1 5029 b .L256_enc_blocks_less_than_1 5030.L256_enc_blocks_more_than_3: //blocks left > 3 5031 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 5032 5033 ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 5034#ifdef __AARCH64EB__ 5035 rev x6, x6 5036 rev x7, x7 5037#endif 5038 rev64 v4.16b, v5.16b //GHASH final-3 block 5039 5040 eor x6, x6, x13 //AES final-2 block - round 14 low 5041 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5042 5043 eor x7, x7, x14 //AES final-2 block - round 14 high 5044 5045 mov d22, v4.d[1] //GHASH final-3 block - mid 5046 fmov d5, x6 //AES final-2 block - mov low 5047 5048 fmov v5.d[1], x7 //AES final-2 block - mov high 5049 5050 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 5051 movi v8.8b, #0 //suppress further partial tag feed in 5052 5053 mov d10, v17.d[1] //GHASH final-3 block - mid 5054 5055 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 5056 5057 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 5058 5059 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 5060 eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 5061.L256_enc_blocks_more_than_2: //blocks left > 2 5062 5063 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 5064 5065 ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 5066#ifdef __AARCH64EB__ 5067 rev x6, x6 5068 rev x7, x7 5069#endif 5070 rev64 v4.16b, v5.16b //GHASH final-2 block 5071 5072 eor x6, x6, x13 //AES final-1 block - round 14 low 5073 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5074 5075 fmov d5, x6 //AES final-1 block - mov low 5076 eor x7, x7, x14 //AES final-1 block - round 14 high 5077 5078 fmov v5.d[1], x7 //AES final-1 block - mov high 5079 5080 movi v8.8b, #0 //suppress further partial tag feed in 5081 5082 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 5083 mov d22, v4.d[1] //GHASH final-2 block - mid 5084 5085 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 5086 5087 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 5088 5089 eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 5090 5091 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 5092 5093 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 5094 5095 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 5096 5097 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 5098.L256_enc_blocks_more_than_1: //blocks left > 1 5099 5100 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 5101 5102 rev64 v4.16b, v5.16b //GHASH final-1 block 5103 5104 ldp x6, x7, [x0], #16 //AES final block - load input low & high 5105#ifdef __AARCH64EB__ 5106 rev x6, x6 5107 rev x7, x7 5108#endif 5109 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5110 5111 movi v8.8b, #0 //suppress further partial tag feed in 5112 5113 eor x6, x6, x13 //AES final block - round 14 low 5114 mov d22, v4.d[1] //GHASH final-1 block - mid 5115 5116 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 5117 eor x7, x7, x14 //AES final block - round 14 high 5118 5119 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 5120 5121 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 5122 5123 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 5124 fmov d5, x6 //AES final block - mov low 5125 5126 fmov v5.d[1], x7 //AES final block - mov high 5127 5128 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 5129 5130 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 5131 5132 eor v5.16b, v5.16b, v3.16b //AES final block - result 5133 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 5134 5135 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 5136.L256_enc_blocks_less_than_1: //blocks left <= 1 5137 5138 and x1, x1, #127 //bit_length %= 128 5139 5140 mvn x13, xzr //rk14_l = 0xffffffffffffffff 5141 sub x1, x1, #128 //bit_length -= 128 5142 5143 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 5144 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 5145 5146 mvn x14, xzr //rk14_h = 0xffffffffffffffff 5147 and x1, x1, #127 //bit_length %= 128 5148 5149 lsr x14, x14, x1 //rk14_h is mask for top 64b of last block 5150 cmp x1, #64 5151 5152 csel x6, x13, x14, lt 5153 csel x7, x14, xzr, lt 5154 5155 fmov d0, x6 //ctr0b is mask for last block 5156 5157 fmov v0.d[1], x7 5158 5159 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 5160 5161 rev64 v4.16b, v5.16b //GHASH final block 5162 5163 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5164 5165 bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 5166 5167 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 5168 mov d8, v4.d[1] //GHASH final block - mid 5169#ifndef __AARCH64EB__ 5170 rev w9, w12 5171#else 5172 mov w9, w12 5173#endif 5174 5175 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 5176 5177 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 5178 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 5179 5180 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 5181 5182 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 5183 5184 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 5185 movi v8.8b, #0xc2 5186 5187 eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 5188 5189 shl d8, d8, #56 //mod_constant 5190 5191 eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up 5192 5193 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 5194 5195 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 5196 5197 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 5198 5199 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 5200 5201 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 5202 5203 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 5204 5205 str w9, [x16, #12] //store the updated counter 5206 5207 st1 { v5.16b}, [x2] //store all 16B 5208 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 5209 5210 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 5211 ext v11.16b, v11.16b, v11.16b, #8 5212 rev64 v11.16b, v11.16b 5213 mov x0, x15 5214 st1 { v11.16b }, [x3] 5215 5216 ldp x21, x22, [sp, #16] 5217 ldp x23, x24, [sp, #32] 5218 ldp d8, d9, [sp, #48] 5219 ldp d10, d11, [sp, #64] 5220 ldp d12, d13, [sp, #80] 5221 ldp d14, d15, [sp, #96] 5222 ldp x19, x20, [sp], #112 5223 ret 5224 5225.L256_enc_ret: 5226 mov w0, #0x0 5227 ret 5228.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel 5229.globl aes_gcm_dec_256_kernel 5230.type aes_gcm_dec_256_kernel,%function 5231.align 4 5232aes_gcm_dec_256_kernel: 5233 cbz x1, .L256_dec_ret 5234 stp x19, x20, [sp, #-112]! 5235 mov x16, x4 5236 mov x8, x5 5237 stp x21, x22, [sp, #16] 5238 stp x23, x24, [sp, #32] 5239 stp d8, d9, [sp, #48] 5240 stp d10, d11, [sp, #64] 5241 stp d12, d13, [sp, #80] 5242 stp d14, d15, [sp, #96] 5243 5244 lsr x5, x1, #3 //byte_len 5245 mov x15, x5 5246 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 5247#ifdef __AARCH64EB__ 5248 rev x10, x10 5249 rev x11, x11 5250#endif 5251 ldp x13, x14, [x8, #224] //load rk14 5252#ifdef __AARCH64EB__ 5253 ror x14, x14, #32 5254 ror x13, x13, #32 5255#endif 5256 ld1 {v18.4s}, [x8], #16 //load rk0 5257 sub x5, x5, #1 //byte_len - 1 5258 5259 ld1 {v19.4s}, [x8], #16 //load rk1 5260 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 5261 5262 add x4, x0, x1, lsr #3 //end_input_ptr 5263 ld1 {v20.4s}, [x8], #16 //load rk2 5264 5265 lsr x12, x11, #32 5266 ld1 {v21.4s}, [x8], #16 //load rk3 5267 orr w11, w11, w11 5268 5269 ld1 {v22.4s}, [x8], #16 //load rk4 5270 add x5, x5, x0 5271 rev w12, w12 //rev_ctr32 5272 5273 add w12, w12, #1 //increment rev_ctr32 5274 fmov d3, x10 //CTR block 3 5275 5276 rev w9, w12 //CTR block 1 5277 add w12, w12, #1 //CTR block 1 5278 fmov d1, x10 //CTR block 1 5279 5280 orr x9, x11, x9, lsl #32 //CTR block 1 5281 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 5282 5283 fmov v1.d[1], x9 //CTR block 1 5284 rev w9, w12 //CTR block 2 5285 add w12, w12, #1 //CTR block 2 5286 5287 fmov d2, x10 //CTR block 2 5288 orr x9, x11, x9, lsl #32 //CTR block 2 5289 5290 fmov v2.d[1], x9 //CTR block 2 5291 rev w9, w12 //CTR block 3 5292 5293 orr x9, x11, x9, lsl #32 //CTR block 3 5294 ld1 {v23.4s}, [x8], #16 //load rk5 5295 5296 fmov v3.d[1], x9 //CTR block 3 5297 add w12, w12, #1 //CTR block 3 5298 5299 ld1 {v24.4s}, [x8], #16 //load rk6 5300 5301 ld1 {v25.4s}, [x8], #16 //load rk7 5302 5303 ld1 {v26.4s}, [x8], #16 //load rk8 5304 5305 aese v0.16b, v18.16b 5306 aesmc v0.16b, v0.16b //AES block 0 - round 0 5307 ldr q14, [x3, #80] //load h3l | h3h 5308#ifndef __AARCH64EB__ 5309 ext v14.16b, v14.16b, v14.16b, #8 5310#endif 5311 5312 aese v3.16b, v18.16b 5313 aesmc v3.16b, v3.16b //AES block 3 - round 0 5314 ldr q15, [x3, #112] //load h4l | h4h 5315#ifndef __AARCH64EB__ 5316 ext v15.16b, v15.16b, v15.16b, #8 5317#endif 5318 5319 aese v1.16b, v18.16b 5320 aesmc v1.16b, v1.16b //AES block 1 - round 0 5321 ldr q13, [x3, #64] //load h2l | h2h 5322#ifndef __AARCH64EB__ 5323 ext v13.16b, v13.16b, v13.16b, #8 5324#endif 5325 5326 aese v2.16b, v18.16b 5327 aesmc v2.16b, v2.16b //AES block 2 - round 0 5328 ld1 {v27.4s}, [x8], #16 //load rk9 5329 5330 aese v0.16b, v19.16b 5331 aesmc v0.16b, v0.16b //AES block 0 - round 1 5332 5333 aese v1.16b, v19.16b 5334 aesmc v1.16b, v1.16b //AES block 1 - round 1 5335 ld1 { v11.16b}, [x3] 5336 ext v11.16b, v11.16b, v11.16b, #8 5337 rev64 v11.16b, v11.16b 5338 5339 aese v2.16b, v19.16b 5340 aesmc v2.16b, v2.16b //AES block 2 - round 1 5341 ld1 {v28.4s}, [x8], #16 //load rk10 5342 5343 aese v3.16b, v19.16b 5344 aesmc v3.16b, v3.16b //AES block 3 - round 1 5345 ld1 {v29.4s}, [x8], #16 //load rk11 5346 5347 aese v0.16b, v20.16b 5348 aesmc v0.16b, v0.16b //AES block 0 - round 2 5349 ldr q12, [x3, #32] //load h1l | h1h 5350#ifndef __AARCH64EB__ 5351 ext v12.16b, v12.16b, v12.16b, #8 5352#endif 5353 aese v2.16b, v20.16b 5354 aesmc v2.16b, v2.16b //AES block 2 - round 2 5355 ld1 {v30.4s}, [x8], #16 //load rk12 5356 5357 aese v3.16b, v20.16b 5358 aesmc v3.16b, v3.16b //AES block 3 - round 2 5359 5360 aese v0.16b, v21.16b 5361 aesmc v0.16b, v0.16b //AES block 0 - round 3 5362 5363 aese v1.16b, v20.16b 5364 aesmc v1.16b, v1.16b //AES block 1 - round 2 5365 5366 aese v3.16b, v21.16b 5367 aesmc v3.16b, v3.16b //AES block 3 - round 3 5368 5369 aese v0.16b, v22.16b 5370 aesmc v0.16b, v0.16b //AES block 0 - round 4 5371 cmp x0, x5 //check if we have <= 4 blocks 5372 5373 aese v2.16b, v21.16b 5374 aesmc v2.16b, v2.16b //AES block 2 - round 3 5375 5376 aese v1.16b, v21.16b 5377 aesmc v1.16b, v1.16b //AES block 1 - round 3 5378 5379 aese v3.16b, v22.16b 5380 aesmc v3.16b, v3.16b //AES block 3 - round 4 5381 5382 aese v2.16b, v22.16b 5383 aesmc v2.16b, v2.16b //AES block 2 - round 4 5384 5385 aese v1.16b, v22.16b 5386 aesmc v1.16b, v1.16b //AES block 1 - round 4 5387 5388 aese v3.16b, v23.16b 5389 aesmc v3.16b, v3.16b //AES block 3 - round 5 5390 5391 aese v0.16b, v23.16b 5392 aesmc v0.16b, v0.16b //AES block 0 - round 5 5393 5394 aese v1.16b, v23.16b 5395 aesmc v1.16b, v1.16b //AES block 1 - round 5 5396 5397 aese v2.16b, v23.16b 5398 aesmc v2.16b, v2.16b //AES block 2 - round 5 5399 5400 aese v0.16b, v24.16b 5401 aesmc v0.16b, v0.16b //AES block 0 - round 6 5402 5403 aese v3.16b, v24.16b 5404 aesmc v3.16b, v3.16b //AES block 3 - round 6 5405 5406 aese v1.16b, v24.16b 5407 aesmc v1.16b, v1.16b //AES block 1 - round 6 5408 5409 aese v2.16b, v24.16b 5410 aesmc v2.16b, v2.16b //AES block 2 - round 6 5411 5412 aese v0.16b, v25.16b 5413 aesmc v0.16b, v0.16b //AES block 0 - round 7 5414 5415 aese v1.16b, v25.16b 5416 aesmc v1.16b, v1.16b //AES block 1 - round 7 5417 5418 aese v3.16b, v25.16b 5419 aesmc v3.16b, v3.16b //AES block 3 - round 7 5420 5421 aese v0.16b, v26.16b 5422 aesmc v0.16b, v0.16b //AES block 0 - round 8 5423 5424 aese v2.16b, v25.16b 5425 aesmc v2.16b, v2.16b //AES block 2 - round 7 5426 5427 aese v3.16b, v26.16b 5428 aesmc v3.16b, v3.16b //AES block 3 - round 8 5429 5430 aese v1.16b, v26.16b 5431 aesmc v1.16b, v1.16b //AES block 1 - round 8 5432 5433 aese v0.16b, v27.16b 5434 aesmc v0.16b, v0.16b //AES block 0 - round 9 5435 5436 aese v2.16b, v26.16b 5437 aesmc v2.16b, v2.16b //AES block 2 - round 8 5438 ld1 {v31.4s}, [x8], #16 //load rk13 5439 5440 aese v1.16b, v27.16b 5441 aesmc v1.16b, v1.16b //AES block 1 - round 9 5442 5443 aese v0.16b, v28.16b 5444 aesmc v0.16b, v0.16b //AES block 0 - round 10 5445 5446 aese v3.16b, v27.16b 5447 aesmc v3.16b, v3.16b //AES block 3 - round 9 5448 5449 aese v1.16b, v28.16b 5450 aesmc v1.16b, v1.16b //AES block 1 - round 10 5451 5452 aese v2.16b, v27.16b 5453 aesmc v2.16b, v2.16b //AES block 2 - round 9 5454 5455 aese v3.16b, v28.16b 5456 aesmc v3.16b, v3.16b //AES block 3 - round 10 5457 5458 aese v0.16b, v29.16b 5459 aesmc v0.16b, v0.16b //AES block 0 - round 11 5460 5461 aese v2.16b, v28.16b 5462 aesmc v2.16b, v2.16b //AES block 2 - round 10 5463 5464 aese v3.16b, v29.16b 5465 aesmc v3.16b, v3.16b //AES block 3 - round 11 5466 5467 aese v1.16b, v29.16b 5468 aesmc v1.16b, v1.16b //AES block 1 - round 11 5469 5470 aese v2.16b, v29.16b 5471 aesmc v2.16b, v2.16b //AES block 2 - round 11 5472 5473 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 5474 5475 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 5476 5477 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 5478 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 5479 5480 aese v1.16b, v30.16b 5481 aesmc v1.16b, v1.16b //AES block 1 - round 12 5482 5483 aese v0.16b, v30.16b 5484 aesmc v0.16b, v0.16b //AES block 0 - round 12 5485 5486 aese v2.16b, v30.16b 5487 aesmc v2.16b, v2.16b //AES block 2 - round 12 5488 5489 aese v3.16b, v30.16b 5490 aesmc v3.16b, v3.16b //AES block 3 - round 12 5491 eor v17.16b, v17.16b, v9.16b //h4k | h3k 5492 5493 aese v1.16b, v31.16b //AES block 1 - round 13 5494 5495 aese v2.16b, v31.16b //AES block 2 - round 13 5496 eor v16.16b, v16.16b, v8.16b //h2k | h1k 5497 5498 aese v3.16b, v31.16b //AES block 3 - round 13 5499 5500 aese v0.16b, v31.16b //AES block 0 - round 13 5501 b.ge .L256_dec_tail //handle tail 5502 5503 ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext 5504 5505 rev w9, w12 //CTR block 4 5506 5507 eor v0.16b, v4.16b, v0.16b //AES block 0 - result 5508 5509 eor v1.16b, v5.16b, v1.16b //AES block 1 - result 5510 rev64 v5.16b, v5.16b //GHASH block 1 5511 ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext 5512 5513 mov x7, v0.d[1] //AES block 0 - mov high 5514 5515 mov x6, v0.d[0] //AES block 0 - mov low 5516 rev64 v4.16b, v4.16b //GHASH block 0 5517 add w12, w12, #1 //CTR block 4 5518 5519 fmov d0, x10 //CTR block 4 5520 orr x9, x11, x9, lsl #32 //CTR block 4 5521 5522 fmov v0.d[1], x9 //CTR block 4 5523 rev w9, w12 //CTR block 5 5524 add w12, w12, #1 //CTR block 5 5525 5526 mov x19, v1.d[0] //AES block 1 - mov low 5527 5528 orr x9, x11, x9, lsl #32 //CTR block 5 5529 mov x20, v1.d[1] //AES block 1 - mov high 5530 eor x7, x7, x14 //AES block 0 - round 14 high 5531#ifdef __AARCH64EB__ 5532 rev x7, x7 5533#endif 5534 eor x6, x6, x13 //AES block 0 - round 14 low 5535#ifdef __AARCH64EB__ 5536 rev x6, x6 5537#endif 5538 stp x6, x7, [x2], #16 //AES block 0 - store result 5539 fmov d1, x10 //CTR block 5 5540 5541 ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext 5542 5543 fmov v1.d[1], x9 //CTR block 5 5544 rev w9, w12 //CTR block 6 5545 add w12, w12, #1 //CTR block 6 5546 5547 eor x19, x19, x13 //AES block 1 - round 14 low 5548#ifdef __AARCH64EB__ 5549 rev x19, x19 5550#endif 5551 orr x9, x11, x9, lsl #32 //CTR block 6 5552 5553 eor x20, x20, x14 //AES block 1 - round 14 high 5554#ifdef __AARCH64EB__ 5555 rev x20, x20 5556#endif 5557 stp x19, x20, [x2], #16 //AES block 1 - store result 5558 5559 eor v2.16b, v6.16b, v2.16b //AES block 2 - result 5560 cmp x0, x5 //check if we have <= 8 blocks 5561 b.ge .L256_dec_prepretail //do prepretail 5562 5563.L256_dec_main_loop: //main loop start 5564 mov x21, v2.d[0] //AES block 4k+2 - mov low 5565 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 5566 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 5567 5568 aese v0.16b, v18.16b 5569 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 5570 mov x22, v2.d[1] //AES block 4k+2 - mov high 5571 5572 aese v1.16b, v18.16b 5573 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 5574 fmov d2, x10 //CTR block 4k+6 5575 5576 fmov v2.d[1], x9 //CTR block 4k+6 5577 eor v4.16b, v4.16b, v11.16b //PRE 1 5578 rev w9, w12 //CTR block 4k+7 5579 5580 aese v0.16b, v19.16b 5581 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 5582 mov x24, v3.d[1] //AES block 4k+3 - mov high 5583 5584 aese v1.16b, v19.16b 5585 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 5586 mov x23, v3.d[0] //AES block 4k+3 - mov low 5587 5588 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 5589 mov d8, v4.d[1] //GHASH block 4k - mid 5590 fmov d3, x10 //CTR block 4k+7 5591 5592 aese v0.16b, v20.16b 5593 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 5594 orr x9, x11, x9, lsl #32 //CTR block 4k+7 5595 5596 aese v2.16b, v18.16b 5597 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 5598 fmov v3.d[1], x9 //CTR block 4k+7 5599 5600 aese v1.16b, v20.16b 5601 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 5602 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 5603 5604 aese v0.16b, v21.16b 5605 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 5606 eor x22, x22, x14 //AES block 4k+2 - round 14 high 5607#ifdef __AARCH64EB__ 5608 rev x22, x22 5609#endif 5610 aese v2.16b, v19.16b 5611 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 5612 mov d10, v17.d[1] //GHASH block 4k - mid 5613 5614 aese v1.16b, v21.16b 5615 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 5616 rev64 v6.16b, v6.16b //GHASH block 4k+2 5617 5618 aese v3.16b, v18.16b 5619 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 5620 eor x21, x21, x13 //AES block 4k+2 - round 14 low 5621#ifdef __AARCH64EB__ 5622 rev x21, x21 5623#endif 5624 aese v2.16b, v20.16b 5625 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 5626 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 5627 5628 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 5629 5630 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 5631 5632 aese v2.16b, v21.16b 5633 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 5634 rev64 v7.16b, v7.16b //GHASH block 4k+3 5635 5636 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 5637 eor x23, x23, x13 //AES block 4k+3 - round 14 low 5638#ifdef __AARCH64EB__ 5639 rev x23, x23 5640#endif 5641 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 5642 eor x24, x24, x14 //AES block 4k+3 - round 14 high 5643#ifdef __AARCH64EB__ 5644 rev x24, x24 5645#endif 5646 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 5647 5648 aese v2.16b, v22.16b 5649 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 5650 5651 aese v3.16b, v19.16b 5652 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 5653 mov d4, v5.d[1] //GHASH block 4k+1 - mid 5654 5655 aese v0.16b, v22.16b 5656 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 5657 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 5658 5659 aese v2.16b, v23.16b 5660 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 5661 add w12, w12, #1 //CTR block 4k+7 5662 5663 aese v3.16b, v20.16b 5664 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 5665 mov d8, v6.d[1] //GHASH block 4k+2 - mid 5666 5667 aese v1.16b, v22.16b 5668 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 5669 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 5670 5671 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 5672 5673 aese v3.16b, v21.16b 5674 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 5675 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 5676 5677 aese v1.16b, v23.16b 5678 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 5679 5680 aese v0.16b, v23.16b 5681 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 5682 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 5683 5684 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 5685 rev w9, w12 //CTR block 4k+8 5686 5687 aese v1.16b, v24.16b 5688 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 5689 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 5690 5691 aese v0.16b, v24.16b 5692 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 5693 add w12, w12, #1 //CTR block 4k+8 5694 5695 aese v3.16b, v22.16b 5696 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 5697 5698 aese v1.16b, v25.16b 5699 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 5700 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 5701 5702 aese v0.16b, v25.16b 5703 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 5704 5705 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 5706 mov d6, v7.d[1] //GHASH block 4k+3 - mid 5707 5708 aese v3.16b, v23.16b 5709 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 5710 5711 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 5712 5713 aese v0.16b, v26.16b 5714 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 5715 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 5716 5717 aese v3.16b, v24.16b 5718 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 5719 5720 pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 5721 orr x9, x11, x9, lsl #32 //CTR block 4k+8 5722 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 5723 5724 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 5725 5726 aese v0.16b, v27.16b 5727 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 5728 eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid 5729 5730 aese v1.16b, v26.16b 5731 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 5732 5733 aese v2.16b, v24.16b 5734 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 5735 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 5736 5737 aese v0.16b, v28.16b 5738 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 5739 5740 pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid 5741 movi v8.8b, #0xc2 5742 5743 aese v2.16b, v25.16b 5744 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 5745 eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low 5746 5747 aese v0.16b, v29.16b 5748 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 5749 5750 aese v3.16b, v25.16b 5751 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 5752 shl d8, d8, #56 //mod_constant 5753 5754 aese v2.16b, v26.16b 5755 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 5756 eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid 5757 5758 aese v0.16b, v30.16b 5759 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 5760 5761 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 5762 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 5763 5764 aese v1.16b, v27.16b 5765 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 5766 ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 5767 5768 aese v0.16b, v31.16b //AES block 4k+4 - round 13 5769 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 5770 5771 aese v1.16b, v28.16b 5772 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 5773 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 5774 5775 aese v2.16b, v27.16b 5776 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 5777 ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 5778 5779 aese v3.16b, v26.16b 5780 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 5781 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 5782 5783 aese v1.16b, v29.16b 5784 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 5785 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 5786 5787 aese v2.16b, v28.16b 5788 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 5789 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 5790 5791 aese v3.16b, v27.16b 5792 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 5793 ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 5794 5795 aese v1.16b, v30.16b 5796 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 5797 ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext 5798 5799 aese v2.16b, v29.16b 5800 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 5801 mov x7, v0.d[1] //AES block 4k+4 - mov high 5802 5803 aese v3.16b, v28.16b 5804 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 5805 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 5806 5807 aese v1.16b, v31.16b //AES block 4k+5 - round 13 5808 mov x6, v0.d[0] //AES block 4k+4 - mov low 5809 5810 aese v2.16b, v30.16b 5811 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 5812 fmov d0, x10 //CTR block 4k+8 5813 5814 aese v3.16b, v29.16b 5815 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 5816 fmov v0.d[1], x9 //CTR block 4k+8 5817 5818 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 5819 eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 5820 rev w9, w12 //CTR block 4k+9 5821 5822 aese v2.16b, v31.16b //AES block 4k+6 - round 13 5823 orr x9, x11, x9, lsl #32 //CTR block 4k+9 5824 cmp x0, x5 //.LOOP CONTROL 5825 5826 add w12, w12, #1 //CTR block 4k+9 5827 5828 eor x6, x6, x13 //AES block 4k+4 - round 14 low 5829#ifdef __AARCH64EB__ 5830 rev x6, x6 5831#endif 5832 eor x7, x7, x14 //AES block 4k+4 - round 14 high 5833#ifdef __AARCH64EB__ 5834 rev x7, x7 5835#endif 5836 mov x20, v1.d[1] //AES block 4k+5 - mov high 5837 eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 5838 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 5839 5840 aese v3.16b, v30.16b 5841 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 5842 mov x19, v1.d[0] //AES block 4k+5 - mov low 5843 5844 fmov d1, x10 //CTR block 4k+9 5845 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 5846 5847 fmov v1.d[1], x9 //CTR block 4k+9 5848 rev w9, w12 //CTR block 4k+10 5849 add w12, w12, #1 //CTR block 4k+10 5850 5851 aese v3.16b, v31.16b //AES block 4k+7 - round 13 5852 orr x9, x11, x9, lsl #32 //CTR block 4k+10 5853 5854 rev64 v5.16b, v5.16b //GHASH block 4k+5 5855 eor x20, x20, x14 //AES block 4k+5 - round 14 high 5856#ifdef __AARCH64EB__ 5857 rev x20, x20 5858#endif 5859 stp x6, x7, [x2], #16 //AES block 4k+4 - store result 5860 5861 eor x19, x19, x13 //AES block 4k+5 - round 14 low 5862#ifdef __AARCH64EB__ 5863 rev x19, x19 5864#endif 5865 stp x19, x20, [x2], #16 //AES block 4k+5 - store result 5866 5867 rev64 v4.16b, v4.16b //GHASH block 4k+4 5868 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 5869 b.lt .L256_dec_main_loop 5870 5871 5872.L256_dec_prepretail: //PREPRETAIL 5873 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 5874 mov x21, v2.d[0] //AES block 4k+2 - mov low 5875 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 5876 5877 aese v0.16b, v18.16b 5878 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 5879 mov x22, v2.d[1] //AES block 4k+2 - mov high 5880 5881 aese v1.16b, v18.16b 5882 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 5883 fmov d2, x10 //CTR block 4k+6 5884 5885 fmov v2.d[1], x9 //CTR block 4k+6 5886 rev w9, w12 //CTR block 4k+7 5887 eor v4.16b, v4.16b, v11.16b //PRE 1 5888 5889 rev64 v6.16b, v6.16b //GHASH block 4k+2 5890 orr x9, x11, x9, lsl #32 //CTR block 4k+7 5891 mov x23, v3.d[0] //AES block 4k+3 - mov low 5892 5893 aese v1.16b, v19.16b 5894 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 5895 mov x24, v3.d[1] //AES block 4k+3 - mov high 5896 5897 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 5898 mov d8, v4.d[1] //GHASH block 4k - mid 5899 fmov d3, x10 //CTR block 4k+7 5900 5901 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 5902 fmov v3.d[1], x9 //CTR block 4k+7 5903 5904 aese v2.16b, v18.16b 5905 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 5906 mov d10, v17.d[1] //GHASH block 4k - mid 5907 5908 aese v0.16b, v19.16b 5909 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 5910 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 5911 5912 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 5913 5914 aese v2.16b, v19.16b 5915 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 5916 rev64 v7.16b, v7.16b //GHASH block 4k+3 5917 5918 aese v3.16b, v18.16b 5919 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 5920 5921 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 5922 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 5923 5924 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 5925 5926 aese v3.16b, v19.16b 5927 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 5928 mov d4, v5.d[1] //GHASH block 4k+1 - mid 5929 5930 aese v0.16b, v20.16b 5931 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 5932 5933 aese v1.16b, v20.16b 5934 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 5935 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 5936 5937 aese v2.16b, v20.16b 5938 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 5939 5940 aese v0.16b, v21.16b 5941 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 5942 mov d8, v6.d[1] //GHASH block 4k+2 - mid 5943 5944 aese v3.16b, v20.16b 5945 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 5946 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 5947 5948 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 5949 5950 aese v0.16b, v22.16b 5951 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 5952 5953 aese v3.16b, v21.16b 5954 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 5955 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 5956 5957 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 5958 5959 aese v0.16b, v23.16b 5960 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 5961 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 5962 5963 aese v3.16b, v22.16b 5964 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 5965 5966 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 5967 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 5968 5969 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 5970 5971 aese v3.16b, v23.16b 5972 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 5973 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 5974 5975 aese v2.16b, v21.16b 5976 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 5977 5978 aese v1.16b, v21.16b 5979 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 5980 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 5981 5982 pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 5983 5984 aese v2.16b, v22.16b 5985 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 5986 mov d6, v7.d[1] //GHASH block 4k+3 - mid 5987 5988 aese v1.16b, v22.16b 5989 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 5990 5991 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 5992 5993 aese v2.16b, v23.16b 5994 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 5995 eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid 5996 5997 aese v1.16b, v23.16b 5998 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 5999 6000 aese v3.16b, v24.16b 6001 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 6002 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 6003 6004 aese v2.16b, v24.16b 6005 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 6006 6007 aese v0.16b, v24.16b 6008 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 6009 movi v8.8b, #0xc2 6010 6011 aese v1.16b, v24.16b 6012 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 6013 eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low 6014 6015 pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid 6016 6017 aese v3.16b, v25.16b 6018 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 6019 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 6020 6021 aese v1.16b, v25.16b 6022 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 6023 6024 aese v0.16b, v25.16b 6025 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 6026 eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid 6027 6028 aese v3.16b, v26.16b 6029 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 6030 6031 aese v2.16b, v25.16b 6032 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 6033 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 6034 6035 aese v1.16b, v26.16b 6036 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 6037 6038 aese v0.16b, v26.16b 6039 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 6040 shl d8, d8, #56 //mod_constant 6041 6042 aese v2.16b, v26.16b 6043 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 6044 6045 aese v1.16b, v27.16b 6046 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 6047 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 6048 6049 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 6050 6051 aese v2.16b, v27.16b 6052 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 6053 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 6054 6055 aese v3.16b, v27.16b 6056 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 6057 6058 aese v0.16b, v27.16b 6059 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 6060 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 6061 6062 aese v2.16b, v28.16b 6063 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 6064 6065 aese v3.16b, v28.16b 6066 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 6067 6068 aese v0.16b, v28.16b 6069 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 6070 eor x22, x22, x14 //AES block 4k+2 - round 14 high 6071#ifdef __AARCH64EB__ 6072 rev x22, x22 6073#endif 6074 aese v1.16b, v28.16b 6075 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 6076 eor x23, x23, x13 //AES block 4k+3 - round 14 low 6077#ifdef __AARCH64EB__ 6078 rev x23, x23 6079#endif 6080 aese v2.16b, v29.16b 6081 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 6082 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 6083 6084 aese v0.16b, v29.16b 6085 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 6086 add w12, w12, #1 //CTR block 4k+7 6087 6088 aese v1.16b, v29.16b 6089 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 6090 eor x21, x21, x13 //AES block 4k+2 - round 14 low 6091#ifdef __AARCH64EB__ 6092 rev x21, x21 6093#endif 6094 6095 aese v2.16b, v30.16b 6096 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 6097 6098 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 6099 eor x24, x24, x14 //AES block 4k+3 - round 14 high 6100#ifdef __AARCH64EB__ 6101 rev x24, x24 6102#endif 6103 6104 aese v3.16b, v29.16b 6105 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 6106 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 6107 6108 aese v1.16b, v30.16b 6109 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 6110 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 6111 6112 aese v0.16b, v30.16b 6113 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 6114 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 6115 6116 aese v3.16b, v30.16b 6117 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 6118 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 6119 6120 aese v1.16b, v31.16b //AES block 4k+5 - round 13 6121 6122 aese v0.16b, v31.16b //AES block 4k+4 - round 13 6123 6124 aese v3.16b, v31.16b //AES block 4k+7 - round 13 6125 6126 aese v2.16b, v31.16b //AES block 4k+6 - round 13 6127 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 6128.L256_dec_tail: //TAIL 6129 6130 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 6131 ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 6132 6133 eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 6134 6135 mov x6, v0.d[0] //AES block 4k+4 - mov low 6136 6137 mov x7, v0.d[1] //AES block 4k+4 - mov high 6138 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 6139 6140 cmp x5, #48 6141 6142 eor x6, x6, x13 //AES block 4k+4 - round 14 low 6143#ifdef __AARCH64EB__ 6144 rev x6, x6 6145#endif 6146 6147 eor x7, x7, x14 //AES block 4k+4 - round 14 high 6148#ifdef __AARCH64EB__ 6149 rev x7, x7 6150#endif 6151 b.gt .L256_dec_blocks_more_than_3 6152 6153 sub w12, w12, #1 6154 mov v3.16b, v2.16b 6155 movi v10.8b, #0 6156 6157 movi v11.8b, #0 6158 cmp x5, #32 6159 6160 movi v9.8b, #0 6161 mov v2.16b, v1.16b 6162 b.gt .L256_dec_blocks_more_than_2 6163 6164 sub w12, w12, #1 6165 6166 mov v3.16b, v1.16b 6167 cmp x5, #16 6168 b.gt .L256_dec_blocks_more_than_1 6169 6170 sub w12, w12, #1 6171 b .L256_dec_blocks_less_than_1 6172.L256_dec_blocks_more_than_3: //blocks left > 3 6173 rev64 v4.16b, v5.16b //GHASH final-3 block 6174 ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 6175 6176 stp x6, x7, [x2], #16 //AES final-3 block - store result 6177 6178 mov d10, v17.d[1] //GHASH final-3 block - mid 6179 6180 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6181 6182 eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 6183 6184 mov d22, v4.d[1] //GHASH final-3 block - mid 6185 6186 mov x6, v0.d[0] //AES final-2 block - mov low 6187 6188 mov x7, v0.d[1] //AES final-2 block - mov high 6189 6190 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 6191 6192 movi v8.8b, #0 //suppress further partial tag feed in 6193 6194 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 6195 6196 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 6197 eor x6, x6, x13 //AES final-2 block - round 14 low 6198#ifdef __AARCH64EB__ 6199 rev x6, x6 6200#endif 6201 6202 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 6203 eor x7, x7, x14 //AES final-2 block - round 14 high 6204#ifdef __AARCH64EB__ 6205 rev x7, x7 6206#endif 6207.L256_dec_blocks_more_than_2: //blocks left > 2 6208 6209 rev64 v4.16b, v5.16b //GHASH final-2 block 6210 ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 6211 6212 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6213 stp x6, x7, [x2], #16 //AES final-2 block - store result 6214 6215 eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 6216 6217 mov d22, v4.d[1] //GHASH final-2 block - mid 6218 6219 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 6220 6221 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 6222 6223 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 6224 mov x6, v0.d[0] //AES final-1 block - mov low 6225 6226 mov x7, v0.d[1] //AES final-1 block - mov high 6227 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 6228 movi v8.8b, #0 //suppress further partial tag feed in 6229 6230 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 6231 6232 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 6233 eor x6, x6, x13 //AES final-1 block - round 14 low 6234#ifdef __AARCH64EB__ 6235 rev x6, x6 6236#endif 6237 6238 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 6239 eor x7, x7, x14 //AES final-1 block - round 14 high 6240#ifdef __AARCH64EB__ 6241 rev x7, x7 6242#endif 6243.L256_dec_blocks_more_than_1: //blocks left > 1 6244 6245 stp x6, x7, [x2], #16 //AES final-1 block - store result 6246 rev64 v4.16b, v5.16b //GHASH final-1 block 6247 6248 ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 6249 6250 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6251 movi v8.8b, #0 //suppress further partial tag feed in 6252 6253 mov d22, v4.d[1] //GHASH final-1 block - mid 6254 6255 eor v0.16b, v5.16b, v3.16b //AES final block - result 6256 6257 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 6258 6259 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 6260 6261 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 6262 mov x6, v0.d[0] //AES final block - mov low 6263 6264 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 6265 6266 mov x7, v0.d[1] //AES final block - mov high 6267 6268 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 6269 eor x6, x6, x13 //AES final block - round 14 low 6270#ifdef __AARCH64EB__ 6271 rev x6, x6 6272#endif 6273 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 6274 6275 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 6276 6277 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 6278 eor x7, x7, x14 //AES final block - round 14 high 6279#ifdef __AARCH64EB__ 6280 rev x7, x7 6281#endif 6282.L256_dec_blocks_less_than_1: //blocks left <= 1 6283 6284 and x1, x1, #127 //bit_length %= 128 6285 mvn x14, xzr //rk14_h = 0xffffffffffffffff 6286 6287 sub x1, x1, #128 //bit_length -= 128 6288 mvn x13, xzr //rk14_l = 0xffffffffffffffff 6289 6290 ldp x4, x5, [x2] //load existing bytes we need to not overwrite 6291 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 6292 6293 and x1, x1, #127 //bit_length %= 128 6294 6295 lsr x14, x14, x1 //rk14_h is mask for top 64b of last block 6296 cmp x1, #64 6297 6298 csel x9, x13, x14, lt 6299 csel x10, x14, xzr, lt 6300 6301 fmov d0, x9 //ctr0b is mask for last block 6302 and x6, x6, x9 6303 6304 mov v0.d[1], x10 6305 bic x4, x4, x9 //mask out low existing bytes 6306 6307#ifndef __AARCH64EB__ 6308 rev w9, w12 6309#else 6310 mov w9, w12 6311#endif 6312 6313 bic x5, x5, x10 //mask out high existing bytes 6314 6315 orr x6, x6, x4 6316 6317 and x7, x7, x10 6318 6319 orr x7, x7, x5 6320 6321 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 6322 6323 rev64 v4.16b, v5.16b //GHASH final block 6324 6325 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6326 6327 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 6328 6329 mov d8, v4.d[1] //GHASH final block - mid 6330 6331 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 6332 6333 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 6334 6335 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 6336 6337 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 6338 6339 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 6340 6341 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 6342 movi v8.8b, #0xc2 6343 6344 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 6345 6346 shl d8, d8, #56 //mod_constant 6347 6348 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 6349 6350 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 6351 6352 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 6353 6354 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 6355 6356 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 6357 6358 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 6359 6360 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 6361 6362 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 6363 6364 stp x6, x7, [x2] 6365 6366 str w9, [x16, #12] //store the updated counter 6367 6368 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 6369 ext v11.16b, v11.16b, v11.16b, #8 6370 rev64 v11.16b, v11.16b 6371 mov x0, x15 6372 st1 { v11.16b }, [x3] 6373 6374 ldp x21, x22, [sp, #16] 6375 ldp x23, x24, [sp, #32] 6376 ldp d8, d9, [sp, #48] 6377 ldp d10, d11, [sp, #64] 6378 ldp d12, d13, [sp, #80] 6379 ldp d14, d15, [sp, #96] 6380 ldp x19, x20, [sp], #112 6381 ret 6382 6383.L256_dec_ret: 6384 mov w0, #0x0 6385 ret 6386.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel 6387.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 6388.align 2 6389.align 2 6390#endif 6391