1/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */ 2#include "arm_arch.h" 3 4#if __ARM_MAX_ARCH__>=8 5.arch armv8-a+crypto 6.text 7.globl aes_gcm_enc_128_kernel 8.type aes_gcm_enc_128_kernel,%function 9.align 4 10aes_gcm_enc_128_kernel: 11 AARCH64_VALID_CALL_TARGET 12 cbz x1, .L128_enc_ret 13 stp x19, x20, [sp, #-112]! 14 mov x16, x4 15 mov x8, x5 16 stp x21, x22, [sp, #16] 17 stp x23, x24, [sp, #32] 18 stp d8, d9, [sp, #48] 19 stp d10, d11, [sp, #64] 20 stp d12, d13, [sp, #80] 21 stp d14, d15, [sp, #96] 22 23 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 24#ifdef __AARCH64EB__ 25 rev x10, x10 26 rev x11, x11 27#endif 28 ldp x13, x14, [x8, #160] //load rk10 29#ifdef __AARCH64EB__ 30 ror x13, x13, #32 31 ror x14, x14, #32 32#endif 33 ld1 {v11.16b}, [x3] 34 ext v11.16b, v11.16b, v11.16b, #8 35 rev64 v11.16b, v11.16b 36 lsr x5, x1, #3 //byte_len 37 mov x15, x5 38 39 ld1 {v18.4s}, [x8], #16 //load rk0 40 add x4, x0, x1, lsr #3 //end_input_ptr 41 sub x5, x5, #1 //byte_len - 1 42 43 lsr x12, x11, #32 44 ldr q15, [x3, #112] //load h4l | h4h 45#ifndef __AARCH64EB__ 46 ext v15.16b, v15.16b, v15.16b, #8 47#endif 48 fmov d1, x10 //CTR block 1 49 rev w12, w12 //rev_ctr32 50 51 add w12, w12, #1 //increment rev_ctr32 52 orr w11, w11, w11 53 ld1 {v19.4s}, [x8], #16 //load rk1 54 55 rev w9, w12 //CTR block 1 56 add w12, w12, #1 //CTR block 1 57 fmov d3, x10 //CTR block 3 58 59 orr x9, x11, x9, lsl #32 //CTR block 1 60 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 61 62 fmov v1.d[1], x9 //CTR block 1 63 rev w9, w12 //CTR block 2 64 65 fmov d2, x10 //CTR block 2 66 orr x9, x11, x9, lsl #32 //CTR block 2 67 add w12, w12, #1 //CTR block 2 68 69 fmov v2.d[1], x9 //CTR block 2 70 rev w9, w12 //CTR block 3 71 72 orr x9, x11, x9, lsl #32 //CTR block 3 73 ld1 {v20.4s}, [x8], #16 //load rk2 74 75 add w12, w12, #1 //CTR block 3 76 fmov v3.d[1], x9 //CTR block 3 77 78 ldr q14, [x3, #80] //load h3l | h3h 79#ifndef __AARCH64EB__ 80 ext v14.16b, v14.16b, v14.16b, #8 81#endif 82 aese v1.16b, v18.16b 83 aesmc v1.16b, v1.16b //AES block 1 - round 0 84 ld1 {v21.4s}, [x8], #16 //load rk3 85 86 aese v2.16b, v18.16b 87 aesmc v2.16b, v2.16b //AES block 2 - round 0 88 ldr q12, [x3, #32] //load h1l | h1h 89#ifndef __AARCH64EB__ 90 ext v12.16b, v12.16b, v12.16b, #8 91#endif 92 93 aese v0.16b, v18.16b 94 aesmc v0.16b, v0.16b //AES block 0 - round 0 95 ld1 {v22.4s}, [x8], #16 //load rk4 96 97 aese v3.16b, v18.16b 98 aesmc v3.16b, v3.16b //AES block 3 - round 0 99 ld1 {v23.4s}, [x8], #16 //load rk5 100 101 aese v2.16b, v19.16b 102 aesmc v2.16b, v2.16b //AES block 2 - round 1 103 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 104 105 aese v0.16b, v19.16b 106 aesmc v0.16b, v0.16b //AES block 0 - round 1 107 ld1 {v24.4s}, [x8], #16 //load rk6 108 109 aese v1.16b, v19.16b 110 aesmc v1.16b, v1.16b //AES block 1 - round 1 111 ld1 {v25.4s}, [x8], #16 //load rk7 112 113 aese v3.16b, v19.16b 114 aesmc v3.16b, v3.16b //AES block 3 - round 1 115 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 116 117 aese v0.16b, v20.16b 118 aesmc v0.16b, v0.16b //AES block 0 - round 2 119 ld1 {v26.4s}, [x8], #16 //load rk8 120 121 aese v1.16b, v20.16b 122 aesmc v1.16b, v1.16b //AES block 1 - round 2 123 ldr q13, [x3, #64] //load h2l | h2h 124#ifndef __AARCH64EB__ 125 ext v13.16b, v13.16b, v13.16b, #8 126#endif 127 128 aese v3.16b, v20.16b 129 aesmc v3.16b, v3.16b //AES block 3 - round 2 130 131 aese v2.16b, v20.16b 132 aesmc v2.16b, v2.16b //AES block 2 - round 2 133 eor v17.16b, v17.16b, v9.16b //h4k | h3k 134 135 aese v0.16b, v21.16b 136 aesmc v0.16b, v0.16b //AES block 0 - round 3 137 138 aese v1.16b, v21.16b 139 aesmc v1.16b, v1.16b //AES block 1 - round 3 140 141 aese v2.16b, v21.16b 142 aesmc v2.16b, v2.16b //AES block 2 - round 3 143 ld1 {v27.4s}, [x8], #16 //load rk9 144 145 aese v3.16b, v21.16b 146 aesmc v3.16b, v3.16b //AES block 3 - round 3 147 148 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 149 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 150 151 aese v3.16b, v22.16b 152 aesmc v3.16b, v3.16b //AES block 3 - round 4 153 add x5, x5, x0 154 155 aese v2.16b, v22.16b 156 aesmc v2.16b, v2.16b //AES block 2 - round 4 157 cmp x0, x5 //check if we have <= 4 blocks 158 159 aese v0.16b, v22.16b 160 aesmc v0.16b, v0.16b //AES block 0 - round 4 161 162 aese v3.16b, v23.16b 163 aesmc v3.16b, v3.16b //AES block 3 - round 5 164 165 aese v2.16b, v23.16b 166 aesmc v2.16b, v2.16b //AES block 2 - round 5 167 168 aese v0.16b, v23.16b 169 aesmc v0.16b, v0.16b //AES block 0 - round 5 170 171 aese v3.16b, v24.16b 172 aesmc v3.16b, v3.16b //AES block 3 - round 6 173 174 aese v1.16b, v22.16b 175 aesmc v1.16b, v1.16b //AES block 1 - round 4 176 177 aese v2.16b, v24.16b 178 aesmc v2.16b, v2.16b //AES block 2 - round 6 179 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 180 181 aese v0.16b, v24.16b 182 aesmc v0.16b, v0.16b //AES block 0 - round 6 183 184 aese v1.16b, v23.16b 185 aesmc v1.16b, v1.16b //AES block 1 - round 5 186 187 aese v3.16b, v25.16b 188 aesmc v3.16b, v3.16b //AES block 3 - round 7 189 190 aese v0.16b, v25.16b 191 aesmc v0.16b, v0.16b //AES block 0 - round 7 192 193 aese v1.16b, v24.16b 194 aesmc v1.16b, v1.16b //AES block 1 - round 6 195 196 aese v2.16b, v25.16b 197 aesmc v2.16b, v2.16b //AES block 2 - round 7 198 199 aese v0.16b, v26.16b 200 aesmc v0.16b, v0.16b //AES block 0 - round 8 201 202 aese v1.16b, v25.16b 203 aesmc v1.16b, v1.16b //AES block 1 - round 7 204 205 aese v2.16b, v26.16b 206 aesmc v2.16b, v2.16b //AES block 2 - round 8 207 208 aese v3.16b, v26.16b 209 aesmc v3.16b, v3.16b //AES block 3 - round 8 210 211 aese v1.16b, v26.16b 212 aesmc v1.16b, v1.16b //AES block 1 - round 8 213 214 aese v2.16b, v27.16b //AES block 2 - round 9 215 216 aese v0.16b, v27.16b //AES block 0 - round 9 217 218 eor v16.16b, v16.16b, v8.16b //h2k | h1k 219 220 aese v1.16b, v27.16b //AES block 1 - round 9 221 222 aese v3.16b, v27.16b //AES block 3 - round 9 223 b.ge .L128_enc_tail //handle tail 224 225 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 226#ifdef __AARCH64EB__ 227 rev x6, x6 228 rev x7, x7 229#endif 230 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 231#ifdef __AARCH64EB__ 232 rev x21, x21 233 rev x22, x22 234#endif 235 ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 236#ifdef __AARCH64EB__ 237 rev x19, x19 238 rev x20, x20 239#endif 240 ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 241#ifdef __AARCH64EB__ 242 rev x23, x23 243 rev x24, x24 244#endif 245 eor x6, x6, x13 //AES block 0 - round 10 low 246 eor x7, x7, x14 //AES block 0 - round 10 high 247 248 eor x21, x21, x13 //AES block 2 - round 10 low 249 fmov d4, x6 //AES block 0 - mov low 250 251 eor x19, x19, x13 //AES block 1 - round 10 low 252 eor x22, x22, x14 //AES block 2 - round 10 high 253 fmov v4.d[1], x7 //AES block 0 - mov high 254 255 fmov d5, x19 //AES block 1 - mov low 256 eor x20, x20, x14 //AES block 1 - round 10 high 257 258 eor x23, x23, x13 //AES block 3 - round 10 low 259 fmov v5.d[1], x20 //AES block 1 - mov high 260 261 fmov d6, x21 //AES block 2 - mov low 262 eor x24, x24, x14 //AES block 3 - round 10 high 263 rev w9, w12 //CTR block 4 264 265 fmov v6.d[1], x22 //AES block 2 - mov high 266 orr x9, x11, x9, lsl #32 //CTR block 4 267 268 eor v4.16b, v4.16b, v0.16b //AES block 0 - result 269 fmov d0, x10 //CTR block 4 270 add w12, w12, #1 //CTR block 4 271 272 fmov v0.d[1], x9 //CTR block 4 273 rev w9, w12 //CTR block 5 274 275 eor v5.16b, v5.16b, v1.16b //AES block 1 - result 276 fmov d1, x10 //CTR block 5 277 orr x9, x11, x9, lsl #32 //CTR block 5 278 279 add w12, w12, #1 //CTR block 5 280 add x0, x0, #64 //AES input_ptr update 281 fmov v1.d[1], x9 //CTR block 5 282 283 fmov d7, x23 //AES block 3 - mov low 284 rev w9, w12 //CTR block 6 285 st1 { v4.16b}, [x2], #16 //AES block 0 - store result 286 287 fmov v7.d[1], x24 //AES block 3 - mov high 288 orr x9, x11, x9, lsl #32 //CTR block 6 289 290 add w12, w12, #1 //CTR block 6 291 eor v6.16b, v6.16b, v2.16b //AES block 2 - result 292 st1 { v5.16b}, [x2], #16 //AES block 1 - store result 293 294 fmov d2, x10 //CTR block 6 295 cmp x0, x5 //check if we have <= 8 blocks 296 297 fmov v2.d[1], x9 //CTR block 6 298 rev w9, w12 //CTR block 7 299 st1 { v6.16b}, [x2], #16 //AES block 2 - store result 300 301 orr x9, x11, x9, lsl #32 //CTR block 7 302 303 eor v7.16b, v7.16b, v3.16b //AES block 3 - result 304 st1 { v7.16b}, [x2], #16 //AES block 3 - store result 305 b.ge .L128_enc_prepretail //do prepretail 306 307.L128_enc_main_loop: //main loop start 308 ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext 309#ifdef __AARCH64EB__ 310 rev x23, x23 311 rev x24, x24 312#endif 313 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 314 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 315 316 aese v2.16b, v18.16b 317 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 318 fmov d3, x10 //CTR block 4k+3 319 320 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 321 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 322 323 aese v1.16b, v18.16b 324 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 325 add w12, w12, #1 //CTR block 4k+3 326 fmov v3.d[1], x9 //CTR block 4k+3 327 328 aese v0.16b, v18.16b 329 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 330 mov d31, v6.d[1] //GHASH block 4k+2 - mid 331 332 aese v2.16b, v19.16b 333 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 334 mov d30, v5.d[1] //GHASH block 4k+1 - mid 335 336 aese v1.16b, v19.16b 337 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 338 eor v4.16b, v4.16b, v11.16b //PRE 1 339 340 aese v3.16b, v18.16b 341 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 342 eor x24, x24, x14 //AES block 4k+3 - round 10 high 343 344 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 345 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 346 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 347#ifdef __AARCH64EB__ 348 rev x6, x6 349 rev x7, x7 350#endif 351 aese v0.16b, v19.16b 352 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 353 rev w9, w12 //CTR block 4k+8 354 355 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 356 mov d8, v4.d[1] //GHASH block 4k - mid 357 orr x9, x11, x9, lsl #32 //CTR block 4k+8 358 359 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 360 add w12, w12, #1 //CTR block 4k+8 361 mov d10, v17.d[1] //GHASH block 4k - mid 362 363 aese v0.16b, v20.16b 364 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 365 366 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 367 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 368 369 aese v1.16b, v20.16b 370 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 371 372 aese v0.16b, v21.16b 373 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 374 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 375 376 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 377 378 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 379 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 380 381 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 382 383 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 384 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 385 386 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 387 eor x7, x7, x14 //AES block 4k+4 - round 10 high 388 389 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 390 mov d30, v7.d[1] //GHASH block 4k+3 - mid 391 392 aese v3.16b, v19.16b 393 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 394 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 395 396 aese v2.16b, v20.16b 397 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 398 eor x6, x6, x13 //AES block 4k+4 - round 10 low 399 400 aese v1.16b, v21.16b 401 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 402 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 403 404 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 405 406 aese v2.16b, v21.16b 407 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 408 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 409 410 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 411 412 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 413 movi v8.8b, #0xc2 414 415 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 416 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 417 418 aese v1.16b, v22.16b 419 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 420 421 aese v3.16b, v20.16b 422 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 423 shl d8, d8, #56 //mod_constant 424 425 aese v0.16b, v22.16b 426 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 427 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 428 429 aese v1.16b, v23.16b 430 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 431 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 432#ifdef __AARCH64EB__ 433 rev x19, x19 434 rev x20, x20 435#endif 436 aese v3.16b, v21.16b 437 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 438 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 439 440 aese v0.16b, v23.16b 441 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 442 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 443#ifdef __AARCH64EB__ 444 rev x21, x21 445 rev x22, x22 446#endif 447 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 448 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 449 450 aese v2.16b, v22.16b 451 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 452 eor x19, x19, x13 //AES block 4k+5 - round 10 low 453 454 aese v3.16b, v22.16b 455 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 456 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 457 458 aese v1.16b, v24.16b 459 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 460 eor x23, x23, x13 //AES block 4k+3 - round 10 low 461 462 aese v2.16b, v23.16b 463 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 464 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 465 466 fmov d4, x6 //AES block 4k+4 - mov low 467 aese v0.16b, v24.16b 468 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 469 fmov v4.d[1], x7 //AES block 4k+4 - mov high 470 471 add x0, x0, #64 //AES input_ptr update 472 fmov d7, x23 //AES block 4k+3 - mov low 473 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 474 475 aese v3.16b, v23.16b 476 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 477 fmov d5, x19 //AES block 4k+5 - mov low 478 479 aese v0.16b, v25.16b 480 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 481 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 482 483 aese v2.16b, v24.16b 484 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 485 eor x20, x20, x14 //AES block 4k+5 - round 10 high 486 487 aese v1.16b, v25.16b 488 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 489 fmov v5.d[1], x20 //AES block 4k+5 - mov high 490 491 aese v0.16b, v26.16b 492 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 493 fmov v7.d[1], x24 //AES block 4k+3 - mov high 494 495 aese v3.16b, v24.16b 496 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 497 cmp x0, x5 //.LOOP CONTROL 498 499 aese v1.16b, v26.16b 500 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 501 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 502 503 aese v0.16b, v27.16b //AES block 4k+4 - round 9 504 eor x21, x21, x13 //AES block 4k+6 - round 10 low 505 eor x22, x22, x14 //AES block 4k+6 - round 10 high 506 507 aese v3.16b, v25.16b 508 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 509 fmov d6, x21 //AES block 4k+6 - mov low 510 511 aese v1.16b, v27.16b //AES block 4k+5 - round 9 512 fmov v6.d[1], x22 //AES block 4k+6 - mov high 513 514 aese v2.16b, v25.16b 515 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 516 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 517 518 fmov d0, x10 //CTR block 4k+8 519 aese v3.16b, v26.16b 520 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 521 522 fmov v0.d[1], x9 //CTR block 4k+8 523 rev w9, w12 //CTR block 4k+9 524 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 525 526 aese v2.16b, v26.16b 527 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 528 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 529 530 add w12, w12, #1 //CTR block 4k+9 531 orr x9, x11, x9, lsl #32 //CTR block 4k+9 532 fmov d1, x10 //CTR block 4k+9 533 534 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 535 fmov v1.d[1], x9 //CTR block 4k+9 536 rev w9, w12 //CTR block 4k+10 537 538 aese v2.16b, v27.16b //AES block 4k+6 - round 9 539 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 540 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 541 orr x9, x11, x9, lsl #32 //CTR block 4k+10 542 543 aese v3.16b, v27.16b //AES block 4k+7 - round 9 544 add w12, w12, #1 //CTR block 4k+10 545 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 546 fmov d2, x10 //CTR block 4k+10 547 548 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 549 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 550 551 fmov v2.d[1], x9 //CTR block 4k+10 552 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 553 rev w9, w12 //CTR block 4k+11 554 555 orr x9, x11, x9, lsl #32 //CTR block 4k+11 556 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result 557 558 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 559 st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result 560 b.lt .L128_enc_main_loop 561 562.L128_enc_prepretail: //PREPRETAIL 563 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 564 fmov d3, x10 //CTR block 4k+3 565 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 566 567 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 568 add w12, w12, #1 //CTR block 4k+3 569 fmov v3.d[1], x9 //CTR block 4k+3 570 571 aese v1.16b, v18.16b 572 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 573 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 574 575 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 576 577 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 578 eor v4.16b, v4.16b, v11.16b //PRE 1 579 580 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 581 582 aese v3.16b, v18.16b 583 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 584 mov d30, v5.d[1] //GHASH block 4k+1 - mid 585 586 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 587 mov d8, v4.d[1] //GHASH block 4k - mid 588 589 mov d31, v6.d[1] //GHASH block 4k+2 - mid 590 mov d10, v17.d[1] //GHASH block 4k - mid 591 592 aese v1.16b, v19.16b 593 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 594 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 595 596 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 597 598 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 599 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 600 601 aese v3.16b, v19.16b 602 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 603 604 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 605 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 606 607 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 608 609 aese v0.16b, v18.16b 610 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 611 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 612 613 aese v2.16b, v18.16b 614 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 615 616 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 617 mov d30, v7.d[1] //GHASH block 4k+3 - mid 618 619 aese v0.16b, v19.16b 620 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 621 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 622 623 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 624 625 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 626 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 627 628 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 629 630 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 631 632 aese v2.16b, v19.16b 633 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 634 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 635 636 aese v0.16b, v20.16b 637 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 638 639 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 640 movi v8.8b, #0xc2 641 642 aese v2.16b, v20.16b 643 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 644 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 645 646 aese v3.16b, v20.16b 647 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 648 649 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 650 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 651 652 aese v2.16b, v21.16b 653 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 654 655 aese v1.16b, v20.16b 656 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 657 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 658 659 aese v0.16b, v21.16b 660 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 661 662 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 663 shl d8, d8, #56 //mod_constant 664 665 aese v1.16b, v21.16b 666 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 667 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 668 669 aese v0.16b, v22.16b 670 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 671 672 pmull v28.1q, v9.1d, v8.1d 673 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 674 675 aese v1.16b, v22.16b 676 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 677 678 aese v0.16b, v23.16b 679 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 680 ext v9.16b, v9.16b, v9.16b, #8 681 682 aese v3.16b, v21.16b 683 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 684 685 aese v2.16b, v22.16b 686 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 687 eor v10.16b, v10.16b, v11.16b 688 689 aese v0.16b, v24.16b 690 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 691 692 aese v3.16b, v22.16b 693 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 694 695 aese v1.16b, v23.16b 696 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 697 698 aese v2.16b, v23.16b 699 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 700 eor v10.16b, v10.16b, v28.16b 701 702 aese v3.16b, v23.16b 703 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 704 705 aese v1.16b, v24.16b 706 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 707 708 aese v2.16b, v24.16b 709 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 710 711 aese v3.16b, v24.16b 712 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 713 eor v10.16b, v10.16b, v9.16b 714 715 aese v0.16b, v25.16b 716 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 717 718 aese v2.16b, v25.16b 719 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 720 721 aese v3.16b, v25.16b 722 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 723 724 pmull v28.1q, v10.1d, v8.1d 725 726 aese v1.16b, v25.16b 727 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 728 ext v10.16b, v10.16b, v10.16b, #8 729 730 aese v3.16b, v26.16b 731 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 732 733 aese v0.16b, v26.16b 734 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 735 eor v11.16b, v11.16b, v28.16b 736 737 aese v1.16b, v26.16b 738 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 739 740 aese v3.16b, v27.16b //AES block 4k+7 - round 9 741 742 aese v2.16b, v26.16b 743 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 744 745 aese v0.16b, v27.16b //AES block 4k+4 - round 9 746 747 aese v1.16b, v27.16b //AES block 4k+5 - round 9 748 eor v11.16b, v11.16b, v10.16b 749 750 aese v2.16b, v27.16b //AES block 4k+6 - round 9 751.L128_enc_tail: //TAIL 752 753 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 754 ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 755#ifdef __AARCH64EB__ 756 rev x6, x6 757 rev x7, x7 758#endif 759 cmp x5, #48 760 761 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 762 eor x6, x6, x13 //AES block 4k+4 - round 10 low 763 eor x7, x7, x14 //AES block 4k+4 - round 10 high 764 765 fmov d4, x6 //AES block 4k+4 - mov low 766 767 fmov v4.d[1], x7 //AES block 4k+4 - mov high 768 769 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 770 771 b.gt .L128_enc_blocks_more_than_3 772 773 sub w12, w12, #1 774 movi v11.8b, #0 775 mov v3.16b, v2.16b 776 777 cmp x5, #32 778 mov v2.16b, v1.16b 779 movi v9.8b, #0 780 781 movi v10.8b, #0 782 b.gt .L128_enc_blocks_more_than_2 783 784 mov v3.16b, v1.16b 785 cmp x5, #16 786 787 sub w12, w12, #1 788 b.gt .L128_enc_blocks_more_than_1 789 790 sub w12, w12, #1 791 b .L128_enc_blocks_less_than_1 792.L128_enc_blocks_more_than_3: //blocks left > 3 793 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 794 795 ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 796#ifdef __AARCH64EB__ 797 rev x6, x6 798 rev x7, x7 799#endif 800 rev64 v4.16b, v5.16b //GHASH final-3 block 801 802 eor v4.16b, v4.16b, v8.16b //feed in partial tag 803 eor x7, x7, x14 //AES final-2 block - round 10 high 804 eor x6, x6, x13 //AES final-2 block - round 10 low 805 806 fmov d5, x6 //AES final-2 block - mov low 807 808 movi v8.8b, #0 //suppress further partial tag feed in 809 fmov v5.d[1], x7 //AES final-2 block - mov high 810 811 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 812 mov d22, v4.d[1] //GHASH final-3 block - mid 813 814 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 815 816 mov d10, v17.d[1] //GHASH final-3 block - mid 817 818 eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 819 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 820 821 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 822.L128_enc_blocks_more_than_2: //blocks left > 2 823 824 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 825 826 rev64 v4.16b, v5.16b //GHASH final-2 block 827 ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 828#ifdef __AARCH64EB__ 829 rev x6, x6 830 rev x7, x7 831#endif 832 eor v4.16b, v4.16b, v8.16b //feed in partial tag 833 834 eor x6, x6, x13 //AES final-1 block - round 10 low 835 836 fmov d5, x6 //AES final-1 block - mov low 837 eor x7, x7, x14 //AES final-1 block - round 10 high 838 839 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 840 fmov v5.d[1], x7 //AES final-1 block - mov high 841 842 mov d22, v4.d[1] //GHASH final-2 block - mid 843 844 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 845 846 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 847 848 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 849 850 eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 851 852 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 853 854 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 855 856 movi v8.8b, #0 //suppress further partial tag feed in 857 858 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 859.L128_enc_blocks_more_than_1: //blocks left > 1 860 861 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 862 863 rev64 v4.16b, v5.16b //GHASH final-1 block 864 ldp x6, x7, [x0], #16 //AES final block - load input low & high 865#ifdef __AARCH64EB__ 866 rev x6, x6 867 rev x7, x7 868#endif 869 eor v4.16b, v4.16b, v8.16b //feed in partial tag 870 871 eor x7, x7, x14 //AES final block - round 10 high 872 eor x6, x6, x13 //AES final block - round 10 low 873 874 fmov d5, x6 //AES final block - mov low 875 876 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 877 fmov v5.d[1], x7 //AES final block - mov high 878 879 mov d22, v4.d[1] //GHASH final-1 block - mid 880 881 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 882 883 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 884 885 eor v5.16b, v5.16b, v3.16b //AES final block - result 886 887 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 888 889 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 890 891 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 892 893 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 894 895 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 896 movi v8.8b, #0 //suppress further partial tag feed in 897.L128_enc_blocks_less_than_1: //blocks left <= 1 898 899 and x1, x1, #127 //bit_length %= 128 900 mvn x13, xzr //rk10_l = 0xffffffffffffffff 901 902 mvn x14, xzr //rk10_h = 0xffffffffffffffff 903 sub x1, x1, #128 //bit_length -= 128 904 905 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 906 907 and x1, x1, #127 //bit_length %= 128 908 909 lsr x14, x14, x1 //rk10_h is mask for top 64b of last block 910 cmp x1, #64 911 912 csel x6, x13, x14, lt 913 csel x7, x14, xzr, lt 914 915 fmov d0, x6 //ctr0b is mask for last block 916 917 fmov v0.d[1], x7 918 919 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 920 921 rev64 v4.16b, v5.16b //GHASH final block 922 923 eor v4.16b, v4.16b, v8.16b //feed in partial tag 924 925 mov d8, v4.d[1] //GHASH final block - mid 926 927 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 928 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 929 930 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 931#ifndef __AARCH64EB__ 932 rev w9, w12 933#else 934 mov w9, w12 935#endif 936 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 937 938 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 939 940 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 941 942 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 943 944 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 945 movi v8.8b, #0xc2 946 947 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 948 949 shl d8, d8, #56 //mod_constant 950 951 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 952 953 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 954 955 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 956 957 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 958 959 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 960 961 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 962 963 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 964 965 bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 966 967 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 968 st1 { v5.16b}, [x2] //store all 16B 969 970 str w9, [x16, #12] //store the updated counter 971 972 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 973 ext v11.16b, v11.16b, v11.16b, #8 974 rev64 v11.16b, v11.16b 975 mov x0, x15 976 st1 { v11.16b }, [x3] 977 ldp x21, x22, [sp, #16] 978 ldp x23, x24, [sp, #32] 979 ldp d8, d9, [sp, #48] 980 ldp d10, d11, [sp, #64] 981 ldp d12, d13, [sp, #80] 982 ldp d14, d15, [sp, #96] 983 ldp x19, x20, [sp], #112 984 ret 985 986.L128_enc_ret: 987 mov w0, #0x0 988 ret 989.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel 990.globl aes_gcm_dec_128_kernel 991.type aes_gcm_dec_128_kernel,%function 992.align 4 993aes_gcm_dec_128_kernel: 994 AARCH64_VALID_CALL_TARGET 995 cbz x1, .L128_dec_ret 996 stp x19, x20, [sp, #-112]! 997 mov x16, x4 998 mov x8, x5 999 stp x21, x22, [sp, #16] 1000 stp x23, x24, [sp, #32] 1001 stp d8, d9, [sp, #48] 1002 stp d10, d11, [sp, #64] 1003 stp d12, d13, [sp, #80] 1004 stp d14, d15, [sp, #96] 1005 1006 lsr x5, x1, #3 //byte_len 1007 mov x15, x5 1008 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 1009#ifdef __AARCH64EB__ 1010 rev x10, x10 1011 rev x11, x11 1012#endif 1013 ldp x13, x14, [x8, #160] //load rk10 1014#ifdef __AARCH64EB__ 1015 ror x14, x14, 32 1016 ror x13, x13, 32 1017#endif 1018 sub x5, x5, #1 //byte_len - 1 1019 ld1 {v18.4s}, [x8], #16 //load rk0 1020 1021 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 1022 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 1023 1024 ldr q13, [x3, #64] //load h2l | h2h 1025#ifndef __AARCH64EB__ 1026 ext v13.16b, v13.16b, v13.16b, #8 1027#endif 1028 lsr x12, x11, #32 1029 fmov d2, x10 //CTR block 2 1030 1031 ld1 {v19.4s}, [x8], #16 //load rk1 1032 orr w11, w11, w11 1033 rev w12, w12 //rev_ctr32 1034 1035 fmov d1, x10 //CTR block 1 1036 add w12, w12, #1 //increment rev_ctr32 1037 1038 aese v0.16b, v18.16b 1039 aesmc v0.16b, v0.16b //AES block 0 - round 0 1040 rev w9, w12 //CTR block 1 1041 1042 orr x9, x11, x9, lsl #32 //CTR block 1 1043 ld1 {v20.4s}, [x8], #16 //load rk2 1044 add w12, w12, #1 //CTR block 1 1045 1046 fmov v1.d[1], x9 //CTR block 1 1047 rev w9, w12 //CTR block 2 1048 add w12, w12, #1 //CTR block 2 1049 1050 aese v0.16b, v19.16b 1051 aesmc v0.16b, v0.16b //AES block 0 - round 1 1052 orr x9, x11, x9, lsl #32 //CTR block 2 1053 1054 fmov v2.d[1], x9 //CTR block 2 1055 rev w9, w12 //CTR block 3 1056 1057 fmov d3, x10 //CTR block 3 1058 orr x9, x11, x9, lsl #32 //CTR block 3 1059 add w12, w12, #1 //CTR block 3 1060 1061 fmov v3.d[1], x9 //CTR block 3 1062 add x4, x0, x1, lsr #3 //end_input_ptr 1063 1064 aese v1.16b, v18.16b 1065 aesmc v1.16b, v1.16b //AES block 1 - round 0 1066 ld1 {v21.4s}, [x8], #16 //load rk3 1067 1068 aese v0.16b, v20.16b 1069 aesmc v0.16b, v0.16b //AES block 0 - round 2 1070 ld1 {v22.4s}, [x8], #16 //load rk4 1071 1072 aese v2.16b, v18.16b 1073 aesmc v2.16b, v2.16b //AES block 2 - round 0 1074 ld1 {v23.4s}, [x8], #16 //load rk5 1075 1076 aese v1.16b, v19.16b 1077 aesmc v1.16b, v1.16b //AES block 1 - round 1 1078 ld1 {v24.4s}, [x8], #16 //load rk6 1079 1080 aese v3.16b, v18.16b 1081 aesmc v3.16b, v3.16b //AES block 3 - round 0 1082 1083 aese v2.16b, v19.16b 1084 aesmc v2.16b, v2.16b //AES block 2 - round 1 1085 1086 aese v1.16b, v20.16b 1087 aesmc v1.16b, v1.16b //AES block 1 - round 2 1088 1089 aese v3.16b, v19.16b 1090 aesmc v3.16b, v3.16b //AES block 3 - round 1 1091 ld1 { v11.16b}, [x3] 1092 ext v11.16b, v11.16b, v11.16b, #8 1093 rev64 v11.16b, v11.16b 1094 1095 aese v0.16b, v21.16b 1096 aesmc v0.16b, v0.16b //AES block 0 - round 3 1097 ld1 {v25.4s}, [x8], #16 //load rk7 1098 1099 aese v1.16b, v21.16b 1100 aesmc v1.16b, v1.16b //AES block 1 - round 3 1101 1102 aese v3.16b, v20.16b 1103 aesmc v3.16b, v3.16b //AES block 3 - round 2 1104 1105 aese v2.16b, v20.16b 1106 aesmc v2.16b, v2.16b //AES block 2 - round 2 1107 ld1 {v26.4s}, [x8], #16 //load rk8 1108 1109 aese v1.16b, v22.16b 1110 aesmc v1.16b, v1.16b //AES block 1 - round 4 1111 1112 aese v3.16b, v21.16b 1113 aesmc v3.16b, v3.16b //AES block 3 - round 3 1114 1115 aese v2.16b, v21.16b 1116 aesmc v2.16b, v2.16b //AES block 2 - round 3 1117 ldr q14, [x3, #80] //load h3l | h3h 1118#ifndef __AARCH64EB__ 1119 ext v14.16b, v14.16b, v14.16b, #8 1120#endif 1121 aese v0.16b, v22.16b 1122 aesmc v0.16b, v0.16b //AES block 0 - round 4 1123 ld1 {v27.4s}, [x8], #16 //load rk9 1124 1125 aese v1.16b, v23.16b 1126 aesmc v1.16b, v1.16b //AES block 1 - round 5 1127 1128 aese v2.16b, v22.16b 1129 aesmc v2.16b, v2.16b //AES block 2 - round 4 1130 1131 aese v3.16b, v22.16b 1132 aesmc v3.16b, v3.16b //AES block 3 - round 4 1133 1134 aese v0.16b, v23.16b 1135 aesmc v0.16b, v0.16b //AES block 0 - round 5 1136 1137 aese v2.16b, v23.16b 1138 aesmc v2.16b, v2.16b //AES block 2 - round 5 1139 ldr q12, [x3, #32] //load h1l | h1h 1140#ifndef __AARCH64EB__ 1141 ext v12.16b, v12.16b, v12.16b, #8 1142#endif 1143 aese v3.16b, v23.16b 1144 aesmc v3.16b, v3.16b //AES block 3 - round 5 1145 1146 aese v0.16b, v24.16b 1147 aesmc v0.16b, v0.16b //AES block 0 - round 6 1148 1149 aese v1.16b, v24.16b 1150 aesmc v1.16b, v1.16b //AES block 1 - round 6 1151 1152 aese v3.16b, v24.16b 1153 aesmc v3.16b, v3.16b //AES block 3 - round 6 1154 1155 aese v2.16b, v24.16b 1156 aesmc v2.16b, v2.16b //AES block 2 - round 6 1157 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 1158 1159 ldr q15, [x3, #112] //load h4l | h4h 1160#ifndef __AARCH64EB__ 1161 ext v15.16b, v15.16b, v15.16b, #8 1162#endif 1163 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 1164 add x5, x5, x0 1165 1166 aese v1.16b, v25.16b 1167 aesmc v1.16b, v1.16b //AES block 1 - round 7 1168 1169 aese v2.16b, v25.16b 1170 aesmc v2.16b, v2.16b //AES block 2 - round 7 1171 1172 aese v0.16b, v25.16b 1173 aesmc v0.16b, v0.16b //AES block 0 - round 7 1174 eor v16.16b, v16.16b, v8.16b //h2k | h1k 1175 1176 aese v3.16b, v25.16b 1177 aesmc v3.16b, v3.16b //AES block 3 - round 7 1178 1179 aese v1.16b, v26.16b 1180 aesmc v1.16b, v1.16b //AES block 1 - round 8 1181 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 1182 1183 aese v2.16b, v26.16b 1184 aesmc v2.16b, v2.16b //AES block 2 - round 8 1185 1186 aese v3.16b, v26.16b 1187 aesmc v3.16b, v3.16b //AES block 3 - round 8 1188 1189 aese v0.16b, v26.16b 1190 aesmc v0.16b, v0.16b //AES block 0 - round 8 1191 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 1192 1193 aese v2.16b, v27.16b //AES block 2 - round 9 1194 1195 aese v3.16b, v27.16b //AES block 3 - round 9 1196 1197 aese v0.16b, v27.16b //AES block 0 - round 9 1198 cmp x0, x5 //check if we have <= 4 blocks 1199 1200 aese v1.16b, v27.16b //AES block 1 - round 9 1201 eor v17.16b, v17.16b, v9.16b //h4k | h3k 1202 b.ge .L128_dec_tail //handle tail 1203 1204 ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext 1205 1206 eor v1.16b, v5.16b, v1.16b //AES block 1 - result 1207 ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext 1208 1209 eor v0.16b, v4.16b, v0.16b //AES block 0 - result 1210 rev64 v4.16b, v4.16b //GHASH block 0 1211 rev w9, w12 //CTR block 4 1212 1213 orr x9, x11, x9, lsl #32 //CTR block 4 1214 add w12, w12, #1 //CTR block 4 1215 ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext 1216 1217 rev64 v5.16b, v5.16b //GHASH block 1 1218 mov x19, v1.d[0] //AES block 1 - mov low 1219 1220 mov x20, v1.d[1] //AES block 1 - mov high 1221 1222 mov x6, v0.d[0] //AES block 0 - mov low 1223 cmp x0, x5 //check if we have <= 8 blocks 1224 1225 mov x7, v0.d[1] //AES block 0 - mov high 1226 1227 fmov d0, x10 //CTR block 4 1228 1229 fmov v0.d[1], x9 //CTR block 4 1230 rev w9, w12 //CTR block 5 1231 eor x19, x19, x13 //AES block 1 - round 10 low 1232#ifdef __AARCH64EB__ 1233 rev x19, x19 1234#endif 1235 fmov d1, x10 //CTR block 5 1236 add w12, w12, #1 //CTR block 5 1237 orr x9, x11, x9, lsl #32 //CTR block 5 1238 1239 fmov v1.d[1], x9 //CTR block 5 1240 rev w9, w12 //CTR block 6 1241 add w12, w12, #1 //CTR block 6 1242 1243 orr x9, x11, x9, lsl #32 //CTR block 6 1244 1245 eor x20, x20, x14 //AES block 1 - round 10 high 1246#ifdef __AARCH64EB__ 1247 rev x20, x20 1248#endif 1249 eor x6, x6, x13 //AES block 0 - round 10 low 1250#ifdef __AARCH64EB__ 1251 rev x6, x6 1252#endif 1253 eor v2.16b, v6.16b, v2.16b //AES block 2 - result 1254 1255 eor x7, x7, x14 //AES block 0 - round 10 high 1256#ifdef __AARCH64EB__ 1257 rev x7, x7 1258#endif 1259 stp x6, x7, [x2], #16 //AES block 0 - store result 1260 1261 stp x19, x20, [x2], #16 //AES block 1 - store result 1262 b.ge .L128_dec_prepretail //do prepretail 1263 1264.L128_dec_main_loop: //main loop start 1265 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 1266 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 1267 mov x21, v2.d[0] //AES block 4k+2 - mov low 1268 1269 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 1270 mov x22, v2.d[1] //AES block 4k+2 - mov high 1271 1272 aese v1.16b, v18.16b 1273 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 1274 fmov d2, x10 //CTR block 4k+6 1275 1276 rev64 v6.16b, v6.16b //GHASH block 4k+2 1277 fmov v2.d[1], x9 //CTR block 4k+6 1278 rev w9, w12 //CTR block 4k+7 1279 1280 mov x23, v3.d[0] //AES block 4k+3 - mov low 1281 eor v4.16b, v4.16b, v11.16b //PRE 1 1282 mov d30, v5.d[1] //GHASH block 4k+1 - mid 1283 1284 aese v1.16b, v19.16b 1285 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 1286 rev64 v7.16b, v7.16b //GHASH block 4k+3 1287 1288 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 1289 mov x24, v3.d[1] //AES block 4k+3 - mov high 1290 orr x9, x11, x9, lsl #32 //CTR block 4k+7 1291 1292 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 1293 fmov d3, x10 //CTR block 4k+7 1294 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 1295 1296 aese v1.16b, v20.16b 1297 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 1298 fmov v3.d[1], x9 //CTR block 4k+7 1299 1300 aese v2.16b, v18.16b 1301 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 1302 mov d10, v17.d[1] //GHASH block 4k - mid 1303 1304 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 1305 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 1306 1307 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 1308 1309 aese v1.16b, v21.16b 1310 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 1311 mov d8, v4.d[1] //GHASH block 4k - mid 1312 1313 aese v3.16b, v18.16b 1314 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 1315 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 1316 1317 aese v0.16b, v18.16b 1318 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 1319 1320 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 1321 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 1322 1323 aese v3.16b, v19.16b 1324 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 1325 eor x23, x23, x13 //AES block 4k+3 - round 10 low 1326#ifdef __AARCH64EB__ 1327 rev x23, x23 1328#endif 1329 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 1330 eor x22, x22, x14 //AES block 4k+2 - round 10 high 1331#ifdef __AARCH64EB__ 1332 rev x22, x22 1333#endif 1334 mov d31, v6.d[1] //GHASH block 4k+2 - mid 1335 1336 aese v0.16b, v19.16b 1337 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 1338 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 1339 1340 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 1341 1342 aese v3.16b, v20.16b 1343 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 1344 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 1345 1346 aese v0.16b, v20.16b 1347 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 1348 1349 aese v1.16b, v22.16b 1350 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 1351 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 1352 1353 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 1354 1355 aese v0.16b, v21.16b 1356 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 1357 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 1358 1359 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 1360 1361 aese v2.16b, v19.16b 1362 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 1363 mov d30, v7.d[1] //GHASH block 4k+3 - mid 1364 1365 aese v0.16b, v22.16b 1366 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 1367 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 1368 1369 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 1370 eor x24, x24, x14 //AES block 4k+3 - round 10 high 1371#ifdef __AARCH64EB__ 1372 rev x24, x24 1373#endif 1374 aese v2.16b, v20.16b 1375 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 1376 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 1377 1378 aese v1.16b, v23.16b 1379 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 1380 eor x21, x21, x13 //AES block 4k+2 - round 10 low 1381#ifdef __AARCH64EB__ 1382 rev x21, x21 1383#endif 1384 aese v0.16b, v23.16b 1385 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 1386 movi v8.8b, #0xc2 1387 1388 aese v2.16b, v21.16b 1389 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 1390 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 1391 1392 aese v1.16b, v24.16b 1393 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 1394 1395 aese v0.16b, v24.16b 1396 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 1397 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 1398 1399 aese v2.16b, v22.16b 1400 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 1401 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 1402 1403 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 1404 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 1405 ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext 1406 1407 aese v1.16b, v25.16b 1408 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 1409 add w12, w12, #1 //CTR block 4k+7 1410 1411 aese v0.16b, v25.16b 1412 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 1413 shl d8, d8, #56 //mod_constant 1414 1415 aese v2.16b, v23.16b 1416 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 1417 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 1418 1419 aese v1.16b, v26.16b 1420 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 1421 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 1422 1423 aese v0.16b, v26.16b 1424 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 1425 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1426 1427 aese v3.16b, v21.16b 1428 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 1429 rev w9, w12 //CTR block 4k+8 1430 1431 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1432 ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 1433 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1434 1435 aese v0.16b, v27.16b //AES block 4k+4 - round 9 1436 orr x9, x11, x9, lsl #32 //CTR block 4k+8 1437 1438 aese v3.16b, v22.16b 1439 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 1440 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1441 1442 aese v1.16b, v27.16b //AES block 4k+5 - round 9 1443 1444 aese v2.16b, v24.16b 1445 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 1446 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 1447 1448 aese v3.16b, v23.16b 1449 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 1450 ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 1451 1452 add w12, w12, #1 //CTR block 4k+8 1453 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1454 eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 1455 1456 aese v2.16b, v25.16b 1457 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 1458 ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 1459 1460 aese v3.16b, v24.16b 1461 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 1462 1463 rev64 v5.16b, v5.16b //GHASH block 4k+5 1464 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1465 mov x7, v0.d[1] //AES block 4k+4 - mov high 1466 1467 aese v2.16b, v26.16b 1468 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 1469 mov x6, v0.d[0] //AES block 4k+4 - mov low 1470 1471 aese v3.16b, v25.16b 1472 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 1473 fmov d0, x10 //CTR block 4k+8 1474 1475 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1476 fmov v0.d[1], x9 //CTR block 4k+8 1477 rev w9, w12 //CTR block 4k+9 1478 1479 aese v2.16b, v27.16b //AES block 4k+6 - round 9 1480 orr x9, x11, x9, lsl #32 //CTR block 4k+9 1481 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1482 1483 aese v3.16b, v26.16b 1484 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 1485 eor x7, x7, x14 //AES block 4k+4 - round 10 high 1486#ifdef __AARCH64EB__ 1487 rev x7, x7 1488#endif 1489 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1490 mov x20, v1.d[1] //AES block 4k+5 - mov high 1491 eor x6, x6, x13 //AES block 4k+4 - round 10 low 1492#ifdef __AARCH64EB__ 1493 rev x6, x6 1494#endif 1495 eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 1496 mov x19, v1.d[0] //AES block 4k+5 - mov low 1497 add w12, w12, #1 //CTR block 4k+9 1498 1499 aese v3.16b, v27.16b //AES block 4k+7 - round 9 1500 fmov d1, x10 //CTR block 4k+9 1501 cmp x0, x5 //.LOOP CONTROL 1502 1503 rev64 v4.16b, v4.16b //GHASH block 4k+4 1504 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1505 fmov v1.d[1], x9 //CTR block 4k+9 1506 1507 rev w9, w12 //CTR block 4k+10 1508 add w12, w12, #1 //CTR block 4k+10 1509 1510 eor x20, x20, x14 //AES block 4k+5 - round 10 high 1511#ifdef __AARCH64EB__ 1512 rev x20, x20 1513#endif 1514 stp x6, x7, [x2], #16 //AES block 4k+4 - store result 1515 1516 eor x19, x19, x13 //AES block 4k+5 - round 10 low 1517#ifdef __AARCH64EB__ 1518 rev x19, x19 1519#endif 1520 stp x19, x20, [x2], #16 //AES block 4k+5 - store result 1521 1522 orr x9, x11, x9, lsl #32 //CTR block 4k+10 1523 b.lt .L128_dec_main_loop 1524 1525.L128_dec_prepretail: //PREPRETAIL 1526 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 1527 mov x21, v2.d[0] //AES block 4k+2 - mov low 1528 mov d30, v5.d[1] //GHASH block 4k+1 - mid 1529 1530 aese v0.16b, v18.16b 1531 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 1532 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 1533 1534 aese v1.16b, v18.16b 1535 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 1536 mov x22, v2.d[1] //AES block 4k+2 - mov high 1537 1538 eor v4.16b, v4.16b, v11.16b //PRE 1 1539 fmov d2, x10 //CTR block 4k+6 1540 rev64 v6.16b, v6.16b //GHASH block 4k+2 1541 1542 aese v0.16b, v19.16b 1543 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 1544 fmov v2.d[1], x9 //CTR block 4k+6 1545 1546 rev w9, w12 //CTR block 4k+7 1547 mov x23, v3.d[0] //AES block 4k+3 - mov low 1548 eor v30.8b, v30.8b, v5.8b //GHASH block 4k+1 - mid 1549 1550 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 1551 mov d10, v17.d[1] //GHASH block 4k - mid 1552 mov x24, v3.d[1] //AES block 4k+3 - mov high 1553 1554 aese v1.16b, v19.16b 1555 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 1556 mov d31, v6.d[1] //GHASH block 4k+2 - mid 1557 1558 aese v0.16b, v20.16b 1559 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 1560 orr x9, x11, x9, lsl #32 //CTR block 4k+7 1561 1562 pmull v29.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 1563 mov d8, v4.d[1] //GHASH block 4k - mid 1564 fmov d3, x10 //CTR block 4k+7 1565 1566 aese v2.16b, v18.16b 1567 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 1568 fmov v3.d[1], x9 //CTR block 4k+7 1569 1570 pmull v30.1q, v30.1d, v17.1d //GHASH block 4k+1 - mid 1571 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 1572 1573 rev64 v7.16b, v7.16b //GHASH block 4k+3 1574 1575 aese v2.16b, v19.16b 1576 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 1577 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 1578 1579 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 1580 1581 aese v3.16b, v18.16b 1582 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 1583 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 1584 1585 pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 1586 1587 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 1588 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+1 - low 1589 1590 pmull v29.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 1591 1592 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 1593 eor v9.16b, v9.16b, v28.16b //GHASH block 4k+1 - high 1594 1595 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+1 - mid 1596 1597 pmull2 v4.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 1598 1599 pmull2 v8.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 1600 mov d30, v7.d[1] //GHASH block 4k+3 - mid 1601 1602 aese v1.16b, v20.16b 1603 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 1604 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 1605 1606 pmull v28.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 1607 1608 eor v9.16b, v9.16b, v8.16b //GHASH block 4k+2 - high 1609 movi v8.8b, #0xc2 1610 1611 aese v3.16b, v19.16b 1612 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 1613 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 1614 1615 eor v11.16b, v11.16b, v28.16b //GHASH block 4k+2 - low 1616 1617 aese v2.16b, v20.16b 1618 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 1619 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high 1620 1621 aese v3.16b, v20.16b 1622 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 1623 eor x23, x23, x13 //AES block 4k+3 - round 10 low 1624#ifdef __AARCH64EB__ 1625 rev x23, x23 1626#endif 1627 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 1628 eor x21, x21, x13 //AES block 4k+2 - round 10 low 1629#ifdef __AARCH64EB__ 1630 rev x21, x21 1631#endif 1632 eor v11.16b, v11.16b, v29.16b //GHASH block 4k+3 - low 1633 1634 aese v2.16b, v21.16b 1635 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 1636 1637 aese v1.16b, v21.16b 1638 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 1639 shl d8, d8, #56 //mod_constant 1640 1641 aese v0.16b, v21.16b 1642 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 1643 1644 aese v2.16b, v22.16b 1645 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 1646 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 1647 1648 aese v1.16b, v22.16b 1649 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 1650 1651 aese v3.16b, v21.16b 1652 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 1653 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1654 1655 aese v2.16b, v23.16b 1656 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 1657 1658 aese v1.16b, v23.16b 1659 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 1660 1661 aese v3.16b, v22.16b 1662 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 1663 1664 aese v0.16b, v22.16b 1665 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 1666 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1667 1668 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1669 1670 aese v1.16b, v24.16b 1671 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 1672 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1673 1674 aese v3.16b, v23.16b 1675 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 1676 1677 aese v0.16b, v23.16b 1678 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 1679 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1680 1681 aese v1.16b, v25.16b 1682 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 1683 1684 aese v2.16b, v24.16b 1685 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 1686 1687 aese v0.16b, v24.16b 1688 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 1689 1690 aese v1.16b, v26.16b 1691 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 1692 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1693 1694 aese v3.16b, v24.16b 1695 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 1696 1697 aese v0.16b, v25.16b 1698 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 1699 1700 aese v1.16b, v27.16b //AES block 4k+5 - round 9 1701 1702 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1703 eor x24, x24, x14 //AES block 4k+3 - round 10 high 1704#ifdef __AARCH64EB__ 1705 rev x24, x24 1706#endif 1707 aese v2.16b, v25.16b 1708 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 1709 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1710 1711 aese v3.16b, v25.16b 1712 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 1713 1714 aese v0.16b, v26.16b 1715 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 1716 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1717 1718 aese v2.16b, v26.16b 1719 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 1720 1721 aese v3.16b, v26.16b 1722 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 1723 eor x22, x22, x14 //AES block 4k+2 - round 10 high 1724#ifdef __AARCH64EB__ 1725 rev x22, x22 1726#endif 1727 aese v0.16b, v27.16b //AES block 4k+4 - round 9 1728 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 1729 1730 aese v2.16b, v27.16b //AES block 4k+6 - round 9 1731 add w12, w12, #1 //CTR block 4k+7 1732 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 1733 1734 aese v3.16b, v27.16b //AES block 4k+7 - round 9 1735 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1736.L128_dec_tail: //TAIL 1737 1738 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 1739 ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 1740 1741 eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 1742 1743 mov x7, v0.d[1] //AES block 4k+4 - mov high 1744 1745 mov x6, v0.d[0] //AES block 4k+4 - mov low 1746 1747 cmp x5, #48 1748 1749 eor x7, x7, x14 //AES block 4k+4 - round 10 high 1750#ifdef __AARCH64EB__ 1751 rev x7, x7 1752#endif 1753 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 1754 eor x6, x6, x13 //AES block 4k+4 - round 10 low 1755#ifdef __AARCH64EB__ 1756 rev x6, x6 1757#endif 1758 b.gt .L128_dec_blocks_more_than_3 1759 1760 mov v3.16b, v2.16b 1761 sub w12, w12, #1 1762 movi v11.8b, #0 1763 1764 movi v9.8b, #0 1765 mov v2.16b, v1.16b 1766 1767 movi v10.8b, #0 1768 cmp x5, #32 1769 b.gt .L128_dec_blocks_more_than_2 1770 1771 cmp x5, #16 1772 1773 mov v3.16b, v1.16b 1774 sub w12, w12, #1 1775 b.gt .L128_dec_blocks_more_than_1 1776 1777 sub w12, w12, #1 1778 b .L128_dec_blocks_less_than_1 1779.L128_dec_blocks_more_than_3: //blocks left > 3 1780 rev64 v4.16b, v5.16b //GHASH final-3 block 1781 ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 1782 1783 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1784 1785 mov d10, v17.d[1] //GHASH final-3 block - mid 1786 stp x6, x7, [x2], #16 //AES final-3 block - store result 1787 eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 1788 1789 mov d22, v4.d[1] //GHASH final-3 block - mid 1790 mov x7, v0.d[1] //AES final-2 block - mov high 1791 1792 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 1793 mov x6, v0.d[0] //AES final-2 block - mov low 1794 1795 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 1796 1797 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 1798 1799 movi v8.8b, #0 //suppress further partial tag feed in 1800 eor x7, x7, x14 //AES final-2 block - round 10 high 1801#ifdef __AARCH64EB__ 1802 rev x7, x7 1803#endif 1804 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 1805 eor x6, x6, x13 //AES final-2 block - round 10 low 1806#ifdef __AARCH64EB__ 1807 rev x6, x6 1808#endif 1809.L128_dec_blocks_more_than_2: //blocks left > 2 1810 1811 rev64 v4.16b, v5.16b //GHASH final-2 block 1812 ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 1813 1814 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1815 1816 eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 1817 stp x6, x7, [x2], #16 //AES final-2 block - store result 1818 1819 mov d22, v4.d[1] //GHASH final-2 block - mid 1820 1821 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 1822 1823 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 1824 mov x6, v0.d[0] //AES final-1 block - mov low 1825 1826 mov x7, v0.d[1] //AES final-1 block - mov high 1827 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 1828 1829 movi v8.8b, #0 //suppress further partial tag feed in 1830 1831 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 1832 1833 eor x6, x6, x13 //AES final-1 block - round 10 low 1834#ifdef __AARCH64EB__ 1835 rev x6, x6 1836#endif 1837 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 1838 1839 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 1840 1841 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 1842 eor x7, x7, x14 //AES final-1 block - round 10 high 1843#ifdef __AARCH64EB__ 1844 rev x7, x7 1845#endif 1846.L128_dec_blocks_more_than_1: //blocks left > 1 1847 1848 rev64 v4.16b, v5.16b //GHASH final-1 block 1849 1850 ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 1851 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1852 1853 mov d22, v4.d[1] //GHASH final-1 block - mid 1854 1855 eor v0.16b, v5.16b, v3.16b //AES final block - result 1856 1857 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 1858 1859 stp x6, x7, [x2], #16 //AES final-1 block - store result 1860 mov x6, v0.d[0] //AES final block - mov low 1861 1862 mov x7, v0.d[1] //AES final block - mov high 1863 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 1864 1865 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 1866 1867 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 1868 1869 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 1870 movi v8.8b, #0 //suppress further partial tag feed in 1871 1872 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 1873 1874 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 1875 eor x7, x7, x14 //AES final block - round 10 high 1876#ifdef __AARCH64EB__ 1877 rev x7, x7 1878#endif 1879 eor x6, x6, x13 //AES final block - round 10 low 1880#ifdef __AARCH64EB__ 1881 rev x6, x6 1882#endif 1883 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 1884.L128_dec_blocks_less_than_1: //blocks left <= 1 1885 1886 mvn x14, xzr //rk10_h = 0xffffffffffffffff 1887 and x1, x1, #127 //bit_length %= 128 1888 1889 mvn x13, xzr //rk10_l = 0xffffffffffffffff 1890 sub x1, x1, #128 //bit_length -= 128 1891 1892 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 1893 1894 and x1, x1, #127 //bit_length %= 128 1895 1896 lsr x14, x14, x1 //rk10_h is mask for top 64b of last block 1897 cmp x1, #64 1898 1899 csel x10, x14, xzr, lt 1900 csel x9, x13, x14, lt 1901 1902 fmov d0, x9 //ctr0b is mask for last block 1903 1904 mov v0.d[1], x10 1905 1906 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 1907 1908 rev64 v4.16b, v5.16b //GHASH final block 1909 1910 eor v4.16b, v4.16b, v8.16b //feed in partial tag 1911 1912 ldp x4, x5, [x2] //load existing bytes we need to not overwrite 1913 1914 and x7, x7, x10 1915 1916 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 1917 mov d8, v4.d[1] //GHASH final block - mid 1918 1919 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 1920 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 1921 1922 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 1923 1924 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 1925 bic x4, x4, x9 //mask out low existing bytes 1926 and x6, x6, x9 1927 1928#ifndef __AARCH64EB__ 1929 rev w9, w12 1930#else 1931 mov w9, w12 1932#endif 1933 1934 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 1935 movi v8.8b, #0xc2 1936 1937 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 1938 1939 bic x5, x5, x10 //mask out high existing bytes 1940 shl d8, d8, #56 //mod_constant 1941 1942 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 1943 1944 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 1945 1946 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 1947 1948 orr x6, x6, x4 1949 str w9, [x16, #12] //store the updated counter 1950 1951 orr x7, x7, x5 1952 stp x6, x7, [x2] 1953 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 1954 1955 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 1956 1957 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 1958 1959 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 1960 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 1961 1962 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 1963 1964 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 1965 ext v11.16b, v11.16b, v11.16b, #8 1966 rev64 v11.16b, v11.16b 1967 mov x0, x15 1968 st1 { v11.16b }, [x3] 1969 1970 ldp x21, x22, [sp, #16] 1971 ldp x23, x24, [sp, #32] 1972 ldp d8, d9, [sp, #48] 1973 ldp d10, d11, [sp, #64] 1974 ldp d12, d13, [sp, #80] 1975 ldp d14, d15, [sp, #96] 1976 ldp x19, x20, [sp], #112 1977 ret 1978 1979.L128_dec_ret: 1980 mov w0, #0x0 1981 ret 1982.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel 1983.globl aes_gcm_enc_192_kernel 1984.type aes_gcm_enc_192_kernel,%function 1985.align 4 1986aes_gcm_enc_192_kernel: 1987 AARCH64_VALID_CALL_TARGET 1988 cbz x1, .L192_enc_ret 1989 stp x19, x20, [sp, #-112]! 1990 mov x16, x4 1991 mov x8, x5 1992 stp x21, x22, [sp, #16] 1993 stp x23, x24, [sp, #32] 1994 stp d8, d9, [sp, #48] 1995 stp d10, d11, [sp, #64] 1996 stp d12, d13, [sp, #80] 1997 stp d14, d15, [sp, #96] 1998 1999 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 2000#ifdef __AARCH64EB__ 2001 rev x10, x10 2002 rev x11, x11 2003#endif 2004 ldp x13, x14, [x8, #192] //load rk12 2005#ifdef __AARCH64EB__ 2006 ror x13, x13, #32 2007 ror x14, x14, #32 2008#endif 2009 ld1 {v18.4s}, [x8], #16 //load rk0 2010 2011 ld1 {v19.4s}, [x8], #16 //load rk1 2012 2013 ld1 {v20.4s}, [x8], #16 //load rk2 2014 2015 lsr x12, x11, #32 2016 ld1 {v21.4s}, [x8], #16 //load rk3 2017 orr w11, w11, w11 2018 2019 ld1 {v22.4s}, [x8], #16 //load rk4 2020 rev w12, w12 //rev_ctr32 2021 2022 add w12, w12, #1 //increment rev_ctr32 2023 fmov d3, x10 //CTR block 3 2024 2025 rev w9, w12 //CTR block 1 2026 add w12, w12, #1 //CTR block 1 2027 fmov d1, x10 //CTR block 1 2028 2029 orr x9, x11, x9, lsl #32 //CTR block 1 2030 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 2031 2032 fmov v1.d[1], x9 //CTR block 1 2033 rev w9, w12 //CTR block 2 2034 add w12, w12, #1 //CTR block 2 2035 2036 fmov d2, x10 //CTR block 2 2037 orr x9, x11, x9, lsl #32 //CTR block 2 2038 2039 fmov v2.d[1], x9 //CTR block 2 2040 rev w9, w12 //CTR block 3 2041 2042 orr x9, x11, x9, lsl #32 //CTR block 3 2043 ld1 {v23.4s}, [x8], #16 //load rk5 2044 2045 fmov v3.d[1], x9 //CTR block 3 2046 2047 ld1 {v24.4s}, [x8], #16 //load rk6 2048 2049 ld1 {v25.4s}, [x8], #16 //load rk7 2050 2051 aese v0.16b, v18.16b 2052 aesmc v0.16b, v0.16b //AES block 0 - round 0 2053 ld1 { v11.16b}, [x3] 2054 ext v11.16b, v11.16b, v11.16b, #8 2055 rev64 v11.16b, v11.16b 2056 2057 aese v3.16b, v18.16b 2058 aesmc v3.16b, v3.16b //AES block 3 - round 0 2059 ld1 {v26.4s}, [x8], #16 //load rk8 2060 2061 aese v1.16b, v18.16b 2062 aesmc v1.16b, v1.16b //AES block 1 - round 0 2063 ldr q15, [x3, #112] //load h4l | h4h 2064#ifndef __AARCH64EB__ 2065 ext v15.16b, v15.16b, v15.16b, #8 2066#endif 2067 aese v2.16b, v18.16b 2068 aesmc v2.16b, v2.16b //AES block 2 - round 0 2069 ld1 {v27.4s}, [x8], #16 //load rk9 2070 2071 aese v0.16b, v19.16b 2072 aesmc v0.16b, v0.16b //AES block 0 - round 1 2073 ld1 {v28.4s}, [x8], #16 //load rk10 2074 2075 aese v1.16b, v19.16b 2076 aesmc v1.16b, v1.16b //AES block 1 - round 1 2077 ldr q12, [x3, #32] //load h1l | h1h 2078#ifndef __AARCH64EB__ 2079 ext v12.16b, v12.16b, v12.16b, #8 2080#endif 2081 aese v2.16b, v19.16b 2082 aesmc v2.16b, v2.16b //AES block 2 - round 1 2083 ld1 {v29.4s}, [x8], #16 //load rk11 2084 2085 aese v3.16b, v19.16b 2086 aesmc v3.16b, v3.16b //AES block 3 - round 1 2087 ldr q14, [x3, #80] //load h3l | h3h 2088#ifndef __AARCH64EB__ 2089 ext v14.16b, v14.16b, v14.16b, #8 2090#endif 2091 aese v0.16b, v20.16b 2092 aesmc v0.16b, v0.16b //AES block 0 - round 2 2093 2094 aese v2.16b, v20.16b 2095 aesmc v2.16b, v2.16b //AES block 2 - round 2 2096 2097 aese v3.16b, v20.16b 2098 aesmc v3.16b, v3.16b //AES block 3 - round 2 2099 2100 aese v0.16b, v21.16b 2101 aesmc v0.16b, v0.16b //AES block 0 - round 3 2102 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 2103 2104 aese v2.16b, v21.16b 2105 aesmc v2.16b, v2.16b //AES block 2 - round 3 2106 2107 aese v1.16b, v20.16b 2108 aesmc v1.16b, v1.16b //AES block 1 - round 2 2109 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 2110 2111 aese v0.16b, v22.16b 2112 aesmc v0.16b, v0.16b //AES block 0 - round 4 2113 2114 aese v3.16b, v21.16b 2115 aesmc v3.16b, v3.16b //AES block 3 - round 3 2116 2117 aese v1.16b, v21.16b 2118 aesmc v1.16b, v1.16b //AES block 1 - round 3 2119 2120 aese v0.16b, v23.16b 2121 aesmc v0.16b, v0.16b //AES block 0 - round 5 2122 2123 aese v2.16b, v22.16b 2124 aesmc v2.16b, v2.16b //AES block 2 - round 4 2125 2126 aese v1.16b, v22.16b 2127 aesmc v1.16b, v1.16b //AES block 1 - round 4 2128 2129 aese v0.16b, v24.16b 2130 aesmc v0.16b, v0.16b //AES block 0 - round 6 2131 2132 aese v3.16b, v22.16b 2133 aesmc v3.16b, v3.16b //AES block 3 - round 4 2134 2135 aese v2.16b, v23.16b 2136 aesmc v2.16b, v2.16b //AES block 2 - round 5 2137 2138 aese v1.16b, v23.16b 2139 aesmc v1.16b, v1.16b //AES block 1 - round 5 2140 2141 aese v3.16b, v23.16b 2142 aesmc v3.16b, v3.16b //AES block 3 - round 5 2143 2144 aese v2.16b, v24.16b 2145 aesmc v2.16b, v2.16b //AES block 2 - round 6 2146 ldr q13, [x3, #64] //load h2l | h2h 2147#ifndef __AARCH64EB__ 2148 ext v13.16b, v13.16b, v13.16b, #8 2149#endif 2150 aese v1.16b, v24.16b 2151 aesmc v1.16b, v1.16b //AES block 1 - round 6 2152 2153 aese v3.16b, v24.16b 2154 aesmc v3.16b, v3.16b //AES block 3 - round 6 2155 2156 aese v0.16b, v25.16b 2157 aesmc v0.16b, v0.16b //AES block 0 - round 7 2158 2159 aese v1.16b, v25.16b 2160 aesmc v1.16b, v1.16b //AES block 1 - round 7 2161 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 2162 2163 aese v3.16b, v25.16b 2164 aesmc v3.16b, v3.16b //AES block 3 - round 7 2165 2166 aese v0.16b, v26.16b 2167 aesmc v0.16b, v0.16b //AES block 0 - round 8 2168 2169 aese v2.16b, v25.16b 2170 aesmc v2.16b, v2.16b //AES block 2 - round 7 2171 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 2172 2173 aese v1.16b, v26.16b 2174 aesmc v1.16b, v1.16b //AES block 1 - round 8 2175 2176 aese v3.16b, v26.16b 2177 aesmc v3.16b, v3.16b //AES block 3 - round 8 2178 2179 aese v2.16b, v26.16b 2180 aesmc v2.16b, v2.16b //AES block 2 - round 8 2181 2182 aese v0.16b, v27.16b 2183 aesmc v0.16b, v0.16b //AES block 0 - round 9 2184 2185 aese v3.16b, v27.16b 2186 aesmc v3.16b, v3.16b //AES block 3 - round 9 2187 2188 aese v2.16b, v27.16b 2189 aesmc v2.16b, v2.16b //AES block 2 - round 9 2190 2191 aese v1.16b, v27.16b 2192 aesmc v1.16b, v1.16b //AES block 1 - round 9 2193 2194 aese v0.16b, v28.16b 2195 aesmc v0.16b, v0.16b //AES block 0 - round 10 2196 2197 aese v2.16b, v28.16b 2198 aesmc v2.16b, v2.16b //AES block 2 - round 10 2199 2200 aese v1.16b, v28.16b 2201 aesmc v1.16b, v1.16b //AES block 1 - round 10 2202 lsr x5, x1, #3 //byte_len 2203 mov x15, x5 2204 2205 aese v3.16b, v28.16b 2206 aesmc v3.16b, v3.16b //AES block 3 - round 10 2207 sub x5, x5, #1 //byte_len - 1 2208 2209 eor v16.16b, v16.16b, v8.16b //h2k | h1k 2210 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 2211 2212 eor v17.16b, v17.16b, v9.16b //h4k | h3k 2213 2214 aese v2.16b, v29.16b //AES block 2 - round 11 2215 add x4, x0, x1, lsr #3 //end_input_ptr 2216 add x5, x5, x0 2217 2218 aese v1.16b, v29.16b //AES block 1 - round 11 2219 cmp x0, x5 //check if we have <= 4 blocks 2220 2221 aese v0.16b, v29.16b //AES block 0 - round 11 2222 add w12, w12, #1 //CTR block 3 2223 2224 aese v3.16b, v29.16b //AES block 3 - round 11 2225 b.ge .L192_enc_tail //handle tail 2226 2227 rev w9, w12 //CTR block 4 2228 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 2229#ifdef __AARCH64EB__ 2230 rev x6, x6 2231 rev x7, x7 2232#endif 2233 orr x9, x11, x9, lsl #32 //CTR block 4 2234 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 2235#ifdef __AARCH64EB__ 2236 rev x21, x21 2237 rev x22, x22 2238#endif 2239 ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 2240#ifdef __AARCH64EB__ 2241 rev x23, x23 2242 rev x24, x24 2243#endif 2244 ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 2245#ifdef __AARCH64EB__ 2246 rev x19, x19 2247 rev x20, x20 2248#endif 2249 add x0, x0, #64 //AES input_ptr update 2250 cmp x0, x5 //check if we have <= 8 blocks 2251 2252 eor x6, x6, x13 //AES block 0 - round 12 low 2253 2254 eor x7, x7, x14 //AES block 0 - round 12 high 2255 eor x22, x22, x14 //AES block 2 - round 12 high 2256 fmov d4, x6 //AES block 0 - mov low 2257 2258 eor x24, x24, x14 //AES block 3 - round 12 high 2259 fmov v4.d[1], x7 //AES block 0 - mov high 2260 2261 eor x21, x21, x13 //AES block 2 - round 12 low 2262 eor x19, x19, x13 //AES block 1 - round 12 low 2263 2264 fmov d5, x19 //AES block 1 - mov low 2265 eor x20, x20, x14 //AES block 1 - round 12 high 2266 2267 fmov v5.d[1], x20 //AES block 1 - mov high 2268 2269 eor x23, x23, x13 //AES block 3 - round 12 low 2270 fmov d6, x21 //AES block 2 - mov low 2271 2272 add w12, w12, #1 //CTR block 4 2273 eor v4.16b, v4.16b, v0.16b //AES block 0 - result 2274 fmov d0, x10 //CTR block 4 2275 2276 fmov v0.d[1], x9 //CTR block 4 2277 rev w9, w12 //CTR block 5 2278 2279 orr x9, x11, x9, lsl #32 //CTR block 5 2280 add w12, w12, #1 //CTR block 5 2281 2282 fmov d7, x23 //AES block 3 - mov low 2283 st1 { v4.16b}, [x2], #16 //AES block 0 - store result 2284 2285 fmov v6.d[1], x22 //AES block 2 - mov high 2286 2287 eor v5.16b, v5.16b, v1.16b //AES block 1 - result 2288 fmov d1, x10 //CTR block 5 2289 st1 { v5.16b}, [x2], #16 //AES block 1 - store result 2290 2291 fmov v7.d[1], x24 //AES block 3 - mov high 2292 2293 fmov v1.d[1], x9 //CTR block 5 2294 rev w9, w12 //CTR block 6 2295 2296 orr x9, x11, x9, lsl #32 //CTR block 6 2297 2298 add w12, w12, #1 //CTR block 6 2299 eor v6.16b, v6.16b, v2.16b //AES block 2 - result 2300 fmov d2, x10 //CTR block 6 2301 2302 fmov v2.d[1], x9 //CTR block 6 2303 rev w9, w12 //CTR block 7 2304 2305 orr x9, x11, x9, lsl #32 //CTR block 7 2306 st1 { v6.16b}, [x2], #16 //AES block 2 - store result 2307 2308 eor v7.16b, v7.16b, v3.16b //AES block 3 - result 2309 st1 { v7.16b}, [x2], #16 //AES block 3 - store result 2310 b.ge .L192_enc_prepretail //do prepretail 2311 2312.L192_enc_main_loop: //main loop start 2313 aese v2.16b, v18.16b 2314 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 2315 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 2316 2317 aese v1.16b, v18.16b 2318 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 2319 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 2320#ifdef __AARCH64EB__ 2321 rev x19, x19 2322 rev x20, x20 2323#endif 2324 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 2325 fmov d3, x10 //CTR block 4k+3 2326 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 2327 2328 aese v2.16b, v19.16b 2329 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 2330 fmov v3.d[1], x9 //CTR block 4k+3 2331 2332 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 2333 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 2334 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 2335#ifdef __AARCH64EB__ 2336 rev x21, x21 2337 rev x22, x22 2338#endif 2339 aese v0.16b, v18.16b 2340 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 2341 ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext 2342#ifdef __AARCH64EB__ 2343 rev x23, x23 2344 rev x24, x24 2345#endif 2346 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 2347 eor v4.16b, v4.16b, v11.16b //PRE 1 2348 2349 aese v1.16b, v19.16b 2350 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 2351 2352 aese v0.16b, v19.16b 2353 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 2354 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 2355 2356 aese v3.16b, v18.16b 2357 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 2358 eor x24, x24, x14 //AES block 4k+3 - round 12 high 2359 2360 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 2361 mov d8, v4.d[1] //GHASH block 4k - mid 2362 2363 aese v0.16b, v20.16b 2364 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 2365 2366 aese v3.16b, v19.16b 2367 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 2368 eor x21, x21, x13 //AES block 4k+6 - round 12 low 2369 2370 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 2371 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 2372 2373 aese v0.16b, v21.16b 2374 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 2375 eor x19, x19, x13 //AES block 4k+5 - round 12 low 2376 2377 aese v1.16b, v20.16b 2378 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 2379 mov d31, v6.d[1] //GHASH block 4k+2 - mid 2380 2381 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 2382 mov d4, v5.d[1] //GHASH block 4k+1 - mid 2383 2384 aese v2.16b, v20.16b 2385 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 2386 2387 aese v1.16b, v21.16b 2388 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 2389 2390 mov d10, v17.d[1] //GHASH block 4k - mid 2391 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 2392 2393 aese v3.16b, v20.16b 2394 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 2395 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 2396 2397 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 2398 2399 aese v0.16b, v22.16b 2400 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 2401 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 2402 2403 aese v3.16b, v21.16b 2404 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 2405 2406 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 2407 eor x20, x20, x14 //AES block 4k+5 - round 12 high 2408 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 2409 2410 aese v0.16b, v23.16b 2411 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 2412 add w12, w12, #1 //CTR block 4k+3 2413 2414 aese v3.16b, v22.16b 2415 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 2416 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 2417 2418 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 2419 eor x22, x22, x14 //AES block 4k+6 - round 12 high 2420 2421 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 2422 eor x23, x23, x13 //AES block 4k+3 - round 12 low 2423 mov d30, v7.d[1] //GHASH block 4k+3 - mid 2424 2425 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 2426 rev w9, w12 //CTR block 4k+8 2427 2428 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 2429 orr x9, x11, x9, lsl #32 //CTR block 4k+8 2430 2431 aese v2.16b, v21.16b 2432 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 2433 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 2434 2435 aese v1.16b, v22.16b 2436 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 2437 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 2438#ifdef __AARCH64EB__ 2439 rev x6, x6 2440 rev x7, x7 2441#endif 2442 aese v0.16b, v24.16b 2443 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 2444 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 2445 2446 aese v2.16b, v22.16b 2447 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 2448 add x0, x0, #64 //AES input_ptr update 2449 2450 aese v1.16b, v23.16b 2451 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 2452 movi v8.8b, #0xc2 2453 2454 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 2455 eor x7, x7, x14 //AES block 4k+4 - round 12 high 2456 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 2457 2458 aese v2.16b, v23.16b 2459 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 2460 eor x6, x6, x13 //AES block 4k+4 - round 12 low 2461 2462 aese v1.16b, v24.16b 2463 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 2464 shl d8, d8, #56 //mod_constant 2465 2466 aese v3.16b, v23.16b 2467 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 2468 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 2469 2470 aese v0.16b, v25.16b 2471 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 2472 fmov d5, x19 //AES block 4k+5 - mov low 2473 2474 aese v1.16b, v25.16b 2475 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 2476 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 2477 2478 aese v3.16b, v24.16b 2479 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 2480 fmov v5.d[1], x20 //AES block 4k+5 - mov high 2481 2482 aese v0.16b, v26.16b 2483 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 2484 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 2485 2486 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 2487 cmp x0, x5 //.LOOP CONTROL 2488 fmov d4, x6 //AES block 4k+4 - mov low 2489 2490 aese v2.16b, v24.16b 2491 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 2492 fmov v4.d[1], x7 //AES block 4k+4 - mov high 2493 2494 aese v1.16b, v26.16b 2495 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 2496 fmov d7, x23 //AES block 4k+3 - mov low 2497 2498 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 2499 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 2500 add w12, w12, #1 //CTR block 4k+8 2501 2502 aese v2.16b, v25.16b 2503 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 2504 fmov v7.d[1], x24 //AES block 4k+3 - mov high 2505 2506 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 2507 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 2508 fmov d6, x21 //AES block 4k+6 - mov low 2509 2510 aese v3.16b, v25.16b 2511 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 2512 2513 aese v0.16b, v27.16b 2514 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 2515 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 2516 2517 aese v2.16b, v26.16b 2518 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 2519 2520 aese v3.16b, v26.16b 2521 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 2522 2523 aese v1.16b, v27.16b 2524 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 2525 2526 aese v0.16b, v28.16b 2527 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 2528 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 2529 2530 aese v3.16b, v27.16b 2531 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 2532 2533 aese v2.16b, v27.16b 2534 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 2535 2536 aese v0.16b, v29.16b //AES block 4k+4 - round 11 2537 2538 aese v1.16b, v28.16b 2539 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 2540 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 2541 2542 aese v2.16b, v28.16b 2543 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 2544 2545 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 2546 fmov d0, x10 //CTR block 4k+8 2547 2548 aese v1.16b, v29.16b //AES block 4k+5 - round 11 2549 fmov v0.d[1], x9 //CTR block 4k+8 2550 rev w9, w12 //CTR block 4k+9 2551 2552 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 2553 fmov v6.d[1], x22 //AES block 4k+6 - mov high 2554 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 2555 2556 aese v3.16b, v28.16b 2557 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 2558 orr x9, x11, x9, lsl #32 //CTR block 4k+9 2559 2560 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 2561 add w12, w12, #1 //CTR block 4k+9 2562 fmov d1, x10 //CTR block 4k+9 2563 2564 aese v2.16b, v29.16b //AES block 4k+6 - round 11 2565 fmov v1.d[1], x9 //CTR block 4k+9 2566 rev w9, w12 //CTR block 4k+10 2567 2568 add w12, w12, #1 //CTR block 4k+10 2569 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 2570 orr x9, x11, x9, lsl #32 //CTR block 4k+10 2571 2572 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 2573 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 2574 2575 aese v3.16b, v29.16b //AES block 4k+7 - round 11 2576 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 2577 fmov d2, x10 //CTR block 4k+10 2578 2579 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 2580 fmov v2.d[1], x9 //CTR block 4k+10 2581 rev w9, w12 //CTR block 4k+11 2582 2583 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 2584 orr x9, x11, x9, lsl #32 //CTR block 4k+11 2585 2586 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result 2587 st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result 2588 b.lt .L192_enc_main_loop 2589 2590.L192_enc_prepretail: //PREPRETAIL 2591 aese v0.16b, v18.16b 2592 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 2593 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 2594 2595 fmov d3, x10 //CTR block 4k+3 2596 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 2597 add w12, w12, #1 //CTR block 4k+3 2598 2599 aese v1.16b, v18.16b 2600 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 2601 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 2602 2603 aese v2.16b, v18.16b 2604 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 2605 2606 fmov v3.d[1], x9 //CTR block 4k+3 2607 eor v4.16b, v4.16b, v11.16b //PRE 1 2608 mov d10, v17.d[1] //GHASH block 4k - mid 2609 2610 aese v1.16b, v19.16b 2611 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 2612 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 2613 2614 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 2615 2616 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 2617 mov d8, v4.d[1] //GHASH block 4k - mid 2618 2619 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 2620 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 2621 2622 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 2623 2624 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 2625 mov d4, v5.d[1] //GHASH block 4k+1 - mid 2626 2627 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 2628 mov d31, v6.d[1] //GHASH block 4k+2 - mid 2629 2630 aese v3.16b, v18.16b 2631 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 2632 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 2633 2634 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 2635 2636 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 2637 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 2638 2639 aese v3.16b, v19.16b 2640 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 2641 2642 aese v2.16b, v19.16b 2643 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 2644 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 2645 2646 aese v0.16b, v19.16b 2647 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 2648 2649 aese v1.16b, v20.16b 2650 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 2651 mov d30, v7.d[1] //GHASH block 4k+3 - mid 2652 2653 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 2654 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 2655 2656 aese v0.16b, v20.16b 2657 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 2658 2659 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 2660 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 2661 2662 aese v1.16b, v21.16b 2663 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 2664 2665 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 2666 2667 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 2668 2669 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 2670 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 2671 2672 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 2673 2674 aese v0.16b, v21.16b 2675 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 2676 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 2677 2678 aese v3.16b, v20.16b 2679 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 2680 2681 aese v2.16b, v20.16b 2682 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 2683 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 2684 2685 aese v0.16b, v22.16b 2686 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 2687 2688 aese v3.16b, v21.16b 2689 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 2690 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 2691 2692 aese v2.16b, v21.16b 2693 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 2694 2695 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 2696 movi v8.8b, #0xc2 2697 2698 aese v3.16b, v22.16b 2699 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 2700 2701 aese v2.16b, v22.16b 2702 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 2703 2704 aese v1.16b, v22.16b 2705 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 2706 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 2707 2708 aese v3.16b, v23.16b 2709 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 2710 2711 aese v2.16b, v23.16b 2712 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 2713 2714 aese v1.16b, v23.16b 2715 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 2716 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 2717 2718 aese v0.16b, v23.16b 2719 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 2720 2721 aese v3.16b, v24.16b 2722 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 2723 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 2724 2725 aese v1.16b, v24.16b 2726 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 2727 2728 aese v0.16b, v24.16b 2729 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 2730 shl d8, d8, #56 //mod_constant 2731 2732 aese v3.16b, v25.16b 2733 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 2734 2735 aese v1.16b, v25.16b 2736 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 2737 eor v10.16b, v10.16b, v11.16b 2738 2739 aese v0.16b, v25.16b 2740 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 2741 2742 pmull v30.1q, v9.1d, v8.1d 2743 2744 aese v2.16b, v24.16b 2745 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 2746 ext v9.16b, v9.16b, v9.16b, #8 2747 2748 aese v0.16b, v26.16b 2749 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 2750 2751 aese v1.16b, v26.16b 2752 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 2753 eor v10.16b, v10.16b, v30.16b 2754 2755 aese v2.16b, v25.16b 2756 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 2757 2758 aese v3.16b, v26.16b 2759 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 2760 2761 aese v0.16b, v27.16b 2762 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 2763 2764 aese v2.16b, v26.16b 2765 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 2766 eor v10.16b, v10.16b, v9.16b 2767 2768 aese v3.16b, v27.16b 2769 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 2770 2771 aese v1.16b, v27.16b 2772 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 2773 2774 aese v2.16b, v27.16b 2775 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 2776 2777 pmull v30.1q, v10.1d, v8.1d 2778 2779 ext v10.16b, v10.16b, v10.16b, #8 2780 2781 aese v3.16b, v28.16b 2782 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 2783 2784 aese v0.16b, v28.16b 2785 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 2786 2787 aese v2.16b, v28.16b 2788 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 2789 2790 aese v1.16b, v28.16b 2791 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 2792 eor v11.16b, v11.16b, v30.16b 2793 2794 aese v0.16b, v29.16b //AES block 4k+4 - round 11 2795 2796 aese v3.16b, v29.16b //AES block 4k+7 - round 11 2797 2798 aese v2.16b, v29.16b //AES block 4k+6 - round 11 2799 2800 aese v1.16b, v29.16b //AES block 4k+5 - round 11 2801 eor v11.16b, v11.16b, v10.16b 2802.L192_enc_tail: //TAIL 2803 2804 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 2805 ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 2806#ifdef __AARCH64EB__ 2807 rev x6, x6 2808 rev x7, x7 2809#endif 2810 eor x6, x6, x13 //AES block 4k+4 - round 12 low 2811 eor x7, x7, x14 //AES block 4k+4 - round 12 high 2812 2813 fmov d4, x6 //AES block 4k+4 - mov low 2814 2815 fmov v4.d[1], x7 //AES block 4k+4 - mov high 2816 cmp x5, #48 2817 2818 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 2819 2820 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 2821 b.gt .L192_enc_blocks_more_than_3 2822 2823 sub w12, w12, #1 2824 movi v10.8b, #0 2825 2826 mov v3.16b, v2.16b 2827 movi v9.8b, #0 2828 cmp x5, #32 2829 2830 mov v2.16b, v1.16b 2831 movi v11.8b, #0 2832 b.gt .L192_enc_blocks_more_than_2 2833 2834 sub w12, w12, #1 2835 2836 mov v3.16b, v1.16b 2837 cmp x5, #16 2838 b.gt .L192_enc_blocks_more_than_1 2839 2840 sub w12, w12, #1 2841 b .L192_enc_blocks_less_than_1 2842.L192_enc_blocks_more_than_3: //blocks left > 3 2843 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 2844 2845 ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 2846#ifdef __AARCH64EB__ 2847 rev x6, x6 2848 rev x7, x7 2849#endif 2850 rev64 v4.16b, v5.16b //GHASH final-3 block 2851 2852 eor x6, x6, x13 //AES final-2 block - round 12 low 2853 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2854 2855 eor x7, x7, x14 //AES final-2 block - round 12 high 2856 fmov d5, x6 //AES final-2 block - mov low 2857 2858 fmov v5.d[1], x7 //AES final-2 block - mov high 2859 2860 mov d22, v4.d[1] //GHASH final-3 block - mid 2861 2862 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 2863 2864 mov d10, v17.d[1] //GHASH final-3 block - mid 2865 2866 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 2867 2868 movi v8.8b, #0 //suppress further partial tag feed in 2869 2870 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 2871 2872 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 2873 eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 2874.L192_enc_blocks_more_than_2: //blocks left > 2 2875 2876 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 2877 2878 rev64 v4.16b, v5.16b //GHASH final-2 block 2879 ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 2880#ifdef __AARCH64EB__ 2881 rev x6, x6 2882 rev x7, x7 2883#endif 2884 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2885 2886 eor x7, x7, x14 //AES final-1 block - round 12 high 2887 2888 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 2889 mov d22, v4.d[1] //GHASH final-2 block - mid 2890 2891 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 2892 eor x6, x6, x13 //AES final-1 block - round 12 low 2893 2894 fmov d5, x6 //AES final-1 block - mov low 2895 2896 fmov v5.d[1], x7 //AES final-1 block - mov high 2897 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 2898 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 2899 2900 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 2901 2902 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 2903 2904 movi v8.8b, #0 //suppress further partial tag feed in 2905 2906 eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 2907 2908 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 2909.L192_enc_blocks_more_than_1: //blocks left > 1 2910 2911 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 2912 2913 ldp x6, x7, [x0], #16 //AES final block - load input low & high 2914#ifdef __AARCH64EB__ 2915 rev x6, x6 2916 rev x7, x7 2917#endif 2918 rev64 v4.16b, v5.16b //GHASH final-1 block 2919 2920 eor x6, x6, x13 //AES final block - round 12 low 2921 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2922 movi v8.8b, #0 //suppress further partial tag feed in 2923 2924 mov d22, v4.d[1] //GHASH final-1 block - mid 2925 2926 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 2927 eor x7, x7, x14 //AES final block - round 12 high 2928 fmov d5, x6 //AES final block - mov low 2929 2930 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 2931 fmov v5.d[1], x7 //AES final block - mov high 2932 2933 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 2934 2935 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 2936 2937 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 2938 2939 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 2940 2941 eor v5.16b, v5.16b, v3.16b //AES final block - result 2942 2943 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 2944 2945 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 2946.L192_enc_blocks_less_than_1: //blocks left <= 1 2947 2948 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 2949#ifndef __AARCH64EB__ 2950 rev w9, w12 2951#else 2952 mov w9, w12 2953#endif 2954 and x1, x1, #127 //bit_length %= 128 2955 2956 sub x1, x1, #128 //bit_length -= 128 2957 mvn x14, xzr //rk12_h = 0xffffffffffffffff 2958 2959 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 2960 mvn x13, xzr //rk12_l = 0xffffffffffffffff 2961 2962 and x1, x1, #127 //bit_length %= 128 2963 2964 lsr x14, x14, x1 //rk12_h is mask for top 64b of last block 2965 cmp x1, #64 2966 2967 csel x6, x13, x14, lt 2968 csel x7, x14, xzr, lt 2969 2970 fmov d0, x6 //ctr0b is mask for last block 2971 2972 fmov v0.d[1], x7 2973 2974 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 2975 2976 rev64 v4.16b, v5.16b //GHASH final block 2977 2978 eor v4.16b, v4.16b, v8.16b //feed in partial tag 2979 2980 mov d8, v4.d[1] //GHASH final block - mid 2981 2982 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 2983 2984 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 2985 2986 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 2987 2988 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 2989 2990 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 2991 2992 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 2993 2994 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 2995 movi v8.8b, #0xc2 2996 2997 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 2998 2999 shl d8, d8, #56 //mod_constant 3000 3001 bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 3002 3003 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3004 3005 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3006 3007 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3008 3009 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3010 3011 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3012 3013 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3014 3015 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3016 3017 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 3018 str w9, [x16, #12] //store the updated counter 3019 3020 st1 { v5.16b}, [x2] //store all 16B 3021 3022 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3023 ext v11.16b, v11.16b, v11.16b, #8 3024 rev64 v11.16b, v11.16b 3025 mov x0, x15 3026 st1 { v11.16b }, [x3] 3027 3028 ldp x21, x22, [sp, #16] 3029 ldp x23, x24, [sp, #32] 3030 ldp d8, d9, [sp, #48] 3031 ldp d10, d11, [sp, #64] 3032 ldp d12, d13, [sp, #80] 3033 ldp d14, d15, [sp, #96] 3034 ldp x19, x20, [sp], #112 3035 ret 3036 3037.L192_enc_ret: 3038 mov w0, #0x0 3039 ret 3040.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel 3041.globl aes_gcm_dec_192_kernel 3042.type aes_gcm_dec_192_kernel,%function 3043.align 4 3044aes_gcm_dec_192_kernel: 3045 AARCH64_VALID_CALL_TARGET 3046 cbz x1, .L192_dec_ret 3047 stp x19, x20, [sp, #-112]! 3048 mov x16, x4 3049 mov x8, x5 3050 stp x21, x22, [sp, #16] 3051 stp x23, x24, [sp, #32] 3052 stp d8, d9, [sp, #48] 3053 stp d10, d11, [sp, #64] 3054 stp d12, d13, [sp, #80] 3055 stp d14, d15, [sp, #96] 3056 3057 add x4, x0, x1, lsr #3 //end_input_ptr 3058 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 3059#ifdef __AARCH64EB__ 3060 rev x10, x10 3061 rev x11, x11 3062#endif 3063 ldp x13, x14, [x8, #192] //load rk12 3064#ifdef __AARCH64EB__ 3065 ror x13, x13, #32 3066 ror x14, x14, #32 3067#endif 3068 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 3069 3070 ld1 {v18.4s}, [x8], #16 //load rk0 3071 3072 lsr x5, x1, #3 //byte_len 3073 mov x15, x5 3074 ld1 {v19.4s}, [x8], #16 //load rk1 3075 3076 lsr x12, x11, #32 3077 orr w11, w11, w11 3078 fmov d3, x10 //CTR block 3 3079 3080 rev w12, w12 //rev_ctr32 3081 fmov d1, x10 //CTR block 1 3082 3083 add w12, w12, #1 //increment rev_ctr32 3084 ld1 {v20.4s}, [x8], #16 //load rk2 3085 3086 aese v0.16b, v18.16b 3087 aesmc v0.16b, v0.16b //AES block 0 - round 0 3088 rev w9, w12 //CTR block 1 3089 3090 add w12, w12, #1 //CTR block 1 3091 orr x9, x11, x9, lsl #32 //CTR block 1 3092 ld1 {v21.4s}, [x8], #16 //load rk3 3093 3094 fmov v1.d[1], x9 //CTR block 1 3095 rev w9, w12 //CTR block 2 3096 add w12, w12, #1 //CTR block 2 3097 3098 fmov d2, x10 //CTR block 2 3099 orr x9, x11, x9, lsl #32 //CTR block 2 3100 3101 fmov v2.d[1], x9 //CTR block 2 3102 rev w9, w12 //CTR block 3 3103 3104 aese v0.16b, v19.16b 3105 aesmc v0.16b, v0.16b //AES block 0 - round 1 3106 orr x9, x11, x9, lsl #32 //CTR block 3 3107 3108 fmov v3.d[1], x9 //CTR block 3 3109 3110 ld1 {v22.4s}, [x8], #16 //load rk4 3111 3112 aese v0.16b, v20.16b 3113 aesmc v0.16b, v0.16b //AES block 0 - round 2 3114 3115 aese v2.16b, v18.16b 3116 aesmc v2.16b, v2.16b //AES block 2 - round 0 3117 ld1 {v23.4s}, [x8], #16 //load rk5 3118 3119 aese v1.16b, v18.16b 3120 aesmc v1.16b, v1.16b //AES block 1 - round 0 3121 ldr q15, [x3, #112] //load h4l | h4h 3122#ifndef __AARCH64EB__ 3123 ext v15.16b, v15.16b, v15.16b, #8 3124#endif 3125 aese v3.16b, v18.16b 3126 aesmc v3.16b, v3.16b //AES block 3 - round 0 3127 ldr q13, [x3, #64] //load h2l | h2h 3128#ifndef __AARCH64EB__ 3129 ext v13.16b, v13.16b, v13.16b, #8 3130#endif 3131 aese v2.16b, v19.16b 3132 aesmc v2.16b, v2.16b //AES block 2 - round 1 3133 ldr q14, [x3, #80] //load h3l | h3h 3134#ifndef __AARCH64EB__ 3135 ext v14.16b, v14.16b, v14.16b, #8 3136#endif 3137 aese v1.16b, v19.16b 3138 aesmc v1.16b, v1.16b //AES block 1 - round 1 3139 3140 aese v3.16b, v19.16b 3141 aesmc v3.16b, v3.16b //AES block 3 - round 1 3142 ldr q12, [x3, #32] //load h1l | h1h 3143#ifndef __AARCH64EB__ 3144 ext v12.16b, v12.16b, v12.16b, #8 3145#endif 3146 aese v2.16b, v20.16b 3147 aesmc v2.16b, v2.16b //AES block 2 - round 2 3148 ld1 {v24.4s}, [x8], #16 //load rk6 3149 3150 aese v0.16b, v21.16b 3151 aesmc v0.16b, v0.16b //AES block 0 - round 3 3152 ld1 {v25.4s}, [x8], #16 //load rk7 3153 3154 aese v1.16b, v20.16b 3155 aesmc v1.16b, v1.16b //AES block 1 - round 2 3156 ld1 {v26.4s}, [x8], #16 //load rk8 3157 3158 aese v3.16b, v20.16b 3159 aesmc v3.16b, v3.16b //AES block 3 - round 2 3160 ld1 {v27.4s}, [x8], #16 //load rk9 3161 3162 aese v2.16b, v21.16b 3163 aesmc v2.16b, v2.16b //AES block 2 - round 3 3164 ld1 { v11.16b}, [x3] 3165 ext v11.16b, v11.16b, v11.16b, #8 3166 rev64 v11.16b, v11.16b 3167 3168 aese v1.16b, v21.16b 3169 aesmc v1.16b, v1.16b //AES block 1 - round 3 3170 add w12, w12, #1 //CTR block 3 3171 3172 aese v3.16b, v21.16b 3173 aesmc v3.16b, v3.16b //AES block 3 - round 3 3174 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 3175 3176 aese v0.16b, v22.16b 3177 aesmc v0.16b, v0.16b //AES block 0 - round 4 3178 ld1 {v28.4s}, [x8], #16 //load rk10 3179 3180 aese v1.16b, v22.16b 3181 aesmc v1.16b, v1.16b //AES block 1 - round 4 3182 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 3183 3184 aese v2.16b, v22.16b 3185 aesmc v2.16b, v2.16b //AES block 2 - round 4 3186 3187 aese v3.16b, v22.16b 3188 aesmc v3.16b, v3.16b //AES block 3 - round 4 3189 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 3190 3191 aese v0.16b, v23.16b 3192 aesmc v0.16b, v0.16b //AES block 0 - round 5 3193 ld1 {v29.4s}, [x8], #16 //load rk11 3194 3195 aese v1.16b, v23.16b 3196 aesmc v1.16b, v1.16b //AES block 1 - round 5 3197 3198 aese v2.16b, v23.16b 3199 aesmc v2.16b, v2.16b //AES block 2 - round 5 3200 3201 aese v3.16b, v23.16b 3202 aesmc v3.16b, v3.16b //AES block 3 - round 5 3203 3204 aese v0.16b, v24.16b 3205 aesmc v0.16b, v0.16b //AES block 0 - round 6 3206 3207 aese v2.16b, v24.16b 3208 aesmc v2.16b, v2.16b //AES block 2 - round 6 3209 3210 aese v3.16b, v24.16b 3211 aesmc v3.16b, v3.16b //AES block 3 - round 6 3212 3213 aese v0.16b, v25.16b 3214 aesmc v0.16b, v0.16b //AES block 0 - round 7 3215 3216 aese v2.16b, v25.16b 3217 aesmc v2.16b, v2.16b //AES block 2 - round 7 3218 3219 aese v3.16b, v25.16b 3220 aesmc v3.16b, v3.16b //AES block 3 - round 7 3221 3222 aese v1.16b, v24.16b 3223 aesmc v1.16b, v1.16b //AES block 1 - round 6 3224 3225 aese v2.16b, v26.16b 3226 aesmc v2.16b, v2.16b //AES block 2 - round 8 3227 3228 aese v3.16b, v26.16b 3229 aesmc v3.16b, v3.16b //AES block 3 - round 8 3230 3231 aese v1.16b, v25.16b 3232 aesmc v1.16b, v1.16b //AES block 1 - round 7 3233 3234 aese v2.16b, v27.16b 3235 aesmc v2.16b, v2.16b //AES block 2 - round 9 3236 3237 aese v3.16b, v27.16b 3238 aesmc v3.16b, v3.16b //AES block 3 - round 9 3239 3240 aese v1.16b, v26.16b 3241 aesmc v1.16b, v1.16b //AES block 1 - round 8 3242 sub x5, x5, #1 //byte_len - 1 3243 3244 aese v0.16b, v26.16b 3245 aesmc v0.16b, v0.16b //AES block 0 - round 8 3246 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 3247 3248 aese v3.16b, v28.16b 3249 aesmc v3.16b, v3.16b //AES block 3 - round 10 3250 add x5, x5, x0 3251 3252 aese v1.16b, v27.16b 3253 aesmc v1.16b, v1.16b //AES block 1 - round 9 3254 cmp x0, x5 //check if we have <= 4 blocks 3255 3256 aese v0.16b, v27.16b 3257 aesmc v0.16b, v0.16b //AES block 0 - round 9 3258 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 3259 3260 aese v3.16b, v29.16b //AES block 3 - round 11 3261 3262 aese v2.16b, v28.16b 3263 aesmc v2.16b, v2.16b //AES block 2 - round 10 3264 3265 aese v1.16b, v28.16b 3266 aesmc v1.16b, v1.16b //AES block 1 - round 10 3267 3268 aese v0.16b, v28.16b 3269 aesmc v0.16b, v0.16b //AES block 0 - round 10 3270 eor v16.16b, v16.16b, v8.16b //h2k | h1k 3271 3272 aese v2.16b, v29.16b //AES block 2 - round 11 3273 3274 aese v1.16b, v29.16b //AES block 1 - round 11 3275 eor v17.16b, v17.16b, v9.16b //h4k | h3k 3276 3277 aese v0.16b, v29.16b //AES block 0 - round 11 3278 b.ge .L192_dec_tail //handle tail 3279 3280 ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext 3281 3282 eor v1.16b, v5.16b, v1.16b //AES block 1 - result 3283 3284 eor v0.16b, v4.16b, v0.16b //AES block 0 - result 3285 rev w9, w12 //CTR block 4 3286 ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext 3287 3288 mov x19, v1.d[0] //AES block 1 - mov low 3289 3290 mov x20, v1.d[1] //AES block 1 - mov high 3291 3292 mov x6, v0.d[0] //AES block 0 - mov low 3293 orr x9, x11, x9, lsl #32 //CTR block 4 3294 add w12, w12, #1 //CTR block 4 3295 3296 mov x7, v0.d[1] //AES block 0 - mov high 3297 rev64 v4.16b, v4.16b //GHASH block 0 3298 3299 fmov d0, x10 //CTR block 4 3300 rev64 v5.16b, v5.16b //GHASH block 1 3301 cmp x0, x5 //check if we have <= 8 blocks 3302 3303 eor x19, x19, x13 //AES block 1 - round 12 low 3304#ifdef __AARCH64EB__ 3305 rev x19, x19 3306#endif 3307 fmov v0.d[1], x9 //CTR block 4 3308 rev w9, w12 //CTR block 5 3309 3310 orr x9, x11, x9, lsl #32 //CTR block 5 3311 fmov d1, x10 //CTR block 5 3312 eor x20, x20, x14 //AES block 1 - round 12 high 3313#ifdef __AARCH64EB__ 3314 rev x20, x20 3315#endif 3316 add w12, w12, #1 //CTR block 5 3317 fmov v1.d[1], x9 //CTR block 5 3318 eor x6, x6, x13 //AES block 0 - round 12 low 3319#ifdef __AARCH64EB__ 3320 rev x6, x6 3321#endif 3322 rev w9, w12 //CTR block 6 3323 eor x7, x7, x14 //AES block 0 - round 12 high 3324#ifdef __AARCH64EB__ 3325 rev x7, x7 3326#endif 3327 stp x6, x7, [x2], #16 //AES block 0 - store result 3328 orr x9, x11, x9, lsl #32 //CTR block 6 3329 3330 stp x19, x20, [x2], #16 //AES block 1 - store result 3331 3332 add w12, w12, #1 //CTR block 6 3333 eor v2.16b, v6.16b, v2.16b //AES block 2 - result 3334 b.ge .L192_dec_prepretail //do prepretail 3335 3336.L192_dec_main_loop: //main loop start 3337 aese v1.16b, v18.16b 3338 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 3339 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 3340 3341 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 3342 mov x21, v2.d[0] //AES block 4k+2 - mov low 3343 3344 mov x22, v2.d[1] //AES block 4k+2 - mov high 3345 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 3346 rev64 v7.16b, v7.16b //GHASH block 4k+3 3347 3348 aese v1.16b, v19.16b 3349 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 3350 fmov d2, x10 //CTR block 4k+6 3351 3352 aese v0.16b, v18.16b 3353 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 3354 eor v4.16b, v4.16b, v11.16b //PRE 1 3355 3356 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 3357 fmov v2.d[1], x9 //CTR block 4k+6 3358 3359 aese v1.16b, v20.16b 3360 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 3361 mov x24, v3.d[1] //AES block 4k+3 - mov high 3362 3363 aese v0.16b, v19.16b 3364 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 3365 mov x23, v3.d[0] //AES block 4k+3 - mov low 3366 3367 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 3368 fmov d3, x10 //CTR block 4k+7 3369 mov d8, v4.d[1] //GHASH block 4k - mid 3370 3371 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 3372 mov d10, v17.d[1] //GHASH block 4k - mid 3373 rev w9, w12 //CTR block 4k+7 3374 3375 aese v2.16b, v18.16b 3376 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 3377 orr x9, x11, x9, lsl #32 //CTR block 4k+7 3378 3379 fmov v3.d[1], x9 //CTR block 4k+7 3380 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 3381 mov d4, v5.d[1] //GHASH block 4k+1 - mid 3382 3383 aese v1.16b, v21.16b 3384 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 3385 3386 aese v0.16b, v20.16b 3387 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 3388 eor x22, x22, x14 //AES block 4k+2 - round 12 high 3389#ifdef __AARCH64EB__ 3390 rev x22, x22 3391#endif 3392 aese v2.16b, v19.16b 3393 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 3394 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 3395 3396 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 3397 3398 aese v3.16b, v18.16b 3399 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 3400 rev64 v6.16b, v6.16b //GHASH block 4k+2 3401 3402 aese v2.16b, v20.16b 3403 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 3404 3405 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 3406 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 3407 eor x21, x21, x13 //AES block 4k+2 - round 12 low 3408#ifdef __AARCH64EB__ 3409 rev x21, x21 3410#endif 3411 aese v1.16b, v22.16b 3412 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 3413 3414 aese v0.16b, v21.16b 3415 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 3416 3417 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 3418 mov d31, v6.d[1] //GHASH block 4k+2 - mid 3419 3420 aese v3.16b, v19.16b 3421 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 3422 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 3423 3424 aese v0.16b, v22.16b 3425 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 3426 3427 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 3428 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 3429 3430 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 3431 3432 aese v0.16b, v23.16b 3433 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 3434 3435 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 3436 mov d30, v7.d[1] //GHASH block 4k+3 - mid 3437 3438 aese v1.16b, v23.16b 3439 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 3440 3441 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 3442 3443 aese v3.16b, v20.16b 3444 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 3445 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 3446 3447 aese v1.16b, v24.16b 3448 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 3449 3450 aese v0.16b, v24.16b 3451 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 3452 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 3453 3454 aese v3.16b, v21.16b 3455 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 3456 3457 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 3458 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 3459 3460 aese v0.16b, v25.16b 3461 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 3462 3463 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 3464 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 3465 3466 aese v1.16b, v25.16b 3467 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 3468 3469 aese v0.16b, v26.16b 3470 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 3471 movi v8.8b, #0xc2 3472 3473 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 3474 3475 aese v1.16b, v26.16b 3476 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 3477 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 3478 3479 aese v2.16b, v21.16b 3480 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 3481 3482 aese v0.16b, v27.16b 3483 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 3484 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 3485 3486 aese v3.16b, v22.16b 3487 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 3488 3489 aese v2.16b, v22.16b 3490 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 3491 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 3492 3493 aese v0.16b, v28.16b 3494 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 3495 3496 aese v1.16b, v27.16b 3497 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 3498 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 3499 3500 aese v2.16b, v23.16b 3501 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 3502 3503 aese v3.16b, v23.16b 3504 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 3505 shl d8, d8, #56 //mod_constant 3506 3507 aese v1.16b, v28.16b 3508 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 3509 3510 aese v2.16b, v24.16b 3511 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 3512 ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 3513 3514 aese v3.16b, v24.16b 3515 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 3516 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3517 3518 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3519 ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 3520 eor x23, x23, x13 //AES block 4k+3 - round 12 low 3521#ifdef __AARCH64EB__ 3522 rev x23, x23 3523#endif 3524 aese v2.16b, v25.16b 3525 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 3526 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3527 3528 aese v0.16b, v29.16b //AES block 4k+4 - round 11 3529 add w12, w12, #1 //CTR block 4k+7 3530 3531 aese v3.16b, v25.16b 3532 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 3533 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3534 3535 aese v2.16b, v26.16b 3536 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 3537 ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 3538 3539 aese v1.16b, v29.16b //AES block 4k+5 - round 11 3540 ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext 3541 rev w9, w12 //CTR block 4k+8 3542 3543 aese v3.16b, v26.16b 3544 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 3545 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 3546 3547 aese v2.16b, v27.16b 3548 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 3549 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3550 3551 cmp x0, x5 //.LOOP CONTROL 3552 3553 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 3554 eor x24, x24, x14 //AES block 4k+3 - round 12 high 3555#ifdef __AARCH64EB__ 3556 rev x24, x24 3557#endif 3558 eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 3559 3560 aese v2.16b, v28.16b 3561 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 3562 orr x9, x11, x9, lsl #32 //CTR block 4k+8 3563 3564 aese v3.16b, v27.16b 3565 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 3566 3567 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3568 mov x19, v1.d[0] //AES block 4k+5 - mov low 3569 3570 mov x6, v0.d[0] //AES block 4k+4 - mov low 3571 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 3572 rev64 v5.16b, v5.16b //GHASH block 4k+5 3573 3574 aese v2.16b, v29.16b //AES block 4k+6 - round 11 3575 mov x7, v0.d[1] //AES block 4k+4 - mov high 3576 3577 aese v3.16b, v28.16b 3578 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 3579 mov x20, v1.d[1] //AES block 4k+5 - mov high 3580 3581 fmov d0, x10 //CTR block 4k+8 3582 add w12, w12, #1 //CTR block 4k+8 3583 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3584 3585 eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 3586 fmov v0.d[1], x9 //CTR block 4k+8 3587 rev w9, w12 //CTR block 4k+9 3588 3589 eor x6, x6, x13 //AES block 4k+4 - round 12 low 3590#ifdef __AARCH64EB__ 3591 rev x6, x6 3592#endif 3593 orr x9, x11, x9, lsl #32 //CTR block 4k+9 3594 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 3595 3596 fmov d1, x10 //CTR block 4k+9 3597 add w12, w12, #1 //CTR block 4k+9 3598 eor x19, x19, x13 //AES block 4k+5 - round 12 low 3599#ifdef __AARCH64EB__ 3600 rev x19, x19 3601#endif 3602 fmov v1.d[1], x9 //CTR block 4k+9 3603 rev w9, w12 //CTR block 4k+10 3604 eor x20, x20, x14 //AES block 4k+5 - round 12 high 3605#ifdef __AARCH64EB__ 3606 rev x20, x20 3607#endif 3608 eor x7, x7, x14 //AES block 4k+4 - round 12 high 3609#ifdef __AARCH64EB__ 3610 rev x7, x7 3611#endif 3612 stp x6, x7, [x2], #16 //AES block 4k+4 - store result 3613 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3614 3615 add w12, w12, #1 //CTR block 4k+10 3616 rev64 v4.16b, v4.16b //GHASH block 4k+4 3617 orr x9, x11, x9, lsl #32 //CTR block 4k+10 3618 3619 aese v3.16b, v29.16b //AES block 4k+7 - round 11 3620 stp x19, x20, [x2], #16 //AES block 4k+5 - store result 3621 b.lt .L192_dec_main_loop 3622 3623.L192_dec_prepretail: //PREPRETAIL 3624 mov x22, v2.d[1] //AES block 4k+2 - mov high 3625 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 3626 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 3627 3628 aese v1.16b, v18.16b 3629 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 3630 mov x21, v2.d[0] //AES block 4k+2 - mov low 3631 3632 aese v0.16b, v18.16b 3633 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 3634 mov d10, v17.d[1] //GHASH block 4k - mid 3635 3636 eor v4.16b, v4.16b, v11.16b //PRE 1 3637 fmov d2, x10 //CTR block 4k+6 3638 3639 aese v1.16b, v19.16b 3640 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 3641 mov x23, v3.d[0] //AES block 4k+3 - mov low 3642 3643 aese v0.16b, v19.16b 3644 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 3645 mov x24, v3.d[1] //AES block 4k+3 - mov high 3646 3647 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 3648 mov d8, v4.d[1] //GHASH block 4k - mid 3649 fmov d3, x10 //CTR block 4k+7 3650 3651 aese v1.16b, v20.16b 3652 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 3653 rev64 v6.16b, v6.16b //GHASH block 4k+2 3654 3655 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 3656 fmov v2.d[1], x9 //CTR block 4k+6 3657 rev w9, w12 //CTR block 4k+7 3658 3659 orr x9, x11, x9, lsl #32 //CTR block 4k+7 3660 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 3661 mov d4, v5.d[1] //GHASH block 4k+1 - mid 3662 3663 pmull v31.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 3664 eor x24, x24, x14 //AES block 4k+3 - round 12 high 3665#ifdef __AARCH64EB__ 3666 rev x24, x24 3667#endif 3668 fmov v3.d[1], x9 //CTR block 4k+7 3669 3670 aese v0.16b, v20.16b 3671 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 3672 eor x21, x21, x13 //AES block 4k+2 - round 12 low 3673#ifdef __AARCH64EB__ 3674 rev x21, x21 3675#endif 3676 pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 3677 eor x22, x22, x14 //AES block 4k+2 - round 12 high 3678#ifdef __AARCH64EB__ 3679 rev x22, x22 3680#endif 3681 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 3682 3683 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 3684 eor x23, x23, x13 //AES block 4k+3 - round 12 low 3685#ifdef __AARCH64EB__ 3686 rev x23, x23 3687#endif 3688 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 3689 3690 rev64 v7.16b, v7.16b //GHASH block 4k+3 3691 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 3692 3693 aese v3.16b, v18.16b 3694 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 3695 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+1 - high 3696 3697 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 3698 add w12, w12, #1 //CTR block 4k+7 3699 3700 pmull2 v30.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 3701 eor v11.16b, v11.16b, v31.16b //GHASH block 4k+1 - low 3702 3703 aese v2.16b, v18.16b 3704 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 3705 3706 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 3707 mov d31, v6.d[1] //GHASH block 4k+2 - mid 3708 3709 aese v3.16b, v19.16b 3710 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 3711 3712 aese v2.16b, v19.16b 3713 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 3714 eor v9.16b, v9.16b, v30.16b //GHASH block 4k+2 - high 3715 3716 eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid 3717 3718 pmull v8.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 3719 3720 aese v2.16b, v20.16b 3721 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 3722 mov d30, v7.d[1] //GHASH block 4k+3 - mid 3723 3724 aese v3.16b, v20.16b 3725 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 3726 ins v31.d[1], v31.d[0] //GHASH block 4k+2 - mid 3727 3728 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 3729 3730 aese v0.16b, v21.16b 3731 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 3732 eor v30.8b, v30.8b, v7.8b //GHASH block 4k+3 - mid 3733 3734 aese v1.16b, v21.16b 3735 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 3736 3737 pmull2 v31.1q, v31.2d, v16.2d //GHASH block 4k+2 - mid 3738 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+2 - low 3739 3740 aese v0.16b, v22.16b 3741 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 3742 3743 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 3744 movi v8.8b, #0xc2 3745 3746 pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid 3747 3748 aese v2.16b, v21.16b 3749 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 3750 3751 shl d8, d8, #56 //mod_constant 3752 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 3753 3754 aese v0.16b, v23.16b 3755 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 3756 eor v10.16b, v10.16b, v31.16b //GHASH block 4k+2 - mid 3757 3758 aese v2.16b, v22.16b 3759 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 3760 3761 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 3762 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 3763 3764 aese v0.16b, v24.16b 3765 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 3766 3767 aese v3.16b, v21.16b 3768 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 3769 eor v10.16b, v10.16b, v30.16b //GHASH block 4k+3 - mid 3770 3771 aese v2.16b, v23.16b 3772 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 3773 3774 aese v0.16b, v25.16b 3775 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 3776 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 3777 3778 aese v3.16b, v22.16b 3779 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 3780 3781 aese v2.16b, v24.16b 3782 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 3783 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 3784 3785 aese v0.16b, v26.16b 3786 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 3787 3788 aese v3.16b, v23.16b 3789 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 3790 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 3791 3792 aese v1.16b, v22.16b 3793 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 3794 3795 aese v2.16b, v25.16b 3796 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 3797 3798 aese v0.16b, v27.16b 3799 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 3800 3801 aese v1.16b, v23.16b 3802 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 3803 3804 aese v3.16b, v24.16b 3805 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 3806 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 3807 3808 aese v0.16b, v28.16b 3809 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 3810 3811 aese v1.16b, v24.16b 3812 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 3813 3814 aese v3.16b, v25.16b 3815 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 3816 3817 aese v2.16b, v26.16b 3818 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 3819 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 3820 3821 aese v1.16b, v25.16b 3822 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 3823 3824 aese v3.16b, v26.16b 3825 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 3826 3827 aese v2.16b, v27.16b 3828 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 3829 3830 aese v1.16b, v26.16b 3831 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 3832 3833 aese v3.16b, v27.16b 3834 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 3835 3836 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 3837 3838 aese v1.16b, v27.16b 3839 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 3840 3841 aese v2.16b, v28.16b 3842 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 3843 3844 aese v3.16b, v28.16b 3845 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 3846 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 3847 3848 aese v1.16b, v28.16b 3849 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 3850 3851 aese v0.16b, v29.16b 3852 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 3853 3854 aese v2.16b, v29.16b 3855 3856 aese v1.16b, v29.16b 3857 3858 aese v3.16b, v29.16b 3859 3860 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 3861.L192_dec_tail: //TAIL 3862 3863 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 3864 ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 3865 3866 eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 3867 3868 mov x7, v0.d[1] //AES block 4k+4 - mov high 3869 3870 mov x6, v0.d[0] //AES block 4k+4 - mov low 3871 3872 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 3873 3874 cmp x5, #48 3875 3876 eor x7, x7, x14 //AES block 4k+4 - round 12 high 3877#ifdef __AARCH64EB__ 3878 rev x7, x7 3879#endif 3880 eor x6, x6, x13 //AES block 4k+4 - round 12 low 3881#ifdef __AARCH64EB__ 3882 rev x6, x6 3883#endif 3884 b.gt .L192_dec_blocks_more_than_3 3885 3886 movi v11.8b, #0 3887 movi v9.8b, #0 3888 3889 mov v3.16b, v2.16b 3890 mov v2.16b, v1.16b 3891 sub w12, w12, #1 3892 3893 movi v10.8b, #0 3894 cmp x5, #32 3895 b.gt .L192_dec_blocks_more_than_2 3896 3897 mov v3.16b, v1.16b 3898 cmp x5, #16 3899 sub w12, w12, #1 3900 3901 b.gt .L192_dec_blocks_more_than_1 3902 3903 sub w12, w12, #1 3904 b .L192_dec_blocks_less_than_1 3905.L192_dec_blocks_more_than_3: //blocks left > 3 3906 rev64 v4.16b, v5.16b //GHASH final-3 block 3907 ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 3908 3909 stp x6, x7, [x2], #16 //AES final-3 block - store result 3910 3911 eor v4.16b, v4.16b, v8.16b //feed in partial tag 3912 3913 eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 3914 3915 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 3916 mov x6, v0.d[0] //AES final-2 block - mov low 3917 mov d22, v4.d[1] //GHASH final-3 block - mid 3918 3919 mov x7, v0.d[1] //AES final-2 block - mov high 3920 3921 mov d10, v17.d[1] //GHASH final-3 block - mid 3922 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 3923 3924 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 3925 3926 eor x6, x6, x13 //AES final-2 block - round 12 low 3927#ifdef __AARCH64EB__ 3928 rev x6, x6 3929#endif 3930 movi v8.8b, #0 //suppress further partial tag feed in 3931 3932 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 3933 eor x7, x7, x14 //AES final-2 block - round 12 high 3934#ifdef __AARCH64EB__ 3935 rev x7, x7 3936#endif 3937.L192_dec_blocks_more_than_2: //blocks left > 2 3938 3939 rev64 v4.16b, v5.16b //GHASH final-2 block 3940 ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 3941 3942 eor v4.16b, v4.16b, v8.16b //feed in partial tag 3943 3944 movi v8.8b, #0 //suppress further partial tag feed in 3945 3946 eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 3947 3948 mov d22, v4.d[1] //GHASH final-2 block - mid 3949 3950 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 3951 3952 stp x6, x7, [x2], #16 //AES final-2 block - store result 3953 3954 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 3955 mov x7, v0.d[1] //AES final-1 block - mov high 3956 3957 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 3958 mov x6, v0.d[0] //AES final-1 block - mov low 3959 3960 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 3961 3962 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 3963 3964 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 3965 eor x7, x7, x14 //AES final-1 block - round 12 high 3966#ifdef __AARCH64EB__ 3967 rev x7, x7 3968#endif 3969 eor x6, x6, x13 //AES final-1 block - round 12 low 3970#ifdef __AARCH64EB__ 3971 rev x6, x6 3972#endif 3973 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 3974.L192_dec_blocks_more_than_1: //blocks left > 1 3975 3976 rev64 v4.16b, v5.16b //GHASH final-1 block 3977 3978 eor v4.16b, v4.16b, v8.16b //feed in partial tag 3979 ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 3980 3981 mov d22, v4.d[1] //GHASH final-1 block - mid 3982 3983 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 3984 3985 eor v0.16b, v5.16b, v3.16b //AES final block - result 3986 stp x6, x7, [x2], #16 //AES final-1 block - store result 3987 3988 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 3989 3990 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 3991 3992 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 3993 mov x7, v0.d[1] //AES final block - mov high 3994 3995 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 3996 mov x6, v0.d[0] //AES final block - mov low 3997 3998 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 3999 4000 movi v8.8b, #0 //suppress further partial tag feed in 4001 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 4002 eor x7, x7, x14 //AES final block - round 12 high 4003#ifdef __AARCH64EB__ 4004 rev x7, x7 4005#endif 4006 eor x6, x6, x13 //AES final block - round 12 low 4007#ifdef __AARCH64EB__ 4008 rev x6, x6 4009#endif 4010 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 4011.L192_dec_blocks_less_than_1: //blocks left <= 1 4012 4013 mvn x13, xzr //rk12_l = 0xffffffffffffffff 4014 ldp x4, x5, [x2] //load existing bytes we need to not overwrite 4015 and x1, x1, #127 //bit_length %= 128 4016 4017 sub x1, x1, #128 //bit_length -= 128 4018 4019 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 4020 4021 and x1, x1, #127 //bit_length %= 128 4022 mvn x14, xzr //rk12_h = 0xffffffffffffffff 4023 4024 lsr x14, x14, x1 //rk12_h is mask for top 64b of last block 4025 cmp x1, #64 4026 4027 csel x9, x13, x14, lt 4028 csel x10, x14, xzr, lt 4029 4030 fmov d0, x9 //ctr0b is mask for last block 4031 and x6, x6, x9 4032 bic x4, x4, x9 //mask out low existing bytes 4033 4034 orr x6, x6, x4 4035 mov v0.d[1], x10 4036#ifndef __AARCH64EB__ 4037 rev w9, w12 4038#else 4039 mov w9, w12 4040#endif 4041 4042 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 4043 str w9, [x16, #12] //store the updated counter 4044 4045 rev64 v4.16b, v5.16b //GHASH final block 4046 4047 eor v4.16b, v4.16b, v8.16b //feed in partial tag 4048 bic x5, x5, x10 //mask out high existing bytes 4049 4050 and x7, x7, x10 4051 4052 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 4053 mov d8, v4.d[1] //GHASH final block - mid 4054 4055 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 4056 4057 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 4058 4059 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 4060 4061 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 4062 4063 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 4064 4065 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 4066 movi v8.8b, #0xc2 4067 4068 eor v30.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 4069 4070 shl d8, d8, #56 //mod_constant 4071 4072 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up 4073 4074 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 4075 orr x7, x7, x5 4076 stp x6, x7, [x2] 4077 4078 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 4079 4080 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid 4081 4082 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 4083 4084 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 4085 4086 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 4087 4088 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 4089 4090 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 4091 ext v11.16b, v11.16b, v11.16b, #8 4092 rev64 v11.16b, v11.16b 4093 mov x0, x15 4094 st1 { v11.16b }, [x3] 4095 4096 ldp x21, x22, [sp, #16] 4097 ldp x23, x24, [sp, #32] 4098 ldp d8, d9, [sp, #48] 4099 ldp d10, d11, [sp, #64] 4100 ldp d12, d13, [sp, #80] 4101 ldp d14, d15, [sp, #96] 4102 ldp x19, x20, [sp], #112 4103 ret 4104 4105.L192_dec_ret: 4106 mov w0, #0x0 4107 ret 4108.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel 4109.globl aes_gcm_enc_256_kernel 4110.type aes_gcm_enc_256_kernel,%function 4111.align 4 4112aes_gcm_enc_256_kernel: 4113 AARCH64_VALID_CALL_TARGET 4114 cbz x1, .L256_enc_ret 4115 stp x19, x20, [sp, #-112]! 4116 mov x16, x4 4117 mov x8, x5 4118 stp x21, x22, [sp, #16] 4119 stp x23, x24, [sp, #32] 4120 stp d8, d9, [sp, #48] 4121 stp d10, d11, [sp, #64] 4122 stp d12, d13, [sp, #80] 4123 stp d14, d15, [sp, #96] 4124 4125 add x4, x0, x1, lsr #3 //end_input_ptr 4126 lsr x5, x1, #3 //byte_len 4127 mov x15, x5 4128 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 4129#ifdef __AARCH64EB__ 4130 rev x10, x10 4131 rev x11, x11 4132#endif 4133 ldp x13, x14, [x8, #224] //load rk14 4134#ifdef __AARCH64EB__ 4135 ror x13, x13, #32 4136 ror x14, x14, #32 4137#endif 4138 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 4139 sub x5, x5, #1 //byte_len - 1 4140 4141 ld1 {v18.4s}, [x8], #16 //load rk0 4142 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 4143 4144 ld1 {v19.4s}, [x8], #16 //load rk1 4145 add x5, x5, x0 4146 4147 lsr x12, x11, #32 4148 fmov d2, x10 //CTR block 2 4149 orr w11, w11, w11 4150 4151 rev w12, w12 //rev_ctr32 4152 cmp x0, x5 //check if we have <= 4 blocks 4153 fmov d1, x10 //CTR block 1 4154 4155 aese v0.16b, v18.16b 4156 aesmc v0.16b, v0.16b //AES block 0 - round 0 4157 add w12, w12, #1 //increment rev_ctr32 4158 4159 rev w9, w12 //CTR block 1 4160 fmov d3, x10 //CTR block 3 4161 4162 orr x9, x11, x9, lsl #32 //CTR block 1 4163 add w12, w12, #1 //CTR block 1 4164 ld1 {v20.4s}, [x8], #16 //load rk2 4165 4166 fmov v1.d[1], x9 //CTR block 1 4167 rev w9, w12 //CTR block 2 4168 add w12, w12, #1 //CTR block 2 4169 4170 orr x9, x11, x9, lsl #32 //CTR block 2 4171 ld1 {v21.4s}, [x8], #16 //load rk3 4172 4173 fmov v2.d[1], x9 //CTR block 2 4174 rev w9, w12 //CTR block 3 4175 4176 aese v0.16b, v19.16b 4177 aesmc v0.16b, v0.16b //AES block 0 - round 1 4178 orr x9, x11, x9, lsl #32 //CTR block 3 4179 4180 fmov v3.d[1], x9 //CTR block 3 4181 4182 aese v1.16b, v18.16b 4183 aesmc v1.16b, v1.16b //AES block 1 - round 0 4184 ld1 {v22.4s}, [x8], #16 //load rk4 4185 4186 aese v0.16b, v20.16b 4187 aesmc v0.16b, v0.16b //AES block 0 - round 2 4188 ld1 {v23.4s}, [x8], #16 //load rk5 4189 4190 aese v2.16b, v18.16b 4191 aesmc v2.16b, v2.16b //AES block 2 - round 0 4192 ld1 {v24.4s}, [x8], #16 //load rk6 4193 4194 aese v1.16b, v19.16b 4195 aesmc v1.16b, v1.16b //AES block 1 - round 1 4196 ldr q14, [x3, #80] //load h3l | h3h 4197#ifndef __AARCH64EB__ 4198 ext v14.16b, v14.16b, v14.16b, #8 4199#endif 4200 aese v3.16b, v18.16b 4201 aesmc v3.16b, v3.16b //AES block 3 - round 0 4202 ld1 {v25.4s}, [x8], #16 //load rk7 4203 4204 aese v2.16b, v19.16b 4205 aesmc v2.16b, v2.16b //AES block 2 - round 1 4206 ld1 {v26.4s}, [x8], #16 //load rk8 4207 4208 aese v1.16b, v20.16b 4209 aesmc v1.16b, v1.16b //AES block 1 - round 2 4210 ldr q13, [x3, #64] //load h2l | h2h 4211#ifndef __AARCH64EB__ 4212 ext v13.16b, v13.16b, v13.16b, #8 4213#endif 4214 aese v3.16b, v19.16b 4215 aesmc v3.16b, v3.16b //AES block 3 - round 1 4216 ld1 {v27.4s}, [x8], #16 //load rk9 4217 4218 aese v2.16b, v20.16b 4219 aesmc v2.16b, v2.16b //AES block 2 - round 2 4220 ldr q15, [x3, #112] //load h4l | h4h 4221#ifndef __AARCH64EB__ 4222 ext v15.16b, v15.16b, v15.16b, #8 4223#endif 4224 aese v1.16b, v21.16b 4225 aesmc v1.16b, v1.16b //AES block 1 - round 3 4226 ld1 {v28.4s}, [x8], #16 //load rk10 4227 4228 aese v3.16b, v20.16b 4229 aesmc v3.16b, v3.16b //AES block 3 - round 2 4230 ld1 {v29.4s}, [x8], #16 //load rk11 4231 4232 aese v2.16b, v21.16b 4233 aesmc v2.16b, v2.16b //AES block 2 - round 3 4234 add w12, w12, #1 //CTR block 3 4235 4236 aese v0.16b, v21.16b 4237 aesmc v0.16b, v0.16b //AES block 0 - round 3 4238 4239 aese v3.16b, v21.16b 4240 aesmc v3.16b, v3.16b //AES block 3 - round 3 4241 ld1 { v11.16b}, [x3] 4242 ext v11.16b, v11.16b, v11.16b, #8 4243 rev64 v11.16b, v11.16b 4244 4245 aese v2.16b, v22.16b 4246 aesmc v2.16b, v2.16b //AES block 2 - round 4 4247 4248 aese v0.16b, v22.16b 4249 aesmc v0.16b, v0.16b //AES block 0 - round 4 4250 4251 aese v1.16b, v22.16b 4252 aesmc v1.16b, v1.16b //AES block 1 - round 4 4253 4254 aese v3.16b, v22.16b 4255 aesmc v3.16b, v3.16b //AES block 3 - round 4 4256 4257 aese v0.16b, v23.16b 4258 aesmc v0.16b, v0.16b //AES block 0 - round 5 4259 4260 aese v1.16b, v23.16b 4261 aesmc v1.16b, v1.16b //AES block 1 - round 5 4262 4263 aese v3.16b, v23.16b 4264 aesmc v3.16b, v3.16b //AES block 3 - round 5 4265 4266 aese v2.16b, v23.16b 4267 aesmc v2.16b, v2.16b //AES block 2 - round 5 4268 4269 aese v1.16b, v24.16b 4270 aesmc v1.16b, v1.16b //AES block 1 - round 6 4271 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 4272 4273 aese v3.16b, v24.16b 4274 aesmc v3.16b, v3.16b //AES block 3 - round 6 4275 ld1 {v30.4s}, [x8], #16 //load rk12 4276 4277 aese v0.16b, v24.16b 4278 aesmc v0.16b, v0.16b //AES block 0 - round 6 4279 ldr q12, [x3, #32] //load h1l | h1h 4280#ifndef __AARCH64EB__ 4281 ext v12.16b, v12.16b, v12.16b, #8 4282#endif 4283 aese v2.16b, v24.16b 4284 aesmc v2.16b, v2.16b //AES block 2 - round 6 4285 ld1 {v31.4s}, [x8], #16 //load rk13 4286 4287 aese v1.16b, v25.16b 4288 aesmc v1.16b, v1.16b //AES block 1 - round 7 4289 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 4290 4291 aese v0.16b, v25.16b 4292 aesmc v0.16b, v0.16b //AES block 0 - round 7 4293 4294 aese v2.16b, v25.16b 4295 aesmc v2.16b, v2.16b //AES block 2 - round 7 4296 4297 aese v3.16b, v25.16b 4298 aesmc v3.16b, v3.16b //AES block 3 - round 7 4299 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 4300 4301 aese v1.16b, v26.16b 4302 aesmc v1.16b, v1.16b //AES block 1 - round 8 4303 4304 aese v2.16b, v26.16b 4305 aesmc v2.16b, v2.16b //AES block 2 - round 8 4306 4307 aese v3.16b, v26.16b 4308 aesmc v3.16b, v3.16b //AES block 3 - round 8 4309 4310 aese v1.16b, v27.16b 4311 aesmc v1.16b, v1.16b //AES block 1 - round 9 4312 4313 aese v2.16b, v27.16b 4314 aesmc v2.16b, v2.16b //AES block 2 - round 9 4315 4316 aese v0.16b, v26.16b 4317 aesmc v0.16b, v0.16b //AES block 0 - round 8 4318 4319 aese v1.16b, v28.16b 4320 aesmc v1.16b, v1.16b //AES block 1 - round 10 4321 4322 aese v3.16b, v27.16b 4323 aesmc v3.16b, v3.16b //AES block 3 - round 9 4324 4325 aese v0.16b, v27.16b 4326 aesmc v0.16b, v0.16b //AES block 0 - round 9 4327 4328 aese v2.16b, v28.16b 4329 aesmc v2.16b, v2.16b //AES block 2 - round 10 4330 4331 aese v3.16b, v28.16b 4332 aesmc v3.16b, v3.16b //AES block 3 - round 10 4333 4334 aese v1.16b, v29.16b 4335 aesmc v1.16b, v1.16b //AES block 1 - round 11 4336 4337 aese v2.16b, v29.16b 4338 aesmc v2.16b, v2.16b //AES block 2 - round 11 4339 4340 aese v0.16b, v28.16b 4341 aesmc v0.16b, v0.16b //AES block 0 - round 10 4342 4343 aese v1.16b, v30.16b 4344 aesmc v1.16b, v1.16b //AES block 1 - round 12 4345 4346 aese v2.16b, v30.16b 4347 aesmc v2.16b, v2.16b //AES block 2 - round 12 4348 4349 aese v0.16b, v29.16b 4350 aesmc v0.16b, v0.16b //AES block 0 - round 11 4351 eor v17.16b, v17.16b, v9.16b //h4k | h3k 4352 4353 aese v3.16b, v29.16b 4354 aesmc v3.16b, v3.16b //AES block 3 - round 11 4355 4356 aese v2.16b, v31.16b //AES block 2 - round 13 4357 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 4358 4359 aese v0.16b, v30.16b 4360 aesmc v0.16b, v0.16b //AES block 0 - round 12 4361 4362 aese v3.16b, v30.16b 4363 aesmc v3.16b, v3.16b //AES block 3 - round 12 4364 4365 aese v1.16b, v31.16b //AES block 1 - round 13 4366 4367 aese v0.16b, v31.16b //AES block 0 - round 13 4368 4369 aese v3.16b, v31.16b //AES block 3 - round 13 4370 eor v16.16b, v16.16b, v8.16b //h2k | h1k 4371 b.ge .L256_enc_tail //handle tail 4372 4373 ldp x19, x20, [x0, #16] //AES block 1 - load plaintext 4374#ifdef __AARCH64EB__ 4375 rev x19, x19 4376 rev x20, x20 4377#endif 4378 rev w9, w12 //CTR block 4 4379 ldp x6, x7, [x0, #0] //AES block 0 - load plaintext 4380#ifdef __AARCH64EB__ 4381 rev x6, x6 4382 rev x7, x7 4383#endif 4384 ldp x23, x24, [x0, #48] //AES block 3 - load plaintext 4385#ifdef __AARCH64EB__ 4386 rev x23, x23 4387 rev x24, x24 4388#endif 4389 ldp x21, x22, [x0, #32] //AES block 2 - load plaintext 4390#ifdef __AARCH64EB__ 4391 rev x21, x21 4392 rev x22, x22 4393#endif 4394 add x0, x0, #64 //AES input_ptr update 4395 4396 eor x19, x19, x13 //AES block 1 - round 14 low 4397 eor x20, x20, x14 //AES block 1 - round 14 high 4398 4399 fmov d5, x19 //AES block 1 - mov low 4400 eor x6, x6, x13 //AES block 0 - round 14 low 4401 4402 eor x7, x7, x14 //AES block 0 - round 14 high 4403 eor x24, x24, x14 //AES block 3 - round 14 high 4404 fmov d4, x6 //AES block 0 - mov low 4405 4406 cmp x0, x5 //check if we have <= 8 blocks 4407 fmov v4.d[1], x7 //AES block 0 - mov high 4408 eor x23, x23, x13 //AES block 3 - round 14 low 4409 4410 eor x21, x21, x13 //AES block 2 - round 14 low 4411 fmov v5.d[1], x20 //AES block 1 - mov high 4412 4413 fmov d6, x21 //AES block 2 - mov low 4414 add w12, w12, #1 //CTR block 4 4415 4416 orr x9, x11, x9, lsl #32 //CTR block 4 4417 fmov d7, x23 //AES block 3 - mov low 4418 eor x22, x22, x14 //AES block 2 - round 14 high 4419 4420 fmov v6.d[1], x22 //AES block 2 - mov high 4421 4422 eor v4.16b, v4.16b, v0.16b //AES block 0 - result 4423 fmov d0, x10 //CTR block 4 4424 4425 fmov v0.d[1], x9 //CTR block 4 4426 rev w9, w12 //CTR block 5 4427 add w12, w12, #1 //CTR block 5 4428 4429 eor v5.16b, v5.16b, v1.16b //AES block 1 - result 4430 fmov d1, x10 //CTR block 5 4431 orr x9, x11, x9, lsl #32 //CTR block 5 4432 4433 fmov v1.d[1], x9 //CTR block 5 4434 rev w9, w12 //CTR block 6 4435 st1 { v4.16b}, [x2], #16 //AES block 0 - store result 4436 4437 fmov v7.d[1], x24 //AES block 3 - mov high 4438 orr x9, x11, x9, lsl #32 //CTR block 6 4439 eor v6.16b, v6.16b, v2.16b //AES block 2 - result 4440 4441 st1 { v5.16b}, [x2], #16 //AES block 1 - store result 4442 4443 add w12, w12, #1 //CTR block 6 4444 fmov d2, x10 //CTR block 6 4445 4446 fmov v2.d[1], x9 //CTR block 6 4447 st1 { v6.16b}, [x2], #16 //AES block 2 - store result 4448 rev w9, w12 //CTR block 7 4449 4450 orr x9, x11, x9, lsl #32 //CTR block 7 4451 4452 eor v7.16b, v7.16b, v3.16b //AES block 3 - result 4453 st1 { v7.16b}, [x2], #16 //AES block 3 - store result 4454 b.ge .L256_enc_prepretail //do prepretail 4455 4456.L256_enc_main_loop: //main loop start 4457 aese v0.16b, v18.16b 4458 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 4459 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 4460 4461 aese v1.16b, v18.16b 4462 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 4463 fmov d3, x10 //CTR block 4k+3 4464 4465 aese v2.16b, v18.16b 4466 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 4467 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 4468 4469 aese v0.16b, v19.16b 4470 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 4471 fmov v3.d[1], x9 //CTR block 4k+3 4472 4473 aese v1.16b, v19.16b 4474 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 4475 ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext 4476#ifdef __AARCH64EB__ 4477 rev x23, x23 4478 rev x24, x24 4479#endif 4480 aese v2.16b, v19.16b 4481 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 4482 ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext 4483#ifdef __AARCH64EB__ 4484 rev x21, x21 4485 rev x22, x22 4486#endif 4487 aese v0.16b, v20.16b 4488 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 4489 eor v4.16b, v4.16b, v11.16b //PRE 1 4490 4491 aese v1.16b, v20.16b 4492 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 4493 4494 aese v3.16b, v18.16b 4495 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 4496 eor x23, x23, x13 //AES block 4k+7 - round 14 low 4497 4498 aese v0.16b, v21.16b 4499 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 4500 mov d10, v17.d[1] //GHASH block 4k - mid 4501 4502 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 4503 eor x22, x22, x14 //AES block 4k+6 - round 14 high 4504 mov d8, v4.d[1] //GHASH block 4k - mid 4505 4506 aese v3.16b, v19.16b 4507 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 4508 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 4509 4510 aese v0.16b, v22.16b 4511 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 4512 4513 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 4514 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 4515 4516 aese v2.16b, v20.16b 4517 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 4518 4519 aese v0.16b, v23.16b 4520 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 4521 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 4522 4523 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 4524 4525 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 4526 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 4527 4528 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 4529 4530 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 4531 mov d4, v5.d[1] //GHASH block 4k+1 - mid 4532 4533 aese v1.16b, v21.16b 4534 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 4535 4536 aese v3.16b, v20.16b 4537 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 4538 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 4539 4540 aese v2.16b, v21.16b 4541 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 4542 4543 aese v1.16b, v22.16b 4544 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 4545 mov d8, v6.d[1] //GHASH block 4k+2 - mid 4546 4547 aese v3.16b, v21.16b 4548 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 4549 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 4550 4551 aese v2.16b, v22.16b 4552 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 4553 4554 aese v0.16b, v24.16b 4555 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 4556 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 4557 4558 aese v3.16b, v22.16b 4559 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 4560 4561 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 4562 4563 aese v0.16b, v25.16b 4564 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 4565 4566 aese v3.16b, v23.16b 4567 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 4568 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 4569 4570 aese v1.16b, v23.16b 4571 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 4572 4573 aese v0.16b, v26.16b 4574 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 4575 4576 aese v2.16b, v23.16b 4577 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 4578 4579 aese v1.16b, v24.16b 4580 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 4581 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 4582 4583 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 4584 4585 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 4586 4587 aese v1.16b, v25.16b 4588 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 4589 4590 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 4591 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 4592 4593 aese v3.16b, v24.16b 4594 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 4595 ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext 4596#ifdef __AARCH64EB__ 4597 rev x19, x19 4598 rev x20, x20 4599#endif 4600 aese v1.16b, v26.16b 4601 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 4602 mov d4, v7.d[1] //GHASH block 4k+3 - mid 4603 4604 aese v2.16b, v24.16b 4605 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 4606 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 4607 4608 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 4609 4610 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 4611 eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid 4612 4613 aese v2.16b, v25.16b 4614 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 4615 eor x19, x19, x13 //AES block 4k+5 - round 14 low 4616 4617 aese v1.16b, v27.16b 4618 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 4619 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 4620 4621 aese v3.16b, v25.16b 4622 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 4623 eor x21, x21, x13 //AES block 4k+6 - round 14 low 4624 4625 aese v0.16b, v27.16b 4626 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 4627 movi v8.8b, #0xc2 4628 4629 pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid 4630 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 4631 fmov d5, x19 //AES block 4k+5 - mov low 4632 4633 aese v2.16b, v26.16b 4634 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 4635 ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext 4636#ifdef __AARCH64EB__ 4637 rev x6, x6 4638 rev x7, x7 4639#endif 4640 aese v0.16b, v28.16b 4641 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 4642 shl d8, d8, #56 //mod_constant 4643 4644 aese v3.16b, v26.16b 4645 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 4646 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 4647 4648 aese v2.16b, v27.16b 4649 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 4650 4651 aese v1.16b, v28.16b 4652 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 4653 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid 4654 4655 aese v3.16b, v27.16b 4656 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 4657 add w12, w12, #1 //CTR block 4k+3 4658 4659 aese v0.16b, v29.16b 4660 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 4661 eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 4662 4663 aese v1.16b, v29.16b 4664 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 4665 add x0, x0, #64 //AES input_ptr update 4666 4667 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 4668 rev w9, w12 //CTR block 4k+8 4669 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 4670 4671 aese v2.16b, v28.16b 4672 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 4673 eor x6, x6, x13 //AES block 4k+4 - round 14 low 4674 4675 aese v1.16b, v30.16b 4676 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 4677 eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up 4678 4679 aese v3.16b, v28.16b 4680 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 4681 eor x7, x7, x14 //AES block 4k+4 - round 14 high 4682 4683 fmov d4, x6 //AES block 4k+4 - mov low 4684 orr x9, x11, x9, lsl #32 //CTR block 4k+8 4685 eor v7.16b, v9.16b, v7.16b //MODULO - fold into mid 4686 4687 aese v0.16b, v30.16b 4688 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 4689 eor x20, x20, x14 //AES block 4k+5 - round 14 high 4690 4691 aese v2.16b, v29.16b 4692 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 4693 eor x24, x24, x14 //AES block 4k+7 - round 14 high 4694 4695 aese v3.16b, v29.16b 4696 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 4697 add w12, w12, #1 //CTR block 4k+8 4698 4699 aese v0.16b, v31.16b //AES block 4k+4 - round 13 4700 fmov v4.d[1], x7 //AES block 4k+4 - mov high 4701 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 4702 4703 aese v2.16b, v30.16b 4704 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 4705 fmov d7, x23 //AES block 4k+7 - mov low 4706 4707 aese v1.16b, v31.16b //AES block 4k+5 - round 13 4708 fmov v5.d[1], x20 //AES block 4k+5 - mov high 4709 4710 fmov d6, x21 //AES block 4k+6 - mov low 4711 cmp x0, x5 //.LOOP CONTROL 4712 4713 fmov v6.d[1], x22 //AES block 4k+6 - mov high 4714 4715 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 4716 eor v4.16b, v4.16b, v0.16b //AES block 4k+4 - result 4717 fmov d0, x10 //CTR block 4k+8 4718 4719 fmov v0.d[1], x9 //CTR block 4k+8 4720 rev w9, w12 //CTR block 4k+9 4721 add w12, w12, #1 //CTR block 4k+9 4722 4723 eor v5.16b, v5.16b, v1.16b //AES block 4k+5 - result 4724 fmov d1, x10 //CTR block 4k+9 4725 orr x9, x11, x9, lsl #32 //CTR block 4k+9 4726 4727 aese v3.16b, v30.16b 4728 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 4729 fmov v1.d[1], x9 //CTR block 4k+9 4730 4731 aese v2.16b, v31.16b //AES block 4k+6 - round 13 4732 rev w9, w12 //CTR block 4k+10 4733 st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result 4734 4735 orr x9, x11, x9, lsl #32 //CTR block 4k+10 4736 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 4737 fmov v7.d[1], x24 //AES block 4k+7 - mov high 4738 4739 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 4740 st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result 4741 add w12, w12, #1 //CTR block 4k+10 4742 4743 aese v3.16b, v31.16b //AES block 4k+7 - round 13 4744 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result 4745 fmov d2, x10 //CTR block 4k+10 4746 4747 st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result 4748 fmov v2.d[1], x9 //CTR block 4k+10 4749 rev w9, w12 //CTR block 4k+11 4750 4751 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 4752 orr x9, x11, x9, lsl #32 //CTR block 4k+11 4753 4754 eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result 4755 st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result 4756 b.lt .L256_enc_main_loop 4757 4758.L256_enc_prepretail: //PREPRETAIL 4759 aese v1.16b, v18.16b 4760 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 4761 rev64 v6.16b, v6.16b //GHASH block 4k+2 (t0, t1, and t2 free) 4762 4763 aese v2.16b, v18.16b 4764 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 4765 fmov d3, x10 //CTR block 4k+3 4766 4767 aese v0.16b, v18.16b 4768 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 4769 rev64 v4.16b, v4.16b //GHASH block 4k (only t0 is free) 4770 4771 fmov v3.d[1], x9 //CTR block 4k+3 4772 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 4773 4774 aese v2.16b, v19.16b 4775 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 4776 4777 aese v0.16b, v19.16b 4778 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 4779 4780 eor v4.16b, v4.16b, v11.16b //PRE 1 4781 rev64 v5.16b, v5.16b //GHASH block 4k+1 (t0 and t1 free) 4782 4783 aese v2.16b, v20.16b 4784 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 4785 4786 aese v3.16b, v18.16b 4787 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 4788 mov d10, v17.d[1] //GHASH block 4k - mid 4789 4790 aese v1.16b, v19.16b 4791 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 4792 4793 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 4794 mov d8, v4.d[1] //GHASH block 4k - mid 4795 4796 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 4797 4798 aese v2.16b, v21.16b 4799 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 4800 4801 aese v1.16b, v20.16b 4802 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 4803 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 4804 4805 aese v0.16b, v20.16b 4806 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 4807 4808 aese v3.16b, v19.16b 4809 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 4810 4811 aese v1.16b, v21.16b 4812 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 4813 4814 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 4815 4816 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 4817 4818 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 4819 4820 aese v3.16b, v20.16b 4821 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 4822 4823 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 4824 mov d4, v5.d[1] //GHASH block 4k+1 - mid 4825 4826 aese v0.16b, v21.16b 4827 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 4828 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 4829 4830 aese v3.16b, v21.16b 4831 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 4832 4833 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 4834 mov d8, v6.d[1] //GHASH block 4k+2 - mid 4835 4836 aese v0.16b, v22.16b 4837 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 4838 rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) 4839 4840 aese v3.16b, v22.16b 4841 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 4842 4843 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 4844 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 4845 add w12, w12, #1 //CTR block 4k+3 4846 4847 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 4848 4849 aese v3.16b, v23.16b 4850 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 4851 4852 aese v2.16b, v22.16b 4853 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 4854 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 4855 4856 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 4857 4858 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 4859 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 4860 4861 aese v2.16b, v23.16b 4862 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 4863 4864 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 4865 mov d4, v7.d[1] //GHASH block 4k+3 - mid 4866 4867 aese v1.16b, v22.16b 4868 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 4869 4870 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 4871 4872 eor v4.8b, v4.8b, v7.8b //GHASH block 4k+3 - mid 4873 4874 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 4875 4876 aese v1.16b, v23.16b 4877 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 4878 4879 pmull v4.1q, v4.1d, v16.1d //GHASH block 4k+3 - mid 4880 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 4881 4882 aese v0.16b, v23.16b 4883 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 4884 4885 aese v1.16b, v24.16b 4886 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 4887 4888 aese v2.16b, v24.16b 4889 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 4890 4891 aese v0.16b, v24.16b 4892 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 4893 movi v8.8b, #0xc2 4894 4895 aese v3.16b, v24.16b 4896 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 4897 4898 aese v1.16b, v25.16b 4899 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 4900 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 4901 4902 aese v0.16b, v25.16b 4903 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 4904 4905 aese v3.16b, v25.16b 4906 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 4907 shl d8, d8, #56 //mod_constant 4908 4909 aese v1.16b, v26.16b 4910 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 4911 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+3 - mid 4912 4913 pmull v6.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 4914 4915 aese v3.16b, v26.16b 4916 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 4917 4918 aese v1.16b, v27.16b 4919 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 4920 4921 aese v0.16b, v26.16b 4922 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 4923 eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low 4924 4925 aese v3.16b, v27.16b 4926 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 4927 4928 eor v10.16b, v10.16b, v9.16b //karatsuba tidy up 4929 4930 pmull v4.1q, v9.1d, v8.1d 4931 ext v9.16b, v9.16b, v9.16b, #8 4932 4933 aese v3.16b, v28.16b 4934 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 4935 4936 aese v2.16b, v25.16b 4937 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 4938 eor v10.16b, v10.16b, v11.16b 4939 4940 aese v1.16b, v28.16b 4941 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 4942 4943 aese v0.16b, v27.16b 4944 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 4945 4946 aese v2.16b, v26.16b 4947 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 4948 4949 aese v1.16b, v29.16b 4950 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 4951 eor v10.16b, v10.16b, v4.16b 4952 4953 aese v0.16b, v28.16b 4954 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 4955 4956 aese v2.16b, v27.16b 4957 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 4958 4959 aese v1.16b, v30.16b 4960 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 4961 4962 aese v0.16b, v29.16b 4963 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 4964 eor v10.16b, v10.16b, v9.16b 4965 4966 aese v3.16b, v29.16b 4967 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 4968 4969 aese v2.16b, v28.16b 4970 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 4971 4972 aese v0.16b, v30.16b 4973 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 4974 4975 pmull v4.1q, v10.1d, v8.1d 4976 4977 aese v2.16b, v29.16b 4978 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 4979 ext v10.16b, v10.16b, v10.16b, #8 4980 4981 aese v3.16b, v30.16b 4982 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 4983 4984 aese v1.16b, v31.16b //AES block 4k+5 - round 13 4985 eor v11.16b, v11.16b, v4.16b 4986 4987 aese v2.16b, v30.16b 4988 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 4989 4990 aese v3.16b, v31.16b //AES block 4k+7 - round 13 4991 4992 aese v0.16b, v31.16b //AES block 4k+4 - round 13 4993 4994 aese v2.16b, v31.16b //AES block 4k+6 - round 13 4995 eor v11.16b, v11.16b, v10.16b 4996.L256_enc_tail: //TAIL 4997 4998 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 4999 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 5000 ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext 5001#ifdef __AARCH64EB__ 5002 rev x6, x6 5003 rev x7, x7 5004#endif 5005 eor x6, x6, x13 //AES block 4k+4 - round 14 low 5006 eor x7, x7, x14 //AES block 4k+4 - round 14 high 5007 5008 cmp x5, #48 5009 fmov d4, x6 //AES block 4k+4 - mov low 5010 5011 fmov v4.d[1], x7 //AES block 4k+4 - mov high 5012 5013 eor v5.16b, v4.16b, v0.16b //AES block 4k+4 - result 5014 b.gt .L256_enc_blocks_more_than_3 5015 5016 cmp x5, #32 5017 mov v3.16b, v2.16b 5018 movi v11.8b, #0 5019 5020 movi v9.8b, #0 5021 sub w12, w12, #1 5022 5023 mov v2.16b, v1.16b 5024 movi v10.8b, #0 5025 b.gt .L256_enc_blocks_more_than_2 5026 5027 mov v3.16b, v1.16b 5028 sub w12, w12, #1 5029 cmp x5, #16 5030 5031 b.gt .L256_enc_blocks_more_than_1 5032 5033 sub w12, w12, #1 5034 b .L256_enc_blocks_less_than_1 5035.L256_enc_blocks_more_than_3: //blocks left > 3 5036 st1 { v5.16b}, [x2], #16 //AES final-3 block - store result 5037 5038 ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high 5039#ifdef __AARCH64EB__ 5040 rev x6, x6 5041 rev x7, x7 5042#endif 5043 rev64 v4.16b, v5.16b //GHASH final-3 block 5044 5045 eor x6, x6, x13 //AES final-2 block - round 14 low 5046 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5047 5048 eor x7, x7, x14 //AES final-2 block - round 14 high 5049 5050 mov d22, v4.d[1] //GHASH final-3 block - mid 5051 fmov d5, x6 //AES final-2 block - mov low 5052 5053 fmov v5.d[1], x7 //AES final-2 block - mov high 5054 5055 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 5056 movi v8.8b, #0 //suppress further partial tag feed in 5057 5058 mov d10, v17.d[1] //GHASH final-3 block - mid 5059 5060 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 5061 5062 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 5063 5064 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 5065 eor v5.16b, v5.16b, v1.16b //AES final-2 block - result 5066.L256_enc_blocks_more_than_2: //blocks left > 2 5067 5068 st1 { v5.16b}, [x2], #16 //AES final-2 block - store result 5069 5070 ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high 5071#ifdef __AARCH64EB__ 5072 rev x6, x6 5073 rev x7, x7 5074#endif 5075 rev64 v4.16b, v5.16b //GHASH final-2 block 5076 5077 eor x6, x6, x13 //AES final-1 block - round 14 low 5078 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5079 5080 fmov d5, x6 //AES final-1 block - mov low 5081 eor x7, x7, x14 //AES final-1 block - round 14 high 5082 5083 fmov v5.d[1], x7 //AES final-1 block - mov high 5084 5085 movi v8.8b, #0 //suppress further partial tag feed in 5086 5087 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 5088 mov d22, v4.d[1] //GHASH final-2 block - mid 5089 5090 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 5091 5092 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 5093 5094 eor v5.16b, v5.16b, v2.16b //AES final-1 block - result 5095 5096 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 5097 5098 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 5099 5100 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 5101 5102 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 5103.L256_enc_blocks_more_than_1: //blocks left > 1 5104 5105 st1 { v5.16b}, [x2], #16 //AES final-1 block - store result 5106 5107 rev64 v4.16b, v5.16b //GHASH final-1 block 5108 5109 ldp x6, x7, [x0], #16 //AES final block - load input low & high 5110#ifdef __AARCH64EB__ 5111 rev x6, x6 5112 rev x7, x7 5113#endif 5114 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5115 5116 movi v8.8b, #0 //suppress further partial tag feed in 5117 5118 eor x6, x6, x13 //AES final block - round 14 low 5119 mov d22, v4.d[1] //GHASH final-1 block - mid 5120 5121 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 5122 eor x7, x7, x14 //AES final block - round 14 high 5123 5124 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 5125 5126 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 5127 5128 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 5129 fmov d5, x6 //AES final block - mov low 5130 5131 fmov v5.d[1], x7 //AES final block - mov high 5132 5133 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 5134 5135 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 5136 5137 eor v5.16b, v5.16b, v3.16b //AES final block - result 5138 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 5139 5140 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 5141.L256_enc_blocks_less_than_1: //blocks left <= 1 5142 5143 and x1, x1, #127 //bit_length %= 128 5144 5145 mvn x13, xzr //rk14_l = 0xffffffffffffffff 5146 sub x1, x1, #128 //bit_length -= 128 5147 5148 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 5149 ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored 5150 5151 mvn x14, xzr //rk14_h = 0xffffffffffffffff 5152 and x1, x1, #127 //bit_length %= 128 5153 5154 lsr x14, x14, x1 //rk14_h is mask for top 64b of last block 5155 cmp x1, #64 5156 5157 csel x6, x13, x14, lt 5158 csel x7, x14, xzr, lt 5159 5160 fmov d0, x6 //ctr0b is mask for last block 5161 5162 fmov v0.d[1], x7 5163 5164 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 5165 5166 rev64 v4.16b, v5.16b //GHASH final block 5167 5168 eor v4.16b, v4.16b, v8.16b //feed in partial tag 5169 5170 bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing 5171 5172 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 5173 mov d8, v4.d[1] //GHASH final block - mid 5174#ifndef __AARCH64EB__ 5175 rev w9, w12 5176#else 5177 mov w9, w12 5178#endif 5179 5180 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 5181 5182 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 5183 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 5184 5185 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 5186 5187 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 5188 5189 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 5190 movi v8.8b, #0xc2 5191 5192 eor v4.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 5193 5194 shl d8, d8, #56 //mod_constant 5195 5196 eor v10.16b, v10.16b, v4.16b //MODULO - karatsuba tidy up 5197 5198 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 5199 5200 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 5201 5202 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 5203 5204 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 5205 5206 pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 5207 5208 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 5209 5210 str w9, [x16, #12] //store the updated counter 5211 5212 st1 { v5.16b}, [x2] //store all 16B 5213 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low 5214 5215 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 5216 ext v11.16b, v11.16b, v11.16b, #8 5217 rev64 v11.16b, v11.16b 5218 mov x0, x15 5219 st1 { v11.16b }, [x3] 5220 5221 ldp x21, x22, [sp, #16] 5222 ldp x23, x24, [sp, #32] 5223 ldp d8, d9, [sp, #48] 5224 ldp d10, d11, [sp, #64] 5225 ldp d12, d13, [sp, #80] 5226 ldp d14, d15, [sp, #96] 5227 ldp x19, x20, [sp], #112 5228 ret 5229 5230.L256_enc_ret: 5231 mov w0, #0x0 5232 ret 5233.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel 5234.globl aes_gcm_dec_256_kernel 5235.type aes_gcm_dec_256_kernel,%function 5236.align 4 5237aes_gcm_dec_256_kernel: 5238 AARCH64_VALID_CALL_TARGET 5239 cbz x1, .L256_dec_ret 5240 stp x19, x20, [sp, #-112]! 5241 mov x16, x4 5242 mov x8, x5 5243 stp x21, x22, [sp, #16] 5244 stp x23, x24, [sp, #32] 5245 stp d8, d9, [sp, #48] 5246 stp d10, d11, [sp, #64] 5247 stp d12, d13, [sp, #80] 5248 stp d14, d15, [sp, #96] 5249 5250 lsr x5, x1, #3 //byte_len 5251 mov x15, x5 5252 ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 5253#ifdef __AARCH64EB__ 5254 rev x10, x10 5255 rev x11, x11 5256#endif 5257 ldp x13, x14, [x8, #224] //load rk14 5258#ifdef __AARCH64EB__ 5259 ror x14, x14, #32 5260 ror x13, x13, #32 5261#endif 5262 ld1 {v18.4s}, [x8], #16 //load rk0 5263 sub x5, x5, #1 //byte_len - 1 5264 5265 ld1 {v19.4s}, [x8], #16 //load rk1 5266 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 5267 5268 add x4, x0, x1, lsr #3 //end_input_ptr 5269 ld1 {v20.4s}, [x8], #16 //load rk2 5270 5271 lsr x12, x11, #32 5272 ld1 {v21.4s}, [x8], #16 //load rk3 5273 orr w11, w11, w11 5274 5275 ld1 {v22.4s}, [x8], #16 //load rk4 5276 add x5, x5, x0 5277 rev w12, w12 //rev_ctr32 5278 5279 add w12, w12, #1 //increment rev_ctr32 5280 fmov d3, x10 //CTR block 3 5281 5282 rev w9, w12 //CTR block 1 5283 add w12, w12, #1 //CTR block 1 5284 fmov d1, x10 //CTR block 1 5285 5286 orr x9, x11, x9, lsl #32 //CTR block 1 5287 ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible 5288 5289 fmov v1.d[1], x9 //CTR block 1 5290 rev w9, w12 //CTR block 2 5291 add w12, w12, #1 //CTR block 2 5292 5293 fmov d2, x10 //CTR block 2 5294 orr x9, x11, x9, lsl #32 //CTR block 2 5295 5296 fmov v2.d[1], x9 //CTR block 2 5297 rev w9, w12 //CTR block 3 5298 5299 orr x9, x11, x9, lsl #32 //CTR block 3 5300 ld1 {v23.4s}, [x8], #16 //load rk5 5301 5302 fmov v3.d[1], x9 //CTR block 3 5303 add w12, w12, #1 //CTR block 3 5304 5305 ld1 {v24.4s}, [x8], #16 //load rk6 5306 5307 ld1 {v25.4s}, [x8], #16 //load rk7 5308 5309 ld1 {v26.4s}, [x8], #16 //load rk8 5310 5311 aese v0.16b, v18.16b 5312 aesmc v0.16b, v0.16b //AES block 0 - round 0 5313 ldr q14, [x3, #80] //load h3l | h3h 5314#ifndef __AARCH64EB__ 5315 ext v14.16b, v14.16b, v14.16b, #8 5316#endif 5317 5318 aese v3.16b, v18.16b 5319 aesmc v3.16b, v3.16b //AES block 3 - round 0 5320 ldr q15, [x3, #112] //load h4l | h4h 5321#ifndef __AARCH64EB__ 5322 ext v15.16b, v15.16b, v15.16b, #8 5323#endif 5324 5325 aese v1.16b, v18.16b 5326 aesmc v1.16b, v1.16b //AES block 1 - round 0 5327 ldr q13, [x3, #64] //load h2l | h2h 5328#ifndef __AARCH64EB__ 5329 ext v13.16b, v13.16b, v13.16b, #8 5330#endif 5331 5332 aese v2.16b, v18.16b 5333 aesmc v2.16b, v2.16b //AES block 2 - round 0 5334 ld1 {v27.4s}, [x8], #16 //load rk9 5335 5336 aese v0.16b, v19.16b 5337 aesmc v0.16b, v0.16b //AES block 0 - round 1 5338 5339 aese v1.16b, v19.16b 5340 aesmc v1.16b, v1.16b //AES block 1 - round 1 5341 ld1 { v11.16b}, [x3] 5342 ext v11.16b, v11.16b, v11.16b, #8 5343 rev64 v11.16b, v11.16b 5344 5345 aese v2.16b, v19.16b 5346 aesmc v2.16b, v2.16b //AES block 2 - round 1 5347 ld1 {v28.4s}, [x8], #16 //load rk10 5348 5349 aese v3.16b, v19.16b 5350 aesmc v3.16b, v3.16b //AES block 3 - round 1 5351 ld1 {v29.4s}, [x8], #16 //load rk11 5352 5353 aese v0.16b, v20.16b 5354 aesmc v0.16b, v0.16b //AES block 0 - round 2 5355 ldr q12, [x3, #32] //load h1l | h1h 5356#ifndef __AARCH64EB__ 5357 ext v12.16b, v12.16b, v12.16b, #8 5358#endif 5359 aese v2.16b, v20.16b 5360 aesmc v2.16b, v2.16b //AES block 2 - round 2 5361 ld1 {v30.4s}, [x8], #16 //load rk12 5362 5363 aese v3.16b, v20.16b 5364 aesmc v3.16b, v3.16b //AES block 3 - round 2 5365 5366 aese v0.16b, v21.16b 5367 aesmc v0.16b, v0.16b //AES block 0 - round 3 5368 5369 aese v1.16b, v20.16b 5370 aesmc v1.16b, v1.16b //AES block 1 - round 2 5371 5372 aese v3.16b, v21.16b 5373 aesmc v3.16b, v3.16b //AES block 3 - round 3 5374 5375 aese v0.16b, v22.16b 5376 aesmc v0.16b, v0.16b //AES block 0 - round 4 5377 cmp x0, x5 //check if we have <= 4 blocks 5378 5379 aese v2.16b, v21.16b 5380 aesmc v2.16b, v2.16b //AES block 2 - round 3 5381 5382 aese v1.16b, v21.16b 5383 aesmc v1.16b, v1.16b //AES block 1 - round 3 5384 5385 aese v3.16b, v22.16b 5386 aesmc v3.16b, v3.16b //AES block 3 - round 4 5387 5388 aese v2.16b, v22.16b 5389 aesmc v2.16b, v2.16b //AES block 2 - round 4 5390 5391 aese v1.16b, v22.16b 5392 aesmc v1.16b, v1.16b //AES block 1 - round 4 5393 5394 aese v3.16b, v23.16b 5395 aesmc v3.16b, v3.16b //AES block 3 - round 5 5396 5397 aese v0.16b, v23.16b 5398 aesmc v0.16b, v0.16b //AES block 0 - round 5 5399 5400 aese v1.16b, v23.16b 5401 aesmc v1.16b, v1.16b //AES block 1 - round 5 5402 5403 aese v2.16b, v23.16b 5404 aesmc v2.16b, v2.16b //AES block 2 - round 5 5405 5406 aese v0.16b, v24.16b 5407 aesmc v0.16b, v0.16b //AES block 0 - round 6 5408 5409 aese v3.16b, v24.16b 5410 aesmc v3.16b, v3.16b //AES block 3 - round 6 5411 5412 aese v1.16b, v24.16b 5413 aesmc v1.16b, v1.16b //AES block 1 - round 6 5414 5415 aese v2.16b, v24.16b 5416 aesmc v2.16b, v2.16b //AES block 2 - round 6 5417 5418 aese v0.16b, v25.16b 5419 aesmc v0.16b, v0.16b //AES block 0 - round 7 5420 5421 aese v1.16b, v25.16b 5422 aesmc v1.16b, v1.16b //AES block 1 - round 7 5423 5424 aese v3.16b, v25.16b 5425 aesmc v3.16b, v3.16b //AES block 3 - round 7 5426 5427 aese v0.16b, v26.16b 5428 aesmc v0.16b, v0.16b //AES block 0 - round 8 5429 5430 aese v2.16b, v25.16b 5431 aesmc v2.16b, v2.16b //AES block 2 - round 7 5432 5433 aese v3.16b, v26.16b 5434 aesmc v3.16b, v3.16b //AES block 3 - round 8 5435 5436 aese v1.16b, v26.16b 5437 aesmc v1.16b, v1.16b //AES block 1 - round 8 5438 5439 aese v0.16b, v27.16b 5440 aesmc v0.16b, v0.16b //AES block 0 - round 9 5441 5442 aese v2.16b, v26.16b 5443 aesmc v2.16b, v2.16b //AES block 2 - round 8 5444 ld1 {v31.4s}, [x8], #16 //load rk13 5445 5446 aese v1.16b, v27.16b 5447 aesmc v1.16b, v1.16b //AES block 1 - round 9 5448 5449 aese v0.16b, v28.16b 5450 aesmc v0.16b, v0.16b //AES block 0 - round 10 5451 5452 aese v3.16b, v27.16b 5453 aesmc v3.16b, v3.16b //AES block 3 - round 9 5454 5455 aese v1.16b, v28.16b 5456 aesmc v1.16b, v1.16b //AES block 1 - round 10 5457 5458 aese v2.16b, v27.16b 5459 aesmc v2.16b, v2.16b //AES block 2 - round 9 5460 5461 aese v3.16b, v28.16b 5462 aesmc v3.16b, v3.16b //AES block 3 - round 10 5463 5464 aese v0.16b, v29.16b 5465 aesmc v0.16b, v0.16b //AES block 0 - round 11 5466 5467 aese v2.16b, v28.16b 5468 aesmc v2.16b, v2.16b //AES block 2 - round 10 5469 5470 aese v3.16b, v29.16b 5471 aesmc v3.16b, v3.16b //AES block 3 - round 11 5472 5473 aese v1.16b, v29.16b 5474 aesmc v1.16b, v1.16b //AES block 1 - round 11 5475 5476 aese v2.16b, v29.16b 5477 aesmc v2.16b, v2.16b //AES block 2 - round 11 5478 5479 trn1 v9.2d, v14.2d, v15.2d //h4h | h3h 5480 5481 trn2 v17.2d, v14.2d, v15.2d //h4l | h3l 5482 5483 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h 5484 trn2 v16.2d, v12.2d, v13.2d //h2l | h1l 5485 5486 aese v1.16b, v30.16b 5487 aesmc v1.16b, v1.16b //AES block 1 - round 12 5488 5489 aese v0.16b, v30.16b 5490 aesmc v0.16b, v0.16b //AES block 0 - round 12 5491 5492 aese v2.16b, v30.16b 5493 aesmc v2.16b, v2.16b //AES block 2 - round 12 5494 5495 aese v3.16b, v30.16b 5496 aesmc v3.16b, v3.16b //AES block 3 - round 12 5497 eor v17.16b, v17.16b, v9.16b //h4k | h3k 5498 5499 aese v1.16b, v31.16b //AES block 1 - round 13 5500 5501 aese v2.16b, v31.16b //AES block 2 - round 13 5502 eor v16.16b, v16.16b, v8.16b //h2k | h1k 5503 5504 aese v3.16b, v31.16b //AES block 3 - round 13 5505 5506 aese v0.16b, v31.16b //AES block 0 - round 13 5507 b.ge .L256_dec_tail //handle tail 5508 5509 ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext 5510 5511 rev w9, w12 //CTR block 4 5512 5513 eor v0.16b, v4.16b, v0.16b //AES block 0 - result 5514 5515 eor v1.16b, v5.16b, v1.16b //AES block 1 - result 5516 rev64 v5.16b, v5.16b //GHASH block 1 5517 ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext 5518 5519 mov x7, v0.d[1] //AES block 0 - mov high 5520 5521 mov x6, v0.d[0] //AES block 0 - mov low 5522 rev64 v4.16b, v4.16b //GHASH block 0 5523 add w12, w12, #1 //CTR block 4 5524 5525 fmov d0, x10 //CTR block 4 5526 orr x9, x11, x9, lsl #32 //CTR block 4 5527 5528 fmov v0.d[1], x9 //CTR block 4 5529 rev w9, w12 //CTR block 5 5530 add w12, w12, #1 //CTR block 5 5531 5532 mov x19, v1.d[0] //AES block 1 - mov low 5533 5534 orr x9, x11, x9, lsl #32 //CTR block 5 5535 mov x20, v1.d[1] //AES block 1 - mov high 5536 eor x7, x7, x14 //AES block 0 - round 14 high 5537#ifdef __AARCH64EB__ 5538 rev x7, x7 5539#endif 5540 eor x6, x6, x13 //AES block 0 - round 14 low 5541#ifdef __AARCH64EB__ 5542 rev x6, x6 5543#endif 5544 stp x6, x7, [x2], #16 //AES block 0 - store result 5545 fmov d1, x10 //CTR block 5 5546 5547 ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext 5548 5549 fmov v1.d[1], x9 //CTR block 5 5550 rev w9, w12 //CTR block 6 5551 add w12, w12, #1 //CTR block 6 5552 5553 eor x19, x19, x13 //AES block 1 - round 14 low 5554#ifdef __AARCH64EB__ 5555 rev x19, x19 5556#endif 5557 orr x9, x11, x9, lsl #32 //CTR block 6 5558 5559 eor x20, x20, x14 //AES block 1 - round 14 high 5560#ifdef __AARCH64EB__ 5561 rev x20, x20 5562#endif 5563 stp x19, x20, [x2], #16 //AES block 1 - store result 5564 5565 eor v2.16b, v6.16b, v2.16b //AES block 2 - result 5566 cmp x0, x5 //check if we have <= 8 blocks 5567 b.ge .L256_dec_prepretail //do prepretail 5568 5569.L256_dec_main_loop: //main loop start 5570 mov x21, v2.d[0] //AES block 4k+2 - mov low 5571 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 5572 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 5573 5574 aese v0.16b, v18.16b 5575 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 5576 mov x22, v2.d[1] //AES block 4k+2 - mov high 5577 5578 aese v1.16b, v18.16b 5579 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 5580 fmov d2, x10 //CTR block 4k+6 5581 5582 fmov v2.d[1], x9 //CTR block 4k+6 5583 eor v4.16b, v4.16b, v11.16b //PRE 1 5584 rev w9, w12 //CTR block 4k+7 5585 5586 aese v0.16b, v19.16b 5587 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 5588 mov x24, v3.d[1] //AES block 4k+3 - mov high 5589 5590 aese v1.16b, v19.16b 5591 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 5592 mov x23, v3.d[0] //AES block 4k+3 - mov low 5593 5594 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 5595 mov d8, v4.d[1] //GHASH block 4k - mid 5596 fmov d3, x10 //CTR block 4k+7 5597 5598 aese v0.16b, v20.16b 5599 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 5600 orr x9, x11, x9, lsl #32 //CTR block 4k+7 5601 5602 aese v2.16b, v18.16b 5603 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 5604 fmov v3.d[1], x9 //CTR block 4k+7 5605 5606 aese v1.16b, v20.16b 5607 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 5608 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 5609 5610 aese v0.16b, v21.16b 5611 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 5612 eor x22, x22, x14 //AES block 4k+2 - round 14 high 5613#ifdef __AARCH64EB__ 5614 rev x22, x22 5615#endif 5616 aese v2.16b, v19.16b 5617 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 5618 mov d10, v17.d[1] //GHASH block 4k - mid 5619 5620 aese v1.16b, v21.16b 5621 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 5622 rev64 v6.16b, v6.16b //GHASH block 4k+2 5623 5624 aese v3.16b, v18.16b 5625 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 5626 eor x21, x21, x13 //AES block 4k+2 - round 14 low 5627#ifdef __AARCH64EB__ 5628 rev x21, x21 5629#endif 5630 aese v2.16b, v20.16b 5631 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 5632 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 5633 5634 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 5635 5636 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 5637 5638 aese v2.16b, v21.16b 5639 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 5640 rev64 v7.16b, v7.16b //GHASH block 4k+3 5641 5642 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 5643 eor x23, x23, x13 //AES block 4k+3 - round 14 low 5644#ifdef __AARCH64EB__ 5645 rev x23, x23 5646#endif 5647 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 5648 eor x24, x24, x14 //AES block 4k+3 - round 14 high 5649#ifdef __AARCH64EB__ 5650 rev x24, x24 5651#endif 5652 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 5653 5654 aese v2.16b, v22.16b 5655 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 5656 5657 aese v3.16b, v19.16b 5658 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 5659 mov d4, v5.d[1] //GHASH block 4k+1 - mid 5660 5661 aese v0.16b, v22.16b 5662 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 5663 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 5664 5665 aese v2.16b, v23.16b 5666 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 5667 add w12, w12, #1 //CTR block 4k+7 5668 5669 aese v3.16b, v20.16b 5670 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 5671 mov d8, v6.d[1] //GHASH block 4k+2 - mid 5672 5673 aese v1.16b, v22.16b 5674 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 5675 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 5676 5677 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 5678 5679 aese v3.16b, v21.16b 5680 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 5681 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 5682 5683 aese v1.16b, v23.16b 5684 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 5685 5686 aese v0.16b, v23.16b 5687 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 5688 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 5689 5690 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 5691 rev w9, w12 //CTR block 4k+8 5692 5693 aese v1.16b, v24.16b 5694 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 5695 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 5696 5697 aese v0.16b, v24.16b 5698 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 5699 add w12, w12, #1 //CTR block 4k+8 5700 5701 aese v3.16b, v22.16b 5702 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 5703 5704 aese v1.16b, v25.16b 5705 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 5706 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 5707 5708 aese v0.16b, v25.16b 5709 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 5710 5711 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 5712 mov d6, v7.d[1] //GHASH block 4k+3 - mid 5713 5714 aese v3.16b, v23.16b 5715 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 5716 5717 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 5718 5719 aese v0.16b, v26.16b 5720 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 5721 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 5722 5723 aese v3.16b, v24.16b 5724 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 5725 5726 pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 5727 orr x9, x11, x9, lsl #32 //CTR block 4k+8 5728 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 5729 5730 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 5731 5732 aese v0.16b, v27.16b 5733 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 5734 eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid 5735 5736 aese v1.16b, v26.16b 5737 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 5738 5739 aese v2.16b, v24.16b 5740 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 5741 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 5742 5743 aese v0.16b, v28.16b 5744 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 5745 5746 pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid 5747 movi v8.8b, #0xc2 5748 5749 aese v2.16b, v25.16b 5750 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 5751 eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low 5752 5753 aese v0.16b, v29.16b 5754 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 5755 5756 aese v3.16b, v25.16b 5757 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 5758 shl d8, d8, #56 //mod_constant 5759 5760 aese v2.16b, v26.16b 5761 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 5762 eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid 5763 5764 aese v0.16b, v30.16b 5765 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 5766 5767 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 5768 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 5769 5770 aese v1.16b, v27.16b 5771 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 5772 ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 5773 5774 aese v0.16b, v31.16b //AES block 4k+4 - round 13 5775 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 5776 5777 aese v1.16b, v28.16b 5778 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 5779 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 5780 5781 aese v2.16b, v27.16b 5782 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 5783 ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext 5784 5785 aese v3.16b, v26.16b 5786 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 5787 eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result 5788 5789 aese v1.16b, v29.16b 5790 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 5791 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 5792 5793 aese v2.16b, v28.16b 5794 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 5795 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 5796 5797 aese v3.16b, v27.16b 5798 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 5799 ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext 5800 5801 aese v1.16b, v30.16b 5802 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 5803 ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext 5804 5805 aese v2.16b, v29.16b 5806 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 5807 mov x7, v0.d[1] //AES block 4k+4 - mov high 5808 5809 aese v3.16b, v28.16b 5810 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 5811 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 5812 5813 aese v1.16b, v31.16b //AES block 4k+5 - round 13 5814 mov x6, v0.d[0] //AES block 4k+4 - mov low 5815 5816 aese v2.16b, v30.16b 5817 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 5818 fmov d0, x10 //CTR block 4k+8 5819 5820 aese v3.16b, v29.16b 5821 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 5822 fmov v0.d[1], x9 //CTR block 4k+8 5823 5824 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 5825 eor v1.16b, v5.16b, v1.16b //AES block 4k+5 - result 5826 rev w9, w12 //CTR block 4k+9 5827 5828 aese v2.16b, v31.16b //AES block 4k+6 - round 13 5829 orr x9, x11, x9, lsl #32 //CTR block 4k+9 5830 cmp x0, x5 //.LOOP CONTROL 5831 5832 add w12, w12, #1 //CTR block 4k+9 5833 5834 eor x6, x6, x13 //AES block 4k+4 - round 14 low 5835#ifdef __AARCH64EB__ 5836 rev x6, x6 5837#endif 5838 eor x7, x7, x14 //AES block 4k+4 - round 14 high 5839#ifdef __AARCH64EB__ 5840 rev x7, x7 5841#endif 5842 mov x20, v1.d[1] //AES block 4k+5 - mov high 5843 eor v2.16b, v6.16b, v2.16b //AES block 4k+6 - result 5844 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 5845 5846 aese v3.16b, v30.16b 5847 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 5848 mov x19, v1.d[0] //AES block 4k+5 - mov low 5849 5850 fmov d1, x10 //CTR block 4k+9 5851 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 5852 5853 fmov v1.d[1], x9 //CTR block 4k+9 5854 rev w9, w12 //CTR block 4k+10 5855 add w12, w12, #1 //CTR block 4k+10 5856 5857 aese v3.16b, v31.16b //AES block 4k+7 - round 13 5858 orr x9, x11, x9, lsl #32 //CTR block 4k+10 5859 5860 rev64 v5.16b, v5.16b //GHASH block 4k+5 5861 eor x20, x20, x14 //AES block 4k+5 - round 14 high 5862#ifdef __AARCH64EB__ 5863 rev x20, x20 5864#endif 5865 stp x6, x7, [x2], #16 //AES block 4k+4 - store result 5866 5867 eor x19, x19, x13 //AES block 4k+5 - round 14 low 5868#ifdef __AARCH64EB__ 5869 rev x19, x19 5870#endif 5871 stp x19, x20, [x2], #16 //AES block 4k+5 - store result 5872 5873 rev64 v4.16b, v4.16b //GHASH block 4k+4 5874 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 5875 b.lt .L256_dec_main_loop 5876 5877 5878.L256_dec_prepretail: //PREPRETAIL 5879 ext v11.16b, v11.16b, v11.16b, #8 //PRE 0 5880 mov x21, v2.d[0] //AES block 4k+2 - mov low 5881 eor v3.16b, v7.16b, v3.16b //AES block 4k+3 - result 5882 5883 aese v0.16b, v18.16b 5884 aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 5885 mov x22, v2.d[1] //AES block 4k+2 - mov high 5886 5887 aese v1.16b, v18.16b 5888 aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 5889 fmov d2, x10 //CTR block 4k+6 5890 5891 fmov v2.d[1], x9 //CTR block 4k+6 5892 rev w9, w12 //CTR block 4k+7 5893 eor v4.16b, v4.16b, v11.16b //PRE 1 5894 5895 rev64 v6.16b, v6.16b //GHASH block 4k+2 5896 orr x9, x11, x9, lsl #32 //CTR block 4k+7 5897 mov x23, v3.d[0] //AES block 4k+3 - mov low 5898 5899 aese v1.16b, v19.16b 5900 aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 5901 mov x24, v3.d[1] //AES block 4k+3 - mov high 5902 5903 pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low 5904 mov d8, v4.d[1] //GHASH block 4k - mid 5905 fmov d3, x10 //CTR block 4k+7 5906 5907 pmull2 v9.1q, v4.2d, v15.2d //GHASH block 4k - high 5908 fmov v3.d[1], x9 //CTR block 4k+7 5909 5910 aese v2.16b, v18.16b 5911 aesmc v2.16b, v2.16b //AES block 4k+6 - round 0 5912 mov d10, v17.d[1] //GHASH block 4k - mid 5913 5914 aese v0.16b, v19.16b 5915 aesmc v0.16b, v0.16b //AES block 4k+4 - round 1 5916 eor v8.8b, v8.8b, v4.8b //GHASH block 4k - mid 5917 5918 pmull2 v4.1q, v5.2d, v14.2d //GHASH block 4k+1 - high 5919 5920 aese v2.16b, v19.16b 5921 aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 5922 rev64 v7.16b, v7.16b //GHASH block 4k+3 5923 5924 aese v3.16b, v18.16b 5925 aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 5926 5927 pmull v10.1q, v8.1d, v10.1d //GHASH block 4k - mid 5928 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+1 - high 5929 5930 pmull v8.1q, v5.1d, v14.1d //GHASH block 4k+1 - low 5931 5932 aese v3.16b, v19.16b 5933 aesmc v3.16b, v3.16b //AES block 4k+7 - round 1 5934 mov d4, v5.d[1] //GHASH block 4k+1 - mid 5935 5936 aese v0.16b, v20.16b 5937 aesmc v0.16b, v0.16b //AES block 4k+4 - round 2 5938 5939 aese v1.16b, v20.16b 5940 aesmc v1.16b, v1.16b //AES block 4k+5 - round 2 5941 eor v11.16b, v11.16b, v8.16b //GHASH block 4k+1 - low 5942 5943 aese v2.16b, v20.16b 5944 aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 5945 5946 aese v0.16b, v21.16b 5947 aesmc v0.16b, v0.16b //AES block 4k+4 - round 3 5948 mov d8, v6.d[1] //GHASH block 4k+2 - mid 5949 5950 aese v3.16b, v20.16b 5951 aesmc v3.16b, v3.16b //AES block 4k+7 - round 2 5952 eor v4.8b, v4.8b, v5.8b //GHASH block 4k+1 - mid 5953 5954 pmull v5.1q, v6.1d, v13.1d //GHASH block 4k+2 - low 5955 5956 aese v0.16b, v22.16b 5957 aesmc v0.16b, v0.16b //AES block 4k+4 - round 4 5958 5959 aese v3.16b, v21.16b 5960 aesmc v3.16b, v3.16b //AES block 4k+7 - round 3 5961 eor v8.8b, v8.8b, v6.8b //GHASH block 4k+2 - mid 5962 5963 pmull v4.1q, v4.1d, v17.1d //GHASH block 4k+1 - mid 5964 5965 aese v0.16b, v23.16b 5966 aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 5967 eor v11.16b, v11.16b, v5.16b //GHASH block 4k+2 - low 5968 5969 aese v3.16b, v22.16b 5970 aesmc v3.16b, v3.16b //AES block 4k+7 - round 4 5971 5972 pmull2 v5.1q, v7.2d, v12.2d //GHASH block 4k+3 - high 5973 eor v10.16b, v10.16b, v4.16b //GHASH block 4k+1 - mid 5974 5975 pmull2 v4.1q, v6.2d, v13.2d //GHASH block 4k+2 - high 5976 5977 aese v3.16b, v23.16b 5978 aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 5979 ins v8.d[1], v8.d[0] //GHASH block 4k+2 - mid 5980 5981 aese v2.16b, v21.16b 5982 aesmc v2.16b, v2.16b //AES block 4k+6 - round 3 5983 5984 aese v1.16b, v21.16b 5985 aesmc v1.16b, v1.16b //AES block 4k+5 - round 3 5986 eor v9.16b, v9.16b, v4.16b //GHASH block 4k+2 - high 5987 5988 pmull v4.1q, v7.1d, v12.1d //GHASH block 4k+3 - low 5989 5990 aese v2.16b, v22.16b 5991 aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 5992 mov d6, v7.d[1] //GHASH block 4k+3 - mid 5993 5994 aese v1.16b, v22.16b 5995 aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 5996 5997 pmull2 v8.1q, v8.2d, v16.2d //GHASH block 4k+2 - mid 5998 5999 aese v2.16b, v23.16b 6000 aesmc v2.16b, v2.16b //AES block 4k+6 - round 5 6001 eor v6.8b, v6.8b, v7.8b //GHASH block 4k+3 - mid 6002 6003 aese v1.16b, v23.16b 6004 aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 6005 6006 aese v3.16b, v24.16b 6007 aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 6008 eor v10.16b, v10.16b, v8.16b //GHASH block 4k+2 - mid 6009 6010 aese v2.16b, v24.16b 6011 aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 6012 6013 aese v0.16b, v24.16b 6014 aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 6015 movi v8.8b, #0xc2 6016 6017 aese v1.16b, v24.16b 6018 aesmc v1.16b, v1.16b //AES block 4k+5 - round 6 6019 eor v11.16b, v11.16b, v4.16b //GHASH block 4k+3 - low 6020 6021 pmull v6.1q, v6.1d, v16.1d //GHASH block 4k+3 - mid 6022 6023 aese v3.16b, v25.16b 6024 aesmc v3.16b, v3.16b //AES block 4k+7 - round 7 6025 eor v9.16b, v9.16b, v5.16b //GHASH block 4k+3 - high 6026 6027 aese v1.16b, v25.16b 6028 aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 6029 6030 aese v0.16b, v25.16b 6031 aesmc v0.16b, v0.16b //AES block 4k+4 - round 7 6032 eor v10.16b, v10.16b, v6.16b //GHASH block 4k+3 - mid 6033 6034 aese v3.16b, v26.16b 6035 aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 6036 6037 aese v2.16b, v25.16b 6038 aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 6039 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 6040 6041 aese v1.16b, v26.16b 6042 aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 6043 6044 aese v0.16b, v26.16b 6045 aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 6046 shl d8, d8, #56 //mod_constant 6047 6048 aese v2.16b, v26.16b 6049 aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 6050 6051 aese v1.16b, v27.16b 6052 aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 6053 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 6054 6055 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 6056 6057 aese v2.16b, v27.16b 6058 aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 6059 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 6060 6061 aese v3.16b, v27.16b 6062 aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 6063 6064 aese v0.16b, v27.16b 6065 aesmc v0.16b, v0.16b //AES block 4k+4 - round 9 6066 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 6067 6068 aese v2.16b, v28.16b 6069 aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 6070 6071 aese v3.16b, v28.16b 6072 aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 6073 6074 aese v0.16b, v28.16b 6075 aesmc v0.16b, v0.16b //AES block 4k+4 - round 10 6076 eor x22, x22, x14 //AES block 4k+2 - round 14 high 6077#ifdef __AARCH64EB__ 6078 rev x22, x22 6079#endif 6080 aese v1.16b, v28.16b 6081 aesmc v1.16b, v1.16b //AES block 4k+5 - round 10 6082 eor x23, x23, x13 //AES block 4k+3 - round 14 low 6083#ifdef __AARCH64EB__ 6084 rev x23, x23 6085#endif 6086 aese v2.16b, v29.16b 6087 aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 6088 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 6089 6090 aese v0.16b, v29.16b 6091 aesmc v0.16b, v0.16b //AES block 4k+4 - round 11 6092 add w12, w12, #1 //CTR block 4k+7 6093 6094 aese v1.16b, v29.16b 6095 aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 6096 eor x21, x21, x13 //AES block 4k+2 - round 14 low 6097#ifdef __AARCH64EB__ 6098 rev x21, x21 6099#endif 6100 6101 aese v2.16b, v30.16b 6102 aesmc v2.16b, v2.16b //AES block 4k+6 - round 12 6103 6104 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 6105 eor x24, x24, x14 //AES block 4k+3 - round 14 high 6106#ifdef __AARCH64EB__ 6107 rev x24, x24 6108#endif 6109 6110 aese v3.16b, v29.16b 6111 aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 6112 stp x21, x22, [x2], #16 //AES block 4k+2 - store result 6113 6114 aese v1.16b, v30.16b 6115 aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 6116 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 6117 6118 aese v0.16b, v30.16b 6119 aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 6120 stp x23, x24, [x2], #16 //AES block 4k+3 - store result 6121 6122 aese v3.16b, v30.16b 6123 aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 6124 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 6125 6126 aese v1.16b, v31.16b //AES block 4k+5 - round 13 6127 6128 aese v0.16b, v31.16b //AES block 4k+4 - round 13 6129 6130 aese v3.16b, v31.16b //AES block 4k+7 - round 13 6131 6132 aese v2.16b, v31.16b //AES block 4k+6 - round 13 6133 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 6134.L256_dec_tail: //TAIL 6135 6136 sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process 6137 ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext 6138 6139 eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result 6140 6141 mov x6, v0.d[0] //AES block 4k+4 - mov low 6142 6143 mov x7, v0.d[1] //AES block 4k+4 - mov high 6144 ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag 6145 6146 cmp x5, #48 6147 6148 eor x6, x6, x13 //AES block 4k+4 - round 14 low 6149#ifdef __AARCH64EB__ 6150 rev x6, x6 6151#endif 6152 6153 eor x7, x7, x14 //AES block 4k+4 - round 14 high 6154#ifdef __AARCH64EB__ 6155 rev x7, x7 6156#endif 6157 b.gt .L256_dec_blocks_more_than_3 6158 6159 sub w12, w12, #1 6160 mov v3.16b, v2.16b 6161 movi v10.8b, #0 6162 6163 movi v11.8b, #0 6164 cmp x5, #32 6165 6166 movi v9.8b, #0 6167 mov v2.16b, v1.16b 6168 b.gt .L256_dec_blocks_more_than_2 6169 6170 sub w12, w12, #1 6171 6172 mov v3.16b, v1.16b 6173 cmp x5, #16 6174 b.gt .L256_dec_blocks_more_than_1 6175 6176 sub w12, w12, #1 6177 b .L256_dec_blocks_less_than_1 6178.L256_dec_blocks_more_than_3: //blocks left > 3 6179 rev64 v4.16b, v5.16b //GHASH final-3 block 6180 ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext 6181 6182 stp x6, x7, [x2], #16 //AES final-3 block - store result 6183 6184 mov d10, v17.d[1] //GHASH final-3 block - mid 6185 6186 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6187 6188 eor v0.16b, v5.16b, v1.16b //AES final-2 block - result 6189 6190 mov d22, v4.d[1] //GHASH final-3 block - mid 6191 6192 mov x6, v0.d[0] //AES final-2 block - mov low 6193 6194 mov x7, v0.d[1] //AES final-2 block - mov high 6195 6196 eor v22.8b, v22.8b, v4.8b //GHASH final-3 block - mid 6197 6198 movi v8.8b, #0 //suppress further partial tag feed in 6199 6200 pmull2 v9.1q, v4.2d, v15.2d //GHASH final-3 block - high 6201 6202 pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid 6203 eor x6, x6, x13 //AES final-2 block - round 14 low 6204#ifdef __AARCH64EB__ 6205 rev x6, x6 6206#endif 6207 6208 pmull v11.1q, v4.1d, v15.1d //GHASH final-3 block - low 6209 eor x7, x7, x14 //AES final-2 block - round 14 high 6210#ifdef __AARCH64EB__ 6211 rev x7, x7 6212#endif 6213.L256_dec_blocks_more_than_2: //blocks left > 2 6214 6215 rev64 v4.16b, v5.16b //GHASH final-2 block 6216 ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext 6217 6218 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6219 stp x6, x7, [x2], #16 //AES final-2 block - store result 6220 6221 eor v0.16b, v5.16b, v2.16b //AES final-1 block - result 6222 6223 mov d22, v4.d[1] //GHASH final-2 block - mid 6224 6225 pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low 6226 6227 pmull2 v20.1q, v4.2d, v14.2d //GHASH final-2 block - high 6228 6229 eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid 6230 mov x6, v0.d[0] //AES final-1 block - mov low 6231 6232 mov x7, v0.d[1] //AES final-1 block - mov high 6233 eor v11.16b, v11.16b, v21.16b //GHASH final-2 block - low 6234 movi v8.8b, #0 //suppress further partial tag feed in 6235 6236 pmull v22.1q, v22.1d, v17.1d //GHASH final-2 block - mid 6237 6238 eor v9.16b, v9.16b, v20.16b //GHASH final-2 block - high 6239 eor x6, x6, x13 //AES final-1 block - round 14 low 6240#ifdef __AARCH64EB__ 6241 rev x6, x6 6242#endif 6243 6244 eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid 6245 eor x7, x7, x14 //AES final-1 block - round 14 high 6246#ifdef __AARCH64EB__ 6247 rev x7, x7 6248#endif 6249.L256_dec_blocks_more_than_1: //blocks left > 1 6250 6251 stp x6, x7, [x2], #16 //AES final-1 block - store result 6252 rev64 v4.16b, v5.16b //GHASH final-1 block 6253 6254 ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext 6255 6256 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6257 movi v8.8b, #0 //suppress further partial tag feed in 6258 6259 mov d22, v4.d[1] //GHASH final-1 block - mid 6260 6261 eor v0.16b, v5.16b, v3.16b //AES final block - result 6262 6263 pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high 6264 6265 eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid 6266 6267 pmull v21.1q, v4.1d, v13.1d //GHASH final-1 block - low 6268 mov x6, v0.d[0] //AES final block - mov low 6269 6270 ins v22.d[1], v22.d[0] //GHASH final-1 block - mid 6271 6272 mov x7, v0.d[1] //AES final block - mov high 6273 6274 pmull2 v22.1q, v22.2d, v16.2d //GHASH final-1 block - mid 6275 eor x6, x6, x13 //AES final block - round 14 low 6276#ifdef __AARCH64EB__ 6277 rev x6, x6 6278#endif 6279 eor v11.16b, v11.16b, v21.16b //GHASH final-1 block - low 6280 6281 eor v9.16b, v9.16b, v20.16b //GHASH final-1 block - high 6282 6283 eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid 6284 eor x7, x7, x14 //AES final block - round 14 high 6285#ifdef __AARCH64EB__ 6286 rev x7, x7 6287#endif 6288.L256_dec_blocks_less_than_1: //blocks left <= 1 6289 6290 and x1, x1, #127 //bit_length %= 128 6291 mvn x14, xzr //rk14_h = 0xffffffffffffffff 6292 6293 sub x1, x1, #128 //bit_length -= 128 6294 mvn x13, xzr //rk14_l = 0xffffffffffffffff 6295 6296 ldp x4, x5, [x2] //load existing bytes we need to not overwrite 6297 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) 6298 6299 and x1, x1, #127 //bit_length %= 128 6300 6301 lsr x14, x14, x1 //rk14_h is mask for top 64b of last block 6302 cmp x1, #64 6303 6304 csel x9, x13, x14, lt 6305 csel x10, x14, xzr, lt 6306 6307 fmov d0, x9 //ctr0b is mask for last block 6308 and x6, x6, x9 6309 6310 mov v0.d[1], x10 6311 bic x4, x4, x9 //mask out low existing bytes 6312 6313#ifndef __AARCH64EB__ 6314 rev w9, w12 6315#else 6316 mov w9, w12 6317#endif 6318 6319 bic x5, x5, x10 //mask out high existing bytes 6320 6321 orr x6, x6, x4 6322 6323 and x7, x7, x10 6324 6325 orr x7, x7, x5 6326 6327 and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits 6328 6329 rev64 v4.16b, v5.16b //GHASH final block 6330 6331 eor v4.16b, v4.16b, v8.16b //feed in partial tag 6332 6333 pmull v21.1q, v4.1d, v12.1d //GHASH final block - low 6334 6335 mov d8, v4.d[1] //GHASH final block - mid 6336 6337 eor v8.8b, v8.8b, v4.8b //GHASH final block - mid 6338 6339 pmull2 v20.1q, v4.2d, v12.2d //GHASH final block - high 6340 6341 pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid 6342 6343 eor v9.16b, v9.16b, v20.16b //GHASH final block - high 6344 6345 eor v11.16b, v11.16b, v21.16b //GHASH final block - low 6346 6347 eor v10.16b, v10.16b, v8.16b //GHASH final block - mid 6348 movi v8.8b, #0xc2 6349 6350 eor v6.16b, v11.16b, v9.16b //MODULO - karatsuba tidy up 6351 6352 shl d8, d8, #56 //mod_constant 6353 6354 eor v10.16b, v10.16b, v6.16b //MODULO - karatsuba tidy up 6355 6356 pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid 6357 6358 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment 6359 6360 eor v10.16b, v10.16b, v7.16b //MODULO - fold into mid 6361 6362 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid 6363 6364 pmull v8.1q, v10.1d, v8.1d //MODULO - mid 64b align with low 6365 6366 ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment 6367 6368 eor v11.16b, v11.16b, v8.16b //MODULO - fold into low 6369 6370 stp x6, x7, [x2] 6371 6372 str w9, [x16, #12] //store the updated counter 6373 6374 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low 6375 ext v11.16b, v11.16b, v11.16b, #8 6376 rev64 v11.16b, v11.16b 6377 mov x0, x15 6378 st1 { v11.16b }, [x3] 6379 6380 ldp x21, x22, [sp, #16] 6381 ldp x23, x24, [sp, #32] 6382 ldp d8, d9, [sp, #48] 6383 ldp d10, d11, [sp, #64] 6384 ldp d12, d13, [sp, #80] 6385 ldp d14, d15, [sp, #96] 6386 ldp x19, x20, [sp], #112 6387 ret 6388 6389.L256_dec_ret: 6390 mov w0, #0x0 6391 ret 6392.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel 6393.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 6394.align 2 6395.align 2 6396#endif 6397