1/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */ 2#include "arm_arch.h" 3 4#if __ARM_MAX_ARCH__>=7 5.arch armv8-a+crypto 6.text 7.globl gcm_init_v8 8.type gcm_init_v8,%function 9.align 4 10gcm_init_v8: 11 AARCH64_VALID_CALL_TARGET 12 ld1 {v17.2d},[x1] //load input H 13 movi v19.16b,#0xe1 14 shl v19.2d,v19.2d,#57 //0xc2.0 15 ext v3.16b,v17.16b,v17.16b,#8 16 ushr v18.2d,v19.2d,#63 17 dup v17.4s,v17.s[1] 18 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 19 ushr v18.2d,v3.2d,#63 20 sshr v17.4s,v17.4s,#31 //broadcast carry bit 21 and v18.16b,v18.16b,v16.16b 22 shl v3.2d,v3.2d,#1 23 ext v18.16b,v18.16b,v18.16b,#8 24 and v16.16b,v16.16b,v17.16b 25 orr v3.16b,v3.16b,v18.16b //H<<<=1 26 eor v20.16b,v3.16b,v16.16b //twisted H 27 st1 {v20.2d},[x0],#16 //store Htable[0] 28 29 //calculate H^2 30 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 31 pmull v0.1q,v20.1d,v20.1d 32 eor v16.16b,v16.16b,v20.16b 33 pmull2 v2.1q,v20.2d,v20.2d 34 pmull v1.1q,v16.1d,v16.1d 35 36 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 37 eor v18.16b,v0.16b,v2.16b 38 eor v1.16b,v1.16b,v17.16b 39 eor v1.16b,v1.16b,v18.16b 40 pmull v18.1q,v0.1d,v19.1d //1st phase 41 42 ins v2.d[0],v1.d[1] 43 ins v1.d[1],v0.d[0] 44 eor v0.16b,v1.16b,v18.16b 45 46 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 47 pmull v0.1q,v0.1d,v19.1d 48 eor v18.16b,v18.16b,v2.16b 49 eor v22.16b,v0.16b,v18.16b 50 51 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 52 eor v17.16b,v17.16b,v22.16b 53 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 54 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 55 //calculate H^3 and H^4 56 pmull v0.1q,v20.1d, v22.1d 57 pmull v5.1q,v22.1d,v22.1d 58 pmull2 v2.1q,v20.2d, v22.2d 59 pmull2 v7.1q,v22.2d,v22.2d 60 pmull v1.1q,v16.1d,v17.1d 61 pmull v6.1q,v17.1d,v17.1d 62 63 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 64 ext v17.16b,v5.16b,v7.16b,#8 65 eor v18.16b,v0.16b,v2.16b 66 eor v1.16b,v1.16b,v16.16b 67 eor v4.16b,v5.16b,v7.16b 68 eor v6.16b,v6.16b,v17.16b 69 eor v1.16b,v1.16b,v18.16b 70 pmull v18.1q,v0.1d,v19.1d //1st phase 71 eor v6.16b,v6.16b,v4.16b 72 pmull v4.1q,v5.1d,v19.1d 73 74 ins v2.d[0],v1.d[1] 75 ins v7.d[0],v6.d[1] 76 ins v1.d[1],v0.d[0] 77 ins v6.d[1],v5.d[0] 78 eor v0.16b,v1.16b,v18.16b 79 eor v5.16b,v6.16b,v4.16b 80 81 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 82 ext v4.16b,v5.16b,v5.16b,#8 83 pmull v0.1q,v0.1d,v19.1d 84 pmull v5.1q,v5.1d,v19.1d 85 eor v18.16b,v18.16b,v2.16b 86 eor v4.16b,v4.16b,v7.16b 87 eor v20.16b, v0.16b,v18.16b //H^3 88 eor v22.16b,v5.16b,v4.16b //H^4 89 90 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing 91 ext v17.16b,v22.16b,v22.16b,#8 92 eor v16.16b,v16.16b,v20.16b 93 eor v17.16b,v17.16b,v22.16b 94 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 95 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] 96 ret 97.size gcm_init_v8,.-gcm_init_v8 98.globl gcm_gmult_v8 99.type gcm_gmult_v8,%function 100.align 4 101gcm_gmult_v8: 102 AARCH64_VALID_CALL_TARGET 103 ld1 {v17.2d},[x0] //load Xi 104 movi v19.16b,#0xe1 105 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 106 shl v19.2d,v19.2d,#57 107#ifndef __AARCH64EB__ 108 rev64 v17.16b,v17.16b 109#endif 110 ext v3.16b,v17.16b,v17.16b,#8 111 112 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 113 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 114 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 115 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 116 117 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 118 eor v18.16b,v0.16b,v2.16b 119 eor v1.16b,v1.16b,v17.16b 120 eor v1.16b,v1.16b,v18.16b 121 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 122 123 ins v2.d[0],v1.d[1] 124 ins v1.d[1],v0.d[0] 125 eor v0.16b,v1.16b,v18.16b 126 127 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 128 pmull v0.1q,v0.1d,v19.1d 129 eor v18.16b,v18.16b,v2.16b 130 eor v0.16b,v0.16b,v18.16b 131 132#ifndef __AARCH64EB__ 133 rev64 v0.16b,v0.16b 134#endif 135 ext v0.16b,v0.16b,v0.16b,#8 136 st1 {v0.2d},[x0] //write out Xi 137 138 ret 139.size gcm_gmult_v8,.-gcm_gmult_v8 140.globl gcm_ghash_v8 141.type gcm_ghash_v8,%function 142.align 4 143gcm_ghash_v8: 144 AARCH64_VALID_CALL_TARGET 145 cmp x3,#64 146 b.hs .Lgcm_ghash_v8_4x 147 ld1 {v0.2d},[x0] //load [rotated] Xi 148 //"[rotated]" means that 149 //loaded value would have 150 //to be rotated in order to 151 //make it appear as in 152 //algorithm specification 153 subs x3,x3,#32 //see if x3 is 32 or larger 154 mov x12,#16 //x12 is used as post- 155 //increment for input pointer; 156 //as loop is modulo-scheduled 157 //x12 is zeroed just in time 158 //to preclude overstepping 159 //inp[len], which means that 160 //last block[s] are actually 161 //loaded twice, but last 162 //copy is not processed 163 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 164 movi v19.16b,#0xe1 165 ld1 {v22.2d},[x1] 166 csel x12,xzr,x12,eq //is it time to zero x12? 167 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 168 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 169 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 170#ifndef __AARCH64EB__ 171 rev64 v16.16b,v16.16b 172 rev64 v0.16b,v0.16b 173#endif 174 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 175 b.lo .Lodd_tail_v8 //x3 was less than 32 176 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 177#ifndef __AARCH64EB__ 178 rev64 v17.16b,v17.16b 179#endif 180 ext v7.16b,v17.16b,v17.16b,#8 181 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 182 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 183 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 184 pmull2 v6.1q,v20.2d,v7.2d 185 b .Loop_mod2x_v8 186 187.align 4 188.Loop_mod2x_v8: 189 ext v18.16b,v3.16b,v3.16b,#8 190 subs x3,x3,#32 //is there more data? 191 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 192 csel x12,xzr,x12,lo //is it time to zero x12? 193 194 pmull v5.1q,v21.1d,v17.1d 195 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 196 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 197 eor v0.16b,v0.16b,v4.16b //accumulate 198 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 199 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 200 201 eor v2.16b,v2.16b,v6.16b 202 csel x12,xzr,x12,eq //is it time to zero x12? 203 eor v1.16b,v1.16b,v5.16b 204 205 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 206 eor v18.16b,v0.16b,v2.16b 207 eor v1.16b,v1.16b,v17.16b 208 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 209#ifndef __AARCH64EB__ 210 rev64 v16.16b,v16.16b 211#endif 212 eor v1.16b,v1.16b,v18.16b 213 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 214 215#ifndef __AARCH64EB__ 216 rev64 v17.16b,v17.16b 217#endif 218 ins v2.d[0],v1.d[1] 219 ins v1.d[1],v0.d[0] 220 ext v7.16b,v17.16b,v17.16b,#8 221 ext v3.16b,v16.16b,v16.16b,#8 222 eor v0.16b,v1.16b,v18.16b 223 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 224 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 225 226 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 227 pmull v0.1q,v0.1d,v19.1d 228 eor v3.16b,v3.16b,v18.16b 229 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 230 eor v3.16b,v3.16b,v0.16b 231 pmull2 v6.1q,v20.2d,v7.2d 232 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 233 234 eor v2.16b,v2.16b,v18.16b 235 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 236 adds x3,x3,#32 //re-construct x3 237 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 238 b.eq .Ldone_v8 //is x3 zero? 239.Lodd_tail_v8: 240 ext v18.16b,v0.16b,v0.16b,#8 241 eor v3.16b,v3.16b,v0.16b //inp^=Xi 242 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 243 244 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 245 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 246 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 247 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 248 249 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 250 eor v18.16b,v0.16b,v2.16b 251 eor v1.16b,v1.16b,v17.16b 252 eor v1.16b,v1.16b,v18.16b 253 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 254 255 ins v2.d[0],v1.d[1] 256 ins v1.d[1],v0.d[0] 257 eor v0.16b,v1.16b,v18.16b 258 259 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 260 pmull v0.1q,v0.1d,v19.1d 261 eor v18.16b,v18.16b,v2.16b 262 eor v0.16b,v0.16b,v18.16b 263 264.Ldone_v8: 265#ifndef __AARCH64EB__ 266 rev64 v0.16b,v0.16b 267#endif 268 ext v0.16b,v0.16b,v0.16b,#8 269 st1 {v0.2d},[x0] //write out Xi 270 271 ret 272.size gcm_ghash_v8,.-gcm_ghash_v8 273.type gcm_ghash_v8_4x,%function 274.align 4 275gcm_ghash_v8_4x: 276.Lgcm_ghash_v8_4x: 277 ld1 {v0.2d},[x0] //load [rotated] Xi 278 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 279 movi v19.16b,#0xe1 280 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 281 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 282 283 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 284#ifndef __AARCH64EB__ 285 rev64 v0.16b,v0.16b 286 rev64 v5.16b,v5.16b 287 rev64 v6.16b,v6.16b 288 rev64 v7.16b,v7.16b 289 rev64 v4.16b,v4.16b 290#endif 291 ext v25.16b,v7.16b,v7.16b,#8 292 ext v24.16b,v6.16b,v6.16b,#8 293 ext v23.16b,v5.16b,v5.16b,#8 294 295 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 296 eor v7.16b,v7.16b,v25.16b 297 pmull2 v31.1q,v20.2d,v25.2d 298 pmull v30.1q,v21.1d,v7.1d 299 300 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 301 eor v6.16b,v6.16b,v24.16b 302 pmull2 v24.1q,v22.2d,v24.2d 303 pmull2 v6.1q,v21.2d,v6.2d 304 305 eor v29.16b,v29.16b,v16.16b 306 eor v31.16b,v31.16b,v24.16b 307 eor v30.16b,v30.16b,v6.16b 308 309 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 310 eor v5.16b,v5.16b,v23.16b 311 pmull2 v23.1q,v26.2d,v23.2d 312 pmull v5.1q,v27.1d,v5.1d 313 314 eor v29.16b,v29.16b,v7.16b 315 eor v31.16b,v31.16b,v23.16b 316 eor v30.16b,v30.16b,v5.16b 317 318 subs x3,x3,#128 319 b.lo .Ltail4x 320 321 b .Loop4x 322 323.align 4 324.Loop4x: 325 eor v16.16b,v4.16b,v0.16b 326 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 327 ext v3.16b,v16.16b,v16.16b,#8 328#ifndef __AARCH64EB__ 329 rev64 v5.16b,v5.16b 330 rev64 v6.16b,v6.16b 331 rev64 v7.16b,v7.16b 332 rev64 v4.16b,v4.16b 333#endif 334 335 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 336 eor v16.16b,v16.16b,v3.16b 337 pmull2 v2.1q,v28.2d,v3.2d 338 ext v25.16b,v7.16b,v7.16b,#8 339 pmull2 v1.1q,v27.2d,v16.2d 340 341 eor v0.16b,v0.16b,v29.16b 342 eor v2.16b,v2.16b,v31.16b 343 ext v24.16b,v6.16b,v6.16b,#8 344 eor v1.16b,v1.16b,v30.16b 345 ext v23.16b,v5.16b,v5.16b,#8 346 347 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 348 eor v18.16b,v0.16b,v2.16b 349 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 350 eor v7.16b,v7.16b,v25.16b 351 eor v1.16b,v1.16b,v17.16b 352 pmull2 v31.1q,v20.2d,v25.2d 353 eor v1.16b,v1.16b,v18.16b 354 pmull v30.1q,v21.1d,v7.1d 355 356 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 357 ins v2.d[0],v1.d[1] 358 ins v1.d[1],v0.d[0] 359 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 360 eor v6.16b,v6.16b,v24.16b 361 pmull2 v24.1q,v22.2d,v24.2d 362 eor v0.16b,v1.16b,v18.16b 363 pmull2 v6.1q,v21.2d,v6.2d 364 365 eor v29.16b,v29.16b,v16.16b 366 eor v31.16b,v31.16b,v24.16b 367 eor v30.16b,v30.16b,v6.16b 368 369 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 370 pmull v0.1q,v0.1d,v19.1d 371 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 372 eor v5.16b,v5.16b,v23.16b 373 eor v18.16b,v18.16b,v2.16b 374 pmull2 v23.1q,v26.2d,v23.2d 375 pmull v5.1q,v27.1d,v5.1d 376 377 eor v0.16b,v0.16b,v18.16b 378 eor v29.16b,v29.16b,v7.16b 379 eor v31.16b,v31.16b,v23.16b 380 ext v0.16b,v0.16b,v0.16b,#8 381 eor v30.16b,v30.16b,v5.16b 382 383 subs x3,x3,#64 384 b.hs .Loop4x 385 386.Ltail4x: 387 eor v16.16b,v4.16b,v0.16b 388 ext v3.16b,v16.16b,v16.16b,#8 389 390 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 391 eor v16.16b,v16.16b,v3.16b 392 pmull2 v2.1q,v28.2d,v3.2d 393 pmull2 v1.1q,v27.2d,v16.2d 394 395 eor v0.16b,v0.16b,v29.16b 396 eor v2.16b,v2.16b,v31.16b 397 eor v1.16b,v1.16b,v30.16b 398 399 adds x3,x3,#64 400 b.eq .Ldone4x 401 402 cmp x3,#32 403 b.lo .Lone 404 b.eq .Ltwo 405.Lthree: 406 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 407 eor v18.16b,v0.16b,v2.16b 408 eor v1.16b,v1.16b,v17.16b 409 ld1 {v4.2d,v5.2d,v6.2d},[x2] 410 eor v1.16b,v1.16b,v18.16b 411#ifndef __AARCH64EB__ 412 rev64 v5.16b,v5.16b 413 rev64 v6.16b,v6.16b 414 rev64 v4.16b,v4.16b 415#endif 416 417 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 418 ins v2.d[0],v1.d[1] 419 ins v1.d[1],v0.d[0] 420 ext v24.16b,v6.16b,v6.16b,#8 421 ext v23.16b,v5.16b,v5.16b,#8 422 eor v0.16b,v1.16b,v18.16b 423 424 pmull v29.1q,v20.1d,v24.1d //H·Ii+2 425 eor v6.16b,v6.16b,v24.16b 426 427 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 428 pmull v0.1q,v0.1d,v19.1d 429 eor v18.16b,v18.16b,v2.16b 430 pmull2 v31.1q,v20.2d,v24.2d 431 pmull v30.1q,v21.1d,v6.1d 432 eor v0.16b,v0.16b,v18.16b 433 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 434 eor v5.16b,v5.16b,v23.16b 435 ext v0.16b,v0.16b,v0.16b,#8 436 437 pmull2 v23.1q,v22.2d,v23.2d 438 eor v16.16b,v4.16b,v0.16b 439 pmull2 v5.1q,v21.2d,v5.2d 440 ext v3.16b,v16.16b,v16.16b,#8 441 442 eor v29.16b,v29.16b,v7.16b 443 eor v31.16b,v31.16b,v23.16b 444 eor v30.16b,v30.16b,v5.16b 445 446 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 447 eor v16.16b,v16.16b,v3.16b 448 pmull2 v2.1q,v26.2d,v3.2d 449 pmull v1.1q,v27.1d,v16.1d 450 451 eor v0.16b,v0.16b,v29.16b 452 eor v2.16b,v2.16b,v31.16b 453 eor v1.16b,v1.16b,v30.16b 454 b .Ldone4x 455 456.align 4 457.Ltwo: 458 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 459 eor v18.16b,v0.16b,v2.16b 460 eor v1.16b,v1.16b,v17.16b 461 ld1 {v4.2d,v5.2d},[x2] 462 eor v1.16b,v1.16b,v18.16b 463#ifndef __AARCH64EB__ 464 rev64 v5.16b,v5.16b 465 rev64 v4.16b,v4.16b 466#endif 467 468 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 469 ins v2.d[0],v1.d[1] 470 ins v1.d[1],v0.d[0] 471 ext v23.16b,v5.16b,v5.16b,#8 472 eor v0.16b,v1.16b,v18.16b 473 474 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 475 pmull v0.1q,v0.1d,v19.1d 476 eor v18.16b,v18.16b,v2.16b 477 eor v0.16b,v0.16b,v18.16b 478 ext v0.16b,v0.16b,v0.16b,#8 479 480 pmull v29.1q,v20.1d,v23.1d //H·Ii+1 481 eor v5.16b,v5.16b,v23.16b 482 483 eor v16.16b,v4.16b,v0.16b 484 ext v3.16b,v16.16b,v16.16b,#8 485 486 pmull2 v31.1q,v20.2d,v23.2d 487 pmull v30.1q,v21.1d,v5.1d 488 489 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 490 eor v16.16b,v16.16b,v3.16b 491 pmull2 v2.1q,v22.2d,v3.2d 492 pmull2 v1.1q,v21.2d,v16.2d 493 494 eor v0.16b,v0.16b,v29.16b 495 eor v2.16b,v2.16b,v31.16b 496 eor v1.16b,v1.16b,v30.16b 497 b .Ldone4x 498 499.align 4 500.Lone: 501 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 502 eor v18.16b,v0.16b,v2.16b 503 eor v1.16b,v1.16b,v17.16b 504 ld1 {v4.2d},[x2] 505 eor v1.16b,v1.16b,v18.16b 506#ifndef __AARCH64EB__ 507 rev64 v4.16b,v4.16b 508#endif 509 510 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 511 ins v2.d[0],v1.d[1] 512 ins v1.d[1],v0.d[0] 513 eor v0.16b,v1.16b,v18.16b 514 515 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 516 pmull v0.1q,v0.1d,v19.1d 517 eor v18.16b,v18.16b,v2.16b 518 eor v0.16b,v0.16b,v18.16b 519 ext v0.16b,v0.16b,v0.16b,#8 520 521 eor v16.16b,v4.16b,v0.16b 522 ext v3.16b,v16.16b,v16.16b,#8 523 524 pmull v0.1q,v20.1d,v3.1d 525 eor v16.16b,v16.16b,v3.16b 526 pmull2 v2.1q,v20.2d,v3.2d 527 pmull v1.1q,v21.1d,v16.1d 528 529.Ldone4x: 530 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 531 eor v18.16b,v0.16b,v2.16b 532 eor v1.16b,v1.16b,v17.16b 533 eor v1.16b,v1.16b,v18.16b 534 535 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 536 ins v2.d[0],v1.d[1] 537 ins v1.d[1],v0.d[0] 538 eor v0.16b,v1.16b,v18.16b 539 540 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 541 pmull v0.1q,v0.1d,v19.1d 542 eor v18.16b,v18.16b,v2.16b 543 eor v0.16b,v0.16b,v18.16b 544 ext v0.16b,v0.16b,v0.16b,#8 545 546#ifndef __AARCH64EB__ 547 rev64 v0.16b,v0.16b 548#endif 549 st1 {v0.2d},[x0] //write out Xi 550 551 ret 552.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 553.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 554.align 2 555.align 2 556#endif 557