1/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */ 2#include "arm_arch.h" 3 4#if __ARM_MAX_ARCH__>=7 5.arch armv8-a+crypto 6.text 7.globl gcm_init_v8 8.type gcm_init_v8,%function 9.align 4 10gcm_init_v8: 11 ld1 {v17.2d},[x1] //load input H 12 movi v19.16b,#0xe1 13 shl v19.2d,v19.2d,#57 //0xc2.0 14 ext v3.16b,v17.16b,v17.16b,#8 15 ushr v18.2d,v19.2d,#63 16 dup v17.4s,v17.s[1] 17 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 18 ushr v18.2d,v3.2d,#63 19 sshr v17.4s,v17.4s,#31 //broadcast carry bit 20 and v18.16b,v18.16b,v16.16b 21 shl v3.2d,v3.2d,#1 22 ext v18.16b,v18.16b,v18.16b,#8 23 and v16.16b,v16.16b,v17.16b 24 orr v3.16b,v3.16b,v18.16b //H<<<=1 25 eor v20.16b,v3.16b,v16.16b //twisted H 26 st1 {v20.2d},[x0],#16 //store Htable[0] 27 28 //calculate H^2 29 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 30 pmull v0.1q,v20.1d,v20.1d 31 eor v16.16b,v16.16b,v20.16b 32 pmull2 v2.1q,v20.2d,v20.2d 33 pmull v1.1q,v16.1d,v16.1d 34 35 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 36 eor v18.16b,v0.16b,v2.16b 37 eor v1.16b,v1.16b,v17.16b 38 eor v1.16b,v1.16b,v18.16b 39 pmull v18.1q,v0.1d,v19.1d //1st phase 40 41 ins v2.d[0],v1.d[1] 42 ins v1.d[1],v0.d[0] 43 eor v0.16b,v1.16b,v18.16b 44 45 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 46 pmull v0.1q,v0.1d,v19.1d 47 eor v18.16b,v18.16b,v2.16b 48 eor v22.16b,v0.16b,v18.16b 49 50 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 51 eor v17.16b,v17.16b,v22.16b 52 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 53 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 54 //calculate H^3 and H^4 55 pmull v0.1q,v20.1d, v22.1d 56 pmull v5.1q,v22.1d,v22.1d 57 pmull2 v2.1q,v20.2d, v22.2d 58 pmull2 v7.1q,v22.2d,v22.2d 59 pmull v1.1q,v16.1d,v17.1d 60 pmull v6.1q,v17.1d,v17.1d 61 62 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 63 ext v17.16b,v5.16b,v7.16b,#8 64 eor v18.16b,v0.16b,v2.16b 65 eor v1.16b,v1.16b,v16.16b 66 eor v4.16b,v5.16b,v7.16b 67 eor v6.16b,v6.16b,v17.16b 68 eor v1.16b,v1.16b,v18.16b 69 pmull v18.1q,v0.1d,v19.1d //1st phase 70 eor v6.16b,v6.16b,v4.16b 71 pmull v4.1q,v5.1d,v19.1d 72 73 ins v2.d[0],v1.d[1] 74 ins v7.d[0],v6.d[1] 75 ins v1.d[1],v0.d[0] 76 ins v6.d[1],v5.d[0] 77 eor v0.16b,v1.16b,v18.16b 78 eor v5.16b,v6.16b,v4.16b 79 80 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 81 ext v4.16b,v5.16b,v5.16b,#8 82 pmull v0.1q,v0.1d,v19.1d 83 pmull v5.1q,v5.1d,v19.1d 84 eor v18.16b,v18.16b,v2.16b 85 eor v4.16b,v4.16b,v7.16b 86 eor v20.16b, v0.16b,v18.16b //H^3 87 eor v22.16b,v5.16b,v4.16b //H^4 88 89 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing 90 ext v17.16b,v22.16b,v22.16b,#8 91 eor v16.16b,v16.16b,v20.16b 92 eor v17.16b,v17.16b,v22.16b 93 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 94 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] 95 ret 96.size gcm_init_v8,.-gcm_init_v8 97.globl gcm_gmult_v8 98.type gcm_gmult_v8,%function 99.align 4 100gcm_gmult_v8: 101 ld1 {v17.2d},[x0] //load Xi 102 movi v19.16b,#0xe1 103 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 104 shl v19.2d,v19.2d,#57 105#ifndef __ARMEB__ 106 rev64 v17.16b,v17.16b 107#endif 108 ext v3.16b,v17.16b,v17.16b,#8 109 110 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 111 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 112 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 113 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 114 115 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 116 eor v18.16b,v0.16b,v2.16b 117 eor v1.16b,v1.16b,v17.16b 118 eor v1.16b,v1.16b,v18.16b 119 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 120 121 ins v2.d[0],v1.d[1] 122 ins v1.d[1],v0.d[0] 123 eor v0.16b,v1.16b,v18.16b 124 125 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 126 pmull v0.1q,v0.1d,v19.1d 127 eor v18.16b,v18.16b,v2.16b 128 eor v0.16b,v0.16b,v18.16b 129 130#ifndef __ARMEB__ 131 rev64 v0.16b,v0.16b 132#endif 133 ext v0.16b,v0.16b,v0.16b,#8 134 st1 {v0.2d},[x0] //write out Xi 135 136 ret 137.size gcm_gmult_v8,.-gcm_gmult_v8 138.globl gcm_ghash_v8 139.type gcm_ghash_v8,%function 140.align 4 141gcm_ghash_v8: 142 cmp x3,#64 143 b.hs .Lgcm_ghash_v8_4x 144 ld1 {v0.2d},[x0] //load [rotated] Xi 145 //"[rotated]" means that 146 //loaded value would have 147 //to be rotated in order to 148 //make it appear as in 149 //algorithm specification 150 subs x3,x3,#32 //see if x3 is 32 or larger 151 mov x12,#16 //x12 is used as post- 152 //increment for input pointer; 153 //as loop is modulo-scheduled 154 //x12 is zeroed just in time 155 //to preclude overstepping 156 //inp[len], which means that 157 //last block[s] are actually 158 //loaded twice, but last 159 //copy is not processed 160 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 161 movi v19.16b,#0xe1 162 ld1 {v22.2d},[x1] 163 csel x12,xzr,x12,eq //is it time to zero x12? 164 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 165 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 166 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 167#ifndef __ARMEB__ 168 rev64 v16.16b,v16.16b 169 rev64 v0.16b,v0.16b 170#endif 171 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 172 b.lo .Lodd_tail_v8 //x3 was less than 32 173 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 174#ifndef __ARMEB__ 175 rev64 v17.16b,v17.16b 176#endif 177 ext v7.16b,v17.16b,v17.16b,#8 178 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 179 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 180 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 181 pmull2 v6.1q,v20.2d,v7.2d 182 b .Loop_mod2x_v8 183 184.align 4 185.Loop_mod2x_v8: 186 ext v18.16b,v3.16b,v3.16b,#8 187 subs x3,x3,#32 //is there more data? 188 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 189 csel x12,xzr,x12,lo //is it time to zero x12? 190 191 pmull v5.1q,v21.1d,v17.1d 192 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 193 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 194 eor v0.16b,v0.16b,v4.16b //accumulate 195 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 196 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 197 198 eor v2.16b,v2.16b,v6.16b 199 csel x12,xzr,x12,eq //is it time to zero x12? 200 eor v1.16b,v1.16b,v5.16b 201 202 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 203 eor v18.16b,v0.16b,v2.16b 204 eor v1.16b,v1.16b,v17.16b 205 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 206#ifndef __ARMEB__ 207 rev64 v16.16b,v16.16b 208#endif 209 eor v1.16b,v1.16b,v18.16b 210 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 211 212#ifndef __ARMEB__ 213 rev64 v17.16b,v17.16b 214#endif 215 ins v2.d[0],v1.d[1] 216 ins v1.d[1],v0.d[0] 217 ext v7.16b,v17.16b,v17.16b,#8 218 ext v3.16b,v16.16b,v16.16b,#8 219 eor v0.16b,v1.16b,v18.16b 220 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 221 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 222 223 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 224 pmull v0.1q,v0.1d,v19.1d 225 eor v3.16b,v3.16b,v18.16b 226 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 227 eor v3.16b,v3.16b,v0.16b 228 pmull2 v6.1q,v20.2d,v7.2d 229 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 230 231 eor v2.16b,v2.16b,v18.16b 232 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 233 adds x3,x3,#32 //re-construct x3 234 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 235 b.eq .Ldone_v8 //is x3 zero? 236.Lodd_tail_v8: 237 ext v18.16b,v0.16b,v0.16b,#8 238 eor v3.16b,v3.16b,v0.16b //inp^=Xi 239 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 240 241 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 242 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 243 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 244 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 245 246 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 247 eor v18.16b,v0.16b,v2.16b 248 eor v1.16b,v1.16b,v17.16b 249 eor v1.16b,v1.16b,v18.16b 250 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 251 252 ins v2.d[0],v1.d[1] 253 ins v1.d[1],v0.d[0] 254 eor v0.16b,v1.16b,v18.16b 255 256 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 257 pmull v0.1q,v0.1d,v19.1d 258 eor v18.16b,v18.16b,v2.16b 259 eor v0.16b,v0.16b,v18.16b 260 261.Ldone_v8: 262#ifndef __ARMEB__ 263 rev64 v0.16b,v0.16b 264#endif 265 ext v0.16b,v0.16b,v0.16b,#8 266 st1 {v0.2d},[x0] //write out Xi 267 268 ret 269.size gcm_ghash_v8,.-gcm_ghash_v8 270.type gcm_ghash_v8_4x,%function 271.align 4 272gcm_ghash_v8_4x: 273.Lgcm_ghash_v8_4x: 274 ld1 {v0.2d},[x0] //load [rotated] Xi 275 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 276 movi v19.16b,#0xe1 277 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 278 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 279 280 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 281#ifndef __ARMEB__ 282 rev64 v0.16b,v0.16b 283 rev64 v5.16b,v5.16b 284 rev64 v6.16b,v6.16b 285 rev64 v7.16b,v7.16b 286 rev64 v4.16b,v4.16b 287#endif 288 ext v25.16b,v7.16b,v7.16b,#8 289 ext v24.16b,v6.16b,v6.16b,#8 290 ext v23.16b,v5.16b,v5.16b,#8 291 292 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 293 eor v7.16b,v7.16b,v25.16b 294 pmull2 v31.1q,v20.2d,v25.2d 295 pmull v30.1q,v21.1d,v7.1d 296 297 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 298 eor v6.16b,v6.16b,v24.16b 299 pmull2 v24.1q,v22.2d,v24.2d 300 pmull2 v6.1q,v21.2d,v6.2d 301 302 eor v29.16b,v29.16b,v16.16b 303 eor v31.16b,v31.16b,v24.16b 304 eor v30.16b,v30.16b,v6.16b 305 306 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 307 eor v5.16b,v5.16b,v23.16b 308 pmull2 v23.1q,v26.2d,v23.2d 309 pmull v5.1q,v27.1d,v5.1d 310 311 eor v29.16b,v29.16b,v7.16b 312 eor v31.16b,v31.16b,v23.16b 313 eor v30.16b,v30.16b,v5.16b 314 315 subs x3,x3,#128 316 b.lo .Ltail4x 317 318 b .Loop4x 319 320.align 4 321.Loop4x: 322 eor v16.16b,v4.16b,v0.16b 323 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 324 ext v3.16b,v16.16b,v16.16b,#8 325#ifndef __ARMEB__ 326 rev64 v5.16b,v5.16b 327 rev64 v6.16b,v6.16b 328 rev64 v7.16b,v7.16b 329 rev64 v4.16b,v4.16b 330#endif 331 332 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 333 eor v16.16b,v16.16b,v3.16b 334 pmull2 v2.1q,v28.2d,v3.2d 335 ext v25.16b,v7.16b,v7.16b,#8 336 pmull2 v1.1q,v27.2d,v16.2d 337 338 eor v0.16b,v0.16b,v29.16b 339 eor v2.16b,v2.16b,v31.16b 340 ext v24.16b,v6.16b,v6.16b,#8 341 eor v1.16b,v1.16b,v30.16b 342 ext v23.16b,v5.16b,v5.16b,#8 343 344 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 345 eor v18.16b,v0.16b,v2.16b 346 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 347 eor v7.16b,v7.16b,v25.16b 348 eor v1.16b,v1.16b,v17.16b 349 pmull2 v31.1q,v20.2d,v25.2d 350 eor v1.16b,v1.16b,v18.16b 351 pmull v30.1q,v21.1d,v7.1d 352 353 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 354 ins v2.d[0],v1.d[1] 355 ins v1.d[1],v0.d[0] 356 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 357 eor v6.16b,v6.16b,v24.16b 358 pmull2 v24.1q,v22.2d,v24.2d 359 eor v0.16b,v1.16b,v18.16b 360 pmull2 v6.1q,v21.2d,v6.2d 361 362 eor v29.16b,v29.16b,v16.16b 363 eor v31.16b,v31.16b,v24.16b 364 eor v30.16b,v30.16b,v6.16b 365 366 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 367 pmull v0.1q,v0.1d,v19.1d 368 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 369 eor v5.16b,v5.16b,v23.16b 370 eor v18.16b,v18.16b,v2.16b 371 pmull2 v23.1q,v26.2d,v23.2d 372 pmull v5.1q,v27.1d,v5.1d 373 374 eor v0.16b,v0.16b,v18.16b 375 eor v29.16b,v29.16b,v7.16b 376 eor v31.16b,v31.16b,v23.16b 377 ext v0.16b,v0.16b,v0.16b,#8 378 eor v30.16b,v30.16b,v5.16b 379 380 subs x3,x3,#64 381 b.hs .Loop4x 382 383.Ltail4x: 384 eor v16.16b,v4.16b,v0.16b 385 ext v3.16b,v16.16b,v16.16b,#8 386 387 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 388 eor v16.16b,v16.16b,v3.16b 389 pmull2 v2.1q,v28.2d,v3.2d 390 pmull2 v1.1q,v27.2d,v16.2d 391 392 eor v0.16b,v0.16b,v29.16b 393 eor v2.16b,v2.16b,v31.16b 394 eor v1.16b,v1.16b,v30.16b 395 396 adds x3,x3,#64 397 b.eq .Ldone4x 398 399 cmp x3,#32 400 b.lo .Lone 401 b.eq .Ltwo 402.Lthree: 403 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 404 eor v18.16b,v0.16b,v2.16b 405 eor v1.16b,v1.16b,v17.16b 406 ld1 {v4.2d,v5.2d,v6.2d},[x2] 407 eor v1.16b,v1.16b,v18.16b 408#ifndef __ARMEB__ 409 rev64 v5.16b,v5.16b 410 rev64 v6.16b,v6.16b 411 rev64 v4.16b,v4.16b 412#endif 413 414 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 415 ins v2.d[0],v1.d[1] 416 ins v1.d[1],v0.d[0] 417 ext v24.16b,v6.16b,v6.16b,#8 418 ext v23.16b,v5.16b,v5.16b,#8 419 eor v0.16b,v1.16b,v18.16b 420 421 pmull v29.1q,v20.1d,v24.1d //H·Ii+2 422 eor v6.16b,v6.16b,v24.16b 423 424 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 425 pmull v0.1q,v0.1d,v19.1d 426 eor v18.16b,v18.16b,v2.16b 427 pmull2 v31.1q,v20.2d,v24.2d 428 pmull v30.1q,v21.1d,v6.1d 429 eor v0.16b,v0.16b,v18.16b 430 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 431 eor v5.16b,v5.16b,v23.16b 432 ext v0.16b,v0.16b,v0.16b,#8 433 434 pmull2 v23.1q,v22.2d,v23.2d 435 eor v16.16b,v4.16b,v0.16b 436 pmull2 v5.1q,v21.2d,v5.2d 437 ext v3.16b,v16.16b,v16.16b,#8 438 439 eor v29.16b,v29.16b,v7.16b 440 eor v31.16b,v31.16b,v23.16b 441 eor v30.16b,v30.16b,v5.16b 442 443 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 444 eor v16.16b,v16.16b,v3.16b 445 pmull2 v2.1q,v26.2d,v3.2d 446 pmull v1.1q,v27.1d,v16.1d 447 448 eor v0.16b,v0.16b,v29.16b 449 eor v2.16b,v2.16b,v31.16b 450 eor v1.16b,v1.16b,v30.16b 451 b .Ldone4x 452 453.align 4 454.Ltwo: 455 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 456 eor v18.16b,v0.16b,v2.16b 457 eor v1.16b,v1.16b,v17.16b 458 ld1 {v4.2d,v5.2d},[x2] 459 eor v1.16b,v1.16b,v18.16b 460#ifndef __ARMEB__ 461 rev64 v5.16b,v5.16b 462 rev64 v4.16b,v4.16b 463#endif 464 465 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 466 ins v2.d[0],v1.d[1] 467 ins v1.d[1],v0.d[0] 468 ext v23.16b,v5.16b,v5.16b,#8 469 eor v0.16b,v1.16b,v18.16b 470 471 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 472 pmull v0.1q,v0.1d,v19.1d 473 eor v18.16b,v18.16b,v2.16b 474 eor v0.16b,v0.16b,v18.16b 475 ext v0.16b,v0.16b,v0.16b,#8 476 477 pmull v29.1q,v20.1d,v23.1d //H·Ii+1 478 eor v5.16b,v5.16b,v23.16b 479 480 eor v16.16b,v4.16b,v0.16b 481 ext v3.16b,v16.16b,v16.16b,#8 482 483 pmull2 v31.1q,v20.2d,v23.2d 484 pmull v30.1q,v21.1d,v5.1d 485 486 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 487 eor v16.16b,v16.16b,v3.16b 488 pmull2 v2.1q,v22.2d,v3.2d 489 pmull2 v1.1q,v21.2d,v16.2d 490 491 eor v0.16b,v0.16b,v29.16b 492 eor v2.16b,v2.16b,v31.16b 493 eor v1.16b,v1.16b,v30.16b 494 b .Ldone4x 495 496.align 4 497.Lone: 498 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 499 eor v18.16b,v0.16b,v2.16b 500 eor v1.16b,v1.16b,v17.16b 501 ld1 {v4.2d},[x2] 502 eor v1.16b,v1.16b,v18.16b 503#ifndef __ARMEB__ 504 rev64 v4.16b,v4.16b 505#endif 506 507 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 508 ins v2.d[0],v1.d[1] 509 ins v1.d[1],v0.d[0] 510 eor v0.16b,v1.16b,v18.16b 511 512 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 513 pmull v0.1q,v0.1d,v19.1d 514 eor v18.16b,v18.16b,v2.16b 515 eor v0.16b,v0.16b,v18.16b 516 ext v0.16b,v0.16b,v0.16b,#8 517 518 eor v16.16b,v4.16b,v0.16b 519 ext v3.16b,v16.16b,v16.16b,#8 520 521 pmull v0.1q,v20.1d,v3.1d 522 eor v16.16b,v16.16b,v3.16b 523 pmull2 v2.1q,v20.2d,v3.2d 524 pmull v1.1q,v21.1d,v16.1d 525 526.Ldone4x: 527 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 528 eor v18.16b,v0.16b,v2.16b 529 eor v1.16b,v1.16b,v17.16b 530 eor v1.16b,v1.16b,v18.16b 531 532 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 533 ins v2.d[0],v1.d[1] 534 ins v1.d[1],v0.d[0] 535 eor v0.16b,v1.16b,v18.16b 536 537 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 538 pmull v0.1q,v0.1d,v19.1d 539 eor v18.16b,v18.16b,v2.16b 540 eor v0.16b,v0.16b,v18.16b 541 ext v0.16b,v0.16b,v0.16b,#8 542 543#ifndef __ARMEB__ 544 rev64 v0.16b,v0.16b 545#endif 546 st1 {v0.2d},[x0] //write out Xi 547 548 ret 549.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 550.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 551.align 2 552.align 2 553#endif 554