1/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */ 2#include "arm_arch.h" 3 4#if __ARM_MAX_ARCH__>=7 5.arch armv8-a+crypto 6.text 7.globl gcm_init_v8 8.type gcm_init_v8,%function 9.align 4 10gcm_init_v8: 11 AARCH64_VALID_CALL_TARGET 12 ld1 {v17.2d},[x1] //load input H 13 movi v19.16b,#0xe1 14 shl v19.2d,v19.2d,#57 //0xc2.0 15 ext v3.16b,v17.16b,v17.16b,#8 16 ushr v18.2d,v19.2d,#63 17 dup v17.4s,v17.s[1] 18 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 19 ushr v18.2d,v3.2d,#63 20 sshr v17.4s,v17.4s,#31 //broadcast carry bit 21 and v18.16b,v18.16b,v16.16b 22 shl v3.2d,v3.2d,#1 23 ext v18.16b,v18.16b,v18.16b,#8 24 and v16.16b,v16.16b,v17.16b 25 orr v3.16b,v3.16b,v18.16b //H<<<=1 26 eor v20.16b,v3.16b,v16.16b //twisted H 27 st1 {v20.2d},[x0],#16 //store Htable[0] 28 29 //calculate H^2 30 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 31 pmull v0.1q,v20.1d,v20.1d 32 eor v16.16b,v16.16b,v20.16b 33 pmull2 v2.1q,v20.2d,v20.2d 34 pmull v1.1q,v16.1d,v16.1d 35 36 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 37 eor v18.16b,v0.16b,v2.16b 38 eor v1.16b,v1.16b,v17.16b 39 eor v1.16b,v1.16b,v18.16b 40 pmull v18.1q,v0.1d,v19.1d //1st phase 41 42 ins v2.d[0],v1.d[1] 43 ins v1.d[1],v0.d[0] 44 eor v0.16b,v1.16b,v18.16b 45 46 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 47 pmull v0.1q,v0.1d,v19.1d 48 eor v18.16b,v18.16b,v2.16b 49 eor v22.16b,v0.16b,v18.16b 50 51 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 52 eor v17.16b,v17.16b,v22.16b 53 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 54 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 55 //calculate H^3 and H^4 56 pmull v0.1q,v20.1d, v22.1d 57 pmull v5.1q,v22.1d,v22.1d 58 pmull2 v2.1q,v20.2d, v22.2d 59 pmull2 v7.1q,v22.2d,v22.2d 60 pmull v1.1q,v16.1d,v17.1d 61 pmull v6.1q,v17.1d,v17.1d 62 63 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 64 ext v17.16b,v5.16b,v7.16b,#8 65 eor v18.16b,v0.16b,v2.16b 66 eor v1.16b,v1.16b,v16.16b 67 eor v4.16b,v5.16b,v7.16b 68 eor v6.16b,v6.16b,v17.16b 69 eor v1.16b,v1.16b,v18.16b 70 pmull v18.1q,v0.1d,v19.1d //1st phase 71 eor v6.16b,v6.16b,v4.16b 72 pmull v4.1q,v5.1d,v19.1d 73 74 ins v2.d[0],v1.d[1] 75 ins v7.d[0],v6.d[1] 76 ins v1.d[1],v0.d[0] 77 ins v6.d[1],v5.d[0] 78 eor v0.16b,v1.16b,v18.16b 79 eor v5.16b,v6.16b,v4.16b 80 81 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 82 ext v4.16b,v5.16b,v5.16b,#8 83 pmull v0.1q,v0.1d,v19.1d 84 pmull v5.1q,v5.1d,v19.1d 85 eor v18.16b,v18.16b,v2.16b 86 eor v4.16b,v4.16b,v7.16b 87 eor v23.16b, v0.16b,v18.16b //H^3 88 eor v25.16b,v5.16b,v4.16b //H^4 89 90 ext v16.16b,v23.16b, v23.16b,#8 //Karatsuba pre-processing 91 ext v17.16b,v25.16b,v25.16b,#8 92 ext v18.16b,v22.16b,v22.16b,#8 93 eor v16.16b,v16.16b,v23.16b 94 eor v17.16b,v17.16b,v25.16b 95 eor v18.16b,v18.16b,v22.16b 96 ext v24.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 97 st1 {v23.2d,v24.2d,v25.2d},[x0],#48 //store Htable[3..5] 98 99 //calculate H^5 and H^6 100 pmull v0.1q,v22.1d, v23.1d 101 pmull v5.1q,v23.1d,v23.1d 102 pmull2 v2.1q,v22.2d, v23.2d 103 pmull2 v7.1q,v23.2d,v23.2d 104 pmull v1.1q,v16.1d,v18.1d 105 pmull v6.1q,v16.1d,v16.1d 106 107 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 108 ext v17.16b,v5.16b,v7.16b,#8 109 eor v18.16b,v0.16b,v2.16b 110 eor v1.16b,v1.16b,v16.16b 111 eor v4.16b,v5.16b,v7.16b 112 eor v6.16b,v6.16b,v17.16b 113 eor v1.16b,v1.16b,v18.16b 114 pmull v18.1q,v0.1d,v19.1d //1st phase 115 eor v6.16b,v6.16b,v4.16b 116 pmull v4.1q,v5.1d,v19.1d 117 118 ins v2.d[0],v1.d[1] 119 ins v7.d[0],v6.d[1] 120 ins v1.d[1],v0.d[0] 121 ins v6.d[1],v5.d[0] 122 eor v0.16b,v1.16b,v18.16b 123 eor v5.16b,v6.16b,v4.16b 124 125 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 126 ext v4.16b,v5.16b,v5.16b,#8 127 pmull v0.1q,v0.1d,v19.1d 128 pmull v5.1q,v5.1d,v19.1d 129 eor v18.16b,v18.16b,v2.16b 130 eor v4.16b,v4.16b,v7.16b 131 eor v26.16b,v0.16b,v18.16b //H^5 132 eor v28.16b,v5.16b,v4.16b //H^6 133 134 ext v16.16b,v26.16b, v26.16b,#8 //Karatsuba pre-processing 135 ext v17.16b,v28.16b,v28.16b,#8 136 ext v18.16b,v22.16b,v22.16b,#8 137 eor v16.16b,v16.16b,v26.16b 138 eor v17.16b,v17.16b,v28.16b 139 eor v18.16b,v18.16b,v22.16b 140 ext v27.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 141 st1 {v26.2d,v27.2d,v28.2d},[x0],#48 //store Htable[6..8] 142 143 //calculate H^7 and H^8 144 pmull v0.1q,v22.1d,v26.1d 145 pmull v5.1q,v22.1d,v28.1d 146 pmull2 v2.1q,v22.2d,v26.2d 147 pmull2 v7.1q,v22.2d,v28.2d 148 pmull v1.1q,v16.1d,v18.1d 149 pmull v6.1q,v17.1d,v18.1d 150 151 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 152 ext v17.16b,v5.16b,v7.16b,#8 153 eor v18.16b,v0.16b,v2.16b 154 eor v1.16b,v1.16b,v16.16b 155 eor v4.16b,v5.16b,v7.16b 156 eor v6.16b,v6.16b,v17.16b 157 eor v1.16b,v1.16b,v18.16b 158 pmull v18.1q,v0.1d,v19.1d //1st phase 159 eor v6.16b,v6.16b,v4.16b 160 pmull v4.1q,v5.1d,v19.1d 161 162 ins v2.d[0],v1.d[1] 163 ins v7.d[0],v6.d[1] 164 ins v1.d[1],v0.d[0] 165 ins v6.d[1],v5.d[0] 166 eor v0.16b,v1.16b,v18.16b 167 eor v5.16b,v6.16b,v4.16b 168 169 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 170 ext v4.16b,v5.16b,v5.16b,#8 171 pmull v0.1q,v0.1d,v19.1d 172 pmull v5.1q,v5.1d,v19.1d 173 eor v18.16b,v18.16b,v2.16b 174 eor v4.16b,v4.16b,v7.16b 175 eor v29.16b,v0.16b,v18.16b //H^7 176 eor v31.16b,v5.16b,v4.16b //H^8 177 178 ext v16.16b,v29.16b,v29.16b,#8 //Karatsuba pre-processing 179 ext v17.16b,v31.16b,v31.16b,#8 180 eor v16.16b,v16.16b,v29.16b 181 eor v17.16b,v17.16b,v31.16b 182 ext v30.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 183 st1 {v29.2d,v30.2d,v31.2d},[x0] //store Htable[9..11] 184 ret 185.size gcm_init_v8,.-gcm_init_v8 186.globl gcm_gmult_v8 187.type gcm_gmult_v8,%function 188.align 4 189gcm_gmult_v8: 190 AARCH64_VALID_CALL_TARGET 191 ld1 {v17.2d},[x0] //load Xi 192 movi v19.16b,#0xe1 193 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 194 shl v19.2d,v19.2d,#57 195#ifndef __AARCH64EB__ 196 rev64 v17.16b,v17.16b 197#endif 198 ext v3.16b,v17.16b,v17.16b,#8 199 200 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 201 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 202 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 203 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 204 205 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 206 eor v18.16b,v0.16b,v2.16b 207 eor v1.16b,v1.16b,v17.16b 208 eor v1.16b,v1.16b,v18.16b 209 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 210 211 ins v2.d[0],v1.d[1] 212 ins v1.d[1],v0.d[0] 213 eor v0.16b,v1.16b,v18.16b 214 215 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 216 pmull v0.1q,v0.1d,v19.1d 217 eor v18.16b,v18.16b,v2.16b 218 eor v0.16b,v0.16b,v18.16b 219 220#ifndef __AARCH64EB__ 221 rev64 v0.16b,v0.16b 222#endif 223 ext v0.16b,v0.16b,v0.16b,#8 224 st1 {v0.2d},[x0] //write out Xi 225 226 ret 227.size gcm_gmult_v8,.-gcm_gmult_v8 228.globl gcm_ghash_v8 229.type gcm_ghash_v8,%function 230.align 4 231gcm_ghash_v8: 232 AARCH64_VALID_CALL_TARGET 233 cmp x3,#64 234 b.hs .Lgcm_ghash_v8_4x 235 ld1 {v0.2d},[x0] //load [rotated] Xi 236 //"[rotated]" means that 237 //loaded value would have 238 //to be rotated in order to 239 //make it appear as in 240 //algorithm specification 241 subs x3,x3,#32 //see if x3 is 32 or larger 242 mov x12,#16 //x12 is used as post- 243 //increment for input pointer; 244 //as loop is modulo-scheduled 245 //x12 is zeroed just in time 246 //to preclude overstepping 247 //inp[len], which means that 248 //last block[s] are actually 249 //loaded twice, but last 250 //copy is not processed 251 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 252 movi v19.16b,#0xe1 253 ld1 {v22.2d},[x1] 254 csel x12,xzr,x12,eq //is it time to zero x12? 255 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 256 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 257 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 258#ifndef __AARCH64EB__ 259 rev64 v16.16b,v16.16b 260 rev64 v0.16b,v0.16b 261#endif 262 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 263 b.lo .Lodd_tail_v8 //x3 was less than 32 264 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 265#ifndef __AARCH64EB__ 266 rev64 v17.16b,v17.16b 267#endif 268 ext v7.16b,v17.16b,v17.16b,#8 269 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 270 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 271 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 272 pmull2 v6.1q,v20.2d,v7.2d 273 b .Loop_mod2x_v8 274 275.align 4 276.Loop_mod2x_v8: 277 ext v18.16b,v3.16b,v3.16b,#8 278 subs x3,x3,#32 //is there more data? 279 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 280 csel x12,xzr,x12,lo //is it time to zero x12? 281 282 pmull v5.1q,v21.1d,v17.1d 283 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 284 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 285 eor v0.16b,v0.16b,v4.16b //accumulate 286 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 287 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 288 289 eor v2.16b,v2.16b,v6.16b 290 csel x12,xzr,x12,eq //is it time to zero x12? 291 eor v1.16b,v1.16b,v5.16b 292 293 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 294 eor v18.16b,v0.16b,v2.16b 295 eor v1.16b,v1.16b,v17.16b 296 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 297#ifndef __AARCH64EB__ 298 rev64 v16.16b,v16.16b 299#endif 300 eor v1.16b,v1.16b,v18.16b 301 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 302 303#ifndef __AARCH64EB__ 304 rev64 v17.16b,v17.16b 305#endif 306 ins v2.d[0],v1.d[1] 307 ins v1.d[1],v0.d[0] 308 ext v7.16b,v17.16b,v17.16b,#8 309 ext v3.16b,v16.16b,v16.16b,#8 310 eor v0.16b,v1.16b,v18.16b 311 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 312 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 313 314 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 315 pmull v0.1q,v0.1d,v19.1d 316 eor v3.16b,v3.16b,v18.16b 317 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 318 eor v3.16b,v3.16b,v0.16b 319 pmull2 v6.1q,v20.2d,v7.2d 320 b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 321 322 eor v2.16b,v2.16b,v18.16b 323 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 324 adds x3,x3,#32 //re-construct x3 325 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 326 b.eq .Ldone_v8 //is x3 zero? 327.Lodd_tail_v8: 328 ext v18.16b,v0.16b,v0.16b,#8 329 eor v3.16b,v3.16b,v0.16b //inp^=Xi 330 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 331 332 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 333 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 334 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 335 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 336 337 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 338 eor v18.16b,v0.16b,v2.16b 339 eor v1.16b,v1.16b,v17.16b 340 eor v1.16b,v1.16b,v18.16b 341 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 342 343 ins v2.d[0],v1.d[1] 344 ins v1.d[1],v0.d[0] 345 eor v0.16b,v1.16b,v18.16b 346 347 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 348 pmull v0.1q,v0.1d,v19.1d 349 eor v18.16b,v18.16b,v2.16b 350 eor v0.16b,v0.16b,v18.16b 351 352.Ldone_v8: 353#ifndef __AARCH64EB__ 354 rev64 v0.16b,v0.16b 355#endif 356 ext v0.16b,v0.16b,v0.16b,#8 357 st1 {v0.2d},[x0] //write out Xi 358 359 ret 360.size gcm_ghash_v8,.-gcm_ghash_v8 361.type gcm_ghash_v8_4x,%function 362.align 4 363gcm_ghash_v8_4x: 364.Lgcm_ghash_v8_4x: 365 ld1 {v0.2d},[x0] //load [rotated] Xi 366 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 367 movi v19.16b,#0xe1 368 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 369 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 370 371 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 372#ifndef __AARCH64EB__ 373 rev64 v0.16b,v0.16b 374 rev64 v5.16b,v5.16b 375 rev64 v6.16b,v6.16b 376 rev64 v7.16b,v7.16b 377 rev64 v4.16b,v4.16b 378#endif 379 ext v25.16b,v7.16b,v7.16b,#8 380 ext v24.16b,v6.16b,v6.16b,#8 381 ext v23.16b,v5.16b,v5.16b,#8 382 383 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 384 eor v7.16b,v7.16b,v25.16b 385 pmull2 v31.1q,v20.2d,v25.2d 386 pmull v30.1q,v21.1d,v7.1d 387 388 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 389 eor v6.16b,v6.16b,v24.16b 390 pmull2 v24.1q,v22.2d,v24.2d 391 pmull2 v6.1q,v21.2d,v6.2d 392 393 eor v29.16b,v29.16b,v16.16b 394 eor v31.16b,v31.16b,v24.16b 395 eor v30.16b,v30.16b,v6.16b 396 397 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 398 eor v5.16b,v5.16b,v23.16b 399 pmull2 v23.1q,v26.2d,v23.2d 400 pmull v5.1q,v27.1d,v5.1d 401 402 eor v29.16b,v29.16b,v7.16b 403 eor v31.16b,v31.16b,v23.16b 404 eor v30.16b,v30.16b,v5.16b 405 406 subs x3,x3,#128 407 b.lo .Ltail4x 408 409 b .Loop4x 410 411.align 4 412.Loop4x: 413 eor v16.16b,v4.16b,v0.16b 414 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 415 ext v3.16b,v16.16b,v16.16b,#8 416#ifndef __AARCH64EB__ 417 rev64 v5.16b,v5.16b 418 rev64 v6.16b,v6.16b 419 rev64 v7.16b,v7.16b 420 rev64 v4.16b,v4.16b 421#endif 422 423 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 424 eor v16.16b,v16.16b,v3.16b 425 pmull2 v2.1q,v28.2d,v3.2d 426 ext v25.16b,v7.16b,v7.16b,#8 427 pmull2 v1.1q,v27.2d,v16.2d 428 429 eor v0.16b,v0.16b,v29.16b 430 eor v2.16b,v2.16b,v31.16b 431 ext v24.16b,v6.16b,v6.16b,#8 432 eor v1.16b,v1.16b,v30.16b 433 ext v23.16b,v5.16b,v5.16b,#8 434 435 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 436 eor v18.16b,v0.16b,v2.16b 437 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 438 eor v7.16b,v7.16b,v25.16b 439 eor v1.16b,v1.16b,v17.16b 440 pmull2 v31.1q,v20.2d,v25.2d 441 eor v1.16b,v1.16b,v18.16b 442 pmull v30.1q,v21.1d,v7.1d 443 444 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 445 ins v2.d[0],v1.d[1] 446 ins v1.d[1],v0.d[0] 447 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 448 eor v6.16b,v6.16b,v24.16b 449 pmull2 v24.1q,v22.2d,v24.2d 450 eor v0.16b,v1.16b,v18.16b 451 pmull2 v6.1q,v21.2d,v6.2d 452 453 eor v29.16b,v29.16b,v16.16b 454 eor v31.16b,v31.16b,v24.16b 455 eor v30.16b,v30.16b,v6.16b 456 457 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 458 pmull v0.1q,v0.1d,v19.1d 459 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 460 eor v5.16b,v5.16b,v23.16b 461 eor v18.16b,v18.16b,v2.16b 462 pmull2 v23.1q,v26.2d,v23.2d 463 pmull v5.1q,v27.1d,v5.1d 464 465 eor v0.16b,v0.16b,v18.16b 466 eor v29.16b,v29.16b,v7.16b 467 eor v31.16b,v31.16b,v23.16b 468 ext v0.16b,v0.16b,v0.16b,#8 469 eor v30.16b,v30.16b,v5.16b 470 471 subs x3,x3,#64 472 b.hs .Loop4x 473 474.Ltail4x: 475 eor v16.16b,v4.16b,v0.16b 476 ext v3.16b,v16.16b,v16.16b,#8 477 478 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 479 eor v16.16b,v16.16b,v3.16b 480 pmull2 v2.1q,v28.2d,v3.2d 481 pmull2 v1.1q,v27.2d,v16.2d 482 483 eor v0.16b,v0.16b,v29.16b 484 eor v2.16b,v2.16b,v31.16b 485 eor v1.16b,v1.16b,v30.16b 486 487 adds x3,x3,#64 488 b.eq .Ldone4x 489 490 cmp x3,#32 491 b.lo .Lone 492 b.eq .Ltwo 493.Lthree: 494 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 495 eor v18.16b,v0.16b,v2.16b 496 eor v1.16b,v1.16b,v17.16b 497 ld1 {v4.2d,v5.2d,v6.2d},[x2] 498 eor v1.16b,v1.16b,v18.16b 499#ifndef __AARCH64EB__ 500 rev64 v5.16b,v5.16b 501 rev64 v6.16b,v6.16b 502 rev64 v4.16b,v4.16b 503#endif 504 505 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 506 ins v2.d[0],v1.d[1] 507 ins v1.d[1],v0.d[0] 508 ext v24.16b,v6.16b,v6.16b,#8 509 ext v23.16b,v5.16b,v5.16b,#8 510 eor v0.16b,v1.16b,v18.16b 511 512 pmull v29.1q,v20.1d,v24.1d //H·Ii+2 513 eor v6.16b,v6.16b,v24.16b 514 515 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 516 pmull v0.1q,v0.1d,v19.1d 517 eor v18.16b,v18.16b,v2.16b 518 pmull2 v31.1q,v20.2d,v24.2d 519 pmull v30.1q,v21.1d,v6.1d 520 eor v0.16b,v0.16b,v18.16b 521 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 522 eor v5.16b,v5.16b,v23.16b 523 ext v0.16b,v0.16b,v0.16b,#8 524 525 pmull2 v23.1q,v22.2d,v23.2d 526 eor v16.16b,v4.16b,v0.16b 527 pmull2 v5.1q,v21.2d,v5.2d 528 ext v3.16b,v16.16b,v16.16b,#8 529 530 eor v29.16b,v29.16b,v7.16b 531 eor v31.16b,v31.16b,v23.16b 532 eor v30.16b,v30.16b,v5.16b 533 534 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 535 eor v16.16b,v16.16b,v3.16b 536 pmull2 v2.1q,v26.2d,v3.2d 537 pmull v1.1q,v27.1d,v16.1d 538 539 eor v0.16b,v0.16b,v29.16b 540 eor v2.16b,v2.16b,v31.16b 541 eor v1.16b,v1.16b,v30.16b 542 b .Ldone4x 543 544.align 4 545.Ltwo: 546 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 547 eor v18.16b,v0.16b,v2.16b 548 eor v1.16b,v1.16b,v17.16b 549 ld1 {v4.2d,v5.2d},[x2] 550 eor v1.16b,v1.16b,v18.16b 551#ifndef __AARCH64EB__ 552 rev64 v5.16b,v5.16b 553 rev64 v4.16b,v4.16b 554#endif 555 556 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 557 ins v2.d[0],v1.d[1] 558 ins v1.d[1],v0.d[0] 559 ext v23.16b,v5.16b,v5.16b,#8 560 eor v0.16b,v1.16b,v18.16b 561 562 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 563 pmull v0.1q,v0.1d,v19.1d 564 eor v18.16b,v18.16b,v2.16b 565 eor v0.16b,v0.16b,v18.16b 566 ext v0.16b,v0.16b,v0.16b,#8 567 568 pmull v29.1q,v20.1d,v23.1d //H·Ii+1 569 eor v5.16b,v5.16b,v23.16b 570 571 eor v16.16b,v4.16b,v0.16b 572 ext v3.16b,v16.16b,v16.16b,#8 573 574 pmull2 v31.1q,v20.2d,v23.2d 575 pmull v30.1q,v21.1d,v5.1d 576 577 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 578 eor v16.16b,v16.16b,v3.16b 579 pmull2 v2.1q,v22.2d,v3.2d 580 pmull2 v1.1q,v21.2d,v16.2d 581 582 eor v0.16b,v0.16b,v29.16b 583 eor v2.16b,v2.16b,v31.16b 584 eor v1.16b,v1.16b,v30.16b 585 b .Ldone4x 586 587.align 4 588.Lone: 589 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 590 eor v18.16b,v0.16b,v2.16b 591 eor v1.16b,v1.16b,v17.16b 592 ld1 {v4.2d},[x2] 593 eor v1.16b,v1.16b,v18.16b 594#ifndef __AARCH64EB__ 595 rev64 v4.16b,v4.16b 596#endif 597 598 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 599 ins v2.d[0],v1.d[1] 600 ins v1.d[1],v0.d[0] 601 eor v0.16b,v1.16b,v18.16b 602 603 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 604 pmull v0.1q,v0.1d,v19.1d 605 eor v18.16b,v18.16b,v2.16b 606 eor v0.16b,v0.16b,v18.16b 607 ext v0.16b,v0.16b,v0.16b,#8 608 609 eor v16.16b,v4.16b,v0.16b 610 ext v3.16b,v16.16b,v16.16b,#8 611 612 pmull v0.1q,v20.1d,v3.1d 613 eor v16.16b,v16.16b,v3.16b 614 pmull2 v2.1q,v20.2d,v3.2d 615 pmull v1.1q,v21.1d,v16.1d 616 617.Ldone4x: 618 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 619 eor v18.16b,v0.16b,v2.16b 620 eor v1.16b,v1.16b,v17.16b 621 eor v1.16b,v1.16b,v18.16b 622 623 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 624 ins v2.d[0],v1.d[1] 625 ins v1.d[1],v0.d[0] 626 eor v0.16b,v1.16b,v18.16b 627 628 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 629 pmull v0.1q,v0.1d,v19.1d 630 eor v18.16b,v18.16b,v2.16b 631 eor v0.16b,v0.16b,v18.16b 632 ext v0.16b,v0.16b,v0.16b,#8 633 634#ifndef __AARCH64EB__ 635 rev64 v0.16b,v0.16b 636#endif 637 st1 {v0.2d},[x0] //write out Xi 638 639 ret 640.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 641.section .rodata 642.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 643.align 2 644.align 2 645#endif 646