Lines Matching full:16

106 	movi	v17.16b, #0x0f
133 adr x11, .Lk_mc_forward+16
135 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
136 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
137 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
138 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
139 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
140 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
141 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
142 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
149 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
150 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
151 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
152 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
153 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
154 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
155 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
157 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
158 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
159 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
160 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
161 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
162 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
164 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
169 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
170 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
171 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
172 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
173 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
174 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
175 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
176 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
177 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
178 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
179 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
180 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
181 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
187 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
188 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
190 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
191 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
192 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
193 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
202 stp x29,x30,[sp,#-16]!
205 ld1 {v7.16b}, [x0]
208 st1 {v0.16b}, [x1]
210 ldp x29,x30,[sp],#16
220 adr x11, .Lk_mc_forward+16
222 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
223 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
224 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
225 and v9.16b, v15.16b, v17.16b
226 ushr v8.16b, v15.16b, #4
227 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
228 tbl v9.16b, {v20.16b}, v9.16b
229 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
230 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
231 tbl v10.16b, {v21.16b}, v8.16b
232 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
233 eor v8.16b, v9.16b, v16.16b
234 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
235 eor v8.16b, v8.16b, v10.16b
242 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
243 tbl v12.16b, {v25.16b}, v10.16b
244 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
245 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
246 tbl v8.16b, {v24.16b}, v11.16b
247 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
248 eor v12.16b, v12.16b, v16.16b
249 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
250 tbl v13.16b, {v27.16b}, v10.16b
251 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
252 eor v8.16b, v8.16b, v12.16b
253 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
254 tbl v10.16b, {v26.16b}, v11.16b
256 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
257 tbl v11.16b, {v8.16b}, v1.16b
258 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
259 eor v10.16b, v10.16b, v13.16b
260 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
261 tbl v8.16b, {v8.16b}, v4.16b
262 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
263 eor v11.16b, v11.16b, v10.16b
264 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
265 tbl v12.16b, {v11.16b},v1.16b
266 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
267 eor v8.16b, v8.16b, v11.16b
269 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
270 eor v8.16b, v8.16b, v12.16b
275 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
276 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
277 and v9.16b, v8.16b, v17.16b
278 ushr v8.16b, v8.16b, #4
279 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
280 tbl v13.16b, {v19.16b},v9.16b
281 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
282 eor v9.16b, v9.16b, v8.16b
283 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
284 tbl v11.16b, {v18.16b},v8.16b
285 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
286 tbl v12.16b, {v18.16b},v9.16b
287 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
288 eor v11.16b, v11.16b, v13.16b
289 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
290 eor v12.16b, v12.16b, v13.16b
291 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
292 tbl v10.16b, {v18.16b},v11.16b
293 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
294 tbl v11.16b, {v18.16b},v12.16b
295 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
296 eor v10.16b, v10.16b, v9.16b
297 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
298 eor v11.16b, v11.16b, v8.16b
299 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
305 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
306 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
307 tbl v12.16b, {v22.16b}, v10.16b
309 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
310 tbl v8.16b, {v23.16b}, v11.16b
311 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
312 eor v12.16b, v12.16b, v16.16b
313 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
314 eor v8.16b, v8.16b, v12.16b
315 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
316 tbl v1.16b, {v8.16b},v1.16b
324 movi v17.16b, #0x0f
352 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
353 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
354 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
355 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
357 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
358 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
359 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
360 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
370 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
371 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
372 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
374 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
377 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
378 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
379 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
380 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
382 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
385 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
386 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
387 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
388 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
390 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
393 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
394 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
395 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
396 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
397 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
398 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
403 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
404 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
405 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
406 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
407 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
408 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
409 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
410 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
411 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
412 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
413 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
414 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
415 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
420 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
423 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
424 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
425 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
426 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
435 stp x29,x30,[sp,#-16]!
438 ld1 {v7.16b}, [x0]
441 st1 {v0.16b}, [x1]
443 ldp x29,x30,[sp],#16
463 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
464 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
465 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
466 and v9.16b, v15.16b, v17.16b
467 ushr v8.16b, v15.16b, #4
468 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
469 tbl v10.16b, {v20.16b},v9.16b
471 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
472 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
473 tbl v8.16b, {v21.16b},v8.16b
474 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
475 eor v10.16b, v10.16b, v16.16b
476 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
477 eor v8.16b, v8.16b, v10.16b
487 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
488 tbl v12.16b, {v24.16b}, v10.16b
489 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
490 tbl v9.16b, {v25.16b}, v11.16b
491 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
492 eor v8.16b, v12.16b, v16.16b
494 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
495 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
498 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
499 tbl v12.16b, {v26.16b}, v10.16b
500 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
501 tbl v8.16b, {v8.16b},v5.16b
502 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
503 tbl v9.16b, {v27.16b}, v11.16b
504 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
505 eor v8.16b, v8.16b, v12.16b
507 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
508 eor v8.16b, v8.16b, v9.16b
511 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
512 tbl v12.16b, {v28.16b}, v10.16b
513 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
514 tbl v8.16b, {v8.16b},v5.16b
515 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
516 tbl v9.16b, {v29.16b}, v11.16b
517 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
518 eor v8.16b, v8.16b, v12.16b
520 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
521 eor v8.16b, v8.16b, v9.16b
524 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
525 tbl v12.16b, {v30.16b}, v10.16b
526 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
527 tbl v8.16b, {v8.16b},v5.16b
528 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
529 tbl v9.16b, {v31.16b}, v11.16b
530 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
531 eor v8.16b, v8.16b, v12.16b
532 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
533 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
534 eor v8.16b, v8.16b, v9.16b
539 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
540 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
541 and v9.16b, v8.16b, v17.16b
542 ushr v8.16b, v8.16b, #4
543 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
544 tbl v10.16b, {v19.16b},v9.16b
545 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
546 eor v9.16b, v9.16b, v8.16b
547 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
548 tbl v11.16b, {v18.16b},v8.16b
549 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
550 tbl v12.16b, {v18.16b},v9.16b
551 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
552 eor v11.16b, v11.16b, v10.16b
553 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
554 eor v12.16b, v12.16b, v10.16b
555 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
556 tbl v10.16b, {v18.16b},v11.16b
557 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
558 tbl v11.16b, {v18.16b},v12.16b
559 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
560 eor v10.16b, v10.16b, v9.16b
561 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
562 eor v11.16b, v11.16b, v8.16b
563 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
568 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
569 tbl v12.16b, {v22.16b}, v10.16b
571 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
572 tbl v9.16b, {v23.16b}, v11.16b
574 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
575 eor v12.16b, v12.16b, v16.16b
576 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
577 eor v8.16b, v9.16b, v12.16b
578 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
579 tbl v1.16b, {v8.16b},v2.16b
591 movi v16.16b, #0x5b // .Lk_s63
593 movi v17.16b, #0x0f // .Lk_s0F
609 stp x29, x30, [sp,#-16]!
614 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
617 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
619 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
632 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
678 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
680 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
681 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
688 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
710 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
717 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
726 movi v4.16b, #0
727 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
728 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
730 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
754 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
758 sub x2, x2, #16 // add $-16, %rdx
759 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
764 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
765 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
766 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
767 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
768 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
769 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
770 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
771 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
772 ldp x29, x30, [sp],#16
794 movi v1.16b, #0
798 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
799 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
800 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
801 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
828 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
829 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
830 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
831 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
835 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
842 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
843 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
844 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
847 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
848 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
849 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
850 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
851 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
852 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
853 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
854 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
855 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
856 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
857 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
858 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
859 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
860 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
861 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
862 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
863 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
866 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
867 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
883 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
884 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
886 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
887 // vmovdqa 16(%r11), %xmm1 # hi
888 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
889 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
919 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
924 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
925 add x2, x2, #16 // add $16, %rdx
926 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
927 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
928 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
929 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
931 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
938 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
939 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
942 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
944 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
945 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
946 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
949 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
950 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
952 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
953 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
954 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
957 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
958 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
960 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
961 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
964 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
965 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
967 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
969 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
970 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
972 sub x2, x2, #16 // add $-16, %rdx
975 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
976 add x8, x8, #64-16 // add $-16, %r8
987 stp x29,x30,[sp,#-16]!
989 stp d8,d9,[sp,#-16]! // ABI spec says so
1000 ldp d8,d9,[sp],#16
1001 ldp x29,x30,[sp],#16
1011 stp x29,x30,[sp,#-16]!
1013 stp d8,d9,[sp,#-16]! // ABI spec says so
1019 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
1028 ldp d8,d9,[sp],#16
1029 ldp x29,x30,[sp],#16
1042 stp x29,x30,[sp,#-16]!
1048 ld1 {v0.16b}, [x4] // load ivec
1054 ld1 {v7.16b}, [x0],#16 // load input
1055 eor v7.16b, v7.16b, v0.16b // xor with ivec
1057 st1 {v0.16b}, [x1],#16 // save output
1058 subs x17, x17, #16
1061 st1 {v0.16b}, [x4] // write ivec
1063 ldp x29,x30,[sp],#16
1074 stp x29,x30,[sp,#-16]!
1076 stp d8,d9,[sp,#-16]! // ABI spec says so
1077 stp d10,d11,[sp,#-16]!
1078 stp d12,d13,[sp,#-16]!
1079 stp d14,d15,[sp,#-16]!
1083 ld1 {v6.16b}, [x4] // load ivec
1085 tst x17, #16
1088 ld1 {v7.16b}, [x0], #16 // load input
1090 eor v0.16b, v0.16b, v6.16b // xor with ivec
1091 orr v6.16b, v7.16b, v7.16b // next ivec value
1092 st1 {v0.16b}, [x1], #16
1093 subs x17, x17, #16
1098 ld1 {v14.16b,v15.16b}, [x0], #32
1100 eor v0.16b, v0.16b, v6.16b // xor with ivec
1101 eor v1.16b, v1.16b, v14.16b
1102 orr v6.16b, v15.16b, v15.16b
1103 st1 {v0.16b,v1.16b}, [x1], #32
1108 st1 {v6.16b}, [x4]
1110 ldp d14,d15,[sp],#16
1111 ldp d12,d13,[sp],#16
1112 ldp d10,d11,[sp],#16
1113 ldp d8,d9,[sp],#16
1114 ldp x29,x30,[sp],#16
1123 stp x29,x30,[sp,#-16]!
1125 stp d8,d9,[sp,#-16]! // ABI spec says so
1126 stp d10,d11,[sp,#-16]!
1127 stp d12,d13,[sp,#-16]!
1128 stp d14,d15,[sp,#-16]!
1133 tst x17, #16
1136 ld1 {v7.16b}, [x0],#16
1138 st1 {v0.16b}, [x1],#16
1139 subs x17, x17, #16
1144 ld1 {v14.16b,v15.16b}, [x0], #32
1146 st1 {v0.16b,v1.16b}, [x1], #32
1151 ldp d14,d15,[sp],#16
1152 ldp d12,d13,[sp],#16
1153 ldp d10,d11,[sp],#16
1154 ldp d8,d9,[sp],#16
1155 ldp x29,x30,[sp],#16
1165 stp x29,x30,[sp,#-16]!
1167 stp d8,d9,[sp,#-16]! // ABI spec says so
1168 stp d10,d11,[sp,#-16]!
1169 stp d12,d13,[sp,#-16]!
1170 stp d14,d15,[sp,#-16]!
1175 tst x17, #16
1178 ld1 {v7.16b}, [x0],#16
1180 st1 {v0.16b}, [x1],#16
1181 subs x17, x17, #16
1186 ld1 {v14.16b,v15.16b}, [x0], #32
1188 st1 {v0.16b,v1.16b}, [x1], #32
1193 ldp d14,d15,[sp],#16
1194 ldp d12,d13,[sp],#16
1195 ldp d10,d11,[sp],#16
1196 ldp d8,d9,[sp],#16
1197 ldp x29,x30,[sp],#16