Lines Matching +full:1 +full:- +full:16

1 /* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
94 .size _vpaes_consts,.-_vpaes_consts
102 // Fills register %r10 -> .aes_consts (so you can -fPIC)
103 // and %xmm9-%xmm15 as specified below.
110 movi v17.16b, #0x0f
115 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
120 // AES-encrypt %xmm0.
124 // %xmm9-%xmm15 as in _vpaes_preheat
128 // Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
129 // Preserves %xmm6 - %xmm8 so you get some local vectors
137 adrp x11, .Lk_mc_forward+16
138 add x11, x11, #:lo12:.Lk_mc_forward+16
140 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
141 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
142 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
143 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
144 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
145 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
146 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
147 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
154 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
155 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
156 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
157 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
158 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
159 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
160 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
162 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
163 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
164 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
165 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
166 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
167 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
168 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
169 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
170 sub w8, w8, #1 // nr--
174 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
175 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
176 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
177 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
178 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
179 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
180 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
181 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
182 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
183 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
184 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
185 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
186 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
191 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
192 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
193 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
195 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
196 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
197 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
198 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
200 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
207 stp x29,x30,[sp,#-16]!
210 ld1 {v7.16b}, [x0]
213 st1 {v0.16b}, [x1]
215 ldp x29,x30,[sp],#16
218 .size vpaes_encrypt,.-vpaes_encrypt
225 adrp x11, .Lk_mc_forward+16
226 add x11, x11, #:lo12:.Lk_mc_forward+16
228 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
229 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
230 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
231 and v9.16b, v15.16b, v17.16b
232 ushr v8.16b, v15.16b, #4
233 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
234 tbl v9.16b, {v20.16b}, v9.16b
235 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
236 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
237 tbl v10.16b, {v21.16b}, v8.16b
238 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
239 eor v8.16b, v9.16b, v16.16b
240 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
241 eor v8.16b, v8.16b, v10.16b
248 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
249 tbl v12.16b, {v25.16b}, v10.16b
250 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
251 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
252 tbl v8.16b, {v24.16b}, v11.16b
253 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
254 eor v12.16b, v12.16b, v16.16b
255 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
256 tbl v13.16b, {v27.16b}, v10.16b
257 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
258 eor v8.16b, v8.16b, v12.16b
259 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
260 tbl v10.16b, {v26.16b}, v11.16b
262 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
263 tbl v11.16b, {v8.16b}, v1.16b
264 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
265 eor v10.16b, v10.16b, v13.16b
266 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
267 tbl v8.16b, {v8.16b}, v4.16b
268 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
269 eor v11.16b, v11.16b, v10.16b
270 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
271 tbl v12.16b, {v11.16b},v1.16b
272 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
273 eor v8.16b, v8.16b, v11.16b
274 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4
275 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
276 eor v8.16b, v8.16b, v12.16b
277 sub w8, w8, #1 // nr--
281 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
282 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
283 and v9.16b, v8.16b, v17.16b
284 ushr v8.16b, v8.16b, #4
285 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
286 tbl v13.16b, {v19.16b},v9.16b
287 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
288 eor v9.16b, v9.16b, v8.16b
289 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
290 tbl v11.16b, {v18.16b},v8.16b
291 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
292 tbl v12.16b, {v18.16b},v9.16b
293 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
294 eor v11.16b, v11.16b, v13.16b
295 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
296 eor v12.16b, v12.16b, v13.16b
297 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
298 tbl v10.16b, {v18.16b},v11.16b
299 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
300 tbl v11.16b, {v18.16b},v12.16b
301 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
302 eor v10.16b, v10.16b, v9.16b
303 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
304 eor v11.16b, v11.16b, v8.16b
305 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
310 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
311 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
312 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
313 tbl v12.16b, {v22.16b}, v10.16b
315 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
316 tbl v8.16b, {v23.16b}, v11.16b
317 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
318 eor v12.16b, v12.16b, v16.16b
319 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
320 eor v8.16b, v8.16b, v12.16b
321 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
322 tbl v1.16b, {v8.16b},v1.16b
324 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
331 movi v17.16b, #0x0f
339 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
362 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
363 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
364 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
365 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
367 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
368 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
369 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
370 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
378 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
379 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
380 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
381 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
382 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
384 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
387 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
388 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
389 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
390 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
392 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
395 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
396 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
397 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
398 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
400 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
403 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
404 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
405 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
406 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
407 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
408 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
409 sub w8, w8, #1 // sub $1,%rax # nr--
413 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
414 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
415 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
416 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
417 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
418 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
419 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
420 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
421 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
422 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
423 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
424 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
425 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
430 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
432 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
433 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
434 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
435 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
436 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
438 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
445 stp x29,x30,[sp,#-16]!
448 ld1 {v7.16b}, [x0]
451 st1 {v0.16b}, [x1]
453 ldp x29,x30,[sp],#16
456 .size vpaes_decrypt,.-vpaes_decrypt
458 // v14-v15 input, v0-v1 output
475 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
476 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
477 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
478 and v9.16b, v15.16b, v17.16b
479 ushr v8.16b, v15.16b, #4
480 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
481 tbl v10.16b, {v20.16b},v9.16b
483 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
484 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
485 tbl v8.16b, {v21.16b},v8.16b
486 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
487 eor v10.16b, v10.16b, v16.16b
488 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
489 eor v8.16b, v8.16b, v10.16b
497 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
498 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
499 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
500 tbl v12.16b, {v24.16b}, v10.16b
501 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
502 tbl v9.16b, {v25.16b}, v11.16b
503 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
504 eor v8.16b, v12.16b, v16.16b
506 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
507 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
510 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
511 tbl v12.16b, {v26.16b}, v10.16b
512 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
513 tbl v8.16b, {v8.16b},v5.16b
514 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
515 tbl v9.16b, {v27.16b}, v11.16b
516 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
517 eor v8.16b, v8.16b, v12.16b
519 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
520 eor v8.16b, v8.16b, v9.16b
523 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
524 tbl v12.16b, {v28.16b}, v10.16b
525 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
526 tbl v8.16b, {v8.16b},v5.16b
527 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
528 tbl v9.16b, {v29.16b}, v11.16b
529 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
530 eor v8.16b, v8.16b, v12.16b
532 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
533 eor v8.16b, v8.16b, v9.16b
536 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
537 tbl v12.16b, {v30.16b}, v10.16b
538 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
539 tbl v8.16b, {v8.16b},v5.16b
540 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
541 tbl v9.16b, {v31.16b}, v11.16b
542 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
543 eor v8.16b, v8.16b, v12.16b
544 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
545 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
546 eor v8.16b, v8.16b, v9.16b
547 sub w8, w8, #1 // sub $1,%rax # nr--
551 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
552 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
553 and v9.16b, v8.16b, v17.16b
554 ushr v8.16b, v8.16b, #4
555 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
556 tbl v10.16b, {v19.16b},v9.16b
557 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
558 eor v9.16b, v9.16b, v8.16b
559 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
560 tbl v11.16b, {v18.16b},v8.16b
561 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
562 tbl v12.16b, {v18.16b},v9.16b
563 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
564 eor v11.16b, v11.16b, v10.16b
565 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
566 eor v12.16b, v12.16b, v10.16b
567 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
568 tbl v10.16b, {v18.16b},v11.16b
569 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
570 tbl v11.16b, {v18.16b},v12.16b
571 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
572 eor v10.16b, v10.16b, v9.16b
573 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
574 eor v11.16b, v11.16b, v8.16b
575 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
580 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
581 tbl v12.16b, {v22.16b}, v10.16b
583 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
584 tbl v9.16b, {v23.16b}, v11.16b
585 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
586 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
587 eor v12.16b, v12.16b, v16.16b
588 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
589 eor v8.16b, v9.16b, v12.16b
590 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
591 tbl v1.16b, {v8.16b},v2.16b
593 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
604 movi v16.16b, #0x5b // .Lk_s63
607 movi v17.16b, #0x0f // .Lk_s0F
619 .size _vpaes_key_preheat,.-_vpaes_key_preheat
625 stp x29, x30, [sp,#-16]!
630 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
633 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
635 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
649 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
662 // 128-bit specific part of key schedule.
671 sub x0, x0, #1 // dec %esi
680 // 192-bit specific part of key schedule.
682 // The main body of this schedule is the same as the 128-bit
695 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
697 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
698 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
703 sub x0, x0, #1 // dec %esi
705 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
708 bl _vpaes_schedule_mangle // save key n+1
718 // 256-bit specific part of key schedule.
720 // The structure here is very similar to the 128-bit
727 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
732 sub x0, x0, #1 // dec %esi
734 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
743 movi v4.16b, #0
744 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
745 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
747 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
773 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
777 sub x2, x2, #16 // add $-16, %rdx
778 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
783 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
784 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
785 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
786 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
787 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
788 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
789 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
790 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
791 ldp x29, x30, [sp],#16
794 .size _vpaes_schedule_core,.-_vpaes_schedule_core
799 // Smear the short, low side in the 192-bit key schedule.
813 movi v1.16b, #0
815 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
816 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
817 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
818 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
819 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
820 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
823 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
841 // Clobbers %xmm1-%xmm4, %r11.
847 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
848 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
849 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
850 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
854 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
861 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
862 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
863 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
866 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
867 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
868 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
869 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
870 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
871 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
872 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
873 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
874 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
875 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
876 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
877 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
878 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
879 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
880 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
881 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
882 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
885 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
886 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
888 .size _vpaes_schedule_round,.-_vpaes_schedule_round
893 // Linear-transform %xmm0 according to tables at (%r11)
902 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
903 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
905 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
906 // vmovdqa 16(%r11), %xmm1 # hi
907 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
908 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
910 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
915 // Mangle xmm0 from (basis-transformed) standard version
920 // multiply by circulant 0,1,1,1
933 // Clobbers xmm1-xmm5
938 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
943 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
944 add x2, x2, #16 // add $16, %rdx
945 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
946 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
947 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
948 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
950 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
957 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
958 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
961 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
963 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
964 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
965 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
968 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
969 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
971 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
972 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
973 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
976 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
977 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
979 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
980 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
983 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
984 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
986 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
988 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
989 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
991 sub x2, x2, #16 // add $-16, %rdx
994 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
995 add x8, x8, #64-16 // add $-16, %r8
996 and x8, x8, #~(1<<6) // and $0x30, %r8
999 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1006 stp x29,x30,[sp,#-16]!
1008 stp d8,d9,[sp,#-16]! // ABI spec says so
1012 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1019 ldp d8,d9,[sp],#16
1020 ldp x29,x30,[sp],#16
1023 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1030 stp x29,x30,[sp,#-16]!
1032 stp d8,d9,[sp,#-16]! // ABI spec says so
1036 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1038 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
1041 mov w3, #1 // mov $1,%ecx
1042 lsr w8, w1, #1 // shr $1,%r8d
1047 ldp d8,d9,[sp],#16
1048 ldp x29,x30,[sp],#16
1051 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1061 stp x29,x30,[sp,#-16]!
1067 ld1 {v0.16b}, [x4] // load ivec
1073 ld1 {v7.16b}, [x0],#16 // load input
1074 eor v7.16b, v7.16b, v0.16b // xor with ivec
1076 st1 {v0.16b}, [x1],#16 // save output
1077 subs x17, x17, #16
1080 st1 {v0.16b}, [x4] // write ivec
1082 ldp x29,x30,[sp],#16
1086 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1093 stp x29,x30,[sp,#-16]!
1095 stp d8,d9,[sp,#-16]! // ABI spec says so
1096 stp d10,d11,[sp,#-16]!
1097 stp d12,d13,[sp,#-16]!
1098 stp d14,d15,[sp,#-16]!
1102 ld1 {v6.16b}, [x4] // load ivec
1104 tst x17, #16
1107 ld1 {v7.16b}, [x0], #16 // load input
1109 eor v0.16b, v0.16b, v6.16b // xor with ivec
1110 orr v6.16b, v7.16b, v7.16b // next ivec value
1111 st1 {v0.16b}, [x1], #16
1112 subs x17, x17, #16
1117 ld1 {v14.16b,v15.16b}, [x0], #32
1119 eor v0.16b, v0.16b, v6.16b // xor with ivec
1120 eor v1.16b, v1.16b, v14.16b
1121 orr v6.16b, v15.16b, v15.16b
1122 st1 {v0.16b,v1.16b}, [x1], #32
1127 st1 {v6.16b}, [x4]
1129 ldp d14,d15,[sp],#16
1130 ldp d12,d13,[sp],#16
1131 ldp d10,d11,[sp],#16
1132 ldp d8,d9,[sp],#16
1133 ldp x29,x30,[sp],#16
1136 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1142 stp x29,x30,[sp,#-16]!
1144 stp d8,d9,[sp,#-16]! // ABI spec says so
1145 stp d10,d11,[sp,#-16]!
1146 stp d12,d13,[sp,#-16]!
1147 stp d14,d15,[sp,#-16]!
1152 tst x17, #16
1155 ld1 {v7.16b}, [x0],#16
1157 st1 {v0.16b}, [x1],#16
1158 subs x17, x17, #16
1163 ld1 {v14.16b,v15.16b}, [x0], #32
1165 st1 {v0.16b,v1.16b}, [x1], #32
1170 ldp d14,d15,[sp],#16
1171 ldp d12,d13,[sp],#16
1172 ldp d10,d11,[sp],#16
1173 ldp d8,d9,[sp],#16
1174 ldp x29,x30,[sp],#16
1177 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1184 stp x29,x30,[sp,#-16]!
1186 stp d8,d9,[sp,#-16]! // ABI spec says so
1187 stp d10,d11,[sp,#-16]!
1188 stp d12,d13,[sp,#-16]!
1189 stp d14,d15,[sp,#-16]!
1194 tst x17, #16
1197 ld1 {v7.16b}, [x0],#16
1199 st1 {v0.16b}, [x1],#16
1200 subs x17, x17, #16
1205 ld1 {v14.16b,v15.16b}, [x0], #32
1207 st1 {v0.16b,v1.16b}, [x1], #32
1212 ldp d14,d15,[sp],#16
1213 ldp d12,d13,[sp],#16
1214 ldp d10,d11,[sp],#16
1215 ldp d8,d9,[sp],#16
1216 ldp x29,x30,[sp],#16
1219 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt