xref: /freebsd/crypto/krb5/src/lib/crypto/builtin/aes/iaesx64.s (revision 7f2fe78b9dd5f51c821d771b63d2e096f6fd49e9)
1[bits 64]
2[CPU intelnop]
3
4; Copyright (c) 2010, Intel Corporation
5; All rights reserved.
6;
7; Redistribution and use in source and binary forms, with or without
8; modification, are permitted provided that the following conditions are met:
9;
10;     * Redistributions of source code must retain the above copyright notice,
11;       this list of conditions and the following disclaimer.
12;     * Redistributions in binary form must reproduce the above copyright notice,
13;       this list of conditions and the following disclaimer in the documentation
14;       and/or other materials provided with the distribution.
15;     * Neither the name of Intel Corporation nor the names of its contributors
16;       may be used to endorse or promote products derived from this software
17;       without specific prior written permission.
18;
19; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22; IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
23; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24; BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
27; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30%define iEncExpandKey128 k5_iEncExpandKey128
31%define iEncExpandKey256 k5_iEncExpandKey256
32%define iDecExpandKey128 k5_iDecExpandKey128
33%define iDecExpandKey256 k5_iDecExpandKey256
34%define iEnc128_CBC      k5_iEnc128_CBC
35%define iEnc256_CBC      k5_iEnc256_CBC
36%define iDec128_CBC      k5_iDec128_CBC
37%define iDec256_CBC      k5_iDec256_CBC
38
39%macro linux_setup 0
40%ifdef __linux__
41	mov rcx, rdi
42	mov rdx, rsi
43%endif
44%endmacro
45
46%macro inversekey 1
47	movdqu  xmm1,%1
48	aesimc	xmm0,xmm1
49	movdqu	%1,xmm0
50%endmacro
51
52%macro aesdeclast1 1
53	aesdeclast	xmm0,%1
54%endmacro
55
56%macro aesenclast1 1
57	aesenclast	xmm0,%1
58%endmacro
59
60%macro aesdec1 1
61	aesdec	xmm0,%1
62%endmacro
63
64%macro aesenc1 1
65	aesenc	xmm0,%1
66%endmacro
67
68
69%macro aesdeclast1_u 1
70	movdqu xmm4,%1
71	aesdeclast	xmm0,xmm4
72%endmacro
73
74%macro aesenclast1_u 1
75	movdqu xmm4,%1
76	aesenclast	xmm0,xmm4
77%endmacro
78
79%macro aesdec1_u 1
80	movdqu xmm4,%1
81	aesdec	xmm0,xmm4
82%endmacro
83
84%macro aesenc1_u 1
85	movdqu xmm4,%1
86	aesenc	xmm0,xmm4
87%endmacro
88
89%macro aesdec4 1
90	movdqa	xmm4,%1
91
92	aesdec	xmm0,xmm4
93	aesdec	xmm1,xmm4
94	aesdec	xmm2,xmm4
95	aesdec	xmm3,xmm4
96
97%endmacro
98
99%macro aesdeclast4 1
100	movdqa	xmm4,%1
101
102	aesdeclast	xmm0,xmm4
103	aesdeclast	xmm1,xmm4
104	aesdeclast	xmm2,xmm4
105	aesdeclast	xmm3,xmm4
106
107%endmacro
108
109
110%macro aesenc4 1
111	movdqa	xmm4,%1
112
113	aesenc	xmm0,xmm4
114	aesenc	xmm1,xmm4
115	aesenc	xmm2,xmm4
116	aesenc	xmm3,xmm4
117
118%endmacro
119
120%macro aesenclast4 1
121	movdqa	xmm4,%1
122
123	aesenclast	xmm0,xmm4
124	aesenclast	xmm1,xmm4
125	aesenclast	xmm2,xmm4
126	aesenclast	xmm3,xmm4
127
128%endmacro
129
130
131%macro xor_with_input4 1
132	movdqu xmm4,[%1]
133	pxor xmm0,xmm4
134	movdqu xmm4,[%1+16]
135	pxor xmm1,xmm4
136	movdqu xmm4,[%1+32]
137	pxor xmm2,xmm4
138	movdqu xmm4,[%1+48]
139	pxor xmm3,xmm4
140%endmacro
141
142
143
144%macro load_and_xor4 2
145	movdqa	xmm4,%2
146	movdqu	xmm0,[%1 + 0*16]
147	pxor	xmm0,xmm4
148	movdqu	xmm1,[%1 + 1*16]
149	pxor	xmm1,xmm4
150	movdqu	xmm2,[%1 + 2*16]
151	pxor	xmm2,xmm4
152	movdqu	xmm3,[%1 + 3*16]
153	pxor	xmm3,xmm4
154%endmacro
155
156%macro store4 1
157	movdqu [%1 + 0*16],xmm0
158	movdqu [%1 + 1*16],xmm1
159	movdqu [%1 + 2*16],xmm2
160	movdqu [%1 + 3*16],xmm3
161%endmacro
162
163%macro copy_round_keys 3
164	movdqu xmm4,[%2 + ((%3)*16)]
165	movdqa [%1 + ((%3)*16)],xmm4
166%endmacro
167
168
169%macro key_expansion_1_192 1
170		;; Assumes the xmm3 includes all zeros at this point.
171        pshufd xmm2, xmm2, 11111111b
172        shufps xmm3, xmm1, 00010000b
173        pxor xmm1, xmm3
174        shufps xmm3, xmm1, 10001100b
175        pxor xmm1, xmm3
176		pxor xmm1, xmm2
177		movdqu [rdx+%1], xmm1
178%endmacro
179
180; Calculate w10 and w11 using calculated w9 and known w4-w5
181%macro key_expansion_2_192 1
182		movdqa xmm5, xmm4
183		pslldq xmm5, 4
184		shufps xmm6, xmm1, 11110000b
185		pxor xmm6, xmm5
186		pxor xmm4, xmm6
187		pshufd xmm7, xmm4, 00001110b
188		movdqu [rdx+%1], xmm7
189%endmacro
190
191
192section .rodata
193align 16
194shuffle_mask:
195DD 0FFFFFFFFh
196DD 03020100h
197DD 07060504h
198DD 0B0A0908h
199
200
201
202section .text
203
204align 16
205key_expansion256:
206
207    pshufd xmm2, xmm2, 011111111b
208
209    movdqa xmm4, xmm1
210    pshufb xmm4, xmm5
211    pxor xmm1, xmm4
212    pshufb xmm4, xmm5
213    pxor xmm1, xmm4
214    pshufb xmm4, xmm5
215    pxor xmm1, xmm4
216    pxor xmm1, xmm2
217
218    movdqu [rdx], xmm1
219    add rdx, 0x10
220
221    aeskeygenassist xmm4, xmm1, 0
222    pshufd xmm2, xmm4, 010101010b
223
224    movdqa xmm4, xmm3
225    pshufb xmm4, xmm5
226    pxor xmm3, xmm4
227    pshufb xmm4, xmm5
228    pxor xmm3, xmm4
229    pshufb xmm4, xmm5
230    pxor xmm3, xmm4
231    pxor xmm3, xmm2
232
233    movdqu [rdx], xmm3
234    add rdx, 0x10
235
236    ret
237
238
239
240align 16
241key_expansion128:
242    pshufd xmm2, xmm2, 0xFF;
243    movdqa xmm3, xmm1
244    pshufb xmm3, xmm5
245    pxor xmm1, xmm3
246    pshufb xmm3, xmm5
247    pxor xmm1, xmm3
248    pshufb xmm3, xmm5
249    pxor xmm1, xmm3
250    pxor xmm1, xmm2
251
252    ; storing the result in the key schedule array
253    movdqu [rdx], xmm1
254    add rdx, 0x10
255    ret
256
257
258
259
260
261
262align 16
263global iEncExpandKey128
264iEncExpandKey128:
265
266		linux_setup
267
268        movdqu xmm1, [rcx]    ; loading the key
269
270        movdqu [rdx], xmm1
271
272        movdqa xmm5, [shuffle_mask wrt rip]
273
274        add rdx,16
275
276        aeskeygenassist xmm2, xmm1, 0x1     ; Generating round key 1
277        call key_expansion128
278        aeskeygenassist xmm2, xmm1, 0x2     ; Generating round key 2
279        call key_expansion128
280        aeskeygenassist xmm2, xmm1, 0x4     ; Generating round key 3
281        call key_expansion128
282        aeskeygenassist xmm2, xmm1, 0x8     ; Generating round key 4
283        call key_expansion128
284        aeskeygenassist xmm2, xmm1, 0x10    ; Generating round key 5
285        call key_expansion128
286        aeskeygenassist xmm2, xmm1, 0x20    ; Generating round key 6
287        call key_expansion128
288        aeskeygenassist xmm2, xmm1, 0x40    ; Generating round key 7
289        call key_expansion128
290        aeskeygenassist xmm2, xmm1, 0x80    ; Generating round key 8
291        call key_expansion128
292        aeskeygenassist xmm2, xmm1, 0x1b    ; Generating round key 9
293        call key_expansion128
294        aeskeygenassist xmm2, xmm1, 0x36    ; Generating round key 10
295        call key_expansion128
296
297		ret
298
299
300
301align 16
302global iDecExpandKey128
303iDecExpandKey128:
304
305	linux_setup
306	push rcx
307	push rdx
308	sub rsp,16+8
309
310	call iEncExpandKey128
311
312	add rsp,16+8
313	pop rdx
314	pop rcx
315
316	inversekey [rdx + 1*16]
317	inversekey [rdx + 2*16]
318	inversekey [rdx + 3*16]
319	inversekey [rdx + 4*16]
320	inversekey [rdx + 5*16]
321	inversekey [rdx + 6*16]
322	inversekey [rdx + 7*16]
323	inversekey [rdx + 8*16]
324	inversekey [rdx + 9*16]
325
326	ret
327
328
329
330align 16
331global iDecExpandKey256
332iDecExpandKey256:
333
334	linux_setup
335	push rcx
336	push rdx
337	sub rsp,16+8
338
339	call iEncExpandKey256
340
341	add rsp,16+8
342	pop rdx
343	pop rcx
344
345	inversekey [rdx + 1*16]
346	inversekey [rdx + 2*16]
347	inversekey [rdx + 3*16]
348	inversekey [rdx + 4*16]
349	inversekey [rdx + 5*16]
350	inversekey [rdx + 6*16]
351	inversekey [rdx + 7*16]
352	inversekey [rdx + 8*16]
353	inversekey [rdx + 9*16]
354	inversekey [rdx + 10*16]
355	inversekey [rdx + 11*16]
356	inversekey [rdx + 12*16]
357	inversekey [rdx + 13*16]
358
359	ret
360
361
362
363
364align 16
365global iEncExpandKey256
366iEncExpandKey256:
367
368	linux_setup
369
370    movdqu xmm1, [rcx]    ; loading the key
371    movdqu xmm3, [rcx+16]
372    movdqu [rdx], xmm1  ; Storing key in memory where all key schedule will be stored
373    movdqu [rdx+16], xmm3
374
375    add rdx,32
376
377    movdqa xmm5, [shuffle_mask wrt rip]  ; this mask is used by key_expansion
378
379    aeskeygenassist xmm2, xmm3, 0x1     ;
380    call key_expansion256
381    aeskeygenassist xmm2, xmm3, 0x2     ;
382    call key_expansion256
383    aeskeygenassist xmm2, xmm3, 0x4     ;
384    call key_expansion256
385    aeskeygenassist xmm2, xmm3, 0x8     ;
386    call key_expansion256
387    aeskeygenassist xmm2, xmm3, 0x10    ;
388    call key_expansion256
389    aeskeygenassist xmm2, xmm3, 0x20    ;
390    call key_expansion256
391    aeskeygenassist xmm2, xmm3, 0x40    ;
392;    call key_expansion256
393
394    pshufd xmm2, xmm2, 011111111b
395
396    movdqa xmm4, xmm1
397    pshufb xmm4, xmm5
398    pxor xmm1, xmm4
399    pshufb xmm4, xmm5
400    pxor xmm1, xmm4
401    pshufb xmm4, xmm5
402    pxor xmm1, xmm4
403    pxor xmm1, xmm2
404
405    movdqu [rdx], xmm1
406
407
408	ret
409
410
411
412align 16
413global iDec128_CBC
414iDec128_CBC:
415
416	linux_setup
417	sub rsp,16*16+8
418
419	mov r9,rcx
420	mov rax,[rcx+24]
421	movdqu	xmm5,[rax]
422
423	mov eax,[rcx+32] ; numblocks
424	mov rdx,[rcx]
425	mov r8,[rcx+8]
426	mov rcx,[rcx+16]
427
428
429	sub r8,rdx
430
431
432	test eax,eax
433	jz end_dec128_CBC
434
435	cmp eax,4
436	jl	lp128decsingle_CBC
437
438	test	rcx,0xf
439	jz		lp128decfour_CBC
440
441	copy_round_keys rsp,rcx,0
442	copy_round_keys rsp,rcx,1
443	copy_round_keys rsp,rcx,2
444	copy_round_keys rsp,rcx,3
445	copy_round_keys rsp,rcx,4
446	copy_round_keys rsp,rcx,5
447	copy_round_keys rsp,rcx,6
448	copy_round_keys rsp,rcx,7
449	copy_round_keys rsp,rcx,8
450	copy_round_keys rsp,rcx,9
451	copy_round_keys rsp,rcx,10
452	mov rcx,rsp
453
454
455align 16
456lp128decfour_CBC:
457
458	test eax,eax
459	jz end_dec128_CBC
460
461	cmp eax,4
462	jl	lp128decsingle_CBC
463
464	load_and_xor4 rdx, [rcx+10*16]
465	add rdx,16*4
466	aesdec4 [rcx+9*16]
467	aesdec4 [rcx+8*16]
468	aesdec4 [rcx+7*16]
469	aesdec4 [rcx+6*16]
470	aesdec4 [rcx+5*16]
471	aesdec4 [rcx+4*16]
472	aesdec4 [rcx+3*16]
473	aesdec4 [rcx+2*16]
474	aesdec4 [rcx+1*16]
475	aesdeclast4 [rcx+0*16]
476
477	pxor	xmm0,xmm5
478	movdqu	xmm4,[rdx - 16*4 + 0*16]
479	pxor	xmm1,xmm4
480	movdqu	xmm4,[rdx - 16*4 + 1*16]
481	pxor	xmm2,xmm4
482	movdqu	xmm4,[rdx - 16*4 + 2*16]
483	pxor	xmm3,xmm4
484	movdqu	xmm5,[rdx - 16*4 + 3*16]
485
486	sub eax,4
487	store4 r8+rdx-(16*4)
488	jmp lp128decfour_CBC
489
490
491	align 16
492lp128decsingle_CBC:
493
494	movdqu xmm0, [rdx]
495	movdqa	xmm1,xmm0
496	movdqu xmm4,[rcx+10*16]
497	pxor xmm0, xmm4
498	aesdec1_u [rcx+9*16]
499	aesdec1_u [rcx+8*16]
500	aesdec1_u [rcx+7*16]
501	aesdec1_u [rcx+6*16]
502	aesdec1_u [rcx+5*16]
503	aesdec1_u [rcx+4*16]
504	aesdec1_u [rcx+3*16]
505	aesdec1_u [rcx+2*16]
506	aesdec1_u [rcx+1*16]
507	aesdeclast1_u [rcx+0*16]
508
509	pxor	xmm0,xmm5
510	movdqa	xmm5,xmm1
511	add rdx, 16
512	movdqu  [r8 + rdx - 16], xmm0
513	dec eax
514	jnz lp128decsingle_CBC
515
516end_dec128_CBC:
517
518	mov	   r9,[r9+24]
519	movdqu [r9],xmm5
520	add rsp,16*16+8
521	ret
522
523
524
525align 16
526global iDec256_CBC
527iDec256_CBC:
528
529	linux_setup
530	sub rsp,16*16+8
531
532	mov r9,rcx
533	mov rax,[rcx+24]
534	movdqu	xmm5,[rax]
535
536	mov eax,[rcx+32] ; numblocks
537	mov rdx,[rcx]
538	mov r8,[rcx+8]
539	mov rcx,[rcx+16]
540
541
542	sub r8,rdx
543
544	test eax,eax
545	jz end_dec256_CBC
546
547	cmp eax,4
548	jl	lp256decsingle_CBC
549
550	test	rcx,0xf
551	jz		lp256decfour_CBC
552
553	copy_round_keys rsp,rcx,0
554	copy_round_keys rsp,rcx,1
555	copy_round_keys rsp,rcx,2
556	copy_round_keys rsp,rcx,3
557	copy_round_keys rsp,rcx,4
558	copy_round_keys rsp,rcx,5
559	copy_round_keys rsp,rcx,6
560	copy_round_keys rsp,rcx,7
561	copy_round_keys rsp,rcx,8
562	copy_round_keys rsp,rcx,9
563	copy_round_keys rsp,rcx,10
564	copy_round_keys rsp,rcx,11
565	copy_round_keys rsp,rcx,12
566	copy_round_keys rsp,rcx,13
567	copy_round_keys rsp,rcx,14
568	mov rcx,rsp
569
570align 16
571lp256decfour_CBC:
572
573	test eax,eax
574	jz end_dec256_CBC
575
576	cmp eax,4
577	jl	lp256decsingle_CBC
578
579	load_and_xor4 rdx, [rcx+14*16]
580	add rdx,16*4
581	aesdec4 [rcx+13*16]
582	aesdec4 [rcx+12*16]
583	aesdec4 [rcx+11*16]
584	aesdec4 [rcx+10*16]
585	aesdec4 [rcx+9*16]
586	aesdec4 [rcx+8*16]
587	aesdec4 [rcx+7*16]
588	aesdec4 [rcx+6*16]
589	aesdec4 [rcx+5*16]
590	aesdec4 [rcx+4*16]
591	aesdec4 [rcx+3*16]
592	aesdec4 [rcx+2*16]
593	aesdec4 [rcx+1*16]
594	aesdeclast4 [rcx+0*16]
595
596	pxor	xmm0,xmm5
597	movdqu	xmm4,[rdx - 16*4 + 0*16]
598	pxor	xmm1,xmm4
599	movdqu	xmm4,[rdx - 16*4 + 1*16]
600	pxor	xmm2,xmm4
601	movdqu	xmm4,[rdx - 16*4 + 2*16]
602	pxor	xmm3,xmm4
603	movdqu	xmm5,[rdx - 16*4 + 3*16]
604
605	sub eax,4
606	store4 r8+rdx-(16*4)
607	jmp lp256decfour_CBC
608
609
610	align 16
611lp256decsingle_CBC:
612
613	movdqu xmm0, [rdx]
614	movdqu xmm4,[rcx+14*16]
615	movdqa	xmm1,xmm0
616	pxor xmm0, xmm4
617	aesdec1_u [rcx+13*16]
618	aesdec1_u [rcx+12*16]
619	aesdec1_u [rcx+11*16]
620	aesdec1_u [rcx+10*16]
621	aesdec1_u [rcx+9*16]
622	aesdec1_u [rcx+8*16]
623	aesdec1_u [rcx+7*16]
624	aesdec1_u [rcx+6*16]
625	aesdec1_u [rcx+5*16]
626	aesdec1_u [rcx+4*16]
627	aesdec1_u [rcx+3*16]
628	aesdec1_u [rcx+2*16]
629	aesdec1_u [rcx+1*16]
630	aesdeclast1_u [rcx+0*16]
631
632	pxor	xmm0,xmm5
633	movdqa	xmm5,xmm1
634	add rdx, 16
635	movdqu  [r8 + rdx - 16], xmm0
636	dec eax
637	jnz lp256decsingle_CBC
638
639end_dec256_CBC:
640
641	mov	   r9,[r9+24]
642	movdqu [r9],xmm5
643	add rsp,16*16+8
644	ret
645
646
647
648align 16
649global iEnc128_CBC
650iEnc128_CBC:
651
652	linux_setup
653	sub rsp,16*16+8
654
655	mov r9,rcx
656	mov rax,[rcx+24]
657	movdqu xmm1,[rax]
658
659	mov eax,[rcx+32] ; numblocks
660	mov rdx,[rcx]
661	mov r8,[rcx+8]
662	mov rcx,[rcx+16]
663
664	sub r8,rdx
665
666
667	test	rcx,0xf
668	jz		lp128encsingle_CBC
669
670	copy_round_keys rsp,rcx,0
671	copy_round_keys rsp,rcx,1
672	copy_round_keys rsp,rcx,2
673	copy_round_keys rsp,rcx,3
674	copy_round_keys rsp,rcx,4
675	copy_round_keys rsp,rcx,5
676	copy_round_keys rsp,rcx,6
677	copy_round_keys rsp,rcx,7
678	copy_round_keys rsp,rcx,8
679	copy_round_keys rsp,rcx,9
680	copy_round_keys rsp,rcx,10
681	mov rcx,rsp
682
683
684	align 16
685
686lp128encsingle_CBC:
687
688	movdqu xmm0, [rdx]
689	movdqu xmm4,[rcx+0*16]
690	add rdx, 16
691	pxor xmm0, xmm1
692	pxor xmm0, xmm4
693	aesenc1 [rcx+1*16]
694	aesenc1 [rcx+2*16]
695	aesenc1 [rcx+3*16]
696	aesenc1 [rcx+4*16]
697	aesenc1 [rcx+5*16]
698	aesenc1 [rcx+6*16]
699	aesenc1 [rcx+7*16]
700	aesenc1 [rcx+8*16]
701	aesenc1 [rcx+9*16]
702	aesenclast1 [rcx+10*16]
703	movdqa xmm1,xmm0
704
705		; Store output encrypted data into CIPHERTEXT array
706	movdqu  [r8+rdx-16], xmm0
707	dec eax
708	jnz lp128encsingle_CBC
709
710	mov	   r9,[r9+24]
711	movdqu [r9],xmm1
712	add rsp,16*16+8
713	ret
714
715
716
717align 16
718global iEnc256_CBC
719iEnc256_CBC:
720
721	linux_setup
722	sub rsp,16*16+8
723
724	mov r9,rcx
725	mov rax,[rcx+24]
726	movdqu xmm1,[rax]
727
728	mov eax,[rcx+32] ; numblocks
729	mov rdx,[rcx]
730	mov r8,[rcx+8]
731	mov rcx,[rcx+16]
732
733	sub r8,rdx
734
735	test	rcx,0xf
736	jz		lp256encsingle_CBC
737
738	copy_round_keys rsp,rcx,0
739	copy_round_keys rsp,rcx,1
740	copy_round_keys rsp,rcx,2
741	copy_round_keys rsp,rcx,3
742	copy_round_keys rsp,rcx,4
743	copy_round_keys rsp,rcx,5
744	copy_round_keys rsp,rcx,6
745	copy_round_keys rsp,rcx,7
746	copy_round_keys rsp,rcx,8
747	copy_round_keys rsp,rcx,9
748	copy_round_keys rsp,rcx,10
749	copy_round_keys rsp,rcx,11
750	copy_round_keys rsp,rcx,12
751	copy_round_keys rsp,rcx,13
752	copy_round_keys rsp,rcx,14
753	mov rcx,rsp
754
755	align 16
756
757lp256encsingle_CBC:
758
759	movdqu xmm0, [rdx]
760	movdqu xmm4, [rcx+0*16]
761	add rdx, 16
762	pxor xmm0, xmm1
763	pxor xmm0, xmm4
764	aesenc1 [rcx+1*16]
765	aesenc1 [rcx+2*16]
766	aesenc1 [rcx+3*16]
767	aesenc1 [rcx+4*16]
768	aesenc1 [rcx+5*16]
769	aesenc1 [rcx+6*16]
770	aesenc1 [rcx+7*16]
771	aesenc1 [rcx+8*16]
772	aesenc1 [rcx+9*16]
773	aesenc1 [rcx+10*16]
774	aesenc1 [rcx+11*16]
775	aesenc1 [rcx+12*16]
776	aesenc1 [rcx+13*16]
777	aesenclast1 [rcx+14*16]
778	movdqa xmm1,xmm0
779
780		; Store output encrypted data into CIPHERTEXT array
781	movdqu  [r8+rdx-16], xmm0
782	dec eax
783	jnz lp256encsingle_CBC
784
785	mov	   r9,[r9+24]
786	movdqu [r9],xmm1
787	add rsp,16*16+8
788	ret
789
790; Mark this file as not needing an executable stack.
791%ifidn __OUTPUT_FORMAT__,elf
792section .note.GNU-stack noalloc noexec nowrite progbits
793%endif
794%ifidn __OUTPUT_FORMAT__,elf32
795section .note.GNU-stack noalloc noexec nowrite progbits
796%endif
797%ifidn __OUTPUT_FORMAT__,elf64
798section .note.GNU-stack noalloc noexec nowrite progbits
799%endif
800