xref: /freebsd/crypto/krb5/src/lib/crypto/builtin/aes/iaesx86.s (revision 7f2fe78b9dd5f51c821d771b63d2e096f6fd49e9)
1[bits 32]
2[CPU intelnop]
3
4; Copyright (c) 2010, Intel Corporation
5; All rights reserved.
6;
7; Redistribution and use in source and binary forms, with or without
8; modification, are permitted provided that the following conditions are met:
9;
10;     * Redistributions of source code must retain the above copyright notice,
11;       this list of conditions and the following disclaimer.
12;     * Redistributions in binary form must reproduce the above copyright notice,
13;       this list of conditions and the following disclaimer in the documentation
14;       and/or other materials provided with the distribution.
15;     * Neither the name of Intel Corporation nor the names of its contributors
16;       may be used to endorse or promote products derived from this software
17;       without specific prior written permission.
18;
19; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22; IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
23; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24; BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
27; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30%define _iEncExpandKey128 k5_iEncExpandKey128
31%define _iEncExpandKey256 k5_iEncExpandKey256
32%define _iDecExpandKey128 k5_iDecExpandKey128
33%define _iDecExpandKey256 k5_iDecExpandKey256
34%define _iEnc128_CBC      k5_iEnc128_CBC
35%define _iEnc256_CBC      k5_iEnc256_CBC
36%define _iDec128_CBC      k5_iDec128_CBC
37%define _iDec256_CBC      k5_iDec256_CBC
38
39%macro inversekey 1
40	movdqu  xmm1,%1
41	aesimc	xmm0,xmm1
42	movdqu	%1,xmm0
43%endmacro
44
45
46%macro aesdec4 1
47	movdqa	xmm4,%1
48
49	aesdec	xmm0,xmm4
50	aesdec	xmm1,xmm4
51	aesdec	xmm2,xmm4
52	aesdec	xmm3,xmm4
53
54%endmacro
55
56
57%macro aesdeclast4 1
58	movdqa	xmm4,%1
59
60	aesdeclast	xmm0,xmm4
61	aesdeclast	xmm1,xmm4
62	aesdeclast	xmm2,xmm4
63	aesdeclast	xmm3,xmm4
64
65%endmacro
66
67
68%macro aesenc4 1
69	movdqa	xmm4,%1
70
71	aesenc	xmm0,xmm4
72	aesenc	xmm1,xmm4
73	aesenc	xmm2,xmm4
74	aesenc	xmm3,xmm4
75
76%endmacro
77
78%macro aesenclast4 1
79	movdqa	xmm4,%1
80
81	aesenclast	xmm0,xmm4
82	aesenclast	xmm1,xmm4
83	aesenclast	xmm2,xmm4
84	aesenclast	xmm3,xmm4
85
86%endmacro
87
88
89%macro aesdeclast1 1
90	aesdeclast	xmm0,%1
91%endmacro
92
93%macro aesenclast1 1
94	aesenclast	xmm0,%1
95%endmacro
96
97%macro aesdec1 1
98	aesdec	xmm0,%1
99%endmacro
100
101;abab
102%macro aesenc1 1
103	aesenc	xmm0,%1
104%endmacro
105
106
107%macro aesdeclast1_u 1
108	movdqu xmm4,%1
109	aesdeclast	xmm0,xmm4
110%endmacro
111
112%macro aesenclast1_u 1
113	movdqu xmm4,%1
114	aesenclast	xmm0,xmm4
115%endmacro
116
117%macro aesdec1_u 1
118	movdqu xmm4,%1
119	aesdec	xmm0,xmm4
120%endmacro
121
122%macro aesenc1_u 1
123	movdqu xmm4,%1
124	aesenc	xmm0,xmm4
125%endmacro
126
127
128%macro load_and_xor4 2
129	movdqa	xmm4,%2
130	movdqu	xmm0,[%1 + 0*16]
131	pxor	xmm0,xmm4
132	movdqu	xmm1,[%1 + 1*16]
133	pxor	xmm1,xmm4
134	movdqu	xmm2,[%1 + 2*16]
135	pxor	xmm2,xmm4
136	movdqu	xmm3,[%1 + 3*16]
137	pxor	xmm3,xmm4
138%endmacro
139
140
141%macro xor_with_input4 1
142	movdqu xmm4,[%1]
143	pxor xmm0,xmm4
144	movdqu xmm4,[%1+16]
145	pxor xmm1,xmm4
146	movdqu xmm4,[%1+32]
147	pxor xmm2,xmm4
148	movdqu xmm4,[%1+48]
149	pxor xmm3,xmm4
150%endmacro
151
152%macro store4 1
153	movdqu [%1 + 0*16],xmm0
154	movdqu [%1 + 1*16],xmm1
155	movdqu [%1 + 2*16],xmm2
156	movdqu [%1 + 3*16],xmm3
157%endmacro
158
159
160%macro copy_round_keys 3
161	movdqu xmm4,[%2 + ((%3)*16)]
162	movdqa [%1 + ((%3)*16)],xmm4
163%endmacro
164
165;abab
166%macro copy_round_keyx 3
167	movdqu xmm4,[%2 + ((%3)*16)]
168	movdqa %1,xmm4
169%endmacro
170
171
172
173%macro key_expansion_1_192 1
174		;; Assumes the xmm3 includes all zeros at this point.
175        pshufd xmm2, xmm2, 11111111b
176        shufps xmm3, xmm1, 00010000b
177        pxor xmm1, xmm3
178        shufps xmm3, xmm1, 10001100b
179        pxor xmm1, xmm3
180		pxor xmm1, xmm2
181		movdqu [edx+%1], xmm1
182%endmacro
183
184; Calculate w10 and w11 using calculated w9 and known w4-w5
185%macro key_expansion_2_192 1
186		movdqa xmm5, xmm4
187		pslldq xmm5, 4
188		shufps xmm6, xmm1, 11110000b
189		pxor xmm6, xmm5
190		pxor xmm4, xmm6
191		pshufd xmm7, xmm4, 00001110b
192		movdqu [edx+%1], xmm7
193%endmacro
194
195
196
197
198
199section .rodata
200align 16
201shuffle_mask:
202DD 0FFFFFFFFh
203DD 03020100h
204DD 07060504h
205DD 0B0A0908h
206
207
208section .text
209
210
211
212align 16
213key_expansion256:
214
215    pshufd xmm2, xmm2, 011111111b
216
217    movdqu xmm4, xmm1
218    pshufb xmm4, xmm5
219    pxor xmm1, xmm4
220    pshufb xmm4, xmm5
221    pxor xmm1, xmm4
222    pshufb xmm4, xmm5
223    pxor xmm1, xmm4
224    pxor xmm1, xmm2
225
226    movdqu [edx], xmm1
227    add edx, 0x10
228
229    aeskeygenassist xmm4, xmm1, 0
230    pshufd xmm2, xmm4, 010101010b
231
232    movdqu xmm4, xmm3
233    pshufb xmm4, xmm5
234    pxor xmm3, xmm4
235    pshufb xmm4, xmm5
236    pxor xmm3, xmm4
237    pshufb xmm4, xmm5
238    pxor xmm3, xmm4
239    pxor xmm3, xmm2
240
241    movdqu [edx], xmm3
242    add edx, 0x10
243
244    ret
245
246
247
248align 16
249key_expansion128:
250    pshufd xmm2, xmm2, 0xFF;
251    movdqu xmm3, xmm1
252    pshufb xmm3, xmm5
253    pxor xmm1, xmm3
254    pshufb xmm3, xmm5
255    pxor xmm1, xmm3
256    pshufb xmm3, xmm5
257    pxor xmm1, xmm3
258    pxor xmm1, xmm2
259
260    ; storing the result in the key schedule array
261    movdqu [edx], xmm1
262    add edx, 0x10
263    ret
264
265
266
267align 16
268global _iEncExpandKey128
269_iEncExpandKey128:
270
271	mov ecx,[esp-4+8]		;input
272	mov edx,[esp-4+12]		;ctx
273
274        movdqu xmm1, [ecx]    ; loading the key
275
276        movdqu [edx], xmm1
277
278        call .next
279.next:
280        pop ecx
281        movdqa xmm5, [ecx-.next+shuffle_mask]
282
283        add edx,16
284
285        aeskeygenassist xmm2, xmm1, 0x1     ; Generating round key 1
286        call key_expansion128
287        aeskeygenassist xmm2, xmm1, 0x2     ; Generating round key 2
288        call key_expansion128
289        aeskeygenassist xmm2, xmm1, 0x4     ; Generating round key 3
290        call key_expansion128
291        aeskeygenassist xmm2, xmm1, 0x8     ; Generating round key 4
292        call key_expansion128
293        aeskeygenassist xmm2, xmm1, 0x10    ; Generating round key 5
294        call key_expansion128
295        aeskeygenassist xmm2, xmm1, 0x20    ; Generating round key 6
296        call key_expansion128
297        aeskeygenassist xmm2, xmm1, 0x40    ; Generating round key 7
298        call key_expansion128
299        aeskeygenassist xmm2, xmm1, 0x80    ; Generating round key 8
300        call key_expansion128
301        aeskeygenassist xmm2, xmm1, 0x1b    ; Generating round key 9
302        call key_expansion128
303        aeskeygenassist xmm2, xmm1, 0x36    ; Generating round key 10
304        call key_expansion128
305
306	ret
307
308
309
310align 16
311global _iDecExpandKey128
312_iDecExpandKey128:
313	push DWORD [esp+8]
314	push DWORD [esp+8]
315
316	call _iEncExpandKey128
317	add esp,8
318
319	mov edx,[esp-4+12]		;ctx
320
321	inversekey	[edx + 1*16]
322	inversekey	[edx + 2*16]
323	inversekey	[edx + 3*16]
324	inversekey	[edx + 4*16]
325	inversekey	[edx + 5*16]
326	inversekey	[edx + 6*16]
327	inversekey	[edx + 7*16]
328	inversekey	[edx + 8*16]
329	inversekey	[edx + 9*16]
330
331	ret
332
333
334
335align 16
336global _iDecExpandKey256
337_iDecExpandKey256:
338	push DWORD [esp+8]
339	push DWORD [esp+8]
340
341	call _iEncExpandKey256
342	add esp, 8
343
344	mov edx, [esp-4+12]		;expanded key
345
346	inversekey	[edx + 1*16]
347	inversekey	[edx + 2*16]
348	inversekey	[edx + 3*16]
349	inversekey	[edx + 4*16]
350	inversekey	[edx + 5*16]
351	inversekey	[edx + 6*16]
352	inversekey	[edx + 7*16]
353	inversekey	[edx + 8*16]
354	inversekey	[edx + 9*16]
355	inversekey	[edx + 10*16]
356	inversekey	[edx + 11*16]
357	inversekey	[edx + 12*16]
358	inversekey	[edx + 13*16]
359
360	ret
361
362
363
364
365align 16
366global _iEncExpandKey256
367_iEncExpandKey256:
368	mov ecx, [esp-4+8]		;input
369	mov edx, [esp-4+12]		;expanded key
370
371
372    movdqu xmm1, [ecx]    ; loading the key
373    movdqu xmm3, [ecx+16]
374    movdqu [edx], xmm1  ; Storing key in memory where all key schedule will be stored
375    movdqu [edx+16], xmm3
376
377    add edx,32
378
379    call .next
380.next:
381    pop ecx
382    movdqa xmm5, [ecx-.next+shuffle_mask]  ; this mask is used by key_expansion
383
384    aeskeygenassist xmm2, xmm3, 0x1     ;
385    call key_expansion256
386    aeskeygenassist xmm2, xmm3, 0x2     ;
387    call key_expansion256
388    aeskeygenassist xmm2, xmm3, 0x4     ;
389    call key_expansion256
390    aeskeygenassist xmm2, xmm3, 0x8     ;
391    call key_expansion256
392    aeskeygenassist xmm2, xmm3, 0x10    ;
393    call key_expansion256
394    aeskeygenassist xmm2, xmm3, 0x20    ;
395    call key_expansion256
396    aeskeygenassist xmm2, xmm3, 0x40    ;
397;    call key_expansion256
398
399    pshufd xmm2, xmm2, 011111111b
400
401    movdqu xmm4, xmm1
402    pshufb xmm4, xmm5
403    pxor xmm1, xmm4
404    pshufb xmm4, xmm5
405    pxor xmm1, xmm4
406    pshufb xmm4, xmm5
407    pxor xmm1, xmm4
408    pxor xmm1, xmm2
409
410    movdqu [edx], xmm1
411
412
413	ret
414
415
416
417align 16
418global _iDec128_CBC
419_iDec128_CBC:
420	mov ecx,[esp-4+8]
421
422	push esi
423	push edi
424	push ebp
425	mov ebp,esp
426	sub esp,16*16
427	and esp,0xfffffff0
428
429	mov eax,[ecx+12]
430	movdqu xmm5,[eax]	;iv
431
432	mov eax,[ecx+16] ; numblocks
433	mov esi,[ecx]
434	mov edi,[ecx+4]
435	mov ecx,[ecx+8]
436
437	sub edi,esi
438
439	test eax,eax
440	jz end_dec128_CBC
441
442	cmp eax,4
443	jl	lp128decsingle_CBC
444
445	test	ecx,0xf
446	jz		lp128decfour_CBC
447
448	copy_round_keys esp,ecx,0
449	copy_round_keys esp,ecx,1
450	copy_round_keys esp,ecx,2
451	copy_round_keys esp,ecx,3
452	copy_round_keys esp,ecx,4
453	copy_round_keys esp,ecx,5
454	copy_round_keys esp,ecx,6
455	copy_round_keys esp,ecx,7
456	copy_round_keys esp,ecx,8
457	copy_round_keys esp,ecx,9
458	copy_round_keys esp,ecx,10
459	mov ecx,esp
460
461
462align 16
463lp128decfour_CBC:
464
465	test eax,eax
466	jz end_dec128_CBC
467
468	cmp eax,4
469	jl	lp128decsingle_CBC
470
471	load_and_xor4 esi, [ecx+10*16]
472	add esi,16*4
473	aesdec4 [ecx+9*16]
474	aesdec4 [ecx+8*16]
475	aesdec4 [ecx+7*16]
476	aesdec4 [ecx+6*16]
477	aesdec4 [ecx+5*16]
478	aesdec4 [ecx+4*16]
479	aesdec4 [ecx+3*16]
480	aesdec4 [ecx+2*16]
481	aesdec4 [ecx+1*16]
482	aesdeclast4 [ecx+0*16]
483
484	pxor	xmm0,xmm5
485	movdqu	xmm4,[esi- 16*4 + 0*16]
486	pxor	xmm1,xmm4
487	movdqu	xmm4,[esi- 16*4 + 1*16]
488	pxor	xmm2,xmm4
489	movdqu	xmm4,[esi- 16*4 + 2*16]
490	pxor	xmm3,xmm4
491	movdqu	xmm5,[esi- 16*4 + 3*16]
492
493	sub eax,4
494	store4 esi+edi-(16*4)
495	jmp lp128decfour_CBC
496
497
498	align 16
499lp128decsingle_CBC:
500
501	movdqu xmm0, [esi]
502	movdqa xmm1,xmm0
503	movdqu xmm4,[ecx+10*16]
504	pxor xmm0, xmm4
505	aesdec1_u  [ecx+9*16]
506	aesdec1_u  [ecx+8*16]
507	aesdec1_u  [ecx+7*16]
508	aesdec1_u  [ecx+6*16]
509	aesdec1_u  [ecx+5*16]
510	aesdec1_u  [ecx+4*16]
511	aesdec1_u  [ecx+3*16]
512	aesdec1_u  [ecx+2*16]
513	aesdec1_u  [ecx+1*16]
514	aesdeclast1_u [ecx+0*16]
515
516	pxor	xmm0,xmm5
517	movdqa	xmm5,xmm1
518
519	add esi, 16
520	movdqu  [edi+esi - 16], xmm0
521	dec eax
522	jnz lp128decsingle_CBC
523
524end_dec128_CBC:
525
526	mov esp,ebp
527	pop ebp
528	pop edi
529	pop esi
530
531	mov ecx,[esp-4+8]   ; first arg
532	mov ecx,[ecx+12]
533	movdqu	[ecx],xmm5 ; store last iv for chaining
534
535	ret
536
537
538
539align 16
540global _iDec256_CBC
541_iDec256_CBC:
542	mov ecx,[esp-4+8]
543
544	push esi
545	push edi
546	push ebp
547	mov ebp,esp
548
549	sub esp,16*16
550	and esp,0xfffffff0
551
552	mov eax,[ecx+12]
553	movdqu xmm5,[eax]	;iv
554
555	mov eax,[ecx+16] ; numblocks
556	mov esi,[ecx]
557	mov edi,[ecx+4]
558	mov ecx,[ecx+8]
559
560	sub edi,esi
561
562	test eax,eax
563	jz end_dec256_CBC
564
565	cmp eax,4
566	jl	lp256decsingle_CBC
567
568	test	ecx,0xf
569	jz	lp256decfour_CBC
570
571	copy_round_keys esp,ecx,0
572	copy_round_keys esp,ecx,1
573	copy_round_keys esp,ecx,2
574	copy_round_keys esp,ecx,3
575	copy_round_keys esp,ecx,4
576	copy_round_keys esp,ecx,5
577	copy_round_keys esp,ecx,6
578	copy_round_keys esp,ecx,7
579	copy_round_keys esp,ecx,8
580	copy_round_keys esp,ecx,9
581	copy_round_keys esp,ecx,10
582	copy_round_keys esp,ecx,11
583	copy_round_keys esp,ecx,12
584	copy_round_keys esp,ecx,13
585	copy_round_keys esp,ecx,14
586	mov ecx,esp
587
588align 16
589lp256decfour_CBC:
590
591	test eax,eax
592	jz end_dec256_CBC
593
594	cmp eax,4
595	jl	lp256decsingle_CBC
596
597	load_and_xor4 esi, [ecx+14*16]
598	add esi,16*4
599	aesdec4 [ecx+13*16]
600	aesdec4 [ecx+12*16]
601	aesdec4 [ecx+11*16]
602	aesdec4 [ecx+10*16]
603	aesdec4 [ecx+9*16]
604	aesdec4 [ecx+8*16]
605	aesdec4 [ecx+7*16]
606	aesdec4 [ecx+6*16]
607	aesdec4 [ecx+5*16]
608	aesdec4 [ecx+4*16]
609	aesdec4 [ecx+3*16]
610	aesdec4 [ecx+2*16]
611	aesdec4 [ecx+1*16]
612	aesdeclast4 [ecx+0*16]
613
614	pxor	xmm0,xmm5
615	movdqu	xmm4,[esi- 16*4 + 0*16]
616	pxor	xmm1,xmm4
617	movdqu	xmm4,[esi- 16*4 + 1*16]
618	pxor	xmm2,xmm4
619	movdqu	xmm4,[esi- 16*4 + 2*16]
620	pxor	xmm3,xmm4
621	movdqu	xmm5,[esi- 16*4 + 3*16]
622
623	sub eax,4
624	store4 esi+edi-(16*4)
625	jmp lp256decfour_CBC
626
627
628	align 16
629lp256decsingle_CBC:
630
631	movdqu xmm0, [esi]
632	movdqa xmm1,xmm0
633	movdqu xmm4, [ecx+14*16]
634	pxor xmm0, xmm4
635	aesdec1_u  [ecx+13*16]
636	aesdec1_u  [ecx+12*16]
637	aesdec1_u  [ecx+11*16]
638	aesdec1_u  [ecx+10*16]
639	aesdec1_u  [ecx+9*16]
640	aesdec1_u  [ecx+8*16]
641	aesdec1_u  [ecx+7*16]
642	aesdec1_u  [ecx+6*16]
643	aesdec1_u  [ecx+5*16]
644	aesdec1_u  [ecx+4*16]
645	aesdec1_u  [ecx+3*16]
646	aesdec1_u  [ecx+2*16]
647	aesdec1_u  [ecx+1*16]
648	aesdeclast1_u  [ecx+0*16]
649
650	pxor	xmm0,xmm5
651	movdqa	xmm5,xmm1
652
653	add esi, 16
654	movdqu  [edi+esi - 16], xmm0
655	dec eax
656	jnz lp256decsingle_CBC
657
658end_dec256_CBC:
659
660
661	mov esp,ebp
662	pop ebp
663	pop edi
664	pop esi
665
666	mov ecx,[esp-4+8]  ; first arg
667	mov ecx,[ecx+12]
668	movdqu	[ecx],xmm5 ; store last iv for chaining
669
670	ret
671
672
673
674align 16
675global _iEnc128_CBC
676_iEnc128_CBC:
677	mov ecx,[esp-4+8]
678
679	push esi
680	push edi
681	push ebp
682	mov ebp,esp
683
684	sub esp,16*16
685	and esp,0xfffffff0
686
687	mov	eax,[ecx+12]
688	movdqu xmm1,[eax]	;iv
689
690	mov eax,[ecx+16] ; numblocks
691	mov esi,[ecx]
692	mov edi,[ecx+4]
693	mov ecx,[ecx+8]
694	sub edi,esi
695
696	test	ecx,0xf
697	jz		lp128encsingle_CBC
698
699	copy_round_keys esp,ecx,0
700	copy_round_keys esp,ecx,1
701	copy_round_keys esp,ecx,2
702	copy_round_keys esp,ecx,3
703	copy_round_keys esp,ecx,4
704	copy_round_keys esp,ecx,5
705	copy_round_keys esp,ecx,6
706	copy_round_keys esp,ecx,7
707	copy_round_keys esp,ecx,8
708	copy_round_keys esp,ecx,9
709	copy_round_keys esp,ecx,10
710	mov ecx,esp
711
712	align 16
713
714lp128encsingle_CBC:
715
716	movdqu xmm0, [esi]
717	add esi, 16
718	pxor xmm0, xmm1
719	movdqu xmm4,[ecx+0*16]
720	pxor xmm0, xmm4
721	aesenc1  [ecx+1*16]
722	aesenc1  [ecx+2*16]
723	aesenc1  [ecx+3*16]
724	aesenc1  [ecx+4*16]
725	aesenc1  [ecx+5*16]
726	aesenc1  [ecx+6*16]
727	aesenc1  [ecx+7*16]
728	aesenc1  [ecx+8*16]
729	aesenc1  [ecx+9*16]
730	aesenclast1  [ecx+10*16]
731		; Store output encrypted data into CIPHERTEXT array
732	movdqu  [esi+edi-16], xmm0
733	movdqa xmm1,xmm0
734	dec eax
735	jnz lp128encsingle_CBC
736
737
738	mov esp,ebp
739	pop ebp
740	pop edi
741	pop esi
742	mov ecx,[esp-4+8]  ; first arg
743	mov ecx,[ecx+12]
744	movdqu	[ecx],xmm1 ; store last iv for chaining
745
746	ret
747
748
749
750align 16
751global _iEnc256_CBC
752_iEnc256_CBC:
753	mov ecx,[esp-4+8]  ; first arg
754
755	push esi
756	push edi
757	push ebp
758	mov ebp,esp
759
760	sub esp,16*16
761	and esp,0xfffffff0
762
763	mov	eax,[ecx+12]
764	movdqu xmm1,[eax]	;iv
765
766	mov eax,[ecx+16] ; numblocks
767	mov esi,[ecx]
768	mov edi,[ecx+4]
769	mov ecx,[ecx+8]
770	sub edi,esi
771
772	test	ecx,0xf
773	jz		lp256encsingle_CBC
774
775	copy_round_keys esp,ecx,0
776	copy_round_keys esp,ecx,1
777	copy_round_keys esp,ecx,2
778	copy_round_keys esp,ecx,3
779	copy_round_keys esp,ecx,4
780	copy_round_keys esp,ecx,5
781	copy_round_keys esp,ecx,6
782	copy_round_keys esp,ecx,7
783	copy_round_keys esp,ecx,8
784	copy_round_keys esp,ecx,9
785	copy_round_keys esp,ecx,10
786	copy_round_keys esp,ecx,11
787	copy_round_keys esp,ecx,12
788	copy_round_keys esp,ecx,13
789	copy_round_keys esp,ecx,14
790	mov ecx,esp
791
792	align 16
793
794lp256encsingle_CBC:
795
796;abab
797	movdqu xmm0, [esi]
798	add esi, 16
799	pxor xmm0, xmm1
800	movdqu xmm4,[ecx+0*16]
801	pxor xmm0, xmm4
802	aesenc1 [ecx+1*16]
803	aesenc1 [ecx+2*16]
804	aesenc1 [ecx+3*16]
805	aesenc1 [ecx+4*16]
806	aesenc1 [ecx+5*16]
807	aesenc1 [ecx+6*16]
808	aesenc1 [ecx+7*16]
809	aesenc1 [ecx+8*16]
810	aesenc1 [ecx+9*16]
811	aesenc1 [ecx+10*16]
812	aesenc1 [ecx+11*16]
813	aesenc1 [ecx+12*16]
814	aesenc1 [ecx+13*16]
815	aesenclast1 [ecx+14*16]
816		; Store output encrypted data into CIPHERTEXT array
817	movdqu  [esi+edi-16], xmm0
818	movdqa xmm1,xmm0
819	dec eax
820	jnz lp256encsingle_CBC
821
822
823	mov esp,ebp
824	pop ebp
825	pop edi
826	pop esi
827	mov ecx,[esp-4+8]
828	mov ecx,[ecx+12]
829	movdqu	[ecx],xmm1 ; store last iv for chaining
830
831	ret
832
833; Mark this file as not needing an executable stack.
834%ifidn __OUTPUT_FORMAT__,elf
835section .note.GNU-stack noalloc noexec nowrite progbits
836%endif
837%ifidn __OUTPUT_FORMAT__,elf32
838section .note.GNU-stack noalloc noexec nowrite progbits
839%endif
840%ifidn __OUTPUT_FORMAT__,elf64
841section .note.GNU-stack noalloc noexec nowrite progbits
842%endif
843