xref: /linux/arch/x86/lib/csum-copy_64.S (revision cfda8617e22a8bf217a613d0b3ba3a38778443ba)
1/*
2 * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License.  See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all.
7 */
8#include <linux/linkage.h>
9#include <asm/errno.h>
10#include <asm/asm.h>
11
12/*
13 * Checksum copy with exception handling.
14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
15 * destination is zeroed.
16 *
17 * Input
18 * rdi  source
19 * rsi  destination
20 * edx  len (32bit)
21 * ecx  sum (32bit)
22 * r8   src_err_ptr (int)
23 * r9   dst_err_ptr (int)
24 *
25 * Output
26 * eax  64bit sum. undefined in case of exception.
27 *
28 * Wrappers need to take care of valid exception sum and zeroing.
29 * They also should align source or destination to 8 bytes.
30 */
31
32	.macro source
3310:
34	_ASM_EXTABLE_UA(10b, .Lbad_source)
35	.endm
36
37	.macro dest
3820:
39	_ASM_EXTABLE_UA(20b, .Lbad_dest)
40	.endm
41
42	/*
43	 * No _ASM_EXTABLE_UA; this is used for intentional prefetch on a
44	 * potentially unmapped kernel address.
45	 */
46	.macro ignore L=.Lignore
4730:
48	_ASM_EXTABLE(30b, \L)
49	.endm
50
51
52SYM_FUNC_START(csum_partial_copy_generic)
53	cmpl	$3*64, %edx
54	jle	.Lignore
55
56.Lignore:
57	subq  $7*8, %rsp
58	movq  %rbx, 2*8(%rsp)
59	movq  %r12, 3*8(%rsp)
60	movq  %r14, 4*8(%rsp)
61	movq  %r13, 5*8(%rsp)
62	movq  %r15, 6*8(%rsp)
63
64	movq  %r8, (%rsp)
65	movq  %r9, 1*8(%rsp)
66
67	movl  %ecx, %eax
68	movl  %edx, %ecx
69
70	xorl  %r9d, %r9d
71	movq  %rcx, %r12
72
73	shrq  $6, %r12
74	jz	.Lhandle_tail       /* < 64 */
75
76	clc
77
78	/* main loop. clear in 64 byte blocks */
79	/* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
80	/* r11:	temp3, rdx: temp4, r12 loopcnt */
81	/* r10:	temp5, r15: temp6, r14 temp7, r13 temp8 */
82	.p2align 4
83.Lloop:
84	source
85	movq  (%rdi), %rbx
86	source
87	movq  8(%rdi), %r8
88	source
89	movq  16(%rdi), %r11
90	source
91	movq  24(%rdi), %rdx
92
93	source
94	movq  32(%rdi), %r10
95	source
96	movq  40(%rdi), %r15
97	source
98	movq  48(%rdi), %r14
99	source
100	movq  56(%rdi), %r13
101
102	ignore 2f
103	prefetcht0 5*64(%rdi)
1042:
105	adcq  %rbx, %rax
106	adcq  %r8, %rax
107	adcq  %r11, %rax
108	adcq  %rdx, %rax
109	adcq  %r10, %rax
110	adcq  %r15, %rax
111	adcq  %r14, %rax
112	adcq  %r13, %rax
113
114	decl %r12d
115
116	dest
117	movq %rbx, (%rsi)
118	dest
119	movq %r8, 8(%rsi)
120	dest
121	movq %r11, 16(%rsi)
122	dest
123	movq %rdx, 24(%rsi)
124
125	dest
126	movq %r10, 32(%rsi)
127	dest
128	movq %r15, 40(%rsi)
129	dest
130	movq %r14, 48(%rsi)
131	dest
132	movq %r13, 56(%rsi)
133
1343:
135
136	leaq 64(%rdi), %rdi
137	leaq 64(%rsi), %rsi
138
139	jnz	.Lloop
140
141	adcq  %r9, %rax
142
143	/* do last up to 56 bytes */
144.Lhandle_tail:
145	/* ecx:	count */
146	movl %ecx, %r10d
147	andl $63, %ecx
148	shrl $3, %ecx
149	jz	.Lfold
150	clc
151	.p2align 4
152.Lloop_8:
153	source
154	movq (%rdi), %rbx
155	adcq %rbx, %rax
156	decl %ecx
157	dest
158	movq %rbx, (%rsi)
159	leaq 8(%rsi), %rsi /* preserve carry */
160	leaq 8(%rdi), %rdi
161	jnz	.Lloop_8
162	adcq %r9, %rax	/* add in carry */
163
164.Lfold:
165	/* reduce checksum to 32bits */
166	movl %eax, %ebx
167	shrq $32, %rax
168	addl %ebx, %eax
169	adcl %r9d, %eax
170
171	/* do last up to 6 bytes */
172.Lhandle_7:
173	movl %r10d, %ecx
174	andl $7, %ecx
175	shrl $1, %ecx
176	jz   .Lhandle_1
177	movl $2, %edx
178	xorl %ebx, %ebx
179	clc
180	.p2align 4
181.Lloop_1:
182	source
183	movw (%rdi), %bx
184	adcl %ebx, %eax
185	decl %ecx
186	dest
187	movw %bx, (%rsi)
188	leaq 2(%rdi), %rdi
189	leaq 2(%rsi), %rsi
190	jnz .Lloop_1
191	adcl %r9d, %eax	/* add in carry */
192
193	/* handle last odd byte */
194.Lhandle_1:
195	testb $1, %r10b
196	jz    .Lende
197	xorl  %ebx, %ebx
198	source
199	movb (%rdi), %bl
200	dest
201	movb %bl, (%rsi)
202	addl %ebx, %eax
203	adcl %r9d, %eax		/* carry */
204
205.Lende:
206	movq 2*8(%rsp), %rbx
207	movq 3*8(%rsp), %r12
208	movq 4*8(%rsp), %r14
209	movq 5*8(%rsp), %r13
210	movq 6*8(%rsp), %r15
211	addq $7*8, %rsp
212	ret
213
214	/* Exception handlers. Very simple, zeroing is done in the wrappers */
215.Lbad_source:
216	movq (%rsp), %rax
217	testq %rax, %rax
218	jz   .Lende
219	movl $-EFAULT, (%rax)
220	jmp  .Lende
221
222.Lbad_dest:
223	movq 8(%rsp), %rax
224	testq %rax, %rax
225	jz   .Lende
226	movl $-EFAULT, (%rax)
227	jmp .Lende
228SYM_FUNC_END(csum_partial_copy_generic)
229