xref: /linux/arch/x86/lib/checksum_32.S (revision 08ec212c0f92cbf30e3ecc7349f18151714041d6)
1/*
2 * INET		An implementation of the TCP/IP protocol suite for the LINUX
3 *		operating system.  INET is implemented using the  BSD Socket
4 *		interface as the means of communication with the user level.
5 *
6 *		IP/TCP/UDP checksumming routines
7 *
8 * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
9 *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
10 *		Tom May, <ftom@netcom.com>
11 *              Pentium Pro/II routines:
12 *              Alexander Kjeldaas <astor@guardian.no>
13 *              Finn Arne Gangstad <finnag@guardian.no>
14 *		Lots of code moved from tcp.c and ip.c; see those files
15 *		for more names.
16 *
17 * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
18 *			     handling.
19 *		Andi Kleen,  add zeroing on error
20 *                   converted to pure assembler
21 *
22 *		This program is free software; you can redistribute it and/or
23 *		modify it under the terms of the GNU General Public License
24 *		as published by the Free Software Foundation; either version
25 *		2 of the License, or (at your option) any later version.
26 */
27
28#include <linux/linkage.h>
29#include <asm/dwarf2.h>
30#include <asm/errno.h>
31#include <asm/asm.h>
32
33/*
34 * computes a partial checksum, e.g. for TCP/UDP fragments
35 */
36
37/*
38unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
39 */
40
41.text
42
43#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
44
45	  /*
46	   * Experiments with Ethernet and SLIP connections show that buff
47	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
48	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
49	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
50	   * alignment for the unrolled loop.
51	   */
52ENTRY(csum_partial)
53	CFI_STARTPROC
54	pushl_cfi %esi
55	CFI_REL_OFFSET esi, 0
56	pushl_cfi %ebx
57	CFI_REL_OFFSET ebx, 0
58	movl 20(%esp),%eax	# Function arg: unsigned int sum
59	movl 16(%esp),%ecx	# Function arg: int len
60	movl 12(%esp),%esi	# Function arg: unsigned char *buff
61	testl $3, %esi		# Check alignment.
62	jz 2f			# Jump if alignment is ok.
63	testl $1, %esi		# Check alignment.
64	jz 10f			# Jump if alignment is boundary of 2bytes.
65
66	# buf is odd
67	dec %ecx
68	jl 8f
69	movzbl (%esi), %ebx
70	adcl %ebx, %eax
71	roll $8, %eax
72	inc %esi
73	testl $2, %esi
74	jz 2f
7510:
76	subl $2, %ecx		# Alignment uses up two bytes.
77	jae 1f			# Jump if we had at least two bytes.
78	addl $2, %ecx		# ecx was < 2.  Deal with it.
79	jmp 4f
801:	movw (%esi), %bx
81	addl $2, %esi
82	addw %bx, %ax
83	adcl $0, %eax
842:
85	movl %ecx, %edx
86	shrl $5, %ecx
87	jz 2f
88	testl %esi, %esi
891:	movl (%esi), %ebx
90	adcl %ebx, %eax
91	movl 4(%esi), %ebx
92	adcl %ebx, %eax
93	movl 8(%esi), %ebx
94	adcl %ebx, %eax
95	movl 12(%esi), %ebx
96	adcl %ebx, %eax
97	movl 16(%esi), %ebx
98	adcl %ebx, %eax
99	movl 20(%esi), %ebx
100	adcl %ebx, %eax
101	movl 24(%esi), %ebx
102	adcl %ebx, %eax
103	movl 28(%esi), %ebx
104	adcl %ebx, %eax
105	lea 32(%esi), %esi
106	dec %ecx
107	jne 1b
108	adcl $0, %eax
1092:	movl %edx, %ecx
110	andl $0x1c, %edx
111	je 4f
112	shrl $2, %edx		# This clears CF
1133:	adcl (%esi), %eax
114	lea 4(%esi), %esi
115	dec %edx
116	jne 3b
117	adcl $0, %eax
1184:	andl $3, %ecx
119	jz 7f
120	cmpl $2, %ecx
121	jb 5f
122	movw (%esi),%cx
123	leal 2(%esi),%esi
124	je 6f
125	shll $16,%ecx
1265:	movb (%esi),%cl
1276:	addl %ecx,%eax
128	adcl $0, %eax
1297:
130	testl $1, 12(%esp)
131	jz 8f
132	roll $8, %eax
1338:
134	popl_cfi %ebx
135	CFI_RESTORE ebx
136	popl_cfi %esi
137	CFI_RESTORE esi
138	ret
139	CFI_ENDPROC
140ENDPROC(csum_partial)
141
142#else
143
144/* Version for PentiumII/PPro */
145
146ENTRY(csum_partial)
147	CFI_STARTPROC
148	pushl_cfi %esi
149	CFI_REL_OFFSET esi, 0
150	pushl_cfi %ebx
151	CFI_REL_OFFSET ebx, 0
152	movl 20(%esp),%eax	# Function arg: unsigned int sum
153	movl 16(%esp),%ecx	# Function arg: int len
154	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
155
156	testl $3, %esi
157	jnz 25f
15810:
159	movl %ecx, %edx
160	movl %ecx, %ebx
161	andl $0x7c, %ebx
162	shrl $7, %ecx
163	addl %ebx,%esi
164	shrl $2, %ebx
165	negl %ebx
166	lea 45f(%ebx,%ebx,2), %ebx
167	testl %esi, %esi
168	jmp *%ebx
169
170	# Handle 2-byte-aligned regions
17120:	addw (%esi), %ax
172	lea 2(%esi), %esi
173	adcl $0, %eax
174	jmp 10b
17525:
176	testl $1, %esi
177	jz 30f
178	# buf is odd
179	dec %ecx
180	jl 90f
181	movzbl (%esi), %ebx
182	addl %ebx, %eax
183	adcl $0, %eax
184	roll $8, %eax
185	inc %esi
186	testl $2, %esi
187	jz 10b
188
18930:	subl $2, %ecx
190	ja 20b
191	je 32f
192	addl $2, %ecx
193	jz 80f
194	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
195	addl %ebx, %eax
196	adcl $0, %eax
197	jmp 80f
19832:
199	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
200	adcl $0, %eax
201	jmp 80f
202
20340:
204	addl -128(%esi), %eax
205	adcl -124(%esi), %eax
206	adcl -120(%esi), %eax
207	adcl -116(%esi), %eax
208	adcl -112(%esi), %eax
209	adcl -108(%esi), %eax
210	adcl -104(%esi), %eax
211	adcl -100(%esi), %eax
212	adcl -96(%esi), %eax
213	adcl -92(%esi), %eax
214	adcl -88(%esi), %eax
215	adcl -84(%esi), %eax
216	adcl -80(%esi), %eax
217	adcl -76(%esi), %eax
218	adcl -72(%esi), %eax
219	adcl -68(%esi), %eax
220	adcl -64(%esi), %eax
221	adcl -60(%esi), %eax
222	adcl -56(%esi), %eax
223	adcl -52(%esi), %eax
224	adcl -48(%esi), %eax
225	adcl -44(%esi), %eax
226	adcl -40(%esi), %eax
227	adcl -36(%esi), %eax
228	adcl -32(%esi), %eax
229	adcl -28(%esi), %eax
230	adcl -24(%esi), %eax
231	adcl -20(%esi), %eax
232	adcl -16(%esi), %eax
233	adcl -12(%esi), %eax
234	adcl -8(%esi), %eax
235	adcl -4(%esi), %eax
23645:
237	lea 128(%esi), %esi
238	adcl $0, %eax
239	dec %ecx
240	jge 40b
241	movl %edx, %ecx
24250:	andl $3, %ecx
243	jz 80f
244
245	# Handle the last 1-3 bytes without jumping
246	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
247	movl $0xffffff,%ebx	# by the shll and shrl instructions
248	shll $3,%ecx
249	shrl %cl,%ebx
250	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
251	addl %ebx,%eax
252	adcl $0,%eax
25380:
254	testl $1, 12(%esp)
255	jz 90f
256	roll $8, %eax
25790:
258	popl_cfi %ebx
259	CFI_RESTORE ebx
260	popl_cfi %esi
261	CFI_RESTORE esi
262	ret
263	CFI_ENDPROC
264ENDPROC(csum_partial)
265
266#endif
267
268/*
269unsigned int csum_partial_copy_generic (const char *src, char *dst,
270				  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
271 */
272
273/*
274 * Copy from ds while checksumming, otherwise like csum_partial
275 *
276 * The macros SRC and DST specify the type of access for the instruction.
277 * thus we can call a custom exception handler for all access types.
278 *
279 * FIXME: could someone double-check whether I haven't mixed up some SRC and
280 *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
281 *	  them all but there's no guarantee.
282 */
283
284#define SRC(y...)			\
285	9999: y;			\
286	_ASM_EXTABLE(9999b, 6001f)
287
288#define DST(y...)			\
289	9999: y;			\
290	_ASM_EXTABLE(9999b, 6002f)
291
292#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
293
294#define ARGBASE 16
295#define FP		12
296
297ENTRY(csum_partial_copy_generic)
298	CFI_STARTPROC
299	subl  $4,%esp
300	CFI_ADJUST_CFA_OFFSET 4
301	pushl_cfi %edi
302	CFI_REL_OFFSET edi, 0
303	pushl_cfi %esi
304	CFI_REL_OFFSET esi, 0
305	pushl_cfi %ebx
306	CFI_REL_OFFSET ebx, 0
307	movl ARGBASE+16(%esp),%eax	# sum
308	movl ARGBASE+12(%esp),%ecx	# len
309	movl ARGBASE+4(%esp),%esi	# src
310	movl ARGBASE+8(%esp),%edi	# dst
311
312	testl $2, %edi			# Check alignment.
313	jz 2f				# Jump if alignment is ok.
314	subl $2, %ecx			# Alignment uses up two bytes.
315	jae 1f				# Jump if we had at least two bytes.
316	addl $2, %ecx			# ecx was < 2.  Deal with it.
317	jmp 4f
318SRC(1:	movw (%esi), %bx	)
319	addl $2, %esi
320DST(	movw %bx, (%edi)	)
321	addl $2, %edi
322	addw %bx, %ax
323	adcl $0, %eax
3242:
325	movl %ecx, FP(%esp)
326	shrl $5, %ecx
327	jz 2f
328	testl %esi, %esi
329SRC(1:	movl (%esi), %ebx	)
330SRC(	movl 4(%esi), %edx	)
331	adcl %ebx, %eax
332DST(	movl %ebx, (%edi)	)
333	adcl %edx, %eax
334DST(	movl %edx, 4(%edi)	)
335
336SRC(	movl 8(%esi), %ebx	)
337SRC(	movl 12(%esi), %edx	)
338	adcl %ebx, %eax
339DST(	movl %ebx, 8(%edi)	)
340	adcl %edx, %eax
341DST(	movl %edx, 12(%edi)	)
342
343SRC(	movl 16(%esi), %ebx 	)
344SRC(	movl 20(%esi), %edx	)
345	adcl %ebx, %eax
346DST(	movl %ebx, 16(%edi)	)
347	adcl %edx, %eax
348DST(	movl %edx, 20(%edi)	)
349
350SRC(	movl 24(%esi), %ebx	)
351SRC(	movl 28(%esi), %edx	)
352	adcl %ebx, %eax
353DST(	movl %ebx, 24(%edi)	)
354	adcl %edx, %eax
355DST(	movl %edx, 28(%edi)	)
356
357	lea 32(%esi), %esi
358	lea 32(%edi), %edi
359	dec %ecx
360	jne 1b
361	adcl $0, %eax
3622:	movl FP(%esp), %edx
363	movl %edx, %ecx
364	andl $0x1c, %edx
365	je 4f
366	shrl $2, %edx			# This clears CF
367SRC(3:	movl (%esi), %ebx	)
368	adcl %ebx, %eax
369DST(	movl %ebx, (%edi)	)
370	lea 4(%esi), %esi
371	lea 4(%edi), %edi
372	dec %edx
373	jne 3b
374	adcl $0, %eax
3754:	andl $3, %ecx
376	jz 7f
377	cmpl $2, %ecx
378	jb 5f
379SRC(	movw (%esi), %cx	)
380	leal 2(%esi), %esi
381DST(	movw %cx, (%edi)	)
382	leal 2(%edi), %edi
383	je 6f
384	shll $16,%ecx
385SRC(5:	movb (%esi), %cl	)
386DST(	movb %cl, (%edi)	)
3876:	addl %ecx, %eax
388	adcl $0, %eax
3897:
3905000:
391
392# Exception handler:
393.section .fixup, "ax"
394
3956001:
396	movl ARGBASE+20(%esp), %ebx	# src_err_ptr
397	movl $-EFAULT, (%ebx)
398
399	# zero the complete destination - computing the rest
400	# is too much work
401	movl ARGBASE+8(%esp), %edi	# dst
402	movl ARGBASE+12(%esp), %ecx	# len
403	xorl %eax,%eax
404	rep ; stosb
405
406	jmp 5000b
407
4086002:
409	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
410	movl $-EFAULT,(%ebx)
411	jmp 5000b
412
413.previous
414
415	popl_cfi %ebx
416	CFI_RESTORE ebx
417	popl_cfi %esi
418	CFI_RESTORE esi
419	popl_cfi %edi
420	CFI_RESTORE edi
421	popl_cfi %ecx			# equivalent to addl $4,%esp
422	ret
423	CFI_ENDPROC
424ENDPROC(csum_partial_copy_generic)
425
426#else
427
428/* Version for PentiumII/PPro */
429
430#define ROUND1(x) \
431	SRC(movl x(%esi), %ebx	)	;	\
432	addl %ebx, %eax			;	\
433	DST(movl %ebx, x(%edi)	)	;
434
435#define ROUND(x) \
436	SRC(movl x(%esi), %ebx	)	;	\
437	adcl %ebx, %eax			;	\
438	DST(movl %ebx, x(%edi)	)	;
439
440#define ARGBASE 12
441
442ENTRY(csum_partial_copy_generic)
443	CFI_STARTPROC
444	pushl_cfi %ebx
445	CFI_REL_OFFSET ebx, 0
446	pushl_cfi %edi
447	CFI_REL_OFFSET edi, 0
448	pushl_cfi %esi
449	CFI_REL_OFFSET esi, 0
450	movl ARGBASE+4(%esp),%esi	#src
451	movl ARGBASE+8(%esp),%edi	#dst
452	movl ARGBASE+12(%esp),%ecx	#len
453	movl ARGBASE+16(%esp),%eax	#sum
454#	movl %ecx, %edx
455	movl %ecx, %ebx
456	movl %esi, %edx
457	shrl $6, %ecx
458	andl $0x3c, %ebx
459	negl %ebx
460	subl %ebx, %esi
461	subl %ebx, %edi
462	lea  -1(%esi),%edx
463	andl $-32,%edx
464	lea 3f(%ebx,%ebx), %ebx
465	testl %esi, %esi
466	jmp *%ebx
4671:	addl $64,%esi
468	addl $64,%edi
469	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
470	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
471	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
472	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
473	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)
4743:	adcl $0,%eax
475	addl $64, %edx
476	dec %ecx
477	jge 1b
4784:	movl ARGBASE+12(%esp),%edx	#len
479	andl $3, %edx
480	jz 7f
481	cmpl $2, %edx
482	jb 5f
483SRC(	movw (%esi), %dx         )
484	leal 2(%esi), %esi
485DST(	movw %dx, (%edi)         )
486	leal 2(%edi), %edi
487	je 6f
488	shll $16,%edx
4895:
490SRC(	movb (%esi), %dl         )
491DST(	movb %dl, (%edi)         )
4926:	addl %edx, %eax
493	adcl $0, %eax
4947:
495.section .fixup, "ax"
4966001:	movl	ARGBASE+20(%esp), %ebx	# src_err_ptr
497	movl $-EFAULT, (%ebx)
498	# zero the complete destination (computing the rest is too much work)
499	movl ARGBASE+8(%esp),%edi	# dst
500	movl ARGBASE+12(%esp),%ecx	# len
501	xorl %eax,%eax
502	rep; stosb
503	jmp 7b
5046002:	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
505	movl $-EFAULT, (%ebx)
506	jmp  7b
507.previous
508
509	popl_cfi %esi
510	CFI_RESTORE esi
511	popl_cfi %edi
512	CFI_RESTORE edi
513	popl_cfi %ebx
514	CFI_RESTORE ebx
515	ret
516	CFI_ENDPROC
517ENDPROC(csum_partial_copy_generic)
518
519#undef ROUND
520#undef ROUND1
521
522#endif
523