xref: /linux/arch/powerpc/lib/checksum_64.S (revision 42249094f79422fbf5ed4b54eeb48ff096809b8f)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19
20/*
21 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
22 * len is in words and is always >= 5.
23 *
24 * In practice len == 5, but this is not guaranteed.  So this code does not
25 * attempt to use doubleword instructions.
26 */
27_GLOBAL(ip_fast_csum)
28	lwz	r0,0(r3)
29	lwzu	r5,4(r3)
30	addic.	r4,r4,-2
31	addc	r0,r0,r5
32	mtctr	r4
33	blelr-
341:	lwzu	r4,4(r3)
35	adde	r0,r0,r4
36	bdnz	1b
37	addze	r0,r0		/* add in final carry */
38        rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
39        add     r0,r0,r4
40        srdi    r0,r0,32
41	rlwinm	r3,r0,16,0,31	/* fold two halves together */
42	add	r3,r0,r3
43	not	r3,r3
44	srwi	r3,r3,16
45	blr
46
47/*
48 * Compute checksum of TCP or UDP pseudo-header:
49 *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
50 * No real gain trying to do this specially for 64 bit, but
51 * the 32 bit addition may spill into the upper bits of
52 * the doubleword so we still must fold it down from 64.
53 */
54_GLOBAL(csum_tcpudp_magic)
55	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
56	addc	r0,r3,r4	/* add 4 32-bit words together */
57	adde	r0,r0,r5
58	adde	r0,r0,r7
59        rldicl  r4,r0,32,0      /* fold 64 bit value */
60        add     r0,r4,r0
61        srdi    r0,r0,32
62	rlwinm	r3,r0,16,0,31	/* fold two halves together */
63	add	r3,r0,r3
64	not	r3,r3
65	srwi	r3,r3,16
66	blr
67
68/*
69 * Computes the checksum of a memory block at buff, length len,
70 * and adds in "sum" (32-bit).
71 *
72 * csum_partial(r3=buff, r4=len, r5=sum)
73 */
74_GLOBAL(csum_partial)
75	addic	r0,r5,0			/* clear carry */
76
77	srdi.	r6,r4,3			/* less than 8 bytes? */
78	beq	.Lcsum_tail_word
79
80	/*
81	 * If only halfword aligned, align to a double word. Since odd
82	 * aligned addresses should be rare and they would require more
83	 * work to calculate the correct checksum, we ignore that case
84	 * and take the potential slowdown of unaligned loads.
85	 */
86	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
87	beq	.Lcsum_aligned
88
89	li	r7,4
90	sub	r6,r7,r6
91	mtctr	r6
92
931:
94	lhz	r6,0(r3)		/* align to doubleword */
95	subi	r4,r4,2
96	addi	r3,r3,2
97	adde	r0,r0,r6
98	bdnz	1b
99
100.Lcsum_aligned:
101	/*
102	 * We unroll the loop such that each iteration is 64 bytes with an
103	 * entry and exit limb of 64 bytes, meaning a minimum size of
104	 * 128 bytes.
105	 */
106	srdi.	r6,r4,7
107	beq	.Lcsum_tail_doublewords		/* len < 128 */
108
109	srdi	r6,r4,6
110	subi	r6,r6,1
111	mtctr	r6
112
113	stdu	r1,-STACKFRAMESIZE(r1)
114	std	r14,STK_REG(R14)(r1)
115	std	r15,STK_REG(R15)(r1)
116	std	r16,STK_REG(R16)(r1)
117
118	ld	r6,0(r3)
119	ld	r9,8(r3)
120
121	ld	r10,16(r3)
122	ld	r11,24(r3)
123
124	/*
125	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
126	 * the XER dependency. This means the fastest this loop can go is
127	 * 16 cycles per iteration. The scheduling of the loop below has
128	 * been shown to hit this on both POWER6 and POWER7.
129	 */
130	.align 5
1312:
132	adde	r0,r0,r6
133	ld	r12,32(r3)
134	ld	r14,40(r3)
135
136	adde	r0,r0,r9
137	ld	r15,48(r3)
138	ld	r16,56(r3)
139	addi	r3,r3,64
140
141	adde	r0,r0,r10
142
143	adde	r0,r0,r11
144
145	adde	r0,r0,r12
146
147	adde	r0,r0,r14
148
149	adde	r0,r0,r15
150	ld	r6,0(r3)
151	ld	r9,8(r3)
152
153	adde	r0,r0,r16
154	ld	r10,16(r3)
155	ld	r11,24(r3)
156	bdnz	2b
157
158
159	adde	r0,r0,r6
160	ld	r12,32(r3)
161	ld	r14,40(r3)
162
163	adde	r0,r0,r9
164	ld	r15,48(r3)
165	ld	r16,56(r3)
166	addi	r3,r3,64
167
168	adde	r0,r0,r10
169	adde	r0,r0,r11
170	adde	r0,r0,r12
171	adde	r0,r0,r14
172	adde	r0,r0,r15
173	adde	r0,r0,r16
174
175	ld	r14,STK_REG(R14)(r1)
176	ld	r15,STK_REG(R15)(r1)
177	ld	r16,STK_REG(R16)(r1)
178	addi	r1,r1,STACKFRAMESIZE
179
180	andi.	r4,r4,63
181
182.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
183	srdi.	r6,r4,3
184	beq	.Lcsum_tail_word
185
186	mtctr	r6
1873:
188	ld	r6,0(r3)
189	addi	r3,r3,8
190	adde	r0,r0,r6
191	bdnz	3b
192
193	andi.	r4,r4,7
194
195.Lcsum_tail_word:			/* Up to 7 bytes to go */
196	srdi.	r6,r4,2
197	beq	.Lcsum_tail_halfword
198
199	lwz	r6,0(r3)
200	addi	r3,r3,4
201	adde	r0,r0,r6
202	subi	r4,r4,4
203
204.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
205	srdi.	r6,r4,1
206	beq	.Lcsum_tail_byte
207
208	lhz	r6,0(r3)
209	addi	r3,r3,2
210	adde	r0,r0,r6
211	subi	r4,r4,2
212
213.Lcsum_tail_byte:			/* Up to 1 byte to go */
214	andi.	r6,r4,1
215	beq	.Lcsum_finish
216
217	lbz	r6,0(r3)
218	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
219	adde	r0,r0,r9
220
221.Lcsum_finish:
222	addze	r0,r0			/* add in final carry */
223	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
224	add	r3,r4,r0
225	srdi	r3,r3,32
226	blr
227
228
229	.macro srcnr
230100:
231	.section __ex_table,"a"
232	.align 3
233	.llong 100b,.Lsrc_error_nr
234	.previous
235	.endm
236
237	.macro source
238150:
239	.section __ex_table,"a"
240	.align 3
241	.llong 150b,.Lsrc_error
242	.previous
243	.endm
244
245	.macro dstnr
246200:
247	.section __ex_table,"a"
248	.align 3
249	.llong 200b,.Ldest_error_nr
250	.previous
251	.endm
252
253	.macro dest
254250:
255	.section __ex_table,"a"
256	.align 3
257	.llong 250b,.Ldest_error
258	.previous
259	.endm
260
261/*
262 * Computes the checksum of a memory block at src, length len,
263 * and adds in "sum" (32-bit), while copying the block to dst.
264 * If an access exception occurs on src or dst, it stores -EFAULT
265 * to *src_err or *dst_err respectively. The caller must take any action
266 * required in this case (zeroing memory, recalculating partial checksum etc).
267 *
268 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
269 */
270_GLOBAL(csum_partial_copy_generic)
271	addic	r0,r6,0			/* clear carry */
272
273	srdi.	r6,r5,3			/* less than 8 bytes? */
274	beq	.Lcopy_tail_word
275
276	/*
277	 * If only halfword aligned, align to a double word. Since odd
278	 * aligned addresses should be rare and they would require more
279	 * work to calculate the correct checksum, we ignore that case
280	 * and take the potential slowdown of unaligned loads.
281	 *
282	 * If the source and destination are relatively unaligned we only
283	 * align the source. This keeps things simple.
284	 */
285	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
286	beq	.Lcopy_aligned
287
288	li	r9,4
289	sub	r6,r9,r6
290	mtctr	r6
291
2921:
293srcnr;	lhz	r6,0(r3)		/* align to doubleword */
294	subi	r5,r5,2
295	addi	r3,r3,2
296	adde	r0,r0,r6
297dstnr;	sth	r6,0(r4)
298	addi	r4,r4,2
299	bdnz	1b
300
301.Lcopy_aligned:
302	/*
303	 * We unroll the loop such that each iteration is 64 bytes with an
304	 * entry and exit limb of 64 bytes, meaning a minimum size of
305	 * 128 bytes.
306	 */
307	srdi.	r6,r5,7
308	beq	.Lcopy_tail_doublewords		/* len < 128 */
309
310	srdi	r6,r5,6
311	subi	r6,r6,1
312	mtctr	r6
313
314	stdu	r1,-STACKFRAMESIZE(r1)
315	std	r14,STK_REG(R14)(r1)
316	std	r15,STK_REG(R15)(r1)
317	std	r16,STK_REG(R16)(r1)
318
319source;	ld	r6,0(r3)
320source;	ld	r9,8(r3)
321
322source;	ld	r10,16(r3)
323source;	ld	r11,24(r3)
324
325	/*
326	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
327	 * the XER dependency. This means the fastest this loop can go is
328	 * 16 cycles per iteration. The scheduling of the loop below has
329	 * been shown to hit this on both POWER6 and POWER7.
330	 */
331	.align 5
3322:
333	adde	r0,r0,r6
334source;	ld	r12,32(r3)
335source;	ld	r14,40(r3)
336
337	adde	r0,r0,r9
338source;	ld	r15,48(r3)
339source;	ld	r16,56(r3)
340	addi	r3,r3,64
341
342	adde	r0,r0,r10
343dest;	std	r6,0(r4)
344dest;	std	r9,8(r4)
345
346	adde	r0,r0,r11
347dest;	std	r10,16(r4)
348dest;	std	r11,24(r4)
349
350	adde	r0,r0,r12
351dest;	std	r12,32(r4)
352dest;	std	r14,40(r4)
353
354	adde	r0,r0,r14
355dest;	std	r15,48(r4)
356dest;	std	r16,56(r4)
357	addi	r4,r4,64
358
359	adde	r0,r0,r15
360source;	ld	r6,0(r3)
361source;	ld	r9,8(r3)
362
363	adde	r0,r0,r16
364source;	ld	r10,16(r3)
365source;	ld	r11,24(r3)
366	bdnz	2b
367
368
369	adde	r0,r0,r6
370source;	ld	r12,32(r3)
371source;	ld	r14,40(r3)
372
373	adde	r0,r0,r9
374source;	ld	r15,48(r3)
375source;	ld	r16,56(r3)
376	addi	r3,r3,64
377
378	adde	r0,r0,r10
379dest;	std	r6,0(r4)
380dest;	std	r9,8(r4)
381
382	adde	r0,r0,r11
383dest;	std	r10,16(r4)
384dest;	std	r11,24(r4)
385
386	adde	r0,r0,r12
387dest;	std	r12,32(r4)
388dest;	std	r14,40(r4)
389
390	adde	r0,r0,r14
391dest;	std	r15,48(r4)
392dest;	std	r16,56(r4)
393	addi	r4,r4,64
394
395	adde	r0,r0,r15
396	adde	r0,r0,r16
397
398	ld	r14,STK_REG(R14)(r1)
399	ld	r15,STK_REG(R15)(r1)
400	ld	r16,STK_REG(R16)(r1)
401	addi	r1,r1,STACKFRAMESIZE
402
403	andi.	r5,r5,63
404
405.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
406	srdi.	r6,r5,3
407	beq	.Lcopy_tail_word
408
409	mtctr	r6
4103:
411srcnr;	ld	r6,0(r3)
412	addi	r3,r3,8
413	adde	r0,r0,r6
414dstnr;	std	r6,0(r4)
415	addi	r4,r4,8
416	bdnz	3b
417
418	andi.	r5,r5,7
419
420.Lcopy_tail_word:			/* Up to 7 bytes to go */
421	srdi.	r6,r5,2
422	beq	.Lcopy_tail_halfword
423
424srcnr;	lwz	r6,0(r3)
425	addi	r3,r3,4
426	adde	r0,r0,r6
427dstnr;	stw	r6,0(r4)
428	addi	r4,r4,4
429	subi	r5,r5,4
430
431.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
432	srdi.	r6,r5,1
433	beq	.Lcopy_tail_byte
434
435srcnr;	lhz	r6,0(r3)
436	addi	r3,r3,2
437	adde	r0,r0,r6
438dstnr;	sth	r6,0(r4)
439	addi	r4,r4,2
440	subi	r5,r5,2
441
442.Lcopy_tail_byte:			/* Up to 1 byte to go */
443	andi.	r6,r5,1
444	beq	.Lcopy_finish
445
446srcnr;	lbz	r6,0(r3)
447	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
448	adde	r0,r0,r9
449dstnr;	stb	r6,0(r4)
450
451.Lcopy_finish:
452	addze	r0,r0			/* add in final carry */
453	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
454	add	r3,r4,r0
455	srdi	r3,r3,32
456	blr
457
458.Lsrc_error:
459	ld	r14,STK_REG(R14)(r1)
460	ld	r15,STK_REG(R15)(r1)
461	ld	r16,STK_REG(R16)(r1)
462	addi	r1,r1,STACKFRAMESIZE
463.Lsrc_error_nr:
464	cmpdi	0,r7,0
465	beqlr
466	li	r6,-EFAULT
467	stw	r6,0(r7)
468	blr
469
470.Ldest_error:
471	ld	r14,STK_REG(R14)(r1)
472	ld	r15,STK_REG(R15)(r1)
473	ld	r16,STK_REG(R16)(r1)
474	addi	r1,r1,STACKFRAMESIZE
475.Ldest_error_nr:
476	cmpdi	0,r8,0
477	beqlr
478	li	r6,-EFAULT
479	stw	r6,0(r8)
480	blr
481