xref: /linux/arch/powerpc/lib/checksum_64.S (revision 31527da5d673ed16255869b6d0f209285b8b0981)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19#include <asm/export.h>
20
21/*
22 * Computes the checksum of a memory block at buff, length len,
23 * and adds in "sum" (32-bit).
24 *
25 * __csum_partial(r3=buff, r4=len, r5=sum)
26 */
27_GLOBAL(__csum_partial)
28	addic	r0,r5,0			/* clear carry */
29
30	srdi.	r6,r4,3			/* less than 8 bytes? */
31	beq	.Lcsum_tail_word
32
33	/*
34	 * If only halfword aligned, align to a double word. Since odd
35	 * aligned addresses should be rare and they would require more
36	 * work to calculate the correct checksum, we ignore that case
37	 * and take the potential slowdown of unaligned loads.
38	 */
39	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
40	beq	.Lcsum_aligned
41
42	li	r7,4
43	sub	r6,r7,r6
44	mtctr	r6
45
461:
47	lhz	r6,0(r3)		/* align to doubleword */
48	subi	r4,r4,2
49	addi	r3,r3,2
50	adde	r0,r0,r6
51	bdnz	1b
52
53.Lcsum_aligned:
54	/*
55	 * We unroll the loop such that each iteration is 64 bytes with an
56	 * entry and exit limb of 64 bytes, meaning a minimum size of
57	 * 128 bytes.
58	 */
59	srdi.	r6,r4,7
60	beq	.Lcsum_tail_doublewords		/* len < 128 */
61
62	srdi	r6,r4,6
63	subi	r6,r6,1
64	mtctr	r6
65
66	stdu	r1,-STACKFRAMESIZE(r1)
67	std	r14,STK_REG(R14)(r1)
68	std	r15,STK_REG(R15)(r1)
69	std	r16,STK_REG(R16)(r1)
70
71	ld	r6,0(r3)
72	ld	r9,8(r3)
73
74	ld	r10,16(r3)
75	ld	r11,24(r3)
76
77	/*
78	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
79	 * because of the XER dependency. This means the fastest this loop can
80	 * go is 16 cycles per iteration. The scheduling of the loop below has
81	 * been shown to hit this on both POWER6 and POWER7.
82	 */
83	.align 5
842:
85	adde	r0,r0,r6
86	ld	r12,32(r3)
87	ld	r14,40(r3)
88
89	adde	r0,r0,r9
90	ld	r15,48(r3)
91	ld	r16,56(r3)
92	addi	r3,r3,64
93
94	adde	r0,r0,r10
95
96	adde	r0,r0,r11
97
98	adde	r0,r0,r12
99
100	adde	r0,r0,r14
101
102	adde	r0,r0,r15
103	ld	r6,0(r3)
104	ld	r9,8(r3)
105
106	adde	r0,r0,r16
107	ld	r10,16(r3)
108	ld	r11,24(r3)
109	bdnz	2b
110
111
112	adde	r0,r0,r6
113	ld	r12,32(r3)
114	ld	r14,40(r3)
115
116	adde	r0,r0,r9
117	ld	r15,48(r3)
118	ld	r16,56(r3)
119	addi	r3,r3,64
120
121	adde	r0,r0,r10
122	adde	r0,r0,r11
123	adde	r0,r0,r12
124	adde	r0,r0,r14
125	adde	r0,r0,r15
126	adde	r0,r0,r16
127
128	ld	r14,STK_REG(R14)(r1)
129	ld	r15,STK_REG(R15)(r1)
130	ld	r16,STK_REG(R16)(r1)
131	addi	r1,r1,STACKFRAMESIZE
132
133	andi.	r4,r4,63
134
135.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
136	srdi.	r6,r4,3
137	beq	.Lcsum_tail_word
138
139	mtctr	r6
1403:
141	ld	r6,0(r3)
142	addi	r3,r3,8
143	adde	r0,r0,r6
144	bdnz	3b
145
146	andi.	r4,r4,7
147
148.Lcsum_tail_word:			/* Up to 7 bytes to go */
149	srdi.	r6,r4,2
150	beq	.Lcsum_tail_halfword
151
152	lwz	r6,0(r3)
153	addi	r3,r3,4
154	adde	r0,r0,r6
155	subi	r4,r4,4
156
157.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
158	srdi.	r6,r4,1
159	beq	.Lcsum_tail_byte
160
161	lhz	r6,0(r3)
162	addi	r3,r3,2
163	adde	r0,r0,r6
164	subi	r4,r4,2
165
166.Lcsum_tail_byte:			/* Up to 1 byte to go */
167	andi.	r6,r4,1
168	beq	.Lcsum_finish
169
170	lbz	r6,0(r3)
171#ifdef __BIG_ENDIAN__
172	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
173	adde	r0,r0,r9
174#else
175	adde	r0,r0,r6
176#endif
177
178.Lcsum_finish:
179	addze	r0,r0			/* add in final carry */
180	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
181	add	r3,r4,r0
182	srdi	r3,r3,32
183	blr
184EXPORT_SYMBOL(__csum_partial)
185
186
187	.macro srcnr
188100:
189	EX_TABLE(100b,.Lsrc_error_nr)
190	.endm
191
192	.macro source
193150:
194	EX_TABLE(150b,.Lsrc_error)
195	.endm
196
197	.macro dstnr
198200:
199	EX_TABLE(200b,.Ldest_error_nr)
200	.endm
201
202	.macro dest
203250:
204	EX_TABLE(250b,.Ldest_error)
205	.endm
206
207/*
208 * Computes the checksum of a memory block at src, length len,
209 * and adds in "sum" (32-bit), while copying the block to dst.
210 * If an access exception occurs on src or dst, it stores -EFAULT
211 * to *src_err or *dst_err respectively. The caller must take any action
212 * required in this case (zeroing memory, recalculating partial checksum etc).
213 *
214 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
215 */
216_GLOBAL(csum_partial_copy_generic)
217	addic	r0,r6,0			/* clear carry */
218
219	srdi.	r6,r5,3			/* less than 8 bytes? */
220	beq	.Lcopy_tail_word
221
222	/*
223	 * If only halfword aligned, align to a double word. Since odd
224	 * aligned addresses should be rare and they would require more
225	 * work to calculate the correct checksum, we ignore that case
226	 * and take the potential slowdown of unaligned loads.
227	 *
228	 * If the source and destination are relatively unaligned we only
229	 * align the source. This keeps things simple.
230	 */
231	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 >> 1) & 0x3 */
232	beq	.Lcopy_aligned
233
234	li	r9,4
235	sub	r6,r9,r6
236	mtctr	r6
237
2381:
239srcnr;	lhz	r6,0(r3)		/* align to doubleword */
240	subi	r5,r5,2
241	addi	r3,r3,2
242	adde	r0,r0,r6
243dstnr;	sth	r6,0(r4)
244	addi	r4,r4,2
245	bdnz	1b
246
247.Lcopy_aligned:
248	/*
249	 * We unroll the loop such that each iteration is 64 bytes with an
250	 * entry and exit limb of 64 bytes, meaning a minimum size of
251	 * 128 bytes.
252	 */
253	srdi.	r6,r5,7
254	beq	.Lcopy_tail_doublewords		/* len < 128 */
255
256	srdi	r6,r5,6
257	subi	r6,r6,1
258	mtctr	r6
259
260	stdu	r1,-STACKFRAMESIZE(r1)
261	std	r14,STK_REG(R14)(r1)
262	std	r15,STK_REG(R15)(r1)
263	std	r16,STK_REG(R16)(r1)
264
265source;	ld	r6,0(r3)
266source;	ld	r9,8(r3)
267
268source;	ld	r10,16(r3)
269source;	ld	r11,24(r3)
270
271	/*
272	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
273	 * because of the XER dependency. This means the fastest this loop can
274	 * go is 16 cycles per iteration. The scheduling of the loop below has
275	 * been shown to hit this on both POWER6 and POWER7.
276	 */
277	.align 5
2782:
279	adde	r0,r0,r6
280source;	ld	r12,32(r3)
281source;	ld	r14,40(r3)
282
283	adde	r0,r0,r9
284source;	ld	r15,48(r3)
285source;	ld	r16,56(r3)
286	addi	r3,r3,64
287
288	adde	r0,r0,r10
289dest;	std	r6,0(r4)
290dest;	std	r9,8(r4)
291
292	adde	r0,r0,r11
293dest;	std	r10,16(r4)
294dest;	std	r11,24(r4)
295
296	adde	r0,r0,r12
297dest;	std	r12,32(r4)
298dest;	std	r14,40(r4)
299
300	adde	r0,r0,r14
301dest;	std	r15,48(r4)
302dest;	std	r16,56(r4)
303	addi	r4,r4,64
304
305	adde	r0,r0,r15
306source;	ld	r6,0(r3)
307source;	ld	r9,8(r3)
308
309	adde	r0,r0,r16
310source;	ld	r10,16(r3)
311source;	ld	r11,24(r3)
312	bdnz	2b
313
314
315	adde	r0,r0,r6
316source;	ld	r12,32(r3)
317source;	ld	r14,40(r3)
318
319	adde	r0,r0,r9
320source;	ld	r15,48(r3)
321source;	ld	r16,56(r3)
322	addi	r3,r3,64
323
324	adde	r0,r0,r10
325dest;	std	r6,0(r4)
326dest;	std	r9,8(r4)
327
328	adde	r0,r0,r11
329dest;	std	r10,16(r4)
330dest;	std	r11,24(r4)
331
332	adde	r0,r0,r12
333dest;	std	r12,32(r4)
334dest;	std	r14,40(r4)
335
336	adde	r0,r0,r14
337dest;	std	r15,48(r4)
338dest;	std	r16,56(r4)
339	addi	r4,r4,64
340
341	adde	r0,r0,r15
342	adde	r0,r0,r16
343
344	ld	r14,STK_REG(R14)(r1)
345	ld	r15,STK_REG(R15)(r1)
346	ld	r16,STK_REG(R16)(r1)
347	addi	r1,r1,STACKFRAMESIZE
348
349	andi.	r5,r5,63
350
351.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
352	srdi.	r6,r5,3
353	beq	.Lcopy_tail_word
354
355	mtctr	r6
3563:
357srcnr;	ld	r6,0(r3)
358	addi	r3,r3,8
359	adde	r0,r0,r6
360dstnr;	std	r6,0(r4)
361	addi	r4,r4,8
362	bdnz	3b
363
364	andi.	r5,r5,7
365
366.Lcopy_tail_word:			/* Up to 7 bytes to go */
367	srdi.	r6,r5,2
368	beq	.Lcopy_tail_halfword
369
370srcnr;	lwz	r6,0(r3)
371	addi	r3,r3,4
372	adde	r0,r0,r6
373dstnr;	stw	r6,0(r4)
374	addi	r4,r4,4
375	subi	r5,r5,4
376
377.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
378	srdi.	r6,r5,1
379	beq	.Lcopy_tail_byte
380
381srcnr;	lhz	r6,0(r3)
382	addi	r3,r3,2
383	adde	r0,r0,r6
384dstnr;	sth	r6,0(r4)
385	addi	r4,r4,2
386	subi	r5,r5,2
387
388.Lcopy_tail_byte:			/* Up to 1 byte to go */
389	andi.	r6,r5,1
390	beq	.Lcopy_finish
391
392srcnr;	lbz	r6,0(r3)
393#ifdef __BIG_ENDIAN__
394	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
395	adde	r0,r0,r9
396#else
397	adde	r0,r0,r6
398#endif
399dstnr;	stb	r6,0(r4)
400
401.Lcopy_finish:
402	addze	r0,r0			/* add in final carry */
403	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
404	add	r3,r4,r0
405	srdi	r3,r3,32
406	blr
407
408.Lsrc_error:
409	ld	r14,STK_REG(R14)(r1)
410	ld	r15,STK_REG(R15)(r1)
411	ld	r16,STK_REG(R16)(r1)
412	addi	r1,r1,STACKFRAMESIZE
413.Lsrc_error_nr:
414	cmpdi	0,r7,0
415	beqlr
416	li	r6,-EFAULT
417	stw	r6,0(r7)
418	blr
419
420.Ldest_error:
421	ld	r14,STK_REG(R14)(r1)
422	ld	r15,STK_REG(R15)(r1)
423	ld	r16,STK_REG(R16)(r1)
424	addi	r1,r1,STACKFRAMESIZE
425.Ldest_error_nr:
426	cmpdi	0,r8,0
427	beqlr
428	li	r6,-EFAULT
429	stw	r6,0(r8)
430	blr
431EXPORT_SYMBOL(csum_partial_copy_generic)
432
433/*
434 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
435 *			   const struct in6_addr *daddr,
436 *			   __u32 len, __u8 proto, __wsum sum)
437 */
438
439_GLOBAL(csum_ipv6_magic)
440	ld	r8, 0(r3)
441	ld	r9, 8(r3)
442	add	r5, r5, r6
443	addc	r0, r8, r9
444	ld	r10, 0(r4)
445	ld	r11, 8(r4)
446	adde	r0, r0, r10
447	add	r5, r5, r7
448	adde	r0, r0, r11
449	adde	r0, r0, r5
450	addze	r0, r0
451	rotldi  r3, r0, 32		/* fold two 32 bit halves together */
452	add	r3, r0, r3
453	srdi	r0, r3, 32
454	rotlwi	r3, r0, 16		/* fold two 16 bit halves together */
455	add	r3, r0, r3
456	not	r3, r3
457	rlwinm	r3, r3, 16, 16, 31
458	blr
459EXPORT_SYMBOL(csum_ipv6_magic)
460