xref: /linux/arch/powerpc/lib/checksum_32.S (revision 02680c23d7b3febe45ea3d4f9818c2b2dc89020a)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 * This file contains assembly-language implementations
4 * of IP-style 1's complement checksum routines.
5 *
6 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 *
8 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
9 */
10
11#include <linux/sys.h>
12#include <asm/processor.h>
13#include <asm/cache.h>
14#include <asm/errno.h>
15#include <asm/ppc_asm.h>
16#include <asm/export.h>
17
18	.text
19
20/*
21 * computes the checksum of a memory block at buff, length len,
22 * and adds in "sum" (32-bit)
23 *
24 * __csum_partial(buff, len, sum)
25 */
26_GLOBAL(__csum_partial)
27	subi	r3,r3,4
28	srawi.	r6,r4,2		/* Divide len by 4 and also clear carry */
29	beq	3f		/* if we're doing < 4 bytes */
30	andi.	r0,r3,2		/* Align buffer to longword boundary */
31	beq+	1f
32	lhz	r0,4(r3)	/* do 2 bytes to get aligned */
33	subi	r4,r4,2
34	addi	r3,r3,2
35	srwi.	r6,r4,2		/* # words to do */
36	adde	r5,r5,r0
37	beq	3f
381:	andi.	r6,r6,3		/* Prepare to handle words 4 by 4 */
39	beq	21f
40	mtctr	r6
412:	lwzu	r0,4(r3)
42	adde	r5,r5,r0
43	bdnz	2b
4421:	srwi.	r6,r4,4		/* # blocks of 4 words to do */
45	beq	3f
46	lwz	r0,4(r3)
47	mtctr	r6
48	lwz	r6,8(r3)
49	adde	r5,r5,r0
50	lwz	r7,12(r3)
51	adde	r5,r5,r6
52	lwzu	r8,16(r3)
53	adde	r5,r5,r7
54	bdz	23f
5522:	lwz	r0,4(r3)
56	adde	r5,r5,r8
57	lwz	r6,8(r3)
58	adde	r5,r5,r0
59	lwz	r7,12(r3)
60	adde	r5,r5,r6
61	lwzu	r8,16(r3)
62	adde	r5,r5,r7
63	bdnz	22b
6423:	adde	r5,r5,r8
653:	andi.	r0,r4,2
66	beq+	4f
67	lhz	r0,4(r3)
68	addi	r3,r3,2
69	adde	r5,r5,r0
704:	andi.	r0,r4,1
71	beq+	5f
72	lbz	r0,4(r3)
73	slwi	r0,r0,8		/* Upper byte of word */
74	adde	r5,r5,r0
755:	addze	r3,r5		/* add in final carry */
76	blr
77EXPORT_SYMBOL(__csum_partial)
78
79/*
80 * Computes the checksum of a memory block at src, length len,
81 * and adds in 0xffffffff, while copying the block to dst.
82 * If an access exception occurs it returns zero.
83 *
84 * csum_partial_copy_generic(src, dst, len)
85 */
86#define CSUM_COPY_16_BYTES_WITHEX(n)	\
878 ## n ## 0:			\
88	lwz	r7,4(r4);	\
898 ## n ## 1:			\
90	lwz	r8,8(r4);	\
918 ## n ## 2:			\
92	lwz	r9,12(r4);	\
938 ## n ## 3:			\
94	lwzu	r10,16(r4);	\
958 ## n ## 4:			\
96	stw	r7,4(r6);	\
97	adde	r12,r12,r7;	\
988 ## n ## 5:			\
99	stw	r8,8(r6);	\
100	adde	r12,r12,r8;	\
1018 ## n ## 6:			\
102	stw	r9,12(r6);	\
103	adde	r12,r12,r9;	\
1048 ## n ## 7:			\
105	stwu	r10,16(r6);	\
106	adde	r12,r12,r10
107
108#define CSUM_COPY_16_BYTES_EXCODE(n)		\
109	EX_TABLE(8 ## n ## 0b, fault);	\
110	EX_TABLE(8 ## n ## 1b, fault);	\
111	EX_TABLE(8 ## n ## 2b, fault);	\
112	EX_TABLE(8 ## n ## 3b, fault);	\
113	EX_TABLE(8 ## n ## 4b, fault);	\
114	EX_TABLE(8 ## n ## 5b, fault);	\
115	EX_TABLE(8 ## n ## 6b, fault);	\
116	EX_TABLE(8 ## n ## 7b, fault);
117
118	.text
119	.stabs	"arch/powerpc/lib/",N_SO,0,0,0f
120	.stabs	"checksum_32.S",N_SO,0,0,0f
1210:
122
123CACHELINE_BYTES = L1_CACHE_BYTES
124LG_CACHELINE_BYTES = L1_CACHE_SHIFT
125CACHELINE_MASK = (L1_CACHE_BYTES-1)
126
127_GLOBAL(csum_partial_copy_generic)
128	li	r12,-1
129	addic	r0,r0,0			/* clear carry */
130	addi	r6,r4,-4
131	neg	r0,r4
132	addi	r4,r3,-4
133	andi.	r0,r0,CACHELINE_MASK	/* # bytes to start of cache line */
134	crset	4*cr7+eq
135	beq	58f
136
137	cmplw	0,r5,r0			/* is this more than total to do? */
138	blt	63f			/* if not much to do */
139	rlwinm	r7,r6,3,0x8
140	rlwnm	r12,r12,r7,0,31	/* odd destination address: rotate one byte */
141	cmplwi	cr7,r7,0	/* is destination address even ? */
142	andi.	r8,r0,3			/* get it word-aligned first */
143	mtctr	r8
144	beq+	61f
145	li	r3,0
14670:	lbz	r9,4(r4)		/* do some bytes */
147	addi	r4,r4,1
148	slwi	r3,r3,8
149	rlwimi	r3,r9,0,24,31
15071:	stb	r9,4(r6)
151	addi	r6,r6,1
152	bdnz	70b
153	adde	r12,r12,r3
15461:	subf	r5,r0,r5
155	srwi.	r0,r0,2
156	mtctr	r0
157	beq	58f
15872:	lwzu	r9,4(r4)		/* do some words */
159	adde	r12,r12,r9
16073:	stwu	r9,4(r6)
161	bdnz	72b
162
16358:	srwi.	r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
164	clrlwi	r5,r5,32-LG_CACHELINE_BYTES
165	li	r11,4
166	beq	63f
167
168	/* Here we decide how far ahead to prefetch the source */
169	li	r3,4
170	cmpwi	r0,1
171	li	r7,0
172	ble	114f
173	li	r7,1
174#if MAX_COPY_PREFETCH > 1
175	/* Heuristically, for large transfers we prefetch
176	   MAX_COPY_PREFETCH cachelines ahead.  For small transfers
177	   we prefetch 1 cacheline ahead. */
178	cmpwi	r0,MAX_COPY_PREFETCH
179	ble	112f
180	li	r7,MAX_COPY_PREFETCH
181112:	mtctr	r7
182111:	dcbt	r3,r4
183	addi	r3,r3,CACHELINE_BYTES
184	bdnz	111b
185#else
186	dcbt	r3,r4
187	addi	r3,r3,CACHELINE_BYTES
188#endif /* MAX_COPY_PREFETCH > 1 */
189
190114:	subf	r8,r7,r0
191	mr	r0,r7
192	mtctr	r8
193
19453:	dcbt	r3,r4
19554:	dcbz	r11,r6
196/* the main body of the cacheline loop */
197	CSUM_COPY_16_BYTES_WITHEX(0)
198#if L1_CACHE_BYTES >= 32
199	CSUM_COPY_16_BYTES_WITHEX(1)
200#if L1_CACHE_BYTES >= 64
201	CSUM_COPY_16_BYTES_WITHEX(2)
202	CSUM_COPY_16_BYTES_WITHEX(3)
203#if L1_CACHE_BYTES >= 128
204	CSUM_COPY_16_BYTES_WITHEX(4)
205	CSUM_COPY_16_BYTES_WITHEX(5)
206	CSUM_COPY_16_BYTES_WITHEX(6)
207	CSUM_COPY_16_BYTES_WITHEX(7)
208#endif
209#endif
210#endif
211	bdnz	53b
212	cmpwi	r0,0
213	li	r3,4
214	li	r7,0
215	bne	114b
216
21763:	srwi.	r0,r5,2
218	mtctr	r0
219	beq	64f
22030:	lwzu	r0,4(r4)
221	adde	r12,r12,r0
22231:	stwu	r0,4(r6)
223	bdnz	30b
224
22564:	andi.	r0,r5,2
226	beq+	65f
22740:	lhz	r0,4(r4)
228	addi	r4,r4,2
22941:	sth	r0,4(r6)
230	adde	r12,r12,r0
231	addi	r6,r6,2
23265:	andi.	r0,r5,1
233	beq+	66f
23450:	lbz	r0,4(r4)
23551:	stb	r0,4(r6)
236	slwi	r0,r0,8
237	adde	r12,r12,r0
23866:	addze	r3,r12
239	beqlr+	cr7
240	rlwinm	r3,r3,8,0,31	/* odd destination address: rotate one byte */
241	blr
242
243fault:
244	li	r3,0
245	blr
246
247	EX_TABLE(70b, fault);
248	EX_TABLE(71b, fault);
249	EX_TABLE(72b, fault);
250	EX_TABLE(73b, fault);
251	EX_TABLE(54b, fault);
252
253/*
254 * this stuff handles faults in the cacheline loop and branches to either
255 * fault (if in read part) or fault (if in write part)
256 */
257	CSUM_COPY_16_BYTES_EXCODE(0)
258#if L1_CACHE_BYTES >= 32
259	CSUM_COPY_16_BYTES_EXCODE(1)
260#if L1_CACHE_BYTES >= 64
261	CSUM_COPY_16_BYTES_EXCODE(2)
262	CSUM_COPY_16_BYTES_EXCODE(3)
263#if L1_CACHE_BYTES >= 128
264	CSUM_COPY_16_BYTES_EXCODE(4)
265	CSUM_COPY_16_BYTES_EXCODE(5)
266	CSUM_COPY_16_BYTES_EXCODE(6)
267	CSUM_COPY_16_BYTES_EXCODE(7)
268#endif
269#endif
270#endif
271
272	EX_TABLE(30b, fault);
273	EX_TABLE(31b, fault);
274	EX_TABLE(40b, fault);
275	EX_TABLE(41b, fault);
276	EX_TABLE(50b, fault);
277	EX_TABLE(51b, fault);
278
279EXPORT_SYMBOL(csum_partial_copy_generic)
280
281/*
282 * __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
283 *			   const struct in6_addr *daddr,
284 *			   __u32 len, __u8 proto, __wsum sum)
285 */
286
287_GLOBAL(csum_ipv6_magic)
288	lwz	r8, 0(r3)
289	lwz	r9, 4(r3)
290	addc	r0, r7, r8
291	lwz	r10, 8(r3)
292	adde	r0, r0, r9
293	lwz	r11, 12(r3)
294	adde	r0, r0, r10
295	lwz	r8, 0(r4)
296	adde	r0, r0, r11
297	lwz	r9, 4(r4)
298	adde	r0, r0, r8
299	lwz	r10, 8(r4)
300	adde	r0, r0, r9
301	lwz	r11, 12(r4)
302	adde	r0, r0, r10
303	add	r5, r5, r6	/* assumption: len + proto doesn't carry */
304	adde	r0, r0, r11
305	adde	r0, r0, r5
306	addze	r0, r0
307	rotlwi	r3, r0, 16
308	add	r3, r0, r3
309	not	r3, r3
310	rlwinm	r3, r3, 16, 16, 31
311	blr
312EXPORT_SYMBOL(csum_ipv6_magic)
313