xref: /linux/arch/powerpc/lib/checksum_64.S (revision 32786fdc9506aeba98278c1844d4bfb766863832)
1/*
2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
4 *
5 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
6 *
7 *  This program is free software; you can redistribute it and/or
8 *  modify it under the terms of the GNU General Public License
9 *  as published by the Free Software Foundation; either version
10 *  2 of the License, or (at your option) any later version.
11 *
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
13 */
14
15#include <linux/sys.h>
16#include <asm/processor.h>
17#include <asm/errno.h>
18#include <asm/ppc_asm.h>
19#include <asm/export.h>
20
21/*
22 * Computes the checksum of a memory block at buff, length len,
23 * and adds in "sum" (32-bit).
24 *
25 * __csum_partial(r3=buff, r4=len, r5=sum)
26 */
27_GLOBAL(__csum_partial)
28	addic	r0,r5,0			/* clear carry */
29
30	srdi.	r6,r4,3			/* less than 8 bytes? */
31	beq	.Lcsum_tail_word
32
33	/*
34	 * If only halfword aligned, align to a double word. Since odd
35	 * aligned addresses should be rare and they would require more
36	 * work to calculate the correct checksum, we ignore that case
37	 * and take the potential slowdown of unaligned loads.
38	 */
39	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
40	beq	.Lcsum_aligned
41
42	li	r7,4
43	sub	r6,r7,r6
44	mtctr	r6
45
461:
47	lhz	r6,0(r3)		/* align to doubleword */
48	subi	r4,r4,2
49	addi	r3,r3,2
50	adde	r0,r0,r6
51	bdnz	1b
52
53.Lcsum_aligned:
54	/*
55	 * We unroll the loop such that each iteration is 64 bytes with an
56	 * entry and exit limb of 64 bytes, meaning a minimum size of
57	 * 128 bytes.
58	 */
59	srdi.	r6,r4,7
60	beq	.Lcsum_tail_doublewords		/* len < 128 */
61
62	srdi	r6,r4,6
63	subi	r6,r6,1
64	mtctr	r6
65
66	stdu	r1,-STACKFRAMESIZE(r1)
67	std	r14,STK_REG(R14)(r1)
68	std	r15,STK_REG(R15)(r1)
69	std	r16,STK_REG(R16)(r1)
70
71	ld	r6,0(r3)
72	ld	r9,8(r3)
73
74	ld	r10,16(r3)
75	ld	r11,24(r3)
76
77	/*
78	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
79	 * because of the XER dependency. This means the fastest this loop can
80	 * go is 16 cycles per iteration. The scheduling of the loop below has
81	 * been shown to hit this on both POWER6 and POWER7.
82	 */
83	.align 5
842:
85	adde	r0,r0,r6
86	ld	r12,32(r3)
87	ld	r14,40(r3)
88
89	adde	r0,r0,r9
90	ld	r15,48(r3)
91	ld	r16,56(r3)
92	addi	r3,r3,64
93
94	adde	r0,r0,r10
95
96	adde	r0,r0,r11
97
98	adde	r0,r0,r12
99
100	adde	r0,r0,r14
101
102	adde	r0,r0,r15
103	ld	r6,0(r3)
104	ld	r9,8(r3)
105
106	adde	r0,r0,r16
107	ld	r10,16(r3)
108	ld	r11,24(r3)
109	bdnz	2b
110
111
112	adde	r0,r0,r6
113	ld	r12,32(r3)
114	ld	r14,40(r3)
115
116	adde	r0,r0,r9
117	ld	r15,48(r3)
118	ld	r16,56(r3)
119	addi	r3,r3,64
120
121	adde	r0,r0,r10
122	adde	r0,r0,r11
123	adde	r0,r0,r12
124	adde	r0,r0,r14
125	adde	r0,r0,r15
126	adde	r0,r0,r16
127
128	ld	r14,STK_REG(R14)(r1)
129	ld	r15,STK_REG(R15)(r1)
130	ld	r16,STK_REG(R16)(r1)
131	addi	r1,r1,STACKFRAMESIZE
132
133	andi.	r4,r4,63
134
135.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
136	srdi.	r6,r4,3
137	beq	.Lcsum_tail_word
138
139	mtctr	r6
1403:
141	ld	r6,0(r3)
142	addi	r3,r3,8
143	adde	r0,r0,r6
144	bdnz	3b
145
146	andi.	r4,r4,7
147
148.Lcsum_tail_word:			/* Up to 7 bytes to go */
149	srdi.	r6,r4,2
150	beq	.Lcsum_tail_halfword
151
152	lwz	r6,0(r3)
153	addi	r3,r3,4
154	adde	r0,r0,r6
155	subi	r4,r4,4
156
157.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
158	srdi.	r6,r4,1
159	beq	.Lcsum_tail_byte
160
161	lhz	r6,0(r3)
162	addi	r3,r3,2
163	adde	r0,r0,r6
164	subi	r4,r4,2
165
166.Lcsum_tail_byte:			/* Up to 1 byte to go */
167	andi.	r6,r4,1
168	beq	.Lcsum_finish
169
170	lbz	r6,0(r3)
171	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
172	adde	r0,r0,r9
173
174.Lcsum_finish:
175	addze	r0,r0			/* add in final carry */
176	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
177	add	r3,r4,r0
178	srdi	r3,r3,32
179	blr
180EXPORT_SYMBOL(__csum_partial)
181
182
183	.macro srcnr
184100:
185	EX_TABLE(100b,.Lsrc_error_nr)
186	.endm
187
188	.macro source
189150:
190	EX_TABLE(150b,.Lsrc_error)
191	.endm
192
193	.macro dstnr
194200:
195	EX_TABLE(200b,.Ldest_error_nr)
196	.endm
197
198	.macro dest
199250:
200	EX_TABLE(250b,.Ldest_error)
201	.endm
202
203/*
204 * Computes the checksum of a memory block at src, length len,
205 * and adds in "sum" (32-bit), while copying the block to dst.
206 * If an access exception occurs on src or dst, it stores -EFAULT
207 * to *src_err or *dst_err respectively. The caller must take any action
208 * required in this case (zeroing memory, recalculating partial checksum etc).
209 *
210 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
211 */
212_GLOBAL(csum_partial_copy_generic)
213	addic	r0,r6,0			/* clear carry */
214
215	srdi.	r6,r5,3			/* less than 8 bytes? */
216	beq	.Lcopy_tail_word
217
218	/*
219	 * If only halfword aligned, align to a double word. Since odd
220	 * aligned addresses should be rare and they would require more
221	 * work to calculate the correct checksum, we ignore that case
222	 * and take the potential slowdown of unaligned loads.
223	 *
224	 * If the source and destination are relatively unaligned we only
225	 * align the source. This keeps things simple.
226	 */
227	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
228	beq	.Lcopy_aligned
229
230	li	r9,4
231	sub	r6,r9,r6
232	mtctr	r6
233
2341:
235srcnr;	lhz	r6,0(r3)		/* align to doubleword */
236	subi	r5,r5,2
237	addi	r3,r3,2
238	adde	r0,r0,r6
239dstnr;	sth	r6,0(r4)
240	addi	r4,r4,2
241	bdnz	1b
242
243.Lcopy_aligned:
244	/*
245	 * We unroll the loop such that each iteration is 64 bytes with an
246	 * entry and exit limb of 64 bytes, meaning a minimum size of
247	 * 128 bytes.
248	 */
249	srdi.	r6,r5,7
250	beq	.Lcopy_tail_doublewords		/* len < 128 */
251
252	srdi	r6,r5,6
253	subi	r6,r6,1
254	mtctr	r6
255
256	stdu	r1,-STACKFRAMESIZE(r1)
257	std	r14,STK_REG(R14)(r1)
258	std	r15,STK_REG(R15)(r1)
259	std	r16,STK_REG(R16)(r1)
260
261source;	ld	r6,0(r3)
262source;	ld	r9,8(r3)
263
264source;	ld	r10,16(r3)
265source;	ld	r11,24(r3)
266
267	/*
268	 * On POWER6 and POWER7 back to back adde instructions take 2 cycles
269	 * because of the XER dependency. This means the fastest this loop can
270	 * go is 16 cycles per iteration. The scheduling of the loop below has
271	 * been shown to hit this on both POWER6 and POWER7.
272	 */
273	.align 5
2742:
275	adde	r0,r0,r6
276source;	ld	r12,32(r3)
277source;	ld	r14,40(r3)
278
279	adde	r0,r0,r9
280source;	ld	r15,48(r3)
281source;	ld	r16,56(r3)
282	addi	r3,r3,64
283
284	adde	r0,r0,r10
285dest;	std	r6,0(r4)
286dest;	std	r9,8(r4)
287
288	adde	r0,r0,r11
289dest;	std	r10,16(r4)
290dest;	std	r11,24(r4)
291
292	adde	r0,r0,r12
293dest;	std	r12,32(r4)
294dest;	std	r14,40(r4)
295
296	adde	r0,r0,r14
297dest;	std	r15,48(r4)
298dest;	std	r16,56(r4)
299	addi	r4,r4,64
300
301	adde	r0,r0,r15
302source;	ld	r6,0(r3)
303source;	ld	r9,8(r3)
304
305	adde	r0,r0,r16
306source;	ld	r10,16(r3)
307source;	ld	r11,24(r3)
308	bdnz	2b
309
310
311	adde	r0,r0,r6
312source;	ld	r12,32(r3)
313source;	ld	r14,40(r3)
314
315	adde	r0,r0,r9
316source;	ld	r15,48(r3)
317source;	ld	r16,56(r3)
318	addi	r3,r3,64
319
320	adde	r0,r0,r10
321dest;	std	r6,0(r4)
322dest;	std	r9,8(r4)
323
324	adde	r0,r0,r11
325dest;	std	r10,16(r4)
326dest;	std	r11,24(r4)
327
328	adde	r0,r0,r12
329dest;	std	r12,32(r4)
330dest;	std	r14,40(r4)
331
332	adde	r0,r0,r14
333dest;	std	r15,48(r4)
334dest;	std	r16,56(r4)
335	addi	r4,r4,64
336
337	adde	r0,r0,r15
338	adde	r0,r0,r16
339
340	ld	r14,STK_REG(R14)(r1)
341	ld	r15,STK_REG(R15)(r1)
342	ld	r16,STK_REG(R16)(r1)
343	addi	r1,r1,STACKFRAMESIZE
344
345	andi.	r5,r5,63
346
347.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
348	srdi.	r6,r5,3
349	beq	.Lcopy_tail_word
350
351	mtctr	r6
3523:
353srcnr;	ld	r6,0(r3)
354	addi	r3,r3,8
355	adde	r0,r0,r6
356dstnr;	std	r6,0(r4)
357	addi	r4,r4,8
358	bdnz	3b
359
360	andi.	r5,r5,7
361
362.Lcopy_tail_word:			/* Up to 7 bytes to go */
363	srdi.	r6,r5,2
364	beq	.Lcopy_tail_halfword
365
366srcnr;	lwz	r6,0(r3)
367	addi	r3,r3,4
368	adde	r0,r0,r6
369dstnr;	stw	r6,0(r4)
370	addi	r4,r4,4
371	subi	r5,r5,4
372
373.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
374	srdi.	r6,r5,1
375	beq	.Lcopy_tail_byte
376
377srcnr;	lhz	r6,0(r3)
378	addi	r3,r3,2
379	adde	r0,r0,r6
380dstnr;	sth	r6,0(r4)
381	addi	r4,r4,2
382	subi	r5,r5,2
383
384.Lcopy_tail_byte:			/* Up to 1 byte to go */
385	andi.	r6,r5,1
386	beq	.Lcopy_finish
387
388srcnr;	lbz	r6,0(r3)
389	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
390	adde	r0,r0,r9
391dstnr;	stb	r6,0(r4)
392
393.Lcopy_finish:
394	addze	r0,r0			/* add in final carry */
395	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
396	add	r3,r4,r0
397	srdi	r3,r3,32
398	blr
399
400.Lsrc_error:
401	ld	r14,STK_REG(R14)(r1)
402	ld	r15,STK_REG(R15)(r1)
403	ld	r16,STK_REG(R16)(r1)
404	addi	r1,r1,STACKFRAMESIZE
405.Lsrc_error_nr:
406	cmpdi	0,r7,0
407	beqlr
408	li	r6,-EFAULT
409	stw	r6,0(r7)
410	blr
411
412.Ldest_error:
413	ld	r14,STK_REG(R14)(r1)
414	ld	r15,STK_REG(R15)(r1)
415	ld	r16,STK_REG(R16)(r1)
416	addi	r1,r1,STACKFRAMESIZE
417.Ldest_error_nr:
418	cmpdi	0,r8,0
419	beqlr
420	li	r6,-EFAULT
421	stw	r6,0(r8)
422	blr
423EXPORT_SYMBOL(csum_partial_copy_generic)
424