xref: /linux/arch/parisc/lib/lusercopy.S (revision 8ce936c2f1a68c3a4f46578eed016ff92a67fbc6)
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*
3 *    User Space Access Routines
4 *
5 *    Copyright (C) 2000-2002 Hewlett-Packard (John Marvin)
6 *    Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org>
7 *    Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr>
8 *    Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org>
9 *    Copyright (C) 2017 Helge Deller <deller@gmx.de>
10 *    Copyright (C) 2017 John David Anglin <dave.anglin@bell.net>
11 */
12
13/*
14 * These routines still have plenty of room for optimization
15 * (word & doubleword load/store, dual issue, store hints, etc.).
16 */
17
18/*
19 * The following routines assume that space register 3 (sr3) contains
20 * the space id associated with the current users address space.
21 */
22
23
24	.text
25
26#include <asm/assembly.h>
27#include <asm/errno.h>
28#include <linux/linkage.h>
29
30	/*
31	 * unsigned long lclear_user(void *to, unsigned long n)
32	 *
33	 * Returns 0 for success.
34	 * otherwise, returns number of bytes not transferred.
35	 */
36
37ENTRY_CFI(lclear_user)
38	comib,=,n   0,%r25,$lclu_done
39$lclu_loop:
40	addib,<>    -1,%r25,$lclu_loop
411:	stbs,ma     %r0,1(%sr3,%r26)
42
43$lclu_done:
44	bv          %r0(%r2)
45	copy        %r25,%r28
46
472:	b           $lclu_done
48	ldo         1(%r25),%r25
49
50	ASM_EXCEPTIONTABLE_ENTRY(1b,2b)
51ENDPROC_CFI(lclear_user)
52
53
54/*
55 * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
56 *
57 * Inputs:
58 * - sr1 already contains space of source region
59 * - sr2 already contains space of destination region
60 *
61 * Returns:
62 * - number of bytes that could not be copied.
63 *   On success, this will be zero.
64 *
65 * This code is based on a C-implementation of a copy routine written by
66 * Randolph Chung, which in turn was derived from the glibc.
67 *
68 * Several strategies are tried to try to get the best performance for various
69 * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes
70 * at a time using general registers.  Unaligned copies are handled either by
71 * aligning the destination and then using shift-and-write method, or in a few
72 * cases by falling back to a byte-at-a-time copy.
73 *
74 * Testing with various alignments and buffer sizes shows that this code is
75 * often >10x faster than a simple byte-at-a-time copy, even for strangely
76 * aligned operands. It is interesting to note that the glibc version of memcpy
77 * (written in C) is actually quite fast already. This routine is able to beat
78 * it by 30-40% for aligned copies because of the loop unrolling, but in some
79 * cases the glibc version is still slightly faster. This lends more
80 * credibility that gcc can generate very good code as long as we are careful.
81 *
82 * Possible optimizations:
83 * - add cache prefetching
84 * - try not to use the post-increment address modifiers; they may create
85 *   additional interlocks. Assumption is that those were only efficient on old
86 *   machines (pre PA8000 processors)
87 */
88
89	dst = arg0
90	src = arg1
91	len = arg2
92	end = arg3
93	t1  = r19
94	t2  = r20
95	t3  = r21
96	t4  = r22
97	srcspc = sr1
98	dstspc = sr2
99
100	t0 = r1
101	a1 = t1
102	a2 = t2
103	a3 = t3
104	a0 = t4
105
106	save_src = ret0
107	save_dst = ret1
108	save_len = r31
109
110ENTRY_CFI(pa_memcpy)
111	/* Last destination address */
112	add	dst,len,end
113
114	/* short copy with less than 16 bytes? */
115	cmpib,COND(>>=),n 15,len,.Lbyte_loop
116
117	/* same alignment? */
118	xor	src,dst,t0
119	extru	t0,31,2,t1
120	cmpib,<>,n  0,t1,.Lunaligned_copy
121
122#ifdef CONFIG_64BIT
123	/* only do 64-bit copies if we can get aligned. */
124	extru	t0,31,3,t1
125	cmpib,<>,n  0,t1,.Lalign_loop32
126
127	/* loop until we are 64-bit aligned */
128.Lalign_loop64:
129	extru	dst,31,3,t1
130	cmpib,=,n	0,t1,.Lcopy_loop_16_start
13120:	ldb,ma	1(srcspc,src),t1
13221:	stb,ma	t1,1(dstspc,dst)
133	b	.Lalign_loop64
134	ldo	-1(len),len
135
136	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
137	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
138
139.Lcopy_loop_16_start:
140	ldi	31,t0
141.Lcopy_loop_16:
142	cmpb,COND(>>=),n t0,len,.Lword_loop
143
14410:	ldd	0(srcspc,src),t1
14511:	ldd	8(srcspc,src),t2
146	ldo	16(src),src
14712:	std,ma	t1,8(dstspc,dst)
14813:	std,ma	t2,8(dstspc,dst)
14914:	ldd	0(srcspc,src),t1
15015:	ldd	8(srcspc,src),t2
151	ldo	16(src),src
15216:	std,ma	t1,8(dstspc,dst)
15317:	std,ma	t2,8(dstspc,dst)
154
155	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
156	ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy16_fault)
157	ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
158	ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
159	ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
160	ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy16_fault)
161	ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
162	ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
163
164	b	.Lcopy_loop_16
165	ldo	-32(len),len
166
167.Lword_loop:
168	cmpib,COND(>>=),n 3,len,.Lbyte_loop
16920:	ldw,ma	4(srcspc,src),t1
17021:	stw,ma	t1,4(dstspc,dst)
171	b	.Lword_loop
172	ldo	-4(len),len
173
174	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
175	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
176
177#endif /* CONFIG_64BIT */
178
179	/* loop until we are 32-bit aligned */
180.Lalign_loop32:
181	extru	dst,31,2,t1
182	cmpib,=,n	0,t1,.Lcopy_loop_8
18320:	ldb,ma	1(srcspc,src),t1
18421:	stb,ma	t1,1(dstspc,dst)
185	b	.Lalign_loop32
186	ldo	-1(len),len
187
188	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
189	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
190
191
192.Lcopy_loop_8:
193	cmpib,COND(>>=),n 15,len,.Lbyte_loop
194
19510:	ldw	0(srcspc,src),t1
19611:	ldw	4(srcspc,src),t2
19712:	stw,ma	t1,4(dstspc,dst)
19813:	stw,ma	t2,4(dstspc,dst)
19914:	ldw	8(srcspc,src),t1
20015:	ldw	12(srcspc,src),t2
201	ldo	16(src),src
20216:	stw,ma	t1,4(dstspc,dst)
20317:	stw,ma	t2,4(dstspc,dst)
204
205	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
206	ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy8_fault)
207	ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
208	ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
209	ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
210	ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy8_fault)
211	ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
212	ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
213
214	b	.Lcopy_loop_8
215	ldo	-16(len),len
216
217.Lbyte_loop:
218	cmpclr,COND(<>) len,%r0,%r0
219	b,n	.Lcopy_done
22020:	ldb	0(srcspc,src),t1
221	ldo	1(src),src
22221:	stb,ma	t1,1(dstspc,dst)
223	b	.Lbyte_loop
224	ldo	-1(len),len
225
226	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
227	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
228
229.Lcopy_done:
230	bv	%r0(%r2)
231	sub	end,dst,ret0
232
233
234	/* src and dst are not aligned the same way. */
235	/* need to go the hard way */
236.Lunaligned_copy:
237	/* align until dst is 32bit-word-aligned */
238	extru	dst,31,2,t1
239	cmpib,=,n	0,t1,.Lcopy_dstaligned
24020:	ldb	0(srcspc,src),t1
241	ldo	1(src),src
24221:	stb,ma	t1,1(dstspc,dst)
243	b	.Lunaligned_copy
244	ldo	-1(len),len
245
246	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
247	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
248
249.Lcopy_dstaligned:
250
251	/* store src, dst and len in safe place */
252	copy	src,save_src
253	copy	dst,save_dst
254	copy	len,save_len
255
256	/* len now needs give number of words to copy */
257	SHRREG	len,2,len
258
259	/*
260	 * Copy from a not-aligned src to an aligned dst using shifts.
261	 * Handles 4 words per loop.
262	 */
263
264	depw,z src,28,2,t0
265	subi 32,t0,t0
266	mtsar t0
267	extru len,31,2,t0
268	cmpib,= 2,t0,.Lcase2
269	/* Make src aligned by rounding it down.  */
270	depi 0,31,2,src
271
272	cmpiclr,<> 3,t0,%r0
273	b,n .Lcase3
274	cmpiclr,<> 1,t0,%r0
275	b,n .Lcase1
276.Lcase0:
277	cmpb,COND(=) %r0,len,.Lcda_finish
278	nop
279
2801:	ldw,ma 4(srcspc,src), a3
281	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
2821:	ldw,ma 4(srcspc,src), a0
283	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
284	b,n .Ldo3
285.Lcase1:
2861:	ldw,ma 4(srcspc,src), a2
287	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
2881:	ldw,ma 4(srcspc,src), a3
289	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
290	ldo -1(len),len
291	cmpb,COND(=),n %r0,len,.Ldo0
292.Ldo4:
2931:	ldw,ma 4(srcspc,src), a0
294	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
295	shrpw a2, a3, %sar, t0
2961:	stw,ma t0, 4(dstspc,dst)
297	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
298.Ldo3:
2991:	ldw,ma 4(srcspc,src), a1
300	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
301	shrpw a3, a0, %sar, t0
3021:	stw,ma t0, 4(dstspc,dst)
303	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
304.Ldo2:
3051:	ldw,ma 4(srcspc,src), a2
306	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
307	shrpw a0, a1, %sar, t0
3081:	stw,ma t0, 4(dstspc,dst)
309	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
310.Ldo1:
3111:	ldw,ma 4(srcspc,src), a3
312	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
313	shrpw a1, a2, %sar, t0
3141:	stw,ma t0, 4(dstspc,dst)
315	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
316	ldo -4(len),len
317	cmpb,COND(<>) %r0,len,.Ldo4
318	nop
319.Ldo0:
320	shrpw a2, a3, %sar, t0
3211:	stw,ma t0, 4(dstspc,dst)
322	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
323
324.Lcda_rdfault:
325.Lcda_finish:
326	/* calculate new src, dst and len and jump to byte-copy loop */
327	sub	dst,save_dst,t0
328	add	save_src,t0,src
329	b	.Lbyte_loop
330	sub	save_len,t0,len
331
332.Lcase3:
3331:	ldw,ma 4(srcspc,src), a0
334	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3351:	ldw,ma 4(srcspc,src), a1
336	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
337	b .Ldo2
338	ldo 1(len),len
339.Lcase2:
3401:	ldw,ma 4(srcspc,src), a1
341	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3421:	ldw,ma 4(srcspc,src), a2
343	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
344	b .Ldo1
345	ldo 2(len),len
346
347
348	/* fault exception fixup handlers: */
349#ifdef CONFIG_64BIT
350.Lcopy16_fault:
351	b	.Lcopy_done
35210:	std,ma	t1,8(dstspc,dst)
353	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
354#endif
355
356.Lcopy8_fault:
357	b	.Lcopy_done
35810:	stw,ma	t1,4(dstspc,dst)
359	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
360ENDPROC_CFI(pa_memcpy)
361
362	.end
363