xref: /linux/arch/parisc/lib/lusercopy.S (revision 680e6ffa15103ab610c0fc1241d2f98c801b13e2)
1/*
2 *    User Space Access Routines
3 *
4 *    Copyright (C) 2000-2002 Hewlett-Packard (John Marvin)
5 *    Copyright (C) 2000 Richard Hirst <rhirst with parisc-linux.org>
6 *    Copyright (C) 2001 Matthieu Delahaye <delahaym at esiee.fr>
7 *    Copyright (C) 2003 Randolph Chung <tausq with parisc-linux.org>
8 *    Copyright (C) 2017 Helge Deller <deller@gmx.de>
9 *    Copyright (C) 2017 John David Anglin <dave.anglin@bell.net>
10 *
11 *
12 *    This program is free software; you can redistribute it and/or modify
13 *    it under the terms of the GNU General Public License as published by
14 *    the Free Software Foundation; either version 2, or (at your option)
15 *    any later version.
16 *
17 *    This program is distributed in the hope that it will be useful,
18 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
19 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 *    GNU General Public License for more details.
21 *
22 *    You should have received a copy of the GNU General Public License
23 *    along with this program; if not, write to the Free Software
24 *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
25 */
26
27/*
28 * These routines still have plenty of room for optimization
29 * (word & doubleword load/store, dual issue, store hints, etc.).
30 */
31
32/*
33 * The following routines assume that space register 3 (sr3) contains
34 * the space id associated with the current users address space.
35 */
36
37
38	.text
39
40#include <asm/assembly.h>
41#include <asm/errno.h>
42#include <linux/linkage.h>
43
44	/*
45	 * get_sr gets the appropriate space value into
46	 * sr1 for kernel/user space access, depending
47	 * on the flag stored in the task structure.
48	 */
49
50	.macro  get_sr
51	mfctl       %cr30,%r1
52	ldw         TI_SEGMENT(%r1),%r22
53	mfsp        %sr3,%r1
54	or,<>       %r22,%r0,%r0
55	copy        %r0,%r1
56	mtsp        %r1,%sr1
57	.endm
58
59	/*
60	 * unsigned long lclear_user(void *to, unsigned long n)
61	 *
62	 * Returns 0 for success.
63	 * otherwise, returns number of bytes not transferred.
64	 */
65
66ENTRY_CFI(lclear_user)
67	comib,=,n   0,%r25,$lclu_done
68	get_sr
69$lclu_loop:
70	addib,<>    -1,%r25,$lclu_loop
711:      stbs,ma     %r0,1(%sr1,%r26)
72
73$lclu_done:
74	bv          %r0(%r2)
75	copy        %r25,%r28
76
772:	b           $lclu_done
78	ldo         1(%r25),%r25
79
80	ASM_EXCEPTIONTABLE_ENTRY(1b,2b)
81ENDPROC_CFI(lclear_user)
82
83
84	/*
85	 * long lstrnlen_user(char *s, long n)
86	 *
87	 * Returns 0 if exception before zero byte or reaching N,
88	 *         N+1 if N would be exceeded,
89	 *         else strlen + 1 (i.e. includes zero byte).
90	 */
91
92ENTRY_CFI(lstrnlen_user)
93	comib,=     0,%r25,$lslen_nzero
94	copy	    %r26,%r24
95	get_sr
961:      ldbs,ma     1(%sr1,%r26),%r1
97$lslen_loop:
98	comib,=,n   0,%r1,$lslen_done
99	addib,<>    -1,%r25,$lslen_loop
1002:      ldbs,ma     1(%sr1,%r26),%r1
101$lslen_done:
102	bv          %r0(%r2)
103	sub	    %r26,%r24,%r28
104
105$lslen_nzero:
106	b           $lslen_done
107	ldo         1(%r26),%r26 /* special case for N == 0 */
108
1093:      b	    $lslen_done
110	copy        %r24,%r26    /* reset r26 so 0 is returned on fault */
111
112	ASM_EXCEPTIONTABLE_ENTRY(1b,3b)
113	ASM_EXCEPTIONTABLE_ENTRY(2b,3b)
114
115ENDPROC_CFI(lstrnlen_user)
116
117
118/*
119 * unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
120 *
121 * Inputs:
122 * - sr1 already contains space of source region
123 * - sr2 already contains space of destination region
124 *
125 * Returns:
126 * - number of bytes that could not be copied.
127 *   On success, this will be zero.
128 *
129 * This code is based on a C-implementation of a copy routine written by
130 * Randolph Chung, which in turn was derived from the glibc.
131 *
132 * Several strategies are tried to try to get the best performance for various
133 * conditions. In the optimal case, we copy by loops that copy 32- or 16-bytes
134 * at a time using general registers.  Unaligned copies are handled either by
135 * aligning the destination and then using shift-and-write method, or in a few
136 * cases by falling back to a byte-at-a-time copy.
137 *
138 * Testing with various alignments and buffer sizes shows that this code is
139 * often >10x faster than a simple byte-at-a-time copy, even for strangely
140 * aligned operands. It is interesting to note that the glibc version of memcpy
141 * (written in C) is actually quite fast already. This routine is able to beat
142 * it by 30-40% for aligned copies because of the loop unrolling, but in some
143 * cases the glibc version is still slightly faster. This lends more
144 * credibility that gcc can generate very good code as long as we are careful.
145 *
146 * Possible optimizations:
147 * - add cache prefetching
148 * - try not to use the post-increment address modifiers; they may create
149 *   additional interlocks. Assumption is that those were only efficient on old
150 *   machines (pre PA8000 processors)
151 */
152
153	dst = arg0
154	src = arg1
155	len = arg2
156	end = arg3
157	t1  = r19
158	t2  = r20
159	t3  = r21
160	t4  = r22
161	srcspc = sr1
162	dstspc = sr2
163
164	t0 = r1
165	a1 = t1
166	a2 = t2
167	a3 = t3
168	a0 = t4
169
170	save_src = ret0
171	save_dst = ret1
172	save_len = r31
173
174ENTRY_CFI(pa_memcpy)
175	/* Last destination address */
176	add	dst,len,end
177
178	/* short copy with less than 16 bytes? */
179	cmpib,COND(>>=),n 15,len,.Lbyte_loop
180
181	/* same alignment? */
182	xor	src,dst,t0
183	extru	t0,31,2,t1
184	cmpib,<>,n  0,t1,.Lunaligned_copy
185
186#ifdef CONFIG_64BIT
187	/* only do 64-bit copies if we can get aligned. */
188	extru	t0,31,3,t1
189	cmpib,<>,n  0,t1,.Lalign_loop32
190
191	/* loop until we are 64-bit aligned */
192.Lalign_loop64:
193	extru	dst,31,3,t1
194	cmpib,=,n	0,t1,.Lcopy_loop_16_start
19520:	ldb,ma	1(srcspc,src),t1
19621:	stb,ma	t1,1(dstspc,dst)
197	b	.Lalign_loop64
198	ldo	-1(len),len
199
200	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
201	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
202
203.Lcopy_loop_16_start:
204	ldi	31,t0
205.Lcopy_loop_16:
206	cmpb,COND(>>=),n t0,len,.Lword_loop
207
20810:	ldd	0(srcspc,src),t1
20911:	ldd	8(srcspc,src),t2
210	ldo	16(src),src
21112:	std,ma	t1,8(dstspc,dst)
21213:	std,ma	t2,8(dstspc,dst)
21314:	ldd	0(srcspc,src),t1
21415:	ldd	8(srcspc,src),t2
215	ldo	16(src),src
21616:	std,ma	t1,8(dstspc,dst)
21717:	std,ma	t2,8(dstspc,dst)
218
219	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
220	ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy16_fault)
221	ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
222	ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
223	ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
224	ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy16_fault)
225	ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
226	ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
227
228	b	.Lcopy_loop_16
229	ldo	-32(len),len
230
231.Lword_loop:
232	cmpib,COND(>>=),n 3,len,.Lbyte_loop
23320:	ldw,ma	4(srcspc,src),t1
23421:	stw,ma	t1,4(dstspc,dst)
235	b	.Lword_loop
236	ldo	-4(len),len
237
238	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
239	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
240
241#endif /* CONFIG_64BIT */
242
243	/* loop until we are 32-bit aligned */
244.Lalign_loop32:
245	extru	dst,31,2,t1
246	cmpib,=,n	0,t1,.Lcopy_loop_8
24720:	ldb,ma	1(srcspc,src),t1
24821:	stb,ma	t1,1(dstspc,dst)
249	b	.Lalign_loop32
250	ldo	-1(len),len
251
252	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
253	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
254
255
256.Lcopy_loop_8:
257	cmpib,COND(>>=),n 15,len,.Lbyte_loop
258
25910:	ldw	0(srcspc,src),t1
26011:	ldw	4(srcspc,src),t2
26112:	stw,ma	t1,4(dstspc,dst)
26213:	stw,ma	t2,4(dstspc,dst)
26314:	ldw	8(srcspc,src),t1
26415:	ldw	12(srcspc,src),t2
265	ldo	16(src),src
26616:	stw,ma	t1,4(dstspc,dst)
26717:	stw,ma	t2,4(dstspc,dst)
268
269	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
270	ASM_EXCEPTIONTABLE_ENTRY(11b,.Lcopy8_fault)
271	ASM_EXCEPTIONTABLE_ENTRY(12b,.Lcopy_done)
272	ASM_EXCEPTIONTABLE_ENTRY(13b,.Lcopy_done)
273	ASM_EXCEPTIONTABLE_ENTRY(14b,.Lcopy_done)
274	ASM_EXCEPTIONTABLE_ENTRY(15b,.Lcopy8_fault)
275	ASM_EXCEPTIONTABLE_ENTRY(16b,.Lcopy_done)
276	ASM_EXCEPTIONTABLE_ENTRY(17b,.Lcopy_done)
277
278	b	.Lcopy_loop_8
279	ldo	-16(len),len
280
281.Lbyte_loop:
282	cmpclr,COND(<>) len,%r0,%r0
283	b,n	.Lcopy_done
28420:	ldb	0(srcspc,src),t1
285	ldo	1(src),src
28621:	stb,ma	t1,1(dstspc,dst)
287	b	.Lbyte_loop
288	ldo	-1(len),len
289
290	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
291	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
292
293.Lcopy_done:
294	bv	%r0(%r2)
295	sub	end,dst,ret0
296
297
298	/* src and dst are not aligned the same way. */
299	/* need to go the hard way */
300.Lunaligned_copy:
301	/* align until dst is 32bit-word-aligned */
302	extru	dst,31,2,t1
303	cmpib,=,n	0,t1,.Lcopy_dstaligned
30420:	ldb	0(srcspc,src),t1
305	ldo	1(src),src
30621:	stb,ma	t1,1(dstspc,dst)
307	b	.Lunaligned_copy
308	ldo	-1(len),len
309
310	ASM_EXCEPTIONTABLE_ENTRY(20b,.Lcopy_done)
311	ASM_EXCEPTIONTABLE_ENTRY(21b,.Lcopy_done)
312
313.Lcopy_dstaligned:
314
315	/* store src, dst and len in safe place */
316	copy	src,save_src
317	copy	dst,save_dst
318	copy	len,save_len
319
320	/* len now needs give number of words to copy */
321	SHRREG	len,2,len
322
323	/*
324	 * Copy from a not-aligned src to an aligned dst using shifts.
325	 * Handles 4 words per loop.
326	 */
327
328	depw,z src,28,2,t0
329	subi 32,t0,t0
330	mtsar t0
331	extru len,31,2,t0
332	cmpib,= 2,t0,.Lcase2
333	/* Make src aligned by rounding it down.  */
334	depi 0,31,2,src
335
336	cmpiclr,<> 3,t0,%r0
337	b,n .Lcase3
338	cmpiclr,<> 1,t0,%r0
339	b,n .Lcase1
340.Lcase0:
341	cmpb,COND(=) %r0,len,.Lcda_finish
342	nop
343
3441:	ldw,ma 4(srcspc,src), a3
345	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3461:	ldw,ma 4(srcspc,src), a0
347	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
348	b,n .Ldo3
349.Lcase1:
3501:	ldw,ma 4(srcspc,src), a2
351	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3521:	ldw,ma 4(srcspc,src), a3
353	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
354	ldo -1(len),len
355	cmpb,COND(=),n %r0,len,.Ldo0
356.Ldo4:
3571:	ldw,ma 4(srcspc,src), a0
358	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
359	shrpw a2, a3, %sar, t0
3601:	stw,ma t0, 4(dstspc,dst)
361	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
362.Ldo3:
3631:	ldw,ma 4(srcspc,src), a1
364	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
365	shrpw a3, a0, %sar, t0
3661:	stw,ma t0, 4(dstspc,dst)
367	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
368.Ldo2:
3691:	ldw,ma 4(srcspc,src), a2
370	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
371	shrpw a0, a1, %sar, t0
3721:	stw,ma t0, 4(dstspc,dst)
373	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
374.Ldo1:
3751:	ldw,ma 4(srcspc,src), a3
376	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
377	shrpw a1, a2, %sar, t0
3781:	stw,ma t0, 4(dstspc,dst)
379	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
380	ldo -4(len),len
381	cmpb,COND(<>) %r0,len,.Ldo4
382	nop
383.Ldo0:
384	shrpw a2, a3, %sar, t0
3851:	stw,ma t0, 4(dstspc,dst)
386	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcopy_done)
387
388.Lcda_rdfault:
389.Lcda_finish:
390	/* calculate new src, dst and len and jump to byte-copy loop */
391	sub	dst,save_dst,t0
392	add	save_src,t0,src
393	b	.Lbyte_loop
394	sub	save_len,t0,len
395
396.Lcase3:
3971:	ldw,ma 4(srcspc,src), a0
398	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
3991:	ldw,ma 4(srcspc,src), a1
400	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
401	b .Ldo2
402	ldo 1(len),len
403.Lcase2:
4041:	ldw,ma 4(srcspc,src), a1
405	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
4061:	ldw,ma 4(srcspc,src), a2
407	ASM_EXCEPTIONTABLE_ENTRY(1b,.Lcda_rdfault)
408	b .Ldo1
409	ldo 2(len),len
410
411
412	/* fault exception fixup handlers: */
413#ifdef CONFIG_64BIT
414.Lcopy16_fault:
415	b	.Lcopy_done
41610:	std,ma	t1,8(dstspc,dst)
417	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
418#endif
419
420.Lcopy8_fault:
421	b	.Lcopy_done
42210:	stw,ma	t1,4(dstspc,dst)
423	ASM_EXCEPTIONTABLE_ENTRY(10b,.Lcopy_done)
424ENDPROC_CFI(pa_memcpy)
425
426	.end
427