xref: /linux/arch/parisc/lib/memcpy.c (revision ff5599816711d2e67da2d7561fd36ac48debd433)
1 /*
2  *    Optimized memory copy routines.
3  *
4  *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
5  *    Copyright (C) 2013 Helge Deller <deller@gmx.de>
6  *
7  *    This program is free software; you can redistribute it and/or modify
8  *    it under the terms of the GNU General Public License as published by
9  *    the Free Software Foundation; either version 2, or (at your option)
10  *    any later version.
11  *
12  *    This program is distributed in the hope that it will be useful,
13  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *    GNU General Public License for more details.
16  *
17  *    You should have received a copy of the GNU General Public License
18  *    along with this program; if not, write to the Free Software
19  *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  *
21  *    Portions derived from the GNU C Library
22  *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
23  *
24  * Several strategies are tried to try to get the best performance for various
25  * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
26  * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
27  * general registers.  Unaligned copies are handled either by aligning the
28  * destination and then using shift-and-write method, or in a few cases by
29  * falling back to a byte-at-a-time copy.
30  *
31  * I chose to implement this in C because it is easier to maintain and debug,
32  * and in my experiments it appears that the C code generated by gcc (3.3/3.4
33  * at the time of writing) is fairly optimal. Unfortunately some of the
34  * semantics of the copy routine (exception handling) is difficult to express
35  * in C, so we have to play some tricks to get it to work.
36  *
37  * All the loads and stores are done via explicit asm() code in order to use
38  * the right space registers.
39  *
40  * Testing with various alignments and buffer sizes shows that this code is
41  * often >10x faster than a simple byte-at-a-time copy, even for strangely
42  * aligned operands. It is interesting to note that the glibc version
43  * of memcpy (written in C) is actually quite fast already. This routine is
44  * able to beat it by 30-40% for aligned copies because of the loop unrolling,
45  * but in some cases the glibc version is still slightly faster. This lends
46  * more credibility that gcc can generate very good code as long as we are
47  * careful.
48  *
49  * TODO:
50  * - cache prefetching needs more experimentation to get optimal settings
51  * - try not to use the post-increment address modifiers; they create additional
52  *   interlocks
53  * - replace byte-copy loops with stybs sequences
54  */
55 
56 #ifdef __KERNEL__
57 #include <linux/module.h>
58 #include <linux/compiler.h>
59 #include <asm/uaccess.h>
60 #define s_space "%%sr1"
61 #define d_space "%%sr2"
62 #else
63 #include "memcpy.h"
64 #define s_space "%%sr0"
65 #define d_space "%%sr0"
66 #define pa_memcpy new2_copy
67 #endif
68 
69 DECLARE_PER_CPU(struct exception_data, exception_data);
70 
71 #define preserve_branch(label)	do {					\
72 	volatile int dummy = 0;						\
73 	/* The following branch is never taken, it's just here to  */	\
74 	/* prevent gcc from optimizing away our exception code. */ 	\
75 	if (unlikely(dummy != dummy))					\
76 		goto label;						\
77 } while (0)
78 
79 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
80 #define get_kernel_space() (0)
81 
82 #define MERGE(w0, sh_1, w1, sh_2)  ({					\
83 	unsigned int _r;						\
84 	asm volatile (							\
85 	"mtsar %3\n"							\
86 	"shrpw %1, %2, %%sar, %0\n"					\
87 	: "=r"(_r)							\
88 	: "r"(w0), "r"(w1), "r"(sh_2)					\
89 	);								\
90 	_r;								\
91 })
92 #define THRESHOLD	16
93 
94 #ifdef DEBUG_MEMCPY
95 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
96 #else
97 #define DPRINTF(fmt, args...)
98 #endif
99 
100 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)	\
101 	__asm__ __volatile__ (				\
102 	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t"	\
103 	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
104 	: _tt(_t), "+r"(_a)				\
105 	: 						\
106 	: "r8")
107 
108 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) 	\
109 	__asm__ __volatile__ (				\
110 	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t"	\
111 	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
112 	: "+r"(_a) 					\
113 	: _tt(_t)					\
114 	: "r8")
115 
116 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
117 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
118 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
119 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
120 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
121 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
122 
123 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) 	\
124 	__asm__ __volatile__ (				\
125 	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t"	\
126 	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
127 	: _tt(_t) 					\
128 	: "r"(_a)					\
129 	: "r8")
130 
131 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) 	\
132 	__asm__ __volatile__ (				\
133 	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" 	\
134 	ASM_EXCEPTIONTABLE_ENTRY(1b,_e)			\
135 	: 						\
136 	: _tt(_t), "r"(_a)				\
137 	: "r8")
138 
139 #define ldw(_s,_o,_a,_t,_e)	def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
140 #define stw(_s,_t,_o,_a,_e) 	def_store_insn(stw,"r",_s,_t,_o,_a,_e)
141 
142 #ifdef  CONFIG_PREFETCH
143 static inline void prefetch_src(const void *addr)
144 {
145 	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
146 }
147 
148 static inline void prefetch_dst(const void *addr)
149 {
150 	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
151 }
152 #else
153 #define prefetch_src(addr) do { } while(0)
154 #define prefetch_dst(addr) do { } while(0)
155 #endif
156 
157 #define PA_MEMCPY_OK		0
158 #define PA_MEMCPY_LOAD_ERROR	1
159 #define PA_MEMCPY_STORE_ERROR	2
160 
161 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
162  * per loop.  This code is derived from glibc.
163  */
164 static inline unsigned long copy_dstaligned(unsigned long dst,
165 					unsigned long src, unsigned long len)
166 {
167 	/* gcc complains that a2 and a3 may be uninitialized, but actually
168 	 * they cannot be.  Initialize a2/a3 to shut gcc up.
169 	 */
170 	register unsigned int a0, a1, a2 = 0, a3 = 0;
171 	int sh_1, sh_2;
172 
173 	/* prefetch_src((const void *)src); */
174 
175 	/* Calculate how to shift a word read at the memory operation
176 	   aligned srcp to make it aligned for copy.  */
177 	sh_1 = 8 * (src % sizeof(unsigned int));
178 	sh_2 = 8 * sizeof(unsigned int) - sh_1;
179 
180 	/* Make src aligned by rounding it down.  */
181 	src &= -sizeof(unsigned int);
182 
183 	switch (len % 4)
184 	{
185 		case 2:
186 			/* a1 = ((unsigned int *) src)[0];
187 			   a2 = ((unsigned int *) src)[1]; */
188 			ldw(s_space, 0, src, a1, cda_ldw_exc);
189 			ldw(s_space, 4, src, a2, cda_ldw_exc);
190 			src -= 1 * sizeof(unsigned int);
191 			dst -= 3 * sizeof(unsigned int);
192 			len += 2;
193 			goto do1;
194 		case 3:
195 			/* a0 = ((unsigned int *) src)[0];
196 			   a1 = ((unsigned int *) src)[1]; */
197 			ldw(s_space, 0, src, a0, cda_ldw_exc);
198 			ldw(s_space, 4, src, a1, cda_ldw_exc);
199 			src -= 0 * sizeof(unsigned int);
200 			dst -= 2 * sizeof(unsigned int);
201 			len += 1;
202 			goto do2;
203 		case 0:
204 			if (len == 0)
205 				return PA_MEMCPY_OK;
206 			/* a3 = ((unsigned int *) src)[0];
207 			   a0 = ((unsigned int *) src)[1]; */
208 			ldw(s_space, 0, src, a3, cda_ldw_exc);
209 			ldw(s_space, 4, src, a0, cda_ldw_exc);
210 			src -=-1 * sizeof(unsigned int);
211 			dst -= 1 * sizeof(unsigned int);
212 			len += 0;
213 			goto do3;
214 		case 1:
215 			/* a2 = ((unsigned int *) src)[0];
216 			   a3 = ((unsigned int *) src)[1]; */
217 			ldw(s_space, 0, src, a2, cda_ldw_exc);
218 			ldw(s_space, 4, src, a3, cda_ldw_exc);
219 			src -=-2 * sizeof(unsigned int);
220 			dst -= 0 * sizeof(unsigned int);
221 			len -= 1;
222 			if (len == 0)
223 				goto do0;
224 			goto do4;			/* No-op.  */
225 	}
226 
227 	do
228 	{
229 		/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
230 do4:
231 		/* a0 = ((unsigned int *) src)[0]; */
232 		ldw(s_space, 0, src, a0, cda_ldw_exc);
233 		/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
234 		stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
235 do3:
236 		/* a1 = ((unsigned int *) src)[1]; */
237 		ldw(s_space, 4, src, a1, cda_ldw_exc);
238 		/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
239 		stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
240 do2:
241 		/* a2 = ((unsigned int *) src)[2]; */
242 		ldw(s_space, 8, src, a2, cda_ldw_exc);
243 		/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
244 		stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
245 do1:
246 		/* a3 = ((unsigned int *) src)[3]; */
247 		ldw(s_space, 12, src, a3, cda_ldw_exc);
248 		/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
249 		stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
250 
251 		src += 4 * sizeof(unsigned int);
252 		dst += 4 * sizeof(unsigned int);
253 		len -= 4;
254 	}
255 	while (len != 0);
256 
257 do0:
258 	/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
259 	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
260 
261 	preserve_branch(handle_load_error);
262 	preserve_branch(handle_store_error);
263 
264 	return PA_MEMCPY_OK;
265 
266 handle_load_error:
267 	__asm__ __volatile__ ("cda_ldw_exc:\n");
268 	return PA_MEMCPY_LOAD_ERROR;
269 
270 handle_store_error:
271 	__asm__ __volatile__ ("cda_stw_exc:\n");
272 	return PA_MEMCPY_STORE_ERROR;
273 }
274 
275 
276 /* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR.
277  * In case of an access fault the faulty address can be read from the per_cpu
278  * exception data struct. */
279 static unsigned long pa_memcpy_internal(void *dstp, const void *srcp,
280 					unsigned long len)
281 {
282 	register unsigned long src, dst, t1, t2, t3;
283 	register unsigned char *pcs, *pcd;
284 	register unsigned int *pws, *pwd;
285 	register double *pds, *pdd;
286 	unsigned long ret;
287 
288 	src = (unsigned long)srcp;
289 	dst = (unsigned long)dstp;
290 	pcs = (unsigned char *)srcp;
291 	pcd = (unsigned char *)dstp;
292 
293 	/* prefetch_src((const void *)srcp); */
294 
295 	if (len < THRESHOLD)
296 		goto byte_copy;
297 
298 	/* Check alignment */
299 	t1 = (src ^ dst);
300 	if (unlikely(t1 & (sizeof(double)-1)))
301 		goto unaligned_copy;
302 
303 	/* src and dst have same alignment. */
304 
305 	/* Copy bytes till we are double-aligned. */
306 	t2 = src & (sizeof(double) - 1);
307 	if (unlikely(t2 != 0)) {
308 		t2 = sizeof(double) - t2;
309 		while (t2 && len) {
310 			/* *pcd++ = *pcs++; */
311 			ldbma(s_space, pcs, t3, pmc_load_exc);
312 			len--;
313 			stbma(d_space, t3, pcd, pmc_store_exc);
314 			t2--;
315 		}
316 	}
317 
318 	pds = (double *)pcs;
319 	pdd = (double *)pcd;
320 
321 #if 0
322 	/* Copy 8 doubles at a time */
323 	while (len >= 8*sizeof(double)) {
324 		register double r1, r2, r3, r4, r5, r6, r7, r8;
325 		/* prefetch_src((char *)pds + L1_CACHE_BYTES); */
326 		flddma(s_space, pds, r1, pmc_load_exc);
327 		flddma(s_space, pds, r2, pmc_load_exc);
328 		flddma(s_space, pds, r3, pmc_load_exc);
329 		flddma(s_space, pds, r4, pmc_load_exc);
330 		fstdma(d_space, r1, pdd, pmc_store_exc);
331 		fstdma(d_space, r2, pdd, pmc_store_exc);
332 		fstdma(d_space, r3, pdd, pmc_store_exc);
333 		fstdma(d_space, r4, pdd, pmc_store_exc);
334 
335 #if 0
336 		if (L1_CACHE_BYTES <= 32)
337 			prefetch_src((char *)pds + L1_CACHE_BYTES);
338 #endif
339 		flddma(s_space, pds, r5, pmc_load_exc);
340 		flddma(s_space, pds, r6, pmc_load_exc);
341 		flddma(s_space, pds, r7, pmc_load_exc);
342 		flddma(s_space, pds, r8, pmc_load_exc);
343 		fstdma(d_space, r5, pdd, pmc_store_exc);
344 		fstdma(d_space, r6, pdd, pmc_store_exc);
345 		fstdma(d_space, r7, pdd, pmc_store_exc);
346 		fstdma(d_space, r8, pdd, pmc_store_exc);
347 		len -= 8*sizeof(double);
348 	}
349 #endif
350 
351 	pws = (unsigned int *)pds;
352 	pwd = (unsigned int *)pdd;
353 
354 word_copy:
355 	while (len >= 8*sizeof(unsigned int)) {
356 		register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
357 		/* prefetch_src((char *)pws + L1_CACHE_BYTES); */
358 		ldwma(s_space, pws, r1, pmc_load_exc);
359 		ldwma(s_space, pws, r2, pmc_load_exc);
360 		ldwma(s_space, pws, r3, pmc_load_exc);
361 		ldwma(s_space, pws, r4, pmc_load_exc);
362 		stwma(d_space, r1, pwd, pmc_store_exc);
363 		stwma(d_space, r2, pwd, pmc_store_exc);
364 		stwma(d_space, r3, pwd, pmc_store_exc);
365 		stwma(d_space, r4, pwd, pmc_store_exc);
366 
367 		ldwma(s_space, pws, r5, pmc_load_exc);
368 		ldwma(s_space, pws, r6, pmc_load_exc);
369 		ldwma(s_space, pws, r7, pmc_load_exc);
370 		ldwma(s_space, pws, r8, pmc_load_exc);
371 		stwma(d_space, r5, pwd, pmc_store_exc);
372 		stwma(d_space, r6, pwd, pmc_store_exc);
373 		stwma(d_space, r7, pwd, pmc_store_exc);
374 		stwma(d_space, r8, pwd, pmc_store_exc);
375 		len -= 8*sizeof(unsigned int);
376 	}
377 
378 	while (len >= 4*sizeof(unsigned int)) {
379 		register unsigned int r1,r2,r3,r4;
380 		ldwma(s_space, pws, r1, pmc_load_exc);
381 		ldwma(s_space, pws, r2, pmc_load_exc);
382 		ldwma(s_space, pws, r3, pmc_load_exc);
383 		ldwma(s_space, pws, r4, pmc_load_exc);
384 		stwma(d_space, r1, pwd, pmc_store_exc);
385 		stwma(d_space, r2, pwd, pmc_store_exc);
386 		stwma(d_space, r3, pwd, pmc_store_exc);
387 		stwma(d_space, r4, pwd, pmc_store_exc);
388 		len -= 4*sizeof(unsigned int);
389 	}
390 
391 	pcs = (unsigned char *)pws;
392 	pcd = (unsigned char *)pwd;
393 
394 byte_copy:
395 	while (len) {
396 		/* *pcd++ = *pcs++; */
397 		ldbma(s_space, pcs, t3, pmc_load_exc);
398 		stbma(d_space, t3, pcd, pmc_store_exc);
399 		len--;
400 	}
401 
402 	return PA_MEMCPY_OK;
403 
404 unaligned_copy:
405 	/* possibly we are aligned on a word, but not on a double... */
406 	if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) {
407 		t2 = src & (sizeof(unsigned int) - 1);
408 
409 		if (unlikely(t2 != 0)) {
410 			t2 = sizeof(unsigned int) - t2;
411 			while (t2) {
412 				/* *pcd++ = *pcs++; */
413 				ldbma(s_space, pcs, t3, pmc_load_exc);
414 				stbma(d_space, t3, pcd, pmc_store_exc);
415 				len--;
416 				t2--;
417 			}
418 		}
419 
420 		pws = (unsigned int *)pcs;
421 		pwd = (unsigned int *)pcd;
422 		goto word_copy;
423 	}
424 
425 	/* Align the destination.  */
426 	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
427 		t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
428 		while (t2) {
429 			/* *pcd++ = *pcs++; */
430 			ldbma(s_space, pcs, t3, pmc_load_exc);
431 			stbma(d_space, t3, pcd, pmc_store_exc);
432 			len--;
433 			t2--;
434 		}
435 		dst = (unsigned long)pcd;
436 		src = (unsigned long)pcs;
437 	}
438 
439 	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int));
440 	if (ret)
441 		return ret;
442 
443 	pcs += (len & -sizeof(unsigned int));
444 	pcd += (len & -sizeof(unsigned int));
445 	len %= sizeof(unsigned int);
446 
447 	preserve_branch(handle_load_error);
448 	preserve_branch(handle_store_error);
449 
450 	goto byte_copy;
451 
452 handle_load_error:
453 	__asm__ __volatile__ ("pmc_load_exc:\n");
454 	return PA_MEMCPY_LOAD_ERROR;
455 
456 handle_store_error:
457 	__asm__ __volatile__ ("pmc_store_exc:\n");
458 	return PA_MEMCPY_STORE_ERROR;
459 }
460 
461 
462 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
463 static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
464 {
465 	unsigned long ret, fault_addr, reference;
466 	struct exception_data *d;
467 
468 	ret = pa_memcpy_internal(dstp, srcp, len);
469 	if (likely(ret == PA_MEMCPY_OK))
470 		return 0;
471 
472 	/* if a load or store fault occured we can get the faulty addr */
473 	d = &__get_cpu_var(exception_data);
474 	fault_addr = d->fault_addr;
475 
476 	/* error in load or store? */
477 	if (ret == PA_MEMCPY_LOAD_ERROR)
478 		reference = (unsigned long) srcp;
479 	else
480 		reference = (unsigned long) dstp;
481 
482 	DPRINTF("pa_memcpy: fault type = %lu, len=%lu fault_addr=%lu ref=%lu\n",
483 		ret, len, fault_addr, reference);
484 
485 	if (fault_addr >= reference)
486 		return len - (fault_addr - reference);
487 	else
488 		return len;
489 }
490 
491 #ifdef __KERNEL__
492 unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
493 {
494 	mtsp(get_kernel_space(), 1);
495 	mtsp(get_user_space(), 2);
496 	return pa_memcpy((void __force *)dst, src, len);
497 }
498 
499 EXPORT_SYMBOL(__copy_from_user);
500 unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len)
501 {
502 	mtsp(get_user_space(), 1);
503 	mtsp(get_kernel_space(), 2);
504 	return pa_memcpy(dst, (void __force *)src, len);
505 }
506 
507 unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
508 {
509 	mtsp(get_user_space(), 1);
510 	mtsp(get_user_space(), 2);
511 	return pa_memcpy((void __force *)dst, (void __force *)src, len);
512 }
513 
514 
515 void * memcpy(void * dst,const void *src, size_t count)
516 {
517 	mtsp(get_kernel_space(), 1);
518 	mtsp(get_kernel_space(), 2);
519 	pa_memcpy(dst, src, count);
520 	return dst;
521 }
522 
523 EXPORT_SYMBOL(copy_to_user);
524 EXPORT_SYMBOL(copy_from_user);
525 EXPORT_SYMBOL(copy_in_user);
526 EXPORT_SYMBOL(memcpy);
527 #endif
528