xref: /linux/arch/parisc/lib/memcpy.c (revision 14b42963f64b98ab61fa9723c03d71aa5ef4f862)
1 /*
2  *    Optimized memory copy routines.
3  *
4  *    Copyright (C) 2004 Randolph Chung <tausq@debian.org>
5  *
6  *    This program is free software; you can redistribute it and/or modify
7  *    it under the terms of the GNU General Public License as published by
8  *    the Free Software Foundation; either version 2, or (at your option)
9  *    any later version.
10  *
11  *    This program is distributed in the hope that it will be useful,
12  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *    GNU General Public License for more details.
15  *
16  *    You should have received a copy of the GNU General Public License
17  *    along with this program; if not, write to the Free Software
18  *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19  *
20  *    Portions derived from the GNU C Library
21  *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
22  *
23  * Several strategies are tried to try to get the best performance for various
24  * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using
25  * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using
26  * general registers.  Unaligned copies are handled either by aligning the
27  * destination and then using shift-and-write method, or in a few cases by
28  * falling back to a byte-at-a-time copy.
29  *
30  * I chose to implement this in C because it is easier to maintain and debug,
31  * and in my experiments it appears that the C code generated by gcc (3.3/3.4
32  * at the time of writing) is fairly optimal. Unfortunately some of the
33  * semantics of the copy routine (exception handling) is difficult to express
34  * in C, so we have to play some tricks to get it to work.
35  *
36  * All the loads and stores are done via explicit asm() code in order to use
37  * the right space registers.
38  *
39  * Testing with various alignments and buffer sizes shows that this code is
40  * often >10x faster than a simple byte-at-a-time copy, even for strangely
41  * aligned operands. It is interesting to note that the glibc version
42  * of memcpy (written in C) is actually quite fast already. This routine is
43  * able to beat it by 30-40% for aligned copies because of the loop unrolling,
44  * but in some cases the glibc version is still slightly faster. This lends
45  * more credibility that gcc can generate very good code as long as we are
46  * careful.
47  *
48  * TODO:
49  * - cache prefetching needs more experimentation to get optimal settings
50  * - try not to use the post-increment address modifiers; they create additional
51  *   interlocks
52  * - replace byte-copy loops with stybs sequences
53  */
54 
55 #ifdef __KERNEL__
56 #include <linux/module.h>
57 #include <linux/compiler.h>
58 #include <asm/uaccess.h>
59 #define s_space "%%sr1"
60 #define d_space "%%sr2"
61 #else
62 #include "memcpy.h"
63 #define s_space "%%sr0"
64 #define d_space "%%sr0"
65 #define pa_memcpy new2_copy
66 #endif
67 
68 DECLARE_PER_CPU(struct exception_data, exception_data);
69 
70 #define preserve_branch(label)	do {					\
71 	volatile int dummy;						\
72 	/* The following branch is never taken, it's just here to  */	\
73 	/* prevent gcc from optimizing away our exception code. */ 	\
74 	if (unlikely(dummy != dummy))					\
75 		goto label;						\
76 } while (0)
77 
78 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))
79 #define get_kernel_space() (0)
80 
81 #define MERGE(w0, sh_1, w1, sh_2)  ({					\
82 	unsigned int _r;						\
83 	asm volatile (							\
84 	"mtsar %3\n"							\
85 	"shrpw %1, %2, %%sar, %0\n"					\
86 	: "=r"(_r)							\
87 	: "r"(w0), "r"(w1), "r"(sh_2)					\
88 	);								\
89 	_r;								\
90 })
91 #define THRESHOLD	16
92 
93 #ifdef DEBUG_MEMCPY
94 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)
95 #else
96 #define DPRINTF(fmt, args...)
97 #endif
98 
99 #ifndef __LP64__
100 #define EXC_WORD ".word"
101 #else
102 #define EXC_WORD ".dword"
103 #endif
104 
105 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)	\
106 	__asm__ __volatile__ (				\
107 	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" 	\
108 	"\t.section __ex_table,\"aw\"\n"		\
109 	"\t" EXC_WORD "\t1b\n"				\
110 	"\t" EXC_WORD "\t" #_e "\n"			\
111 	"\t.previous\n"					\
112 	: _tt(_t), "+r"(_a)				\
113 	: 						\
114 	: "r8")
115 
116 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) 	\
117 	__asm__ __volatile__ (				\
118 	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" 	\
119 	"\t.section __ex_table,\"aw\"\n"		\
120 	"\t" EXC_WORD "\t1b\n"				\
121 	"\t" EXC_WORD "\t" #_e "\n"			\
122 	"\t.previous\n"					\
123 	: "+r"(_a) 					\
124 	: _tt(_t)					\
125 	: "r8")
126 
127 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)
128 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)
129 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)
130 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)
131 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)
132 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)
133 
134 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) 	\
135 	__asm__ __volatile__ (				\
136 	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n"	\
137 	"\t.section __ex_table,\"aw\"\n"		\
138 	"\t" EXC_WORD "\t1b\n"				\
139 	"\t" EXC_WORD "\t" #_e "\n"			\
140 	"\t.previous\n"					\
141 	: _tt(_t) 					\
142 	: "r"(_a)					\
143 	: "r8")
144 
145 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) 	\
146 	__asm__ __volatile__ (				\
147 	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" 	\
148 	"\t.section __ex_table,\"aw\"\n"		\
149 	"\t" EXC_WORD "\t1b\n"				\
150 	"\t" EXC_WORD "\t" #_e "\n"			\
151 	"\t.previous\n"					\
152 	: 						\
153 	: _tt(_t), "r"(_a)				\
154 	: "r8")
155 
156 #define ldw(_s,_o,_a,_t,_e)	def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)
157 #define stw(_s,_t,_o,_a,_e) 	def_store_insn(stw,"r",_s,_t,_o,_a,_e)
158 
159 #ifdef  CONFIG_PREFETCH
160 extern inline void prefetch_src(const void *addr)
161 {
162 	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));
163 }
164 
165 extern inline void prefetch_dst(const void *addr)
166 {
167 	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));
168 }
169 #else
170 #define prefetch_src(addr)
171 #define prefetch_dst(addr)
172 #endif
173 
174 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words
175  * per loop.  This code is derived from glibc.
176  */
177 static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len)
178 {
179 	/* gcc complains that a2 and a3 may be uninitialized, but actually
180 	 * they cannot be.  Initialize a2/a3 to shut gcc up.
181 	 */
182 	register unsigned int a0, a1, a2 = 0, a3 = 0;
183 	int sh_1, sh_2;
184 	struct exception_data *d;
185 
186 	/* prefetch_src((const void *)src); */
187 
188 	/* Calculate how to shift a word read at the memory operation
189 	   aligned srcp to make it aligned for copy.  */
190 	sh_1 = 8 * (src % sizeof(unsigned int));
191 	sh_2 = 8 * sizeof(unsigned int) - sh_1;
192 
193 	/* Make src aligned by rounding it down.  */
194 	src &= -sizeof(unsigned int);
195 
196 	switch (len % 4)
197 	{
198 		case 2:
199 			/* a1 = ((unsigned int *) src)[0];
200 			   a2 = ((unsigned int *) src)[1]; */
201 			ldw(s_space, 0, src, a1, cda_ldw_exc);
202 			ldw(s_space, 4, src, a2, cda_ldw_exc);
203 			src -= 1 * sizeof(unsigned int);
204 			dst -= 3 * sizeof(unsigned int);
205 			len += 2;
206 			goto do1;
207 		case 3:
208 			/* a0 = ((unsigned int *) src)[0];
209 			   a1 = ((unsigned int *) src)[1]; */
210 			ldw(s_space, 0, src, a0, cda_ldw_exc);
211 			ldw(s_space, 4, src, a1, cda_ldw_exc);
212 			src -= 0 * sizeof(unsigned int);
213 			dst -= 2 * sizeof(unsigned int);
214 			len += 1;
215 			goto do2;
216 		case 0:
217 			if (len == 0)
218 				return 0;
219 			/* a3 = ((unsigned int *) src)[0];
220 			   a0 = ((unsigned int *) src)[1]; */
221 			ldw(s_space, 0, src, a3, cda_ldw_exc);
222 			ldw(s_space, 4, src, a0, cda_ldw_exc);
223 			src -=-1 * sizeof(unsigned int);
224 			dst -= 1 * sizeof(unsigned int);
225 			len += 0;
226 			goto do3;
227 		case 1:
228 			/* a2 = ((unsigned int *) src)[0];
229 			   a3 = ((unsigned int *) src)[1]; */
230 			ldw(s_space, 0, src, a2, cda_ldw_exc);
231 			ldw(s_space, 4, src, a3, cda_ldw_exc);
232 			src -=-2 * sizeof(unsigned int);
233 			dst -= 0 * sizeof(unsigned int);
234 			len -= 1;
235 			if (len == 0)
236 				goto do0;
237 			goto do4;			/* No-op.  */
238 	}
239 
240 	do
241 	{
242 		/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */
243 do4:
244 		/* a0 = ((unsigned int *) src)[0]; */
245 		ldw(s_space, 0, src, a0, cda_ldw_exc);
246 		/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
247 		stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
248 do3:
249 		/* a1 = ((unsigned int *) src)[1]; */
250 		ldw(s_space, 4, src, a1, cda_ldw_exc);
251 		/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */
252 		stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);
253 do2:
254 		/* a2 = ((unsigned int *) src)[2]; */
255 		ldw(s_space, 8, src, a2, cda_ldw_exc);
256 		/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */
257 		stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);
258 do1:
259 		/* a3 = ((unsigned int *) src)[3]; */
260 		ldw(s_space, 12, src, a3, cda_ldw_exc);
261 		/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */
262 		stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
263 
264 		src += 4 * sizeof(unsigned int);
265 		dst += 4 * sizeof(unsigned int);
266 		len -= 4;
267 	}
268 	while (len != 0);
269 
270 do0:
271 	/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */
272 	stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);
273 
274 	preserve_branch(handle_load_error);
275 	preserve_branch(handle_store_error);
276 
277 	return 0;
278 
279 handle_load_error:
280 	__asm__ __volatile__ ("cda_ldw_exc:\n");
281 	d = &__get_cpu_var(exception_data);
282 	DPRINTF("cda_ldw_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
283 		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
284 	return o_len * 4 - d->fault_addr + o_src;
285 
286 handle_store_error:
287 	__asm__ __volatile__ ("cda_stw_exc:\n");
288 	d = &__get_cpu_var(exception_data);
289 	DPRINTF("cda_stw_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
290 		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
291 	return o_len * 4 - d->fault_addr + o_dst;
292 }
293 
294 
295 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */
296 unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len)
297 {
298 	register unsigned long src, dst, t1, t2, t3;
299 	register unsigned char *pcs, *pcd;
300 	register unsigned int *pws, *pwd;
301 	register double *pds, *pdd;
302 	unsigned long ret = 0;
303 	unsigned long o_dst, o_src, o_len;
304 	struct exception_data *d;
305 
306 	src = (unsigned long)srcp;
307 	dst = (unsigned long)dstp;
308 	pcs = (unsigned char *)srcp;
309 	pcd = (unsigned char *)dstp;
310 
311 	o_dst = dst; o_src = src; o_len = len;
312 
313 	/* prefetch_src((const void *)srcp); */
314 
315 	if (len < THRESHOLD)
316 		goto byte_copy;
317 
318 	/* Check alignment */
319 	t1 = (src ^ dst);
320 	if (unlikely(t1 & (sizeof(double)-1)))
321 		goto unaligned_copy;
322 
323 	/* src and dst have same alignment. */
324 
325 	/* Copy bytes till we are double-aligned. */
326 	t2 = src & (sizeof(double) - 1);
327 	if (unlikely(t2 != 0)) {
328 		t2 = sizeof(double) - t2;
329 		while (t2 && len) {
330 			/* *pcd++ = *pcs++; */
331 			ldbma(s_space, pcs, t3, pmc_load_exc);
332 			len--;
333 			stbma(d_space, t3, pcd, pmc_store_exc);
334 			t2--;
335 		}
336 	}
337 
338 	pds = (double *)pcs;
339 	pdd = (double *)pcd;
340 
341 #if 0
342 	/* Copy 8 doubles at a time */
343 	while (len >= 8*sizeof(double)) {
344 		register double r1, r2, r3, r4, r5, r6, r7, r8;
345 		/* prefetch_src((char *)pds + L1_CACHE_BYTES); */
346 		flddma(s_space, pds, r1, pmc_load_exc);
347 		flddma(s_space, pds, r2, pmc_load_exc);
348 		flddma(s_space, pds, r3, pmc_load_exc);
349 		flddma(s_space, pds, r4, pmc_load_exc);
350 		fstdma(d_space, r1, pdd, pmc_store_exc);
351 		fstdma(d_space, r2, pdd, pmc_store_exc);
352 		fstdma(d_space, r3, pdd, pmc_store_exc);
353 		fstdma(d_space, r4, pdd, pmc_store_exc);
354 
355 #if 0
356 		if (L1_CACHE_BYTES <= 32)
357 			prefetch_src((char *)pds + L1_CACHE_BYTES);
358 #endif
359 		flddma(s_space, pds, r5, pmc_load_exc);
360 		flddma(s_space, pds, r6, pmc_load_exc);
361 		flddma(s_space, pds, r7, pmc_load_exc);
362 		flddma(s_space, pds, r8, pmc_load_exc);
363 		fstdma(d_space, r5, pdd, pmc_store_exc);
364 		fstdma(d_space, r6, pdd, pmc_store_exc);
365 		fstdma(d_space, r7, pdd, pmc_store_exc);
366 		fstdma(d_space, r8, pdd, pmc_store_exc);
367 		len -= 8*sizeof(double);
368 	}
369 #endif
370 
371 	pws = (unsigned int *)pds;
372 	pwd = (unsigned int *)pdd;
373 
374 word_copy:
375 	while (len >= 8*sizeof(unsigned int)) {
376 		register unsigned int r1,r2,r3,r4,r5,r6,r7,r8;
377 		/* prefetch_src((char *)pws + L1_CACHE_BYTES); */
378 		ldwma(s_space, pws, r1, pmc_load_exc);
379 		ldwma(s_space, pws, r2, pmc_load_exc);
380 		ldwma(s_space, pws, r3, pmc_load_exc);
381 		ldwma(s_space, pws, r4, pmc_load_exc);
382 		stwma(d_space, r1, pwd, pmc_store_exc);
383 		stwma(d_space, r2, pwd, pmc_store_exc);
384 		stwma(d_space, r3, pwd, pmc_store_exc);
385 		stwma(d_space, r4, pwd, pmc_store_exc);
386 
387 		ldwma(s_space, pws, r5, pmc_load_exc);
388 		ldwma(s_space, pws, r6, pmc_load_exc);
389 		ldwma(s_space, pws, r7, pmc_load_exc);
390 		ldwma(s_space, pws, r8, pmc_load_exc);
391 		stwma(d_space, r5, pwd, pmc_store_exc);
392 		stwma(d_space, r6, pwd, pmc_store_exc);
393 		stwma(d_space, r7, pwd, pmc_store_exc);
394 		stwma(d_space, r8, pwd, pmc_store_exc);
395 		len -= 8*sizeof(unsigned int);
396 	}
397 
398 	while (len >= 4*sizeof(unsigned int)) {
399 		register unsigned int r1,r2,r3,r4;
400 		ldwma(s_space, pws, r1, pmc_load_exc);
401 		ldwma(s_space, pws, r2, pmc_load_exc);
402 		ldwma(s_space, pws, r3, pmc_load_exc);
403 		ldwma(s_space, pws, r4, pmc_load_exc);
404 		stwma(d_space, r1, pwd, pmc_store_exc);
405 		stwma(d_space, r2, pwd, pmc_store_exc);
406 		stwma(d_space, r3, pwd, pmc_store_exc);
407 		stwma(d_space, r4, pwd, pmc_store_exc);
408 		len -= 4*sizeof(unsigned int);
409 	}
410 
411 	pcs = (unsigned char *)pws;
412 	pcd = (unsigned char *)pwd;
413 
414 byte_copy:
415 	while (len) {
416 		/* *pcd++ = *pcs++; */
417 		ldbma(s_space, pcs, t3, pmc_load_exc);
418 		stbma(d_space, t3, pcd, pmc_store_exc);
419 		len--;
420 	}
421 
422 	return 0;
423 
424 unaligned_copy:
425 	/* possibly we are aligned on a word, but not on a double... */
426 	if (likely(t1 & (sizeof(unsigned int)-1)) == 0) {
427 		t2 = src & (sizeof(unsigned int) - 1);
428 
429 		if (unlikely(t2 != 0)) {
430 			t2 = sizeof(unsigned int) - t2;
431 			while (t2) {
432 				/* *pcd++ = *pcs++; */
433 				ldbma(s_space, pcs, t3, pmc_load_exc);
434 				stbma(d_space, t3, pcd, pmc_store_exc);
435 				len--;
436 				t2--;
437 			}
438 		}
439 
440 		pws = (unsigned int *)pcs;
441 		pwd = (unsigned int *)pcd;
442 		goto word_copy;
443 	}
444 
445 	/* Align the destination.  */
446 	if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) {
447 		t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1));
448 		while (t2) {
449 			/* *pcd++ = *pcs++; */
450 			ldbma(s_space, pcs, t3, pmc_load_exc);
451 			stbma(d_space, t3, pcd, pmc_store_exc);
452 			len--;
453 			t2--;
454 		}
455 		dst = (unsigned long)pcd;
456 		src = (unsigned long)pcs;
457 	}
458 
459 	ret = copy_dstaligned(dst, src, len / sizeof(unsigned int),
460 		o_dst, o_src, o_len);
461 	if (ret)
462 		return ret;
463 
464 	pcs += (len & -sizeof(unsigned int));
465 	pcd += (len & -sizeof(unsigned int));
466 	len %= sizeof(unsigned int);
467 
468 	preserve_branch(handle_load_error);
469 	preserve_branch(handle_store_error);
470 
471 	goto byte_copy;
472 
473 handle_load_error:
474 	__asm__ __volatile__ ("pmc_load_exc:\n");
475 	d = &__get_cpu_var(exception_data);
476 	DPRINTF("pmc_load_exc: o_len=%lu fault_addr=%lu o_src=%lu ret=%lu\n",
477 		o_len, d->fault_addr, o_src, o_len - d->fault_addr + o_src);
478 	return o_len - d->fault_addr + o_src;
479 
480 handle_store_error:
481 	__asm__ __volatile__ ("pmc_store_exc:\n");
482 	d = &__get_cpu_var(exception_data);
483 	DPRINTF("pmc_store_exc: o_len=%lu fault_addr=%lu o_dst=%lu ret=%lu\n",
484 		o_len, d->fault_addr, o_dst, o_len - d->fault_addr + o_dst);
485 	return o_len - d->fault_addr + o_dst;
486 }
487 
488 #ifdef __KERNEL__
489 unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len)
490 {
491 	mtsp(get_kernel_space(), 1);
492 	mtsp(get_user_space(), 2);
493 	return pa_memcpy((void __force *)dst, src, len);
494 }
495 
496 unsigned long copy_from_user(void *dst, const void __user *src, unsigned long len)
497 {
498 	mtsp(get_user_space(), 1);
499 	mtsp(get_kernel_space(), 2);
500 	return pa_memcpy(dst, (void __force *)src, len);
501 }
502 
503 unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len)
504 {
505 	mtsp(get_user_space(), 1);
506 	mtsp(get_user_space(), 2);
507 	return pa_memcpy((void __force *)dst, (void __force *)src, len);
508 }
509 
510 
511 void * memcpy(void * dst,const void *src, size_t count)
512 {
513 	mtsp(get_kernel_space(), 1);
514 	mtsp(get_kernel_space(), 2);
515 	pa_memcpy(dst, src, count);
516 	return dst;
517 }
518 
519 EXPORT_SYMBOL(copy_to_user);
520 EXPORT_SYMBOL(copy_from_user);
521 EXPORT_SYMBOL(copy_in_user);
522 EXPORT_SYMBOL(memcpy);
523 #endif
524