1 /* 2 * Optimized memory copy routines. 3 * 4 * Copyright (C) 2004 Randolph Chung <tausq@debian.org> 5 * Copyright (C) 2013 Helge Deller <deller@gmx.de> 6 * 7 * This program is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2, or (at your option) 10 * any later version. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 * GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this program; if not, write to the Free Software 19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 20 * 21 * Portions derived from the GNU C Library 22 * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc. 23 * 24 * Several strategies are tried to try to get the best performance for various 25 * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using 26 * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using 27 * general registers. Unaligned copies are handled either by aligning the 28 * destination and then using shift-and-write method, or in a few cases by 29 * falling back to a byte-at-a-time copy. 30 * 31 * I chose to implement this in C because it is easier to maintain and debug, 32 * and in my experiments it appears that the C code generated by gcc (3.3/3.4 33 * at the time of writing) is fairly optimal. Unfortunately some of the 34 * semantics of the copy routine (exception handling) is difficult to express 35 * in C, so we have to play some tricks to get it to work. 36 * 37 * All the loads and stores are done via explicit asm() code in order to use 38 * the right space registers. 39 * 40 * Testing with various alignments and buffer sizes shows that this code is 41 * often >10x faster than a simple byte-at-a-time copy, even for strangely 42 * aligned operands. It is interesting to note that the glibc version 43 * of memcpy (written in C) is actually quite fast already. This routine is 44 * able to beat it by 30-40% for aligned copies because of the loop unrolling, 45 * but in some cases the glibc version is still slightly faster. This lends 46 * more credibility that gcc can generate very good code as long as we are 47 * careful. 48 * 49 * TODO: 50 * - cache prefetching needs more experimentation to get optimal settings 51 * - try not to use the post-increment address modifiers; they create additional 52 * interlocks 53 * - replace byte-copy loops with stybs sequences 54 */ 55 56 #ifdef __KERNEL__ 57 #include <linux/module.h> 58 #include <linux/compiler.h> 59 #include <asm/uaccess.h> 60 #define s_space "%%sr1" 61 #define d_space "%%sr2" 62 #else 63 #include "memcpy.h" 64 #define s_space "%%sr0" 65 #define d_space "%%sr0" 66 #define pa_memcpy new2_copy 67 #endif 68 69 DECLARE_PER_CPU(struct exception_data, exception_data); 70 71 #define preserve_branch(label) do { \ 72 volatile int dummy = 0; \ 73 /* The following branch is never taken, it's just here to */ \ 74 /* prevent gcc from optimizing away our exception code. */ \ 75 if (unlikely(dummy != dummy)) \ 76 goto label; \ 77 } while (0) 78 79 #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3)) 80 #define get_kernel_space() (0) 81 82 #define MERGE(w0, sh_1, w1, sh_2) ({ \ 83 unsigned int _r; \ 84 asm volatile ( \ 85 "mtsar %3\n" \ 86 "shrpw %1, %2, %%sar, %0\n" \ 87 : "=r"(_r) \ 88 : "r"(w0), "r"(w1), "r"(sh_2) \ 89 ); \ 90 _r; \ 91 }) 92 #define THRESHOLD 16 93 94 #ifdef DEBUG_MEMCPY 95 #define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0) 96 #else 97 #define DPRINTF(fmt, args...) 98 #endif 99 100 #define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ 101 __asm__ __volatile__ ( \ 102 "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \ 103 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ 104 : _tt(_t), "+r"(_a) \ 105 : \ 106 : "r8") 107 108 #define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ 109 __asm__ __volatile__ ( \ 110 "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \ 111 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ 112 : "+r"(_a) \ 113 : _tt(_t) \ 114 : "r8") 115 116 #define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e) 117 #define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e) 118 #define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e) 119 #define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e) 120 #define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e) 121 #define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e) 122 123 #define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \ 124 __asm__ __volatile__ ( \ 125 "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \ 126 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ 127 : _tt(_t) \ 128 : "r"(_a) \ 129 : "r8") 130 131 #define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \ 132 __asm__ __volatile__ ( \ 133 "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \ 134 ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ 135 : \ 136 : _tt(_t), "r"(_a) \ 137 : "r8") 138 139 #define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e) 140 #define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e) 141 142 #ifdef CONFIG_PREFETCH 143 static inline void prefetch_src(const void *addr) 144 { 145 __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr)); 146 } 147 148 static inline void prefetch_dst(const void *addr) 149 { 150 __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr)); 151 } 152 #else 153 #define prefetch_src(addr) do { } while(0) 154 #define prefetch_dst(addr) do { } while(0) 155 #endif 156 157 #define PA_MEMCPY_OK 0 158 #define PA_MEMCPY_LOAD_ERROR 1 159 #define PA_MEMCPY_STORE_ERROR 2 160 161 /* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words 162 * per loop. This code is derived from glibc. 163 */ 164 static inline unsigned long copy_dstaligned(unsigned long dst, 165 unsigned long src, unsigned long len) 166 { 167 /* gcc complains that a2 and a3 may be uninitialized, but actually 168 * they cannot be. Initialize a2/a3 to shut gcc up. 169 */ 170 register unsigned int a0, a1, a2 = 0, a3 = 0; 171 int sh_1, sh_2; 172 173 /* prefetch_src((const void *)src); */ 174 175 /* Calculate how to shift a word read at the memory operation 176 aligned srcp to make it aligned for copy. */ 177 sh_1 = 8 * (src % sizeof(unsigned int)); 178 sh_2 = 8 * sizeof(unsigned int) - sh_1; 179 180 /* Make src aligned by rounding it down. */ 181 src &= -sizeof(unsigned int); 182 183 switch (len % 4) 184 { 185 case 2: 186 /* a1 = ((unsigned int *) src)[0]; 187 a2 = ((unsigned int *) src)[1]; */ 188 ldw(s_space, 0, src, a1, cda_ldw_exc); 189 ldw(s_space, 4, src, a2, cda_ldw_exc); 190 src -= 1 * sizeof(unsigned int); 191 dst -= 3 * sizeof(unsigned int); 192 len += 2; 193 goto do1; 194 case 3: 195 /* a0 = ((unsigned int *) src)[0]; 196 a1 = ((unsigned int *) src)[1]; */ 197 ldw(s_space, 0, src, a0, cda_ldw_exc); 198 ldw(s_space, 4, src, a1, cda_ldw_exc); 199 src -= 0 * sizeof(unsigned int); 200 dst -= 2 * sizeof(unsigned int); 201 len += 1; 202 goto do2; 203 case 0: 204 if (len == 0) 205 return PA_MEMCPY_OK; 206 /* a3 = ((unsigned int *) src)[0]; 207 a0 = ((unsigned int *) src)[1]; */ 208 ldw(s_space, 0, src, a3, cda_ldw_exc); 209 ldw(s_space, 4, src, a0, cda_ldw_exc); 210 src -=-1 * sizeof(unsigned int); 211 dst -= 1 * sizeof(unsigned int); 212 len += 0; 213 goto do3; 214 case 1: 215 /* a2 = ((unsigned int *) src)[0]; 216 a3 = ((unsigned int *) src)[1]; */ 217 ldw(s_space, 0, src, a2, cda_ldw_exc); 218 ldw(s_space, 4, src, a3, cda_ldw_exc); 219 src -=-2 * sizeof(unsigned int); 220 dst -= 0 * sizeof(unsigned int); 221 len -= 1; 222 if (len == 0) 223 goto do0; 224 goto do4; /* No-op. */ 225 } 226 227 do 228 { 229 /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */ 230 do4: 231 /* a0 = ((unsigned int *) src)[0]; */ 232 ldw(s_space, 0, src, a0, cda_ldw_exc); 233 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ 234 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); 235 do3: 236 /* a1 = ((unsigned int *) src)[1]; */ 237 ldw(s_space, 4, src, a1, cda_ldw_exc); 238 /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */ 239 stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc); 240 do2: 241 /* a2 = ((unsigned int *) src)[2]; */ 242 ldw(s_space, 8, src, a2, cda_ldw_exc); 243 /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */ 244 stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc); 245 do1: 246 /* a3 = ((unsigned int *) src)[3]; */ 247 ldw(s_space, 12, src, a3, cda_ldw_exc); 248 /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */ 249 stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc); 250 251 src += 4 * sizeof(unsigned int); 252 dst += 4 * sizeof(unsigned int); 253 len -= 4; 254 } 255 while (len != 0); 256 257 do0: 258 /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ 259 stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); 260 261 preserve_branch(handle_load_error); 262 preserve_branch(handle_store_error); 263 264 return PA_MEMCPY_OK; 265 266 handle_load_error: 267 __asm__ __volatile__ ("cda_ldw_exc:\n"); 268 return PA_MEMCPY_LOAD_ERROR; 269 270 handle_store_error: 271 __asm__ __volatile__ ("cda_stw_exc:\n"); 272 return PA_MEMCPY_STORE_ERROR; 273 } 274 275 276 /* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR. 277 * In case of an access fault the faulty address can be read from the per_cpu 278 * exception data struct. */ 279 static unsigned long pa_memcpy_internal(void *dstp, const void *srcp, 280 unsigned long len) 281 { 282 register unsigned long src, dst, t1, t2, t3; 283 register unsigned char *pcs, *pcd; 284 register unsigned int *pws, *pwd; 285 register double *pds, *pdd; 286 unsigned long ret; 287 288 src = (unsigned long)srcp; 289 dst = (unsigned long)dstp; 290 pcs = (unsigned char *)srcp; 291 pcd = (unsigned char *)dstp; 292 293 /* prefetch_src((const void *)srcp); */ 294 295 if (len < THRESHOLD) 296 goto byte_copy; 297 298 /* Check alignment */ 299 t1 = (src ^ dst); 300 if (unlikely(t1 & (sizeof(double)-1))) 301 goto unaligned_copy; 302 303 /* src and dst have same alignment. */ 304 305 /* Copy bytes till we are double-aligned. */ 306 t2 = src & (sizeof(double) - 1); 307 if (unlikely(t2 != 0)) { 308 t2 = sizeof(double) - t2; 309 while (t2 && len) { 310 /* *pcd++ = *pcs++; */ 311 ldbma(s_space, pcs, t3, pmc_load_exc); 312 len--; 313 stbma(d_space, t3, pcd, pmc_store_exc); 314 t2--; 315 } 316 } 317 318 pds = (double *)pcs; 319 pdd = (double *)pcd; 320 321 #if 0 322 /* Copy 8 doubles at a time */ 323 while (len >= 8*sizeof(double)) { 324 register double r1, r2, r3, r4, r5, r6, r7, r8; 325 /* prefetch_src((char *)pds + L1_CACHE_BYTES); */ 326 flddma(s_space, pds, r1, pmc_load_exc); 327 flddma(s_space, pds, r2, pmc_load_exc); 328 flddma(s_space, pds, r3, pmc_load_exc); 329 flddma(s_space, pds, r4, pmc_load_exc); 330 fstdma(d_space, r1, pdd, pmc_store_exc); 331 fstdma(d_space, r2, pdd, pmc_store_exc); 332 fstdma(d_space, r3, pdd, pmc_store_exc); 333 fstdma(d_space, r4, pdd, pmc_store_exc); 334 335 #if 0 336 if (L1_CACHE_BYTES <= 32) 337 prefetch_src((char *)pds + L1_CACHE_BYTES); 338 #endif 339 flddma(s_space, pds, r5, pmc_load_exc); 340 flddma(s_space, pds, r6, pmc_load_exc); 341 flddma(s_space, pds, r7, pmc_load_exc); 342 flddma(s_space, pds, r8, pmc_load_exc); 343 fstdma(d_space, r5, pdd, pmc_store_exc); 344 fstdma(d_space, r6, pdd, pmc_store_exc); 345 fstdma(d_space, r7, pdd, pmc_store_exc); 346 fstdma(d_space, r8, pdd, pmc_store_exc); 347 len -= 8*sizeof(double); 348 } 349 #endif 350 351 pws = (unsigned int *)pds; 352 pwd = (unsigned int *)pdd; 353 354 word_copy: 355 while (len >= 8*sizeof(unsigned int)) { 356 register unsigned int r1,r2,r3,r4,r5,r6,r7,r8; 357 /* prefetch_src((char *)pws + L1_CACHE_BYTES); */ 358 ldwma(s_space, pws, r1, pmc_load_exc); 359 ldwma(s_space, pws, r2, pmc_load_exc); 360 ldwma(s_space, pws, r3, pmc_load_exc); 361 ldwma(s_space, pws, r4, pmc_load_exc); 362 stwma(d_space, r1, pwd, pmc_store_exc); 363 stwma(d_space, r2, pwd, pmc_store_exc); 364 stwma(d_space, r3, pwd, pmc_store_exc); 365 stwma(d_space, r4, pwd, pmc_store_exc); 366 367 ldwma(s_space, pws, r5, pmc_load_exc); 368 ldwma(s_space, pws, r6, pmc_load_exc); 369 ldwma(s_space, pws, r7, pmc_load_exc); 370 ldwma(s_space, pws, r8, pmc_load_exc); 371 stwma(d_space, r5, pwd, pmc_store_exc); 372 stwma(d_space, r6, pwd, pmc_store_exc); 373 stwma(d_space, r7, pwd, pmc_store_exc); 374 stwma(d_space, r8, pwd, pmc_store_exc); 375 len -= 8*sizeof(unsigned int); 376 } 377 378 while (len >= 4*sizeof(unsigned int)) { 379 register unsigned int r1,r2,r3,r4; 380 ldwma(s_space, pws, r1, pmc_load_exc); 381 ldwma(s_space, pws, r2, pmc_load_exc); 382 ldwma(s_space, pws, r3, pmc_load_exc); 383 ldwma(s_space, pws, r4, pmc_load_exc); 384 stwma(d_space, r1, pwd, pmc_store_exc); 385 stwma(d_space, r2, pwd, pmc_store_exc); 386 stwma(d_space, r3, pwd, pmc_store_exc); 387 stwma(d_space, r4, pwd, pmc_store_exc); 388 len -= 4*sizeof(unsigned int); 389 } 390 391 pcs = (unsigned char *)pws; 392 pcd = (unsigned char *)pwd; 393 394 byte_copy: 395 while (len) { 396 /* *pcd++ = *pcs++; */ 397 ldbma(s_space, pcs, t3, pmc_load_exc); 398 stbma(d_space, t3, pcd, pmc_store_exc); 399 len--; 400 } 401 402 return PA_MEMCPY_OK; 403 404 unaligned_copy: 405 /* possibly we are aligned on a word, but not on a double... */ 406 if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) { 407 t2 = src & (sizeof(unsigned int) - 1); 408 409 if (unlikely(t2 != 0)) { 410 t2 = sizeof(unsigned int) - t2; 411 while (t2) { 412 /* *pcd++ = *pcs++; */ 413 ldbma(s_space, pcs, t3, pmc_load_exc); 414 stbma(d_space, t3, pcd, pmc_store_exc); 415 len--; 416 t2--; 417 } 418 } 419 420 pws = (unsigned int *)pcs; 421 pwd = (unsigned int *)pcd; 422 goto word_copy; 423 } 424 425 /* Align the destination. */ 426 if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) { 427 t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1)); 428 while (t2) { 429 /* *pcd++ = *pcs++; */ 430 ldbma(s_space, pcs, t3, pmc_load_exc); 431 stbma(d_space, t3, pcd, pmc_store_exc); 432 len--; 433 t2--; 434 } 435 dst = (unsigned long)pcd; 436 src = (unsigned long)pcs; 437 } 438 439 ret = copy_dstaligned(dst, src, len / sizeof(unsigned int)); 440 if (ret) 441 return ret; 442 443 pcs += (len & -sizeof(unsigned int)); 444 pcd += (len & -sizeof(unsigned int)); 445 len %= sizeof(unsigned int); 446 447 preserve_branch(handle_load_error); 448 preserve_branch(handle_store_error); 449 450 goto byte_copy; 451 452 handle_load_error: 453 __asm__ __volatile__ ("pmc_load_exc:\n"); 454 return PA_MEMCPY_LOAD_ERROR; 455 456 handle_store_error: 457 __asm__ __volatile__ ("pmc_store_exc:\n"); 458 return PA_MEMCPY_STORE_ERROR; 459 } 460 461 462 /* Returns 0 for success, otherwise, returns number of bytes not transferred. */ 463 static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) 464 { 465 unsigned long ret, fault_addr, reference; 466 struct exception_data *d; 467 468 ret = pa_memcpy_internal(dstp, srcp, len); 469 if (likely(ret == PA_MEMCPY_OK)) 470 return 0; 471 472 /* if a load or store fault occured we can get the faulty addr */ 473 d = &__get_cpu_var(exception_data); 474 fault_addr = d->fault_addr; 475 476 /* error in load or store? */ 477 if (ret == PA_MEMCPY_LOAD_ERROR) 478 reference = (unsigned long) srcp; 479 else 480 reference = (unsigned long) dstp; 481 482 DPRINTF("pa_memcpy: fault type = %lu, len=%lu fault_addr=%lu ref=%lu\n", 483 ret, len, fault_addr, reference); 484 485 if (fault_addr >= reference) 486 return len - (fault_addr - reference); 487 else 488 return len; 489 } 490 491 #ifdef __KERNEL__ 492 unsigned long copy_to_user(void __user *dst, const void *src, unsigned long len) 493 { 494 mtsp(get_kernel_space(), 1); 495 mtsp(get_user_space(), 2); 496 return pa_memcpy((void __force *)dst, src, len); 497 } 498 499 EXPORT_SYMBOL(__copy_from_user); 500 unsigned long __copy_from_user(void *dst, const void __user *src, unsigned long len) 501 { 502 mtsp(get_user_space(), 1); 503 mtsp(get_kernel_space(), 2); 504 return pa_memcpy(dst, (void __force *)src, len); 505 } 506 507 unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned long len) 508 { 509 mtsp(get_user_space(), 1); 510 mtsp(get_user_space(), 2); 511 return pa_memcpy((void __force *)dst, (void __force *)src, len); 512 } 513 514 515 void * memcpy(void * dst,const void *src, size_t count) 516 { 517 mtsp(get_kernel_space(), 1); 518 mtsp(get_kernel_space(), 2); 519 pa_memcpy(dst, src, count); 520 return dst; 521 } 522 523 EXPORT_SYMBOL(copy_to_user); 524 EXPORT_SYMBOL(copy_from_user); 525 EXPORT_SYMBOL(copy_in_user); 526 EXPORT_SYMBOL(memcpy); 527 #endif 528