1/* 2 * Memory copy functions for 32-bit PowerPC. 3 * 4 * Copyright (C) 1996-2005 Paul Mackerras. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11#include <asm/processor.h> 12#include <asm/cache.h> 13#include <asm/errno.h> 14#include <asm/ppc_asm.h> 15 16#define COPY_16_BYTES \ 17 lwz r7,4(r4); \ 18 lwz r8,8(r4); \ 19 lwz r9,12(r4); \ 20 lwzu r10,16(r4); \ 21 stw r7,4(r6); \ 22 stw r8,8(r6); \ 23 stw r9,12(r6); \ 24 stwu r10,16(r6) 25 26#define COPY_16_BYTES_WITHEX(n) \ 278 ## n ## 0: \ 28 lwz r7,4(r4); \ 298 ## n ## 1: \ 30 lwz r8,8(r4); \ 318 ## n ## 2: \ 32 lwz r9,12(r4); \ 338 ## n ## 3: \ 34 lwzu r10,16(r4); \ 358 ## n ## 4: \ 36 stw r7,4(r6); \ 378 ## n ## 5: \ 38 stw r8,8(r6); \ 398 ## n ## 6: \ 40 stw r9,12(r6); \ 418 ## n ## 7: \ 42 stwu r10,16(r6) 43 44#define COPY_16_BYTES_EXCODE(n) \ 459 ## n ## 0: \ 46 addi r5,r5,-(16 * n); \ 47 b 104f; \ 489 ## n ## 1: \ 49 addi r5,r5,-(16 * n); \ 50 b 105f; \ 51.section __ex_table,"a"; \ 52 .align 2; \ 53 .long 8 ## n ## 0b,9 ## n ## 0b; \ 54 .long 8 ## n ## 1b,9 ## n ## 0b; \ 55 .long 8 ## n ## 2b,9 ## n ## 0b; \ 56 .long 8 ## n ## 3b,9 ## n ## 0b; \ 57 .long 8 ## n ## 4b,9 ## n ## 1b; \ 58 .long 8 ## n ## 5b,9 ## n ## 1b; \ 59 .long 8 ## n ## 6b,9 ## n ## 1b; \ 60 .long 8 ## n ## 7b,9 ## n ## 1b; \ 61 .text 62 63 .text 64 .stabs "arch/powerpc/lib/",N_SO,0,0,0f 65 .stabs "copy_32.S",N_SO,0,0,0f 660: 67 68CACHELINE_BYTES = L1_CACHE_BYTES 69LG_CACHELINE_BYTES = L1_CACHE_SHIFT 70CACHELINE_MASK = (L1_CACHE_BYTES-1) 71 72/* 73 * Use dcbz on the complete cache lines in the destination 74 * to set them to zero. This requires that the destination 75 * area is cacheable. -- paulus 76 */ 77_GLOBAL(cacheable_memzero) 78 mr r5,r4 79 li r4,0 80 addi r6,r3,-4 81 cmplwi 0,r5,4 82 blt 7f 83 stwu r4,4(r6) 84 beqlr 85 andi. r0,r6,3 86 add r5,r0,r5 87 subf r6,r0,r6 88 clrlwi r7,r6,32-LG_CACHELINE_BYTES 89 add r8,r7,r5 90 srwi r9,r8,LG_CACHELINE_BYTES 91 addic. r9,r9,-1 /* total number of complete cachelines */ 92 ble 2f 93 xori r0,r7,CACHELINE_MASK & ~3 94 srwi. r0,r0,2 95 beq 3f 96 mtctr r0 974: stwu r4,4(r6) 98 bdnz 4b 993: mtctr r9 100 li r7,4 10110: dcbz r7,r6 102 addi r6,r6,CACHELINE_BYTES 103 bdnz 10b 104 clrlwi r5,r8,32-LG_CACHELINE_BYTES 105 addi r5,r5,4 1062: srwi r0,r5,2 107 mtctr r0 108 bdz 6f 1091: stwu r4,4(r6) 110 bdnz 1b 1116: andi. r5,r5,3 1127: cmpwi 0,r5,0 113 beqlr 114 mtctr r5 115 addi r6,r6,3 1168: stbu r4,1(r6) 117 bdnz 8b 118 blr 119 120_GLOBAL(memset) 121 rlwimi r4,r4,8,16,23 122 rlwimi r4,r4,16,0,15 123 addi r6,r3,-4 124 cmplwi 0,r5,4 125 blt 7f 126 stwu r4,4(r6) 127 beqlr 128 andi. r0,r6,3 129 add r5,r0,r5 130 subf r6,r0,r6 131 srwi r0,r5,2 132 mtctr r0 133 bdz 6f 1341: stwu r4,4(r6) 135 bdnz 1b 1366: andi. r5,r5,3 1377: cmpwi 0,r5,0 138 beqlr 139 mtctr r5 140 addi r6,r6,3 1418: stbu r4,1(r6) 142 bdnz 8b 143 blr 144 145/* 146 * This version uses dcbz on the complete cache lines in the 147 * destination area to reduce memory traffic. This requires that 148 * the destination area is cacheable. 149 * We only use this version if the source and dest don't overlap. 150 * -- paulus. 151 */ 152_GLOBAL(cacheable_memcpy) 153 add r7,r3,r5 /* test if the src & dst overlap */ 154 add r8,r4,r5 155 cmplw 0,r4,r7 156 cmplw 1,r3,r8 157 crand 0,0,4 /* cr0.lt &= cr1.lt */ 158 blt memcpy /* if regions overlap */ 159 160 addi r4,r4,-4 161 addi r6,r3,-4 162 neg r0,r3 163 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 164 beq 58f 165 166 cmplw 0,r5,r0 /* is this more than total to do? */ 167 blt 63f /* if not much to do */ 168 andi. r8,r0,3 /* get it word-aligned first */ 169 subf r5,r0,r5 170 mtctr r8 171 beq+ 61f 17270: lbz r9,4(r4) /* do some bytes */ 173 stb r9,4(r6) 174 addi r4,r4,1 175 addi r6,r6,1 176 bdnz 70b 17761: srwi. r0,r0,2 178 mtctr r0 179 beq 58f 18072: lwzu r9,4(r4) /* do some words */ 181 stwu r9,4(r6) 182 bdnz 72b 183 18458: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 185 clrlwi r5,r5,32-LG_CACHELINE_BYTES 186 li r11,4 187 mtctr r0 188 beq 63f 18953: 190 dcbz r11,r6 191 COPY_16_BYTES 192#if L1_CACHE_BYTES >= 32 193 COPY_16_BYTES 194#if L1_CACHE_BYTES >= 64 195 COPY_16_BYTES 196 COPY_16_BYTES 197#if L1_CACHE_BYTES >= 128 198 COPY_16_BYTES 199 COPY_16_BYTES 200 COPY_16_BYTES 201 COPY_16_BYTES 202#endif 203#endif 204#endif 205 bdnz 53b 206 20763: srwi. r0,r5,2 208 mtctr r0 209 beq 64f 21030: lwzu r0,4(r4) 211 stwu r0,4(r6) 212 bdnz 30b 213 21464: andi. r0,r5,3 215 mtctr r0 216 beq+ 65f 21740: lbz r0,4(r4) 218 stb r0,4(r6) 219 addi r4,r4,1 220 addi r6,r6,1 221 bdnz 40b 22265: blr 223 224_GLOBAL(memmove) 225 cmplw 0,r3,r4 226 bgt backwards_memcpy 227 /* fall through */ 228 229_GLOBAL(memcpy) 230 srwi. r7,r5,3 231 addi r6,r3,-4 232 addi r4,r4,-4 233 beq 2f /* if less than 8 bytes to do */ 234 andi. r0,r6,3 /* get dest word aligned */ 235 mtctr r7 236 bne 5f 2371: lwz r7,4(r4) 238 lwzu r8,8(r4) 239 stw r7,4(r6) 240 stwu r8,8(r6) 241 bdnz 1b 242 andi. r5,r5,7 2432: cmplwi 0,r5,4 244 blt 3f 245 lwzu r0,4(r4) 246 addi r5,r5,-4 247 stwu r0,4(r6) 2483: cmpwi 0,r5,0 249 beqlr 250 mtctr r5 251 addi r4,r4,3 252 addi r6,r6,3 2534: lbzu r0,1(r4) 254 stbu r0,1(r6) 255 bdnz 4b 256 blr 2575: subfic r0,r0,4 258 mtctr r0 2596: lbz r7,4(r4) 260 addi r4,r4,1 261 stb r7,4(r6) 262 addi r6,r6,1 263 bdnz 6b 264 subf r5,r0,r5 265 rlwinm. r7,r5,32-3,3,31 266 beq 2b 267 mtctr r7 268 b 1b 269 270_GLOBAL(backwards_memcpy) 271 rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ 272 add r6,r3,r5 273 add r4,r4,r5 274 beq 2f 275 andi. r0,r6,3 276 mtctr r7 277 bne 5f 2781: lwz r7,-4(r4) 279 lwzu r8,-8(r4) 280 stw r7,-4(r6) 281 stwu r8,-8(r6) 282 bdnz 1b 283 andi. r5,r5,7 2842: cmplwi 0,r5,4 285 blt 3f 286 lwzu r0,-4(r4) 287 subi r5,r5,4 288 stwu r0,-4(r6) 2893: cmpwi 0,r5,0 290 beqlr 291 mtctr r5 2924: lbzu r0,-1(r4) 293 stbu r0,-1(r6) 294 bdnz 4b 295 blr 2965: mtctr r0 2976: lbzu r7,-1(r4) 298 stbu r7,-1(r6) 299 bdnz 6b 300 subf r5,r0,r5 301 rlwinm. r7,r5,32-3,3,31 302 beq 2b 303 mtctr r7 304 b 1b 305 306_GLOBAL(__copy_tofrom_user) 307 addi r4,r4,-4 308 addi r6,r3,-4 309 neg r0,r3 310 andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ 311 beq 58f 312 313 cmplw 0,r5,r0 /* is this more than total to do? */ 314 blt 63f /* if not much to do */ 315 andi. r8,r0,3 /* get it word-aligned first */ 316 mtctr r8 317 beq+ 61f 31870: lbz r9,4(r4) /* do some bytes */ 31971: stb r9,4(r6) 320 addi r4,r4,1 321 addi r6,r6,1 322 bdnz 70b 32361: subf r5,r0,r5 324 srwi. r0,r0,2 325 mtctr r0 326 beq 58f 32772: lwzu r9,4(r4) /* do some words */ 32873: stwu r9,4(r6) 329 bdnz 72b 330 331 .section __ex_table,"a" 332 .align 2 333 .long 70b,100f 334 .long 71b,101f 335 .long 72b,102f 336 .long 73b,103f 337 .text 338 33958: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ 340 clrlwi r5,r5,32-LG_CACHELINE_BYTES 341 li r11,4 342 beq 63f 343 344 /* Here we decide how far ahead to prefetch the source */ 345 li r3,4 346 cmpwi r0,1 347 li r7,0 348 ble 114f 349 li r7,1 350#if MAX_COPY_PREFETCH > 1 351 /* Heuristically, for large transfers we prefetch 352 MAX_COPY_PREFETCH cachelines ahead. For small transfers 353 we prefetch 1 cacheline ahead. */ 354 cmpwi r0,MAX_COPY_PREFETCH 355 ble 112f 356 li r7,MAX_COPY_PREFETCH 357112: mtctr r7 358111: dcbt r3,r4 359 addi r3,r3,CACHELINE_BYTES 360 bdnz 111b 361#else 362 dcbt r3,r4 363 addi r3,r3,CACHELINE_BYTES 364#endif /* MAX_COPY_PREFETCH > 1 */ 365 366114: subf r8,r7,r0 367 mr r0,r7 368 mtctr r8 369 37053: dcbt r3,r4 37154: dcbz r11,r6 372 .section __ex_table,"a" 373 .align 2 374 .long 54b,105f 375 .text 376/* the main body of the cacheline loop */ 377 COPY_16_BYTES_WITHEX(0) 378#if L1_CACHE_BYTES >= 32 379 COPY_16_BYTES_WITHEX(1) 380#if L1_CACHE_BYTES >= 64 381 COPY_16_BYTES_WITHEX(2) 382 COPY_16_BYTES_WITHEX(3) 383#if L1_CACHE_BYTES >= 128 384 COPY_16_BYTES_WITHEX(4) 385 COPY_16_BYTES_WITHEX(5) 386 COPY_16_BYTES_WITHEX(6) 387 COPY_16_BYTES_WITHEX(7) 388#endif 389#endif 390#endif 391 bdnz 53b 392 cmpwi r0,0 393 li r3,4 394 li r7,0 395 bne 114b 396 39763: srwi. r0,r5,2 398 mtctr r0 399 beq 64f 40030: lwzu r0,4(r4) 40131: stwu r0,4(r6) 402 bdnz 30b 403 40464: andi. r0,r5,3 405 mtctr r0 406 beq+ 65f 40740: lbz r0,4(r4) 40841: stb r0,4(r6) 409 addi r4,r4,1 410 addi r6,r6,1 411 bdnz 40b 41265: li r3,0 413 blr 414 415/* read fault, initial single-byte copy */ 416100: li r9,0 417 b 90f 418/* write fault, initial single-byte copy */ 419101: li r9,1 42090: subf r5,r8,r5 421 li r3,0 422 b 99f 423/* read fault, initial word copy */ 424102: li r9,0 425 b 91f 426/* write fault, initial word copy */ 427103: li r9,1 42891: li r3,2 429 b 99f 430 431/* 432 * this stuff handles faults in the cacheline loop and branches to either 433 * 104f (if in read part) or 105f (if in write part), after updating r5 434 */ 435 COPY_16_BYTES_EXCODE(0) 436#if L1_CACHE_BYTES >= 32 437 COPY_16_BYTES_EXCODE(1) 438#if L1_CACHE_BYTES >= 64 439 COPY_16_BYTES_EXCODE(2) 440 COPY_16_BYTES_EXCODE(3) 441#if L1_CACHE_BYTES >= 128 442 COPY_16_BYTES_EXCODE(4) 443 COPY_16_BYTES_EXCODE(5) 444 COPY_16_BYTES_EXCODE(6) 445 COPY_16_BYTES_EXCODE(7) 446#endif 447#endif 448#endif 449 450/* read fault in cacheline loop */ 451104: li r9,0 452 b 92f 453/* fault on dcbz (effectively a write fault) */ 454/* or write fault in cacheline loop */ 455105: li r9,1 45692: li r3,LG_CACHELINE_BYTES 457 mfctr r8 458 add r0,r0,r8 459 b 106f 460/* read fault in final word loop */ 461108: li r9,0 462 b 93f 463/* write fault in final word loop */ 464109: li r9,1 46593: andi. r5,r5,3 466 li r3,2 467 b 99f 468/* read fault in final byte loop */ 469110: li r9,0 470 b 94f 471/* write fault in final byte loop */ 472111: li r9,1 47394: li r5,0 474 li r3,0 475/* 476 * At this stage the number of bytes not copied is 477 * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. 478 */ 47999: mfctr r0 480106: slw r3,r0,r3 481 add. r3,r3,r5 482 beq 120f /* shouldn't happen */ 483 cmpwi 0,r9,0 484 bne 120f 485/* for a read fault, first try to continue the copy one byte at a time */ 486 mtctr r3 487130: lbz r0,4(r4) 488131: stb r0,4(r6) 489 addi r4,r4,1 490 addi r6,r6,1 491 bdnz 130b 492/* then clear out the destination: r3 bytes starting at 4(r6) */ 493132: mfctr r3 494 srwi. r0,r3,2 495 li r9,0 496 mtctr r0 497 beq 113f 498112: stwu r9,4(r6) 499 bdnz 112b 500113: andi. r0,r3,3 501 mtctr r0 502 beq 120f 503114: stb r9,4(r6) 504 addi r6,r6,1 505 bdnz 114b 506120: blr 507 508 .section __ex_table,"a" 509 .align 2 510 .long 30b,108b 511 .long 31b,109b 512 .long 40b,110b 513 .long 41b,111b 514 .long 130b,132b 515 .long 131b,120b 516 .long 112b,120b 517 .long 114b,120b 518 .text 519