1/* memcpy.S: Sparc optimized memcpy and memmove code 2 * Hand optimized from GNU libc's memcpy and memmove 3 * Copyright (C) 1991,1996 Free Software Foundation 4 * Copyright (C) 1995 Linus Torvalds (Linus.Torvalds@helsinki.fi) 5 * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu) 6 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) 7 * Copyright (C) 1996 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 8 */ 9 10#include <asm/export.h> 11#define FUNC(x) \ 12 .globl x; \ 13 .type x,@function; \ 14 .align 4; \ 15x: 16 17/* Both these macros have to start with exactly the same insn */ 18#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 19 ldd [%src + (offset) + 0x00], %t0; \ 20 ldd [%src + (offset) + 0x08], %t2; \ 21 ldd [%src + (offset) + 0x10], %t4; \ 22 ldd [%src + (offset) + 0x18], %t6; \ 23 st %t0, [%dst + (offset) + 0x00]; \ 24 st %t1, [%dst + (offset) + 0x04]; \ 25 st %t2, [%dst + (offset) + 0x08]; \ 26 st %t3, [%dst + (offset) + 0x0c]; \ 27 st %t4, [%dst + (offset) + 0x10]; \ 28 st %t5, [%dst + (offset) + 0x14]; \ 29 st %t6, [%dst + (offset) + 0x18]; \ 30 st %t7, [%dst + (offset) + 0x1c]; 31 32#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 33 ldd [%src + (offset) + 0x00], %t0; \ 34 ldd [%src + (offset) + 0x08], %t2; \ 35 ldd [%src + (offset) + 0x10], %t4; \ 36 ldd [%src + (offset) + 0x18], %t6; \ 37 std %t0, [%dst + (offset) + 0x00]; \ 38 std %t2, [%dst + (offset) + 0x08]; \ 39 std %t4, [%dst + (offset) + 0x10]; \ 40 std %t6, [%dst + (offset) + 0x18]; 41 42#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ 43 ldd [%src - (offset) - 0x10], %t0; \ 44 ldd [%src - (offset) - 0x08], %t2; \ 45 st %t0, [%dst - (offset) - 0x10]; \ 46 st %t1, [%dst - (offset) - 0x0c]; \ 47 st %t2, [%dst - (offset) - 0x08]; \ 48 st %t3, [%dst - (offset) - 0x04]; 49 50#define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \ 51 ldd [%src - (offset) - 0x10], %t0; \ 52 ldd [%src - (offset) - 0x08], %t2; \ 53 std %t0, [%dst - (offset) - 0x10]; \ 54 std %t2, [%dst - (offset) - 0x08]; 55 56#define MOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ 57 ldub [%src - (offset) - 0x02], %t0; \ 58 ldub [%src - (offset) - 0x01], %t1; \ 59 stb %t0, [%dst - (offset) - 0x02]; \ 60 stb %t1, [%dst - (offset) - 0x01]; 61 62/* Both these macros have to start with exactly the same insn */ 63#define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 64 ldd [%src - (offset) - 0x20], %t0; \ 65 ldd [%src - (offset) - 0x18], %t2; \ 66 ldd [%src - (offset) - 0x10], %t4; \ 67 ldd [%src - (offset) - 0x08], %t6; \ 68 st %t0, [%dst - (offset) - 0x20]; \ 69 st %t1, [%dst - (offset) - 0x1c]; \ 70 st %t2, [%dst - (offset) - 0x18]; \ 71 st %t3, [%dst - (offset) - 0x14]; \ 72 st %t4, [%dst - (offset) - 0x10]; \ 73 st %t5, [%dst - (offset) - 0x0c]; \ 74 st %t6, [%dst - (offset) - 0x08]; \ 75 st %t7, [%dst - (offset) - 0x04]; 76 77#define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, t7) \ 78 ldd [%src - (offset) - 0x20], %t0; \ 79 ldd [%src - (offset) - 0x18], %t2; \ 80 ldd [%src - (offset) - 0x10], %t4; \ 81 ldd [%src - (offset) - 0x08], %t6; \ 82 std %t0, [%dst - (offset) - 0x20]; \ 83 std %t2, [%dst - (offset) - 0x18]; \ 84 std %t4, [%dst - (offset) - 0x10]; \ 85 std %t6, [%dst - (offset) - 0x08]; 86 87#define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ 88 ldd [%src + (offset) + 0x00], %t0; \ 89 ldd [%src + (offset) + 0x08], %t2; \ 90 st %t0, [%dst + (offset) + 0x00]; \ 91 st %t1, [%dst + (offset) + 0x04]; \ 92 st %t2, [%dst + (offset) + 0x08]; \ 93 st %t3, [%dst + (offset) + 0x0c]; 94 95#define RMOVE_SHORTCHUNK(src, dst, offset, t0, t1) \ 96 ldub [%src + (offset) + 0x00], %t0; \ 97 ldub [%src + (offset) + 0x01], %t1; \ 98 stb %t0, [%dst + (offset) + 0x00]; \ 99 stb %t1, [%dst + (offset) + 0x01]; 100 101#define SMOVE_CHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \ 102 ldd [%src + (offset) + 0x00], %t0; \ 103 ldd [%src + (offset) + 0x08], %t2; \ 104 srl %t0, shir, %t5; \ 105 srl %t1, shir, %t6; \ 106 sll %t0, shil, %t0; \ 107 or %t5, %prev, %t5; \ 108 sll %t1, shil, %prev; \ 109 or %t6, %t0, %t0; \ 110 srl %t2, shir, %t1; \ 111 srl %t3, shir, %t6; \ 112 sll %t2, shil, %t2; \ 113 or %t1, %prev, %t1; \ 114 std %t4, [%dst + (offset) + (offset2) - 0x04]; \ 115 std %t0, [%dst + (offset) + (offset2) + 0x04]; \ 116 sll %t3, shil, %prev; \ 117 or %t6, %t2, %t4; 118 119#define SMOVE_ALIGNCHUNK(src, dst, offset, t0, t1, t2, t3, t4, t5, t6, prev, shil, shir, offset2) \ 120 ldd [%src + (offset) + 0x00], %t0; \ 121 ldd [%src + (offset) + 0x08], %t2; \ 122 srl %t0, shir, %t4; \ 123 srl %t1, shir, %t5; \ 124 sll %t0, shil, %t6; \ 125 or %t4, %prev, %t0; \ 126 sll %t1, shil, %prev; \ 127 or %t5, %t6, %t1; \ 128 srl %t2, shir, %t4; \ 129 srl %t3, shir, %t5; \ 130 sll %t2, shil, %t6; \ 131 or %t4, %prev, %t2; \ 132 sll %t3, shil, %prev; \ 133 or %t5, %t6, %t3; \ 134 std %t0, [%dst + (offset) + (offset2) + 0x00]; \ 135 std %t2, [%dst + (offset) + (offset2) + 0x08]; 136 137 .text 138 .align 4 139 1400: 141 retl 142 nop ! Only bcopy returns here and it retuns void... 143 144#ifdef __KERNEL__ 145FUNC(amemmove) 146FUNC(__memmove) 147EXPORT_SYMBOL(__memmove) 148#endif 149FUNC(memmove) 150EXPORT_SYMBOL(memmove) 151 cmp %o0, %o1 152 mov %o0, %g7 153 bleu 9f 154 sub %o0, %o1, %o4 155 156 add %o1, %o2, %o3 157 cmp %o3, %o0 158 bleu 0f 159 andcc %o4, 3, %o5 160 161 add %o1, %o2, %o1 162 add %o0, %o2, %o0 163 sub %o1, 1, %o1 164 sub %o0, 1, %o0 165 1661: /* reverse_bytes */ 167 168 ldub [%o1], %o4 169 subcc %o2, 1, %o2 170 stb %o4, [%o0] 171 sub %o1, 1, %o1 172 bne 1b 173 sub %o0, 1, %o0 174 175 retl 176 mov %g7, %o0 177 178/* NOTE: This code is executed just for the cases, 179 where %src (=%o1) & 3 is != 0. 180 We need to align it to 4. So, for (%src & 3) 181 1 we need to do ldub,lduh 182 2 lduh 183 3 just ldub 184 so even if it looks weird, the branches 185 are correct here. -jj 186 */ 18778: /* dword_align */ 188 189 andcc %o1, 1, %g0 190 be 4f 191 andcc %o1, 2, %g0 192 193 ldub [%o1], %g2 194 add %o1, 1, %o1 195 stb %g2, [%o0] 196 sub %o2, 1, %o2 197 bne 3f 198 add %o0, 1, %o0 1994: 200 lduh [%o1], %g2 201 add %o1, 2, %o1 202 sth %g2, [%o0] 203 sub %o2, 2, %o2 204 b 3f 205 add %o0, 2, %o0 206 207FUNC(memcpy) /* %o0=dst %o1=src %o2=len */ 208EXPORT_SYMBOL(memcpy) 209 210 sub %o0, %o1, %o4 211 mov %o0, %g7 2129: 213 andcc %o4, 3, %o5 2140: 215 bne 86f 216 cmp %o2, 15 217 218 bleu 90f 219 andcc %o1, 3, %g0 220 221 bne 78b 2223: 223 andcc %o1, 4, %g0 224 225 be 2f 226 mov %o2, %g1 227 228 ld [%o1], %o4 229 sub %g1, 4, %g1 230 st %o4, [%o0] 231 add %o1, 4, %o1 232 add %o0, 4, %o0 2332: 234 andcc %g1, 0xffffff80, %g0 235 be 3f 236 andcc %o0, 4, %g0 237 238 be 82f + 4 2395: 240 MOVE_BIGCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) 241 MOVE_BIGCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) 242 MOVE_BIGCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) 243 MOVE_BIGCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) 244 sub %g1, 128, %g1 245 add %o1, 128, %o1 246 cmp %g1, 128 247 bge 5b 248 add %o0, 128, %o0 2493: 250 andcc %g1, 0x70, %g4 251 be 80f 252 andcc %g1, 8, %g0 253 254 sethi %hi(80f), %o5 255 srl %g4, 1, %o4 256 add %g4, %o4, %o4 257 add %o1, %g4, %o1 258 sub %o5, %o4, %o5 259 jmpl %o5 + %lo(80f), %g0 260 add %o0, %g4, %o0 261 26279: /* memcpy_table */ 263 264 MOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g4, g5) 265 MOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g4, g5) 266 MOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g4, g5) 267 MOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g4, g5) 268 MOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g4, g5) 269 MOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g4, g5) 270 MOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g4, g5) 271 27280: /* memcpy_table_end */ 273 be 81f 274 andcc %g1, 4, %g0 275 276 ldd [%o1], %g2 277 add %o0, 8, %o0 278 st %g2, [%o0 - 0x08] 279 add %o1, 8, %o1 280 st %g3, [%o0 - 0x04] 281 28281: /* memcpy_last7 */ 283 284 be 1f 285 andcc %g1, 2, %g0 286 287 ld [%o1], %g2 288 add %o1, 4, %o1 289 st %g2, [%o0] 290 add %o0, 4, %o0 2911: 292 be 1f 293 andcc %g1, 1, %g0 294 295 lduh [%o1], %g2 296 add %o1, 2, %o1 297 sth %g2, [%o0] 298 add %o0, 2, %o0 2991: 300 be 1f 301 nop 302 303 ldub [%o1], %g2 304 stb %g2, [%o0] 3051: 306 retl 307 mov %g7, %o0 308 30982: /* ldd_std */ 310 MOVE_BIGALIGNCHUNK(o1, o0, 0x00, o2, o3, o4, o5, g2, g3, g4, g5) 311 MOVE_BIGALIGNCHUNK(o1, o0, 0x20, o2, o3, o4, o5, g2, g3, g4, g5) 312 MOVE_BIGALIGNCHUNK(o1, o0, 0x40, o2, o3, o4, o5, g2, g3, g4, g5) 313 MOVE_BIGALIGNCHUNK(o1, o0, 0x60, o2, o3, o4, o5, g2, g3, g4, g5) 314 subcc %g1, 128, %g1 315 add %o1, 128, %o1 316 cmp %g1, 128 317 bge 82b 318 add %o0, 128, %o0 319 320 andcc %g1, 0x70, %g4 321 be 84f 322 andcc %g1, 8, %g0 323 324 sethi %hi(84f), %o5 325 add %o1, %g4, %o1 326 sub %o5, %g4, %o5 327 jmpl %o5 + %lo(84f), %g0 328 add %o0, %g4, %o0 329 33083: /* amemcpy_table */ 331 332 MOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3, g4, g5) 333 MOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3, g4, g5) 334 MOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3, g4, g5) 335 MOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3, g4, g5) 336 MOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3, g4, g5) 337 MOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3, g4, g5) 338 MOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3, g4, g5) 339 34084: /* amemcpy_table_end */ 341 be 85f 342 andcc %g1, 4, %g0 343 344 ldd [%o1], %g2 345 add %o0, 8, %o0 346 std %g2, [%o0 - 0x08] 347 add %o1, 8, %o1 34885: /* amemcpy_last7 */ 349 be 1f 350 andcc %g1, 2, %g0 351 352 ld [%o1], %g2 353 add %o1, 4, %o1 354 st %g2, [%o0] 355 add %o0, 4, %o0 3561: 357 be 1f 358 andcc %g1, 1, %g0 359 360 lduh [%o1], %g2 361 add %o1, 2, %o1 362 sth %g2, [%o0] 363 add %o0, 2, %o0 3641: 365 be 1f 366 nop 367 368 ldub [%o1], %g2 369 stb %g2, [%o0] 3701: 371 retl 372 mov %g7, %o0 373 37486: /* non_aligned */ 375 cmp %o2, 6 376 bleu 88f 377 nop 378 379 save %sp, -96, %sp 380 andcc %i0, 3, %g0 381 be 61f 382 andcc %i0, 1, %g0 383 be 60f 384 andcc %i0, 2, %g0 385 386 ldub [%i1], %g5 387 add %i1, 1, %i1 388 stb %g5, [%i0] 389 sub %i2, 1, %i2 390 bne 61f 391 add %i0, 1, %i0 39260: 393 ldub [%i1], %g3 394 add %i1, 2, %i1 395 stb %g3, [%i0] 396 sub %i2, 2, %i2 397 ldub [%i1 - 1], %g3 398 add %i0, 2, %i0 399 stb %g3, [%i0 - 1] 40061: 401 and %i1, 3, %g2 402 and %i2, 0xc, %g3 403 and %i1, -4, %i1 404 cmp %g3, 4 405 sll %g2, 3, %g4 406 mov 32, %g2 407 be 4f 408 sub %g2, %g4, %l0 409 410 blu 3f 411 cmp %g3, 0x8 412 413 be 2f 414 srl %i2, 2, %g3 415 416 ld [%i1], %i3 417 add %i0, -8, %i0 418 ld [%i1 + 4], %i4 419 b 8f 420 add %g3, 1, %g3 4212: 422 ld [%i1], %i4 423 add %i0, -12, %i0 424 ld [%i1 + 4], %i5 425 add %g3, 2, %g3 426 b 9f 427 add %i1, -4, %i1 4283: 429 ld [%i1], %g1 430 add %i0, -4, %i0 431 ld [%i1 + 4], %i3 432 srl %i2, 2, %g3 433 b 7f 434 add %i1, 4, %i1 4354: 436 ld [%i1], %i5 437 cmp %i2, 7 438 ld [%i1 + 4], %g1 439 srl %i2, 2, %g3 440 bleu 10f 441 add %i1, 8, %i1 442 443 ld [%i1], %i3 444 add %g3, -1, %g3 4455: 446 sll %i5, %g4, %g2 447 srl %g1, %l0, %g5 448 or %g2, %g5, %g2 449 st %g2, [%i0] 4507: 451 ld [%i1 + 4], %i4 452 sll %g1, %g4, %g2 453 srl %i3, %l0, %g5 454 or %g2, %g5, %g2 455 st %g2, [%i0 + 4] 4568: 457 ld [%i1 + 8], %i5 458 sll %i3, %g4, %g2 459 srl %i4, %l0, %g5 460 or %g2, %g5, %g2 461 st %g2, [%i0 + 8] 4629: 463 ld [%i1 + 12], %g1 464 sll %i4, %g4, %g2 465 srl %i5, %l0, %g5 466 addcc %g3, -4, %g3 467 or %g2, %g5, %g2 468 add %i1, 16, %i1 469 st %g2, [%i0 + 12] 470 add %i0, 16, %i0 471 bne,a 5b 472 ld [%i1], %i3 47310: 474 sll %i5, %g4, %g2 475 srl %g1, %l0, %g5 476 srl %l0, 3, %g3 477 or %g2, %g5, %g2 478 sub %i1, %g3, %i1 479 andcc %i2, 2, %g0 480 st %g2, [%i0] 481 be 1f 482 andcc %i2, 1, %g0 483 484 ldub [%i1], %g2 485 add %i1, 2, %i1 486 stb %g2, [%i0 + 4] 487 add %i0, 2, %i0 488 ldub [%i1 - 1], %g2 489 stb %g2, [%i0 + 3] 4901: 491 be 1f 492 nop 493 ldub [%i1], %g2 494 stb %g2, [%i0 + 4] 4951: 496 ret 497 restore %g7, %g0, %o0 498 49988: /* short_end */ 500 501 and %o2, 0xe, %o3 50220: 503 sethi %hi(89f), %o5 504 sll %o3, 3, %o4 505 add %o0, %o3, %o0 506 sub %o5, %o4, %o5 507 add %o1, %o3, %o1 508 jmpl %o5 + %lo(89f), %g0 509 andcc %o2, 1, %g0 510 511 MOVE_SHORTCHUNK(o1, o0, 0x0c, g2, g3) 512 MOVE_SHORTCHUNK(o1, o0, 0x0a, g2, g3) 513 MOVE_SHORTCHUNK(o1, o0, 0x08, g2, g3) 514 MOVE_SHORTCHUNK(o1, o0, 0x06, g2, g3) 515 MOVE_SHORTCHUNK(o1, o0, 0x04, g2, g3) 516 MOVE_SHORTCHUNK(o1, o0, 0x02, g2, g3) 517 MOVE_SHORTCHUNK(o1, o0, 0x00, g2, g3) 518 51989: /* short_table_end */ 520 521 be 1f 522 nop 523 524 ldub [%o1], %g2 525 stb %g2, [%o0] 5261: 527 retl 528 mov %g7, %o0 529 53090: /* short_aligned_end */ 531 bne 88b 532 andcc %o2, 8, %g0 533 534 be 1f 535 andcc %o2, 4, %g0 536 537 ld [%o1 + 0x00], %g2 538 ld [%o1 + 0x04], %g3 539 add %o1, 8, %o1 540 st %g2, [%o0 + 0x00] 541 st %g3, [%o0 + 0x04] 542 add %o0, 8, %o0 5431: 544 b 81b 545 mov %o2, %g1 546