1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 .file "memcpy.s" 27 28/* 29 * memcpy(s1, s2, len) 30 * 31 * Copy s2 to s1, always copy n bytes. 32 * Note: this C code does not work for overlapped copies. 33 * Memmove() and bcopy() do. 34 * 35 * Fast assembler language version of the following C-program for memcpy 36 * which represents the `standard' for the C-library. 37 * 38 * void * 39 * memcpy(void *s, const void *s0, size_t n) 40 * { 41 * if (n != 0) { 42 * char *s1 = s; 43 * const char *s2 = s0; 44 * do { 45 * *s1++ = *s2++; 46 * } while (--n != 0); 47 * } 48 * return (s); 49 * } 50 */ 51 52#include <sys/asm_linkage.h> 53#include <sys/sun4asi.h> 54#include <sys/trap.h> 55 56#define ICACHE_LINE_SIZE 64 57#define BLOCK_SIZE 64 58#define FPRS_FEF 0x4 59 60#define ALIGNED8_FPCOPY_THRESHOLD 1024 61#define ALIGNED4_FPCOPY_THRESHOLD 1024 62#define BST_THRESHOLD 65536 63 64#define SHORTCOPY 3 65#define SMALL_MAX 64 66#define MEDIUM_MAX 255 67#define MED_WMAX 256 /* max copy for medium word-aligned case */ 68 69#define N_READS_STRONG 20 70#define N_WRITES_STRONG 22 71 72 73 ANSI_PRAGMA_WEAK(memmove,function) 74 ANSI_PRAGMA_WEAK(memcpy,function) 75 76 ENTRY(memmove) 77 prefetch [%o1], N_READS_STRONG 78 prefetch [%o0], N_WRITES_STRONG 79 cmp %o1, %o0 ! if from address is >= to use forward copy 80 bgeu %ncc, .forcpy ! else use backward if ... 81 sub %o0, %o1, %o4 ! get difference of two addresses 82 cmp %o2, %o4 ! compare size and difference of addresses 83 bleu %ncc, .forcpy ! if size is bigger, do overlapped copy 84 nop 85 86 ! 87 ! an overlapped copy that must be done "backwards" 88 ! 89.ovbc: 90 mov %o0, %g1 ! save dest address for return val 91 add %o1, %o2, %o1 ! get to end of source space 92 add %o0, %o2, %o0 ! get to end of destination space 93 94 cmp %o2, 64 95 bgeu,pn %ncc, .dbalign 96 nop 97 cmp %o2, 4 98 blt,pn %ncc, .byte 99 sub %o2, 3, %o2 100.byte4loop: 101 ldub [%o1-1], %o3 ! load last byte 102 stb %o3, [%o0-1] ! store last byte 103 sub %o1, 4, %o1 104 ldub [%o1+2], %o3 ! load 2nd from last byte 105 stb %o3, [%o0-2] ! store 2nd from last byte 106 sub %o0, 4, %o0 107 ldub [%o1+1], %o3 ! load 3rd from last byte 108 stb %o3, [%o0+1] ! store 3rd from last byte 109 subcc %o2, 4, %o2 110 ldub [%o1], %o3 ! load 4th from last byte 111 bgu,pt %ncc, .byte4loop 112 stb %o3, [%o0] ! store 4th from last byte 113.byte: 114 addcc %o2, 3, %o2 115 bz,pt %ncc, .exit 116.byteloop: 117 dec %o1 ! decrement src address 118 ldub [%o1], %o3 ! read a byte 119 dec %o0 ! decrement dst address 120 deccc %o2 ! decrement count 121 bgu,pt %ncc, .byteloop ! loop until done 122 stb %o3, [%o0] ! write byte 123.exit: 124 retl 125 mov %g1, %o0 126 127 .align 16 128.dbalign: 129 prefetch [%o1 - (4 * BLOCK_SIZE)], #one_read 130 prefetch [%o0 - (4 * BLOCK_SIZE)], #one_write 131 andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned 132 bz,pt %ncc, .dbmed 133 sub %o2, %o5, %o2 ! update count 134.dbalign1: 135 dec %o1 ! decrement src address 136 ldub [%o1], %o3 ! read a byte 137 dec %o0 ! decrement dst address 138 deccc %o5 ! decrement count 139 bgu,pt %ncc, .dbalign1 ! loop until done 140 stb %o3, [%o0] ! store a byte 141 142! check for src long word alignment 143.dbmed: 144 andcc %o1, 7, %g0 ! chk src long word alignment 145 bnz,pn %ncc, .dbbck 146 nop 147! 148! Following code is for overlapping copies where src and dest 149! are long word aligned 150! 151! 152! For SPARC64-VI, prefetch is effective for both integer and fp register 153! operations. There are no benefits in using the fp registers for 154! aligned data copying. 155 156.dbmedl32enter: 157 subcc %o2, 31, %o2 ! adjust length to allow cc test 158 ! for end of loop 159 ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32 160 nop 161.dbmedl32: 162 ldx [%o1-8], %o4 ! load 163 prefetch [%o1 - (8 * BLOCK_SIZE)], #one_read 164 subcc %o2, 32, %o2 ! decrement length count 165 stx %o4, [%o0-8] ! and store 166 prefetch [%o0 - (8 * BLOCK_SIZE)], #one_write 167 ldx [%o1-16], %o3 ! a block of 32 bytes 168 sub %o1, 32, %o1 ! decrease src ptr by 32 169 stx %o3, [%o0-16] 170 ldx [%o1+8], %o4 171 sub %o0, 32, %o0 ! decrease dst ptr by 32 172 stx %o4, [%o0+8] 173 ldx [%o1], %o3 174 bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left 175 stx %o3, [%o0] 176.dbmedl31: 177 addcc %o2, 16, %o2 ! adjust remaining count 178 ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left 179 nop ! 180 ldx [%o1-8], %o4 ! load and store 16 bytes 181 sub %o1, 16, %o1 ! decrease src ptr by 16 182 stx %o4, [%o0-8] ! 183 sub %o2, 16, %o2 ! decrease count by 16 184 ldx [%o1], %o3 ! 185 sub %o0, 16, %o0 ! decrease dst ptr by 16 186 stx %o3, [%o0] 187.dbmedl15: 188 addcc %o2, 15, %o2 ! restore count 189 bz,pt %ncc, .dbexit ! exit if finished 190 nop 191 cmp %o2, 8 192 blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left 193 nop 194 ldx [%o1-8], %o4 ! load 8 bytes 195 sub %o1, 8, %o1 ! decrease src ptr by 8 196 stx %o4, [%o0-8] ! and store 8 bytes 197 subcc %o2, 8, %o2 ! decrease count by 8 198 bnz %ncc, .dbremain ! exit if finished 199 sub %o0, 8, %o0 ! decrease dst ptr by 8 200 retl 201 mov %g1, %o0 202 203! 204! Following code is for overlapping copies where src and dest 205! are not long word aligned 206! 207 .align 16 208.dbbck: 209 rd %fprs, %o3 ! o3 = fprs 210 211 ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. 212 ! So set it anyway, without checking. 213 wr %g0, 0x4, %fprs ! fprs.fef = 1 214 215 alignaddr %o1, %g0, %o5 ! align src 216 ldd [%o5], %d0 ! get first 8 byte block 217 andn %o2, 7, %o4 ! prepare src ptr for finishup code 218 cmp %o2, 32 219 blt,pn %ncc, .dbmv8 220 sub %o1, %o4, %o1 ! 221 cmp %o2, 4095 ! check for short memmoves 222 blt,pn %ncc, .dbmv32enter ! go to no prefetch code 223.dbmv64: 224 ldd [%o5-8], %d2 ! load 8 bytes 225 ldd [%o5-16], %d4 ! load 8 bytes 226 sub %o5, 64, %o5 ! 227 ldd [%o5+40], %d6 ! load 8 bytes 228 sub %o0, 64, %o0 ! 229 ldd [%o5+32], %d8 ! load 8 bytes 230 sub %o2, 64, %o2 ! 64 less bytes to copy 231 ldd [%o5+24], %d18 ! load 8 bytes 232 cmp %o2, 64 ! do we have < 64 bytes remaining 233 ldd [%o5+16], %d28 ! load 8 bytes 234 ldd [%o5+8], %d30 ! load 8 bytes 235 faligndata %d2, %d0, %d10 ! extract 8 bytes out 236 prefetch [%o5 - (5 * BLOCK_SIZE)], #one_read 237 ldd [%o5], %d0 ! load 8 bytes 238 std %d10, [%o0+56] ! store the current 8 bytes 239 faligndata %d4, %d2, %d12 ! extract 8 bytes out 240 prefetch [%o0 - (5 * BLOCK_SIZE)], #one_write 241 std %d12, [%o0+48] ! store the current 8 bytes 242 faligndata %d6, %d4, %d14 ! extract 8 bytes out 243 std %d14, [%o0+40] ! store the current 8 bytes 244 faligndata %d8, %d6, %d16 ! extract 8 bytes out 245 std %d16, [%o0+32] ! store the current 8 bytes 246 faligndata %d18, %d8, %d20 ! extract 8 bytes out 247 std %d20, [%o0+24] ! store the current 8 bytes 248 faligndata %d28, %d18, %d22 ! extract 8 bytes out 249 std %d22, [%o0+16] ! store the current 8 bytes 250 faligndata %d30, %d28, %d24 ! extract 8 bytes out 251 std %d24, [%o0+8] ! store the current 8 bytes 252 faligndata %d0, %d30, %d26 ! extract 8 bytes out 253 bgeu,pt %ncc, .dbmv64 254 std %d26, [%o0] ! store the current 8 bytes 255 256 cmp %o2, 32 257 blt,pn %ncc, .dbmvx 258 nop 259.dbmv32: 260 ldd [%o5-8], %d2 ! load 8 bytes 261.dbmv32enter: 262 ldd [%o5-16], %d4 ! load 8 bytes 263 sub %o5, 32, %o5 ! 264 ldd [%o5+8], %d6 ! load 8 bytes 265 sub %o0, 32, %o0 ! 266 faligndata %d2, %d0, %d10 ! extract 8 bytes out 267 ldd [%o5], %d0 ! load 8 bytes 268 sub %o2,32, %o2 ! 32 less bytes to copy 269 std %d10, [%o0+24] ! store the current 8 bytes 270 cmp %o2, 32 ! do we have < 32 bytes remaining 271 faligndata %d4, %d2, %d12 ! extract 8 bytes out 272 std %d12, [%o0+16] ! store the current 8 bytes 273 faligndata %d6, %d4, %d14 ! extract 8 bytes out 274 std %d14, [%o0+8] ! store the current 8 bytes 275 faligndata %d0, %d6, %d16 ! extract 8 bytes out 276 bgeu,pt %ncc, .dbmv32 277 std %d16, [%o0] ! store the current 8 bytes 278.dbmvx: 279 cmp %o2, 8 ! do we have < 8 bytes remaining 280 blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code 281 nop 282.dbmv8: 283 ldd [%o5-8], %d2 284 sub %o0, 8, %o0 ! since we are at the end 285 ! when we first enter the loop 286 sub %o2, 8, %o2 ! 8 less bytes to copy 287 sub %o5, 8, %o5 288 cmp %o2, 8 ! do we have < 8 bytes remaining 289 faligndata %d2, %d0, %d8 ! extract 8 bytes out 290 std %d8, [%o0] ! store the current 8 bytes 291 bgeu,pt %ncc, .dbmv8 292 fmovd %d2, %d0 293.dbmvfinish: 294 and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 295 tst %o2 296 bz,pt %ncc, .dbexit 297 wr %o3, %g0, %fprs ! fprs = o3 restore fprs 298 299.dbremain: 300 cmp %o2, 4 301 blt,pn %ncc, .dbbyte 302 nop 303 ldub [%o1-1], %o3 ! load last byte 304 stb %o3, [%o0-1] ! store last byte 305 sub %o1, 4, %o1 306 ldub [%o1+2], %o3 ! load 2nd from last byte 307 stb %o3, [%o0-2] ! store 2nd from last byte 308 sub %o0, 4, %o0 309 ldub [%o1+1], %o3 ! load 3rd from last byte 310 stb %o3, [%o0+1] ! store 3rd from last byte 311 subcc %o2, 4, %o2 312 ldub [%o1], %o3 ! load 4th from last byte 313 stb %o3, [%o0] ! store 4th from last byte 314 bz,pt %ncc, .dbexit 315.dbbyte: 316 dec %o1 ! decrement src address 317 ldub [%o1], %o3 ! read a byte 318 dec %o0 ! decrement dst address 319 deccc %o2 ! decrement count 320 bgu,pt %ncc, .dbbyte ! loop until done 321 stb %o3, [%o0] ! write byte 322.dbexit: 323 retl 324 mov %g1, %o0 325 SET_SIZE(memmove) 326 327 328 .align ICACHE_LINE_SIZE 329 ENTRY(memcpy) 330 ! adjust instruction alignment 331 nop ! Do not remove, these nops affect 332 nop ! icache alignment and performance 333.forcpy: 334 prefetch [%o1], N_READS_STRONG 335 prefetch [%o0], N_WRITES_STRONG 336 cmp %o2, SMALL_MAX ! check for not small case 337 bgu,pn %ncc, .medium ! go to larger cases 338 mov %o0, %g1 ! save %o0 339 cmp %o2, SHORTCOPY ! check for really short case 340 ble,pt %ncc, .smallleft ! 341 or %o0, %o1, %o3 ! prepare alignment check 342 andcc %o3, 0x3, %g0 ! test for alignment 343 bz,pt %ncc, .smallword ! branch to word aligned case 344 sub %o2, 3, %o2 ! adjust count to allow cc zero test 345.smallnotalign4: 346 ldub [%o1], %o3 ! read byte 347 subcc %o2, 4, %o2 ! reduce count by 4 348 stb %o3, [%o0] ! write byte 349 ldub [%o1+1], %o3 ! repeat for a total of 4 bytes 350 add %o1, 4, %o1 ! advance SRC by 4 351 stb %o3, [%o0+1] 352 ldub [%o1-2], %o3 353 add %o0, 4, %o0 ! advance DST by 4 354 stb %o3, [%o0-2] 355 ldub [%o1-1], %o3 356 bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain 357 stb %o3, [%o0-1] 358 add %o2, 3, %o2 ! restore count 359.smallleft: 360 tst %o2 361 bz,pt %ncc, .smallexit 362 nop 363.smallleft3: ! 1, 2, or 3 bytes remain 364 ldub [%o1], %o3 ! load one byte 365 deccc %o2 ! reduce count for cc test 366 bz,pt %ncc, .smallexit 367 stb %o3, [%o0] ! store one byte 368 ldub [%o1+1], %o3 ! load second byte 369 deccc %o2 370 bz,pt %ncc, .smallexit 371 stb %o3, [%o0+1] ! store second byte 372 ldub [%o1+2], %o3 ! load third byte 373 stb %o3, [%o0+2] ! store third byte 374 retl 375 mov %g1, %o0 ! restore %o0 376 377 .align 16 378 nop ! affects loop icache alignment 379.smallwords: 380 lduw [%o1], %o3 ! read word 381.smallwordx: 382 subcc %o2, 8, %o2 ! update count 383 stw %o3, [%o0] ! write word 384 add %o1, 8, %o1 ! update SRC 385 lduw [%o1-4], %o3 ! read word 386 add %o0, 8, %o0 ! update DST 387 bgu,pt %ncc, .smallwords ! loop until done 388 stw %o3, [%o0-4] ! write word 389 addcc %o2, 7, %o2 ! restore count 390 bz,pt %ncc, .smallexit ! check for completion 391 nop 392 cmp %o2, 4 ! check for 4 or more bytes left 393 blt .smallleft3 ! if not, go to finish up 394 nop 395 lduw [%o1], %o3 396 add %o1, 4, %o1 397 subcc %o2, 4, %o2 398 stw %o3, [%o0] 399 add %o0, 4, %o0 400 bnz,pt %ncc, .smallleft3 401 nop 402 retl 403 mov %g1, %o0 ! restore %o0 404 405.smallword: 406 subcc %o2, 4, %o2 ! update count 407 bgu,pt %ncc, .smallwordx 408 lduw [%o1], %o3 ! read word 409 addcc %o2, 3, %o2 ! restore count 410 bz,pt %ncc, .smallexit 411 stw %o3, [%o0] ! write word 412 deccc %o2 ! reduce count for cc test 413 ldub [%o1+4], %o3 ! load one byte 414 bz,pt %ncc, .smallexit 415 stb %o3, [%o0+4] ! store one byte 416 ldub [%o1+5], %o3 ! load second byte 417 deccc %o2 418 bz,pt %ncc, .smallexit 419 stb %o3, [%o0+5] ! store second byte 420 ldub [%o1+6], %o3 ! load third byte 421 stb %o3, [%o0+6] ! store third byte 422.smallexit: 423 retl 424 mov %g1, %o0 ! restore %o0 425 .align 16 426.medium: 427 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 428 prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write 429 neg %o0, %o5 430 neg %o1, %o3 431 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 432 and %o3, 7, %o3 ! bytes till SRC 8 byte aligned 433 434 bz %ncc, 2f 435 sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) 436 ! o3={-7, -6, ... 7} o3>0 => SRC overaligned 437 438 sub %o2, %o5, %o2 ! update count 439 4401: 441 ldub [%o1], %o4 442 deccc %o5 443 inc %o1 444 stb %o4, [%o0] 445 bgu,pt %ncc, 1b 446 inc %o0 447 448 ! Now DST is 8-byte aligned. o0, o1, o2 are current. 449 4502: 451 andcc %o1, 0x3, %g0 ! test alignment 452 prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read 453 bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases 454 ! if src, dst not aligned 455 prefetch [%o0 + (1 * BLOCK_SIZE)], #one_write 456 457/* 458 * Handle all cases where src and dest are aligned on word 459 * or long word boundaries. Use unrolled loops for better 460 * performance. This option wins over standard large data 461 * move when source and destination is in cache for medium 462 * to short data moves. 463 */ 464 andcc %o1, 0x7, %g0 ! test word alignment 465 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 466 bz,pt %ncc, .medlword ! branch to long word aligned case 467 prefetch [%o0 + (2 * BLOCK_SIZE)], #one_write 468 cmp %o2, ALIGNED4_FPCOPY_THRESHOLD ! limit to store buffer size 469 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 470 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 471 subcc %o2, 15, %o2 ! adjust length to allow cc test 472 prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write 473 ! for end of loop 474 ble,pt %ncc, .medw15 ! skip big loop if less than 16 475 .empty 476.medw16: 477 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 478 ld [%o1], %o4 ! load 479 subcc %o2, 16, %o2 ! decrement length count 480 prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write 481 stw %o4, [%o0] ! and store 482 ld [%o1+4], %o3 ! a block of 16 bytes 483 add %o1, 16, %o1 ! increase src ptr by 16 484 stw %o3, [%o0+4] 485 ld [%o1-8], %o4 486 add %o0, 16, %o0 ! increase dst ptr by 16 487 stw %o4, [%o0-8] 488 ld [%o1-4], %o3 489 bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left 490 stw %o3, [%o0-4] 491.medw15: 492 addcc %o2, 15, %o2 ! restore count 493 bz,pt %ncc, .medwexit ! exit if finished 494 nop 495 cmp %o2, 8 496 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 497 nop ! 498 ld [%o1], %o4 ! load 4 bytes 499 subcc %o2, 8, %o2 ! decrease count by 8 500 stw %o4, [%o0] ! and store 4 bytes 501 add %o1, 8, %o1 ! increase src ptr by 8 502 ld [%o1-4], %o3 ! load 4 bytes 503 add %o0, 8, %o0 ! increase dst ptr by 8 504 stw %o3, [%o0-4] ! and store 4 bytes 505 bz %ncc, .medwexit ! exit if finished 506 nop 507.medw7: ! count is ge 1, less than 8 508 cmp %o2, 3 ! check for 4 bytes left 509 ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left 510 nop ! 511 ld [%o1], %o4 ! load 4 bytes 512 sub %o2, 4, %o2 ! decrease count by 4 513 add %o1, 4, %o1 ! increase src ptr by 4 514 stw %o4, [%o0] ! and store 4 bytes 515 add %o0, 4, %o0 ! increase dst ptr by 4 516 tst %o2 ! check for zero bytes left 517 bz %ncc, .medwexit ! exit if finished 518 nop 519.medw3: ! count is known to be 1, 2, or 3 520 deccc %o2 ! reduce count by one 521 ldub [%o1], %o3 ! load one byte 522 bz,pt %ncc, .medwexit ! exit if last byte 523 stb %o3, [%o0] ! store one byte 524 ldub [%o1+1], %o3 ! load second byte 525 deccc %o2 ! reduce count by one 526 bz,pt %ncc, .medwexit ! exit if last byte 527 stb %o3, [%o0+1] ! store second byte 528 ldub [%o1+2], %o3 ! load third byte 529 stb %o3, [%o0+2] ! store third byte 530.medwexit: 531 retl 532 mov %g1, %o0 ! restore %o0 533 534/* 535 * Special case for handling when src and dest are both long word aligned 536 * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD 537 * bytes. 538 */ 539 540 .align 16 541 nop 542.medlword: ! long word aligned 543 ! length > ALIGNED8_FPCOPY_THRESHOLD 544 cmp %o2, ALIGNED8_FPCOPY_THRESHOLD 545 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 546 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 547 prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write 548 subcc %o2, 31, %o2 ! adjust length to allow cc test 549 ! for end of loop 550 ble,pt %ncc, .medl31 ! skip big loop if less than 32 551 .empty 552.medl32: 553 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 554 ldx [%o1], %o4 ! load 555 subcc %o2, 32, %o2 ! decrement length count 556 prefetch [%o0 + (4 * BLOCK_SIZE)], #one_read 557 stx %o4, [%o0] ! and store 558 ldx [%o1+8], %o3 ! a block of 32 bytes 559 add %o1, 32, %o1 ! increase src ptr by 32 560 stx %o3, [%o0+8] 561 ldx [%o1-16], %o4 562 add %o0, 32, %o0 ! increase dst ptr by 32 563 stx %o4, [%o0-16] 564 ldx [%o1-8], %o3 565 bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left 566 stx %o3, [%o0-8] 567.medl31: 568 addcc %o2, 16, %o2 ! adjust remaining count 569 ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left 570 nop ! 571 ldx [%o1], %o4 ! load and store 16 bytes 572 add %o1, 16, %o1 ! increase src ptr by 16 573 stx %o4, [%o0] ! 574 sub %o2, 16, %o2 ! decrease count by 16 575 ldx [%o1-8], %o3 ! 576 add %o0, 16, %o0 ! increase dst ptr by 16 577 stx %o3, [%o0-8] 578.medl15: 579 addcc %o2, 15, %o2 ! restore count 580 bz,pt %ncc, .medwexit ! exit if finished 581 nop 582 cmp %o2, 8 583 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 584 nop 585 ldx [%o1], %o4 ! load 8 bytes 586 add %o1, 8, %o1 ! increase src ptr by 8 587 stx %o4, [%o0] ! and store 8 bytes 588 subcc %o2, 8, %o2 ! decrease count by 8 589 bz %ncc, .medwexit ! exit if finished 590 add %o0, 8, %o0 ! increase dst ptr by 8 591 ba .medw7 592 nop 593 594 .align 16 595 nop 596 nop 597 nop 598.mediumsetup: 599 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 600 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 601.mediumrejoin: 602 rd %fprs, %o4 ! check for unused FPU 603 604 add %o1, 8, %o1 ! prepare to round SRC upward 605 606 sethi %hi(0x1234567f), %o5 ! For GSR.MASK 607 or %o5, 0x67f, %o5 608 609 andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 610 bz,a %ncc, 3f 611 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 6123: 613 cmp %o2, MEDIUM_MAX 614 bmask %o5, %g0, %g0 615 616 ! Compute o5 (number of bytes that need copying using the main loop). 617 ! First, compute for the medium case. 618 ! Then, if large case, o5 is replaced by count for block alignment. 619 ! Be careful not to read past end of SRC 620 ! Currently, o2 is the actual count remaining 621 ! o3 is how much sooner we'll cross the alignment boundary 622 ! in SRC compared to in DST 623 ! 624 ! Examples: Let # denote bytes that should not be accessed 625 ! Let x denote a byte already copied to align DST 626 ! Let . and - denote bytes not yet copied 627 ! Let | denote double alignment boundaries 628 ! 629 ! DST: ######xx|........|--------|..###### o2 = 18 630 ! o0 631 ! 632 ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 633 ! o1 634 ! 635 ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 636 ! o1 637 ! 638 ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 639 ! o1 640 641 or %g0, -8, %o5 642 alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1 643 644 movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 645 add %o5, %o2, %o5 646 add %o5, %o3, %o5 647 648 bleu %ncc, 4f 649 andn %o5, 7, %o5 ! 8 byte aligned count 650 neg %o0, %o5 ! 'large' case 651 and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned 6524: 653 brgez,a %o3, .beginmedloop 654 ldd [%o1-8], %d0 655 656 add %o1, %o3, %o1 ! back up o1 6575: 658 ldda [%o1]ASI_FL8_P, %d2 659 inc %o1 660 andcc %o1, 7, %g0 661 bnz %ncc, 5b 662 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 663 664.beginmedloop: 665 tst %o5 666 bz %ncc, .endmedloop 667 sub %o2, %o5, %o2 ! update count for later 668 669 ! Main loop to write out doubles. Note: o5 & 7 == 0 670 671 ldd [%o1], %d2 672 subcc %o5, 8, %o5 ! update local count 673 bz,pn %ncc, 1f 674 add %o1, 8, %o1 ! update SRC 675 676.medloop: 677 faligndata %d0, %d2, %d4 678 ldd [%o1], %d0 679 subcc %o5, 8, %o5 ! update local count 680 add %o1, 16, %o1 ! update SRC 681 std %d4, [%o0] 682 bz,pn %ncc, 2f 683 faligndata %d2, %d0, %d6 684 ldd [%o1 - 8], %d2 685 subcc %o5, 8, %o5 ! update local count 686 std %d6, [%o0 + 8] 687 bnz,pt %ncc, .medloop 688 add %o0, 16, %o0 ! update DST 689 6901: 691 faligndata %d0, %d2, %d4 692 fmovd %d2, %d0 693 std %d4, [%o0] 694 ba .endmedloop 695 add %o0, 8, %o0 696 6972: 698 std %d6, [%o0 + 8] 699 sub %o1, 8, %o1 700 add %o0, 16, %o0 701 702 703.endmedloop: 704 ! Currently, o1 is pointing to the next double-aligned byte in SRC 705 ! The 8 bytes starting at [o1-8] are available in d0 706 ! At least one, and possibly all, of these need to be written. 707 708 cmp %o2, BLOCK_SIZE 709 bgu %ncc, .large ! otherwise, less than 16 bytes left 710 711#if 0 712 713 /* This code will use partial stores. */ 714 715 mov %g0, %o5 716 and %o3, 7, %o3 ! Number of bytes needed to completely 717 ! fill %d0 with good (unwritten) data. 718 719 subcc %o2, 8, %o2 ! update count (maybe too much) 720 movl %ncc, %o2, %o5 721 addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0 722 sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0) 723 724 bz %ncc, 2f 725 alignaddr %o3, %g0, %g0 ! set GSR.ALIGN 726 7271: 728 deccc %o5 729 ldda [%o1]ASI_FL8_P, %d2 730 inc %o1 731 bgu %ncc, 1b 732 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 733 7342: 735 not %o3 736 faligndata %d0, %d0, %d0 ! shift bytes to the left 737 and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3] 738 edge8n %g0, %o3, %o5 739 stda %d0, [%o0]%o5, ASI_PST8_P 740 brlez %o2, .mediumexit 741 add %o0, %o3, %o0 ! update DST to last stored byte 7423: 743 inc %o0 744 deccc %o2 745 ldub [%o1], %o3 746 stb %o3, [%o0] 747 bgu %ncc, 3b 748 inc %o1 749 750#else 751 752 andcc %o3, 7, %o5 ! Number of bytes needed to completely 753 ! fill %d0 with good (unwritten) data. 754 bz %ncc, 2f 755 sub %o5, 8, %o3 ! -(number of good bytes in %d0) 756 cmp %o2, 8 757 bl,a %ncc, 3f ! Not enough bytes to fill %d0 758 add %o1, %o3, %o1 ! Back up %o1 759 7601: 761 deccc %o5 762 ldda [%o1]ASI_FL8_P, %d2 763 inc %o1 764 bgu %ncc, 1b 765 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 766 7672: 768 subcc %o2, 8, %o2 769 std %d0, [%o0] 770 bz %ncc, .mediumexit 771 add %o0, 8, %o0 7723: 773 ldub [%o1], %o3 774 deccc %o2 775 inc %o1 776 stb %o3, [%o0] 777 bgu %ncc, 3b 778 inc %o0 779#endif 780 781.mediumexit: 782 wr %o4, %g0, %fprs ! fprs = o4 restore fprs 783 retl 784 mov %g1, %o0 785 786 787 .align ICACHE_LINE_SIZE 788.large: 789 790 ! %o0 I/O DST is 64-byte aligned 791 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 792 ! %d0 I/O already loaded with SRC data from [%o1-8] 793 ! %o2 I/O count (number of bytes that need to be written) 794 ! %o3 I Not written. If zero, then SRC is double aligned. 795 ! %o4 I Not written. Holds fprs. 796 ! %o5 O The number of doubles that remain to be written. 797 798 ! Load the rest of the current block 799 ! Recall that %o1 is further into SRC than %o0 is into DST 800 801 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 802 prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read 803 804 set BST_THRESHOLD, %o5 805 cmp %o2, %o5 806 bgu,pn %icc, .xlarge 807 prefetch [%o1 + (12 * BLOCK_SIZE)], #one_read 808 809 ldd [%o1], %f2 810 ldd [%o1 + 0x8], %f4 811 faligndata %f0, %f2, %f32 812 ldd [%o1 + 0x10], %f6 813 faligndata %f2, %f4, %f34 814 ldd [%o1 + 0x18], %f8 815 faligndata %f4, %f6, %f36 816 ldd [%o1 + 0x20], %f10 817 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 818 faligndata %f6, %f8, %f38 819 prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read 820 ldd [%o1 + 0x28], %f12 821 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) 822 faligndata %f8, %f10, %f40 823 ldd [%o1 + 0x30], %f14 824 faligndata %f10, %f12, %f42 825 ldd [%o1 + 0x38], %f0 826 sub %o2, BLOCK_SIZE, %o2 ! update count 827 add %o1, BLOCK_SIZE, %o1 ! update SRC 828 829 ! Main loop. Write previous block. Load rest of current block. 830 ! Some bytes will be loaded that won't yet be written. 8311: 832 ldd [%o1], %f2 833 faligndata %f12, %f14, %f44 834 ldd [%o1 + 0x8], %f4 835 faligndata %f14, %f0, %f46 836 std %f32, [%o0] 837 std %f34, [%o0+8] 838 std %f36, [%o0+16] 839 std %f38, [%o0+24] 840 std %f40, [%o0+32] 841 std %f42, [%o0+40] 842 std %f44, [%o0+48] 843 std %f46, [%o0+56] 844 sub %o2, BLOCK_SIZE, %o2 ! update count 845 prefetch [%o1 + (24 * BLOCK_SIZE) + BLOCK_SIZE], #one_read 846 add %o0, BLOCK_SIZE, %o0 ! update DST 847 ldd [%o1 + 0x10], %f6 848 faligndata %f0, %f2, %f32 849 ldd [%o1 + 0x18], %f8 850 faligndata %f2, %f4, %f34 851 ldd [%o1 + 0x20], %f10 852 faligndata %f4, %f6, %f36 853 ldd [%o1 + 0x28], %f12 854 faligndata %f6, %f8, %f38 855 ldd [%o1 + 0x30], %f14 856 faligndata %f8, %f10, %f40 857 ldd [%o1 + 0x38], %f0 858 faligndata %f10, %f12, %f42 859 prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read 860 cmp %o2, BLOCK_SIZE + 8 861 prefetch [%o0 + (18 * BLOCK_SIZE)], #one_write 862 bgu,pt %ncc, 1b 863 add %o1, BLOCK_SIZE, %o1 ! update SRC 864 faligndata %f12, %f14, %f44 865 faligndata %f14, %f0, %f46 866 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 867 cmp %o2, BLOCK_SIZE 868 bne %ncc, 2f ! exactly 1 block remaining? 869 add %o0, BLOCK_SIZE, %o0 ! update DST 870 brz,a %o3, 3f ! is SRC double aligned? 871 ldd [%o1], %f2 872 8732: 874 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 875 add %o5, %o3, %o5 876 877 membar #StoreLoad|#StoreStore 878 879 ba .beginmedloop 880 andn %o5, 7, %o5 ! 8 byte aligned count 881 882 883 ! This is when there is exactly 1 block remaining and SRC is aligned 8843: 885 ldd [%o1 + 0x8], %f4 886 ldd [%o1 + 0x10], %f6 887 fsrc1 %f0, %f32 888 ldd [%o1 + 0x18], %f8 889 fsrc1 %f2, %f34 890 ldd [%o1 + 0x20], %f10 891 fsrc1 %f4, %f36 892 ldd [%o1 + 0x28], %f12 893 fsrc1 %f6, %f38 894 ldd [%o1 + 0x30], %f14 895 fsrc1 %f8, %f40 896 fsrc1 %f10, %f42 897 fsrc1 %f12, %f44 898 fsrc1 %f14, %f46 899 stda %f32, [%o0]ASI_BLK_P 900 membar #StoreLoad|#StoreStore 901 wr %o4, 0, %fprs 902 retl 903 mov %g1, %o0 904 905 906 .align 16 907 ! two nops here causes loop starting at 1f below to be 908 ! on a cache line boundary, improving performance 909 nop 910 nop 911.xlarge: 912 ! %o0 I/O DST is 64-byte aligned 913 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 914 ! %d0 I/O already loaded with SRC data from [%o1-8] 915 ! %o2 I/O count (number of bytes that need to be written) 916 ! %o3 I Not written. If zero, then SRC is double aligned. 917 ! %o4 I Not written. Holds fprs. 918 ! %o5 O The number of doubles that remain to be written. 919 920 ! Load the rest of the current block 921 ! Recall that %o1 is further into SRC than %o0 is into DST 922 923 ldd [%o1], %f2 924 ldd [%o1 + 0x8], %f4 925 faligndata %f0, %f2, %f32 926 ldd [%o1 + 0x10], %f6 927 faligndata %f2, %f4, %f34 928 ldd [%o1 + 0x18], %f8 929 faligndata %f4, %f6, %f36 930 ldd [%o1 + 0x20], %f10 931 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 932 faligndata %f6, %f8, %f38 933 ldd [%o1 + 0x28], %f12 934 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) 935 prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read 936 faligndata %f8, %f10, %f40 937 ldd [%o1 + 0x30], %f14 938 faligndata %f10, %f12, %f42 939 ldd [%o1 + 0x38], %f0 940 prefetch [%o1 + (17 * BLOCK_SIZE)], #one_read 941 sub %o2, BLOCK_SIZE, %o2 ! update count 942 add %o1, BLOCK_SIZE, %o1 ! update SRC 943 944 ! This point is 32-byte aligned since 24 instructions appear since 945 ! the previous alignment directive. 946 947 948 ! Main loop. Write previous block. Load rest of current block. 949 ! Some bytes will be loaded that won't yet be written. 9501: 951 ldd [%o1], %f2 952 faligndata %f12, %f14, %f44 953 ldd [%o1 + 0x8], %f4 954 faligndata %f14, %f0, %f46 955 stda %f32, [%o0]ASI_BLK_P 956 sub %o2, BLOCK_SIZE, %o2 ! update count 957 ldd [%o1 + 0x10], %f6 958 faligndata %f0, %f2, %f32 959 ldd [%o1 + 0x18], %f8 960 faligndata %f2, %f4, %f34 961 ldd [%o1 + 0x20], %f10 962 faligndata %f4, %f6, %f36 963 ldd [%o1 + 0x28], %f12 964 faligndata %f6, %f8, %f38 965 ldd [%o1 + 0x30], %f14 966 prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads 967 faligndata %f8, %f10, %f40 968 ldd [%o1 + 0x38], %f0 969 faligndata %f10, %f12, %f42 970 prefetch [%o1 + (25 * BLOCK_SIZE)], #one_read 971 add %o0, BLOCK_SIZE, %o0 ! update DST 972 cmp %o2, BLOCK_SIZE + 8 973 ! second prefetch important to correct for occasional dropped 974 prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read 975 bgu,pt %ncc, 1b 976 add %o1, BLOCK_SIZE, %o1 ! update SRC 977 978 faligndata %f12, %f14, %f44 979 faligndata %f14, %f0, %f46 980 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 981 cmp %o2, BLOCK_SIZE 982 bne %ncc, 2f ! exactly 1 block remaining? 983 add %o0, BLOCK_SIZE, %o0 ! update DST 984 brz,a %o3, 3f ! is SRC double aligned? 985 ldd [%o1], %f2 986 9872: 988 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 989 add %o5, %o3, %o5 990 991 membar #StoreLoad|#StoreStore 992 993 ba .beginmedloop 994 andn %o5, 7, %o5 ! 8 byte aligned count 995 996 997 ! This is when there is exactly 1 block remaining and SRC is aligned 9983: 999 ldd [%o1 + 0x8], %f4 1000 ldd [%o1 + 0x10], %f6 1001 fsrc1 %f0, %f32 1002 ldd [%o1 + 0x18], %f8 1003 fsrc1 %f2, %f34 1004 ldd [%o1 + 0x20], %f10 1005 fsrc1 %f4, %f36 1006 ldd [%o1 + 0x28], %f12 1007 fsrc1 %f6, %f38 1008 ldd [%o1 + 0x30], %f14 1009 fsrc1 %f8, %f40 1010 fsrc1 %f10, %f42 1011 fsrc1 %f12, %f44 1012 fsrc1 %f14, %f46 1013 stda %f32, [%o0]ASI_BLK_P 1014 membar #StoreLoad|#StoreStore 1015 wr %o4, 0, %fprs 1016 retl 1017 mov %g1, %o0 1018 1019 SET_SIZE(memcpy) 1020