1/* U3memcpy.S: UltraSparc-III optimized memcpy. 2 * 3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) 4 */ 5 6#ifdef __KERNEL__ 7#include <asm/visasm.h> 8#include <asm/asi.h> 9#define GLOBAL_SPARE %g7 10#else 11#define ASI_BLK_P 0xf0 12#define FPRS_FEF 0x04 13#ifdef MEMCPY_DEBUG 14#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ 15 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; 16#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 17#else 18#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs 19#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 20#endif 21#define GLOBAL_SPARE %g5 22#endif 23 24#ifndef EX_LD 25#define EX_LD(x) x 26#endif 27#ifndef EX_LD_FP 28#define EX_LD_FP(x) x 29#endif 30 31#ifndef EX_ST 32#define EX_ST(x) x 33#endif 34#ifndef EX_ST_FP 35#define EX_ST_FP(x) x 36#endif 37 38#ifndef EX_RETVAL 39#define EX_RETVAL(x) x 40#endif 41 42#ifndef LOAD 43#define LOAD(type,addr,dest) type [addr], dest 44#endif 45 46#ifndef STORE 47#define STORE(type,src,addr) type src, [addr] 48#endif 49 50#ifndef STORE_BLK 51#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P 52#endif 53 54#ifndef FUNC_NAME 55#define FUNC_NAME U3memcpy 56#endif 57 58#ifndef PREAMBLE 59#define PREAMBLE 60#endif 61 62#ifndef XCC 63#define XCC xcc 64#endif 65 66 .register %g2,#scratch 67 .register %g3,#scratch 68 69 /* Special/non-trivial issues of this code: 70 * 71 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf 72 * 2) Only low 32 FPU registers are used so that only the 73 * lower half of the FPU register set is dirtied by this 74 * code. This is especially important in the kernel. 75 * 3) This code never prefetches cachelines past the end 76 * of the source buffer. 77 */ 78 79 .text 80 .align 64 81 82 /* The cheetah's flexible spine, oversized liver, enlarged heart, 83 * slender muscular body, and claws make it the swiftest hunter 84 * in Africa and the fastest animal on land. Can reach speeds 85 * of up to 2.4GB per second. 86 */ 87 88 .globl FUNC_NAME 89 .type FUNC_NAME,#function 90FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 91 srlx %o2, 31, %g2 92 cmp %g2, 0 93 tne %xcc, 5 94 PREAMBLE 95 mov %o0, %o4 96 cmp %o2, 0 97 be,pn %XCC, 85f 98 or %o0, %o1, %o3 99 cmp %o2, 16 100 blu,a,pn %XCC, 80f 101 or %o3, %o2, %o3 102 103 cmp %o2, (3 * 64) 104 blu,pt %XCC, 70f 105 andcc %o3, 0x7, %g0 106 107 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve 108 * o5 from here until we hit VISExitHalf. 109 */ 110 VISEntryHalf 111 112 /* Is 'dst' already aligned on an 64-byte boundary? */ 113 andcc %o0, 0x3f, %g2 114 be,pt %XCC, 2f 115 116 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number 117 * of bytes to copy to make 'dst' 64-byte aligned. We pre- 118 * subtract this from 'len'. 119 */ 120 sub %o0, %o1, GLOBAL_SPARE 121 sub %g2, 0x40, %g2 122 sub %g0, %g2, %g2 123 sub %o2, %g2, %o2 124 andcc %g2, 0x7, %g1 125 be,pt %icc, 2f 126 and %g2, 0x38, %g2 127 1281: subcc %g1, 0x1, %g1 129 EX_LD_FP(LOAD(ldub, %o1 + 0x00, %o3)) 130 EX_ST_FP(STORE(stb, %o3, %o1 + GLOBAL_SPARE)) 131 bgu,pt %XCC, 1b 132 add %o1, 0x1, %o1 133 134 add %o1, GLOBAL_SPARE, %o0 135 1362: cmp %g2, 0x0 137 and %o1, 0x7, %g1 138 be,pt %icc, 3f 139 alignaddr %o1, %g0, %o1 140 141 EX_LD_FP(LOAD(ldd, %o1, %f4)) 1421: EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f6)) 143 add %o1, 0x8, %o1 144 subcc %g2, 0x8, %g2 145 faligndata %f4, %f6, %f0 146 EX_ST_FP(STORE(std, %f0, %o0)) 147 be,pn %icc, 3f 148 add %o0, 0x8, %o0 149 150 EX_LD_FP(LOAD(ldd, %o1 + 0x8, %f4)) 151 add %o1, 0x8, %o1 152 subcc %g2, 0x8, %g2 153 faligndata %f6, %f4, %f2 154 EX_ST_FP(STORE(std, %f2, %o0)) 155 bne,pt %icc, 1b 156 add %o0, 0x8, %o0 157 1583: LOAD(prefetch, %o1 + 0x000, #one_read) 159 LOAD(prefetch, %o1 + 0x040, #one_read) 160 andn %o2, (0x40 - 1), GLOBAL_SPARE 161 LOAD(prefetch, %o1 + 0x080, #one_read) 162 LOAD(prefetch, %o1 + 0x0c0, #one_read) 163 LOAD(prefetch, %o1 + 0x100, #one_read) 164 EX_LD_FP(LOAD(ldd, %o1 + 0x000, %f0)) 165 LOAD(prefetch, %o1 + 0x140, #one_read) 166 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2)) 167 LOAD(prefetch, %o1 + 0x180, #one_read) 168 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4)) 169 LOAD(prefetch, %o1 + 0x1c0, #one_read) 170 faligndata %f0, %f2, %f16 171 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6)) 172 faligndata %f2, %f4, %f18 173 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8)) 174 faligndata %f4, %f6, %f20 175 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10)) 176 faligndata %f6, %f8, %f22 177 178 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12)) 179 faligndata %f8, %f10, %f24 180 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14)) 181 faligndata %f10, %f12, %f26 182 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0)) 183 184 subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE 185 add %o1, 0x40, %o1 186 bgu,pt %XCC, 1f 187 srl GLOBAL_SPARE, 6, %o3 188 ba,pt %xcc, 2f 189 nop 190 191 .align 64 1921: 193 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2)) 194 faligndata %f12, %f14, %f28 195 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4)) 196 faligndata %f14, %f0, %f30 197 EX_ST_FP(STORE_BLK(%f16, %o0)) 198 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6)) 199 faligndata %f0, %f2, %f16 200 add %o0, 0x40, %o0 201 202 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8)) 203 faligndata %f2, %f4, %f18 204 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10)) 205 faligndata %f4, %f6, %f20 206 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12)) 207 subcc %o3, 0x01, %o3 208 faligndata %f6, %f8, %f22 209 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14)) 210 211 faligndata %f8, %f10, %f24 212 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0)) 213 LOAD(prefetch, %o1 + 0x1c0, #one_read) 214 faligndata %f10, %f12, %f26 215 bg,pt %XCC, 1b 216 add %o1, 0x40, %o1 217 218 /* Finally we copy the last full 64-byte block. */ 2192: 220 EX_LD_FP(LOAD(ldd, %o1 + 0x008, %f2)) 221 faligndata %f12, %f14, %f28 222 EX_LD_FP(LOAD(ldd, %o1 + 0x010, %f4)) 223 faligndata %f14, %f0, %f30 224 EX_ST_FP(STORE_BLK(%f16, %o0)) 225 EX_LD_FP(LOAD(ldd, %o1 + 0x018, %f6)) 226 faligndata %f0, %f2, %f16 227 EX_LD_FP(LOAD(ldd, %o1 + 0x020, %f8)) 228 faligndata %f2, %f4, %f18 229 EX_LD_FP(LOAD(ldd, %o1 + 0x028, %f10)) 230 faligndata %f4, %f6, %f20 231 EX_LD_FP(LOAD(ldd, %o1 + 0x030, %f12)) 232 faligndata %f6, %f8, %f22 233 EX_LD_FP(LOAD(ldd, %o1 + 0x038, %f14)) 234 faligndata %f8, %f10, %f24 235 cmp %g1, 0 236 be,pt %XCC, 1f 237 add %o0, 0x40, %o0 238 EX_LD_FP(LOAD(ldd, %o1 + 0x040, %f0)) 2391: faligndata %f10, %f12, %f26 240 faligndata %f12, %f14, %f28 241 faligndata %f14, %f0, %f30 242 EX_ST_FP(STORE_BLK(%f16, %o0)) 243 add %o0, 0x40, %o0 244 add %o1, 0x40, %o1 245 membar #Sync 246 247 /* Now we copy the (len modulo 64) bytes at the end. 248 * Note how we borrow the %f0 loaded above. 249 * 250 * Also notice how this code is careful not to perform a 251 * load past the end of the src buffer. 252 */ 253 and %o2, 0x3f, %o2 254 andcc %o2, 0x38, %g2 255 be,pn %XCC, 2f 256 subcc %g2, 0x8, %g2 257 be,pn %XCC, 2f 258 cmp %g1, 0 259 260 sub %o2, %g2, %o2 261 be,a,pt %XCC, 1f 262 EX_LD_FP(LOAD(ldd, %o1 + 0x00, %f0)) 263 2641: EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f2)) 265 add %o1, 0x8, %o1 266 subcc %g2, 0x8, %g2 267 faligndata %f0, %f2, %f8 268 EX_ST_FP(STORE(std, %f8, %o0)) 269 be,pn %XCC, 2f 270 add %o0, 0x8, %o0 271 EX_LD_FP(LOAD(ldd, %o1 + 0x08, %f0)) 272 add %o1, 0x8, %o1 273 subcc %g2, 0x8, %g2 274 faligndata %f2, %f0, %f8 275 EX_ST_FP(STORE(std, %f8, %o0)) 276 bne,pn %XCC, 1b 277 add %o0, 0x8, %o0 278 279 /* If anything is left, we copy it one byte at a time. 280 * Note that %g1 is (src & 0x3) saved above before the 281 * alignaddr was performed. 282 */ 2832: 284 cmp %o2, 0 285 add %o1, %g1, %o1 286 VISExitHalf 287 be,pn %XCC, 85f 288 sub %o0, %o1, %o3 289 290 andcc %g1, 0x7, %g0 291 bne,pn %icc, 90f 292 andcc %o2, 0x8, %g0 293 be,pt %icc, 1f 294 nop 295 EX_LD(LOAD(ldx, %o1, %o5)) 296 EX_ST(STORE(stx, %o5, %o1 + %o3)) 297 add %o1, 0x8, %o1 298 2991: andcc %o2, 0x4, %g0 300 be,pt %icc, 1f 301 nop 302 EX_LD(LOAD(lduw, %o1, %o5)) 303 EX_ST(STORE(stw, %o5, %o1 + %o3)) 304 add %o1, 0x4, %o1 305 3061: andcc %o2, 0x2, %g0 307 be,pt %icc, 1f 308 nop 309 EX_LD(LOAD(lduh, %o1, %o5)) 310 EX_ST(STORE(sth, %o5, %o1 + %o3)) 311 add %o1, 0x2, %o1 312 3131: andcc %o2, 0x1, %g0 314 be,pt %icc, 85f 315 nop 316 EX_LD(LOAD(ldub, %o1, %o5)) 317 ba,pt %xcc, 85f 318 EX_ST(STORE(stb, %o5, %o1 + %o3)) 319 320 .align 64 32170: /* 16 < len <= 64 */ 322 bne,pn %XCC, 75f 323 sub %o0, %o1, %o3 324 32572: 326 andn %o2, 0xf, GLOBAL_SPARE 327 and %o2, 0xf, %o2 3281: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE 329 EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) 330 EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) 331 EX_ST(STORE(stx, %o5, %o1 + %o3)) 332 add %o1, 0x8, %o1 333 EX_ST(STORE(stx, %g1, %o1 + %o3)) 334 bgu,pt %XCC, 1b 335 add %o1, 0x8, %o1 33673: andcc %o2, 0x8, %g0 337 be,pt %XCC, 1f 338 nop 339 sub %o2, 0x8, %o2 340 EX_LD(LOAD(ldx, %o1, %o5)) 341 EX_ST(STORE(stx, %o5, %o1 + %o3)) 342 add %o1, 0x8, %o1 3431: andcc %o2, 0x4, %g0 344 be,pt %XCC, 1f 345 nop 346 sub %o2, 0x4, %o2 347 EX_LD(LOAD(lduw, %o1, %o5)) 348 EX_ST(STORE(stw, %o5, %o1 + %o3)) 349 add %o1, 0x4, %o1 3501: cmp %o2, 0 351 be,pt %XCC, 85f 352 nop 353 ba,pt %xcc, 90f 354 nop 355 35675: 357 andcc %o0, 0x7, %g1 358 sub %g1, 0x8, %g1 359 be,pn %icc, 2f 360 sub %g0, %g1, %g1 361 sub %o2, %g1, %o2 362 3631: subcc %g1, 1, %g1 364 EX_LD(LOAD(ldub, %o1, %o5)) 365 EX_ST(STORE(stb, %o5, %o1 + %o3)) 366 bgu,pt %icc, 1b 367 add %o1, 1, %o1 368 3692: add %o1, %o3, %o0 370 andcc %o1, 0x7, %g1 371 bne,pt %icc, 8f 372 sll %g1, 3, %g1 373 374 cmp %o2, 16 375 bgeu,pt %icc, 72b 376 nop 377 ba,a,pt %xcc, 73b 378 3798: mov 64, %o3 380 andn %o1, 0x7, %o1 381 EX_LD(LOAD(ldx, %o1, %g2)) 382 sub %o3, %g1, %o3 383 andn %o2, 0x7, GLOBAL_SPARE 384 sllx %g2, %g1, %g2 3851: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) 386 subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE 387 add %o1, 0x8, %o1 388 srlx %g3, %o3, %o5 389 or %o5, %g2, %o5 390 EX_ST(STORE(stx, %o5, %o0)) 391 add %o0, 0x8, %o0 392 bgu,pt %icc, 1b 393 sllx %g3, %g1, %g2 394 395 srl %g1, 3, %g1 396 andcc %o2, 0x7, %o2 397 be,pn %icc, 85f 398 add %o1, %g1, %o1 399 ba,pt %xcc, 90f 400 sub %o0, %o1, %o3 401 402 .align 64 40380: /* 0 < len <= 16 */ 404 andcc %o3, 0x3, %g0 405 bne,pn %XCC, 90f 406 sub %o0, %o1, %o3 407 4081: 409 subcc %o2, 4, %o2 410 EX_LD(LOAD(lduw, %o1, %g1)) 411 EX_ST(STORE(stw, %g1, %o1 + %o3)) 412 bgu,pt %XCC, 1b 413 add %o1, 4, %o1 414 41585: retl 416 mov EX_RETVAL(%o4), %o0 417 418 .align 32 41990: 420 subcc %o2, 1, %o2 421 EX_LD(LOAD(ldub, %o1, %g1)) 422 EX_ST(STORE(stb, %g1, %o1 + %o3)) 423 bgu,pt %XCC, 90b 424 add %o1, 1, %o1 425 retl 426 mov EX_RETVAL(%o4), %o0 427 428 .size FUNC_NAME, .-FUNC_NAME 429