1/* U3memcpy.S: UltraSparc-III optimized memcpy. 2 * 3 * Copyright (C) 1999, 2000, 2004 David S. Miller (davem@redhat.com) 4 */ 5 6#ifdef __KERNEL__ 7#include <asm/visasm.h> 8#include <asm/asi.h> 9#define GLOBAL_SPARE %g7 10#else 11#define ASI_BLK_P 0xf0 12#define FPRS_FEF 0x04 13#ifdef MEMCPY_DEBUG 14#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ 15 clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; 16#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 17#else 18#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs 19#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 20#endif 21#define GLOBAL_SPARE %g5 22#endif 23 24#ifndef EX_LD 25#define EX_LD(x) x 26#endif 27 28#ifndef EX_ST 29#define EX_ST(x) x 30#endif 31 32#ifndef EX_RETVAL 33#define EX_RETVAL(x) x 34#endif 35 36#ifndef LOAD 37#define LOAD(type,addr,dest) type [addr], dest 38#endif 39 40#ifndef STORE 41#define STORE(type,src,addr) type src, [addr] 42#endif 43 44#ifndef STORE_BLK 45#define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P 46#endif 47 48#ifndef FUNC_NAME 49#define FUNC_NAME U3memcpy 50#endif 51 52#ifndef PREAMBLE 53#define PREAMBLE 54#endif 55 56#ifndef XCC 57#define XCC xcc 58#endif 59 60 .register %g2,#scratch 61 .register %g3,#scratch 62 63 /* Special/non-trivial issues of this code: 64 * 65 * 1) %o5 is preserved from VISEntryHalf to VISExitHalf 66 * 2) Only low 32 FPU registers are used so that only the 67 * lower half of the FPU register set is dirtied by this 68 * code. This is especially important in the kernel. 69 * 3) This code never prefetches cachelines past the end 70 * of the source buffer. 71 */ 72 73 .text 74 .align 64 75 76 /* The cheetah's flexible spine, oversized liver, enlarged heart, 77 * slender muscular body, and claws make it the swiftest hunter 78 * in Africa and the fastest animal on land. Can reach speeds 79 * of up to 2.4GB per second. 80 */ 81 82 .globl FUNC_NAME 83 .type FUNC_NAME,#function 84FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 85 srlx %o2, 31, %g2 86 cmp %g2, 0 87 tne %xcc, 5 88 PREAMBLE 89 mov %o0, %o4 90 cmp %o2, 0 91 be,pn %XCC, 85f 92 or %o0, %o1, %o3 93 cmp %o2, 16 94 blu,a,pn %XCC, 80f 95 or %o3, %o2, %o3 96 97 cmp %o2, (3 * 64) 98 blu,pt %XCC, 70f 99 andcc %o3, 0x7, %g0 100 101 /* Clobbers o5/g1/g2/g3/g7/icc/xcc. We must preserve 102 * o5 from here until we hit VISExitHalf. 103 */ 104 VISEntryHalf 105 106 /* Is 'dst' already aligned on an 64-byte boundary? */ 107 andcc %o0, 0x3f, %g2 108 be,pt %XCC, 2f 109 110 /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number 111 * of bytes to copy to make 'dst' 64-byte aligned. We pre- 112 * subtract this from 'len'. 113 */ 114 sub %o0, %o1, GLOBAL_SPARE 115 sub %g2, 0x40, %g2 116 sub %g0, %g2, %g2 117 sub %o2, %g2, %o2 118 andcc %g2, 0x7, %g1 119 be,pt %icc, 2f 120 and %g2, 0x38, %g2 121 1221: subcc %g1, 0x1, %g1 123 EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) 124 EX_ST(STORE(stb, %o3, %o1 + GLOBAL_SPARE)) 125 bgu,pt %XCC, 1b 126 add %o1, 0x1, %o1 127 128 add %o1, GLOBAL_SPARE, %o0 129 1302: cmp %g2, 0x0 131 and %o1, 0x7, %g1 132 be,pt %icc, 3f 133 alignaddr %o1, %g0, %o1 134 135 EX_LD(LOAD(ldd, %o1, %f4)) 1361: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) 137 add %o1, 0x8, %o1 138 subcc %g2, 0x8, %g2 139 faligndata %f4, %f6, %f0 140 EX_ST(STORE(std, %f0, %o0)) 141 be,pn %icc, 3f 142 add %o0, 0x8, %o0 143 144 EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) 145 add %o1, 0x8, %o1 146 subcc %g2, 0x8, %g2 147 faligndata %f6, %f4, %f2 148 EX_ST(STORE(std, %f2, %o0)) 149 bne,pt %icc, 1b 150 add %o0, 0x8, %o0 151 1523: LOAD(prefetch, %o1 + 0x000, #one_read) 153 LOAD(prefetch, %o1 + 0x040, #one_read) 154 andn %o2, (0x40 - 1), GLOBAL_SPARE 155 LOAD(prefetch, %o1 + 0x080, #one_read) 156 LOAD(prefetch, %o1 + 0x0c0, #one_read) 157 LOAD(prefetch, %o1 + 0x100, #one_read) 158 EX_LD(LOAD(ldd, %o1 + 0x000, %f0)) 159 LOAD(prefetch, %o1 + 0x140, #one_read) 160 EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) 161 LOAD(prefetch, %o1 + 0x180, #one_read) 162 EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) 163 LOAD(prefetch, %o1 + 0x1c0, #one_read) 164 faligndata %f0, %f2, %f16 165 EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) 166 faligndata %f2, %f4, %f18 167 EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) 168 faligndata %f4, %f6, %f20 169 EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) 170 faligndata %f6, %f8, %f22 171 172 EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) 173 faligndata %f8, %f10, %f24 174 EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) 175 faligndata %f10, %f12, %f26 176 EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) 177 178 subcc GLOBAL_SPARE, 0x80, GLOBAL_SPARE 179 add %o1, 0x40, %o1 180 bgu,pt %XCC, 1f 181 srl GLOBAL_SPARE, 6, %o3 182 ba,pt %xcc, 2f 183 nop 184 185 .align 64 1861: 187 EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) 188 faligndata %f12, %f14, %f28 189 EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) 190 faligndata %f14, %f0, %f30 191 EX_ST(STORE_BLK(%f16, %o0)) 192 EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) 193 faligndata %f0, %f2, %f16 194 add %o0, 0x40, %o0 195 196 EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) 197 faligndata %f2, %f4, %f18 198 EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) 199 faligndata %f4, %f6, %f20 200 EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) 201 subcc %o3, 0x01, %o3 202 faligndata %f6, %f8, %f22 203 EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) 204 205 faligndata %f8, %f10, %f24 206 EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) 207 LOAD(prefetch, %o1 + 0x1c0, #one_read) 208 faligndata %f10, %f12, %f26 209 bg,pt %XCC, 1b 210 add %o1, 0x40, %o1 211 212 /* Finally we copy the last full 64-byte block. */ 2132: 214 EX_LD(LOAD(ldd, %o1 + 0x008, %f2)) 215 faligndata %f12, %f14, %f28 216 EX_LD(LOAD(ldd, %o1 + 0x010, %f4)) 217 faligndata %f14, %f0, %f30 218 EX_ST(STORE_BLK(%f16, %o0)) 219 EX_LD(LOAD(ldd, %o1 + 0x018, %f6)) 220 faligndata %f0, %f2, %f16 221 EX_LD(LOAD(ldd, %o1 + 0x020, %f8)) 222 faligndata %f2, %f4, %f18 223 EX_LD(LOAD(ldd, %o1 + 0x028, %f10)) 224 faligndata %f4, %f6, %f20 225 EX_LD(LOAD(ldd, %o1 + 0x030, %f12)) 226 faligndata %f6, %f8, %f22 227 EX_LD(LOAD(ldd, %o1 + 0x038, %f14)) 228 faligndata %f8, %f10, %f24 229 cmp %g1, 0 230 be,pt %XCC, 1f 231 add %o0, 0x40, %o0 232 EX_LD(LOAD(ldd, %o1 + 0x040, %f0)) 2331: faligndata %f10, %f12, %f26 234 faligndata %f12, %f14, %f28 235 faligndata %f14, %f0, %f30 236 EX_ST(STORE_BLK(%f16, %o0)) 237 add %o0, 0x40, %o0 238 add %o1, 0x40, %o1 239 membar #Sync 240 241 /* Now we copy the (len modulo 64) bytes at the end. 242 * Note how we borrow the %f0 loaded above. 243 * 244 * Also notice how this code is careful not to perform a 245 * load past the end of the src buffer. 246 */ 247 and %o2, 0x3f, %o2 248 andcc %o2, 0x38, %g2 249 be,pn %XCC, 2f 250 subcc %g2, 0x8, %g2 251 be,pn %XCC, 2f 252 cmp %g1, 0 253 254 sub %o2, %g2, %o2 255 be,a,pt %XCC, 1f 256 EX_LD(LOAD(ldd, %o1 + 0x00, %f0)) 257 2581: EX_LD(LOAD(ldd, %o1 + 0x08, %f2)) 259 add %o1, 0x8, %o1 260 subcc %g2, 0x8, %g2 261 faligndata %f0, %f2, %f8 262 EX_ST(STORE(std, %f8, %o0)) 263 be,pn %XCC, 2f 264 add %o0, 0x8, %o0 265 EX_LD(LOAD(ldd, %o1 + 0x08, %f0)) 266 add %o1, 0x8, %o1 267 subcc %g2, 0x8, %g2 268 faligndata %f2, %f0, %f8 269 EX_ST(STORE(std, %f8, %o0)) 270 bne,pn %XCC, 1b 271 add %o0, 0x8, %o0 272 273 /* If anything is left, we copy it one byte at a time. 274 * Note that %g1 is (src & 0x3) saved above before the 275 * alignaddr was performed. 276 */ 2772: 278 cmp %o2, 0 279 add %o1, %g1, %o1 280 VISExitHalf 281 be,pn %XCC, 85f 282 sub %o0, %o1, %o3 283 284 andcc %g1, 0x7, %g0 285 bne,pn %icc, 90f 286 andcc %o2, 0x8, %g0 287 be,pt %icc, 1f 288 nop 289 EX_LD(LOAD(ldx, %o1, %o5)) 290 EX_ST(STORE(stx, %o5, %o1 + %o3)) 291 add %o1, 0x8, %o1 292 2931: andcc %o2, 0x4, %g0 294 be,pt %icc, 1f 295 nop 296 EX_LD(LOAD(lduw, %o1, %o5)) 297 EX_ST(STORE(stw, %o5, %o1 + %o3)) 298 add %o1, 0x4, %o1 299 3001: andcc %o2, 0x2, %g0 301 be,pt %icc, 1f 302 nop 303 EX_LD(LOAD(lduh, %o1, %o5)) 304 EX_ST(STORE(sth, %o5, %o1 + %o3)) 305 add %o1, 0x2, %o1 306 3071: andcc %o2, 0x1, %g0 308 be,pt %icc, 85f 309 nop 310 EX_LD(LOAD(ldub, %o1, %o5)) 311 ba,pt %xcc, 85f 312 EX_ST(STORE(stb, %o5, %o1 + %o3)) 313 314 .align 64 31570: /* 16 < len <= 64 */ 316 bne,pn %XCC, 75f 317 sub %o0, %o1, %o3 318 31972: 320 andn %o2, 0xf, GLOBAL_SPARE 321 and %o2, 0xf, %o2 3221: subcc GLOBAL_SPARE, 0x10, GLOBAL_SPARE 323 EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) 324 EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) 325 EX_ST(STORE(stx, %o5, %o1 + %o3)) 326 add %o1, 0x8, %o1 327 EX_ST(STORE(stx, %g1, %o1 + %o3)) 328 bgu,pt %XCC, 1b 329 add %o1, 0x8, %o1 33073: andcc %o2, 0x8, %g0 331 be,pt %XCC, 1f 332 nop 333 sub %o2, 0x8, %o2 334 EX_LD(LOAD(ldx, %o1, %o5)) 335 EX_ST(STORE(stx, %o5, %o1 + %o3)) 336 add %o1, 0x8, %o1 3371: andcc %o2, 0x4, %g0 338 be,pt %XCC, 1f 339 nop 340 sub %o2, 0x4, %o2 341 EX_LD(LOAD(lduw, %o1, %o5)) 342 EX_ST(STORE(stw, %o5, %o1 + %o3)) 343 add %o1, 0x4, %o1 3441: cmp %o2, 0 345 be,pt %XCC, 85f 346 nop 347 ba,pt %xcc, 90f 348 nop 349 35075: 351 andcc %o0, 0x7, %g1 352 sub %g1, 0x8, %g1 353 be,pn %icc, 2f 354 sub %g0, %g1, %g1 355 sub %o2, %g1, %o2 356 3571: subcc %g1, 1, %g1 358 EX_LD(LOAD(ldub, %o1, %o5)) 359 EX_ST(STORE(stb, %o5, %o1 + %o3)) 360 bgu,pt %icc, 1b 361 add %o1, 1, %o1 362 3632: add %o1, %o3, %o0 364 andcc %o1, 0x7, %g1 365 bne,pt %icc, 8f 366 sll %g1, 3, %g1 367 368 cmp %o2, 16 369 bgeu,pt %icc, 72b 370 nop 371 ba,a,pt %xcc, 73b 372 3738: mov 64, %o3 374 andn %o1, 0x7, %o1 375 EX_LD(LOAD(ldx, %o1, %g2)) 376 sub %o3, %g1, %o3 377 andn %o2, 0x7, GLOBAL_SPARE 378 sllx %g2, %g1, %g2 3791: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) 380 subcc GLOBAL_SPARE, 0x8, GLOBAL_SPARE 381 add %o1, 0x8, %o1 382 srlx %g3, %o3, %o5 383 or %o5, %g2, %o5 384 EX_ST(STORE(stx, %o5, %o0)) 385 add %o0, 0x8, %o0 386 bgu,pt %icc, 1b 387 sllx %g3, %g1, %g2 388 389 srl %g1, 3, %g1 390 andcc %o2, 0x7, %o2 391 be,pn %icc, 85f 392 add %o1, %g1, %o1 393 ba,pt %xcc, 90f 394 sub %o0, %o1, %o3 395 396 .align 64 39780: /* 0 < len <= 16 */ 398 andcc %o3, 0x3, %g0 399 bne,pn %XCC, 90f 400 sub %o0, %o1, %o3 401 4021: 403 subcc %o2, 4, %o2 404 EX_LD(LOAD(lduw, %o1, %g1)) 405 EX_ST(STORE(stw, %g1, %o1 + %o3)) 406 bgu,pt %XCC, 1b 407 add %o1, 4, %o1 408 40985: retl 410 mov EX_RETVAL(%o4), %o0 411 412 .align 32 41390: 414 subcc %o2, 1, %o2 415 EX_LD(LOAD(ldub, %o1, %g1)) 416 EX_ST(STORE(stb, %g1, %o1 + %o3)) 417 bgu,pt %XCC, 90b 418 add %o1, 1, %o1 419 retl 420 mov EX_RETVAL(%o4), %o0 421 422 .size FUNC_NAME, .-FUNC_NAME 423