1/* SPDX-License-Identifier: GPL-2.0 */ 2/* NGmemcpy.S: Niagara optimized memcpy. 3 * 4 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) 5 */ 6 7#ifdef __KERNEL__ 8#include <linux/linkage.h> 9#include <asm/asi.h> 10#include <asm/thread_info.h> 11#define GLOBAL_SPARE %g7 12#define RESTORE_ASI(TMP) \ 13 wr %g0, ASI_AIUS, %asi 14#else 15#define GLOBAL_SPARE %g5 16#define RESTORE_ASI(TMP) \ 17 wr %g0, ASI_PNF, %asi 18#endif 19 20#ifdef __sparc_v9__ 21#define SAVE_AMOUNT 128 22#else 23#define SAVE_AMOUNT 64 24#endif 25 26#ifndef STORE_ASI 27#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 28#endif 29 30#ifndef EX_LD 31#define EX_LD(x,y) x 32#endif 33 34#ifndef EX_ST 35#define EX_ST(x,y) x 36#endif 37 38#ifndef LOAD 39#ifndef MEMCPY_DEBUG 40#define LOAD(type,addr,dest) type [addr], dest 41#else 42#define LOAD(type,addr,dest) type##a [addr] 0x80, dest 43#endif 44#endif 45 46#ifndef LOAD_TWIN 47#define LOAD_TWIN(addr_reg,dest0,dest1) \ 48 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 49#endif 50 51#ifndef STORE 52#define STORE(type,src,addr) type src, [addr] 53#endif 54 55#ifndef STORE_INIT 56#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 57#define STORE_INIT(src,addr) stxa src, [addr] %asi 58#else 59#define STORE_INIT(src,addr) stx src, [addr + 0x00] 60#endif 61#endif 62 63#ifndef FUNC_NAME 64#define FUNC_NAME NGmemcpy 65#endif 66 67#ifndef PREAMBLE 68#define PREAMBLE 69#endif 70 71#ifndef XCC 72#define XCC xcc 73#endif 74 75 .register %g2,#scratch 76 .register %g3,#scratch 77 78 .text 79#ifndef EX_RETVAL 80#define EX_RETVAL(x) x 81__restore_asi: 82 wr %g0, ASI_AIUS, %asi 83 ret 84 restore 85ENTRY(NG_ret_i2_plus_i4_plus_1) 86 ba,pt %xcc, __restore_asi 87 add %i2, %i5, %i0 88ENDPROC(NG_ret_i2_plus_i4_plus_1) 89ENTRY(NG_ret_i2_plus_g1) 90 ba,pt %xcc, __restore_asi 91 add %i2, %g1, %i0 92ENDPROC(NG_ret_i2_plus_g1) 93ENTRY(NG_ret_i2_plus_g1_minus_8) 94 sub %g1, 8, %g1 95 ba,pt %xcc, __restore_asi 96 add %i2, %g1, %i0 97ENDPROC(NG_ret_i2_plus_g1_minus_8) 98ENTRY(NG_ret_i2_plus_g1_minus_16) 99 sub %g1, 16, %g1 100 ba,pt %xcc, __restore_asi 101 add %i2, %g1, %i0 102ENDPROC(NG_ret_i2_plus_g1_minus_16) 103ENTRY(NG_ret_i2_plus_g1_minus_24) 104 sub %g1, 24, %g1 105 ba,pt %xcc, __restore_asi 106 add %i2, %g1, %i0 107ENDPROC(NG_ret_i2_plus_g1_minus_24) 108ENTRY(NG_ret_i2_plus_g1_minus_32) 109 sub %g1, 32, %g1 110 ba,pt %xcc, __restore_asi 111 add %i2, %g1, %i0 112ENDPROC(NG_ret_i2_plus_g1_minus_32) 113ENTRY(NG_ret_i2_plus_g1_minus_40) 114 sub %g1, 40, %g1 115 ba,pt %xcc, __restore_asi 116 add %i2, %g1, %i0 117ENDPROC(NG_ret_i2_plus_g1_minus_40) 118ENTRY(NG_ret_i2_plus_g1_minus_48) 119 sub %g1, 48, %g1 120 ba,pt %xcc, __restore_asi 121 add %i2, %g1, %i0 122ENDPROC(NG_ret_i2_plus_g1_minus_48) 123ENTRY(NG_ret_i2_plus_g1_minus_56) 124 sub %g1, 56, %g1 125 ba,pt %xcc, __restore_asi 126 add %i2, %g1, %i0 127ENDPROC(NG_ret_i2_plus_g1_minus_56) 128ENTRY(NG_ret_i2_plus_i4_plus_16) 129 add %i4, 16, %i4 130 ba,pt %xcc, __restore_asi 131 add %i2, %i4, %i0 132ENDPROC(NG_ret_i2_plus_i4_plus_16) 133ENTRY(NG_ret_i2_plus_i4_plus_8) 134 add %i4, 8, %i4 135 ba,pt %xcc, __restore_asi 136 add %i2, %i4, %i0 137ENDPROC(NG_ret_i2_plus_i4_plus_8) 138ENTRY(NG_ret_i2_plus_8) 139 ba,pt %xcc, __restore_asi 140 add %i2, 8, %i0 141ENDPROC(NG_ret_i2_plus_8) 142ENTRY(NG_ret_i2_plus_4) 143 ba,pt %xcc, __restore_asi 144 add %i2, 4, %i0 145ENDPROC(NG_ret_i2_plus_4) 146ENTRY(NG_ret_i2_plus_1) 147 ba,pt %xcc, __restore_asi 148 add %i2, 1, %i0 149ENDPROC(NG_ret_i2_plus_1) 150ENTRY(NG_ret_i2_plus_g1_plus_1) 151 add %g1, 1, %g1 152 ba,pt %xcc, __restore_asi 153 add %i2, %g1, %i0 154ENDPROC(NG_ret_i2_plus_g1_plus_1) 155ENTRY(NG_ret_i2) 156 ba,pt %xcc, __restore_asi 157 mov %i2, %i0 158ENDPROC(NG_ret_i2) 159ENTRY(NG_ret_i2_and_7_plus_i4) 160 and %i2, 7, %i2 161 ba,pt %xcc, __restore_asi 162 add %i2, %i4, %i0 163ENDPROC(NG_ret_i2_and_7_plus_i4) 164ENTRY(NG_ret_i2_and_7_plus_i4_plus_8) 165 and %i2, 7, %i2 166 add %i4, 8, %i4 167 ba,pt %xcc, __restore_asi 168 add %i2, %i4, %i0 169ENDPROC(NG_ret_i2_and_7_plus_i4) 170#endif 171 172 .align 64 173 174 .globl FUNC_NAME 175 .type FUNC_NAME,#function 176FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ 177 PREAMBLE 178 save %sp, -SAVE_AMOUNT, %sp 179 srlx %i2, 31, %g2 180 cmp %g2, 0 181 tne %xcc, 5 182 mov %i0, %o0 183 cmp %i2, 0 184 be,pn %XCC, 85f 185 or %o0, %i1, %i3 186 cmp %i2, 16 187 blu,a,pn %XCC, 80f 188 or %i3, %i2, %i3 189 190 /* 2 blocks (128 bytes) is the minimum we can do the block 191 * copy with. We need to ensure that we'll iterate at least 192 * once in the block copy loop. At worst we'll need to align 193 * the destination to a 64-byte boundary which can chew up 194 * to (64 - 1) bytes from the length before we perform the 195 * block copy loop. 196 */ 197 cmp %i2, (2 * 64) 198 blu,pt %XCC, 70f 199 andcc %i3, 0x7, %g0 200 201 /* %o0: dst 202 * %i1: src 203 * %i2: len (known to be >= 128) 204 * 205 * The block copy loops will use %i4/%i5,%g2/%g3 as 206 * temporaries while copying the data. 207 */ 208 209 LOAD(prefetch, %i1, #one_read) 210 wr %g0, STORE_ASI, %asi 211 212 /* Align destination on 64-byte boundary. */ 213 andcc %o0, (64 - 1), %i4 214 be,pt %XCC, 2f 215 sub %i4, 64, %i4 216 sub %g0, %i4, %i4 ! bytes to align dst 217 sub %i2, %i4, %i2 2181: subcc %i4, 1, %i4 219 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1) 220 EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1) 221 add %i1, 1, %i1 222 bne,pt %XCC, 1b 223 add %o0, 1, %o0 224 225 /* If the source is on a 16-byte boundary we can do 226 * the direct block copy loop. If it is 8-byte aligned 227 * we can do the 16-byte loads offset by -8 bytes and the 228 * init stores offset by one register. 229 * 230 * If the source is not even 8-byte aligned, we need to do 231 * shifting and masking (basically integer faligndata). 232 * 233 * The careful bit with init stores is that if we store 234 * to any part of the cache line we have to store the whole 235 * cacheline else we can end up with corrupt L2 cache line 236 * contents. Since the loop works on 64-bytes of 64-byte 237 * aligned store data at a time, this is easy to ensure. 238 */ 2392: 240 andcc %i1, (16 - 1), %i4 241 andn %i2, (64 - 1), %g1 ! block copy loop iterator 242 be,pt %XCC, 50f 243 sub %i2, %g1, %i2 ! final sub-block copy bytes 244 245 cmp %i4, 8 246 be,pt %XCC, 10f 247 sub %i1, %i4, %i1 248 249 /* Neither 8-byte nor 16-byte aligned, shift and mask. */ 250 and %i4, 0x7, GLOBAL_SPARE 251 sll GLOBAL_SPARE, 3, GLOBAL_SPARE 252 mov 64, %i5 253 EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1) 254 sub %i5, GLOBAL_SPARE, %i5 255 mov 16, %o4 256 mov 32, %o5 257 mov 48, %o7 258 mov 64, %i3 259 260 bg,pn %XCC, 9f 261 nop 262 263#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ 264 sllx WORD1, POST_SHIFT, WORD1; \ 265 srlx WORD2, PRE_SHIFT, TMP; \ 266 sllx WORD2, POST_SHIFT, WORD2; \ 267 or WORD1, TMP, WORD1; \ 268 srlx WORD3, PRE_SHIFT, TMP; \ 269 or WORD2, TMP, WORD2; 270 2718: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 272 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 273 LOAD(prefetch, %i1 + %i3, #one_read) 274 275 EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1) 276 EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 277 278 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 279 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 280 281 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 282 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 283 284 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 285 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) 286 287 EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 288 EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 289 290 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 291 add %i1, 64, %i1 292 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) 293 294 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 295 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 296 297 subcc %g1, 64, %g1 298 bne,pt %XCC, 8b 299 add %o0, 64, %o0 300 301 ba,pt %XCC, 60f 302 add %i1, %i4, %i1 303 3049: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) 305 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 306 LOAD(prefetch, %i1 + %i3, #one_read) 307 308 EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1) 309 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 310 311 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) 312 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 313 314 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 315 EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 316 317 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 318 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) 319 320 EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 321 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 322 323 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) 324 add %i1, 64, %i1 325 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) 326 327 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 328 EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 329 330 subcc %g1, 64, %g1 331 bne,pt %XCC, 9b 332 add %o0, 64, %o0 333 334 ba,pt %XCC, 60f 335 add %i1, %i4, %i1 336 33710: /* Destination is 64-byte aligned, source was only 8-byte 338 * aligned but it has been subtracted by 8 and we perform 339 * one twin load ahead, then add 8 back into source when 340 * we finish the loop. 341 */ 342 EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1) 343 mov 16, %o7 344 mov 32, %g2 345 mov 48, %g3 346 mov 64, %o1 3471: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 348 LOAD(prefetch, %i1 + %o1, #one_read) 349 EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 350 EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 351 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 352 EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 353 EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 354 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 355 EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 356 EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 357 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48) 358 add %i1, 64, %i1 359 EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 360 EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 361 subcc %g1, 64, %g1 362 bne,pt %XCC, 1b 363 add %o0, 64, %o0 364 365 ba,pt %XCC, 60f 366 add %i1, 0x8, %i1 367 36850: /* Destination is 64-byte aligned, and source is 16-byte 369 * aligned. 370 */ 371 mov 16, %o7 372 mov 32, %g2 373 mov 48, %g3 374 mov 64, %o1 3751: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1) 376 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) 377 LOAD(prefetch, %i1 + %o1, #one_read) 378 EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line 379 EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) 380 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) 381 EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) 382 EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) 383 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) 384 add %i1, 64, %i1 385 EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) 386 EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) 387 EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) 388 EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) 389 subcc %g1, 64, %g1 390 bne,pt %XCC, 1b 391 add %o0, 64, %o0 392 /* fall through */ 393 39460: 395 membar #Sync 396 397 /* %i2 contains any final bytes still needed to be copied 398 * over. If anything is left, we copy it one byte at a time. 399 */ 400 RESTORE_ASI(%i3) 401 brz,pt %i2, 85f 402 sub %o0, %i1, %i3 403 ba,a,pt %XCC, 90f 404 nop 405 406 .align 64 40770: /* 16 < len <= 64 */ 408 bne,pn %XCC, 75f 409 sub %o0, %i1, %i3 410 41172: 412 andn %i2, 0xf, %i4 413 and %i2, 0xf, %i2 4141: subcc %i4, 0x10, %i4 415 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4_plus_16) 416 add %i1, 0x08, %i1 417 EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4_plus_16) 418 sub %i1, 0x08, %i1 419 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4_plus_16) 420 add %i1, 0x8, %i1 421 EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_plus_8) 422 bgu,pt %XCC, 1b 423 add %i1, 0x8, %i1 42473: andcc %i2, 0x8, %g0 425 be,pt %XCC, 1f 426 nop 427 sub %i2, 0x8, %i2 428 EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8) 429 EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8) 430 add %i1, 0x8, %i1 4311: andcc %i2, 0x4, %g0 432 be,pt %XCC, 1f 433 nop 434 sub %i2, 0x4, %i2 435 EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4) 436 EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4) 437 add %i1, 0x4, %i1 4381: cmp %i2, 0 439 be,pt %XCC, 85f 440 nop 441 ba,pt %xcc, 90f 442 nop 443 44475: 445 andcc %o0, 0x7, %g1 446 sub %g1, 0x8, %g1 447 be,pn %icc, 2f 448 sub %g0, %g1, %g1 449 sub %i2, %g1, %i2 450 4511: subcc %g1, 1, %g1 452 EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1) 453 EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1) 454 bgu,pt %icc, 1b 455 add %i1, 1, %i1 456 4572: add %i1, %i3, %o0 458 andcc %i1, 0x7, %g1 459 bne,pt %icc, 8f 460 sll %g1, 3, %g1 461 462 cmp %i2, 16 463 bgeu,pt %icc, 72b 464 nop 465 ba,a,pt %xcc, 73b 466 4678: mov 64, %i3 468 andn %i1, 0x7, %i1 469 EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) 470 sub %i3, %g1, %i3 471 andn %i2, 0x7, %i4 472 sllx %g2, %g1, %g2 4731: add %i1, 0x8, %i1 474 EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4) 475 subcc %i4, 0x8, %i4 476 srlx %g3, %i3, %i5 477 or %i5, %g2, %i5 478 EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4_plus_8) 479 add %o0, 0x8, %o0 480 bgu,pt %icc, 1b 481 sllx %g3, %g1, %g2 482 483 srl %g1, 3, %g1 484 andcc %i2, 0x7, %i2 485 be,pn %icc, 85f 486 add %i1, %g1, %i1 487 ba,pt %xcc, 90f 488 sub %o0, %i1, %i3 489 490 .align 64 49180: /* 0 < len <= 16 */ 492 andcc %i3, 0x3, %g0 493 bne,pn %XCC, 90f 494 sub %o0, %i1, %i3 495 4961: 497 subcc %i2, 4, %i2 498 EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4) 499 EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4) 500 bgu,pt %XCC, 1b 501 add %i1, 4, %i1 502 50385: ret 504 restore EX_RETVAL(%i0), %g0, %o0 505 506 .align 32 50790: 508 subcc %i2, 1, %i2 509 EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1) 510 EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1) 511 bgu,pt %XCC, 90b 512 add %i1, 1, %i1 513 ret 514 restore EX_RETVAL(%i0), %g0, %o0 515 516 .size FUNC_NAME, .-FUNC_NAME 517