1/* 2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions 3 * xthal_memcpy and xthal_bcopy 4 * 5 * This file is subject to the terms and conditions of the GNU General Public 6 * License. See the file "COPYING" in the main directory of this archive 7 * for more details. 8 * 9 * Copyright (C) 2002 - 2012 Tensilica Inc. 10 */ 11 12#include <variant/core.h> 13#include <asm/asmmacro.h> 14 15/* 16 * void *memcpy(void *dst, const void *src, size_t len); 17 * 18 * This function is intended to do the same thing as the standard 19 * library function memcpy() for most cases. 20 * However, where the source and/or destination references 21 * an instruction RAM or ROM or a data RAM or ROM, that 22 * source and/or destination will always be accessed with 23 * 32-bit load and store instructions (as required for these 24 * types of devices). 25 * 26 * !!!!!!! XTFIXME: 27 * !!!!!!! Handling of IRAM/IROM has not yet 28 * !!!!!!! been implemented. 29 * 30 * The (general case) algorithm is as follows: 31 * If destination is unaligned, align it by conditionally 32 * copying 1 and 2 bytes. 33 * If source is aligned, 34 * do 16 bytes with a loop, and then finish up with 35 * 8, 4, 2, and 1 byte copies conditional on the length; 36 * else (if source is unaligned), 37 * do the same, but use SRC to align the source data. 38 * This code tries to use fall-through branches for the common 39 * case of aligned source and destination and multiple 40 * of 4 (or 8) length. 41 * 42 * Register use: 43 * a0/ return address 44 * a1/ stack pointer 45 * a2/ return value 46 * a3/ src 47 * a4/ length 48 * a5/ dst 49 * a6/ tmp 50 * a7/ tmp 51 * a8/ tmp 52 * a9/ tmp 53 * a10/ tmp 54 * a11/ tmp 55 */ 56 57 .text 58 59/* 60 * Byte by byte copy 61 */ 62 .align 4 63 .byte 0 # 1 mod 4 alignment for LOOPNEZ 64 # (0 mod 4 alignment for LBEG) 65.Lbytecopy: 66#if XCHAL_HAVE_LOOPS 67 loopnez a4, .Lbytecopydone 68#else /* !XCHAL_HAVE_LOOPS */ 69 beqz a4, .Lbytecopydone 70 add a7, a3, a4 # a7 = end address for source 71#endif /* !XCHAL_HAVE_LOOPS */ 72.Lnextbyte: 73 l8ui a6, a3, 0 74 addi a3, a3, 1 75 s8i a6, a5, 0 76 addi a5, a5, 1 77#if !XCHAL_HAVE_LOOPS 78 bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end 79#endif /* !XCHAL_HAVE_LOOPS */ 80.Lbytecopydone: 81 retw 82 83/* 84 * Destination is unaligned 85 */ 86 87 .align 4 88.Ldst1mod2: # dst is only byte aligned 89 _bltui a4, 7, .Lbytecopy # do short copies byte by byte 90 91 # copy 1 byte 92 l8ui a6, a3, 0 93 addi a3, a3, 1 94 addi a4, a4, -1 95 s8i a6, a5, 0 96 addi a5, a5, 1 97 _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then 98 # return to main algorithm 99.Ldst2mod4: # dst 16-bit aligned 100 # copy 2 bytes 101 _bltui a4, 6, .Lbytecopy # do short copies byte by byte 102 l8ui a6, a3, 0 103 l8ui a7, a3, 1 104 addi a3, a3, 2 105 addi a4, a4, -2 106 s8i a6, a5, 0 107 s8i a7, a5, 1 108 addi a5, a5, 2 109 j .Ldstaligned # dst is now aligned, return to main algorithm 110 111 .align 4 112 .global memcpy 113 .type memcpy,@function 114memcpy: 115 116 entry sp, 16 # minimal stack frame 117 # a2/ dst, a3/ src, a4/ len 118 mov a5, a2 # copy dst so that a2 is return value 119.Lcommon: 120 _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 121 _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 122.Ldstaligned: # return here from .Ldst?mod? once dst is aligned 123 srli a7, a4, 4 # number of loop iterations with 16B 124 # per iteration 125 movi a8, 3 # if source is not aligned, 126 _bany a3, a8, .Lsrcunaligned # then use shifting copy 127 /* 128 * Destination and source are word-aligned, use word copy. 129 */ 130 # copy 16 bytes per iteration for word-aligned dst and word-aligned src 131#if XCHAL_HAVE_LOOPS 132 loopnez a7, .Loop1done 133#else /* !XCHAL_HAVE_LOOPS */ 134 beqz a7, .Loop1done 135 slli a8, a7, 4 136 add a8, a8, a3 # a8 = end of last 16B source chunk 137#endif /* !XCHAL_HAVE_LOOPS */ 138.Loop1: 139 l32i a6, a3, 0 140 l32i a7, a3, 4 141 s32i a6, a5, 0 142 l32i a6, a3, 8 143 s32i a7, a5, 4 144 l32i a7, a3, 12 145 s32i a6, a5, 8 146 addi a3, a3, 16 147 s32i a7, a5, 12 148 addi a5, a5, 16 149#if !XCHAL_HAVE_LOOPS 150 bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end 151#endif /* !XCHAL_HAVE_LOOPS */ 152.Loop1done: 153 bbci.l a4, 3, .L2 154 # copy 8 bytes 155 l32i a6, a3, 0 156 l32i a7, a3, 4 157 addi a3, a3, 8 158 s32i a6, a5, 0 159 s32i a7, a5, 4 160 addi a5, a5, 8 161.L2: 162 bbsi.l a4, 2, .L3 163 bbsi.l a4, 1, .L4 164 bbsi.l a4, 0, .L5 165 retw 166.L3: 167 # copy 4 bytes 168 l32i a6, a3, 0 169 addi a3, a3, 4 170 s32i a6, a5, 0 171 addi a5, a5, 4 172 bbsi.l a4, 1, .L4 173 bbsi.l a4, 0, .L5 174 retw 175.L4: 176 # copy 2 bytes 177 l16ui a6, a3, 0 178 addi a3, a3, 2 179 s16i a6, a5, 0 180 addi a5, a5, 2 181 bbsi.l a4, 0, .L5 182 retw 183.L5: 184 # copy 1 byte 185 l8ui a6, a3, 0 186 s8i a6, a5, 0 187 retw 188 189/* 190 * Destination is aligned, Source is unaligned 191 */ 192 193 .align 4 194.Lsrcunaligned: 195 _beqz a4, .Ldone # avoid loading anything for zero-length copies 196 # copy 16 bytes per iteration for word-aligned dst and unaligned src 197 __ssa8 a3 # set shift amount from byte offset 198 199/* set to 1 when running on ISS (simulator) with the 200 lint or ferret client, or 0 to save a few cycles */ 201#define SIM_CHECKS_ALIGNMENT 1 202#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 203 and a11, a3, a8 # save unalignment offset for below 204 sub a3, a3, a11 # align a3 205#endif 206 l32i a6, a3, 0 # load first word 207#if XCHAL_HAVE_LOOPS 208 loopnez a7, .Loop2done 209#else /* !XCHAL_HAVE_LOOPS */ 210 beqz a7, .Loop2done 211 slli a10, a7, 4 212 add a10, a10, a3 # a10 = end of last 16B source chunk 213#endif /* !XCHAL_HAVE_LOOPS */ 214.Loop2: 215 l32i a7, a3, 4 216 l32i a8, a3, 8 217 __src_b a6, a6, a7 218 s32i a6, a5, 0 219 l32i a9, a3, 12 220 __src_b a7, a7, a8 221 s32i a7, a5, 4 222 l32i a6, a3, 16 223 __src_b a8, a8, a9 224 s32i a8, a5, 8 225 addi a3, a3, 16 226 __src_b a9, a9, a6 227 s32i a9, a5, 12 228 addi a5, a5, 16 229#if !XCHAL_HAVE_LOOPS 230 bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end 231#endif /* !XCHAL_HAVE_LOOPS */ 232.Loop2done: 233 bbci.l a4, 3, .L12 234 # copy 8 bytes 235 l32i a7, a3, 4 236 l32i a8, a3, 8 237 __src_b a6, a6, a7 238 s32i a6, a5, 0 239 addi a3, a3, 8 240 __src_b a7, a7, a8 241 s32i a7, a5, 4 242 addi a5, a5, 8 243 mov a6, a8 244.L12: 245 bbci.l a4, 2, .L13 246 # copy 4 bytes 247 l32i a7, a3, 4 248 addi a3, a3, 4 249 __src_b a6, a6, a7 250 s32i a6, a5, 0 251 addi a5, a5, 4 252 mov a6, a7 253.L13: 254#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 255 add a3, a3, a11 # readjust a3 with correct misalignment 256#endif 257 bbsi.l a4, 1, .L14 258 bbsi.l a4, 0, .L15 259.Ldone: retw 260.L14: 261 # copy 2 bytes 262 l8ui a6, a3, 0 263 l8ui a7, a3, 1 264 addi a3, a3, 2 265 s8i a6, a5, 0 266 s8i a7, a5, 1 267 addi a5, a5, 2 268 bbsi.l a4, 0, .L15 269 retw 270.L15: 271 # copy 1 byte 272 l8ui a6, a3, 0 273 s8i a6, a5, 0 274 retw 275 276 277/* 278 * void bcopy(const void *src, void *dest, size_t n); 279 */ 280 .align 4 281 .global bcopy 282 .type bcopy,@function 283bcopy: 284 entry sp, 16 # minimal stack frame 285 # a2=src, a3=dst, a4=len 286 mov a5, a3 287 mov a3, a2 288 mov a2, a5 289 j .Lmovecommon # go to common code for memmove+bcopy 290 291/* 292 * void *memmove(void *dst, const void *src, size_t len); 293 * 294 * This function is intended to do the same thing as the standard 295 * library function memmove() for most cases. 296 * However, where the source and/or destination references 297 * an instruction RAM or ROM or a data RAM or ROM, that 298 * source and/or destination will always be accessed with 299 * 32-bit load and store instructions (as required for these 300 * types of devices). 301 * 302 * !!!!!!! XTFIXME: 303 * !!!!!!! Handling of IRAM/IROM has not yet 304 * !!!!!!! been implemented. 305 * 306 * The (general case) algorithm is as follows: 307 * If end of source doesn't overlap destination then use memcpy. 308 * Otherwise do memcpy backwards. 309 * 310 * Register use: 311 * a0/ return address 312 * a1/ stack pointer 313 * a2/ return value 314 * a3/ src 315 * a4/ length 316 * a5/ dst 317 * a6/ tmp 318 * a7/ tmp 319 * a8/ tmp 320 * a9/ tmp 321 * a10/ tmp 322 * a11/ tmp 323 */ 324 325/* 326 * Byte by byte copy 327 */ 328 .align 4 329 .byte 0 # 1 mod 4 alignment for LOOPNEZ 330 # (0 mod 4 alignment for LBEG) 331.Lbackbytecopy: 332#if XCHAL_HAVE_LOOPS 333 loopnez a4, .Lbackbytecopydone 334#else /* !XCHAL_HAVE_LOOPS */ 335 beqz a4, .Lbackbytecopydone 336 sub a7, a3, a4 # a7 = start address for source 337#endif /* !XCHAL_HAVE_LOOPS */ 338.Lbacknextbyte: 339 addi a3, a3, -1 340 l8ui a6, a3, 0 341 addi a5, a5, -1 342 s8i a6, a5, 0 343#if !XCHAL_HAVE_LOOPS 344 bne a3, a7, .Lbacknextbyte # continue loop if 345 # $a3:src != $a7:src_start 346#endif /* !XCHAL_HAVE_LOOPS */ 347.Lbackbytecopydone: 348 retw 349 350/* 351 * Destination is unaligned 352 */ 353 354 .align 4 355.Lbackdst1mod2: # dst is only byte aligned 356 _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte 357 358 # copy 1 byte 359 addi a3, a3, -1 360 l8ui a6, a3, 0 361 addi a5, a5, -1 362 s8i a6, a5, 0 363 addi a4, a4, -1 364 _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then 365 # return to main algorithm 366.Lbackdst2mod4: # dst 16-bit aligned 367 # copy 2 bytes 368 _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte 369 addi a3, a3, -2 370 l8ui a6, a3, 0 371 l8ui a7, a3, 1 372 addi a5, a5, -2 373 s8i a6, a5, 0 374 s8i a7, a5, 1 375 addi a4, a4, -2 376 j .Lbackdstaligned # dst is now aligned, 377 # return to main algorithm 378 379 .align 4 380 .global memmove 381 .type memmove,@function 382memmove: 383 384 entry sp, 16 # minimal stack frame 385 # a2/ dst, a3/ src, a4/ len 386 mov a5, a2 # copy dst so that a2 is return value 387.Lmovecommon: 388 sub a6, a5, a3 389 bgeu a6, a4, .Lcommon 390 391 add a5, a5, a4 392 add a3, a3, a4 393 394 _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2 395 _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4 396.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned 397 srli a7, a4, 4 # number of loop iterations with 16B 398 # per iteration 399 movi a8, 3 # if source is not aligned, 400 _bany a3, a8, .Lbacksrcunaligned # then use shifting copy 401 /* 402 * Destination and source are word-aligned, use word copy. 403 */ 404 # copy 16 bytes per iteration for word-aligned dst and word-aligned src 405#if XCHAL_HAVE_LOOPS 406 loopnez a7, .backLoop1done 407#else /* !XCHAL_HAVE_LOOPS */ 408 beqz a7, .backLoop1done 409 slli a8, a7, 4 410 sub a8, a3, a8 # a8 = start of first 16B source chunk 411#endif /* !XCHAL_HAVE_LOOPS */ 412.backLoop1: 413 addi a3, a3, -16 414 l32i a7, a3, 12 415 l32i a6, a3, 8 416 addi a5, a5, -16 417 s32i a7, a5, 12 418 l32i a7, a3, 4 419 s32i a6, a5, 8 420 l32i a6, a3, 0 421 s32i a7, a5, 4 422 s32i a6, a5, 0 423#if !XCHAL_HAVE_LOOPS 424 bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start 425#endif /* !XCHAL_HAVE_LOOPS */ 426.backLoop1done: 427 bbci.l a4, 3, .Lback2 428 # copy 8 bytes 429 addi a3, a3, -8 430 l32i a6, a3, 0 431 l32i a7, a3, 4 432 addi a5, a5, -8 433 s32i a6, a5, 0 434 s32i a7, a5, 4 435.Lback2: 436 bbsi.l a4, 2, .Lback3 437 bbsi.l a4, 1, .Lback4 438 bbsi.l a4, 0, .Lback5 439 retw 440.Lback3: 441 # copy 4 bytes 442 addi a3, a3, -4 443 l32i a6, a3, 0 444 addi a5, a5, -4 445 s32i a6, a5, 0 446 bbsi.l a4, 1, .Lback4 447 bbsi.l a4, 0, .Lback5 448 retw 449.Lback4: 450 # copy 2 bytes 451 addi a3, a3, -2 452 l16ui a6, a3, 0 453 addi a5, a5, -2 454 s16i a6, a5, 0 455 bbsi.l a4, 0, .Lback5 456 retw 457.Lback5: 458 # copy 1 byte 459 addi a3, a3, -1 460 l8ui a6, a3, 0 461 addi a5, a5, -1 462 s8i a6, a5, 0 463 retw 464 465/* 466 * Destination is aligned, Source is unaligned 467 */ 468 469 .align 4 470.Lbacksrcunaligned: 471 _beqz a4, .Lbackdone # avoid loading anything for zero-length copies 472 # copy 16 bytes per iteration for word-aligned dst and unaligned src 473 __ssa8 a3 # set shift amount from byte offset 474#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with 475 * the lint or ferret client, or 0 476 * to save a few cycles */ 477#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 478 and a11, a3, a8 # save unalignment offset for below 479 sub a3, a3, a11 # align a3 480#endif 481 l32i a6, a3, 0 # load first word 482#if XCHAL_HAVE_LOOPS 483 loopnez a7, .backLoop2done 484#else /* !XCHAL_HAVE_LOOPS */ 485 beqz a7, .backLoop2done 486 slli a10, a7, 4 487 sub a10, a3, a10 # a10 = start of first 16B source chunk 488#endif /* !XCHAL_HAVE_LOOPS */ 489.backLoop2: 490 addi a3, a3, -16 491 l32i a7, a3, 12 492 l32i a8, a3, 8 493 addi a5, a5, -16 494 __src_b a6, a7, a6 495 s32i a6, a5, 12 496 l32i a9, a3, 4 497 __src_b a7, a8, a7 498 s32i a7, a5, 8 499 l32i a6, a3, 0 500 __src_b a8, a9, a8 501 s32i a8, a5, 4 502 __src_b a9, a6, a9 503 s32i a9, a5, 0 504#if !XCHAL_HAVE_LOOPS 505 bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start 506#endif /* !XCHAL_HAVE_LOOPS */ 507.backLoop2done: 508 bbci.l a4, 3, .Lback12 509 # copy 8 bytes 510 addi a3, a3, -8 511 l32i a7, a3, 4 512 l32i a8, a3, 0 513 addi a5, a5, -8 514 __src_b a6, a7, a6 515 s32i a6, a5, 4 516 __src_b a7, a8, a7 517 s32i a7, a5, 0 518 mov a6, a8 519.Lback12: 520 bbci.l a4, 2, .Lback13 521 # copy 4 bytes 522 addi a3, a3, -4 523 l32i a7, a3, 0 524 addi a5, a5, -4 525 __src_b a6, a7, a6 526 s32i a6, a5, 0 527 mov a6, a7 528.Lback13: 529#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 530 add a3, a3, a11 # readjust a3 with correct misalignment 531#endif 532 bbsi.l a4, 1, .Lback14 533 bbsi.l a4, 0, .Lback15 534.Lbackdone: 535 retw 536.Lback14: 537 # copy 2 bytes 538 addi a3, a3, -2 539 l8ui a6, a3, 0 540 l8ui a7, a3, 1 541 addi a5, a5, -2 542 s8i a6, a5, 0 543 s8i a7, a5, 1 544 bbsi.l a4, 0, .Lback15 545 retw 546.Lback15: 547 # copy 1 byte 548 addi a3, a3, -1 549 addi a5, a5, -1 550 l8ui a6, a3, 0 551 s8i a6, a5, 0 552 retw 553 554 555/* 556 * Local Variables: 557 * mode:fundamental 558 * comment-start: "# " 559 * comment-start-skip: "# *" 560 * End: 561 */ 562