1/* 2 * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions 3 * xthal_memcpy and xthal_bcopy 4 * 5 * This file is subject to the terms and conditions of the GNU General Public 6 * License. See the file "COPYING" in the main directory of this archive 7 * for more details. 8 * 9 * Copyright (C) 2002 - 2012 Tensilica Inc. 10 */ 11 12#include <linux/linkage.h> 13#include <variant/core.h> 14#include <asm/asmmacro.h> 15 16/* 17 * void *memcpy(void *dst, const void *src, size_t len); 18 * 19 * This function is intended to do the same thing as the standard 20 * library function memcpy() for most cases. 21 * However, where the source and/or destination references 22 * an instruction RAM or ROM or a data RAM or ROM, that 23 * source and/or destination will always be accessed with 24 * 32-bit load and store instructions (as required for these 25 * types of devices). 26 * 27 * !!!!!!! XTFIXME: 28 * !!!!!!! Handling of IRAM/IROM has not yet 29 * !!!!!!! been implemented. 30 * 31 * The (general case) algorithm is as follows: 32 * If destination is unaligned, align it by conditionally 33 * copying 1 and 2 bytes. 34 * If source is aligned, 35 * do 16 bytes with a loop, and then finish up with 36 * 8, 4, 2, and 1 byte copies conditional on the length; 37 * else (if source is unaligned), 38 * do the same, but use SRC to align the source data. 39 * This code tries to use fall-through branches for the common 40 * case of aligned source and destination and multiple 41 * of 4 (or 8) length. 42 * 43 * Register use: 44 * a0/ return address 45 * a1/ stack pointer 46 * a2/ return value 47 * a3/ src 48 * a4/ length 49 * a5/ dst 50 * a6/ tmp 51 * a7/ tmp 52 * a8/ tmp 53 * a9/ tmp 54 * a10/ tmp 55 * a11/ tmp 56 */ 57 58 .text 59 60/* 61 * Byte by byte copy 62 */ 63 .align 4 64 .byte 0 # 1 mod 4 alignment for LOOPNEZ 65 # (0 mod 4 alignment for LBEG) 66.Lbytecopy: 67#if XCHAL_HAVE_LOOPS 68 loopnez a4, .Lbytecopydone 69#else /* !XCHAL_HAVE_LOOPS */ 70 beqz a4, .Lbytecopydone 71 add a7, a3, a4 # a7 = end address for source 72#endif /* !XCHAL_HAVE_LOOPS */ 73.Lnextbyte: 74 l8ui a6, a3, 0 75 addi a3, a3, 1 76 s8i a6, a5, 0 77 addi a5, a5, 1 78#if !XCHAL_HAVE_LOOPS 79 bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end 80#endif /* !XCHAL_HAVE_LOOPS */ 81.Lbytecopydone: 82 retw 83 84/* 85 * Destination is unaligned 86 */ 87 88 .align 4 89.Ldst1mod2: # dst is only byte aligned 90 _bltui a4, 7, .Lbytecopy # do short copies byte by byte 91 92 # copy 1 byte 93 l8ui a6, a3, 0 94 addi a3, a3, 1 95 addi a4, a4, -1 96 s8i a6, a5, 0 97 addi a5, a5, 1 98 _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then 99 # return to main algorithm 100.Ldst2mod4: # dst 16-bit aligned 101 # copy 2 bytes 102 _bltui a4, 6, .Lbytecopy # do short copies byte by byte 103 l8ui a6, a3, 0 104 l8ui a7, a3, 1 105 addi a3, a3, 2 106 addi a4, a4, -2 107 s8i a6, a5, 0 108 s8i a7, a5, 1 109 addi a5, a5, 2 110 j .Ldstaligned # dst is now aligned, return to main algorithm 111 112ENTRY(memcpy) 113 114 entry sp, 16 # minimal stack frame 115 # a2/ dst, a3/ src, a4/ len 116 mov a5, a2 # copy dst so that a2 is return value 117.Lcommon: 118 _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 119 _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 120.Ldstaligned: # return here from .Ldst?mod? once dst is aligned 121 srli a7, a4, 4 # number of loop iterations with 16B 122 # per iteration 123 movi a8, 3 # if source is not aligned, 124 _bany a3, a8, .Lsrcunaligned # then use shifting copy 125 /* 126 * Destination and source are word-aligned, use word copy. 127 */ 128 # copy 16 bytes per iteration for word-aligned dst and word-aligned src 129#if XCHAL_HAVE_LOOPS 130 loopnez a7, .Loop1done 131#else /* !XCHAL_HAVE_LOOPS */ 132 beqz a7, .Loop1done 133 slli a8, a7, 4 134 add a8, a8, a3 # a8 = end of last 16B source chunk 135#endif /* !XCHAL_HAVE_LOOPS */ 136.Loop1: 137 l32i a6, a3, 0 138 l32i a7, a3, 4 139 s32i a6, a5, 0 140 l32i a6, a3, 8 141 s32i a7, a5, 4 142 l32i a7, a3, 12 143 s32i a6, a5, 8 144 addi a3, a3, 16 145 s32i a7, a5, 12 146 addi a5, a5, 16 147#if !XCHAL_HAVE_LOOPS 148 bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end 149#endif /* !XCHAL_HAVE_LOOPS */ 150.Loop1done: 151 bbci.l a4, 3, .L2 152 # copy 8 bytes 153 l32i a6, a3, 0 154 l32i a7, a3, 4 155 addi a3, a3, 8 156 s32i a6, a5, 0 157 s32i a7, a5, 4 158 addi a5, a5, 8 159.L2: 160 bbsi.l a4, 2, .L3 161 bbsi.l a4, 1, .L4 162 bbsi.l a4, 0, .L5 163 retw 164.L3: 165 # copy 4 bytes 166 l32i a6, a3, 0 167 addi a3, a3, 4 168 s32i a6, a5, 0 169 addi a5, a5, 4 170 bbsi.l a4, 1, .L4 171 bbsi.l a4, 0, .L5 172 retw 173.L4: 174 # copy 2 bytes 175 l16ui a6, a3, 0 176 addi a3, a3, 2 177 s16i a6, a5, 0 178 addi a5, a5, 2 179 bbsi.l a4, 0, .L5 180 retw 181.L5: 182 # copy 1 byte 183 l8ui a6, a3, 0 184 s8i a6, a5, 0 185 retw 186 187/* 188 * Destination is aligned, Source is unaligned 189 */ 190 191 .align 4 192.Lsrcunaligned: 193 _beqz a4, .Ldone # avoid loading anything for zero-length copies 194 # copy 16 bytes per iteration for word-aligned dst and unaligned src 195 __ssa8 a3 # set shift amount from byte offset 196 197/* set to 1 when running on ISS (simulator) with the 198 lint or ferret client, or 0 to save a few cycles */ 199#define SIM_CHECKS_ALIGNMENT 1 200#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 201 and a11, a3, a8 # save unalignment offset for below 202 sub a3, a3, a11 # align a3 203#endif 204 l32i a6, a3, 0 # load first word 205#if XCHAL_HAVE_LOOPS 206 loopnez a7, .Loop2done 207#else /* !XCHAL_HAVE_LOOPS */ 208 beqz a7, .Loop2done 209 slli a10, a7, 4 210 add a10, a10, a3 # a10 = end of last 16B source chunk 211#endif /* !XCHAL_HAVE_LOOPS */ 212.Loop2: 213 l32i a7, a3, 4 214 l32i a8, a3, 8 215 __src_b a6, a6, a7 216 s32i a6, a5, 0 217 l32i a9, a3, 12 218 __src_b a7, a7, a8 219 s32i a7, a5, 4 220 l32i a6, a3, 16 221 __src_b a8, a8, a9 222 s32i a8, a5, 8 223 addi a3, a3, 16 224 __src_b a9, a9, a6 225 s32i a9, a5, 12 226 addi a5, a5, 16 227#if !XCHAL_HAVE_LOOPS 228 bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end 229#endif /* !XCHAL_HAVE_LOOPS */ 230.Loop2done: 231 bbci.l a4, 3, .L12 232 # copy 8 bytes 233 l32i a7, a3, 4 234 l32i a8, a3, 8 235 __src_b a6, a6, a7 236 s32i a6, a5, 0 237 addi a3, a3, 8 238 __src_b a7, a7, a8 239 s32i a7, a5, 4 240 addi a5, a5, 8 241 mov a6, a8 242.L12: 243 bbci.l a4, 2, .L13 244 # copy 4 bytes 245 l32i a7, a3, 4 246 addi a3, a3, 4 247 __src_b a6, a6, a7 248 s32i a6, a5, 0 249 addi a5, a5, 4 250 mov a6, a7 251.L13: 252#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 253 add a3, a3, a11 # readjust a3 with correct misalignment 254#endif 255 bbsi.l a4, 1, .L14 256 bbsi.l a4, 0, .L15 257.Ldone: retw 258.L14: 259 # copy 2 bytes 260 l8ui a6, a3, 0 261 l8ui a7, a3, 1 262 addi a3, a3, 2 263 s8i a6, a5, 0 264 s8i a7, a5, 1 265 addi a5, a5, 2 266 bbsi.l a4, 0, .L15 267 retw 268.L15: 269 # copy 1 byte 270 l8ui a6, a3, 0 271 s8i a6, a5, 0 272 retw 273 274ENDPROC(memcpy) 275 276/* 277 * void bcopy(const void *src, void *dest, size_t n); 278 */ 279 280ENTRY(bcopy) 281 282 entry sp, 16 # minimal stack frame 283 # a2=src, a3=dst, a4=len 284 mov a5, a3 285 mov a3, a2 286 mov a2, a5 287 j .Lmovecommon # go to common code for memmove+bcopy 288 289ENDPROC(bcopy) 290 291/* 292 * void *memmove(void *dst, const void *src, size_t len); 293 * 294 * This function is intended to do the same thing as the standard 295 * library function memmove() for most cases. 296 * However, where the source and/or destination references 297 * an instruction RAM or ROM or a data RAM or ROM, that 298 * source and/or destination will always be accessed with 299 * 32-bit load and store instructions (as required for these 300 * types of devices). 301 * 302 * !!!!!!! XTFIXME: 303 * !!!!!!! Handling of IRAM/IROM has not yet 304 * !!!!!!! been implemented. 305 * 306 * The (general case) algorithm is as follows: 307 * If end of source doesn't overlap destination then use memcpy. 308 * Otherwise do memcpy backwards. 309 * 310 * Register use: 311 * a0/ return address 312 * a1/ stack pointer 313 * a2/ return value 314 * a3/ src 315 * a4/ length 316 * a5/ dst 317 * a6/ tmp 318 * a7/ tmp 319 * a8/ tmp 320 * a9/ tmp 321 * a10/ tmp 322 * a11/ tmp 323 */ 324 325/* 326 * Byte by byte copy 327 */ 328 .align 4 329 .byte 0 # 1 mod 4 alignment for LOOPNEZ 330 # (0 mod 4 alignment for LBEG) 331.Lbackbytecopy: 332#if XCHAL_HAVE_LOOPS 333 loopnez a4, .Lbackbytecopydone 334#else /* !XCHAL_HAVE_LOOPS */ 335 beqz a4, .Lbackbytecopydone 336 sub a7, a3, a4 # a7 = start address for source 337#endif /* !XCHAL_HAVE_LOOPS */ 338.Lbacknextbyte: 339 addi a3, a3, -1 340 l8ui a6, a3, 0 341 addi a5, a5, -1 342 s8i a6, a5, 0 343#if !XCHAL_HAVE_LOOPS 344 bne a3, a7, .Lbacknextbyte # continue loop if 345 # $a3:src != $a7:src_start 346#endif /* !XCHAL_HAVE_LOOPS */ 347.Lbackbytecopydone: 348 retw 349 350/* 351 * Destination is unaligned 352 */ 353 354 .align 4 355.Lbackdst1mod2: # dst is only byte aligned 356 _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte 357 358 # copy 1 byte 359 addi a3, a3, -1 360 l8ui a6, a3, 0 361 addi a5, a5, -1 362 s8i a6, a5, 0 363 addi a4, a4, -1 364 _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then 365 # return to main algorithm 366.Lbackdst2mod4: # dst 16-bit aligned 367 # copy 2 bytes 368 _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte 369 addi a3, a3, -2 370 l8ui a6, a3, 0 371 l8ui a7, a3, 1 372 addi a5, a5, -2 373 s8i a6, a5, 0 374 s8i a7, a5, 1 375 addi a4, a4, -2 376 j .Lbackdstaligned # dst is now aligned, 377 # return to main algorithm 378 379ENTRY(memmove) 380 381 entry sp, 16 # minimal stack frame 382 # a2/ dst, a3/ src, a4/ len 383 mov a5, a2 # copy dst so that a2 is return value 384.Lmovecommon: 385 sub a6, a5, a3 386 bgeu a6, a4, .Lcommon 387 388 add a5, a5, a4 389 add a3, a3, a4 390 391 _bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2 392 _bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4 393.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned 394 srli a7, a4, 4 # number of loop iterations with 16B 395 # per iteration 396 movi a8, 3 # if source is not aligned, 397 _bany a3, a8, .Lbacksrcunaligned # then use shifting copy 398 /* 399 * Destination and source are word-aligned, use word copy. 400 */ 401 # copy 16 bytes per iteration for word-aligned dst and word-aligned src 402#if XCHAL_HAVE_LOOPS 403 loopnez a7, .backLoop1done 404#else /* !XCHAL_HAVE_LOOPS */ 405 beqz a7, .backLoop1done 406 slli a8, a7, 4 407 sub a8, a3, a8 # a8 = start of first 16B source chunk 408#endif /* !XCHAL_HAVE_LOOPS */ 409.backLoop1: 410 addi a3, a3, -16 411 l32i a7, a3, 12 412 l32i a6, a3, 8 413 addi a5, a5, -16 414 s32i a7, a5, 12 415 l32i a7, a3, 4 416 s32i a6, a5, 8 417 l32i a6, a3, 0 418 s32i a7, a5, 4 419 s32i a6, a5, 0 420#if !XCHAL_HAVE_LOOPS 421 bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start 422#endif /* !XCHAL_HAVE_LOOPS */ 423.backLoop1done: 424 bbci.l a4, 3, .Lback2 425 # copy 8 bytes 426 addi a3, a3, -8 427 l32i a6, a3, 0 428 l32i a7, a3, 4 429 addi a5, a5, -8 430 s32i a6, a5, 0 431 s32i a7, a5, 4 432.Lback2: 433 bbsi.l a4, 2, .Lback3 434 bbsi.l a4, 1, .Lback4 435 bbsi.l a4, 0, .Lback5 436 retw 437.Lback3: 438 # copy 4 bytes 439 addi a3, a3, -4 440 l32i a6, a3, 0 441 addi a5, a5, -4 442 s32i a6, a5, 0 443 bbsi.l a4, 1, .Lback4 444 bbsi.l a4, 0, .Lback5 445 retw 446.Lback4: 447 # copy 2 bytes 448 addi a3, a3, -2 449 l16ui a6, a3, 0 450 addi a5, a5, -2 451 s16i a6, a5, 0 452 bbsi.l a4, 0, .Lback5 453 retw 454.Lback5: 455 # copy 1 byte 456 addi a3, a3, -1 457 l8ui a6, a3, 0 458 addi a5, a5, -1 459 s8i a6, a5, 0 460 retw 461 462/* 463 * Destination is aligned, Source is unaligned 464 */ 465 466 .align 4 467.Lbacksrcunaligned: 468 _beqz a4, .Lbackdone # avoid loading anything for zero-length copies 469 # copy 16 bytes per iteration for word-aligned dst and unaligned src 470 __ssa8 a3 # set shift amount from byte offset 471#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with 472 * the lint or ferret client, or 0 473 * to save a few cycles */ 474#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 475 and a11, a3, a8 # save unalignment offset for below 476 sub a3, a3, a11 # align a3 477#endif 478 l32i a6, a3, 0 # load first word 479#if XCHAL_HAVE_LOOPS 480 loopnez a7, .backLoop2done 481#else /* !XCHAL_HAVE_LOOPS */ 482 beqz a7, .backLoop2done 483 slli a10, a7, 4 484 sub a10, a3, a10 # a10 = start of first 16B source chunk 485#endif /* !XCHAL_HAVE_LOOPS */ 486.backLoop2: 487 addi a3, a3, -16 488 l32i a7, a3, 12 489 l32i a8, a3, 8 490 addi a5, a5, -16 491 __src_b a6, a7, a6 492 s32i a6, a5, 12 493 l32i a9, a3, 4 494 __src_b a7, a8, a7 495 s32i a7, a5, 8 496 l32i a6, a3, 0 497 __src_b a8, a9, a8 498 s32i a8, a5, 4 499 __src_b a9, a6, a9 500 s32i a9, a5, 0 501#if !XCHAL_HAVE_LOOPS 502 bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start 503#endif /* !XCHAL_HAVE_LOOPS */ 504.backLoop2done: 505 bbci.l a4, 3, .Lback12 506 # copy 8 bytes 507 addi a3, a3, -8 508 l32i a7, a3, 4 509 l32i a8, a3, 0 510 addi a5, a5, -8 511 __src_b a6, a7, a6 512 s32i a6, a5, 4 513 __src_b a7, a8, a7 514 s32i a7, a5, 0 515 mov a6, a8 516.Lback12: 517 bbci.l a4, 2, .Lback13 518 # copy 4 bytes 519 addi a3, a3, -4 520 l32i a7, a3, 0 521 addi a5, a5, -4 522 __src_b a6, a7, a6 523 s32i a6, a5, 0 524 mov a6, a7 525.Lback13: 526#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT 527 add a3, a3, a11 # readjust a3 with correct misalignment 528#endif 529 bbsi.l a4, 1, .Lback14 530 bbsi.l a4, 0, .Lback15 531.Lbackdone: 532 retw 533.Lback14: 534 # copy 2 bytes 535 addi a3, a3, -2 536 l8ui a6, a3, 0 537 l8ui a7, a3, 1 538 addi a5, a5, -2 539 s8i a6, a5, 0 540 s8i a7, a5, 1 541 bbsi.l a4, 0, .Lback15 542 retw 543.Lback15: 544 # copy 1 byte 545 addi a3, a3, -1 546 addi a5, a5, -1 547 l8ui a6, a3, 0 548 s8i a6, a5, 0 549 retw 550 551ENDPROC(memmove) 552