1/* 2 * memcpy - copy memory area 3 * 4 * Copyright (c) 2013-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8/* 9 This memcpy routine is optimised for Cortex-A15 cores and takes advantage 10 of VFP or NEON when built with the appropriate flags. 11 12 Assumptions: 13 14 ARMv6 (ARMv7-a if using Neon) 15 ARM state 16 Unaligned accesses 17 18 */ 19 20#include "asmdefs.h" 21 22 .syntax unified 23 /* This implementation requires ARM state. */ 24 .arm 25 26#ifdef __ARM_NEON__ 27 28 .fpu neon 29 .arch armv7-a 30# define FRAME_SIZE 4 31# define USE_VFP 32# define USE_NEON 33 34#elif !defined (__SOFTFP__) 35 36 .arch armv6 37 .fpu vfpv2 38# define FRAME_SIZE 32 39# define USE_VFP 40 41#else 42 .arch armv6 43# define FRAME_SIZE 32 44 45#endif 46 47/* Old versions of GAS incorrectly implement the NEON align semantics. */ 48#ifdef BROKEN_ASM_NEON_ALIGN 49#define ALIGN(addr, align) addr,:align 50#else 51#define ALIGN(addr, align) addr:align 52#endif 53 54#define PC_OFFSET 8 /* PC pipeline compensation. */ 55#define INSN_SIZE 4 56 57/* Call parameters. */ 58#define dstin r0 59#define src r1 60#define count r2 61 62/* Locals. */ 63#define tmp1 r3 64#define dst ip 65#define tmp2 r10 66 67#ifndef USE_NEON 68/* For bulk copies using GP registers. */ 69#define A_l r2 /* Call-clobbered. */ 70#define A_h r3 /* Call-clobbered. */ 71#define B_l r4 72#define B_h r5 73#define C_l r6 74#define C_h r7 75#define D_l r8 76#define D_h r9 77#endif 78 79/* Number of lines ahead to pre-fetch data. If you change this the code 80 below will need adjustment to compensate. */ 81 82#define prefetch_lines 5 83 84#ifdef USE_VFP 85 .macro cpy_line_vfp vreg, base 86 vstr \vreg, [dst, #\base] 87 vldr \vreg, [src, #\base] 88 vstr d0, [dst, #\base + 8] 89 vldr d0, [src, #\base + 8] 90 vstr d1, [dst, #\base + 16] 91 vldr d1, [src, #\base + 16] 92 vstr d2, [dst, #\base + 24] 93 vldr d2, [src, #\base + 24] 94 vstr \vreg, [dst, #\base + 32] 95 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] 96 vstr d0, [dst, #\base + 40] 97 vldr d0, [src, #\base + 40] 98 vstr d1, [dst, #\base + 48] 99 vldr d1, [src, #\base + 48] 100 vstr d2, [dst, #\base + 56] 101 vldr d2, [src, #\base + 56] 102 .endm 103 104 .macro cpy_tail_vfp vreg, base 105 vstr \vreg, [dst, #\base] 106 vldr \vreg, [src, #\base] 107 vstr d0, [dst, #\base + 8] 108 vldr d0, [src, #\base + 8] 109 vstr d1, [dst, #\base + 16] 110 vldr d1, [src, #\base + 16] 111 vstr d2, [dst, #\base + 24] 112 vldr d2, [src, #\base + 24] 113 vstr \vreg, [dst, #\base + 32] 114 vstr d0, [dst, #\base + 40] 115 vldr d0, [src, #\base + 40] 116 vstr d1, [dst, #\base + 48] 117 vldr d1, [src, #\base + 48] 118 vstr d2, [dst, #\base + 56] 119 vldr d2, [src, #\base + 56] 120 .endm 121#endif 122 123ENTRY (__memcpy_arm) 124 125 mov dst, dstin /* Preserve dstin, we need to return it. */ 126 cmp count, #64 127 bhs L(cpy_not_short) 128 /* Deal with small copies quickly by dropping straight into the 129 exit block. */ 130 131L(tail63unaligned): 132#ifdef USE_NEON 133 and tmp1, count, #0x38 134 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 135 add pc, pc, tmp1 136 vld1.8 {d0}, [src]! /* 14 words to go. */ 137 vst1.8 {d0}, [dst]! 138 vld1.8 {d0}, [src]! /* 12 words to go. */ 139 vst1.8 {d0}, [dst]! 140 vld1.8 {d0}, [src]! /* 10 words to go. */ 141 vst1.8 {d0}, [dst]! 142 vld1.8 {d0}, [src]! /* 8 words to go. */ 143 vst1.8 {d0}, [dst]! 144 vld1.8 {d0}, [src]! /* 6 words to go. */ 145 vst1.8 {d0}, [dst]! 146 vld1.8 {d0}, [src]! /* 4 words to go. */ 147 vst1.8 {d0}, [dst]! 148 vld1.8 {d0}, [src]! /* 2 words to go. */ 149 vst1.8 {d0}, [dst]! 150 151 tst count, #4 152 ldrne tmp1, [src], #4 153 strne tmp1, [dst], #4 154#else 155 /* Copy up to 15 full words of data. May not be aligned. */ 156 /* Cannot use VFP for unaligned data. */ 157 and tmp1, count, #0x3c 158 add dst, dst, tmp1 159 add src, src, tmp1 160 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) 161 /* Jump directly into the sequence below at the correct offset. */ 162 add pc, pc, tmp1, lsl #1 163 164 ldr tmp1, [src, #-60] /* 15 words to go. */ 165 str tmp1, [dst, #-60] 166 167 ldr tmp1, [src, #-56] /* 14 words to go. */ 168 str tmp1, [dst, #-56] 169 ldr tmp1, [src, #-52] 170 str tmp1, [dst, #-52] 171 172 ldr tmp1, [src, #-48] /* 12 words to go. */ 173 str tmp1, [dst, #-48] 174 ldr tmp1, [src, #-44] 175 str tmp1, [dst, #-44] 176 177 ldr tmp1, [src, #-40] /* 10 words to go. */ 178 str tmp1, [dst, #-40] 179 ldr tmp1, [src, #-36] 180 str tmp1, [dst, #-36] 181 182 ldr tmp1, [src, #-32] /* 8 words to go. */ 183 str tmp1, [dst, #-32] 184 ldr tmp1, [src, #-28] 185 str tmp1, [dst, #-28] 186 187 ldr tmp1, [src, #-24] /* 6 words to go. */ 188 str tmp1, [dst, #-24] 189 ldr tmp1, [src, #-20] 190 str tmp1, [dst, #-20] 191 192 ldr tmp1, [src, #-16] /* 4 words to go. */ 193 str tmp1, [dst, #-16] 194 ldr tmp1, [src, #-12] 195 str tmp1, [dst, #-12] 196 197 ldr tmp1, [src, #-8] /* 2 words to go. */ 198 str tmp1, [dst, #-8] 199 ldr tmp1, [src, #-4] 200 str tmp1, [dst, #-4] 201#endif 202 203 lsls count, count, #31 204 ldrhcs tmp1, [src], #2 205 ldrbne src, [src] /* Src is dead, use as a scratch. */ 206 strhcs tmp1, [dst], #2 207 strbne src, [dst] 208 bx lr 209 210L(cpy_not_short): 211 /* At least 64 bytes to copy, but don't know the alignment yet. */ 212 str tmp2, [sp, #-FRAME_SIZE]! 213 and tmp2, src, #7 214 and tmp1, dst, #7 215 cmp tmp1, tmp2 216 bne L(cpy_notaligned) 217 218#ifdef USE_VFP 219 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show 220 that the FP pipeline is much better at streaming loads and 221 stores. This is outside the critical loop. */ 222 vmov.f32 s0, s0 223#endif 224 225 /* SRC and DST have the same mutual 64-bit alignment, but we may 226 still need to pre-copy some bytes to get to natural alignment. 227 We bring SRC and DST into full 64-bit alignment. */ 228 lsls tmp2, dst, #29 229 beq 1f 230 rsbs tmp2, tmp2, #0 231 sub count, count, tmp2, lsr #29 232 ldrmi tmp1, [src], #4 233 strmi tmp1, [dst], #4 234 lsls tmp2, tmp2, #2 235 ldrhcs tmp1, [src], #2 236 ldrbne tmp2, [src], #1 237 strhcs tmp1, [dst], #2 238 strbne tmp2, [dst], #1 239 2401: 241 subs tmp2, count, #64 /* Use tmp2 for count. */ 242 blo L(tail63aligned) 243 244 cmp tmp2, #512 245 bhs L(cpy_body_long) 246 247L(cpy_body_medium): /* Count in tmp2. */ 248#ifdef USE_VFP 2491: 250 vldr d0, [src, #0] 251 subs tmp2, tmp2, #64 252 vldr d1, [src, #8] 253 vstr d0, [dst, #0] 254 vldr d0, [src, #16] 255 vstr d1, [dst, #8] 256 vldr d1, [src, #24] 257 vstr d0, [dst, #16] 258 vldr d0, [src, #32] 259 vstr d1, [dst, #24] 260 vldr d1, [src, #40] 261 vstr d0, [dst, #32] 262 vldr d0, [src, #48] 263 vstr d1, [dst, #40] 264 vldr d1, [src, #56] 265 vstr d0, [dst, #48] 266 add src, src, #64 267 vstr d1, [dst, #56] 268 add dst, dst, #64 269 bhs 1b 270 tst tmp2, #0x3f 271 beq L(done) 272 273L(tail63aligned): /* Count in tmp2. */ 274 and tmp1, tmp2, #0x38 275 add dst, dst, tmp1 276 add src, src, tmp1 277 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 278 add pc, pc, tmp1 279 280 vldr d0, [src, #-56] /* 14 words to go. */ 281 vstr d0, [dst, #-56] 282 vldr d0, [src, #-48] /* 12 words to go. */ 283 vstr d0, [dst, #-48] 284 vldr d0, [src, #-40] /* 10 words to go. */ 285 vstr d0, [dst, #-40] 286 vldr d0, [src, #-32] /* 8 words to go. */ 287 vstr d0, [dst, #-32] 288 vldr d0, [src, #-24] /* 6 words to go. */ 289 vstr d0, [dst, #-24] 290 vldr d0, [src, #-16] /* 4 words to go. */ 291 vstr d0, [dst, #-16] 292 vldr d0, [src, #-8] /* 2 words to go. */ 293 vstr d0, [dst, #-8] 294#else 295 sub src, src, #8 296 sub dst, dst, #8 2971: 298 ldrd A_l, A_h, [src, #8] 299 strd A_l, A_h, [dst, #8] 300 ldrd A_l, A_h, [src, #16] 301 strd A_l, A_h, [dst, #16] 302 ldrd A_l, A_h, [src, #24] 303 strd A_l, A_h, [dst, #24] 304 ldrd A_l, A_h, [src, #32] 305 strd A_l, A_h, [dst, #32] 306 ldrd A_l, A_h, [src, #40] 307 strd A_l, A_h, [dst, #40] 308 ldrd A_l, A_h, [src, #48] 309 strd A_l, A_h, [dst, #48] 310 ldrd A_l, A_h, [src, #56] 311 strd A_l, A_h, [dst, #56] 312 ldrd A_l, A_h, [src, #64]! 313 strd A_l, A_h, [dst, #64]! 314 subs tmp2, tmp2, #64 315 bhs 1b 316 tst tmp2, #0x3f 317 bne 1f 318 ldr tmp2,[sp], #FRAME_SIZE 319 bx lr 3201: 321 add src, src, #8 322 add dst, dst, #8 323 324L(tail63aligned): /* Count in tmp2. */ 325 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but 326 we know that the src and dest are 64-bit aligned so we can use 327 LDRD/STRD to improve efficiency. */ 328 /* TMP2 is now negative, but we don't care about that. The bottom 329 six bits still tell us how many bytes are left to copy. */ 330 331 and tmp1, tmp2, #0x38 332 add dst, dst, tmp1 333 add src, src, tmp1 334 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 335 add pc, pc, tmp1 336 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ 337 strd A_l, A_h, [dst, #-56] 338 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ 339 strd A_l, A_h, [dst, #-48] 340 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ 341 strd A_l, A_h, [dst, #-40] 342 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ 343 strd A_l, A_h, [dst, #-32] 344 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ 345 strd A_l, A_h, [dst, #-24] 346 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ 347 strd A_l, A_h, [dst, #-16] 348 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ 349 strd A_l, A_h, [dst, #-8] 350 351#endif 352 tst tmp2, #4 353 ldrne tmp1, [src], #4 354 strne tmp1, [dst], #4 355 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ 356 ldrhcs tmp1, [src], #2 357 ldrbne tmp2, [src] 358 strhcs tmp1, [dst], #2 359 strbne tmp2, [dst] 360 361L(done): 362 ldr tmp2, [sp], #FRAME_SIZE 363 bx lr 364 365L(cpy_body_long): /* Count in tmp2. */ 366 367 /* Long copy. We know that there's at least (prefetch_lines * 64) 368 bytes to go. */ 369#ifdef USE_VFP 370 /* Don't use PLD. Instead, read some data in advance of the current 371 copy position into a register. This should act like a PLD 372 operation but we won't have to repeat the transfer. */ 373 374 vldr d3, [src, #0] 375 vldr d4, [src, #64] 376 vldr d5, [src, #128] 377 vldr d6, [src, #192] 378 vldr d7, [src, #256] 379 380 vldr d0, [src, #8] 381 vldr d1, [src, #16] 382 vldr d2, [src, #24] 383 add src, src, #32 384 385 subs tmp2, tmp2, #prefetch_lines * 64 * 2 386 blo 2f 3871: 388 cpy_line_vfp d3, 0 389 cpy_line_vfp d4, 64 390 cpy_line_vfp d5, 128 391 add dst, dst, #3 * 64 392 add src, src, #3 * 64 393 cpy_line_vfp d6, 0 394 cpy_line_vfp d7, 64 395 add dst, dst, #2 * 64 396 add src, src, #2 * 64 397 subs tmp2, tmp2, #prefetch_lines * 64 398 bhs 1b 399 4002: 401 cpy_tail_vfp d3, 0 402 cpy_tail_vfp d4, 64 403 cpy_tail_vfp d5, 128 404 add src, src, #3 * 64 405 add dst, dst, #3 * 64 406 cpy_tail_vfp d6, 0 407 vstr d7, [dst, #64] 408 vldr d7, [src, #64] 409 vstr d0, [dst, #64 + 8] 410 vldr d0, [src, #64 + 8] 411 vstr d1, [dst, #64 + 16] 412 vldr d1, [src, #64 + 16] 413 vstr d2, [dst, #64 + 24] 414 vldr d2, [src, #64 + 24] 415 vstr d7, [dst, #64 + 32] 416 add src, src, #96 417 vstr d0, [dst, #64 + 40] 418 vstr d1, [dst, #64 + 48] 419 vstr d2, [dst, #64 + 56] 420 add dst, dst, #128 421 add tmp2, tmp2, #prefetch_lines * 64 422 b L(cpy_body_medium) 423#else 424 /* Long copy. Use an SMS style loop to maximize the I/O 425 bandwidth of the core. We don't have enough spare registers 426 to synthesise prefetching, so use PLD operations. */ 427 /* Pre-bias src and dst. */ 428 sub src, src, #8 429 sub dst, dst, #8 430 pld [src, #8] 431 pld [src, #72] 432 subs tmp2, tmp2, #64 433 pld [src, #136] 434 ldrd A_l, A_h, [src, #8] 435 strd B_l, B_h, [sp, #8] 436 ldrd B_l, B_h, [src, #16] 437 strd C_l, C_h, [sp, #16] 438 ldrd C_l, C_h, [src, #24] 439 strd D_l, D_h, [sp, #24] 440 pld [src, #200] 441 ldrd D_l, D_h, [src, #32]! 442 b 1f 443 .p2align 6 4442: 445 pld [src, #232] 446 strd A_l, A_h, [dst, #40] 447 ldrd A_l, A_h, [src, #40] 448 strd B_l, B_h, [dst, #48] 449 ldrd B_l, B_h, [src, #48] 450 strd C_l, C_h, [dst, #56] 451 ldrd C_l, C_h, [src, #56] 452 strd D_l, D_h, [dst, #64]! 453 ldrd D_l, D_h, [src, #64]! 454 subs tmp2, tmp2, #64 4551: 456 strd A_l, A_h, [dst, #8] 457 ldrd A_l, A_h, [src, #8] 458 strd B_l, B_h, [dst, #16] 459 ldrd B_l, B_h, [src, #16] 460 strd C_l, C_h, [dst, #24] 461 ldrd C_l, C_h, [src, #24] 462 strd D_l, D_h, [dst, #32] 463 ldrd D_l, D_h, [src, #32] 464 bcs 2b 465 /* Save the remaining bytes and restore the callee-saved regs. */ 466 strd A_l, A_h, [dst, #40] 467 add src, src, #40 468 strd B_l, B_h, [dst, #48] 469 ldrd B_l, B_h, [sp, #8] 470 strd C_l, C_h, [dst, #56] 471 ldrd C_l, C_h, [sp, #16] 472 strd D_l, D_h, [dst, #64] 473 ldrd D_l, D_h, [sp, #24] 474 add dst, dst, #72 475 tst tmp2, #0x3f 476 bne L(tail63aligned) 477 ldr tmp2, [sp], #FRAME_SIZE 478 bx lr 479#endif 480 481L(cpy_notaligned): 482 pld [src] 483 pld [src, #64] 484 /* There's at least 64 bytes to copy, but there is no mutual 485 alignment. */ 486 /* Bring DST to 64-bit alignment. */ 487 lsls tmp2, dst, #29 488 pld [src, #(2 * 64)] 489 beq 1f 490 rsbs tmp2, tmp2, #0 491 sub count, count, tmp2, lsr #29 492 ldrmi tmp1, [src], #4 493 strmi tmp1, [dst], #4 494 lsls tmp2, tmp2, #2 495 ldrbne tmp1, [src], #1 496 ldrhcs tmp2, [src], #2 497 strbne tmp1, [dst], #1 498 strhcs tmp2, [dst], #2 4991: 500 pld [src, #(3 * 64)] 501 subs count, count, #64 502 ldrlo tmp2, [sp], #FRAME_SIZE 503 blo L(tail63unaligned) 504 pld [src, #(4 * 64)] 505 506#ifdef USE_NEON 507 vld1.8 {d0-d3}, [src]! 508 vld1.8 {d4-d7}, [src]! 509 subs count, count, #64 510 blo 2f 5111: 512 pld [src, #(4 * 64)] 513 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 514 vld1.8 {d0-d3}, [src]! 515 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 516 vld1.8 {d4-d7}, [src]! 517 subs count, count, #64 518 bhs 1b 5192: 520 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 521 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 522 ands count, count, #0x3f 523#else 524 /* Use an SMS style loop to maximize the I/O bandwidth. */ 525 sub src, src, #4 526 sub dst, dst, #8 527 subs tmp2, count, #64 /* Use tmp2 for count. */ 528 ldr A_l, [src, #4] 529 ldr A_h, [src, #8] 530 strd B_l, B_h, [sp, #8] 531 ldr B_l, [src, #12] 532 ldr B_h, [src, #16] 533 strd C_l, C_h, [sp, #16] 534 ldr C_l, [src, #20] 535 ldr C_h, [src, #24] 536 strd D_l, D_h, [sp, #24] 537 ldr D_l, [src, #28] 538 ldr D_h, [src, #32]! 539 b 1f 540 .p2align 6 5412: 542 pld [src, #(5 * 64) - (32 - 4)] 543 strd A_l, A_h, [dst, #40] 544 ldr A_l, [src, #36] 545 ldr A_h, [src, #40] 546 strd B_l, B_h, [dst, #48] 547 ldr B_l, [src, #44] 548 ldr B_h, [src, #48] 549 strd C_l, C_h, [dst, #56] 550 ldr C_l, [src, #52] 551 ldr C_h, [src, #56] 552 strd D_l, D_h, [dst, #64]! 553 ldr D_l, [src, #60] 554 ldr D_h, [src, #64]! 555 subs tmp2, tmp2, #64 5561: 557 strd A_l, A_h, [dst, #8] 558 ldr A_l, [src, #4] 559 ldr A_h, [src, #8] 560 strd B_l, B_h, [dst, #16] 561 ldr B_l, [src, #12] 562 ldr B_h, [src, #16] 563 strd C_l, C_h, [dst, #24] 564 ldr C_l, [src, #20] 565 ldr C_h, [src, #24] 566 strd D_l, D_h, [dst, #32] 567 ldr D_l, [src, #28] 568 ldr D_h, [src, #32] 569 bcs 2b 570 571 /* Save the remaining bytes and restore the callee-saved regs. */ 572 strd A_l, A_h, [dst, #40] 573 add src, src, #36 574 strd B_l, B_h, [dst, #48] 575 ldrd B_l, B_h, [sp, #8] 576 strd C_l, C_h, [dst, #56] 577 ldrd C_l, C_h, [sp, #16] 578 strd D_l, D_h, [dst, #64] 579 ldrd D_l, D_h, [sp, #24] 580 add dst, dst, #72 581 ands count, tmp2, #0x3f 582#endif 583 ldr tmp2, [sp], #FRAME_SIZE 584 bne L(tail63unaligned) 585 bx lr 586 587END (__memcpy_arm) 588