1/* Copyright (c) 2013, Linaro Limited 2 All rights reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 15 * Neither the name of Linaro Limited nor the names of its 16 contributors may be used to endorse or promote products derived 17 from this software without specific prior written permission. 18 19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32/* 33 This memcpy routine is optimised for Cortex-A15 cores and takes advantage 34 of VFP or NEON when built with the appropriate flags. 35 36 Assumptions: 37 38 ARMv6 (ARMv7-a if using Neon) 39 ARM state 40 Unaligned accesses 41 42 */ 43 44 .syntax unified 45 /* This implementation requires ARM state. */ 46 .arm 47 48#ifdef __ARM_NEON__ 49 50 .fpu neon 51 .arch armv7-a 52# define FRAME_SIZE 4 53# define USE_VFP 54# define USE_NEON 55 56#elif !defined (__SOFTFP__) 57 58 .arch armv6 59 .fpu vfpv2 60# define FRAME_SIZE 32 61# define USE_VFP 62 63#else 64 .arch armv6 65# define FRAME_SIZE 32 66 67#endif 68 69/* Old versions of GAS incorrectly implement the NEON align semantics. */ 70#ifdef BROKEN_ASM_NEON_ALIGN 71#define ALIGN(addr, align) addr,:align 72#else 73#define ALIGN(addr, align) addr:align 74#endif 75 76#define PC_OFFSET 8 /* PC pipeline compensation. */ 77#define INSN_SIZE 4 78 79/* Call parameters. */ 80#define dstin r0 81#define src r1 82#define count r2 83 84/* Locals. */ 85#define tmp1 r3 86#define dst ip 87#define tmp2 r10 88 89#ifndef USE_NEON 90/* For bulk copies using GP registers. */ 91#define A_l r2 /* Call-clobbered. */ 92#define A_h r3 /* Call-clobbered. */ 93#define B_l r4 94#define B_h r5 95#define C_l r6 96#define C_h r7 97#define D_l r8 98#define D_h r9 99#endif 100 101/* Number of lines ahead to pre-fetch data. If you change this the code 102 below will need adjustment to compensate. */ 103 104#define prefetch_lines 5 105 106#ifdef USE_VFP 107 .macro cpy_line_vfp vreg, base 108 vstr \vreg, [dst, #\base] 109 vldr \vreg, [src, #\base] 110 vstr d0, [dst, #\base + 8] 111 vldr d0, [src, #\base + 8] 112 vstr d1, [dst, #\base + 16] 113 vldr d1, [src, #\base + 16] 114 vstr d2, [dst, #\base + 24] 115 vldr d2, [src, #\base + 24] 116 vstr \vreg, [dst, #\base + 32] 117 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] 118 vstr d0, [dst, #\base + 40] 119 vldr d0, [src, #\base + 40] 120 vstr d1, [dst, #\base + 48] 121 vldr d1, [src, #\base + 48] 122 vstr d2, [dst, #\base + 56] 123 vldr d2, [src, #\base + 56] 124 .endm 125 126 .macro cpy_tail_vfp vreg, base 127 vstr \vreg, [dst, #\base] 128 vldr \vreg, [src, #\base] 129 vstr d0, [dst, #\base + 8] 130 vldr d0, [src, #\base + 8] 131 vstr d1, [dst, #\base + 16] 132 vldr d1, [src, #\base + 16] 133 vstr d2, [dst, #\base + 24] 134 vldr d2, [src, #\base + 24] 135 vstr \vreg, [dst, #\base + 32] 136 vstr d0, [dst, #\base + 40] 137 vldr d0, [src, #\base + 40] 138 vstr d1, [dst, #\base + 48] 139 vldr d1, [src, #\base + 48] 140 vstr d2, [dst, #\base + 56] 141 vldr d2, [src, #\base + 56] 142 .endm 143#endif 144 145 .macro def_fn f p2align=0 146 .text 147 .p2align \p2align 148 .global \f 149 .type \f, %function 150\f: 151 .endm 152 153def_fn memcpy p2align=6 154 155 mov dst, dstin /* Preserve dstin, we need to return it. */ 156 cmp count, #64 157 bge .Lcpy_not_short 158 /* Deal with small copies quickly by dropping straight into the 159 exit block. */ 160 161.Ltail63unaligned: 162#ifdef USE_NEON 163 and tmp1, count, #0x38 164 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 165 add pc, pc, tmp1 166 vld1.8 {d0}, [src]! /* 14 words to go. */ 167 vst1.8 {d0}, [dst]! 168 vld1.8 {d0}, [src]! /* 12 words to go. */ 169 vst1.8 {d0}, [dst]! 170 vld1.8 {d0}, [src]! /* 10 words to go. */ 171 vst1.8 {d0}, [dst]! 172 vld1.8 {d0}, [src]! /* 8 words to go. */ 173 vst1.8 {d0}, [dst]! 174 vld1.8 {d0}, [src]! /* 6 words to go. */ 175 vst1.8 {d0}, [dst]! 176 vld1.8 {d0}, [src]! /* 4 words to go. */ 177 vst1.8 {d0}, [dst]! 178 vld1.8 {d0}, [src]! /* 2 words to go. */ 179 vst1.8 {d0}, [dst]! 180 181 tst count, #4 182 ldrne tmp1, [src], #4 183 strne tmp1, [dst], #4 184#else 185 /* Copy up to 15 full words of data. May not be aligned. */ 186 /* Cannot use VFP for unaligned data. */ 187 and tmp1, count, #0x3c 188 add dst, dst, tmp1 189 add src, src, tmp1 190 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) 191 /* Jump directly into the sequence below at the correct offset. */ 192 add pc, pc, tmp1, lsl #1 193 194 ldr tmp1, [src, #-60] /* 15 words to go. */ 195 str tmp1, [dst, #-60] 196 197 ldr tmp1, [src, #-56] /* 14 words to go. */ 198 str tmp1, [dst, #-56] 199 ldr tmp1, [src, #-52] 200 str tmp1, [dst, #-52] 201 202 ldr tmp1, [src, #-48] /* 12 words to go. */ 203 str tmp1, [dst, #-48] 204 ldr tmp1, [src, #-44] 205 str tmp1, [dst, #-44] 206 207 ldr tmp1, [src, #-40] /* 10 words to go. */ 208 str tmp1, [dst, #-40] 209 ldr tmp1, [src, #-36] 210 str tmp1, [dst, #-36] 211 212 ldr tmp1, [src, #-32] /* 8 words to go. */ 213 str tmp1, [dst, #-32] 214 ldr tmp1, [src, #-28] 215 str tmp1, [dst, #-28] 216 217 ldr tmp1, [src, #-24] /* 6 words to go. */ 218 str tmp1, [dst, #-24] 219 ldr tmp1, [src, #-20] 220 str tmp1, [dst, #-20] 221 222 ldr tmp1, [src, #-16] /* 4 words to go. */ 223 str tmp1, [dst, #-16] 224 ldr tmp1, [src, #-12] 225 str tmp1, [dst, #-12] 226 227 ldr tmp1, [src, #-8] /* 2 words to go. */ 228 str tmp1, [dst, #-8] 229 ldr tmp1, [src, #-4] 230 str tmp1, [dst, #-4] 231#endif 232 233 lsls count, count, #31 234 ldrhcs tmp1, [src], #2 235 ldrbne src, [src] /* Src is dead, use as a scratch. */ 236 strhcs tmp1, [dst], #2 237 strbne src, [dst] 238 bx lr 239 240.Lcpy_not_short: 241 /* At least 64 bytes to copy, but don't know the alignment yet. */ 242 str tmp2, [sp, #-FRAME_SIZE]! 243 and tmp2, src, #7 244 and tmp1, dst, #7 245 cmp tmp1, tmp2 246 bne .Lcpy_notaligned 247 248#ifdef USE_VFP 249 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show 250 that the FP pipeline is much better at streaming loads and 251 stores. This is outside the critical loop. */ 252 vmov.f32 s0, s0 253#endif 254 255 /* SRC and DST have the same mutual 64-bit alignment, but we may 256 still need to pre-copy some bytes to get to natural alignment. 257 We bring SRC and DST into full 64-bit alignment. */ 258 lsls tmp2, dst, #29 259 beq 1f 260 rsbs tmp2, tmp2, #0 261 sub count, count, tmp2, lsr #29 262 ldrmi tmp1, [src], #4 263 strmi tmp1, [dst], #4 264 lsls tmp2, tmp2, #2 265 ldrhcs tmp1, [src], #2 266 ldrbne tmp2, [src], #1 267 strhcs tmp1, [dst], #2 268 strbne tmp2, [dst], #1 269 2701: 271 subs tmp2, count, #64 /* Use tmp2 for count. */ 272 blt .Ltail63aligned 273 274 cmp tmp2, #512 275 bge .Lcpy_body_long 276 277.Lcpy_body_medium: /* Count in tmp2. */ 278#ifdef USE_VFP 2791: 280 vldr d0, [src, #0] 281 subs tmp2, tmp2, #64 282 vldr d1, [src, #8] 283 vstr d0, [dst, #0] 284 vldr d0, [src, #16] 285 vstr d1, [dst, #8] 286 vldr d1, [src, #24] 287 vstr d0, [dst, #16] 288 vldr d0, [src, #32] 289 vstr d1, [dst, #24] 290 vldr d1, [src, #40] 291 vstr d0, [dst, #32] 292 vldr d0, [src, #48] 293 vstr d1, [dst, #40] 294 vldr d1, [src, #56] 295 vstr d0, [dst, #48] 296 add src, src, #64 297 vstr d1, [dst, #56] 298 add dst, dst, #64 299 bge 1b 300 tst tmp2, #0x3f 301 beq .Ldone 302 303.Ltail63aligned: /* Count in tmp2. */ 304 and tmp1, tmp2, #0x38 305 add dst, dst, tmp1 306 add src, src, tmp1 307 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 308 add pc, pc, tmp1 309 310 vldr d0, [src, #-56] /* 14 words to go. */ 311 vstr d0, [dst, #-56] 312 vldr d0, [src, #-48] /* 12 words to go. */ 313 vstr d0, [dst, #-48] 314 vldr d0, [src, #-40] /* 10 words to go. */ 315 vstr d0, [dst, #-40] 316 vldr d0, [src, #-32] /* 8 words to go. */ 317 vstr d0, [dst, #-32] 318 vldr d0, [src, #-24] /* 6 words to go. */ 319 vstr d0, [dst, #-24] 320 vldr d0, [src, #-16] /* 4 words to go. */ 321 vstr d0, [dst, #-16] 322 vldr d0, [src, #-8] /* 2 words to go. */ 323 vstr d0, [dst, #-8] 324#else 325 sub src, src, #8 326 sub dst, dst, #8 3271: 328 ldrd A_l, A_h, [src, #8] 329 strd A_l, A_h, [dst, #8] 330 ldrd A_l, A_h, [src, #16] 331 strd A_l, A_h, [dst, #16] 332 ldrd A_l, A_h, [src, #24] 333 strd A_l, A_h, [dst, #24] 334 ldrd A_l, A_h, [src, #32] 335 strd A_l, A_h, [dst, #32] 336 ldrd A_l, A_h, [src, #40] 337 strd A_l, A_h, [dst, #40] 338 ldrd A_l, A_h, [src, #48] 339 strd A_l, A_h, [dst, #48] 340 ldrd A_l, A_h, [src, #56] 341 strd A_l, A_h, [dst, #56] 342 ldrd A_l, A_h, [src, #64]! 343 strd A_l, A_h, [dst, #64]! 344 subs tmp2, tmp2, #64 345 bge 1b 346 tst tmp2, #0x3f 347 bne 1f 348 ldr tmp2,[sp], #FRAME_SIZE 349 bx lr 3501: 351 add src, src, #8 352 add dst, dst, #8 353 354.Ltail63aligned: /* Count in tmp2. */ 355 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but 356 we know that the src and dest are 64-bit aligned so we can use 357 LDRD/STRD to improve efficiency. */ 358 /* TMP2 is now negative, but we don't care about that. The bottom 359 six bits still tell us how many bytes are left to copy. */ 360 361 and tmp1, tmp2, #0x38 362 add dst, dst, tmp1 363 add src, src, tmp1 364 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 365 add pc, pc, tmp1 366 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ 367 strd A_l, A_h, [dst, #-56] 368 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ 369 strd A_l, A_h, [dst, #-48] 370 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ 371 strd A_l, A_h, [dst, #-40] 372 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ 373 strd A_l, A_h, [dst, #-32] 374 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ 375 strd A_l, A_h, [dst, #-24] 376 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ 377 strd A_l, A_h, [dst, #-16] 378 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ 379 strd A_l, A_h, [dst, #-8] 380 381#endif 382 tst tmp2, #4 383 ldrne tmp1, [src], #4 384 strne tmp1, [dst], #4 385 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ 386 ldrhcs tmp1, [src], #2 387 ldrbne tmp2, [src] 388 strhcs tmp1, [dst], #2 389 strbne tmp2, [dst] 390 391.Ldone: 392 ldr tmp2, [sp], #FRAME_SIZE 393 bx lr 394 395.Lcpy_body_long: /* Count in tmp2. */ 396 397 /* Long copy. We know that there's at least (prefetch_lines * 64) 398 bytes to go. */ 399#ifdef USE_VFP 400 /* Don't use PLD. Instead, read some data in advance of the current 401 copy position into a register. This should act like a PLD 402 operation but we won't have to repeat the transfer. */ 403 404 vldr d3, [src, #0] 405 vldr d4, [src, #64] 406 vldr d5, [src, #128] 407 vldr d6, [src, #192] 408 vldr d7, [src, #256] 409 410 vldr d0, [src, #8] 411 vldr d1, [src, #16] 412 vldr d2, [src, #24] 413 add src, src, #32 414 415 subs tmp2, tmp2, #prefetch_lines * 64 * 2 416 blt 2f 4171: 418 cpy_line_vfp d3, 0 419 cpy_line_vfp d4, 64 420 cpy_line_vfp d5, 128 421 add dst, dst, #3 * 64 422 add src, src, #3 * 64 423 cpy_line_vfp d6, 0 424 cpy_line_vfp d7, 64 425 add dst, dst, #2 * 64 426 add src, src, #2 * 64 427 subs tmp2, tmp2, #prefetch_lines * 64 428 bge 1b 429 4302: 431 cpy_tail_vfp d3, 0 432 cpy_tail_vfp d4, 64 433 cpy_tail_vfp d5, 128 434 add src, src, #3 * 64 435 add dst, dst, #3 * 64 436 cpy_tail_vfp d6, 0 437 vstr d7, [dst, #64] 438 vldr d7, [src, #64] 439 vstr d0, [dst, #64 + 8] 440 vldr d0, [src, #64 + 8] 441 vstr d1, [dst, #64 + 16] 442 vldr d1, [src, #64 + 16] 443 vstr d2, [dst, #64 + 24] 444 vldr d2, [src, #64 + 24] 445 vstr d7, [dst, #64 + 32] 446 add src, src, #96 447 vstr d0, [dst, #64 + 40] 448 vstr d1, [dst, #64 + 48] 449 vstr d2, [dst, #64 + 56] 450 add dst, dst, #128 451 add tmp2, tmp2, #prefetch_lines * 64 452 b .Lcpy_body_medium 453#else 454 /* Long copy. Use an SMS style loop to maximize the I/O 455 bandwidth of the core. We don't have enough spare registers 456 to synthesise prefetching, so use PLD operations. */ 457 /* Pre-bias src and dst. */ 458 sub src, src, #8 459 sub dst, dst, #8 460 pld [src, #8] 461 pld [src, #72] 462 subs tmp2, tmp2, #64 463 pld [src, #136] 464 ldrd A_l, A_h, [src, #8] 465 strd B_l, B_h, [sp, #8] 466 ldrd B_l, B_h, [src, #16] 467 strd C_l, C_h, [sp, #16] 468 ldrd C_l, C_h, [src, #24] 469 strd D_l, D_h, [sp, #24] 470 pld [src, #200] 471 ldrd D_l, D_h, [src, #32]! 472 b 1f 473 .p2align 6 4742: 475 pld [src, #232] 476 strd A_l, A_h, [dst, #40] 477 ldrd A_l, A_h, [src, #40] 478 strd B_l, B_h, [dst, #48] 479 ldrd B_l, B_h, [src, #48] 480 strd C_l, C_h, [dst, #56] 481 ldrd C_l, C_h, [src, #56] 482 strd D_l, D_h, [dst, #64]! 483 ldrd D_l, D_h, [src, #64]! 484 subs tmp2, tmp2, #64 4851: 486 strd A_l, A_h, [dst, #8] 487 ldrd A_l, A_h, [src, #8] 488 strd B_l, B_h, [dst, #16] 489 ldrd B_l, B_h, [src, #16] 490 strd C_l, C_h, [dst, #24] 491 ldrd C_l, C_h, [src, #24] 492 strd D_l, D_h, [dst, #32] 493 ldrd D_l, D_h, [src, #32] 494 bcs 2b 495 /* Save the remaining bytes and restore the callee-saved regs. */ 496 strd A_l, A_h, [dst, #40] 497 add src, src, #40 498 strd B_l, B_h, [dst, #48] 499 ldrd B_l, B_h, [sp, #8] 500 strd C_l, C_h, [dst, #56] 501 ldrd C_l, C_h, [sp, #16] 502 strd D_l, D_h, [dst, #64] 503 ldrd D_l, D_h, [sp, #24] 504 add dst, dst, #72 505 tst tmp2, #0x3f 506 bne .Ltail63aligned 507 ldr tmp2, [sp], #FRAME_SIZE 508 bx lr 509#endif 510 511.Lcpy_notaligned: 512 pld [src] 513 pld [src, #64] 514 /* There's at least 64 bytes to copy, but there is no mutual 515 alignment. */ 516 /* Bring DST to 64-bit alignment. */ 517 lsls tmp2, dst, #29 518 pld [src, #(2 * 64)] 519 beq 1f 520 rsbs tmp2, tmp2, #0 521 sub count, count, tmp2, lsr #29 522 ldrmi tmp1, [src], #4 523 strmi tmp1, [dst], #4 524 lsls tmp2, tmp2, #2 525 ldrbne tmp1, [src], #1 526 ldrhcs tmp2, [src], #2 527 strbne tmp1, [dst], #1 528 strhcs tmp2, [dst], #2 5291: 530 pld [src, #(3 * 64)] 531 subs count, count, #64 532 ldrmi tmp2, [sp], #FRAME_SIZE 533 bmi .Ltail63unaligned 534 pld [src, #(4 * 64)] 535 536#ifdef USE_NEON 537 vld1.8 {d0-d3}, [src]! 538 vld1.8 {d4-d7}, [src]! 539 subs count, count, #64 540 bmi 2f 5411: 542 pld [src, #(4 * 64)] 543 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 544 vld1.8 {d0-d3}, [src]! 545 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 546 vld1.8 {d4-d7}, [src]! 547 subs count, count, #64 548 bpl 1b 5492: 550 vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 551 vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 552 ands count, count, #0x3f 553#else 554 /* Use an SMS style loop to maximize the I/O bandwidth. */ 555 sub src, src, #4 556 sub dst, dst, #8 557 subs tmp2, count, #64 /* Use tmp2 for count. */ 558 ldr A_l, [src, #4] 559 ldr A_h, [src, #8] 560 strd B_l, B_h, [sp, #8] 561 ldr B_l, [src, #12] 562 ldr B_h, [src, #16] 563 strd C_l, C_h, [sp, #16] 564 ldr C_l, [src, #20] 565 ldr C_h, [src, #24] 566 strd D_l, D_h, [sp, #24] 567 ldr D_l, [src, #28] 568 ldr D_h, [src, #32]! 569 b 1f 570 .p2align 6 5712: 572 pld [src, #(5 * 64) - (32 - 4)] 573 strd A_l, A_h, [dst, #40] 574 ldr A_l, [src, #36] 575 ldr A_h, [src, #40] 576 strd B_l, B_h, [dst, #48] 577 ldr B_l, [src, #44] 578 ldr B_h, [src, #48] 579 strd C_l, C_h, [dst, #56] 580 ldr C_l, [src, #52] 581 ldr C_h, [src, #56] 582 strd D_l, D_h, [dst, #64]! 583 ldr D_l, [src, #60] 584 ldr D_h, [src, #64]! 585 subs tmp2, tmp2, #64 5861: 587 strd A_l, A_h, [dst, #8] 588 ldr A_l, [src, #4] 589 ldr A_h, [src, #8] 590 strd B_l, B_h, [dst, #16] 591 ldr B_l, [src, #12] 592 ldr B_h, [src, #16] 593 strd C_l, C_h, [dst, #24] 594 ldr C_l, [src, #20] 595 ldr C_h, [src, #24] 596 strd D_l, D_h, [dst, #32] 597 ldr D_l, [src, #28] 598 ldr D_h, [src, #32] 599 bcs 2b 600 601 /* Save the remaining bytes and restore the callee-saved regs. */ 602 strd A_l, A_h, [dst, #40] 603 add src, src, #36 604 strd B_l, B_h, [dst, #48] 605 ldrd B_l, B_h, [sp, #8] 606 strd C_l, C_h, [dst, #56] 607 ldrd C_l, C_h, [sp, #16] 608 strd D_l, D_h, [dst, #64] 609 ldrd D_l, D_h, [sp, #24] 610 add dst, dst, #72 611 ands count, tmp2, #0x3f 612#endif 613 ldr tmp2, [sp], #FRAME_SIZE 614 bne .Ltail63unaligned 615 bx lr 616 617 .size memcpy, . - memcpy 618