131914882SAlex Richardson/* 231914882SAlex Richardson * memcpy - copy memory area 331914882SAlex Richardson * 4*072a4ba8SAndrew Turner * Copyright (c) 2012-2022, Arm Limited. 5*072a4ba8SAndrew Turner * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 631914882SAlex Richardson */ 731914882SAlex Richardson 831914882SAlex Richardson/* Assumptions: 931914882SAlex Richardson * 1031914882SAlex Richardson * ARMv8-a, AArch64, unaligned accesses. 1131914882SAlex Richardson * 1231914882SAlex Richardson */ 1331914882SAlex Richardson 14*072a4ba8SAndrew Turner#include "asmdefs.h" 1531914882SAlex Richardson 1631914882SAlex Richardson#define dstin x0 1731914882SAlex Richardson#define src x1 1831914882SAlex Richardson#define count x2 1931914882SAlex Richardson#define dst x3 2031914882SAlex Richardson#define srcend x4 2131914882SAlex Richardson#define dstend x5 2231914882SAlex Richardson#define A_l x6 2331914882SAlex Richardson#define A_lw w6 2431914882SAlex Richardson#define A_h x7 2531914882SAlex Richardson#define B_l x8 2631914882SAlex Richardson#define B_lw w8 2731914882SAlex Richardson#define B_h x9 2831914882SAlex Richardson#define C_l x10 2931914882SAlex Richardson#define C_lw w10 3031914882SAlex Richardson#define C_h x11 3131914882SAlex Richardson#define D_l x12 3231914882SAlex Richardson#define D_h x13 3331914882SAlex Richardson#define E_l x14 3431914882SAlex Richardson#define E_h x15 3531914882SAlex Richardson#define F_l x16 3631914882SAlex Richardson#define F_h x17 3731914882SAlex Richardson#define G_l count 3831914882SAlex Richardson#define G_h dst 3931914882SAlex Richardson#define H_l src 4031914882SAlex Richardson#define H_h srcend 4131914882SAlex Richardson#define tmp1 x14 4231914882SAlex Richardson 4331914882SAlex Richardson/* This implementation handles overlaps and supports both memcpy and memmove 4431914882SAlex Richardson from a single entry point. It uses unaligned accesses and branchless 4531914882SAlex Richardson sequences to keep the code small, simple and improve performance. 4631914882SAlex Richardson 4731914882SAlex Richardson Copies are split into 3 main cases: small copies of up to 32 bytes, medium 4831914882SAlex Richardson copies of up to 128 bytes, and large copies. The overhead of the overlap 4931914882SAlex Richardson check is negligible since it is only required for large copies. 5031914882SAlex Richardson 5131914882SAlex Richardson Large copies use a software pipelined loop processing 64 bytes per iteration. 5231914882SAlex Richardson The destination pointer is 16-byte aligned to minimize unaligned accesses. 5331914882SAlex Richardson The loop tail is handled by always copying 64 bytes from the end. 5431914882SAlex Richardson*/ 5531914882SAlex Richardson 5631914882SAlex RichardsonENTRY_ALIAS (__memmove_aarch64) 5731914882SAlex RichardsonENTRY (__memcpy_aarch64) 5831914882SAlex Richardson add srcend, src, count 5931914882SAlex Richardson add dstend, dstin, count 6031914882SAlex Richardson cmp count, 128 6131914882SAlex Richardson b.hi L(copy_long) 6231914882SAlex Richardson cmp count, 32 6331914882SAlex Richardson b.hi L(copy32_128) 6431914882SAlex Richardson 6531914882SAlex Richardson /* Small copies: 0..32 bytes. */ 6631914882SAlex Richardson cmp count, 16 6731914882SAlex Richardson b.lo L(copy16) 6831914882SAlex Richardson ldp A_l, A_h, [src] 6931914882SAlex Richardson ldp D_l, D_h, [srcend, -16] 7031914882SAlex Richardson stp A_l, A_h, [dstin] 7131914882SAlex Richardson stp D_l, D_h, [dstend, -16] 7231914882SAlex Richardson ret 7331914882SAlex Richardson 7431914882SAlex Richardson /* Copy 8-15 bytes. */ 7531914882SAlex RichardsonL(copy16): 7631914882SAlex Richardson tbz count, 3, L(copy8) 7731914882SAlex Richardson ldr A_l, [src] 7831914882SAlex Richardson ldr A_h, [srcend, -8] 7931914882SAlex Richardson str A_l, [dstin] 8031914882SAlex Richardson str A_h, [dstend, -8] 8131914882SAlex Richardson ret 8231914882SAlex Richardson 8331914882SAlex Richardson .p2align 3 8431914882SAlex Richardson /* Copy 4-7 bytes. */ 8531914882SAlex RichardsonL(copy8): 8631914882SAlex Richardson tbz count, 2, L(copy4) 8731914882SAlex Richardson ldr A_lw, [src] 8831914882SAlex Richardson ldr B_lw, [srcend, -4] 8931914882SAlex Richardson str A_lw, [dstin] 9031914882SAlex Richardson str B_lw, [dstend, -4] 9131914882SAlex Richardson ret 9231914882SAlex Richardson 9331914882SAlex Richardson /* Copy 0..3 bytes using a branchless sequence. */ 9431914882SAlex RichardsonL(copy4): 9531914882SAlex Richardson cbz count, L(copy0) 9631914882SAlex Richardson lsr tmp1, count, 1 9731914882SAlex Richardson ldrb A_lw, [src] 9831914882SAlex Richardson ldrb C_lw, [srcend, -1] 9931914882SAlex Richardson ldrb B_lw, [src, tmp1] 10031914882SAlex Richardson strb A_lw, [dstin] 10131914882SAlex Richardson strb B_lw, [dstin, tmp1] 10231914882SAlex Richardson strb C_lw, [dstend, -1] 10331914882SAlex RichardsonL(copy0): 10431914882SAlex Richardson ret 10531914882SAlex Richardson 10631914882SAlex Richardson .p2align 4 10731914882SAlex Richardson /* Medium copies: 33..128 bytes. */ 10831914882SAlex RichardsonL(copy32_128): 10931914882SAlex Richardson ldp A_l, A_h, [src] 11031914882SAlex Richardson ldp B_l, B_h, [src, 16] 11131914882SAlex Richardson ldp C_l, C_h, [srcend, -32] 11231914882SAlex Richardson ldp D_l, D_h, [srcend, -16] 11331914882SAlex Richardson cmp count, 64 11431914882SAlex Richardson b.hi L(copy128) 11531914882SAlex Richardson stp A_l, A_h, [dstin] 11631914882SAlex Richardson stp B_l, B_h, [dstin, 16] 11731914882SAlex Richardson stp C_l, C_h, [dstend, -32] 11831914882SAlex Richardson stp D_l, D_h, [dstend, -16] 11931914882SAlex Richardson ret 12031914882SAlex Richardson 12131914882SAlex Richardson .p2align 4 12231914882SAlex Richardson /* Copy 65..128 bytes. */ 12331914882SAlex RichardsonL(copy128): 12431914882SAlex Richardson ldp E_l, E_h, [src, 32] 12531914882SAlex Richardson ldp F_l, F_h, [src, 48] 12631914882SAlex Richardson cmp count, 96 12731914882SAlex Richardson b.ls L(copy96) 12831914882SAlex Richardson ldp G_l, G_h, [srcend, -64] 12931914882SAlex Richardson ldp H_l, H_h, [srcend, -48] 13031914882SAlex Richardson stp G_l, G_h, [dstend, -64] 13131914882SAlex Richardson stp H_l, H_h, [dstend, -48] 13231914882SAlex RichardsonL(copy96): 13331914882SAlex Richardson stp A_l, A_h, [dstin] 13431914882SAlex Richardson stp B_l, B_h, [dstin, 16] 13531914882SAlex Richardson stp E_l, E_h, [dstin, 32] 13631914882SAlex Richardson stp F_l, F_h, [dstin, 48] 13731914882SAlex Richardson stp C_l, C_h, [dstend, -32] 13831914882SAlex Richardson stp D_l, D_h, [dstend, -16] 13931914882SAlex Richardson ret 14031914882SAlex Richardson 14131914882SAlex Richardson .p2align 4 14231914882SAlex Richardson /* Copy more than 128 bytes. */ 14331914882SAlex RichardsonL(copy_long): 14431914882SAlex Richardson /* Use backwards copy if there is an overlap. */ 14531914882SAlex Richardson sub tmp1, dstin, src 14631914882SAlex Richardson cbz tmp1, L(copy0) 14731914882SAlex Richardson cmp tmp1, count 14831914882SAlex Richardson b.lo L(copy_long_backwards) 14931914882SAlex Richardson 15031914882SAlex Richardson /* Copy 16 bytes and then align dst to 16-byte alignment. */ 15131914882SAlex Richardson 15231914882SAlex Richardson ldp D_l, D_h, [src] 15331914882SAlex Richardson and tmp1, dstin, 15 15431914882SAlex Richardson bic dst, dstin, 15 15531914882SAlex Richardson sub src, src, tmp1 15631914882SAlex Richardson add count, count, tmp1 /* Count is now 16 too large. */ 15731914882SAlex Richardson ldp A_l, A_h, [src, 16] 15831914882SAlex Richardson stp D_l, D_h, [dstin] 15931914882SAlex Richardson ldp B_l, B_h, [src, 32] 16031914882SAlex Richardson ldp C_l, C_h, [src, 48] 16131914882SAlex Richardson ldp D_l, D_h, [src, 64]! 16231914882SAlex Richardson subs count, count, 128 + 16 /* Test and readjust count. */ 16331914882SAlex Richardson b.ls L(copy64_from_end) 16431914882SAlex Richardson 16531914882SAlex RichardsonL(loop64): 16631914882SAlex Richardson stp A_l, A_h, [dst, 16] 16731914882SAlex Richardson ldp A_l, A_h, [src, 16] 16831914882SAlex Richardson stp B_l, B_h, [dst, 32] 16931914882SAlex Richardson ldp B_l, B_h, [src, 32] 17031914882SAlex Richardson stp C_l, C_h, [dst, 48] 17131914882SAlex Richardson ldp C_l, C_h, [src, 48] 17231914882SAlex Richardson stp D_l, D_h, [dst, 64]! 17331914882SAlex Richardson ldp D_l, D_h, [src, 64]! 17431914882SAlex Richardson subs count, count, 64 17531914882SAlex Richardson b.hi L(loop64) 17631914882SAlex Richardson 17731914882SAlex Richardson /* Write the last iteration and copy 64 bytes from the end. */ 17831914882SAlex RichardsonL(copy64_from_end): 17931914882SAlex Richardson ldp E_l, E_h, [srcend, -64] 18031914882SAlex Richardson stp A_l, A_h, [dst, 16] 18131914882SAlex Richardson ldp A_l, A_h, [srcend, -48] 18231914882SAlex Richardson stp B_l, B_h, [dst, 32] 18331914882SAlex Richardson ldp B_l, B_h, [srcend, -32] 18431914882SAlex Richardson stp C_l, C_h, [dst, 48] 18531914882SAlex Richardson ldp C_l, C_h, [srcend, -16] 18631914882SAlex Richardson stp D_l, D_h, [dst, 64] 18731914882SAlex Richardson stp E_l, E_h, [dstend, -64] 18831914882SAlex Richardson stp A_l, A_h, [dstend, -48] 18931914882SAlex Richardson stp B_l, B_h, [dstend, -32] 19031914882SAlex Richardson stp C_l, C_h, [dstend, -16] 19131914882SAlex Richardson ret 19231914882SAlex Richardson 19331914882SAlex Richardson .p2align 4 19431914882SAlex Richardson 19531914882SAlex Richardson /* Large backwards copy for overlapping copies. 19631914882SAlex Richardson Copy 16 bytes and then align dst to 16-byte alignment. */ 19731914882SAlex RichardsonL(copy_long_backwards): 19831914882SAlex Richardson ldp D_l, D_h, [srcend, -16] 19931914882SAlex Richardson and tmp1, dstend, 15 20031914882SAlex Richardson sub srcend, srcend, tmp1 20131914882SAlex Richardson sub count, count, tmp1 20231914882SAlex Richardson ldp A_l, A_h, [srcend, -16] 20331914882SAlex Richardson stp D_l, D_h, [dstend, -16] 20431914882SAlex Richardson ldp B_l, B_h, [srcend, -32] 20531914882SAlex Richardson ldp C_l, C_h, [srcend, -48] 20631914882SAlex Richardson ldp D_l, D_h, [srcend, -64]! 20731914882SAlex Richardson sub dstend, dstend, tmp1 20831914882SAlex Richardson subs count, count, 128 20931914882SAlex Richardson b.ls L(copy64_from_start) 21031914882SAlex Richardson 21131914882SAlex RichardsonL(loop64_backwards): 21231914882SAlex Richardson stp A_l, A_h, [dstend, -16] 21331914882SAlex Richardson ldp A_l, A_h, [srcend, -16] 21431914882SAlex Richardson stp B_l, B_h, [dstend, -32] 21531914882SAlex Richardson ldp B_l, B_h, [srcend, -32] 21631914882SAlex Richardson stp C_l, C_h, [dstend, -48] 21731914882SAlex Richardson ldp C_l, C_h, [srcend, -48] 21831914882SAlex Richardson stp D_l, D_h, [dstend, -64]! 21931914882SAlex Richardson ldp D_l, D_h, [srcend, -64]! 22031914882SAlex Richardson subs count, count, 64 22131914882SAlex Richardson b.hi L(loop64_backwards) 22231914882SAlex Richardson 22331914882SAlex Richardson /* Write the last iteration and copy 64 bytes from the start. */ 22431914882SAlex RichardsonL(copy64_from_start): 22531914882SAlex Richardson ldp G_l, G_h, [src, 48] 22631914882SAlex Richardson stp A_l, A_h, [dstend, -16] 22731914882SAlex Richardson ldp A_l, A_h, [src, 32] 22831914882SAlex Richardson stp B_l, B_h, [dstend, -32] 22931914882SAlex Richardson ldp B_l, B_h, [src, 16] 23031914882SAlex Richardson stp C_l, C_h, [dstend, -48] 23131914882SAlex Richardson ldp C_l, C_h, [src] 23231914882SAlex Richardson stp D_l, D_h, [dstend, -64] 23331914882SAlex Richardson stp G_l, G_h, [dstin, 48] 23431914882SAlex Richardson stp A_l, A_h, [dstin, 32] 23531914882SAlex Richardson stp B_l, B_h, [dstin, 16] 23631914882SAlex Richardson stp C_l, C_h, [dstin] 23731914882SAlex Richardson ret 23831914882SAlex Richardson 23931914882SAlex RichardsonEND (__memcpy_aarch64) 24031914882SAlex Richardson 241