1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright (C) 2022 Michael T. Kloos <michael@michaelkloos.com> 4 */ 5 6#include <linux/linkage.h> 7#include <asm/asm.h> 8 9SYM_FUNC_START(__memmove) 10 /* 11 * Returns 12 * a0 - dest 13 * 14 * Parameters 15 * a0 - Inclusive first byte of dest 16 * a1 - Inclusive first byte of src 17 * a2 - Length of copy n 18 * 19 * Because the return matches the parameter register a0, 20 * we will not clobber or modify that register. 21 * 22 * Note: This currently only works on little-endian. 23 * To port to big-endian, reverse the direction of shifts 24 * in the 2 misaligned fixup copy loops. 25 */ 26 27 /* Return if nothing to do */ 28 beq a0, a1, .Lreturn_from_memmove 29 beqz a2, .Lreturn_from_memmove 30 31 /* 32 * Register Uses 33 * Forward Copy: a1 - Index counter of src 34 * Reverse Copy: a4 - Index counter of src 35 * Forward Copy: t3 - Index counter of dest 36 * Reverse Copy: t4 - Index counter of dest 37 * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest 38 * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest 39 * Both Copy Modes: t0 - Link / Temporary for load-store 40 * Both Copy Modes: t1 - Temporary for load-store 41 * Both Copy Modes: t2 - Temporary for load-store 42 * Both Copy Modes: a5 - dest to src alignment offset 43 * Both Copy Modes: a6 - Shift ammount 44 * Both Copy Modes: a7 - Inverse Shift ammount 45 * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops 46 */ 47 48 /* 49 * Solve for some register values now. 50 * Byte copy does not need t5 or t6. 51 */ 52 mv t3, a0 53 add t4, a0, a2 54 add a4, a1, a2 55 56 /* 57 * Byte copy if copying less than (2 * SZREG) bytes. This can 58 * cause problems with the bulk copy implementation and is 59 * small enough not to bother. 60 */ 61 andi t0, a2, -(2 * SZREG) 62 beqz t0, .Lbyte_copy 63 64 /* 65 * Now solve for t5 and t6. 66 */ 67 andi t5, t3, -SZREG 68 andi t6, t4, -SZREG 69 /* 70 * If dest(Register t3) rounded down to the nearest naturally 71 * aligned SZREG address, does not equal dest, then add SZREG 72 * to find the low-bound of SZREG alignment in the dest memory 73 * region. Note that this could overshoot the dest memory 74 * region if n is less than SZREG. This is one reason why 75 * we always byte copy if n is less than SZREG. 76 * Otherwise, dest is already naturally aligned to SZREG. 77 */ 78 beq t5, t3, 1f 79 addi t5, t5, SZREG 80 1: 81 82 /* 83 * If the dest and src are co-aligned to SZREG, then there is 84 * no need for the full rigmarole of a full misaligned fixup copy. 85 * Instead, do a simpler co-aligned copy. 86 */ 87 xor t0, a0, a1 88 andi t1, t0, (SZREG - 1) 89 beqz t1, .Lcoaligned_copy 90 /* Fall through to misaligned fixup copy */ 91 92.Lmisaligned_fixup_copy: 93 bltu a1, a0, .Lmisaligned_fixup_copy_reverse 94 95.Lmisaligned_fixup_copy_forward: 96 jal t0, .Lbyte_copy_until_aligned_forward 97 98 andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ 99 slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ 100 sub a5, a1, t3 /* Find the difference between src and dest */ 101 andi a1, a1, -SZREG /* Align the src pointer */ 102 addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ 103 104 /* 105 * Compute The Inverse Shift 106 * a7 = XLEN - a6 = XLEN + -a6 107 * 2s complement negation to find the negative: -a6 = ~a6 + 1 108 * Add that to XLEN. XLEN = SZREG * 8. 109 */ 110 not a7, a6 111 addi a7, a7, (SZREG * 8 + 1) 112 113 /* 114 * Fix Misalignment Copy Loop - Forward 115 * load_val0 = load_ptr[0]; 116 * do { 117 * load_val1 = load_ptr[1]; 118 * store_ptr += 2; 119 * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); 120 * 121 * if (store_ptr == {a2}) 122 * break; 123 * 124 * load_val0 = load_ptr[2]; 125 * load_ptr += 2; 126 * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); 127 * 128 * } while (store_ptr != store_ptr_end); 129 * store_ptr = store_ptr_end; 130 */ 131 132 REG_L t0, (0 * SZREG)(a1) 133 1: 134 REG_L t1, (1 * SZREG)(a1) 135 addi t3, t3, (2 * SZREG) 136 srl t0, t0, a6 137 sll t2, t1, a7 138 or t2, t0, t2 139 REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) 140 141 beq t3, a2, 2f 142 143 REG_L t0, (2 * SZREG)(a1) 144 addi a1, a1, (2 * SZREG) 145 srl t1, t1, a6 146 sll t2, t0, a7 147 or t2, t1, t2 148 REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) 149 150 bne t3, t6, 1b 151 2: 152 mv t3, t6 /* Fix the dest pointer in case the loop was broken */ 153 154 add a1, t3, a5 /* Restore the src pointer */ 155 j .Lbyte_copy_forward /* Copy any remaining bytes */ 156 157.Lmisaligned_fixup_copy_reverse: 158 jal t0, .Lbyte_copy_until_aligned_reverse 159 160 andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ 161 slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ 162 sub a5, a4, t4 /* Find the difference between src and dest */ 163 andi a4, a4, -SZREG /* Align the src pointer */ 164 addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ 165 166 /* 167 * Compute The Inverse Shift 168 * a7 = XLEN - a6 = XLEN + -a6 169 * 2s complement negation to find the negative: -a6 = ~a6 + 1 170 * Add that to XLEN. XLEN = SZREG * 8. 171 */ 172 not a7, a6 173 addi a7, a7, (SZREG * 8 + 1) 174 175 /* 176 * Fix Misalignment Copy Loop - Reverse 177 * load_val1 = load_ptr[0]; 178 * do { 179 * load_val0 = load_ptr[-1]; 180 * store_ptr -= 2; 181 * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); 182 * 183 * if (store_ptr == {a2}) 184 * break; 185 * 186 * load_val1 = load_ptr[-2]; 187 * load_ptr -= 2; 188 * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); 189 * 190 * } while (store_ptr != store_ptr_end); 191 * store_ptr = store_ptr_end; 192 */ 193 194 REG_L t1, ( 0 * SZREG)(a4) 195 1: 196 REG_L t0, (-1 * SZREG)(a4) 197 addi t4, t4, (-2 * SZREG) 198 sll t1, t1, a7 199 srl t2, t0, a6 200 or t2, t1, t2 201 REG_S t2, ( 1 * SZREG)(t4) 202 203 beq t4, a2, 2f 204 205 REG_L t1, (-2 * SZREG)(a4) 206 addi a4, a4, (-2 * SZREG) 207 sll t0, t0, a7 208 srl t2, t1, a6 209 or t2, t0, t2 210 REG_S t2, ( 0 * SZREG)(t4) 211 212 bne t4, t5, 1b 213 2: 214 mv t4, t5 /* Fix the dest pointer in case the loop was broken */ 215 216 add a4, t4, a5 /* Restore the src pointer */ 217 j .Lbyte_copy_reverse /* Copy any remaining bytes */ 218 219/* 220 * Simple copy loops for SZREG co-aligned memory locations. 221 * These also make calls to do byte copies for any unaligned 222 * data at their terminations. 223 */ 224.Lcoaligned_copy: 225 bltu a1, a0, .Lcoaligned_copy_reverse 226 227.Lcoaligned_copy_forward: 228 jal t0, .Lbyte_copy_until_aligned_forward 229 230 1: 231 REG_L t1, ( 0 * SZREG)(a1) 232 addi a1, a1, SZREG 233 addi t3, t3, SZREG 234 REG_S t1, (-1 * SZREG)(t3) 235 bne t3, t6, 1b 236 237 j .Lbyte_copy_forward /* Copy any remaining bytes */ 238 239.Lcoaligned_copy_reverse: 240 jal t0, .Lbyte_copy_until_aligned_reverse 241 242 1: 243 REG_L t1, (-1 * SZREG)(a4) 244 addi a4, a4, -SZREG 245 addi t4, t4, -SZREG 246 REG_S t1, ( 0 * SZREG)(t4) 247 bne t4, t5, 1b 248 249 j .Lbyte_copy_reverse /* Copy any remaining bytes */ 250 251/* 252 * These are basically sub-functions within the function. They 253 * are used to byte copy until the dest pointer is in alignment. 254 * At which point, a bulk copy method can be used by the 255 * calling code. These work on the same registers as the bulk 256 * copy loops. Therefore, the register values can be picked 257 * up from where they were left and we avoid code duplication 258 * without any overhead except the call in and return jumps. 259 */ 260.Lbyte_copy_until_aligned_forward: 261 beq t3, t5, 2f 262 1: 263 lb t1, 0(a1) 264 addi a1, a1, 1 265 addi t3, t3, 1 266 sb t1, -1(t3) 267 bne t3, t5, 1b 268 2: 269 jalr zero, 0x0(t0) /* Return to multibyte copy loop */ 270 271.Lbyte_copy_until_aligned_reverse: 272 beq t4, t6, 2f 273 1: 274 lb t1, -1(a4) 275 addi a4, a4, -1 276 addi t4, t4, -1 277 sb t1, 0(t4) 278 bne t4, t6, 1b 279 2: 280 jalr zero, 0x0(t0) /* Return to multibyte copy loop */ 281 282/* 283 * Simple byte copy loops. 284 * These will byte copy until they reach the end of data to copy. 285 * At that point, they will call to return from memmove. 286 */ 287.Lbyte_copy: 288 bltu a1, a0, .Lbyte_copy_reverse 289 290.Lbyte_copy_forward: 291 beq t3, t4, 2f 292 1: 293 lb t1, 0(a1) 294 addi a1, a1, 1 295 addi t3, t3, 1 296 sb t1, -1(t3) 297 bne t3, t4, 1b 298 2: 299 ret 300 301.Lbyte_copy_reverse: 302 beq t4, t3, 2f 303 1: 304 lb t1, -1(a4) 305 addi a4, a4, -1 306 addi t4, t4, -1 307 sb t1, 0(t4) 308 bne t4, t3, 1b 309 2: 310 311.Lreturn_from_memmove: 312 ret 313 314SYM_FUNC_END(__memmove) 315SYM_FUNC_ALIAS_WEAK(memmove, __memmove) 316SYM_FUNC_ALIAS(__pi_memmove, __memmove) 317SYM_FUNC_ALIAS(__pi___memmove, __memmove) 318