1/*- 2 * Copyright (c) 2018 Instituto de Pesquisas Eldorado 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the author nor the names of its contributors may 14 * be used to endorse or promote products derived from this software 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 */ 29 30#include <machine/asm.h> 31#define BLOCK_SIZE_BITS 6 32#define BLOCK_SIZE (1 << BLOCK_SIZE_BITS) 33#define BLOCK_SIZE_MASK (BLOCK_SIZE - 1) 34 35/* Minimum 8 byte alignment, to avoid cache-inhibited alignment faults.*/ 36#ifndef ALIGN_MASK 37#define ALIGN_MASK 0x7 38#endif 39 40#define MULTI_PHASE_THRESHOLD 512 41 42#ifndef FN_NAME 43#ifdef MEMMOVE 44#define FN_NAME __memmove 45WEAK_REFERENCE(__memmove, memmove); 46#else 47#define FN_NAME __bcopy 48WEAK_REFERENCE(__bcopy, bcopy); 49#endif 50#endif 51 52/* 53 * r3: dst 54 * r4: src 55 * r5: len 56 */ 57 58ENTRY(FN_NAME) 59 cmpld %r3, %r4 /* src == dst? nothing to do */ 60 beqlr- 61 cmpdi %r5, 0 /* len == 0? nothing to do */ 62 beqlr- 63 64#ifdef MEMMOVE 65 std %r3, -8(%r1) /* save dst */ 66#else /* bcopy: swap src/dst */ 67 mr %r0, %r3 68 mr %r3, %r4 69 mr %r4, %r0 70#endif 71 72 /* First check for relative alignment, if unaligned copy one byte at a time */ 73 andi. %r8, %r3, ALIGN_MASK 74 andi. %r7, %r4, ALIGN_MASK 75 cmpd %r7, %r8 76 bne .Lunaligned 77 78 79 cmpldi %r5, MULTI_PHASE_THRESHOLD 80 bge .Lmulti_phase 81 b .Lfast_copy 82 83.Lunaligned: 84 /* forward or backward copy? */ 85 cmpd %r4, %r3 86 blt .Lbackward_unaligned 87 88 /* Just need to setup increment and jump to copy */ 89 li %r0, 1 90 mtctr %r5 91 b .Lsingle_1_loop 92 93.Lbackward_unaligned: 94 /* advance src and dst to last byte, set decrement and jump to copy */ 95 add %r3, %r3, %r5 96 addi %r3, %r3, -1 97 add %r4, %r4, %r5 98 addi %r4, %r4, -1 99 li %r0, -1 100 mtctr %r5 101 b .Lsingle_1_loop 102 103.Lfast_copy: 104 /* align src */ 105 cmpd %r4, %r3 /* forward or backward copy? */ 106 blt .Lbackward_align 107 108 .align 5 109.Lalign: 110 andi. %r0, %r4, 15 111 beq .Lsingle_copy 112 lbz %r0, 0(%r4) 113 addi %r4, %r4, 1 114 stb %r0, 0(%r3) 115 addi %r3, %r3, 1 116 addi %r5, %r5, -1 117 cmpdi %r5, 0 118 beq- .Ldone 119 b .Lalign 120 121.Lbackward_align: 122 /* advance src and dst to end (past last byte) */ 123 add %r3, %r3, %r5 124 add %r4, %r4, %r5 125 .align 5 126.Lbackward_align_loop: 127 andi. %r0, %r4, 15 128 beq .Lbackward_single_copy 129 lbzu %r0, -1(%r4) 130 addi %r5, %r5, -1 131 stbu %r0, -1(%r3) 132 cmpdi %r5, 0 133 beq- .Ldone 134 b .Lbackward_align_loop 135 136.Lsingle_copy: 137 /* forward copy */ 138 li %r0, 1 139 li %r8, 16 140 li %r9, 0 141 b .Lsingle_phase 142 143.Lbackward_single_copy: 144 /* backward copy */ 145 li %r0, -1 146 li %r8, -16 147 li %r9, -15 148 /* point src and dst to last byte */ 149 addi %r3, %r3, -1 150 addi %r4, %r4, -1 151 152.Lsingle_phase: 153 srdi. %r6, %r5, 4 /* number of 16-bytes */ 154 beq .Lsingle_1 155 156 /* pre-adjustment */ 157 add %r3, %r3, %r9 158 add %r4, %r4, %r9 159 160 mtctr %r6 161 .align 5 162.Lsingle_16_loop: 163 ld %r6, 0(%r4) 164 ld %r7, 8(%r4) 165 add %r4, %r4, %r8 166 std %r6, 0(%r3) 167 std %r7, 8(%r3) 168 add %r3, %r3, %r8 169 bdnz .Lsingle_16_loop 170 171 /* post-adjustment */ 172 sub %r3, %r3, %r9 173 sub %r4, %r4, %r9 174 175.Lsingle_1: 176 andi. %r6, %r5, 0x0f /* number of 1-bytes */ 177 beq .Ldone /* 1-bytes == 0? done */ 178 179 mtctr %r6 180 .align 5 181.Lsingle_1_loop: 182 lbz %r6, 0(%r4) 183 add %r4, %r4, %r0 /* increment */ 184 stb %r6, 0(%r3) 185 add %r3, %r3, %r0 /* increment */ 186 bdnz .Lsingle_1_loop 187 188.Ldone: 189#ifdef MEMMOVE 190 ld %r3, -8(%r1) /* restore dst */ 191#endif 192 blr 193 194 195.Lmulti_phase: 196 /* set up multi-phase copy parameters */ 197 198 /* r7 = bytes before the aligned section of the buffer */ 199 andi. %r6, %r4, 15 200 subfic %r7, %r6, 16 201 /* r8 = bytes in and after the aligned section of the buffer */ 202 sub %r8, %r5, %r7 203 /* r9 = bytes after the aligned section of the buffer */ 204 andi. %r9, %r8, BLOCK_SIZE_MASK 205 /* r10 = BLOCKS in the aligned section of the buffer */ 206 srdi %r10, %r8, BLOCK_SIZE_BITS 207 208 /* forward or backward copy? */ 209 cmpd %r4, %r3 210 blt .Lbackward_multi_copy 211 212 /* set up forward copy parameters */ 213 std %r7, -32(%r1) /* bytes to copy in phase 1 */ 214 std %r10, -40(%r1) /* BLOCKS to copy in phase 2 */ 215 std %r9, -48(%r1) /* bytes to copy in phase 3 */ 216 217 li %r0, 1 /* increment for phases 1 and 3 */ 218 li %r5, BLOCK_SIZE /* increment for phase 2 */ 219 220 /* op offsets for phase 2 */ 221 li %r7, 0 222 li %r8, 16 223 li %r9, 32 224 li %r10, 48 225 226 std %r8, -16(%r1) /* 16-byte increment (16) */ 227 std %r7, -24(%r1) /* 16-byte pre/post adjustment (0) */ 228 229 b .Lphase1 230 231.Lbackward_multi_copy: 232 /* set up backward copy parameters */ 233 std %r9, -32(%r1) /* bytes to copy in phase 1 */ 234 std %r10, -40(%r1) /* BLOCKS to copy in phase 2 */ 235 std %r7, -48(%r1) /* bytes to copy in phase 3 */ 236 237 li %r0, -1 /* increment for phases 1 and 3 */ 238 add %r6, %r5, %r0 /* r6 = len - 1 */ 239 li %r5, -BLOCK_SIZE /* increment for phase 2 */ 240 /* advance src and dst to the last position */ 241 add %r3, %r3, %r6 242 add %r4, %r4, %r6 243 244 /* op offsets for phase 2 */ 245 li %r7, -15 246 li %r8, -31 247 li %r9, -47 248 li %r10, -63 249 250 add %r6, %r7, %r0 /* r6 = -16 */ 251 std %r6, -16(%r1) /* 16-byte increment (-16) */ 252 std %r7, -24(%r1) /* 16-byte pre/post adjustment (-15) */ 253 254.Lphase1: 255 ld %r6, -32(%r1) /* bytes to copy in phase 1 */ 256 cmpldi %r6, 0 /* r6 == 0? skip phase 1 */ 257 beq+ .Lphase2 258 259 mtctr %r6 260 .align 5 261.Lphase1_loop: 262 lbz %r6, 0(%r4) 263 add %r4, %r4, %r0 /* phase 1 increment */ 264 stb %r6, 0(%r3) 265 add %r3, %r3, %r0 /* phase 1 increment */ 266 bdnz .Lphase1_loop 267 268.Lphase2: 269 ld %r6, -40(%r1) /* BLOCKS to copy in phase 2 */ 270 cmpldi %r6, 0 /* %r6 == 0? skip phase 2 */ 271 beq .Lphase3 272 273#ifdef FN_PHASE2 274FN_PHASE2 275#else 276 /* save registers */ 277 std %r14, -56(%r1) 278 std %r15, -64(%r1) 279 std %r16, -72(%r1) 280 std %r17, -80(%r1) 281 std %r18, -88(%r1) 282 std %r19, -96(%r1) 283 std %r20, -104(%r1) 284 std %r21, -112(%r1) 285 286 addi %r18, %r7, 8 287 addi %r19, %r8, 8 288 addi %r20, %r9, 8 289 addi %r21, %r10, 8 290 291 mtctr %r6 292 .align 5 293.Lphase2_loop: 294 ldx %r14, %r7, %r4 295 ldx %r15, %r18, %r4 296 ldx %r16, %r8, %r4 297 ldx %r17, %r19, %r4 298 stdx %r14, %r7, %r3 299 stdx %r15, %r18, %r3 300 stdx %r16, %r8, %r3 301 stdx %r17, %r19, %r3 302 303 ldx %r14, %r9, %r4 304 ldx %r15, %r20, %r4 305 ldx %r16, %r10, %r4 306 ldx %r17, %r21, %r4 307 stdx %r14, %r9, %r3 308 stdx %r15, %r20, %r3 309 stdx %r16, %r10, %r3 310 stdx %r17, %r21, %r3 311 312 add %r4, %r4, %r5 /* phase 2 increment */ 313 add %r3, %r3, %r5 /* phase 2 increment */ 314 315 bdnz .Lphase2_loop 316 317 /* restore registers */ 318 ld %r14, -56(%r1) 319 ld %r15, -64(%r1) 320 ld %r16, -72(%r1) 321 ld %r17, -80(%r1) 322 ld %r18, -88(%r1) 323 ld %r19, -96(%r1) 324 ld %r20, -104(%r1) 325 ld %r21, -112(%r1) 326#endif 327 328.Lphase3: 329 /* load registers for transitioning into the single-phase logic */ 330 ld %r5, -48(%r1) /* bytes to copy in phase 3 */ 331 ld %r8, -16(%r1) /* 16-byte increment */ 332 ld %r9, -24(%r1) /* 16-byte pre/post adjustment */ 333 b .Lsingle_phase 334 335END(FN_NAME) 336 337 .section .note.GNU-stack,"",%progbits 338 339