1/*- 2 * Copyright (c) 2018 Instituto de Pesquisas Eldorado 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of the author nor the names of its contributors may 14 * be used to endorse or promote products derived from this software 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 */ 29 30#include <machine/asm.h> 31__FBSDID("$FreeBSD$"); 32 33#define BLOCK_SIZE_BITS 6 34#define BLOCK_SIZE (1 << BLOCK_SIZE_BITS) 35#define BLOCK_SIZE_MASK (BLOCK_SIZE - 1) 36 37/* Minimum 8 byte alignment, to avoid cache-inhibited alignment faults.*/ 38#ifndef ALIGN_MASK 39#define ALIGN_MASK 0x7 40#endif 41 42#define MULTI_PHASE_THRESHOLD 512 43 44#ifndef FN_NAME 45#ifdef MEMMOVE 46#define FN_NAME __memmove 47WEAK_REFERENCE(__memmove, memmove); 48#else 49#define FN_NAME __bcopy 50WEAK_REFERENCE(__bcopy, bcopy); 51#endif 52#endif 53 54/* 55 * r3: dst 56 * r4: src 57 * r5: len 58 */ 59 60ENTRY(FN_NAME) 61 cmpld %r3, %r4 /* src == dst? nothing to do */ 62 beqlr- 63 cmpdi %r5, 0 /* len == 0? nothing to do */ 64 beqlr- 65 66#ifdef MEMMOVE 67 std %r3, -8(%r1) /* save dst */ 68#else /* bcopy: swap src/dst */ 69 mr %r0, %r3 70 mr %r3, %r4 71 mr %r4, %r0 72#endif 73 74 /* First check for relative alignment, if unaligned copy one byte at a time */ 75 andi. %r8, %r3, ALIGN_MASK 76 andi. %r7, %r4, ALIGN_MASK 77 cmpd %r7, %r8 78 bne .Lunaligned 79 80 81 cmpldi %r5, MULTI_PHASE_THRESHOLD 82 bge .Lmulti_phase 83 b .Lfast_copy 84 85.Lunaligned: 86 /* forward or backward copy? */ 87 cmpd %r4, %r3 88 blt .Lbackward_unaligned 89 90 /* Just need to setup increment and jump to copy */ 91 li %r0, 1 92 mtctr %r5 93 b .Lsingle_1_loop 94 95.Lbackward_unaligned: 96 /* advance src and dst to last byte, set decrement and jump to copy */ 97 add %r3, %r3, %r5 98 addi %r3, %r3, -1 99 add %r4, %r4, %r5 100 addi %r4, %r4, -1 101 li %r0, -1 102 mtctr %r5 103 b .Lsingle_1_loop 104 105.Lfast_copy: 106 /* align src */ 107 cmpd %r4, %r3 /* forward or backward copy? */ 108 blt .Lbackward_align 109 110 .align 5 111.Lalign: 112 andi. %r0, %r4, 15 113 beq .Lsingle_copy 114 lbz %r0, 0(%r4) 115 addi %r4, %r4, 1 116 stb %r0, 0(%r3) 117 addi %r3, %r3, 1 118 addi %r5, %r5, -1 119 cmpdi %r5, 0 120 beq- .Ldone 121 b .Lalign 122 123.Lbackward_align: 124 /* advance src and dst to end (past last byte) */ 125 add %r3, %r3, %r5 126 add %r4, %r4, %r5 127 .align 5 128.Lbackward_align_loop: 129 andi. %r0, %r4, 15 130 beq .Lbackward_single_copy 131 lbzu %r0, -1(%r4) 132 addi %r5, %r5, -1 133 stbu %r0, -1(%r3) 134 cmpdi %r5, 0 135 beq- .Ldone 136 b .Lbackward_align_loop 137 138.Lsingle_copy: 139 /* forward copy */ 140 li %r0, 1 141 li %r8, 16 142 li %r9, 0 143 b .Lsingle_phase 144 145.Lbackward_single_copy: 146 /* backward copy */ 147 li %r0, -1 148 li %r8, -16 149 li %r9, -15 150 /* point src and dst to last byte */ 151 addi %r3, %r3, -1 152 addi %r4, %r4, -1 153 154.Lsingle_phase: 155 srdi. %r6, %r5, 4 /* number of 16-bytes */ 156 beq .Lsingle_1 157 158 /* pre-adjustment */ 159 add %r3, %r3, %r9 160 add %r4, %r4, %r9 161 162 mtctr %r6 163 .align 5 164.Lsingle_16_loop: 165 ld %r6, 0(%r4) 166 ld %r7, 8(%r4) 167 add %r4, %r4, %r8 168 std %r6, 0(%r3) 169 std %r7, 8(%r3) 170 add %r3, %r3, %r8 171 bdnz .Lsingle_16_loop 172 173 /* post-adjustment */ 174 sub %r3, %r3, %r9 175 sub %r4, %r4, %r9 176 177.Lsingle_1: 178 andi. %r6, %r5, 0x0f /* number of 1-bytes */ 179 beq .Ldone /* 1-bytes == 0? done */ 180 181 mtctr %r6 182 .align 5 183.Lsingle_1_loop: 184 lbz %r6, 0(%r4) 185 add %r4, %r4, %r0 /* increment */ 186 stb %r6, 0(%r3) 187 add %r3, %r3, %r0 /* increment */ 188 bdnz .Lsingle_1_loop 189 190.Ldone: 191#ifdef MEMMOVE 192 ld %r3, -8(%r1) /* restore dst */ 193#endif 194 blr 195 196 197.Lmulti_phase: 198 /* set up multi-phase copy parameters */ 199 200 /* r7 = bytes before the aligned section of the buffer */ 201 andi. %r6, %r4, 15 202 subfic %r7, %r6, 16 203 /* r8 = bytes in and after the aligned section of the buffer */ 204 sub %r8, %r5, %r7 205 /* r9 = bytes after the aligned section of the buffer */ 206 andi. %r9, %r8, BLOCK_SIZE_MASK 207 /* r10 = BLOCKS in the aligned section of the buffer */ 208 srdi %r10, %r8, BLOCK_SIZE_BITS 209 210 /* forward or backward copy? */ 211 cmpd %r4, %r3 212 blt .Lbackward_multi_copy 213 214 /* set up forward copy parameters */ 215 std %r7, -32(%r1) /* bytes to copy in phase 1 */ 216 std %r10, -40(%r1) /* BLOCKS to copy in phase 2 */ 217 std %r9, -48(%r1) /* bytes to copy in phase 3 */ 218 219 li %r0, 1 /* increment for phases 1 and 3 */ 220 li %r5, BLOCK_SIZE /* increment for phase 2 */ 221 222 /* op offsets for phase 2 */ 223 li %r7, 0 224 li %r8, 16 225 li %r9, 32 226 li %r10, 48 227 228 std %r8, -16(%r1) /* 16-byte increment (16) */ 229 std %r7, -24(%r1) /* 16-byte pre/post adjustment (0) */ 230 231 b .Lphase1 232 233.Lbackward_multi_copy: 234 /* set up backward copy parameters */ 235 std %r9, -32(%r1) /* bytes to copy in phase 1 */ 236 std %r10, -40(%r1) /* BLOCKS to copy in phase 2 */ 237 std %r7, -48(%r1) /* bytes to copy in phase 3 */ 238 239 li %r0, -1 /* increment for phases 1 and 3 */ 240 add %r6, %r5, %r0 /* r6 = len - 1 */ 241 li %r5, -BLOCK_SIZE /* increment for phase 2 */ 242 /* advance src and dst to the last position */ 243 add %r3, %r3, %r6 244 add %r4, %r4, %r6 245 246 /* op offsets for phase 2 */ 247 li %r7, -15 248 li %r8, -31 249 li %r9, -47 250 li %r10, -63 251 252 add %r6, %r7, %r0 /* r6 = -16 */ 253 std %r6, -16(%r1) /* 16-byte increment (-16) */ 254 std %r7, -24(%r1) /* 16-byte pre/post adjustment (-15) */ 255 256.Lphase1: 257 ld %r6, -32(%r1) /* bytes to copy in phase 1 */ 258 cmpldi %r6, 0 /* r6 == 0? skip phase 1 */ 259 beq+ .Lphase2 260 261 mtctr %r6 262 .align 5 263.Lphase1_loop: 264 lbz %r6, 0(%r4) 265 add %r4, %r4, %r0 /* phase 1 increment */ 266 stb %r6, 0(%r3) 267 add %r3, %r3, %r0 /* phase 1 increment */ 268 bdnz .Lphase1_loop 269 270.Lphase2: 271 ld %r6, -40(%r1) /* BLOCKS to copy in phase 2 */ 272 cmpldi %r6, 0 /* %r6 == 0? skip phase 2 */ 273 beq .Lphase3 274 275#ifdef FN_PHASE2 276FN_PHASE2 277#else 278 /* save registers */ 279 std %r14, -56(%r1) 280 std %r15, -64(%r1) 281 std %r16, -72(%r1) 282 std %r17, -80(%r1) 283 std %r18, -88(%r1) 284 std %r19, -96(%r1) 285 std %r20, -104(%r1) 286 std %r21, -112(%r1) 287 288 addi %r18, %r7, 8 289 addi %r19, %r8, 8 290 addi %r20, %r9, 8 291 addi %r21, %r10, 8 292 293 mtctr %r6 294 .align 5 295.Lphase2_loop: 296 ldx %r14, %r7, %r4 297 ldx %r15, %r18, %r4 298 ldx %r16, %r8, %r4 299 ldx %r17, %r19, %r4 300 stdx %r14, %r7, %r3 301 stdx %r15, %r18, %r3 302 stdx %r16, %r8, %r3 303 stdx %r17, %r19, %r3 304 305 ldx %r14, %r9, %r4 306 ldx %r15, %r20, %r4 307 ldx %r16, %r10, %r4 308 ldx %r17, %r21, %r4 309 stdx %r14, %r9, %r3 310 stdx %r15, %r20, %r3 311 stdx %r16, %r10, %r3 312 stdx %r17, %r21, %r3 313 314 add %r4, %r4, %r5 /* phase 2 increment */ 315 add %r3, %r3, %r5 /* phase 2 increment */ 316 317 bdnz .Lphase2_loop 318 319 /* restore registers */ 320 ld %r14, -56(%r1) 321 ld %r15, -64(%r1) 322 ld %r16, -72(%r1) 323 ld %r17, -80(%r1) 324 ld %r18, -88(%r1) 325 ld %r19, -96(%r1) 326 ld %r20, -104(%r1) 327 ld %r21, -112(%r1) 328#endif 329 330.Lphase3: 331 /* load registers for transitioning into the single-phase logic */ 332 ld %r5, -48(%r1) /* bytes to copy in phase 3 */ 333 ld %r8, -16(%r1) /* 16-byte increment */ 334 ld %r9, -24(%r1) /* 16-byte pre/post adjustment */ 335 b .Lsingle_phase 336 337END(FN_NAME) 338 339 .section .note.GNU-stack,"",%progbits 340 341