1/* 2 * Copyright (C) 2002 Paul Mackerras, IBM Corp. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 */ 9#include <asm/processor.h> 10#include <asm/ppc_asm.h> 11 12 .align 7 13_GLOBAL(memcpy) 14 mtcrf 0x01,r5 15 cmpldi cr1,r5,16 16 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry 17 andi. r6,r6,7 18 dcbt 0,r4 19 blt cr1,.Lshort_copy 20 bne .Ldst_unaligned 21.Ldst_aligned: 22 andi. r0,r4,7 23 addi r3,r3,-16 24 bne .Lsrc_unaligned 25 srdi r7,r5,4 26 ld r9,0(r4) 27 addi r4,r4,-8 28 mtctr r7 29 andi. r5,r5,7 30 bf cr7*4+0,2f 31 addi r3,r3,8 32 addi r4,r4,8 33 mr r8,r9 34 blt cr1,3f 351: ld r9,8(r4) 36 std r8,8(r3) 372: ldu r8,16(r4) 38 stdu r9,16(r3) 39 bdnz 1b 403: std r8,8(r3) 41 beqlr 42 addi r3,r3,16 43 ld r9,8(r4) 44.Ldo_tail: 45 bf cr7*4+1,1f 46 rotldi r9,r9,32 47 stw r9,0(r3) 48 addi r3,r3,4 491: bf cr7*4+2,2f 50 rotldi r9,r9,16 51 sth r9,0(r3) 52 addi r3,r3,2 532: bf cr7*4+3,3f 54 rotldi r9,r9,8 55 stb r9,0(r3) 563: blr 57 58.Lsrc_unaligned: 59 srdi r6,r5,3 60 addi r5,r5,-16 61 subf r4,r0,r4 62 srdi r7,r5,4 63 sldi r10,r0,3 64 cmpdi cr6,r6,3 65 andi. r5,r5,7 66 mtctr r7 67 subfic r11,r10,64 68 add r5,r5,r0 69 70 bt cr7*4+0,0f 71 72 ld r9,0(r4) # 3+2n loads, 2+2n stores 73 ld r0,8(r4) 74 sld r6,r9,r10 75 ldu r9,16(r4) 76 srd r7,r0,r11 77 sld r8,r0,r10 78 or r7,r7,r6 79 blt cr6,4f 80 ld r0,8(r4) 81 # s1<< in r8, d0=(s0<<|s1>>) in r7, s3 in r0, s2 in r9, nix in r6 & r12 82 b 2f 83 840: ld r0,0(r4) # 4+2n loads, 3+2n stores 85 ldu r9,8(r4) 86 sld r8,r0,r10 87 addi r3,r3,-8 88 blt cr6,5f 89 ld r0,8(r4) 90 srd r12,r9,r11 91 sld r6,r9,r10 92 ldu r9,16(r4) 93 or r12,r8,r12 94 srd r7,r0,r11 95 sld r8,r0,r10 96 addi r3,r3,16 97 beq cr6,3f 98 99 # d0=(s0<<|s1>>) in r12, s1<< in r6, s2>> in r7, s2<< in r8, s3 in r9 1001: or r7,r7,r6 101 ld r0,8(r4) 102 std r12,8(r3) 1032: srd r12,r9,r11 104 sld r6,r9,r10 105 ldu r9,16(r4) 106 or r12,r8,r12 107 stdu r7,16(r3) 108 srd r7,r0,r11 109 sld r8,r0,r10 110 bdnz 1b 111 1123: std r12,8(r3) 113 or r7,r7,r6 1144: std r7,16(r3) 1155: srd r12,r9,r11 116 or r12,r8,r12 117 std r12,24(r3) 118 beqlr 119 cmpwi cr1,r5,8 120 addi r3,r3,32 121 sld r9,r9,r10 122 ble cr1,.Ldo_tail 123 ld r0,8(r4) 124 srd r7,r0,r11 125 or r9,r7,r9 126 b .Ldo_tail 127 128.Ldst_unaligned: 129 mtcrf 0x01,r6 # put #bytes to 8B bdry into cr7 130 subf r5,r6,r5 131 li r7,0 132 cmpldi r1,r5,16 133 bf cr7*4+3,1f 134 lbz r0,0(r4) 135 stb r0,0(r3) 136 addi r7,r7,1 1371: bf cr7*4+2,2f 138 lhzx r0,r7,r4 139 sthx r0,r7,r3 140 addi r7,r7,2 1412: bf cr7*4+1,3f 142 lwzx r0,r7,r4 143 stwx r0,r7,r3 1443: mtcrf 0x01,r5 145 add r4,r6,r4 146 add r3,r6,r3 147 b .Ldst_aligned 148 149.Lshort_copy: 150 bf cr7*4+0,1f 151 lwz r0,0(r4) 152 lwz r9,4(r4) 153 addi r4,r4,8 154 stw r0,0(r3) 155 stw r9,4(r3) 156 addi r3,r3,8 1571: bf cr7*4+1,2f 158 lwz r0,0(r4) 159 addi r4,r4,4 160 stw r0,0(r3) 161 addi r3,r3,4 1622: bf cr7*4+2,3f 163 lhz r0,0(r4) 164 addi r4,r4,2 165 sth r0,0(r3) 166 addi r3,r3,2 1673: bf cr7*4+3,4f 168 lbz r0,0(r4) 169 stb r0,0(r3) 1704: blr 171