1*25fdd86aSStrahinja Stanišić/*- 2*25fdd86aSStrahinja Stanišić * SPDX-License-Identifier: BSD-2-Clause 3*25fdd86aSStrahinja Stanišić * 4*25fdd86aSStrahinja Stanišić * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org> 5*25fdd86aSStrahinja Stanišić */ 6*25fdd86aSStrahinja Stanišić 7*25fdd86aSStrahinja Stanišić#include <machine/asm.h> 8*25fdd86aSStrahinja Stanišić 9*25fdd86aSStrahinja Stanišić/* 10*25fdd86aSStrahinja Stanišić * a0 - void* dst 11*25fdd86aSStrahinja Stanišić * a1 - const void* src 12*25fdd86aSStrahinja Stanišić * a2 - size_t len 13*25fdd86aSStrahinja Stanišić */ 14*25fdd86aSStrahinja StanišićENTRY(memcpy) 15*25fdd86aSStrahinja Stanišić beqz a2, .Lreturn 16*25fdd86aSStrahinja Stanišić 17*25fdd86aSStrahinja Stanišić /* diff = (dstv - srcv) & 0b111 */ 18*25fdd86aSStrahinja Stanišić sub t0, a0, a1 19*25fdd86aSStrahinja Stanišić andi t0, t0, 0b111 20*25fdd86aSStrahinja Stanišić 21*25fdd86aSStrahinja Stanišić sltiu t1, a2, 8 22*25fdd86aSStrahinja Stanišić 23*25fdd86aSStrahinja Stanišić /* we never change a0, because memcpy returns the original dst */ 24*25fdd86aSStrahinja Stanišić mv a3, a0 25*25fdd86aSStrahinja Stanišić 26*25fdd86aSStrahinja Stanišić /* len < 8 */ 27*25fdd86aSStrahinja Stanišić bnez t1, .Lend 28*25fdd86aSStrahinja Stanišić 29*25fdd86aSStrahinja Stanišić /* t1 = (-dst) & 0b111 */ 30*25fdd86aSStrahinja Stanišić neg t1, a0 31*25fdd86aSStrahinja Stanišić andi t1, t1, 0b111 32*25fdd86aSStrahinja Stanišić 33*25fdd86aSStrahinja Stanišić sub a2, a2, t1 34*25fdd86aSStrahinja Stanišić 35*25fdd86aSStrahinja Stanišić la t2, .Lduff_start 36*25fdd86aSStrahinja Stanišić slli t3, t1, 3 37*25fdd86aSStrahinja Stanišić sub t2, t2, t3 38*25fdd86aSStrahinja Stanišić jr t2 39*25fdd86aSStrahinja Stanišić lb t3, 6(a1) 40*25fdd86aSStrahinja Stanišić sb t3, 6(a3) 41*25fdd86aSStrahinja Stanišić lb t3, 5(a1) 42*25fdd86aSStrahinja Stanišić sb t3, 5(a3) 43*25fdd86aSStrahinja Stanišić lb t3, 4(a1) 44*25fdd86aSStrahinja Stanišić sb t3, 4(a3) 45*25fdd86aSStrahinja Stanišić lb t3, 3(a1) 46*25fdd86aSStrahinja Stanišić sb t3, 3(a3) 47*25fdd86aSStrahinja Stanišić lb t3, 2(a1) 48*25fdd86aSStrahinja Stanišić sb t3, 2(a3) 49*25fdd86aSStrahinja Stanišić lb t3, 1(a1) 50*25fdd86aSStrahinja Stanišić sb t3, 1(a3) 51*25fdd86aSStrahinja Stanišić lb t3, 0(a1) 52*25fdd86aSStrahinja Stanišić sb t3, 0(a3) 53*25fdd86aSStrahinja Stanišić.Lduff_start: 54*25fdd86aSStrahinja Stanišić 55*25fdd86aSStrahinja Stanišić add a1, a1, t1 56*25fdd86aSStrahinja Stanišić add a3, a3, t1 57*25fdd86aSStrahinja Stanišić 58*25fdd86aSStrahinja Stanišić beqz a2, .Lreturn 59*25fdd86aSStrahinja Stanišić 60*25fdd86aSStrahinja Stanišić beqz t0, .Lmemcpy8 61*25fdd86aSStrahinja Stanišić 62*25fdd86aSStrahinja Stanišić /* 63*25fdd86aSStrahinja Stanišić * a4 - size_t right_shift 64*25fdd86aSStrahinja Stanišić * a5 - size_t left_shift 65*25fdd86aSStrahinja Stanišić * a6 - size_t whole (number of dword stores) 66*25fdd86aSStrahinja Stanišić */ 67*25fdd86aSStrahinja Stanišić 68*25fdd86aSStrahinja Stanišić /* right_shift = (src % 0b111) * 8; */ 69*25fdd86aSStrahinja Stanišić andi a4, a1, 0b111 70*25fdd86aSStrahinja Stanišić slli a4, a4, 3 71*25fdd86aSStrahinja Stanišić 72*25fdd86aSStrahinja Stanišić /* left_shift = 64 - right_shift */ 73*25fdd86aSStrahinja Stanišić neg a5, a4 74*25fdd86aSStrahinja Stanišić 75*25fdd86aSStrahinja Stanišić /* whole = len / 8 */ 76*25fdd86aSStrahinja Stanišić srli a6, a2, 3 77*25fdd86aSStrahinja Stanišić 78*25fdd86aSStrahinja Stanišić /* len = len % 8 */ 79*25fdd86aSStrahinja Stanišić andi a2, a2, 0b111 80*25fdd86aSStrahinja Stanišić 81*25fdd86aSStrahinja Stanišić /* t0 - uint64_t* ptr */ 82*25fdd86aSStrahinja Stanišić 83*25fdd86aSStrahinja Stanišić /* ptr = src & ~0b111 */ 84*25fdd86aSStrahinja Stanišić andi t0, a1, ~0b111 85*25fdd86aSStrahinja Stanišić 86*25fdd86aSStrahinja Stanišić /* src += whole * 8 */ 87*25fdd86aSStrahinja Stanišić slli t1, a6, 3 88*25fdd86aSStrahinja Stanišić add a1, a1, t1 89*25fdd86aSStrahinja Stanišić 90*25fdd86aSStrahinja Stanišić /* 91*25fdd86aSStrahinja Stanišić * t1 - uint64_t low 92*25fdd86aSStrahinja Stanišić * t2 - uint64_t high 93*25fdd86aSStrahinja Stanišić */ 94*25fdd86aSStrahinja Stanišić 95*25fdd86aSStrahinja Stanišić /* low = *ptr++ */ 96*25fdd86aSStrahinja Stanišić ld t1, (t0) 97*25fdd86aSStrahinja Stanišić addi t0, t0, 8 98*25fdd86aSStrahinja Stanišić 99*25fdd86aSStrahinja Stanišić /* low >>= right_shift */ 100*25fdd86aSStrahinja Stanišić srl t1, t1, a4 101*25fdd86aSStrahinja Stanišić 102*25fdd86aSStrahinja Stanišić beqz a6, .Llmain_skip 103*25fdd86aSStrahinja Stanišić.Llmain: 104*25fdd86aSStrahinja Stanišić /* high = *ptr++ */ 105*25fdd86aSStrahinja Stanišić ld t2, (t0) 106*25fdd86aSStrahinja Stanišić addi t0, t0, 8 107*25fdd86aSStrahinja Stanišić 108*25fdd86aSStrahinja Stanišić /* whole-- */ 109*25fdd86aSStrahinja Stanišić addi a6, a6, -1 110*25fdd86aSStrahinja Stanišić 111*25fdd86aSStrahinja Stanišić /* temp = (high << left_shift) | low */ 112*25fdd86aSStrahinja Stanišić sll t3, t2, a5 113*25fdd86aSStrahinja Stanišić or t3, t3, t1 114*25fdd86aSStrahinja Stanišić 115*25fdd86aSStrahinja Stanišić /* low = high >> right_shift */ 116*25fdd86aSStrahinja Stanišić srl t1, t2, a4 117*25fdd86aSStrahinja Stanišić 118*25fdd86aSStrahinja Stanišić /* *dst++ = temp */ 119*25fdd86aSStrahinja Stanišić sd t3, (a3) 120*25fdd86aSStrahinja Stanišić addi a3, a3, 8 121*25fdd86aSStrahinja Stanišić 122*25fdd86aSStrahinja Stanišić bnez a6, .Llmain 123*25fdd86aSStrahinja Stanišić 124*25fdd86aSStrahinja Stanišić.Llmain_skip: 125*25fdd86aSStrahinja Stanišić 126*25fdd86aSStrahinja Stanišić.Lend: 127*25fdd86aSStrahinja Stanišić la t1, .Lduff_end 128*25fdd86aSStrahinja Stanišić slli t2, a2, 3 129*25fdd86aSStrahinja Stanišić sub t1, t1, t2 130*25fdd86aSStrahinja Stanišić jr t1 131*25fdd86aSStrahinja Stanišić lb t2, 6(a1) 132*25fdd86aSStrahinja Stanišić sb t2, 6(a3) 133*25fdd86aSStrahinja Stanišić lb t2, 5(a1) 134*25fdd86aSStrahinja Stanišić sb t2, 5(a3) 135*25fdd86aSStrahinja Stanišić lb t2, 4(a1) 136*25fdd86aSStrahinja Stanišić sb t2, 4(a3) 137*25fdd86aSStrahinja Stanišić lb t2, 3(a1) 138*25fdd86aSStrahinja Stanišić sb t2, 3(a3) 139*25fdd86aSStrahinja Stanišić lb t2, 2(a1) 140*25fdd86aSStrahinja Stanišić sb t2, 2(a3) 141*25fdd86aSStrahinja Stanišić lb t2, 1(a1) 142*25fdd86aSStrahinja Stanišić sb t2, 1(a3) 143*25fdd86aSStrahinja Stanišić lb t2, 0(a1) 144*25fdd86aSStrahinja Stanišić sb t2, 0(a3) 145*25fdd86aSStrahinja Stanišić.Lduff_end: 146*25fdd86aSStrahinja Stanišić 147*25fdd86aSStrahinja Stanišić.Lreturn: 148*25fdd86aSStrahinja Stanišić ret 149*25fdd86aSStrahinja Stanišić 150*25fdd86aSStrahinja Stanišić/* exectued when dst - src is multiple of 8 151*25fdd86aSStrahinja Stanišić * a0 - void* dst 152*25fdd86aSStrahinja Stanišić * a1 - const void* src 153*25fdd86aSStrahinja Stanišić * a2 - size_t len 154*25fdd86aSStrahinja Stanišić */ 155*25fdd86aSStrahinja Stanišić.Lmemcpy8: 156*25fdd86aSStrahinja Stanišić 157*25fdd86aSStrahinja Stanišić beqz a2, .Lreturn 158*25fdd86aSStrahinja Stanišić 159*25fdd86aSStrahinja Stanišić slti t0, a2, 128 160*25fdd86aSStrahinja Stanišić bnez t0, .Llmain8_64_skip 161*25fdd86aSStrahinja Stanišić 162*25fdd86aSStrahinja Stanišić /* a4 - uint64_t* end_unroll */ 163*25fdd86aSStrahinja Stanišić 164*25fdd86aSStrahinja Stanišić /* end_unroll = dst + len / 64 * 64 */ 165*25fdd86aSStrahinja Stanišić andi t0, a2, ~0b111111 166*25fdd86aSStrahinja Stanišić add a4, a3, t0 167*25fdd86aSStrahinja Stanišić 168*25fdd86aSStrahinja Stanišić /* len = len % 64 */ 169*25fdd86aSStrahinja Stanišić andi a2, a2, 0b111111 170*25fdd86aSStrahinja Stanišić 171*25fdd86aSStrahinja Stanišić.Llmain8_64: 172*25fdd86aSStrahinja Stanišić ld t0, 0(a1) 173*25fdd86aSStrahinja Stanišić ld t1, 8(a1) 174*25fdd86aSStrahinja Stanišić ld t2, 16(a1) 175*25fdd86aSStrahinja Stanišić ld t3, 24(a1) 176*25fdd86aSStrahinja Stanišić sd t0, 0(a3) 177*25fdd86aSStrahinja Stanišić sd t1, 8(a3) 178*25fdd86aSStrahinja Stanišić sd t2, 16(a3) 179*25fdd86aSStrahinja Stanišić sd t3, 24(a3) 180*25fdd86aSStrahinja Stanišić ld t0, 32(a1) 181*25fdd86aSStrahinja Stanišić ld t1, 40(a1) 182*25fdd86aSStrahinja Stanišić ld t2, 48(a1) 183*25fdd86aSStrahinja Stanišić ld t3, 56(a1) 184*25fdd86aSStrahinja Stanišić sd t0, 32(a3) 185*25fdd86aSStrahinja Stanišić sd t1, 40(a3) 186*25fdd86aSStrahinja Stanišić sd t2, 48(a3) 187*25fdd86aSStrahinja Stanišić sd t3, 56(a3) 188*25fdd86aSStrahinja Stanišić addi a3, a3, 64 189*25fdd86aSStrahinja Stanišić addi a1, a1, 64 190*25fdd86aSStrahinja Stanišić bne a3, a4, .Llmain8_64 191*25fdd86aSStrahinja Stanišić.Llmain8_64_skip: 192*25fdd86aSStrahinja Stanišić 193*25fdd86aSStrahinja Stanišić beqz a2, .Lreturn 194*25fdd86aSStrahinja Stanišić 195*25fdd86aSStrahinja Stanišić /* a4 - uint64_t* end_align */ 196*25fdd86aSStrahinja Stanišić 197*25fdd86aSStrahinja Stanišić /* end_align = (dst + len) & ~0b111 */ 198*25fdd86aSStrahinja Stanišić add a4, a3, a2 199*25fdd86aSStrahinja Stanišić andi a4, a4, ~0b111 200*25fdd86aSStrahinja Stanišić 201*25fdd86aSStrahinja Stanišić /* len = len % 8 */ 202*25fdd86aSStrahinja Stanišić andi a2, a2, 0b111 203*25fdd86aSStrahinja Stanišić 204*25fdd86aSStrahinja Stanišić beq a3, a4, .Llmain8_skip 205*25fdd86aSStrahinja Stanišić.Llmain8: 206*25fdd86aSStrahinja Stanišić ld t0, (a1) 207*25fdd86aSStrahinja Stanišić sd t0, (a3) 208*25fdd86aSStrahinja Stanišić addi a3, a3, 8 209*25fdd86aSStrahinja Stanišić addi a1, a1, 8 210*25fdd86aSStrahinja Stanišić bne a3, a4, .Llmain8 211*25fdd86aSStrahinja Stanišić.Llmain8_skip: 212*25fdd86aSStrahinja Stanišić 213*25fdd86aSStrahinja Stanišić la t1, .Lduff_end 214*25fdd86aSStrahinja Stanišić slli t2, a2, 3 215*25fdd86aSStrahinja Stanišić sub t1, t1, t2 216*25fdd86aSStrahinja Stanišić jr t1 217*25fdd86aSStrahinja StanišićEND(memcpy) 218