1*1e49577aSRod Evans/* 2*1e49577aSRod Evans * CDDL HEADER START 3*1e49577aSRod Evans * 4*1e49577aSRod Evans * The contents of this file are subject to the terms of the 5*1e49577aSRod Evans * Common Development and Distribution License (the "License"). 6*1e49577aSRod Evans * You may not use this file except in compliance with the License. 7*1e49577aSRod Evans * 8*1e49577aSRod Evans * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*1e49577aSRod Evans * or http://www.opensolaris.org/os/licensing. 10*1e49577aSRod Evans * See the License for the specific language governing permissions 11*1e49577aSRod Evans * and limitations under the License. 12*1e49577aSRod Evans * 13*1e49577aSRod Evans * When distributing Covered Code, include this CDDL HEADER in each 14*1e49577aSRod Evans * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*1e49577aSRod Evans * If applicable, add the following below this CDDL HEADER, with the 16*1e49577aSRod Evans * fields enclosed by brackets "[]" replaced with your own identifying 17*1e49577aSRod Evans * information: Portions Copyright [yyyy] [name of copyright owner] 18*1e49577aSRod Evans * 19*1e49577aSRod Evans * CDDL HEADER END 20*1e49577aSRod Evans */ 21*1e49577aSRod Evans 22*1e49577aSRod Evans/* 23*1e49577aSRod Evans * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. 24*1e49577aSRod Evans */ 25*1e49577aSRod Evans 26*1e49577aSRod Evans .file "memcpy.s" 27*1e49577aSRod Evans 28*1e49577aSRod Evans/* 29*1e49577aSRod Evans * memcpy(s1, s2, len) 30*1e49577aSRod Evans * 31*1e49577aSRod Evans * Copy s2 to s1, always copy n bytes. 32*1e49577aSRod Evans * Note: this C code does not work for overlapped copies. 33*1e49577aSRod Evans * Memmove() and bcopy() do. 34*1e49577aSRod Evans * 35*1e49577aSRod Evans * Fast assembler language version of the following C-program for memcpy 36*1e49577aSRod Evans * which represents the `standard' for the C-library. 37*1e49577aSRod Evans * 38*1e49577aSRod Evans * void * 39*1e49577aSRod Evans * memcpy(void *s, const void *s0, size_t n) 40*1e49577aSRod Evans * { 41*1e49577aSRod Evans * if (n != 0) { 42*1e49577aSRod Evans * char *s1 = s; 43*1e49577aSRod Evans * const char *s2 = s0; 44*1e49577aSRod Evans * do { 45*1e49577aSRod Evans * *s1++ = *s2++; 46*1e49577aSRod Evans * } while (--n != 0); 47*1e49577aSRod Evans * } 48*1e49577aSRod Evans * return (s); 49*1e49577aSRod Evans * } 50*1e49577aSRod Evans */ 51*1e49577aSRod Evans 52*1e49577aSRod Evans#include <sys/asm_linkage.h> 53*1e49577aSRod Evans#include <sys/sun4asi.h> 54*1e49577aSRod Evans#include <sys/trap.h> 55*1e49577aSRod Evans 56*1e49577aSRod Evans#define ICACHE_LINE_SIZE 64 57*1e49577aSRod Evans#define BLOCK_SIZE 64 58*1e49577aSRod Evans#define FPRS_FEF 0x4 59*1e49577aSRod Evans 60*1e49577aSRod Evans#define SHORTCOPY 3 61*1e49577aSRod Evans#define SMALL_MAX 39 62*1e49577aSRod Evans#define MEDIUM_MAX 255 63*1e49577aSRod Evans#define MED_WMAX 256 /* max copy for medium word-aligned case */ 64*1e49577aSRod Evans#define MED_MAX 256 /* max copy for medium longword-aligned case */ 65*1e49577aSRod Evans 66*1e49577aSRod Evans#ifndef BSTORE_SIZE 67*1e49577aSRod Evans#define BSTORE_SIZE 256 /* min copy size for block store */ 68*1e49577aSRod Evans#endif 69*1e49577aSRod Evans 70*1e49577aSRod Evans ANSI_PRAGMA_WEAK(memmove,function) 71*1e49577aSRod Evans ANSI_PRAGMA_WEAK(memcpy,function) 72*1e49577aSRod Evans 73*1e49577aSRod Evans ENTRY(memmove) 74*1e49577aSRod Evans cmp %o1, %o0 ! if from address is >= to use forward copy 75*1e49577aSRod Evans bgeu %ncc, .forcpy ! else use backward if ... 76*1e49577aSRod Evans sub %o0, %o1, %o4 ! get difference of two addresses 77*1e49577aSRod Evans cmp %o2, %o4 ! compare size and difference of addresses 78*1e49577aSRod Evans bleu %ncc, .forcpy ! if size is bigger, do overlapped copy 79*1e49577aSRod Evans nop 80*1e49577aSRod Evans 81*1e49577aSRod Evans ! 82*1e49577aSRod Evans ! an overlapped copy that must be done "backwards" 83*1e49577aSRod Evans ! 84*1e49577aSRod Evans.ovbc: 85*1e49577aSRod Evans mov %o0, %g1 ! save dest address for return val 86*1e49577aSRod Evans add %o1, %o2, %o1 ! get to end of source space 87*1e49577aSRod Evans add %o0, %o2, %o0 ! get to end of destination space 88*1e49577aSRod Evans 89*1e49577aSRod Evans cmp %o2, 24 90*1e49577aSRod Evans bgeu,pn %ncc, .dbalign 91*1e49577aSRod Evans nop 92*1e49577aSRod Evans cmp %o2, 4 93*1e49577aSRod Evans blt,pn %ncc, .byte 94*1e49577aSRod Evans sub %o2, 3, %o2 95*1e49577aSRod Evans.byte4loop: 96*1e49577aSRod Evans ldub [%o1-1], %o3 ! load last byte 97*1e49577aSRod Evans stb %o3, [%o0-1] ! store last byte 98*1e49577aSRod Evans sub %o1, 4, %o1 99*1e49577aSRod Evans ldub [%o1+2], %o3 ! load 2nd from last byte 100*1e49577aSRod Evans stb %o3, [%o0-2] ! store 2nd from last byte 101*1e49577aSRod Evans sub %o0, 4, %o0 102*1e49577aSRod Evans ldub [%o1+1], %o3 ! load 3rd from last byte 103*1e49577aSRod Evans stb %o3, [%o0+1] ! store 3rd from last byte 104*1e49577aSRod Evans subcc %o2, 4, %o2 105*1e49577aSRod Evans ldub [%o1], %o3 ! load 4th from last byte 106*1e49577aSRod Evans bgu,pt %ncc, .byte4loop 107*1e49577aSRod Evans stb %o3, [%o0] ! store 4th from last byte 108*1e49577aSRod Evans.byte: 109*1e49577aSRod Evans addcc %o2, 3, %o2 110*1e49577aSRod Evans bz,pt %ncc, .exit 111*1e49577aSRod Evans.byteloop: 112*1e49577aSRod Evans dec %o1 ! decrement src address 113*1e49577aSRod Evans ldub [%o1], %o3 ! read a byte 114*1e49577aSRod Evans dec %o0 ! decrement dst address 115*1e49577aSRod Evans deccc %o2 ! decrement count 116*1e49577aSRod Evans bgu,pt %ncc, .byteloop ! loop until done 117*1e49577aSRod Evans stb %o3, [%o0] ! write byte 118*1e49577aSRod Evans.exit: 119*1e49577aSRod Evans retl 120*1e49577aSRod Evans mov %g1, %o0 121*1e49577aSRod Evans 122*1e49577aSRod Evans .align 16 123*1e49577aSRod Evans.dbalign: 124*1e49577aSRod Evans andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned 125*1e49577aSRod Evans bz,pt %ncc, .dbmed 126*1e49577aSRod Evans sub %o2, %o5, %o2 ! update count 127*1e49577aSRod Evans.dbalign1: 128*1e49577aSRod Evans dec %o1 ! decrement src address 129*1e49577aSRod Evans ldub [%o1], %o3 ! read a byte 130*1e49577aSRod Evans dec %o0 ! decrement dst address 131*1e49577aSRod Evans deccc %o5 ! decrement count 132*1e49577aSRod Evans bgu,pt %ncc, .dbalign1 ! loop until done 133*1e49577aSRod Evans stb %o3, [%o0] ! store a byte 134*1e49577aSRod Evans 135*1e49577aSRod Evans! check for src long word alignment 136*1e49577aSRod Evans.dbmed: 137*1e49577aSRod Evans andcc %o1, 7, %g0 ! chk src long word alignment 138*1e49577aSRod Evans bnz,pn %ncc, .dbbck 139*1e49577aSRod Evans nop 140*1e49577aSRod Evans! 141*1e49577aSRod Evans! Following code is for overlapping copies where src and dest 142*1e49577aSRod Evans! are long word aligned 143*1e49577aSRod Evans! 144*1e49577aSRod Evans cmp %o2, 4095 145*1e49577aSRod Evans blt,pn %ncc, .dbmedl32enter ! go to no prefetch code 146*1e49577aSRod Evans nop 147*1e49577aSRod Evans prefetch [%o1 - (1 * BLOCK_SIZE)], 20 ! into the prefetch cache 148*1e49577aSRod Evans sub %o2, 63, %o2 ! adjust length to allow cc test 149*1e49577aSRod Evans ! for end of loop 150*1e49577aSRod Evans prefetch [%o1 - (2 * BLOCK_SIZE)], 20 ! into the prefetch cache 151*1e49577aSRod Evans rd %fprs, %o3 ! o3 = fprs 152*1e49577aSRod Evans ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. 153*1e49577aSRod Evans ! So set it anyway, without checking. 154*1e49577aSRod Evans prefetch [%o1 - (3 * BLOCK_SIZE)], 20 ! into the prefetch cache 155*1e49577aSRod Evans wr %g0, 0x4, %fprs ! fprs.fef = 1 156*1e49577aSRod Evans prefetch [%o1 - (4 * BLOCK_SIZE)], 20 ! into the prefetch cache 157*1e49577aSRod Evans.dbmedl64: 158*1e49577aSRod Evans prefetch [%o1 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache 159*1e49577aSRod Evans ldd [%o1-8], %d4 ! load 160*1e49577aSRod Evans subcc %o2, 64, %o2 ! decrement length count 161*1e49577aSRod Evans std %d4, [%o0-8] ! and store 162*1e49577aSRod Evans ldd [%o1-16], %d2 ! a block of 64 bytes 163*1e49577aSRod Evans sub %o1, 64, %o1 ! decrease src ptr by 64 164*1e49577aSRod Evans std %d2, [%o0-16] 165*1e49577aSRod Evans sub %o0, 64, %o0 ! decrease dst ptr by 64 166*1e49577aSRod Evans ldd [%o1+40], %d4 167*1e49577aSRod Evans std %d4, [%o0+40] 168*1e49577aSRod Evans ldd [%o1+32], %d2 169*1e49577aSRod Evans std %d2, [%o0+32] 170*1e49577aSRod Evans ldd [%o1+24], %d4 171*1e49577aSRod Evans std %d4, [%o0+24] 172*1e49577aSRod Evans ldd [%o1+16], %d2 173*1e49577aSRod Evans std %d2, [%o0+16] 174*1e49577aSRod Evans ldd [%o1+8], %d4 175*1e49577aSRod Evans std %d4, [%o0+8] 176*1e49577aSRod Evans ldd [%o1], %d2 177*1e49577aSRod Evans bgu,pt %ncc, .dbmedl64 ! repeat if at least 64 bytes left 178*1e49577aSRod Evans std %d2, [%o0] 179*1e49577aSRod Evans add %o2, 63, %o2 ! restore offset adjustment 180*1e49577aSRod Evans and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 181*1e49577aSRod Evans wr %o3, %g0, %fprs ! fprs = o3 restore fprs 182*1e49577aSRod Evans.dbmedl32enter: 183*1e49577aSRod Evans subcc %o2, 31, %o2 ! adjust length to allow cc test 184*1e49577aSRod Evans ! for end of loop 185*1e49577aSRod Evans ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32 186*1e49577aSRod Evans nop 187*1e49577aSRod Evans.dbmedl32: 188*1e49577aSRod Evans ldx [%o1-8], %o4 ! load 189*1e49577aSRod Evans subcc %o2, 32, %o2 ! decrement length count 190*1e49577aSRod Evans stx %o4, [%o0-8] ! and store 191*1e49577aSRod Evans ldx [%o1-16], %o3 ! a block of 32 bytes 192*1e49577aSRod Evans sub %o1, 32, %o1 ! decrease src ptr by 32 193*1e49577aSRod Evans stx %o3, [%o0-16] 194*1e49577aSRod Evans ldx [%o1+8], %o4 195*1e49577aSRod Evans sub %o0, 32, %o0 ! decrease dst ptr by 32 196*1e49577aSRod Evans stx %o4, [%o0+8] 197*1e49577aSRod Evans ldx [%o1], %o3 198*1e49577aSRod Evans bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left 199*1e49577aSRod Evans stx %o3, [%o0] 200*1e49577aSRod Evans.dbmedl31: 201*1e49577aSRod Evans addcc %o2, 16, %o2 ! adjust remaining count 202*1e49577aSRod Evans ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left 203*1e49577aSRod Evans nop ! 204*1e49577aSRod Evans ldx [%o1-8], %o4 ! load and store 16 bytes 205*1e49577aSRod Evans sub %o1, 16, %o1 ! decrease src ptr by 16 206*1e49577aSRod Evans stx %o4, [%o0-8] ! 207*1e49577aSRod Evans sub %o2, 16, %o2 ! decrease count by 16 208*1e49577aSRod Evans ldx [%o1], %o3 ! 209*1e49577aSRod Evans sub %o0, 16, %o0 ! decrease dst ptr by 16 210*1e49577aSRod Evans stx %o3, [%o0] 211*1e49577aSRod Evans.dbmedl15: 212*1e49577aSRod Evans addcc %o2, 15, %o2 ! restore count 213*1e49577aSRod Evans bz,pt %ncc, .dbexit ! exit if finished 214*1e49577aSRod Evans nop 215*1e49577aSRod Evans cmp %o2, 8 216*1e49577aSRod Evans blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left 217*1e49577aSRod Evans nop 218*1e49577aSRod Evans ldx [%o1-8], %o4 ! load 8 bytes 219*1e49577aSRod Evans sub %o1, 8, %o1 ! decrease src ptr by 8 220*1e49577aSRod Evans stx %o4, [%o0-8] ! and store 8 bytes 221*1e49577aSRod Evans subcc %o2, 8, %o2 ! decrease count by 8 222*1e49577aSRod Evans bnz %ncc, .dbremain ! exit if finished 223*1e49577aSRod Evans sub %o0, 8, %o0 ! decrease dst ptr by 8 224*1e49577aSRod Evans retl 225*1e49577aSRod Evans mov %g1, %o0 226*1e49577aSRod Evans 227*1e49577aSRod Evans! 228*1e49577aSRod Evans! Following code is for overlapping copies where src and dest 229*1e49577aSRod Evans! are not long word aligned 230*1e49577aSRod Evans! 231*1e49577aSRod Evans .align 16 232*1e49577aSRod Evans.dbbck: 233*1e49577aSRod Evans rd %fprs, %o3 ! o3 = fprs 234*1e49577aSRod Evans 235*1e49577aSRod Evans ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. 236*1e49577aSRod Evans ! So set it anyway, without checking. 237*1e49577aSRod Evans wr %g0, 0x4, %fprs ! fprs.fef = 1 238*1e49577aSRod Evans 239*1e49577aSRod Evans alignaddr %o1, %g0, %o5 ! align src 240*1e49577aSRod Evans ldd [%o5], %d0 ! get first 8 byte block 241*1e49577aSRod Evans andn %o2, 7, %o4 ! prepare src ptr for finishup code 242*1e49577aSRod Evans cmp %o2, 32 243*1e49577aSRod Evans blt,pn %ncc, .dbmv8 244*1e49577aSRod Evans sub %o1, %o4, %o1 ! 245*1e49577aSRod Evans cmp %o2, 4095 ! check for short memmoves 246*1e49577aSRod Evans blt,pn %ncc, .dbmv32enter ! go to no prefetch code 247*1e49577aSRod Evans.dbmv64: 248*1e49577aSRod Evans ldd [%o5-8], %d2 ! load 8 bytes 249*1e49577aSRod Evans ldd [%o5-16], %d4 ! load 8 bytes 250*1e49577aSRod Evans sub %o5, 64, %o5 ! 251*1e49577aSRod Evans ldd [%o5+40], %d6 ! load 8 bytes 252*1e49577aSRod Evans sub %o0, 64, %o0 ! 253*1e49577aSRod Evans ldd [%o5+32], %d8 ! load 8 bytes 254*1e49577aSRod Evans sub %o2, 64, %o2 ! 64 less bytes to copy 255*1e49577aSRod Evans ldd [%o5+24], %d18 ! load 8 bytes 256*1e49577aSRod Evans cmp %o2, 64 ! do we have < 64 bytes remaining 257*1e49577aSRod Evans ldd [%o5+16], %d28 ! load 8 bytes 258*1e49577aSRod Evans ldd [%o5+8], %d30 ! load 8 bytes 259*1e49577aSRod Evans prefetch [%o5 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache 260*1e49577aSRod Evans faligndata %d2, %d0, %d10 ! extract 8 bytes out 261*1e49577aSRod Evans ldd [%o5], %d0 ! load 8 bytes 262*1e49577aSRod Evans std %d10, [%o0+56] ! store the current 8 bytes 263*1e49577aSRod Evans faligndata %d4, %d2, %d12 ! extract 8 bytes out 264*1e49577aSRod Evans std %d12, [%o0+48] ! store the current 8 bytes 265*1e49577aSRod Evans faligndata %d6, %d4, %d14 ! extract 8 bytes out 266*1e49577aSRod Evans std %d14, [%o0+40] ! store the current 8 bytes 267*1e49577aSRod Evans faligndata %d8, %d6, %d16 ! extract 8 bytes out 268*1e49577aSRod Evans std %d16, [%o0+32] ! store the current 8 bytes 269*1e49577aSRod Evans faligndata %d18, %d8, %d20 ! extract 8 bytes out 270*1e49577aSRod Evans std %d20, [%o0+24] ! store the current 8 bytes 271*1e49577aSRod Evans faligndata %d28, %d18, %d22 ! extract 8 bytes out 272*1e49577aSRod Evans std %d22, [%o0+16] ! store the current 8 bytes 273*1e49577aSRod Evans faligndata %d30, %d28, %d24 ! extract 8 bytes out 274*1e49577aSRod Evans std %d24, [%o0+8] ! store the current 8 bytes 275*1e49577aSRod Evans faligndata %d0, %d30, %d26 ! extract 8 bytes out 276*1e49577aSRod Evans bgeu,pt %ncc, .dbmv64 277*1e49577aSRod Evans std %d26, [%o0] ! store the current 8 bytes 278*1e49577aSRod Evans 279*1e49577aSRod Evans cmp %o2, 32 280*1e49577aSRod Evans blt,pn %ncc, .dbmvx 281*1e49577aSRod Evans nop 282*1e49577aSRod Evans.dbmv32: 283*1e49577aSRod Evans ldd [%o5-8], %d2 ! load 8 bytes 284*1e49577aSRod Evans.dbmv32enter: 285*1e49577aSRod Evans ldd [%o5-16], %d4 ! load 8 bytes 286*1e49577aSRod Evans sub %o5, 32, %o5 ! 287*1e49577aSRod Evans ldd [%o5+8], %d6 ! load 8 bytes 288*1e49577aSRod Evans sub %o0, 32, %o0 ! 289*1e49577aSRod Evans faligndata %d2, %d0, %d10 ! extract 8 bytes out 290*1e49577aSRod Evans ldd [%o5], %d0 ! load 8 bytes 291*1e49577aSRod Evans sub %o2,32, %o2 ! 32 less bytes to copy 292*1e49577aSRod Evans std %d10, [%o0+24] ! store the current 8 bytes 293*1e49577aSRod Evans cmp %o2, 32 ! do we have < 32 bytes remaining 294*1e49577aSRod Evans faligndata %d4, %d2, %d12 ! extract 8 bytes out 295*1e49577aSRod Evans std %d12, [%o0+16] ! store the current 8 bytes 296*1e49577aSRod Evans faligndata %d6, %d4, %d14 ! extract 8 bytes out 297*1e49577aSRod Evans std %d14, [%o0+8] ! store the current 8 bytes 298*1e49577aSRod Evans faligndata %d0, %d6, %d16 ! extract 8 bytes out 299*1e49577aSRod Evans bgeu,pt %ncc, .dbmv32 300*1e49577aSRod Evans std %d16, [%o0] ! store the current 8 bytes 301*1e49577aSRod Evans.dbmvx: 302*1e49577aSRod Evans cmp %o2, 8 ! do we have < 8 bytes remaining 303*1e49577aSRod Evans blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code 304*1e49577aSRod Evans nop 305*1e49577aSRod Evans.dbmv8: 306*1e49577aSRod Evans ldd [%o5-8], %d2 307*1e49577aSRod Evans sub %o0, 8, %o0 ! since we are at the end 308*1e49577aSRod Evans ! when we first enter the loop 309*1e49577aSRod Evans sub %o2, 8, %o2 ! 8 less bytes to copy 310*1e49577aSRod Evans sub %o5, 8, %o5 311*1e49577aSRod Evans cmp %o2, 8 ! do we have < 8 bytes remaining 312*1e49577aSRod Evans faligndata %d2, %d0, %d8 ! extract 8 bytes out 313*1e49577aSRod Evans std %d8, [%o0] ! store the current 8 bytes 314*1e49577aSRod Evans bgeu,pt %ncc, .dbmv8 315*1e49577aSRod Evans fmovd %d2, %d0 316*1e49577aSRod Evans.dbmvfinish: 317*1e49577aSRod Evans and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 318*1e49577aSRod Evans tst %o2 319*1e49577aSRod Evans bz,pt %ncc, .dbexit 320*1e49577aSRod Evans wr %o3, %g0, %fprs ! fprs = o3 restore fprs 321*1e49577aSRod Evans 322*1e49577aSRod Evans.dbremain: 323*1e49577aSRod Evans cmp %o2, 4 324*1e49577aSRod Evans blt,pn %ncc, .dbbyte 325*1e49577aSRod Evans nop 326*1e49577aSRod Evans ldub [%o1-1], %o3 ! load last byte 327*1e49577aSRod Evans stb %o3, [%o0-1] ! store last byte 328*1e49577aSRod Evans sub %o1, 4, %o1 329*1e49577aSRod Evans ldub [%o1+2], %o3 ! load 2nd from last byte 330*1e49577aSRod Evans stb %o3, [%o0-2] ! store 2nd from last byte 331*1e49577aSRod Evans sub %o0, 4, %o0 332*1e49577aSRod Evans ldub [%o1+1], %o3 ! load 3rd from last byte 333*1e49577aSRod Evans stb %o3, [%o0+1] ! store 3rd from last byte 334*1e49577aSRod Evans subcc %o2, 4, %o2 335*1e49577aSRod Evans ldub [%o1], %o3 ! load 4th from last byte 336*1e49577aSRod Evans stb %o3, [%o0] ! store 4th from last byte 337*1e49577aSRod Evans bz,pt %ncc, .dbexit 338*1e49577aSRod Evans.dbbyte: 339*1e49577aSRod Evans dec %o1 ! decrement src address 340*1e49577aSRod Evans ldub [%o1], %o3 ! read a byte 341*1e49577aSRod Evans dec %o0 ! decrement dst address 342*1e49577aSRod Evans deccc %o2 ! decrement count 343*1e49577aSRod Evans bgu,pt %ncc, .dbbyte ! loop until done 344*1e49577aSRod Evans stb %o3, [%o0] ! write byte 345*1e49577aSRod Evans.dbexit: 346*1e49577aSRod Evans retl 347*1e49577aSRod Evans mov %g1, %o0 348*1e49577aSRod Evans SET_SIZE(memmove) 349*1e49577aSRod Evans 350*1e49577aSRod Evans 351*1e49577aSRod Evans .align ICACHE_LINE_SIZE 352*1e49577aSRod Evans ENTRY(memcpy) 353*1e49577aSRod Evans ! adjust instruction alignment 354*1e49577aSRod Evans nop ! Do not remove, these nops affect 355*1e49577aSRod Evans nop ! icache alignment and performance 356*1e49577aSRod Evans.forcpy: 357*1e49577aSRod Evans cmp %o2, SMALL_MAX ! check for not small case 358*1e49577aSRod Evans bgu,pn %ncc, .medium ! go to larger cases 359*1e49577aSRod Evans mov %o0, %g1 ! save %o0 360*1e49577aSRod Evans cmp %o2, SHORTCOPY ! check for really short case 361*1e49577aSRod Evans ble,pt %ncc, .smallleft ! 362*1e49577aSRod Evans or %o0, %o1, %o3 ! prepare alignment check 363*1e49577aSRod Evans andcc %o3, 0x3, %g0 ! test for alignment 364*1e49577aSRod Evans bz,pt %ncc, .smallword ! branch to word aligned case 365*1e49577aSRod Evans sub %o2, 3, %o2 ! adjust count to allow cc zero test 366*1e49577aSRod Evans.smallnotalign4: 367*1e49577aSRod Evans ldub [%o1], %o3 ! read byte 368*1e49577aSRod Evans subcc %o2, 4, %o2 ! reduce count by 4 369*1e49577aSRod Evans stb %o3, [%o0] ! write byte 370*1e49577aSRod Evans ldub [%o1+1], %o3 ! repeat for a total of 4 bytes 371*1e49577aSRod Evans add %o1, 4, %o1 ! advance SRC by 4 372*1e49577aSRod Evans stb %o3, [%o0+1] 373*1e49577aSRod Evans ldub [%o1-2], %o3 374*1e49577aSRod Evans add %o0, 4, %o0 ! advance DST by 4 375*1e49577aSRod Evans stb %o3, [%o0-2] 376*1e49577aSRod Evans ldub [%o1-1], %o3 377*1e49577aSRod Evans bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain 378*1e49577aSRod Evans stb %o3, [%o0-1] 379*1e49577aSRod Evans add %o2, 3, %o2 ! restore count 380*1e49577aSRod Evans.smallleft: 381*1e49577aSRod Evans tst %o2 382*1e49577aSRod Evans bz,pt %ncc, .smallexit 383*1e49577aSRod Evans nop 384*1e49577aSRod Evans.smallleft3: ! 1, 2, or 3 bytes remain 385*1e49577aSRod Evans ldub [%o1], %o3 ! load one byte 386*1e49577aSRod Evans deccc %o2 ! reduce count for cc test 387*1e49577aSRod Evans bz,pt %ncc, .smallexit 388*1e49577aSRod Evans stb %o3, [%o0] ! store one byte 389*1e49577aSRod Evans ldub [%o1+1], %o3 ! load second byte 390*1e49577aSRod Evans deccc %o2 391*1e49577aSRod Evans bz,pt %ncc, .smallexit 392*1e49577aSRod Evans stb %o3, [%o0+1] ! store second byte 393*1e49577aSRod Evans ldub [%o1+2], %o3 ! load third byte 394*1e49577aSRod Evans stb %o3, [%o0+2] ! store third byte 395*1e49577aSRod Evans retl 396*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 397*1e49577aSRod Evans 398*1e49577aSRod Evans .align 16 399*1e49577aSRod Evans nop ! affects loop icache alignment 400*1e49577aSRod Evans.smallwords: 401*1e49577aSRod Evans lduw [%o1], %o3 ! read word 402*1e49577aSRod Evans.smallwordx: 403*1e49577aSRod Evans subcc %o2, 8, %o2 ! update count 404*1e49577aSRod Evans stw %o3, [%o0] ! write word 405*1e49577aSRod Evans add %o1, 8, %o1 ! update SRC 406*1e49577aSRod Evans lduw [%o1-4], %o3 ! read word 407*1e49577aSRod Evans add %o0, 8, %o0 ! update DST 408*1e49577aSRod Evans bgu,pt %ncc, .smallwords ! loop until done 409*1e49577aSRod Evans stw %o3, [%o0-4] ! write word 410*1e49577aSRod Evans addcc %o2, 7, %o2 ! restore count 411*1e49577aSRod Evans bz,pt %ncc, .smallexit ! check for completion 412*1e49577aSRod Evans nop 413*1e49577aSRod Evans cmp %o2, 4 ! check for 4 or more bytes left 414*1e49577aSRod Evans blt .smallleft3 ! if not, go to finish up 415*1e49577aSRod Evans nop 416*1e49577aSRod Evans lduw [%o1], %o3 417*1e49577aSRod Evans add %o1, 4, %o1 418*1e49577aSRod Evans subcc %o2, 4, %o2 419*1e49577aSRod Evans stw %o3, [%o0] 420*1e49577aSRod Evans add %o0, 4, %o0 421*1e49577aSRod Evans bnz,pt %ncc, .smallleft3 422*1e49577aSRod Evans nop 423*1e49577aSRod Evans retl 424*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 425*1e49577aSRod Evans 426*1e49577aSRod Evans.smallword: 427*1e49577aSRod Evans subcc %o2, 4, %o2 ! update count 428*1e49577aSRod Evans bgu,pt %ncc, .smallwordx 429*1e49577aSRod Evans lduw [%o1], %o3 ! read word 430*1e49577aSRod Evans addcc %o2, 3, %o2 ! restore count 431*1e49577aSRod Evans bz,pt %ncc, .smallexit 432*1e49577aSRod Evans stw %o3, [%o0] ! write word 433*1e49577aSRod Evans deccc %o2 ! reduce count for cc test 434*1e49577aSRod Evans ldub [%o1+4], %o3 ! load one byte 435*1e49577aSRod Evans bz,pt %ncc, .smallexit 436*1e49577aSRod Evans stb %o3, [%o0+4] ! store one byte 437*1e49577aSRod Evans ldub [%o1+5], %o3 ! load second byte 438*1e49577aSRod Evans deccc %o2 439*1e49577aSRod Evans bz,pt %ncc, .smallexit 440*1e49577aSRod Evans stb %o3, [%o0+5] ! store second byte 441*1e49577aSRod Evans ldub [%o1+6], %o3 ! load third byte 442*1e49577aSRod Evans stb %o3, [%o0+6] ! store third byte 443*1e49577aSRod Evans.smallexit: 444*1e49577aSRod Evans retl 445*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 446*1e49577aSRod Evans .align 16 447*1e49577aSRod Evans.medium: 448*1e49577aSRod Evans neg %o0, %o5 449*1e49577aSRod Evans neg %o1, %o3 450*1e49577aSRod Evans andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 451*1e49577aSRod Evans and %o3, 7, %o3 ! bytes till SRC 8 byte aligned 452*1e49577aSRod Evans 453*1e49577aSRod Evans bz %ncc, 2f 454*1e49577aSRod Evans sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) 455*1e49577aSRod Evans ! o3={-7, -6, ... 7} o3>0 => SRC overaligned 456*1e49577aSRod Evans 457*1e49577aSRod Evans sub %o2, %o5, %o2 ! update count 458*1e49577aSRod Evans 459*1e49577aSRod Evans1: 460*1e49577aSRod Evans ldub [%o1], %o4 461*1e49577aSRod Evans deccc %o5 462*1e49577aSRod Evans inc %o1 463*1e49577aSRod Evans stb %o4, [%o0] 464*1e49577aSRod Evans bgu,pt %ncc, 1b 465*1e49577aSRod Evans inc %o0 466*1e49577aSRod Evans 467*1e49577aSRod Evans ! Now DST is 8-byte aligned. o0, o1, o2 are current. 468*1e49577aSRod Evans 469*1e49577aSRod Evans2: 470*1e49577aSRod Evans andcc %o1, 0x3, %g0 ! test alignment 471*1e49577aSRod Evans bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases 472*1e49577aSRod Evans ! if src, dst not aligned 473*1e49577aSRod Evans prefetch [%o1 + (1 * BLOCK_SIZE)], 20 474*1e49577aSRod Evans 475*1e49577aSRod Evans/* 476*1e49577aSRod Evans * Handle all cases where src and dest are aligned on word 477*1e49577aSRod Evans * or long word boundaries. Use unrolled loops for better 478*1e49577aSRod Evans * performance. This option wins over standard large data 479*1e49577aSRod Evans * move when source and destination is in cache for medium 480*1e49577aSRod Evans * to short data moves. 481*1e49577aSRod Evans */ 482*1e49577aSRod Evans andcc %o1, 0x7, %g0 ! test word alignment 483*1e49577aSRod Evans bz,pt %ncc, .medlword ! branch to long word aligned case 484*1e49577aSRod Evans prefetch [%o1 + (2 * BLOCK_SIZE)], 20 485*1e49577aSRod Evans cmp %o2, MED_WMAX ! limit to store buffer size 486*1e49577aSRod Evans bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 487*1e49577aSRod Evans nop 488*1e49577aSRod Evans subcc %o2, 15, %o2 ! adjust length to allow cc test 489*1e49577aSRod Evans ! for end of loop 490*1e49577aSRod Evans ble,pt %ncc, .medw15 ! skip big loop if less than 16 491*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], 20 492*1e49577aSRod Evans/* 493*1e49577aSRod Evans * no need to put prefetch in loop as prefetches have 494*1e49577aSRod Evans * already been issued for maximum loop size 495*1e49577aSRod Evans */ 496*1e49577aSRod Evans.medw16: 497*1e49577aSRod Evans ld [%o1], %o4 ! load 498*1e49577aSRod Evans subcc %o2, 16, %o2 ! decrement length count 499*1e49577aSRod Evans stw %o4, [%o0] ! and store 500*1e49577aSRod Evans ld [%o1+4], %o3 ! a block of 16 bytes 501*1e49577aSRod Evans add %o1, 16, %o1 ! increase src ptr by 16 502*1e49577aSRod Evans stw %o3, [%o0+4] 503*1e49577aSRod Evans ld [%o1-8], %o4 504*1e49577aSRod Evans add %o0, 16, %o0 ! increase dst ptr by 16 505*1e49577aSRod Evans stw %o4, [%o0-8] 506*1e49577aSRod Evans ld [%o1-4], %o3 507*1e49577aSRod Evans bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left 508*1e49577aSRod Evans stw %o3, [%o0-4] 509*1e49577aSRod Evans.medw15: 510*1e49577aSRod Evans addcc %o2, 15, %o2 ! restore count 511*1e49577aSRod Evans bz,pt %ncc, .medwexit ! exit if finished 512*1e49577aSRod Evans nop 513*1e49577aSRod Evans cmp %o2, 8 514*1e49577aSRod Evans blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 515*1e49577aSRod Evans nop ! 516*1e49577aSRod Evans ld [%o1], %o4 ! load 4 bytes 517*1e49577aSRod Evans subcc %o2, 8, %o2 ! decrease count by 8 518*1e49577aSRod Evans stw %o4, [%o0] ! and store 4 bytes 519*1e49577aSRod Evans add %o1, 8, %o1 ! increase src ptr by 8 520*1e49577aSRod Evans ld [%o1-4], %o3 ! load 4 bytes 521*1e49577aSRod Evans add %o0, 8, %o0 ! increase dst ptr by 8 522*1e49577aSRod Evans stw %o3, [%o0-4] ! and store 4 bytes 523*1e49577aSRod Evans bz %ncc, .medwexit ! exit if finished 524*1e49577aSRod Evans nop 525*1e49577aSRod Evans.medw7: ! count is ge 1, less than 8 526*1e49577aSRod Evans cmp %o2, 3 ! check for 4 bytes left 527*1e49577aSRod Evans ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left 528*1e49577aSRod Evans nop ! 529*1e49577aSRod Evans ld [%o1], %o4 ! load 4 bytes 530*1e49577aSRod Evans sub %o2, 4, %o2 ! decrease count by 4 531*1e49577aSRod Evans add %o1, 4, %o1 ! increase src ptr by 4 532*1e49577aSRod Evans stw %o4, [%o0] ! and store 4 bytes 533*1e49577aSRod Evans add %o0, 4, %o0 ! increase dst ptr by 4 534*1e49577aSRod Evans tst %o2 ! check for zero bytes left 535*1e49577aSRod Evans bz %ncc, .medwexit ! exit if finished 536*1e49577aSRod Evans nop 537*1e49577aSRod Evans.medw3: ! count is known to be 1, 2, or 3 538*1e49577aSRod Evans deccc %o2 ! reduce count by one 539*1e49577aSRod Evans ldub [%o1], %o3 ! load one byte 540*1e49577aSRod Evans bz,pt %ncc, .medwexit ! exit if last byte 541*1e49577aSRod Evans stb %o3, [%o0] ! store one byte 542*1e49577aSRod Evans ldub [%o1+1], %o3 ! load second byte 543*1e49577aSRod Evans deccc %o2 ! reduce count by one 544*1e49577aSRod Evans bz,pt %ncc, .medwexit ! exit if last byte 545*1e49577aSRod Evans stb %o3, [%o0+1] ! store second byte 546*1e49577aSRod Evans ldub [%o1+2], %o3 ! load third byte 547*1e49577aSRod Evans stb %o3, [%o0+2] ! store third byte 548*1e49577aSRod Evans.medwexit: 549*1e49577aSRod Evans retl 550*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 551*1e49577aSRod Evans 552*1e49577aSRod Evans/* 553*1e49577aSRod Evans * Special case for handling when src and dest are both long word aligned 554*1e49577aSRod Evans * and total data to move is between SMALL_MAX and MED_MAX bytes 555*1e49577aSRod Evans */ 556*1e49577aSRod Evans 557*1e49577aSRod Evans .align 16 558*1e49577aSRod Evans nop 559*1e49577aSRod Evans.medlword: ! long word aligned 560*1e49577aSRod Evans ! length > SMALL_MAX 561*1e49577aSRod Evans cmp %o2, MED_MAX ! limit to store buffer size 562*1e49577aSRod Evans bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 563*1e49577aSRod Evans nop 564*1e49577aSRod Evans subcc %o2, 31, %o2 ! adjust length to allow cc test 565*1e49577aSRod Evans ! for end of loop 566*1e49577aSRod Evans ble,pt %ncc, .medl31 ! skip big loop if less than 32 567*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], 20 ! into the l2 cache 568*1e49577aSRod Evans/* 569*1e49577aSRod Evans * no need to put prefetch in loop as prefetches have 570*1e49577aSRod Evans * already been issued for maximum loop size 571*1e49577aSRod Evans */ 572*1e49577aSRod Evans.medl32: 573*1e49577aSRod Evans ldx [%o1], %o4 ! load 574*1e49577aSRod Evans subcc %o2, 32, %o2 ! decrement length count 575*1e49577aSRod Evans stx %o4, [%o0] ! and store 576*1e49577aSRod Evans ldx [%o1+8], %o3 ! a block of 32 bytes 577*1e49577aSRod Evans add %o1, 32, %o1 ! increase src ptr by 32 578*1e49577aSRod Evans stx %o3, [%o0+8] 579*1e49577aSRod Evans ldx [%o1-16], %o4 580*1e49577aSRod Evans add %o0, 32, %o0 ! increase dst ptr by 32 581*1e49577aSRod Evans stx %o4, [%o0-16] 582*1e49577aSRod Evans ldx [%o1-8], %o3 583*1e49577aSRod Evans bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left 584*1e49577aSRod Evans stx %o3, [%o0-8] 585*1e49577aSRod Evans.medl31: 586*1e49577aSRod Evans addcc %o2, 16, %o2 ! adjust remaining count 587*1e49577aSRod Evans ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left 588*1e49577aSRod Evans nop ! 589*1e49577aSRod Evans ldx [%o1], %o4 ! load and store 16 bytes 590*1e49577aSRod Evans add %o1, 16, %o1 ! increase src ptr by 16 591*1e49577aSRod Evans stx %o4, [%o0] ! 592*1e49577aSRod Evans sub %o2, 16, %o2 ! decrease count by 16 593*1e49577aSRod Evans ldx [%o1-8], %o3 ! 594*1e49577aSRod Evans add %o0, 16, %o0 ! increase dst ptr by 16 595*1e49577aSRod Evans stx %o3, [%o0-8] 596*1e49577aSRod Evans.medl15: 597*1e49577aSRod Evans addcc %o2, 15, %o2 ! restore count 598*1e49577aSRod Evans bz,pt %ncc, .medwexit ! exit if finished 599*1e49577aSRod Evans nop 600*1e49577aSRod Evans cmp %o2, 8 601*1e49577aSRod Evans blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 602*1e49577aSRod Evans nop 603*1e49577aSRod Evans ldx [%o1], %o4 ! load 8 bytes 604*1e49577aSRod Evans add %o1, 8, %o1 ! increase src ptr by 8 605*1e49577aSRod Evans stx %o4, [%o0] ! and store 8 bytes 606*1e49577aSRod Evans subcc %o2, 8, %o2 ! decrease count by 8 607*1e49577aSRod Evans bz %ncc, .medwexit ! exit if finished 608*1e49577aSRod Evans add %o0, 8, %o0 ! increase dst ptr by 8 609*1e49577aSRod Evans ba .medw7 610*1e49577aSRod Evans nop 611*1e49577aSRod Evans 612*1e49577aSRod Evans .align 16 613*1e49577aSRod Evans nop 614*1e49577aSRod Evans nop 615*1e49577aSRod Evans nop 616*1e49577aSRod Evans.mediumsetup: 617*1e49577aSRod Evans prefetch [%o1 + (2 * BLOCK_SIZE)], 21 618*1e49577aSRod Evans.mediumrejoin: 619*1e49577aSRod Evans rd %fprs, %o4 ! check for unused FPU 620*1e49577aSRod Evans 621*1e49577aSRod Evans add %o1, 8, %o1 ! prepare to round SRC upward 622*1e49577aSRod Evans 623*1e49577aSRod Evans sethi %hi(0x1234567f), %o5 ! For GSR.MASK 624*1e49577aSRod Evans or %o5, 0x67f, %o5 625*1e49577aSRod Evans 626*1e49577aSRod Evans andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 627*1e49577aSRod Evans bz,a %ncc, 3f 628*1e49577aSRod Evans wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 629*1e49577aSRod Evans3: 630*1e49577aSRod Evans cmp %o2, MEDIUM_MAX 631*1e49577aSRod Evans bmask %o5, %g0, %g0 632*1e49577aSRod Evans 633*1e49577aSRod Evans ! Compute o5 (number of bytes that need copying using the main loop). 634*1e49577aSRod Evans ! First, compute for the medium case. 635*1e49577aSRod Evans ! Then, if large case, o5 is replaced by count for block alignment. 636*1e49577aSRod Evans ! Be careful not to read past end of SRC 637*1e49577aSRod Evans ! Currently, o2 is the actual count remaining 638*1e49577aSRod Evans ! o3 is how much sooner we'll cross the alignment boundary 639*1e49577aSRod Evans ! in SRC compared to in DST 640*1e49577aSRod Evans ! 641*1e49577aSRod Evans ! Examples: Let # denote bytes that should not be accessed 642*1e49577aSRod Evans ! Let x denote a byte already copied to align DST 643*1e49577aSRod Evans ! Let . and - denote bytes not yet copied 644*1e49577aSRod Evans ! Let | denote double alignment boundaries 645*1e49577aSRod Evans ! 646*1e49577aSRod Evans ! DST: ######xx|........|--------|..###### o2 = 18 647*1e49577aSRod Evans ! o0 648*1e49577aSRod Evans ! 649*1e49577aSRod Evans ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 650*1e49577aSRod Evans ! o1 651*1e49577aSRod Evans ! 652*1e49577aSRod Evans ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 653*1e49577aSRod Evans ! o1 654*1e49577aSRod Evans ! 655*1e49577aSRod Evans ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 656*1e49577aSRod Evans ! o1 657*1e49577aSRod Evans 658*1e49577aSRod Evans or %g0, -8, %o5 659*1e49577aSRod Evans alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1 660*1e49577aSRod Evans 661*1e49577aSRod Evans movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 662*1e49577aSRod Evans add %o5, %o2, %o5 663*1e49577aSRod Evans add %o5, %o3, %o5 664*1e49577aSRod Evans 665*1e49577aSRod Evans bleu %ncc, 4f 666*1e49577aSRod Evans andn %o5, 7, %o5 ! 8 byte aligned count 667*1e49577aSRod Evans neg %o0, %o5 ! 'large' case 668*1e49577aSRod Evans and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned 669*1e49577aSRod Evans4: 670*1e49577aSRod Evans brgez,a %o3, .beginmedloop 671*1e49577aSRod Evans ldd [%o1-8], %d0 672*1e49577aSRod Evans 673*1e49577aSRod Evans add %o1, %o3, %o1 ! back up o1 674*1e49577aSRod Evans5: 675*1e49577aSRod Evans ldda [%o1]ASI_FL8_P, %d2 676*1e49577aSRod Evans inc %o1 677*1e49577aSRod Evans andcc %o1, 7, %g0 678*1e49577aSRod Evans bnz %ncc, 5b 679*1e49577aSRod Evans bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 680*1e49577aSRod Evans 681*1e49577aSRod Evans.beginmedloop: 682*1e49577aSRod Evans tst %o5 683*1e49577aSRod Evans bz %ncc, .endmedloop 684*1e49577aSRod Evans sub %o2, %o5, %o2 ! update count for later 685*1e49577aSRod Evans 686*1e49577aSRod Evans ! Main loop to write out doubles. Note: o5 & 7 == 0 687*1e49577aSRod Evans 688*1e49577aSRod Evans ldd [%o1], %d2 689*1e49577aSRod Evans subcc %o5, 8, %o5 ! update local count 690*1e49577aSRod Evans bz,pn %ncc, 1f 691*1e49577aSRod Evans add %o1, 8, %o1 ! update SRC 692*1e49577aSRod Evans 693*1e49577aSRod Evans.medloop: 694*1e49577aSRod Evans faligndata %d0, %d2, %d4 695*1e49577aSRod Evans ldd [%o1], %d0 696*1e49577aSRod Evans subcc %o5, 8, %o5 ! update local count 697*1e49577aSRod Evans add %o1, 16, %o1 ! update SRC 698*1e49577aSRod Evans std %d4, [%o0] 699*1e49577aSRod Evans bz,pn %ncc, 2f 700*1e49577aSRod Evans faligndata %d2, %d0, %d6 701*1e49577aSRod Evans ldd [%o1 - 8], %d2 702*1e49577aSRod Evans subcc %o5, 8, %o5 ! update local count 703*1e49577aSRod Evans std %d6, [%o0 + 8] 704*1e49577aSRod Evans bnz,pt %ncc, .medloop 705*1e49577aSRod Evans add %o0, 16, %o0 ! update DST 706*1e49577aSRod Evans 707*1e49577aSRod Evans1: 708*1e49577aSRod Evans faligndata %d0, %d2, %d4 709*1e49577aSRod Evans fmovd %d2, %d0 710*1e49577aSRod Evans std %d4, [%o0] 711*1e49577aSRod Evans ba .endmedloop 712*1e49577aSRod Evans add %o0, 8, %o0 713*1e49577aSRod Evans 714*1e49577aSRod Evans2: 715*1e49577aSRod Evans std %d6, [%o0 + 8] 716*1e49577aSRod Evans sub %o1, 8, %o1 717*1e49577aSRod Evans add %o0, 16, %o0 718*1e49577aSRod Evans 719*1e49577aSRod Evans 720*1e49577aSRod Evans.endmedloop: 721*1e49577aSRod Evans ! Currently, o1 is pointing to the next double-aligned byte in SRC 722*1e49577aSRod Evans ! The 8 bytes starting at [o1-8] are available in d0 723*1e49577aSRod Evans ! At least one, and possibly all, of these need to be written. 724*1e49577aSRod Evans 725*1e49577aSRod Evans cmp %o2, BLOCK_SIZE 726*1e49577aSRod Evans bgu %ncc, .large ! otherwise, less than 16 bytes left 727*1e49577aSRod Evans 728*1e49577aSRod Evans#if 0 729*1e49577aSRod Evans 730*1e49577aSRod Evans /* This code will use partial stores. */ 731*1e49577aSRod Evans 732*1e49577aSRod Evans mov %g0, %o5 733*1e49577aSRod Evans and %o3, 7, %o3 ! Number of bytes needed to completely 734*1e49577aSRod Evans ! fill %d0 with good (unwritten) data. 735*1e49577aSRod Evans 736*1e49577aSRod Evans subcc %o2, 8, %o2 ! update count (maybe too much) 737*1e49577aSRod Evans movl %ncc, %o2, %o5 738*1e49577aSRod Evans addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0 739*1e49577aSRod Evans sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0) 740*1e49577aSRod Evans 741*1e49577aSRod Evans bz %ncc, 2f 742*1e49577aSRod Evans alignaddr %o3, %g0, %g0 ! set GSR.ALIGN 743*1e49577aSRod Evans 744*1e49577aSRod Evans1: 745*1e49577aSRod Evans deccc %o5 746*1e49577aSRod Evans ldda [%o1]ASI_FL8_P, %d2 747*1e49577aSRod Evans inc %o1 748*1e49577aSRod Evans bgu %ncc, 1b 749*1e49577aSRod Evans bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 750*1e49577aSRod Evans 751*1e49577aSRod Evans2: 752*1e49577aSRod Evans not %o3 753*1e49577aSRod Evans faligndata %d0, %d0, %d0 ! shift bytes to the left 754*1e49577aSRod Evans and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3] 755*1e49577aSRod Evans edge8n %g0, %o3, %o5 756*1e49577aSRod Evans stda %d0, [%o0]%o5, ASI_PST8_P 757*1e49577aSRod Evans brlez %o2, .mediumexit 758*1e49577aSRod Evans add %o0, %o3, %o0 ! update DST to last stored byte 759*1e49577aSRod Evans3: 760*1e49577aSRod Evans inc %o0 761*1e49577aSRod Evans deccc %o2 762*1e49577aSRod Evans ldub [%o1], %o3 763*1e49577aSRod Evans stb %o3, [%o0] 764*1e49577aSRod Evans bgu %ncc, 3b 765*1e49577aSRod Evans inc %o1 766*1e49577aSRod Evans 767*1e49577aSRod Evans#else 768*1e49577aSRod Evans 769*1e49577aSRod Evans andcc %o3, 7, %o5 ! Number of bytes needed to completely 770*1e49577aSRod Evans ! fill %d0 with good (unwritten) data. 771*1e49577aSRod Evans bz %ncc, 2f 772*1e49577aSRod Evans sub %o5, 8, %o3 ! -(number of good bytes in %d0) 773*1e49577aSRod Evans cmp %o2, 8 774*1e49577aSRod Evans bl,a %ncc, 3f ! Not enough bytes to fill %d0 775*1e49577aSRod Evans add %o1, %o3, %o1 ! Back up %o1 776*1e49577aSRod Evans 777*1e49577aSRod Evans1: 778*1e49577aSRod Evans deccc %o5 779*1e49577aSRod Evans ldda [%o1]ASI_FL8_P, %d2 780*1e49577aSRod Evans inc %o1 781*1e49577aSRod Evans bgu %ncc, 1b 782*1e49577aSRod Evans bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 783*1e49577aSRod Evans 784*1e49577aSRod Evans2: 785*1e49577aSRod Evans subcc %o2, 8, %o2 786*1e49577aSRod Evans std %d0, [%o0] 787*1e49577aSRod Evans bz %ncc, .mediumexit 788*1e49577aSRod Evans add %o0, 8, %o0 789*1e49577aSRod Evans3: 790*1e49577aSRod Evans ldub [%o1], %o3 791*1e49577aSRod Evans deccc %o2 792*1e49577aSRod Evans inc %o1 793*1e49577aSRod Evans stb %o3, [%o0] 794*1e49577aSRod Evans bgu %ncc, 3b 795*1e49577aSRod Evans inc %o0 796*1e49577aSRod Evans#endif 797*1e49577aSRod Evans 798*1e49577aSRod Evans.mediumexit: 799*1e49577aSRod Evans wr %o4, %g0, %fprs ! fprs = o4 restore fprs 800*1e49577aSRod Evans retl 801*1e49577aSRod Evans mov %g1, %o0 802*1e49577aSRod Evans 803*1e49577aSRod Evans 804*1e49577aSRod Evans .align ICACHE_LINE_SIZE 805*1e49577aSRod Evans.large: 806*1e49577aSRod Evans ! The following test for BSTORE_SIZE is used to decide whether 807*1e49577aSRod Evans ! to store data with a block store or with individual stores. 808*1e49577aSRod Evans ! The block store wins when the amount of data is so large 809*1e49577aSRod Evans ! that it is causes other application data to be moved out 810*1e49577aSRod Evans ! of the L1 or L2 cache. 811*1e49577aSRod Evans ! On a Panther, block store can lose more often because block 812*1e49577aSRod Evans ! store forces the stored data to be removed from the L3 cache. 813*1e49577aSRod Evans ! 814*1e49577aSRod Evans sethi %hi(BSTORE_SIZE),%o5 815*1e49577aSRod Evans or %o5,%lo(BSTORE_SIZE),%o5 816*1e49577aSRod Evans cmp %o2, %o5 817*1e49577aSRod Evans bgu %ncc, .xlarge 818*1e49577aSRod Evans 819*1e49577aSRod Evans ! %o0 I/O DST is 64-byte aligned 820*1e49577aSRod Evans ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 821*1e49577aSRod Evans ! %d0 I/O already loaded with SRC data from [%o1-8] 822*1e49577aSRod Evans ! %o2 I/O count (number of bytes that need to be written) 823*1e49577aSRod Evans ! %o3 I Not written. If zero, then SRC is double aligned. 824*1e49577aSRod Evans ! %o4 I Not written. Holds fprs. 825*1e49577aSRod Evans ! %o5 O The number of doubles that remain to be written. 826*1e49577aSRod Evans 827*1e49577aSRod Evans ! Load the rest of the current block 828*1e49577aSRod Evans ! Recall that %o1 is further into SRC than %o0 is into DST 829*1e49577aSRod Evans 830*1e49577aSRod Evans prefetch [%o0 + (0 * BLOCK_SIZE)], 22 831*1e49577aSRod Evans prefetch [%o0 + (1 * BLOCK_SIZE)], 22 832*1e49577aSRod Evans prefetch [%o0 + (2 * BLOCK_SIZE)], 22 833*1e49577aSRod Evans ldd [%o1], %f2 834*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], 21 835*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 836*1e49577aSRod Evans faligndata %f0, %f2, %f32 837*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 838*1e49577aSRod Evans faligndata %f2, %f4, %f34 839*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 840*1e49577aSRod Evans faligndata %f4, %f6, %f36 841*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 842*1e49577aSRod Evans or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 843*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], 21 844*1e49577aSRod Evans faligndata %f6, %f8, %f38 845*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 846*1e49577aSRod Evans movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) 847*1e49577aSRod Evans faligndata %f8, %f10, %f40 848*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 849*1e49577aSRod Evans faligndata %f10, %f12, %f42 850*1e49577aSRod Evans ldd [%o1 + 0x38], %f0 851*1e49577aSRod Evans sub %o2, BLOCK_SIZE, %o2 ! update count 852*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], 21 853*1e49577aSRod Evans add %o1, BLOCK_SIZE, %o1 ! update SRC 854*1e49577aSRod Evans 855*1e49577aSRod Evans ! Main loop. Write previous block. Load rest of current block. 856*1e49577aSRod Evans ! Some bytes will be loaded that won't yet be written. 857*1e49577aSRod Evans1: 858*1e49577aSRod Evans ldd [%o1], %f2 859*1e49577aSRod Evans faligndata %f12, %f14, %f44 860*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 861*1e49577aSRod Evans faligndata %f14, %f0, %f46 862*1e49577aSRod Evans std %f32, [%o0] 863*1e49577aSRod Evans std %f34, [%o0+8] 864*1e49577aSRod Evans std %f36, [%o0+16] 865*1e49577aSRod Evans std %f38, [%o0+24] 866*1e49577aSRod Evans std %f40, [%o0+32] 867*1e49577aSRod Evans std %f42, [%o0+40] 868*1e49577aSRod Evans std %f44, [%o0+48] 869*1e49577aSRod Evans std %f46, [%o0+56] 870*1e49577aSRod Evans sub %o2, BLOCK_SIZE, %o2 ! update count 871*1e49577aSRod Evans prefetch [%o0 + (6 * BLOCK_SIZE)], 22 872*1e49577aSRod Evans prefetch [%o0 + (3 * BLOCK_SIZE)], 22 873*1e49577aSRod Evans add %o0, BLOCK_SIZE, %o0 ! update DST 874*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 875*1e49577aSRod Evans faligndata %f0, %f2, %f32 876*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 877*1e49577aSRod Evans faligndata %f2, %f4, %f34 878*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 879*1e49577aSRod Evans faligndata %f4, %f6, %f36 880*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 881*1e49577aSRod Evans faligndata %f6, %f8, %f38 882*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 883*1e49577aSRod Evans faligndata %f8, %f10, %f40 884*1e49577aSRod Evans ldd [%o1 + 0x38], %f0 885*1e49577aSRod Evans faligndata %f10, %f12, %f42 886*1e49577aSRod Evans cmp %o2, BLOCK_SIZE + 8 887*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], 21 888*1e49577aSRod Evans bgu,pt %ncc, 1b 889*1e49577aSRod Evans add %o1, BLOCK_SIZE, %o1 ! update SRC 890*1e49577aSRod Evans faligndata %f12, %f14, %f44 891*1e49577aSRod Evans faligndata %f14, %f0, %f46 892*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 893*1e49577aSRod Evans cmp %o2, BLOCK_SIZE 894*1e49577aSRod Evans bne %ncc, 2f ! exactly 1 block remaining? 895*1e49577aSRod Evans add %o0, BLOCK_SIZE, %o0 ! update DST 896*1e49577aSRod Evans brz,a %o3, 3f ! is SRC double aligned? 897*1e49577aSRod Evans ldd [%o1], %f2 898*1e49577aSRod Evans 899*1e49577aSRod Evans2: 900*1e49577aSRod Evans add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 901*1e49577aSRod Evans add %o5, %o3, %o5 902*1e49577aSRod Evans 903*1e49577aSRod Evans membar #StoreLoad|#StoreStore 904*1e49577aSRod Evans 905*1e49577aSRod Evans ba .beginmedloop 906*1e49577aSRod Evans andn %o5, 7, %o5 ! 8 byte aligned count 907*1e49577aSRod Evans 908*1e49577aSRod Evans 909*1e49577aSRod Evans ! This is when there is exactly 1 block remaining and SRC is aligned 910*1e49577aSRod Evans3: 911*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 912*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 913*1e49577aSRod Evans fsrc1 %f0, %f32 914*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 915*1e49577aSRod Evans fsrc1 %f2, %f34 916*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 917*1e49577aSRod Evans fsrc1 %f4, %f36 918*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 919*1e49577aSRod Evans fsrc1 %f6, %f38 920*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 921*1e49577aSRod Evans fsrc1 %f8, %f40 922*1e49577aSRod Evans fsrc1 %f10, %f42 923*1e49577aSRod Evans fsrc1 %f12, %f44 924*1e49577aSRod Evans fsrc1 %f14, %f46 925*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P 926*1e49577aSRod Evans membar #StoreLoad|#StoreStore 927*1e49577aSRod Evans wr %o4, 0, %fprs 928*1e49577aSRod Evans retl 929*1e49577aSRod Evans mov %g1, %o0 930*1e49577aSRod Evans 931*1e49577aSRod Evans 932*1e49577aSRod Evans .align 16 933*1e49577aSRod Evans ! two nops here causes loop starting at 1f below to be 934*1e49577aSRod Evans ! on a cache line boundary, improving performance 935*1e49577aSRod Evans nop 936*1e49577aSRod Evans nop 937*1e49577aSRod Evans.xlarge: 938*1e49577aSRod Evans ! %o0 I/O DST is 64-byte aligned 939*1e49577aSRod Evans ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 940*1e49577aSRod Evans ! %d0 I/O already loaded with SRC data from [%o1-8] 941*1e49577aSRod Evans ! %o2 I/O count (number of bytes that need to be written) 942*1e49577aSRod Evans ! %o3 I Not written. If zero, then SRC is double aligned. 943*1e49577aSRod Evans ! %o4 I Not written. Holds fprs. 944*1e49577aSRod Evans ! %o5 O The number of doubles that remain to be written. 945*1e49577aSRod Evans 946*1e49577aSRod Evans ! Load the rest of the current block 947*1e49577aSRod Evans ! Recall that %o1 is further into SRC than %o0 is into DST 948*1e49577aSRod Evans 949*1e49577aSRod Evans ! prefetch [%o1 + (3 * BLOCK_SIZE)], 21 950*1e49577aSRod Evans ! executed in delay slot for branch to .xlarge 951*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], 21 952*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], 21 953*1e49577aSRod Evans ldd [%o1], %f2 954*1e49577aSRod Evans prefetch [%o1 + (6 * BLOCK_SIZE)], 21 955*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 956*1e49577aSRod Evans faligndata %f0, %f2, %f32 957*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 958*1e49577aSRod Evans faligndata %f2, %f4, %f34 959*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 960*1e49577aSRod Evans faligndata %f4, %f6, %f36 961*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 962*1e49577aSRod Evans or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 963*1e49577aSRod Evans faligndata %f6, %f8, %f38 964*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 965*1e49577aSRod Evans movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) 966*1e49577aSRod Evans faligndata %f8, %f10, %f40 967*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 968*1e49577aSRod Evans faligndata %f10, %f12, %f42 969*1e49577aSRod Evans ldd [%o1 + 0x38], %f0 970*1e49577aSRod Evans sub %o2, BLOCK_SIZE, %o2 ! update count 971*1e49577aSRod Evans prefetch [%o1 + (7 * BLOCK_SIZE)], 21 972*1e49577aSRod Evans add %o1, BLOCK_SIZE, %o1 ! update SRC 973*1e49577aSRod Evans 974*1e49577aSRod Evans ! This point is 32-byte aligned since 24 instructions appear since 975*1e49577aSRod Evans ! the previous alignment directive. 976*1e49577aSRod Evans 977*1e49577aSRod Evans 978*1e49577aSRod Evans ! Main loop. Write previous block. Load rest of current block. 979*1e49577aSRod Evans ! Some bytes will be loaded that won't yet be written. 980*1e49577aSRod Evans1: 981*1e49577aSRod Evans ldd [%o1], %f2 982*1e49577aSRod Evans faligndata %f12, %f14, %f44 983*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 984*1e49577aSRod Evans faligndata %f14, %f0, %f46 985*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P 986*1e49577aSRod Evans sub %o2, BLOCK_SIZE, %o2 ! update count 987*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 988*1e49577aSRod Evans faligndata %f0, %f2, %f32 989*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 990*1e49577aSRod Evans faligndata %f2, %f4, %f34 991*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 992*1e49577aSRod Evans faligndata %f4, %f6, %f36 993*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 994*1e49577aSRod Evans faligndata %f6, %f8, %f38 995*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 996*1e49577aSRod Evans faligndata %f8, %f10, %f40 997*1e49577aSRod Evans ldd [%o1 + 0x38], %f0 998*1e49577aSRod Evans faligndata %f10, %f12, %f42 999*1e49577aSRod Evans ! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K 1000*1e49577aSRod Evans prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21 1001*1e49577aSRod Evans add %o0, BLOCK_SIZE, %o0 ! update DST 1002*1e49577aSRod Evans cmp %o2, BLOCK_SIZE + 8 1003*1e49577aSRod Evans ! second prefetch important to correct for occasional dropped 1004*1e49577aSRod Evans ! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K 1005*1e49577aSRod Evans ! strong prefetch prevents drops on Panther, but Jaguar and earlier 1006*1e49577aSRod Evans ! US-III models treat strong prefetches as weak prefetchs 1007*1e49577aSRod Evans ! to avoid regressions on customer hardware, we retain the prefetch 1008*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], 21 1009*1e49577aSRod Evans bgu,pt %ncc, 1b 1010*1e49577aSRod Evans add %o1, BLOCK_SIZE, %o1 ! update SRC 1011*1e49577aSRod Evans 1012*1e49577aSRod Evans faligndata %f12, %f14, %f44 1013*1e49577aSRod Evans faligndata %f14, %f0, %f46 1014*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 1015*1e49577aSRod Evans cmp %o2, BLOCK_SIZE 1016*1e49577aSRod Evans bne %ncc, 2f ! exactly 1 block remaining? 1017*1e49577aSRod Evans add %o0, BLOCK_SIZE, %o0 ! update DST 1018*1e49577aSRod Evans brz,a %o3, 3f ! is SRC double aligned? 1019*1e49577aSRod Evans ldd [%o1], %f2 1020*1e49577aSRod Evans 1021*1e49577aSRod Evans2: 1022*1e49577aSRod Evans add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 1023*1e49577aSRod Evans add %o5, %o3, %o5 1024*1e49577aSRod Evans 1025*1e49577aSRod Evans membar #StoreLoad|#StoreStore 1026*1e49577aSRod Evans 1027*1e49577aSRod Evans ba .beginmedloop 1028*1e49577aSRod Evans andn %o5, 7, %o5 ! 8 byte aligned count 1029*1e49577aSRod Evans 1030*1e49577aSRod Evans 1031*1e49577aSRod Evans ! This is when there is exactly 1 block remaining and SRC is aligned 1032*1e49577aSRod Evans3: 1033*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 1034*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 1035*1e49577aSRod Evans fsrc1 %f0, %f32 1036*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 1037*1e49577aSRod Evans fsrc1 %f2, %f34 1038*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 1039*1e49577aSRod Evans fsrc1 %f4, %f36 1040*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 1041*1e49577aSRod Evans fsrc1 %f6, %f38 1042*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 1043*1e49577aSRod Evans fsrc1 %f8, %f40 1044*1e49577aSRod Evans fsrc1 %f10, %f42 1045*1e49577aSRod Evans fsrc1 %f12, %f44 1046*1e49577aSRod Evans fsrc1 %f14, %f46 1047*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P 1048*1e49577aSRod Evans membar #StoreLoad|#StoreStore 1049*1e49577aSRod Evans wr %o4, 0, %fprs 1050*1e49577aSRod Evans retl 1051*1e49577aSRod Evans mov %g1, %o0 1052*1e49577aSRod Evans 1053*1e49577aSRod Evans SET_SIZE(memcpy) 1054