1*1e49577aSRod Evans/* 2*1e49577aSRod Evans * CDDL HEADER START 3*1e49577aSRod Evans * 4*1e49577aSRod Evans * The contents of this file are subject to the terms of the 5*1e49577aSRod Evans * Common Development and Distribution License (the "License"). 6*1e49577aSRod Evans * You may not use this file except in compliance with the License. 7*1e49577aSRod Evans * 8*1e49577aSRod Evans * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*1e49577aSRod Evans * or http://www.opensolaris.org/os/licensing. 10*1e49577aSRod Evans * See the License for the specific language governing permissions 11*1e49577aSRod Evans * and limitations under the License. 12*1e49577aSRod Evans * 13*1e49577aSRod Evans * When distributing Covered Code, include this CDDL HEADER in each 14*1e49577aSRod Evans * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*1e49577aSRod Evans * If applicable, add the following below this CDDL HEADER, with the 16*1e49577aSRod Evans * fields enclosed by brackets "[]" replaced with your own identifying 17*1e49577aSRod Evans * information: Portions Copyright [yyyy] [name of copyright owner] 18*1e49577aSRod Evans * 19*1e49577aSRod Evans * CDDL HEADER END 20*1e49577aSRod Evans */ 21*1e49577aSRod Evans 22*1e49577aSRod Evans/* 23*1e49577aSRod Evans * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 24*1e49577aSRod Evans */ 25*1e49577aSRod Evans 26*1e49577aSRod Evans .file "memcpy.s" 27*1e49577aSRod Evans 28*1e49577aSRod Evans/* 29*1e49577aSRod Evans * memcpy(s1, s2, len) 30*1e49577aSRod Evans * 31*1e49577aSRod Evans * Copy s2 to s1, always copy n bytes. 32*1e49577aSRod Evans * Note: this C code does not work for overlapped copies. 33*1e49577aSRod Evans * Memmove() and bcopy() do. 34*1e49577aSRod Evans * 35*1e49577aSRod Evans * Fast assembler language version of the following C-program for memcpy 36*1e49577aSRod Evans * which represents the `standard' for the C-library. 37*1e49577aSRod Evans * 38*1e49577aSRod Evans * void * 39*1e49577aSRod Evans * memcpy(void *s, const void *s0, size_t n) 40*1e49577aSRod Evans * { 41*1e49577aSRod Evans * if (n != 0) { 42*1e49577aSRod Evans * char *s1 = s; 43*1e49577aSRod Evans * const char *s2 = s0; 44*1e49577aSRod Evans * do { 45*1e49577aSRod Evans * *s1++ = *s2++; 46*1e49577aSRod Evans * } while (--n != 0); 47*1e49577aSRod Evans * } 48*1e49577aSRod Evans * return (s); 49*1e49577aSRod Evans * } 50*1e49577aSRod Evans */ 51*1e49577aSRod Evans 52*1e49577aSRod Evans#include <sys/asm_linkage.h> 53*1e49577aSRod Evans#include <sys/sun4asi.h> 54*1e49577aSRod Evans#include <sys/trap.h> 55*1e49577aSRod Evans 56*1e49577aSRod Evans#define ICACHE_LINE_SIZE 64 57*1e49577aSRod Evans#define BLOCK_SIZE 64 58*1e49577aSRod Evans#define FPRS_FEF 0x4 59*1e49577aSRod Evans 60*1e49577aSRod Evans#define ALIGNED8_FPCOPY_THRESHOLD 1024 61*1e49577aSRod Evans#define ALIGNED4_FPCOPY_THRESHOLD 1024 62*1e49577aSRod Evans#define BST_THRESHOLD 65536 63*1e49577aSRod Evans 64*1e49577aSRod Evans#define SHORTCOPY 3 65*1e49577aSRod Evans#define SMALL_MAX 64 66*1e49577aSRod Evans#define MEDIUM_MAX 255 67*1e49577aSRod Evans#define MED_WMAX 256 /* max copy for medium word-aligned case */ 68*1e49577aSRod Evans 69*1e49577aSRod Evans#define N_READS_STRONG 20 70*1e49577aSRod Evans#define N_WRITES_STRONG 22 71*1e49577aSRod Evans 72*1e49577aSRod Evans 73*1e49577aSRod Evans ANSI_PRAGMA_WEAK(memmove,function) 74*1e49577aSRod Evans ANSI_PRAGMA_WEAK(memcpy,function) 75*1e49577aSRod Evans 76*1e49577aSRod Evans ENTRY(memmove) 77*1e49577aSRod Evans prefetch [%o1], N_READS_STRONG 78*1e49577aSRod Evans prefetch [%o0], N_WRITES_STRONG 79*1e49577aSRod Evans cmp %o1, %o0 ! if from address is >= to use forward copy 80*1e49577aSRod Evans bgeu %ncc, .forcpy ! else use backward if ... 81*1e49577aSRod Evans sub %o0, %o1, %o4 ! get difference of two addresses 82*1e49577aSRod Evans cmp %o2, %o4 ! compare size and difference of addresses 83*1e49577aSRod Evans bleu %ncc, .forcpy ! if size is bigger, do overlapped copy 84*1e49577aSRod Evans nop 85*1e49577aSRod Evans 86*1e49577aSRod Evans ! 87*1e49577aSRod Evans ! an overlapped copy that must be done "backwards" 88*1e49577aSRod Evans ! 89*1e49577aSRod Evans.ovbc: 90*1e49577aSRod Evans mov %o0, %g1 ! save dest address for return val 91*1e49577aSRod Evans add %o1, %o2, %o1 ! get to end of source space 92*1e49577aSRod Evans add %o0, %o2, %o0 ! get to end of destination space 93*1e49577aSRod Evans 94*1e49577aSRod Evans cmp %o2, 64 95*1e49577aSRod Evans bgeu,pn %ncc, .dbalign 96*1e49577aSRod Evans nop 97*1e49577aSRod Evans cmp %o2, 4 98*1e49577aSRod Evans blt,pn %ncc, .byte 99*1e49577aSRod Evans sub %o2, 3, %o2 100*1e49577aSRod Evans.byte4loop: 101*1e49577aSRod Evans ldub [%o1-1], %o3 ! load last byte 102*1e49577aSRod Evans stb %o3, [%o0-1] ! store last byte 103*1e49577aSRod Evans sub %o1, 4, %o1 104*1e49577aSRod Evans ldub [%o1+2], %o3 ! load 2nd from last byte 105*1e49577aSRod Evans stb %o3, [%o0-2] ! store 2nd from last byte 106*1e49577aSRod Evans sub %o0, 4, %o0 107*1e49577aSRod Evans ldub [%o1+1], %o3 ! load 3rd from last byte 108*1e49577aSRod Evans stb %o3, [%o0+1] ! store 3rd from last byte 109*1e49577aSRod Evans subcc %o2, 4, %o2 110*1e49577aSRod Evans ldub [%o1], %o3 ! load 4th from last byte 111*1e49577aSRod Evans bgu,pt %ncc, .byte4loop 112*1e49577aSRod Evans stb %o3, [%o0] ! store 4th from last byte 113*1e49577aSRod Evans.byte: 114*1e49577aSRod Evans addcc %o2, 3, %o2 115*1e49577aSRod Evans bz,pt %ncc, .exit 116*1e49577aSRod Evans.byteloop: 117*1e49577aSRod Evans dec %o1 ! decrement src address 118*1e49577aSRod Evans ldub [%o1], %o3 ! read a byte 119*1e49577aSRod Evans dec %o0 ! decrement dst address 120*1e49577aSRod Evans deccc %o2 ! decrement count 121*1e49577aSRod Evans bgu,pt %ncc, .byteloop ! loop until done 122*1e49577aSRod Evans stb %o3, [%o0] ! write byte 123*1e49577aSRod Evans.exit: 124*1e49577aSRod Evans retl 125*1e49577aSRod Evans mov %g1, %o0 126*1e49577aSRod Evans 127*1e49577aSRod Evans .align 16 128*1e49577aSRod Evans.dbalign: 129*1e49577aSRod Evans prefetch [%o1 - (4 * BLOCK_SIZE)], #one_read 130*1e49577aSRod Evans prefetch [%o0 - (4 * BLOCK_SIZE)], #one_write 131*1e49577aSRod Evans andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned 132*1e49577aSRod Evans bz,pt %ncc, .dbmed 133*1e49577aSRod Evans sub %o2, %o5, %o2 ! update count 134*1e49577aSRod Evans.dbalign1: 135*1e49577aSRod Evans dec %o1 ! decrement src address 136*1e49577aSRod Evans ldub [%o1], %o3 ! read a byte 137*1e49577aSRod Evans dec %o0 ! decrement dst address 138*1e49577aSRod Evans deccc %o5 ! decrement count 139*1e49577aSRod Evans bgu,pt %ncc, .dbalign1 ! loop until done 140*1e49577aSRod Evans stb %o3, [%o0] ! store a byte 141*1e49577aSRod Evans 142*1e49577aSRod Evans! check for src long word alignment 143*1e49577aSRod Evans.dbmed: 144*1e49577aSRod Evans andcc %o1, 7, %g0 ! chk src long word alignment 145*1e49577aSRod Evans bnz,pn %ncc, .dbbck 146*1e49577aSRod Evans nop 147*1e49577aSRod Evans! 148*1e49577aSRod Evans! Following code is for overlapping copies where src and dest 149*1e49577aSRod Evans! are long word aligned 150*1e49577aSRod Evans! 151*1e49577aSRod Evans! 152*1e49577aSRod Evans! For SPARC64-VI, prefetch is effective for both integer and fp register 153*1e49577aSRod Evans! operations. There are no benefits in using the fp registers for 154*1e49577aSRod Evans! aligned data copying. 155*1e49577aSRod Evans 156*1e49577aSRod Evans.dbmedl32enter: 157*1e49577aSRod Evans subcc %o2, 31, %o2 ! adjust length to allow cc test 158*1e49577aSRod Evans ! for end of loop 159*1e49577aSRod Evans ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32 160*1e49577aSRod Evans nop 161*1e49577aSRod Evans.dbmedl32: 162*1e49577aSRod Evans ldx [%o1-8], %o4 ! load 163*1e49577aSRod Evans prefetch [%o1 - (8 * BLOCK_SIZE)], #one_read 164*1e49577aSRod Evans subcc %o2, 32, %o2 ! decrement length count 165*1e49577aSRod Evans stx %o4, [%o0-8] ! and store 166*1e49577aSRod Evans prefetch [%o0 - (8 * BLOCK_SIZE)], #one_write 167*1e49577aSRod Evans ldx [%o1-16], %o3 ! a block of 32 bytes 168*1e49577aSRod Evans sub %o1, 32, %o1 ! decrease src ptr by 32 169*1e49577aSRod Evans stx %o3, [%o0-16] 170*1e49577aSRod Evans ldx [%o1+8], %o4 171*1e49577aSRod Evans sub %o0, 32, %o0 ! decrease dst ptr by 32 172*1e49577aSRod Evans stx %o4, [%o0+8] 173*1e49577aSRod Evans ldx [%o1], %o3 174*1e49577aSRod Evans bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left 175*1e49577aSRod Evans stx %o3, [%o0] 176*1e49577aSRod Evans.dbmedl31: 177*1e49577aSRod Evans addcc %o2, 16, %o2 ! adjust remaining count 178*1e49577aSRod Evans ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left 179*1e49577aSRod Evans nop ! 180*1e49577aSRod Evans ldx [%o1-8], %o4 ! load and store 16 bytes 181*1e49577aSRod Evans sub %o1, 16, %o1 ! decrease src ptr by 16 182*1e49577aSRod Evans stx %o4, [%o0-8] ! 183*1e49577aSRod Evans sub %o2, 16, %o2 ! decrease count by 16 184*1e49577aSRod Evans ldx [%o1], %o3 ! 185*1e49577aSRod Evans sub %o0, 16, %o0 ! decrease dst ptr by 16 186*1e49577aSRod Evans stx %o3, [%o0] 187*1e49577aSRod Evans.dbmedl15: 188*1e49577aSRod Evans addcc %o2, 15, %o2 ! restore count 189*1e49577aSRod Evans bz,pt %ncc, .dbexit ! exit if finished 190*1e49577aSRod Evans nop 191*1e49577aSRod Evans cmp %o2, 8 192*1e49577aSRod Evans blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left 193*1e49577aSRod Evans nop 194*1e49577aSRod Evans ldx [%o1-8], %o4 ! load 8 bytes 195*1e49577aSRod Evans sub %o1, 8, %o1 ! decrease src ptr by 8 196*1e49577aSRod Evans stx %o4, [%o0-8] ! and store 8 bytes 197*1e49577aSRod Evans subcc %o2, 8, %o2 ! decrease count by 8 198*1e49577aSRod Evans bnz %ncc, .dbremain ! exit if finished 199*1e49577aSRod Evans sub %o0, 8, %o0 ! decrease dst ptr by 8 200*1e49577aSRod Evans retl 201*1e49577aSRod Evans mov %g1, %o0 202*1e49577aSRod Evans 203*1e49577aSRod Evans! 204*1e49577aSRod Evans! Following code is for overlapping copies where src and dest 205*1e49577aSRod Evans! are not long word aligned 206*1e49577aSRod Evans! 207*1e49577aSRod Evans .align 16 208*1e49577aSRod Evans.dbbck: 209*1e49577aSRod Evans rd %fprs, %o3 ! o3 = fprs 210*1e49577aSRod Evans 211*1e49577aSRod Evans ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. 212*1e49577aSRod Evans ! So set it anyway, without checking. 213*1e49577aSRod Evans wr %g0, 0x4, %fprs ! fprs.fef = 1 214*1e49577aSRod Evans 215*1e49577aSRod Evans alignaddr %o1, %g0, %o5 ! align src 216*1e49577aSRod Evans ldd [%o5], %d0 ! get first 8 byte block 217*1e49577aSRod Evans andn %o2, 7, %o4 ! prepare src ptr for finishup code 218*1e49577aSRod Evans cmp %o2, 32 219*1e49577aSRod Evans blt,pn %ncc, .dbmv8 220*1e49577aSRod Evans sub %o1, %o4, %o1 ! 221*1e49577aSRod Evans cmp %o2, 4095 ! check for short memmoves 222*1e49577aSRod Evans blt,pn %ncc, .dbmv32enter ! go to no prefetch code 223*1e49577aSRod Evans.dbmv64: 224*1e49577aSRod Evans ldd [%o5-8], %d2 ! load 8 bytes 225*1e49577aSRod Evans ldd [%o5-16], %d4 ! load 8 bytes 226*1e49577aSRod Evans sub %o5, 64, %o5 ! 227*1e49577aSRod Evans ldd [%o5+40], %d6 ! load 8 bytes 228*1e49577aSRod Evans sub %o0, 64, %o0 ! 229*1e49577aSRod Evans ldd [%o5+32], %d8 ! load 8 bytes 230*1e49577aSRod Evans sub %o2, 64, %o2 ! 64 less bytes to copy 231*1e49577aSRod Evans ldd [%o5+24], %d18 ! load 8 bytes 232*1e49577aSRod Evans cmp %o2, 64 ! do we have < 64 bytes remaining 233*1e49577aSRod Evans ldd [%o5+16], %d28 ! load 8 bytes 234*1e49577aSRod Evans ldd [%o5+8], %d30 ! load 8 bytes 235*1e49577aSRod Evans faligndata %d2, %d0, %d10 ! extract 8 bytes out 236*1e49577aSRod Evans prefetch [%o5 - (5 * BLOCK_SIZE)], #one_read 237*1e49577aSRod Evans ldd [%o5], %d0 ! load 8 bytes 238*1e49577aSRod Evans std %d10, [%o0+56] ! store the current 8 bytes 239*1e49577aSRod Evans faligndata %d4, %d2, %d12 ! extract 8 bytes out 240*1e49577aSRod Evans prefetch [%o0 - (5 * BLOCK_SIZE)], #one_write 241*1e49577aSRod Evans std %d12, [%o0+48] ! store the current 8 bytes 242*1e49577aSRod Evans faligndata %d6, %d4, %d14 ! extract 8 bytes out 243*1e49577aSRod Evans std %d14, [%o0+40] ! store the current 8 bytes 244*1e49577aSRod Evans faligndata %d8, %d6, %d16 ! extract 8 bytes out 245*1e49577aSRod Evans std %d16, [%o0+32] ! store the current 8 bytes 246*1e49577aSRod Evans faligndata %d18, %d8, %d20 ! extract 8 bytes out 247*1e49577aSRod Evans std %d20, [%o0+24] ! store the current 8 bytes 248*1e49577aSRod Evans faligndata %d28, %d18, %d22 ! extract 8 bytes out 249*1e49577aSRod Evans std %d22, [%o0+16] ! store the current 8 bytes 250*1e49577aSRod Evans faligndata %d30, %d28, %d24 ! extract 8 bytes out 251*1e49577aSRod Evans std %d24, [%o0+8] ! store the current 8 bytes 252*1e49577aSRod Evans faligndata %d0, %d30, %d26 ! extract 8 bytes out 253*1e49577aSRod Evans bgeu,pt %ncc, .dbmv64 254*1e49577aSRod Evans std %d26, [%o0] ! store the current 8 bytes 255*1e49577aSRod Evans 256*1e49577aSRod Evans cmp %o2, 32 257*1e49577aSRod Evans blt,pn %ncc, .dbmvx 258*1e49577aSRod Evans nop 259*1e49577aSRod Evans.dbmv32: 260*1e49577aSRod Evans ldd [%o5-8], %d2 ! load 8 bytes 261*1e49577aSRod Evans.dbmv32enter: 262*1e49577aSRod Evans ldd [%o5-16], %d4 ! load 8 bytes 263*1e49577aSRod Evans sub %o5, 32, %o5 ! 264*1e49577aSRod Evans ldd [%o5+8], %d6 ! load 8 bytes 265*1e49577aSRod Evans sub %o0, 32, %o0 ! 266*1e49577aSRod Evans faligndata %d2, %d0, %d10 ! extract 8 bytes out 267*1e49577aSRod Evans ldd [%o5], %d0 ! load 8 bytes 268*1e49577aSRod Evans sub %o2,32, %o2 ! 32 less bytes to copy 269*1e49577aSRod Evans std %d10, [%o0+24] ! store the current 8 bytes 270*1e49577aSRod Evans cmp %o2, 32 ! do we have < 32 bytes remaining 271*1e49577aSRod Evans faligndata %d4, %d2, %d12 ! extract 8 bytes out 272*1e49577aSRod Evans std %d12, [%o0+16] ! store the current 8 bytes 273*1e49577aSRod Evans faligndata %d6, %d4, %d14 ! extract 8 bytes out 274*1e49577aSRod Evans std %d14, [%o0+8] ! store the current 8 bytes 275*1e49577aSRod Evans faligndata %d0, %d6, %d16 ! extract 8 bytes out 276*1e49577aSRod Evans bgeu,pt %ncc, .dbmv32 277*1e49577aSRod Evans std %d16, [%o0] ! store the current 8 bytes 278*1e49577aSRod Evans.dbmvx: 279*1e49577aSRod Evans cmp %o2, 8 ! do we have < 8 bytes remaining 280*1e49577aSRod Evans blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code 281*1e49577aSRod Evans nop 282*1e49577aSRod Evans.dbmv8: 283*1e49577aSRod Evans ldd [%o5-8], %d2 284*1e49577aSRod Evans sub %o0, 8, %o0 ! since we are at the end 285*1e49577aSRod Evans ! when we first enter the loop 286*1e49577aSRod Evans sub %o2, 8, %o2 ! 8 less bytes to copy 287*1e49577aSRod Evans sub %o5, 8, %o5 288*1e49577aSRod Evans cmp %o2, 8 ! do we have < 8 bytes remaining 289*1e49577aSRod Evans faligndata %d2, %d0, %d8 ! extract 8 bytes out 290*1e49577aSRod Evans std %d8, [%o0] ! store the current 8 bytes 291*1e49577aSRod Evans bgeu,pt %ncc, .dbmv8 292*1e49577aSRod Evans fmovd %d2, %d0 293*1e49577aSRod Evans.dbmvfinish: 294*1e49577aSRod Evans and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 295*1e49577aSRod Evans tst %o2 296*1e49577aSRod Evans bz,pt %ncc, .dbexit 297*1e49577aSRod Evans wr %o3, %g0, %fprs ! fprs = o3 restore fprs 298*1e49577aSRod Evans 299*1e49577aSRod Evans.dbremain: 300*1e49577aSRod Evans cmp %o2, 4 301*1e49577aSRod Evans blt,pn %ncc, .dbbyte 302*1e49577aSRod Evans nop 303*1e49577aSRod Evans ldub [%o1-1], %o3 ! load last byte 304*1e49577aSRod Evans stb %o3, [%o0-1] ! store last byte 305*1e49577aSRod Evans sub %o1, 4, %o1 306*1e49577aSRod Evans ldub [%o1+2], %o3 ! load 2nd from last byte 307*1e49577aSRod Evans stb %o3, [%o0-2] ! store 2nd from last byte 308*1e49577aSRod Evans sub %o0, 4, %o0 309*1e49577aSRod Evans ldub [%o1+1], %o3 ! load 3rd from last byte 310*1e49577aSRod Evans stb %o3, [%o0+1] ! store 3rd from last byte 311*1e49577aSRod Evans subcc %o2, 4, %o2 312*1e49577aSRod Evans ldub [%o1], %o3 ! load 4th from last byte 313*1e49577aSRod Evans stb %o3, [%o0] ! store 4th from last byte 314*1e49577aSRod Evans bz,pt %ncc, .dbexit 315*1e49577aSRod Evans.dbbyte: 316*1e49577aSRod Evans dec %o1 ! decrement src address 317*1e49577aSRod Evans ldub [%o1], %o3 ! read a byte 318*1e49577aSRod Evans dec %o0 ! decrement dst address 319*1e49577aSRod Evans deccc %o2 ! decrement count 320*1e49577aSRod Evans bgu,pt %ncc, .dbbyte ! loop until done 321*1e49577aSRod Evans stb %o3, [%o0] ! write byte 322*1e49577aSRod Evans.dbexit: 323*1e49577aSRod Evans retl 324*1e49577aSRod Evans mov %g1, %o0 325*1e49577aSRod Evans SET_SIZE(memmove) 326*1e49577aSRod Evans 327*1e49577aSRod Evans 328*1e49577aSRod Evans .align ICACHE_LINE_SIZE 329*1e49577aSRod Evans ENTRY(memcpy) 330*1e49577aSRod Evans ! adjust instruction alignment 331*1e49577aSRod Evans nop ! Do not remove, these nops affect 332*1e49577aSRod Evans nop ! icache alignment and performance 333*1e49577aSRod Evans.forcpy: 334*1e49577aSRod Evans prefetch [%o1], N_READS_STRONG 335*1e49577aSRod Evans prefetch [%o0], N_WRITES_STRONG 336*1e49577aSRod Evans cmp %o2, SMALL_MAX ! check for not small case 337*1e49577aSRod Evans bgu,pn %ncc, .medium ! go to larger cases 338*1e49577aSRod Evans mov %o0, %g1 ! save %o0 339*1e49577aSRod Evans cmp %o2, SHORTCOPY ! check for really short case 340*1e49577aSRod Evans ble,pt %ncc, .smallleft ! 341*1e49577aSRod Evans or %o0, %o1, %o3 ! prepare alignment check 342*1e49577aSRod Evans andcc %o3, 0x3, %g0 ! test for alignment 343*1e49577aSRod Evans bz,pt %ncc, .smallword ! branch to word aligned case 344*1e49577aSRod Evans sub %o2, 3, %o2 ! adjust count to allow cc zero test 345*1e49577aSRod Evans.smallnotalign4: 346*1e49577aSRod Evans ldub [%o1], %o3 ! read byte 347*1e49577aSRod Evans subcc %o2, 4, %o2 ! reduce count by 4 348*1e49577aSRod Evans stb %o3, [%o0] ! write byte 349*1e49577aSRod Evans ldub [%o1+1], %o3 ! repeat for a total of 4 bytes 350*1e49577aSRod Evans add %o1, 4, %o1 ! advance SRC by 4 351*1e49577aSRod Evans stb %o3, [%o0+1] 352*1e49577aSRod Evans ldub [%o1-2], %o3 353*1e49577aSRod Evans add %o0, 4, %o0 ! advance DST by 4 354*1e49577aSRod Evans stb %o3, [%o0-2] 355*1e49577aSRod Evans ldub [%o1-1], %o3 356*1e49577aSRod Evans bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain 357*1e49577aSRod Evans stb %o3, [%o0-1] 358*1e49577aSRod Evans add %o2, 3, %o2 ! restore count 359*1e49577aSRod Evans.smallleft: 360*1e49577aSRod Evans tst %o2 361*1e49577aSRod Evans bz,pt %ncc, .smallexit 362*1e49577aSRod Evans nop 363*1e49577aSRod Evans.smallleft3: ! 1, 2, or 3 bytes remain 364*1e49577aSRod Evans ldub [%o1], %o3 ! load one byte 365*1e49577aSRod Evans deccc %o2 ! reduce count for cc test 366*1e49577aSRod Evans bz,pt %ncc, .smallexit 367*1e49577aSRod Evans stb %o3, [%o0] ! store one byte 368*1e49577aSRod Evans ldub [%o1+1], %o3 ! load second byte 369*1e49577aSRod Evans deccc %o2 370*1e49577aSRod Evans bz,pt %ncc, .smallexit 371*1e49577aSRod Evans stb %o3, [%o0+1] ! store second byte 372*1e49577aSRod Evans ldub [%o1+2], %o3 ! load third byte 373*1e49577aSRod Evans stb %o3, [%o0+2] ! store third byte 374*1e49577aSRod Evans retl 375*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 376*1e49577aSRod Evans 377*1e49577aSRod Evans .align 16 378*1e49577aSRod Evans nop ! affects loop icache alignment 379*1e49577aSRod Evans.smallwords: 380*1e49577aSRod Evans lduw [%o1], %o3 ! read word 381*1e49577aSRod Evans.smallwordx: 382*1e49577aSRod Evans subcc %o2, 8, %o2 ! update count 383*1e49577aSRod Evans stw %o3, [%o0] ! write word 384*1e49577aSRod Evans add %o1, 8, %o1 ! update SRC 385*1e49577aSRod Evans lduw [%o1-4], %o3 ! read word 386*1e49577aSRod Evans add %o0, 8, %o0 ! update DST 387*1e49577aSRod Evans bgu,pt %ncc, .smallwords ! loop until done 388*1e49577aSRod Evans stw %o3, [%o0-4] ! write word 389*1e49577aSRod Evans addcc %o2, 7, %o2 ! restore count 390*1e49577aSRod Evans bz,pt %ncc, .smallexit ! check for completion 391*1e49577aSRod Evans nop 392*1e49577aSRod Evans cmp %o2, 4 ! check for 4 or more bytes left 393*1e49577aSRod Evans blt .smallleft3 ! if not, go to finish up 394*1e49577aSRod Evans nop 395*1e49577aSRod Evans lduw [%o1], %o3 396*1e49577aSRod Evans add %o1, 4, %o1 397*1e49577aSRod Evans subcc %o2, 4, %o2 398*1e49577aSRod Evans stw %o3, [%o0] 399*1e49577aSRod Evans add %o0, 4, %o0 400*1e49577aSRod Evans bnz,pt %ncc, .smallleft3 401*1e49577aSRod Evans nop 402*1e49577aSRod Evans retl 403*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 404*1e49577aSRod Evans 405*1e49577aSRod Evans.smallword: 406*1e49577aSRod Evans subcc %o2, 4, %o2 ! update count 407*1e49577aSRod Evans bgu,pt %ncc, .smallwordx 408*1e49577aSRod Evans lduw [%o1], %o3 ! read word 409*1e49577aSRod Evans addcc %o2, 3, %o2 ! restore count 410*1e49577aSRod Evans bz,pt %ncc, .smallexit 411*1e49577aSRod Evans stw %o3, [%o0] ! write word 412*1e49577aSRod Evans deccc %o2 ! reduce count for cc test 413*1e49577aSRod Evans ldub [%o1+4], %o3 ! load one byte 414*1e49577aSRod Evans bz,pt %ncc, .smallexit 415*1e49577aSRod Evans stb %o3, [%o0+4] ! store one byte 416*1e49577aSRod Evans ldub [%o1+5], %o3 ! load second byte 417*1e49577aSRod Evans deccc %o2 418*1e49577aSRod Evans bz,pt %ncc, .smallexit 419*1e49577aSRod Evans stb %o3, [%o0+5] ! store second byte 420*1e49577aSRod Evans ldub [%o1+6], %o3 ! load third byte 421*1e49577aSRod Evans stb %o3, [%o0+6] ! store third byte 422*1e49577aSRod Evans.smallexit: 423*1e49577aSRod Evans retl 424*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 425*1e49577aSRod Evans .align 16 426*1e49577aSRod Evans.medium: 427*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 428*1e49577aSRod Evans prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write 429*1e49577aSRod Evans neg %o0, %o5 430*1e49577aSRod Evans neg %o1, %o3 431*1e49577aSRod Evans andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 432*1e49577aSRod Evans and %o3, 7, %o3 ! bytes till SRC 8 byte aligned 433*1e49577aSRod Evans 434*1e49577aSRod Evans bz %ncc, 2f 435*1e49577aSRod Evans sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) 436*1e49577aSRod Evans ! o3={-7, -6, ... 7} o3>0 => SRC overaligned 437*1e49577aSRod Evans 438*1e49577aSRod Evans sub %o2, %o5, %o2 ! update count 439*1e49577aSRod Evans 440*1e49577aSRod Evans1: 441*1e49577aSRod Evans ldub [%o1], %o4 442*1e49577aSRod Evans deccc %o5 443*1e49577aSRod Evans inc %o1 444*1e49577aSRod Evans stb %o4, [%o0] 445*1e49577aSRod Evans bgu,pt %ncc, 1b 446*1e49577aSRod Evans inc %o0 447*1e49577aSRod Evans 448*1e49577aSRod Evans ! Now DST is 8-byte aligned. o0, o1, o2 are current. 449*1e49577aSRod Evans 450*1e49577aSRod Evans2: 451*1e49577aSRod Evans andcc %o1, 0x3, %g0 ! test alignment 452*1e49577aSRod Evans prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read 453*1e49577aSRod Evans bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases 454*1e49577aSRod Evans ! if src, dst not aligned 455*1e49577aSRod Evans prefetch [%o0 + (1 * BLOCK_SIZE)], #one_write 456*1e49577aSRod Evans 457*1e49577aSRod Evans/* 458*1e49577aSRod Evans * Handle all cases where src and dest are aligned on word 459*1e49577aSRod Evans * or long word boundaries. Use unrolled loops for better 460*1e49577aSRod Evans * performance. This option wins over standard large data 461*1e49577aSRod Evans * move when source and destination is in cache for medium 462*1e49577aSRod Evans * to short data moves. 463*1e49577aSRod Evans */ 464*1e49577aSRod Evans andcc %o1, 0x7, %g0 ! test word alignment 465*1e49577aSRod Evans prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 466*1e49577aSRod Evans bz,pt %ncc, .medlword ! branch to long word aligned case 467*1e49577aSRod Evans prefetch [%o0 + (2 * BLOCK_SIZE)], #one_write 468*1e49577aSRod Evans cmp %o2, ALIGNED4_FPCOPY_THRESHOLD ! limit to store buffer size 469*1e49577aSRod Evans bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 470*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 471*1e49577aSRod Evans subcc %o2, 15, %o2 ! adjust length to allow cc test 472*1e49577aSRod Evans prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write 473*1e49577aSRod Evans ! for end of loop 474*1e49577aSRod Evans ble,pt %ncc, .medw15 ! skip big loop if less than 16 475*1e49577aSRod Evans .empty 476*1e49577aSRod Evans.medw16: 477*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 478*1e49577aSRod Evans ld [%o1], %o4 ! load 479*1e49577aSRod Evans subcc %o2, 16, %o2 ! decrement length count 480*1e49577aSRod Evans prefetch [%o0 + (4 * BLOCK_SIZE)], #one_write 481*1e49577aSRod Evans stw %o4, [%o0] ! and store 482*1e49577aSRod Evans ld [%o1+4], %o3 ! a block of 16 bytes 483*1e49577aSRod Evans add %o1, 16, %o1 ! increase src ptr by 16 484*1e49577aSRod Evans stw %o3, [%o0+4] 485*1e49577aSRod Evans ld [%o1-8], %o4 486*1e49577aSRod Evans add %o0, 16, %o0 ! increase dst ptr by 16 487*1e49577aSRod Evans stw %o4, [%o0-8] 488*1e49577aSRod Evans ld [%o1-4], %o3 489*1e49577aSRod Evans bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left 490*1e49577aSRod Evans stw %o3, [%o0-4] 491*1e49577aSRod Evans.medw15: 492*1e49577aSRod Evans addcc %o2, 15, %o2 ! restore count 493*1e49577aSRod Evans bz,pt %ncc, .medwexit ! exit if finished 494*1e49577aSRod Evans nop 495*1e49577aSRod Evans cmp %o2, 8 496*1e49577aSRod Evans blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 497*1e49577aSRod Evans nop ! 498*1e49577aSRod Evans ld [%o1], %o4 ! load 4 bytes 499*1e49577aSRod Evans subcc %o2, 8, %o2 ! decrease count by 8 500*1e49577aSRod Evans stw %o4, [%o0] ! and store 4 bytes 501*1e49577aSRod Evans add %o1, 8, %o1 ! increase src ptr by 8 502*1e49577aSRod Evans ld [%o1-4], %o3 ! load 4 bytes 503*1e49577aSRod Evans add %o0, 8, %o0 ! increase dst ptr by 8 504*1e49577aSRod Evans stw %o3, [%o0-4] ! and store 4 bytes 505*1e49577aSRod Evans bz %ncc, .medwexit ! exit if finished 506*1e49577aSRod Evans nop 507*1e49577aSRod Evans.medw7: ! count is ge 1, less than 8 508*1e49577aSRod Evans cmp %o2, 3 ! check for 4 bytes left 509*1e49577aSRod Evans ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left 510*1e49577aSRod Evans nop ! 511*1e49577aSRod Evans ld [%o1], %o4 ! load 4 bytes 512*1e49577aSRod Evans sub %o2, 4, %o2 ! decrease count by 4 513*1e49577aSRod Evans add %o1, 4, %o1 ! increase src ptr by 4 514*1e49577aSRod Evans stw %o4, [%o0] ! and store 4 bytes 515*1e49577aSRod Evans add %o0, 4, %o0 ! increase dst ptr by 4 516*1e49577aSRod Evans tst %o2 ! check for zero bytes left 517*1e49577aSRod Evans bz %ncc, .medwexit ! exit if finished 518*1e49577aSRod Evans nop 519*1e49577aSRod Evans.medw3: ! count is known to be 1, 2, or 3 520*1e49577aSRod Evans deccc %o2 ! reduce count by one 521*1e49577aSRod Evans ldub [%o1], %o3 ! load one byte 522*1e49577aSRod Evans bz,pt %ncc, .medwexit ! exit if last byte 523*1e49577aSRod Evans stb %o3, [%o0] ! store one byte 524*1e49577aSRod Evans ldub [%o1+1], %o3 ! load second byte 525*1e49577aSRod Evans deccc %o2 ! reduce count by one 526*1e49577aSRod Evans bz,pt %ncc, .medwexit ! exit if last byte 527*1e49577aSRod Evans stb %o3, [%o0+1] ! store second byte 528*1e49577aSRod Evans ldub [%o1+2], %o3 ! load third byte 529*1e49577aSRod Evans stb %o3, [%o0+2] ! store third byte 530*1e49577aSRod Evans.medwexit: 531*1e49577aSRod Evans retl 532*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 533*1e49577aSRod Evans 534*1e49577aSRod Evans/* 535*1e49577aSRod Evans * Special case for handling when src and dest are both long word aligned 536*1e49577aSRod Evans * and total data to move is between SMALL_MAX and ALIGNED8_FPCOPY_THRESHOLD 537*1e49577aSRod Evans * bytes. 538*1e49577aSRod Evans */ 539*1e49577aSRod Evans 540*1e49577aSRod Evans .align 16 541*1e49577aSRod Evans nop 542*1e49577aSRod Evans.medlword: ! long word aligned 543*1e49577aSRod Evans ! length > ALIGNED8_FPCOPY_THRESHOLD 544*1e49577aSRod Evans cmp %o2, ALIGNED8_FPCOPY_THRESHOLD 545*1e49577aSRod Evans bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 546*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 547*1e49577aSRod Evans prefetch [%o0 + (3 * BLOCK_SIZE)], #one_write 548*1e49577aSRod Evans subcc %o2, 31, %o2 ! adjust length to allow cc test 549*1e49577aSRod Evans ! for end of loop 550*1e49577aSRod Evans ble,pt %ncc, .medl31 ! skip big loop if less than 32 551*1e49577aSRod Evans .empty 552*1e49577aSRod Evans.medl32: 553*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 554*1e49577aSRod Evans ldx [%o1], %o4 ! load 555*1e49577aSRod Evans subcc %o2, 32, %o2 ! decrement length count 556*1e49577aSRod Evans prefetch [%o0 + (4 * BLOCK_SIZE)], #one_read 557*1e49577aSRod Evans stx %o4, [%o0] ! and store 558*1e49577aSRod Evans ldx [%o1+8], %o3 ! a block of 32 bytes 559*1e49577aSRod Evans add %o1, 32, %o1 ! increase src ptr by 32 560*1e49577aSRod Evans stx %o3, [%o0+8] 561*1e49577aSRod Evans ldx [%o1-16], %o4 562*1e49577aSRod Evans add %o0, 32, %o0 ! increase dst ptr by 32 563*1e49577aSRod Evans stx %o4, [%o0-16] 564*1e49577aSRod Evans ldx [%o1-8], %o3 565*1e49577aSRod Evans bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left 566*1e49577aSRod Evans stx %o3, [%o0-8] 567*1e49577aSRod Evans.medl31: 568*1e49577aSRod Evans addcc %o2, 16, %o2 ! adjust remaining count 569*1e49577aSRod Evans ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left 570*1e49577aSRod Evans nop ! 571*1e49577aSRod Evans ldx [%o1], %o4 ! load and store 16 bytes 572*1e49577aSRod Evans add %o1, 16, %o1 ! increase src ptr by 16 573*1e49577aSRod Evans stx %o4, [%o0] ! 574*1e49577aSRod Evans sub %o2, 16, %o2 ! decrease count by 16 575*1e49577aSRod Evans ldx [%o1-8], %o3 ! 576*1e49577aSRod Evans add %o0, 16, %o0 ! increase dst ptr by 16 577*1e49577aSRod Evans stx %o3, [%o0-8] 578*1e49577aSRod Evans.medl15: 579*1e49577aSRod Evans addcc %o2, 15, %o2 ! restore count 580*1e49577aSRod Evans bz,pt %ncc, .medwexit ! exit if finished 581*1e49577aSRod Evans nop 582*1e49577aSRod Evans cmp %o2, 8 583*1e49577aSRod Evans blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 584*1e49577aSRod Evans nop 585*1e49577aSRod Evans ldx [%o1], %o4 ! load 8 bytes 586*1e49577aSRod Evans add %o1, 8, %o1 ! increase src ptr by 8 587*1e49577aSRod Evans stx %o4, [%o0] ! and store 8 bytes 588*1e49577aSRod Evans subcc %o2, 8, %o2 ! decrease count by 8 589*1e49577aSRod Evans bz %ncc, .medwexit ! exit if finished 590*1e49577aSRod Evans add %o0, 8, %o0 ! increase dst ptr by 8 591*1e49577aSRod Evans ba .medw7 592*1e49577aSRod Evans nop 593*1e49577aSRod Evans 594*1e49577aSRod Evans .align 16 595*1e49577aSRod Evans nop 596*1e49577aSRod Evans nop 597*1e49577aSRod Evans nop 598*1e49577aSRod Evans.mediumsetup: 599*1e49577aSRod Evans prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 600*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 601*1e49577aSRod Evans.mediumrejoin: 602*1e49577aSRod Evans rd %fprs, %o4 ! check for unused FPU 603*1e49577aSRod Evans 604*1e49577aSRod Evans add %o1, 8, %o1 ! prepare to round SRC upward 605*1e49577aSRod Evans 606*1e49577aSRod Evans sethi %hi(0x1234567f), %o5 ! For GSR.MASK 607*1e49577aSRod Evans or %o5, 0x67f, %o5 608*1e49577aSRod Evans 609*1e49577aSRod Evans andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 610*1e49577aSRod Evans bz,a %ncc, 3f 611*1e49577aSRod Evans wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 612*1e49577aSRod Evans3: 613*1e49577aSRod Evans cmp %o2, MEDIUM_MAX 614*1e49577aSRod Evans bmask %o5, %g0, %g0 615*1e49577aSRod Evans 616*1e49577aSRod Evans ! Compute o5 (number of bytes that need copying using the main loop). 617*1e49577aSRod Evans ! First, compute for the medium case. 618*1e49577aSRod Evans ! Then, if large case, o5 is replaced by count for block alignment. 619*1e49577aSRod Evans ! Be careful not to read past end of SRC 620*1e49577aSRod Evans ! Currently, o2 is the actual count remaining 621*1e49577aSRod Evans ! o3 is how much sooner we'll cross the alignment boundary 622*1e49577aSRod Evans ! in SRC compared to in DST 623*1e49577aSRod Evans ! 624*1e49577aSRod Evans ! Examples: Let # denote bytes that should not be accessed 625*1e49577aSRod Evans ! Let x denote a byte already copied to align DST 626*1e49577aSRod Evans ! Let . and - denote bytes not yet copied 627*1e49577aSRod Evans ! Let | denote double alignment boundaries 628*1e49577aSRod Evans ! 629*1e49577aSRod Evans ! DST: ######xx|........|--------|..###### o2 = 18 630*1e49577aSRod Evans ! o0 631*1e49577aSRod Evans ! 632*1e49577aSRod Evans ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 633*1e49577aSRod Evans ! o1 634*1e49577aSRod Evans ! 635*1e49577aSRod Evans ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 636*1e49577aSRod Evans ! o1 637*1e49577aSRod Evans ! 638*1e49577aSRod Evans ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 639*1e49577aSRod Evans ! o1 640*1e49577aSRod Evans 641*1e49577aSRod Evans or %g0, -8, %o5 642*1e49577aSRod Evans alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1 643*1e49577aSRod Evans 644*1e49577aSRod Evans movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 645*1e49577aSRod Evans add %o5, %o2, %o5 646*1e49577aSRod Evans add %o5, %o3, %o5 647*1e49577aSRod Evans 648*1e49577aSRod Evans bleu %ncc, 4f 649*1e49577aSRod Evans andn %o5, 7, %o5 ! 8 byte aligned count 650*1e49577aSRod Evans neg %o0, %o5 ! 'large' case 651*1e49577aSRod Evans and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned 652*1e49577aSRod Evans4: 653*1e49577aSRod Evans brgez,a %o3, .beginmedloop 654*1e49577aSRod Evans ldd [%o1-8], %d0 655*1e49577aSRod Evans 656*1e49577aSRod Evans add %o1, %o3, %o1 ! back up o1 657*1e49577aSRod Evans5: 658*1e49577aSRod Evans ldda [%o1]ASI_FL8_P, %d2 659*1e49577aSRod Evans inc %o1 660*1e49577aSRod Evans andcc %o1, 7, %g0 661*1e49577aSRod Evans bnz %ncc, 5b 662*1e49577aSRod Evans bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 663*1e49577aSRod Evans 664*1e49577aSRod Evans.beginmedloop: 665*1e49577aSRod Evans tst %o5 666*1e49577aSRod Evans bz %ncc, .endmedloop 667*1e49577aSRod Evans sub %o2, %o5, %o2 ! update count for later 668*1e49577aSRod Evans 669*1e49577aSRod Evans ! Main loop to write out doubles. Note: o5 & 7 == 0 670*1e49577aSRod Evans 671*1e49577aSRod Evans ldd [%o1], %d2 672*1e49577aSRod Evans subcc %o5, 8, %o5 ! update local count 673*1e49577aSRod Evans bz,pn %ncc, 1f 674*1e49577aSRod Evans add %o1, 8, %o1 ! update SRC 675*1e49577aSRod Evans 676*1e49577aSRod Evans.medloop: 677*1e49577aSRod Evans faligndata %d0, %d2, %d4 678*1e49577aSRod Evans ldd [%o1], %d0 679*1e49577aSRod Evans subcc %o5, 8, %o5 ! update local count 680*1e49577aSRod Evans add %o1, 16, %o1 ! update SRC 681*1e49577aSRod Evans std %d4, [%o0] 682*1e49577aSRod Evans bz,pn %ncc, 2f 683*1e49577aSRod Evans faligndata %d2, %d0, %d6 684*1e49577aSRod Evans ldd [%o1 - 8], %d2 685*1e49577aSRod Evans subcc %o5, 8, %o5 ! update local count 686*1e49577aSRod Evans std %d6, [%o0 + 8] 687*1e49577aSRod Evans bnz,pt %ncc, .medloop 688*1e49577aSRod Evans add %o0, 16, %o0 ! update DST 689*1e49577aSRod Evans 690*1e49577aSRod Evans1: 691*1e49577aSRod Evans faligndata %d0, %d2, %d4 692*1e49577aSRod Evans fmovd %d2, %d0 693*1e49577aSRod Evans std %d4, [%o0] 694*1e49577aSRod Evans ba .endmedloop 695*1e49577aSRod Evans add %o0, 8, %o0 696*1e49577aSRod Evans 697*1e49577aSRod Evans2: 698*1e49577aSRod Evans std %d6, [%o0 + 8] 699*1e49577aSRod Evans sub %o1, 8, %o1 700*1e49577aSRod Evans add %o0, 16, %o0 701*1e49577aSRod Evans 702*1e49577aSRod Evans 703*1e49577aSRod Evans.endmedloop: 704*1e49577aSRod Evans ! Currently, o1 is pointing to the next double-aligned byte in SRC 705*1e49577aSRod Evans ! The 8 bytes starting at [o1-8] are available in d0 706*1e49577aSRod Evans ! At least one, and possibly all, of these need to be written. 707*1e49577aSRod Evans 708*1e49577aSRod Evans cmp %o2, BLOCK_SIZE 709*1e49577aSRod Evans bgu %ncc, .large ! otherwise, less than 16 bytes left 710*1e49577aSRod Evans 711*1e49577aSRod Evans#if 0 712*1e49577aSRod Evans 713*1e49577aSRod Evans /* This code will use partial stores. */ 714*1e49577aSRod Evans 715*1e49577aSRod Evans mov %g0, %o5 716*1e49577aSRod Evans and %o3, 7, %o3 ! Number of bytes needed to completely 717*1e49577aSRod Evans ! fill %d0 with good (unwritten) data. 718*1e49577aSRod Evans 719*1e49577aSRod Evans subcc %o2, 8, %o2 ! update count (maybe too much) 720*1e49577aSRod Evans movl %ncc, %o2, %o5 721*1e49577aSRod Evans addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0 722*1e49577aSRod Evans sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0) 723*1e49577aSRod Evans 724*1e49577aSRod Evans bz %ncc, 2f 725*1e49577aSRod Evans alignaddr %o3, %g0, %g0 ! set GSR.ALIGN 726*1e49577aSRod Evans 727*1e49577aSRod Evans1: 728*1e49577aSRod Evans deccc %o5 729*1e49577aSRod Evans ldda [%o1]ASI_FL8_P, %d2 730*1e49577aSRod Evans inc %o1 731*1e49577aSRod Evans bgu %ncc, 1b 732*1e49577aSRod Evans bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 733*1e49577aSRod Evans 734*1e49577aSRod Evans2: 735*1e49577aSRod Evans not %o3 736*1e49577aSRod Evans faligndata %d0, %d0, %d0 ! shift bytes to the left 737*1e49577aSRod Evans and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3] 738*1e49577aSRod Evans edge8n %g0, %o3, %o5 739*1e49577aSRod Evans stda %d0, [%o0]%o5, ASI_PST8_P 740*1e49577aSRod Evans brlez %o2, .mediumexit 741*1e49577aSRod Evans add %o0, %o3, %o0 ! update DST to last stored byte 742*1e49577aSRod Evans3: 743*1e49577aSRod Evans inc %o0 744*1e49577aSRod Evans deccc %o2 745*1e49577aSRod Evans ldub [%o1], %o3 746*1e49577aSRod Evans stb %o3, [%o0] 747*1e49577aSRod Evans bgu %ncc, 3b 748*1e49577aSRod Evans inc %o1 749*1e49577aSRod Evans 750*1e49577aSRod Evans#else 751*1e49577aSRod Evans 752*1e49577aSRod Evans andcc %o3, 7, %o5 ! Number of bytes needed to completely 753*1e49577aSRod Evans ! fill %d0 with good (unwritten) data. 754*1e49577aSRod Evans bz %ncc, 2f 755*1e49577aSRod Evans sub %o5, 8, %o3 ! -(number of good bytes in %d0) 756*1e49577aSRod Evans cmp %o2, 8 757*1e49577aSRod Evans bl,a %ncc, 3f ! Not enough bytes to fill %d0 758*1e49577aSRod Evans add %o1, %o3, %o1 ! Back up %o1 759*1e49577aSRod Evans 760*1e49577aSRod Evans1: 761*1e49577aSRod Evans deccc %o5 762*1e49577aSRod Evans ldda [%o1]ASI_FL8_P, %d2 763*1e49577aSRod Evans inc %o1 764*1e49577aSRod Evans bgu %ncc, 1b 765*1e49577aSRod Evans bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 766*1e49577aSRod Evans 767*1e49577aSRod Evans2: 768*1e49577aSRod Evans subcc %o2, 8, %o2 769*1e49577aSRod Evans std %d0, [%o0] 770*1e49577aSRod Evans bz %ncc, .mediumexit 771*1e49577aSRod Evans add %o0, 8, %o0 772*1e49577aSRod Evans3: 773*1e49577aSRod Evans ldub [%o1], %o3 774*1e49577aSRod Evans deccc %o2 775*1e49577aSRod Evans inc %o1 776*1e49577aSRod Evans stb %o3, [%o0] 777*1e49577aSRod Evans bgu %ncc, 3b 778*1e49577aSRod Evans inc %o0 779*1e49577aSRod Evans#endif 780*1e49577aSRod Evans 781*1e49577aSRod Evans.mediumexit: 782*1e49577aSRod Evans wr %o4, %g0, %fprs ! fprs = o4 restore fprs 783*1e49577aSRod Evans retl 784*1e49577aSRod Evans mov %g1, %o0 785*1e49577aSRod Evans 786*1e49577aSRod Evans 787*1e49577aSRod Evans .align ICACHE_LINE_SIZE 788*1e49577aSRod Evans.large: 789*1e49577aSRod Evans 790*1e49577aSRod Evans ! %o0 I/O DST is 64-byte aligned 791*1e49577aSRod Evans ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 792*1e49577aSRod Evans ! %d0 I/O already loaded with SRC data from [%o1-8] 793*1e49577aSRod Evans ! %o2 I/O count (number of bytes that need to be written) 794*1e49577aSRod Evans ! %o3 I Not written. If zero, then SRC is double aligned. 795*1e49577aSRod Evans ! %o4 I Not written. Holds fprs. 796*1e49577aSRod Evans ! %o5 O The number of doubles that remain to be written. 797*1e49577aSRod Evans 798*1e49577aSRod Evans ! Load the rest of the current block 799*1e49577aSRod Evans ! Recall that %o1 is further into SRC than %o0 is into DST 800*1e49577aSRod Evans 801*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 802*1e49577aSRod Evans prefetch [%o1 + (8 * BLOCK_SIZE)], #one_read 803*1e49577aSRod Evans 804*1e49577aSRod Evans set BST_THRESHOLD, %o5 805*1e49577aSRod Evans cmp %o2, %o5 806*1e49577aSRod Evans bgu,pn %icc, .xlarge 807*1e49577aSRod Evans prefetch [%o1 + (12 * BLOCK_SIZE)], #one_read 808*1e49577aSRod Evans 809*1e49577aSRod Evans ldd [%o1], %f2 810*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 811*1e49577aSRod Evans faligndata %f0, %f2, %f32 812*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 813*1e49577aSRod Evans faligndata %f2, %f4, %f34 814*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 815*1e49577aSRod Evans faligndata %f4, %f6, %f36 816*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 817*1e49577aSRod Evans or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 818*1e49577aSRod Evans faligndata %f6, %f8, %f38 819*1e49577aSRod Evans prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read 820*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 821*1e49577aSRod Evans movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) 822*1e49577aSRod Evans faligndata %f8, %f10, %f40 823*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 824*1e49577aSRod Evans faligndata %f10, %f12, %f42 825*1e49577aSRod Evans ldd [%o1 + 0x38], %f0 826*1e49577aSRod Evans sub %o2, BLOCK_SIZE, %o2 ! update count 827*1e49577aSRod Evans add %o1, BLOCK_SIZE, %o1 ! update SRC 828*1e49577aSRod Evans 829*1e49577aSRod Evans ! Main loop. Write previous block. Load rest of current block. 830*1e49577aSRod Evans ! Some bytes will be loaded that won't yet be written. 831*1e49577aSRod Evans1: 832*1e49577aSRod Evans ldd [%o1], %f2 833*1e49577aSRod Evans faligndata %f12, %f14, %f44 834*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 835*1e49577aSRod Evans faligndata %f14, %f0, %f46 836*1e49577aSRod Evans std %f32, [%o0] 837*1e49577aSRod Evans std %f34, [%o0+8] 838*1e49577aSRod Evans std %f36, [%o0+16] 839*1e49577aSRod Evans std %f38, [%o0+24] 840*1e49577aSRod Evans std %f40, [%o0+32] 841*1e49577aSRod Evans std %f42, [%o0+40] 842*1e49577aSRod Evans std %f44, [%o0+48] 843*1e49577aSRod Evans std %f46, [%o0+56] 844*1e49577aSRod Evans sub %o2, BLOCK_SIZE, %o2 ! update count 845*1e49577aSRod Evans prefetch [%o1 + (24 * BLOCK_SIZE) + BLOCK_SIZE], #one_read 846*1e49577aSRod Evans add %o0, BLOCK_SIZE, %o0 ! update DST 847*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 848*1e49577aSRod Evans faligndata %f0, %f2, %f32 849*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 850*1e49577aSRod Evans faligndata %f2, %f4, %f34 851*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 852*1e49577aSRod Evans faligndata %f4, %f6, %f36 853*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 854*1e49577aSRod Evans faligndata %f6, %f8, %f38 855*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 856*1e49577aSRod Evans faligndata %f8, %f10, %f40 857*1e49577aSRod Evans ldd [%o1 + 0x38], %f0 858*1e49577aSRod Evans faligndata %f10, %f12, %f42 859*1e49577aSRod Evans prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read 860*1e49577aSRod Evans cmp %o2, BLOCK_SIZE + 8 861*1e49577aSRod Evans prefetch [%o0 + (18 * BLOCK_SIZE)], #one_write 862*1e49577aSRod Evans bgu,pt %ncc, 1b 863*1e49577aSRod Evans add %o1, BLOCK_SIZE, %o1 ! update SRC 864*1e49577aSRod Evans faligndata %f12, %f14, %f44 865*1e49577aSRod Evans faligndata %f14, %f0, %f46 866*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 867*1e49577aSRod Evans cmp %o2, BLOCK_SIZE 868*1e49577aSRod Evans bne %ncc, 2f ! exactly 1 block remaining? 869*1e49577aSRod Evans add %o0, BLOCK_SIZE, %o0 ! update DST 870*1e49577aSRod Evans brz,a %o3, 3f ! is SRC double aligned? 871*1e49577aSRod Evans ldd [%o1], %f2 872*1e49577aSRod Evans 873*1e49577aSRod Evans2: 874*1e49577aSRod Evans add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 875*1e49577aSRod Evans add %o5, %o3, %o5 876*1e49577aSRod Evans 877*1e49577aSRod Evans membar #StoreLoad|#StoreStore 878*1e49577aSRod Evans 879*1e49577aSRod Evans ba .beginmedloop 880*1e49577aSRod Evans andn %o5, 7, %o5 ! 8 byte aligned count 881*1e49577aSRod Evans 882*1e49577aSRod Evans 883*1e49577aSRod Evans ! This is when there is exactly 1 block remaining and SRC is aligned 884*1e49577aSRod Evans3: 885*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 886*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 887*1e49577aSRod Evans fsrc1 %f0, %f32 888*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 889*1e49577aSRod Evans fsrc1 %f2, %f34 890*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 891*1e49577aSRod Evans fsrc1 %f4, %f36 892*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 893*1e49577aSRod Evans fsrc1 %f6, %f38 894*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 895*1e49577aSRod Evans fsrc1 %f8, %f40 896*1e49577aSRod Evans fsrc1 %f10, %f42 897*1e49577aSRod Evans fsrc1 %f12, %f44 898*1e49577aSRod Evans fsrc1 %f14, %f46 899*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P 900*1e49577aSRod Evans membar #StoreLoad|#StoreStore 901*1e49577aSRod Evans wr %o4, 0, %fprs 902*1e49577aSRod Evans retl 903*1e49577aSRod Evans mov %g1, %o0 904*1e49577aSRod Evans 905*1e49577aSRod Evans 906*1e49577aSRod Evans .align 16 907*1e49577aSRod Evans ! two nops here causes loop starting at 1f below to be 908*1e49577aSRod Evans ! on a cache line boundary, improving performance 909*1e49577aSRod Evans nop 910*1e49577aSRod Evans nop 911*1e49577aSRod Evans.xlarge: 912*1e49577aSRod Evans ! %o0 I/O DST is 64-byte aligned 913*1e49577aSRod Evans ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 914*1e49577aSRod Evans ! %d0 I/O already loaded with SRC data from [%o1-8] 915*1e49577aSRod Evans ! %o2 I/O count (number of bytes that need to be written) 916*1e49577aSRod Evans ! %o3 I Not written. If zero, then SRC is double aligned. 917*1e49577aSRod Evans ! %o4 I Not written. Holds fprs. 918*1e49577aSRod Evans ! %o5 O The number of doubles that remain to be written. 919*1e49577aSRod Evans 920*1e49577aSRod Evans ! Load the rest of the current block 921*1e49577aSRod Evans ! Recall that %o1 is further into SRC than %o0 is into DST 922*1e49577aSRod Evans 923*1e49577aSRod Evans ldd [%o1], %f2 924*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 925*1e49577aSRod Evans faligndata %f0, %f2, %f32 926*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 927*1e49577aSRod Evans faligndata %f2, %f4, %f34 928*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 929*1e49577aSRod Evans faligndata %f4, %f6, %f36 930*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 931*1e49577aSRod Evans or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 932*1e49577aSRod Evans faligndata %f6, %f8, %f38 933*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 934*1e49577aSRod Evans movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) 935*1e49577aSRod Evans prefetch [%o1 + (16 * BLOCK_SIZE)], #one_read 936*1e49577aSRod Evans faligndata %f8, %f10, %f40 937*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 938*1e49577aSRod Evans faligndata %f10, %f12, %f42 939*1e49577aSRod Evans ldd [%o1 + 0x38], %f0 940*1e49577aSRod Evans prefetch [%o1 + (17 * BLOCK_SIZE)], #one_read 941*1e49577aSRod Evans sub %o2, BLOCK_SIZE, %o2 ! update count 942*1e49577aSRod Evans add %o1, BLOCK_SIZE, %o1 ! update SRC 943*1e49577aSRod Evans 944*1e49577aSRod Evans ! This point is 32-byte aligned since 24 instructions appear since 945*1e49577aSRod Evans ! the previous alignment directive. 946*1e49577aSRod Evans 947*1e49577aSRod Evans 948*1e49577aSRod Evans ! Main loop. Write previous block. Load rest of current block. 949*1e49577aSRod Evans ! Some bytes will be loaded that won't yet be written. 950*1e49577aSRod Evans1: 951*1e49577aSRod Evans ldd [%o1], %f2 952*1e49577aSRod Evans faligndata %f12, %f14, %f44 953*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 954*1e49577aSRod Evans faligndata %f14, %f0, %f46 955*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P 956*1e49577aSRod Evans sub %o2, BLOCK_SIZE, %o2 ! update count 957*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 958*1e49577aSRod Evans faligndata %f0, %f2, %f32 959*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 960*1e49577aSRod Evans faligndata %f2, %f4, %f34 961*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 962*1e49577aSRod Evans faligndata %f4, %f6, %f36 963*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 964*1e49577aSRod Evans faligndata %f6, %f8, %f38 965*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 966*1e49577aSRod Evans prefetch [%o1 + (2 * BLOCK_SIZE)], #n_reads 967*1e49577aSRod Evans faligndata %f8, %f10, %f40 968*1e49577aSRod Evans ldd [%o1 + 0x38], %f0 969*1e49577aSRod Evans faligndata %f10, %f12, %f42 970*1e49577aSRod Evans prefetch [%o1 + (25 * BLOCK_SIZE)], #one_read 971*1e49577aSRod Evans add %o0, BLOCK_SIZE, %o0 ! update DST 972*1e49577aSRod Evans cmp %o2, BLOCK_SIZE + 8 973*1e49577aSRod Evans ! second prefetch important to correct for occasional dropped 974*1e49577aSRod Evans prefetch [%o1 + (18 * BLOCK_SIZE)], #one_read 975*1e49577aSRod Evans bgu,pt %ncc, 1b 976*1e49577aSRod Evans add %o1, BLOCK_SIZE, %o1 ! update SRC 977*1e49577aSRod Evans 978*1e49577aSRod Evans faligndata %f12, %f14, %f44 979*1e49577aSRod Evans faligndata %f14, %f0, %f46 980*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 981*1e49577aSRod Evans cmp %o2, BLOCK_SIZE 982*1e49577aSRod Evans bne %ncc, 2f ! exactly 1 block remaining? 983*1e49577aSRod Evans add %o0, BLOCK_SIZE, %o0 ! update DST 984*1e49577aSRod Evans brz,a %o3, 3f ! is SRC double aligned? 985*1e49577aSRod Evans ldd [%o1], %f2 986*1e49577aSRod Evans 987*1e49577aSRod Evans2: 988*1e49577aSRod Evans add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 989*1e49577aSRod Evans add %o5, %o3, %o5 990*1e49577aSRod Evans 991*1e49577aSRod Evans membar #StoreLoad|#StoreStore 992*1e49577aSRod Evans 993*1e49577aSRod Evans ba .beginmedloop 994*1e49577aSRod Evans andn %o5, 7, %o5 ! 8 byte aligned count 995*1e49577aSRod Evans 996*1e49577aSRod Evans 997*1e49577aSRod Evans ! This is when there is exactly 1 block remaining and SRC is aligned 998*1e49577aSRod Evans3: 999*1e49577aSRod Evans ldd [%o1 + 0x8], %f4 1000*1e49577aSRod Evans ldd [%o1 + 0x10], %f6 1001*1e49577aSRod Evans fsrc1 %f0, %f32 1002*1e49577aSRod Evans ldd [%o1 + 0x18], %f8 1003*1e49577aSRod Evans fsrc1 %f2, %f34 1004*1e49577aSRod Evans ldd [%o1 + 0x20], %f10 1005*1e49577aSRod Evans fsrc1 %f4, %f36 1006*1e49577aSRod Evans ldd [%o1 + 0x28], %f12 1007*1e49577aSRod Evans fsrc1 %f6, %f38 1008*1e49577aSRod Evans ldd [%o1 + 0x30], %f14 1009*1e49577aSRod Evans fsrc1 %f8, %f40 1010*1e49577aSRod Evans fsrc1 %f10, %f42 1011*1e49577aSRod Evans fsrc1 %f12, %f44 1012*1e49577aSRod Evans fsrc1 %f14, %f46 1013*1e49577aSRod Evans stda %f32, [%o0]ASI_BLK_P 1014*1e49577aSRod Evans membar #StoreLoad|#StoreStore 1015*1e49577aSRod Evans wr %o4, 0, %fprs 1016*1e49577aSRod Evans retl 1017*1e49577aSRod Evans mov %g1, %o0 1018*1e49577aSRod Evans 1019*1e49577aSRod Evans SET_SIZE(memcpy) 1020