1*5d9d9091SRichard Lowe/* 2*5d9d9091SRichard Lowe * CDDL HEADER START 3*5d9d9091SRichard Lowe * 4*5d9d9091SRichard Lowe * The contents of this file are subject to the terms of the 5*5d9d9091SRichard Lowe * Common Development and Distribution License (the "License"). 6*5d9d9091SRichard Lowe * You may not use this file except in compliance with the License. 7*5d9d9091SRichard Lowe * 8*5d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*5d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing. 10*5d9d9091SRichard Lowe * See the License for the specific language governing permissions 11*5d9d9091SRichard Lowe * and limitations under the License. 12*5d9d9091SRichard Lowe * 13*5d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each 14*5d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*5d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the 16*5d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying 17*5d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner] 18*5d9d9091SRichard Lowe * 19*5d9d9091SRichard Lowe * CDDL HEADER END 20*5d9d9091SRichard Lowe */ 21*5d9d9091SRichard Lowe 22*5d9d9091SRichard Lowe/* 23*5d9d9091SRichard Lowe * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24*5d9d9091SRichard Lowe */ 25*5d9d9091SRichard Lowe 26*5d9d9091SRichard Lowe .file "memcpy.s" 27*5d9d9091SRichard Lowe 28*5d9d9091SRichard Lowe/* 29*5d9d9091SRichard Lowe * memcpy(s1, s2, len) 30*5d9d9091SRichard Lowe * 31*5d9d9091SRichard Lowe * Copy s2 to s1, always copy n bytes. 32*5d9d9091SRichard Lowe * Note: this C code does not work for overlapped copies. 33*5d9d9091SRichard Lowe * Memmove() and bcopy() do. 34*5d9d9091SRichard Lowe * 35*5d9d9091SRichard Lowe * Added entry __align_cpy_1 is generally for use of the compilers. 36*5d9d9091SRichard Lowe * 37*5d9d9091SRichard Lowe * Fast assembler language version of the following C-program for memcpy 38*5d9d9091SRichard Lowe * which represents the `standard' for the C-library. 39*5d9d9091SRichard Lowe * 40*5d9d9091SRichard Lowe * void * 41*5d9d9091SRichard Lowe * memcpy(void *s, const void *s0, size_t n) 42*5d9d9091SRichard Lowe * { 43*5d9d9091SRichard Lowe * if (n != 0) { 44*5d9d9091SRichard Lowe * char *s1 = s; 45*5d9d9091SRichard Lowe * const char *s2 = s0; 46*5d9d9091SRichard Lowe * do { 47*5d9d9091SRichard Lowe * *s1++ = *s2++; 48*5d9d9091SRichard Lowe * } while (--n != 0); 49*5d9d9091SRichard Lowe * } 50*5d9d9091SRichard Lowe * return (s); 51*5d9d9091SRichard Lowe * } 52*5d9d9091SRichard Lowe * 53*5d9d9091SRichard Lowe * 54*5d9d9091SRichard Lowe * N1 Flow : 55*5d9d9091SRichard Lowe * 56*5d9d9091SRichard Lowe * if (count < 17) { 57*5d9d9091SRichard Lowe * Do the byte copy 58*5d9d9091SRichard Lowe * Return destination address 59*5d9d9091SRichard Lowe * } 60*5d9d9091SRichard Lowe * if (count < 128) { 61*5d9d9091SRichard Lowe * Is source aligned on word boundary 62*5d9d9091SRichard Lowe * If no then align source on word boundary then goto .ald 63*5d9d9091SRichard Lowe * If yes goto .ald 64*5d9d9091SRichard Lowe * .ald: 65*5d9d9091SRichard Lowe * Is destination aligned on word boundary 66*5d9d9091SRichard Lowe * Depending on destination offset (last 2 bits of destination) 67*5d9d9091SRichard Lowe * copy data by shifting and merging. 68*5d9d9091SRichard Lowe * Copy residue bytes as byte copy 69*5d9d9091SRichard Lowe * Return destination address 70*5d9d9091SRichard Lowe * } else { 71*5d9d9091SRichard Lowe * Align destination on block boundary 72*5d9d9091SRichard Lowe * Depending on the source offset (last 4 bits of source address) align 73*5d9d9091SRichard Lowe * the data and store to destination. Both the load and store are done 74*5d9d9091SRichard Lowe * using ASI_BLK_INIT_ST_QUAD_LDD_P. 75*5d9d9091SRichard Lowe * For remaining count copy as much data in 8-byte chunk from source to 76*5d9d9091SRichard Lowe * destination. 77*5d9d9091SRichard Lowe * Followed by trailing copy using byte copy. 78*5d9d9091SRichard Lowe * Return saved destination address 79*5d9d9091SRichard Lowe * } 80*5d9d9091SRichard Lowe * 81*5d9d9091SRichard Lowe * 82*5d9d9091SRichard Lowe * N2 Flow : 83*5d9d9091SRichard Lowe * Flow : 84*5d9d9091SRichard Lowe * 85*5d9d9091SRichard Lowe * if (count < 128) { 86*5d9d9091SRichard Lowe * if count < 3 87*5d9d9091SRichard Lowe * copy bytes; exit with dst addr 88*5d9d9091SRichard Lowe * if src & dst aligned on word boundary but not long word boundary, 89*5d9d9091SRichard Lowe * copy with ldw/stw; branch to finish_up 90*5d9d9091SRichard Lowe * if src & dst aligned on long word boundary 91*5d9d9091SRichard Lowe * copy with ldx/stx; branch to finish_up 92*5d9d9091SRichard Lowe * if src & dst not aligned and length <= 14 93*5d9d9091SRichard Lowe * copy bytes; exit with dst addr 94*5d9d9091SRichard Lowe * move enough bytes to get src to word boundary 95*5d9d9091SRichard Lowe * if dst now on word boundary 96*5d9d9091SRichard Lowe * move_words: 97*5d9d9091SRichard Lowe * copy words; branch to finish_up 98*5d9d9091SRichard Lowe * if dst now on half word boundary 99*5d9d9091SRichard Lowe * load words, shift half words, store words; branch to finish_up 100*5d9d9091SRichard Lowe * if dst on byte 1 101*5d9d9091SRichard Lowe * load words, shift 3 bytes, store words; branch to finish_up 102*5d9d9091SRichard Lowe * if dst on byte 3 103*5d9d9091SRichard Lowe * load words, shift 1 byte, store words; branch to finish_up 104*5d9d9091SRichard Lowe * finish_up: 105*5d9d9091SRichard Lowe * copy bytes; exit with dst addr 106*5d9d9091SRichard Lowe * } else { More than 128 bytes 107*5d9d9091SRichard Lowe * move bytes until dst is on long word boundary 108*5d9d9091SRichard Lowe * if( src is on long word boundary ) { 109*5d9d9091SRichard Lowe * if (count < 512) { 110*5d9d9091SRichard Lowe * finish_long: src/dst aligned on 8 bytes 111*5d9d9091SRichard Lowe * copy with ldx/stx in 8-way unrolled loop; 112*5d9d9091SRichard Lowe * copy final 0-63 bytes; exit with dst addr 113*5d9d9091SRichard Lowe * } else { src/dst aligned; count > 512 114*5d9d9091SRichard Lowe * align dst on 64 byte boundary; use 8-way test for each of 8 possible 115*5d9d9091SRichard Lowe * src alignments relative to a 64 byte boundary to select the 116*5d9d9091SRichard Lowe * 16-way unrolled loop to use for 117*5d9d9091SRichard Lowe * block load, fmovd, block-init-store, block-store, fmovd operations 118*5d9d9091SRichard Lowe * then go to finish_long. 119*5d9d9091SRichard Lowe * } 120*5d9d9091SRichard Lowe * } else { src/dst not aligned on 8 bytes 121*5d9d9091SRichard Lowe * if src is word aligned and count < 512 122*5d9d9091SRichard Lowe * move words in 8-way unrolled loop 123*5d9d9091SRichard Lowe * move final 0-31 bytes; exit with dst addr 124*5d9d9091SRichard Lowe * if count < 512 125*5d9d9091SRichard Lowe * use alignaddr/faligndata combined with ldd/std in 8-way 126*5d9d9091SRichard Lowe * unrolled loop to move data. 127*5d9d9091SRichard Lowe * go to unalign_done 128*5d9d9091SRichard Lowe * else 129*5d9d9091SRichard Lowe * setup alignaddr for faligndata instructions 130*5d9d9091SRichard Lowe * align dst on 64 byte boundary; use 8-way test for each of 8 possible 131*5d9d9091SRichard Lowe * src alignments to nearest long word relative to 64 byte boundary to 132*5d9d9091SRichard Lowe * select the 8-way unrolled loop to use for 133*5d9d9091SRichard Lowe * block load, falign, fmovd, block-init-store, block-store loop 134*5d9d9091SRichard Lowe * (only use block-init-store when src/dst on 8 byte boundaries.) 135*5d9d9091SRichard Lowe * unalign_done: 136*5d9d9091SRichard Lowe * move remaining bytes for unaligned cases. exit with dst addr. 137*5d9d9091SRichard Lowe * } 138*5d9d9091SRichard Lowe * 139*5d9d9091SRichard Lowe * Comment on N2 memmove and memcpy common code and block-store-init: 140*5d9d9091SRichard Lowe * In the man page for memmove, it specifies that copying will take place 141*5d9d9091SRichard Lowe * correctly between objects that overlap. For memcpy, behavior is 142*5d9d9091SRichard Lowe * undefined for objects that overlap. 143*5d9d9091SRichard Lowe * 144*5d9d9091SRichard Lowe * In rare cases, some multi-threaded applications may attempt to examine 145*5d9d9091SRichard Lowe * the copy destination buffer during the copy. Using the block-store-init 146*5d9d9091SRichard Lowe * instruction allows those applications to observe zeros in some 147*5d9d9091SRichard Lowe * cache lines of the destination buffer for narrow windows. But the 148*5d9d9091SRichard Lowe * the block-store-init provides memory throughput advantages for many 149*5d9d9091SRichard Lowe * common applications. To meet both needs, those applications which need 150*5d9d9091SRichard Lowe * the destination buffer to retain meaning during the copy should use 151*5d9d9091SRichard Lowe * memmove instead of memcpy. The memmove version duplicates the memcpy 152*5d9d9091SRichard Lowe * algorithms except the memmove version does not use block-store-init 153*5d9d9091SRichard Lowe * in those cases where memcpy does use block-store-init. Otherwise, when 154*5d9d9091SRichard Lowe * memmove can determine the source and destination do not overlap, 155*5d9d9091SRichard Lowe * memmove shares the memcpy code. 156*5d9d9091SRichard Lowe */ 157*5d9d9091SRichard Lowe 158*5d9d9091SRichard Lowe#include <sys/asm_linkage.h> 159*5d9d9091SRichard Lowe#include <sys/niagaraasi.h> 160*5d9d9091SRichard Lowe#include <sys/asi.h> 161*5d9d9091SRichard Lowe#include <sys/trap.h> 162*5d9d9091SRichard Lowe 163*5d9d9091SRichard Lowe/* documented name for primary block initializing store */ 164*5d9d9091SRichard Lowe#define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P 165*5d9d9091SRichard Lowe 166*5d9d9091SRichard Lowe#define BLOCK_SIZE 64 167*5d9d9091SRichard Lowe#define FPRS_FEF 0x4 168*5d9d9091SRichard Lowe 169*5d9d9091SRichard Lowe#define SHORTCOPY 3 170*5d9d9091SRichard Lowe#define SHORTCHECK 14 171*5d9d9091SRichard Lowe#define SHORT_LONG 64 /* max copy for short longword-aligned case */ 172*5d9d9091SRichard Lowe /* must be at least 32 */ 173*5d9d9091SRichard Lowe#define SMALL_MAX 128 174*5d9d9091SRichard Lowe#define MED_UMAX 512 /* max copy for medium un-aligned case */ 175*5d9d9091SRichard Lowe#define MED_WMAX 512 /* max copy for medium word-aligned case */ 176*5d9d9091SRichard Lowe#define MED_MAX 512 /* max copy for medium longword-aligned case */ 177*5d9d9091SRichard Lowe 178*5d9d9091SRichard Lowe#ifdef NIAGARA2_IMPL 179*5d9d9091SRichard Lowe#include <sys/sun4asi.h> 180*5d9d9091SRichard Lowe 181*5d9d9091SRichard Lowe#else /* NIAGARA2_IMPL */ 182*5d9d9091SRichard Lowe/* 183*5d9d9091SRichard Lowe * This define is to align data for the unaligned source cases. 184*5d9d9091SRichard Lowe * The data1, data2 and data3 is merged into data1 and data2. 185*5d9d9091SRichard Lowe * The data3 is preserved for next merge. 186*5d9d9091SRichard Lowe */ 187*5d9d9091SRichard Lowe#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ 188*5d9d9091SRichard Lowe sllx data1, lshift, data1 ;\ 189*5d9d9091SRichard Lowe srlx data2, rshift, tmp ;\ 190*5d9d9091SRichard Lowe or data1, tmp, data1 ;\ 191*5d9d9091SRichard Lowe sllx data2, lshift, data2 ;\ 192*5d9d9091SRichard Lowe srlx data3, rshift, tmp ;\ 193*5d9d9091SRichard Lowe or data2, tmp, data2 194*5d9d9091SRichard Lowe/* 195*5d9d9091SRichard Lowe * Align the data. Merge the data1 and data2 into data1. 196*5d9d9091SRichard Lowe */ 197*5d9d9091SRichard Lowe#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ 198*5d9d9091SRichard Lowe sllx data1, lshift, data1 ;\ 199*5d9d9091SRichard Lowe srlx data2, rshift, tmp ;\ 200*5d9d9091SRichard Lowe or data1, tmp, data1 201*5d9d9091SRichard Lowe#endif /* NIAGARA2_IMPL */ 202*5d9d9091SRichard Lowe 203*5d9d9091SRichard Lowe 204*5d9d9091SRichard Lowe ANSI_PRAGMA_WEAK(memmove,function) 205*5d9d9091SRichard Lowe ANSI_PRAGMA_WEAK(memcpy,function) 206*5d9d9091SRichard Lowe 207*5d9d9091SRichard Lowe ENTRY(memmove) 208*5d9d9091SRichard Lowe cmp %o1, %o0 ! if from address is >= to use forward copy 209*5d9d9091SRichard Lowe bgeu,pn %ncc, .forcpy ! else use backward if ... 210*5d9d9091SRichard Lowe sub %o0, %o1, %o4 ! get difference of two addresses 211*5d9d9091SRichard Lowe cmp %o2, %o4 ! compare size and difference of addresses 212*5d9d9091SRichard Lowe bleu,pn %ncc, .forcpy ! if size is bigger, do overlapped copy 213*5d9d9091SRichard Lowe add %o1, %o2, %o5 ! get to end of source space 214*5d9d9091SRichard Lowe 215*5d9d9091SRichard Lowe ! 216*5d9d9091SRichard Lowe ! an overlapped copy that must be done "backwards" 217*5d9d9091SRichard Lowe ! 218*5d9d9091SRichard Lowe.chksize: 219*5d9d9091SRichard Lowe cmp %o2, 8 ! less than 8 byte do byte copy 220*5d9d9091SRichard Lowe blu,pt %ncc, 2f ! else continue 221*5d9d9091SRichard Lowe 222*5d9d9091SRichard Lowe ! Now size is bigger than 8 223*5d9d9091SRichard Lowe.dbalign: 224*5d9d9091SRichard Lowe add %o0, %o2, %g1 ! get to end of dest space 225*5d9d9091SRichard Lowe andcc %g1, 7, %o3 ! %o3 has bytes till dst 8 bytes aligned 226*5d9d9091SRichard Lowe bz,a,pn %ncc, .dbbck ! if dst is not 8 byte aligned: align it 227*5d9d9091SRichard Lowe andn %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size 228*5d9d9091SRichard Lowe sub %o2, %o3, %o2 ! update o2 with new count 229*5d9d9091SRichard Lowe 230*5d9d9091SRichard Lowe1: dec %o5 ! decrement source 231*5d9d9091SRichard Lowe ldub [%o5], %g1 ! load one byte 232*5d9d9091SRichard Lowe deccc %o3 ! decrement count 233*5d9d9091SRichard Lowe bgu,pt %ncc, 1b ! if not done keep copying 234*5d9d9091SRichard Lowe stb %g1, [%o5+%o4] ! store one byte into dest 235*5d9d9091SRichard Lowe andncc %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size 236*5d9d9091SRichard Lowe bz,pn %ncc, 2f ! if size < 8, move to byte copy 237*5d9d9091SRichard Lowe 238*5d9d9091SRichard Lowe ! Now Destination is 8 byte aligned 239*5d9d9091SRichard Lowe.dbbck: 240*5d9d9091SRichard Lowe andcc %o5, 7, %o0 ! %o0 has src offset 241*5d9d9091SRichard Lowe bz,a,pn %ncc, .dbcopybc ! if src is aligned to fast mem move 242*5d9d9091SRichard Lowe sub %o2, %o3, %o2 ! Residue bytes in %o2 243*5d9d9091SRichard Lowe 244*5d9d9091SRichard Lowe.cpy_dbwdbc: ! alignment of src is needed 245*5d9d9091SRichard Lowe sub %o2, 8, %o2 ! set size one loop ahead 246*5d9d9091SRichard Lowe sll %o0, 3, %g1 ! %g1 is left shift 247*5d9d9091SRichard Lowe mov 64, %g5 ! init %g5 to be 64 248*5d9d9091SRichard Lowe sub %g5, %g1, %g5 ! %g5 right shift = (64 - left shift) 249*5d9d9091SRichard Lowe sub %o5, %o0, %o5 ! align the src at 8 bytes. 250*5d9d9091SRichard Lowe add %o4, %o0, %o4 ! increase difference between src & dst 251*5d9d9091SRichard Lowe ldx [%o5], %o1 ! load first 8 bytes 252*5d9d9091SRichard Lowe srlx %o1, %g5, %o1 253*5d9d9091SRichard Lowe1: sub %o5, 8, %o5 ! subtract 8 from src 254*5d9d9091SRichard Lowe ldx [%o5], %o0 ! load 8 byte 255*5d9d9091SRichard Lowe sllx %o0, %g1, %o3 ! shift loaded 8 bytes left into tmp reg 256*5d9d9091SRichard Lowe or %o1, %o3, %o3 ! align data 257*5d9d9091SRichard Lowe stx %o3, [%o5+%o4] ! store 8 byte 258*5d9d9091SRichard Lowe subcc %o2, 8, %o2 ! subtract 8 byte from size 259*5d9d9091SRichard Lowe bg,pt %ncc, 1b ! if size > 0 continue 260*5d9d9091SRichard Lowe srlx %o0, %g5, %o1 ! move extra byte for the next use 261*5d9d9091SRichard Lowe 262*5d9d9091SRichard Lowe srl %g1, 3, %o0 ! retsote %o0 value for alignment 263*5d9d9091SRichard Lowe add %o5, %o0, %o5 ! restore src alignment 264*5d9d9091SRichard Lowe sub %o4, %o0, %o4 ! restore difference between src & dest 265*5d9d9091SRichard Lowe 266*5d9d9091SRichard Lowe ba 2f ! branch to the trailing byte copy 267*5d9d9091SRichard Lowe add %o2, 8, %o2 ! restore size value 268*5d9d9091SRichard Lowe 269*5d9d9091SRichard Lowe.dbcopybc: ! alignment of src is not needed 270*5d9d9091SRichard Lowe1: sub %o5, 8, %o5 ! subtract from src 271*5d9d9091SRichard Lowe ldx [%o5], %g1 ! load 8 bytes 272*5d9d9091SRichard Lowe subcc %o3, 8, %o3 ! subtract from size 273*5d9d9091SRichard Lowe bgu,pt %ncc, 1b ! if size is bigger 0 continue 274*5d9d9091SRichard Lowe stx %g1, [%o5+%o4] ! store 8 bytes to destination 275*5d9d9091SRichard Lowe 276*5d9d9091SRichard Lowe ba 2f 277*5d9d9091SRichard Lowe nop 278*5d9d9091SRichard Lowe 279*5d9d9091SRichard Lowe.bcbyte: 280*5d9d9091SRichard Lowe1: ldub [%o5], %g1 ! load one byte 281*5d9d9091SRichard Lowe stb %g1, [%o5+%o4] ! store one byte 282*5d9d9091SRichard Lowe2: deccc %o2 ! decrement size 283*5d9d9091SRichard Lowe bgeu,a,pt %ncc, 1b ! if size is >= 0 continue 284*5d9d9091SRichard Lowe dec %o5 ! decrement from address 285*5d9d9091SRichard Lowe 286*5d9d9091SRichard Lowe.exitbc: ! exit from backward copy 287*5d9d9091SRichard Lowe retl 288*5d9d9091SRichard Lowe add %o5, %o4, %o0 ! restore dest addr 289*5d9d9091SRichard Lowe 290*5d9d9091SRichard Lowe#ifdef NIAGARA2_IMPL 291*5d9d9091SRichard Lowe ! 292*5d9d9091SRichard Lowe ! Check to see if memmove is large aligned copy 293*5d9d9091SRichard Lowe ! If so, use special version of copy that avoids 294*5d9d9091SRichard Lowe ! use of block store init 295*5d9d9091SRichard Lowe ! 296*5d9d9091SRichard Lowe.forcpy: 297*5d9d9091SRichard Lowe cmp %o2, SMALL_MAX ! check for not small case 298*5d9d9091SRichard Lowe blt,pn %ncc, .mv_short ! merge with memcpy 299*5d9d9091SRichard Lowe mov %o0, %g1 ! save %o0 300*5d9d9091SRichard Lowe neg %o0, %o5 301*5d9d9091SRichard Lowe andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 302*5d9d9091SRichard Lowe brz,pt %o5, .mv_dst_aligned_on_8 303*5d9d9091SRichard Lowe 304*5d9d9091SRichard Lowe ! %o5 has the bytes to be written in partial store. 305*5d9d9091SRichard Lowe sub %o2, %o5, %o2 306*5d9d9091SRichard Lowe sub %o1, %o0, %o1 ! %o1 gets the difference 307*5d9d9091SRichard Lowe7: ! dst aligning loop 308*5d9d9091SRichard Lowe ldub [%o1+%o0], %o4 ! load one byte 309*5d9d9091SRichard Lowe subcc %o5, 1, %o5 310*5d9d9091SRichard Lowe stb %o4, [%o0] 311*5d9d9091SRichard Lowe bgu,pt %ncc, 7b 312*5d9d9091SRichard Lowe add %o0, 1, %o0 ! advance dst 313*5d9d9091SRichard Lowe add %o1, %o0, %o1 ! restore %o1 314*5d9d9091SRichard Lowe.mv_dst_aligned_on_8: 315*5d9d9091SRichard Lowe andcc %o1, 7, %o5 316*5d9d9091SRichard Lowe brnz,pt %o5, .src_dst_unaligned_on_8 317*5d9d9091SRichard Lowe prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read 318*5d9d9091SRichard Lowe 319*5d9d9091SRichard Lowe.mv_src_dst_aligned_on_8: 320*5d9d9091SRichard Lowe ! check if we are copying MED_MAX or more bytes 321*5d9d9091SRichard Lowe cmp %o2, MED_MAX ! limit to store buffer size 322*5d9d9091SRichard Lowe bleu,pt %ncc, .medlong 323*5d9d9091SRichard Lowe prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 324*5d9d9091SRichard Lowe 325*5d9d9091SRichard Lowe/* 326*5d9d9091SRichard Lowe * The following memmove code mimics the memcpy code for large aligned copies, 327*5d9d9091SRichard Lowe * but does not use the ASI_STBI_P (block initializing store) performance 328*5d9d9091SRichard Lowe * optimization. See memmove rationale section in documentation 329*5d9d9091SRichard Lowe */ 330*5d9d9091SRichard Lowe.mv_large_align8_copy: ! Src and dst share 8 byte alignment 331*5d9d9091SRichard Lowe rd %fprs, %g5 ! check for unused fp 332*5d9d9091SRichard Lowe ! if fprs.fef == 0, set it. 333*5d9d9091SRichard Lowe ! Setting it when already set costs more than checking 334*5d9d9091SRichard Lowe andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 335*5d9d9091SRichard Lowe bz,a %ncc, 1f 336*5d9d9091SRichard Lowe wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 337*5d9d9091SRichard Lowe1: 338*5d9d9091SRichard Lowe ! align dst to 64 byte boundary 339*5d9d9091SRichard Lowe andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 340*5d9d9091SRichard Lowe brz,pn %o3, .mv_aligned_on_64 341*5d9d9091SRichard Lowe sub %o3, 64, %o3 ! %o3 has negative bytes to move 342*5d9d9091SRichard Lowe add %o2, %o3, %o2 ! adjust remaining count 343*5d9d9091SRichard Lowe.mv_align_to_64: 344*5d9d9091SRichard Lowe ldx [%o1], %o4 345*5d9d9091SRichard Lowe add %o1, 8, %o1 ! increment src ptr 346*5d9d9091SRichard Lowe addcc %o3, 8, %o3 347*5d9d9091SRichard Lowe stx %o4, [%o0] 348*5d9d9091SRichard Lowe brnz,pt %o3, .mv_align_to_64 349*5d9d9091SRichard Lowe add %o0, 8, %o0 ! increment dst ptr 350*5d9d9091SRichard Lowe 351*5d9d9091SRichard Lowe.mv_aligned_on_64: 352*5d9d9091SRichard Lowe prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 353*5d9d9091SRichard Lowe mov %asi,%o4 ! save %asi 354*5d9d9091SRichard Lowe ! Determine source alignment 355*5d9d9091SRichard Lowe ! to correct 8 byte offset 356*5d9d9091SRichard Lowe andcc %o1, 0x20, %o3 357*5d9d9091SRichard Lowe brnz,pn %o3, .mv_align_1 358*5d9d9091SRichard Lowe mov ASI_BLK_P, %asi ! setup %asi for block load/store 359*5d9d9091SRichard Lowe andcc %o1, 0x10, %o3 360*5d9d9091SRichard Lowe brnz,pn %o3, .mv_align_01 361*5d9d9091SRichard Lowe nop 362*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 363*5d9d9091SRichard Lowe brz,pn %o3, .mv_align_000 364*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 365*5d9d9091SRichard Lowe ba .mv_align_001 366*5d9d9091SRichard Lowe nop 367*5d9d9091SRichard Lowe.mv_align_01: 368*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 369*5d9d9091SRichard Lowe brnz,pn %o3, .mv_align_011 370*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 371*5d9d9091SRichard Lowe ba .mv_align_010 372*5d9d9091SRichard Lowe nop 373*5d9d9091SRichard Lowe.mv_align_1: 374*5d9d9091SRichard Lowe andcc %o1, 0x10, %o3 375*5d9d9091SRichard Lowe brnz,pn %o3, .mv_align_11 376*5d9d9091SRichard Lowe nop 377*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 378*5d9d9091SRichard Lowe brnz,pn %o3, .mv_align_101 379*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 380*5d9d9091SRichard Lowe ba .mv_align_100 381*5d9d9091SRichard Lowe nop 382*5d9d9091SRichard Lowe.mv_align_11: 383*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 384*5d9d9091SRichard Lowe brz,pn %o3, .mv_align_110 385*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 386*5d9d9091SRichard Lowe 387*5d9d9091SRichard Lowe.mv_align_111: 388*5d9d9091SRichard Lowe! Alignment off by 8 bytes 389*5d9d9091SRichard Lowe ldd [%o1], %d0 390*5d9d9091SRichard Lowe add %o1, 8, %o1 391*5d9d9091SRichard Lowe sub %o2, 8, %o2 392*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 393*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 394*5d9d9091SRichard Lowe.mv_align_111_loop: 395*5d9d9091SRichard Lowe subcc %o5, 128, %o5 396*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 397*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 398*5d9d9091SRichard Lowe fmovd %d16, %d2 399*5d9d9091SRichard Lowe fmovd %d18, %d4 400*5d9d9091SRichard Lowe fmovd %d20, %d6 401*5d9d9091SRichard Lowe fmovd %d22, %d8 402*5d9d9091SRichard Lowe fmovd %d24, %d10 403*5d9d9091SRichard Lowe fmovd %d26, %d12 404*5d9d9091SRichard Lowe fmovd %d28, %d14 405*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 406*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 407*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 408*5d9d9091SRichard Lowe fmovd %d30, %d0 409*5d9d9091SRichard Lowe 410*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 411*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 412*5d9d9091SRichard Lowe fmovd %d16, %d2 413*5d9d9091SRichard Lowe fmovd %d18, %d4 414*5d9d9091SRichard Lowe fmovd %d20, %d6 415*5d9d9091SRichard Lowe fmovd %d22, %d8 416*5d9d9091SRichard Lowe fmovd %d24, %d10 417*5d9d9091SRichard Lowe fmovd %d26, %d12 418*5d9d9091SRichard Lowe fmovd %d28, %d14 419*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 420*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 421*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 422*5d9d9091SRichard Lowe fmovd %d30, %d0 423*5d9d9091SRichard Lowe bgt,pt %ncc, .mv_align_111_loop 424*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 425*5d9d9091SRichard Lowe 426*5d9d9091SRichard Lowe std %d0, [%o0] 427*5d9d9091SRichard Lowe ba .remain_stuff 428*5d9d9091SRichard Lowe add %o0, 8, %o0 429*5d9d9091SRichard Lowe ! END OF mv_align_111 430*5d9d9091SRichard Lowe 431*5d9d9091SRichard Lowe.mv_align_110: 432*5d9d9091SRichard Lowe! Alignment off by 16 bytes 433*5d9d9091SRichard Lowe ldd [%o1], %d0 434*5d9d9091SRichard Lowe ldd [%o1+8], %d2 435*5d9d9091SRichard Lowe add %o1, 16, %o1 436*5d9d9091SRichard Lowe sub %o2, 16, %o2 437*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 438*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 439*5d9d9091SRichard Lowe.mv_align_110_loop: 440*5d9d9091SRichard Lowe subcc %o5, 128, %o5 441*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 442*5d9d9091SRichard Lowe 443*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 444*5d9d9091SRichard Lowe fmovd %d16, %d4 445*5d9d9091SRichard Lowe fmovd %d18, %d6 446*5d9d9091SRichard Lowe fmovd %d20, %d8 447*5d9d9091SRichard Lowe fmovd %d22, %d10 448*5d9d9091SRichard Lowe fmovd %d24, %d12 449*5d9d9091SRichard Lowe fmovd %d26, %d14 450*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 451*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 452*5d9d9091SRichard Lowe fmovd %d28, %d0 453*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 454*5d9d9091SRichard Lowe fmovd %d30, %d2 455*5d9d9091SRichard Lowe 456*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 457*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 458*5d9d9091SRichard Lowe fmovd %d16, %d4 459*5d9d9091SRichard Lowe fmovd %d18, %d6 460*5d9d9091SRichard Lowe fmovd %d20, %d8 461*5d9d9091SRichard Lowe fmovd %d22, %d10 462*5d9d9091SRichard Lowe fmovd %d24, %d12 463*5d9d9091SRichard Lowe fmovd %d26, %d14 464*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 465*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 466*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 467*5d9d9091SRichard Lowe fmovd %d28, %d0 468*5d9d9091SRichard Lowe fmovd %d30, %d2 469*5d9d9091SRichard Lowe bgt,pt %ncc, .mv_align_110_loop 470*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 471*5d9d9091SRichard Lowe 472*5d9d9091SRichard Lowe std %d0, [%o0] 473*5d9d9091SRichard Lowe std %d2, [%o0+8] 474*5d9d9091SRichard Lowe ba .remain_stuff 475*5d9d9091SRichard Lowe add %o0, 16, %o0 476*5d9d9091SRichard Lowe ! END OF mv_align_110 477*5d9d9091SRichard Lowe 478*5d9d9091SRichard Lowe.mv_align_101: 479*5d9d9091SRichard Lowe! Alignment off by 24 bytes 480*5d9d9091SRichard Lowe ldd [%o1], %d0 481*5d9d9091SRichard Lowe ldd [%o1+8], %d2 482*5d9d9091SRichard Lowe ldd [%o1+16], %d4 483*5d9d9091SRichard Lowe add %o1, 24, %o1 484*5d9d9091SRichard Lowe sub %o2, 24, %o2 485*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 486*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 487*5d9d9091SRichard Lowe.mv_align_101_loop: 488*5d9d9091SRichard Lowe subcc %o5, 128, %o5 489*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 490*5d9d9091SRichard Lowe 491*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 492*5d9d9091SRichard Lowe fmovd %d16, %d6 493*5d9d9091SRichard Lowe fmovd %d18, %d8 494*5d9d9091SRichard Lowe fmovd %d20, %d10 495*5d9d9091SRichard Lowe fmovd %d22, %d12 496*5d9d9091SRichard Lowe fmovd %d24, %d14 497*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 498*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 499*5d9d9091SRichard Lowe fmovd %d26, %d0 500*5d9d9091SRichard Lowe fmovd %d28, %d2 501*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 502*5d9d9091SRichard Lowe fmovd %d30, %d4 503*5d9d9091SRichard Lowe 504*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 505*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 506*5d9d9091SRichard Lowe fmovd %d16, %d6 507*5d9d9091SRichard Lowe fmovd %d18, %d8 508*5d9d9091SRichard Lowe fmovd %d20, %d10 509*5d9d9091SRichard Lowe fmovd %d22, %d12 510*5d9d9091SRichard Lowe fmovd %d24, %d14 511*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 512*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 513*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 514*5d9d9091SRichard Lowe fmovd %d26, %d0 515*5d9d9091SRichard Lowe fmovd %d28, %d2 516*5d9d9091SRichard Lowe fmovd %d30, %d4 517*5d9d9091SRichard Lowe bgt,pt %ncc, .mv_align_101_loop 518*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 519*5d9d9091SRichard Lowe 520*5d9d9091SRichard Lowe std %d0, [%o0] 521*5d9d9091SRichard Lowe std %d2, [%o0+8] 522*5d9d9091SRichard Lowe std %d4, [%o0+16] 523*5d9d9091SRichard Lowe ba .remain_stuff 524*5d9d9091SRichard Lowe add %o0, 24, %o0 525*5d9d9091SRichard Lowe ! END OF mv_align_101 526*5d9d9091SRichard Lowe 527*5d9d9091SRichard Lowe.mv_align_100: 528*5d9d9091SRichard Lowe! Alignment off by 32 bytes 529*5d9d9091SRichard Lowe ldd [%o1], %d0 530*5d9d9091SRichard Lowe ldd [%o1+8], %d2 531*5d9d9091SRichard Lowe ldd [%o1+16],%d4 532*5d9d9091SRichard Lowe ldd [%o1+24],%d6 533*5d9d9091SRichard Lowe add %o1, 32, %o1 534*5d9d9091SRichard Lowe sub %o2, 32, %o2 535*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 536*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 537*5d9d9091SRichard Lowe.mv_align_100_loop: 538*5d9d9091SRichard Lowe subcc %o5, 128, %o5 539*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 540*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 541*5d9d9091SRichard Lowe fmovd %d16, %d8 542*5d9d9091SRichard Lowe fmovd %d18, %d10 543*5d9d9091SRichard Lowe fmovd %d20, %d12 544*5d9d9091SRichard Lowe fmovd %d22, %d14 545*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 546*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 547*5d9d9091SRichard Lowe fmovd %d24, %d0 548*5d9d9091SRichard Lowe fmovd %d26, %d2 549*5d9d9091SRichard Lowe fmovd %d28, %d4 550*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 551*5d9d9091SRichard Lowe fmovd %d30, %d6 552*5d9d9091SRichard Lowe 553*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 554*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 555*5d9d9091SRichard Lowe fmovd %d16, %d8 556*5d9d9091SRichard Lowe fmovd %d18, %d10 557*5d9d9091SRichard Lowe fmovd %d20, %d12 558*5d9d9091SRichard Lowe fmovd %d22, %d14 559*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 560*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 561*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 562*5d9d9091SRichard Lowe fmovd %d24, %d0 563*5d9d9091SRichard Lowe fmovd %d26, %d2 564*5d9d9091SRichard Lowe fmovd %d28, %d4 565*5d9d9091SRichard Lowe fmovd %d30, %d6 566*5d9d9091SRichard Lowe bgt,pt %ncc, .mv_align_100_loop 567*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 568*5d9d9091SRichard Lowe 569*5d9d9091SRichard Lowe std %d0, [%o0] 570*5d9d9091SRichard Lowe std %d2, [%o0+8] 571*5d9d9091SRichard Lowe std %d4, [%o0+16] 572*5d9d9091SRichard Lowe std %d6, [%o0+24] 573*5d9d9091SRichard Lowe ba .remain_stuff 574*5d9d9091SRichard Lowe add %o0, 32, %o0 575*5d9d9091SRichard Lowe ! END OF mv_align_100 576*5d9d9091SRichard Lowe 577*5d9d9091SRichard Lowe.mv_align_011: 578*5d9d9091SRichard Lowe! Alignment off by 40 bytes 579*5d9d9091SRichard Lowe ldd [%o1], %d0 580*5d9d9091SRichard Lowe ldd [%o1+8], %d2 581*5d9d9091SRichard Lowe ldd [%o1+16], %d4 582*5d9d9091SRichard Lowe ldd [%o1+24], %d6 583*5d9d9091SRichard Lowe ldd [%o1+32], %d8 584*5d9d9091SRichard Lowe add %o1, 40, %o1 585*5d9d9091SRichard Lowe sub %o2, 40, %o2 586*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 587*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 588*5d9d9091SRichard Lowe.mv_align_011_loop: 589*5d9d9091SRichard Lowe subcc %o5, 128, %o5 590*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 591*5d9d9091SRichard Lowe 592*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 593*5d9d9091SRichard Lowe fmovd %d16, %d10 594*5d9d9091SRichard Lowe fmovd %d18, %d12 595*5d9d9091SRichard Lowe fmovd %d20, %d14 596*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 597*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 598*5d9d9091SRichard Lowe fmovd %d22, %d0 599*5d9d9091SRichard Lowe fmovd %d24, %d2 600*5d9d9091SRichard Lowe fmovd %d26, %d4 601*5d9d9091SRichard Lowe fmovd %d28, %d6 602*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 603*5d9d9091SRichard Lowe fmovd %d30, %d8 604*5d9d9091SRichard Lowe 605*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 606*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 607*5d9d9091SRichard Lowe fmovd %d16, %d10 608*5d9d9091SRichard Lowe fmovd %d18, %d12 609*5d9d9091SRichard Lowe fmovd %d20, %d14 610*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 611*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 612*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 613*5d9d9091SRichard Lowe fmovd %d22, %d0 614*5d9d9091SRichard Lowe fmovd %d24, %d2 615*5d9d9091SRichard Lowe fmovd %d26, %d4 616*5d9d9091SRichard Lowe fmovd %d28, %d6 617*5d9d9091SRichard Lowe fmovd %d30, %d8 618*5d9d9091SRichard Lowe bgt,pt %ncc, .mv_align_011_loop 619*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 620*5d9d9091SRichard Lowe 621*5d9d9091SRichard Lowe std %d0, [%o0] 622*5d9d9091SRichard Lowe std %d2, [%o0+8] 623*5d9d9091SRichard Lowe std %d4, [%o0+16] 624*5d9d9091SRichard Lowe std %d6, [%o0+24] 625*5d9d9091SRichard Lowe std %d8, [%o0+32] 626*5d9d9091SRichard Lowe ba .remain_stuff 627*5d9d9091SRichard Lowe add %o0, 40, %o0 628*5d9d9091SRichard Lowe ! END OF mv_align_011 629*5d9d9091SRichard Lowe 630*5d9d9091SRichard Lowe.mv_align_010: 631*5d9d9091SRichard Lowe! Alignment off by 48 bytes 632*5d9d9091SRichard Lowe ldd [%o1], %d0 633*5d9d9091SRichard Lowe ldd [%o1+8], %d2 634*5d9d9091SRichard Lowe ldd [%o1+16], %d4 635*5d9d9091SRichard Lowe ldd [%o1+24], %d6 636*5d9d9091SRichard Lowe ldd [%o1+32], %d8 637*5d9d9091SRichard Lowe ldd [%o1+40], %d10 638*5d9d9091SRichard Lowe add %o1, 48, %o1 639*5d9d9091SRichard Lowe sub %o2, 48, %o2 640*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 641*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 642*5d9d9091SRichard Lowe.mv_align_010_loop: 643*5d9d9091SRichard Lowe subcc %o5, 128, %o5 644*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 645*5d9d9091SRichard Lowe 646*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 647*5d9d9091SRichard Lowe fmovd %d16, %d12 648*5d9d9091SRichard Lowe fmovd %d18, %d14 649*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 650*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 651*5d9d9091SRichard Lowe fmovd %d20, %d0 652*5d9d9091SRichard Lowe fmovd %d22, %d2 653*5d9d9091SRichard Lowe fmovd %d24, %d4 654*5d9d9091SRichard Lowe fmovd %d26, %d6 655*5d9d9091SRichard Lowe fmovd %d28, %d8 656*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 657*5d9d9091SRichard Lowe fmovd %d30, %d10 658*5d9d9091SRichard Lowe 659*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 660*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 661*5d9d9091SRichard Lowe fmovd %d16, %d12 662*5d9d9091SRichard Lowe fmovd %d18, %d14 663*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 664*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 665*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 666*5d9d9091SRichard Lowe fmovd %d20, %d0 667*5d9d9091SRichard Lowe fmovd %d22, %d2 668*5d9d9091SRichard Lowe fmovd %d24, %d4 669*5d9d9091SRichard Lowe fmovd %d26, %d6 670*5d9d9091SRichard Lowe fmovd %d28, %d8 671*5d9d9091SRichard Lowe fmovd %d30, %d10 672*5d9d9091SRichard Lowe bgt,pt %ncc, .mv_align_010_loop 673*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 674*5d9d9091SRichard Lowe 675*5d9d9091SRichard Lowe std %d0, [%o0] 676*5d9d9091SRichard Lowe std %d2, [%o0+8] 677*5d9d9091SRichard Lowe std %d4, [%o0+16] 678*5d9d9091SRichard Lowe std %d6, [%o0+24] 679*5d9d9091SRichard Lowe std %d8, [%o0+32] 680*5d9d9091SRichard Lowe std %d10, [%o0+40] 681*5d9d9091SRichard Lowe ba .remain_stuff 682*5d9d9091SRichard Lowe add %o0, 48, %o0 683*5d9d9091SRichard Lowe ! END OF mv_align_010 684*5d9d9091SRichard Lowe 685*5d9d9091SRichard Lowe.mv_align_001: 686*5d9d9091SRichard Lowe! Alignment off by 56 bytes 687*5d9d9091SRichard Lowe ldd [%o1], %d0 688*5d9d9091SRichard Lowe ldd [%o1+8], %d2 689*5d9d9091SRichard Lowe ldd [%o1+16], %d4 690*5d9d9091SRichard Lowe ldd [%o1+24], %d6 691*5d9d9091SRichard Lowe ldd [%o1+32], %d8 692*5d9d9091SRichard Lowe ldd [%o1+40], %d10 693*5d9d9091SRichard Lowe ldd [%o1+48], %d12 694*5d9d9091SRichard Lowe add %o1, 56, %o1 695*5d9d9091SRichard Lowe sub %o2, 56, %o2 696*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 697*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 698*5d9d9091SRichard Lowe.mv_align_001_loop: 699*5d9d9091SRichard Lowe subcc %o5, 128, %o5 700*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 701*5d9d9091SRichard Lowe 702*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 703*5d9d9091SRichard Lowe fmovd %d16, %d14 704*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 705*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 706*5d9d9091SRichard Lowe fmovd %d18, %d0 707*5d9d9091SRichard Lowe fmovd %d20, %d2 708*5d9d9091SRichard Lowe fmovd %d22, %d4 709*5d9d9091SRichard Lowe fmovd %d24, %d6 710*5d9d9091SRichard Lowe fmovd %d26, %d8 711*5d9d9091SRichard Lowe fmovd %d28, %d10 712*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 713*5d9d9091SRichard Lowe fmovd %d30, %d12 714*5d9d9091SRichard Lowe 715*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 716*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 717*5d9d9091SRichard Lowe fmovd %d16, %d14 718*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 719*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 720*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 721*5d9d9091SRichard Lowe fmovd %d18, %d0 722*5d9d9091SRichard Lowe fmovd %d20, %d2 723*5d9d9091SRichard Lowe fmovd %d22, %d4 724*5d9d9091SRichard Lowe fmovd %d24, %d6 725*5d9d9091SRichard Lowe fmovd %d26, %d8 726*5d9d9091SRichard Lowe fmovd %d28, %d10 727*5d9d9091SRichard Lowe fmovd %d30, %d12 728*5d9d9091SRichard Lowe bgt,pt %ncc, .mv_align_001_loop 729*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 730*5d9d9091SRichard Lowe 731*5d9d9091SRichard Lowe std %d0, [%o0] 732*5d9d9091SRichard Lowe std %d2, [%o0+8] 733*5d9d9091SRichard Lowe std %d4, [%o0+16] 734*5d9d9091SRichard Lowe std %d6, [%o0+24] 735*5d9d9091SRichard Lowe std %d8, [%o0+32] 736*5d9d9091SRichard Lowe std %d10, [%o0+40] 737*5d9d9091SRichard Lowe std %d12, [%o0+48] 738*5d9d9091SRichard Lowe ba .remain_stuff 739*5d9d9091SRichard Lowe add %o0, 56, %o0 740*5d9d9091SRichard Lowe ! END OF mv_align_001 741*5d9d9091SRichard Lowe 742*5d9d9091SRichard Lowe.mv_align_000: 743*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 744*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 745*5d9d9091SRichard Lowe.mv_align_000_loop: 746*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 747*5d9d9091SRichard Lowe subcc %o5, 128, %o5 748*5d9d9091SRichard Lowe ldda [%o1]%asi,%d0 749*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 750*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 751*5d9d9091SRichard Lowe 752*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 753*5d9d9091SRichard Lowe add %o0, 64, %o0 754*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d0 755*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 756*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 757*5d9d9091SRichard Lowe add %o0, 64, %o0 ! increment dst 758*5d9d9091SRichard Lowe bgt,pt %ncc, .mv_align_000_loop 759*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 760*5d9d9091SRichard Lowe ba .remain_stuff 761*5d9d9091SRichard Lowe nop 762*5d9d9091SRichard Lowe 763*5d9d9091SRichard Lowe ! END OF mv_align_000 764*5d9d9091SRichard Lowe#else /* NIAGARA2_IMPL */ 765*5d9d9091SRichard Lowe#endif /* NIAGARA2_IMPL */ 766*5d9d9091SRichard Lowe 767*5d9d9091SRichard Lowe SET_SIZE(memmove) 768*5d9d9091SRichard Lowe 769*5d9d9091SRichard Lowe ENTRY(memcpy) 770*5d9d9091SRichard Lowe ENTRY(__align_cpy_1) 771*5d9d9091SRichard Lowe#ifdef NIAGARA2_IMPL 772*5d9d9091SRichard Lowe cmp %o2, SMALL_MAX ! check for not small case 773*5d9d9091SRichard Lowe bgeu,pn %ncc, .medium ! go to larger cases 774*5d9d9091SRichard Lowe mov %o0, %g1 ! save %o0 775*5d9d9091SRichard Lowe.mv_short: 776*5d9d9091SRichard Lowe cmp %o2, SHORTCOPY ! check for really short case 777*5d9d9091SRichard Lowe ble,pt %ncc, .smallfin 778*5d9d9091SRichard Lowe or %o0, %o1, %o4 ! prepare alignment check 779*5d9d9091SRichard Lowe andcc %o4, 0x3, %o5 ! test for alignment 780*5d9d9091SRichard Lowe bz,pt %ncc, .smallword ! branch to word aligned case 781*5d9d9091SRichard Lowe cmp %o2, SHORTCHECK 782*5d9d9091SRichard Lowe ble,pt %ncc, .smallrest 783*5d9d9091SRichard Lowe andcc %o1, 0x3, %o5 ! is src word aligned 784*5d9d9091SRichard Lowe bz,pn %ncc, .aldst 785*5d9d9091SRichard Lowe cmp %o5, 2 ! is src half-word aligned 786*5d9d9091SRichard Lowe be,pt %ncc, .s2algn 787*5d9d9091SRichard Lowe cmp %o5, 3 ! src is byte aligned 788*5d9d9091SRichard Lowe.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it 789*5d9d9091SRichard Lowe inc 1, %o1 790*5d9d9091SRichard Lowe stb %o3, [%o0] ! move a byte to align src 791*5d9d9091SRichard Lowe inc 1, %o0 792*5d9d9091SRichard Lowe bne,pt %ncc, .s2algn 793*5d9d9091SRichard Lowe dec %o2 794*5d9d9091SRichard Lowe b .ald ! now go align dest 795*5d9d9091SRichard Lowe andcc %o0, 0x3, %o5 796*5d9d9091SRichard Lowe 797*5d9d9091SRichard Lowe.s2algn:lduh [%o1], %o3 ! know src is 2 byte aligned 798*5d9d9091SRichard Lowe inc 2, %o1 799*5d9d9091SRichard Lowe srl %o3, 8, %o4 800*5d9d9091SRichard Lowe stb %o4, [%o0] ! have to do bytes, 801*5d9d9091SRichard Lowe stb %o3, [%o0 + 1] ! don't know dst alignment 802*5d9d9091SRichard Lowe inc 2, %o0 803*5d9d9091SRichard Lowe dec 2, %o2 804*5d9d9091SRichard Lowe 805*5d9d9091SRichard Lowe.aldst: andcc %o0, 0x3, %o5 ! align the destination address 806*5d9d9091SRichard Lowe.ald: bz,pn %ncc, .w4cp 807*5d9d9091SRichard Lowe cmp %o5, 2 808*5d9d9091SRichard Lowe be,pn %ncc, .w2cp 809*5d9d9091SRichard Lowe cmp %o5, 3 810*5d9d9091SRichard Lowe.w3cp: lduw [%o1], %o4 811*5d9d9091SRichard Lowe inc 4, %o1 812*5d9d9091SRichard Lowe srl %o4, 24, %o5 813*5d9d9091SRichard Lowe stb %o5, [%o0] 814*5d9d9091SRichard Lowe bne,pt %ncc, .w1cp 815*5d9d9091SRichard Lowe inc %o0 816*5d9d9091SRichard Lowe dec 1, %o2 817*5d9d9091SRichard Lowe andn %o2, 3, %o3 ! %o3 is aligned word count 818*5d9d9091SRichard Lowe dec 4, %o3 ! avoid reading beyond tail of src 819*5d9d9091SRichard Lowe sub %o1, %o0, %o1 ! %o1 gets the difference 820*5d9d9091SRichard Lowe 821*5d9d9091SRichard Lowe1: sll %o4, 8, %g5 ! save residual bytes 822*5d9d9091SRichard Lowe lduw [%o1+%o0], %o4 823*5d9d9091SRichard Lowe deccc 4, %o3 824*5d9d9091SRichard Lowe srl %o4, 24, %o5 ! merge with residual 825*5d9d9091SRichard Lowe or %o5, %g5, %g5 826*5d9d9091SRichard Lowe st %g5, [%o0] 827*5d9d9091SRichard Lowe bnz,pt %ncc, 1b 828*5d9d9091SRichard Lowe inc 4, %o0 829*5d9d9091SRichard Lowe sub %o1, 3, %o1 ! used one byte of last word read 830*5d9d9091SRichard Lowe and %o2, 3, %o2 831*5d9d9091SRichard Lowe b 7f 832*5d9d9091SRichard Lowe inc 4, %o2 833*5d9d9091SRichard Lowe 834*5d9d9091SRichard Lowe.w1cp: srl %o4, 8, %o5 835*5d9d9091SRichard Lowe sth %o5, [%o0] 836*5d9d9091SRichard Lowe inc 2, %o0 837*5d9d9091SRichard Lowe dec 3, %o2 838*5d9d9091SRichard Lowe andn %o2, 3, %o3 ! %o3 is aligned word count 839*5d9d9091SRichard Lowe dec 4, %o3 ! avoid reading beyond tail of src 840*5d9d9091SRichard Lowe sub %o1, %o0, %o1 ! %o1 gets the difference 841*5d9d9091SRichard Lowe 842*5d9d9091SRichard Lowe2: sll %o4, 24, %g5 ! save residual bytes 843*5d9d9091SRichard Lowe lduw [%o1+%o0], %o4 844*5d9d9091SRichard Lowe deccc 4, %o3 845*5d9d9091SRichard Lowe srl %o4, 8, %o5 ! merge with residual 846*5d9d9091SRichard Lowe or %o5, %g5, %g5 847*5d9d9091SRichard Lowe st %g5, [%o0] 848*5d9d9091SRichard Lowe bnz,pt %ncc, 2b 849*5d9d9091SRichard Lowe inc 4, %o0 850*5d9d9091SRichard Lowe sub %o1, 1, %o1 ! used three bytes of last word read 851*5d9d9091SRichard Lowe and %o2, 3, %o2 852*5d9d9091SRichard Lowe b 7f 853*5d9d9091SRichard Lowe inc 4, %o2 854*5d9d9091SRichard Lowe 855*5d9d9091SRichard Lowe.w2cp: lduw [%o1], %o4 856*5d9d9091SRichard Lowe inc 4, %o1 857*5d9d9091SRichard Lowe srl %o4, 16, %o5 858*5d9d9091SRichard Lowe sth %o5, [%o0] 859*5d9d9091SRichard Lowe inc 2, %o0 860*5d9d9091SRichard Lowe dec 2, %o2 861*5d9d9091SRichard Lowe andn %o2, 3, %o3 ! %o3 is aligned word count 862*5d9d9091SRichard Lowe dec 4, %o3 ! avoid reading beyond tail of src 863*5d9d9091SRichard Lowe sub %o1, %o0, %o1 ! %o1 gets the difference 864*5d9d9091SRichard Lowe 865*5d9d9091SRichard Lowe3: sll %o4, 16, %g5 ! save residual bytes 866*5d9d9091SRichard Lowe lduw [%o1+%o0], %o4 867*5d9d9091SRichard Lowe deccc 4, %o3 868*5d9d9091SRichard Lowe srl %o4, 16, %o5 ! merge with residual 869*5d9d9091SRichard Lowe or %o5, %g5, %g5 870*5d9d9091SRichard Lowe st %g5, [%o0] 871*5d9d9091SRichard Lowe bnz,pt %ncc, 3b 872*5d9d9091SRichard Lowe inc 4, %o0 873*5d9d9091SRichard Lowe sub %o1, 2, %o1 ! used two bytes of last word read 874*5d9d9091SRichard Lowe and %o2, 3, %o2 875*5d9d9091SRichard Lowe b 7f 876*5d9d9091SRichard Lowe inc 4, %o2 877*5d9d9091SRichard Lowe 878*5d9d9091SRichard Lowe.w4cp: andn %o2, 3, %o3 ! %o3 is aligned word count 879*5d9d9091SRichard Lowe sub %o1, %o0, %o1 ! %o1 gets the difference 880*5d9d9091SRichard Lowe 881*5d9d9091SRichard Lowe1: lduw [%o1+%o0], %o4 ! read from address 882*5d9d9091SRichard Lowe deccc 4, %o3 ! decrement count 883*5d9d9091SRichard Lowe st %o4, [%o0] ! write at destination address 884*5d9d9091SRichard Lowe bgu,pt %ncc, 1b 885*5d9d9091SRichard Lowe inc 4, %o0 ! increment to address 886*5d9d9091SRichard Lowe and %o2, 3, %o2 ! number of leftover bytes, if any 887*5d9d9091SRichard Lowe 888*5d9d9091SRichard Lowe ! simple finish up byte copy, works with any alignment 889*5d9d9091SRichard Lowe7: 890*5d9d9091SRichard Lowe add %o1, %o0, %o1 ! restore %o1 891*5d9d9091SRichard Lowe.smallrest: 892*5d9d9091SRichard Lowe tst %o2 893*5d9d9091SRichard Lowe bz,pt %ncc, .smallx 894*5d9d9091SRichard Lowe cmp %o2, 4 895*5d9d9091SRichard Lowe blt,pt %ncc, .smallleft3 896*5d9d9091SRichard Lowe nop 897*5d9d9091SRichard Lowe sub %o2, 3, %o2 898*5d9d9091SRichard Lowe.smallnotalign4: 899*5d9d9091SRichard Lowe ldub [%o1], %o3 ! read byte 900*5d9d9091SRichard Lowe subcc %o2, 4, %o2 ! reduce count by 4 901*5d9d9091SRichard Lowe stb %o3, [%o0] ! write byte 902*5d9d9091SRichard Lowe ldub [%o1+1], %o3 ! repeat for total of 4 bytes 903*5d9d9091SRichard Lowe add %o1, 4, %o1 ! advance SRC by 4 904*5d9d9091SRichard Lowe stb %o3, [%o0+1] 905*5d9d9091SRichard Lowe ldub [%o1-2], %o3 906*5d9d9091SRichard Lowe add %o0, 4, %o0 ! advance DST by 4 907*5d9d9091SRichard Lowe stb %o3, [%o0-2] 908*5d9d9091SRichard Lowe ldub [%o1-1], %o3 909*5d9d9091SRichard Lowe bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain 910*5d9d9091SRichard Lowe stb %o3, [%o0-1] 911*5d9d9091SRichard Lowe addcc %o2, 3, %o2 ! restore count 912*5d9d9091SRichard Lowe bz,pt %ncc, .smallx 913*5d9d9091SRichard Lowe.smallleft3: ! 1, 2, or 3 bytes remain 914*5d9d9091SRichard Lowe subcc %o2, 1, %o2 915*5d9d9091SRichard Lowe ldub [%o1], %o3 ! load one byte 916*5d9d9091SRichard Lowe bz,pt %ncc, .smallx 917*5d9d9091SRichard Lowe stb %o3, [%o0] ! store one byte 918*5d9d9091SRichard Lowe ldub [%o1+1], %o3 ! load second byte 919*5d9d9091SRichard Lowe subcc %o2, 1, %o2 920*5d9d9091SRichard Lowe bz,pt %ncc, .smallx 921*5d9d9091SRichard Lowe stb %o3, [%o0+1] ! store second byte 922*5d9d9091SRichard Lowe ldub [%o1+2], %o3 ! load third byte 923*5d9d9091SRichard Lowe stb %o3, [%o0+2] ! store third byte 924*5d9d9091SRichard Lowe.smallx: 925*5d9d9091SRichard Lowe retl 926*5d9d9091SRichard Lowe mov %g1, %o0 ! restore %o0 927*5d9d9091SRichard Lowe 928*5d9d9091SRichard Lowe.smallfin: 929*5d9d9091SRichard Lowe tst %o2 930*5d9d9091SRichard Lowe bnz,pt %ncc, .smallleft3 931*5d9d9091SRichard Lowe nop 932*5d9d9091SRichard Lowe retl 933*5d9d9091SRichard Lowe mov %g1, %o0 ! restore %o0 934*5d9d9091SRichard Lowe 935*5d9d9091SRichard Lowe .align 16 936*5d9d9091SRichard Lowe.smallwords: 937*5d9d9091SRichard Lowe lduw [%o1], %o3 ! read word 938*5d9d9091SRichard Lowe.smallwordx: 939*5d9d9091SRichard Lowe subcc %o2, 8, %o2 ! update count 940*5d9d9091SRichard Lowe stw %o3, [%o0] ! write word 941*5d9d9091SRichard Lowe add %o1, 8, %o1 ! update SRC 942*5d9d9091SRichard Lowe lduw [%o1-4], %o3 ! read word 943*5d9d9091SRichard Lowe add %o0, 8, %o0 ! update DST 944*5d9d9091SRichard Lowe bgu,pt %ncc, .smallwords ! loop until done 945*5d9d9091SRichard Lowe stw %o3, [%o0-4] ! write word 946*5d9d9091SRichard Lowe addcc %o2, 7, %o2 ! restore count 947*5d9d9091SRichard Lowe bz,pt %ncc, .smallexit ! check for completion 948*5d9d9091SRichard Lowe cmp %o2, 4 ! check for 4 or more bytes left 949*5d9d9091SRichard Lowe blt %ncc, .smallleft3 ! if not, go to finish up 950*5d9d9091SRichard Lowe nop 951*5d9d9091SRichard Lowe lduw [%o1], %o3 952*5d9d9091SRichard Lowe add %o1, 4, %o1 953*5d9d9091SRichard Lowe subcc %o2, 4, %o2 954*5d9d9091SRichard Lowe add %o0, 4, %o0 955*5d9d9091SRichard Lowe bnz,pt %ncc, .smallleft3 956*5d9d9091SRichard Lowe stw %o3, [%o0-4] 957*5d9d9091SRichard Lowe retl 958*5d9d9091SRichard Lowe mov %g1, %o0 ! restore %o0 959*5d9d9091SRichard Lowe 960*5d9d9091SRichard Lowe! 8 or more bytes, src and dest start on word boundary 961*5d9d9091SRichard Lowe! %o4 contains or %o0, %o1; %o3 contains first four bytes of src 962*5d9d9091SRichard Lowe.smalllong: 963*5d9d9091SRichard Lowe andcc %o4, 0x7, %o5 ! test for long alignment 964*5d9d9091SRichard Lowe bnz,pt %ncc, .smallwordx ! branch to word aligned case 965*5d9d9091SRichard Lowe cmp %o2, SHORT_LONG-7 966*5d9d9091SRichard Lowe bge,a %ncc, .medl64 ! if we branch 967*5d9d9091SRichard Lowe sub %o2,56,%o2 ! adjust %o2 to -31 off count 968*5d9d9091SRichard Lowe sub %o1, %o0, %o1 ! %o1 gets the difference 969*5d9d9091SRichard Lowe.small_long_l: 970*5d9d9091SRichard Lowe ldx [%o1+%o0], %o3 971*5d9d9091SRichard Lowe subcc %o2, 8, %o2 972*5d9d9091SRichard Lowe add %o0, 8, %o0 973*5d9d9091SRichard Lowe bgu,pt %ncc, .small_long_l ! loop until done 974*5d9d9091SRichard Lowe stx %o3, [%o0-8] ! write word 975*5d9d9091SRichard Lowe add %o1, %o0, %o1 ! restore %o1 976*5d9d9091SRichard Lowe addcc %o2, 7, %o2 ! restore %o2 to correct count 977*5d9d9091SRichard Lowe bz,pt %ncc, .smallexit ! check for completion 978*5d9d9091SRichard Lowe cmp %o2, 4 ! check for 4 or more bytes left 979*5d9d9091SRichard Lowe blt,pt %ncc, .smallleft3 ! if not, go to finish up 980*5d9d9091SRichard Lowe nop 981*5d9d9091SRichard Lowe lduw [%o1], %o3 982*5d9d9091SRichard Lowe add %o1, 4, %o1 983*5d9d9091SRichard Lowe subcc %o2, 4, %o2 984*5d9d9091SRichard Lowe stw %o3, [%o0] 985*5d9d9091SRichard Lowe add %o0, 4, %o0 986*5d9d9091SRichard Lowe bnz,pt %ncc, .smallleft3 987*5d9d9091SRichard Lowe nop 988*5d9d9091SRichard Lowe retl 989*5d9d9091SRichard Lowe mov %g1, %o0 ! restore %o0 990*5d9d9091SRichard Lowe 991*5d9d9091SRichard Lowe .align 16 992*5d9d9091SRichard Lowe! src and dest start on word boundary 993*5d9d9091SRichard Lowe.smallword: 994*5d9d9091SRichard Lowe subcc %o2, 7, %o2 ! adjust count 995*5d9d9091SRichard Lowe bgu,pt %ncc, .smalllong 996*5d9d9091SRichard Lowe lduw [%o1], %o3 ! read word 997*5d9d9091SRichard Lowe addcc %o2, 3, %o2 ! restore count 998*5d9d9091SRichard Lowe bz,pt %ncc, .smallexit 999*5d9d9091SRichard Lowe stw %o3, [%o0] ! write word 1000*5d9d9091SRichard Lowe deccc %o2 ! reduce count for cc test 1001*5d9d9091SRichard Lowe ldub [%o1+4], %o3 ! load one byte 1002*5d9d9091SRichard Lowe bz,pt %ncc, .smallexit 1003*5d9d9091SRichard Lowe stb %o3, [%o0+4] ! store one byte 1004*5d9d9091SRichard Lowe ldub [%o1+5], %o3 ! load second byte 1005*5d9d9091SRichard Lowe deccc %o2 1006*5d9d9091SRichard Lowe bz,pt %ncc, .smallexit 1007*5d9d9091SRichard Lowe stb %o3, [%o0+5] ! store second byte 1008*5d9d9091SRichard Lowe ldub [%o1+6], %o3 ! load third byte 1009*5d9d9091SRichard Lowe stb %o3, [%o0+6] ! store third byte 1010*5d9d9091SRichard Lowe.smallexit: 1011*5d9d9091SRichard Lowe retl 1012*5d9d9091SRichard Lowe mov %g1, %o0 ! restore %o0 1013*5d9d9091SRichard Lowe 1014*5d9d9091SRichard Lowe .align 16 1015*5d9d9091SRichard Lowe.medium: 1016*5d9d9091SRichard Lowe neg %o0, %o5 1017*5d9d9091SRichard Lowe andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 1018*5d9d9091SRichard Lowe brz,pt %o5, .dst_aligned_on_8 1019*5d9d9091SRichard Lowe 1020*5d9d9091SRichard Lowe ! %o5 has the bytes to be written in partial store. 1021*5d9d9091SRichard Lowe sub %o2, %o5, %o2 1022*5d9d9091SRichard Lowe sub %o1, %o0, %o1 ! %o1 gets the difference 1023*5d9d9091SRichard Lowe7: ! dst aligning loop 1024*5d9d9091SRichard Lowe ldub [%o1+%o0], %o4 ! load one byte 1025*5d9d9091SRichard Lowe subcc %o5, 1, %o5 1026*5d9d9091SRichard Lowe stb %o4, [%o0] 1027*5d9d9091SRichard Lowe bgu,pt %ncc, 7b 1028*5d9d9091SRichard Lowe add %o0, 1, %o0 ! advance dst 1029*5d9d9091SRichard Lowe add %o1, %o0, %o1 ! restore %o1 1030*5d9d9091SRichard Lowe.dst_aligned_on_8: 1031*5d9d9091SRichard Lowe andcc %o1, 7, %o5 1032*5d9d9091SRichard Lowe brnz,pt %o5, .src_dst_unaligned_on_8 1033*5d9d9091SRichard Lowe prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read 1034*5d9d9091SRichard Lowe 1035*5d9d9091SRichard Lowe.src_dst_aligned_on_8: 1036*5d9d9091SRichard Lowe ! check if we are copying MED_MAX or more bytes 1037*5d9d9091SRichard Lowe cmp %o2, MED_MAX ! limit to store buffer size 1038*5d9d9091SRichard Lowe bgu,pt %ncc, .large_align8_copy 1039*5d9d9091SRichard Lowe prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 1040*5d9d9091SRichard Lowe/* 1041*5d9d9091SRichard Lowe * Special case for handling when src and dest are both long word aligned 1042*5d9d9091SRichard Lowe * and total data to move is less than MED_MAX bytes 1043*5d9d9091SRichard Lowe */ 1044*5d9d9091SRichard Lowe.medlong: 1045*5d9d9091SRichard Lowe subcc %o2, 63, %o2 ! adjust length to allow cc test 1046*5d9d9091SRichard Lowe ble,pt %ncc, .medl63 ! skip big loop if less than 64 bytes 1047*5d9d9091SRichard Lowe.medl64: 1048*5d9d9091SRichard Lowe prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache 1049*5d9d9091SRichard Lowe ldx [%o1], %o4 ! load 1050*5d9d9091SRichard Lowe subcc %o2, 64, %o2 ! decrement length count 1051*5d9d9091SRichard Lowe stx %o4, [%o0] ! and store 1052*5d9d9091SRichard Lowe ldx [%o1+8], %o3 ! a block of 64 bytes 1053*5d9d9091SRichard Lowe stx %o3, [%o0+8] 1054*5d9d9091SRichard Lowe ldx [%o1+16], %o4 1055*5d9d9091SRichard Lowe stx %o4, [%o0+16] 1056*5d9d9091SRichard Lowe ldx [%o1+24], %o3 1057*5d9d9091SRichard Lowe stx %o3, [%o0+24] 1058*5d9d9091SRichard Lowe ldx [%o1+32], %o4 ! load 1059*5d9d9091SRichard Lowe stx %o4, [%o0+32] ! and store 1060*5d9d9091SRichard Lowe ldx [%o1+40], %o3 ! a block of 64 bytes 1061*5d9d9091SRichard Lowe add %o1, 64, %o1 ! increase src ptr by 64 1062*5d9d9091SRichard Lowe stx %o3, [%o0+40] 1063*5d9d9091SRichard Lowe ldx [%o1-16], %o4 1064*5d9d9091SRichard Lowe add %o0, 64, %o0 ! increase dst ptr by 64 1065*5d9d9091SRichard Lowe stx %o4, [%o0-16] 1066*5d9d9091SRichard Lowe ldx [%o1-8], %o3 1067*5d9d9091SRichard Lowe bgu,pt %ncc, .medl64 ! repeat if at least 64 bytes left 1068*5d9d9091SRichard Lowe stx %o3, [%o0-8] 1069*5d9d9091SRichard Lowe.medl63: 1070*5d9d9091SRichard Lowe addcc %o2, 32, %o2 ! adjust remaining count 1071*5d9d9091SRichard Lowe ble,pt %ncc, .medl31 ! to skip if 31 or fewer bytes left 1072*5d9d9091SRichard Lowe nop 1073*5d9d9091SRichard Lowe ldx [%o1], %o4 ! load 1074*5d9d9091SRichard Lowe sub %o2, 32, %o2 ! decrement length count 1075*5d9d9091SRichard Lowe stx %o4, [%o0] ! and store 1076*5d9d9091SRichard Lowe ldx [%o1+8], %o3 ! a block of 32 bytes 1077*5d9d9091SRichard Lowe add %o1, 32, %o1 ! increase src ptr by 32 1078*5d9d9091SRichard Lowe stx %o3, [%o0+8] 1079*5d9d9091SRichard Lowe ldx [%o1-16], %o4 1080*5d9d9091SRichard Lowe add %o0, 32, %o0 ! increase dst ptr by 32 1081*5d9d9091SRichard Lowe stx %o4, [%o0-16] 1082*5d9d9091SRichard Lowe ldx [%o1-8], %o3 1083*5d9d9091SRichard Lowe stx %o3, [%o0-8] 1084*5d9d9091SRichard Lowe.medl31: 1085*5d9d9091SRichard Lowe addcc %o2, 16, %o2 ! adjust remaining count 1086*5d9d9091SRichard Lowe ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left 1087*5d9d9091SRichard Lowe nop ! 1088*5d9d9091SRichard Lowe ldx [%o1], %o4 ! load and store 16 bytes 1089*5d9d9091SRichard Lowe add %o1, 16, %o1 ! increase src ptr by 16 1090*5d9d9091SRichard Lowe stx %o4, [%o0] ! 1091*5d9d9091SRichard Lowe sub %o2, 16, %o2 ! decrease count by 16 1092*5d9d9091SRichard Lowe ldx [%o1-8], %o3 ! 1093*5d9d9091SRichard Lowe add %o0, 16, %o0 ! increase dst ptr by 16 1094*5d9d9091SRichard Lowe stx %o3, [%o0-8] 1095*5d9d9091SRichard Lowe.medl15: 1096*5d9d9091SRichard Lowe addcc %o2, 15, %o2 ! restore count 1097*5d9d9091SRichard Lowe bz,pt %ncc, .smallexit ! exit if finished 1098*5d9d9091SRichard Lowe cmp %o2, 8 1099*5d9d9091SRichard Lowe blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 1100*5d9d9091SRichard Lowe tst %o2 1101*5d9d9091SRichard Lowe ldx [%o1], %o4 ! load 8 bytes 1102*5d9d9091SRichard Lowe add %o1, 8, %o1 ! increase src ptr by 8 1103*5d9d9091SRichard Lowe add %o0, 8, %o0 ! increase dst ptr by 8 1104*5d9d9091SRichard Lowe subcc %o2, 8, %o2 ! decrease count by 8 1105*5d9d9091SRichard Lowe bnz,pt %ncc, .medw7 1106*5d9d9091SRichard Lowe stx %o4, [%o0-8] ! and store 8 bytes 1107*5d9d9091SRichard Lowe retl 1108*5d9d9091SRichard Lowe mov %g1, %o0 ! restore %o0 1109*5d9d9091SRichard Lowe 1110*5d9d9091SRichard Lowe .align 16 1111*5d9d9091SRichard Lowe.src_dst_unaligned_on_8: 1112*5d9d9091SRichard Lowe ! DST is 8-byte aligned, src is not 1113*5d9d9091SRichard Lowe2: 1114*5d9d9091SRichard Lowe andcc %o1, 0x3, %o5 ! test word alignment 1115*5d9d9091SRichard Lowe bnz,pt %ncc, .unalignsetup ! branch to skip if not word aligned 1116*5d9d9091SRichard Lowe prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 1117*5d9d9091SRichard Lowe 1118*5d9d9091SRichard Lowe/* 1119*5d9d9091SRichard Lowe * Handle all cases where src and dest are aligned on word 1120*5d9d9091SRichard Lowe * boundaries. Use unrolled loops for better performance. 1121*5d9d9091SRichard Lowe * This option wins over standard large data move when 1122*5d9d9091SRichard Lowe * source and destination is in cache for medium 1123*5d9d9091SRichard Lowe * to short data moves. 1124*5d9d9091SRichard Lowe */ 1125*5d9d9091SRichard Lowe cmp %o2, MED_WMAX ! limit to store buffer size 1126*5d9d9091SRichard Lowe bge,pt %ncc, .unalignrejoin ! otherwise rejoin main loop 1127*5d9d9091SRichard Lowe prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 1128*5d9d9091SRichard Lowe 1129*5d9d9091SRichard Lowe subcc %o2, 31, %o2 ! adjust length to allow cc test 1130*5d9d9091SRichard Lowe ! for end of loop 1131*5d9d9091SRichard Lowe ble,pt %ncc, .medw31 ! skip big loop if less than 16 1132*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1133*5d9d9091SRichard Lowe.medw32: 1134*5d9d9091SRichard Lowe ld [%o1], %o4 ! move a block of 32 bytes 1135*5d9d9091SRichard Lowe stw %o4, [%o0] 1136*5d9d9091SRichard Lowe ld [%o1+4], %o3 1137*5d9d9091SRichard Lowe stw %o3, [%o0+4] 1138*5d9d9091SRichard Lowe ld [%o1+8], %o4 1139*5d9d9091SRichard Lowe stw %o4, [%o0+8] 1140*5d9d9091SRichard Lowe ld [%o1+12], %o3 1141*5d9d9091SRichard Lowe stw %o3, [%o0+12] 1142*5d9d9091SRichard Lowe ld [%o1+16], %o4 1143*5d9d9091SRichard Lowe subcc %o2, 32, %o2 ! decrement length count 1144*5d9d9091SRichard Lowe stw %o4, [%o0+16] 1145*5d9d9091SRichard Lowe ld [%o1+20], %o3 1146*5d9d9091SRichard Lowe add %o1, 32, %o1 ! increase src ptr by 32 1147*5d9d9091SRichard Lowe stw %o3, [%o0+20] 1148*5d9d9091SRichard Lowe ld [%o1-8], %o4 1149*5d9d9091SRichard Lowe add %o0, 32, %o0 ! increase dst ptr by 32 1150*5d9d9091SRichard Lowe stw %o4, [%o0-8] 1151*5d9d9091SRichard Lowe ld [%o1-4], %o3 1152*5d9d9091SRichard Lowe bgu,pt %ncc, .medw32 ! repeat if at least 32 bytes left 1153*5d9d9091SRichard Lowe stw %o3, [%o0-4] 1154*5d9d9091SRichard Lowe.medw31: 1155*5d9d9091SRichard Lowe addcc %o2, 31, %o2 ! restore count 1156*5d9d9091SRichard Lowe 1157*5d9d9091SRichard Lowe bz,pt %ncc, .smallexit ! exit if finished 1158*5d9d9091SRichard Lowe nop 1159*5d9d9091SRichard Lowe cmp %o2, 16 1160*5d9d9091SRichard Lowe blt,pt %ncc, .medw15 1161*5d9d9091SRichard Lowe nop 1162*5d9d9091SRichard Lowe ld [%o1], %o4 ! move a block of 16 bytes 1163*5d9d9091SRichard Lowe subcc %o2, 16, %o2 ! decrement length count 1164*5d9d9091SRichard Lowe stw %o4, [%o0] 1165*5d9d9091SRichard Lowe ld [%o1+4], %o3 1166*5d9d9091SRichard Lowe add %o1, 16, %o1 ! increase src ptr by 16 1167*5d9d9091SRichard Lowe stw %o3, [%o0+4] 1168*5d9d9091SRichard Lowe ld [%o1-8], %o4 1169*5d9d9091SRichard Lowe add %o0, 16, %o0 ! increase dst ptr by 16 1170*5d9d9091SRichard Lowe stw %o4, [%o0-8] 1171*5d9d9091SRichard Lowe ld [%o1-4], %o3 1172*5d9d9091SRichard Lowe stw %o3, [%o0-4] 1173*5d9d9091SRichard Lowe.medw15: 1174*5d9d9091SRichard Lowe bz,pt %ncc, .smallexit ! exit if finished 1175*5d9d9091SRichard Lowe cmp %o2, 8 1176*5d9d9091SRichard Lowe blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 1177*5d9d9091SRichard Lowe tst %o2 1178*5d9d9091SRichard Lowe ld [%o1], %o4 ! load 4 bytes 1179*5d9d9091SRichard Lowe subcc %o2, 8, %o2 ! decrease count by 8 1180*5d9d9091SRichard Lowe stw %o4, [%o0] ! and store 4 bytes 1181*5d9d9091SRichard Lowe add %o1, 8, %o1 ! increase src ptr by 8 1182*5d9d9091SRichard Lowe ld [%o1-4], %o3 ! load 4 bytes 1183*5d9d9091SRichard Lowe add %o0, 8, %o0 ! increase dst ptr by 8 1184*5d9d9091SRichard Lowe stw %o3, [%o0-4] ! and store 4 bytes 1185*5d9d9091SRichard Lowe bz,pt %ncc, .smallexit ! exit if finished 1186*5d9d9091SRichard Lowe.medw7: ! count is ge 1, less than 8 1187*5d9d9091SRichard Lowe cmp %o2, 4 ! check for 4 bytes left 1188*5d9d9091SRichard Lowe blt,pt %ncc, .smallleft3 ! skip if 3 or fewer bytes left 1189*5d9d9091SRichard Lowe nop ! 1190*5d9d9091SRichard Lowe ld [%o1], %o4 ! load 4 bytes 1191*5d9d9091SRichard Lowe add %o1, 4, %o1 ! increase src ptr by 4 1192*5d9d9091SRichard Lowe add %o0, 4, %o0 ! increase dst ptr by 4 1193*5d9d9091SRichard Lowe subcc %o2, 4, %o2 ! decrease count by 4 1194*5d9d9091SRichard Lowe bnz .smallleft3 1195*5d9d9091SRichard Lowe stw %o4, [%o0-4] ! and store 4 bytes 1196*5d9d9091SRichard Lowe retl 1197*5d9d9091SRichard Lowe mov %g1, %o0 ! restore %o0 1198*5d9d9091SRichard Lowe 1199*5d9d9091SRichard Lowe .align 16 1200*5d9d9091SRichard Lowe.large_align8_copy: ! Src and dst share 8 byte alignment 1201*5d9d9091SRichard Lowe rd %fprs, %g5 ! check for unused fp 1202*5d9d9091SRichard Lowe ! if fprs.fef == 0, set it. 1203*5d9d9091SRichard Lowe ! Setting it when already set costs more than checking 1204*5d9d9091SRichard Lowe andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 1205*5d9d9091SRichard Lowe bz,a %ncc, 1f 1206*5d9d9091SRichard Lowe wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 1207*5d9d9091SRichard Lowe1: 1208*5d9d9091SRichard Lowe ! align dst to 64 byte boundary 1209*5d9d9091SRichard Lowe andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 1210*5d9d9091SRichard Lowe brz,pn %o3, .aligned_to_64 1211*5d9d9091SRichard Lowe andcc %o0, 8, %o3 ! odd long words to move? 1212*5d9d9091SRichard Lowe brz,pt %o3, .aligned_to_16 1213*5d9d9091SRichard Lowe nop 1214*5d9d9091SRichard Lowe ldx [%o1], %o4 1215*5d9d9091SRichard Lowe sub %o2, 8, %o2 1216*5d9d9091SRichard Lowe add %o1, 8, %o1 ! increment src ptr 1217*5d9d9091SRichard Lowe add %o0, 8, %o0 ! increment dst ptr 1218*5d9d9091SRichard Lowe stx %o4, [%o0-8] 1219*5d9d9091SRichard Lowe.aligned_to_16: 1220*5d9d9091SRichard Lowe andcc %o0, 16, %o3 ! pair of long words to move? 1221*5d9d9091SRichard Lowe brz,pt %o3, .aligned_to_32 1222*5d9d9091SRichard Lowe nop 1223*5d9d9091SRichard Lowe ldx [%o1], %o4 1224*5d9d9091SRichard Lowe sub %o2, 16, %o2 1225*5d9d9091SRichard Lowe stx %o4, [%o0] 1226*5d9d9091SRichard Lowe add %o1, 16, %o1 ! increment src ptr 1227*5d9d9091SRichard Lowe ldx [%o1-8], %o4 1228*5d9d9091SRichard Lowe add %o0, 16, %o0 ! increment dst ptr 1229*5d9d9091SRichard Lowe stx %o4, [%o0-8] 1230*5d9d9091SRichard Lowe.aligned_to_32: 1231*5d9d9091SRichard Lowe andcc %o0, 32, %o3 ! four long words to move? 1232*5d9d9091SRichard Lowe brz,pt %o3, .aligned_to_64 1233*5d9d9091SRichard Lowe nop 1234*5d9d9091SRichard Lowe ldx [%o1], %o4 1235*5d9d9091SRichard Lowe sub %o2, 32, %o2 1236*5d9d9091SRichard Lowe stx %o4, [%o0] 1237*5d9d9091SRichard Lowe ldx [%o1+8], %o4 1238*5d9d9091SRichard Lowe stx %o4, [%o0+8] 1239*5d9d9091SRichard Lowe ldx [%o1+16], %o4 1240*5d9d9091SRichard Lowe stx %o4, [%o0+16] 1241*5d9d9091SRichard Lowe add %o1, 32, %o1 ! increment src ptr 1242*5d9d9091SRichard Lowe ldx [%o1-8], %o4 1243*5d9d9091SRichard Lowe add %o0, 32, %o0 ! increment dst ptr 1244*5d9d9091SRichard Lowe stx %o4, [%o0-8] 1245*5d9d9091SRichard Lowe.aligned_to_64: 1246*5d9d9091SRichard Lowe prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 1247*5d9d9091SRichard Lowe mov %asi,%o4 ! save %asi 1248*5d9d9091SRichard Lowe ! Determine source alignment 1249*5d9d9091SRichard Lowe ! to correct 8 byte offset 1250*5d9d9091SRichard Lowe andcc %o1, 0x20, %o3 1251*5d9d9091SRichard Lowe brnz,pn %o3, .align_1 1252*5d9d9091SRichard Lowe mov ASI_BLK_P, %asi ! setup %asi for block load/store 1253*5d9d9091SRichard Lowe andcc %o1, 0x10, %o3 1254*5d9d9091SRichard Lowe brnz,pn %o3, .align_01 1255*5d9d9091SRichard Lowe nop 1256*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 1257*5d9d9091SRichard Lowe brz,pn %o3, .align_000 1258*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1259*5d9d9091SRichard Lowe ba .align_001 1260*5d9d9091SRichard Lowe nop 1261*5d9d9091SRichard Lowe.align_01: 1262*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 1263*5d9d9091SRichard Lowe brnz,pn %o3, .align_011 1264*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1265*5d9d9091SRichard Lowe ba .align_010 1266*5d9d9091SRichard Lowe nop 1267*5d9d9091SRichard Lowe.align_1: 1268*5d9d9091SRichard Lowe andcc %o1, 0x10, %o3 1269*5d9d9091SRichard Lowe brnz,pn %o3, .align_11 1270*5d9d9091SRichard Lowe nop 1271*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 1272*5d9d9091SRichard Lowe brnz,pn %o3, .align_101 1273*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1274*5d9d9091SRichard Lowe ba .align_100 1275*5d9d9091SRichard Lowe nop 1276*5d9d9091SRichard Lowe.align_11: 1277*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 1278*5d9d9091SRichard Lowe brz,pn %o3, .align_110 1279*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1280*5d9d9091SRichard Lowe 1281*5d9d9091SRichard Lowe.align_111: 1282*5d9d9091SRichard Lowe! Alignment off by 8 bytes 1283*5d9d9091SRichard Lowe ldd [%o1], %d0 1284*5d9d9091SRichard Lowe add %o1, 8, %o1 1285*5d9d9091SRichard Lowe sub %o2, 8, %o2 1286*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1287*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 1288*5d9d9091SRichard Lowe.align_111_loop: 1289*5d9d9091SRichard Lowe subcc %o5, 128, %o5 1290*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 1291*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 1292*5d9d9091SRichard Lowe fmovd %d16, %d2 1293*5d9d9091SRichard Lowe fmovd %d18, %d4 1294*5d9d9091SRichard Lowe fmovd %d20, %d6 1295*5d9d9091SRichard Lowe fmovd %d22, %d8 1296*5d9d9091SRichard Lowe fmovd %d24, %d10 1297*5d9d9091SRichard Lowe fmovd %d26, %d12 1298*5d9d9091SRichard Lowe fmovd %d28, %d14 1299*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1300*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1301*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1302*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1303*5d9d9091SRichard Lowe fmovd %d30, %d0 1304*5d9d9091SRichard Lowe 1305*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 1306*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 1307*5d9d9091SRichard Lowe fmovd %d16, %d2 1308*5d9d9091SRichard Lowe fmovd %d18, %d4 1309*5d9d9091SRichard Lowe fmovd %d20, %d6 1310*5d9d9091SRichard Lowe fmovd %d22, %d8 1311*5d9d9091SRichard Lowe fmovd %d24, %d10 1312*5d9d9091SRichard Lowe fmovd %d26, %d12 1313*5d9d9091SRichard Lowe fmovd %d28, %d14 1314*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 1315*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1316*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1317*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1318*5d9d9091SRichard Lowe fmovd %d30, %d0 1319*5d9d9091SRichard Lowe bgt,pt %ncc, .align_111_loop 1320*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1321*5d9d9091SRichard Lowe 1322*5d9d9091SRichard Lowe std %d0, [%o0] 1323*5d9d9091SRichard Lowe ba .remain_stuff 1324*5d9d9091SRichard Lowe add %o0, 8, %o0 1325*5d9d9091SRichard Lowe ! END OF align_111 1326*5d9d9091SRichard Lowe 1327*5d9d9091SRichard Lowe.align_110: 1328*5d9d9091SRichard Lowe! Alignment off by 16 bytes 1329*5d9d9091SRichard Lowe ldd [%o1], %d0 1330*5d9d9091SRichard Lowe ldd [%o1+8], %d2 1331*5d9d9091SRichard Lowe add %o1, 16, %o1 1332*5d9d9091SRichard Lowe sub %o2, 16, %o2 1333*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1334*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 1335*5d9d9091SRichard Lowe.align_110_loop: 1336*5d9d9091SRichard Lowe subcc %o5, 128, %o5 1337*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 1338*5d9d9091SRichard Lowe 1339*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 1340*5d9d9091SRichard Lowe fmovd %d16, %d4 1341*5d9d9091SRichard Lowe fmovd %d18, %d6 1342*5d9d9091SRichard Lowe fmovd %d20, %d8 1343*5d9d9091SRichard Lowe fmovd %d22, %d10 1344*5d9d9091SRichard Lowe fmovd %d24, %d12 1345*5d9d9091SRichard Lowe fmovd %d26, %d14 1346*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1347*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1348*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1349*5d9d9091SRichard Lowe fmovd %d28, %d0 1350*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1351*5d9d9091SRichard Lowe fmovd %d30, %d2 1352*5d9d9091SRichard Lowe 1353*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 1354*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 1355*5d9d9091SRichard Lowe fmovd %d16, %d4 1356*5d9d9091SRichard Lowe fmovd %d18, %d6 1357*5d9d9091SRichard Lowe fmovd %d20, %d8 1358*5d9d9091SRichard Lowe fmovd %d22, %d10 1359*5d9d9091SRichard Lowe fmovd %d24, %d12 1360*5d9d9091SRichard Lowe fmovd %d26, %d14 1361*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 1362*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1363*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1364*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1365*5d9d9091SRichard Lowe fmovd %d28, %d0 1366*5d9d9091SRichard Lowe fmovd %d30, %d2 1367*5d9d9091SRichard Lowe bgt,pt %ncc, .align_110_loop 1368*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1369*5d9d9091SRichard Lowe 1370*5d9d9091SRichard Lowe std %d0, [%o0] 1371*5d9d9091SRichard Lowe std %d2, [%o0+8] 1372*5d9d9091SRichard Lowe ba .remain_stuff 1373*5d9d9091SRichard Lowe add %o0, 16, %o0 1374*5d9d9091SRichard Lowe ! END OF align_110 1375*5d9d9091SRichard Lowe 1376*5d9d9091SRichard Lowe.align_101: 1377*5d9d9091SRichard Lowe! Alignment off by 24 bytes 1378*5d9d9091SRichard Lowe ldd [%o1], %d0 1379*5d9d9091SRichard Lowe ldd [%o1+8], %d2 1380*5d9d9091SRichard Lowe ldd [%o1+16], %d4 1381*5d9d9091SRichard Lowe add %o1, 24, %o1 1382*5d9d9091SRichard Lowe sub %o2, 24, %o2 1383*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1384*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 1385*5d9d9091SRichard Lowe.align_101_loop: 1386*5d9d9091SRichard Lowe subcc %o5, 128, %o5 1387*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 1388*5d9d9091SRichard Lowe 1389*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 1390*5d9d9091SRichard Lowe fmovd %d16, %d6 1391*5d9d9091SRichard Lowe fmovd %d18, %d8 1392*5d9d9091SRichard Lowe fmovd %d20, %d10 1393*5d9d9091SRichard Lowe fmovd %d22, %d12 1394*5d9d9091SRichard Lowe fmovd %d24, %d14 1395*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1396*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1397*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1398*5d9d9091SRichard Lowe fmovd %d26, %d0 1399*5d9d9091SRichard Lowe fmovd %d28, %d2 1400*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1401*5d9d9091SRichard Lowe fmovd %d30, %d4 1402*5d9d9091SRichard Lowe 1403*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 1404*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 1405*5d9d9091SRichard Lowe fmovd %d16, %d6 1406*5d9d9091SRichard Lowe fmovd %d18, %d8 1407*5d9d9091SRichard Lowe fmovd %d20, %d10 1408*5d9d9091SRichard Lowe fmovd %d22, %d12 1409*5d9d9091SRichard Lowe fmovd %d24, %d14 1410*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 1411*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1412*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1413*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1414*5d9d9091SRichard Lowe fmovd %d26, %d0 1415*5d9d9091SRichard Lowe fmovd %d28, %d2 1416*5d9d9091SRichard Lowe fmovd %d30, %d4 1417*5d9d9091SRichard Lowe bgt,pt %ncc, .align_101_loop 1418*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1419*5d9d9091SRichard Lowe 1420*5d9d9091SRichard Lowe std %d0, [%o0] 1421*5d9d9091SRichard Lowe std %d2, [%o0+8] 1422*5d9d9091SRichard Lowe std %d4, [%o0+16] 1423*5d9d9091SRichard Lowe ba .remain_stuff 1424*5d9d9091SRichard Lowe add %o0, 24, %o0 1425*5d9d9091SRichard Lowe ! END OF align_101 1426*5d9d9091SRichard Lowe 1427*5d9d9091SRichard Lowe.align_100: 1428*5d9d9091SRichard Lowe! Alignment off by 32 bytes 1429*5d9d9091SRichard Lowe ldd [%o1], %d0 1430*5d9d9091SRichard Lowe ldd [%o1+8], %d2 1431*5d9d9091SRichard Lowe ldd [%o1+16],%d4 1432*5d9d9091SRichard Lowe ldd [%o1+24],%d6 1433*5d9d9091SRichard Lowe add %o1, 32, %o1 1434*5d9d9091SRichard Lowe sub %o2, 32, %o2 1435*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1436*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 1437*5d9d9091SRichard Lowe.align_100_loop: 1438*5d9d9091SRichard Lowe subcc %o5, 128, %o5 1439*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 1440*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 1441*5d9d9091SRichard Lowe fmovd %d16, %d8 1442*5d9d9091SRichard Lowe fmovd %d18, %d10 1443*5d9d9091SRichard Lowe fmovd %d20, %d12 1444*5d9d9091SRichard Lowe fmovd %d22, %d14 1445*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1446*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1447*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1448*5d9d9091SRichard Lowe fmovd %d24, %d0 1449*5d9d9091SRichard Lowe fmovd %d26, %d2 1450*5d9d9091SRichard Lowe fmovd %d28, %d4 1451*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1452*5d9d9091SRichard Lowe fmovd %d30, %d6 1453*5d9d9091SRichard Lowe 1454*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 1455*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 1456*5d9d9091SRichard Lowe fmovd %d16, %d8 1457*5d9d9091SRichard Lowe fmovd %d18, %d10 1458*5d9d9091SRichard Lowe fmovd %d20, %d12 1459*5d9d9091SRichard Lowe fmovd %d22, %d14 1460*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 1461*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1462*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1463*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1464*5d9d9091SRichard Lowe fmovd %d24, %d0 1465*5d9d9091SRichard Lowe fmovd %d26, %d2 1466*5d9d9091SRichard Lowe fmovd %d28, %d4 1467*5d9d9091SRichard Lowe fmovd %d30, %d6 1468*5d9d9091SRichard Lowe bgt,pt %ncc, .align_100_loop 1469*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1470*5d9d9091SRichard Lowe 1471*5d9d9091SRichard Lowe std %d0, [%o0] 1472*5d9d9091SRichard Lowe std %d2, [%o0+8] 1473*5d9d9091SRichard Lowe std %d4, [%o0+16] 1474*5d9d9091SRichard Lowe std %d6, [%o0+24] 1475*5d9d9091SRichard Lowe ba .remain_stuff 1476*5d9d9091SRichard Lowe add %o0, 32, %o0 1477*5d9d9091SRichard Lowe ! END OF align_100 1478*5d9d9091SRichard Lowe 1479*5d9d9091SRichard Lowe.align_011: 1480*5d9d9091SRichard Lowe! Alignment off by 40 bytes 1481*5d9d9091SRichard Lowe ldd [%o1], %d0 1482*5d9d9091SRichard Lowe ldd [%o1+8], %d2 1483*5d9d9091SRichard Lowe ldd [%o1+16], %d4 1484*5d9d9091SRichard Lowe ldd [%o1+24], %d6 1485*5d9d9091SRichard Lowe ldd [%o1+32], %d8 1486*5d9d9091SRichard Lowe add %o1, 40, %o1 1487*5d9d9091SRichard Lowe sub %o2, 40, %o2 1488*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1489*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 1490*5d9d9091SRichard Lowe.align_011_loop: 1491*5d9d9091SRichard Lowe subcc %o5, 128, %o5 1492*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 1493*5d9d9091SRichard Lowe 1494*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 1495*5d9d9091SRichard Lowe fmovd %d16, %d10 1496*5d9d9091SRichard Lowe fmovd %d18, %d12 1497*5d9d9091SRichard Lowe fmovd %d20, %d14 1498*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1499*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1500*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1501*5d9d9091SRichard Lowe fmovd %d22, %d0 1502*5d9d9091SRichard Lowe fmovd %d24, %d2 1503*5d9d9091SRichard Lowe fmovd %d26, %d4 1504*5d9d9091SRichard Lowe fmovd %d28, %d6 1505*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1506*5d9d9091SRichard Lowe fmovd %d30, %d8 1507*5d9d9091SRichard Lowe 1508*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 1509*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 1510*5d9d9091SRichard Lowe fmovd %d16, %d10 1511*5d9d9091SRichard Lowe fmovd %d18, %d12 1512*5d9d9091SRichard Lowe fmovd %d20, %d14 1513*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 1514*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1515*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1516*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1517*5d9d9091SRichard Lowe fmovd %d22, %d0 1518*5d9d9091SRichard Lowe fmovd %d24, %d2 1519*5d9d9091SRichard Lowe fmovd %d26, %d4 1520*5d9d9091SRichard Lowe fmovd %d28, %d6 1521*5d9d9091SRichard Lowe fmovd %d30, %d8 1522*5d9d9091SRichard Lowe bgt,pt %ncc, .align_011_loop 1523*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1524*5d9d9091SRichard Lowe 1525*5d9d9091SRichard Lowe std %d0, [%o0] 1526*5d9d9091SRichard Lowe std %d2, [%o0+8] 1527*5d9d9091SRichard Lowe std %d4, [%o0+16] 1528*5d9d9091SRichard Lowe std %d6, [%o0+24] 1529*5d9d9091SRichard Lowe std %d8, [%o0+32] 1530*5d9d9091SRichard Lowe ba .remain_stuff 1531*5d9d9091SRichard Lowe add %o0, 40, %o0 1532*5d9d9091SRichard Lowe ! END OF align_011 1533*5d9d9091SRichard Lowe 1534*5d9d9091SRichard Lowe.align_010: 1535*5d9d9091SRichard Lowe! Alignment off by 48 bytes 1536*5d9d9091SRichard Lowe ldd [%o1], %d0 1537*5d9d9091SRichard Lowe ldd [%o1+8], %d2 1538*5d9d9091SRichard Lowe ldd [%o1+16], %d4 1539*5d9d9091SRichard Lowe ldd [%o1+24], %d6 1540*5d9d9091SRichard Lowe ldd [%o1+32], %d8 1541*5d9d9091SRichard Lowe ldd [%o1+40], %d10 1542*5d9d9091SRichard Lowe add %o1, 48, %o1 1543*5d9d9091SRichard Lowe sub %o2, 48, %o2 1544*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1545*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 1546*5d9d9091SRichard Lowe.align_010_loop: 1547*5d9d9091SRichard Lowe subcc %o5, 128, %o5 1548*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 1549*5d9d9091SRichard Lowe 1550*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 1551*5d9d9091SRichard Lowe fmovd %d16, %d12 1552*5d9d9091SRichard Lowe fmovd %d18, %d14 1553*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1554*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1555*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1556*5d9d9091SRichard Lowe fmovd %d20, %d0 1557*5d9d9091SRichard Lowe fmovd %d22, %d2 1558*5d9d9091SRichard Lowe fmovd %d24, %d4 1559*5d9d9091SRichard Lowe fmovd %d26, %d6 1560*5d9d9091SRichard Lowe fmovd %d28, %d8 1561*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1562*5d9d9091SRichard Lowe fmovd %d30, %d10 1563*5d9d9091SRichard Lowe 1564*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 1565*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 1566*5d9d9091SRichard Lowe fmovd %d16, %d12 1567*5d9d9091SRichard Lowe fmovd %d18, %d14 1568*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 1569*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1570*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1571*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1572*5d9d9091SRichard Lowe fmovd %d20, %d0 1573*5d9d9091SRichard Lowe fmovd %d22, %d2 1574*5d9d9091SRichard Lowe fmovd %d24, %d4 1575*5d9d9091SRichard Lowe fmovd %d26, %d6 1576*5d9d9091SRichard Lowe fmovd %d28, %d8 1577*5d9d9091SRichard Lowe fmovd %d30, %d10 1578*5d9d9091SRichard Lowe bgt,pt %ncc, .align_010_loop 1579*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1580*5d9d9091SRichard Lowe 1581*5d9d9091SRichard Lowe std %d0, [%o0] 1582*5d9d9091SRichard Lowe std %d2, [%o0+8] 1583*5d9d9091SRichard Lowe std %d4, [%o0+16] 1584*5d9d9091SRichard Lowe std %d6, [%o0+24] 1585*5d9d9091SRichard Lowe std %d8, [%o0+32] 1586*5d9d9091SRichard Lowe std %d10, [%o0+40] 1587*5d9d9091SRichard Lowe ba .remain_stuff 1588*5d9d9091SRichard Lowe add %o0, 48, %o0 1589*5d9d9091SRichard Lowe ! END OF align_010 1590*5d9d9091SRichard Lowe 1591*5d9d9091SRichard Lowe.align_001: 1592*5d9d9091SRichard Lowe! Alignment off by 56 bytes 1593*5d9d9091SRichard Lowe ldd [%o1], %d0 1594*5d9d9091SRichard Lowe ldd [%o1+8], %d2 1595*5d9d9091SRichard Lowe ldd [%o1+16], %d4 1596*5d9d9091SRichard Lowe ldd [%o1+24], %d6 1597*5d9d9091SRichard Lowe ldd [%o1+32], %d8 1598*5d9d9091SRichard Lowe ldd [%o1+40], %d10 1599*5d9d9091SRichard Lowe ldd [%o1+48], %d12 1600*5d9d9091SRichard Lowe add %o1, 56, %o1 1601*5d9d9091SRichard Lowe sub %o2, 56, %o2 1602*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1603*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 1604*5d9d9091SRichard Lowe.align_001_loop: 1605*5d9d9091SRichard Lowe subcc %o5, 128, %o5 1606*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 1607*5d9d9091SRichard Lowe 1608*5d9d9091SRichard Lowe ldda [%o1]%asi,%d16 ! block load 1609*5d9d9091SRichard Lowe fmovd %d16, %d14 1610*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1611*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1612*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1613*5d9d9091SRichard Lowe fmovd %d18, %d0 1614*5d9d9091SRichard Lowe fmovd %d20, %d2 1615*5d9d9091SRichard Lowe fmovd %d22, %d4 1616*5d9d9091SRichard Lowe fmovd %d24, %d6 1617*5d9d9091SRichard Lowe fmovd %d26, %d8 1618*5d9d9091SRichard Lowe fmovd %d28, %d10 1619*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1620*5d9d9091SRichard Lowe fmovd %d30, %d12 1621*5d9d9091SRichard Lowe 1622*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 1623*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d16 1624*5d9d9091SRichard Lowe fmovd %d16, %d14 1625*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 1626*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1627*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1628*5d9d9091SRichard Lowe add %o0, 64, %o0 ! advance dst 1629*5d9d9091SRichard Lowe fmovd %d18, %d0 1630*5d9d9091SRichard Lowe fmovd %d20, %d2 1631*5d9d9091SRichard Lowe fmovd %d22, %d4 1632*5d9d9091SRichard Lowe fmovd %d24, %d6 1633*5d9d9091SRichard Lowe fmovd %d26, %d8 1634*5d9d9091SRichard Lowe fmovd %d28, %d10 1635*5d9d9091SRichard Lowe fmovd %d30, %d12 1636*5d9d9091SRichard Lowe bgt,pt %ncc, .align_001_loop 1637*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1638*5d9d9091SRichard Lowe 1639*5d9d9091SRichard Lowe std %d0, [%o0] 1640*5d9d9091SRichard Lowe std %d2, [%o0+8] 1641*5d9d9091SRichard Lowe std %d4, [%o0+16] 1642*5d9d9091SRichard Lowe std %d6, [%o0+24] 1643*5d9d9091SRichard Lowe std %d8, [%o0+32] 1644*5d9d9091SRichard Lowe std %d10, [%o0+40] 1645*5d9d9091SRichard Lowe std %d12, [%o0+48] 1646*5d9d9091SRichard Lowe ba .remain_stuff 1647*5d9d9091SRichard Lowe add %o0, 56, %o0 1648*5d9d9091SRichard Lowe ! END OF align_001 1649*5d9d9091SRichard Lowe 1650*5d9d9091SRichard Lowe.align_000: 1651*5d9d9091SRichard Lowe andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1652*5d9d9091SRichard Lowe and %o2, 0x7f, %o2 ! residue bytes in %o2 1653*5d9d9091SRichard Lowe.align_000_loop: 1654*5d9d9091SRichard Lowe /* ---- copy line 1 of 2. ---- */ 1655*5d9d9091SRichard Lowe subcc %o5, 128, %o5 1656*5d9d9091SRichard Lowe ldda [%o1]%asi,%d0 1657*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1658*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1659*5d9d9091SRichard Lowe prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1660*5d9d9091SRichard Lowe 1661*5d9d9091SRichard Lowe /* ---- copy line 2 of 2. ---- */ 1662*5d9d9091SRichard Lowe add %o0, 64, %o0 1663*5d9d9091SRichard Lowe ldda [%o1+64]%asi,%d0 1664*5d9d9091SRichard Lowe add %o1, 128, %o1 ! increment src 1665*5d9d9091SRichard Lowe stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1666*5d9d9091SRichard Lowe stda %d0,[%o0]%asi 1667*5d9d9091SRichard Lowe add %o0, 64, %o0 ! increment dst 1668*5d9d9091SRichard Lowe bgt,pt %ncc, .align_000_loop 1669*5d9d9091SRichard Lowe prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1670*5d9d9091SRichard Lowe 1671*5d9d9091SRichard Lowe ! END OF align_000 1672*5d9d9091SRichard Lowe 1673*5d9d9091SRichard Lowe.remain_stuff: 1674*5d9d9091SRichard Lowe mov %o4, %asi ! restore %asi 1675*5d9d9091SRichard Lowe brnz %g5, .medlong 1676*5d9d9091SRichard Lowe membar #Sync 1677*5d9d9091SRichard Lowe ba .medlong 1678*5d9d9091SRichard Lowe wr %g5, %g0, %fprs 1679*5d9d9091SRichard Lowe 1680*5d9d9091SRichard Lowe .align 16 1681*5d9d9091SRichard Lowe ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 1682*5d9d9091SRichard Lowe.unalignsetup: 1683*5d9d9091SRichard Lowe prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 1684*5d9d9091SRichard Lowe.unalignrejoin: 1685*5d9d9091SRichard Lowe rd %fprs, %g5 ! check for unused fp 1686*5d9d9091SRichard Lowe ! if fprs.fef == 0, set it. 1687*5d9d9091SRichard Lowe ! Setting it when already set costs more than checking 1688*5d9d9091SRichard Lowe andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 1689*5d9d9091SRichard Lowe bz,a %ncc, 1f 1690*5d9d9091SRichard Lowe wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 1691*5d9d9091SRichard Lowe1: 1692*5d9d9091SRichard Lowe cmp %o2, MED_UMAX ! check for medium unaligned limit 1693*5d9d9091SRichard Lowe bge,pt %ncc,.unalign_large 1694*5d9d9091SRichard Lowe nop 1695*5d9d9091SRichard Lowe andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 1696*5d9d9091SRichard Lowe and %o2, 0x3f, %o2 ! residue bytes in %o2 1697*5d9d9091SRichard Lowe cmp %o2, 8 ! Insure we don't load beyond 1698*5d9d9091SRichard Lowe bgt .unalign_adjust ! end of source buffer 1699*5d9d9091SRichard Lowe andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 1700*5d9d9091SRichard Lowe add %o2, 64, %o2 ! adjust to leave loop 1701*5d9d9091SRichard Lowe sub %o5, 64, %o5 ! early if necessary 1702*5d9d9091SRichard Lowe.unalign_adjust: 1703*5d9d9091SRichard Lowe alignaddr %o1, %g0, %g0 ! generate %gsr 1704*5d9d9091SRichard Lowe add %o1, %o5, %o1 ! advance %o1 to after blocks 1705*5d9d9091SRichard Lowe ldd [%o4], %d0 1706*5d9d9091SRichard Lowe.unalign_loop: 1707*5d9d9091SRichard Lowe ldd [%o4+8], %d2 1708*5d9d9091SRichard Lowe faligndata %d0, %d2, %d16 1709*5d9d9091SRichard Lowe ldd [%o4+16], %d4 1710*5d9d9091SRichard Lowe std %d16, [%o0] 1711*5d9d9091SRichard Lowe faligndata %d2, %d4, %d18 1712*5d9d9091SRichard Lowe ldd [%o4+24], %d6 1713*5d9d9091SRichard Lowe std %d18, [%o0+8] 1714*5d9d9091SRichard Lowe faligndata %d4, %d6, %d20 1715*5d9d9091SRichard Lowe ldd [%o4+32], %d8 1716*5d9d9091SRichard Lowe std %d20, [%o0+16] 1717*5d9d9091SRichard Lowe faligndata %d6, %d8, %d22 1718*5d9d9091SRichard Lowe ldd [%o4+40], %d10 1719*5d9d9091SRichard Lowe std %d22, [%o0+24] 1720*5d9d9091SRichard Lowe faligndata %d8, %d10, %d24 1721*5d9d9091SRichard Lowe ldd [%o4+48], %d12 1722*5d9d9091SRichard Lowe std %d24, [%o0+32] 1723*5d9d9091SRichard Lowe faligndata %d10, %d12, %d26 1724*5d9d9091SRichard Lowe ldd [%o4+56], %d14 1725*5d9d9091SRichard Lowe std %d26, [%o0+40] 1726*5d9d9091SRichard Lowe faligndata %d12, %d14, %d28 1727*5d9d9091SRichard Lowe ldd [%o4+64], %d0 1728*5d9d9091SRichard Lowe std %d28, [%o0+48] 1729*5d9d9091SRichard Lowe faligndata %d14, %d0, %d30 1730*5d9d9091SRichard Lowe add %o4, BLOCK_SIZE, %o4 1731*5d9d9091SRichard Lowe std %d30, [%o0+56] 1732*5d9d9091SRichard Lowe add %o0, BLOCK_SIZE, %o0 1733*5d9d9091SRichard Lowe subcc %o5, BLOCK_SIZE, %o5 1734*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_loop 1735*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1736*5d9d9091SRichard Lowe ba .unalign_done 1737*5d9d9091SRichard Lowe nop 1738*5d9d9091SRichard Lowe 1739*5d9d9091SRichard Lowe.unalign_large: 1740*5d9d9091SRichard Lowe andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 1741*5d9d9091SRichard Lowe bz %ncc, .unalignsrc 1742*5d9d9091SRichard Lowe sub %o3, 64, %o3 ! %o3 will be multiple of 8 1743*5d9d9091SRichard Lowe neg %o3 ! bytes until dest is 64 byte aligned 1744*5d9d9091SRichard Lowe sub %o2, %o3, %o2 ! update cnt with bytes to be moved 1745*5d9d9091SRichard Lowe ! Move bytes according to source alignment 1746*5d9d9091SRichard Lowe andcc %o1, 0x1, %o5 1747*5d9d9091SRichard Lowe bnz %ncc, .unalignbyte ! check for byte alignment 1748*5d9d9091SRichard Lowe nop 1749*5d9d9091SRichard Lowe andcc %o1, 2, %o5 ! check for half word alignment 1750*5d9d9091SRichard Lowe bnz %ncc, .unalignhalf 1751*5d9d9091SRichard Lowe nop 1752*5d9d9091SRichard Lowe ! Src is word aligned 1753*5d9d9091SRichard Lowe.unalignword: 1754*5d9d9091SRichard Lowe ld [%o1], %o4 ! load 4 bytes 1755*5d9d9091SRichard Lowe stw %o4, [%o0] ! and store 4 bytes 1756*5d9d9091SRichard Lowe ld [%o1+4], %o4 ! load 4 bytes 1757*5d9d9091SRichard Lowe add %o1, 8, %o1 ! increase src ptr by 8 1758*5d9d9091SRichard Lowe stw %o4, [%o0+4] ! and store 4 bytes 1759*5d9d9091SRichard Lowe subcc %o3, 8, %o3 ! decrease count by 8 1760*5d9d9091SRichard Lowe bnz %ncc, .unalignword 1761*5d9d9091SRichard Lowe add %o0, 8, %o0 ! increase dst ptr by 8 1762*5d9d9091SRichard Lowe ba .unalignsrc 1763*5d9d9091SRichard Lowe nop 1764*5d9d9091SRichard Lowe 1765*5d9d9091SRichard Lowe ! Src is half-word aligned 1766*5d9d9091SRichard Lowe.unalignhalf: 1767*5d9d9091SRichard Lowe lduh [%o1], %o4 ! load 2 bytes 1768*5d9d9091SRichard Lowe sllx %o4, 32, %o5 ! shift left 1769*5d9d9091SRichard Lowe lduw [%o1+2], %o4 1770*5d9d9091SRichard Lowe or %o4, %o5, %o5 1771*5d9d9091SRichard Lowe sllx %o5, 16, %o5 1772*5d9d9091SRichard Lowe lduh [%o1+6], %o4 1773*5d9d9091SRichard Lowe or %o4, %o5, %o5 1774*5d9d9091SRichard Lowe stx %o5, [%o0] 1775*5d9d9091SRichard Lowe add %o1, 8, %o1 1776*5d9d9091SRichard Lowe subcc %o3, 8, %o3 1777*5d9d9091SRichard Lowe bnz %ncc, .unalignhalf 1778*5d9d9091SRichard Lowe add %o0, 8, %o0 1779*5d9d9091SRichard Lowe ba .unalignsrc 1780*5d9d9091SRichard Lowe nop 1781*5d9d9091SRichard Lowe 1782*5d9d9091SRichard Lowe ! Src is Byte aligned 1783*5d9d9091SRichard Lowe.unalignbyte: 1784*5d9d9091SRichard Lowe sub %o0, %o1, %o0 ! share pointer advance 1785*5d9d9091SRichard Lowe.unalignbyte_loop: 1786*5d9d9091SRichard Lowe ldub [%o1], %o4 1787*5d9d9091SRichard Lowe sllx %o4, 56, %o5 1788*5d9d9091SRichard Lowe lduh [%o1+1], %o4 1789*5d9d9091SRichard Lowe sllx %o4, 40, %o4 1790*5d9d9091SRichard Lowe or %o4, %o5, %o5 1791*5d9d9091SRichard Lowe lduh [%o1+3], %o4 1792*5d9d9091SRichard Lowe sllx %o4, 24, %o4 1793*5d9d9091SRichard Lowe or %o4, %o5, %o5 1794*5d9d9091SRichard Lowe lduh [%o1+5], %o4 1795*5d9d9091SRichard Lowe sllx %o4, 8, %o4 1796*5d9d9091SRichard Lowe or %o4, %o5, %o5 1797*5d9d9091SRichard Lowe ldub [%o1+7], %o4 1798*5d9d9091SRichard Lowe or %o4, %o5, %o5 1799*5d9d9091SRichard Lowe stx %o5, [%o0+%o1] 1800*5d9d9091SRichard Lowe subcc %o3, 8, %o3 1801*5d9d9091SRichard Lowe bnz %ncc, .unalignbyte_loop 1802*5d9d9091SRichard Lowe add %o1, 8, %o1 1803*5d9d9091SRichard Lowe add %o0,%o1, %o0 ! restore pointer 1804*5d9d9091SRichard Lowe 1805*5d9d9091SRichard Lowe ! Destination is now block (64 byte aligned) 1806*5d9d9091SRichard Lowe.unalignsrc: 1807*5d9d9091SRichard Lowe andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 1808*5d9d9091SRichard Lowe and %o2, 0x3f, %o2 ! residue bytes in %o2 1809*5d9d9091SRichard Lowe add %o2, 64, %o2 ! Insure we don't load beyond 1810*5d9d9091SRichard Lowe sub %o5, 64, %o5 ! end of source buffer 1811*5d9d9091SRichard Lowe 1812*5d9d9091SRichard Lowe andn %o1, 0x3f, %o4 ! %o4 has block aligned src address 1813*5d9d9091SRichard Lowe prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read 1814*5d9d9091SRichard Lowe alignaddr %o1, %g0, %g0 ! generate %gsr 1815*5d9d9091SRichard Lowe add %o1, %o5, %o1 ! advance %o1 to after blocks 1816*5d9d9091SRichard Lowe ! 1817*5d9d9091SRichard Lowe ! Determine source alignment to correct 8 byte offset 1818*5d9d9091SRichard Lowe andcc %o1, 0x20, %o3 1819*5d9d9091SRichard Lowe brnz,pn %o3, .unalign_1 1820*5d9d9091SRichard Lowe nop 1821*5d9d9091SRichard Lowe andcc %o1, 0x10, %o3 1822*5d9d9091SRichard Lowe brnz,pn %o3, .unalign_01 1823*5d9d9091SRichard Lowe nop 1824*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 1825*5d9d9091SRichard Lowe brz,a %o3, .unalign_000 1826*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1827*5d9d9091SRichard Lowe ba .unalign_001 1828*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1829*5d9d9091SRichard Lowe.unalign_01: 1830*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 1831*5d9d9091SRichard Lowe brnz,a %o3, .unalign_011 1832*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1833*5d9d9091SRichard Lowe ba .unalign_010 1834*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1835*5d9d9091SRichard Lowe.unalign_1: 1836*5d9d9091SRichard Lowe andcc %o1, 0x10, %o3 1837*5d9d9091SRichard Lowe brnz,pn %o3, .unalign_11 1838*5d9d9091SRichard Lowe nop 1839*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 1840*5d9d9091SRichard Lowe brnz,a %o3, .unalign_101 1841*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1842*5d9d9091SRichard Lowe ba .unalign_100 1843*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1844*5d9d9091SRichard Lowe.unalign_11: 1845*5d9d9091SRichard Lowe andcc %o1, 0x08, %o3 1846*5d9d9091SRichard Lowe brz,pn %o3, .unalign_110 1847*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1848*5d9d9091SRichard Lowe 1849*5d9d9091SRichard Lowe.unalign_111: 1850*5d9d9091SRichard Lowe ldd [%o4+56], %d14 1851*5d9d9091SRichard Lowe.unalign_111_loop: 1852*5d9d9091SRichard Lowe add %o4, 64, %o4 1853*5d9d9091SRichard Lowe ldda [%o4]ASI_BLK_P, %d16 1854*5d9d9091SRichard Lowe faligndata %d14, %d16, %d48 1855*5d9d9091SRichard Lowe faligndata %d16, %d18, %d50 1856*5d9d9091SRichard Lowe faligndata %d18, %d20, %d52 1857*5d9d9091SRichard Lowe faligndata %d20, %d22, %d54 1858*5d9d9091SRichard Lowe faligndata %d22, %d24, %d56 1859*5d9d9091SRichard Lowe faligndata %d24, %d26, %d58 1860*5d9d9091SRichard Lowe faligndata %d26, %d28, %d60 1861*5d9d9091SRichard Lowe faligndata %d28, %d30, %d62 1862*5d9d9091SRichard Lowe fmovd %d30, %d14 1863*5d9d9091SRichard Lowe stda %d48, [%o0]ASI_BLK_P 1864*5d9d9091SRichard Lowe subcc %o5, 64, %o5 1865*5d9d9091SRichard Lowe add %o0, 64, %o0 1866*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_111_loop 1867*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1868*5d9d9091SRichard Lowe ba .unalign_done 1869*5d9d9091SRichard Lowe membar #Sync 1870*5d9d9091SRichard Lowe 1871*5d9d9091SRichard Lowe.unalign_110: 1872*5d9d9091SRichard Lowe ldd [%o4+48], %d12 1873*5d9d9091SRichard Lowe ldd [%o4+56], %d14 1874*5d9d9091SRichard Lowe.unalign_110_loop: 1875*5d9d9091SRichard Lowe add %o4, 64, %o4 1876*5d9d9091SRichard Lowe ldda [%o4]ASI_BLK_P, %d16 1877*5d9d9091SRichard Lowe faligndata %d12, %d14, %d48 1878*5d9d9091SRichard Lowe faligndata %d14, %d16, %d50 1879*5d9d9091SRichard Lowe faligndata %d16, %d18, %d52 1880*5d9d9091SRichard Lowe faligndata %d18, %d20, %d54 1881*5d9d9091SRichard Lowe faligndata %d20, %d22, %d56 1882*5d9d9091SRichard Lowe faligndata %d22, %d24, %d58 1883*5d9d9091SRichard Lowe faligndata %d24, %d26, %d60 1884*5d9d9091SRichard Lowe faligndata %d26, %d28, %d62 1885*5d9d9091SRichard Lowe fmovd %d28, %d12 1886*5d9d9091SRichard Lowe fmovd %d30, %d14 1887*5d9d9091SRichard Lowe stda %d48, [%o0]ASI_BLK_P 1888*5d9d9091SRichard Lowe subcc %o5, 64, %o5 1889*5d9d9091SRichard Lowe add %o0, 64, %o0 1890*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_110_loop 1891*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1892*5d9d9091SRichard Lowe ba .unalign_done 1893*5d9d9091SRichard Lowe membar #Sync 1894*5d9d9091SRichard Lowe 1895*5d9d9091SRichard Lowe.unalign_101: 1896*5d9d9091SRichard Lowe ldd [%o4+40], %d10 1897*5d9d9091SRichard Lowe ldd [%o4+48], %d12 1898*5d9d9091SRichard Lowe ldd [%o4+56], %d14 1899*5d9d9091SRichard Lowe.unalign_101_loop: 1900*5d9d9091SRichard Lowe add %o4, 64, %o4 1901*5d9d9091SRichard Lowe ldda [%o4]ASI_BLK_P, %d16 1902*5d9d9091SRichard Lowe faligndata %d10, %d12, %d48 1903*5d9d9091SRichard Lowe faligndata %d12, %d14, %d50 1904*5d9d9091SRichard Lowe faligndata %d14, %d16, %d52 1905*5d9d9091SRichard Lowe faligndata %d16, %d18, %d54 1906*5d9d9091SRichard Lowe faligndata %d18, %d20, %d56 1907*5d9d9091SRichard Lowe faligndata %d20, %d22, %d58 1908*5d9d9091SRichard Lowe faligndata %d22, %d24, %d60 1909*5d9d9091SRichard Lowe faligndata %d24, %d26, %d62 1910*5d9d9091SRichard Lowe fmovd %d26, %d10 1911*5d9d9091SRichard Lowe fmovd %d28, %d12 1912*5d9d9091SRichard Lowe fmovd %d30, %d14 1913*5d9d9091SRichard Lowe stda %d48, [%o0]ASI_BLK_P 1914*5d9d9091SRichard Lowe subcc %o5, 64, %o5 1915*5d9d9091SRichard Lowe add %o0, 64, %o0 1916*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_101_loop 1917*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1918*5d9d9091SRichard Lowe ba .unalign_done 1919*5d9d9091SRichard Lowe membar #Sync 1920*5d9d9091SRichard Lowe 1921*5d9d9091SRichard Lowe.unalign_100: 1922*5d9d9091SRichard Lowe ldd [%o4+32], %d8 1923*5d9d9091SRichard Lowe ldd [%o4+40], %d10 1924*5d9d9091SRichard Lowe ldd [%o4+48], %d12 1925*5d9d9091SRichard Lowe ldd [%o4+56], %d14 1926*5d9d9091SRichard Lowe.unalign_100_loop: 1927*5d9d9091SRichard Lowe add %o4, 64, %o4 1928*5d9d9091SRichard Lowe ldda [%o4]ASI_BLK_P, %d16 1929*5d9d9091SRichard Lowe faligndata %d8, %d10, %d48 1930*5d9d9091SRichard Lowe faligndata %d10, %d12, %d50 1931*5d9d9091SRichard Lowe faligndata %d12, %d14, %d52 1932*5d9d9091SRichard Lowe faligndata %d14, %d16, %d54 1933*5d9d9091SRichard Lowe faligndata %d16, %d18, %d56 1934*5d9d9091SRichard Lowe faligndata %d18, %d20, %d58 1935*5d9d9091SRichard Lowe faligndata %d20, %d22, %d60 1936*5d9d9091SRichard Lowe faligndata %d22, %d24, %d62 1937*5d9d9091SRichard Lowe fmovd %d24, %d8 1938*5d9d9091SRichard Lowe fmovd %d26, %d10 1939*5d9d9091SRichard Lowe fmovd %d28, %d12 1940*5d9d9091SRichard Lowe fmovd %d30, %d14 1941*5d9d9091SRichard Lowe stda %d48, [%o0]ASI_BLK_P 1942*5d9d9091SRichard Lowe subcc %o5, 64, %o5 1943*5d9d9091SRichard Lowe add %o0, 64, %o0 1944*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_100_loop 1945*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1946*5d9d9091SRichard Lowe ba .unalign_done 1947*5d9d9091SRichard Lowe membar #Sync 1948*5d9d9091SRichard Lowe 1949*5d9d9091SRichard Lowe.unalign_011: 1950*5d9d9091SRichard Lowe ldd [%o4+24], %d6 1951*5d9d9091SRichard Lowe ldd [%o4+32], %d8 1952*5d9d9091SRichard Lowe ldd [%o4+40], %d10 1953*5d9d9091SRichard Lowe ldd [%o4+48], %d12 1954*5d9d9091SRichard Lowe ldd [%o4+56], %d14 1955*5d9d9091SRichard Lowe.unalign_011_loop: 1956*5d9d9091SRichard Lowe add %o4, 64, %o4 1957*5d9d9091SRichard Lowe ldda [%o4]ASI_BLK_P, %d16 1958*5d9d9091SRichard Lowe faligndata %d6, %d8, %d48 1959*5d9d9091SRichard Lowe faligndata %d8, %d10, %d50 1960*5d9d9091SRichard Lowe faligndata %d10, %d12, %d52 1961*5d9d9091SRichard Lowe faligndata %d12, %d14, %d54 1962*5d9d9091SRichard Lowe faligndata %d14, %d16, %d56 1963*5d9d9091SRichard Lowe faligndata %d16, %d18, %d58 1964*5d9d9091SRichard Lowe faligndata %d18, %d20, %d60 1965*5d9d9091SRichard Lowe faligndata %d20, %d22, %d62 1966*5d9d9091SRichard Lowe fmovd %d22, %d6 1967*5d9d9091SRichard Lowe fmovd %d24, %d8 1968*5d9d9091SRichard Lowe fmovd %d26, %d10 1969*5d9d9091SRichard Lowe fmovd %d28, %d12 1970*5d9d9091SRichard Lowe fmovd %d30, %d14 1971*5d9d9091SRichard Lowe stda %d48, [%o0]ASI_BLK_P 1972*5d9d9091SRichard Lowe subcc %o5, 64, %o5 1973*5d9d9091SRichard Lowe add %o0, 64, %o0 1974*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_011_loop 1975*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1976*5d9d9091SRichard Lowe ba .unalign_done 1977*5d9d9091SRichard Lowe membar #Sync 1978*5d9d9091SRichard Lowe 1979*5d9d9091SRichard Lowe.unalign_010: 1980*5d9d9091SRichard Lowe ldd [%o4+16], %d4 1981*5d9d9091SRichard Lowe ldd [%o4+24], %d6 1982*5d9d9091SRichard Lowe ldd [%o4+32], %d8 1983*5d9d9091SRichard Lowe ldd [%o4+40], %d10 1984*5d9d9091SRichard Lowe ldd [%o4+48], %d12 1985*5d9d9091SRichard Lowe ldd [%o4+56], %d14 1986*5d9d9091SRichard Lowe.unalign_010_loop: 1987*5d9d9091SRichard Lowe add %o4, 64, %o4 1988*5d9d9091SRichard Lowe ldda [%o4]ASI_BLK_P, %d16 1989*5d9d9091SRichard Lowe faligndata %d4, %d6, %d48 1990*5d9d9091SRichard Lowe faligndata %d6, %d8, %d50 1991*5d9d9091SRichard Lowe faligndata %d8, %d10, %d52 1992*5d9d9091SRichard Lowe faligndata %d10, %d12, %d54 1993*5d9d9091SRichard Lowe faligndata %d12, %d14, %d56 1994*5d9d9091SRichard Lowe faligndata %d14, %d16, %d58 1995*5d9d9091SRichard Lowe faligndata %d16, %d18, %d60 1996*5d9d9091SRichard Lowe faligndata %d18, %d20, %d62 1997*5d9d9091SRichard Lowe fmovd %d20, %d4 1998*5d9d9091SRichard Lowe fmovd %d22, %d6 1999*5d9d9091SRichard Lowe fmovd %d24, %d8 2000*5d9d9091SRichard Lowe fmovd %d26, %d10 2001*5d9d9091SRichard Lowe fmovd %d28, %d12 2002*5d9d9091SRichard Lowe fmovd %d30, %d14 2003*5d9d9091SRichard Lowe stda %d48, [%o0]ASI_BLK_P 2004*5d9d9091SRichard Lowe subcc %o5, 64, %o5 2005*5d9d9091SRichard Lowe add %o0, 64, %o0 2006*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_010_loop 2007*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 2008*5d9d9091SRichard Lowe ba .unalign_done 2009*5d9d9091SRichard Lowe membar #Sync 2010*5d9d9091SRichard Lowe 2011*5d9d9091SRichard Lowe.unalign_001: 2012*5d9d9091SRichard Lowe ldd [%o4+8], %d2 2013*5d9d9091SRichard Lowe ldd [%o4+16], %d4 2014*5d9d9091SRichard Lowe ldd [%o4+24], %d6 2015*5d9d9091SRichard Lowe ldd [%o4+32], %d8 2016*5d9d9091SRichard Lowe ldd [%o4+40], %d10 2017*5d9d9091SRichard Lowe ldd [%o4+48], %d12 2018*5d9d9091SRichard Lowe ldd [%o4+56], %d14 2019*5d9d9091SRichard Lowe.unalign_001_loop: 2020*5d9d9091SRichard Lowe add %o4, 64, %o4 2021*5d9d9091SRichard Lowe ldda [%o4]ASI_BLK_P, %d16 2022*5d9d9091SRichard Lowe faligndata %d2, %d4, %d48 2023*5d9d9091SRichard Lowe faligndata %d4, %d6, %d50 2024*5d9d9091SRichard Lowe faligndata %d6, %d8, %d52 2025*5d9d9091SRichard Lowe faligndata %d8, %d10, %d54 2026*5d9d9091SRichard Lowe faligndata %d10, %d12, %d56 2027*5d9d9091SRichard Lowe faligndata %d12, %d14, %d58 2028*5d9d9091SRichard Lowe faligndata %d14, %d16, %d60 2029*5d9d9091SRichard Lowe faligndata %d16, %d18, %d62 2030*5d9d9091SRichard Lowe fmovd %d18, %d2 2031*5d9d9091SRichard Lowe fmovd %d20, %d4 2032*5d9d9091SRichard Lowe fmovd %d22, %d6 2033*5d9d9091SRichard Lowe fmovd %d24, %d8 2034*5d9d9091SRichard Lowe fmovd %d26, %d10 2035*5d9d9091SRichard Lowe fmovd %d28, %d12 2036*5d9d9091SRichard Lowe fmovd %d30, %d14 2037*5d9d9091SRichard Lowe stda %d48, [%o0]ASI_BLK_P 2038*5d9d9091SRichard Lowe subcc %o5, 64, %o5 2039*5d9d9091SRichard Lowe add %o0, 64, %o0 2040*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_001_loop 2041*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 2042*5d9d9091SRichard Lowe ba .unalign_done 2043*5d9d9091SRichard Lowe membar #Sync 2044*5d9d9091SRichard Lowe 2045*5d9d9091SRichard Lowe.unalign_000: 2046*5d9d9091SRichard Lowe ldda [%o4]ASI_BLK_P, %d0 2047*5d9d9091SRichard Lowe.unalign_000_loop: 2048*5d9d9091SRichard Lowe add %o4, 64, %o4 2049*5d9d9091SRichard Lowe ldda [%o4]ASI_BLK_P, %d16 2050*5d9d9091SRichard Lowe faligndata %d0, %d2, %d48 2051*5d9d9091SRichard Lowe faligndata %d2, %d4, %d50 2052*5d9d9091SRichard Lowe faligndata %d4, %d6, %d52 2053*5d9d9091SRichard Lowe faligndata %d6, %d8, %d54 2054*5d9d9091SRichard Lowe faligndata %d8, %d10, %d56 2055*5d9d9091SRichard Lowe faligndata %d10, %d12, %d58 2056*5d9d9091SRichard Lowe faligndata %d12, %d14, %d60 2057*5d9d9091SRichard Lowe faligndata %d14, %d16, %d62 2058*5d9d9091SRichard Lowe fmovd %d16, %d0 2059*5d9d9091SRichard Lowe fmovd %d18, %d2 2060*5d9d9091SRichard Lowe fmovd %d20, %d4 2061*5d9d9091SRichard Lowe fmovd %d22, %d6 2062*5d9d9091SRichard Lowe fmovd %d24, %d8 2063*5d9d9091SRichard Lowe fmovd %d26, %d10 2064*5d9d9091SRichard Lowe fmovd %d28, %d12 2065*5d9d9091SRichard Lowe fmovd %d30, %d14 2066*5d9d9091SRichard Lowe stda %d48, [%o0]ASI_BLK_P 2067*5d9d9091SRichard Lowe subcc %o5, 64, %o5 2068*5d9d9091SRichard Lowe add %o0, 64, %o0 2069*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_000_loop 2070*5d9d9091SRichard Lowe prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 2071*5d9d9091SRichard Lowe membar #Sync 2072*5d9d9091SRichard Lowe 2073*5d9d9091SRichard Lowe.unalign_done: 2074*5d9d9091SRichard Lowe ! Handle trailing bytes, 64 to 127 2075*5d9d9091SRichard Lowe ! Dest long word aligned, Src not long word aligned 2076*5d9d9091SRichard Lowe cmp %o2, 15 2077*5d9d9091SRichard Lowe bleu %ncc, .unalign_short 2078*5d9d9091SRichard Lowe 2079*5d9d9091SRichard Lowe andn %o2, 0x7, %o5 ! %o5 is multiple of 8 2080*5d9d9091SRichard Lowe and %o2, 0x7, %o2 ! residue bytes in %o2 2081*5d9d9091SRichard Lowe add %o2, 8, %o2 2082*5d9d9091SRichard Lowe sub %o5, 8, %o5 ! insure we don't load past end of src 2083*5d9d9091SRichard Lowe andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 2084*5d9d9091SRichard Lowe add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 2085*5d9d9091SRichard Lowe ldd [%o4], %d0 ! fetch partial word 2086*5d9d9091SRichard Lowe.unalign_by8: 2087*5d9d9091SRichard Lowe ldd [%o4+8], %d2 2088*5d9d9091SRichard Lowe add %o4, 8, %o4 2089*5d9d9091SRichard Lowe faligndata %d0, %d2, %d16 2090*5d9d9091SRichard Lowe subcc %o5, 8, %o5 2091*5d9d9091SRichard Lowe std %d16, [%o0] 2092*5d9d9091SRichard Lowe fmovd %d2, %d0 2093*5d9d9091SRichard Lowe bgu,pt %ncc, .unalign_by8 2094*5d9d9091SRichard Lowe add %o0, 8, %o0 2095*5d9d9091SRichard Lowe 2096*5d9d9091SRichard Lowe.unalign_short: 2097*5d9d9091SRichard Lowe brnz %g5, .smallrest 2098*5d9d9091SRichard Lowe nop 2099*5d9d9091SRichard Lowe ba .smallrest 2100*5d9d9091SRichard Lowe wr %g5, %g0, %fprs 2101*5d9d9091SRichard Lowe#else /* NIAGARA2_IMPL */ 2102*5d9d9091SRichard Lowe.forcpy: 2103*5d9d9091SRichard Lowe mov %o0, %g5 ! save des address for return val 2104*5d9d9091SRichard Lowe cmp %o2, 17 ! for small counts copy bytes 2105*5d9d9091SRichard Lowe bleu,pt %ncc, .dbytecp 2106*5d9d9091SRichard Lowe nop 2107*5d9d9091SRichard Lowe 2108*5d9d9091SRichard Lowe cmp %o2, 0x80 ! For lengths less than 128 bytes no 2109*5d9d9091SRichard Lowe bleu,pn %ncc, .no_blkcpy ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2110*5d9d9091SRichard Lowe 2111*5d9d9091SRichard Lowe /* 2112*5d9d9091SRichard Lowe * Make sure that source and destination buffers are 64 bytes apart. 2113*5d9d9091SRichard Lowe * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy 2114*5d9d9091SRichard Lowe * the data. 2115*5d9d9091SRichard Lowe */ 2116*5d9d9091SRichard Lowe subcc %o1, %o0, %o3 2117*5d9d9091SRichard Lowe blu %ncc, .blkalgndst 2118*5d9d9091SRichard Lowe cmp %o3, 0x40 ! if src - dst >= 0x40 2119*5d9d9091SRichard Lowe bgeu,pt %ncc, .blkalgndst ! then use ASI_BLK_INIT_ST_QUAD_LDD_P 2120*5d9d9091SRichard Lowe.no_blkcpy: 2121*5d9d9091SRichard Lowe andcc %o1, 3, %o5 ! is src word aligned 2122*5d9d9091SRichard Lowe bz,pn %ncc, .aldst 2123*5d9d9091SRichard Lowe cmp %o5, 2 ! is src half-word aligned 2124*5d9d9091SRichard Lowe be,pt %ncc, .s2algn 2125*5d9d9091SRichard Lowe cmp %o5, 3 ! src is byte aligned 2126*5d9d9091SRichard Lowe.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it 2127*5d9d9091SRichard Lowe inc 1, %o1 2128*5d9d9091SRichard Lowe stb %o3, [%g5] ! move a byte to align src 2129*5d9d9091SRichard Lowe inc 1, %g5 2130*5d9d9091SRichard Lowe bne,pt %ncc, .s2algn 2131*5d9d9091SRichard Lowe dec %o2 2132*5d9d9091SRichard Lowe b .ald ! now go align dest 2133*5d9d9091SRichard Lowe andcc %g5, 3, %o5 2134*5d9d9091SRichard Lowe 2135*5d9d9091SRichard Lowe.s2algn:lduh [%o1], %o3 ! know src is 2 byte alinged 2136*5d9d9091SRichard Lowe inc 2, %o1 2137*5d9d9091SRichard Lowe srl %o3, 8, %o4 2138*5d9d9091SRichard Lowe stb %o4, [%g5] ! have to do bytes, 2139*5d9d9091SRichard Lowe stb %o3, [%g5 + 1] ! don't know dst alingment 2140*5d9d9091SRichard Lowe inc 2, %g5 2141*5d9d9091SRichard Lowe dec 2, %o2 2142*5d9d9091SRichard Lowe 2143*5d9d9091SRichard Lowe.aldst: andcc %g5, 3, %o5 ! align the destination address 2144*5d9d9091SRichard Lowe.ald: bz,pn %ncc, .w4cp 2145*5d9d9091SRichard Lowe cmp %o5, 2 2146*5d9d9091SRichard Lowe bz,pn %ncc, .w2cp 2147*5d9d9091SRichard Lowe cmp %o5, 3 2148*5d9d9091SRichard Lowe.w3cp: lduw [%o1], %o4 2149*5d9d9091SRichard Lowe inc 4, %o1 2150*5d9d9091SRichard Lowe srl %o4, 24, %o5 2151*5d9d9091SRichard Lowe stb %o5, [%g5] 2152*5d9d9091SRichard Lowe bne,pt %ncc, .w1cp 2153*5d9d9091SRichard Lowe inc %g5 2154*5d9d9091SRichard Lowe dec 1, %o2 2155*5d9d9091SRichard Lowe andn %o2, 3, %o3 ! o3 is aligned word count 2156*5d9d9091SRichard Lowe dec 4, %o3 ! avoid reading beyond tail of src 2157*5d9d9091SRichard Lowe sub %o1, %g5, %o1 ! o1 gets the difference 2158*5d9d9091SRichard Lowe 2159*5d9d9091SRichard Lowe1: sll %o4, 8, %g1 ! save residual bytes 2160*5d9d9091SRichard Lowe lduw [%o1+%g5], %o4 2161*5d9d9091SRichard Lowe deccc 4, %o3 2162*5d9d9091SRichard Lowe srl %o4, 24, %o5 ! merge with residual 2163*5d9d9091SRichard Lowe or %o5, %g1, %g1 2164*5d9d9091SRichard Lowe st %g1, [%g5] 2165*5d9d9091SRichard Lowe bnz,pt %ncc, 1b 2166*5d9d9091SRichard Lowe inc 4, %g5 2167*5d9d9091SRichard Lowe sub %o1, 3, %o1 ! used one byte of last word read 2168*5d9d9091SRichard Lowe and %o2, 3, %o2 2169*5d9d9091SRichard Lowe b 7f 2170*5d9d9091SRichard Lowe inc 4, %o2 2171*5d9d9091SRichard Lowe 2172*5d9d9091SRichard Lowe.w1cp: srl %o4, 8, %o5 2173*5d9d9091SRichard Lowe sth %o5, [%g5] 2174*5d9d9091SRichard Lowe inc 2, %g5 2175*5d9d9091SRichard Lowe dec 3, %o2 2176*5d9d9091SRichard Lowe andn %o2, 3, %o3 ! o3 is aligned word count 2177*5d9d9091SRichard Lowe dec 4, %o3 ! avoid reading beyond tail of src 2178*5d9d9091SRichard Lowe sub %o1, %g5, %o1 ! o1 gets the difference 2179*5d9d9091SRichard Lowe 2180*5d9d9091SRichard Lowe2: sll %o4, 24, %g1 ! save residual bytes 2181*5d9d9091SRichard Lowe lduw [%o1+%g5], %o4 2182*5d9d9091SRichard Lowe deccc 4, %o3 2183*5d9d9091SRichard Lowe srl %o4, 8, %o5 ! merge with residual 2184*5d9d9091SRichard Lowe or %o5, %g1, %g1 2185*5d9d9091SRichard Lowe st %g1, [%g5] 2186*5d9d9091SRichard Lowe bnz,pt %ncc, 2b 2187*5d9d9091SRichard Lowe inc 4, %g5 2188*5d9d9091SRichard Lowe sub %o1, 1, %o1 ! used three bytes of last word read 2189*5d9d9091SRichard Lowe and %o2, 3, %o2 2190*5d9d9091SRichard Lowe b 7f 2191*5d9d9091SRichard Lowe inc 4, %o2 2192*5d9d9091SRichard Lowe 2193*5d9d9091SRichard Lowe.w2cp: lduw [%o1], %o4 2194*5d9d9091SRichard Lowe inc 4, %o1 2195*5d9d9091SRichard Lowe srl %o4, 16, %o5 2196*5d9d9091SRichard Lowe sth %o5, [%g5] 2197*5d9d9091SRichard Lowe inc 2, %g5 2198*5d9d9091SRichard Lowe dec 2, %o2 2199*5d9d9091SRichard Lowe andn %o2, 3, %o3 ! o3 is aligned word count 2200*5d9d9091SRichard Lowe dec 4, %o3 ! avoid reading beyond tail of src 2201*5d9d9091SRichard Lowe sub %o1, %g5, %o1 ! o1 gets the difference 2202*5d9d9091SRichard Lowe 2203*5d9d9091SRichard Lowe3: sll %o4, 16, %g1 ! save residual bytes 2204*5d9d9091SRichard Lowe lduw [%o1+%g5], %o4 2205*5d9d9091SRichard Lowe deccc 4, %o3 2206*5d9d9091SRichard Lowe srl %o4, 16, %o5 ! merge with residual 2207*5d9d9091SRichard Lowe or %o5, %g1, %g1 2208*5d9d9091SRichard Lowe st %g1, [%g5] 2209*5d9d9091SRichard Lowe bnz,pt %ncc, 3b 2210*5d9d9091SRichard Lowe inc 4, %g5 2211*5d9d9091SRichard Lowe sub %o1, 2, %o1 ! used two bytes of last word read 2212*5d9d9091SRichard Lowe and %o2, 3, %o2 2213*5d9d9091SRichard Lowe b 7f 2214*5d9d9091SRichard Lowe inc 4, %o2 2215*5d9d9091SRichard Lowe 2216*5d9d9091SRichard Lowe.w4cp: andn %o2, 3, %o3 ! o3 is aligned word count 2217*5d9d9091SRichard Lowe sub %o1, %g5, %o1 ! o1 gets the difference 2218*5d9d9091SRichard Lowe 2219*5d9d9091SRichard Lowe1: lduw [%o1+%g5], %o4 ! read from address 2220*5d9d9091SRichard Lowe deccc 4, %o3 ! decrement count 2221*5d9d9091SRichard Lowe st %o4, [%g5] ! write at destination address 2222*5d9d9091SRichard Lowe bgu,pt %ncc, 1b 2223*5d9d9091SRichard Lowe inc 4, %g5 ! increment to address 2224*5d9d9091SRichard Lowe b 7f 2225*5d9d9091SRichard Lowe and %o2, 3, %o2 ! number of leftover bytes, if any 2226*5d9d9091SRichard Lowe 2227*5d9d9091SRichard Lowe ! 2228*5d9d9091SRichard Lowe ! differenced byte copy, works with any alignment 2229*5d9d9091SRichard Lowe ! 2230*5d9d9091SRichard Lowe.dbytecp: 2231*5d9d9091SRichard Lowe b 7f 2232*5d9d9091SRichard Lowe sub %o1, %g5, %o1 ! o1 gets the difference 2233*5d9d9091SRichard Lowe 2234*5d9d9091SRichard Lowe4: stb %o4, [%g5] ! write to address 2235*5d9d9091SRichard Lowe inc %g5 ! inc to address 2236*5d9d9091SRichard Lowe7: deccc %o2 ! decrement count 2237*5d9d9091SRichard Lowe bgeu,a,pt %ncc,4b ! loop till done 2238*5d9d9091SRichard Lowe ldub [%o1+%g5], %o4 ! read from address 2239*5d9d9091SRichard Lowe retl ! %o0 was preserved 2240*5d9d9091SRichard Lowe nop 2241*5d9d9091SRichard Lowe 2242*5d9d9091SRichard Lowe.blkalgndst: 2243*5d9d9091SRichard Lowe save %sp, -SA(MINFRAME), %sp 2244*5d9d9091SRichard Lowe 2245*5d9d9091SRichard Lowe ! Block (64 bytes) align the destination. 2246*5d9d9091SRichard Lowe andcc %i0, 0x3f, %i3 ! is dst block aligned 2247*5d9d9091SRichard Lowe bz %ncc, .chksrc ! dst already block aligned 2248*5d9d9091SRichard Lowe sub %i3, 0x40, %i3 2249*5d9d9091SRichard Lowe neg %i3 ! bytes till dst 64 bytes aligned 2250*5d9d9091SRichard Lowe sub %i2, %i3, %i2 ! update i2 with new count 2251*5d9d9091SRichard Lowe 2252*5d9d9091SRichard Lowe ! Based on source and destination alignment do 2253*5d9d9091SRichard Lowe ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 2254*5d9d9091SRichard Lowe 2255*5d9d9091SRichard Lowe ! Is dst & src 8B aligned 2256*5d9d9091SRichard Lowe or %i0, %i1, %o2 2257*5d9d9091SRichard Lowe andcc %o2, 0x7, %g0 2258*5d9d9091SRichard Lowe bz %ncc, .alewdcp 2259*5d9d9091SRichard Lowe nop 2260*5d9d9091SRichard Lowe 2261*5d9d9091SRichard Lowe ! Is dst & src 4B aligned 2262*5d9d9091SRichard Lowe andcc %o2, 0x3, %g0 2263*5d9d9091SRichard Lowe bz %ncc, .alwdcp 2264*5d9d9091SRichard Lowe nop 2265*5d9d9091SRichard Lowe 2266*5d9d9091SRichard Lowe ! Is dst & src 2B aligned 2267*5d9d9091SRichard Lowe andcc %o2, 0x1, %g0 2268*5d9d9091SRichard Lowe bz %ncc, .alhlfwdcp 2269*5d9d9091SRichard Lowe nop 2270*5d9d9091SRichard Lowe 2271*5d9d9091SRichard Lowe ! 1B aligned 2272*5d9d9091SRichard Lowe1: ldub [%i1], %o2 2273*5d9d9091SRichard Lowe stb %o2, [%i0] 2274*5d9d9091SRichard Lowe inc %i1 2275*5d9d9091SRichard Lowe deccc %i3 2276*5d9d9091SRichard Lowe bgu,pt %ncc, 1b 2277*5d9d9091SRichard Lowe inc %i0 2278*5d9d9091SRichard Lowe 2279*5d9d9091SRichard Lowe ba .chksrc 2280*5d9d9091SRichard Lowe nop 2281*5d9d9091SRichard Lowe 2282*5d9d9091SRichard Lowe ! dst & src 4B aligned 2283*5d9d9091SRichard Lowe.alwdcp: 2284*5d9d9091SRichard Lowe ld [%i1], %o2 2285*5d9d9091SRichard Lowe st %o2, [%i0] 2286*5d9d9091SRichard Lowe add %i1, 0x4, %i1 2287*5d9d9091SRichard Lowe subcc %i3, 0x4, %i3 2288*5d9d9091SRichard Lowe bgu,pt %ncc, .alwdcp 2289*5d9d9091SRichard Lowe add %i0, 0x4, %i0 2290*5d9d9091SRichard Lowe 2291*5d9d9091SRichard Lowe ba .chksrc 2292*5d9d9091SRichard Lowe nop 2293*5d9d9091SRichard Lowe 2294*5d9d9091SRichard Lowe ! dst & src 2B aligned 2295*5d9d9091SRichard Lowe.alhlfwdcp: 2296*5d9d9091SRichard Lowe lduh [%i1], %o2 2297*5d9d9091SRichard Lowe stuh %o2, [%i0] 2298*5d9d9091SRichard Lowe add %i1, 0x2, %i1 2299*5d9d9091SRichard Lowe subcc %i3, 0x2, %i3 2300*5d9d9091SRichard Lowe bgu,pt %ncc, .alhlfwdcp 2301*5d9d9091SRichard Lowe add %i0, 0x2, %i0 2302*5d9d9091SRichard Lowe 2303*5d9d9091SRichard Lowe ba .chksrc 2304*5d9d9091SRichard Lowe nop 2305*5d9d9091SRichard Lowe 2306*5d9d9091SRichard Lowe ! dst & src 8B aligned 2307*5d9d9091SRichard Lowe.alewdcp: 2308*5d9d9091SRichard Lowe ldx [%i1], %o2 2309*5d9d9091SRichard Lowe stx %o2, [%i0] 2310*5d9d9091SRichard Lowe add %i1, 0x8, %i1 2311*5d9d9091SRichard Lowe subcc %i3, 0x8, %i3 2312*5d9d9091SRichard Lowe bgu,pt %ncc, .alewdcp 2313*5d9d9091SRichard Lowe add %i0, 0x8, %i0 2314*5d9d9091SRichard Lowe 2315*5d9d9091SRichard Lowe ! Now Destination is block (64 bytes) aligned 2316*5d9d9091SRichard Lowe.chksrc: 2317*5d9d9091SRichard Lowe andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 2318*5d9d9091SRichard Lowe sub %i2, %i3, %i2 ! Residue bytes in %i2 2319*5d9d9091SRichard Lowe mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2320*5d9d9091SRichard Lowe andcc %i1, 0xf, %l1 ! is src quadword aligned 2321*5d9d9091SRichard Lowe bz,pn %ncc, .blkcpy ! src offset in %l1 2322*5d9d9091SRichard Lowe nop 2323*5d9d9091SRichard Lowe cmp %l1, 0x8 2324*5d9d9091SRichard Lowe bgu %ncc, .cpy_upper_double 2325*5d9d9091SRichard Lowe nop 2326*5d9d9091SRichard Lowe blu %ncc, .cpy_lower_double 2327*5d9d9091SRichard Lowe nop 2328*5d9d9091SRichard Lowe 2329*5d9d9091SRichard Lowe ! Falls through when source offset is equal to 8 i.e. 2330*5d9d9091SRichard Lowe ! source is double word aligned. 2331*5d9d9091SRichard Lowe ! In this case no shift/merge of data is required 2332*5d9d9091SRichard Lowe sub %i1, %l1, %i1 ! align the src at 16 bytes. 2333*5d9d9091SRichard Lowe andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2334*5d9d9091SRichard Lowe prefetch [%o0+0x0], #one_read 2335*5d9d9091SRichard Lowe ldda [%i1+0x0]%asi, %o2 2336*5d9d9091SRichard Loweloop0: 2337*5d9d9091SRichard Lowe ldda [%i1+0x10]%asi, %o4 2338*5d9d9091SRichard Lowe prefetch [%o0+0x40], #one_read 2339*5d9d9091SRichard Lowe 2340*5d9d9091SRichard Lowe stxa %o3, [%i0+0x0]%asi 2341*5d9d9091SRichard Lowe stxa %o4, [%i0+0x8]%asi 2342*5d9d9091SRichard Lowe 2343*5d9d9091SRichard Lowe ldda [%i1+0x20]%asi, %o2 2344*5d9d9091SRichard Lowe stxa %o5, [%i0+0x10]%asi 2345*5d9d9091SRichard Lowe stxa %o2, [%i0+0x18]%asi 2346*5d9d9091SRichard Lowe 2347*5d9d9091SRichard Lowe ldda [%i1+0x30]%asi, %o4 2348*5d9d9091SRichard Lowe stxa %o3, [%i0+0x20]%asi 2349*5d9d9091SRichard Lowe stxa %o4, [%i0+0x28]%asi 2350*5d9d9091SRichard Lowe 2351*5d9d9091SRichard Lowe ldda [%i1+0x40]%asi, %o2 2352*5d9d9091SRichard Lowe stxa %o5, [%i0+0x30]%asi 2353*5d9d9091SRichard Lowe stxa %o2, [%i0+0x38]%asi 2354*5d9d9091SRichard Lowe 2355*5d9d9091SRichard Lowe add %o0, 0x40, %o0 2356*5d9d9091SRichard Lowe add %i1, 0x40, %i1 2357*5d9d9091SRichard Lowe subcc %i3, 0x40, %i3 2358*5d9d9091SRichard Lowe bgu,pt %ncc, loop0 2359*5d9d9091SRichard Lowe add %i0, 0x40, %i0 2360*5d9d9091SRichard Lowe ba .blkdone 2361*5d9d9091SRichard Lowe add %i1, %l1, %i1 ! increment the source by src offset 2362*5d9d9091SRichard Lowe 2363*5d9d9091SRichard Lowe.cpy_lower_double: 2364*5d9d9091SRichard Lowe sub %i1, %l1, %i1 ! align the src at 16 bytes. 2365*5d9d9091SRichard Lowe sll %l1, 3, %l2 ! %l2 left shift 2366*5d9d9091SRichard Lowe mov 0x40, %l3 2367*5d9d9091SRichard Lowe sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) 2368*5d9d9091SRichard Lowe andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2369*5d9d9091SRichard Lowe prefetch [%o0+0x0], #one_read 2370*5d9d9091SRichard Lowe ldda [%i1+0x0]%asi, %o2 ! partial data in %o2 and %o3 has 2371*5d9d9091SRichard Lowe ! complete data 2372*5d9d9091SRichard Loweloop1: 2373*5d9d9091SRichard Lowe ldda [%i1+0x10]%asi, %o4 ! %o4 has partial data for this read. 2374*5d9d9091SRichard Lowe ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) ! merge %o2, %o3 and %o4 2375*5d9d9091SRichard Lowe ! into %o2 and %o3 2376*5d9d9091SRichard Lowe prefetch [%o0+0x40], #one_read 2377*5d9d9091SRichard Lowe stxa %o2, [%i0+0x0]%asi 2378*5d9d9091SRichard Lowe stxa %o3, [%i0+0x8]%asi 2379*5d9d9091SRichard Lowe 2380*5d9d9091SRichard Lowe ldda [%i1+0x20]%asi, %o2 2381*5d9d9091SRichard Lowe ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) ! merge %o2 with %o5 and 2382*5d9d9091SRichard Lowe stxa %o4, [%i0+0x10]%asi ! %o4 from previous read 2383*5d9d9091SRichard Lowe stxa %o5, [%i0+0x18]%asi ! into %o4 and %o5 2384*5d9d9091SRichard Lowe 2385*5d9d9091SRichard Lowe ! Repeat the same for next 32 bytes. 2386*5d9d9091SRichard Lowe 2387*5d9d9091SRichard Lowe ldda [%i1+0x30]%asi, %o4 2388*5d9d9091SRichard Lowe ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) 2389*5d9d9091SRichard Lowe stxa %o2, [%i0+0x20]%asi 2390*5d9d9091SRichard Lowe stxa %o3, [%i0+0x28]%asi 2391*5d9d9091SRichard Lowe 2392*5d9d9091SRichard Lowe ldda [%i1+0x40]%asi, %o2 2393*5d9d9091SRichard Lowe ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) 2394*5d9d9091SRichard Lowe stxa %o4, [%i0+0x30]%asi 2395*5d9d9091SRichard Lowe stxa %o5, [%i0+0x38]%asi 2396*5d9d9091SRichard Lowe 2397*5d9d9091SRichard Lowe add %o0, 0x40, %o0 2398*5d9d9091SRichard Lowe add %i1, 0x40, %i1 2399*5d9d9091SRichard Lowe subcc %i3, 0x40, %i3 2400*5d9d9091SRichard Lowe bgu,pt %ncc, loop1 2401*5d9d9091SRichard Lowe add %i0, 0x40, %i0 2402*5d9d9091SRichard Lowe ba .blkdone 2403*5d9d9091SRichard Lowe add %i1, %l1, %i1 ! increment the source by src offset 2404*5d9d9091SRichard Lowe 2405*5d9d9091SRichard Lowe.cpy_upper_double: 2406*5d9d9091SRichard Lowe sub %i1, %l1, %i1 ! align the src at 16 bytes. 2407*5d9d9091SRichard Lowe mov 0x8, %l2 2408*5d9d9091SRichard Lowe sub %l1, %l2, %l2 2409*5d9d9091SRichard Lowe sll %l2, 3, %l2 ! %l2 left shift 2410*5d9d9091SRichard Lowe mov 0x40, %l3 2411*5d9d9091SRichard Lowe sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) 2412*5d9d9091SRichard Lowe andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2413*5d9d9091SRichard Lowe prefetch [%o0+0x0], #one_read 2414*5d9d9091SRichard Lowe ldda [%i1+0x0]%asi, %o2 ! partial data in %o3 for this read and 2415*5d9d9091SRichard Lowe ! no data in %o2 2416*5d9d9091SRichard Loweloop2: 2417*5d9d9091SRichard Lowe ldda [%i1+0x10]%asi, %o4 ! %o4 has complete data and %o5 has 2418*5d9d9091SRichard Lowe ! partial 2419*5d9d9091SRichard Lowe ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) ! merge %o3, %o4 and %o5 2420*5d9d9091SRichard Lowe ! into %o3 and %o4 2421*5d9d9091SRichard Lowe prefetch [%o0+0x40], #one_read 2422*5d9d9091SRichard Lowe stxa %o3, [%i0+0x0]%asi 2423*5d9d9091SRichard Lowe stxa %o4, [%i0+0x8]%asi 2424*5d9d9091SRichard Lowe 2425*5d9d9091SRichard Lowe ldda [%i1+0x20]%asi, %o2 2426*5d9d9091SRichard Lowe ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) ! merge %o2 and %o3 with 2427*5d9d9091SRichard Lowe stxa %o5, [%i0+0x10]%asi ! %o5 from previous read 2428*5d9d9091SRichard Lowe stxa %o2, [%i0+0x18]%asi ! into %o5 and %o2 2429*5d9d9091SRichard Lowe 2430*5d9d9091SRichard Lowe ! Repeat the same for next 32 bytes. 2431*5d9d9091SRichard Lowe 2432*5d9d9091SRichard Lowe ldda [%i1+0x30]%asi, %o4 2433*5d9d9091SRichard Lowe ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) 2434*5d9d9091SRichard Lowe stxa %o3, [%i0+0x20]%asi 2435*5d9d9091SRichard Lowe stxa %o4, [%i0+0x28]%asi 2436*5d9d9091SRichard Lowe 2437*5d9d9091SRichard Lowe ldda [%i1+0x40]%asi, %o2 2438*5d9d9091SRichard Lowe ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) 2439*5d9d9091SRichard Lowe stxa %o5, [%i0+0x30]%asi 2440*5d9d9091SRichard Lowe stxa %o2, [%i0+0x38]%asi 2441*5d9d9091SRichard Lowe 2442*5d9d9091SRichard Lowe add %o0, 0x40, %o0 2443*5d9d9091SRichard Lowe add %i1, 0x40, %i1 2444*5d9d9091SRichard Lowe subcc %i3, 0x40, %i3 2445*5d9d9091SRichard Lowe bgu,pt %ncc, loop2 2446*5d9d9091SRichard Lowe add %i0, 0x40, %i0 2447*5d9d9091SRichard Lowe ba .blkdone 2448*5d9d9091SRichard Lowe add %i1, %l1, %i1 ! increment the source by src offset 2449*5d9d9091SRichard Lowe 2450*5d9d9091SRichard Lowe ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2451*5d9d9091SRichard Lowe.blkcpy: 2452*5d9d9091SRichard Lowe andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2453*5d9d9091SRichard Lowe prefetch [%o0+0x0], #one_read 2454*5d9d9091SRichard Lowe1: 2455*5d9d9091SRichard Lowe prefetch [%o0+0x40], #one_read 2456*5d9d9091SRichard Lowe 2457*5d9d9091SRichard Lowe ldda [%i1+0x0]%asi, %o2 2458*5d9d9091SRichard Lowe ldda [%i1+0x10]%asi, %o4 2459*5d9d9091SRichard Lowe 2460*5d9d9091SRichard Lowe stxa %o2, [%i0+0x0]%asi 2461*5d9d9091SRichard Lowe stxa %o3, [%i0+0x8]%asi 2462*5d9d9091SRichard Lowe stxa %o4, [%i0+0x10]%asi 2463*5d9d9091SRichard Lowe stxa %o5, [%i0+0x18]%asi 2464*5d9d9091SRichard Lowe 2465*5d9d9091SRichard Lowe ldda [%i1+0x20]%asi, %o2 2466*5d9d9091SRichard Lowe ldda [%i1+0x30]%asi, %o4 2467*5d9d9091SRichard Lowe 2468*5d9d9091SRichard Lowe stxa %o2, [%i0+0x20]%asi 2469*5d9d9091SRichard Lowe stxa %o3, [%i0+0x28]%asi 2470*5d9d9091SRichard Lowe stxa %o4, [%i0+0x30]%asi 2471*5d9d9091SRichard Lowe stxa %o5, [%i0+0x38]%asi 2472*5d9d9091SRichard Lowe 2473*5d9d9091SRichard Lowe add %o0, 0x40, %o0 2474*5d9d9091SRichard Lowe add %i1, 0x40, %i1 2475*5d9d9091SRichard Lowe subcc %i3, 0x40, %i3 2476*5d9d9091SRichard Lowe bgu,pt %ncc, 1b 2477*5d9d9091SRichard Lowe add %i0, 0x40, %i0 2478*5d9d9091SRichard Lowe 2479*5d9d9091SRichard Lowe.blkdone: 2480*5d9d9091SRichard Lowe membar #Sync 2481*5d9d9091SRichard Lowe 2482*5d9d9091SRichard Lowe mov ASI_PNF, %asi ! restore %asi to default 2483*5d9d9091SRichard Lowe ! ASI_PRIMARY_NOFAULT value 2484*5d9d9091SRichard Lowe tst %i2 2485*5d9d9091SRichard Lowe bz,pt %ncc, .blkexit 2486*5d9d9091SRichard Lowe nop 2487*5d9d9091SRichard Lowe 2488*5d9d9091SRichard Lowe ! Handle trailing bytes 2489*5d9d9091SRichard Lowe cmp %i2, 0x8 2490*5d9d9091SRichard Lowe blu,pt %ncc, .residue 2491*5d9d9091SRichard Lowe nop 2492*5d9d9091SRichard Lowe 2493*5d9d9091SRichard Lowe ! Can we do some 8B ops 2494*5d9d9091SRichard Lowe or %i1, %i0, %o2 2495*5d9d9091SRichard Lowe andcc %o2, 0x7, %g0 2496*5d9d9091SRichard Lowe bnz %ncc, .last4 2497*5d9d9091SRichard Lowe nop 2498*5d9d9091SRichard Lowe 2499*5d9d9091SRichard Lowe ! Do 8byte ops as long as possible 2500*5d9d9091SRichard Lowe.last8: 2501*5d9d9091SRichard Lowe ldx [%i1], %o2 2502*5d9d9091SRichard Lowe stx %o2, [%i0] 2503*5d9d9091SRichard Lowe add %i1, 0x8, %i1 2504*5d9d9091SRichard Lowe sub %i2, 0x8, %i2 2505*5d9d9091SRichard Lowe cmp %i2, 0x8 2506*5d9d9091SRichard Lowe bgu,pt %ncc, .last8 2507*5d9d9091SRichard Lowe add %i0, 0x8, %i0 2508*5d9d9091SRichard Lowe 2509*5d9d9091SRichard Lowe tst %i2 2510*5d9d9091SRichard Lowe bz,pt %ncc, .blkexit 2511*5d9d9091SRichard Lowe nop 2512*5d9d9091SRichard Lowe 2513*5d9d9091SRichard Lowe ba .residue 2514*5d9d9091SRichard Lowe nop 2515*5d9d9091SRichard Lowe 2516*5d9d9091SRichard Lowe.last4: 2517*5d9d9091SRichard Lowe ! Can we do 4B ops 2518*5d9d9091SRichard Lowe andcc %o2, 0x3, %g0 2519*5d9d9091SRichard Lowe bnz %ncc, .last2 2520*5d9d9091SRichard Lowe nop 2521*5d9d9091SRichard Lowe1: 2522*5d9d9091SRichard Lowe ld [%i1], %o2 2523*5d9d9091SRichard Lowe st %o2, [%i0] 2524*5d9d9091SRichard Lowe add %i1, 0x4, %i1 2525*5d9d9091SRichard Lowe sub %i2, 0x4, %i2 2526*5d9d9091SRichard Lowe cmp %i2, 0x4 2527*5d9d9091SRichard Lowe bgu,pt %ncc, 1b 2528*5d9d9091SRichard Lowe add %i0, 0x4, %i0 2529*5d9d9091SRichard Lowe 2530*5d9d9091SRichard Lowe cmp %i2, 0 2531*5d9d9091SRichard Lowe bz,pt %ncc, .blkexit 2532*5d9d9091SRichard Lowe nop 2533*5d9d9091SRichard Lowe 2534*5d9d9091SRichard Lowe ba .residue 2535*5d9d9091SRichard Lowe nop 2536*5d9d9091SRichard Lowe 2537*5d9d9091SRichard Lowe.last2: 2538*5d9d9091SRichard Lowe ! Can we do 2B ops 2539*5d9d9091SRichard Lowe andcc %o2, 0x1, %g0 2540*5d9d9091SRichard Lowe bnz %ncc, .residue 2541*5d9d9091SRichard Lowe nop 2542*5d9d9091SRichard Lowe 2543*5d9d9091SRichard Lowe1: 2544*5d9d9091SRichard Lowe lduh [%i1], %o2 2545*5d9d9091SRichard Lowe stuh %o2, [%i0] 2546*5d9d9091SRichard Lowe add %i1, 0x2, %i1 2547*5d9d9091SRichard Lowe sub %i2, 0x2, %i2 2548*5d9d9091SRichard Lowe cmp %i2, 0x2 2549*5d9d9091SRichard Lowe bgu,pt %ncc, 1b 2550*5d9d9091SRichard Lowe add %i0, 0x2, %i0 2551*5d9d9091SRichard Lowe 2552*5d9d9091SRichard Lowe cmp %i2, 0 2553*5d9d9091SRichard Lowe bz,pt %ncc, .blkexit 2554*5d9d9091SRichard Lowe nop 2555*5d9d9091SRichard Lowe 2556*5d9d9091SRichard Lowe.residue: 2557*5d9d9091SRichard Lowe ldub [%i1], %o2 2558*5d9d9091SRichard Lowe stb %o2, [%i0] 2559*5d9d9091SRichard Lowe inc %i1 2560*5d9d9091SRichard Lowe deccc %i2 2561*5d9d9091SRichard Lowe bgu,pt %ncc, .residue 2562*5d9d9091SRichard Lowe inc %i0 2563*5d9d9091SRichard Lowe 2564*5d9d9091SRichard Lowe.blkexit: 2565*5d9d9091SRichard Lowe 2566*5d9d9091SRichard Lowe ret 2567*5d9d9091SRichard Lowe restore %g5, %g0, %o0 2568*5d9d9091SRichard Lowe 2569*5d9d9091SRichard Lowe#endif /* NIAGARA2_IMPL */ 2570*5d9d9091SRichard Lowe SET_SIZE(memcpy) 2571*5d9d9091SRichard Lowe SET_SIZE(__align_cpy_1) 2572