1*1e49577aSRod Evans/* 2*1e49577aSRod Evans * CDDL HEADER START 3*1e49577aSRod Evans * 4*1e49577aSRod Evans * The contents of this file are subject to the terms of the 5*1e49577aSRod Evans * Common Development and Distribution License (the "License"). 6*1e49577aSRod Evans * You may not use this file except in compliance with the License. 7*1e49577aSRod Evans * 8*1e49577aSRod Evans * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*1e49577aSRod Evans * or http://www.opensolaris.org/os/licensing. 10*1e49577aSRod Evans * See the License for the specific language governing permissions 11*1e49577aSRod Evans * and limitations under the License. 12*1e49577aSRod Evans * 13*1e49577aSRod Evans * When distributing Covered Code, include this CDDL HEADER in each 14*1e49577aSRod Evans * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*1e49577aSRod Evans * If applicable, add the following below this CDDL HEADER, with the 16*1e49577aSRod Evans * fields enclosed by brackets "[]" replaced with your own identifying 17*1e49577aSRod Evans * information: Portions Copyright [yyyy] [name of copyright owner] 18*1e49577aSRod Evans * 19*1e49577aSRod Evans * CDDL HEADER END 20*1e49577aSRod Evans */ 21*1e49577aSRod Evans 22*1e49577aSRod Evans/* 23*1e49577aSRod Evans * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24*1e49577aSRod Evans */ 25*1e49577aSRod Evans 26*1e49577aSRod Evans .file "memcpy.s" 27*1e49577aSRod Evans 28*1e49577aSRod Evans/* 29*1e49577aSRod Evans * memcpy(s1, s2, len) 30*1e49577aSRod Evans * 31*1e49577aSRod Evans * Copy s2 to s1, always copy n bytes. 32*1e49577aSRod Evans * Note: this C code does not work for overlapped copies. 33*1e49577aSRod Evans * Memmove() and bcopy() do. 34*1e49577aSRod Evans * 35*1e49577aSRod Evans * Added entry __align_cpy_1 is generally for use of the compilers. 36*1e49577aSRod Evans * 37*1e49577aSRod Evans * Fast assembler language version of the following C-program for memcpy 38*1e49577aSRod Evans * which represents the `standard' for the C-library. 39*1e49577aSRod Evans * 40*1e49577aSRod Evans * void * 41*1e49577aSRod Evans * memcpy(void *s, const void *s0, size_t n) 42*1e49577aSRod Evans * { 43*1e49577aSRod Evans * if (n != 0) { 44*1e49577aSRod Evans * char *s1 = s; 45*1e49577aSRod Evans * const char *s2 = s0; 46*1e49577aSRod Evans * do { 47*1e49577aSRod Evans * *s1++ = *s2++; 48*1e49577aSRod Evans * } while (--n != 0); 49*1e49577aSRod Evans * } 50*1e49577aSRod Evans * return (s); 51*1e49577aSRod Evans * } 52*1e49577aSRod Evans * 53*1e49577aSRod Evans * 54*1e49577aSRod Evans * N1 Flow : 55*1e49577aSRod Evans * 56*1e49577aSRod Evans * if (count < 17) { 57*1e49577aSRod Evans * Do the byte copy 58*1e49577aSRod Evans * Return destination address 59*1e49577aSRod Evans * } 60*1e49577aSRod Evans * if (count < 128) { 61*1e49577aSRod Evans * Is source aligned on word boundary 62*1e49577aSRod Evans * If no then align source on word boundary then goto .ald 63*1e49577aSRod Evans * If yes goto .ald 64*1e49577aSRod Evans * .ald: 65*1e49577aSRod Evans * Is destination aligned on word boundary 66*1e49577aSRod Evans * Depending on destination offset (last 2 bits of destination) 67*1e49577aSRod Evans * copy data by shifting and merging. 68*1e49577aSRod Evans * Copy residue bytes as byte copy 69*1e49577aSRod Evans * Return destination address 70*1e49577aSRod Evans * } else { 71*1e49577aSRod Evans * Align destination on block boundary 72*1e49577aSRod Evans * Depending on the source offset (last 4 bits of source address) align 73*1e49577aSRod Evans * the data and store to destination. Both the load and store are done 74*1e49577aSRod Evans * using ASI_BLK_INIT_ST_QUAD_LDD_P. 75*1e49577aSRod Evans * For remaining count copy as much data in 8-byte chunk from source to 76*1e49577aSRod Evans * destination. 77*1e49577aSRod Evans * Followed by trailing copy using byte copy. 78*1e49577aSRod Evans * Return saved destination address 79*1e49577aSRod Evans * } 80*1e49577aSRod Evans * 81*1e49577aSRod Evans * 82*1e49577aSRod Evans * N2 Flow : 83*1e49577aSRod Evans * Flow : 84*1e49577aSRod Evans * 85*1e49577aSRod Evans * if (count < 128) { 86*1e49577aSRod Evans * if count < 3 87*1e49577aSRod Evans * copy bytes; exit with dst addr 88*1e49577aSRod Evans * if src & dst aligned on word boundary but not long word boundary, 89*1e49577aSRod Evans * copy with ldw/stw; branch to finish_up 90*1e49577aSRod Evans * if src & dst aligned on long word boundary 91*1e49577aSRod Evans * copy with ldx/stx; branch to finish_up 92*1e49577aSRod Evans * if src & dst not aligned and length <= 14 93*1e49577aSRod Evans * copy bytes; exit with dst addr 94*1e49577aSRod Evans * move enough bytes to get src to word boundary 95*1e49577aSRod Evans * if dst now on word boundary 96*1e49577aSRod Evans * move_words: 97*1e49577aSRod Evans * copy words; branch to finish_up 98*1e49577aSRod Evans * if dst now on half word boundary 99*1e49577aSRod Evans * load words, shift half words, store words; branch to finish_up 100*1e49577aSRod Evans * if dst on byte 1 101*1e49577aSRod Evans * load words, shift 3 bytes, store words; branch to finish_up 102*1e49577aSRod Evans * if dst on byte 3 103*1e49577aSRod Evans * load words, shift 1 byte, store words; branch to finish_up 104*1e49577aSRod Evans * finish_up: 105*1e49577aSRod Evans * copy bytes; exit with dst addr 106*1e49577aSRod Evans * } else { More than 128 bytes 107*1e49577aSRod Evans * move bytes until dst is on long word boundary 108*1e49577aSRod Evans * if( src is on long word boundary ) { 109*1e49577aSRod Evans * if (count < 512) { 110*1e49577aSRod Evans * finish_long: src/dst aligned on 8 bytes 111*1e49577aSRod Evans * copy with ldx/stx in 8-way unrolled loop; 112*1e49577aSRod Evans * copy final 0-63 bytes; exit with dst addr 113*1e49577aSRod Evans * } else { src/dst aligned; count > 512 114*1e49577aSRod Evans * align dst on 64 byte boundary; use 8-way test for each of 8 possible 115*1e49577aSRod Evans * src alignments relative to a 64 byte boundary to select the 116*1e49577aSRod Evans * 16-way unrolled loop to use for 117*1e49577aSRod Evans * block load, fmovd, block-init-store, block-store, fmovd operations 118*1e49577aSRod Evans * then go to finish_long. 119*1e49577aSRod Evans * } 120*1e49577aSRod Evans * } else { src/dst not aligned on 8 bytes 121*1e49577aSRod Evans * if src is word aligned and count < 512 122*1e49577aSRod Evans * move words in 8-way unrolled loop 123*1e49577aSRod Evans * move final 0-31 bytes; exit with dst addr 124*1e49577aSRod Evans * if count < 512 125*1e49577aSRod Evans * use alignaddr/faligndata combined with ldd/std in 8-way 126*1e49577aSRod Evans * unrolled loop to move data. 127*1e49577aSRod Evans * go to unalign_done 128*1e49577aSRod Evans * else 129*1e49577aSRod Evans * setup alignaddr for faligndata instructions 130*1e49577aSRod Evans * align dst on 64 byte boundary; use 8-way test for each of 8 possible 131*1e49577aSRod Evans * src alignments to nearest long word relative to 64 byte boundary to 132*1e49577aSRod Evans * select the 8-way unrolled loop to use for 133*1e49577aSRod Evans * block load, falign, fmovd, block-init-store, block-store loop 134*1e49577aSRod Evans * (only use block-init-store when src/dst on 8 byte boundaries.) 135*1e49577aSRod Evans * unalign_done: 136*1e49577aSRod Evans * move remaining bytes for unaligned cases. exit with dst addr. 137*1e49577aSRod Evans * } 138*1e49577aSRod Evans * 139*1e49577aSRod Evans * Comment on N2 memmove and memcpy common code and block-store-init: 140*1e49577aSRod Evans * In the man page for memmove, it specifies that copying will take place 141*1e49577aSRod Evans * correctly between objects that overlap. For memcpy, behavior is 142*1e49577aSRod Evans * undefined for objects that overlap. 143*1e49577aSRod Evans * 144*1e49577aSRod Evans * In rare cases, some multi-threaded applications may attempt to examine 145*1e49577aSRod Evans * the copy destination buffer during the copy. Using the block-store-init 146*1e49577aSRod Evans * instruction allows those applications to observe zeros in some 147*1e49577aSRod Evans * cache lines of the destination buffer for narrow windows. But the 148*1e49577aSRod Evans * the block-store-init provides memory throughput advantages for many 149*1e49577aSRod Evans * common applications. To meet both needs, those applications which need 150*1e49577aSRod Evans * the destination buffer to retain meaning during the copy should use 151*1e49577aSRod Evans * memmove instead of memcpy. The memmove version duplicates the memcpy 152*1e49577aSRod Evans * algorithms except the memmove version does not use block-store-init 153*1e49577aSRod Evans * in those cases where memcpy does use block-store-init. Otherwise, when 154*1e49577aSRod Evans * memmove can determine the source and destination do not overlap, 155*1e49577aSRod Evans * memmove shares the memcpy code. 156*1e49577aSRod Evans */ 157*1e49577aSRod Evans 158*1e49577aSRod Evans#include <sys/asm_linkage.h> 159*1e49577aSRod Evans#include <sys/niagaraasi.h> 160*1e49577aSRod Evans#include <sys/asi.h> 161*1e49577aSRod Evans#include <sys/trap.h> 162*1e49577aSRod Evans 163*1e49577aSRod Evans/* documented name for primary block initializing store */ 164*1e49577aSRod Evans#define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P 165*1e49577aSRod Evans 166*1e49577aSRod Evans#define BLOCK_SIZE 64 167*1e49577aSRod Evans#define FPRS_FEF 0x4 168*1e49577aSRod Evans 169*1e49577aSRod Evans#define SHORTCOPY 3 170*1e49577aSRod Evans#define SHORTCHECK 14 171*1e49577aSRod Evans#define SHORT_LONG 64 /* max copy for short longword-aligned case */ 172*1e49577aSRod Evans /* must be at least 32 */ 173*1e49577aSRod Evans#define SMALL_MAX 128 174*1e49577aSRod Evans#define MED_UMAX 512 /* max copy for medium un-aligned case */ 175*1e49577aSRod Evans#define MED_WMAX 512 /* max copy for medium word-aligned case */ 176*1e49577aSRod Evans#define MED_MAX 512 /* max copy for medium longword-aligned case */ 177*1e49577aSRod Evans 178*1e49577aSRod Evans#ifdef NIAGARA2_IMPL 179*1e49577aSRod Evans#include <sys/sun4asi.h> 180*1e49577aSRod Evans 181*1e49577aSRod Evans#else /* NIAGARA2_IMPL */ 182*1e49577aSRod Evans/* 183*1e49577aSRod Evans * This define is to align data for the unaligned source cases. 184*1e49577aSRod Evans * The data1, data2 and data3 is merged into data1 and data2. 185*1e49577aSRod Evans * The data3 is preserved for next merge. 186*1e49577aSRod Evans */ 187*1e49577aSRod Evans#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ 188*1e49577aSRod Evans sllx data1, lshift, data1 ;\ 189*1e49577aSRod Evans srlx data2, rshift, tmp ;\ 190*1e49577aSRod Evans or data1, tmp, data1 ;\ 191*1e49577aSRod Evans sllx data2, lshift, data2 ;\ 192*1e49577aSRod Evans srlx data3, rshift, tmp ;\ 193*1e49577aSRod Evans or data2, tmp, data2 194*1e49577aSRod Evans/* 195*1e49577aSRod Evans * Align the data. Merge the data1 and data2 into data1. 196*1e49577aSRod Evans */ 197*1e49577aSRod Evans#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ 198*1e49577aSRod Evans sllx data1, lshift, data1 ;\ 199*1e49577aSRod Evans srlx data2, rshift, tmp ;\ 200*1e49577aSRod Evans or data1, tmp, data1 201*1e49577aSRod Evans#endif /* NIAGARA2_IMPL */ 202*1e49577aSRod Evans 203*1e49577aSRod Evans 204*1e49577aSRod Evans ANSI_PRAGMA_WEAK(memmove,function) 205*1e49577aSRod Evans ANSI_PRAGMA_WEAK(memcpy,function) 206*1e49577aSRod Evans 207*1e49577aSRod Evans ENTRY(memmove) 208*1e49577aSRod Evans cmp %o1, %o0 ! if from address is >= to use forward copy 209*1e49577aSRod Evans bgeu,pn %ncc, .forcpy ! else use backward if ... 210*1e49577aSRod Evans sub %o0, %o1, %o4 ! get difference of two addresses 211*1e49577aSRod Evans cmp %o2, %o4 ! compare size and difference of addresses 212*1e49577aSRod Evans bleu,pn %ncc, .forcpy ! if size is bigger, do overlapped copy 213*1e49577aSRod Evans add %o1, %o2, %o5 ! get to end of source space 214*1e49577aSRod Evans 215*1e49577aSRod Evans ! 216*1e49577aSRod Evans ! an overlapped copy that must be done "backwards" 217*1e49577aSRod Evans ! 218*1e49577aSRod Evans.chksize: 219*1e49577aSRod Evans cmp %o2, 8 ! less than 8 byte do byte copy 220*1e49577aSRod Evans blu,pt %ncc, 2f ! else continue 221*1e49577aSRod Evans 222*1e49577aSRod Evans ! Now size is bigger than 8 223*1e49577aSRod Evans.dbalign: 224*1e49577aSRod Evans add %o0, %o2, %g1 ! get to end of dest space 225*1e49577aSRod Evans andcc %g1, 7, %o3 ! %o3 has bytes till dst 8 bytes aligned 226*1e49577aSRod Evans bz,a,pn %ncc, .dbbck ! if dst is not 8 byte aligned: align it 227*1e49577aSRod Evans andn %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size 228*1e49577aSRod Evans sub %o2, %o3, %o2 ! update o2 with new count 229*1e49577aSRod Evans 230*1e49577aSRod Evans1: dec %o5 ! decrement source 231*1e49577aSRod Evans ldub [%o5], %g1 ! load one byte 232*1e49577aSRod Evans deccc %o3 ! decrement count 233*1e49577aSRod Evans bgu,pt %ncc, 1b ! if not done keep copying 234*1e49577aSRod Evans stb %g1, [%o5+%o4] ! store one byte into dest 235*1e49577aSRod Evans andncc %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size 236*1e49577aSRod Evans bz,pn %ncc, 2f ! if size < 8, move to byte copy 237*1e49577aSRod Evans 238*1e49577aSRod Evans ! Now Destination is 8 byte aligned 239*1e49577aSRod Evans.dbbck: 240*1e49577aSRod Evans andcc %o5, 7, %o0 ! %o0 has src offset 241*1e49577aSRod Evans bz,a,pn %ncc, .dbcopybc ! if src is aligned to fast mem move 242*1e49577aSRod Evans sub %o2, %o3, %o2 ! Residue bytes in %o2 243*1e49577aSRod Evans 244*1e49577aSRod Evans.cpy_dbwdbc: ! alignment of src is needed 245*1e49577aSRod Evans sub %o2, 8, %o2 ! set size one loop ahead 246*1e49577aSRod Evans sll %o0, 3, %g1 ! %g1 is left shift 247*1e49577aSRod Evans mov 64, %g5 ! init %g5 to be 64 248*1e49577aSRod Evans sub %g5, %g1, %g5 ! %g5 right shift = (64 - left shift) 249*1e49577aSRod Evans sub %o5, %o0, %o5 ! align the src at 8 bytes. 250*1e49577aSRod Evans add %o4, %o0, %o4 ! increase difference between src & dst 251*1e49577aSRod Evans ldx [%o5], %o1 ! load first 8 bytes 252*1e49577aSRod Evans srlx %o1, %g5, %o1 253*1e49577aSRod Evans1: sub %o5, 8, %o5 ! subtract 8 from src 254*1e49577aSRod Evans ldx [%o5], %o0 ! load 8 byte 255*1e49577aSRod Evans sllx %o0, %g1, %o3 ! shift loaded 8 bytes left into tmp reg 256*1e49577aSRod Evans or %o1, %o3, %o3 ! align data 257*1e49577aSRod Evans stx %o3, [%o5+%o4] ! store 8 byte 258*1e49577aSRod Evans subcc %o2, 8, %o2 ! subtract 8 byte from size 259*1e49577aSRod Evans bg,pt %ncc, 1b ! if size > 0 continue 260*1e49577aSRod Evans srlx %o0, %g5, %o1 ! move extra byte for the next use 261*1e49577aSRod Evans 262*1e49577aSRod Evans srl %g1, 3, %o0 ! retsote %o0 value for alignment 263*1e49577aSRod Evans add %o5, %o0, %o5 ! restore src alignment 264*1e49577aSRod Evans sub %o4, %o0, %o4 ! restore difference between src & dest 265*1e49577aSRod Evans 266*1e49577aSRod Evans ba 2f ! branch to the trailing byte copy 267*1e49577aSRod Evans add %o2, 8, %o2 ! restore size value 268*1e49577aSRod Evans 269*1e49577aSRod Evans.dbcopybc: ! alignment of src is not needed 270*1e49577aSRod Evans1: sub %o5, 8, %o5 ! subtract from src 271*1e49577aSRod Evans ldx [%o5], %g1 ! load 8 bytes 272*1e49577aSRod Evans subcc %o3, 8, %o3 ! subtract from size 273*1e49577aSRod Evans bgu,pt %ncc, 1b ! if size is bigger 0 continue 274*1e49577aSRod Evans stx %g1, [%o5+%o4] ! store 8 bytes to destination 275*1e49577aSRod Evans 276*1e49577aSRod Evans ba 2f 277*1e49577aSRod Evans nop 278*1e49577aSRod Evans 279*1e49577aSRod Evans.bcbyte: 280*1e49577aSRod Evans1: ldub [%o5], %g1 ! load one byte 281*1e49577aSRod Evans stb %g1, [%o5+%o4] ! store one byte 282*1e49577aSRod Evans2: deccc %o2 ! decrement size 283*1e49577aSRod Evans bgeu,a,pt %ncc, 1b ! if size is >= 0 continue 284*1e49577aSRod Evans dec %o5 ! decrement from address 285*1e49577aSRod Evans 286*1e49577aSRod Evans.exitbc: ! exit from backward copy 287*1e49577aSRod Evans retl 288*1e49577aSRod Evans add %o5, %o4, %o0 ! restore dest addr 289*1e49577aSRod Evans 290*1e49577aSRod Evans#ifdef NIAGARA2_IMPL 291*1e49577aSRod Evans ! 292*1e49577aSRod Evans ! Check to see if memmove is large aligned copy 293*1e49577aSRod Evans ! If so, use special version of copy that avoids 294*1e49577aSRod Evans ! use of block store init 295*1e49577aSRod Evans ! 296*1e49577aSRod Evans.forcpy: 297*1e49577aSRod Evans cmp %o2, SMALL_MAX ! check for not small case 298*1e49577aSRod Evans blt,pn %ncc, .mv_short ! merge with memcpy 299*1e49577aSRod Evans mov %o0, %g1 ! save %o0 300*1e49577aSRod Evans neg %o0, %o5 301*1e49577aSRod Evans andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 302*1e49577aSRod Evans brz,pt %o5, .mv_dst_aligned_on_8 303*1e49577aSRod Evans 304*1e49577aSRod Evans ! %o5 has the bytes to be written in partial store. 305*1e49577aSRod Evans sub %o2, %o5, %o2 306*1e49577aSRod Evans sub %o1, %o0, %o1 ! %o1 gets the difference 307*1e49577aSRod Evans7: ! dst aligning loop 308*1e49577aSRod Evans ldub [%o1+%o0], %o4 ! load one byte 309*1e49577aSRod Evans subcc %o5, 1, %o5 310*1e49577aSRod Evans stb %o4, [%o0] 311*1e49577aSRod Evans bgu,pt %ncc, 7b 312*1e49577aSRod Evans add %o0, 1, %o0 ! advance dst 313*1e49577aSRod Evans add %o1, %o0, %o1 ! restore %o1 314*1e49577aSRod Evans.mv_dst_aligned_on_8: 315*1e49577aSRod Evans andcc %o1, 7, %o5 316*1e49577aSRod Evans brnz,pt %o5, .src_dst_unaligned_on_8 317*1e49577aSRod Evans prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read 318*1e49577aSRod Evans 319*1e49577aSRod Evans.mv_src_dst_aligned_on_8: 320*1e49577aSRod Evans ! check if we are copying MED_MAX or more bytes 321*1e49577aSRod Evans cmp %o2, MED_MAX ! limit to store buffer size 322*1e49577aSRod Evans bleu,pt %ncc, .medlong 323*1e49577aSRod Evans prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 324*1e49577aSRod Evans 325*1e49577aSRod Evans/* 326*1e49577aSRod Evans * The following memmove code mimics the memcpy code for large aligned copies, 327*1e49577aSRod Evans * but does not use the ASI_STBI_P (block initializing store) performance 328*1e49577aSRod Evans * optimization. See memmove rationale section in documentation 329*1e49577aSRod Evans */ 330*1e49577aSRod Evans.mv_large_align8_copy: ! Src and dst share 8 byte alignment 331*1e49577aSRod Evans rd %fprs, %g5 ! check for unused fp 332*1e49577aSRod Evans ! if fprs.fef == 0, set it. 333*1e49577aSRod Evans ! Setting it when already set costs more than checking 334*1e49577aSRod Evans andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 335*1e49577aSRod Evans bz,a %ncc, 1f 336*1e49577aSRod Evans wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 337*1e49577aSRod Evans1: 338*1e49577aSRod Evans ! align dst to 64 byte boundary 339*1e49577aSRod Evans andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 340*1e49577aSRod Evans brz,pn %o3, .mv_aligned_on_64 341*1e49577aSRod Evans sub %o3, 64, %o3 ! %o3 has negative bytes to move 342*1e49577aSRod Evans add %o2, %o3, %o2 ! adjust remaining count 343*1e49577aSRod Evans.mv_align_to_64: 344*1e49577aSRod Evans ldx [%o1], %o4 345*1e49577aSRod Evans add %o1, 8, %o1 ! increment src ptr 346*1e49577aSRod Evans addcc %o3, 8, %o3 347*1e49577aSRod Evans stx %o4, [%o0] 348*1e49577aSRod Evans brnz,pt %o3, .mv_align_to_64 349*1e49577aSRod Evans add %o0, 8, %o0 ! increment dst ptr 350*1e49577aSRod Evans 351*1e49577aSRod Evans.mv_aligned_on_64: 352*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 353*1e49577aSRod Evans mov %asi,%o4 ! save %asi 354*1e49577aSRod Evans ! Determine source alignment 355*1e49577aSRod Evans ! to correct 8 byte offset 356*1e49577aSRod Evans andcc %o1, 0x20, %o3 357*1e49577aSRod Evans brnz,pn %o3, .mv_align_1 358*1e49577aSRod Evans mov ASI_BLK_P, %asi ! setup %asi for block load/store 359*1e49577aSRod Evans andcc %o1, 0x10, %o3 360*1e49577aSRod Evans brnz,pn %o3, .mv_align_01 361*1e49577aSRod Evans nop 362*1e49577aSRod Evans andcc %o1, 0x08, %o3 363*1e49577aSRod Evans brz,pn %o3, .mv_align_000 364*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 365*1e49577aSRod Evans ba .mv_align_001 366*1e49577aSRod Evans nop 367*1e49577aSRod Evans.mv_align_01: 368*1e49577aSRod Evans andcc %o1, 0x08, %o3 369*1e49577aSRod Evans brnz,pn %o3, .mv_align_011 370*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 371*1e49577aSRod Evans ba .mv_align_010 372*1e49577aSRod Evans nop 373*1e49577aSRod Evans.mv_align_1: 374*1e49577aSRod Evans andcc %o1, 0x10, %o3 375*1e49577aSRod Evans brnz,pn %o3, .mv_align_11 376*1e49577aSRod Evans nop 377*1e49577aSRod Evans andcc %o1, 0x08, %o3 378*1e49577aSRod Evans brnz,pn %o3, .mv_align_101 379*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 380*1e49577aSRod Evans ba .mv_align_100 381*1e49577aSRod Evans nop 382*1e49577aSRod Evans.mv_align_11: 383*1e49577aSRod Evans andcc %o1, 0x08, %o3 384*1e49577aSRod Evans brz,pn %o3, .mv_align_110 385*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 386*1e49577aSRod Evans 387*1e49577aSRod Evans.mv_align_111: 388*1e49577aSRod Evans! Alignment off by 8 bytes 389*1e49577aSRod Evans ldd [%o1], %d0 390*1e49577aSRod Evans add %o1, 8, %o1 391*1e49577aSRod Evans sub %o2, 8, %o2 392*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 393*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 394*1e49577aSRod Evans.mv_align_111_loop: 395*1e49577aSRod Evans subcc %o5, 128, %o5 396*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 397*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 398*1e49577aSRod Evans fmovd %d16, %d2 399*1e49577aSRod Evans fmovd %d18, %d4 400*1e49577aSRod Evans fmovd %d20, %d6 401*1e49577aSRod Evans fmovd %d22, %d8 402*1e49577aSRod Evans fmovd %d24, %d10 403*1e49577aSRod Evans fmovd %d26, %d12 404*1e49577aSRod Evans fmovd %d28, %d14 405*1e49577aSRod Evans stda %d0,[%o0]%asi 406*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 407*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 408*1e49577aSRod Evans fmovd %d30, %d0 409*1e49577aSRod Evans 410*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 411*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 412*1e49577aSRod Evans fmovd %d16, %d2 413*1e49577aSRod Evans fmovd %d18, %d4 414*1e49577aSRod Evans fmovd %d20, %d6 415*1e49577aSRod Evans fmovd %d22, %d8 416*1e49577aSRod Evans fmovd %d24, %d10 417*1e49577aSRod Evans fmovd %d26, %d12 418*1e49577aSRod Evans fmovd %d28, %d14 419*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 420*1e49577aSRod Evans stda %d0,[%o0]%asi 421*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 422*1e49577aSRod Evans fmovd %d30, %d0 423*1e49577aSRod Evans bgt,pt %ncc, .mv_align_111_loop 424*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 425*1e49577aSRod Evans 426*1e49577aSRod Evans std %d0, [%o0] 427*1e49577aSRod Evans ba .remain_stuff 428*1e49577aSRod Evans add %o0, 8, %o0 429*1e49577aSRod Evans ! END OF mv_align_111 430*1e49577aSRod Evans 431*1e49577aSRod Evans.mv_align_110: 432*1e49577aSRod Evans! Alignment off by 16 bytes 433*1e49577aSRod Evans ldd [%o1], %d0 434*1e49577aSRod Evans ldd [%o1+8], %d2 435*1e49577aSRod Evans add %o1, 16, %o1 436*1e49577aSRod Evans sub %o2, 16, %o2 437*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 438*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 439*1e49577aSRod Evans.mv_align_110_loop: 440*1e49577aSRod Evans subcc %o5, 128, %o5 441*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 442*1e49577aSRod Evans 443*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 444*1e49577aSRod Evans fmovd %d16, %d4 445*1e49577aSRod Evans fmovd %d18, %d6 446*1e49577aSRod Evans fmovd %d20, %d8 447*1e49577aSRod Evans fmovd %d22, %d10 448*1e49577aSRod Evans fmovd %d24, %d12 449*1e49577aSRod Evans fmovd %d26, %d14 450*1e49577aSRod Evans stda %d0,[%o0]%asi 451*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 452*1e49577aSRod Evans fmovd %d28, %d0 453*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 454*1e49577aSRod Evans fmovd %d30, %d2 455*1e49577aSRod Evans 456*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 457*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 458*1e49577aSRod Evans fmovd %d16, %d4 459*1e49577aSRod Evans fmovd %d18, %d6 460*1e49577aSRod Evans fmovd %d20, %d8 461*1e49577aSRod Evans fmovd %d22, %d10 462*1e49577aSRod Evans fmovd %d24, %d12 463*1e49577aSRod Evans fmovd %d26, %d14 464*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 465*1e49577aSRod Evans stda %d0,[%o0]%asi 466*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 467*1e49577aSRod Evans fmovd %d28, %d0 468*1e49577aSRod Evans fmovd %d30, %d2 469*1e49577aSRod Evans bgt,pt %ncc, .mv_align_110_loop 470*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 471*1e49577aSRod Evans 472*1e49577aSRod Evans std %d0, [%o0] 473*1e49577aSRod Evans std %d2, [%o0+8] 474*1e49577aSRod Evans ba .remain_stuff 475*1e49577aSRod Evans add %o0, 16, %o0 476*1e49577aSRod Evans ! END OF mv_align_110 477*1e49577aSRod Evans 478*1e49577aSRod Evans.mv_align_101: 479*1e49577aSRod Evans! Alignment off by 24 bytes 480*1e49577aSRod Evans ldd [%o1], %d0 481*1e49577aSRod Evans ldd [%o1+8], %d2 482*1e49577aSRod Evans ldd [%o1+16], %d4 483*1e49577aSRod Evans add %o1, 24, %o1 484*1e49577aSRod Evans sub %o2, 24, %o2 485*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 486*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 487*1e49577aSRod Evans.mv_align_101_loop: 488*1e49577aSRod Evans subcc %o5, 128, %o5 489*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 490*1e49577aSRod Evans 491*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 492*1e49577aSRod Evans fmovd %d16, %d6 493*1e49577aSRod Evans fmovd %d18, %d8 494*1e49577aSRod Evans fmovd %d20, %d10 495*1e49577aSRod Evans fmovd %d22, %d12 496*1e49577aSRod Evans fmovd %d24, %d14 497*1e49577aSRod Evans stda %d0,[%o0]%asi 498*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 499*1e49577aSRod Evans fmovd %d26, %d0 500*1e49577aSRod Evans fmovd %d28, %d2 501*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 502*1e49577aSRod Evans fmovd %d30, %d4 503*1e49577aSRod Evans 504*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 505*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 506*1e49577aSRod Evans fmovd %d16, %d6 507*1e49577aSRod Evans fmovd %d18, %d8 508*1e49577aSRod Evans fmovd %d20, %d10 509*1e49577aSRod Evans fmovd %d22, %d12 510*1e49577aSRod Evans fmovd %d24, %d14 511*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 512*1e49577aSRod Evans stda %d0,[%o0]%asi 513*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 514*1e49577aSRod Evans fmovd %d26, %d0 515*1e49577aSRod Evans fmovd %d28, %d2 516*1e49577aSRod Evans fmovd %d30, %d4 517*1e49577aSRod Evans bgt,pt %ncc, .mv_align_101_loop 518*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 519*1e49577aSRod Evans 520*1e49577aSRod Evans std %d0, [%o0] 521*1e49577aSRod Evans std %d2, [%o0+8] 522*1e49577aSRod Evans std %d4, [%o0+16] 523*1e49577aSRod Evans ba .remain_stuff 524*1e49577aSRod Evans add %o0, 24, %o0 525*1e49577aSRod Evans ! END OF mv_align_101 526*1e49577aSRod Evans 527*1e49577aSRod Evans.mv_align_100: 528*1e49577aSRod Evans! Alignment off by 32 bytes 529*1e49577aSRod Evans ldd [%o1], %d0 530*1e49577aSRod Evans ldd [%o1+8], %d2 531*1e49577aSRod Evans ldd [%o1+16],%d4 532*1e49577aSRod Evans ldd [%o1+24],%d6 533*1e49577aSRod Evans add %o1, 32, %o1 534*1e49577aSRod Evans sub %o2, 32, %o2 535*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 536*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 537*1e49577aSRod Evans.mv_align_100_loop: 538*1e49577aSRod Evans subcc %o5, 128, %o5 539*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 540*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 541*1e49577aSRod Evans fmovd %d16, %d8 542*1e49577aSRod Evans fmovd %d18, %d10 543*1e49577aSRod Evans fmovd %d20, %d12 544*1e49577aSRod Evans fmovd %d22, %d14 545*1e49577aSRod Evans stda %d0,[%o0]%asi 546*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 547*1e49577aSRod Evans fmovd %d24, %d0 548*1e49577aSRod Evans fmovd %d26, %d2 549*1e49577aSRod Evans fmovd %d28, %d4 550*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 551*1e49577aSRod Evans fmovd %d30, %d6 552*1e49577aSRod Evans 553*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 554*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 555*1e49577aSRod Evans fmovd %d16, %d8 556*1e49577aSRod Evans fmovd %d18, %d10 557*1e49577aSRod Evans fmovd %d20, %d12 558*1e49577aSRod Evans fmovd %d22, %d14 559*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 560*1e49577aSRod Evans stda %d0,[%o0]%asi 561*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 562*1e49577aSRod Evans fmovd %d24, %d0 563*1e49577aSRod Evans fmovd %d26, %d2 564*1e49577aSRod Evans fmovd %d28, %d4 565*1e49577aSRod Evans fmovd %d30, %d6 566*1e49577aSRod Evans bgt,pt %ncc, .mv_align_100_loop 567*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 568*1e49577aSRod Evans 569*1e49577aSRod Evans std %d0, [%o0] 570*1e49577aSRod Evans std %d2, [%o0+8] 571*1e49577aSRod Evans std %d4, [%o0+16] 572*1e49577aSRod Evans std %d6, [%o0+24] 573*1e49577aSRod Evans ba .remain_stuff 574*1e49577aSRod Evans add %o0, 32, %o0 575*1e49577aSRod Evans ! END OF mv_align_100 576*1e49577aSRod Evans 577*1e49577aSRod Evans.mv_align_011: 578*1e49577aSRod Evans! Alignment off by 40 bytes 579*1e49577aSRod Evans ldd [%o1], %d0 580*1e49577aSRod Evans ldd [%o1+8], %d2 581*1e49577aSRod Evans ldd [%o1+16], %d4 582*1e49577aSRod Evans ldd [%o1+24], %d6 583*1e49577aSRod Evans ldd [%o1+32], %d8 584*1e49577aSRod Evans add %o1, 40, %o1 585*1e49577aSRod Evans sub %o2, 40, %o2 586*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 587*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 588*1e49577aSRod Evans.mv_align_011_loop: 589*1e49577aSRod Evans subcc %o5, 128, %o5 590*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 591*1e49577aSRod Evans 592*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 593*1e49577aSRod Evans fmovd %d16, %d10 594*1e49577aSRod Evans fmovd %d18, %d12 595*1e49577aSRod Evans fmovd %d20, %d14 596*1e49577aSRod Evans stda %d0,[%o0]%asi 597*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 598*1e49577aSRod Evans fmovd %d22, %d0 599*1e49577aSRod Evans fmovd %d24, %d2 600*1e49577aSRod Evans fmovd %d26, %d4 601*1e49577aSRod Evans fmovd %d28, %d6 602*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 603*1e49577aSRod Evans fmovd %d30, %d8 604*1e49577aSRod Evans 605*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 606*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 607*1e49577aSRod Evans fmovd %d16, %d10 608*1e49577aSRod Evans fmovd %d18, %d12 609*1e49577aSRod Evans fmovd %d20, %d14 610*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 611*1e49577aSRod Evans stda %d0,[%o0]%asi 612*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 613*1e49577aSRod Evans fmovd %d22, %d0 614*1e49577aSRod Evans fmovd %d24, %d2 615*1e49577aSRod Evans fmovd %d26, %d4 616*1e49577aSRod Evans fmovd %d28, %d6 617*1e49577aSRod Evans fmovd %d30, %d8 618*1e49577aSRod Evans bgt,pt %ncc, .mv_align_011_loop 619*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 620*1e49577aSRod Evans 621*1e49577aSRod Evans std %d0, [%o0] 622*1e49577aSRod Evans std %d2, [%o0+8] 623*1e49577aSRod Evans std %d4, [%o0+16] 624*1e49577aSRod Evans std %d6, [%o0+24] 625*1e49577aSRod Evans std %d8, [%o0+32] 626*1e49577aSRod Evans ba .remain_stuff 627*1e49577aSRod Evans add %o0, 40, %o0 628*1e49577aSRod Evans ! END OF mv_align_011 629*1e49577aSRod Evans 630*1e49577aSRod Evans.mv_align_010: 631*1e49577aSRod Evans! Alignment off by 48 bytes 632*1e49577aSRod Evans ldd [%o1], %d0 633*1e49577aSRod Evans ldd [%o1+8], %d2 634*1e49577aSRod Evans ldd [%o1+16], %d4 635*1e49577aSRod Evans ldd [%o1+24], %d6 636*1e49577aSRod Evans ldd [%o1+32], %d8 637*1e49577aSRod Evans ldd [%o1+40], %d10 638*1e49577aSRod Evans add %o1, 48, %o1 639*1e49577aSRod Evans sub %o2, 48, %o2 640*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 641*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 642*1e49577aSRod Evans.mv_align_010_loop: 643*1e49577aSRod Evans subcc %o5, 128, %o5 644*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 645*1e49577aSRod Evans 646*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 647*1e49577aSRod Evans fmovd %d16, %d12 648*1e49577aSRod Evans fmovd %d18, %d14 649*1e49577aSRod Evans stda %d0,[%o0]%asi 650*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 651*1e49577aSRod Evans fmovd %d20, %d0 652*1e49577aSRod Evans fmovd %d22, %d2 653*1e49577aSRod Evans fmovd %d24, %d4 654*1e49577aSRod Evans fmovd %d26, %d6 655*1e49577aSRod Evans fmovd %d28, %d8 656*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 657*1e49577aSRod Evans fmovd %d30, %d10 658*1e49577aSRod Evans 659*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 660*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 661*1e49577aSRod Evans fmovd %d16, %d12 662*1e49577aSRod Evans fmovd %d18, %d14 663*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 664*1e49577aSRod Evans stda %d0,[%o0]%asi 665*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 666*1e49577aSRod Evans fmovd %d20, %d0 667*1e49577aSRod Evans fmovd %d22, %d2 668*1e49577aSRod Evans fmovd %d24, %d4 669*1e49577aSRod Evans fmovd %d26, %d6 670*1e49577aSRod Evans fmovd %d28, %d8 671*1e49577aSRod Evans fmovd %d30, %d10 672*1e49577aSRod Evans bgt,pt %ncc, .mv_align_010_loop 673*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 674*1e49577aSRod Evans 675*1e49577aSRod Evans std %d0, [%o0] 676*1e49577aSRod Evans std %d2, [%o0+8] 677*1e49577aSRod Evans std %d4, [%o0+16] 678*1e49577aSRod Evans std %d6, [%o0+24] 679*1e49577aSRod Evans std %d8, [%o0+32] 680*1e49577aSRod Evans std %d10, [%o0+40] 681*1e49577aSRod Evans ba .remain_stuff 682*1e49577aSRod Evans add %o0, 48, %o0 683*1e49577aSRod Evans ! END OF mv_align_010 684*1e49577aSRod Evans 685*1e49577aSRod Evans.mv_align_001: 686*1e49577aSRod Evans! Alignment off by 56 bytes 687*1e49577aSRod Evans ldd [%o1], %d0 688*1e49577aSRod Evans ldd [%o1+8], %d2 689*1e49577aSRod Evans ldd [%o1+16], %d4 690*1e49577aSRod Evans ldd [%o1+24], %d6 691*1e49577aSRod Evans ldd [%o1+32], %d8 692*1e49577aSRod Evans ldd [%o1+40], %d10 693*1e49577aSRod Evans ldd [%o1+48], %d12 694*1e49577aSRod Evans add %o1, 56, %o1 695*1e49577aSRod Evans sub %o2, 56, %o2 696*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 697*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 698*1e49577aSRod Evans.mv_align_001_loop: 699*1e49577aSRod Evans subcc %o5, 128, %o5 700*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 701*1e49577aSRod Evans 702*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 703*1e49577aSRod Evans fmovd %d16, %d14 704*1e49577aSRod Evans stda %d0,[%o0]%asi 705*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 706*1e49577aSRod Evans fmovd %d18, %d0 707*1e49577aSRod Evans fmovd %d20, %d2 708*1e49577aSRod Evans fmovd %d22, %d4 709*1e49577aSRod Evans fmovd %d24, %d6 710*1e49577aSRod Evans fmovd %d26, %d8 711*1e49577aSRod Evans fmovd %d28, %d10 712*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 713*1e49577aSRod Evans fmovd %d30, %d12 714*1e49577aSRod Evans 715*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 716*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 717*1e49577aSRod Evans fmovd %d16, %d14 718*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 719*1e49577aSRod Evans stda %d0,[%o0]%asi 720*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 721*1e49577aSRod Evans fmovd %d18, %d0 722*1e49577aSRod Evans fmovd %d20, %d2 723*1e49577aSRod Evans fmovd %d22, %d4 724*1e49577aSRod Evans fmovd %d24, %d6 725*1e49577aSRod Evans fmovd %d26, %d8 726*1e49577aSRod Evans fmovd %d28, %d10 727*1e49577aSRod Evans fmovd %d30, %d12 728*1e49577aSRod Evans bgt,pt %ncc, .mv_align_001_loop 729*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 730*1e49577aSRod Evans 731*1e49577aSRod Evans std %d0, [%o0] 732*1e49577aSRod Evans std %d2, [%o0+8] 733*1e49577aSRod Evans std %d4, [%o0+16] 734*1e49577aSRod Evans std %d6, [%o0+24] 735*1e49577aSRod Evans std %d8, [%o0+32] 736*1e49577aSRod Evans std %d10, [%o0+40] 737*1e49577aSRod Evans std %d12, [%o0+48] 738*1e49577aSRod Evans ba .remain_stuff 739*1e49577aSRod Evans add %o0, 56, %o0 740*1e49577aSRod Evans ! END OF mv_align_001 741*1e49577aSRod Evans 742*1e49577aSRod Evans.mv_align_000: 743*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 744*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 745*1e49577aSRod Evans.mv_align_000_loop: 746*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 747*1e49577aSRod Evans subcc %o5, 128, %o5 748*1e49577aSRod Evans ldda [%o1]%asi,%d0 749*1e49577aSRod Evans stda %d0,[%o0]%asi 750*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 751*1e49577aSRod Evans 752*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 753*1e49577aSRod Evans add %o0, 64, %o0 754*1e49577aSRod Evans ldda [%o1+64]%asi,%d0 755*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 756*1e49577aSRod Evans stda %d0,[%o0]%asi 757*1e49577aSRod Evans add %o0, 64, %o0 ! increment dst 758*1e49577aSRod Evans bgt,pt %ncc, .mv_align_000_loop 759*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 760*1e49577aSRod Evans ba .remain_stuff 761*1e49577aSRod Evans nop 762*1e49577aSRod Evans 763*1e49577aSRod Evans ! END OF mv_align_000 764*1e49577aSRod Evans#else /* NIAGARA2_IMPL */ 765*1e49577aSRod Evans#endif /* NIAGARA2_IMPL */ 766*1e49577aSRod Evans 767*1e49577aSRod Evans SET_SIZE(memmove) 768*1e49577aSRod Evans 769*1e49577aSRod Evans ENTRY(memcpy) 770*1e49577aSRod Evans ENTRY(__align_cpy_1) 771*1e49577aSRod Evans#ifdef NIAGARA2_IMPL 772*1e49577aSRod Evans cmp %o2, SMALL_MAX ! check for not small case 773*1e49577aSRod Evans bgeu,pn %ncc, .medium ! go to larger cases 774*1e49577aSRod Evans mov %o0, %g1 ! save %o0 775*1e49577aSRod Evans.mv_short: 776*1e49577aSRod Evans cmp %o2, SHORTCOPY ! check for really short case 777*1e49577aSRod Evans ble,pt %ncc, .smallfin 778*1e49577aSRod Evans or %o0, %o1, %o4 ! prepare alignment check 779*1e49577aSRod Evans andcc %o4, 0x3, %o5 ! test for alignment 780*1e49577aSRod Evans bz,pt %ncc, .smallword ! branch to word aligned case 781*1e49577aSRod Evans cmp %o2, SHORTCHECK 782*1e49577aSRod Evans ble,pt %ncc, .smallrest 783*1e49577aSRod Evans andcc %o1, 0x3, %o5 ! is src word aligned 784*1e49577aSRod Evans bz,pn %ncc, .aldst 785*1e49577aSRod Evans cmp %o5, 2 ! is src half-word aligned 786*1e49577aSRod Evans be,pt %ncc, .s2algn 787*1e49577aSRod Evans cmp %o5, 3 ! src is byte aligned 788*1e49577aSRod Evans.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it 789*1e49577aSRod Evans inc 1, %o1 790*1e49577aSRod Evans stb %o3, [%o0] ! move a byte to align src 791*1e49577aSRod Evans inc 1, %o0 792*1e49577aSRod Evans bne,pt %ncc, .s2algn 793*1e49577aSRod Evans dec %o2 794*1e49577aSRod Evans b .ald ! now go align dest 795*1e49577aSRod Evans andcc %o0, 0x3, %o5 796*1e49577aSRod Evans 797*1e49577aSRod Evans.s2algn:lduh [%o1], %o3 ! know src is 2 byte aligned 798*1e49577aSRod Evans inc 2, %o1 799*1e49577aSRod Evans srl %o3, 8, %o4 800*1e49577aSRod Evans stb %o4, [%o0] ! have to do bytes, 801*1e49577aSRod Evans stb %o3, [%o0 + 1] ! don't know dst alignment 802*1e49577aSRod Evans inc 2, %o0 803*1e49577aSRod Evans dec 2, %o2 804*1e49577aSRod Evans 805*1e49577aSRod Evans.aldst: andcc %o0, 0x3, %o5 ! align the destination address 806*1e49577aSRod Evans.ald: bz,pn %ncc, .w4cp 807*1e49577aSRod Evans cmp %o5, 2 808*1e49577aSRod Evans be,pn %ncc, .w2cp 809*1e49577aSRod Evans cmp %o5, 3 810*1e49577aSRod Evans.w3cp: lduw [%o1], %o4 811*1e49577aSRod Evans inc 4, %o1 812*1e49577aSRod Evans srl %o4, 24, %o5 813*1e49577aSRod Evans stb %o5, [%o0] 814*1e49577aSRod Evans bne,pt %ncc, .w1cp 815*1e49577aSRod Evans inc %o0 816*1e49577aSRod Evans dec 1, %o2 817*1e49577aSRod Evans andn %o2, 3, %o3 ! %o3 is aligned word count 818*1e49577aSRod Evans dec 4, %o3 ! avoid reading beyond tail of src 819*1e49577aSRod Evans sub %o1, %o0, %o1 ! %o1 gets the difference 820*1e49577aSRod Evans 821*1e49577aSRod Evans1: sll %o4, 8, %g5 ! save residual bytes 822*1e49577aSRod Evans lduw [%o1+%o0], %o4 823*1e49577aSRod Evans deccc 4, %o3 824*1e49577aSRod Evans srl %o4, 24, %o5 ! merge with residual 825*1e49577aSRod Evans or %o5, %g5, %g5 826*1e49577aSRod Evans st %g5, [%o0] 827*1e49577aSRod Evans bnz,pt %ncc, 1b 828*1e49577aSRod Evans inc 4, %o0 829*1e49577aSRod Evans sub %o1, 3, %o1 ! used one byte of last word read 830*1e49577aSRod Evans and %o2, 3, %o2 831*1e49577aSRod Evans b 7f 832*1e49577aSRod Evans inc 4, %o2 833*1e49577aSRod Evans 834*1e49577aSRod Evans.w1cp: srl %o4, 8, %o5 835*1e49577aSRod Evans sth %o5, [%o0] 836*1e49577aSRod Evans inc 2, %o0 837*1e49577aSRod Evans dec 3, %o2 838*1e49577aSRod Evans andn %o2, 3, %o3 ! %o3 is aligned word count 839*1e49577aSRod Evans dec 4, %o3 ! avoid reading beyond tail of src 840*1e49577aSRod Evans sub %o1, %o0, %o1 ! %o1 gets the difference 841*1e49577aSRod Evans 842*1e49577aSRod Evans2: sll %o4, 24, %g5 ! save residual bytes 843*1e49577aSRod Evans lduw [%o1+%o0], %o4 844*1e49577aSRod Evans deccc 4, %o3 845*1e49577aSRod Evans srl %o4, 8, %o5 ! merge with residual 846*1e49577aSRod Evans or %o5, %g5, %g5 847*1e49577aSRod Evans st %g5, [%o0] 848*1e49577aSRod Evans bnz,pt %ncc, 2b 849*1e49577aSRod Evans inc 4, %o0 850*1e49577aSRod Evans sub %o1, 1, %o1 ! used three bytes of last word read 851*1e49577aSRod Evans and %o2, 3, %o2 852*1e49577aSRod Evans b 7f 853*1e49577aSRod Evans inc 4, %o2 854*1e49577aSRod Evans 855*1e49577aSRod Evans.w2cp: lduw [%o1], %o4 856*1e49577aSRod Evans inc 4, %o1 857*1e49577aSRod Evans srl %o4, 16, %o5 858*1e49577aSRod Evans sth %o5, [%o0] 859*1e49577aSRod Evans inc 2, %o0 860*1e49577aSRod Evans dec 2, %o2 861*1e49577aSRod Evans andn %o2, 3, %o3 ! %o3 is aligned word count 862*1e49577aSRod Evans dec 4, %o3 ! avoid reading beyond tail of src 863*1e49577aSRod Evans sub %o1, %o0, %o1 ! %o1 gets the difference 864*1e49577aSRod Evans 865*1e49577aSRod Evans3: sll %o4, 16, %g5 ! save residual bytes 866*1e49577aSRod Evans lduw [%o1+%o0], %o4 867*1e49577aSRod Evans deccc 4, %o3 868*1e49577aSRod Evans srl %o4, 16, %o5 ! merge with residual 869*1e49577aSRod Evans or %o5, %g5, %g5 870*1e49577aSRod Evans st %g5, [%o0] 871*1e49577aSRod Evans bnz,pt %ncc, 3b 872*1e49577aSRod Evans inc 4, %o0 873*1e49577aSRod Evans sub %o1, 2, %o1 ! used two bytes of last word read 874*1e49577aSRod Evans and %o2, 3, %o2 875*1e49577aSRod Evans b 7f 876*1e49577aSRod Evans inc 4, %o2 877*1e49577aSRod Evans 878*1e49577aSRod Evans.w4cp: andn %o2, 3, %o3 ! %o3 is aligned word count 879*1e49577aSRod Evans sub %o1, %o0, %o1 ! %o1 gets the difference 880*1e49577aSRod Evans 881*1e49577aSRod Evans1: lduw [%o1+%o0], %o4 ! read from address 882*1e49577aSRod Evans deccc 4, %o3 ! decrement count 883*1e49577aSRod Evans st %o4, [%o0] ! write at destination address 884*1e49577aSRod Evans bgu,pt %ncc, 1b 885*1e49577aSRod Evans inc 4, %o0 ! increment to address 886*1e49577aSRod Evans and %o2, 3, %o2 ! number of leftover bytes, if any 887*1e49577aSRod Evans 888*1e49577aSRod Evans ! simple finish up byte copy, works with any alignment 889*1e49577aSRod Evans7: 890*1e49577aSRod Evans add %o1, %o0, %o1 ! restore %o1 891*1e49577aSRod Evans.smallrest: 892*1e49577aSRod Evans tst %o2 893*1e49577aSRod Evans bz,pt %ncc, .smallx 894*1e49577aSRod Evans cmp %o2, 4 895*1e49577aSRod Evans blt,pt %ncc, .smallleft3 896*1e49577aSRod Evans nop 897*1e49577aSRod Evans sub %o2, 3, %o2 898*1e49577aSRod Evans.smallnotalign4: 899*1e49577aSRod Evans ldub [%o1], %o3 ! read byte 900*1e49577aSRod Evans subcc %o2, 4, %o2 ! reduce count by 4 901*1e49577aSRod Evans stb %o3, [%o0] ! write byte 902*1e49577aSRod Evans ldub [%o1+1], %o3 ! repeat for total of 4 bytes 903*1e49577aSRod Evans add %o1, 4, %o1 ! advance SRC by 4 904*1e49577aSRod Evans stb %o3, [%o0+1] 905*1e49577aSRod Evans ldub [%o1-2], %o3 906*1e49577aSRod Evans add %o0, 4, %o0 ! advance DST by 4 907*1e49577aSRod Evans stb %o3, [%o0-2] 908*1e49577aSRod Evans ldub [%o1-1], %o3 909*1e49577aSRod Evans bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain 910*1e49577aSRod Evans stb %o3, [%o0-1] 911*1e49577aSRod Evans addcc %o2, 3, %o2 ! restore count 912*1e49577aSRod Evans bz,pt %ncc, .smallx 913*1e49577aSRod Evans.smallleft3: ! 1, 2, or 3 bytes remain 914*1e49577aSRod Evans subcc %o2, 1, %o2 915*1e49577aSRod Evans ldub [%o1], %o3 ! load one byte 916*1e49577aSRod Evans bz,pt %ncc, .smallx 917*1e49577aSRod Evans stb %o3, [%o0] ! store one byte 918*1e49577aSRod Evans ldub [%o1+1], %o3 ! load second byte 919*1e49577aSRod Evans subcc %o2, 1, %o2 920*1e49577aSRod Evans bz,pt %ncc, .smallx 921*1e49577aSRod Evans stb %o3, [%o0+1] ! store second byte 922*1e49577aSRod Evans ldub [%o1+2], %o3 ! load third byte 923*1e49577aSRod Evans stb %o3, [%o0+2] ! store third byte 924*1e49577aSRod Evans.smallx: 925*1e49577aSRod Evans retl 926*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 927*1e49577aSRod Evans 928*1e49577aSRod Evans.smallfin: 929*1e49577aSRod Evans tst %o2 930*1e49577aSRod Evans bnz,pt %ncc, .smallleft3 931*1e49577aSRod Evans nop 932*1e49577aSRod Evans retl 933*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 934*1e49577aSRod Evans 935*1e49577aSRod Evans .align 16 936*1e49577aSRod Evans.smallwords: 937*1e49577aSRod Evans lduw [%o1], %o3 ! read word 938*1e49577aSRod Evans.smallwordx: 939*1e49577aSRod Evans subcc %o2, 8, %o2 ! update count 940*1e49577aSRod Evans stw %o3, [%o0] ! write word 941*1e49577aSRod Evans add %o1, 8, %o1 ! update SRC 942*1e49577aSRod Evans lduw [%o1-4], %o3 ! read word 943*1e49577aSRod Evans add %o0, 8, %o0 ! update DST 944*1e49577aSRod Evans bgu,pt %ncc, .smallwords ! loop until done 945*1e49577aSRod Evans stw %o3, [%o0-4] ! write word 946*1e49577aSRod Evans addcc %o2, 7, %o2 ! restore count 947*1e49577aSRod Evans bz,pt %ncc, .smallexit ! check for completion 948*1e49577aSRod Evans cmp %o2, 4 ! check for 4 or more bytes left 949*1e49577aSRod Evans blt %ncc, .smallleft3 ! if not, go to finish up 950*1e49577aSRod Evans nop 951*1e49577aSRod Evans lduw [%o1], %o3 952*1e49577aSRod Evans add %o1, 4, %o1 953*1e49577aSRod Evans subcc %o2, 4, %o2 954*1e49577aSRod Evans add %o0, 4, %o0 955*1e49577aSRod Evans bnz,pt %ncc, .smallleft3 956*1e49577aSRod Evans stw %o3, [%o0-4] 957*1e49577aSRod Evans retl 958*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 959*1e49577aSRod Evans 960*1e49577aSRod Evans! 8 or more bytes, src and dest start on word boundary 961*1e49577aSRod Evans! %o4 contains or %o0, %o1; %o3 contains first four bytes of src 962*1e49577aSRod Evans.smalllong: 963*1e49577aSRod Evans andcc %o4, 0x7, %o5 ! test for long alignment 964*1e49577aSRod Evans bnz,pt %ncc, .smallwordx ! branch to word aligned case 965*1e49577aSRod Evans cmp %o2, SHORT_LONG-7 966*1e49577aSRod Evans bge,a %ncc, .medl64 ! if we branch 967*1e49577aSRod Evans sub %o2,56,%o2 ! adjust %o2 to -31 off count 968*1e49577aSRod Evans sub %o1, %o0, %o1 ! %o1 gets the difference 969*1e49577aSRod Evans.small_long_l: 970*1e49577aSRod Evans ldx [%o1+%o0], %o3 971*1e49577aSRod Evans subcc %o2, 8, %o2 972*1e49577aSRod Evans add %o0, 8, %o0 973*1e49577aSRod Evans bgu,pt %ncc, .small_long_l ! loop until done 974*1e49577aSRod Evans stx %o3, [%o0-8] ! write word 975*1e49577aSRod Evans add %o1, %o0, %o1 ! restore %o1 976*1e49577aSRod Evans addcc %o2, 7, %o2 ! restore %o2 to correct count 977*1e49577aSRod Evans bz,pt %ncc, .smallexit ! check for completion 978*1e49577aSRod Evans cmp %o2, 4 ! check for 4 or more bytes left 979*1e49577aSRod Evans blt,pt %ncc, .smallleft3 ! if not, go to finish up 980*1e49577aSRod Evans nop 981*1e49577aSRod Evans lduw [%o1], %o3 982*1e49577aSRod Evans add %o1, 4, %o1 983*1e49577aSRod Evans subcc %o2, 4, %o2 984*1e49577aSRod Evans stw %o3, [%o0] 985*1e49577aSRod Evans add %o0, 4, %o0 986*1e49577aSRod Evans bnz,pt %ncc, .smallleft3 987*1e49577aSRod Evans nop 988*1e49577aSRod Evans retl 989*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 990*1e49577aSRod Evans 991*1e49577aSRod Evans .align 16 992*1e49577aSRod Evans! src and dest start on word boundary 993*1e49577aSRod Evans.smallword: 994*1e49577aSRod Evans subcc %o2, 7, %o2 ! adjust count 995*1e49577aSRod Evans bgu,pt %ncc, .smalllong 996*1e49577aSRod Evans lduw [%o1], %o3 ! read word 997*1e49577aSRod Evans addcc %o2, 3, %o2 ! restore count 998*1e49577aSRod Evans bz,pt %ncc, .smallexit 999*1e49577aSRod Evans stw %o3, [%o0] ! write word 1000*1e49577aSRod Evans deccc %o2 ! reduce count for cc test 1001*1e49577aSRod Evans ldub [%o1+4], %o3 ! load one byte 1002*1e49577aSRod Evans bz,pt %ncc, .smallexit 1003*1e49577aSRod Evans stb %o3, [%o0+4] ! store one byte 1004*1e49577aSRod Evans ldub [%o1+5], %o3 ! load second byte 1005*1e49577aSRod Evans deccc %o2 1006*1e49577aSRod Evans bz,pt %ncc, .smallexit 1007*1e49577aSRod Evans stb %o3, [%o0+5] ! store second byte 1008*1e49577aSRod Evans ldub [%o1+6], %o3 ! load third byte 1009*1e49577aSRod Evans stb %o3, [%o0+6] ! store third byte 1010*1e49577aSRod Evans.smallexit: 1011*1e49577aSRod Evans retl 1012*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 1013*1e49577aSRod Evans 1014*1e49577aSRod Evans .align 16 1015*1e49577aSRod Evans.medium: 1016*1e49577aSRod Evans neg %o0, %o5 1017*1e49577aSRod Evans andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 1018*1e49577aSRod Evans brz,pt %o5, .dst_aligned_on_8 1019*1e49577aSRod Evans 1020*1e49577aSRod Evans ! %o5 has the bytes to be written in partial store. 1021*1e49577aSRod Evans sub %o2, %o5, %o2 1022*1e49577aSRod Evans sub %o1, %o0, %o1 ! %o1 gets the difference 1023*1e49577aSRod Evans7: ! dst aligning loop 1024*1e49577aSRod Evans ldub [%o1+%o0], %o4 ! load one byte 1025*1e49577aSRod Evans subcc %o5, 1, %o5 1026*1e49577aSRod Evans stb %o4, [%o0] 1027*1e49577aSRod Evans bgu,pt %ncc, 7b 1028*1e49577aSRod Evans add %o0, 1, %o0 ! advance dst 1029*1e49577aSRod Evans add %o1, %o0, %o1 ! restore %o1 1030*1e49577aSRod Evans.dst_aligned_on_8: 1031*1e49577aSRod Evans andcc %o1, 7, %o5 1032*1e49577aSRod Evans brnz,pt %o5, .src_dst_unaligned_on_8 1033*1e49577aSRod Evans prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read 1034*1e49577aSRod Evans 1035*1e49577aSRod Evans.src_dst_aligned_on_8: 1036*1e49577aSRod Evans ! check if we are copying MED_MAX or more bytes 1037*1e49577aSRod Evans cmp %o2, MED_MAX ! limit to store buffer size 1038*1e49577aSRod Evans bgu,pt %ncc, .large_align8_copy 1039*1e49577aSRod Evans prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 1040*1e49577aSRod Evans/* 1041*1e49577aSRod Evans * Special case for handling when src and dest are both long word aligned 1042*1e49577aSRod Evans * and total data to move is less than MED_MAX bytes 1043*1e49577aSRod Evans */ 1044*1e49577aSRod Evans.medlong: 1045*1e49577aSRod Evans subcc %o2, 63, %o2 ! adjust length to allow cc test 1046*1e49577aSRod Evans ble,pt %ncc, .medl63 ! skip big loop if less than 64 bytes 1047*1e49577aSRod Evans.medl64: 1048*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache 1049*1e49577aSRod Evans ldx [%o1], %o4 ! load 1050*1e49577aSRod Evans subcc %o2, 64, %o2 ! decrement length count 1051*1e49577aSRod Evans stx %o4, [%o0] ! and store 1052*1e49577aSRod Evans ldx [%o1+8], %o3 ! a block of 64 bytes 1053*1e49577aSRod Evans stx %o3, [%o0+8] 1054*1e49577aSRod Evans ldx [%o1+16], %o4 1055*1e49577aSRod Evans stx %o4, [%o0+16] 1056*1e49577aSRod Evans ldx [%o1+24], %o3 1057*1e49577aSRod Evans stx %o3, [%o0+24] 1058*1e49577aSRod Evans ldx [%o1+32], %o4 ! load 1059*1e49577aSRod Evans stx %o4, [%o0+32] ! and store 1060*1e49577aSRod Evans ldx [%o1+40], %o3 ! a block of 64 bytes 1061*1e49577aSRod Evans add %o1, 64, %o1 ! increase src ptr by 64 1062*1e49577aSRod Evans stx %o3, [%o0+40] 1063*1e49577aSRod Evans ldx [%o1-16], %o4 1064*1e49577aSRod Evans add %o0, 64, %o0 ! increase dst ptr by 64 1065*1e49577aSRod Evans stx %o4, [%o0-16] 1066*1e49577aSRod Evans ldx [%o1-8], %o3 1067*1e49577aSRod Evans bgu,pt %ncc, .medl64 ! repeat if at least 64 bytes left 1068*1e49577aSRod Evans stx %o3, [%o0-8] 1069*1e49577aSRod Evans.medl63: 1070*1e49577aSRod Evans addcc %o2, 32, %o2 ! adjust remaining count 1071*1e49577aSRod Evans ble,pt %ncc, .medl31 ! to skip if 31 or fewer bytes left 1072*1e49577aSRod Evans nop 1073*1e49577aSRod Evans ldx [%o1], %o4 ! load 1074*1e49577aSRod Evans sub %o2, 32, %o2 ! decrement length count 1075*1e49577aSRod Evans stx %o4, [%o0] ! and store 1076*1e49577aSRod Evans ldx [%o1+8], %o3 ! a block of 32 bytes 1077*1e49577aSRod Evans add %o1, 32, %o1 ! increase src ptr by 32 1078*1e49577aSRod Evans stx %o3, [%o0+8] 1079*1e49577aSRod Evans ldx [%o1-16], %o4 1080*1e49577aSRod Evans add %o0, 32, %o0 ! increase dst ptr by 32 1081*1e49577aSRod Evans stx %o4, [%o0-16] 1082*1e49577aSRod Evans ldx [%o1-8], %o3 1083*1e49577aSRod Evans stx %o3, [%o0-8] 1084*1e49577aSRod Evans.medl31: 1085*1e49577aSRod Evans addcc %o2, 16, %o2 ! adjust remaining count 1086*1e49577aSRod Evans ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left 1087*1e49577aSRod Evans nop ! 1088*1e49577aSRod Evans ldx [%o1], %o4 ! load and store 16 bytes 1089*1e49577aSRod Evans add %o1, 16, %o1 ! increase src ptr by 16 1090*1e49577aSRod Evans stx %o4, [%o0] ! 1091*1e49577aSRod Evans sub %o2, 16, %o2 ! decrease count by 16 1092*1e49577aSRod Evans ldx [%o1-8], %o3 ! 1093*1e49577aSRod Evans add %o0, 16, %o0 ! increase dst ptr by 16 1094*1e49577aSRod Evans stx %o3, [%o0-8] 1095*1e49577aSRod Evans.medl15: 1096*1e49577aSRod Evans addcc %o2, 15, %o2 ! restore count 1097*1e49577aSRod Evans bz,pt %ncc, .smallexit ! exit if finished 1098*1e49577aSRod Evans cmp %o2, 8 1099*1e49577aSRod Evans blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 1100*1e49577aSRod Evans tst %o2 1101*1e49577aSRod Evans ldx [%o1], %o4 ! load 8 bytes 1102*1e49577aSRod Evans add %o1, 8, %o1 ! increase src ptr by 8 1103*1e49577aSRod Evans add %o0, 8, %o0 ! increase dst ptr by 8 1104*1e49577aSRod Evans subcc %o2, 8, %o2 ! decrease count by 8 1105*1e49577aSRod Evans bnz,pt %ncc, .medw7 1106*1e49577aSRod Evans stx %o4, [%o0-8] ! and store 8 bytes 1107*1e49577aSRod Evans retl 1108*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 1109*1e49577aSRod Evans 1110*1e49577aSRod Evans .align 16 1111*1e49577aSRod Evans.src_dst_unaligned_on_8: 1112*1e49577aSRod Evans ! DST is 8-byte aligned, src is not 1113*1e49577aSRod Evans2: 1114*1e49577aSRod Evans andcc %o1, 0x3, %o5 ! test word alignment 1115*1e49577aSRod Evans bnz,pt %ncc, .unalignsetup ! branch to skip if not word aligned 1116*1e49577aSRod Evans prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 1117*1e49577aSRod Evans 1118*1e49577aSRod Evans/* 1119*1e49577aSRod Evans * Handle all cases where src and dest are aligned on word 1120*1e49577aSRod Evans * boundaries. Use unrolled loops for better performance. 1121*1e49577aSRod Evans * This option wins over standard large data move when 1122*1e49577aSRod Evans * source and destination is in cache for medium 1123*1e49577aSRod Evans * to short data moves. 1124*1e49577aSRod Evans */ 1125*1e49577aSRod Evans cmp %o2, MED_WMAX ! limit to store buffer size 1126*1e49577aSRod Evans bge,pt %ncc, .unalignrejoin ! otherwise rejoin main loop 1127*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 1128*1e49577aSRod Evans 1129*1e49577aSRod Evans subcc %o2, 31, %o2 ! adjust length to allow cc test 1130*1e49577aSRod Evans ! for end of loop 1131*1e49577aSRod Evans ble,pt %ncc, .medw31 ! skip big loop if less than 16 1132*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1133*1e49577aSRod Evans.medw32: 1134*1e49577aSRod Evans ld [%o1], %o4 ! move a block of 32 bytes 1135*1e49577aSRod Evans stw %o4, [%o0] 1136*1e49577aSRod Evans ld [%o1+4], %o3 1137*1e49577aSRod Evans stw %o3, [%o0+4] 1138*1e49577aSRod Evans ld [%o1+8], %o4 1139*1e49577aSRod Evans stw %o4, [%o0+8] 1140*1e49577aSRod Evans ld [%o1+12], %o3 1141*1e49577aSRod Evans stw %o3, [%o0+12] 1142*1e49577aSRod Evans ld [%o1+16], %o4 1143*1e49577aSRod Evans subcc %o2, 32, %o2 ! decrement length count 1144*1e49577aSRod Evans stw %o4, [%o0+16] 1145*1e49577aSRod Evans ld [%o1+20], %o3 1146*1e49577aSRod Evans add %o1, 32, %o1 ! increase src ptr by 32 1147*1e49577aSRod Evans stw %o3, [%o0+20] 1148*1e49577aSRod Evans ld [%o1-8], %o4 1149*1e49577aSRod Evans add %o0, 32, %o0 ! increase dst ptr by 32 1150*1e49577aSRod Evans stw %o4, [%o0-8] 1151*1e49577aSRod Evans ld [%o1-4], %o3 1152*1e49577aSRod Evans bgu,pt %ncc, .medw32 ! repeat if at least 32 bytes left 1153*1e49577aSRod Evans stw %o3, [%o0-4] 1154*1e49577aSRod Evans.medw31: 1155*1e49577aSRod Evans addcc %o2, 31, %o2 ! restore count 1156*1e49577aSRod Evans 1157*1e49577aSRod Evans bz,pt %ncc, .smallexit ! exit if finished 1158*1e49577aSRod Evans nop 1159*1e49577aSRod Evans cmp %o2, 16 1160*1e49577aSRod Evans blt,pt %ncc, .medw15 1161*1e49577aSRod Evans nop 1162*1e49577aSRod Evans ld [%o1], %o4 ! move a block of 16 bytes 1163*1e49577aSRod Evans subcc %o2, 16, %o2 ! decrement length count 1164*1e49577aSRod Evans stw %o4, [%o0] 1165*1e49577aSRod Evans ld [%o1+4], %o3 1166*1e49577aSRod Evans add %o1, 16, %o1 ! increase src ptr by 16 1167*1e49577aSRod Evans stw %o3, [%o0+4] 1168*1e49577aSRod Evans ld [%o1-8], %o4 1169*1e49577aSRod Evans add %o0, 16, %o0 ! increase dst ptr by 16 1170*1e49577aSRod Evans stw %o4, [%o0-8] 1171*1e49577aSRod Evans ld [%o1-4], %o3 1172*1e49577aSRod Evans stw %o3, [%o0-4] 1173*1e49577aSRod Evans.medw15: 1174*1e49577aSRod Evans bz,pt %ncc, .smallexit ! exit if finished 1175*1e49577aSRod Evans cmp %o2, 8 1176*1e49577aSRod Evans blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 1177*1e49577aSRod Evans tst %o2 1178*1e49577aSRod Evans ld [%o1], %o4 ! load 4 bytes 1179*1e49577aSRod Evans subcc %o2, 8, %o2 ! decrease count by 8 1180*1e49577aSRod Evans stw %o4, [%o0] ! and store 4 bytes 1181*1e49577aSRod Evans add %o1, 8, %o1 ! increase src ptr by 8 1182*1e49577aSRod Evans ld [%o1-4], %o3 ! load 4 bytes 1183*1e49577aSRod Evans add %o0, 8, %o0 ! increase dst ptr by 8 1184*1e49577aSRod Evans stw %o3, [%o0-4] ! and store 4 bytes 1185*1e49577aSRod Evans bz,pt %ncc, .smallexit ! exit if finished 1186*1e49577aSRod Evans.medw7: ! count is ge 1, less than 8 1187*1e49577aSRod Evans cmp %o2, 4 ! check for 4 bytes left 1188*1e49577aSRod Evans blt,pt %ncc, .smallleft3 ! skip if 3 or fewer bytes left 1189*1e49577aSRod Evans nop ! 1190*1e49577aSRod Evans ld [%o1], %o4 ! load 4 bytes 1191*1e49577aSRod Evans add %o1, 4, %o1 ! increase src ptr by 4 1192*1e49577aSRod Evans add %o0, 4, %o0 ! increase dst ptr by 4 1193*1e49577aSRod Evans subcc %o2, 4, %o2 ! decrease count by 4 1194*1e49577aSRod Evans bnz .smallleft3 1195*1e49577aSRod Evans stw %o4, [%o0-4] ! and store 4 bytes 1196*1e49577aSRod Evans retl 1197*1e49577aSRod Evans mov %g1, %o0 ! restore %o0 1198*1e49577aSRod Evans 1199*1e49577aSRod Evans .align 16 1200*1e49577aSRod Evans.large_align8_copy: ! Src and dst share 8 byte alignment 1201*1e49577aSRod Evans rd %fprs, %g5 ! check for unused fp 1202*1e49577aSRod Evans ! if fprs.fef == 0, set it. 1203*1e49577aSRod Evans ! Setting it when already set costs more than checking 1204*1e49577aSRod Evans andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 1205*1e49577aSRod Evans bz,a %ncc, 1f 1206*1e49577aSRod Evans wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 1207*1e49577aSRod Evans1: 1208*1e49577aSRod Evans ! align dst to 64 byte boundary 1209*1e49577aSRod Evans andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 1210*1e49577aSRod Evans brz,pn %o3, .aligned_to_64 1211*1e49577aSRod Evans andcc %o0, 8, %o3 ! odd long words to move? 1212*1e49577aSRod Evans brz,pt %o3, .aligned_to_16 1213*1e49577aSRod Evans nop 1214*1e49577aSRod Evans ldx [%o1], %o4 1215*1e49577aSRod Evans sub %o2, 8, %o2 1216*1e49577aSRod Evans add %o1, 8, %o1 ! increment src ptr 1217*1e49577aSRod Evans add %o0, 8, %o0 ! increment dst ptr 1218*1e49577aSRod Evans stx %o4, [%o0-8] 1219*1e49577aSRod Evans.aligned_to_16: 1220*1e49577aSRod Evans andcc %o0, 16, %o3 ! pair of long words to move? 1221*1e49577aSRod Evans brz,pt %o3, .aligned_to_32 1222*1e49577aSRod Evans nop 1223*1e49577aSRod Evans ldx [%o1], %o4 1224*1e49577aSRod Evans sub %o2, 16, %o2 1225*1e49577aSRod Evans stx %o4, [%o0] 1226*1e49577aSRod Evans add %o1, 16, %o1 ! increment src ptr 1227*1e49577aSRod Evans ldx [%o1-8], %o4 1228*1e49577aSRod Evans add %o0, 16, %o0 ! increment dst ptr 1229*1e49577aSRod Evans stx %o4, [%o0-8] 1230*1e49577aSRod Evans.aligned_to_32: 1231*1e49577aSRod Evans andcc %o0, 32, %o3 ! four long words to move? 1232*1e49577aSRod Evans brz,pt %o3, .aligned_to_64 1233*1e49577aSRod Evans nop 1234*1e49577aSRod Evans ldx [%o1], %o4 1235*1e49577aSRod Evans sub %o2, 32, %o2 1236*1e49577aSRod Evans stx %o4, [%o0] 1237*1e49577aSRod Evans ldx [%o1+8], %o4 1238*1e49577aSRod Evans stx %o4, [%o0+8] 1239*1e49577aSRod Evans ldx [%o1+16], %o4 1240*1e49577aSRod Evans stx %o4, [%o0+16] 1241*1e49577aSRod Evans add %o1, 32, %o1 ! increment src ptr 1242*1e49577aSRod Evans ldx [%o1-8], %o4 1243*1e49577aSRod Evans add %o0, 32, %o0 ! increment dst ptr 1244*1e49577aSRod Evans stx %o4, [%o0-8] 1245*1e49577aSRod Evans.aligned_to_64: 1246*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 1247*1e49577aSRod Evans mov %asi,%o4 ! save %asi 1248*1e49577aSRod Evans ! Determine source alignment 1249*1e49577aSRod Evans ! to correct 8 byte offset 1250*1e49577aSRod Evans andcc %o1, 0x20, %o3 1251*1e49577aSRod Evans brnz,pn %o3, .align_1 1252*1e49577aSRod Evans mov ASI_BLK_P, %asi ! setup %asi for block load/store 1253*1e49577aSRod Evans andcc %o1, 0x10, %o3 1254*1e49577aSRod Evans brnz,pn %o3, .align_01 1255*1e49577aSRod Evans nop 1256*1e49577aSRod Evans andcc %o1, 0x08, %o3 1257*1e49577aSRod Evans brz,pn %o3, .align_000 1258*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1259*1e49577aSRod Evans ba .align_001 1260*1e49577aSRod Evans nop 1261*1e49577aSRod Evans.align_01: 1262*1e49577aSRod Evans andcc %o1, 0x08, %o3 1263*1e49577aSRod Evans brnz,pn %o3, .align_011 1264*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1265*1e49577aSRod Evans ba .align_010 1266*1e49577aSRod Evans nop 1267*1e49577aSRod Evans.align_1: 1268*1e49577aSRod Evans andcc %o1, 0x10, %o3 1269*1e49577aSRod Evans brnz,pn %o3, .align_11 1270*1e49577aSRod Evans nop 1271*1e49577aSRod Evans andcc %o1, 0x08, %o3 1272*1e49577aSRod Evans brnz,pn %o3, .align_101 1273*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1274*1e49577aSRod Evans ba .align_100 1275*1e49577aSRod Evans nop 1276*1e49577aSRod Evans.align_11: 1277*1e49577aSRod Evans andcc %o1, 0x08, %o3 1278*1e49577aSRod Evans brz,pn %o3, .align_110 1279*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1280*1e49577aSRod Evans 1281*1e49577aSRod Evans.align_111: 1282*1e49577aSRod Evans! Alignment off by 8 bytes 1283*1e49577aSRod Evans ldd [%o1], %d0 1284*1e49577aSRod Evans add %o1, 8, %o1 1285*1e49577aSRod Evans sub %o2, 8, %o2 1286*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1287*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 1288*1e49577aSRod Evans.align_111_loop: 1289*1e49577aSRod Evans subcc %o5, 128, %o5 1290*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 1291*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 1292*1e49577aSRod Evans fmovd %d16, %d2 1293*1e49577aSRod Evans fmovd %d18, %d4 1294*1e49577aSRod Evans fmovd %d20, %d6 1295*1e49577aSRod Evans fmovd %d22, %d8 1296*1e49577aSRod Evans fmovd %d24, %d10 1297*1e49577aSRod Evans fmovd %d26, %d12 1298*1e49577aSRod Evans fmovd %d28, %d14 1299*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1300*1e49577aSRod Evans stda %d0,[%o0]%asi 1301*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1302*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1303*1e49577aSRod Evans fmovd %d30, %d0 1304*1e49577aSRod Evans 1305*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 1306*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 1307*1e49577aSRod Evans fmovd %d16, %d2 1308*1e49577aSRod Evans fmovd %d18, %d4 1309*1e49577aSRod Evans fmovd %d20, %d6 1310*1e49577aSRod Evans fmovd %d22, %d8 1311*1e49577aSRod Evans fmovd %d24, %d10 1312*1e49577aSRod Evans fmovd %d26, %d12 1313*1e49577aSRod Evans fmovd %d28, %d14 1314*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 1315*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1316*1e49577aSRod Evans stda %d0,[%o0]%asi 1317*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1318*1e49577aSRod Evans fmovd %d30, %d0 1319*1e49577aSRod Evans bgt,pt %ncc, .align_111_loop 1320*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1321*1e49577aSRod Evans 1322*1e49577aSRod Evans std %d0, [%o0] 1323*1e49577aSRod Evans ba .remain_stuff 1324*1e49577aSRod Evans add %o0, 8, %o0 1325*1e49577aSRod Evans ! END OF align_111 1326*1e49577aSRod Evans 1327*1e49577aSRod Evans.align_110: 1328*1e49577aSRod Evans! Alignment off by 16 bytes 1329*1e49577aSRod Evans ldd [%o1], %d0 1330*1e49577aSRod Evans ldd [%o1+8], %d2 1331*1e49577aSRod Evans add %o1, 16, %o1 1332*1e49577aSRod Evans sub %o2, 16, %o2 1333*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1334*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 1335*1e49577aSRod Evans.align_110_loop: 1336*1e49577aSRod Evans subcc %o5, 128, %o5 1337*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 1338*1e49577aSRod Evans 1339*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 1340*1e49577aSRod Evans fmovd %d16, %d4 1341*1e49577aSRod Evans fmovd %d18, %d6 1342*1e49577aSRod Evans fmovd %d20, %d8 1343*1e49577aSRod Evans fmovd %d22, %d10 1344*1e49577aSRod Evans fmovd %d24, %d12 1345*1e49577aSRod Evans fmovd %d26, %d14 1346*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1347*1e49577aSRod Evans stda %d0,[%o0]%asi 1348*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1349*1e49577aSRod Evans fmovd %d28, %d0 1350*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1351*1e49577aSRod Evans fmovd %d30, %d2 1352*1e49577aSRod Evans 1353*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 1354*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 1355*1e49577aSRod Evans fmovd %d16, %d4 1356*1e49577aSRod Evans fmovd %d18, %d6 1357*1e49577aSRod Evans fmovd %d20, %d8 1358*1e49577aSRod Evans fmovd %d22, %d10 1359*1e49577aSRod Evans fmovd %d24, %d12 1360*1e49577aSRod Evans fmovd %d26, %d14 1361*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 1362*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1363*1e49577aSRod Evans stda %d0,[%o0]%asi 1364*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1365*1e49577aSRod Evans fmovd %d28, %d0 1366*1e49577aSRod Evans fmovd %d30, %d2 1367*1e49577aSRod Evans bgt,pt %ncc, .align_110_loop 1368*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1369*1e49577aSRod Evans 1370*1e49577aSRod Evans std %d0, [%o0] 1371*1e49577aSRod Evans std %d2, [%o0+8] 1372*1e49577aSRod Evans ba .remain_stuff 1373*1e49577aSRod Evans add %o0, 16, %o0 1374*1e49577aSRod Evans ! END OF align_110 1375*1e49577aSRod Evans 1376*1e49577aSRod Evans.align_101: 1377*1e49577aSRod Evans! Alignment off by 24 bytes 1378*1e49577aSRod Evans ldd [%o1], %d0 1379*1e49577aSRod Evans ldd [%o1+8], %d2 1380*1e49577aSRod Evans ldd [%o1+16], %d4 1381*1e49577aSRod Evans add %o1, 24, %o1 1382*1e49577aSRod Evans sub %o2, 24, %o2 1383*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1384*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 1385*1e49577aSRod Evans.align_101_loop: 1386*1e49577aSRod Evans subcc %o5, 128, %o5 1387*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 1388*1e49577aSRod Evans 1389*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 1390*1e49577aSRod Evans fmovd %d16, %d6 1391*1e49577aSRod Evans fmovd %d18, %d8 1392*1e49577aSRod Evans fmovd %d20, %d10 1393*1e49577aSRod Evans fmovd %d22, %d12 1394*1e49577aSRod Evans fmovd %d24, %d14 1395*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1396*1e49577aSRod Evans stda %d0,[%o0]%asi 1397*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1398*1e49577aSRod Evans fmovd %d26, %d0 1399*1e49577aSRod Evans fmovd %d28, %d2 1400*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1401*1e49577aSRod Evans fmovd %d30, %d4 1402*1e49577aSRod Evans 1403*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 1404*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 1405*1e49577aSRod Evans fmovd %d16, %d6 1406*1e49577aSRod Evans fmovd %d18, %d8 1407*1e49577aSRod Evans fmovd %d20, %d10 1408*1e49577aSRod Evans fmovd %d22, %d12 1409*1e49577aSRod Evans fmovd %d24, %d14 1410*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 1411*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1412*1e49577aSRod Evans stda %d0,[%o0]%asi 1413*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1414*1e49577aSRod Evans fmovd %d26, %d0 1415*1e49577aSRod Evans fmovd %d28, %d2 1416*1e49577aSRod Evans fmovd %d30, %d4 1417*1e49577aSRod Evans bgt,pt %ncc, .align_101_loop 1418*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1419*1e49577aSRod Evans 1420*1e49577aSRod Evans std %d0, [%o0] 1421*1e49577aSRod Evans std %d2, [%o0+8] 1422*1e49577aSRod Evans std %d4, [%o0+16] 1423*1e49577aSRod Evans ba .remain_stuff 1424*1e49577aSRod Evans add %o0, 24, %o0 1425*1e49577aSRod Evans ! END OF align_101 1426*1e49577aSRod Evans 1427*1e49577aSRod Evans.align_100: 1428*1e49577aSRod Evans! Alignment off by 32 bytes 1429*1e49577aSRod Evans ldd [%o1], %d0 1430*1e49577aSRod Evans ldd [%o1+8], %d2 1431*1e49577aSRod Evans ldd [%o1+16],%d4 1432*1e49577aSRod Evans ldd [%o1+24],%d6 1433*1e49577aSRod Evans add %o1, 32, %o1 1434*1e49577aSRod Evans sub %o2, 32, %o2 1435*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1436*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 1437*1e49577aSRod Evans.align_100_loop: 1438*1e49577aSRod Evans subcc %o5, 128, %o5 1439*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 1440*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 1441*1e49577aSRod Evans fmovd %d16, %d8 1442*1e49577aSRod Evans fmovd %d18, %d10 1443*1e49577aSRod Evans fmovd %d20, %d12 1444*1e49577aSRod Evans fmovd %d22, %d14 1445*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1446*1e49577aSRod Evans stda %d0,[%o0]%asi 1447*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1448*1e49577aSRod Evans fmovd %d24, %d0 1449*1e49577aSRod Evans fmovd %d26, %d2 1450*1e49577aSRod Evans fmovd %d28, %d4 1451*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1452*1e49577aSRod Evans fmovd %d30, %d6 1453*1e49577aSRod Evans 1454*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 1455*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 1456*1e49577aSRod Evans fmovd %d16, %d8 1457*1e49577aSRod Evans fmovd %d18, %d10 1458*1e49577aSRod Evans fmovd %d20, %d12 1459*1e49577aSRod Evans fmovd %d22, %d14 1460*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 1461*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1462*1e49577aSRod Evans stda %d0,[%o0]%asi 1463*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1464*1e49577aSRod Evans fmovd %d24, %d0 1465*1e49577aSRod Evans fmovd %d26, %d2 1466*1e49577aSRod Evans fmovd %d28, %d4 1467*1e49577aSRod Evans fmovd %d30, %d6 1468*1e49577aSRod Evans bgt,pt %ncc, .align_100_loop 1469*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1470*1e49577aSRod Evans 1471*1e49577aSRod Evans std %d0, [%o0] 1472*1e49577aSRod Evans std %d2, [%o0+8] 1473*1e49577aSRod Evans std %d4, [%o0+16] 1474*1e49577aSRod Evans std %d6, [%o0+24] 1475*1e49577aSRod Evans ba .remain_stuff 1476*1e49577aSRod Evans add %o0, 32, %o0 1477*1e49577aSRod Evans ! END OF align_100 1478*1e49577aSRod Evans 1479*1e49577aSRod Evans.align_011: 1480*1e49577aSRod Evans! Alignment off by 40 bytes 1481*1e49577aSRod Evans ldd [%o1], %d0 1482*1e49577aSRod Evans ldd [%o1+8], %d2 1483*1e49577aSRod Evans ldd [%o1+16], %d4 1484*1e49577aSRod Evans ldd [%o1+24], %d6 1485*1e49577aSRod Evans ldd [%o1+32], %d8 1486*1e49577aSRod Evans add %o1, 40, %o1 1487*1e49577aSRod Evans sub %o2, 40, %o2 1488*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1489*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 1490*1e49577aSRod Evans.align_011_loop: 1491*1e49577aSRod Evans subcc %o5, 128, %o5 1492*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 1493*1e49577aSRod Evans 1494*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 1495*1e49577aSRod Evans fmovd %d16, %d10 1496*1e49577aSRod Evans fmovd %d18, %d12 1497*1e49577aSRod Evans fmovd %d20, %d14 1498*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1499*1e49577aSRod Evans stda %d0,[%o0]%asi 1500*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1501*1e49577aSRod Evans fmovd %d22, %d0 1502*1e49577aSRod Evans fmovd %d24, %d2 1503*1e49577aSRod Evans fmovd %d26, %d4 1504*1e49577aSRod Evans fmovd %d28, %d6 1505*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1506*1e49577aSRod Evans fmovd %d30, %d8 1507*1e49577aSRod Evans 1508*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 1509*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 1510*1e49577aSRod Evans fmovd %d16, %d10 1511*1e49577aSRod Evans fmovd %d18, %d12 1512*1e49577aSRod Evans fmovd %d20, %d14 1513*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 1514*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1515*1e49577aSRod Evans stda %d0,[%o0]%asi 1516*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1517*1e49577aSRod Evans fmovd %d22, %d0 1518*1e49577aSRod Evans fmovd %d24, %d2 1519*1e49577aSRod Evans fmovd %d26, %d4 1520*1e49577aSRod Evans fmovd %d28, %d6 1521*1e49577aSRod Evans fmovd %d30, %d8 1522*1e49577aSRod Evans bgt,pt %ncc, .align_011_loop 1523*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1524*1e49577aSRod Evans 1525*1e49577aSRod Evans std %d0, [%o0] 1526*1e49577aSRod Evans std %d2, [%o0+8] 1527*1e49577aSRod Evans std %d4, [%o0+16] 1528*1e49577aSRod Evans std %d6, [%o0+24] 1529*1e49577aSRod Evans std %d8, [%o0+32] 1530*1e49577aSRod Evans ba .remain_stuff 1531*1e49577aSRod Evans add %o0, 40, %o0 1532*1e49577aSRod Evans ! END OF align_011 1533*1e49577aSRod Evans 1534*1e49577aSRod Evans.align_010: 1535*1e49577aSRod Evans! Alignment off by 48 bytes 1536*1e49577aSRod Evans ldd [%o1], %d0 1537*1e49577aSRod Evans ldd [%o1+8], %d2 1538*1e49577aSRod Evans ldd [%o1+16], %d4 1539*1e49577aSRod Evans ldd [%o1+24], %d6 1540*1e49577aSRod Evans ldd [%o1+32], %d8 1541*1e49577aSRod Evans ldd [%o1+40], %d10 1542*1e49577aSRod Evans add %o1, 48, %o1 1543*1e49577aSRod Evans sub %o2, 48, %o2 1544*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1545*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 1546*1e49577aSRod Evans.align_010_loop: 1547*1e49577aSRod Evans subcc %o5, 128, %o5 1548*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 1549*1e49577aSRod Evans 1550*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 1551*1e49577aSRod Evans fmovd %d16, %d12 1552*1e49577aSRod Evans fmovd %d18, %d14 1553*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1554*1e49577aSRod Evans stda %d0,[%o0]%asi 1555*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1556*1e49577aSRod Evans fmovd %d20, %d0 1557*1e49577aSRod Evans fmovd %d22, %d2 1558*1e49577aSRod Evans fmovd %d24, %d4 1559*1e49577aSRod Evans fmovd %d26, %d6 1560*1e49577aSRod Evans fmovd %d28, %d8 1561*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1562*1e49577aSRod Evans fmovd %d30, %d10 1563*1e49577aSRod Evans 1564*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 1565*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 1566*1e49577aSRod Evans fmovd %d16, %d12 1567*1e49577aSRod Evans fmovd %d18, %d14 1568*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 1569*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1570*1e49577aSRod Evans stda %d0,[%o0]%asi 1571*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1572*1e49577aSRod Evans fmovd %d20, %d0 1573*1e49577aSRod Evans fmovd %d22, %d2 1574*1e49577aSRod Evans fmovd %d24, %d4 1575*1e49577aSRod Evans fmovd %d26, %d6 1576*1e49577aSRod Evans fmovd %d28, %d8 1577*1e49577aSRod Evans fmovd %d30, %d10 1578*1e49577aSRod Evans bgt,pt %ncc, .align_010_loop 1579*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1580*1e49577aSRod Evans 1581*1e49577aSRod Evans std %d0, [%o0] 1582*1e49577aSRod Evans std %d2, [%o0+8] 1583*1e49577aSRod Evans std %d4, [%o0+16] 1584*1e49577aSRod Evans std %d6, [%o0+24] 1585*1e49577aSRod Evans std %d8, [%o0+32] 1586*1e49577aSRod Evans std %d10, [%o0+40] 1587*1e49577aSRod Evans ba .remain_stuff 1588*1e49577aSRod Evans add %o0, 48, %o0 1589*1e49577aSRod Evans ! END OF align_010 1590*1e49577aSRod Evans 1591*1e49577aSRod Evans.align_001: 1592*1e49577aSRod Evans! Alignment off by 56 bytes 1593*1e49577aSRod Evans ldd [%o1], %d0 1594*1e49577aSRod Evans ldd [%o1+8], %d2 1595*1e49577aSRod Evans ldd [%o1+16], %d4 1596*1e49577aSRod Evans ldd [%o1+24], %d6 1597*1e49577aSRod Evans ldd [%o1+32], %d8 1598*1e49577aSRod Evans ldd [%o1+40], %d10 1599*1e49577aSRod Evans ldd [%o1+48], %d12 1600*1e49577aSRod Evans add %o1, 56, %o1 1601*1e49577aSRod Evans sub %o2, 56, %o2 1602*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1603*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 1604*1e49577aSRod Evans.align_001_loop: 1605*1e49577aSRod Evans subcc %o5, 128, %o5 1606*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 1607*1e49577aSRod Evans 1608*1e49577aSRod Evans ldda [%o1]%asi,%d16 ! block load 1609*1e49577aSRod Evans fmovd %d16, %d14 1610*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1611*1e49577aSRod Evans stda %d0,[%o0]%asi 1612*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1613*1e49577aSRod Evans fmovd %d18, %d0 1614*1e49577aSRod Evans fmovd %d20, %d2 1615*1e49577aSRod Evans fmovd %d22, %d4 1616*1e49577aSRod Evans fmovd %d24, %d6 1617*1e49577aSRod Evans fmovd %d26, %d8 1618*1e49577aSRod Evans fmovd %d28, %d10 1619*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1620*1e49577aSRod Evans fmovd %d30, %d12 1621*1e49577aSRod Evans 1622*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 1623*1e49577aSRod Evans ldda [%o1+64]%asi,%d16 1624*1e49577aSRod Evans fmovd %d16, %d14 1625*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 1626*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1627*1e49577aSRod Evans stda %d0,[%o0]%asi 1628*1e49577aSRod Evans add %o0, 64, %o0 ! advance dst 1629*1e49577aSRod Evans fmovd %d18, %d0 1630*1e49577aSRod Evans fmovd %d20, %d2 1631*1e49577aSRod Evans fmovd %d22, %d4 1632*1e49577aSRod Evans fmovd %d24, %d6 1633*1e49577aSRod Evans fmovd %d26, %d8 1634*1e49577aSRod Evans fmovd %d28, %d10 1635*1e49577aSRod Evans fmovd %d30, %d12 1636*1e49577aSRod Evans bgt,pt %ncc, .align_001_loop 1637*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1638*1e49577aSRod Evans 1639*1e49577aSRod Evans std %d0, [%o0] 1640*1e49577aSRod Evans std %d2, [%o0+8] 1641*1e49577aSRod Evans std %d4, [%o0+16] 1642*1e49577aSRod Evans std %d6, [%o0+24] 1643*1e49577aSRod Evans std %d8, [%o0+32] 1644*1e49577aSRod Evans std %d10, [%o0+40] 1645*1e49577aSRod Evans std %d12, [%o0+48] 1646*1e49577aSRod Evans ba .remain_stuff 1647*1e49577aSRod Evans add %o0, 56, %o0 1648*1e49577aSRod Evans ! END OF align_001 1649*1e49577aSRod Evans 1650*1e49577aSRod Evans.align_000: 1651*1e49577aSRod Evans andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1652*1e49577aSRod Evans and %o2, 0x7f, %o2 ! residue bytes in %o2 1653*1e49577aSRod Evans.align_000_loop: 1654*1e49577aSRod Evans /* ---- copy line 1 of 2. ---- */ 1655*1e49577aSRod Evans subcc %o5, 128, %o5 1656*1e49577aSRod Evans ldda [%o1]%asi,%d0 1657*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1658*1e49577aSRod Evans stda %d0,[%o0]%asi 1659*1e49577aSRod Evans prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1660*1e49577aSRod Evans 1661*1e49577aSRod Evans /* ---- copy line 2 of 2. ---- */ 1662*1e49577aSRod Evans add %o0, 64, %o0 1663*1e49577aSRod Evans ldda [%o1+64]%asi,%d0 1664*1e49577aSRod Evans add %o1, 128, %o1 ! increment src 1665*1e49577aSRod Evans stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1666*1e49577aSRod Evans stda %d0,[%o0]%asi 1667*1e49577aSRod Evans add %o0, 64, %o0 ! increment dst 1668*1e49577aSRod Evans bgt,pt %ncc, .align_000_loop 1669*1e49577aSRod Evans prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1670*1e49577aSRod Evans 1671*1e49577aSRod Evans ! END OF align_000 1672*1e49577aSRod Evans 1673*1e49577aSRod Evans.remain_stuff: 1674*1e49577aSRod Evans mov %o4, %asi ! restore %asi 1675*1e49577aSRod Evans brnz %g5, .medlong 1676*1e49577aSRod Evans membar #Sync 1677*1e49577aSRod Evans ba .medlong 1678*1e49577aSRod Evans wr %g5, %g0, %fprs 1679*1e49577aSRod Evans 1680*1e49577aSRod Evans .align 16 1681*1e49577aSRod Evans ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 1682*1e49577aSRod Evans.unalignsetup: 1683*1e49577aSRod Evans prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 1684*1e49577aSRod Evans.unalignrejoin: 1685*1e49577aSRod Evans rd %fprs, %g5 ! check for unused fp 1686*1e49577aSRod Evans ! if fprs.fef == 0, set it. 1687*1e49577aSRod Evans ! Setting it when already set costs more than checking 1688*1e49577aSRod Evans andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 1689*1e49577aSRod Evans bz,a %ncc, 1f 1690*1e49577aSRod Evans wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 1691*1e49577aSRod Evans1: 1692*1e49577aSRod Evans cmp %o2, MED_UMAX ! check for medium unaligned limit 1693*1e49577aSRod Evans bge,pt %ncc,.unalign_large 1694*1e49577aSRod Evans nop 1695*1e49577aSRod Evans andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 1696*1e49577aSRod Evans and %o2, 0x3f, %o2 ! residue bytes in %o2 1697*1e49577aSRod Evans cmp %o2, 8 ! Insure we don't load beyond 1698*1e49577aSRod Evans bgt .unalign_adjust ! end of source buffer 1699*1e49577aSRod Evans andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 1700*1e49577aSRod Evans add %o2, 64, %o2 ! adjust to leave loop 1701*1e49577aSRod Evans sub %o5, 64, %o5 ! early if necessary 1702*1e49577aSRod Evans.unalign_adjust: 1703*1e49577aSRod Evans alignaddr %o1, %g0, %g0 ! generate %gsr 1704*1e49577aSRod Evans add %o1, %o5, %o1 ! advance %o1 to after blocks 1705*1e49577aSRod Evans ldd [%o4], %d0 1706*1e49577aSRod Evans.unalign_loop: 1707*1e49577aSRod Evans ldd [%o4+8], %d2 1708*1e49577aSRod Evans faligndata %d0, %d2, %d16 1709*1e49577aSRod Evans ldd [%o4+16], %d4 1710*1e49577aSRod Evans std %d16, [%o0] 1711*1e49577aSRod Evans faligndata %d2, %d4, %d18 1712*1e49577aSRod Evans ldd [%o4+24], %d6 1713*1e49577aSRod Evans std %d18, [%o0+8] 1714*1e49577aSRod Evans faligndata %d4, %d6, %d20 1715*1e49577aSRod Evans ldd [%o4+32], %d8 1716*1e49577aSRod Evans std %d20, [%o0+16] 1717*1e49577aSRod Evans faligndata %d6, %d8, %d22 1718*1e49577aSRod Evans ldd [%o4+40], %d10 1719*1e49577aSRod Evans std %d22, [%o0+24] 1720*1e49577aSRod Evans faligndata %d8, %d10, %d24 1721*1e49577aSRod Evans ldd [%o4+48], %d12 1722*1e49577aSRod Evans std %d24, [%o0+32] 1723*1e49577aSRod Evans faligndata %d10, %d12, %d26 1724*1e49577aSRod Evans ldd [%o4+56], %d14 1725*1e49577aSRod Evans std %d26, [%o0+40] 1726*1e49577aSRod Evans faligndata %d12, %d14, %d28 1727*1e49577aSRod Evans ldd [%o4+64], %d0 1728*1e49577aSRod Evans std %d28, [%o0+48] 1729*1e49577aSRod Evans faligndata %d14, %d0, %d30 1730*1e49577aSRod Evans add %o4, BLOCK_SIZE, %o4 1731*1e49577aSRod Evans std %d30, [%o0+56] 1732*1e49577aSRod Evans add %o0, BLOCK_SIZE, %o0 1733*1e49577aSRod Evans subcc %o5, BLOCK_SIZE, %o5 1734*1e49577aSRod Evans bgu,pt %ncc, .unalign_loop 1735*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1736*1e49577aSRod Evans ba .unalign_done 1737*1e49577aSRod Evans nop 1738*1e49577aSRod Evans 1739*1e49577aSRod Evans.unalign_large: 1740*1e49577aSRod Evans andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 1741*1e49577aSRod Evans bz %ncc, .unalignsrc 1742*1e49577aSRod Evans sub %o3, 64, %o3 ! %o3 will be multiple of 8 1743*1e49577aSRod Evans neg %o3 ! bytes until dest is 64 byte aligned 1744*1e49577aSRod Evans sub %o2, %o3, %o2 ! update cnt with bytes to be moved 1745*1e49577aSRod Evans ! Move bytes according to source alignment 1746*1e49577aSRod Evans andcc %o1, 0x1, %o5 1747*1e49577aSRod Evans bnz %ncc, .unalignbyte ! check for byte alignment 1748*1e49577aSRod Evans nop 1749*1e49577aSRod Evans andcc %o1, 2, %o5 ! check for half word alignment 1750*1e49577aSRod Evans bnz %ncc, .unalignhalf 1751*1e49577aSRod Evans nop 1752*1e49577aSRod Evans ! Src is word aligned 1753*1e49577aSRod Evans.unalignword: 1754*1e49577aSRod Evans ld [%o1], %o4 ! load 4 bytes 1755*1e49577aSRod Evans stw %o4, [%o0] ! and store 4 bytes 1756*1e49577aSRod Evans ld [%o1+4], %o4 ! load 4 bytes 1757*1e49577aSRod Evans add %o1, 8, %o1 ! increase src ptr by 8 1758*1e49577aSRod Evans stw %o4, [%o0+4] ! and store 4 bytes 1759*1e49577aSRod Evans subcc %o3, 8, %o3 ! decrease count by 8 1760*1e49577aSRod Evans bnz %ncc, .unalignword 1761*1e49577aSRod Evans add %o0, 8, %o0 ! increase dst ptr by 8 1762*1e49577aSRod Evans ba .unalignsrc 1763*1e49577aSRod Evans nop 1764*1e49577aSRod Evans 1765*1e49577aSRod Evans ! Src is half-word aligned 1766*1e49577aSRod Evans.unalignhalf: 1767*1e49577aSRod Evans lduh [%o1], %o4 ! load 2 bytes 1768*1e49577aSRod Evans sllx %o4, 32, %o5 ! shift left 1769*1e49577aSRod Evans lduw [%o1+2], %o4 1770*1e49577aSRod Evans or %o4, %o5, %o5 1771*1e49577aSRod Evans sllx %o5, 16, %o5 1772*1e49577aSRod Evans lduh [%o1+6], %o4 1773*1e49577aSRod Evans or %o4, %o5, %o5 1774*1e49577aSRod Evans stx %o5, [%o0] 1775*1e49577aSRod Evans add %o1, 8, %o1 1776*1e49577aSRod Evans subcc %o3, 8, %o3 1777*1e49577aSRod Evans bnz %ncc, .unalignhalf 1778*1e49577aSRod Evans add %o0, 8, %o0 1779*1e49577aSRod Evans ba .unalignsrc 1780*1e49577aSRod Evans nop 1781*1e49577aSRod Evans 1782*1e49577aSRod Evans ! Src is Byte aligned 1783*1e49577aSRod Evans.unalignbyte: 1784*1e49577aSRod Evans sub %o0, %o1, %o0 ! share pointer advance 1785*1e49577aSRod Evans.unalignbyte_loop: 1786*1e49577aSRod Evans ldub [%o1], %o4 1787*1e49577aSRod Evans sllx %o4, 56, %o5 1788*1e49577aSRod Evans lduh [%o1+1], %o4 1789*1e49577aSRod Evans sllx %o4, 40, %o4 1790*1e49577aSRod Evans or %o4, %o5, %o5 1791*1e49577aSRod Evans lduh [%o1+3], %o4 1792*1e49577aSRod Evans sllx %o4, 24, %o4 1793*1e49577aSRod Evans or %o4, %o5, %o5 1794*1e49577aSRod Evans lduh [%o1+5], %o4 1795*1e49577aSRod Evans sllx %o4, 8, %o4 1796*1e49577aSRod Evans or %o4, %o5, %o5 1797*1e49577aSRod Evans ldub [%o1+7], %o4 1798*1e49577aSRod Evans or %o4, %o5, %o5 1799*1e49577aSRod Evans stx %o5, [%o0+%o1] 1800*1e49577aSRod Evans subcc %o3, 8, %o3 1801*1e49577aSRod Evans bnz %ncc, .unalignbyte_loop 1802*1e49577aSRod Evans add %o1, 8, %o1 1803*1e49577aSRod Evans add %o0,%o1, %o0 ! restore pointer 1804*1e49577aSRod Evans 1805*1e49577aSRod Evans ! Destination is now block (64 byte aligned) 1806*1e49577aSRod Evans.unalignsrc: 1807*1e49577aSRod Evans andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 1808*1e49577aSRod Evans and %o2, 0x3f, %o2 ! residue bytes in %o2 1809*1e49577aSRod Evans add %o2, 64, %o2 ! Insure we don't load beyond 1810*1e49577aSRod Evans sub %o5, 64, %o5 ! end of source buffer 1811*1e49577aSRod Evans 1812*1e49577aSRod Evans andn %o1, 0x3f, %o4 ! %o4 has block aligned src address 1813*1e49577aSRod Evans prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read 1814*1e49577aSRod Evans alignaddr %o1, %g0, %g0 ! generate %gsr 1815*1e49577aSRod Evans add %o1, %o5, %o1 ! advance %o1 to after blocks 1816*1e49577aSRod Evans ! 1817*1e49577aSRod Evans ! Determine source alignment to correct 8 byte offset 1818*1e49577aSRod Evans andcc %o1, 0x20, %o3 1819*1e49577aSRod Evans brnz,pn %o3, .unalign_1 1820*1e49577aSRod Evans nop 1821*1e49577aSRod Evans andcc %o1, 0x10, %o3 1822*1e49577aSRod Evans brnz,pn %o3, .unalign_01 1823*1e49577aSRod Evans nop 1824*1e49577aSRod Evans andcc %o1, 0x08, %o3 1825*1e49577aSRod Evans brz,a %o3, .unalign_000 1826*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1827*1e49577aSRod Evans ba .unalign_001 1828*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1829*1e49577aSRod Evans.unalign_01: 1830*1e49577aSRod Evans andcc %o1, 0x08, %o3 1831*1e49577aSRod Evans brnz,a %o3, .unalign_011 1832*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1833*1e49577aSRod Evans ba .unalign_010 1834*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1835*1e49577aSRod Evans.unalign_1: 1836*1e49577aSRod Evans andcc %o1, 0x10, %o3 1837*1e49577aSRod Evans brnz,pn %o3, .unalign_11 1838*1e49577aSRod Evans nop 1839*1e49577aSRod Evans andcc %o1, 0x08, %o3 1840*1e49577aSRod Evans brnz,a %o3, .unalign_101 1841*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1842*1e49577aSRod Evans ba .unalign_100 1843*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1844*1e49577aSRod Evans.unalign_11: 1845*1e49577aSRod Evans andcc %o1, 0x08, %o3 1846*1e49577aSRod Evans brz,pn %o3, .unalign_110 1847*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1848*1e49577aSRod Evans 1849*1e49577aSRod Evans.unalign_111: 1850*1e49577aSRod Evans ldd [%o4+56], %d14 1851*1e49577aSRod Evans.unalign_111_loop: 1852*1e49577aSRod Evans add %o4, 64, %o4 1853*1e49577aSRod Evans ldda [%o4]ASI_BLK_P, %d16 1854*1e49577aSRod Evans faligndata %d14, %d16, %d48 1855*1e49577aSRod Evans faligndata %d16, %d18, %d50 1856*1e49577aSRod Evans faligndata %d18, %d20, %d52 1857*1e49577aSRod Evans faligndata %d20, %d22, %d54 1858*1e49577aSRod Evans faligndata %d22, %d24, %d56 1859*1e49577aSRod Evans faligndata %d24, %d26, %d58 1860*1e49577aSRod Evans faligndata %d26, %d28, %d60 1861*1e49577aSRod Evans faligndata %d28, %d30, %d62 1862*1e49577aSRod Evans fmovd %d30, %d14 1863*1e49577aSRod Evans stda %d48, [%o0]ASI_BLK_P 1864*1e49577aSRod Evans subcc %o5, 64, %o5 1865*1e49577aSRod Evans add %o0, 64, %o0 1866*1e49577aSRod Evans bgu,pt %ncc, .unalign_111_loop 1867*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1868*1e49577aSRod Evans ba .unalign_done 1869*1e49577aSRod Evans membar #Sync 1870*1e49577aSRod Evans 1871*1e49577aSRod Evans.unalign_110: 1872*1e49577aSRod Evans ldd [%o4+48], %d12 1873*1e49577aSRod Evans ldd [%o4+56], %d14 1874*1e49577aSRod Evans.unalign_110_loop: 1875*1e49577aSRod Evans add %o4, 64, %o4 1876*1e49577aSRod Evans ldda [%o4]ASI_BLK_P, %d16 1877*1e49577aSRod Evans faligndata %d12, %d14, %d48 1878*1e49577aSRod Evans faligndata %d14, %d16, %d50 1879*1e49577aSRod Evans faligndata %d16, %d18, %d52 1880*1e49577aSRod Evans faligndata %d18, %d20, %d54 1881*1e49577aSRod Evans faligndata %d20, %d22, %d56 1882*1e49577aSRod Evans faligndata %d22, %d24, %d58 1883*1e49577aSRod Evans faligndata %d24, %d26, %d60 1884*1e49577aSRod Evans faligndata %d26, %d28, %d62 1885*1e49577aSRod Evans fmovd %d28, %d12 1886*1e49577aSRod Evans fmovd %d30, %d14 1887*1e49577aSRod Evans stda %d48, [%o0]ASI_BLK_P 1888*1e49577aSRod Evans subcc %o5, 64, %o5 1889*1e49577aSRod Evans add %o0, 64, %o0 1890*1e49577aSRod Evans bgu,pt %ncc, .unalign_110_loop 1891*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1892*1e49577aSRod Evans ba .unalign_done 1893*1e49577aSRod Evans membar #Sync 1894*1e49577aSRod Evans 1895*1e49577aSRod Evans.unalign_101: 1896*1e49577aSRod Evans ldd [%o4+40], %d10 1897*1e49577aSRod Evans ldd [%o4+48], %d12 1898*1e49577aSRod Evans ldd [%o4+56], %d14 1899*1e49577aSRod Evans.unalign_101_loop: 1900*1e49577aSRod Evans add %o4, 64, %o4 1901*1e49577aSRod Evans ldda [%o4]ASI_BLK_P, %d16 1902*1e49577aSRod Evans faligndata %d10, %d12, %d48 1903*1e49577aSRod Evans faligndata %d12, %d14, %d50 1904*1e49577aSRod Evans faligndata %d14, %d16, %d52 1905*1e49577aSRod Evans faligndata %d16, %d18, %d54 1906*1e49577aSRod Evans faligndata %d18, %d20, %d56 1907*1e49577aSRod Evans faligndata %d20, %d22, %d58 1908*1e49577aSRod Evans faligndata %d22, %d24, %d60 1909*1e49577aSRod Evans faligndata %d24, %d26, %d62 1910*1e49577aSRod Evans fmovd %d26, %d10 1911*1e49577aSRod Evans fmovd %d28, %d12 1912*1e49577aSRod Evans fmovd %d30, %d14 1913*1e49577aSRod Evans stda %d48, [%o0]ASI_BLK_P 1914*1e49577aSRod Evans subcc %o5, 64, %o5 1915*1e49577aSRod Evans add %o0, 64, %o0 1916*1e49577aSRod Evans bgu,pt %ncc, .unalign_101_loop 1917*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1918*1e49577aSRod Evans ba .unalign_done 1919*1e49577aSRod Evans membar #Sync 1920*1e49577aSRod Evans 1921*1e49577aSRod Evans.unalign_100: 1922*1e49577aSRod Evans ldd [%o4+32], %d8 1923*1e49577aSRod Evans ldd [%o4+40], %d10 1924*1e49577aSRod Evans ldd [%o4+48], %d12 1925*1e49577aSRod Evans ldd [%o4+56], %d14 1926*1e49577aSRod Evans.unalign_100_loop: 1927*1e49577aSRod Evans add %o4, 64, %o4 1928*1e49577aSRod Evans ldda [%o4]ASI_BLK_P, %d16 1929*1e49577aSRod Evans faligndata %d8, %d10, %d48 1930*1e49577aSRod Evans faligndata %d10, %d12, %d50 1931*1e49577aSRod Evans faligndata %d12, %d14, %d52 1932*1e49577aSRod Evans faligndata %d14, %d16, %d54 1933*1e49577aSRod Evans faligndata %d16, %d18, %d56 1934*1e49577aSRod Evans faligndata %d18, %d20, %d58 1935*1e49577aSRod Evans faligndata %d20, %d22, %d60 1936*1e49577aSRod Evans faligndata %d22, %d24, %d62 1937*1e49577aSRod Evans fmovd %d24, %d8 1938*1e49577aSRod Evans fmovd %d26, %d10 1939*1e49577aSRod Evans fmovd %d28, %d12 1940*1e49577aSRod Evans fmovd %d30, %d14 1941*1e49577aSRod Evans stda %d48, [%o0]ASI_BLK_P 1942*1e49577aSRod Evans subcc %o5, 64, %o5 1943*1e49577aSRod Evans add %o0, 64, %o0 1944*1e49577aSRod Evans bgu,pt %ncc, .unalign_100_loop 1945*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1946*1e49577aSRod Evans ba .unalign_done 1947*1e49577aSRod Evans membar #Sync 1948*1e49577aSRod Evans 1949*1e49577aSRod Evans.unalign_011: 1950*1e49577aSRod Evans ldd [%o4+24], %d6 1951*1e49577aSRod Evans ldd [%o4+32], %d8 1952*1e49577aSRod Evans ldd [%o4+40], %d10 1953*1e49577aSRod Evans ldd [%o4+48], %d12 1954*1e49577aSRod Evans ldd [%o4+56], %d14 1955*1e49577aSRod Evans.unalign_011_loop: 1956*1e49577aSRod Evans add %o4, 64, %o4 1957*1e49577aSRod Evans ldda [%o4]ASI_BLK_P, %d16 1958*1e49577aSRod Evans faligndata %d6, %d8, %d48 1959*1e49577aSRod Evans faligndata %d8, %d10, %d50 1960*1e49577aSRod Evans faligndata %d10, %d12, %d52 1961*1e49577aSRod Evans faligndata %d12, %d14, %d54 1962*1e49577aSRod Evans faligndata %d14, %d16, %d56 1963*1e49577aSRod Evans faligndata %d16, %d18, %d58 1964*1e49577aSRod Evans faligndata %d18, %d20, %d60 1965*1e49577aSRod Evans faligndata %d20, %d22, %d62 1966*1e49577aSRod Evans fmovd %d22, %d6 1967*1e49577aSRod Evans fmovd %d24, %d8 1968*1e49577aSRod Evans fmovd %d26, %d10 1969*1e49577aSRod Evans fmovd %d28, %d12 1970*1e49577aSRod Evans fmovd %d30, %d14 1971*1e49577aSRod Evans stda %d48, [%o0]ASI_BLK_P 1972*1e49577aSRod Evans subcc %o5, 64, %o5 1973*1e49577aSRod Evans add %o0, 64, %o0 1974*1e49577aSRod Evans bgu,pt %ncc, .unalign_011_loop 1975*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1976*1e49577aSRod Evans ba .unalign_done 1977*1e49577aSRod Evans membar #Sync 1978*1e49577aSRod Evans 1979*1e49577aSRod Evans.unalign_010: 1980*1e49577aSRod Evans ldd [%o4+16], %d4 1981*1e49577aSRod Evans ldd [%o4+24], %d6 1982*1e49577aSRod Evans ldd [%o4+32], %d8 1983*1e49577aSRod Evans ldd [%o4+40], %d10 1984*1e49577aSRod Evans ldd [%o4+48], %d12 1985*1e49577aSRod Evans ldd [%o4+56], %d14 1986*1e49577aSRod Evans.unalign_010_loop: 1987*1e49577aSRod Evans add %o4, 64, %o4 1988*1e49577aSRod Evans ldda [%o4]ASI_BLK_P, %d16 1989*1e49577aSRod Evans faligndata %d4, %d6, %d48 1990*1e49577aSRod Evans faligndata %d6, %d8, %d50 1991*1e49577aSRod Evans faligndata %d8, %d10, %d52 1992*1e49577aSRod Evans faligndata %d10, %d12, %d54 1993*1e49577aSRod Evans faligndata %d12, %d14, %d56 1994*1e49577aSRod Evans faligndata %d14, %d16, %d58 1995*1e49577aSRod Evans faligndata %d16, %d18, %d60 1996*1e49577aSRod Evans faligndata %d18, %d20, %d62 1997*1e49577aSRod Evans fmovd %d20, %d4 1998*1e49577aSRod Evans fmovd %d22, %d6 1999*1e49577aSRod Evans fmovd %d24, %d8 2000*1e49577aSRod Evans fmovd %d26, %d10 2001*1e49577aSRod Evans fmovd %d28, %d12 2002*1e49577aSRod Evans fmovd %d30, %d14 2003*1e49577aSRod Evans stda %d48, [%o0]ASI_BLK_P 2004*1e49577aSRod Evans subcc %o5, 64, %o5 2005*1e49577aSRod Evans add %o0, 64, %o0 2006*1e49577aSRod Evans bgu,pt %ncc, .unalign_010_loop 2007*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 2008*1e49577aSRod Evans ba .unalign_done 2009*1e49577aSRod Evans membar #Sync 2010*1e49577aSRod Evans 2011*1e49577aSRod Evans.unalign_001: 2012*1e49577aSRod Evans ldd [%o4+8], %d2 2013*1e49577aSRod Evans ldd [%o4+16], %d4 2014*1e49577aSRod Evans ldd [%o4+24], %d6 2015*1e49577aSRod Evans ldd [%o4+32], %d8 2016*1e49577aSRod Evans ldd [%o4+40], %d10 2017*1e49577aSRod Evans ldd [%o4+48], %d12 2018*1e49577aSRod Evans ldd [%o4+56], %d14 2019*1e49577aSRod Evans.unalign_001_loop: 2020*1e49577aSRod Evans add %o4, 64, %o4 2021*1e49577aSRod Evans ldda [%o4]ASI_BLK_P, %d16 2022*1e49577aSRod Evans faligndata %d2, %d4, %d48 2023*1e49577aSRod Evans faligndata %d4, %d6, %d50 2024*1e49577aSRod Evans faligndata %d6, %d8, %d52 2025*1e49577aSRod Evans faligndata %d8, %d10, %d54 2026*1e49577aSRod Evans faligndata %d10, %d12, %d56 2027*1e49577aSRod Evans faligndata %d12, %d14, %d58 2028*1e49577aSRod Evans faligndata %d14, %d16, %d60 2029*1e49577aSRod Evans faligndata %d16, %d18, %d62 2030*1e49577aSRod Evans fmovd %d18, %d2 2031*1e49577aSRod Evans fmovd %d20, %d4 2032*1e49577aSRod Evans fmovd %d22, %d6 2033*1e49577aSRod Evans fmovd %d24, %d8 2034*1e49577aSRod Evans fmovd %d26, %d10 2035*1e49577aSRod Evans fmovd %d28, %d12 2036*1e49577aSRod Evans fmovd %d30, %d14 2037*1e49577aSRod Evans stda %d48, [%o0]ASI_BLK_P 2038*1e49577aSRod Evans subcc %o5, 64, %o5 2039*1e49577aSRod Evans add %o0, 64, %o0 2040*1e49577aSRod Evans bgu,pt %ncc, .unalign_001_loop 2041*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 2042*1e49577aSRod Evans ba .unalign_done 2043*1e49577aSRod Evans membar #Sync 2044*1e49577aSRod Evans 2045*1e49577aSRod Evans.unalign_000: 2046*1e49577aSRod Evans ldda [%o4]ASI_BLK_P, %d0 2047*1e49577aSRod Evans.unalign_000_loop: 2048*1e49577aSRod Evans add %o4, 64, %o4 2049*1e49577aSRod Evans ldda [%o4]ASI_BLK_P, %d16 2050*1e49577aSRod Evans faligndata %d0, %d2, %d48 2051*1e49577aSRod Evans faligndata %d2, %d4, %d50 2052*1e49577aSRod Evans faligndata %d4, %d6, %d52 2053*1e49577aSRod Evans faligndata %d6, %d8, %d54 2054*1e49577aSRod Evans faligndata %d8, %d10, %d56 2055*1e49577aSRod Evans faligndata %d10, %d12, %d58 2056*1e49577aSRod Evans faligndata %d12, %d14, %d60 2057*1e49577aSRod Evans faligndata %d14, %d16, %d62 2058*1e49577aSRod Evans fmovd %d16, %d0 2059*1e49577aSRod Evans fmovd %d18, %d2 2060*1e49577aSRod Evans fmovd %d20, %d4 2061*1e49577aSRod Evans fmovd %d22, %d6 2062*1e49577aSRod Evans fmovd %d24, %d8 2063*1e49577aSRod Evans fmovd %d26, %d10 2064*1e49577aSRod Evans fmovd %d28, %d12 2065*1e49577aSRod Evans fmovd %d30, %d14 2066*1e49577aSRod Evans stda %d48, [%o0]ASI_BLK_P 2067*1e49577aSRod Evans subcc %o5, 64, %o5 2068*1e49577aSRod Evans add %o0, 64, %o0 2069*1e49577aSRod Evans bgu,pt %ncc, .unalign_000_loop 2070*1e49577aSRod Evans prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 2071*1e49577aSRod Evans membar #Sync 2072*1e49577aSRod Evans 2073*1e49577aSRod Evans.unalign_done: 2074*1e49577aSRod Evans ! Handle trailing bytes, 64 to 127 2075*1e49577aSRod Evans ! Dest long word aligned, Src not long word aligned 2076*1e49577aSRod Evans cmp %o2, 15 2077*1e49577aSRod Evans bleu %ncc, .unalign_short 2078*1e49577aSRod Evans 2079*1e49577aSRod Evans andn %o2, 0x7, %o5 ! %o5 is multiple of 8 2080*1e49577aSRod Evans and %o2, 0x7, %o2 ! residue bytes in %o2 2081*1e49577aSRod Evans add %o2, 8, %o2 2082*1e49577aSRod Evans sub %o5, 8, %o5 ! insure we don't load past end of src 2083*1e49577aSRod Evans andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 2084*1e49577aSRod Evans add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 2085*1e49577aSRod Evans ldd [%o4], %d0 ! fetch partial word 2086*1e49577aSRod Evans.unalign_by8: 2087*1e49577aSRod Evans ldd [%o4+8], %d2 2088*1e49577aSRod Evans add %o4, 8, %o4 2089*1e49577aSRod Evans faligndata %d0, %d2, %d16 2090*1e49577aSRod Evans subcc %o5, 8, %o5 2091*1e49577aSRod Evans std %d16, [%o0] 2092*1e49577aSRod Evans fmovd %d2, %d0 2093*1e49577aSRod Evans bgu,pt %ncc, .unalign_by8 2094*1e49577aSRod Evans add %o0, 8, %o0 2095*1e49577aSRod Evans 2096*1e49577aSRod Evans.unalign_short: 2097*1e49577aSRod Evans brnz %g5, .smallrest 2098*1e49577aSRod Evans nop 2099*1e49577aSRod Evans ba .smallrest 2100*1e49577aSRod Evans wr %g5, %g0, %fprs 2101*1e49577aSRod Evans#else /* NIAGARA2_IMPL */ 2102*1e49577aSRod Evans.forcpy: 2103*1e49577aSRod Evans mov %o0, %g5 ! save des address for return val 2104*1e49577aSRod Evans cmp %o2, 17 ! for small counts copy bytes 2105*1e49577aSRod Evans bleu,pt %ncc, .dbytecp 2106*1e49577aSRod Evans nop 2107*1e49577aSRod Evans 2108*1e49577aSRod Evans cmp %o2, 0x80 ! For lengths less than 128 bytes no 2109*1e49577aSRod Evans bleu,pn %ncc, .no_blkcpy ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2110*1e49577aSRod Evans 2111*1e49577aSRod Evans /* 2112*1e49577aSRod Evans * Make sure that source and destination buffers are 64 bytes apart. 2113*1e49577aSRod Evans * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy 2114*1e49577aSRod Evans * the data. 2115*1e49577aSRod Evans */ 2116*1e49577aSRod Evans subcc %o1, %o0, %o3 2117*1e49577aSRod Evans blu %ncc, .blkalgndst 2118*1e49577aSRod Evans cmp %o3, 0x40 ! if src - dst >= 0x40 2119*1e49577aSRod Evans bgeu,pt %ncc, .blkalgndst ! then use ASI_BLK_INIT_ST_QUAD_LDD_P 2120*1e49577aSRod Evans.no_blkcpy: 2121*1e49577aSRod Evans andcc %o1, 3, %o5 ! is src word aligned 2122*1e49577aSRod Evans bz,pn %ncc, .aldst 2123*1e49577aSRod Evans cmp %o5, 2 ! is src half-word aligned 2124*1e49577aSRod Evans be,pt %ncc, .s2algn 2125*1e49577aSRod Evans cmp %o5, 3 ! src is byte aligned 2126*1e49577aSRod Evans.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it 2127*1e49577aSRod Evans inc 1, %o1 2128*1e49577aSRod Evans stb %o3, [%g5] ! move a byte to align src 2129*1e49577aSRod Evans inc 1, %g5 2130*1e49577aSRod Evans bne,pt %ncc, .s2algn 2131*1e49577aSRod Evans dec %o2 2132*1e49577aSRod Evans b .ald ! now go align dest 2133*1e49577aSRod Evans andcc %g5, 3, %o5 2134*1e49577aSRod Evans 2135*1e49577aSRod Evans.s2algn:lduh [%o1], %o3 ! know src is 2 byte alinged 2136*1e49577aSRod Evans inc 2, %o1 2137*1e49577aSRod Evans srl %o3, 8, %o4 2138*1e49577aSRod Evans stb %o4, [%g5] ! have to do bytes, 2139*1e49577aSRod Evans stb %o3, [%g5 + 1] ! don't know dst alingment 2140*1e49577aSRod Evans inc 2, %g5 2141*1e49577aSRod Evans dec 2, %o2 2142*1e49577aSRod Evans 2143*1e49577aSRod Evans.aldst: andcc %g5, 3, %o5 ! align the destination address 2144*1e49577aSRod Evans.ald: bz,pn %ncc, .w4cp 2145*1e49577aSRod Evans cmp %o5, 2 2146*1e49577aSRod Evans bz,pn %ncc, .w2cp 2147*1e49577aSRod Evans cmp %o5, 3 2148*1e49577aSRod Evans.w3cp: lduw [%o1], %o4 2149*1e49577aSRod Evans inc 4, %o1 2150*1e49577aSRod Evans srl %o4, 24, %o5 2151*1e49577aSRod Evans stb %o5, [%g5] 2152*1e49577aSRod Evans bne,pt %ncc, .w1cp 2153*1e49577aSRod Evans inc %g5 2154*1e49577aSRod Evans dec 1, %o2 2155*1e49577aSRod Evans andn %o2, 3, %o3 ! o3 is aligned word count 2156*1e49577aSRod Evans dec 4, %o3 ! avoid reading beyond tail of src 2157*1e49577aSRod Evans sub %o1, %g5, %o1 ! o1 gets the difference 2158*1e49577aSRod Evans 2159*1e49577aSRod Evans1: sll %o4, 8, %g1 ! save residual bytes 2160*1e49577aSRod Evans lduw [%o1+%g5], %o4 2161*1e49577aSRod Evans deccc 4, %o3 2162*1e49577aSRod Evans srl %o4, 24, %o5 ! merge with residual 2163*1e49577aSRod Evans or %o5, %g1, %g1 2164*1e49577aSRod Evans st %g1, [%g5] 2165*1e49577aSRod Evans bnz,pt %ncc, 1b 2166*1e49577aSRod Evans inc 4, %g5 2167*1e49577aSRod Evans sub %o1, 3, %o1 ! used one byte of last word read 2168*1e49577aSRod Evans and %o2, 3, %o2 2169*1e49577aSRod Evans b 7f 2170*1e49577aSRod Evans inc 4, %o2 2171*1e49577aSRod Evans 2172*1e49577aSRod Evans.w1cp: srl %o4, 8, %o5 2173*1e49577aSRod Evans sth %o5, [%g5] 2174*1e49577aSRod Evans inc 2, %g5 2175*1e49577aSRod Evans dec 3, %o2 2176*1e49577aSRod Evans andn %o2, 3, %o3 ! o3 is aligned word count 2177*1e49577aSRod Evans dec 4, %o3 ! avoid reading beyond tail of src 2178*1e49577aSRod Evans sub %o1, %g5, %o1 ! o1 gets the difference 2179*1e49577aSRod Evans 2180*1e49577aSRod Evans2: sll %o4, 24, %g1 ! save residual bytes 2181*1e49577aSRod Evans lduw [%o1+%g5], %o4 2182*1e49577aSRod Evans deccc 4, %o3 2183*1e49577aSRod Evans srl %o4, 8, %o5 ! merge with residual 2184*1e49577aSRod Evans or %o5, %g1, %g1 2185*1e49577aSRod Evans st %g1, [%g5] 2186*1e49577aSRod Evans bnz,pt %ncc, 2b 2187*1e49577aSRod Evans inc 4, %g5 2188*1e49577aSRod Evans sub %o1, 1, %o1 ! used three bytes of last word read 2189*1e49577aSRod Evans and %o2, 3, %o2 2190*1e49577aSRod Evans b 7f 2191*1e49577aSRod Evans inc 4, %o2 2192*1e49577aSRod Evans 2193*1e49577aSRod Evans.w2cp: lduw [%o1], %o4 2194*1e49577aSRod Evans inc 4, %o1 2195*1e49577aSRod Evans srl %o4, 16, %o5 2196*1e49577aSRod Evans sth %o5, [%g5] 2197*1e49577aSRod Evans inc 2, %g5 2198*1e49577aSRod Evans dec 2, %o2 2199*1e49577aSRod Evans andn %o2, 3, %o3 ! o3 is aligned word count 2200*1e49577aSRod Evans dec 4, %o3 ! avoid reading beyond tail of src 2201*1e49577aSRod Evans sub %o1, %g5, %o1 ! o1 gets the difference 2202*1e49577aSRod Evans 2203*1e49577aSRod Evans3: sll %o4, 16, %g1 ! save residual bytes 2204*1e49577aSRod Evans lduw [%o1+%g5], %o4 2205*1e49577aSRod Evans deccc 4, %o3 2206*1e49577aSRod Evans srl %o4, 16, %o5 ! merge with residual 2207*1e49577aSRod Evans or %o5, %g1, %g1 2208*1e49577aSRod Evans st %g1, [%g5] 2209*1e49577aSRod Evans bnz,pt %ncc, 3b 2210*1e49577aSRod Evans inc 4, %g5 2211*1e49577aSRod Evans sub %o1, 2, %o1 ! used two bytes of last word read 2212*1e49577aSRod Evans and %o2, 3, %o2 2213*1e49577aSRod Evans b 7f 2214*1e49577aSRod Evans inc 4, %o2 2215*1e49577aSRod Evans 2216*1e49577aSRod Evans.w4cp: andn %o2, 3, %o3 ! o3 is aligned word count 2217*1e49577aSRod Evans sub %o1, %g5, %o1 ! o1 gets the difference 2218*1e49577aSRod Evans 2219*1e49577aSRod Evans1: lduw [%o1+%g5], %o4 ! read from address 2220*1e49577aSRod Evans deccc 4, %o3 ! decrement count 2221*1e49577aSRod Evans st %o4, [%g5] ! write at destination address 2222*1e49577aSRod Evans bgu,pt %ncc, 1b 2223*1e49577aSRod Evans inc 4, %g5 ! increment to address 2224*1e49577aSRod Evans b 7f 2225*1e49577aSRod Evans and %o2, 3, %o2 ! number of leftover bytes, if any 2226*1e49577aSRod Evans 2227*1e49577aSRod Evans ! 2228*1e49577aSRod Evans ! differenced byte copy, works with any alignment 2229*1e49577aSRod Evans ! 2230*1e49577aSRod Evans.dbytecp: 2231*1e49577aSRod Evans b 7f 2232*1e49577aSRod Evans sub %o1, %g5, %o1 ! o1 gets the difference 2233*1e49577aSRod Evans 2234*1e49577aSRod Evans4: stb %o4, [%g5] ! write to address 2235*1e49577aSRod Evans inc %g5 ! inc to address 2236*1e49577aSRod Evans7: deccc %o2 ! decrement count 2237*1e49577aSRod Evans bgeu,a,pt %ncc,4b ! loop till done 2238*1e49577aSRod Evans ldub [%o1+%g5], %o4 ! read from address 2239*1e49577aSRod Evans retl ! %o0 was preserved 2240*1e49577aSRod Evans nop 2241*1e49577aSRod Evans 2242*1e49577aSRod Evans.blkalgndst: 2243*1e49577aSRod Evans save %sp, -SA(MINFRAME), %sp 2244*1e49577aSRod Evans 2245*1e49577aSRod Evans ! Block (64 bytes) align the destination. 2246*1e49577aSRod Evans andcc %i0, 0x3f, %i3 ! is dst block aligned 2247*1e49577aSRod Evans bz %ncc, .chksrc ! dst already block aligned 2248*1e49577aSRod Evans sub %i3, 0x40, %i3 2249*1e49577aSRod Evans neg %i3 ! bytes till dst 64 bytes aligned 2250*1e49577aSRod Evans sub %i2, %i3, %i2 ! update i2 with new count 2251*1e49577aSRod Evans 2252*1e49577aSRod Evans ! Based on source and destination alignment do 2253*1e49577aSRod Evans ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 2254*1e49577aSRod Evans 2255*1e49577aSRod Evans ! Is dst & src 8B aligned 2256*1e49577aSRod Evans or %i0, %i1, %o2 2257*1e49577aSRod Evans andcc %o2, 0x7, %g0 2258*1e49577aSRod Evans bz %ncc, .alewdcp 2259*1e49577aSRod Evans nop 2260*1e49577aSRod Evans 2261*1e49577aSRod Evans ! Is dst & src 4B aligned 2262*1e49577aSRod Evans andcc %o2, 0x3, %g0 2263*1e49577aSRod Evans bz %ncc, .alwdcp 2264*1e49577aSRod Evans nop 2265*1e49577aSRod Evans 2266*1e49577aSRod Evans ! Is dst & src 2B aligned 2267*1e49577aSRod Evans andcc %o2, 0x1, %g0 2268*1e49577aSRod Evans bz %ncc, .alhlfwdcp 2269*1e49577aSRod Evans nop 2270*1e49577aSRod Evans 2271*1e49577aSRod Evans ! 1B aligned 2272*1e49577aSRod Evans1: ldub [%i1], %o2 2273*1e49577aSRod Evans stb %o2, [%i0] 2274*1e49577aSRod Evans inc %i1 2275*1e49577aSRod Evans deccc %i3 2276*1e49577aSRod Evans bgu,pt %ncc, 1b 2277*1e49577aSRod Evans inc %i0 2278*1e49577aSRod Evans 2279*1e49577aSRod Evans ba .chksrc 2280*1e49577aSRod Evans nop 2281*1e49577aSRod Evans 2282*1e49577aSRod Evans ! dst & src 4B aligned 2283*1e49577aSRod Evans.alwdcp: 2284*1e49577aSRod Evans ld [%i1], %o2 2285*1e49577aSRod Evans st %o2, [%i0] 2286*1e49577aSRod Evans add %i1, 0x4, %i1 2287*1e49577aSRod Evans subcc %i3, 0x4, %i3 2288*1e49577aSRod Evans bgu,pt %ncc, .alwdcp 2289*1e49577aSRod Evans add %i0, 0x4, %i0 2290*1e49577aSRod Evans 2291*1e49577aSRod Evans ba .chksrc 2292*1e49577aSRod Evans nop 2293*1e49577aSRod Evans 2294*1e49577aSRod Evans ! dst & src 2B aligned 2295*1e49577aSRod Evans.alhlfwdcp: 2296*1e49577aSRod Evans lduh [%i1], %o2 2297*1e49577aSRod Evans stuh %o2, [%i0] 2298*1e49577aSRod Evans add %i1, 0x2, %i1 2299*1e49577aSRod Evans subcc %i3, 0x2, %i3 2300*1e49577aSRod Evans bgu,pt %ncc, .alhlfwdcp 2301*1e49577aSRod Evans add %i0, 0x2, %i0 2302*1e49577aSRod Evans 2303*1e49577aSRod Evans ba .chksrc 2304*1e49577aSRod Evans nop 2305*1e49577aSRod Evans 2306*1e49577aSRod Evans ! dst & src 8B aligned 2307*1e49577aSRod Evans.alewdcp: 2308*1e49577aSRod Evans ldx [%i1], %o2 2309*1e49577aSRod Evans stx %o2, [%i0] 2310*1e49577aSRod Evans add %i1, 0x8, %i1 2311*1e49577aSRod Evans subcc %i3, 0x8, %i3 2312*1e49577aSRod Evans bgu,pt %ncc, .alewdcp 2313*1e49577aSRod Evans add %i0, 0x8, %i0 2314*1e49577aSRod Evans 2315*1e49577aSRod Evans ! Now Destination is block (64 bytes) aligned 2316*1e49577aSRod Evans.chksrc: 2317*1e49577aSRod Evans andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 2318*1e49577aSRod Evans sub %i2, %i3, %i2 ! Residue bytes in %i2 2319*1e49577aSRod Evans mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2320*1e49577aSRod Evans andcc %i1, 0xf, %l1 ! is src quadword aligned 2321*1e49577aSRod Evans bz,pn %ncc, .blkcpy ! src offset in %l1 2322*1e49577aSRod Evans nop 2323*1e49577aSRod Evans cmp %l1, 0x8 2324*1e49577aSRod Evans bgu %ncc, .cpy_upper_double 2325*1e49577aSRod Evans nop 2326*1e49577aSRod Evans blu %ncc, .cpy_lower_double 2327*1e49577aSRod Evans nop 2328*1e49577aSRod Evans 2329*1e49577aSRod Evans ! Falls through when source offset is equal to 8 i.e. 2330*1e49577aSRod Evans ! source is double word aligned. 2331*1e49577aSRod Evans ! In this case no shift/merge of data is required 2332*1e49577aSRod Evans sub %i1, %l1, %i1 ! align the src at 16 bytes. 2333*1e49577aSRod Evans andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2334*1e49577aSRod Evans prefetch [%o0+0x0], #one_read 2335*1e49577aSRod Evans ldda [%i1+0x0]%asi, %o2 2336*1e49577aSRod Evansloop0: 2337*1e49577aSRod Evans ldda [%i1+0x10]%asi, %o4 2338*1e49577aSRod Evans prefetch [%o0+0x40], #one_read 2339*1e49577aSRod Evans 2340*1e49577aSRod Evans stxa %o3, [%i0+0x0]%asi 2341*1e49577aSRod Evans stxa %o4, [%i0+0x8]%asi 2342*1e49577aSRod Evans 2343*1e49577aSRod Evans ldda [%i1+0x20]%asi, %o2 2344*1e49577aSRod Evans stxa %o5, [%i0+0x10]%asi 2345*1e49577aSRod Evans stxa %o2, [%i0+0x18]%asi 2346*1e49577aSRod Evans 2347*1e49577aSRod Evans ldda [%i1+0x30]%asi, %o4 2348*1e49577aSRod Evans stxa %o3, [%i0+0x20]%asi 2349*1e49577aSRod Evans stxa %o4, [%i0+0x28]%asi 2350*1e49577aSRod Evans 2351*1e49577aSRod Evans ldda [%i1+0x40]%asi, %o2 2352*1e49577aSRod Evans stxa %o5, [%i0+0x30]%asi 2353*1e49577aSRod Evans stxa %o2, [%i0+0x38]%asi 2354*1e49577aSRod Evans 2355*1e49577aSRod Evans add %o0, 0x40, %o0 2356*1e49577aSRod Evans add %i1, 0x40, %i1 2357*1e49577aSRod Evans subcc %i3, 0x40, %i3 2358*1e49577aSRod Evans bgu,pt %ncc, loop0 2359*1e49577aSRod Evans add %i0, 0x40, %i0 2360*1e49577aSRod Evans ba .blkdone 2361*1e49577aSRod Evans add %i1, %l1, %i1 ! increment the source by src offset 2362*1e49577aSRod Evans 2363*1e49577aSRod Evans.cpy_lower_double: 2364*1e49577aSRod Evans sub %i1, %l1, %i1 ! align the src at 16 bytes. 2365*1e49577aSRod Evans sll %l1, 3, %l2 ! %l2 left shift 2366*1e49577aSRod Evans mov 0x40, %l3 2367*1e49577aSRod Evans sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) 2368*1e49577aSRod Evans andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2369*1e49577aSRod Evans prefetch [%o0+0x0], #one_read 2370*1e49577aSRod Evans ldda [%i1+0x0]%asi, %o2 ! partial data in %o2 and %o3 has 2371*1e49577aSRod Evans ! complete data 2372*1e49577aSRod Evansloop1: 2373*1e49577aSRod Evans ldda [%i1+0x10]%asi, %o4 ! %o4 has partial data for this read. 2374*1e49577aSRod Evans ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) ! merge %o2, %o3 and %o4 2375*1e49577aSRod Evans ! into %o2 and %o3 2376*1e49577aSRod Evans prefetch [%o0+0x40], #one_read 2377*1e49577aSRod Evans stxa %o2, [%i0+0x0]%asi 2378*1e49577aSRod Evans stxa %o3, [%i0+0x8]%asi 2379*1e49577aSRod Evans 2380*1e49577aSRod Evans ldda [%i1+0x20]%asi, %o2 2381*1e49577aSRod Evans ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) ! merge %o2 with %o5 and 2382*1e49577aSRod Evans stxa %o4, [%i0+0x10]%asi ! %o4 from previous read 2383*1e49577aSRod Evans stxa %o5, [%i0+0x18]%asi ! into %o4 and %o5 2384*1e49577aSRod Evans 2385*1e49577aSRod Evans ! Repeat the same for next 32 bytes. 2386*1e49577aSRod Evans 2387*1e49577aSRod Evans ldda [%i1+0x30]%asi, %o4 2388*1e49577aSRod Evans ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) 2389*1e49577aSRod Evans stxa %o2, [%i0+0x20]%asi 2390*1e49577aSRod Evans stxa %o3, [%i0+0x28]%asi 2391*1e49577aSRod Evans 2392*1e49577aSRod Evans ldda [%i1+0x40]%asi, %o2 2393*1e49577aSRod Evans ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) 2394*1e49577aSRod Evans stxa %o4, [%i0+0x30]%asi 2395*1e49577aSRod Evans stxa %o5, [%i0+0x38]%asi 2396*1e49577aSRod Evans 2397*1e49577aSRod Evans add %o0, 0x40, %o0 2398*1e49577aSRod Evans add %i1, 0x40, %i1 2399*1e49577aSRod Evans subcc %i3, 0x40, %i3 2400*1e49577aSRod Evans bgu,pt %ncc, loop1 2401*1e49577aSRod Evans add %i0, 0x40, %i0 2402*1e49577aSRod Evans ba .blkdone 2403*1e49577aSRod Evans add %i1, %l1, %i1 ! increment the source by src offset 2404*1e49577aSRod Evans 2405*1e49577aSRod Evans.cpy_upper_double: 2406*1e49577aSRod Evans sub %i1, %l1, %i1 ! align the src at 16 bytes. 2407*1e49577aSRod Evans mov 0x8, %l2 2408*1e49577aSRod Evans sub %l1, %l2, %l2 2409*1e49577aSRod Evans sll %l2, 3, %l2 ! %l2 left shift 2410*1e49577aSRod Evans mov 0x40, %l3 2411*1e49577aSRod Evans sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) 2412*1e49577aSRod Evans andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2413*1e49577aSRod Evans prefetch [%o0+0x0], #one_read 2414*1e49577aSRod Evans ldda [%i1+0x0]%asi, %o2 ! partial data in %o3 for this read and 2415*1e49577aSRod Evans ! no data in %o2 2416*1e49577aSRod Evansloop2: 2417*1e49577aSRod Evans ldda [%i1+0x10]%asi, %o4 ! %o4 has complete data and %o5 has 2418*1e49577aSRod Evans ! partial 2419*1e49577aSRod Evans ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) ! merge %o3, %o4 and %o5 2420*1e49577aSRod Evans ! into %o3 and %o4 2421*1e49577aSRod Evans prefetch [%o0+0x40], #one_read 2422*1e49577aSRod Evans stxa %o3, [%i0+0x0]%asi 2423*1e49577aSRod Evans stxa %o4, [%i0+0x8]%asi 2424*1e49577aSRod Evans 2425*1e49577aSRod Evans ldda [%i1+0x20]%asi, %o2 2426*1e49577aSRod Evans ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) ! merge %o2 and %o3 with 2427*1e49577aSRod Evans stxa %o5, [%i0+0x10]%asi ! %o5 from previous read 2428*1e49577aSRod Evans stxa %o2, [%i0+0x18]%asi ! into %o5 and %o2 2429*1e49577aSRod Evans 2430*1e49577aSRod Evans ! Repeat the same for next 32 bytes. 2431*1e49577aSRod Evans 2432*1e49577aSRod Evans ldda [%i1+0x30]%asi, %o4 2433*1e49577aSRod Evans ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) 2434*1e49577aSRod Evans stxa %o3, [%i0+0x20]%asi 2435*1e49577aSRod Evans stxa %o4, [%i0+0x28]%asi 2436*1e49577aSRod Evans 2437*1e49577aSRod Evans ldda [%i1+0x40]%asi, %o2 2438*1e49577aSRod Evans ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) 2439*1e49577aSRod Evans stxa %o5, [%i0+0x30]%asi 2440*1e49577aSRod Evans stxa %o2, [%i0+0x38]%asi 2441*1e49577aSRod Evans 2442*1e49577aSRod Evans add %o0, 0x40, %o0 2443*1e49577aSRod Evans add %i1, 0x40, %i1 2444*1e49577aSRod Evans subcc %i3, 0x40, %i3 2445*1e49577aSRod Evans bgu,pt %ncc, loop2 2446*1e49577aSRod Evans add %i0, 0x40, %i0 2447*1e49577aSRod Evans ba .blkdone 2448*1e49577aSRod Evans add %i1, %l1, %i1 ! increment the source by src offset 2449*1e49577aSRod Evans 2450*1e49577aSRod Evans ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2451*1e49577aSRod Evans.blkcpy: 2452*1e49577aSRod Evans andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2453*1e49577aSRod Evans prefetch [%o0+0x0], #one_read 2454*1e49577aSRod Evans1: 2455*1e49577aSRod Evans prefetch [%o0+0x40], #one_read 2456*1e49577aSRod Evans 2457*1e49577aSRod Evans ldda [%i1+0x0]%asi, %o2 2458*1e49577aSRod Evans ldda [%i1+0x10]%asi, %o4 2459*1e49577aSRod Evans 2460*1e49577aSRod Evans stxa %o2, [%i0+0x0]%asi 2461*1e49577aSRod Evans stxa %o3, [%i0+0x8]%asi 2462*1e49577aSRod Evans stxa %o4, [%i0+0x10]%asi 2463*1e49577aSRod Evans stxa %o5, [%i0+0x18]%asi 2464*1e49577aSRod Evans 2465*1e49577aSRod Evans ldda [%i1+0x20]%asi, %o2 2466*1e49577aSRod Evans ldda [%i1+0x30]%asi, %o4 2467*1e49577aSRod Evans 2468*1e49577aSRod Evans stxa %o2, [%i0+0x20]%asi 2469*1e49577aSRod Evans stxa %o3, [%i0+0x28]%asi 2470*1e49577aSRod Evans stxa %o4, [%i0+0x30]%asi 2471*1e49577aSRod Evans stxa %o5, [%i0+0x38]%asi 2472*1e49577aSRod Evans 2473*1e49577aSRod Evans add %o0, 0x40, %o0 2474*1e49577aSRod Evans add %i1, 0x40, %i1 2475*1e49577aSRod Evans subcc %i3, 0x40, %i3 2476*1e49577aSRod Evans bgu,pt %ncc, 1b 2477*1e49577aSRod Evans add %i0, 0x40, %i0 2478*1e49577aSRod Evans 2479*1e49577aSRod Evans.blkdone: 2480*1e49577aSRod Evans membar #Sync 2481*1e49577aSRod Evans 2482*1e49577aSRod Evans mov ASI_PNF, %asi ! restore %asi to default 2483*1e49577aSRod Evans ! ASI_PRIMARY_NOFAULT value 2484*1e49577aSRod Evans tst %i2 2485*1e49577aSRod Evans bz,pt %ncc, .blkexit 2486*1e49577aSRod Evans nop 2487*1e49577aSRod Evans 2488*1e49577aSRod Evans ! Handle trailing bytes 2489*1e49577aSRod Evans cmp %i2, 0x8 2490*1e49577aSRod Evans blu,pt %ncc, .residue 2491*1e49577aSRod Evans nop 2492*1e49577aSRod Evans 2493*1e49577aSRod Evans ! Can we do some 8B ops 2494*1e49577aSRod Evans or %i1, %i0, %o2 2495*1e49577aSRod Evans andcc %o2, 0x7, %g0 2496*1e49577aSRod Evans bnz %ncc, .last4 2497*1e49577aSRod Evans nop 2498*1e49577aSRod Evans 2499*1e49577aSRod Evans ! Do 8byte ops as long as possible 2500*1e49577aSRod Evans.last8: 2501*1e49577aSRod Evans ldx [%i1], %o2 2502*1e49577aSRod Evans stx %o2, [%i0] 2503*1e49577aSRod Evans add %i1, 0x8, %i1 2504*1e49577aSRod Evans sub %i2, 0x8, %i2 2505*1e49577aSRod Evans cmp %i2, 0x8 2506*1e49577aSRod Evans bgu,pt %ncc, .last8 2507*1e49577aSRod Evans add %i0, 0x8, %i0 2508*1e49577aSRod Evans 2509*1e49577aSRod Evans tst %i2 2510*1e49577aSRod Evans bz,pt %ncc, .blkexit 2511*1e49577aSRod Evans nop 2512*1e49577aSRod Evans 2513*1e49577aSRod Evans ba .residue 2514*1e49577aSRod Evans nop 2515*1e49577aSRod Evans 2516*1e49577aSRod Evans.last4: 2517*1e49577aSRod Evans ! Can we do 4B ops 2518*1e49577aSRod Evans andcc %o2, 0x3, %g0 2519*1e49577aSRod Evans bnz %ncc, .last2 2520*1e49577aSRod Evans nop 2521*1e49577aSRod Evans1: 2522*1e49577aSRod Evans ld [%i1], %o2 2523*1e49577aSRod Evans st %o2, [%i0] 2524*1e49577aSRod Evans add %i1, 0x4, %i1 2525*1e49577aSRod Evans sub %i2, 0x4, %i2 2526*1e49577aSRod Evans cmp %i2, 0x4 2527*1e49577aSRod Evans bgu,pt %ncc, 1b 2528*1e49577aSRod Evans add %i0, 0x4, %i0 2529*1e49577aSRod Evans 2530*1e49577aSRod Evans cmp %i2, 0 2531*1e49577aSRod Evans bz,pt %ncc, .blkexit 2532*1e49577aSRod Evans nop 2533*1e49577aSRod Evans 2534*1e49577aSRod Evans ba .residue 2535*1e49577aSRod Evans nop 2536*1e49577aSRod Evans 2537*1e49577aSRod Evans.last2: 2538*1e49577aSRod Evans ! Can we do 2B ops 2539*1e49577aSRod Evans andcc %o2, 0x1, %g0 2540*1e49577aSRod Evans bnz %ncc, .residue 2541*1e49577aSRod Evans nop 2542*1e49577aSRod Evans 2543*1e49577aSRod Evans1: 2544*1e49577aSRod Evans lduh [%i1], %o2 2545*1e49577aSRod Evans stuh %o2, [%i0] 2546*1e49577aSRod Evans add %i1, 0x2, %i1 2547*1e49577aSRod Evans sub %i2, 0x2, %i2 2548*1e49577aSRod Evans cmp %i2, 0x2 2549*1e49577aSRod Evans bgu,pt %ncc, 1b 2550*1e49577aSRod Evans add %i0, 0x2, %i0 2551*1e49577aSRod Evans 2552*1e49577aSRod Evans cmp %i2, 0 2553*1e49577aSRod Evans bz,pt %ncc, .blkexit 2554*1e49577aSRod Evans nop 2555*1e49577aSRod Evans 2556*1e49577aSRod Evans.residue: 2557*1e49577aSRod Evans ldub [%i1], %o2 2558*1e49577aSRod Evans stb %o2, [%i0] 2559*1e49577aSRod Evans inc %i1 2560*1e49577aSRod Evans deccc %i2 2561*1e49577aSRod Evans bgu,pt %ncc, .residue 2562*1e49577aSRod Evans inc %i0 2563*1e49577aSRod Evans 2564*1e49577aSRod Evans.blkexit: 2565*1e49577aSRod Evans 2566*1e49577aSRod Evans ret 2567*1e49577aSRod Evans restore %g5, %g0, %o0 2568*1e49577aSRod Evans 2569*1e49577aSRod Evans#endif /* NIAGARA2_IMPL */ 2570*1e49577aSRod Evans SET_SIZE(memcpy) 2571*1e49577aSRod Evans SET_SIZE(__align_cpy_1) 2572