1ae2c6ca6SDavid S. Miller/* NG4memcpy.S: Niagara-4 optimized memcpy. 2ae2c6ca6SDavid S. Miller * 3ae2c6ca6SDavid S. Miller * Copyright (C) 2012 David S. Miller (davem@davemloft.net) 4ae2c6ca6SDavid S. Miller */ 5ae2c6ca6SDavid S. Miller 6ae2c6ca6SDavid S. Miller#ifdef __KERNEL__ 795707704SDavid S. Miller#include <linux/linkage.h> 8ae2c6ca6SDavid S. Miller#include <asm/visasm.h> 9ae2c6ca6SDavid S. Miller#include <asm/asi.h> 10ae2c6ca6SDavid S. Miller#define GLOBAL_SPARE %g7 11ae2c6ca6SDavid S. Miller#else 12ae2c6ca6SDavid S. Miller#define ASI_BLK_INIT_QUAD_LDD_P 0xe2 13ae2c6ca6SDavid S. Miller#define FPRS_FEF 0x04 14ae2c6ca6SDavid S. Miller 15ae2c6ca6SDavid S. Miller/* On T4 it is very expensive to access ASRs like %fprs and 16ae2c6ca6SDavid S. Miller * %asi, avoiding a read or a write can save ~50 cycles. 17ae2c6ca6SDavid S. Miller */ 18ae2c6ca6SDavid S. Miller#define FPU_ENTER \ 19ae2c6ca6SDavid S. Miller rd %fprs, %o5; \ 20ae2c6ca6SDavid S. Miller andcc %o5, FPRS_FEF, %g0; \ 21ae2c6ca6SDavid S. Miller be,a,pn %icc, 999f; \ 22ae2c6ca6SDavid S. Miller wr %g0, FPRS_FEF, %fprs; \ 23ae2c6ca6SDavid S. Miller 999: 24ae2c6ca6SDavid S. Miller 25ae2c6ca6SDavid S. Miller#ifdef MEMCPY_DEBUG 26ae2c6ca6SDavid S. Miller#define VISEntryHalf FPU_ENTER; \ 27ae2c6ca6SDavid S. Miller clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; 28ae2c6ca6SDavid S. Miller#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 29ae2c6ca6SDavid S. Miller#else 30ae2c6ca6SDavid S. Miller#define VISEntryHalf FPU_ENTER 31ae2c6ca6SDavid S. Miller#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs 32ae2c6ca6SDavid S. Miller#endif 33ae2c6ca6SDavid S. Miller 34ae2c6ca6SDavid S. Miller#define GLOBAL_SPARE %g5 35ae2c6ca6SDavid S. Miller#endif 36ae2c6ca6SDavid S. Miller 37ae2c6ca6SDavid S. Miller#ifndef STORE_ASI 38ae2c6ca6SDavid S. Miller#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA 39ae2c6ca6SDavid S. Miller#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 40ae2c6ca6SDavid S. Miller#else 41ae2c6ca6SDavid S. Miller#define STORE_ASI 0x80 /* ASI_P */ 42ae2c6ca6SDavid S. Miller#endif 43ae2c6ca6SDavid S. Miller#endif 44ae2c6ca6SDavid S. Miller 45f4da3628SDavid S. Miller#if !defined(EX_LD) && !defined(EX_ST) 46f4da3628SDavid S. Miller#define NON_USER_COPY 47f4da3628SDavid S. Miller#endif 48f4da3628SDavid S. Miller 49ae2c6ca6SDavid S. Miller#ifndef EX_LD 5095707704SDavid S. Miller#define EX_LD(x,y) x 51ae2c6ca6SDavid S. Miller#endif 52a7c5724bSRob Gardner#ifndef EX_LD_FP 5395707704SDavid S. Miller#define EX_LD_FP(x,y) x 54a7c5724bSRob Gardner#endif 55ae2c6ca6SDavid S. Miller 56ae2c6ca6SDavid S. Miller#ifndef EX_ST 5795707704SDavid S. Miller#define EX_ST(x,y) x 58ae2c6ca6SDavid S. Miller#endif 59a7c5724bSRob Gardner#ifndef EX_ST_FP 6095707704SDavid S. Miller#define EX_ST_FP(x,y) x 61a7c5724bSRob Gardner#endif 62ae2c6ca6SDavid S. Miller 63ae2c6ca6SDavid S. Miller 64ae2c6ca6SDavid S. Miller#ifndef LOAD 65ae2c6ca6SDavid S. Miller#define LOAD(type,addr,dest) type [addr], dest 66ae2c6ca6SDavid S. Miller#endif 67ae2c6ca6SDavid S. Miller 68ae2c6ca6SDavid S. Miller#ifndef STORE 69ae2c6ca6SDavid S. Miller#ifndef MEMCPY_DEBUG 70ae2c6ca6SDavid S. Miller#define STORE(type,src,addr) type src, [addr] 71ae2c6ca6SDavid S. Miller#else 72ae2c6ca6SDavid S. Miller#define STORE(type,src,addr) type##a src, [addr] %asi 73ae2c6ca6SDavid S. Miller#endif 74ae2c6ca6SDavid S. Miller#endif 75ae2c6ca6SDavid S. Miller 76ae2c6ca6SDavid S. Miller#ifndef STORE_INIT 77ae2c6ca6SDavid S. Miller#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI 78ae2c6ca6SDavid S. Miller#endif 79ae2c6ca6SDavid S. Miller 80ae2c6ca6SDavid S. Miller#ifndef FUNC_NAME 81ae2c6ca6SDavid S. Miller#define FUNC_NAME NG4memcpy 82ae2c6ca6SDavid S. Miller#endif 83ae2c6ca6SDavid S. Miller#ifndef PREAMBLE 84ae2c6ca6SDavid S. Miller#define PREAMBLE 85ae2c6ca6SDavid S. Miller#endif 86ae2c6ca6SDavid S. Miller 87ae2c6ca6SDavid S. Miller#ifndef XCC 88ae2c6ca6SDavid S. Miller#define XCC xcc 89ae2c6ca6SDavid S. Miller#endif 90ae2c6ca6SDavid S. Miller 91ae2c6ca6SDavid S. Miller .register %g2,#scratch 92ae2c6ca6SDavid S. Miller .register %g3,#scratch 93ae2c6ca6SDavid S. Miller 94ae2c6ca6SDavid S. Miller .text 9595707704SDavid S. Miller#ifndef EX_RETVAL 9695707704SDavid S. Miller#define EX_RETVAL(x) x 9795707704SDavid S. Miller#endif 98ae2c6ca6SDavid S. Miller .align 64 99ae2c6ca6SDavid S. Miller 100ae2c6ca6SDavid S. Miller .globl FUNC_NAME 101ae2c6ca6SDavid S. Miller .type FUNC_NAME,#function 102ae2c6ca6SDavid S. MillerFUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 103ae2c6ca6SDavid S. Miller#ifdef MEMCPY_DEBUG 104ae2c6ca6SDavid S. Miller wr %g0, 0x80, %asi 105ae2c6ca6SDavid S. Miller#endif 106ae2c6ca6SDavid S. Miller srlx %o2, 31, %g2 107ae2c6ca6SDavid S. Miller cmp %g2, 0 108ae2c6ca6SDavid S. Miller tne %XCC, 5 109ae2c6ca6SDavid S. Miller PREAMBLE 110ae2c6ca6SDavid S. Miller mov %o0, %o3 111ae2c6ca6SDavid S. Miller brz,pn %o2, .Lexit 112ae2c6ca6SDavid S. Miller cmp %o2, 3 113ae2c6ca6SDavid S. Miller ble,pn %icc, .Ltiny 114ae2c6ca6SDavid S. Miller cmp %o2, 19 115ae2c6ca6SDavid S. Miller ble,pn %icc, .Lsmall 116ae2c6ca6SDavid S. Miller or %o0, %o1, %g2 117ae2c6ca6SDavid S. Miller cmp %o2, 128 118ae2c6ca6SDavid S. Miller bl,pn %icc, .Lmedium 119ae2c6ca6SDavid S. Miller nop 120ae2c6ca6SDavid S. Miller 121ae2c6ca6SDavid S. Miller.Llarge:/* len >= 0x80 */ 122ae2c6ca6SDavid S. Miller /* First get dest 8 byte aligned. */ 123ae2c6ca6SDavid S. Miller sub %g0, %o0, %g1 124ae2c6ca6SDavid S. Miller and %g1, 0x7, %g1 125ae2c6ca6SDavid S. Miller brz,pt %g1, 51f 126ae2c6ca6SDavid S. Miller sub %o2, %g1, %o2 127ae2c6ca6SDavid S. Miller 12895707704SDavid S. Miller 129*1ab32693SBabu Moger1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 130ae2c6ca6SDavid S. Miller add %o1, 1, %o1 131ae2c6ca6SDavid S. Miller subcc %g1, 1, %g1 132ae2c6ca6SDavid S. Miller add %o0, 1, %o0 133ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 134*1ab32693SBabu Moger EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 135ae2c6ca6SDavid S. Miller 136ae2c6ca6SDavid S. Miller51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) 137ae2c6ca6SDavid S. Miller LOAD(prefetch, %o1 + 0x080, #n_reads_strong) 138ae2c6ca6SDavid S. Miller LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) 139ae2c6ca6SDavid S. Miller LOAD(prefetch, %o1 + 0x100, #n_reads_strong) 140ae2c6ca6SDavid S. Miller LOAD(prefetch, %o1 + 0x140, #n_reads_strong) 141ae2c6ca6SDavid S. Miller LOAD(prefetch, %o1 + 0x180, #n_reads_strong) 142ae2c6ca6SDavid S. Miller LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) 143ae2c6ca6SDavid S. Miller LOAD(prefetch, %o1 + 0x200, #n_reads_strong) 144ae2c6ca6SDavid S. Miller 145ae2c6ca6SDavid S. Miller /* Check if we can use the straight fully aligned 146ae2c6ca6SDavid S. Miller * loop, or we require the alignaddr/faligndata variant. 147ae2c6ca6SDavid S. Miller */ 148ae2c6ca6SDavid S. Miller andcc %o1, 0x7, %o5 149ae2c6ca6SDavid S. Miller bne,pn %icc, .Llarge_src_unaligned 150ae2c6ca6SDavid S. Miller sub %g0, %o0, %g1 151ae2c6ca6SDavid S. Miller 152ae2c6ca6SDavid S. Miller /* Legitimize the use of initializing stores by getting dest 153ae2c6ca6SDavid S. Miller * to be 64-byte aligned. 154ae2c6ca6SDavid S. Miller */ 155ae2c6ca6SDavid S. Miller and %g1, 0x3f, %g1 156ae2c6ca6SDavid S. Miller brz,pt %g1, .Llarge_aligned 157ae2c6ca6SDavid S. Miller sub %o2, %g1, %o2 158ae2c6ca6SDavid S. Miller 159*1ab32693SBabu Moger1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 160ae2c6ca6SDavid S. Miller add %o1, 8, %o1 161ae2c6ca6SDavid S. Miller subcc %g1, 8, %g1 162ae2c6ca6SDavid S. Miller add %o0, 8, %o0 163ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 164*1ab32693SBabu Moger EX_ST(STORE(stx, %g2, %o0 - 0x08), memcpy_retl_o2_plus_g1_plus_8) 165ae2c6ca6SDavid S. Miller 166ae2c6ca6SDavid S. Miller.Llarge_aligned: 167ae2c6ca6SDavid S. Miller /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ 168ae2c6ca6SDavid S. Miller andn %o2, 0x3f, %o4 169ae2c6ca6SDavid S. Miller sub %o2, %o4, %o2 170ae2c6ca6SDavid S. Miller 171*1ab32693SBabu Moger1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o4) 172ae2c6ca6SDavid S. Miller add %o1, 0x40, %o1 173*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 - 0x38, %g2), memcpy_retl_o2_plus_o4) 174ae2c6ca6SDavid S. Miller subcc %o4, 0x40, %o4 175*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 - 0x30, %g3), memcpy_retl_o2_plus_o4_plus_64) 176*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE), memcpy_retl_o2_plus_o4_plus_64) 177*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 - 0x20, %o5), memcpy_retl_o2_plus_o4_plus_64) 178*1ab32693SBabu Moger EX_ST(STORE_INIT(%g1, %o0), memcpy_retl_o2_plus_o4_plus_64) 179ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 180*1ab32693SBabu Moger EX_ST(STORE_INIT(%g2, %o0), memcpy_retl_o2_plus_o4_plus_56) 181ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 182*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 - 0x18, %g2), memcpy_retl_o2_plus_o4_plus_48) 183*1ab32693SBabu Moger EX_ST(STORE_INIT(%g3, %o0), memcpy_retl_o2_plus_o4_plus_48) 184ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 185*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 - 0x10, %g3), memcpy_retl_o2_plus_o4_plus_40) 186*1ab32693SBabu Moger EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), memcpy_retl_o2_plus_o4_plus_40) 187ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 188*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE), memcpy_retl_o2_plus_o4_plus_32) 189*1ab32693SBabu Moger EX_ST(STORE_INIT(%o5, %o0), memcpy_retl_o2_plus_o4_plus_32) 190ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 191*1ab32693SBabu Moger EX_ST(STORE_INIT(%g2, %o0), memcpy_retl_o2_plus_o4_plus_24) 192ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 193*1ab32693SBabu Moger EX_ST(STORE_INIT(%g3, %o0), memcpy_retl_o2_plus_o4_plus_16) 194ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 195*1ab32693SBabu Moger EX_ST(STORE_INIT(GLOBAL_SPARE, %o0), memcpy_retl_o2_plus_o4_plus_8) 196ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 197ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 198ae2c6ca6SDavid S. Miller LOAD(prefetch, %o1 + 0x200, #n_reads_strong) 199ae2c6ca6SDavid S. Miller 200ae2c6ca6SDavid S. Miller membar #StoreLoad | #StoreStore 201ae2c6ca6SDavid S. Miller 202ae2c6ca6SDavid S. Miller brz,pn %o2, .Lexit 203ae2c6ca6SDavid S. Miller cmp %o2, 19 204ae2c6ca6SDavid S. Miller ble,pn %icc, .Lsmall_unaligned 205ae2c6ca6SDavid S. Miller nop 206ae2c6ca6SDavid S. Miller ba,a,pt %icc, .Lmedium_noprefetch 207ae2c6ca6SDavid S. Miller 208ae2c6ca6SDavid S. Miller.Lexit: retl 209ae2c6ca6SDavid S. Miller mov EX_RETVAL(%o3), %o0 210ae2c6ca6SDavid S. Miller 211ae2c6ca6SDavid S. Miller.Llarge_src_unaligned: 212f4da3628SDavid S. Miller#ifdef NON_USER_COPY 213f4da3628SDavid S. Miller VISEntryHalfFast(.Lmedium_vis_entry_fail) 214f4da3628SDavid S. Miller#else 215f4da3628SDavid S. Miller VISEntryHalf 216f4da3628SDavid S. Miller#endif 217ae2c6ca6SDavid S. Miller andn %o2, 0x3f, %o4 218ae2c6ca6SDavid S. Miller sub %o2, %o4, %o2 219ae2c6ca6SDavid S. Miller alignaddr %o1, %g0, %g1 220ae2c6ca6SDavid S. Miller add %o1, %o4, %o1 221*1ab32693SBabu Moger EX_LD_FP(LOAD(ldd, %g1 + 0x00, %f0), memcpy_retl_o2_plus_o4) 222*1ab32693SBabu Moger1: EX_LD_FP(LOAD(ldd, %g1 + 0x08, %f2), memcpy_retl_o2_plus_o4) 223ae2c6ca6SDavid S. Miller subcc %o4, 0x40, %o4 224*1ab32693SBabu Moger EX_LD_FP(LOAD(ldd, %g1 + 0x10, %f4), memcpy_retl_o2_plus_o4_plus_64) 225*1ab32693SBabu Moger EX_LD_FP(LOAD(ldd, %g1 + 0x18, %f6), memcpy_retl_o2_plus_o4_plus_64) 226*1ab32693SBabu Moger EX_LD_FP(LOAD(ldd, %g1 + 0x20, %f8), memcpy_retl_o2_plus_o4_plus_64) 227*1ab32693SBabu Moger EX_LD_FP(LOAD(ldd, %g1 + 0x28, %f10), memcpy_retl_o2_plus_o4_plus_64) 228*1ab32693SBabu Moger EX_LD_FP(LOAD(ldd, %g1 + 0x30, %f12), memcpy_retl_o2_plus_o4_plus_64) 229*1ab32693SBabu Moger EX_LD_FP(LOAD(ldd, %g1 + 0x38, %f14), memcpy_retl_o2_plus_o4_plus_64) 230ae2c6ca6SDavid S. Miller faligndata %f0, %f2, %f16 231*1ab32693SBabu Moger EX_LD_FP(LOAD(ldd, %g1 + 0x40, %f0), memcpy_retl_o2_plus_o4_plus_64) 232ae2c6ca6SDavid S. Miller faligndata %f2, %f4, %f18 233ae2c6ca6SDavid S. Miller add %g1, 0x40, %g1 234ae2c6ca6SDavid S. Miller faligndata %f4, %f6, %f20 235ae2c6ca6SDavid S. Miller faligndata %f6, %f8, %f22 236ae2c6ca6SDavid S. Miller faligndata %f8, %f10, %f24 237ae2c6ca6SDavid S. Miller faligndata %f10, %f12, %f26 238ae2c6ca6SDavid S. Miller faligndata %f12, %f14, %f28 239ae2c6ca6SDavid S. Miller faligndata %f14, %f0, %f30 240*1ab32693SBabu Moger EX_ST_FP(STORE(std, %f16, %o0 + 0x00), memcpy_retl_o2_plus_o4_plus_64) 241*1ab32693SBabu Moger EX_ST_FP(STORE(std, %f18, %o0 + 0x08), memcpy_retl_o2_plus_o4_plus_56) 242*1ab32693SBabu Moger EX_ST_FP(STORE(std, %f20, %o0 + 0x10), memcpy_retl_o2_plus_o4_plus_48) 243*1ab32693SBabu Moger EX_ST_FP(STORE(std, %f22, %o0 + 0x18), memcpy_retl_o2_plus_o4_plus_40) 244*1ab32693SBabu Moger EX_ST_FP(STORE(std, %f24, %o0 + 0x20), memcpy_retl_o2_plus_o4_plus_32) 245*1ab32693SBabu Moger EX_ST_FP(STORE(std, %f26, %o0 + 0x28), memcpy_retl_o2_plus_o4_plus_24) 246*1ab32693SBabu Moger EX_ST_FP(STORE(std, %f28, %o0 + 0x30), memcpy_retl_o2_plus_o4_plus_16) 247*1ab32693SBabu Moger EX_ST_FP(STORE(std, %f30, %o0 + 0x38), memcpy_retl_o2_plus_o4_plus_8) 248ae2c6ca6SDavid S. Miller add %o0, 0x40, %o0 249ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 250ae2c6ca6SDavid S. Miller LOAD(prefetch, %g1 + 0x200, #n_reads_strong) 25144922150SDavid S. Miller#ifdef NON_USER_COPY 25244922150SDavid S. Miller VISExitHalfFast 25344922150SDavid S. Miller#else 254ae2c6ca6SDavid S. Miller VISExitHalf 25544922150SDavid S. Miller#endif 256ae2c6ca6SDavid S. Miller brz,pn %o2, .Lexit 257ae2c6ca6SDavid S. Miller cmp %o2, 19 258ae2c6ca6SDavid S. Miller ble,pn %icc, .Lsmall_unaligned 259ae2c6ca6SDavid S. Miller nop 260ae2c6ca6SDavid S. Miller ba,a,pt %icc, .Lmedium_unaligned 261ae2c6ca6SDavid S. Miller 262f4da3628SDavid S. Miller#ifdef NON_USER_COPY 263f4da3628SDavid S. Miller.Lmedium_vis_entry_fail: 264f4da3628SDavid S. Miller or %o0, %o1, %g2 265f4da3628SDavid S. Miller#endif 266ae2c6ca6SDavid S. Miller.Lmedium: 267ae2c6ca6SDavid S. Miller LOAD(prefetch, %o1 + 0x40, #n_reads_strong) 268ae2c6ca6SDavid S. Miller andcc %g2, 0x7, %g0 269ae2c6ca6SDavid S. Miller bne,pn %icc, .Lmedium_unaligned 270ae2c6ca6SDavid S. Miller nop 271ae2c6ca6SDavid S. Miller.Lmedium_noprefetch: 272ae2c6ca6SDavid S. Miller andncc %o2, 0x20 - 1, %o5 273ae2c6ca6SDavid S. Miller be,pn %icc, 2f 274ae2c6ca6SDavid S. Miller sub %o2, %o5, %o2 275*1ab32693SBabu Moger1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5) 276*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5) 277*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE), memcpy_retl_o2_plus_o5) 278*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5) 279ae2c6ca6SDavid S. Miller add %o1, 0x20, %o1 280ae2c6ca6SDavid S. Miller subcc %o5, 0x20, %o5 281*1ab32693SBabu Moger EX_ST(STORE(stx, %g1, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32) 282*1ab32693SBabu Moger EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24) 283*1ab32693SBabu Moger EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24) 284*1ab32693SBabu Moger EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8) 285ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 286ae2c6ca6SDavid S. Miller add %o0, 0x20, %o0 287ae2c6ca6SDavid S. Miller2: andcc %o2, 0x18, %o5 288ae2c6ca6SDavid S. Miller be,pt %icc, 3f 289ae2c6ca6SDavid S. Miller sub %o2, %o5, %o2 29095707704SDavid S. Miller 291*1ab32693SBabu Moger1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5) 292ae2c6ca6SDavid S. Miller add %o1, 0x08, %o1 293ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 294ae2c6ca6SDavid S. Miller subcc %o5, 0x08, %o5 295ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 296*1ab32693SBabu Moger EX_ST(STORE(stx, %g1, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8) 297ae2c6ca6SDavid S. Miller3: brz,pt %o2, .Lexit 298ae2c6ca6SDavid S. Miller cmp %o2, 0x04 299ae2c6ca6SDavid S. Miller bl,pn %icc, .Ltiny 300ae2c6ca6SDavid S. Miller nop 301*1ab32693SBabu Moger EX_LD(LOAD(lduw, %o1 + 0x00, %g1), memcpy_retl_o2) 302ae2c6ca6SDavid S. Miller add %o1, 0x04, %o1 303ae2c6ca6SDavid S. Miller add %o0, 0x04, %o0 304ae2c6ca6SDavid S. Miller subcc %o2, 0x04, %o2 305ae2c6ca6SDavid S. Miller bne,pn %icc, .Ltiny 306*1ab32693SBabu Moger EX_ST(STORE(stw, %g1, %o0 - 0x04), memcpy_retl_o2_plus_4) 307ae2c6ca6SDavid S. Miller ba,a,pt %icc, .Lexit 308ae2c6ca6SDavid S. Miller.Lmedium_unaligned: 309ae2c6ca6SDavid S. Miller /* First get dest 8 byte aligned. */ 310ae2c6ca6SDavid S. Miller sub %g0, %o0, %g1 311ae2c6ca6SDavid S. Miller and %g1, 0x7, %g1 312ae2c6ca6SDavid S. Miller brz,pt %g1, 2f 313ae2c6ca6SDavid S. Miller sub %o2, %g1, %o2 314ae2c6ca6SDavid S. Miller 315*1ab32693SBabu Moger1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1) 316ae2c6ca6SDavid S. Miller add %o1, 1, %o1 317ae2c6ca6SDavid S. Miller subcc %g1, 1, %g1 318ae2c6ca6SDavid S. Miller add %o0, 1, %o0 319ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 320*1ab32693SBabu Moger EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1) 321ae2c6ca6SDavid S. Miller2: 322ae2c6ca6SDavid S. Miller and %o1, 0x7, %g1 323ae2c6ca6SDavid S. Miller brz,pn %g1, .Lmedium_noprefetch 324ae2c6ca6SDavid S. Miller sll %g1, 3, %g1 325ae2c6ca6SDavid S. Miller mov 64, %g2 326ae2c6ca6SDavid S. Miller sub %g2, %g1, %g2 327ae2c6ca6SDavid S. Miller andn %o1, 0x7, %o1 328*1ab32693SBabu Moger EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2) 329ae2c6ca6SDavid S. Miller sllx %o4, %g1, %o4 330ae2c6ca6SDavid S. Miller andn %o2, 0x08 - 1, %o5 331ae2c6ca6SDavid S. Miller sub %o2, %o5, %o2 332*1ab32693SBabu Moger1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5) 333ae2c6ca6SDavid S. Miller add %o1, 0x08, %o1 334ae2c6ca6SDavid S. Miller subcc %o5, 0x08, %o5 335ae2c6ca6SDavid S. Miller srlx %g3, %g2, GLOBAL_SPARE 336ae2c6ca6SDavid S. Miller or GLOBAL_SPARE, %o4, GLOBAL_SPARE 337*1ab32693SBabu Moger EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8) 338ae2c6ca6SDavid S. Miller add %o0, 0x08, %o0 339ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 340ae2c6ca6SDavid S. Miller sllx %g3, %g1, %o4 341ae2c6ca6SDavid S. Miller srl %g1, 3, %g1 342ae2c6ca6SDavid S. Miller add %o1, %g1, %o1 343ae2c6ca6SDavid S. Miller brz,pn %o2, .Lexit 344ae2c6ca6SDavid S. Miller nop 345ae2c6ca6SDavid S. Miller ba,pt %icc, .Lsmall_unaligned 346ae2c6ca6SDavid S. Miller 347ae2c6ca6SDavid S. Miller.Ltiny: 348*1ab32693SBabu Moger EX_LD(LOAD(ldub, %o1 + 0x00, %g1), memcpy_retl_o2) 349ae2c6ca6SDavid S. Miller subcc %o2, 1, %o2 350ae2c6ca6SDavid S. Miller be,pn %icc, .Lexit 351*1ab32693SBabu Moger EX_ST(STORE(stb, %g1, %o0 + 0x00), memcpy_retl_o2_plus_1) 352*1ab32693SBabu Moger EX_LD(LOAD(ldub, %o1 + 0x01, %g1), memcpy_retl_o2) 353ae2c6ca6SDavid S. Miller subcc %o2, 1, %o2 354ae2c6ca6SDavid S. Miller be,pn %icc, .Lexit 355*1ab32693SBabu Moger EX_ST(STORE(stb, %g1, %o0 + 0x01), memcpy_retl_o2_plus_1) 356*1ab32693SBabu Moger EX_LD(LOAD(ldub, %o1 + 0x02, %g1), memcpy_retl_o2) 357ae2c6ca6SDavid S. Miller ba,pt %icc, .Lexit 358*1ab32693SBabu Moger EX_ST(STORE(stb, %g1, %o0 + 0x02), memcpy_retl_o2) 359ae2c6ca6SDavid S. Miller 360ae2c6ca6SDavid S. Miller.Lsmall: 361ae2c6ca6SDavid S. Miller andcc %g2, 0x3, %g0 362ae2c6ca6SDavid S. Miller bne,pn %icc, .Lsmall_unaligned 363ae2c6ca6SDavid S. Miller andn %o2, 0x4 - 1, %o5 364ae2c6ca6SDavid S. Miller sub %o2, %o5, %o2 365ae2c6ca6SDavid S. Miller1: 366*1ab32693SBabu Moger EX_LD(LOAD(lduw, %o1 + 0x00, %g1), memcpy_retl_o2_plus_o5) 367ae2c6ca6SDavid S. Miller add %o1, 0x04, %o1 368ae2c6ca6SDavid S. Miller subcc %o5, 0x04, %o5 369ae2c6ca6SDavid S. Miller add %o0, 0x04, %o0 370ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 371*1ab32693SBabu Moger EX_ST(STORE(stw, %g1, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4) 372ae2c6ca6SDavid S. Miller brz,pt %o2, .Lexit 373ae2c6ca6SDavid S. Miller nop 374ae2c6ca6SDavid S. Miller ba,a,pt %icc, .Ltiny 375ae2c6ca6SDavid S. Miller 376ae2c6ca6SDavid S. Miller.Lsmall_unaligned: 377*1ab32693SBabu Moger1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1), memcpy_retl_o2) 378ae2c6ca6SDavid S. Miller add %o1, 1, %o1 379ae2c6ca6SDavid S. Miller add %o0, 1, %o0 380ae2c6ca6SDavid S. Miller subcc %o2, 1, %o2 381ae2c6ca6SDavid S. Miller bne,pt %icc, 1b 382*1ab32693SBabu Moger EX_ST(STORE(stb, %g1, %o0 - 0x01), memcpy_retl_o2_plus_1) 383ae2c6ca6SDavid S. Miller ba,a,pt %icc, .Lexit 3840ae2d26fSBabu Moger nop 385ae2c6ca6SDavid S. Miller .size FUNC_NAME, .-FUNC_NAME 386