1*5d9d9091SRichard Lowe/* 2*5d9d9091SRichard Lowe * CDDL HEADER START 3*5d9d9091SRichard Lowe * 4*5d9d9091SRichard Lowe * The contents of this file are subject to the terms of the 5*5d9d9091SRichard Lowe * Common Development and Distribution License (the "License"). 6*5d9d9091SRichard Lowe * You may not use this file except in compliance with the License. 7*5d9d9091SRichard Lowe * 8*5d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*5d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing. 10*5d9d9091SRichard Lowe * See the License for the specific language governing permissions 11*5d9d9091SRichard Lowe * and limitations under the License. 12*5d9d9091SRichard Lowe * 13*5d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each 14*5d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*5d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the 16*5d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying 17*5d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner] 18*5d9d9091SRichard Lowe * 19*5d9d9091SRichard Lowe * CDDL HEADER END 20*5d9d9091SRichard Lowe */ 21*5d9d9091SRichard Lowe 22*5d9d9091SRichard Lowe/* 23*5d9d9091SRichard Lowe * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24*5d9d9091SRichard Lowe * Use is subject to license terms. 25*5d9d9091SRichard Lowe */ 26*5d9d9091SRichard Lowe 27*5d9d9091SRichard Lowe/* 28*5d9d9091SRichard Lowe * Copyright (c) 2008, Intel Corporation 29*5d9d9091SRichard Lowe * All rights reserved. 30*5d9d9091SRichard Lowe */ 31*5d9d9091SRichard Lowe 32*5d9d9091SRichard Lowe/* 33*5d9d9091SRichard Lowe * memcpy.s - copies two blocks of memory 34*5d9d9091SRichard Lowe * Implements memcpy() and memmove() libc primitives. 35*5d9d9091SRichard Lowe */ 36*5d9d9091SRichard Lowe 37*5d9d9091SRichard Lowe .file "memcpy.s" 38*5d9d9091SRichard Lowe 39*5d9d9091SRichard Lowe#include <sys/asm_linkage.h> 40*5d9d9091SRichard Lowe 41*5d9d9091SRichard Lowe ANSI_PRAGMA_WEAK(memmove,function) 42*5d9d9091SRichard Lowe ANSI_PRAGMA_WEAK(memcpy,function) 43*5d9d9091SRichard Lowe 44*5d9d9091SRichard Lowe#include "cache.h" 45*5d9d9091SRichard Lowe#include "proc64_id.h" 46*5d9d9091SRichard Lowe 47*5d9d9091SRichard Lowe#define L(s) .memcpy##s 48*5d9d9091SRichard Lowe 49*5d9d9091SRichard Lowe/* 50*5d9d9091SRichard Lowe * memcpy algorithm overview: 51*5d9d9091SRichard Lowe * 52*5d9d9091SRichard Lowe * Thresholds used below were determined experimentally. 53*5d9d9091SRichard Lowe * 54*5d9d9091SRichard Lowe * Pseudo code: 55*5d9d9091SRichard Lowe * 56*5d9d9091SRichard Lowe * NOTE: On AMD NO_SSE is always set. Performance on Opteron did not improve 57*5d9d9091SRichard Lowe * using 16-byte stores. Setting NO_SSE on AMD should be re-evaluated on 58*5d9d9091SRichard Lowe * future AMD processors. 59*5d9d9091SRichard Lowe * 60*5d9d9091SRichard Lowe * 61*5d9d9091SRichard Lowe * If (size <= 128 bytes) { 62*5d9d9091SRichard Lowe * do unrolled code (primarily 8-byte loads/stores) regardless of 63*5d9d9091SRichard Lowe * alignment. 64*5d9d9091SRichard Lowe * } else { 65*5d9d9091SRichard Lowe * Align destination to 16-byte boundary 66*5d9d9091SRichard Lowe * 67*5d9d9091SRichard Lowe * if (NO_SSE) { 68*5d9d9091SRichard Lowe * If (size > half of the largest level cache) { 69*5d9d9091SRichard Lowe * Use 8-byte non-temporal stores (64-bytes/loop) 70*5d9d9091SRichard Lowe * } else { 71*5d9d9091SRichard Lowe * if (size > 4K && size <= half l1 cache size) { 72*5d9d9091SRichard Lowe * Use rep movsq 73*5d9d9091SRichard Lowe * } else { 74*5d9d9091SRichard Lowe * Use 8-byte loads/stores (64 bytes per loop) 75*5d9d9091SRichard Lowe * } 76*5d9d9091SRichard Lowe * } 77*5d9d9091SRichard Lowe * 78*5d9d9091SRichard Lowe * } else { **USE SSE** 79*5d9d9091SRichard Lowe * If (size > half of the largest level cache) { 80*5d9d9091SRichard Lowe * Use 16-byte non-temporal stores (128-bytes per loop) 81*5d9d9091SRichard Lowe * } else { 82*5d9d9091SRichard Lowe * If (both source and destination are aligned) { 83*5d9d9091SRichard Lowe * Use 16-byte aligned loads and stores (128 bytes/loop) 84*5d9d9091SRichard Lowe * } else { 85*5d9d9091SRichard Lowe * use pairs of xmm registers with SSE2 or SSSE3 86*5d9d9091SRichard Lowe * instructions to concatenate and shift appropriately 87*5d9d9091SRichard Lowe * to account for source unalignment. This enables 88*5d9d9091SRichard Lowe * 16-byte aligned loads to be done. 89*5d9d9091SRichard Lowe * } 90*5d9d9091SRichard Lowe * } 91*5d9d9091SRichard Lowe } 92*5d9d9091SRichard Lowe * 93*5d9d9091SRichard Lowe * Finish any remaining bytes via unrolled code above. 94*5d9d9091SRichard Lowe * } 95*5d9d9091SRichard Lowe * 96*5d9d9091SRichard Lowe * memmove overview: 97*5d9d9091SRichard Lowe * memmove is the same as memcpy except one case where copy needs to be 98*5d9d9091SRichard Lowe * done backwards. The copy backwards code is done in a similar manner. 99*5d9d9091SRichard Lowe */ 100*5d9d9091SRichard Lowe 101*5d9d9091SRichard Lowe ENTRY(memmove) 102*5d9d9091SRichard Lowe cmp %rsi,%rdi # if dst <= src 103*5d9d9091SRichard Lowe jbe L(CopyForward) # then do copy forward 104*5d9d9091SRichard Lowe mov %rsi,%r9 # move src to r9 105*5d9d9091SRichard Lowe add %rdx,%r9 # add len to get addr of end of src 106*5d9d9091SRichard Lowe cmp %r9,%rdi # if dst < end of src 107*5d9d9091SRichard Lowe jb L(CopyBackwards) # then do copy backwards 108*5d9d9091SRichard Lowe jmp L(CopyForward) 109*5d9d9091SRichard Lowe 110*5d9d9091SRichard Lowe ENTRY (memcpy) 111*5d9d9091SRichard LoweL(CopyForward): 112*5d9d9091SRichard Lowe mov %rdx,%r8 113*5d9d9091SRichard Lowe mov %rdi,%rcx 114*5d9d9091SRichard Lowe mov %rsi,%rdx 115*5d9d9091SRichard Lowe mov %rdi,%rax 116*5d9d9091SRichard Lowe lea L(fwdPxQx)(%rip),%r11 117*5d9d9091SRichard Lowe cmp $0x80,%r8 # 128 118*5d9d9091SRichard Lowe jg L(ck_use_sse2) 119*5d9d9091SRichard Lowe add %r8,%rcx 120*5d9d9091SRichard Lowe add %r8,%rdx 121*5d9d9091SRichard Lowe 122*5d9d9091SRichard Lowe movslq (%r11,%r8,4),%r10 123*5d9d9091SRichard Lowe lea (%r10,%r11,1),%r11 124*5d9d9091SRichard Lowe jmpq *%r11 125*5d9d9091SRichard Lowe 126*5d9d9091SRichard Lowe .balign 16 127*5d9d9091SRichard LoweL(ShrtAlignNew): 128*5d9d9091SRichard Lowe lea L(AliPxQx)(%rip),%r11 129*5d9d9091SRichard Lowe mov %rcx,%r9 130*5d9d9091SRichard Lowe and $0xf,%r9 131*5d9d9091SRichard Lowe 132*5d9d9091SRichard Lowe movslq (%r11,%r9,4),%r10 133*5d9d9091SRichard Lowe lea (%r10,%r11,1),%r11 134*5d9d9091SRichard Lowe jmpq *%r11 135*5d9d9091SRichard Lowe 136*5d9d9091SRichard Lowe .balign 16 137*5d9d9091SRichard LoweL(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) 138*5d9d9091SRichard Lowe .int L(P1Q0)-L(fwdPxQx) 139*5d9d9091SRichard Lowe .int L(P2Q0)-L(fwdPxQx) 140*5d9d9091SRichard Lowe .int L(P3Q0)-L(fwdPxQx) 141*5d9d9091SRichard Lowe .int L(P4Q0)-L(fwdPxQx) 142*5d9d9091SRichard Lowe .int L(P5Q0)-L(fwdPxQx) 143*5d9d9091SRichard Lowe .int L(P6Q0)-L(fwdPxQx) 144*5d9d9091SRichard Lowe .int L(P7Q0)-L(fwdPxQx) 145*5d9d9091SRichard Lowe 146*5d9d9091SRichard Lowe .int L(P0Q1)-L(fwdPxQx) 147*5d9d9091SRichard Lowe .int L(P1Q1)-L(fwdPxQx) 148*5d9d9091SRichard Lowe .int L(P2Q1)-L(fwdPxQx) 149*5d9d9091SRichard Lowe .int L(P3Q1)-L(fwdPxQx) 150*5d9d9091SRichard Lowe .int L(P4Q1)-L(fwdPxQx) 151*5d9d9091SRichard Lowe .int L(P5Q1)-L(fwdPxQx) 152*5d9d9091SRichard Lowe .int L(P6Q1)-L(fwdPxQx) 153*5d9d9091SRichard Lowe .int L(P7Q1)-L(fwdPxQx) 154*5d9d9091SRichard Lowe 155*5d9d9091SRichard Lowe .int L(P0Q2)-L(fwdPxQx) 156*5d9d9091SRichard Lowe .int L(P1Q2)-L(fwdPxQx) 157*5d9d9091SRichard Lowe .int L(P2Q2)-L(fwdPxQx) 158*5d9d9091SRichard Lowe .int L(P3Q2)-L(fwdPxQx) 159*5d9d9091SRichard Lowe .int L(P4Q2)-L(fwdPxQx) 160*5d9d9091SRichard Lowe .int L(P5Q2)-L(fwdPxQx) 161*5d9d9091SRichard Lowe .int L(P6Q2)-L(fwdPxQx) 162*5d9d9091SRichard Lowe .int L(P7Q2)-L(fwdPxQx) 163*5d9d9091SRichard Lowe 164*5d9d9091SRichard Lowe .int L(P0Q3)-L(fwdPxQx) 165*5d9d9091SRichard Lowe .int L(P1Q3)-L(fwdPxQx) 166*5d9d9091SRichard Lowe .int L(P2Q3)-L(fwdPxQx) 167*5d9d9091SRichard Lowe .int L(P3Q3)-L(fwdPxQx) 168*5d9d9091SRichard Lowe .int L(P4Q3)-L(fwdPxQx) 169*5d9d9091SRichard Lowe .int L(P5Q3)-L(fwdPxQx) 170*5d9d9091SRichard Lowe .int L(P6Q3)-L(fwdPxQx) 171*5d9d9091SRichard Lowe .int L(P7Q3)-L(fwdPxQx) 172*5d9d9091SRichard Lowe 173*5d9d9091SRichard Lowe .int L(P0Q4)-L(fwdPxQx) 174*5d9d9091SRichard Lowe .int L(P1Q4)-L(fwdPxQx) 175*5d9d9091SRichard Lowe .int L(P2Q4)-L(fwdPxQx) 176*5d9d9091SRichard Lowe .int L(P3Q4)-L(fwdPxQx) 177*5d9d9091SRichard Lowe .int L(P4Q4)-L(fwdPxQx) 178*5d9d9091SRichard Lowe .int L(P5Q4)-L(fwdPxQx) 179*5d9d9091SRichard Lowe .int L(P6Q4)-L(fwdPxQx) 180*5d9d9091SRichard Lowe .int L(P7Q4)-L(fwdPxQx) 181*5d9d9091SRichard Lowe 182*5d9d9091SRichard Lowe .int L(P0Q5)-L(fwdPxQx) 183*5d9d9091SRichard Lowe .int L(P1Q5)-L(fwdPxQx) 184*5d9d9091SRichard Lowe .int L(P2Q5)-L(fwdPxQx) 185*5d9d9091SRichard Lowe .int L(P3Q5)-L(fwdPxQx) 186*5d9d9091SRichard Lowe .int L(P4Q5)-L(fwdPxQx) 187*5d9d9091SRichard Lowe .int L(P5Q5)-L(fwdPxQx) 188*5d9d9091SRichard Lowe .int L(P6Q5)-L(fwdPxQx) 189*5d9d9091SRichard Lowe .int L(P7Q5)-L(fwdPxQx) 190*5d9d9091SRichard Lowe 191*5d9d9091SRichard Lowe .int L(P0Q6)-L(fwdPxQx) 192*5d9d9091SRichard Lowe .int L(P1Q6)-L(fwdPxQx) 193*5d9d9091SRichard Lowe .int L(P2Q6)-L(fwdPxQx) 194*5d9d9091SRichard Lowe .int L(P3Q6)-L(fwdPxQx) 195*5d9d9091SRichard Lowe .int L(P4Q6)-L(fwdPxQx) 196*5d9d9091SRichard Lowe .int L(P5Q6)-L(fwdPxQx) 197*5d9d9091SRichard Lowe .int L(P6Q6)-L(fwdPxQx) 198*5d9d9091SRichard Lowe .int L(P7Q6)-L(fwdPxQx) 199*5d9d9091SRichard Lowe 200*5d9d9091SRichard Lowe .int L(P0Q7)-L(fwdPxQx) 201*5d9d9091SRichard Lowe .int L(P1Q7)-L(fwdPxQx) 202*5d9d9091SRichard Lowe .int L(P2Q7)-L(fwdPxQx) 203*5d9d9091SRichard Lowe .int L(P3Q7)-L(fwdPxQx) 204*5d9d9091SRichard Lowe .int L(P4Q7)-L(fwdPxQx) 205*5d9d9091SRichard Lowe .int L(P5Q7)-L(fwdPxQx) 206*5d9d9091SRichard Lowe .int L(P6Q7)-L(fwdPxQx) 207*5d9d9091SRichard Lowe .int L(P7Q7)-L(fwdPxQx) 208*5d9d9091SRichard Lowe 209*5d9d9091SRichard Lowe .int L(P0Q8)-L(fwdPxQx) 210*5d9d9091SRichard Lowe .int L(P1Q8)-L(fwdPxQx) 211*5d9d9091SRichard Lowe .int L(P2Q8)-L(fwdPxQx) 212*5d9d9091SRichard Lowe .int L(P3Q8)-L(fwdPxQx) 213*5d9d9091SRichard Lowe .int L(P4Q8)-L(fwdPxQx) 214*5d9d9091SRichard Lowe .int L(P5Q8)-L(fwdPxQx) 215*5d9d9091SRichard Lowe .int L(P6Q8)-L(fwdPxQx) 216*5d9d9091SRichard Lowe .int L(P7Q8)-L(fwdPxQx) 217*5d9d9091SRichard Lowe 218*5d9d9091SRichard Lowe .int L(P0Q9)-L(fwdPxQx) 219*5d9d9091SRichard Lowe .int L(P1Q9)-L(fwdPxQx) 220*5d9d9091SRichard Lowe .int L(P2Q9)-L(fwdPxQx) 221*5d9d9091SRichard Lowe .int L(P3Q9)-L(fwdPxQx) 222*5d9d9091SRichard Lowe .int L(P4Q9)-L(fwdPxQx) 223*5d9d9091SRichard Lowe .int L(P5Q9)-L(fwdPxQx) 224*5d9d9091SRichard Lowe .int L(P6Q9)-L(fwdPxQx) 225*5d9d9091SRichard Lowe .int L(P7Q9)-L(fwdPxQx) 226*5d9d9091SRichard Lowe 227*5d9d9091SRichard Lowe .int L(P0QA)-L(fwdPxQx) 228*5d9d9091SRichard Lowe .int L(P1QA)-L(fwdPxQx) 229*5d9d9091SRichard Lowe .int L(P2QA)-L(fwdPxQx) 230*5d9d9091SRichard Lowe .int L(P3QA)-L(fwdPxQx) 231*5d9d9091SRichard Lowe .int L(P4QA)-L(fwdPxQx) 232*5d9d9091SRichard Lowe .int L(P5QA)-L(fwdPxQx) 233*5d9d9091SRichard Lowe .int L(P6QA)-L(fwdPxQx) 234*5d9d9091SRichard Lowe .int L(P7QA)-L(fwdPxQx) 235*5d9d9091SRichard Lowe 236*5d9d9091SRichard Lowe .int L(P0QB)-L(fwdPxQx) 237*5d9d9091SRichard Lowe .int L(P1QB)-L(fwdPxQx) 238*5d9d9091SRichard Lowe .int L(P2QB)-L(fwdPxQx) 239*5d9d9091SRichard Lowe .int L(P3QB)-L(fwdPxQx) 240*5d9d9091SRichard Lowe .int L(P4QB)-L(fwdPxQx) 241*5d9d9091SRichard Lowe .int L(P5QB)-L(fwdPxQx) 242*5d9d9091SRichard Lowe .int L(P6QB)-L(fwdPxQx) 243*5d9d9091SRichard Lowe .int L(P7QB)-L(fwdPxQx) 244*5d9d9091SRichard Lowe 245*5d9d9091SRichard Lowe .int L(P0QC)-L(fwdPxQx) 246*5d9d9091SRichard Lowe .int L(P1QC)-L(fwdPxQx) 247*5d9d9091SRichard Lowe .int L(P2QC)-L(fwdPxQx) 248*5d9d9091SRichard Lowe .int L(P3QC)-L(fwdPxQx) 249*5d9d9091SRichard Lowe .int L(P4QC)-L(fwdPxQx) 250*5d9d9091SRichard Lowe .int L(P5QC)-L(fwdPxQx) 251*5d9d9091SRichard Lowe .int L(P6QC)-L(fwdPxQx) 252*5d9d9091SRichard Lowe .int L(P7QC)-L(fwdPxQx) 253*5d9d9091SRichard Lowe 254*5d9d9091SRichard Lowe .int L(P0QD)-L(fwdPxQx) 255*5d9d9091SRichard Lowe .int L(P1QD)-L(fwdPxQx) 256*5d9d9091SRichard Lowe .int L(P2QD)-L(fwdPxQx) 257*5d9d9091SRichard Lowe .int L(P3QD)-L(fwdPxQx) 258*5d9d9091SRichard Lowe .int L(P4QD)-L(fwdPxQx) 259*5d9d9091SRichard Lowe .int L(P5QD)-L(fwdPxQx) 260*5d9d9091SRichard Lowe .int L(P6QD)-L(fwdPxQx) 261*5d9d9091SRichard Lowe .int L(P7QD)-L(fwdPxQx) 262*5d9d9091SRichard Lowe 263*5d9d9091SRichard Lowe .int L(P0QE)-L(fwdPxQx) 264*5d9d9091SRichard Lowe .int L(P1QE)-L(fwdPxQx) 265*5d9d9091SRichard Lowe .int L(P2QE)-L(fwdPxQx) 266*5d9d9091SRichard Lowe .int L(P3QE)-L(fwdPxQx) 267*5d9d9091SRichard Lowe .int L(P4QE)-L(fwdPxQx) 268*5d9d9091SRichard Lowe .int L(P5QE)-L(fwdPxQx) 269*5d9d9091SRichard Lowe .int L(P6QE)-L(fwdPxQx) 270*5d9d9091SRichard Lowe .int L(P7QE)-L(fwdPxQx) 271*5d9d9091SRichard Lowe 272*5d9d9091SRichard Lowe .int L(P0QF)-L(fwdPxQx) 273*5d9d9091SRichard Lowe .int L(P1QF)-L(fwdPxQx) 274*5d9d9091SRichard Lowe .int L(P2QF)-L(fwdPxQx) 275*5d9d9091SRichard Lowe .int L(P3QF)-L(fwdPxQx) 276*5d9d9091SRichard Lowe .int L(P4QF)-L(fwdPxQx) 277*5d9d9091SRichard Lowe .int L(P5QF)-L(fwdPxQx) 278*5d9d9091SRichard Lowe .int L(P6QF)-L(fwdPxQx) 279*5d9d9091SRichard Lowe .int L(P7QF)-L(fwdPxQx) 280*5d9d9091SRichard Lowe 281*5d9d9091SRichard Lowe .int L(P0QG)-L(fwdPxQx) # 0x80 282*5d9d9091SRichard Lowe 283*5d9d9091SRichard Lowe .balign 16 284*5d9d9091SRichard LoweL(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) 285*5d9d9091SRichard Lowe .int L(A1Q0)-L(AliPxQx) 286*5d9d9091SRichard Lowe .int L(A2Q0)-L(AliPxQx) 287*5d9d9091SRichard Lowe .int L(A3Q0)-L(AliPxQx) 288*5d9d9091SRichard Lowe .int L(A4Q0)-L(AliPxQx) 289*5d9d9091SRichard Lowe .int L(A5Q0)-L(AliPxQx) 290*5d9d9091SRichard Lowe .int L(A6Q0)-L(AliPxQx) 291*5d9d9091SRichard Lowe .int L(A7Q0)-L(AliPxQx) 292*5d9d9091SRichard Lowe .int L(A0Q1)-L(AliPxQx) 293*5d9d9091SRichard Lowe .int L(A1Q1)-L(AliPxQx) 294*5d9d9091SRichard Lowe .int L(A2Q1)-L(AliPxQx) 295*5d9d9091SRichard Lowe .int L(A3Q1)-L(AliPxQx) 296*5d9d9091SRichard Lowe .int L(A4Q1)-L(AliPxQx) 297*5d9d9091SRichard Lowe .int L(A5Q1)-L(AliPxQx) 298*5d9d9091SRichard Lowe .int L(A6Q1)-L(AliPxQx) 299*5d9d9091SRichard Lowe .int L(A7Q1)-L(AliPxQx) 300*5d9d9091SRichard Lowe 301*5d9d9091SRichard Lowe .balign 16 302*5d9d9091SRichard LoweL(A1Q0): # ; need to move 8+ 7=1+2+4 bytes 303*5d9d9091SRichard Lowe movzbq (%rdx),%r11 304*5d9d9091SRichard Lowe sub $0xf,%r8 305*5d9d9091SRichard Lowe mov %r11b,(%rcx) 306*5d9d9091SRichard Lowe 307*5d9d9091SRichard Lowe movzwq 0x1(%rdx),%r10 308*5d9d9091SRichard Lowe mov %r10w,0x1(%rcx) 309*5d9d9091SRichard Lowe 310*5d9d9091SRichard Lowe mov 0x3(%rdx),%r9d 311*5d9d9091SRichard Lowe mov %r9d,0x3(%rcx) 312*5d9d9091SRichard Lowe 313*5d9d9091SRichard Lowe mov 0x7(%rdx),%r11 314*5d9d9091SRichard Lowe add $0xf,%rdx 315*5d9d9091SRichard Lowe mov %r11,0x7(%rcx) 316*5d9d9091SRichard Lowe 317*5d9d9091SRichard Lowe add $0xf,%rcx 318*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 319*5d9d9091SRichard Lowe 320*5d9d9091SRichard Lowe .balign 16 321*5d9d9091SRichard LoweL(A2Q0): # ; need to move 8+ 6=2+4 bytes 322*5d9d9091SRichard Lowe movzwq (%rdx),%r10 323*5d9d9091SRichard Lowe sub $0xe,%r8 324*5d9d9091SRichard Lowe mov %r10w,(%rcx) 325*5d9d9091SRichard Lowe 326*5d9d9091SRichard Lowe mov 0x2(%rdx),%r9d 327*5d9d9091SRichard Lowe mov %r9d,0x2(%rcx) 328*5d9d9091SRichard Lowe 329*5d9d9091SRichard Lowe mov 0x6(%rdx),%r11 330*5d9d9091SRichard Lowe add $0xe,%rdx 331*5d9d9091SRichard Lowe mov %r11,0x6(%rcx) 332*5d9d9091SRichard Lowe add $0xe,%rcx 333*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 334*5d9d9091SRichard Lowe 335*5d9d9091SRichard Lowe .balign 16 336*5d9d9091SRichard LoweL(A3Q0): # ; need to move 8+ 5=1+4 bytes 337*5d9d9091SRichard Lowe movzbq (%rdx),%r11 338*5d9d9091SRichard Lowe sub $0xd,%r8 339*5d9d9091SRichard Lowe mov %r11b,(%rcx) 340*5d9d9091SRichard Lowe 341*5d9d9091SRichard Lowe mov 0x1(%rdx),%r9d 342*5d9d9091SRichard Lowe mov %r9d,0x1(%rcx) 343*5d9d9091SRichard Lowe 344*5d9d9091SRichard Lowe mov 0x5(%rdx),%r10 345*5d9d9091SRichard Lowe add $0xd,%rdx 346*5d9d9091SRichard Lowe mov %r10,0x5(%rcx) 347*5d9d9091SRichard Lowe 348*5d9d9091SRichard Lowe add $0xd,%rcx 349*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 350*5d9d9091SRichard Lowe 351*5d9d9091SRichard Lowe .balign 16 352*5d9d9091SRichard LoweL(A4Q0): # ; need to move 8+4 bytes 353*5d9d9091SRichard Lowe mov (%rdx),%r9d 354*5d9d9091SRichard Lowe sub $0xc,%r8 355*5d9d9091SRichard Lowe mov %r9d,(%rcx) 356*5d9d9091SRichard Lowe 357*5d9d9091SRichard Lowe mov 0x4(%rdx),%r10 358*5d9d9091SRichard Lowe add $0xc,%rdx 359*5d9d9091SRichard Lowe mov %r10,0x4(%rcx) 360*5d9d9091SRichard Lowe 361*5d9d9091SRichard Lowe add $0xc,%rcx 362*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 363*5d9d9091SRichard Lowe 364*5d9d9091SRichard Lowe .balign 16 365*5d9d9091SRichard LoweL(A5Q0): # ; need to move 8+ 3=1+2 bytes 366*5d9d9091SRichard Lowe movzbq (%rdx),%r11 367*5d9d9091SRichard Lowe sub $0xb,%r8 368*5d9d9091SRichard Lowe mov %r11b,(%rcx) 369*5d9d9091SRichard Lowe 370*5d9d9091SRichard Lowe movzwq 0x1(%rdx),%r10 371*5d9d9091SRichard Lowe mov %r10w,0x1(%rcx) 372*5d9d9091SRichard Lowe 373*5d9d9091SRichard Lowe mov 0x3(%rdx),%r9 374*5d9d9091SRichard Lowe add $0xb,%rdx 375*5d9d9091SRichard Lowe mov %r9,0x3(%rcx) 376*5d9d9091SRichard Lowe 377*5d9d9091SRichard Lowe add $0xb,%rcx 378*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 379*5d9d9091SRichard Lowe 380*5d9d9091SRichard Lowe .balign 16 381*5d9d9091SRichard LoweL(A6Q0): # ; need to move 8+2 bytes 382*5d9d9091SRichard Lowe movzwq (%rdx),%r10 383*5d9d9091SRichard Lowe sub $0xa,%r8 384*5d9d9091SRichard Lowe mov %r10w,(%rcx) 385*5d9d9091SRichard Lowe 386*5d9d9091SRichard Lowe mov 0x2(%rdx),%r9 387*5d9d9091SRichard Lowe add $0xa,%rdx 388*5d9d9091SRichard Lowe mov %r9,0x2(%rcx) 389*5d9d9091SRichard Lowe 390*5d9d9091SRichard Lowe add $0xa,%rcx 391*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 392*5d9d9091SRichard Lowe 393*5d9d9091SRichard Lowe .balign 16 394*5d9d9091SRichard LoweL(A7Q0): # ; need to move 8+1 byte 395*5d9d9091SRichard Lowe movzbq (%rdx),%r11 396*5d9d9091SRichard Lowe sub $0x9,%r8 397*5d9d9091SRichard Lowe mov %r11b,(%rcx) 398*5d9d9091SRichard Lowe 399*5d9d9091SRichard Lowe mov 0x1(%rdx),%r10 400*5d9d9091SRichard Lowe add $0x9,%rdx 401*5d9d9091SRichard Lowe mov %r10,0x1(%rcx) 402*5d9d9091SRichard Lowe 403*5d9d9091SRichard Lowe add $0x9,%rcx 404*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 405*5d9d9091SRichard Lowe 406*5d9d9091SRichard Lowe .balign 16 407*5d9d9091SRichard LoweL(A0Q1): # ; need to move 8 bytes 408*5d9d9091SRichard Lowe 409*5d9d9091SRichard Lowe mov (%rdx),%r10 410*5d9d9091SRichard Lowe add $0x8,%rdx 411*5d9d9091SRichard Lowe sub $0x8,%r8 412*5d9d9091SRichard Lowe mov %r10,(%rcx) 413*5d9d9091SRichard Lowe 414*5d9d9091SRichard Lowe add $0x8,%rcx 415*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 416*5d9d9091SRichard Lowe 417*5d9d9091SRichard Lowe .balign 16 418*5d9d9091SRichard LoweL(A1Q1): # ; need to move 7=1+2+4 bytes 419*5d9d9091SRichard Lowe movzbq (%rdx),%r11 420*5d9d9091SRichard Lowe sub $0x7,%r8 421*5d9d9091SRichard Lowe mov %r11b,(%rcx) 422*5d9d9091SRichard Lowe 423*5d9d9091SRichard Lowe movzwq 0x1(%rdx),%r10 424*5d9d9091SRichard Lowe mov %r10w,0x1(%rcx) 425*5d9d9091SRichard Lowe 426*5d9d9091SRichard Lowe mov 0x3(%rdx),%r9d 427*5d9d9091SRichard Lowe add $0x7,%rdx 428*5d9d9091SRichard Lowe mov %r9d,0x3(%rcx) 429*5d9d9091SRichard Lowe add $0x7,%rcx 430*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 431*5d9d9091SRichard Lowe 432*5d9d9091SRichard Lowe .balign 16 433*5d9d9091SRichard LoweL(A2Q1): # ; need to move 6=2+4 bytes 434*5d9d9091SRichard Lowe movzwq (%rdx),%r10 435*5d9d9091SRichard Lowe sub $0x6,%r8 436*5d9d9091SRichard Lowe mov %r10w,(%rcx) 437*5d9d9091SRichard Lowe mov 0x2(%rdx),%r9d 438*5d9d9091SRichard Lowe add $0x6,%rdx 439*5d9d9091SRichard Lowe mov %r9d,0x2(%rcx) 440*5d9d9091SRichard Lowe add $0x6,%rcx 441*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 442*5d9d9091SRichard Lowe 443*5d9d9091SRichard Lowe .balign 16 444*5d9d9091SRichard LoweL(A3Q1): # ; need to move 5=1+4 bytes 445*5d9d9091SRichard Lowe movzbq (%rdx),%r11 446*5d9d9091SRichard Lowe sub $0x5,%r8 447*5d9d9091SRichard Lowe mov %r11b,(%rcx) 448*5d9d9091SRichard Lowe mov 0x1(%rdx),%r9d 449*5d9d9091SRichard Lowe add $0x5,%rdx 450*5d9d9091SRichard Lowe mov %r9d,0x1(%rcx) 451*5d9d9091SRichard Lowe add $0x5,%rcx 452*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 453*5d9d9091SRichard Lowe 454*5d9d9091SRichard Lowe .balign 16 455*5d9d9091SRichard LoweL(A4Q1): # ; need to move 4 bytes 456*5d9d9091SRichard Lowe mov (%rdx),%r9d 457*5d9d9091SRichard Lowe sub $0x4,%r8 458*5d9d9091SRichard Lowe add $0x4,%rdx 459*5d9d9091SRichard Lowe mov %r9d,(%rcx) 460*5d9d9091SRichard Lowe add $0x4,%rcx 461*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 462*5d9d9091SRichard Lowe 463*5d9d9091SRichard Lowe .balign 16 464*5d9d9091SRichard LoweL(A5Q1): # ; need to move 3=1+2 bytes 465*5d9d9091SRichard Lowe movzbq (%rdx),%r11 466*5d9d9091SRichard Lowe sub $0x3,%r8 467*5d9d9091SRichard Lowe mov %r11b,(%rcx) 468*5d9d9091SRichard Lowe 469*5d9d9091SRichard Lowe movzwq 0x1(%rdx),%r10 470*5d9d9091SRichard Lowe add $0x3,%rdx 471*5d9d9091SRichard Lowe mov %r10w,0x1(%rcx) 472*5d9d9091SRichard Lowe 473*5d9d9091SRichard Lowe add $0x3,%rcx 474*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 475*5d9d9091SRichard Lowe 476*5d9d9091SRichard Lowe .balign 16 477*5d9d9091SRichard LoweL(A6Q1): # ; need to move 2 bytes 478*5d9d9091SRichard Lowe movzwq (%rdx),%r10 479*5d9d9091SRichard Lowe sub $0x2,%r8 480*5d9d9091SRichard Lowe add $0x2,%rdx 481*5d9d9091SRichard Lowe mov %r10w,(%rcx) 482*5d9d9091SRichard Lowe add $0x2,%rcx 483*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 484*5d9d9091SRichard Lowe 485*5d9d9091SRichard Lowe .balign 16 486*5d9d9091SRichard LoweL(A7Q1): # ; need to move 1 byte 487*5d9d9091SRichard Lowe movzbq (%rdx),%r11 488*5d9d9091SRichard Lowe dec %r8 489*5d9d9091SRichard Lowe inc %rdx 490*5d9d9091SRichard Lowe mov %r11b,(%rcx) 491*5d9d9091SRichard Lowe inc %rcx 492*5d9d9091SRichard Lowe jmp L(now_qw_aligned) 493*5d9d9091SRichard Lowe 494*5d9d9091SRichard Lowe 495*5d9d9091SRichard Lowe .balign 16 496*5d9d9091SRichard LoweL(P0QG): 497*5d9d9091SRichard Lowe mov -0x80(%rdx),%r9 498*5d9d9091SRichard Lowe mov %r9,-0x80(%rcx) 499*5d9d9091SRichard LoweL(P0QF): 500*5d9d9091SRichard Lowe mov -0x78(%rdx),%r10 501*5d9d9091SRichard Lowe mov %r10,-0x78(%rcx) 502*5d9d9091SRichard LoweL(P0QE): 503*5d9d9091SRichard Lowe mov -0x70(%rdx),%r9 504*5d9d9091SRichard Lowe mov %r9,-0x70(%rcx) 505*5d9d9091SRichard LoweL(P0QD): 506*5d9d9091SRichard Lowe mov -0x68(%rdx),%r10 507*5d9d9091SRichard Lowe mov %r10,-0x68(%rcx) 508*5d9d9091SRichard LoweL(P0QC): 509*5d9d9091SRichard Lowe mov -0x60(%rdx),%r9 510*5d9d9091SRichard Lowe mov %r9,-0x60(%rcx) 511*5d9d9091SRichard LoweL(P0QB): 512*5d9d9091SRichard Lowe mov -0x58(%rdx),%r10 513*5d9d9091SRichard Lowe mov %r10,-0x58(%rcx) 514*5d9d9091SRichard LoweL(P0QA): 515*5d9d9091SRichard Lowe mov -0x50(%rdx),%r9 516*5d9d9091SRichard Lowe mov %r9,-0x50(%rcx) 517*5d9d9091SRichard LoweL(P0Q9): 518*5d9d9091SRichard Lowe mov -0x48(%rdx),%r10 519*5d9d9091SRichard Lowe mov %r10,-0x48(%rcx) 520*5d9d9091SRichard LoweL(P0Q8): 521*5d9d9091SRichard Lowe mov -0x40(%rdx),%r9 522*5d9d9091SRichard Lowe mov %r9,-0x40(%rcx) 523*5d9d9091SRichard LoweL(P0Q7): 524*5d9d9091SRichard Lowe mov -0x38(%rdx),%r10 525*5d9d9091SRichard Lowe mov %r10,-0x38(%rcx) 526*5d9d9091SRichard LoweL(P0Q6): 527*5d9d9091SRichard Lowe mov -0x30(%rdx),%r9 528*5d9d9091SRichard Lowe mov %r9,-0x30(%rcx) 529*5d9d9091SRichard LoweL(P0Q5): 530*5d9d9091SRichard Lowe mov -0x28(%rdx),%r10 531*5d9d9091SRichard Lowe mov %r10,-0x28(%rcx) 532*5d9d9091SRichard LoweL(P0Q4): 533*5d9d9091SRichard Lowe mov -0x20(%rdx),%r9 534*5d9d9091SRichard Lowe mov %r9,-0x20(%rcx) 535*5d9d9091SRichard LoweL(P0Q3): 536*5d9d9091SRichard Lowe mov -0x18(%rdx),%r10 537*5d9d9091SRichard Lowe mov %r10,-0x18(%rcx) 538*5d9d9091SRichard LoweL(P0Q2): 539*5d9d9091SRichard Lowe mov -0x10(%rdx),%r9 540*5d9d9091SRichard Lowe mov %r9,-0x10(%rcx) 541*5d9d9091SRichard LoweL(P0Q1): 542*5d9d9091SRichard Lowe mov -0x8(%rdx),%r10 543*5d9d9091SRichard Lowe mov %r10,-0x8(%rcx) 544*5d9d9091SRichard LoweL(P0Q0): 545*5d9d9091SRichard Lowe ret 546*5d9d9091SRichard Lowe 547*5d9d9091SRichard Lowe .balign 16 548*5d9d9091SRichard LoweL(P1QF): 549*5d9d9091SRichard Lowe mov -0x79(%rdx),%r9 550*5d9d9091SRichard Lowe mov %r9,-0x79(%rcx) 551*5d9d9091SRichard LoweL(P1QE): 552*5d9d9091SRichard Lowe mov -0x71(%rdx),%r11 553*5d9d9091SRichard Lowe mov %r11,-0x71(%rcx) 554*5d9d9091SRichard LoweL(P1QD): 555*5d9d9091SRichard Lowe mov -0x69(%rdx),%r10 556*5d9d9091SRichard Lowe mov %r10,-0x69(%rcx) 557*5d9d9091SRichard LoweL(P1QC): 558*5d9d9091SRichard Lowe mov -0x61(%rdx),%r9 559*5d9d9091SRichard Lowe mov %r9,-0x61(%rcx) 560*5d9d9091SRichard LoweL(P1QB): 561*5d9d9091SRichard Lowe mov -0x59(%rdx),%r11 562*5d9d9091SRichard Lowe mov %r11,-0x59(%rcx) 563*5d9d9091SRichard LoweL(P1QA): 564*5d9d9091SRichard Lowe mov -0x51(%rdx),%r10 565*5d9d9091SRichard Lowe mov %r10,-0x51(%rcx) 566*5d9d9091SRichard LoweL(P1Q9): 567*5d9d9091SRichard Lowe mov -0x49(%rdx),%r9 568*5d9d9091SRichard Lowe mov %r9,-0x49(%rcx) 569*5d9d9091SRichard LoweL(P1Q8): 570*5d9d9091SRichard Lowe mov -0x41(%rdx),%r11 571*5d9d9091SRichard Lowe mov %r11,-0x41(%rcx) 572*5d9d9091SRichard LoweL(P1Q7): 573*5d9d9091SRichard Lowe mov -0x39(%rdx),%r10 574*5d9d9091SRichard Lowe mov %r10,-0x39(%rcx) 575*5d9d9091SRichard LoweL(P1Q6): 576*5d9d9091SRichard Lowe mov -0x31(%rdx),%r9 577*5d9d9091SRichard Lowe mov %r9,-0x31(%rcx) 578*5d9d9091SRichard LoweL(P1Q5): 579*5d9d9091SRichard Lowe mov -0x29(%rdx),%r11 580*5d9d9091SRichard Lowe mov %r11,-0x29(%rcx) 581*5d9d9091SRichard LoweL(P1Q4): 582*5d9d9091SRichard Lowe mov -0x21(%rdx),%r10 583*5d9d9091SRichard Lowe mov %r10,-0x21(%rcx) 584*5d9d9091SRichard LoweL(P1Q3): 585*5d9d9091SRichard Lowe mov -0x19(%rdx),%r9 586*5d9d9091SRichard Lowe mov %r9,-0x19(%rcx) 587*5d9d9091SRichard LoweL(P1Q2): 588*5d9d9091SRichard Lowe mov -0x11(%rdx),%r11 589*5d9d9091SRichard Lowe mov %r11,-0x11(%rcx) 590*5d9d9091SRichard LoweL(P1Q1): 591*5d9d9091SRichard Lowe mov -0x9(%rdx),%r10 592*5d9d9091SRichard Lowe mov %r10,-0x9(%rcx) 593*5d9d9091SRichard LoweL(P1Q0): 594*5d9d9091SRichard Lowe movzbq -0x1(%rdx),%r9 595*5d9d9091SRichard Lowe mov %r9b,-0x1(%rcx) 596*5d9d9091SRichard Lowe ret 597*5d9d9091SRichard Lowe 598*5d9d9091SRichard Lowe .balign 16 599*5d9d9091SRichard LoweL(P2QF): 600*5d9d9091SRichard Lowe mov -0x7a(%rdx),%r9 601*5d9d9091SRichard Lowe mov %r9,-0x7a(%rcx) 602*5d9d9091SRichard LoweL(P2QE): 603*5d9d9091SRichard Lowe mov -0x72(%rdx),%r11 604*5d9d9091SRichard Lowe mov %r11,-0x72(%rcx) 605*5d9d9091SRichard LoweL(P2QD): 606*5d9d9091SRichard Lowe mov -0x6a(%rdx),%r10 607*5d9d9091SRichard Lowe mov %r10,-0x6a(%rcx) 608*5d9d9091SRichard LoweL(P2QC): 609*5d9d9091SRichard Lowe mov -0x62(%rdx),%r9 610*5d9d9091SRichard Lowe mov %r9,-0x62(%rcx) 611*5d9d9091SRichard LoweL(P2QB): 612*5d9d9091SRichard Lowe mov -0x5a(%rdx),%r11 613*5d9d9091SRichard Lowe mov %r11,-0x5a(%rcx) 614*5d9d9091SRichard LoweL(P2QA): 615*5d9d9091SRichard Lowe mov -0x52(%rdx),%r10 616*5d9d9091SRichard Lowe mov %r10,-0x52(%rcx) 617*5d9d9091SRichard LoweL(P2Q9): 618*5d9d9091SRichard Lowe mov -0x4a(%rdx),%r9 619*5d9d9091SRichard Lowe mov %r9,-0x4a(%rcx) 620*5d9d9091SRichard LoweL(P2Q8): 621*5d9d9091SRichard Lowe mov -0x42(%rdx),%r11 622*5d9d9091SRichard Lowe mov %r11,-0x42(%rcx) 623*5d9d9091SRichard LoweL(P2Q7): 624*5d9d9091SRichard Lowe mov -0x3a(%rdx),%r10 625*5d9d9091SRichard Lowe mov %r10,-0x3a(%rcx) 626*5d9d9091SRichard LoweL(P2Q6): 627*5d9d9091SRichard Lowe mov -0x32(%rdx),%r9 628*5d9d9091SRichard Lowe mov %r9,-0x32(%rcx) 629*5d9d9091SRichard LoweL(P2Q5): 630*5d9d9091SRichard Lowe mov -0x2a(%rdx),%r11 631*5d9d9091SRichard Lowe mov %r11,-0x2a(%rcx) 632*5d9d9091SRichard LoweL(P2Q4): 633*5d9d9091SRichard Lowe mov -0x22(%rdx),%r10 634*5d9d9091SRichard Lowe mov %r10,-0x22(%rcx) 635*5d9d9091SRichard LoweL(P2Q3): 636*5d9d9091SRichard Lowe mov -0x1a(%rdx),%r9 637*5d9d9091SRichard Lowe mov %r9,-0x1a(%rcx) 638*5d9d9091SRichard LoweL(P2Q2): 639*5d9d9091SRichard Lowe mov -0x12(%rdx),%r11 640*5d9d9091SRichard Lowe mov %r11,-0x12(%rcx) 641*5d9d9091SRichard LoweL(P2Q1): 642*5d9d9091SRichard Lowe mov -0xa(%rdx),%r10 643*5d9d9091SRichard Lowe mov %r10,-0xa(%rcx) 644*5d9d9091SRichard LoweL(P2Q0): 645*5d9d9091SRichard Lowe movzwq -0x2(%rdx),%r9 646*5d9d9091SRichard Lowe mov %r9w,-0x2(%rcx) 647*5d9d9091SRichard Lowe ret 648*5d9d9091SRichard Lowe 649*5d9d9091SRichard Lowe .balign 16 650*5d9d9091SRichard LoweL(P3QF): 651*5d9d9091SRichard Lowe mov -0x7b(%rdx),%r9 652*5d9d9091SRichard Lowe mov %r9,-0x7b(%rcx) 653*5d9d9091SRichard LoweL(P3QE): 654*5d9d9091SRichard Lowe mov -0x73(%rdx),%r11 655*5d9d9091SRichard Lowe mov %r11,-0x73(%rcx) 656*5d9d9091SRichard LoweL(P3QD): 657*5d9d9091SRichard Lowe mov -0x6b(%rdx),%r10 658*5d9d9091SRichard Lowe mov %r10,-0x6b(%rcx) 659*5d9d9091SRichard LoweL(P3QC): 660*5d9d9091SRichard Lowe mov -0x63(%rdx),%r9 661*5d9d9091SRichard Lowe mov %r9,-0x63(%rcx) 662*5d9d9091SRichard LoweL(P3QB): 663*5d9d9091SRichard Lowe mov -0x5b(%rdx),%r11 664*5d9d9091SRichard Lowe mov %r11,-0x5b(%rcx) 665*5d9d9091SRichard LoweL(P3QA): 666*5d9d9091SRichard Lowe mov -0x53(%rdx),%r10 667*5d9d9091SRichard Lowe mov %r10,-0x53(%rcx) 668*5d9d9091SRichard LoweL(P3Q9): 669*5d9d9091SRichard Lowe mov -0x4b(%rdx),%r9 670*5d9d9091SRichard Lowe mov %r9,-0x4b(%rcx) 671*5d9d9091SRichard LoweL(P3Q8): 672*5d9d9091SRichard Lowe mov -0x43(%rdx),%r11 673*5d9d9091SRichard Lowe mov %r11,-0x43(%rcx) 674*5d9d9091SRichard LoweL(P3Q7): 675*5d9d9091SRichard Lowe mov -0x3b(%rdx),%r10 676*5d9d9091SRichard Lowe mov %r10,-0x3b(%rcx) 677*5d9d9091SRichard LoweL(P3Q6): 678*5d9d9091SRichard Lowe mov -0x33(%rdx),%r9 679*5d9d9091SRichard Lowe mov %r9,-0x33(%rcx) 680*5d9d9091SRichard LoweL(P3Q5): 681*5d9d9091SRichard Lowe mov -0x2b(%rdx),%r11 682*5d9d9091SRichard Lowe mov %r11,-0x2b(%rcx) 683*5d9d9091SRichard LoweL(P3Q4): 684*5d9d9091SRichard Lowe mov -0x23(%rdx),%r10 685*5d9d9091SRichard Lowe mov %r10,-0x23(%rcx) 686*5d9d9091SRichard LoweL(P3Q3): 687*5d9d9091SRichard Lowe mov -0x1b(%rdx),%r9 688*5d9d9091SRichard Lowe mov %r9,-0x1b(%rcx) 689*5d9d9091SRichard LoweL(P3Q2): 690*5d9d9091SRichard Lowe mov -0x13(%rdx),%r11 691*5d9d9091SRichard Lowe mov %r11,-0x13(%rcx) 692*5d9d9091SRichard LoweL(P3Q1): 693*5d9d9091SRichard Lowe mov -0xb(%rdx),%r10 694*5d9d9091SRichard Lowe mov %r10,-0xb(%rcx) 695*5d9d9091SRichard Lowe /* 696*5d9d9091SRichard Lowe * These trailing loads/stores have to do all their loads 1st, 697*5d9d9091SRichard Lowe * then do the stores. 698*5d9d9091SRichard Lowe */ 699*5d9d9091SRichard LoweL(P3Q0): 700*5d9d9091SRichard Lowe movzwq -0x3(%rdx),%r9 701*5d9d9091SRichard Lowe movzbq -0x1(%rdx),%r10 702*5d9d9091SRichard Lowe mov %r9w,-0x3(%rcx) 703*5d9d9091SRichard Lowe mov %r10b,-0x1(%rcx) 704*5d9d9091SRichard Lowe ret 705*5d9d9091SRichard Lowe 706*5d9d9091SRichard Lowe .balign 16 707*5d9d9091SRichard LoweL(P4QF): 708*5d9d9091SRichard Lowe mov -0x7c(%rdx),%r9 709*5d9d9091SRichard Lowe mov %r9,-0x7c(%rcx) 710*5d9d9091SRichard LoweL(P4QE): 711*5d9d9091SRichard Lowe mov -0x74(%rdx),%r11 712*5d9d9091SRichard Lowe mov %r11,-0x74(%rcx) 713*5d9d9091SRichard LoweL(P4QD): 714*5d9d9091SRichard Lowe mov -0x6c(%rdx),%r10 715*5d9d9091SRichard Lowe mov %r10,-0x6c(%rcx) 716*5d9d9091SRichard LoweL(P4QC): 717*5d9d9091SRichard Lowe mov -0x64(%rdx),%r9 718*5d9d9091SRichard Lowe mov %r9,-0x64(%rcx) 719*5d9d9091SRichard LoweL(P4QB): 720*5d9d9091SRichard Lowe mov -0x5c(%rdx),%r11 721*5d9d9091SRichard Lowe mov %r11,-0x5c(%rcx) 722*5d9d9091SRichard LoweL(P4QA): 723*5d9d9091SRichard Lowe mov -0x54(%rdx),%r10 724*5d9d9091SRichard Lowe mov %r10,-0x54(%rcx) 725*5d9d9091SRichard LoweL(P4Q9): 726*5d9d9091SRichard Lowe mov -0x4c(%rdx),%r9 727*5d9d9091SRichard Lowe mov %r9,-0x4c(%rcx) 728*5d9d9091SRichard LoweL(P4Q8): 729*5d9d9091SRichard Lowe mov -0x44(%rdx),%r11 730*5d9d9091SRichard Lowe mov %r11,-0x44(%rcx) 731*5d9d9091SRichard LoweL(P4Q7): 732*5d9d9091SRichard Lowe mov -0x3c(%rdx),%r10 733*5d9d9091SRichard Lowe mov %r10,-0x3c(%rcx) 734*5d9d9091SRichard LoweL(P4Q6): 735*5d9d9091SRichard Lowe mov -0x34(%rdx),%r9 736*5d9d9091SRichard Lowe mov %r9,-0x34(%rcx) 737*5d9d9091SRichard LoweL(P4Q5): 738*5d9d9091SRichard Lowe mov -0x2c(%rdx),%r11 739*5d9d9091SRichard Lowe mov %r11,-0x2c(%rcx) 740*5d9d9091SRichard LoweL(P4Q4): 741*5d9d9091SRichard Lowe mov -0x24(%rdx),%r10 742*5d9d9091SRichard Lowe mov %r10,-0x24(%rcx) 743*5d9d9091SRichard LoweL(P4Q3): 744*5d9d9091SRichard Lowe mov -0x1c(%rdx),%r9 745*5d9d9091SRichard Lowe mov %r9,-0x1c(%rcx) 746*5d9d9091SRichard LoweL(P4Q2): 747*5d9d9091SRichard Lowe mov -0x14(%rdx),%r11 748*5d9d9091SRichard Lowe mov %r11,-0x14(%rcx) 749*5d9d9091SRichard LoweL(P4Q1): 750*5d9d9091SRichard Lowe mov -0xc(%rdx),%r10 751*5d9d9091SRichard Lowe mov %r10,-0xc(%rcx) 752*5d9d9091SRichard LoweL(P4Q0): 753*5d9d9091SRichard Lowe mov -0x4(%rdx),%r9d 754*5d9d9091SRichard Lowe mov %r9d,-0x4(%rcx) 755*5d9d9091SRichard Lowe ret 756*5d9d9091SRichard Lowe 757*5d9d9091SRichard Lowe .balign 16 758*5d9d9091SRichard LoweL(P5QF): 759*5d9d9091SRichard Lowe mov -0x7d(%rdx),%r9 760*5d9d9091SRichard Lowe mov %r9,-0x7d(%rcx) 761*5d9d9091SRichard LoweL(P5QE): 762*5d9d9091SRichard Lowe mov -0x75(%rdx),%r11 763*5d9d9091SRichard Lowe mov %r11,-0x75(%rcx) 764*5d9d9091SRichard LoweL(P5QD): 765*5d9d9091SRichard Lowe mov -0x6d(%rdx),%r10 766*5d9d9091SRichard Lowe mov %r10,-0x6d(%rcx) 767*5d9d9091SRichard LoweL(P5QC): 768*5d9d9091SRichard Lowe mov -0x65(%rdx),%r9 769*5d9d9091SRichard Lowe mov %r9,-0x65(%rcx) 770*5d9d9091SRichard LoweL(P5QB): 771*5d9d9091SRichard Lowe mov -0x5d(%rdx),%r11 772*5d9d9091SRichard Lowe mov %r11,-0x5d(%rcx) 773*5d9d9091SRichard LoweL(P5QA): 774*5d9d9091SRichard Lowe mov -0x55(%rdx),%r10 775*5d9d9091SRichard Lowe mov %r10,-0x55(%rcx) 776*5d9d9091SRichard LoweL(P5Q9): 777*5d9d9091SRichard Lowe mov -0x4d(%rdx),%r9 778*5d9d9091SRichard Lowe mov %r9,-0x4d(%rcx) 779*5d9d9091SRichard LoweL(P5Q8): 780*5d9d9091SRichard Lowe mov -0x45(%rdx),%r11 781*5d9d9091SRichard Lowe mov %r11,-0x45(%rcx) 782*5d9d9091SRichard LoweL(P5Q7): 783*5d9d9091SRichard Lowe mov -0x3d(%rdx),%r10 784*5d9d9091SRichard Lowe mov %r10,-0x3d(%rcx) 785*5d9d9091SRichard LoweL(P5Q6): 786*5d9d9091SRichard Lowe mov -0x35(%rdx),%r9 787*5d9d9091SRichard Lowe mov %r9,-0x35(%rcx) 788*5d9d9091SRichard LoweL(P5Q5): 789*5d9d9091SRichard Lowe mov -0x2d(%rdx),%r11 790*5d9d9091SRichard Lowe mov %r11,-0x2d(%rcx) 791*5d9d9091SRichard LoweL(P5Q4): 792*5d9d9091SRichard Lowe mov -0x25(%rdx),%r10 793*5d9d9091SRichard Lowe mov %r10,-0x25(%rcx) 794*5d9d9091SRichard LoweL(P5Q3): 795*5d9d9091SRichard Lowe mov -0x1d(%rdx),%r9 796*5d9d9091SRichard Lowe mov %r9,-0x1d(%rcx) 797*5d9d9091SRichard LoweL(P5Q2): 798*5d9d9091SRichard Lowe mov -0x15(%rdx),%r11 799*5d9d9091SRichard Lowe mov %r11,-0x15(%rcx) 800*5d9d9091SRichard LoweL(P5Q1): 801*5d9d9091SRichard Lowe mov -0xd(%rdx),%r10 802*5d9d9091SRichard Lowe mov %r10,-0xd(%rcx) 803*5d9d9091SRichard Lowe /* 804*5d9d9091SRichard Lowe * These trailing loads/stores have to do all their loads 1st, 805*5d9d9091SRichard Lowe * then do the stores. 806*5d9d9091SRichard Lowe */ 807*5d9d9091SRichard LoweL(P5Q0): 808*5d9d9091SRichard Lowe mov -0x5(%rdx),%r9d 809*5d9d9091SRichard Lowe movzbq -0x1(%rdx),%r10 810*5d9d9091SRichard Lowe mov %r9d,-0x5(%rcx) 811*5d9d9091SRichard Lowe mov %r10b,-0x1(%rcx) 812*5d9d9091SRichard Lowe ret 813*5d9d9091SRichard Lowe 814*5d9d9091SRichard Lowe .balign 16 815*5d9d9091SRichard LoweL(P6QF): 816*5d9d9091SRichard Lowe mov -0x7e(%rdx),%r9 817*5d9d9091SRichard Lowe mov %r9,-0x7e(%rcx) 818*5d9d9091SRichard LoweL(P6QE): 819*5d9d9091SRichard Lowe mov -0x76(%rdx),%r11 820*5d9d9091SRichard Lowe mov %r11,-0x76(%rcx) 821*5d9d9091SRichard LoweL(P6QD): 822*5d9d9091SRichard Lowe mov -0x6e(%rdx),%r10 823*5d9d9091SRichard Lowe mov %r10,-0x6e(%rcx) 824*5d9d9091SRichard LoweL(P6QC): 825*5d9d9091SRichard Lowe mov -0x66(%rdx),%r9 826*5d9d9091SRichard Lowe mov %r9,-0x66(%rcx) 827*5d9d9091SRichard LoweL(P6QB): 828*5d9d9091SRichard Lowe mov -0x5e(%rdx),%r11 829*5d9d9091SRichard Lowe mov %r11,-0x5e(%rcx) 830*5d9d9091SRichard LoweL(P6QA): 831*5d9d9091SRichard Lowe mov -0x56(%rdx),%r10 832*5d9d9091SRichard Lowe mov %r10,-0x56(%rcx) 833*5d9d9091SRichard LoweL(P6Q9): 834*5d9d9091SRichard Lowe mov -0x4e(%rdx),%r9 835*5d9d9091SRichard Lowe mov %r9,-0x4e(%rcx) 836*5d9d9091SRichard LoweL(P6Q8): 837*5d9d9091SRichard Lowe mov -0x46(%rdx),%r11 838*5d9d9091SRichard Lowe mov %r11,-0x46(%rcx) 839*5d9d9091SRichard LoweL(P6Q7): 840*5d9d9091SRichard Lowe mov -0x3e(%rdx),%r10 841*5d9d9091SRichard Lowe mov %r10,-0x3e(%rcx) 842*5d9d9091SRichard LoweL(P6Q6): 843*5d9d9091SRichard Lowe mov -0x36(%rdx),%r9 844*5d9d9091SRichard Lowe mov %r9,-0x36(%rcx) 845*5d9d9091SRichard LoweL(P6Q5): 846*5d9d9091SRichard Lowe mov -0x2e(%rdx),%r11 847*5d9d9091SRichard Lowe mov %r11,-0x2e(%rcx) 848*5d9d9091SRichard LoweL(P6Q4): 849*5d9d9091SRichard Lowe mov -0x26(%rdx),%r10 850*5d9d9091SRichard Lowe mov %r10,-0x26(%rcx) 851*5d9d9091SRichard LoweL(P6Q3): 852*5d9d9091SRichard Lowe mov -0x1e(%rdx),%r9 853*5d9d9091SRichard Lowe mov %r9,-0x1e(%rcx) 854*5d9d9091SRichard LoweL(P6Q2): 855*5d9d9091SRichard Lowe mov -0x16(%rdx),%r11 856*5d9d9091SRichard Lowe mov %r11,-0x16(%rcx) 857*5d9d9091SRichard LoweL(P6Q1): 858*5d9d9091SRichard Lowe mov -0xe(%rdx),%r10 859*5d9d9091SRichard Lowe mov %r10,-0xe(%rcx) 860*5d9d9091SRichard Lowe /* 861*5d9d9091SRichard Lowe * These trailing loads/stores have to do all their loads 1st, 862*5d9d9091SRichard Lowe * then do the stores. 863*5d9d9091SRichard Lowe */ 864*5d9d9091SRichard LoweL(P6Q0): 865*5d9d9091SRichard Lowe mov -0x6(%rdx),%r9d 866*5d9d9091SRichard Lowe movzwq -0x2(%rdx),%r10 867*5d9d9091SRichard Lowe mov %r9d,-0x6(%rcx) 868*5d9d9091SRichard Lowe mov %r10w,-0x2(%rcx) 869*5d9d9091SRichard Lowe ret 870*5d9d9091SRichard Lowe 871*5d9d9091SRichard Lowe .balign 16 872*5d9d9091SRichard LoweL(P7QF): 873*5d9d9091SRichard Lowe mov -0x7f(%rdx),%r9 874*5d9d9091SRichard Lowe mov %r9,-0x7f(%rcx) 875*5d9d9091SRichard LoweL(P7QE): 876*5d9d9091SRichard Lowe mov -0x77(%rdx),%r11 877*5d9d9091SRichard Lowe mov %r11,-0x77(%rcx) 878*5d9d9091SRichard LoweL(P7QD): 879*5d9d9091SRichard Lowe mov -0x6f(%rdx),%r10 880*5d9d9091SRichard Lowe mov %r10,-0x6f(%rcx) 881*5d9d9091SRichard LoweL(P7QC): 882*5d9d9091SRichard Lowe mov -0x67(%rdx),%r9 883*5d9d9091SRichard Lowe mov %r9,-0x67(%rcx) 884*5d9d9091SRichard LoweL(P7QB): 885*5d9d9091SRichard Lowe mov -0x5f(%rdx),%r11 886*5d9d9091SRichard Lowe mov %r11,-0x5f(%rcx) 887*5d9d9091SRichard LoweL(P7QA): 888*5d9d9091SRichard Lowe mov -0x57(%rdx),%r10 889*5d9d9091SRichard Lowe mov %r10,-0x57(%rcx) 890*5d9d9091SRichard LoweL(P7Q9): 891*5d9d9091SRichard Lowe mov -0x4f(%rdx),%r9 892*5d9d9091SRichard Lowe mov %r9,-0x4f(%rcx) 893*5d9d9091SRichard LoweL(P7Q8): 894*5d9d9091SRichard Lowe mov -0x47(%rdx),%r11 895*5d9d9091SRichard Lowe mov %r11,-0x47(%rcx) 896*5d9d9091SRichard LoweL(P7Q7): 897*5d9d9091SRichard Lowe mov -0x3f(%rdx),%r10 898*5d9d9091SRichard Lowe mov %r10,-0x3f(%rcx) 899*5d9d9091SRichard LoweL(P7Q6): 900*5d9d9091SRichard Lowe mov -0x37(%rdx),%r9 901*5d9d9091SRichard Lowe mov %r9,-0x37(%rcx) 902*5d9d9091SRichard LoweL(P7Q5): 903*5d9d9091SRichard Lowe mov -0x2f(%rdx),%r11 904*5d9d9091SRichard Lowe mov %r11,-0x2f(%rcx) 905*5d9d9091SRichard LoweL(P7Q4): 906*5d9d9091SRichard Lowe mov -0x27(%rdx),%r10 907*5d9d9091SRichard Lowe mov %r10,-0x27(%rcx) 908*5d9d9091SRichard LoweL(P7Q3): 909*5d9d9091SRichard Lowe mov -0x1f(%rdx),%r9 910*5d9d9091SRichard Lowe mov %r9,-0x1f(%rcx) 911*5d9d9091SRichard LoweL(P7Q2): 912*5d9d9091SRichard Lowe mov -0x17(%rdx),%r11 913*5d9d9091SRichard Lowe mov %r11,-0x17(%rcx) 914*5d9d9091SRichard LoweL(P7Q1): 915*5d9d9091SRichard Lowe mov -0xf(%rdx),%r10 916*5d9d9091SRichard Lowe mov %r10,-0xf(%rcx) 917*5d9d9091SRichard Lowe /* 918*5d9d9091SRichard Lowe * These trailing loads/stores have to do all their loads 1st, 919*5d9d9091SRichard Lowe * then do the stores. 920*5d9d9091SRichard Lowe */ 921*5d9d9091SRichard LoweL(P7Q0): 922*5d9d9091SRichard Lowe mov -0x7(%rdx),%r9d 923*5d9d9091SRichard Lowe movzwq -0x3(%rdx),%r10 924*5d9d9091SRichard Lowe movzbq -0x1(%rdx),%r11 925*5d9d9091SRichard Lowe mov %r9d,-0x7(%rcx) 926*5d9d9091SRichard Lowe mov %r10w,-0x3(%rcx) 927*5d9d9091SRichard Lowe mov %r11b,-0x1(%rcx) 928*5d9d9091SRichard Lowe ret 929*5d9d9091SRichard Lowe 930*5d9d9091SRichard Lowe .balign 16 931*5d9d9091SRichard LoweL(ck_use_sse2): 932*5d9d9091SRichard Lowe /* 933*5d9d9091SRichard Lowe * Align dest to 16 byte boundary. 934*5d9d9091SRichard Lowe */ 935*5d9d9091SRichard Lowe test $0xf,%rcx 936*5d9d9091SRichard Lowe jnz L(ShrtAlignNew) 937*5d9d9091SRichard Lowe 938*5d9d9091SRichard LoweL(now_qw_aligned): 939*5d9d9091SRichard Lowe cmpl $NO_SSE,.memops_method(%rip) 940*5d9d9091SRichard Lowe je L(Loop8byte_pre) 941*5d9d9091SRichard Lowe 942*5d9d9091SRichard Lowe /* 943*5d9d9091SRichard Lowe * The fall-through path is to do SSE2 16-byte load/stores 944*5d9d9091SRichard Lowe */ 945*5d9d9091SRichard Lowe 946*5d9d9091SRichard Lowe /* 947*5d9d9091SRichard Lowe * If current move size is larger than half of the highest level cache 948*5d9d9091SRichard Lowe * size, then do non-temporal moves. 949*5d9d9091SRichard Lowe */ 950*5d9d9091SRichard Lowe mov .largest_level_cache_size(%rip),%r9d 951*5d9d9091SRichard Lowe shr %r9 # take half of it 952*5d9d9091SRichard Lowe cmp %r9,%r8 953*5d9d9091SRichard Lowe jg L(sse2_nt_move) 954*5d9d9091SRichard Lowe 955*5d9d9091SRichard Lowe /* 956*5d9d9091SRichard Lowe * If both the source and dest are aligned, then use the both aligned 957*5d9d9091SRichard Lowe * logic. Well aligned data should reap the rewards. 958*5d9d9091SRichard Lowe */ 959*5d9d9091SRichard Lowe test $0xf,%rdx 960*5d9d9091SRichard Lowe jz L(pre_both_aligned) 961*5d9d9091SRichard Lowe 962*5d9d9091SRichard Lowe lea L(SSE_src)(%rip),%r10 # SSE2 (default) 963*5d9d9091SRichard Lowe testl $USE_SSSE3,.memops_method(%rip) 964*5d9d9091SRichard Lowe jz 1f 965*5d9d9091SRichard Lowe lea L(SSSE3_src)(%rip),%r10 # SSSE3 966*5d9d9091SRichard Lowe 967*5d9d9091SRichard Lowe1: 968*5d9d9091SRichard Lowe /* 969*5d9d9091SRichard Lowe * if the src is not 16 byte aligned... 970*5d9d9091SRichard Lowe */ 971*5d9d9091SRichard Lowe mov %rdx,%r11 972*5d9d9091SRichard Lowe and $0xf,%r11 973*5d9d9091SRichard Lowe movdqu (%rdx),%xmm0 974*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) 975*5d9d9091SRichard Lowe add $0x10,%rdx 976*5d9d9091SRichard Lowe sub %r11,%rdx 977*5d9d9091SRichard Lowe add $0x10,%rcx 978*5d9d9091SRichard Lowe sub $0x10,%r8 979*5d9d9091SRichard Lowe movdqa (%rdx),%xmm1 980*5d9d9091SRichard Lowe 981*5d9d9091SRichard Lowe movslq (%r10,%r11,4),%r9 982*5d9d9091SRichard Lowe lea (%r9,%r10,1),%r10 983*5d9d9091SRichard Lowe jmpq *%r10 984*5d9d9091SRichard Lowe 985*5d9d9091SRichard Lowe .balign 16 986*5d9d9091SRichard LoweL(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) 987*5d9d9091SRichard Lowe .int L(mov3dqa1) -L(SSSE3_src) 988*5d9d9091SRichard Lowe .int L(mov3dqa2) -L(SSSE3_src) 989*5d9d9091SRichard Lowe .int L(mov3dqa3) -L(SSSE3_src) 990*5d9d9091SRichard Lowe .int L(mov3dqa4) -L(SSSE3_src) 991*5d9d9091SRichard Lowe .int L(mov3dqa5) -L(SSSE3_src) 992*5d9d9091SRichard Lowe .int L(mov3dqa6) -L(SSSE3_src) 993*5d9d9091SRichard Lowe .int L(mov3dqa7) -L(SSSE3_src) 994*5d9d9091SRichard Lowe .int L(movdqa8) -L(SSSE3_src) 995*5d9d9091SRichard Lowe .int L(mov3dqa9) -L(SSSE3_src) 996*5d9d9091SRichard Lowe .int L(mov3dqa10)-L(SSSE3_src) 997*5d9d9091SRichard Lowe .int L(mov3dqa11)-L(SSSE3_src) 998*5d9d9091SRichard Lowe .int L(mov3dqa12)-L(SSSE3_src) 999*5d9d9091SRichard Lowe .int L(mov3dqa13)-L(SSSE3_src) 1000*5d9d9091SRichard Lowe .int L(mov3dqa14)-L(SSSE3_src) 1001*5d9d9091SRichard Lowe .int L(mov3dqa15)-L(SSSE3_src) 1002*5d9d9091SRichard LoweL(SSE_src): .int L(pre_both_aligned)-L(SSE_src) 1003*5d9d9091SRichard Lowe .int L(movdqa1) -L(SSE_src) 1004*5d9d9091SRichard Lowe .int L(movdqa2) -L(SSE_src) 1005*5d9d9091SRichard Lowe .int L(movdqa3) -L(SSE_src) 1006*5d9d9091SRichard Lowe .int L(movdqa4) -L(SSE_src) 1007*5d9d9091SRichard Lowe .int L(movdqa5) -L(SSE_src) 1008*5d9d9091SRichard Lowe .int L(movdqa6) -L(SSE_src) 1009*5d9d9091SRichard Lowe .int L(movdqa7) -L(SSE_src) 1010*5d9d9091SRichard Lowe .int L(movdqa8) -L(SSE_src) 1011*5d9d9091SRichard Lowe .int L(movdqa9) -L(SSE_src) 1012*5d9d9091SRichard Lowe .int L(movdqa10)-L(SSE_src) 1013*5d9d9091SRichard Lowe .int L(movdqa11)-L(SSE_src) 1014*5d9d9091SRichard Lowe .int L(movdqa12)-L(SSE_src) 1015*5d9d9091SRichard Lowe .int L(movdqa13)-L(SSE_src) 1016*5d9d9091SRichard Lowe .int L(movdqa14)-L(SSE_src) 1017*5d9d9091SRichard Lowe .int L(movdqa15)-L(SSE_src) 1018*5d9d9091SRichard Lowe 1019*5d9d9091SRichard Lowe .balign 16 1020*5d9d9091SRichard LoweL(movdqa1): 1021*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1022*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1023*5d9d9091SRichard Lowe lea 0x20(%rdx),%rdx 1024*5d9d9091SRichard Lowe lea -0x20(%r8),%r8 1025*5d9d9091SRichard Lowe 1026*5d9d9091SRichard Lowe psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) 1027*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1028*5d9d9091SRichard Lowe pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) 1029*5d9d9091SRichard Lowe por %xmm1,%xmm3 # OR them together 1030*5d9d9091SRichard Lowe cmp $0x20,%r8 1031*5d9d9091SRichard Lowe 1032*5d9d9091SRichard Lowe psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) 1033*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration 1034*5d9d9091SRichard Lowe pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) 1035*5d9d9091SRichard Lowe por %xmm2,%xmm0 # OR them together 1036*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1037*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) # store it 1038*5d9d9091SRichard Lowe lea 0x20(%rcx),%rcx 1039*5d9d9091SRichard Lowe 1040*5d9d9091SRichard Lowe jge L(movdqa1) 1041*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1042*5d9d9091SRichard Lowe 1043*5d9d9091SRichard Lowe .balign 16 1044*5d9d9091SRichard LoweL(movdqa2): 1045*5d9d9091SRichard Lowe sub $0x20,%r8 1046*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1047*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1048*5d9d9091SRichard Lowe add $0x20,%rdx 1049*5d9d9091SRichard Lowe 1050*5d9d9091SRichard Lowe psrldq $0x2,%xmm1 1051*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1052*5d9d9091SRichard Lowe pslldq $0xe,%xmm3 1053*5d9d9091SRichard Lowe por %xmm1,%xmm3 1054*5d9d9091SRichard Lowe 1055*5d9d9091SRichard Lowe psrldq $0x2,%xmm2 1056*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1057*5d9d9091SRichard Lowe pslldq $0xe,%xmm0 1058*5d9d9091SRichard Lowe por %xmm2,%xmm0 1059*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1060*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1061*5d9d9091SRichard Lowe 1062*5d9d9091SRichard Lowe add $0x20,%rcx 1063*5d9d9091SRichard Lowe cmp $0x20,%r8 1064*5d9d9091SRichard Lowe jge L(movdqa2) 1065*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1066*5d9d9091SRichard Lowe 1067*5d9d9091SRichard Lowe .balign 16 1068*5d9d9091SRichard LoweL(movdqa3): 1069*5d9d9091SRichard Lowe sub $0x20,%r8 1070*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1071*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1072*5d9d9091SRichard Lowe add $0x20,%rdx 1073*5d9d9091SRichard Lowe 1074*5d9d9091SRichard Lowe psrldq $0x3,%xmm1 1075*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1076*5d9d9091SRichard Lowe pslldq $0xd,%xmm3 1077*5d9d9091SRichard Lowe por %xmm1,%xmm3 1078*5d9d9091SRichard Lowe 1079*5d9d9091SRichard Lowe psrldq $0x3,%xmm2 1080*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1081*5d9d9091SRichard Lowe pslldq $0xd,%xmm0 1082*5d9d9091SRichard Lowe por %xmm2,%xmm0 1083*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1084*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1085*5d9d9091SRichard Lowe 1086*5d9d9091SRichard Lowe add $0x20,%rcx 1087*5d9d9091SRichard Lowe cmp $0x20,%r8 1088*5d9d9091SRichard Lowe jge L(movdqa3) 1089*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1090*5d9d9091SRichard Lowe 1091*5d9d9091SRichard Lowe .balign 16 1092*5d9d9091SRichard LoweL(movdqa4): 1093*5d9d9091SRichard Lowe sub $0x20,%r8 1094*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1095*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1096*5d9d9091SRichard Lowe add $0x20,%rdx 1097*5d9d9091SRichard Lowe 1098*5d9d9091SRichard Lowe psrldq $0x4,%xmm1 1099*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1100*5d9d9091SRichard Lowe pslldq $0xc,%xmm3 1101*5d9d9091SRichard Lowe por %xmm1,%xmm3 1102*5d9d9091SRichard Lowe 1103*5d9d9091SRichard Lowe psrldq $0x4,%xmm2 1104*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1105*5d9d9091SRichard Lowe pslldq $0xc,%xmm0 1106*5d9d9091SRichard Lowe por %xmm2,%xmm0 1107*5d9d9091SRichard Lowe 1108*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1109*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1110*5d9d9091SRichard Lowe 1111*5d9d9091SRichard Lowe add $0x20,%rcx 1112*5d9d9091SRichard Lowe cmp $0x20,%r8 1113*5d9d9091SRichard Lowe jge L(movdqa4) 1114*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1115*5d9d9091SRichard Lowe 1116*5d9d9091SRichard Lowe .balign 16 1117*5d9d9091SRichard LoweL(movdqa5): 1118*5d9d9091SRichard Lowe sub $0x20,%r8 1119*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1120*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1121*5d9d9091SRichard Lowe add $0x20,%rdx 1122*5d9d9091SRichard Lowe 1123*5d9d9091SRichard Lowe psrldq $0x5,%xmm1 1124*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1125*5d9d9091SRichard Lowe pslldq $0xb,%xmm3 1126*5d9d9091SRichard Lowe por %xmm1,%xmm3 1127*5d9d9091SRichard Lowe 1128*5d9d9091SRichard Lowe psrldq $0x5,%xmm2 1129*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1130*5d9d9091SRichard Lowe pslldq $0xb,%xmm0 1131*5d9d9091SRichard Lowe por %xmm2,%xmm0 1132*5d9d9091SRichard Lowe 1133*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1134*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1135*5d9d9091SRichard Lowe 1136*5d9d9091SRichard Lowe add $0x20,%rcx 1137*5d9d9091SRichard Lowe cmp $0x20,%r8 1138*5d9d9091SRichard Lowe jge L(movdqa5) 1139*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1140*5d9d9091SRichard Lowe 1141*5d9d9091SRichard Lowe .balign 16 1142*5d9d9091SRichard LoweL(movdqa6): 1143*5d9d9091SRichard Lowe sub $0x20,%r8 1144*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1145*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1146*5d9d9091SRichard Lowe add $0x20,%rdx 1147*5d9d9091SRichard Lowe 1148*5d9d9091SRichard Lowe psrldq $0x6,%xmm1 1149*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1150*5d9d9091SRichard Lowe pslldq $0xa,%xmm3 1151*5d9d9091SRichard Lowe por %xmm1,%xmm3 1152*5d9d9091SRichard Lowe 1153*5d9d9091SRichard Lowe psrldq $0x6,%xmm2 1154*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1155*5d9d9091SRichard Lowe pslldq $0xa,%xmm0 1156*5d9d9091SRichard Lowe por %xmm2,%xmm0 1157*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1158*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1159*5d9d9091SRichard Lowe 1160*5d9d9091SRichard Lowe add $0x20,%rcx 1161*5d9d9091SRichard Lowe cmp $0x20,%r8 1162*5d9d9091SRichard Lowe jge L(movdqa6) 1163*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1164*5d9d9091SRichard Lowe 1165*5d9d9091SRichard Lowe .balign 16 1166*5d9d9091SRichard LoweL(movdqa7): 1167*5d9d9091SRichard Lowe sub $0x20,%r8 1168*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1169*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1170*5d9d9091SRichard Lowe add $0x20,%rdx 1171*5d9d9091SRichard Lowe 1172*5d9d9091SRichard Lowe psrldq $0x7,%xmm1 1173*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1174*5d9d9091SRichard Lowe pslldq $0x9,%xmm3 1175*5d9d9091SRichard Lowe por %xmm1,%xmm3 1176*5d9d9091SRichard Lowe 1177*5d9d9091SRichard Lowe psrldq $0x7,%xmm2 1178*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1179*5d9d9091SRichard Lowe pslldq $0x9,%xmm0 1180*5d9d9091SRichard Lowe por %xmm2,%xmm0 1181*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1182*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1183*5d9d9091SRichard Lowe 1184*5d9d9091SRichard Lowe add $0x20,%rcx 1185*5d9d9091SRichard Lowe cmp $0x20,%r8 1186*5d9d9091SRichard Lowe jge L(movdqa7) 1187*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1188*5d9d9091SRichard Lowe 1189*5d9d9091SRichard Lowe .balign 16 1190*5d9d9091SRichard LoweL(movdqa8): 1191*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1192*5d9d9091SRichard Lowe sub $0x30,%r8 1193*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1194*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1195*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1196*5d9d9091SRichard Lowe 1197*5d9d9091SRichard Lowe shufpd $0x1,%xmm3,%xmm1 1198*5d9d9091SRichard Lowe movdqa %xmm1,(%rcx) 1199*5d9d9091SRichard Lowe 1200*5d9d9091SRichard Lowe cmp $0x30,%r8 1201*5d9d9091SRichard Lowe 1202*5d9d9091SRichard Lowe shufpd $0x1,%xmm0,%xmm3 1203*5d9d9091SRichard Lowe movdqa %xmm3,0x10(%rcx) 1204*5d9d9091SRichard Lowe 1205*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1206*5d9d9091SRichard Lowe shufpd $0x1,%xmm5,%xmm0 1207*5d9d9091SRichard Lowe movdqa %xmm0,0x20(%rcx) 1208*5d9d9091SRichard Lowe 1209*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1210*5d9d9091SRichard Lowe 1211*5d9d9091SRichard Lowe jge L(movdqa8) 1212*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1213*5d9d9091SRichard Lowe 1214*5d9d9091SRichard Lowe .balign 16 1215*5d9d9091SRichard LoweL(movdqa9): 1216*5d9d9091SRichard Lowe sub $0x20,%r8 1217*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1218*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1219*5d9d9091SRichard Lowe add $0x20,%rdx 1220*5d9d9091SRichard Lowe 1221*5d9d9091SRichard Lowe psrldq $0x9,%xmm1 1222*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1223*5d9d9091SRichard Lowe pslldq $0x7,%xmm3 1224*5d9d9091SRichard Lowe por %xmm1,%xmm3 1225*5d9d9091SRichard Lowe 1226*5d9d9091SRichard Lowe psrldq $0x9,%xmm2 1227*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1228*5d9d9091SRichard Lowe pslldq $0x7,%xmm0 1229*5d9d9091SRichard Lowe por %xmm2,%xmm0 1230*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1231*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1232*5d9d9091SRichard Lowe 1233*5d9d9091SRichard Lowe add $0x20,%rcx 1234*5d9d9091SRichard Lowe cmp $0x20,%r8 1235*5d9d9091SRichard Lowe jge L(movdqa9) 1236*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1237*5d9d9091SRichard Lowe 1238*5d9d9091SRichard Lowe .balign 16 1239*5d9d9091SRichard LoweL(movdqa10): 1240*5d9d9091SRichard Lowe sub $0x20,%r8 1241*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1242*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1243*5d9d9091SRichard Lowe add $0x20,%rdx 1244*5d9d9091SRichard Lowe 1245*5d9d9091SRichard Lowe psrldq $0xa,%xmm1 1246*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1247*5d9d9091SRichard Lowe pslldq $0x6,%xmm3 1248*5d9d9091SRichard Lowe por %xmm1,%xmm3 1249*5d9d9091SRichard Lowe 1250*5d9d9091SRichard Lowe psrldq $0xa,%xmm2 1251*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1252*5d9d9091SRichard Lowe pslldq $0x6,%xmm0 1253*5d9d9091SRichard Lowe por %xmm2,%xmm0 1254*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1255*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1256*5d9d9091SRichard Lowe 1257*5d9d9091SRichard Lowe add $0x20,%rcx 1258*5d9d9091SRichard Lowe cmp $0x20,%r8 1259*5d9d9091SRichard Lowe jge L(movdqa10) 1260*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1261*5d9d9091SRichard Lowe 1262*5d9d9091SRichard Lowe .balign 16 1263*5d9d9091SRichard LoweL(movdqa11): 1264*5d9d9091SRichard Lowe sub $0x20,%r8 1265*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1266*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1267*5d9d9091SRichard Lowe add $0x20,%rdx 1268*5d9d9091SRichard Lowe 1269*5d9d9091SRichard Lowe psrldq $0xb,%xmm1 1270*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1271*5d9d9091SRichard Lowe pslldq $0x5,%xmm3 1272*5d9d9091SRichard Lowe por %xmm1,%xmm3 1273*5d9d9091SRichard Lowe 1274*5d9d9091SRichard Lowe psrldq $0xb,%xmm2 1275*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1276*5d9d9091SRichard Lowe pslldq $0x5,%xmm0 1277*5d9d9091SRichard Lowe por %xmm2,%xmm0 1278*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1279*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1280*5d9d9091SRichard Lowe 1281*5d9d9091SRichard Lowe add $0x20,%rcx 1282*5d9d9091SRichard Lowe cmp $0x20,%r8 1283*5d9d9091SRichard Lowe jge L(movdqa11) 1284*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1285*5d9d9091SRichard Lowe 1286*5d9d9091SRichard Lowe .balign 16 1287*5d9d9091SRichard LoweL(movdqa12): 1288*5d9d9091SRichard Lowe sub $0x20,%r8 1289*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1290*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1291*5d9d9091SRichard Lowe add $0x20,%rdx 1292*5d9d9091SRichard Lowe 1293*5d9d9091SRichard Lowe psrldq $0xc,%xmm1 1294*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1295*5d9d9091SRichard Lowe pslldq $0x4,%xmm3 1296*5d9d9091SRichard Lowe por %xmm1,%xmm3 1297*5d9d9091SRichard Lowe 1298*5d9d9091SRichard Lowe psrldq $0xc,%xmm2 1299*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1300*5d9d9091SRichard Lowe pslldq $0x4,%xmm0 1301*5d9d9091SRichard Lowe por %xmm2,%xmm0 1302*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1303*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1304*5d9d9091SRichard Lowe 1305*5d9d9091SRichard Lowe add $0x20,%rcx 1306*5d9d9091SRichard Lowe cmp $0x20,%r8 1307*5d9d9091SRichard Lowe jge L(movdqa12) 1308*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1309*5d9d9091SRichard Lowe 1310*5d9d9091SRichard Lowe .balign 16 1311*5d9d9091SRichard LoweL(movdqa13): 1312*5d9d9091SRichard Lowe sub $0x20,%r8 1313*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1314*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1315*5d9d9091SRichard Lowe add $0x20,%rdx 1316*5d9d9091SRichard Lowe 1317*5d9d9091SRichard Lowe psrldq $0xd,%xmm1 1318*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1319*5d9d9091SRichard Lowe pslldq $0x3,%xmm3 1320*5d9d9091SRichard Lowe por %xmm1,%xmm3 1321*5d9d9091SRichard Lowe 1322*5d9d9091SRichard Lowe psrldq $0xd,%xmm2 1323*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1324*5d9d9091SRichard Lowe pslldq $0x3,%xmm0 1325*5d9d9091SRichard Lowe por %xmm2,%xmm0 1326*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1327*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1328*5d9d9091SRichard Lowe 1329*5d9d9091SRichard Lowe add $0x20,%rcx 1330*5d9d9091SRichard Lowe cmp $0x20,%r8 1331*5d9d9091SRichard Lowe jge L(movdqa13) 1332*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1333*5d9d9091SRichard Lowe 1334*5d9d9091SRichard Lowe .balign 16 1335*5d9d9091SRichard LoweL(movdqa14): 1336*5d9d9091SRichard Lowe sub $0x20,%r8 1337*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1338*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1339*5d9d9091SRichard Lowe add $0x20,%rdx 1340*5d9d9091SRichard Lowe 1341*5d9d9091SRichard Lowe psrldq $0xe,%xmm1 1342*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1343*5d9d9091SRichard Lowe pslldq $0x2,%xmm3 1344*5d9d9091SRichard Lowe por %xmm1,%xmm3 1345*5d9d9091SRichard Lowe 1346*5d9d9091SRichard Lowe psrldq $0xe,%xmm2 1347*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1348*5d9d9091SRichard Lowe pslldq $0x2,%xmm0 1349*5d9d9091SRichard Lowe por %xmm2,%xmm0 1350*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1351*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1352*5d9d9091SRichard Lowe 1353*5d9d9091SRichard Lowe add $0x20,%rcx 1354*5d9d9091SRichard Lowe cmp $0x20,%r8 1355*5d9d9091SRichard Lowe jge L(movdqa14) 1356*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1357*5d9d9091SRichard Lowe 1358*5d9d9091SRichard Lowe .balign 16 1359*5d9d9091SRichard LoweL(movdqa15): 1360*5d9d9091SRichard Lowe sub $0x20,%r8 1361*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1362*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1363*5d9d9091SRichard Lowe add $0x20,%rdx 1364*5d9d9091SRichard Lowe 1365*5d9d9091SRichard Lowe psrldq $0xf,%xmm1 1366*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1367*5d9d9091SRichard Lowe pslldq $0x1,%xmm3 1368*5d9d9091SRichard Lowe por %xmm1,%xmm3 1369*5d9d9091SRichard Lowe 1370*5d9d9091SRichard Lowe psrldq $0xf,%xmm2 1371*5d9d9091SRichard Lowe movdqa %xmm0,%xmm1 1372*5d9d9091SRichard Lowe pslldq $0x1,%xmm0 1373*5d9d9091SRichard Lowe por %xmm2,%xmm0 1374*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1375*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1376*5d9d9091SRichard Lowe 1377*5d9d9091SRichard Lowe add $0x20,%rcx 1378*5d9d9091SRichard Lowe cmp $0x20,%r8 1379*5d9d9091SRichard Lowe jge L(movdqa15) 1380*5d9d9091SRichard Lowe #jmp L(movdqa_epi) 1381*5d9d9091SRichard Lowe 1382*5d9d9091SRichard Lowe .balign 16 1383*5d9d9091SRichard LoweL(movdqa_epi): 1384*5d9d9091SRichard Lowe lea L(fwdPxQx)(%rip),%r10 1385*5d9d9091SRichard Lowe add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) 1386*5d9d9091SRichard Lowe add %r8,%rcx 1387*5d9d9091SRichard Lowe add %r8,%rdx 1388*5d9d9091SRichard Lowe 1389*5d9d9091SRichard Lowe movslq (%r10,%r8,4),%r9 1390*5d9d9091SRichard Lowe lea (%r9,%r10,1),%r10 1391*5d9d9091SRichard Lowe jmpq *%r10 1392*5d9d9091SRichard Lowe 1393*5d9d9091SRichard Lowe .balign 16 1394*5d9d9091SRichard LoweL(mov3dqa1): 1395*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1396*5d9d9091SRichard Lowe sub $0x30,%r8 1397*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1398*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 # load the upper source buffer 1399*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1400*5d9d9091SRichard Lowe cmp $0x30,%r8 1401*5d9d9091SRichard Lowe 1402*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1403*5d9d9091SRichard Lowe #palignr $0x1,%xmm1,%xmm3 1404*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1405*5d9d9091SRichard Lowe .byte 0xd9,0x01 1406*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1407*5d9d9091SRichard Lowe 1408*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration 1409*5d9d9091SRichard Lowe #palignr $0x1,%xmm2,%xmm0 1410*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1411*5d9d9091SRichard Lowe .byte 0xc2,0x01 1412*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) # store it 1413*5d9d9091SRichard Lowe 1414*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration 1415*5d9d9091SRichard Lowe #palignr $0x1,%xmm4,%xmm5 1416*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1417*5d9d9091SRichard Lowe .byte 0xec,0x01 1418*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) # store it 1419*5d9d9091SRichard Lowe 1420*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1421*5d9d9091SRichard Lowe jge L(mov3dqa1) 1422*5d9d9091SRichard Lowe 1423*5d9d9091SRichard Lowe cmp $0x10,%r8 1424*5d9d9091SRichard Lowe jl L(movdqa_epi) 1425*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1426*5d9d9091SRichard Lowe sub $0x10,%r8 1427*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1428*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1429*5d9d9091SRichard Lowe #palignr $0x1,%xmm1,%xmm3 1430*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1431*5d9d9091SRichard Lowe .byte 0xd9,0x01 1432*5d9d9091SRichard Lowe 1433*5d9d9091SRichard Lowe cmp $0x10,%r8 1434*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1435*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1436*5d9d9091SRichard Lowe jl L(movdqa_epi) 1437*5d9d9091SRichard Lowe 1438*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1439*5d9d9091SRichard Lowe sub $0x10,%r8 1440*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1441*5d9d9091SRichard Lowe #palignr $0x1,%xmm2,%xmm0 1442*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1443*5d9d9091SRichard Lowe .byte 0xc2,0x01 1444*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1445*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1446*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1447*5d9d9091SRichard Lowe 1448*5d9d9091SRichard Lowe .balign 16 1449*5d9d9091SRichard LoweL(mov3dqa2): 1450*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1451*5d9d9091SRichard Lowe sub $0x30,%r8 1452*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1453*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1454*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1455*5d9d9091SRichard Lowe cmp $0x30,%r8 1456*5d9d9091SRichard Lowe 1457*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1458*5d9d9091SRichard Lowe #palignr $0x2,%xmm1,%xmm3 1459*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1460*5d9d9091SRichard Lowe .byte 0xd9,0x02 1461*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1462*5d9d9091SRichard Lowe 1463*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1464*5d9d9091SRichard Lowe #palignr $0x2,%xmm2,%xmm0 1465*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1466*5d9d9091SRichard Lowe .byte 0xc2,0x02 1467*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1468*5d9d9091SRichard Lowe 1469*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1470*5d9d9091SRichard Lowe #palignr $0x2,%xmm4,%xmm5 1471*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1472*5d9d9091SRichard Lowe .byte 0xec,0x02 1473*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1474*5d9d9091SRichard Lowe 1475*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1476*5d9d9091SRichard Lowe jge L(mov3dqa2) 1477*5d9d9091SRichard Lowe 1478*5d9d9091SRichard Lowe cmp $0x10,%r8 1479*5d9d9091SRichard Lowe jl L(movdqa_epi) 1480*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1481*5d9d9091SRichard Lowe sub $0x10,%r8 1482*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1483*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1484*5d9d9091SRichard Lowe #palignr $0x2,%xmm1,%xmm3 1485*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1486*5d9d9091SRichard Lowe .byte 0xd9,0x02 1487*5d9d9091SRichard Lowe 1488*5d9d9091SRichard Lowe cmp $0x10,%r8 1489*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1490*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1491*5d9d9091SRichard Lowe jl L(movdqa_epi) 1492*5d9d9091SRichard Lowe 1493*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1494*5d9d9091SRichard Lowe sub $0x10,%r8 1495*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1496*5d9d9091SRichard Lowe #palignr $0x2,%xmm2,%xmm0 1497*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1498*5d9d9091SRichard Lowe .byte 0xc2,0x02 1499*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1500*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1501*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1502*5d9d9091SRichard Lowe 1503*5d9d9091SRichard Lowe .balign 16 1504*5d9d9091SRichard LoweL(mov3dqa3): 1505*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1506*5d9d9091SRichard Lowe sub $0x30,%r8 1507*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1508*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1509*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1510*5d9d9091SRichard Lowe cmp $0x30,%r8 1511*5d9d9091SRichard Lowe 1512*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1513*5d9d9091SRichard Lowe #palignr $0x3,%xmm1,%xmm3 1514*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1515*5d9d9091SRichard Lowe .byte 0xd9,0x03 1516*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1517*5d9d9091SRichard Lowe 1518*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1519*5d9d9091SRichard Lowe #palignr $0x3,%xmm2,%xmm0 1520*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1521*5d9d9091SRichard Lowe .byte 0xc2,0x03 1522*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1523*5d9d9091SRichard Lowe 1524*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1525*5d9d9091SRichard Lowe #palignr $0x3,%xmm4,%xmm5 1526*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1527*5d9d9091SRichard Lowe .byte 0xec,0x03 1528*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1529*5d9d9091SRichard Lowe 1530*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1531*5d9d9091SRichard Lowe jge L(mov3dqa3) 1532*5d9d9091SRichard Lowe 1533*5d9d9091SRichard Lowe cmp $0x10,%r8 1534*5d9d9091SRichard Lowe jl L(movdqa_epi) 1535*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1536*5d9d9091SRichard Lowe sub $0x10,%r8 1537*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1538*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1539*5d9d9091SRichard Lowe #palignr $0x3,%xmm1,%xmm3 1540*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1541*5d9d9091SRichard Lowe .byte 0xd9,0x03 1542*5d9d9091SRichard Lowe 1543*5d9d9091SRichard Lowe cmp $0x10,%r8 1544*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1545*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1546*5d9d9091SRichard Lowe jl L(movdqa_epi) 1547*5d9d9091SRichard Lowe 1548*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1549*5d9d9091SRichard Lowe sub $0x10,%r8 1550*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1551*5d9d9091SRichard Lowe #palignr $0x3,%xmm2,%xmm0 1552*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1553*5d9d9091SRichard Lowe .byte 0xc2,0x03 1554*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1555*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1556*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1557*5d9d9091SRichard Lowe 1558*5d9d9091SRichard Lowe .balign 16 1559*5d9d9091SRichard LoweL(mov3dqa4): 1560*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1561*5d9d9091SRichard Lowe sub $0x30,%r8 1562*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1563*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1564*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1565*5d9d9091SRichard Lowe cmp $0x30,%r8 1566*5d9d9091SRichard Lowe 1567*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1568*5d9d9091SRichard Lowe #palignr $0x4,%xmm1,%xmm3 1569*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1570*5d9d9091SRichard Lowe .byte 0xd9,0x04 1571*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1572*5d9d9091SRichard Lowe 1573*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1574*5d9d9091SRichard Lowe #palignr $0x4,%xmm2,%xmm0 1575*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1576*5d9d9091SRichard Lowe .byte 0xc2,0x04 1577*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1578*5d9d9091SRichard Lowe 1579*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1580*5d9d9091SRichard Lowe #palignr $0x4,%xmm4,%xmm5 1581*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1582*5d9d9091SRichard Lowe .byte 0xec,0x04 1583*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1584*5d9d9091SRichard Lowe 1585*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1586*5d9d9091SRichard Lowe jge L(mov3dqa4) 1587*5d9d9091SRichard Lowe 1588*5d9d9091SRichard Lowe cmp $0x10,%r8 1589*5d9d9091SRichard Lowe jl L(movdqa_epi) 1590*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1591*5d9d9091SRichard Lowe sub $0x10,%r8 1592*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1593*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1594*5d9d9091SRichard Lowe #palignr $0x4,%xmm1,%xmm3 1595*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1596*5d9d9091SRichard Lowe .byte 0xd9,0x04 1597*5d9d9091SRichard Lowe 1598*5d9d9091SRichard Lowe cmp $0x10,%r8 1599*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1600*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1601*5d9d9091SRichard Lowe jl L(movdqa_epi) 1602*5d9d9091SRichard Lowe 1603*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1604*5d9d9091SRichard Lowe sub $0x10,%r8 1605*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1606*5d9d9091SRichard Lowe #palignr $0x4,%xmm2,%xmm0 1607*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1608*5d9d9091SRichard Lowe .byte 0xc2,0x04 1609*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1610*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1611*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1612*5d9d9091SRichard Lowe 1613*5d9d9091SRichard Lowe .balign 16 1614*5d9d9091SRichard LoweL(mov3dqa5): 1615*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1616*5d9d9091SRichard Lowe sub $0x30,%r8 1617*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1618*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1619*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1620*5d9d9091SRichard Lowe cmp $0x30,%r8 1621*5d9d9091SRichard Lowe 1622*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1623*5d9d9091SRichard Lowe #palignr $0x5,%xmm1,%xmm3 1624*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1625*5d9d9091SRichard Lowe .byte 0xd9,0x05 1626*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1627*5d9d9091SRichard Lowe 1628*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1629*5d9d9091SRichard Lowe #palignr $0x5,%xmm2,%xmm0 1630*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1631*5d9d9091SRichard Lowe .byte 0xc2,0x05 1632*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1633*5d9d9091SRichard Lowe 1634*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1635*5d9d9091SRichard Lowe #palignr $0x5,%xmm4,%xmm5 1636*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1637*5d9d9091SRichard Lowe .byte 0xec,0x05 1638*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1639*5d9d9091SRichard Lowe 1640*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1641*5d9d9091SRichard Lowe jge L(mov3dqa5) 1642*5d9d9091SRichard Lowe 1643*5d9d9091SRichard Lowe cmp $0x10,%r8 1644*5d9d9091SRichard Lowe jl L(movdqa_epi) 1645*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1646*5d9d9091SRichard Lowe sub $0x10,%r8 1647*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1648*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1649*5d9d9091SRichard Lowe #palignr $0x5,%xmm1,%xmm3 1650*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1651*5d9d9091SRichard Lowe .byte 0xd9,0x05 1652*5d9d9091SRichard Lowe 1653*5d9d9091SRichard Lowe cmp $0x10,%r8 1654*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1655*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1656*5d9d9091SRichard Lowe jl L(movdqa_epi) 1657*5d9d9091SRichard Lowe 1658*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1659*5d9d9091SRichard Lowe sub $0x10,%r8 1660*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1661*5d9d9091SRichard Lowe #palignr $0x5,%xmm2,%xmm0 1662*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1663*5d9d9091SRichard Lowe .byte 0xc2,0x05 1664*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1665*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1666*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1667*5d9d9091SRichard Lowe 1668*5d9d9091SRichard Lowe .balign 16 1669*5d9d9091SRichard LoweL(mov3dqa6): 1670*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1671*5d9d9091SRichard Lowe sub $0x30,%r8 1672*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1673*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1674*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1675*5d9d9091SRichard Lowe cmp $0x30,%r8 1676*5d9d9091SRichard Lowe 1677*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1678*5d9d9091SRichard Lowe #palignr $0x6,%xmm1,%xmm3 1679*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1680*5d9d9091SRichard Lowe .byte 0xd9,0x06 1681*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1682*5d9d9091SRichard Lowe 1683*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1684*5d9d9091SRichard Lowe #palignr $0x6,%xmm2,%xmm0 1685*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1686*5d9d9091SRichard Lowe .byte 0xc2,0x06 1687*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1688*5d9d9091SRichard Lowe 1689*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1690*5d9d9091SRichard Lowe #palignr $0x6,%xmm4,%xmm5 1691*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1692*5d9d9091SRichard Lowe .byte 0xec,0x06 1693*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1694*5d9d9091SRichard Lowe 1695*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1696*5d9d9091SRichard Lowe jge L(mov3dqa6) 1697*5d9d9091SRichard Lowe 1698*5d9d9091SRichard Lowe cmp $0x10,%r8 1699*5d9d9091SRichard Lowe jl L(movdqa_epi) 1700*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1701*5d9d9091SRichard Lowe sub $0x10,%r8 1702*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1703*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1704*5d9d9091SRichard Lowe #palignr $0x6,%xmm1,%xmm3 1705*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1706*5d9d9091SRichard Lowe .byte 0xd9,0x06 1707*5d9d9091SRichard Lowe 1708*5d9d9091SRichard Lowe cmp $0x10,%r8 1709*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1710*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1711*5d9d9091SRichard Lowe jl L(movdqa_epi) 1712*5d9d9091SRichard Lowe 1713*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1714*5d9d9091SRichard Lowe sub $0x10,%r8 1715*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1716*5d9d9091SRichard Lowe #palignr $0x6,%xmm2,%xmm0 1717*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1718*5d9d9091SRichard Lowe .byte 0xc2,0x06 1719*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1720*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1721*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1722*5d9d9091SRichard Lowe 1723*5d9d9091SRichard Lowe .balign 16 1724*5d9d9091SRichard LoweL(mov3dqa7): 1725*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1726*5d9d9091SRichard Lowe sub $0x30,%r8 1727*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1728*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1729*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1730*5d9d9091SRichard Lowe cmp $0x30,%r8 1731*5d9d9091SRichard Lowe 1732*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1733*5d9d9091SRichard Lowe #palignr $0x7,%xmm1,%xmm3 1734*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1735*5d9d9091SRichard Lowe .byte 0xd9,0x07 1736*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1737*5d9d9091SRichard Lowe 1738*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1739*5d9d9091SRichard Lowe #palignr $0x7,%xmm2,%xmm0 1740*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1741*5d9d9091SRichard Lowe .byte 0xc2,0x07 1742*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1743*5d9d9091SRichard Lowe 1744*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1745*5d9d9091SRichard Lowe #palignr $0x7,%xmm4,%xmm5 1746*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1747*5d9d9091SRichard Lowe .byte 0xec,0x07 1748*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1749*5d9d9091SRichard Lowe 1750*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1751*5d9d9091SRichard Lowe jge L(mov3dqa7) 1752*5d9d9091SRichard Lowe 1753*5d9d9091SRichard Lowe cmp $0x10,%r8 1754*5d9d9091SRichard Lowe jl L(movdqa_epi) 1755*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1756*5d9d9091SRichard Lowe sub $0x10,%r8 1757*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1758*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1759*5d9d9091SRichard Lowe #palignr $0x7,%xmm1,%xmm3 1760*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1761*5d9d9091SRichard Lowe .byte 0xd9,0x07 1762*5d9d9091SRichard Lowe 1763*5d9d9091SRichard Lowe cmp $0x10,%r8 1764*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1765*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1766*5d9d9091SRichard Lowe jl L(movdqa_epi) 1767*5d9d9091SRichard Lowe 1768*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1769*5d9d9091SRichard Lowe sub $0x10,%r8 1770*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1771*5d9d9091SRichard Lowe #palignr $0x7,%xmm2,%xmm0 1772*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1773*5d9d9091SRichard Lowe .byte 0xc2,0x07 1774*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1775*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1776*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1777*5d9d9091SRichard Lowe 1778*5d9d9091SRichard Lowe .balign 16 1779*5d9d9091SRichard LoweL(mov3dqa9): 1780*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1781*5d9d9091SRichard Lowe sub $0x30,%r8 1782*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1783*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1784*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1785*5d9d9091SRichard Lowe cmp $0x30,%r8 1786*5d9d9091SRichard Lowe 1787*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1788*5d9d9091SRichard Lowe #palignr $0x9,%xmm1,%xmm3 1789*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1790*5d9d9091SRichard Lowe .byte 0xd9,0x09 1791*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1792*5d9d9091SRichard Lowe 1793*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1794*5d9d9091SRichard Lowe #palignr $0x9,%xmm2,%xmm0 1795*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1796*5d9d9091SRichard Lowe .byte 0xc2,0x09 1797*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1798*5d9d9091SRichard Lowe 1799*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1800*5d9d9091SRichard Lowe #palignr $0x9,%xmm4,%xmm5 1801*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1802*5d9d9091SRichard Lowe .byte 0xec,0x09 1803*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1804*5d9d9091SRichard Lowe 1805*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1806*5d9d9091SRichard Lowe jge L(mov3dqa9) 1807*5d9d9091SRichard Lowe 1808*5d9d9091SRichard Lowe cmp $0x10,%r8 1809*5d9d9091SRichard Lowe jl L(movdqa_epi) 1810*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1811*5d9d9091SRichard Lowe sub $0x10,%r8 1812*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1813*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1814*5d9d9091SRichard Lowe #palignr $0x9,%xmm1,%xmm3 1815*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1816*5d9d9091SRichard Lowe .byte 0xd9,0x09 1817*5d9d9091SRichard Lowe 1818*5d9d9091SRichard Lowe cmp $0x10,%r8 1819*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1820*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1821*5d9d9091SRichard Lowe jl L(movdqa_epi) 1822*5d9d9091SRichard Lowe 1823*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1824*5d9d9091SRichard Lowe sub $0x10,%r8 1825*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1826*5d9d9091SRichard Lowe #palignr $0x9,%xmm2,%xmm0 1827*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1828*5d9d9091SRichard Lowe .byte 0xc2,0x09 1829*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1830*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1831*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1832*5d9d9091SRichard Lowe 1833*5d9d9091SRichard Lowe .balign 16 1834*5d9d9091SRichard LoweL(mov3dqa10): 1835*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1836*5d9d9091SRichard Lowe sub $0x30,%r8 1837*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1838*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1839*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1840*5d9d9091SRichard Lowe cmp $0x30,%r8 1841*5d9d9091SRichard Lowe 1842*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1843*5d9d9091SRichard Lowe #palignr $0xa,%xmm1,%xmm3 1844*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1845*5d9d9091SRichard Lowe .byte 0xd9,0x0a 1846*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1847*5d9d9091SRichard Lowe 1848*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1849*5d9d9091SRichard Lowe #palignr $0xa,%xmm2,%xmm0 1850*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1851*5d9d9091SRichard Lowe .byte 0xc2,0x0a 1852*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1853*5d9d9091SRichard Lowe 1854*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1855*5d9d9091SRichard Lowe #palignr $0xa,%xmm4,%xmm5 1856*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1857*5d9d9091SRichard Lowe .byte 0xec,0x0a 1858*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1859*5d9d9091SRichard Lowe 1860*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1861*5d9d9091SRichard Lowe jge L(mov3dqa10) 1862*5d9d9091SRichard Lowe 1863*5d9d9091SRichard Lowe cmp $0x10,%r8 1864*5d9d9091SRichard Lowe jl L(movdqa_epi) 1865*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1866*5d9d9091SRichard Lowe sub $0x10,%r8 1867*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1868*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1869*5d9d9091SRichard Lowe #palignr $0xa,%xmm1,%xmm3 1870*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1871*5d9d9091SRichard Lowe .byte 0xd9,0x0a 1872*5d9d9091SRichard Lowe 1873*5d9d9091SRichard Lowe cmp $0x10,%r8 1874*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1875*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1876*5d9d9091SRichard Lowe jl L(movdqa_epi) 1877*5d9d9091SRichard Lowe 1878*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1879*5d9d9091SRichard Lowe sub $0x10,%r8 1880*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1881*5d9d9091SRichard Lowe #palignr $0xa,%xmm2,%xmm0 1882*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1883*5d9d9091SRichard Lowe .byte 0xc2,0x0a 1884*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1885*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1886*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1887*5d9d9091SRichard Lowe 1888*5d9d9091SRichard Lowe .balign 16 1889*5d9d9091SRichard LoweL(mov3dqa11): 1890*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1891*5d9d9091SRichard Lowe sub $0x30,%r8 1892*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1893*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1894*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1895*5d9d9091SRichard Lowe cmp $0x30,%r8 1896*5d9d9091SRichard Lowe 1897*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1898*5d9d9091SRichard Lowe #palignr $0xb,%xmm1,%xmm3 1899*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1900*5d9d9091SRichard Lowe .byte 0xd9,0x0b 1901*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1902*5d9d9091SRichard Lowe 1903*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1904*5d9d9091SRichard Lowe #palignr $0xb,%xmm2,%xmm0 1905*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1906*5d9d9091SRichard Lowe .byte 0xc2,0x0b 1907*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1908*5d9d9091SRichard Lowe 1909*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1910*5d9d9091SRichard Lowe #palignr $0xb,%xmm4,%xmm5 1911*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1912*5d9d9091SRichard Lowe .byte 0xec,0x0b 1913*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1914*5d9d9091SRichard Lowe 1915*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1916*5d9d9091SRichard Lowe jge L(mov3dqa11) 1917*5d9d9091SRichard Lowe 1918*5d9d9091SRichard Lowe cmp $0x10,%r8 1919*5d9d9091SRichard Lowe jl L(movdqa_epi) 1920*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1921*5d9d9091SRichard Lowe sub $0x10,%r8 1922*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1923*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1924*5d9d9091SRichard Lowe #palignr $0xb,%xmm1,%xmm3 1925*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1926*5d9d9091SRichard Lowe .byte 0xd9,0x0b 1927*5d9d9091SRichard Lowe 1928*5d9d9091SRichard Lowe cmp $0x10,%r8 1929*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1930*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1931*5d9d9091SRichard Lowe jl L(movdqa_epi) 1932*5d9d9091SRichard Lowe 1933*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1934*5d9d9091SRichard Lowe sub $0x10,%r8 1935*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1936*5d9d9091SRichard Lowe #palignr $0xb,%xmm2,%xmm0 1937*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1938*5d9d9091SRichard Lowe .byte 0xc2,0x0b 1939*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1940*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1941*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1942*5d9d9091SRichard Lowe 1943*5d9d9091SRichard Lowe .balign 16 1944*5d9d9091SRichard LoweL(mov3dqa12): 1945*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 1946*5d9d9091SRichard Lowe sub $0x30,%r8 1947*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 1948*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 1949*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 1950*5d9d9091SRichard Lowe cmp $0x30,%r8 1951*5d9d9091SRichard Lowe 1952*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 1953*5d9d9091SRichard Lowe #palignr $0xc,%xmm1,%xmm3 1954*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1955*5d9d9091SRichard Lowe .byte 0xd9,0x0c 1956*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 1957*5d9d9091SRichard Lowe 1958*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 1959*5d9d9091SRichard Lowe #palignr $0xc,%xmm2,%xmm0 1960*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1961*5d9d9091SRichard Lowe .byte 0xc2,0x0c 1962*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 1963*5d9d9091SRichard Lowe 1964*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 1965*5d9d9091SRichard Lowe #palignr $0xc,%xmm4,%xmm5 1966*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1967*5d9d9091SRichard Lowe .byte 0xec,0x0c 1968*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 1969*5d9d9091SRichard Lowe 1970*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 1971*5d9d9091SRichard Lowe jge L(mov3dqa12) 1972*5d9d9091SRichard Lowe 1973*5d9d9091SRichard Lowe cmp $0x10,%r8 1974*5d9d9091SRichard Lowe jl L(movdqa_epi) 1975*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1976*5d9d9091SRichard Lowe sub $0x10,%r8 1977*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1978*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 1979*5d9d9091SRichard Lowe #palignr $0xc,%xmm1,%xmm3 1980*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1981*5d9d9091SRichard Lowe .byte 0xd9,0x0c 1982*5d9d9091SRichard Lowe 1983*5d9d9091SRichard Lowe cmp $0x10,%r8 1984*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 1985*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1986*5d9d9091SRichard Lowe jl L(movdqa_epi) 1987*5d9d9091SRichard Lowe 1988*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1989*5d9d9091SRichard Lowe sub $0x10,%r8 1990*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 1991*5d9d9091SRichard Lowe #palignr $0xc,%xmm2,%xmm0 1992*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 1993*5d9d9091SRichard Lowe .byte 0xc2,0x0c 1994*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 1995*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 1996*5d9d9091SRichard Lowe jmp L(movdqa_epi) 1997*5d9d9091SRichard Lowe 1998*5d9d9091SRichard Lowe .balign 16 1999*5d9d9091SRichard LoweL(mov3dqa13): 2000*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 2001*5d9d9091SRichard Lowe sub $0x30,%r8 2002*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 2003*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 2004*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 2005*5d9d9091SRichard Lowe cmp $0x30,%r8 2006*5d9d9091SRichard Lowe 2007*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 2008*5d9d9091SRichard Lowe #palignr $0xd,%xmm1,%xmm3 2009*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2010*5d9d9091SRichard Lowe .byte 0xd9,0x0d 2011*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 2012*5d9d9091SRichard Lowe 2013*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 2014*5d9d9091SRichard Lowe #palignr $0xd,%xmm2,%xmm0 2015*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2016*5d9d9091SRichard Lowe .byte 0xc2,0x0d 2017*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 2018*5d9d9091SRichard Lowe 2019*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 2020*5d9d9091SRichard Lowe #palignr $0xd,%xmm4,%xmm5 2021*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2022*5d9d9091SRichard Lowe .byte 0xec,0x0d 2023*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 2024*5d9d9091SRichard Lowe 2025*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 2026*5d9d9091SRichard Lowe jge L(mov3dqa13) 2027*5d9d9091SRichard Lowe 2028*5d9d9091SRichard Lowe cmp $0x10,%r8 2029*5d9d9091SRichard Lowe jl L(movdqa_epi) 2030*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2031*5d9d9091SRichard Lowe sub $0x10,%r8 2032*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 2033*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 2034*5d9d9091SRichard Lowe #palignr $0xd,%xmm1,%xmm3 2035*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2036*5d9d9091SRichard Lowe .byte 0xd9,0x0d 2037*5d9d9091SRichard Lowe 2038*5d9d9091SRichard Lowe cmp $0x10,%r8 2039*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 2040*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 2041*5d9d9091SRichard Lowe jl L(movdqa_epi) 2042*5d9d9091SRichard Lowe 2043*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2044*5d9d9091SRichard Lowe sub $0x10,%r8 2045*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 2046*5d9d9091SRichard Lowe #palignr $0xd,%xmm2,%xmm0 2047*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2048*5d9d9091SRichard Lowe .byte 0xc2,0x0d 2049*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 2050*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 2051*5d9d9091SRichard Lowe jmp L(movdqa_epi) 2052*5d9d9091SRichard Lowe 2053*5d9d9091SRichard Lowe .balign 16 2054*5d9d9091SRichard LoweL(mov3dqa14): 2055*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 2056*5d9d9091SRichard Lowe sub $0x30,%r8 2057*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 2058*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 2059*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 2060*5d9d9091SRichard Lowe cmp $0x30,%r8 2061*5d9d9091SRichard Lowe 2062*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 2063*5d9d9091SRichard Lowe #palignr $0xe,%xmm1,%xmm3 2064*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2065*5d9d9091SRichard Lowe .byte 0xd9,0x0e 2066*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 2067*5d9d9091SRichard Lowe 2068*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 2069*5d9d9091SRichard Lowe #palignr $0xe,%xmm2,%xmm0 2070*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2071*5d9d9091SRichard Lowe .byte 0xc2,0x0e 2072*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 2073*5d9d9091SRichard Lowe 2074*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 2075*5d9d9091SRichard Lowe #palignr $0xe,%xmm4,%xmm5 2076*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2077*5d9d9091SRichard Lowe .byte 0xec,0x0e 2078*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 2079*5d9d9091SRichard Lowe 2080*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 2081*5d9d9091SRichard Lowe jge L(mov3dqa14) 2082*5d9d9091SRichard Lowe 2083*5d9d9091SRichard Lowe cmp $0x10,%r8 2084*5d9d9091SRichard Lowe jl L(movdqa_epi) 2085*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2086*5d9d9091SRichard Lowe sub $0x10,%r8 2087*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 2088*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 2089*5d9d9091SRichard Lowe #palignr $0xe,%xmm1,%xmm3 2090*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2091*5d9d9091SRichard Lowe .byte 0xd9,0x0e 2092*5d9d9091SRichard Lowe 2093*5d9d9091SRichard Lowe cmp $0x10,%r8 2094*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 2095*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 2096*5d9d9091SRichard Lowe jl L(movdqa_epi) 2097*5d9d9091SRichard Lowe 2098*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2099*5d9d9091SRichard Lowe sub $0x10,%r8 2100*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 2101*5d9d9091SRichard Lowe #palignr $0xe,%xmm2,%xmm0 2102*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2103*5d9d9091SRichard Lowe .byte 0xc2,0x0e 2104*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 2105*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 2106*5d9d9091SRichard Lowe jmp L(movdqa_epi) 2107*5d9d9091SRichard Lowe 2108*5d9d9091SRichard Lowe .balign 16 2109*5d9d9091SRichard LoweL(mov3dqa15): 2110*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 2111*5d9d9091SRichard Lowe sub $0x30,%r8 2112*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm0 2113*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm5 2114*5d9d9091SRichard Lowe lea 0x30(%rdx),%rdx 2115*5d9d9091SRichard Lowe cmp $0x30,%r8 2116*5d9d9091SRichard Lowe 2117*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 2118*5d9d9091SRichard Lowe #palignr $0xf,%xmm1,%xmm3 2119*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2120*5d9d9091SRichard Lowe .byte 0xd9,0x0f 2121*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) 2122*5d9d9091SRichard Lowe 2123*5d9d9091SRichard Lowe movdqa %xmm0,%xmm4 2124*5d9d9091SRichard Lowe #palignr $0xf,%xmm2,%xmm0 2125*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2126*5d9d9091SRichard Lowe .byte 0xc2,0x0f 2127*5d9d9091SRichard Lowe movdqa %xmm0,0x10(%rcx) 2128*5d9d9091SRichard Lowe 2129*5d9d9091SRichard Lowe movdqa %xmm5,%xmm1 2130*5d9d9091SRichard Lowe #palignr $0xf,%xmm4,%xmm5 2131*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2132*5d9d9091SRichard Lowe .byte 0xec,0x0f 2133*5d9d9091SRichard Lowe movdqa %xmm5,0x20(%rcx) 2134*5d9d9091SRichard Lowe 2135*5d9d9091SRichard Lowe lea 0x30(%rcx),%rcx 2136*5d9d9091SRichard Lowe jge L(mov3dqa15) 2137*5d9d9091SRichard Lowe 2138*5d9d9091SRichard Lowe cmp $0x10,%r8 2139*5d9d9091SRichard Lowe jl L(movdqa_epi) 2140*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2141*5d9d9091SRichard Lowe sub $0x10,%r8 2142*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 2143*5d9d9091SRichard Lowe movdqa %xmm3,%xmm2 # save for use next concat 2144*5d9d9091SRichard Lowe #palignr $0xf,%xmm1,%xmm3 2145*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2146*5d9d9091SRichard Lowe .byte 0xd9,0x0f 2147*5d9d9091SRichard Lowe 2148*5d9d9091SRichard Lowe cmp $0x10,%r8 2149*5d9d9091SRichard Lowe movdqa %xmm3,(%rcx) # store it 2150*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 2151*5d9d9091SRichard Lowe jl L(movdqa_epi) 2152*5d9d9091SRichard Lowe 2153*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2154*5d9d9091SRichard Lowe sub $0x10,%r8 2155*5d9d9091SRichard Lowe lea 0x10(%rdx),%rdx 2156*5d9d9091SRichard Lowe #palignr $0xf,%xmm2,%xmm0 2157*5d9d9091SRichard Lowe .byte 0x66,0x0f,0x3a,0x0f 2158*5d9d9091SRichard Lowe .byte 0xc2,0x0f 2159*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) # store it 2160*5d9d9091SRichard Lowe lea 0x10(%rcx),%rcx 2161*5d9d9091SRichard Lowe jmp L(movdqa_epi) 2162*5d9d9091SRichard Lowe 2163*5d9d9091SRichard Lowe .balign 16 2164*5d9d9091SRichard LoweL(sse2_nt_move): 2165*5d9d9091SRichard Lowe lea 0x40(%rcx),%rcx 2166*5d9d9091SRichard Lowe lea 0x40(%rdx),%rdx 2167*5d9d9091SRichard Lowe lea -0x40(%r8),%r8 2168*5d9d9091SRichard Lowe 2169*5d9d9091SRichard Lowe /* 2170*5d9d9091SRichard Lowe * doesn't matter if source is aligned for stuff out of cache. 2171*5d9d9091SRichard Lowe * the mis-aligned penalty is masked by the slowness of main memory. 2172*5d9d9091SRichard Lowe */ 2173*5d9d9091SRichard Lowe prefetchnta 0x180(%rdx) 2174*5d9d9091SRichard Lowe movdqu -0x40(%rdx),%xmm0 2175*5d9d9091SRichard Lowe movdqu -0x30(%rdx),%xmm1 2176*5d9d9091SRichard Lowe 2177*5d9d9091SRichard Lowe cmp $0x40,%r8 2178*5d9d9091SRichard Lowe movntdq %xmm0,-0x40(%rcx) 2179*5d9d9091SRichard Lowe movntdq %xmm1,-0x30(%rcx) 2180*5d9d9091SRichard Lowe 2181*5d9d9091SRichard Lowe movdqu -0x20(%rdx),%xmm2 2182*5d9d9091SRichard Lowe movdqu -0x10(%rdx),%xmm3 2183*5d9d9091SRichard Lowe 2184*5d9d9091SRichard Lowe movntdq %xmm2,-0x20(%rcx) 2185*5d9d9091SRichard Lowe movntdq %xmm3,-0x10(%rcx) 2186*5d9d9091SRichard Lowe 2187*5d9d9091SRichard Lowe jge L(sse2_nt_move) 2188*5d9d9091SRichard Lowe 2189*5d9d9091SRichard Lowe lea L(Fix16EndTable)(%rip),%r10 2190*5d9d9091SRichard Lowe mov %r8,%r9 2191*5d9d9091SRichard Lowe and $0xFFFFFFFFFFFFFFF0,%r9 2192*5d9d9091SRichard Lowe add %r9,%rcx 2193*5d9d9091SRichard Lowe add %r9,%rdx 2194*5d9d9091SRichard Lowe sub %r9,%r8 2195*5d9d9091SRichard Lowe shr $0x4,%r9 2196*5d9d9091SRichard Lowe sfence 2197*5d9d9091SRichard Lowe 2198*5d9d9091SRichard Lowe movslq (%r10,%r9,4),%r11 2199*5d9d9091SRichard Lowe lea (%r11,%r10,1),%r10 2200*5d9d9091SRichard Lowe jmpq *%r10 2201*5d9d9091SRichard Lowe 2202*5d9d9091SRichard Lowe .balign 16 2203*5d9d9091SRichard LoweL(Fix16EndTable): 2204*5d9d9091SRichard Lowe .int L(fix16_0)-L(Fix16EndTable) 2205*5d9d9091SRichard Lowe .int L(fix16_1)-L(Fix16EndTable) 2206*5d9d9091SRichard Lowe .int L(fix16_2)-L(Fix16EndTable) 2207*5d9d9091SRichard Lowe .int L(fix16_3)-L(Fix16EndTable) 2208*5d9d9091SRichard Lowe 2209*5d9d9091SRichard Lowe .balign 16 2210*5d9d9091SRichard LoweL(fix16_3): 2211*5d9d9091SRichard Lowe movdqu -0x30(%rdx),%xmm1 2212*5d9d9091SRichard Lowe movdqa %xmm1,-0x30(%rcx) 2213*5d9d9091SRichard LoweL(fix16_2): 2214*5d9d9091SRichard Lowe movdqu -0x20(%rdx),%xmm2 2215*5d9d9091SRichard Lowe movdqa %xmm2,-0x20(%rcx) 2216*5d9d9091SRichard LoweL(fix16_1): 2217*5d9d9091SRichard Lowe movdqu -0x10(%rdx),%xmm3 2218*5d9d9091SRichard Lowe movdqa %xmm3,-0x10(%rcx) 2219*5d9d9091SRichard LoweL(fix16_0): 2220*5d9d9091SRichard Lowe lea L(fwdPxQx)(%rip),%r10 2221*5d9d9091SRichard Lowe add %r8,%rdx 2222*5d9d9091SRichard Lowe add %r8,%rcx 2223*5d9d9091SRichard Lowe 2224*5d9d9091SRichard Lowe movslq (%r10,%r8,4),%r9 2225*5d9d9091SRichard Lowe lea (%r9,%r10,1),%r10 2226*5d9d9091SRichard Lowe jmpq *%r10 2227*5d9d9091SRichard Lowe 2228*5d9d9091SRichard Lowe .balign 16 2229*5d9d9091SRichard LoweL(pre_both_aligned): 2230*5d9d9091SRichard Lowe cmp $0x80,%r8 2231*5d9d9091SRichard Lowe jl L(fix_16b) 2232*5d9d9091SRichard Lowe 2233*5d9d9091SRichard Lowe .balign 16 2234*5d9d9091SRichard LoweL(both_aligned): 2235*5d9d9091SRichard Lowe 2236*5d9d9091SRichard Lowe /* 2237*5d9d9091SRichard Lowe * this 'paired' load/load/store/store seems to do best. 2238*5d9d9091SRichard Lowe */ 2239*5d9d9091SRichard Lowe movdqa (%rdx),%xmm0 2240*5d9d9091SRichard Lowe movdqa 0x10(%rdx),%xmm1 2241*5d9d9091SRichard Lowe 2242*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) 2243*5d9d9091SRichard Lowe movdqa %xmm1,0x10(%rcx) 2244*5d9d9091SRichard Lowe lea -0x80(%r8),%r8 2245*5d9d9091SRichard Lowe 2246*5d9d9091SRichard Lowe movdqa 0x20(%rdx),%xmm2 2247*5d9d9091SRichard Lowe movdqa 0x30(%rdx),%xmm3 2248*5d9d9091SRichard Lowe 2249*5d9d9091SRichard Lowe movdqa %xmm2,0x20(%rcx) 2250*5d9d9091SRichard Lowe movdqa %xmm3,0x30(%rcx) 2251*5d9d9091SRichard Lowe 2252*5d9d9091SRichard Lowe movdqa 0x40(%rdx),%xmm0 2253*5d9d9091SRichard Lowe movdqa 0x50(%rdx),%xmm1 2254*5d9d9091SRichard Lowe cmp $0x80,%r8 2255*5d9d9091SRichard Lowe 2256*5d9d9091SRichard Lowe movdqa %xmm0,0x40(%rcx) 2257*5d9d9091SRichard Lowe movdqa %xmm1,0x50(%rcx) 2258*5d9d9091SRichard Lowe 2259*5d9d9091SRichard Lowe movdqa 0x60(%rdx),%xmm2 2260*5d9d9091SRichard Lowe movdqa 0x70(%rdx),%xmm3 2261*5d9d9091SRichard Lowe lea 0x80(%rdx),%rdx 2262*5d9d9091SRichard Lowe movdqa %xmm2,0x60(%rcx) 2263*5d9d9091SRichard Lowe movdqa %xmm3,0x70(%rcx) 2264*5d9d9091SRichard Lowe lea 0x80(%rcx),%rcx 2265*5d9d9091SRichard Lowe jge L(both_aligned) 2266*5d9d9091SRichard Lowe 2267*5d9d9091SRichard LoweL(fix_16b): 2268*5d9d9091SRichard Lowe add %r8,%rcx 2269*5d9d9091SRichard Lowe lea L(fwdPxQx)(%rip),%r10 2270*5d9d9091SRichard Lowe add %r8,%rdx 2271*5d9d9091SRichard Lowe 2272*5d9d9091SRichard Lowe movslq (%r10,%r8,4),%r9 2273*5d9d9091SRichard Lowe lea (%r9,%r10,1),%r10 2274*5d9d9091SRichard Lowe jmpq *%r10 2275*5d9d9091SRichard Lowe 2276*5d9d9091SRichard Lowe .balign 16 2277*5d9d9091SRichard LoweL(Loop8byte_pre): 2278*5d9d9091SRichard Lowe # Use 8-byte moves 2279*5d9d9091SRichard Lowe mov .largest_level_cache_size(%rip),%r9d 2280*5d9d9091SRichard Lowe shr %r9 # take half of it 2281*5d9d9091SRichard Lowe cmp %r9,%r8 2282*5d9d9091SRichard Lowe jge L(byte8_nt_top) 2283*5d9d9091SRichard Lowe # Find out whether to use rep movsq 2284*5d9d9091SRichard Lowe cmp $4096,%r8 2285*5d9d9091SRichard Lowe jle L(byte8_top) 2286*5d9d9091SRichard Lowe mov .amd64cache1half(%rip),%r9d # half of l1 cache 2287*5d9d9091SRichard Lowe cmp %r9,%r8 2288*5d9d9091SRichard Lowe jle L(use_rep) 2289*5d9d9091SRichard Lowe 2290*5d9d9091SRichard Lowe .balign 16 2291*5d9d9091SRichard LoweL(byte8_top): 2292*5d9d9091SRichard Lowe mov (%rdx),%r9 2293*5d9d9091SRichard Lowe mov 0x8(%rdx),%r10 2294*5d9d9091SRichard Lowe lea -0x40(%r8),%r8 2295*5d9d9091SRichard Lowe mov %r9,(%rcx) 2296*5d9d9091SRichard Lowe mov %r10,0x8(%rcx) 2297*5d9d9091SRichard Lowe mov 0x10(%rdx),%r11 2298*5d9d9091SRichard Lowe mov 0x18(%rdx),%r9 2299*5d9d9091SRichard Lowe mov %r11,0x10(%rcx) 2300*5d9d9091SRichard Lowe mov %r9,0x18(%rcx) 2301*5d9d9091SRichard Lowe 2302*5d9d9091SRichard Lowe cmp $0x40,%r8 2303*5d9d9091SRichard Lowe mov 0x20(%rdx),%r10 2304*5d9d9091SRichard Lowe mov 0x28(%rdx),%r11 2305*5d9d9091SRichard Lowe mov %r10,0x20(%rcx) 2306*5d9d9091SRichard Lowe mov %r11,0x28(%rcx) 2307*5d9d9091SRichard Lowe mov 0x30(%rdx),%r9 2308*5d9d9091SRichard Lowe mov 0x38(%rdx),%r10 2309*5d9d9091SRichard Lowe lea 0x40(%rdx),%rdx 2310*5d9d9091SRichard Lowe mov %r9,0x30(%rcx) 2311*5d9d9091SRichard Lowe mov %r10,0x38(%rcx) 2312*5d9d9091SRichard Lowe lea 0x40(%rcx),%rcx 2313*5d9d9091SRichard Lowe jg L(byte8_top) 2314*5d9d9091SRichard Lowe 2315*5d9d9091SRichard LoweL(byte8_end): 2316*5d9d9091SRichard Lowe lea L(fwdPxQx)(%rip),%r10 2317*5d9d9091SRichard Lowe lea (%rdx,%r8,1),%rdx 2318*5d9d9091SRichard Lowe lea (%rcx,%r8,1),%rcx 2319*5d9d9091SRichard Lowe 2320*5d9d9091SRichard Lowe movslq (%r10,%r8,4),%r9 2321*5d9d9091SRichard Lowe lea (%r9,%r10,1),%r10 2322*5d9d9091SRichard Lowe jmpq *%r10 2323*5d9d9091SRichard Lowe 2324*5d9d9091SRichard Lowe .balign 16 2325*5d9d9091SRichard LoweL(use_rep): 2326*5d9d9091SRichard Lowe mov %rdx,%rsi # %rsi = source 2327*5d9d9091SRichard Lowe mov %rcx,%rdi # %rdi = destination 2328*5d9d9091SRichard Lowe mov %r8,%rcx # %rcx = count 2329*5d9d9091SRichard Lowe shrq $3,%rcx # 8-byte word count 2330*5d9d9091SRichard Lowe rep 2331*5d9d9091SRichard Lowe movsq 2332*5d9d9091SRichard Lowe mov %rsi,%rdx # source 2333*5d9d9091SRichard Lowe mov %rdi,%rcx # destination 2334*5d9d9091SRichard Lowe andq $7,%r8 # remainder 2335*5d9d9091SRichard Lowe jnz L(byte8_end) 2336*5d9d9091SRichard Lowe ret 2337*5d9d9091SRichard Lowe 2338*5d9d9091SRichard Lowe .balign 16 2339*5d9d9091SRichard LoweL(byte8_nt_top): 2340*5d9d9091SRichard Lowe sub $0x40,%r8 2341*5d9d9091SRichard Lowe prefetchnta 0x180(%rdx) 2342*5d9d9091SRichard Lowe mov (%rdx),%r9 2343*5d9d9091SRichard Lowe movnti %r9,(%rcx) 2344*5d9d9091SRichard Lowe mov 0x8(%rdx),%r10 2345*5d9d9091SRichard Lowe movnti %r10,0x8(%rcx) 2346*5d9d9091SRichard Lowe mov 0x10(%rdx),%r11 2347*5d9d9091SRichard Lowe movnti %r11,0x10(%rcx) 2348*5d9d9091SRichard Lowe mov 0x18(%rdx),%r9 2349*5d9d9091SRichard Lowe movnti %r9,0x18(%rcx) 2350*5d9d9091SRichard Lowe mov 0x20(%rdx),%r10 2351*5d9d9091SRichard Lowe movnti %r10,0x20(%rcx) 2352*5d9d9091SRichard Lowe mov 0x28(%rdx),%r11 2353*5d9d9091SRichard Lowe movnti %r11,0x28(%rcx) 2354*5d9d9091SRichard Lowe mov 0x30(%rdx),%r9 2355*5d9d9091SRichard Lowe movnti %r9,0x30(%rcx) 2356*5d9d9091SRichard Lowe mov 0x38(%rdx),%r10 2357*5d9d9091SRichard Lowe movnti %r10,0x38(%rcx) 2358*5d9d9091SRichard Lowe 2359*5d9d9091SRichard Lowe lea 0x40(%rdx),%rdx 2360*5d9d9091SRichard Lowe lea 0x40(%rcx),%rcx 2361*5d9d9091SRichard Lowe cmp $0x40,%r8 2362*5d9d9091SRichard Lowe jge L(byte8_nt_top) 2363*5d9d9091SRichard Lowe sfence 2364*5d9d9091SRichard Lowe jmp L(byte8_end) 2365*5d9d9091SRichard Lowe 2366*5d9d9091SRichard Lowe SET_SIZE(memcpy) 2367*5d9d9091SRichard Lowe 2368*5d9d9091SRichard Lowe .balign 16 2369*5d9d9091SRichard LoweL(CopyBackwards): 2370*5d9d9091SRichard Lowe mov %rdx,%r8 2371*5d9d9091SRichard Lowe mov %rdi,%rcx 2372*5d9d9091SRichard Lowe mov %rsi,%rdx 2373*5d9d9091SRichard Lowe mov %rdi,%rax # return value 2374*5d9d9091SRichard Lowe 2375*5d9d9091SRichard Lowe # ck alignment of last byte 2376*5d9d9091SRichard Lowe lea (%rcx,%r8,1),%rcx 2377*5d9d9091SRichard Lowe test $0x7,%rcx 2378*5d9d9091SRichard Lowe lea (%rdx,%r8,1),%rdx 2379*5d9d9091SRichard Lowe jne L(bk_align) 2380*5d9d9091SRichard Lowe 2381*5d9d9091SRichard LoweL(bk_qw_aligned): 2382*5d9d9091SRichard Lowe lea L(bkPxQx)(%rip),%r10 2383*5d9d9091SRichard Lowe 2384*5d9d9091SRichard Lowe cmp $0x90,%r8 # 144 2385*5d9d9091SRichard Lowe jg L(bk_ck_sse2_alignment) 2386*5d9d9091SRichard Lowe 2387*5d9d9091SRichard Lowe sub %r8,%rcx 2388*5d9d9091SRichard Lowe sub %r8,%rdx 2389*5d9d9091SRichard Lowe 2390*5d9d9091SRichard Lowe movslq (%r10,%r8,4),%r9 2391*5d9d9091SRichard Lowe lea (%r9,%r10,1),%r10 2392*5d9d9091SRichard Lowe jmpq *%r10 2393*5d9d9091SRichard Lowe 2394*5d9d9091SRichard Lowe .balign 16 2395*5d9d9091SRichard LoweL(bk_align): 2396*5d9d9091SRichard Lowe # only align if len > 8 2397*5d9d9091SRichard Lowe cmp $8,%r8 2398*5d9d9091SRichard Lowe jle L(bk_qw_aligned) 2399*5d9d9091SRichard Lowe test $0x1,%rcx 2400*5d9d9091SRichard Lowe je L(bk_tst2) 2401*5d9d9091SRichard Lowe dec %rcx 2402*5d9d9091SRichard Lowe dec %rdx 2403*5d9d9091SRichard Lowe dec %r8 2404*5d9d9091SRichard Lowe mov (%rdx),%r9b 2405*5d9d9091SRichard Lowe mov %r9b,(%rcx) 2406*5d9d9091SRichard Lowe 2407*5d9d9091SRichard LoweL(bk_tst2): 2408*5d9d9091SRichard Lowe test $0x2,%rcx 2409*5d9d9091SRichard Lowe je L(bk_tst3) 2410*5d9d9091SRichard Lowe 2411*5d9d9091SRichard LoweL(bk_got2): 2412*5d9d9091SRichard Lowe sub $0x2,%rcx 2413*5d9d9091SRichard Lowe sub $0x2,%rdx 2414*5d9d9091SRichard Lowe sub $0x2,%r8 2415*5d9d9091SRichard Lowe movzwq (%rdx),%r9 2416*5d9d9091SRichard Lowe mov %r9w,(%rcx) 2417*5d9d9091SRichard Lowe 2418*5d9d9091SRichard LoweL(bk_tst3): 2419*5d9d9091SRichard Lowe test $0x4,%rcx 2420*5d9d9091SRichard Lowe je L(bk_qw_aligned) 2421*5d9d9091SRichard Lowe 2422*5d9d9091SRichard LoweL(bk_got3): 2423*5d9d9091SRichard Lowe sub $0x4,%rcx 2424*5d9d9091SRichard Lowe sub $0x4,%rdx 2425*5d9d9091SRichard Lowe sub $0x4,%r8 2426*5d9d9091SRichard Lowe mov (%rdx),%r9d 2427*5d9d9091SRichard Lowe mov %r9d,(%rcx) 2428*5d9d9091SRichard Lowe jmp L(bk_qw_aligned) 2429*5d9d9091SRichard Lowe 2430*5d9d9091SRichard Lowe .balign 16 2431*5d9d9091SRichard LoweL(bk_ck_sse2_alignment): 2432*5d9d9091SRichard Lowe cmpl $NO_SSE,.memops_method(%rip) 2433*5d9d9091SRichard Lowe je L(bk_use_rep) 2434*5d9d9091SRichard Lowe # check alignment of last byte 2435*5d9d9091SRichard Lowe test $0xf,%rcx 2436*5d9d9091SRichard Lowe jz L(bk_sse2_cpy) 2437*5d9d9091SRichard Lowe 2438*5d9d9091SRichard LoweL(bk_sse2_align): 2439*5d9d9091SRichard Lowe # only here if already aligned on at least a qword bndry 2440*5d9d9091SRichard Lowe sub $0x8,%rcx 2441*5d9d9091SRichard Lowe sub $0x8,%rdx 2442*5d9d9091SRichard Lowe sub $0x8,%r8 2443*5d9d9091SRichard Lowe mov (%rdx),%r9 2444*5d9d9091SRichard Lowe mov %r9,(%rcx) 2445*5d9d9091SRichard Lowe #jmp L(bk_sse2_cpy) 2446*5d9d9091SRichard Lowe 2447*5d9d9091SRichard Lowe .balign 16 2448*5d9d9091SRichard LoweL(bk_sse2_cpy): 2449*5d9d9091SRichard Lowe sub $0x80,%rcx # 128 2450*5d9d9091SRichard Lowe sub $0x80,%rdx 2451*5d9d9091SRichard Lowe movdqu 0x70(%rdx),%xmm3 2452*5d9d9091SRichard Lowe movdqu 0x60(%rdx),%xmm2 2453*5d9d9091SRichard Lowe movdqa %xmm3,0x70(%rcx) 2454*5d9d9091SRichard Lowe movdqa %xmm2,0x60(%rcx) 2455*5d9d9091SRichard Lowe sub $0x80,%r8 2456*5d9d9091SRichard Lowe movdqu 0x50(%rdx),%xmm1 2457*5d9d9091SRichard Lowe movdqu 0x40(%rdx),%xmm0 2458*5d9d9091SRichard Lowe movdqa %xmm1,0x50(%rcx) 2459*5d9d9091SRichard Lowe movdqa %xmm0,0x40(%rcx) 2460*5d9d9091SRichard Lowe 2461*5d9d9091SRichard Lowe cmp $0x80,%r8 2462*5d9d9091SRichard Lowe movdqu 0x30(%rdx),%xmm3 2463*5d9d9091SRichard Lowe movdqu 0x20(%rdx),%xmm2 2464*5d9d9091SRichard Lowe movdqa %xmm3,0x30(%rcx) 2465*5d9d9091SRichard Lowe movdqa %xmm2,0x20(%rcx) 2466*5d9d9091SRichard Lowe movdqu 0x10(%rdx),%xmm1 2467*5d9d9091SRichard Lowe movdqu (%rdx),%xmm0 2468*5d9d9091SRichard Lowe movdqa %xmm1,0x10(%rcx) 2469*5d9d9091SRichard Lowe movdqa %xmm0,(%rcx) 2470*5d9d9091SRichard Lowe jge L(bk_sse2_cpy) 2471*5d9d9091SRichard Lowe 2472*5d9d9091SRichard LoweL(bk_sse2_cpy_end): 2473*5d9d9091SRichard Lowe lea L(bkPxQx)(%rip),%r10 2474*5d9d9091SRichard Lowe sub %r8,%rdx 2475*5d9d9091SRichard Lowe sub %r8,%rcx 2476*5d9d9091SRichard Lowe movslq (%r10,%r8,4),%r9 2477*5d9d9091SRichard Lowe lea (%r9,%r10,1),%r10 2478*5d9d9091SRichard Lowe jmpq *%r10 2479*5d9d9091SRichard Lowe 2480*5d9d9091SRichard Lowe .balign 16 2481*5d9d9091SRichard LoweL(bk_use_rep): 2482*5d9d9091SRichard Lowe xchg %rcx,%r9 2483*5d9d9091SRichard Lowe mov %rdx,%rsi # source 2484*5d9d9091SRichard Lowe mov %r9,%rdi # destination 2485*5d9d9091SRichard Lowe mov %r8,%rcx # count 2486*5d9d9091SRichard Lowe sub $8,%rsi 2487*5d9d9091SRichard Lowe sub $8,%rdi 2488*5d9d9091SRichard Lowe shr $3,%rcx 2489*5d9d9091SRichard Lowe std # reverse direction 2490*5d9d9091SRichard Lowe rep 2491*5d9d9091SRichard Lowe movsq 2492*5d9d9091SRichard Lowe cld # reset direction flag 2493*5d9d9091SRichard Lowe 2494*5d9d9091SRichard Lowe xchg %rcx,%r9 2495*5d9d9091SRichard Lowe lea L(bkPxQx)(%rip),%r10 2496*5d9d9091SRichard Lowe sub %r8,%rdx 2497*5d9d9091SRichard Lowe sub %r8,%rcx 2498*5d9d9091SRichard Lowe andq $7,%r8 # remainder 2499*5d9d9091SRichard Lowe jz 2f 2500*5d9d9091SRichard Lowe movslq (%r10,%r8,4),%r9 2501*5d9d9091SRichard Lowe lea (%r9,%r10,1),%r10 2502*5d9d9091SRichard Lowe jmpq *%r10 2503*5d9d9091SRichard Lowe2: 2504*5d9d9091SRichard Lowe ret 2505*5d9d9091SRichard Lowe 2506*5d9d9091SRichard Lowe .balign 16 2507*5d9d9091SRichard LoweL(bkP0QI): 2508*5d9d9091SRichard Lowe mov 0x88(%rdx),%r10 2509*5d9d9091SRichard Lowe mov %r10,0x88(%rcx) 2510*5d9d9091SRichard LoweL(bkP0QH): 2511*5d9d9091SRichard Lowe mov 0x80(%rdx),%r10 2512*5d9d9091SRichard Lowe mov %r10,0x80(%rcx) 2513*5d9d9091SRichard LoweL(bkP0QG): 2514*5d9d9091SRichard Lowe mov 0x78(%rdx),%r9 2515*5d9d9091SRichard Lowe mov %r9,0x78(%rcx) 2516*5d9d9091SRichard LoweL(bkP0QF): 2517*5d9d9091SRichard Lowe mov 0x70(%rdx),%r11 2518*5d9d9091SRichard Lowe mov %r11,0x70(%rcx) 2519*5d9d9091SRichard LoweL(bkP0QE): 2520*5d9d9091SRichard Lowe mov 0x68(%rdx),%r10 2521*5d9d9091SRichard Lowe mov %r10,0x68(%rcx) 2522*5d9d9091SRichard LoweL(bkP0QD): 2523*5d9d9091SRichard Lowe mov 0x60(%rdx),%r9 2524*5d9d9091SRichard Lowe mov %r9,0x60(%rcx) 2525*5d9d9091SRichard LoweL(bkP0QC): 2526*5d9d9091SRichard Lowe mov 0x58(%rdx),%r11 2527*5d9d9091SRichard Lowe mov %r11,0x58(%rcx) 2528*5d9d9091SRichard LoweL(bkP0QB): 2529*5d9d9091SRichard Lowe mov 0x50(%rdx),%r10 2530*5d9d9091SRichard Lowe mov %r10,0x50(%rcx) 2531*5d9d9091SRichard LoweL(bkP0QA): 2532*5d9d9091SRichard Lowe mov 0x48(%rdx),%r9 2533*5d9d9091SRichard Lowe mov %r9,0x48(%rcx) 2534*5d9d9091SRichard LoweL(bkP0Q9): 2535*5d9d9091SRichard Lowe mov 0x40(%rdx),%r11 2536*5d9d9091SRichard Lowe mov %r11,0x40(%rcx) 2537*5d9d9091SRichard LoweL(bkP0Q8): 2538*5d9d9091SRichard Lowe mov 0x38(%rdx),%r10 2539*5d9d9091SRichard Lowe mov %r10,0x38(%rcx) 2540*5d9d9091SRichard LoweL(bkP0Q7): 2541*5d9d9091SRichard Lowe mov 0x30(%rdx),%r9 2542*5d9d9091SRichard Lowe mov %r9,0x30(%rcx) 2543*5d9d9091SRichard LoweL(bkP0Q6): 2544*5d9d9091SRichard Lowe mov 0x28(%rdx),%r11 2545*5d9d9091SRichard Lowe mov %r11,0x28(%rcx) 2546*5d9d9091SRichard LoweL(bkP0Q5): 2547*5d9d9091SRichard Lowe mov 0x20(%rdx),%r10 2548*5d9d9091SRichard Lowe mov %r10,0x20(%rcx) 2549*5d9d9091SRichard LoweL(bkP0Q4): 2550*5d9d9091SRichard Lowe mov 0x18(%rdx),%r9 2551*5d9d9091SRichard Lowe mov %r9,0x18(%rcx) 2552*5d9d9091SRichard LoweL(bkP0Q3): 2553*5d9d9091SRichard Lowe mov 0x10(%rdx),%r11 2554*5d9d9091SRichard Lowe mov %r11,0x10(%rcx) 2555*5d9d9091SRichard LoweL(bkP0Q2): 2556*5d9d9091SRichard Lowe mov 0x8(%rdx),%r10 2557*5d9d9091SRichard Lowe mov %r10,0x8(%rcx) 2558*5d9d9091SRichard LoweL(bkP0Q1): 2559*5d9d9091SRichard Lowe mov (%rdx),%r9 2560*5d9d9091SRichard Lowe mov %r9,(%rcx) 2561*5d9d9091SRichard LoweL(bkP0Q0): 2562*5d9d9091SRichard Lowe ret 2563*5d9d9091SRichard Lowe 2564*5d9d9091SRichard Lowe .balign 16 2565*5d9d9091SRichard LoweL(bkP1QI): 2566*5d9d9091SRichard Lowe mov 0x89(%rdx),%r10 2567*5d9d9091SRichard Lowe mov %r10,0x89(%rcx) 2568*5d9d9091SRichard LoweL(bkP1QH): 2569*5d9d9091SRichard Lowe mov 0x81(%rdx),%r11 2570*5d9d9091SRichard Lowe mov %r11,0x81(%rcx) 2571*5d9d9091SRichard LoweL(bkP1QG): 2572*5d9d9091SRichard Lowe mov 0x79(%rdx),%r10 2573*5d9d9091SRichard Lowe mov %r10,0x79(%rcx) 2574*5d9d9091SRichard LoweL(bkP1QF): 2575*5d9d9091SRichard Lowe mov 0x71(%rdx),%r9 2576*5d9d9091SRichard Lowe mov %r9,0x71(%rcx) 2577*5d9d9091SRichard LoweL(bkP1QE): 2578*5d9d9091SRichard Lowe mov 0x69(%rdx),%r11 2579*5d9d9091SRichard Lowe mov %r11,0x69(%rcx) 2580*5d9d9091SRichard LoweL(bkP1QD): 2581*5d9d9091SRichard Lowe mov 0x61(%rdx),%r10 2582*5d9d9091SRichard Lowe mov %r10,0x61(%rcx) 2583*5d9d9091SRichard LoweL(bkP1QC): 2584*5d9d9091SRichard Lowe mov 0x59(%rdx),%r9 2585*5d9d9091SRichard Lowe mov %r9,0x59(%rcx) 2586*5d9d9091SRichard LoweL(bkP1QB): 2587*5d9d9091SRichard Lowe mov 0x51(%rdx),%r11 2588*5d9d9091SRichard Lowe mov %r11,0x51(%rcx) 2589*5d9d9091SRichard LoweL(bkP1QA): 2590*5d9d9091SRichard Lowe mov 0x49(%rdx),%r10 2591*5d9d9091SRichard Lowe mov %r10,0x49(%rcx) 2592*5d9d9091SRichard LoweL(bkP1Q9): 2593*5d9d9091SRichard Lowe mov 0x41(%rdx),%r9 2594*5d9d9091SRichard Lowe mov %r9,0x41(%rcx) 2595*5d9d9091SRichard LoweL(bkP1Q8): 2596*5d9d9091SRichard Lowe mov 0x39(%rdx),%r11 2597*5d9d9091SRichard Lowe mov %r11,0x39(%rcx) 2598*5d9d9091SRichard LoweL(bkP1Q7): 2599*5d9d9091SRichard Lowe mov 0x31(%rdx),%r10 2600*5d9d9091SRichard Lowe mov %r10,0x31(%rcx) 2601*5d9d9091SRichard LoweL(bkP1Q6): 2602*5d9d9091SRichard Lowe mov 0x29(%rdx),%r9 2603*5d9d9091SRichard Lowe mov %r9,0x29(%rcx) 2604*5d9d9091SRichard LoweL(bkP1Q5): 2605*5d9d9091SRichard Lowe mov 0x21(%rdx),%r11 2606*5d9d9091SRichard Lowe mov %r11,0x21(%rcx) 2607*5d9d9091SRichard LoweL(bkP1Q4): 2608*5d9d9091SRichard Lowe mov 0x19(%rdx),%r10 2609*5d9d9091SRichard Lowe mov %r10,0x19(%rcx) 2610*5d9d9091SRichard LoweL(bkP1Q3): 2611*5d9d9091SRichard Lowe mov 0x11(%rdx),%r9 2612*5d9d9091SRichard Lowe mov %r9,0x11(%rcx) 2613*5d9d9091SRichard LoweL(bkP1Q2): 2614*5d9d9091SRichard Lowe mov 0x9(%rdx),%r11 2615*5d9d9091SRichard Lowe mov %r11,0x9(%rcx) 2616*5d9d9091SRichard LoweL(bkP1Q1): 2617*5d9d9091SRichard Lowe mov 0x1(%rdx),%r10 2618*5d9d9091SRichard Lowe mov %r10,0x1(%rcx) 2619*5d9d9091SRichard LoweL(bkP1Q0): 2620*5d9d9091SRichard Lowe mov (%rdx),%r9b 2621*5d9d9091SRichard Lowe mov %r9b,(%rcx) 2622*5d9d9091SRichard Lowe ret 2623*5d9d9091SRichard Lowe 2624*5d9d9091SRichard Lowe .balign 16 2625*5d9d9091SRichard LoweL(bkP2QI): 2626*5d9d9091SRichard Lowe mov 0x8a(%rdx),%r10 2627*5d9d9091SRichard Lowe mov %r10,0x8a(%rcx) 2628*5d9d9091SRichard LoweL(bkP2QH): 2629*5d9d9091SRichard Lowe mov 0x82(%rdx),%r11 2630*5d9d9091SRichard Lowe mov %r11,0x82(%rcx) 2631*5d9d9091SRichard LoweL(bkP2QG): 2632*5d9d9091SRichard Lowe mov 0x7a(%rdx),%r10 2633*5d9d9091SRichard Lowe mov %r10,0x7a(%rcx) 2634*5d9d9091SRichard LoweL(bkP2QF): 2635*5d9d9091SRichard Lowe mov 0x72(%rdx),%r9 2636*5d9d9091SRichard Lowe mov %r9,0x72(%rcx) 2637*5d9d9091SRichard LoweL(bkP2QE): 2638*5d9d9091SRichard Lowe mov 0x6a(%rdx),%r11 2639*5d9d9091SRichard Lowe mov %r11,0x6a(%rcx) 2640*5d9d9091SRichard LoweL(bkP2QD): 2641*5d9d9091SRichard Lowe mov 0x62(%rdx),%r10 2642*5d9d9091SRichard Lowe mov %r10,0x62(%rcx) 2643*5d9d9091SRichard LoweL(bkP2QC): 2644*5d9d9091SRichard Lowe mov 0x5a(%rdx),%r9 2645*5d9d9091SRichard Lowe mov %r9,0x5a(%rcx) 2646*5d9d9091SRichard LoweL(bkP2QB): 2647*5d9d9091SRichard Lowe mov 0x52(%rdx),%r11 2648*5d9d9091SRichard Lowe mov %r11,0x52(%rcx) 2649*5d9d9091SRichard LoweL(bkP2QA): 2650*5d9d9091SRichard Lowe mov 0x4a(%rdx),%r10 2651*5d9d9091SRichard Lowe mov %r10,0x4a(%rcx) 2652*5d9d9091SRichard LoweL(bkP2Q9): 2653*5d9d9091SRichard Lowe mov 0x42(%rdx),%r9 2654*5d9d9091SRichard Lowe mov %r9,0x42(%rcx) 2655*5d9d9091SRichard LoweL(bkP2Q8): 2656*5d9d9091SRichard Lowe mov 0x3a(%rdx),%r11 2657*5d9d9091SRichard Lowe mov %r11,0x3a(%rcx) 2658*5d9d9091SRichard LoweL(bkP2Q7): 2659*5d9d9091SRichard Lowe mov 0x32(%rdx),%r10 2660*5d9d9091SRichard Lowe mov %r10,0x32(%rcx) 2661*5d9d9091SRichard LoweL(bkP2Q6): 2662*5d9d9091SRichard Lowe mov 0x2a(%rdx),%r9 2663*5d9d9091SRichard Lowe mov %r9,0x2a(%rcx) 2664*5d9d9091SRichard LoweL(bkP2Q5): 2665*5d9d9091SRichard Lowe mov 0x22(%rdx),%r11 2666*5d9d9091SRichard Lowe mov %r11,0x22(%rcx) 2667*5d9d9091SRichard LoweL(bkP2Q4): 2668*5d9d9091SRichard Lowe mov 0x1a(%rdx),%r10 2669*5d9d9091SRichard Lowe mov %r10,0x1a(%rcx) 2670*5d9d9091SRichard LoweL(bkP2Q3): 2671*5d9d9091SRichard Lowe mov 0x12(%rdx),%r9 2672*5d9d9091SRichard Lowe mov %r9,0x12(%rcx) 2673*5d9d9091SRichard LoweL(bkP2Q2): 2674*5d9d9091SRichard Lowe mov 0xa(%rdx),%r11 2675*5d9d9091SRichard Lowe mov %r11,0xa(%rcx) 2676*5d9d9091SRichard LoweL(bkP2Q1): 2677*5d9d9091SRichard Lowe mov 0x2(%rdx),%r10 2678*5d9d9091SRichard Lowe mov %r10,0x2(%rcx) 2679*5d9d9091SRichard LoweL(bkP2Q0): 2680*5d9d9091SRichard Lowe mov (%rdx),%r9w 2681*5d9d9091SRichard Lowe mov %r9w,(%rcx) 2682*5d9d9091SRichard Lowe ret 2683*5d9d9091SRichard Lowe 2684*5d9d9091SRichard Lowe .balign 16 2685*5d9d9091SRichard LoweL(bkP3QI): 2686*5d9d9091SRichard Lowe mov 0x8b(%rdx),%r10 2687*5d9d9091SRichard Lowe mov %r10,0x8b(%rcx) 2688*5d9d9091SRichard LoweL(bkP3QH): 2689*5d9d9091SRichard Lowe mov 0x83(%rdx),%r11 2690*5d9d9091SRichard Lowe mov %r11,0x83(%rcx) 2691*5d9d9091SRichard LoweL(bkP3QG): 2692*5d9d9091SRichard Lowe mov 0x7b(%rdx),%r10 2693*5d9d9091SRichard Lowe mov %r10,0x7b(%rcx) 2694*5d9d9091SRichard LoweL(bkP3QF): 2695*5d9d9091SRichard Lowe mov 0x73(%rdx),%r9 2696*5d9d9091SRichard Lowe mov %r9,0x73(%rcx) 2697*5d9d9091SRichard LoweL(bkP3QE): 2698*5d9d9091SRichard Lowe mov 0x6b(%rdx),%r11 2699*5d9d9091SRichard Lowe mov %r11,0x6b(%rcx) 2700*5d9d9091SRichard LoweL(bkP3QD): 2701*5d9d9091SRichard Lowe mov 0x63(%rdx),%r10 2702*5d9d9091SRichard Lowe mov %r10,0x63(%rcx) 2703*5d9d9091SRichard LoweL(bkP3QC): 2704*5d9d9091SRichard Lowe mov 0x5b(%rdx),%r9 2705*5d9d9091SRichard Lowe mov %r9,0x5b(%rcx) 2706*5d9d9091SRichard LoweL(bkP3QB): 2707*5d9d9091SRichard Lowe mov 0x53(%rdx),%r11 2708*5d9d9091SRichard Lowe mov %r11,0x53(%rcx) 2709*5d9d9091SRichard LoweL(bkP3QA): 2710*5d9d9091SRichard Lowe mov 0x4b(%rdx),%r10 2711*5d9d9091SRichard Lowe mov %r10,0x4b(%rcx) 2712*5d9d9091SRichard LoweL(bkP3Q9): 2713*5d9d9091SRichard Lowe mov 0x43(%rdx),%r9 2714*5d9d9091SRichard Lowe mov %r9,0x43(%rcx) 2715*5d9d9091SRichard LoweL(bkP3Q8): 2716*5d9d9091SRichard Lowe mov 0x3b(%rdx),%r11 2717*5d9d9091SRichard Lowe mov %r11,0x3b(%rcx) 2718*5d9d9091SRichard LoweL(bkP3Q7): 2719*5d9d9091SRichard Lowe mov 0x33(%rdx),%r10 2720*5d9d9091SRichard Lowe mov %r10,0x33(%rcx) 2721*5d9d9091SRichard LoweL(bkP3Q6): 2722*5d9d9091SRichard Lowe mov 0x2b(%rdx),%r9 2723*5d9d9091SRichard Lowe mov %r9,0x2b(%rcx) 2724*5d9d9091SRichard LoweL(bkP3Q5): 2725*5d9d9091SRichard Lowe mov 0x23(%rdx),%r11 2726*5d9d9091SRichard Lowe mov %r11,0x23(%rcx) 2727*5d9d9091SRichard LoweL(bkP3Q4): 2728*5d9d9091SRichard Lowe mov 0x1b(%rdx),%r10 2729*5d9d9091SRichard Lowe mov %r10,0x1b(%rcx) 2730*5d9d9091SRichard LoweL(bkP3Q3): 2731*5d9d9091SRichard Lowe mov 0x13(%rdx),%r9 2732*5d9d9091SRichard Lowe mov %r9,0x13(%rcx) 2733*5d9d9091SRichard LoweL(bkP3Q2): 2734*5d9d9091SRichard Lowe mov 0xb(%rdx),%r11 2735*5d9d9091SRichard Lowe mov %r11,0xb(%rcx) 2736*5d9d9091SRichard LoweL(bkP3Q1): 2737*5d9d9091SRichard Lowe mov 0x3(%rdx),%r10 2738*5d9d9091SRichard Lowe mov %r10,0x3(%rcx) 2739*5d9d9091SRichard LoweL(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores 2740*5d9d9091SRichard Lowe mov 0x1(%rdx),%r9w 2741*5d9d9091SRichard Lowe mov %r9w,0x1(%rcx) 2742*5d9d9091SRichard Lowe mov (%rdx),%r10b 2743*5d9d9091SRichard Lowe mov %r10b,(%rcx) 2744*5d9d9091SRichard Lowe ret 2745*5d9d9091SRichard Lowe 2746*5d9d9091SRichard Lowe .balign 16 2747*5d9d9091SRichard LoweL(bkP4QI): 2748*5d9d9091SRichard Lowe mov 0x8c(%rdx),%r10 2749*5d9d9091SRichard Lowe mov %r10,0x8c(%rcx) 2750*5d9d9091SRichard LoweL(bkP4QH): 2751*5d9d9091SRichard Lowe mov 0x84(%rdx),%r11 2752*5d9d9091SRichard Lowe mov %r11,0x84(%rcx) 2753*5d9d9091SRichard LoweL(bkP4QG): 2754*5d9d9091SRichard Lowe mov 0x7c(%rdx),%r10 2755*5d9d9091SRichard Lowe mov %r10,0x7c(%rcx) 2756*5d9d9091SRichard LoweL(bkP4QF): 2757*5d9d9091SRichard Lowe mov 0x74(%rdx),%r9 2758*5d9d9091SRichard Lowe mov %r9,0x74(%rcx) 2759*5d9d9091SRichard LoweL(bkP4QE): 2760*5d9d9091SRichard Lowe mov 0x6c(%rdx),%r11 2761*5d9d9091SRichard Lowe mov %r11,0x6c(%rcx) 2762*5d9d9091SRichard LoweL(bkP4QD): 2763*5d9d9091SRichard Lowe mov 0x64(%rdx),%r10 2764*5d9d9091SRichard Lowe mov %r10,0x64(%rcx) 2765*5d9d9091SRichard LoweL(bkP4QC): 2766*5d9d9091SRichard Lowe mov 0x5c(%rdx),%r9 2767*5d9d9091SRichard Lowe mov %r9,0x5c(%rcx) 2768*5d9d9091SRichard LoweL(bkP4QB): 2769*5d9d9091SRichard Lowe mov 0x54(%rdx),%r11 2770*5d9d9091SRichard Lowe mov %r11,0x54(%rcx) 2771*5d9d9091SRichard LoweL(bkP4QA): 2772*5d9d9091SRichard Lowe mov 0x4c(%rdx),%r10 2773*5d9d9091SRichard Lowe mov %r10,0x4c(%rcx) 2774*5d9d9091SRichard LoweL(bkP4Q9): 2775*5d9d9091SRichard Lowe mov 0x44(%rdx),%r9 2776*5d9d9091SRichard Lowe mov %r9,0x44(%rcx) 2777*5d9d9091SRichard LoweL(bkP4Q8): 2778*5d9d9091SRichard Lowe mov 0x3c(%rdx),%r11 2779*5d9d9091SRichard Lowe mov %r11,0x3c(%rcx) 2780*5d9d9091SRichard LoweL(bkP4Q7): 2781*5d9d9091SRichard Lowe mov 0x34(%rdx),%r10 2782*5d9d9091SRichard Lowe mov %r10,0x34(%rcx) 2783*5d9d9091SRichard LoweL(bkP4Q6): 2784*5d9d9091SRichard Lowe mov 0x2c(%rdx),%r9 2785*5d9d9091SRichard Lowe mov %r9,0x2c(%rcx) 2786*5d9d9091SRichard LoweL(bkP4Q5): 2787*5d9d9091SRichard Lowe mov 0x24(%rdx),%r11 2788*5d9d9091SRichard Lowe mov %r11,0x24(%rcx) 2789*5d9d9091SRichard LoweL(bkP4Q4): 2790*5d9d9091SRichard Lowe mov 0x1c(%rdx),%r10 2791*5d9d9091SRichard Lowe mov %r10,0x1c(%rcx) 2792*5d9d9091SRichard LoweL(bkP4Q3): 2793*5d9d9091SRichard Lowe mov 0x14(%rdx),%r9 2794*5d9d9091SRichard Lowe mov %r9,0x14(%rcx) 2795*5d9d9091SRichard LoweL(bkP4Q2): 2796*5d9d9091SRichard Lowe mov 0xc(%rdx),%r11 2797*5d9d9091SRichard Lowe mov %r11,0xc(%rcx) 2798*5d9d9091SRichard LoweL(bkP4Q1): 2799*5d9d9091SRichard Lowe mov 0x4(%rdx),%r10 2800*5d9d9091SRichard Lowe mov %r10,0x4(%rcx) 2801*5d9d9091SRichard LoweL(bkP4Q0): 2802*5d9d9091SRichard Lowe mov (%rdx),%r9d 2803*5d9d9091SRichard Lowe mov %r9d,(%rcx) 2804*5d9d9091SRichard Lowe ret 2805*5d9d9091SRichard Lowe 2806*5d9d9091SRichard Lowe .balign 16 2807*5d9d9091SRichard LoweL(bkP5QI): 2808*5d9d9091SRichard Lowe mov 0x8d(%rdx),%r10 2809*5d9d9091SRichard Lowe mov %r10,0x8d(%rcx) 2810*5d9d9091SRichard LoweL(bkP5QH): 2811*5d9d9091SRichard Lowe mov 0x85(%rdx),%r9 2812*5d9d9091SRichard Lowe mov %r9,0x85(%rcx) 2813*5d9d9091SRichard LoweL(bkP5QG): 2814*5d9d9091SRichard Lowe mov 0x7d(%rdx),%r11 2815*5d9d9091SRichard Lowe mov %r11,0x7d(%rcx) 2816*5d9d9091SRichard LoweL(bkP5QF): 2817*5d9d9091SRichard Lowe mov 0x75(%rdx),%r10 2818*5d9d9091SRichard Lowe mov %r10,0x75(%rcx) 2819*5d9d9091SRichard LoweL(bkP5QE): 2820*5d9d9091SRichard Lowe mov 0x6d(%rdx),%r9 2821*5d9d9091SRichard Lowe mov %r9,0x6d(%rcx) 2822*5d9d9091SRichard LoweL(bkP5QD): 2823*5d9d9091SRichard Lowe mov 0x65(%rdx),%r11 2824*5d9d9091SRichard Lowe mov %r11,0x65(%rcx) 2825*5d9d9091SRichard LoweL(bkP5QC): 2826*5d9d9091SRichard Lowe mov 0x5d(%rdx),%r10 2827*5d9d9091SRichard Lowe mov %r10,0x5d(%rcx) 2828*5d9d9091SRichard LoweL(bkP5QB): 2829*5d9d9091SRichard Lowe mov 0x55(%rdx),%r9 2830*5d9d9091SRichard Lowe mov %r9,0x55(%rcx) 2831*5d9d9091SRichard LoweL(bkP5QA): 2832*5d9d9091SRichard Lowe mov 0x4d(%rdx),%r11 2833*5d9d9091SRichard Lowe mov %r11,0x4d(%rcx) 2834*5d9d9091SRichard LoweL(bkP5Q9): 2835*5d9d9091SRichard Lowe mov 0x45(%rdx),%r10 2836*5d9d9091SRichard Lowe mov %r10,0x45(%rcx) 2837*5d9d9091SRichard LoweL(bkP5Q8): 2838*5d9d9091SRichard Lowe mov 0x3d(%rdx),%r9 2839*5d9d9091SRichard Lowe mov %r9,0x3d(%rcx) 2840*5d9d9091SRichard LoweL(bkP5Q7): 2841*5d9d9091SRichard Lowe mov 0x35(%rdx),%r11 2842*5d9d9091SRichard Lowe mov %r11,0x35(%rcx) 2843*5d9d9091SRichard LoweL(bkP5Q6): 2844*5d9d9091SRichard Lowe mov 0x2d(%rdx),%r10 2845*5d9d9091SRichard Lowe mov %r10,0x2d(%rcx) 2846*5d9d9091SRichard LoweL(bkP5Q5): 2847*5d9d9091SRichard Lowe mov 0x25(%rdx),%r9 2848*5d9d9091SRichard Lowe mov %r9,0x25(%rcx) 2849*5d9d9091SRichard LoweL(bkP5Q4): 2850*5d9d9091SRichard Lowe mov 0x1d(%rdx),%r11 2851*5d9d9091SRichard Lowe mov %r11,0x1d(%rcx) 2852*5d9d9091SRichard LoweL(bkP5Q3): 2853*5d9d9091SRichard Lowe mov 0x15(%rdx),%r10 2854*5d9d9091SRichard Lowe mov %r10,0x15(%rcx) 2855*5d9d9091SRichard LoweL(bkP5Q2): 2856*5d9d9091SRichard Lowe mov 0xd(%rdx),%r9 2857*5d9d9091SRichard Lowe mov %r9,0xd(%rcx) 2858*5d9d9091SRichard LoweL(bkP5Q1): 2859*5d9d9091SRichard Lowe mov 0x5(%rdx),%r11 2860*5d9d9091SRichard Lowe mov %r11,0x5(%rcx) 2861*5d9d9091SRichard LoweL(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores 2862*5d9d9091SRichard Lowe mov 0x1(%rdx),%r9d 2863*5d9d9091SRichard Lowe mov %r9d,0x1(%rcx) 2864*5d9d9091SRichard Lowe mov (%rdx),%r10b 2865*5d9d9091SRichard Lowe mov %r10b,(%rcx) 2866*5d9d9091SRichard Lowe ret 2867*5d9d9091SRichard Lowe 2868*5d9d9091SRichard Lowe .balign 16 2869*5d9d9091SRichard LoweL(bkP6QI): 2870*5d9d9091SRichard Lowe mov 0x8e(%rdx),%r10 2871*5d9d9091SRichard Lowe mov %r10,0x8e(%rcx) 2872*5d9d9091SRichard LoweL(bkP6QH): 2873*5d9d9091SRichard Lowe mov 0x86(%rdx),%r11 2874*5d9d9091SRichard Lowe mov %r11,0x86(%rcx) 2875*5d9d9091SRichard LoweL(bkP6QG): 2876*5d9d9091SRichard Lowe mov 0x7e(%rdx),%r10 2877*5d9d9091SRichard Lowe mov %r10,0x7e(%rcx) 2878*5d9d9091SRichard LoweL(bkP6QF): 2879*5d9d9091SRichard Lowe mov 0x76(%rdx),%r9 2880*5d9d9091SRichard Lowe mov %r9,0x76(%rcx) 2881*5d9d9091SRichard LoweL(bkP6QE): 2882*5d9d9091SRichard Lowe mov 0x6e(%rdx),%r11 2883*5d9d9091SRichard Lowe mov %r11,0x6e(%rcx) 2884*5d9d9091SRichard LoweL(bkP6QD): 2885*5d9d9091SRichard Lowe mov 0x66(%rdx),%r10 2886*5d9d9091SRichard Lowe mov %r10,0x66(%rcx) 2887*5d9d9091SRichard LoweL(bkP6QC): 2888*5d9d9091SRichard Lowe mov 0x5e(%rdx),%r9 2889*5d9d9091SRichard Lowe mov %r9,0x5e(%rcx) 2890*5d9d9091SRichard LoweL(bkP6QB): 2891*5d9d9091SRichard Lowe mov 0x56(%rdx),%r11 2892*5d9d9091SRichard Lowe mov %r11,0x56(%rcx) 2893*5d9d9091SRichard LoweL(bkP6QA): 2894*5d9d9091SRichard Lowe mov 0x4e(%rdx),%r10 2895*5d9d9091SRichard Lowe mov %r10,0x4e(%rcx) 2896*5d9d9091SRichard LoweL(bkP6Q9): 2897*5d9d9091SRichard Lowe mov 0x46(%rdx),%r9 2898*5d9d9091SRichard Lowe mov %r9,0x46(%rcx) 2899*5d9d9091SRichard LoweL(bkP6Q8): 2900*5d9d9091SRichard Lowe mov 0x3e(%rdx),%r11 2901*5d9d9091SRichard Lowe mov %r11,0x3e(%rcx) 2902*5d9d9091SRichard LoweL(bkP6Q7): 2903*5d9d9091SRichard Lowe mov 0x36(%rdx),%r10 2904*5d9d9091SRichard Lowe mov %r10,0x36(%rcx) 2905*5d9d9091SRichard LoweL(bkP6Q6): 2906*5d9d9091SRichard Lowe mov 0x2e(%rdx),%r9 2907*5d9d9091SRichard Lowe mov %r9,0x2e(%rcx) 2908*5d9d9091SRichard LoweL(bkP6Q5): 2909*5d9d9091SRichard Lowe mov 0x26(%rdx),%r11 2910*5d9d9091SRichard Lowe mov %r11,0x26(%rcx) 2911*5d9d9091SRichard LoweL(bkP6Q4): 2912*5d9d9091SRichard Lowe mov 0x1e(%rdx),%r10 2913*5d9d9091SRichard Lowe mov %r10,0x1e(%rcx) 2914*5d9d9091SRichard LoweL(bkP6Q3): 2915*5d9d9091SRichard Lowe mov 0x16(%rdx),%r9 2916*5d9d9091SRichard Lowe mov %r9,0x16(%rcx) 2917*5d9d9091SRichard LoweL(bkP6Q2): 2918*5d9d9091SRichard Lowe mov 0xe(%rdx),%r11 2919*5d9d9091SRichard Lowe mov %r11,0xe(%rcx) 2920*5d9d9091SRichard LoweL(bkP6Q1): 2921*5d9d9091SRichard Lowe mov 0x6(%rdx),%r10 2922*5d9d9091SRichard Lowe mov %r10,0x6(%rcx) 2923*5d9d9091SRichard LoweL(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores 2924*5d9d9091SRichard Lowe mov 0x2(%rdx),%r9d 2925*5d9d9091SRichard Lowe mov %r9d,0x2(%rcx) 2926*5d9d9091SRichard Lowe mov (%rdx),%r10w 2927*5d9d9091SRichard Lowe mov %r10w,(%rcx) 2928*5d9d9091SRichard Lowe ret 2929*5d9d9091SRichard Lowe 2930*5d9d9091SRichard Lowe .balign 16 2931*5d9d9091SRichard LoweL(bkP7QI): 2932*5d9d9091SRichard Lowe mov 0x8f(%rdx),%r10 2933*5d9d9091SRichard Lowe mov %r10,0x8f(%rcx) 2934*5d9d9091SRichard LoweL(bkP7QH): 2935*5d9d9091SRichard Lowe mov 0x87(%rdx),%r11 2936*5d9d9091SRichard Lowe mov %r11,0x87(%rcx) 2937*5d9d9091SRichard LoweL(bkP7QG): 2938*5d9d9091SRichard Lowe mov 0x7f(%rdx),%r10 2939*5d9d9091SRichard Lowe mov %r10,0x7f(%rcx) 2940*5d9d9091SRichard LoweL(bkP7QF): 2941*5d9d9091SRichard Lowe mov 0x77(%rdx),%r9 2942*5d9d9091SRichard Lowe mov %r9,0x77(%rcx) 2943*5d9d9091SRichard LoweL(bkP7QE): 2944*5d9d9091SRichard Lowe mov 0x6f(%rdx),%r11 2945*5d9d9091SRichard Lowe mov %r11,0x6f(%rcx) 2946*5d9d9091SRichard LoweL(bkP7QD): 2947*5d9d9091SRichard Lowe mov 0x67(%rdx),%r10 2948*5d9d9091SRichard Lowe mov %r10,0x67(%rcx) 2949*5d9d9091SRichard LoweL(bkP7QC): 2950*5d9d9091SRichard Lowe mov 0x5f(%rdx),%r9 2951*5d9d9091SRichard Lowe mov %r9,0x5f(%rcx) 2952*5d9d9091SRichard LoweL(bkP7QB): 2953*5d9d9091SRichard Lowe mov 0x57(%rdx),%r11 2954*5d9d9091SRichard Lowe mov %r11,0x57(%rcx) 2955*5d9d9091SRichard LoweL(bkP7QA): 2956*5d9d9091SRichard Lowe mov 0x4f(%rdx),%r10 2957*5d9d9091SRichard Lowe mov %r10,0x4f(%rcx) 2958*5d9d9091SRichard LoweL(bkP7Q9): 2959*5d9d9091SRichard Lowe mov 0x47(%rdx),%r9 2960*5d9d9091SRichard Lowe mov %r9,0x47(%rcx) 2961*5d9d9091SRichard LoweL(bkP7Q8): 2962*5d9d9091SRichard Lowe mov 0x3f(%rdx),%r11 2963*5d9d9091SRichard Lowe mov %r11,0x3f(%rcx) 2964*5d9d9091SRichard LoweL(bkP7Q7): 2965*5d9d9091SRichard Lowe mov 0x37(%rdx),%r10 2966*5d9d9091SRichard Lowe mov %r10,0x37(%rcx) 2967*5d9d9091SRichard LoweL(bkP7Q6): 2968*5d9d9091SRichard Lowe mov 0x2f(%rdx),%r9 2969*5d9d9091SRichard Lowe mov %r9,0x2f(%rcx) 2970*5d9d9091SRichard LoweL(bkP7Q5): 2971*5d9d9091SRichard Lowe mov 0x27(%rdx),%r11 2972*5d9d9091SRichard Lowe mov %r11,0x27(%rcx) 2973*5d9d9091SRichard LoweL(bkP7Q4): 2974*5d9d9091SRichard Lowe mov 0x1f(%rdx),%r10 2975*5d9d9091SRichard Lowe mov %r10,0x1f(%rcx) 2976*5d9d9091SRichard LoweL(bkP7Q3): 2977*5d9d9091SRichard Lowe mov 0x17(%rdx),%r9 2978*5d9d9091SRichard Lowe mov %r9,0x17(%rcx) 2979*5d9d9091SRichard LoweL(bkP7Q2): 2980*5d9d9091SRichard Lowe mov 0xf(%rdx),%r11 2981*5d9d9091SRichard Lowe mov %r11,0xf(%rcx) 2982*5d9d9091SRichard LoweL(bkP7Q1): 2983*5d9d9091SRichard Lowe mov 0x7(%rdx),%r10 2984*5d9d9091SRichard Lowe mov %r10,0x7(%rcx) 2985*5d9d9091SRichard LoweL(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores 2986*5d9d9091SRichard Lowe mov 0x3(%rdx),%r9d 2987*5d9d9091SRichard Lowe mov %r9d,0x3(%rcx) 2988*5d9d9091SRichard Lowe mov 0x1(%rdx),%r10w 2989*5d9d9091SRichard Lowe mov %r10w,0x1(%rcx) 2990*5d9d9091SRichard Lowe mov (%rdx),%r11b 2991*5d9d9091SRichard Lowe mov %r11b,(%rcx) 2992*5d9d9091SRichard Lowe ret 2993*5d9d9091SRichard Lowe 2994*5d9d9091SRichard Lowe .balign 16 2995*5d9d9091SRichard LoweL(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) 2996*5d9d9091SRichard Lowe .int L(bkP1Q0)-L(bkPxQx) 2997*5d9d9091SRichard Lowe .int L(bkP2Q0)-L(bkPxQx) 2998*5d9d9091SRichard Lowe .int L(bkP3Q0)-L(bkPxQx) 2999*5d9d9091SRichard Lowe .int L(bkP4Q0)-L(bkPxQx) 3000*5d9d9091SRichard Lowe .int L(bkP5Q0)-L(bkPxQx) 3001*5d9d9091SRichard Lowe .int L(bkP6Q0)-L(bkPxQx) 3002*5d9d9091SRichard Lowe .int L(bkP7Q0)-L(bkPxQx) 3003*5d9d9091SRichard Lowe 3004*5d9d9091SRichard Lowe .int L(bkP0Q1)-L(bkPxQx) 3005*5d9d9091SRichard Lowe .int L(bkP1Q1)-L(bkPxQx) 3006*5d9d9091SRichard Lowe .int L(bkP2Q1)-L(bkPxQx) 3007*5d9d9091SRichard Lowe .int L(bkP3Q1)-L(bkPxQx) 3008*5d9d9091SRichard Lowe .int L(bkP4Q1)-L(bkPxQx) 3009*5d9d9091SRichard Lowe .int L(bkP5Q1)-L(bkPxQx) 3010*5d9d9091SRichard Lowe .int L(bkP6Q1)-L(bkPxQx) 3011*5d9d9091SRichard Lowe .int L(bkP7Q1)-L(bkPxQx) 3012*5d9d9091SRichard Lowe 3013*5d9d9091SRichard Lowe .int L(bkP0Q2)-L(bkPxQx) 3014*5d9d9091SRichard Lowe .int L(bkP1Q2)-L(bkPxQx) 3015*5d9d9091SRichard Lowe .int L(bkP2Q2)-L(bkPxQx) 3016*5d9d9091SRichard Lowe .int L(bkP3Q2)-L(bkPxQx) 3017*5d9d9091SRichard Lowe .int L(bkP4Q2)-L(bkPxQx) 3018*5d9d9091SRichard Lowe .int L(bkP5Q2)-L(bkPxQx) 3019*5d9d9091SRichard Lowe .int L(bkP6Q2)-L(bkPxQx) 3020*5d9d9091SRichard Lowe .int L(bkP7Q2)-L(bkPxQx) 3021*5d9d9091SRichard Lowe 3022*5d9d9091SRichard Lowe .int L(bkP0Q3)-L(bkPxQx) 3023*5d9d9091SRichard Lowe .int L(bkP1Q3)-L(bkPxQx) 3024*5d9d9091SRichard Lowe .int L(bkP2Q3)-L(bkPxQx) 3025*5d9d9091SRichard Lowe .int L(bkP3Q3)-L(bkPxQx) 3026*5d9d9091SRichard Lowe .int L(bkP4Q3)-L(bkPxQx) 3027*5d9d9091SRichard Lowe .int L(bkP5Q3)-L(bkPxQx) 3028*5d9d9091SRichard Lowe .int L(bkP6Q3)-L(bkPxQx) 3029*5d9d9091SRichard Lowe .int L(bkP7Q3)-L(bkPxQx) 3030*5d9d9091SRichard Lowe 3031*5d9d9091SRichard Lowe .int L(bkP0Q4)-L(bkPxQx) 3032*5d9d9091SRichard Lowe .int L(bkP1Q4)-L(bkPxQx) 3033*5d9d9091SRichard Lowe .int L(bkP2Q4)-L(bkPxQx) 3034*5d9d9091SRichard Lowe .int L(bkP3Q4)-L(bkPxQx) 3035*5d9d9091SRichard Lowe .int L(bkP4Q4)-L(bkPxQx) 3036*5d9d9091SRichard Lowe .int L(bkP5Q4)-L(bkPxQx) 3037*5d9d9091SRichard Lowe .int L(bkP6Q4)-L(bkPxQx) 3038*5d9d9091SRichard Lowe .int L(bkP7Q4)-L(bkPxQx) 3039*5d9d9091SRichard Lowe 3040*5d9d9091SRichard Lowe .int L(bkP0Q5)-L(bkPxQx) 3041*5d9d9091SRichard Lowe .int L(bkP1Q5)-L(bkPxQx) 3042*5d9d9091SRichard Lowe .int L(bkP2Q5)-L(bkPxQx) 3043*5d9d9091SRichard Lowe .int L(bkP3Q5)-L(bkPxQx) 3044*5d9d9091SRichard Lowe .int L(bkP4Q5)-L(bkPxQx) 3045*5d9d9091SRichard Lowe .int L(bkP5Q5)-L(bkPxQx) 3046*5d9d9091SRichard Lowe .int L(bkP6Q5)-L(bkPxQx) 3047*5d9d9091SRichard Lowe .int L(bkP7Q5)-L(bkPxQx) 3048*5d9d9091SRichard Lowe 3049*5d9d9091SRichard Lowe .int L(bkP0Q6)-L(bkPxQx) 3050*5d9d9091SRichard Lowe .int L(bkP1Q6)-L(bkPxQx) 3051*5d9d9091SRichard Lowe .int L(bkP2Q6)-L(bkPxQx) 3052*5d9d9091SRichard Lowe .int L(bkP3Q6)-L(bkPxQx) 3053*5d9d9091SRichard Lowe .int L(bkP4Q6)-L(bkPxQx) 3054*5d9d9091SRichard Lowe .int L(bkP5Q6)-L(bkPxQx) 3055*5d9d9091SRichard Lowe .int L(bkP6Q6)-L(bkPxQx) 3056*5d9d9091SRichard Lowe .int L(bkP7Q6)-L(bkPxQx) 3057*5d9d9091SRichard Lowe 3058*5d9d9091SRichard Lowe .int L(bkP0Q7)-L(bkPxQx) 3059*5d9d9091SRichard Lowe .int L(bkP1Q7)-L(bkPxQx) 3060*5d9d9091SRichard Lowe .int L(bkP2Q7)-L(bkPxQx) 3061*5d9d9091SRichard Lowe .int L(bkP3Q7)-L(bkPxQx) 3062*5d9d9091SRichard Lowe .int L(bkP4Q7)-L(bkPxQx) 3063*5d9d9091SRichard Lowe .int L(bkP5Q7)-L(bkPxQx) 3064*5d9d9091SRichard Lowe .int L(bkP6Q7)-L(bkPxQx) 3065*5d9d9091SRichard Lowe .int L(bkP7Q7)-L(bkPxQx) 3066*5d9d9091SRichard Lowe 3067*5d9d9091SRichard Lowe .int L(bkP0Q8)-L(bkPxQx) 3068*5d9d9091SRichard Lowe .int L(bkP1Q8)-L(bkPxQx) 3069*5d9d9091SRichard Lowe .int L(bkP2Q8)-L(bkPxQx) 3070*5d9d9091SRichard Lowe .int L(bkP3Q8)-L(bkPxQx) 3071*5d9d9091SRichard Lowe .int L(bkP4Q8)-L(bkPxQx) 3072*5d9d9091SRichard Lowe .int L(bkP5Q8)-L(bkPxQx) 3073*5d9d9091SRichard Lowe .int L(bkP6Q8)-L(bkPxQx) 3074*5d9d9091SRichard Lowe .int L(bkP7Q8)-L(bkPxQx) 3075*5d9d9091SRichard Lowe 3076*5d9d9091SRichard Lowe .int L(bkP0Q9)-L(bkPxQx) 3077*5d9d9091SRichard Lowe .int L(bkP1Q9)-L(bkPxQx) 3078*5d9d9091SRichard Lowe .int L(bkP2Q9)-L(bkPxQx) 3079*5d9d9091SRichard Lowe .int L(bkP3Q9)-L(bkPxQx) 3080*5d9d9091SRichard Lowe .int L(bkP4Q9)-L(bkPxQx) 3081*5d9d9091SRichard Lowe .int L(bkP5Q9)-L(bkPxQx) 3082*5d9d9091SRichard Lowe .int L(bkP6Q9)-L(bkPxQx) 3083*5d9d9091SRichard Lowe .int L(bkP7Q9)-L(bkPxQx) 3084*5d9d9091SRichard Lowe 3085*5d9d9091SRichard Lowe .int L(bkP0QA)-L(bkPxQx) 3086*5d9d9091SRichard Lowe .int L(bkP1QA)-L(bkPxQx) 3087*5d9d9091SRichard Lowe .int L(bkP2QA)-L(bkPxQx) 3088*5d9d9091SRichard Lowe .int L(bkP3QA)-L(bkPxQx) 3089*5d9d9091SRichard Lowe .int L(bkP4QA)-L(bkPxQx) 3090*5d9d9091SRichard Lowe .int L(bkP5QA)-L(bkPxQx) 3091*5d9d9091SRichard Lowe .int L(bkP6QA)-L(bkPxQx) 3092*5d9d9091SRichard Lowe .int L(bkP7QA)-L(bkPxQx) 3093*5d9d9091SRichard Lowe 3094*5d9d9091SRichard Lowe .int L(bkP0QB)-L(bkPxQx) 3095*5d9d9091SRichard Lowe .int L(bkP1QB)-L(bkPxQx) 3096*5d9d9091SRichard Lowe .int L(bkP2QB)-L(bkPxQx) 3097*5d9d9091SRichard Lowe .int L(bkP3QB)-L(bkPxQx) 3098*5d9d9091SRichard Lowe .int L(bkP4QB)-L(bkPxQx) 3099*5d9d9091SRichard Lowe .int L(bkP5QB)-L(bkPxQx) 3100*5d9d9091SRichard Lowe .int L(bkP6QB)-L(bkPxQx) 3101*5d9d9091SRichard Lowe .int L(bkP7QB)-L(bkPxQx) 3102*5d9d9091SRichard Lowe 3103*5d9d9091SRichard Lowe .int L(bkP0QC)-L(bkPxQx) 3104*5d9d9091SRichard Lowe .int L(bkP1QC)-L(bkPxQx) 3105*5d9d9091SRichard Lowe .int L(bkP2QC)-L(bkPxQx) 3106*5d9d9091SRichard Lowe .int L(bkP3QC)-L(bkPxQx) 3107*5d9d9091SRichard Lowe .int L(bkP4QC)-L(bkPxQx) 3108*5d9d9091SRichard Lowe .int L(bkP5QC)-L(bkPxQx) 3109*5d9d9091SRichard Lowe .int L(bkP6QC)-L(bkPxQx) 3110*5d9d9091SRichard Lowe .int L(bkP7QC)-L(bkPxQx) 3111*5d9d9091SRichard Lowe 3112*5d9d9091SRichard Lowe .int L(bkP0QD)-L(bkPxQx) 3113*5d9d9091SRichard Lowe .int L(bkP1QD)-L(bkPxQx) 3114*5d9d9091SRichard Lowe .int L(bkP2QD)-L(bkPxQx) 3115*5d9d9091SRichard Lowe .int L(bkP3QD)-L(bkPxQx) 3116*5d9d9091SRichard Lowe .int L(bkP4QD)-L(bkPxQx) 3117*5d9d9091SRichard Lowe .int L(bkP5QD)-L(bkPxQx) 3118*5d9d9091SRichard Lowe .int L(bkP6QD)-L(bkPxQx) 3119*5d9d9091SRichard Lowe .int L(bkP7QD)-L(bkPxQx) 3120*5d9d9091SRichard Lowe 3121*5d9d9091SRichard Lowe .int L(bkP0QE)-L(bkPxQx) 3122*5d9d9091SRichard Lowe .int L(bkP1QE)-L(bkPxQx) 3123*5d9d9091SRichard Lowe .int L(bkP2QE)-L(bkPxQx) 3124*5d9d9091SRichard Lowe .int L(bkP3QE)-L(bkPxQx) 3125*5d9d9091SRichard Lowe .int L(bkP4QE)-L(bkPxQx) 3126*5d9d9091SRichard Lowe .int L(bkP5QE)-L(bkPxQx) 3127*5d9d9091SRichard Lowe .int L(bkP6QE)-L(bkPxQx) 3128*5d9d9091SRichard Lowe .int L(bkP7QE)-L(bkPxQx) 3129*5d9d9091SRichard Lowe 3130*5d9d9091SRichard Lowe .int L(bkP0QF)-L(bkPxQx) 3131*5d9d9091SRichard Lowe .int L(bkP1QF)-L(bkPxQx) 3132*5d9d9091SRichard Lowe .int L(bkP2QF)-L(bkPxQx) 3133*5d9d9091SRichard Lowe .int L(bkP3QF)-L(bkPxQx) 3134*5d9d9091SRichard Lowe .int L(bkP4QF)-L(bkPxQx) 3135*5d9d9091SRichard Lowe .int L(bkP5QF)-L(bkPxQx) 3136*5d9d9091SRichard Lowe .int L(bkP6QF)-L(bkPxQx) 3137*5d9d9091SRichard Lowe .int L(bkP7QF)-L(bkPxQx) 3138*5d9d9091SRichard Lowe 3139*5d9d9091SRichard Lowe .int L(bkP0QG)-L(bkPxQx) 3140*5d9d9091SRichard Lowe .int L(bkP1QG)-L(bkPxQx) 3141*5d9d9091SRichard Lowe .int L(bkP2QG)-L(bkPxQx) 3142*5d9d9091SRichard Lowe .int L(bkP3QG)-L(bkPxQx) 3143*5d9d9091SRichard Lowe .int L(bkP4QG)-L(bkPxQx) 3144*5d9d9091SRichard Lowe .int L(bkP5QG)-L(bkPxQx) 3145*5d9d9091SRichard Lowe .int L(bkP6QG)-L(bkPxQx) 3146*5d9d9091SRichard Lowe .int L(bkP7QG)-L(bkPxQx) 3147*5d9d9091SRichard Lowe 3148*5d9d9091SRichard Lowe .int L(bkP0QH)-L(bkPxQx) 3149*5d9d9091SRichard Lowe .int L(bkP1QH)-L(bkPxQx) 3150*5d9d9091SRichard Lowe .int L(bkP2QH)-L(bkPxQx) 3151*5d9d9091SRichard Lowe .int L(bkP3QH)-L(bkPxQx) 3152*5d9d9091SRichard Lowe .int L(bkP4QH)-L(bkPxQx) 3153*5d9d9091SRichard Lowe .int L(bkP5QH)-L(bkPxQx) 3154*5d9d9091SRichard Lowe .int L(bkP6QH)-L(bkPxQx) 3155*5d9d9091SRichard Lowe .int L(bkP7QH)-L(bkPxQx) 3156*5d9d9091SRichard Lowe 3157*5d9d9091SRichard Lowe .int L(bkP0QI)-L(bkPxQx) 3158*5d9d9091SRichard Lowe .int L(bkP1QI)-L(bkPxQx) 3159*5d9d9091SRichard Lowe .int L(bkP2QI)-L(bkPxQx) 3160*5d9d9091SRichard Lowe .int L(bkP3QI)-L(bkPxQx) 3161*5d9d9091SRichard Lowe .int L(bkP4QI)-L(bkPxQx) 3162*5d9d9091SRichard Lowe .int L(bkP5QI)-L(bkPxQx) 3163*5d9d9091SRichard Lowe .int L(bkP6QI)-L(bkPxQx) 3164*5d9d9091SRichard Lowe .int L(bkP7QI)-L(bkPxQx) 3165*5d9d9091SRichard Lowe 3166*5d9d9091SRichard Lowe SET_SIZE(memmove) 3167