17c478bd9Sstevel@tonic-gate/* 2d0b3732eSbholler * CDDL HEADER START 3d0b3732eSbholler * 4d0b3732eSbholler * The contents of this file are subject to the terms of the 5d0b3732eSbholler * Common Development and Distribution License (the "License"). 6d0b3732eSbholler * You may not use this file except in compliance with the License. 7d0b3732eSbholler * 8d0b3732eSbholler * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9d0b3732eSbholler * or http://www.opensolaris.org/os/licensing. 10d0b3732eSbholler * See the License for the specific language governing permissions 11d0b3732eSbholler * and limitations under the License. 12d0b3732eSbholler * 13d0b3732eSbholler * When distributing Covered Code, include this CDDL HEADER in each 14d0b3732eSbholler * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15d0b3732eSbholler * If applicable, add the following below this CDDL HEADER, with the 16d0b3732eSbholler * fields enclosed by brackets "[]" replaced with your own identifying 17d0b3732eSbholler * information: Portions Copyright [yyyy] [name of copyright owner] 18d0b3732eSbholler * 19d0b3732eSbholler * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate 227c478bd9Sstevel@tonic-gate/* 23*fad5204eSbostrovs * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 247257d1b4Sraf * Use is subject to license terms. 257257d1b4Sraf */ 267257d1b4Sraf 277257d1b4Sraf/* 28d0b3732eSbholler * Copyright (c) 2008, Intel Corporation 297c478bd9Sstevel@tonic-gate * All rights reserved. 307c478bd9Sstevel@tonic-gate */ 317c478bd9Sstevel@tonic-gate 32d0b3732eSbholler/* 33d0b3732eSbholler * memcpy.s - copies two blocks of memory 34d0b3732eSbholler * Implements memcpy() and memmove() libc primitives. 35d0b3732eSbholler */ 367257d1b4Sraf 379a70fc3bSMark J. Nelson .file "memcpy.s" 387c478bd9Sstevel@tonic-gate 397c478bd9Sstevel@tonic-gate#include <sys/asm_linkage.h> 407257d1b4Sraf 417c478bd9Sstevel@tonic-gate ANSI_PRAGMA_WEAK(memmove,function) 427c478bd9Sstevel@tonic-gate ANSI_PRAGMA_WEAK(memcpy,function) 437c478bd9Sstevel@tonic-gate 447c478bd9Sstevel@tonic-gate#include "cache.h" 45d0b3732eSbholler#include "proc64_id.h" 467c478bd9Sstevel@tonic-gate 47d0b3732eSbholler#define L(s) .memcpy/**/s 487c478bd9Sstevel@tonic-gate 49d0b3732eSbholler/* 50d0b3732eSbholler * memcpy algorithm overview: 51d0b3732eSbholler * 52d0b3732eSbholler * Thresholds used below were determined experimentally. 53d0b3732eSbholler * 54d0b3732eSbholler * Pseudo code: 55d0b3732eSbholler * 56*fad5204eSbostrovs * NOTE: On AMD NO_SSE is always set. Performance on Opteron did not improve 57*fad5204eSbostrovs * using 16-byte stores. Setting NO_SSE on AMD should be re-evaluated on 58*fad5204eSbostrovs * future AMD processors. 59*fad5204eSbostrovs * 60*fad5204eSbostrovs * 61d0b3732eSbholler * If (size <= 128 bytes) { 62d0b3732eSbholler * do unrolled code (primarily 8-byte loads/stores) regardless of 63d0b3732eSbholler * alignment. 64d0b3732eSbholler * } else { 65d0b3732eSbholler * Align destination to 16-byte boundary 66d0b3732eSbholler * 67d0b3732eSbholler * if (NO_SSE) { 68d0b3732eSbholler * If (size > half of the largest level cache) { 69d0b3732eSbholler * Use 8-byte non-temporal stores (64-bytes/loop) 70d0b3732eSbholler * } else { 71d0b3732eSbholler * if (size > 4K && size <= half l1 cache size) { 72d0b3732eSbholler * Use rep movsq 73d0b3732eSbholler * } else { 74d0b3732eSbholler * Use 8-byte loads/stores (64 bytes per loop) 75d0b3732eSbholler * } 76d0b3732eSbholler * } 77d0b3732eSbholler * 78d0b3732eSbholler * } else { **USE SSE** 79d0b3732eSbholler * If (size > half of the largest level cache) { 80d0b3732eSbholler * Use 16-byte non-temporal stores (128-bytes per loop) 81d0b3732eSbholler * } else { 82d0b3732eSbholler * If (both source and destination are aligned) { 83d0b3732eSbholler * Use 16-byte aligned loads and stores (128 bytes/loop) 84d0b3732eSbholler * } else { 85d0b3732eSbholler * use pairs of xmm registers with SSE2 or SSSE3 86d0b3732eSbholler * instructions to concatenate and shift appropriately 87d0b3732eSbholler * to account for source unalignment. This enables 88d0b3732eSbholler * 16-byte aligned loads to be done. 89d0b3732eSbholler * } 90d0b3732eSbholler * } 91d0b3732eSbholler } 92d0b3732eSbholler * 93d0b3732eSbholler * Finish any remaining bytes via unrolled code above. 94d0b3732eSbholler * } 95d0b3732eSbholler * 96d0b3732eSbholler * memmove overview: 97d0b3732eSbholler * memmove is the same as memcpy except one case where copy needs to be 98d0b3732eSbholler * done backwards. The copy backwards code is done in a similar manner. 99d0b3732eSbholler */ 1007c478bd9Sstevel@tonic-gate 101d0b3732eSbholler ENTRY(memmove) 102d0b3732eSbholler cmp %rsi,%rdi # if dst <= src 103d0b3732eSbholler jbe L(CopyForward) # then do copy forward 104d0b3732eSbholler mov %rsi,%r9 # move src to r9 105d0b3732eSbholler add %rdx,%r9 # add len to get addr of end of src 106d0b3732eSbholler cmp %r9,%rdi # if dst < end of src 107d0b3732eSbholler jb L(CopyBackwards) # then do copy backwards 108d0b3732eSbholler jmp L(CopyForward) 1097c478bd9Sstevel@tonic-gate 110d0b3732eSbholler ENTRY (memcpy) 111d0b3732eSbhollerL(CopyForward): 112d0b3732eSbholler mov %rdx,%r8 113d0b3732eSbholler mov %rdi,%rcx 114d0b3732eSbholler mov %rsi,%rdx 1157c478bd9Sstevel@tonic-gate mov %rdi,%rax 116d0b3732eSbholler lea L(fwdPxQx)(%rip),%r11 117d0b3732eSbholler cmp $0x80,%r8 # 128 118d0b3732eSbholler jg L(ck_use_sse2) 119d0b3732eSbholler add %r8,%rcx 120d0b3732eSbholler add %r8,%rdx 1217c478bd9Sstevel@tonic-gate 122d0b3732eSbholler movslq (%r11,%r8,4),%r10 123d0b3732eSbholler lea (%r10,%r11,1),%r11 124d0b3732eSbholler jmpq *%r11 1257c478bd9Sstevel@tonic-gate 126d0b3732eSbholler .balign 16 127d0b3732eSbhollerL(ShrtAlignNew): 128d0b3732eSbholler lea L(AliPxQx)(%rip),%r11 129d0b3732eSbholler mov %rcx,%r9 130d0b3732eSbholler and $0xf,%r9 1317c478bd9Sstevel@tonic-gate 132d0b3732eSbholler movslq (%r11,%r9,4),%r10 133d0b3732eSbholler lea (%r10,%r11,1),%r11 134d0b3732eSbholler jmpq *%r11 1357c478bd9Sstevel@tonic-gate 136d0b3732eSbholler .balign 16 137d0b3732eSbhollerL(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) 138d0b3732eSbholler .int L(P1Q0)-L(fwdPxQx) 139d0b3732eSbholler .int L(P2Q0)-L(fwdPxQx) 140d0b3732eSbholler .int L(P3Q0)-L(fwdPxQx) 141d0b3732eSbholler .int L(P4Q0)-L(fwdPxQx) 142d0b3732eSbholler .int L(P5Q0)-L(fwdPxQx) 143d0b3732eSbholler .int L(P6Q0)-L(fwdPxQx) 144d0b3732eSbholler .int L(P7Q0)-L(fwdPxQx) 1457c478bd9Sstevel@tonic-gate 146d0b3732eSbholler .int L(P0Q1)-L(fwdPxQx) 147d0b3732eSbholler .int L(P1Q1)-L(fwdPxQx) 148d0b3732eSbholler .int L(P2Q1)-L(fwdPxQx) 149d0b3732eSbholler .int L(P3Q1)-L(fwdPxQx) 150d0b3732eSbholler .int L(P4Q1)-L(fwdPxQx) 151d0b3732eSbholler .int L(P5Q1)-L(fwdPxQx) 152d0b3732eSbholler .int L(P6Q1)-L(fwdPxQx) 153d0b3732eSbholler .int L(P7Q1)-L(fwdPxQx) 1547c478bd9Sstevel@tonic-gate 155d0b3732eSbholler .int L(P0Q2)-L(fwdPxQx) 156d0b3732eSbholler .int L(P1Q2)-L(fwdPxQx) 157d0b3732eSbholler .int L(P2Q2)-L(fwdPxQx) 158d0b3732eSbholler .int L(P3Q2)-L(fwdPxQx) 159d0b3732eSbholler .int L(P4Q2)-L(fwdPxQx) 160d0b3732eSbholler .int L(P5Q2)-L(fwdPxQx) 161d0b3732eSbholler .int L(P6Q2)-L(fwdPxQx) 162d0b3732eSbholler .int L(P7Q2)-L(fwdPxQx) 1637c478bd9Sstevel@tonic-gate 164d0b3732eSbholler .int L(P0Q3)-L(fwdPxQx) 165d0b3732eSbholler .int L(P1Q3)-L(fwdPxQx) 166d0b3732eSbholler .int L(P2Q3)-L(fwdPxQx) 167d0b3732eSbholler .int L(P3Q3)-L(fwdPxQx) 168d0b3732eSbholler .int L(P4Q3)-L(fwdPxQx) 169d0b3732eSbholler .int L(P5Q3)-L(fwdPxQx) 170d0b3732eSbholler .int L(P6Q3)-L(fwdPxQx) 171d0b3732eSbholler .int L(P7Q3)-L(fwdPxQx) 1727c478bd9Sstevel@tonic-gate 173d0b3732eSbholler .int L(P0Q4)-L(fwdPxQx) 174d0b3732eSbholler .int L(P1Q4)-L(fwdPxQx) 175d0b3732eSbholler .int L(P2Q4)-L(fwdPxQx) 176d0b3732eSbholler .int L(P3Q4)-L(fwdPxQx) 177d0b3732eSbholler .int L(P4Q4)-L(fwdPxQx) 178d0b3732eSbholler .int L(P5Q4)-L(fwdPxQx) 179d0b3732eSbholler .int L(P6Q4)-L(fwdPxQx) 180d0b3732eSbholler .int L(P7Q4)-L(fwdPxQx) 1817c478bd9Sstevel@tonic-gate 182d0b3732eSbholler .int L(P0Q5)-L(fwdPxQx) 183d0b3732eSbholler .int L(P1Q5)-L(fwdPxQx) 184d0b3732eSbholler .int L(P2Q5)-L(fwdPxQx) 185d0b3732eSbholler .int L(P3Q5)-L(fwdPxQx) 186d0b3732eSbholler .int L(P4Q5)-L(fwdPxQx) 187d0b3732eSbholler .int L(P5Q5)-L(fwdPxQx) 188d0b3732eSbholler .int L(P6Q5)-L(fwdPxQx) 189d0b3732eSbholler .int L(P7Q5)-L(fwdPxQx) 1907c478bd9Sstevel@tonic-gate 191d0b3732eSbholler .int L(P0Q6)-L(fwdPxQx) 192d0b3732eSbholler .int L(P1Q6)-L(fwdPxQx) 193d0b3732eSbholler .int L(P2Q6)-L(fwdPxQx) 194d0b3732eSbholler .int L(P3Q6)-L(fwdPxQx) 195d0b3732eSbholler .int L(P4Q6)-L(fwdPxQx) 196d0b3732eSbholler .int L(P5Q6)-L(fwdPxQx) 197d0b3732eSbholler .int L(P6Q6)-L(fwdPxQx) 198d0b3732eSbholler .int L(P7Q6)-L(fwdPxQx) 1997c478bd9Sstevel@tonic-gate 200d0b3732eSbholler .int L(P0Q7)-L(fwdPxQx) 201d0b3732eSbholler .int L(P1Q7)-L(fwdPxQx) 202d0b3732eSbholler .int L(P2Q7)-L(fwdPxQx) 203d0b3732eSbholler .int L(P3Q7)-L(fwdPxQx) 204d0b3732eSbholler .int L(P4Q7)-L(fwdPxQx) 205d0b3732eSbholler .int L(P5Q7)-L(fwdPxQx) 206d0b3732eSbholler .int L(P6Q7)-L(fwdPxQx) 207d0b3732eSbholler .int L(P7Q7)-L(fwdPxQx) 2087c478bd9Sstevel@tonic-gate 209d0b3732eSbholler .int L(P0Q8)-L(fwdPxQx) 210d0b3732eSbholler .int L(P1Q8)-L(fwdPxQx) 211d0b3732eSbholler .int L(P2Q8)-L(fwdPxQx) 212d0b3732eSbholler .int L(P3Q8)-L(fwdPxQx) 213d0b3732eSbholler .int L(P4Q8)-L(fwdPxQx) 214d0b3732eSbholler .int L(P5Q8)-L(fwdPxQx) 215d0b3732eSbholler .int L(P6Q8)-L(fwdPxQx) 216d0b3732eSbholler .int L(P7Q8)-L(fwdPxQx) 2177c478bd9Sstevel@tonic-gate 218d0b3732eSbholler .int L(P0Q9)-L(fwdPxQx) 219d0b3732eSbholler .int L(P1Q9)-L(fwdPxQx) 220d0b3732eSbholler .int L(P2Q9)-L(fwdPxQx) 221d0b3732eSbholler .int L(P3Q9)-L(fwdPxQx) 222d0b3732eSbholler .int L(P4Q9)-L(fwdPxQx) 223d0b3732eSbholler .int L(P5Q9)-L(fwdPxQx) 224d0b3732eSbholler .int L(P6Q9)-L(fwdPxQx) 225d0b3732eSbholler .int L(P7Q9)-L(fwdPxQx) 2267c478bd9Sstevel@tonic-gate 227d0b3732eSbholler .int L(P0QA)-L(fwdPxQx) 228d0b3732eSbholler .int L(P1QA)-L(fwdPxQx) 229d0b3732eSbholler .int L(P2QA)-L(fwdPxQx) 230d0b3732eSbholler .int L(P3QA)-L(fwdPxQx) 231d0b3732eSbholler .int L(P4QA)-L(fwdPxQx) 232d0b3732eSbholler .int L(P5QA)-L(fwdPxQx) 233d0b3732eSbholler .int L(P6QA)-L(fwdPxQx) 234d0b3732eSbholler .int L(P7QA)-L(fwdPxQx) 2357c478bd9Sstevel@tonic-gate 236d0b3732eSbholler .int L(P0QB)-L(fwdPxQx) 237d0b3732eSbholler .int L(P1QB)-L(fwdPxQx) 238d0b3732eSbholler .int L(P2QB)-L(fwdPxQx) 239d0b3732eSbholler .int L(P3QB)-L(fwdPxQx) 240d0b3732eSbholler .int L(P4QB)-L(fwdPxQx) 241d0b3732eSbholler .int L(P5QB)-L(fwdPxQx) 242d0b3732eSbholler .int L(P6QB)-L(fwdPxQx) 243d0b3732eSbholler .int L(P7QB)-L(fwdPxQx) 2447c478bd9Sstevel@tonic-gate 245d0b3732eSbholler .int L(P0QC)-L(fwdPxQx) 246d0b3732eSbholler .int L(P1QC)-L(fwdPxQx) 247d0b3732eSbholler .int L(P2QC)-L(fwdPxQx) 248d0b3732eSbholler .int L(P3QC)-L(fwdPxQx) 249d0b3732eSbholler .int L(P4QC)-L(fwdPxQx) 250d0b3732eSbholler .int L(P5QC)-L(fwdPxQx) 251d0b3732eSbholler .int L(P6QC)-L(fwdPxQx) 252d0b3732eSbholler .int L(P7QC)-L(fwdPxQx) 2537c478bd9Sstevel@tonic-gate 254d0b3732eSbholler .int L(P0QD)-L(fwdPxQx) 255d0b3732eSbholler .int L(P1QD)-L(fwdPxQx) 256d0b3732eSbholler .int L(P2QD)-L(fwdPxQx) 257d0b3732eSbholler .int L(P3QD)-L(fwdPxQx) 258d0b3732eSbholler .int L(P4QD)-L(fwdPxQx) 259d0b3732eSbholler .int L(P5QD)-L(fwdPxQx) 260d0b3732eSbholler .int L(P6QD)-L(fwdPxQx) 261d0b3732eSbholler .int L(P7QD)-L(fwdPxQx) 2627c478bd9Sstevel@tonic-gate 263d0b3732eSbholler .int L(P0QE)-L(fwdPxQx) 264d0b3732eSbholler .int L(P1QE)-L(fwdPxQx) 265d0b3732eSbholler .int L(P2QE)-L(fwdPxQx) 266d0b3732eSbholler .int L(P3QE)-L(fwdPxQx) 267d0b3732eSbholler .int L(P4QE)-L(fwdPxQx) 268d0b3732eSbholler .int L(P5QE)-L(fwdPxQx) 269d0b3732eSbholler .int L(P6QE)-L(fwdPxQx) 270d0b3732eSbholler .int L(P7QE)-L(fwdPxQx) 2717c478bd9Sstevel@tonic-gate 272d0b3732eSbholler .int L(P0QF)-L(fwdPxQx) 273d0b3732eSbholler .int L(P1QF)-L(fwdPxQx) 274d0b3732eSbholler .int L(P2QF)-L(fwdPxQx) 275d0b3732eSbholler .int L(P3QF)-L(fwdPxQx) 276d0b3732eSbholler .int L(P4QF)-L(fwdPxQx) 277d0b3732eSbholler .int L(P5QF)-L(fwdPxQx) 278d0b3732eSbholler .int L(P6QF)-L(fwdPxQx) 279d0b3732eSbholler .int L(P7QF)-L(fwdPxQx) 280d0b3732eSbholler 281d0b3732eSbholler .int L(P0QG)-L(fwdPxQx) # 0x80 282d0b3732eSbholler 283d0b3732eSbholler .balign 16 284d0b3732eSbhollerL(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) 285d0b3732eSbholler .int L(A1Q0)-L(AliPxQx) 286d0b3732eSbholler .int L(A2Q0)-L(AliPxQx) 287d0b3732eSbholler .int L(A3Q0)-L(AliPxQx) 288d0b3732eSbholler .int L(A4Q0)-L(AliPxQx) 289d0b3732eSbholler .int L(A5Q0)-L(AliPxQx) 290d0b3732eSbholler .int L(A6Q0)-L(AliPxQx) 291d0b3732eSbholler .int L(A7Q0)-L(AliPxQx) 292d0b3732eSbholler .int L(A0Q1)-L(AliPxQx) 293d0b3732eSbholler .int L(A1Q1)-L(AliPxQx) 294d0b3732eSbholler .int L(A2Q1)-L(AliPxQx) 295d0b3732eSbholler .int L(A3Q1)-L(AliPxQx) 296d0b3732eSbholler .int L(A4Q1)-L(AliPxQx) 297d0b3732eSbholler .int L(A5Q1)-L(AliPxQx) 298d0b3732eSbholler .int L(A6Q1)-L(AliPxQx) 299d0b3732eSbholler .int L(A7Q1)-L(AliPxQx) 300d0b3732eSbholler 301d0b3732eSbholler .balign 16 302d0b3732eSbhollerL(A1Q0): # ; need to move 8+ 7=1+2+4 bytes 303d0b3732eSbholler movzbq (%rdx),%r11 304d0b3732eSbholler sub $0xf,%r8 305d0b3732eSbholler mov %r11b,(%rcx) 306d0b3732eSbholler 307d0b3732eSbholler movzwq 0x1(%rdx),%r10 308d0b3732eSbholler mov %r10w,0x1(%rcx) 309d0b3732eSbholler 310d0b3732eSbholler mov 0x3(%rdx),%r9d 311d0b3732eSbholler mov %r9d,0x3(%rcx) 312d0b3732eSbholler 313d0b3732eSbholler mov 0x7(%rdx),%r11 314d0b3732eSbholler add $0xf,%rdx 315d0b3732eSbholler mov %r11,0x7(%rcx) 316d0b3732eSbholler 317d0b3732eSbholler add $0xf,%rcx 318d0b3732eSbholler jmp L(now_qw_aligned) 319d0b3732eSbholler 320d0b3732eSbholler .balign 16 321d0b3732eSbhollerL(A2Q0): # ; need to move 8+ 6=2+4 bytes 322d0b3732eSbholler movzwq (%rdx),%r10 323d0b3732eSbholler sub $0xe,%r8 324d0b3732eSbholler mov %r10w,(%rcx) 325d0b3732eSbholler 326d0b3732eSbholler mov 0x2(%rdx),%r9d 327d0b3732eSbholler mov %r9d,0x2(%rcx) 328d0b3732eSbholler 329d0b3732eSbholler mov 0x6(%rdx),%r11 330d0b3732eSbholler add $0xe,%rdx 331d0b3732eSbholler mov %r11,0x6(%rcx) 332d0b3732eSbholler add $0xe,%rcx 333d0b3732eSbholler jmp L(now_qw_aligned) 334d0b3732eSbholler 335d0b3732eSbholler .balign 16 336d0b3732eSbhollerL(A3Q0): # ; need to move 8+ 5=1+4 bytes 337d0b3732eSbholler movzbq (%rdx),%r11 338d0b3732eSbholler sub $0xd,%r8 339d0b3732eSbholler mov %r11b,(%rcx) 340d0b3732eSbholler 341d0b3732eSbholler mov 0x1(%rdx),%r9d 342d0b3732eSbholler mov %r9d,0x1(%rcx) 343d0b3732eSbholler 344d0b3732eSbholler mov 0x5(%rdx),%r10 345d0b3732eSbholler add $0xd,%rdx 346d0b3732eSbholler mov %r10,0x5(%rcx) 347d0b3732eSbholler 348d0b3732eSbholler add $0xd,%rcx 349d0b3732eSbholler jmp L(now_qw_aligned) 350d0b3732eSbholler 351d0b3732eSbholler .balign 16 352d0b3732eSbhollerL(A4Q0): # ; need to move 8+4 bytes 353d0b3732eSbholler mov (%rdx),%r9d 354d0b3732eSbholler sub $0xc,%r8 355d0b3732eSbholler mov %r9d,(%rcx) 356d0b3732eSbholler 357d0b3732eSbholler mov 0x4(%rdx),%r10 358d0b3732eSbholler add $0xc,%rdx 359d0b3732eSbholler mov %r10,0x4(%rcx) 360d0b3732eSbholler 361d0b3732eSbholler add $0xc,%rcx 362d0b3732eSbholler jmp L(now_qw_aligned) 363d0b3732eSbholler 364d0b3732eSbholler .balign 16 365d0b3732eSbhollerL(A5Q0): # ; need to move 8+ 3=1+2 bytes 366d0b3732eSbholler movzbq (%rdx),%r11 367d0b3732eSbholler sub $0xb,%r8 368d0b3732eSbholler mov %r11b,(%rcx) 369d0b3732eSbholler 370d0b3732eSbholler movzwq 0x1(%rdx),%r10 371d0b3732eSbholler mov %r10w,0x1(%rcx) 372d0b3732eSbholler 373d0b3732eSbholler mov 0x3(%rdx),%r9 374d0b3732eSbholler add $0xb,%rdx 375d0b3732eSbholler mov %r9,0x3(%rcx) 376d0b3732eSbholler 377d0b3732eSbholler add $0xb,%rcx 378d0b3732eSbholler jmp L(now_qw_aligned) 379d0b3732eSbholler 380d0b3732eSbholler .balign 16 381d0b3732eSbhollerL(A6Q0): # ; need to move 8+2 bytes 382d0b3732eSbholler movzwq (%rdx),%r10 383d0b3732eSbholler sub $0xa,%r8 384d0b3732eSbholler mov %r10w,(%rcx) 385d0b3732eSbholler 386d0b3732eSbholler mov 0x2(%rdx),%r9 387d0b3732eSbholler add $0xa,%rdx 388d0b3732eSbholler mov %r9,0x2(%rcx) 389d0b3732eSbholler 390d0b3732eSbholler add $0xa,%rcx 391d0b3732eSbholler jmp L(now_qw_aligned) 392d0b3732eSbholler 393d0b3732eSbholler .balign 16 394d0b3732eSbhollerL(A7Q0): # ; need to move 8+1 byte 395d0b3732eSbholler movzbq (%rdx),%r11 396d0b3732eSbholler sub $0x9,%r8 397d0b3732eSbholler mov %r11b,(%rcx) 398d0b3732eSbholler 399d0b3732eSbholler mov 0x1(%rdx),%r10 400d0b3732eSbholler add $0x9,%rdx 401d0b3732eSbholler mov %r10,0x1(%rcx) 402d0b3732eSbholler 403d0b3732eSbholler add $0x9,%rcx 404d0b3732eSbholler jmp L(now_qw_aligned) 405d0b3732eSbholler 406d0b3732eSbholler .balign 16 407d0b3732eSbhollerL(A0Q1): # ; need to move 8 bytes 408d0b3732eSbholler 409d0b3732eSbholler mov (%rdx),%r10 410d0b3732eSbholler add $0x8,%rdx 411d0b3732eSbholler sub $0x8,%r8 412d0b3732eSbholler mov %r10,(%rcx) 413d0b3732eSbholler 414d0b3732eSbholler add $0x8,%rcx 415d0b3732eSbholler jmp L(now_qw_aligned) 416d0b3732eSbholler 417d0b3732eSbholler .balign 16 418d0b3732eSbhollerL(A1Q1): # ; need to move 7=1+2+4 bytes 419d0b3732eSbholler movzbq (%rdx),%r11 420d0b3732eSbholler sub $0x7,%r8 421d0b3732eSbholler mov %r11b,(%rcx) 422d0b3732eSbholler 423d0b3732eSbholler movzwq 0x1(%rdx),%r10 424d0b3732eSbholler mov %r10w,0x1(%rcx) 425d0b3732eSbholler 426d0b3732eSbholler mov 0x3(%rdx),%r9d 427d0b3732eSbholler add $0x7,%rdx 428d0b3732eSbholler mov %r9d,0x3(%rcx) 429d0b3732eSbholler add $0x7,%rcx 430d0b3732eSbholler jmp L(now_qw_aligned) 431d0b3732eSbholler 432d0b3732eSbholler .balign 16 433d0b3732eSbhollerL(A2Q1): # ; need to move 6=2+4 bytes 434d0b3732eSbholler movzwq (%rdx),%r10 435d0b3732eSbholler sub $0x6,%r8 436d0b3732eSbholler mov %r10w,(%rcx) 437d0b3732eSbholler mov 0x2(%rdx),%r9d 438d0b3732eSbholler add $0x6,%rdx 439d0b3732eSbholler mov %r9d,0x2(%rcx) 440d0b3732eSbholler add $0x6,%rcx 441d0b3732eSbholler jmp L(now_qw_aligned) 442d0b3732eSbholler 443d0b3732eSbholler .balign 16 444d0b3732eSbhollerL(A3Q1): # ; need to move 5=1+4 bytes 445d0b3732eSbholler movzbq (%rdx),%r11 446d0b3732eSbholler sub $0x5,%r8 447d0b3732eSbholler mov %r11b,(%rcx) 448d0b3732eSbholler mov 0x1(%rdx),%r9d 449d0b3732eSbholler add $0x5,%rdx 450d0b3732eSbholler mov %r9d,0x1(%rcx) 451d0b3732eSbholler add $0x5,%rcx 452d0b3732eSbholler jmp L(now_qw_aligned) 453d0b3732eSbholler 454d0b3732eSbholler .balign 16 455d0b3732eSbhollerL(A4Q1): # ; need to move 4 bytes 456d0b3732eSbholler mov (%rdx),%r9d 457d0b3732eSbholler sub $0x4,%r8 458d0b3732eSbholler add $0x4,%rdx 459d0b3732eSbholler mov %r9d,(%rcx) 460d0b3732eSbholler add $0x4,%rcx 461d0b3732eSbholler jmp L(now_qw_aligned) 462d0b3732eSbholler 463d0b3732eSbholler .balign 16 464d0b3732eSbhollerL(A5Q1): # ; need to move 3=1+2 bytes 465d0b3732eSbholler movzbq (%rdx),%r11 466d0b3732eSbholler sub $0x3,%r8 467d0b3732eSbholler mov %r11b,(%rcx) 468d0b3732eSbholler 469d0b3732eSbholler movzwq 0x1(%rdx),%r10 470d0b3732eSbholler add $0x3,%rdx 471d0b3732eSbholler mov %r10w,0x1(%rcx) 472d0b3732eSbholler 473d0b3732eSbholler add $0x3,%rcx 474d0b3732eSbholler jmp L(now_qw_aligned) 475d0b3732eSbholler 476d0b3732eSbholler .balign 16 477d0b3732eSbhollerL(A6Q1): # ; need to move 2 bytes 478d0b3732eSbholler movzwq (%rdx),%r10 479d0b3732eSbholler sub $0x2,%r8 480d0b3732eSbholler add $0x2,%rdx 481d0b3732eSbholler mov %r10w,(%rcx) 482d0b3732eSbholler add $0x2,%rcx 483d0b3732eSbholler jmp L(now_qw_aligned) 484d0b3732eSbholler 485d0b3732eSbholler .balign 16 486d0b3732eSbhollerL(A7Q1): # ; need to move 1 byte 487d0b3732eSbholler movzbq (%rdx),%r11 488d0b3732eSbholler dec %r8 489d0b3732eSbholler inc %rdx 490d0b3732eSbholler mov %r11b,(%rcx) 491d0b3732eSbholler inc %rcx 492d0b3732eSbholler jmp L(now_qw_aligned) 493d0b3732eSbholler 494d0b3732eSbholler 495d0b3732eSbholler .balign 16 496d0b3732eSbhollerL(P0QG): 497d0b3732eSbholler mov -0x80(%rdx),%r9 498d0b3732eSbholler mov %r9,-0x80(%rcx) 499d0b3732eSbhollerL(P0QF): 500d0b3732eSbholler mov -0x78(%rdx),%r10 501d0b3732eSbholler mov %r10,-0x78(%rcx) 502d0b3732eSbhollerL(P0QE): 503d0b3732eSbholler mov -0x70(%rdx),%r9 504d0b3732eSbholler mov %r9,-0x70(%rcx) 505d0b3732eSbhollerL(P0QD): 506d0b3732eSbholler mov -0x68(%rdx),%r10 507d0b3732eSbholler mov %r10,-0x68(%rcx) 508d0b3732eSbhollerL(P0QC): 509d0b3732eSbholler mov -0x60(%rdx),%r9 510d0b3732eSbholler mov %r9,-0x60(%rcx) 511d0b3732eSbhollerL(P0QB): 512d0b3732eSbholler mov -0x58(%rdx),%r10 513d0b3732eSbholler mov %r10,-0x58(%rcx) 514d0b3732eSbhollerL(P0QA): 515d0b3732eSbholler mov -0x50(%rdx),%r9 516d0b3732eSbholler mov %r9,-0x50(%rcx) 517d0b3732eSbhollerL(P0Q9): 518d0b3732eSbholler mov -0x48(%rdx),%r10 519d0b3732eSbholler mov %r10,-0x48(%rcx) 520d0b3732eSbhollerL(P0Q8): 521d0b3732eSbholler mov -0x40(%rdx),%r9 522d0b3732eSbholler mov %r9,-0x40(%rcx) 523d0b3732eSbhollerL(P0Q7): 524d0b3732eSbholler mov -0x38(%rdx),%r10 525d0b3732eSbholler mov %r10,-0x38(%rcx) 526d0b3732eSbhollerL(P0Q6): 527d0b3732eSbholler mov -0x30(%rdx),%r9 528d0b3732eSbholler mov %r9,-0x30(%rcx) 529d0b3732eSbhollerL(P0Q5): 530d0b3732eSbholler mov -0x28(%rdx),%r10 531d0b3732eSbholler mov %r10,-0x28(%rcx) 532d0b3732eSbhollerL(P0Q4): 533d0b3732eSbholler mov -0x20(%rdx),%r9 534d0b3732eSbholler mov %r9,-0x20(%rcx) 535d0b3732eSbhollerL(P0Q3): 536d0b3732eSbholler mov -0x18(%rdx),%r10 537d0b3732eSbholler mov %r10,-0x18(%rcx) 538d0b3732eSbhollerL(P0Q2): 539d0b3732eSbholler mov -0x10(%rdx),%r9 540d0b3732eSbholler mov %r9,-0x10(%rcx) 541d0b3732eSbhollerL(P0Q1): 542d0b3732eSbholler mov -0x8(%rdx),%r10 543d0b3732eSbholler mov %r10,-0x8(%rcx) 544d0b3732eSbhollerL(P0Q0): 545d0b3732eSbholler ret 546d0b3732eSbholler 547d0b3732eSbholler .balign 16 548d0b3732eSbhollerL(P1QF): 549d0b3732eSbholler mov -0x79(%rdx),%r9 550d0b3732eSbholler mov %r9,-0x79(%rcx) 551d0b3732eSbhollerL(P1QE): 552d0b3732eSbholler mov -0x71(%rdx),%r11 553d0b3732eSbholler mov %r11,-0x71(%rcx) 554d0b3732eSbhollerL(P1QD): 555d0b3732eSbholler mov -0x69(%rdx),%r10 556d0b3732eSbholler mov %r10,-0x69(%rcx) 557d0b3732eSbhollerL(P1QC): 558d0b3732eSbholler mov -0x61(%rdx),%r9 559d0b3732eSbholler mov %r9,-0x61(%rcx) 560d0b3732eSbhollerL(P1QB): 561d0b3732eSbholler mov -0x59(%rdx),%r11 562d0b3732eSbholler mov %r11,-0x59(%rcx) 563d0b3732eSbhollerL(P1QA): 564d0b3732eSbholler mov -0x51(%rdx),%r10 565d0b3732eSbholler mov %r10,-0x51(%rcx) 566d0b3732eSbhollerL(P1Q9): 567d0b3732eSbholler mov -0x49(%rdx),%r9 568d0b3732eSbholler mov %r9,-0x49(%rcx) 569d0b3732eSbhollerL(P1Q8): 570d0b3732eSbholler mov -0x41(%rdx),%r11 571d0b3732eSbholler mov %r11,-0x41(%rcx) 572d0b3732eSbhollerL(P1Q7): 573d0b3732eSbholler mov -0x39(%rdx),%r10 574d0b3732eSbholler mov %r10,-0x39(%rcx) 575d0b3732eSbhollerL(P1Q6): 576d0b3732eSbholler mov -0x31(%rdx),%r9 577d0b3732eSbholler mov %r9,-0x31(%rcx) 578d0b3732eSbhollerL(P1Q5): 579d0b3732eSbholler mov -0x29(%rdx),%r11 580d0b3732eSbholler mov %r11,-0x29(%rcx) 581d0b3732eSbhollerL(P1Q4): 582d0b3732eSbholler mov -0x21(%rdx),%r10 583d0b3732eSbholler mov %r10,-0x21(%rcx) 584d0b3732eSbhollerL(P1Q3): 585d0b3732eSbholler mov -0x19(%rdx),%r9 586d0b3732eSbholler mov %r9,-0x19(%rcx) 587d0b3732eSbhollerL(P1Q2): 588d0b3732eSbholler mov -0x11(%rdx),%r11 589d0b3732eSbholler mov %r11,-0x11(%rcx) 590d0b3732eSbhollerL(P1Q1): 591d0b3732eSbholler mov -0x9(%rdx),%r10 592d0b3732eSbholler mov %r10,-0x9(%rcx) 593d0b3732eSbhollerL(P1Q0): 594d0b3732eSbholler movzbq -0x1(%rdx),%r9 595d0b3732eSbholler mov %r9b,-0x1(%rcx) 596d0b3732eSbholler ret 597d0b3732eSbholler 598d0b3732eSbholler .balign 16 599d0b3732eSbhollerL(P2QF): 600d0b3732eSbholler mov -0x7a(%rdx),%r9 601d0b3732eSbholler mov %r9,-0x7a(%rcx) 602d0b3732eSbhollerL(P2QE): 603d0b3732eSbholler mov -0x72(%rdx),%r11 604d0b3732eSbholler mov %r11,-0x72(%rcx) 605d0b3732eSbhollerL(P2QD): 606d0b3732eSbholler mov -0x6a(%rdx),%r10 607d0b3732eSbholler mov %r10,-0x6a(%rcx) 608d0b3732eSbhollerL(P2QC): 609d0b3732eSbholler mov -0x62(%rdx),%r9 610d0b3732eSbholler mov %r9,-0x62(%rcx) 611d0b3732eSbhollerL(P2QB): 612d0b3732eSbholler mov -0x5a(%rdx),%r11 613d0b3732eSbholler mov %r11,-0x5a(%rcx) 614d0b3732eSbhollerL(P2QA): 615d0b3732eSbholler mov -0x52(%rdx),%r10 616d0b3732eSbholler mov %r10,-0x52(%rcx) 617d0b3732eSbhollerL(P2Q9): 618d0b3732eSbholler mov -0x4a(%rdx),%r9 619d0b3732eSbholler mov %r9,-0x4a(%rcx) 620d0b3732eSbhollerL(P2Q8): 621d0b3732eSbholler mov -0x42(%rdx),%r11 622d0b3732eSbholler mov %r11,-0x42(%rcx) 623d0b3732eSbhollerL(P2Q7): 624d0b3732eSbholler mov -0x3a(%rdx),%r10 625d0b3732eSbholler mov %r10,-0x3a(%rcx) 626d0b3732eSbhollerL(P2Q6): 627d0b3732eSbholler mov -0x32(%rdx),%r9 628d0b3732eSbholler mov %r9,-0x32(%rcx) 629d0b3732eSbhollerL(P2Q5): 630d0b3732eSbholler mov -0x2a(%rdx),%r11 631d0b3732eSbholler mov %r11,-0x2a(%rcx) 632d0b3732eSbhollerL(P2Q4): 633d0b3732eSbholler mov -0x22(%rdx),%r10 634d0b3732eSbholler mov %r10,-0x22(%rcx) 635d0b3732eSbhollerL(P2Q3): 636d0b3732eSbholler mov -0x1a(%rdx),%r9 637d0b3732eSbholler mov %r9,-0x1a(%rcx) 638d0b3732eSbhollerL(P2Q2): 639d0b3732eSbholler mov -0x12(%rdx),%r11 640d0b3732eSbholler mov %r11,-0x12(%rcx) 641d0b3732eSbhollerL(P2Q1): 642d0b3732eSbholler mov -0xa(%rdx),%r10 643d0b3732eSbholler mov %r10,-0xa(%rcx) 644d0b3732eSbhollerL(P2Q0): 645d0b3732eSbholler movzwq -0x2(%rdx),%r9 646d0b3732eSbholler mov %r9w,-0x2(%rcx) 647d0b3732eSbholler ret 648d0b3732eSbholler 649d0b3732eSbholler .balign 16 650d0b3732eSbhollerL(P3QF): 651d0b3732eSbholler mov -0x7b(%rdx),%r9 652d0b3732eSbholler mov %r9,-0x7b(%rcx) 653d0b3732eSbhollerL(P3QE): 654d0b3732eSbholler mov -0x73(%rdx),%r11 655d0b3732eSbholler mov %r11,-0x73(%rcx) 656d0b3732eSbhollerL(P3QD): 657d0b3732eSbholler mov -0x6b(%rdx),%r10 658d0b3732eSbholler mov %r10,-0x6b(%rcx) 659d0b3732eSbhollerL(P3QC): 660d0b3732eSbholler mov -0x63(%rdx),%r9 661d0b3732eSbholler mov %r9,-0x63(%rcx) 662d0b3732eSbhollerL(P3QB): 663d0b3732eSbholler mov -0x5b(%rdx),%r11 664d0b3732eSbholler mov %r11,-0x5b(%rcx) 665d0b3732eSbhollerL(P3QA): 666d0b3732eSbholler mov -0x53(%rdx),%r10 667d0b3732eSbholler mov %r10,-0x53(%rcx) 668d0b3732eSbhollerL(P3Q9): 669d0b3732eSbholler mov -0x4b(%rdx),%r9 670d0b3732eSbholler mov %r9,-0x4b(%rcx) 671d0b3732eSbhollerL(P3Q8): 672d0b3732eSbholler mov -0x43(%rdx),%r11 673d0b3732eSbholler mov %r11,-0x43(%rcx) 674d0b3732eSbhollerL(P3Q7): 675d0b3732eSbholler mov -0x3b(%rdx),%r10 676d0b3732eSbholler mov %r10,-0x3b(%rcx) 677d0b3732eSbhollerL(P3Q6): 678d0b3732eSbholler mov -0x33(%rdx),%r9 679d0b3732eSbholler mov %r9,-0x33(%rcx) 680d0b3732eSbhollerL(P3Q5): 681d0b3732eSbholler mov -0x2b(%rdx),%r11 682d0b3732eSbholler mov %r11,-0x2b(%rcx) 683d0b3732eSbhollerL(P3Q4): 684d0b3732eSbholler mov -0x23(%rdx),%r10 685d0b3732eSbholler mov %r10,-0x23(%rcx) 686d0b3732eSbhollerL(P3Q3): 687d0b3732eSbholler mov -0x1b(%rdx),%r9 688d0b3732eSbholler mov %r9,-0x1b(%rcx) 689d0b3732eSbhollerL(P3Q2): 690d0b3732eSbholler mov -0x13(%rdx),%r11 691d0b3732eSbholler mov %r11,-0x13(%rcx) 692d0b3732eSbhollerL(P3Q1): 693d0b3732eSbholler mov -0xb(%rdx),%r10 694d0b3732eSbholler mov %r10,-0xb(%rcx) 695d0b3732eSbholler /* 696d0b3732eSbholler * These trailing loads/stores have to do all their loads 1st, 697d0b3732eSbholler * then do the stores. 698d0b3732eSbholler */ 699d0b3732eSbhollerL(P3Q0): 700d0b3732eSbholler movzwq -0x3(%rdx),%r9 701d0b3732eSbholler movzbq -0x1(%rdx),%r10 702d0b3732eSbholler mov %r9w,-0x3(%rcx) 703d0b3732eSbholler mov %r10b,-0x1(%rcx) 704d0b3732eSbholler ret 705d0b3732eSbholler 706d0b3732eSbholler .balign 16 707d0b3732eSbhollerL(P4QF): 708d0b3732eSbholler mov -0x7c(%rdx),%r9 709d0b3732eSbholler mov %r9,-0x7c(%rcx) 710d0b3732eSbhollerL(P4QE): 711d0b3732eSbholler mov -0x74(%rdx),%r11 712d0b3732eSbholler mov %r11,-0x74(%rcx) 713d0b3732eSbhollerL(P4QD): 714d0b3732eSbholler mov -0x6c(%rdx),%r10 715d0b3732eSbholler mov %r10,-0x6c(%rcx) 716d0b3732eSbhollerL(P4QC): 717d0b3732eSbholler mov -0x64(%rdx),%r9 718d0b3732eSbholler mov %r9,-0x64(%rcx) 719d0b3732eSbhollerL(P4QB): 720d0b3732eSbholler mov -0x5c(%rdx),%r11 721d0b3732eSbholler mov %r11,-0x5c(%rcx) 722d0b3732eSbhollerL(P4QA): 723d0b3732eSbholler mov -0x54(%rdx),%r10 724d0b3732eSbholler mov %r10,-0x54(%rcx) 725d0b3732eSbhollerL(P4Q9): 726d0b3732eSbholler mov -0x4c(%rdx),%r9 727d0b3732eSbholler mov %r9,-0x4c(%rcx) 728d0b3732eSbhollerL(P4Q8): 729d0b3732eSbholler mov -0x44(%rdx),%r11 730d0b3732eSbholler mov %r11,-0x44(%rcx) 731d0b3732eSbhollerL(P4Q7): 732d0b3732eSbholler mov -0x3c(%rdx),%r10 733d0b3732eSbholler mov %r10,-0x3c(%rcx) 734d0b3732eSbhollerL(P4Q6): 735d0b3732eSbholler mov -0x34(%rdx),%r9 736d0b3732eSbholler mov %r9,-0x34(%rcx) 737d0b3732eSbhollerL(P4Q5): 738d0b3732eSbholler mov -0x2c(%rdx),%r11 739d0b3732eSbholler mov %r11,-0x2c(%rcx) 740d0b3732eSbhollerL(P4Q4): 741d0b3732eSbholler mov -0x24(%rdx),%r10 742d0b3732eSbholler mov %r10,-0x24(%rcx) 743d0b3732eSbhollerL(P4Q3): 744d0b3732eSbholler mov -0x1c(%rdx),%r9 745d0b3732eSbholler mov %r9,-0x1c(%rcx) 746d0b3732eSbhollerL(P4Q2): 747d0b3732eSbholler mov -0x14(%rdx),%r11 748d0b3732eSbholler mov %r11,-0x14(%rcx) 749d0b3732eSbhollerL(P4Q1): 750d0b3732eSbholler mov -0xc(%rdx),%r10 751d0b3732eSbholler mov %r10,-0xc(%rcx) 752d0b3732eSbhollerL(P4Q0): 753d0b3732eSbholler mov -0x4(%rdx),%r9d 754d0b3732eSbholler mov %r9d,-0x4(%rcx) 755d0b3732eSbholler ret 756d0b3732eSbholler 757d0b3732eSbholler .balign 16 758d0b3732eSbhollerL(P5QF): 759d0b3732eSbholler mov -0x7d(%rdx),%r9 760d0b3732eSbholler mov %r9,-0x7d(%rcx) 761d0b3732eSbhollerL(P5QE): 762d0b3732eSbholler mov -0x75(%rdx),%r11 763d0b3732eSbholler mov %r11,-0x75(%rcx) 764d0b3732eSbhollerL(P5QD): 765d0b3732eSbholler mov -0x6d(%rdx),%r10 766d0b3732eSbholler mov %r10,-0x6d(%rcx) 767d0b3732eSbhollerL(P5QC): 768d0b3732eSbholler mov -0x65(%rdx),%r9 769d0b3732eSbholler mov %r9,-0x65(%rcx) 770d0b3732eSbhollerL(P5QB): 771d0b3732eSbholler mov -0x5d(%rdx),%r11 772d0b3732eSbholler mov %r11,-0x5d(%rcx) 773d0b3732eSbhollerL(P5QA): 774d0b3732eSbholler mov -0x55(%rdx),%r10 775d0b3732eSbholler mov %r10,-0x55(%rcx) 776d0b3732eSbhollerL(P5Q9): 777d0b3732eSbholler mov -0x4d(%rdx),%r9 778d0b3732eSbholler mov %r9,-0x4d(%rcx) 779d0b3732eSbhollerL(P5Q8): 780d0b3732eSbholler mov -0x45(%rdx),%r11 781d0b3732eSbholler mov %r11,-0x45(%rcx) 782d0b3732eSbhollerL(P5Q7): 783d0b3732eSbholler mov -0x3d(%rdx),%r10 784d0b3732eSbholler mov %r10,-0x3d(%rcx) 785d0b3732eSbhollerL(P5Q6): 786d0b3732eSbholler mov -0x35(%rdx),%r9 787d0b3732eSbholler mov %r9,-0x35(%rcx) 788d0b3732eSbhollerL(P5Q5): 789d0b3732eSbholler mov -0x2d(%rdx),%r11 790d0b3732eSbholler mov %r11,-0x2d(%rcx) 791d0b3732eSbhollerL(P5Q4): 792d0b3732eSbholler mov -0x25(%rdx),%r10 793d0b3732eSbholler mov %r10,-0x25(%rcx) 794d0b3732eSbhollerL(P5Q3): 795d0b3732eSbholler mov -0x1d(%rdx),%r9 796d0b3732eSbholler mov %r9,-0x1d(%rcx) 797d0b3732eSbhollerL(P5Q2): 798d0b3732eSbholler mov -0x15(%rdx),%r11 799d0b3732eSbholler mov %r11,-0x15(%rcx) 800d0b3732eSbhollerL(P5Q1): 801d0b3732eSbholler mov -0xd(%rdx),%r10 802d0b3732eSbholler mov %r10,-0xd(%rcx) 803d0b3732eSbholler /* 804d0b3732eSbholler * These trailing loads/stores have to do all their loads 1st, 805d0b3732eSbholler * then do the stores. 806d0b3732eSbholler */ 807d0b3732eSbhollerL(P5Q0): 808d0b3732eSbholler mov -0x5(%rdx),%r9d 809d0b3732eSbholler movzbq -0x1(%rdx),%r10 810d0b3732eSbholler mov %r9d,-0x5(%rcx) 811d0b3732eSbholler mov %r10b,-0x1(%rcx) 812d0b3732eSbholler ret 813d0b3732eSbholler 814d0b3732eSbholler .balign 16 815d0b3732eSbhollerL(P6QF): 816d0b3732eSbholler mov -0x7e(%rdx),%r9 817d0b3732eSbholler mov %r9,-0x7e(%rcx) 818d0b3732eSbhollerL(P6QE): 819d0b3732eSbholler mov -0x76(%rdx),%r11 820d0b3732eSbholler mov %r11,-0x76(%rcx) 821d0b3732eSbhollerL(P6QD): 822d0b3732eSbholler mov -0x6e(%rdx),%r10 823d0b3732eSbholler mov %r10,-0x6e(%rcx) 824d0b3732eSbhollerL(P6QC): 825d0b3732eSbholler mov -0x66(%rdx),%r9 826d0b3732eSbholler mov %r9,-0x66(%rcx) 827d0b3732eSbhollerL(P6QB): 828d0b3732eSbholler mov -0x5e(%rdx),%r11 829d0b3732eSbholler mov %r11,-0x5e(%rcx) 830d0b3732eSbhollerL(P6QA): 831d0b3732eSbholler mov -0x56(%rdx),%r10 832d0b3732eSbholler mov %r10,-0x56(%rcx) 833d0b3732eSbhollerL(P6Q9): 834d0b3732eSbholler mov -0x4e(%rdx),%r9 835d0b3732eSbholler mov %r9,-0x4e(%rcx) 836d0b3732eSbhollerL(P6Q8): 837d0b3732eSbholler mov -0x46(%rdx),%r11 838d0b3732eSbholler mov %r11,-0x46(%rcx) 839d0b3732eSbhollerL(P6Q7): 840d0b3732eSbholler mov -0x3e(%rdx),%r10 841d0b3732eSbholler mov %r10,-0x3e(%rcx) 842d0b3732eSbhollerL(P6Q6): 843d0b3732eSbholler mov -0x36(%rdx),%r9 844d0b3732eSbholler mov %r9,-0x36(%rcx) 845d0b3732eSbhollerL(P6Q5): 846d0b3732eSbholler mov -0x2e(%rdx),%r11 847d0b3732eSbholler mov %r11,-0x2e(%rcx) 848d0b3732eSbhollerL(P6Q4): 849d0b3732eSbholler mov -0x26(%rdx),%r10 850d0b3732eSbholler mov %r10,-0x26(%rcx) 851d0b3732eSbhollerL(P6Q3): 852d0b3732eSbholler mov -0x1e(%rdx),%r9 853d0b3732eSbholler mov %r9,-0x1e(%rcx) 854d0b3732eSbhollerL(P6Q2): 855d0b3732eSbholler mov -0x16(%rdx),%r11 856d0b3732eSbholler mov %r11,-0x16(%rcx) 857d0b3732eSbhollerL(P6Q1): 858d0b3732eSbholler mov -0xe(%rdx),%r10 859d0b3732eSbholler mov %r10,-0xe(%rcx) 860d0b3732eSbholler /* 861d0b3732eSbholler * These trailing loads/stores have to do all their loads 1st, 862d0b3732eSbholler * then do the stores. 863d0b3732eSbholler */ 864d0b3732eSbhollerL(P6Q0): 865d0b3732eSbholler mov -0x6(%rdx),%r9d 866d0b3732eSbholler movzwq -0x2(%rdx),%r10 867d0b3732eSbholler mov %r9d,-0x6(%rcx) 868d0b3732eSbholler mov %r10w,-0x2(%rcx) 869d0b3732eSbholler ret 870d0b3732eSbholler 871d0b3732eSbholler .balign 16 872d0b3732eSbhollerL(P7QF): 873d0b3732eSbholler mov -0x7f(%rdx),%r9 874d0b3732eSbholler mov %r9,-0x7f(%rcx) 875d0b3732eSbhollerL(P7QE): 876d0b3732eSbholler mov -0x77(%rdx),%r11 877d0b3732eSbholler mov %r11,-0x77(%rcx) 878d0b3732eSbhollerL(P7QD): 879d0b3732eSbholler mov -0x6f(%rdx),%r10 880d0b3732eSbholler mov %r10,-0x6f(%rcx) 881d0b3732eSbhollerL(P7QC): 882d0b3732eSbholler mov -0x67(%rdx),%r9 883d0b3732eSbholler mov %r9,-0x67(%rcx) 884d0b3732eSbhollerL(P7QB): 885d0b3732eSbholler mov -0x5f(%rdx),%r11 886d0b3732eSbholler mov %r11,-0x5f(%rcx) 887d0b3732eSbhollerL(P7QA): 888d0b3732eSbholler mov -0x57(%rdx),%r10 889d0b3732eSbholler mov %r10,-0x57(%rcx) 890d0b3732eSbhollerL(P7Q9): 891d0b3732eSbholler mov -0x4f(%rdx),%r9 892d0b3732eSbholler mov %r9,-0x4f(%rcx) 893d0b3732eSbhollerL(P7Q8): 894d0b3732eSbholler mov -0x47(%rdx),%r11 895d0b3732eSbholler mov %r11,-0x47(%rcx) 896d0b3732eSbhollerL(P7Q7): 897d0b3732eSbholler mov -0x3f(%rdx),%r10 898d0b3732eSbholler mov %r10,-0x3f(%rcx) 899d0b3732eSbhollerL(P7Q6): 900d0b3732eSbholler mov -0x37(%rdx),%r9 901d0b3732eSbholler mov %r9,-0x37(%rcx) 902d0b3732eSbhollerL(P7Q5): 903d0b3732eSbholler mov -0x2f(%rdx),%r11 904d0b3732eSbholler mov %r11,-0x2f(%rcx) 905d0b3732eSbhollerL(P7Q4): 906d0b3732eSbholler mov -0x27(%rdx),%r10 907d0b3732eSbholler mov %r10,-0x27(%rcx) 908d0b3732eSbhollerL(P7Q3): 909d0b3732eSbholler mov -0x1f(%rdx),%r9 910d0b3732eSbholler mov %r9,-0x1f(%rcx) 911d0b3732eSbhollerL(P7Q2): 912d0b3732eSbholler mov -0x17(%rdx),%r11 913d0b3732eSbholler mov %r11,-0x17(%rcx) 914d0b3732eSbhollerL(P7Q1): 915d0b3732eSbholler mov -0xf(%rdx),%r10 916d0b3732eSbholler mov %r10,-0xf(%rcx) 917d0b3732eSbholler /* 918d0b3732eSbholler * These trailing loads/stores have to do all their loads 1st, 919d0b3732eSbholler * then do the stores. 920d0b3732eSbholler */ 921d0b3732eSbhollerL(P7Q0): 922d0b3732eSbholler mov -0x7(%rdx),%r9d 923d0b3732eSbholler movzwq -0x3(%rdx),%r10 924d0b3732eSbholler movzbq -0x1(%rdx),%r11 925d0b3732eSbholler mov %r9d,-0x7(%rcx) 926d0b3732eSbholler mov %r10w,-0x3(%rcx) 927d0b3732eSbholler mov %r11b,-0x1(%rcx) 928d0b3732eSbholler ret 929d0b3732eSbholler 930d0b3732eSbholler .balign 16 931d0b3732eSbhollerL(ck_use_sse2): 932d0b3732eSbholler /* 933d0b3732eSbholler * Align dest to 16 byte boundary. 934d0b3732eSbholler */ 935d0b3732eSbholler test $0xf,%rcx 936d0b3732eSbholler jnz L(ShrtAlignNew) 937d0b3732eSbholler 938d0b3732eSbhollerL(now_qw_aligned): 939d0b3732eSbholler cmpl $NO_SSE,.memops_method(%rip) 940d0b3732eSbholler je L(Loop8byte_pre) 941d0b3732eSbholler 942d0b3732eSbholler /* 943d0b3732eSbholler * The fall-through path is to do SSE2 16-byte load/stores 944d0b3732eSbholler */ 945d0b3732eSbholler 946d0b3732eSbholler /* 947d0b3732eSbholler * If current move size is larger than half of the highest level cache 948d0b3732eSbholler * size, then do non-temporal moves. 949d0b3732eSbholler */ 950d0b3732eSbholler mov .largest_level_cache_size(%rip),%r9d 951d0b3732eSbholler shr %r9 # take half of it 952d0b3732eSbholler cmp %r9,%r8 953d0b3732eSbholler jg L(sse2_nt_move) 954d0b3732eSbholler 955d0b3732eSbholler /* 956d0b3732eSbholler * If both the source and dest are aligned, then use the both aligned 957d0b3732eSbholler * logic. Well aligned data should reap the rewards. 958d0b3732eSbholler */ 959d0b3732eSbholler test $0xf,%rdx 960d0b3732eSbholler jz L(pre_both_aligned) 961d0b3732eSbholler 962d0b3732eSbholler lea L(SSE_src)(%rip),%r10 # SSE2 (default) 963d0b3732eSbholler testl $USE_SSSE3,.memops_method(%rip) 964d0b3732eSbholler jz 1f 965d0b3732eSbholler lea L(SSSE3_src)(%rip),%r10 # SSSE3 966d0b3732eSbholler 967d0b3732eSbholler1: 968d0b3732eSbholler /* 969d0b3732eSbholler * if the src is not 16 byte aligned... 970d0b3732eSbholler */ 971d0b3732eSbholler mov %rdx,%r11 972d0b3732eSbholler and $0xf,%r11 973d0b3732eSbholler movdqu (%rdx),%xmm0 974d0b3732eSbholler movdqa %xmm0,(%rcx) 975d0b3732eSbholler add $0x10,%rdx 976d0b3732eSbholler sub %r11,%rdx 977d0b3732eSbholler add $0x10,%rcx 978d0b3732eSbholler sub $0x10,%r8 979d0b3732eSbholler movdqa (%rdx),%xmm1 980d0b3732eSbholler 981d0b3732eSbholler movslq (%r10,%r11,4),%r9 982d0b3732eSbholler lea (%r9,%r10,1),%r10 983d0b3732eSbholler jmpq *%r10 984d0b3732eSbholler 985d0b3732eSbholler .balign 16 986d0b3732eSbhollerL(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) 987d0b3732eSbholler .int L(mov3dqa1) -L(SSSE3_src) 988d0b3732eSbholler .int L(mov3dqa2) -L(SSSE3_src) 989d0b3732eSbholler .int L(mov3dqa3) -L(SSSE3_src) 990d0b3732eSbholler .int L(mov3dqa4) -L(SSSE3_src) 991d0b3732eSbholler .int L(mov3dqa5) -L(SSSE3_src) 992d0b3732eSbholler .int L(mov3dqa6) -L(SSSE3_src) 993d0b3732eSbholler .int L(mov3dqa7) -L(SSSE3_src) 994d0b3732eSbholler .int L(movdqa8) -L(SSSE3_src) 995d0b3732eSbholler .int L(mov3dqa9) -L(SSSE3_src) 996d0b3732eSbholler .int L(mov3dqa10)-L(SSSE3_src) 997d0b3732eSbholler .int L(mov3dqa11)-L(SSSE3_src) 998d0b3732eSbholler .int L(mov3dqa12)-L(SSSE3_src) 999d0b3732eSbholler .int L(mov3dqa13)-L(SSSE3_src) 1000d0b3732eSbholler .int L(mov3dqa14)-L(SSSE3_src) 1001d0b3732eSbholler .int L(mov3dqa15)-L(SSSE3_src) 1002d0b3732eSbhollerL(SSE_src): .int L(pre_both_aligned)-L(SSE_src) 1003d0b3732eSbholler .int L(movdqa1) -L(SSE_src) 1004d0b3732eSbholler .int L(movdqa2) -L(SSE_src) 1005d0b3732eSbholler .int L(movdqa3) -L(SSE_src) 1006d0b3732eSbholler .int L(movdqa4) -L(SSE_src) 1007d0b3732eSbholler .int L(movdqa5) -L(SSE_src) 1008d0b3732eSbholler .int L(movdqa6) -L(SSE_src) 1009d0b3732eSbholler .int L(movdqa7) -L(SSE_src) 1010d0b3732eSbholler .int L(movdqa8) -L(SSE_src) 1011d0b3732eSbholler .int L(movdqa9) -L(SSE_src) 1012d0b3732eSbholler .int L(movdqa10)-L(SSE_src) 1013d0b3732eSbholler .int L(movdqa11)-L(SSE_src) 1014d0b3732eSbholler .int L(movdqa12)-L(SSE_src) 1015d0b3732eSbholler .int L(movdqa13)-L(SSE_src) 1016d0b3732eSbholler .int L(movdqa14)-L(SSE_src) 1017d0b3732eSbholler .int L(movdqa15)-L(SSE_src) 1018d0b3732eSbholler 1019d0b3732eSbholler .balign 16 1020d0b3732eSbhollerL(movdqa1): 1021d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1022d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1023d0b3732eSbholler lea 0x20(%rdx),%rdx 1024d0b3732eSbholler lea -0x20(%r8),%r8 1025d0b3732eSbholler 1026d0b3732eSbholler psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) 1027d0b3732eSbholler movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1028d0b3732eSbholler pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) 1029d0b3732eSbholler por %xmm1,%xmm3 # OR them together 1030d0b3732eSbholler cmp $0x20,%r8 1031d0b3732eSbholler 1032d0b3732eSbholler psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) 1033d0b3732eSbholler movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration 1034d0b3732eSbholler pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) 1035d0b3732eSbholler por %xmm2,%xmm0 # OR them together 1036d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1037d0b3732eSbholler movdqa %xmm0,0x10(%rcx) # store it 1038d0b3732eSbholler lea 0x20(%rcx),%rcx 1039d0b3732eSbholler 1040d0b3732eSbholler jge L(movdqa1) 1041d0b3732eSbholler jmp L(movdqa_epi) 1042d0b3732eSbholler 1043d0b3732eSbholler .balign 16 1044d0b3732eSbhollerL(movdqa2): 1045d0b3732eSbholler sub $0x20,%r8 1046d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1047d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1048d0b3732eSbholler add $0x20,%rdx 1049d0b3732eSbholler 1050d0b3732eSbholler psrldq $0x2,%xmm1 1051d0b3732eSbholler movdqa %xmm3,%xmm2 1052d0b3732eSbholler pslldq $0xe,%xmm3 1053d0b3732eSbholler por %xmm1,%xmm3 1054d0b3732eSbholler 1055d0b3732eSbholler psrldq $0x2,%xmm2 1056d0b3732eSbholler movdqa %xmm0,%xmm1 1057d0b3732eSbholler pslldq $0xe,%xmm0 1058d0b3732eSbholler por %xmm2,%xmm0 1059d0b3732eSbholler movdqa %xmm3,(%rcx) 1060d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1061d0b3732eSbholler 1062d0b3732eSbholler add $0x20,%rcx 1063d0b3732eSbholler cmp $0x20,%r8 1064d0b3732eSbholler jge L(movdqa2) 1065d0b3732eSbholler jmp L(movdqa_epi) 1066d0b3732eSbholler 1067d0b3732eSbholler .balign 16 1068d0b3732eSbhollerL(movdqa3): 1069d0b3732eSbholler sub $0x20,%r8 1070d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1071d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1072d0b3732eSbholler add $0x20,%rdx 1073d0b3732eSbholler 1074d0b3732eSbholler psrldq $0x3,%xmm1 1075d0b3732eSbholler movdqa %xmm3,%xmm2 1076d0b3732eSbholler pslldq $0xd,%xmm3 1077d0b3732eSbholler por %xmm1,%xmm3 1078d0b3732eSbholler 1079d0b3732eSbholler psrldq $0x3,%xmm2 1080d0b3732eSbholler movdqa %xmm0,%xmm1 1081d0b3732eSbholler pslldq $0xd,%xmm0 1082d0b3732eSbholler por %xmm2,%xmm0 1083d0b3732eSbholler movdqa %xmm3,(%rcx) 1084d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1085d0b3732eSbholler 1086d0b3732eSbholler add $0x20,%rcx 1087d0b3732eSbholler cmp $0x20,%r8 1088d0b3732eSbholler jge L(movdqa3) 1089d0b3732eSbholler jmp L(movdqa_epi) 1090d0b3732eSbholler 1091d0b3732eSbholler .balign 16 1092d0b3732eSbhollerL(movdqa4): 1093d0b3732eSbholler sub $0x20,%r8 1094d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1095d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1096d0b3732eSbholler add $0x20,%rdx 1097d0b3732eSbholler 1098d0b3732eSbholler psrldq $0x4,%xmm1 1099d0b3732eSbholler movdqa %xmm3,%xmm2 1100d0b3732eSbholler pslldq $0xc,%xmm3 1101d0b3732eSbholler por %xmm1,%xmm3 1102d0b3732eSbholler 1103d0b3732eSbholler psrldq $0x4,%xmm2 1104d0b3732eSbholler movdqa %xmm0,%xmm1 1105d0b3732eSbholler pslldq $0xc,%xmm0 1106d0b3732eSbholler por %xmm2,%xmm0 1107d0b3732eSbholler 1108d0b3732eSbholler movdqa %xmm3,(%rcx) 1109d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1110d0b3732eSbholler 1111d0b3732eSbholler add $0x20,%rcx 1112d0b3732eSbholler cmp $0x20,%r8 1113d0b3732eSbholler jge L(movdqa4) 1114d0b3732eSbholler jmp L(movdqa_epi) 1115d0b3732eSbholler 1116d0b3732eSbholler .balign 16 1117d0b3732eSbhollerL(movdqa5): 1118d0b3732eSbholler sub $0x20,%r8 1119d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1120d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1121d0b3732eSbholler add $0x20,%rdx 1122d0b3732eSbholler 1123d0b3732eSbholler psrldq $0x5,%xmm1 1124d0b3732eSbholler movdqa %xmm3,%xmm2 1125d0b3732eSbholler pslldq $0xb,%xmm3 1126d0b3732eSbholler por %xmm1,%xmm3 1127d0b3732eSbholler 1128d0b3732eSbholler psrldq $0x5,%xmm2 1129d0b3732eSbholler movdqa %xmm0,%xmm1 1130d0b3732eSbholler pslldq $0xb,%xmm0 1131d0b3732eSbholler por %xmm2,%xmm0 1132d0b3732eSbholler 1133d0b3732eSbholler movdqa %xmm3,(%rcx) 1134d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1135d0b3732eSbholler 1136d0b3732eSbholler add $0x20,%rcx 1137d0b3732eSbholler cmp $0x20,%r8 1138d0b3732eSbholler jge L(movdqa5) 1139d0b3732eSbholler jmp L(movdqa_epi) 1140d0b3732eSbholler 1141d0b3732eSbholler .balign 16 1142d0b3732eSbhollerL(movdqa6): 1143d0b3732eSbholler sub $0x20,%r8 1144d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1145d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1146d0b3732eSbholler add $0x20,%rdx 1147d0b3732eSbholler 1148d0b3732eSbholler psrldq $0x6,%xmm1 1149d0b3732eSbholler movdqa %xmm3,%xmm2 1150d0b3732eSbholler pslldq $0xa,%xmm3 1151d0b3732eSbholler por %xmm1,%xmm3 1152d0b3732eSbholler 1153d0b3732eSbholler psrldq $0x6,%xmm2 1154d0b3732eSbholler movdqa %xmm0,%xmm1 1155d0b3732eSbholler pslldq $0xa,%xmm0 1156d0b3732eSbholler por %xmm2,%xmm0 1157d0b3732eSbholler movdqa %xmm3,(%rcx) 1158d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1159d0b3732eSbholler 1160d0b3732eSbholler add $0x20,%rcx 1161d0b3732eSbholler cmp $0x20,%r8 1162d0b3732eSbholler jge L(movdqa6) 1163d0b3732eSbholler jmp L(movdqa_epi) 1164d0b3732eSbholler 1165d0b3732eSbholler .balign 16 1166d0b3732eSbhollerL(movdqa7): 1167d0b3732eSbholler sub $0x20,%r8 1168d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1169d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1170d0b3732eSbholler add $0x20,%rdx 1171d0b3732eSbholler 1172d0b3732eSbholler psrldq $0x7,%xmm1 1173d0b3732eSbholler movdqa %xmm3,%xmm2 1174d0b3732eSbholler pslldq $0x9,%xmm3 1175d0b3732eSbholler por %xmm1,%xmm3 1176d0b3732eSbholler 1177d0b3732eSbholler psrldq $0x7,%xmm2 1178d0b3732eSbholler movdqa %xmm0,%xmm1 1179d0b3732eSbholler pslldq $0x9,%xmm0 1180d0b3732eSbholler por %xmm2,%xmm0 1181d0b3732eSbholler movdqa %xmm3,(%rcx) 1182d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1183d0b3732eSbholler 1184d0b3732eSbholler add $0x20,%rcx 1185d0b3732eSbholler cmp $0x20,%r8 1186d0b3732eSbholler jge L(movdqa7) 1187d0b3732eSbholler jmp L(movdqa_epi) 1188d0b3732eSbholler 1189d0b3732eSbholler .balign 16 1190d0b3732eSbhollerL(movdqa8): 1191d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1192d0b3732eSbholler sub $0x30,%r8 1193d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1194d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1195d0b3732eSbholler lea 0x30(%rdx),%rdx 1196d0b3732eSbholler 1197d0b3732eSbholler shufpd $0x1,%xmm3,%xmm1 1198d0b3732eSbholler movdqa %xmm1,(%rcx) 1199d0b3732eSbholler 1200d0b3732eSbholler cmp $0x30,%r8 1201d0b3732eSbholler 1202d0b3732eSbholler shufpd $0x1,%xmm0,%xmm3 1203d0b3732eSbholler movdqa %xmm3,0x10(%rcx) 1204d0b3732eSbholler 1205d0b3732eSbholler movdqa %xmm5,%xmm1 1206d0b3732eSbholler shufpd $0x1,%xmm5,%xmm0 1207d0b3732eSbholler movdqa %xmm0,0x20(%rcx) 1208d0b3732eSbholler 1209d0b3732eSbholler lea 0x30(%rcx),%rcx 1210d0b3732eSbholler 1211d0b3732eSbholler jge L(movdqa8) 1212d0b3732eSbholler jmp L(movdqa_epi) 1213d0b3732eSbholler 1214d0b3732eSbholler .balign 16 1215d0b3732eSbhollerL(movdqa9): 1216d0b3732eSbholler sub $0x20,%r8 1217d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1218d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1219d0b3732eSbholler add $0x20,%rdx 1220d0b3732eSbholler 1221d0b3732eSbholler psrldq $0x9,%xmm1 1222d0b3732eSbholler movdqa %xmm3,%xmm2 1223d0b3732eSbholler pslldq $0x7,%xmm3 1224d0b3732eSbholler por %xmm1,%xmm3 1225d0b3732eSbholler 1226d0b3732eSbholler psrldq $0x9,%xmm2 1227d0b3732eSbholler movdqa %xmm0,%xmm1 1228d0b3732eSbholler pslldq $0x7,%xmm0 1229d0b3732eSbholler por %xmm2,%xmm0 1230d0b3732eSbholler movdqa %xmm3,(%rcx) 1231d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1232d0b3732eSbholler 1233d0b3732eSbholler add $0x20,%rcx 1234d0b3732eSbholler cmp $0x20,%r8 1235d0b3732eSbholler jge L(movdqa9) 1236d0b3732eSbholler jmp L(movdqa_epi) 1237d0b3732eSbholler 1238d0b3732eSbholler .balign 16 1239d0b3732eSbhollerL(movdqa10): 1240d0b3732eSbholler sub $0x20,%r8 1241d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1242d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1243d0b3732eSbholler add $0x20,%rdx 1244d0b3732eSbholler 1245d0b3732eSbholler psrldq $0xa,%xmm1 1246d0b3732eSbholler movdqa %xmm3,%xmm2 1247d0b3732eSbholler pslldq $0x6,%xmm3 1248d0b3732eSbholler por %xmm1,%xmm3 1249d0b3732eSbholler 1250d0b3732eSbholler psrldq $0xa,%xmm2 1251d0b3732eSbholler movdqa %xmm0,%xmm1 1252d0b3732eSbholler pslldq $0x6,%xmm0 1253d0b3732eSbholler por %xmm2,%xmm0 1254d0b3732eSbholler movdqa %xmm3,(%rcx) 1255d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1256d0b3732eSbholler 1257d0b3732eSbholler add $0x20,%rcx 1258d0b3732eSbholler cmp $0x20,%r8 1259d0b3732eSbholler jge L(movdqa10) 1260d0b3732eSbholler jmp L(movdqa_epi) 1261d0b3732eSbholler 1262d0b3732eSbholler .balign 16 1263d0b3732eSbhollerL(movdqa11): 1264d0b3732eSbholler sub $0x20,%r8 1265d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1266d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1267d0b3732eSbholler add $0x20,%rdx 1268d0b3732eSbholler 1269d0b3732eSbholler psrldq $0xb,%xmm1 1270d0b3732eSbholler movdqa %xmm3,%xmm2 1271d0b3732eSbholler pslldq $0x5,%xmm3 1272d0b3732eSbholler por %xmm1,%xmm3 1273d0b3732eSbholler 1274d0b3732eSbholler psrldq $0xb,%xmm2 1275d0b3732eSbholler movdqa %xmm0,%xmm1 1276d0b3732eSbholler pslldq $0x5,%xmm0 1277d0b3732eSbholler por %xmm2,%xmm0 1278d0b3732eSbholler movdqa %xmm3,(%rcx) 1279d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1280d0b3732eSbholler 1281d0b3732eSbholler add $0x20,%rcx 1282d0b3732eSbholler cmp $0x20,%r8 1283d0b3732eSbholler jge L(movdqa11) 1284d0b3732eSbholler jmp L(movdqa_epi) 1285d0b3732eSbholler 1286d0b3732eSbholler .balign 16 1287d0b3732eSbhollerL(movdqa12): 1288d0b3732eSbholler sub $0x20,%r8 1289d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1290d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1291d0b3732eSbholler add $0x20,%rdx 1292d0b3732eSbholler 1293d0b3732eSbholler psrldq $0xc,%xmm1 1294d0b3732eSbholler movdqa %xmm3,%xmm2 1295d0b3732eSbholler pslldq $0x4,%xmm3 1296d0b3732eSbholler por %xmm1,%xmm3 1297d0b3732eSbholler 1298d0b3732eSbholler psrldq $0xc,%xmm2 1299d0b3732eSbholler movdqa %xmm0,%xmm1 1300d0b3732eSbholler pslldq $0x4,%xmm0 1301d0b3732eSbholler por %xmm2,%xmm0 1302d0b3732eSbholler movdqa %xmm3,(%rcx) 1303d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1304d0b3732eSbholler 1305d0b3732eSbholler add $0x20,%rcx 1306d0b3732eSbholler cmp $0x20,%r8 1307d0b3732eSbholler jge L(movdqa12) 1308d0b3732eSbholler jmp L(movdqa_epi) 1309d0b3732eSbholler 1310d0b3732eSbholler .balign 16 1311d0b3732eSbhollerL(movdqa13): 1312d0b3732eSbholler sub $0x20,%r8 1313d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1314d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1315d0b3732eSbholler add $0x20,%rdx 1316d0b3732eSbholler 1317d0b3732eSbholler psrldq $0xd,%xmm1 1318d0b3732eSbholler movdqa %xmm3,%xmm2 1319d0b3732eSbholler pslldq $0x3,%xmm3 1320d0b3732eSbholler por %xmm1,%xmm3 1321d0b3732eSbholler 1322d0b3732eSbholler psrldq $0xd,%xmm2 1323d0b3732eSbholler movdqa %xmm0,%xmm1 1324d0b3732eSbholler pslldq $0x3,%xmm0 1325d0b3732eSbholler por %xmm2,%xmm0 1326d0b3732eSbholler movdqa %xmm3,(%rcx) 1327d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1328d0b3732eSbholler 1329d0b3732eSbholler add $0x20,%rcx 1330d0b3732eSbholler cmp $0x20,%r8 1331d0b3732eSbholler jge L(movdqa13) 1332d0b3732eSbholler jmp L(movdqa_epi) 1333d0b3732eSbholler 1334d0b3732eSbholler .balign 16 1335d0b3732eSbhollerL(movdqa14): 1336d0b3732eSbholler sub $0x20,%r8 1337d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1338d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1339d0b3732eSbholler add $0x20,%rdx 1340d0b3732eSbholler 1341d0b3732eSbholler psrldq $0xe,%xmm1 1342d0b3732eSbholler movdqa %xmm3,%xmm2 1343d0b3732eSbholler pslldq $0x2,%xmm3 1344d0b3732eSbholler por %xmm1,%xmm3 1345d0b3732eSbholler 1346d0b3732eSbholler psrldq $0xe,%xmm2 1347d0b3732eSbholler movdqa %xmm0,%xmm1 1348d0b3732eSbholler pslldq $0x2,%xmm0 1349d0b3732eSbholler por %xmm2,%xmm0 1350d0b3732eSbholler movdqa %xmm3,(%rcx) 1351d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1352d0b3732eSbholler 1353d0b3732eSbholler add $0x20,%rcx 1354d0b3732eSbholler cmp $0x20,%r8 1355d0b3732eSbholler jge L(movdqa14) 1356d0b3732eSbholler jmp L(movdqa_epi) 1357d0b3732eSbholler 1358d0b3732eSbholler .balign 16 1359d0b3732eSbhollerL(movdqa15): 1360d0b3732eSbholler sub $0x20,%r8 1361d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1362d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1363d0b3732eSbholler add $0x20,%rdx 1364d0b3732eSbholler 1365d0b3732eSbholler psrldq $0xf,%xmm1 1366d0b3732eSbholler movdqa %xmm3,%xmm2 1367d0b3732eSbholler pslldq $0x1,%xmm3 1368d0b3732eSbholler por %xmm1,%xmm3 1369d0b3732eSbholler 1370d0b3732eSbholler psrldq $0xf,%xmm2 1371d0b3732eSbholler movdqa %xmm0,%xmm1 1372d0b3732eSbholler pslldq $0x1,%xmm0 1373d0b3732eSbholler por %xmm2,%xmm0 1374d0b3732eSbholler movdqa %xmm3,(%rcx) 1375d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1376d0b3732eSbholler 1377d0b3732eSbholler add $0x20,%rcx 1378d0b3732eSbholler cmp $0x20,%r8 1379d0b3732eSbholler jge L(movdqa15) 1380d0b3732eSbholler #jmp L(movdqa_epi) 1381d0b3732eSbholler 1382d0b3732eSbholler .balign 16 1383d0b3732eSbhollerL(movdqa_epi): 1384d0b3732eSbholler lea L(fwdPxQx)(%rip),%r10 1385d0b3732eSbholler add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) 1386d0b3732eSbholler add %r8,%rcx 1387d0b3732eSbholler add %r8,%rdx 1388d0b3732eSbholler 1389d0b3732eSbholler movslq (%r10,%r8,4),%r9 1390d0b3732eSbholler lea (%r9,%r10,1),%r10 1391d0b3732eSbholler jmpq *%r10 1392d0b3732eSbholler 1393d0b3732eSbholler .balign 16 1394d0b3732eSbhollerL(mov3dqa1): 1395d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1396d0b3732eSbholler sub $0x30,%r8 1397d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 # load the upper source buffer 1398d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 # load the upper source buffer 1399d0b3732eSbholler lea 0x30(%rdx),%rdx 1400d0b3732eSbholler cmp $0x30,%r8 1401d0b3732eSbholler 1402d0b3732eSbholler movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration 1403d0b3732eSbholler #palignr $0x1,%xmm1,%xmm3 1404d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1405d0b3732eSbholler .byte 0xd9,0x01 1406d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1407d0b3732eSbholler 1408d0b3732eSbholler movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration 1409d0b3732eSbholler #palignr $0x1,%xmm2,%xmm0 1410d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1411d0b3732eSbholler .byte 0xc2,0x01 1412d0b3732eSbholler movdqa %xmm0,0x10(%rcx) # store it 1413d0b3732eSbholler 1414d0b3732eSbholler movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration 1415d0b3732eSbholler #palignr $0x1,%xmm4,%xmm5 1416d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1417d0b3732eSbholler .byte 0xec,0x01 1418d0b3732eSbholler movdqa %xmm5,0x20(%rcx) # store it 1419d0b3732eSbholler 1420d0b3732eSbholler lea 0x30(%rcx),%rcx 1421d0b3732eSbholler jge L(mov3dqa1) 1422d0b3732eSbholler 1423d0b3732eSbholler cmp $0x10,%r8 1424d0b3732eSbholler jl L(movdqa_epi) 1425d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1426d0b3732eSbholler sub $0x10,%r8 1427d0b3732eSbholler lea 0x10(%rdx),%rdx 1428d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1429d0b3732eSbholler #palignr $0x1,%xmm1,%xmm3 1430d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1431d0b3732eSbholler .byte 0xd9,0x01 1432d0b3732eSbholler 1433d0b3732eSbholler cmp $0x10,%r8 1434d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1435d0b3732eSbholler lea 0x10(%rcx),%rcx 1436d0b3732eSbholler jl L(movdqa_epi) 1437d0b3732eSbholler 1438d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1439d0b3732eSbholler sub $0x10,%r8 1440d0b3732eSbholler lea 0x10(%rdx),%rdx 1441d0b3732eSbholler #palignr $0x1,%xmm2,%xmm0 1442d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1443d0b3732eSbholler .byte 0xc2,0x01 1444d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1445d0b3732eSbholler lea 0x10(%rcx),%rcx 1446d0b3732eSbholler jmp L(movdqa_epi) 1447d0b3732eSbholler 1448d0b3732eSbholler .balign 16 1449d0b3732eSbhollerL(mov3dqa2): 1450d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1451d0b3732eSbholler sub $0x30,%r8 1452d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1453d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1454d0b3732eSbholler lea 0x30(%rdx),%rdx 1455d0b3732eSbholler cmp $0x30,%r8 1456d0b3732eSbholler 1457d0b3732eSbholler movdqa %xmm3,%xmm2 1458d0b3732eSbholler #palignr $0x2,%xmm1,%xmm3 1459d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1460d0b3732eSbholler .byte 0xd9,0x02 1461d0b3732eSbholler movdqa %xmm3,(%rcx) 1462d0b3732eSbholler 1463d0b3732eSbholler movdqa %xmm0,%xmm4 1464d0b3732eSbholler #palignr $0x2,%xmm2,%xmm0 1465d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1466d0b3732eSbholler .byte 0xc2,0x02 1467d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1468d0b3732eSbholler 1469d0b3732eSbholler movdqa %xmm5,%xmm1 1470d0b3732eSbholler #palignr $0x2,%xmm4,%xmm5 1471d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1472d0b3732eSbholler .byte 0xec,0x02 1473d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1474d0b3732eSbholler 1475d0b3732eSbholler lea 0x30(%rcx),%rcx 1476d0b3732eSbholler jge L(mov3dqa2) 1477d0b3732eSbholler 1478d0b3732eSbholler cmp $0x10,%r8 1479d0b3732eSbholler jl L(movdqa_epi) 1480d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1481d0b3732eSbholler sub $0x10,%r8 1482d0b3732eSbholler lea 0x10(%rdx),%rdx 1483d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1484d0b3732eSbholler #palignr $0x2,%xmm1,%xmm3 1485d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1486d0b3732eSbholler .byte 0xd9,0x02 1487d0b3732eSbholler 1488d0b3732eSbholler cmp $0x10,%r8 1489d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1490d0b3732eSbholler lea 0x10(%rcx),%rcx 1491d0b3732eSbholler jl L(movdqa_epi) 1492d0b3732eSbholler 1493d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1494d0b3732eSbholler sub $0x10,%r8 1495d0b3732eSbholler lea 0x10(%rdx),%rdx 1496d0b3732eSbholler #palignr $0x2,%xmm2,%xmm0 1497d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1498d0b3732eSbholler .byte 0xc2,0x02 1499d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1500d0b3732eSbholler lea 0x10(%rcx),%rcx 1501d0b3732eSbholler jmp L(movdqa_epi) 1502d0b3732eSbholler 1503d0b3732eSbholler .balign 16 1504d0b3732eSbhollerL(mov3dqa3): 1505d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1506d0b3732eSbholler sub $0x30,%r8 1507d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1508d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1509d0b3732eSbholler lea 0x30(%rdx),%rdx 1510d0b3732eSbholler cmp $0x30,%r8 1511d0b3732eSbholler 1512d0b3732eSbholler movdqa %xmm3,%xmm2 1513d0b3732eSbholler #palignr $0x3,%xmm1,%xmm3 1514d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1515d0b3732eSbholler .byte 0xd9,0x03 1516d0b3732eSbholler movdqa %xmm3,(%rcx) 1517d0b3732eSbholler 1518d0b3732eSbholler movdqa %xmm0,%xmm4 1519d0b3732eSbholler #palignr $0x3,%xmm2,%xmm0 1520d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1521d0b3732eSbholler .byte 0xc2,0x03 1522d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1523d0b3732eSbholler 1524d0b3732eSbholler movdqa %xmm5,%xmm1 1525d0b3732eSbholler #palignr $0x3,%xmm4,%xmm5 1526d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1527d0b3732eSbholler .byte 0xec,0x03 1528d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1529d0b3732eSbholler 1530d0b3732eSbholler lea 0x30(%rcx),%rcx 1531d0b3732eSbholler jge L(mov3dqa3) 1532d0b3732eSbholler 1533d0b3732eSbholler cmp $0x10,%r8 1534d0b3732eSbholler jl L(movdqa_epi) 1535d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1536d0b3732eSbholler sub $0x10,%r8 1537d0b3732eSbholler lea 0x10(%rdx),%rdx 1538d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1539d0b3732eSbholler #palignr $0x3,%xmm1,%xmm3 1540d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1541d0b3732eSbholler .byte 0xd9,0x03 1542d0b3732eSbholler 1543d0b3732eSbholler cmp $0x10,%r8 1544d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1545d0b3732eSbholler lea 0x10(%rcx),%rcx 1546d0b3732eSbholler jl L(movdqa_epi) 1547d0b3732eSbholler 1548d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1549d0b3732eSbholler sub $0x10,%r8 1550d0b3732eSbholler lea 0x10(%rdx),%rdx 1551d0b3732eSbholler #palignr $0x3,%xmm2,%xmm0 1552d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1553d0b3732eSbholler .byte 0xc2,0x03 1554d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1555d0b3732eSbholler lea 0x10(%rcx),%rcx 1556d0b3732eSbholler jmp L(movdqa_epi) 1557d0b3732eSbholler 1558d0b3732eSbholler .balign 16 1559d0b3732eSbhollerL(mov3dqa4): 1560d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1561d0b3732eSbholler sub $0x30,%r8 1562d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1563d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1564d0b3732eSbholler lea 0x30(%rdx),%rdx 1565d0b3732eSbholler cmp $0x30,%r8 1566d0b3732eSbholler 1567d0b3732eSbholler movdqa %xmm3,%xmm2 1568d0b3732eSbholler #palignr $0x4,%xmm1,%xmm3 1569d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1570d0b3732eSbholler .byte 0xd9,0x04 1571d0b3732eSbholler movdqa %xmm3,(%rcx) 1572d0b3732eSbholler 1573d0b3732eSbholler movdqa %xmm0,%xmm4 1574d0b3732eSbholler #palignr $0x4,%xmm2,%xmm0 1575d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1576d0b3732eSbholler .byte 0xc2,0x04 1577d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1578d0b3732eSbholler 1579d0b3732eSbholler movdqa %xmm5,%xmm1 1580d0b3732eSbholler #palignr $0x4,%xmm4,%xmm5 1581d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1582d0b3732eSbholler .byte 0xec,0x04 1583d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1584d0b3732eSbholler 1585d0b3732eSbholler lea 0x30(%rcx),%rcx 1586d0b3732eSbholler jge L(mov3dqa4) 1587d0b3732eSbholler 1588d0b3732eSbholler cmp $0x10,%r8 1589d0b3732eSbholler jl L(movdqa_epi) 1590d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1591d0b3732eSbholler sub $0x10,%r8 1592d0b3732eSbholler lea 0x10(%rdx),%rdx 1593d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1594d0b3732eSbholler #palignr $0x4,%xmm1,%xmm3 1595d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1596d0b3732eSbholler .byte 0xd9,0x04 1597d0b3732eSbholler 1598d0b3732eSbholler cmp $0x10,%r8 1599d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1600d0b3732eSbholler lea 0x10(%rcx),%rcx 1601d0b3732eSbholler jl L(movdqa_epi) 1602d0b3732eSbholler 1603d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1604d0b3732eSbholler sub $0x10,%r8 1605d0b3732eSbholler lea 0x10(%rdx),%rdx 1606d0b3732eSbholler #palignr $0x4,%xmm2,%xmm0 1607d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1608d0b3732eSbholler .byte 0xc2,0x04 1609d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1610d0b3732eSbholler lea 0x10(%rcx),%rcx 1611d0b3732eSbholler jmp L(movdqa_epi) 1612d0b3732eSbholler 1613d0b3732eSbholler .balign 16 1614d0b3732eSbhollerL(mov3dqa5): 1615d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1616d0b3732eSbholler sub $0x30,%r8 1617d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1618d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1619d0b3732eSbholler lea 0x30(%rdx),%rdx 1620d0b3732eSbholler cmp $0x30,%r8 1621d0b3732eSbholler 1622d0b3732eSbholler movdqa %xmm3,%xmm2 1623d0b3732eSbholler #palignr $0x5,%xmm1,%xmm3 1624d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1625d0b3732eSbholler .byte 0xd9,0x05 1626d0b3732eSbholler movdqa %xmm3,(%rcx) 1627d0b3732eSbholler 1628d0b3732eSbholler movdqa %xmm0,%xmm4 1629d0b3732eSbholler #palignr $0x5,%xmm2,%xmm0 1630d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1631d0b3732eSbholler .byte 0xc2,0x05 1632d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1633d0b3732eSbholler 1634d0b3732eSbholler movdqa %xmm5,%xmm1 1635d0b3732eSbholler #palignr $0x5,%xmm4,%xmm5 1636d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1637d0b3732eSbholler .byte 0xec,0x05 1638d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1639d0b3732eSbholler 1640d0b3732eSbholler lea 0x30(%rcx),%rcx 1641d0b3732eSbholler jge L(mov3dqa5) 1642d0b3732eSbholler 1643d0b3732eSbholler cmp $0x10,%r8 1644d0b3732eSbholler jl L(movdqa_epi) 1645d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1646d0b3732eSbholler sub $0x10,%r8 1647d0b3732eSbholler lea 0x10(%rdx),%rdx 1648d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1649d0b3732eSbholler #palignr $0x5,%xmm1,%xmm3 1650d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1651d0b3732eSbholler .byte 0xd9,0x05 1652d0b3732eSbholler 1653d0b3732eSbholler cmp $0x10,%r8 1654d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1655d0b3732eSbholler lea 0x10(%rcx),%rcx 1656d0b3732eSbholler jl L(movdqa_epi) 1657d0b3732eSbholler 1658d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1659d0b3732eSbholler sub $0x10,%r8 1660d0b3732eSbholler lea 0x10(%rdx),%rdx 1661d0b3732eSbholler #palignr $0x5,%xmm2,%xmm0 1662d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1663d0b3732eSbholler .byte 0xc2,0x05 1664d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1665d0b3732eSbholler lea 0x10(%rcx),%rcx 1666d0b3732eSbholler jmp L(movdqa_epi) 1667d0b3732eSbholler 1668d0b3732eSbholler .balign 16 1669d0b3732eSbhollerL(mov3dqa6): 1670d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1671d0b3732eSbholler sub $0x30,%r8 1672d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1673d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1674d0b3732eSbholler lea 0x30(%rdx),%rdx 1675d0b3732eSbholler cmp $0x30,%r8 1676d0b3732eSbholler 1677d0b3732eSbholler movdqa %xmm3,%xmm2 1678d0b3732eSbholler #palignr $0x6,%xmm1,%xmm3 1679d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1680d0b3732eSbholler .byte 0xd9,0x06 1681d0b3732eSbholler movdqa %xmm3,(%rcx) 1682d0b3732eSbholler 1683d0b3732eSbholler movdqa %xmm0,%xmm4 1684d0b3732eSbholler #palignr $0x6,%xmm2,%xmm0 1685d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1686d0b3732eSbholler .byte 0xc2,0x06 1687d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1688d0b3732eSbholler 1689d0b3732eSbholler movdqa %xmm5,%xmm1 1690d0b3732eSbholler #palignr $0x6,%xmm4,%xmm5 1691d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1692d0b3732eSbholler .byte 0xec,0x06 1693d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1694d0b3732eSbholler 1695d0b3732eSbholler lea 0x30(%rcx),%rcx 1696d0b3732eSbholler jge L(mov3dqa6) 1697d0b3732eSbholler 1698d0b3732eSbholler cmp $0x10,%r8 1699d0b3732eSbholler jl L(movdqa_epi) 1700d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1701d0b3732eSbholler sub $0x10,%r8 1702d0b3732eSbholler lea 0x10(%rdx),%rdx 1703d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1704d0b3732eSbholler #palignr $0x6,%xmm1,%xmm3 1705d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1706d0b3732eSbholler .byte 0xd9,0x06 1707d0b3732eSbholler 1708d0b3732eSbholler cmp $0x10,%r8 1709d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1710d0b3732eSbholler lea 0x10(%rcx),%rcx 1711d0b3732eSbholler jl L(movdqa_epi) 1712d0b3732eSbholler 1713d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1714d0b3732eSbholler sub $0x10,%r8 1715d0b3732eSbholler lea 0x10(%rdx),%rdx 1716d0b3732eSbholler #palignr $0x6,%xmm2,%xmm0 1717d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1718d0b3732eSbholler .byte 0xc2,0x06 1719d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1720d0b3732eSbholler lea 0x10(%rcx),%rcx 1721d0b3732eSbholler jmp L(movdqa_epi) 1722d0b3732eSbholler 1723d0b3732eSbholler .balign 16 1724d0b3732eSbhollerL(mov3dqa7): 1725d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1726d0b3732eSbholler sub $0x30,%r8 1727d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1728d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1729d0b3732eSbholler lea 0x30(%rdx),%rdx 1730d0b3732eSbholler cmp $0x30,%r8 1731d0b3732eSbholler 1732d0b3732eSbholler movdqa %xmm3,%xmm2 1733d0b3732eSbholler #palignr $0x7,%xmm1,%xmm3 1734d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1735d0b3732eSbholler .byte 0xd9,0x07 1736d0b3732eSbholler movdqa %xmm3,(%rcx) 1737d0b3732eSbholler 1738d0b3732eSbholler movdqa %xmm0,%xmm4 1739d0b3732eSbholler #palignr $0x7,%xmm2,%xmm0 1740d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1741d0b3732eSbholler .byte 0xc2,0x07 1742d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1743d0b3732eSbholler 1744d0b3732eSbholler movdqa %xmm5,%xmm1 1745d0b3732eSbholler #palignr $0x7,%xmm4,%xmm5 1746d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1747d0b3732eSbholler .byte 0xec,0x07 1748d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1749d0b3732eSbholler 1750d0b3732eSbholler lea 0x30(%rcx),%rcx 1751d0b3732eSbholler jge L(mov3dqa7) 1752d0b3732eSbholler 1753d0b3732eSbholler cmp $0x10,%r8 1754d0b3732eSbholler jl L(movdqa_epi) 1755d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1756d0b3732eSbholler sub $0x10,%r8 1757d0b3732eSbholler lea 0x10(%rdx),%rdx 1758d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1759d0b3732eSbholler #palignr $0x7,%xmm1,%xmm3 1760d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1761d0b3732eSbholler .byte 0xd9,0x07 1762d0b3732eSbholler 1763d0b3732eSbholler cmp $0x10,%r8 1764d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1765d0b3732eSbholler lea 0x10(%rcx),%rcx 1766d0b3732eSbholler jl L(movdqa_epi) 1767d0b3732eSbholler 1768d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1769d0b3732eSbholler sub $0x10,%r8 1770d0b3732eSbholler lea 0x10(%rdx),%rdx 1771d0b3732eSbholler #palignr $0x7,%xmm2,%xmm0 1772d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1773d0b3732eSbholler .byte 0xc2,0x07 1774d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1775d0b3732eSbholler lea 0x10(%rcx),%rcx 1776d0b3732eSbholler jmp L(movdqa_epi) 1777d0b3732eSbholler 1778d0b3732eSbholler .balign 16 1779d0b3732eSbhollerL(mov3dqa9): 1780d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1781d0b3732eSbholler sub $0x30,%r8 1782d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1783d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1784d0b3732eSbholler lea 0x30(%rdx),%rdx 1785d0b3732eSbholler cmp $0x30,%r8 1786d0b3732eSbholler 1787d0b3732eSbholler movdqa %xmm3,%xmm2 1788d0b3732eSbholler #palignr $0x9,%xmm1,%xmm3 1789d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1790d0b3732eSbholler .byte 0xd9,0x09 1791d0b3732eSbholler movdqa %xmm3,(%rcx) 1792d0b3732eSbholler 1793d0b3732eSbholler movdqa %xmm0,%xmm4 1794d0b3732eSbholler #palignr $0x9,%xmm2,%xmm0 1795d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1796d0b3732eSbholler .byte 0xc2,0x09 1797d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1798d0b3732eSbholler 1799d0b3732eSbholler movdqa %xmm5,%xmm1 1800d0b3732eSbholler #palignr $0x9,%xmm4,%xmm5 1801d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1802d0b3732eSbholler .byte 0xec,0x09 1803d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1804d0b3732eSbholler 1805d0b3732eSbholler lea 0x30(%rcx),%rcx 1806d0b3732eSbholler jge L(mov3dqa9) 1807d0b3732eSbholler 1808d0b3732eSbholler cmp $0x10,%r8 1809d0b3732eSbholler jl L(movdqa_epi) 1810d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1811d0b3732eSbholler sub $0x10,%r8 1812d0b3732eSbholler lea 0x10(%rdx),%rdx 1813d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1814d0b3732eSbholler #palignr $0x9,%xmm1,%xmm3 1815d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1816d0b3732eSbholler .byte 0xd9,0x09 1817d0b3732eSbholler 1818d0b3732eSbholler cmp $0x10,%r8 1819d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1820d0b3732eSbholler lea 0x10(%rcx),%rcx 1821d0b3732eSbholler jl L(movdqa_epi) 1822d0b3732eSbholler 1823d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1824d0b3732eSbholler sub $0x10,%r8 1825d0b3732eSbholler lea 0x10(%rdx),%rdx 1826d0b3732eSbholler #palignr $0x9,%xmm2,%xmm0 1827d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1828d0b3732eSbholler .byte 0xc2,0x09 1829d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1830d0b3732eSbholler lea 0x10(%rcx),%rcx 1831d0b3732eSbholler jmp L(movdqa_epi) 1832d0b3732eSbholler 1833d0b3732eSbholler .balign 16 1834d0b3732eSbhollerL(mov3dqa10): 1835d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1836d0b3732eSbholler sub $0x30,%r8 1837d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1838d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1839d0b3732eSbholler lea 0x30(%rdx),%rdx 1840d0b3732eSbholler cmp $0x30,%r8 1841d0b3732eSbholler 1842d0b3732eSbholler movdqa %xmm3,%xmm2 1843d0b3732eSbholler #palignr $0xa,%xmm1,%xmm3 1844d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1845d0b3732eSbholler .byte 0xd9,0x0a 1846d0b3732eSbholler movdqa %xmm3,(%rcx) 1847d0b3732eSbholler 1848d0b3732eSbholler movdqa %xmm0,%xmm4 1849d0b3732eSbholler #palignr $0xa,%xmm2,%xmm0 1850d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1851d0b3732eSbholler .byte 0xc2,0x0a 1852d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1853d0b3732eSbholler 1854d0b3732eSbholler movdqa %xmm5,%xmm1 1855d0b3732eSbholler #palignr $0xa,%xmm4,%xmm5 1856d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1857d0b3732eSbholler .byte 0xec,0x0a 1858d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1859d0b3732eSbholler 1860d0b3732eSbholler lea 0x30(%rcx),%rcx 1861d0b3732eSbholler jge L(mov3dqa10) 1862d0b3732eSbholler 1863d0b3732eSbholler cmp $0x10,%r8 1864d0b3732eSbholler jl L(movdqa_epi) 1865d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1866d0b3732eSbholler sub $0x10,%r8 1867d0b3732eSbholler lea 0x10(%rdx),%rdx 1868d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1869d0b3732eSbholler #palignr $0xa,%xmm1,%xmm3 1870d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1871d0b3732eSbholler .byte 0xd9,0x0a 1872d0b3732eSbholler 1873d0b3732eSbholler cmp $0x10,%r8 1874d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1875d0b3732eSbholler lea 0x10(%rcx),%rcx 1876d0b3732eSbholler jl L(movdqa_epi) 1877d0b3732eSbholler 1878d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1879d0b3732eSbholler sub $0x10,%r8 1880d0b3732eSbholler lea 0x10(%rdx),%rdx 1881d0b3732eSbholler #palignr $0xa,%xmm2,%xmm0 1882d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1883d0b3732eSbholler .byte 0xc2,0x0a 1884d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1885d0b3732eSbholler lea 0x10(%rcx),%rcx 1886d0b3732eSbholler jmp L(movdqa_epi) 1887d0b3732eSbholler 1888d0b3732eSbholler .balign 16 1889d0b3732eSbhollerL(mov3dqa11): 1890d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1891d0b3732eSbholler sub $0x30,%r8 1892d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1893d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1894d0b3732eSbholler lea 0x30(%rdx),%rdx 1895d0b3732eSbholler cmp $0x30,%r8 1896d0b3732eSbholler 1897d0b3732eSbholler movdqa %xmm3,%xmm2 1898d0b3732eSbholler #palignr $0xb,%xmm1,%xmm3 1899d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1900d0b3732eSbholler .byte 0xd9,0x0b 1901d0b3732eSbholler movdqa %xmm3,(%rcx) 1902d0b3732eSbholler 1903d0b3732eSbholler movdqa %xmm0,%xmm4 1904d0b3732eSbholler #palignr $0xb,%xmm2,%xmm0 1905d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1906d0b3732eSbholler .byte 0xc2,0x0b 1907d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1908d0b3732eSbholler 1909d0b3732eSbholler movdqa %xmm5,%xmm1 1910d0b3732eSbholler #palignr $0xb,%xmm4,%xmm5 1911d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1912d0b3732eSbholler .byte 0xec,0x0b 1913d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1914d0b3732eSbholler 1915d0b3732eSbholler lea 0x30(%rcx),%rcx 1916d0b3732eSbholler jge L(mov3dqa11) 1917d0b3732eSbholler 1918d0b3732eSbholler cmp $0x10,%r8 1919d0b3732eSbholler jl L(movdqa_epi) 1920d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1921d0b3732eSbholler sub $0x10,%r8 1922d0b3732eSbholler lea 0x10(%rdx),%rdx 1923d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1924d0b3732eSbholler #palignr $0xb,%xmm1,%xmm3 1925d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1926d0b3732eSbholler .byte 0xd9,0x0b 1927d0b3732eSbholler 1928d0b3732eSbholler cmp $0x10,%r8 1929d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1930d0b3732eSbholler lea 0x10(%rcx),%rcx 1931d0b3732eSbholler jl L(movdqa_epi) 1932d0b3732eSbholler 1933d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1934d0b3732eSbholler sub $0x10,%r8 1935d0b3732eSbholler lea 0x10(%rdx),%rdx 1936d0b3732eSbholler #palignr $0xb,%xmm2,%xmm0 1937d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1938d0b3732eSbholler .byte 0xc2,0x0b 1939d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1940d0b3732eSbholler lea 0x10(%rcx),%rcx 1941d0b3732eSbholler jmp L(movdqa_epi) 1942d0b3732eSbholler 1943d0b3732eSbholler .balign 16 1944d0b3732eSbhollerL(mov3dqa12): 1945d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 1946d0b3732eSbholler sub $0x30,%r8 1947d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 1948d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 1949d0b3732eSbholler lea 0x30(%rdx),%rdx 1950d0b3732eSbholler cmp $0x30,%r8 1951d0b3732eSbholler 1952d0b3732eSbholler movdqa %xmm3,%xmm2 1953d0b3732eSbholler #palignr $0xc,%xmm1,%xmm3 1954d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1955d0b3732eSbholler .byte 0xd9,0x0c 1956d0b3732eSbholler movdqa %xmm3,(%rcx) 1957d0b3732eSbholler 1958d0b3732eSbholler movdqa %xmm0,%xmm4 1959d0b3732eSbholler #palignr $0xc,%xmm2,%xmm0 1960d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1961d0b3732eSbholler .byte 0xc2,0x0c 1962d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 1963d0b3732eSbholler 1964d0b3732eSbholler movdqa %xmm5,%xmm1 1965d0b3732eSbholler #palignr $0xc,%xmm4,%xmm5 1966d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1967d0b3732eSbholler .byte 0xec,0x0c 1968d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 1969d0b3732eSbholler 1970d0b3732eSbholler lea 0x30(%rcx),%rcx 1971d0b3732eSbholler jge L(mov3dqa12) 1972d0b3732eSbholler 1973d0b3732eSbholler cmp $0x10,%r8 1974d0b3732eSbholler jl L(movdqa_epi) 1975d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 1976d0b3732eSbholler sub $0x10,%r8 1977d0b3732eSbholler lea 0x10(%rdx),%rdx 1978d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 1979d0b3732eSbholler #palignr $0xc,%xmm1,%xmm3 1980d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1981d0b3732eSbholler .byte 0xd9,0x0c 1982d0b3732eSbholler 1983d0b3732eSbholler cmp $0x10,%r8 1984d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 1985d0b3732eSbholler lea 0x10(%rcx),%rcx 1986d0b3732eSbholler jl L(movdqa_epi) 1987d0b3732eSbholler 1988d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 1989d0b3732eSbholler sub $0x10,%r8 1990d0b3732eSbholler lea 0x10(%rdx),%rdx 1991d0b3732eSbholler #palignr $0xc,%xmm2,%xmm0 1992d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 1993d0b3732eSbholler .byte 0xc2,0x0c 1994d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 1995d0b3732eSbholler lea 0x10(%rcx),%rcx 1996d0b3732eSbholler jmp L(movdqa_epi) 1997d0b3732eSbholler 1998d0b3732eSbholler .balign 16 1999d0b3732eSbhollerL(mov3dqa13): 2000d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 2001d0b3732eSbholler sub $0x30,%r8 2002d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 2003d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 2004d0b3732eSbholler lea 0x30(%rdx),%rdx 2005d0b3732eSbholler cmp $0x30,%r8 2006d0b3732eSbholler 2007d0b3732eSbholler movdqa %xmm3,%xmm2 2008d0b3732eSbholler #palignr $0xd,%xmm1,%xmm3 2009d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2010d0b3732eSbholler .byte 0xd9,0x0d 2011d0b3732eSbholler movdqa %xmm3,(%rcx) 2012d0b3732eSbholler 2013d0b3732eSbholler movdqa %xmm0,%xmm4 2014d0b3732eSbholler #palignr $0xd,%xmm2,%xmm0 2015d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2016d0b3732eSbholler .byte 0xc2,0x0d 2017d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 2018d0b3732eSbholler 2019d0b3732eSbholler movdqa %xmm5,%xmm1 2020d0b3732eSbholler #palignr $0xd,%xmm4,%xmm5 2021d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2022d0b3732eSbholler .byte 0xec,0x0d 2023d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 2024d0b3732eSbholler 2025d0b3732eSbholler lea 0x30(%rcx),%rcx 2026d0b3732eSbholler jge L(mov3dqa13) 2027d0b3732eSbholler 2028d0b3732eSbholler cmp $0x10,%r8 2029d0b3732eSbholler jl L(movdqa_epi) 2030d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2031d0b3732eSbholler sub $0x10,%r8 2032d0b3732eSbholler lea 0x10(%rdx),%rdx 2033d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 2034d0b3732eSbholler #palignr $0xd,%xmm1,%xmm3 2035d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2036d0b3732eSbholler .byte 0xd9,0x0d 2037d0b3732eSbholler 2038d0b3732eSbholler cmp $0x10,%r8 2039d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 2040d0b3732eSbholler lea 0x10(%rcx),%rcx 2041d0b3732eSbholler jl L(movdqa_epi) 2042d0b3732eSbholler 2043d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2044d0b3732eSbholler sub $0x10,%r8 2045d0b3732eSbholler lea 0x10(%rdx),%rdx 2046d0b3732eSbholler #palignr $0xd,%xmm2,%xmm0 2047d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2048d0b3732eSbholler .byte 0xc2,0x0d 2049d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 2050d0b3732eSbholler lea 0x10(%rcx),%rcx 2051d0b3732eSbholler jmp L(movdqa_epi) 2052d0b3732eSbholler 2053d0b3732eSbholler .balign 16 2054d0b3732eSbhollerL(mov3dqa14): 2055d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 2056d0b3732eSbholler sub $0x30,%r8 2057d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 2058d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 2059d0b3732eSbholler lea 0x30(%rdx),%rdx 2060d0b3732eSbholler cmp $0x30,%r8 2061d0b3732eSbholler 2062d0b3732eSbholler movdqa %xmm3,%xmm2 2063d0b3732eSbholler #palignr $0xe,%xmm1,%xmm3 2064d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2065d0b3732eSbholler .byte 0xd9,0x0e 2066d0b3732eSbholler movdqa %xmm3,(%rcx) 2067d0b3732eSbholler 2068d0b3732eSbholler movdqa %xmm0,%xmm4 2069d0b3732eSbholler #palignr $0xe,%xmm2,%xmm0 2070d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2071d0b3732eSbholler .byte 0xc2,0x0e 2072d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 2073d0b3732eSbholler 2074d0b3732eSbholler movdqa %xmm5,%xmm1 2075d0b3732eSbholler #palignr $0xe,%xmm4,%xmm5 2076d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2077d0b3732eSbholler .byte 0xec,0x0e 2078d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 2079d0b3732eSbholler 2080d0b3732eSbholler lea 0x30(%rcx),%rcx 2081d0b3732eSbholler jge L(mov3dqa14) 2082d0b3732eSbholler 2083d0b3732eSbholler cmp $0x10,%r8 2084d0b3732eSbholler jl L(movdqa_epi) 2085d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2086d0b3732eSbholler sub $0x10,%r8 2087d0b3732eSbholler lea 0x10(%rdx),%rdx 2088d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 2089d0b3732eSbholler #palignr $0xe,%xmm1,%xmm3 2090d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2091d0b3732eSbholler .byte 0xd9,0x0e 2092d0b3732eSbholler 2093d0b3732eSbholler cmp $0x10,%r8 2094d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 2095d0b3732eSbholler lea 0x10(%rcx),%rcx 2096d0b3732eSbholler jl L(movdqa_epi) 2097d0b3732eSbholler 2098d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2099d0b3732eSbholler sub $0x10,%r8 2100d0b3732eSbholler lea 0x10(%rdx),%rdx 2101d0b3732eSbholler #palignr $0xe,%xmm2,%xmm0 2102d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2103d0b3732eSbholler .byte 0xc2,0x0e 2104d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 2105d0b3732eSbholler lea 0x10(%rcx),%rcx 2106d0b3732eSbholler jmp L(movdqa_epi) 2107d0b3732eSbholler 2108d0b3732eSbholler .balign 16 2109d0b3732eSbhollerL(mov3dqa15): 2110d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 2111d0b3732eSbholler sub $0x30,%r8 2112d0b3732eSbholler movdqa 0x20(%rdx),%xmm0 2113d0b3732eSbholler movdqa 0x30(%rdx),%xmm5 2114d0b3732eSbholler lea 0x30(%rdx),%rdx 2115d0b3732eSbholler cmp $0x30,%r8 2116d0b3732eSbholler 2117d0b3732eSbholler movdqa %xmm3,%xmm2 2118d0b3732eSbholler #palignr $0xf,%xmm1,%xmm3 2119d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2120d0b3732eSbholler .byte 0xd9,0x0f 2121d0b3732eSbholler movdqa %xmm3,(%rcx) 2122d0b3732eSbholler 2123d0b3732eSbholler movdqa %xmm0,%xmm4 2124d0b3732eSbholler #palignr $0xf,%xmm2,%xmm0 2125d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2126d0b3732eSbholler .byte 0xc2,0x0f 2127d0b3732eSbholler movdqa %xmm0,0x10(%rcx) 2128d0b3732eSbholler 2129d0b3732eSbholler movdqa %xmm5,%xmm1 2130d0b3732eSbholler #palignr $0xf,%xmm4,%xmm5 2131d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2132d0b3732eSbholler .byte 0xec,0x0f 2133d0b3732eSbholler movdqa %xmm5,0x20(%rcx) 2134d0b3732eSbholler 2135d0b3732eSbholler lea 0x30(%rcx),%rcx 2136d0b3732eSbholler jge L(mov3dqa15) 2137d0b3732eSbholler 2138d0b3732eSbholler cmp $0x10,%r8 2139d0b3732eSbholler jl L(movdqa_epi) 2140d0b3732eSbholler movdqa 0x10(%rdx),%xmm3 # load the upper source buffer 2141d0b3732eSbholler sub $0x10,%r8 2142d0b3732eSbholler lea 0x10(%rdx),%rdx 2143d0b3732eSbholler movdqa %xmm3,%xmm2 # save for use next concat 2144d0b3732eSbholler #palignr $0xf,%xmm1,%xmm3 2145d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2146d0b3732eSbholler .byte 0xd9,0x0f 2147d0b3732eSbholler 2148d0b3732eSbholler cmp $0x10,%r8 2149d0b3732eSbholler movdqa %xmm3,(%rcx) # store it 2150d0b3732eSbholler lea 0x10(%rcx),%rcx 2151d0b3732eSbholler jl L(movdqa_epi) 2152d0b3732eSbholler 2153d0b3732eSbholler movdqa 0x10(%rdx),%xmm0 # load the upper source buffer 2154d0b3732eSbholler sub $0x10,%r8 2155d0b3732eSbholler lea 0x10(%rdx),%rdx 2156d0b3732eSbholler #palignr $0xf,%xmm2,%xmm0 2157d0b3732eSbholler .byte 0x66,0x0f,0x3a,0x0f 2158d0b3732eSbholler .byte 0xc2,0x0f 2159d0b3732eSbholler movdqa %xmm0,(%rcx) # store it 2160d0b3732eSbholler lea 0x10(%rcx),%rcx 2161d0b3732eSbholler jmp L(movdqa_epi) 2162d0b3732eSbholler 2163d0b3732eSbholler .balign 16 2164d0b3732eSbhollerL(sse2_nt_move): 2165d0b3732eSbholler lea 0x40(%rcx),%rcx 2166d0b3732eSbholler lea 0x40(%rdx),%rdx 2167d0b3732eSbholler lea -0x40(%r8),%r8 2168d0b3732eSbholler 2169d0b3732eSbholler /* 2170d0b3732eSbholler * doesn't matter if source is aligned for stuff out of cache. 2171d0b3732eSbholler * the mis-aligned penalty is masked by the slowness of main memory. 2172d0b3732eSbholler */ 2173d0b3732eSbholler prefetchnta 0x180(%rdx) 2174d0b3732eSbholler movdqu -0x40(%rdx),%xmm0 2175d0b3732eSbholler movdqu -0x30(%rdx),%xmm1 2176d0b3732eSbholler 2177d0b3732eSbholler cmp $0x40,%r8 2178d0b3732eSbholler movntdq %xmm0,-0x40(%rcx) 2179d0b3732eSbholler movntdq %xmm1,-0x30(%rcx) 2180d0b3732eSbholler 2181d0b3732eSbholler movdqu -0x20(%rdx),%xmm2 2182d0b3732eSbholler movdqu -0x10(%rdx),%xmm3 2183d0b3732eSbholler 2184d0b3732eSbholler movntdq %xmm2,-0x20(%rcx) 2185d0b3732eSbholler movntdq %xmm3,-0x10(%rcx) 2186d0b3732eSbholler 2187d0b3732eSbholler jge L(sse2_nt_move) 2188d0b3732eSbholler 2189d0b3732eSbholler lea L(Fix16EndTable)(%rip),%r10 2190d0b3732eSbholler mov %r8,%r9 2191d0b3732eSbholler and $0xFFFFFFFFFFFFFFF0,%r9 2192d0b3732eSbholler add %r9,%rcx 2193d0b3732eSbholler add %r9,%rdx 2194d0b3732eSbholler sub %r9,%r8 2195d0b3732eSbholler shr $0x4,%r9 2196d0b3732eSbholler sfence 2197d0b3732eSbholler 2198d0b3732eSbholler movslq (%r10,%r9,4),%r11 2199d0b3732eSbholler lea (%r11,%r10,1),%r10 2200d0b3732eSbholler jmpq *%r10 2201d0b3732eSbholler 2202d0b3732eSbholler .balign 16 2203d0b3732eSbhollerL(Fix16EndTable): 2204d0b3732eSbholler .int L(fix16_0)-L(Fix16EndTable) 2205d0b3732eSbholler .int L(fix16_1)-L(Fix16EndTable) 2206d0b3732eSbholler .int L(fix16_2)-L(Fix16EndTable) 2207d0b3732eSbholler .int L(fix16_3)-L(Fix16EndTable) 2208d0b3732eSbholler 2209d0b3732eSbholler .balign 16 2210d0b3732eSbhollerL(fix16_3): 2211d0b3732eSbholler movdqu -0x30(%rdx),%xmm1 2212d0b3732eSbholler movdqa %xmm1,-0x30(%rcx) 2213d0b3732eSbhollerL(fix16_2): 2214d0b3732eSbholler movdqu -0x20(%rdx),%xmm2 2215d0b3732eSbholler movdqa %xmm2,-0x20(%rcx) 2216d0b3732eSbhollerL(fix16_1): 2217d0b3732eSbholler movdqu -0x10(%rdx),%xmm3 2218d0b3732eSbholler movdqa %xmm3,-0x10(%rcx) 2219d0b3732eSbhollerL(fix16_0): 2220d0b3732eSbholler lea L(fwdPxQx)(%rip),%r10 2221d0b3732eSbholler add %r8,%rdx 2222d0b3732eSbholler add %r8,%rcx 2223d0b3732eSbholler 2224d0b3732eSbholler movslq (%r10,%r8,4),%r9 2225d0b3732eSbholler lea (%r9,%r10,1),%r10 2226d0b3732eSbholler jmpq *%r10 2227d0b3732eSbholler 2228d0b3732eSbholler .balign 16 2229d0b3732eSbhollerL(pre_both_aligned): 2230d0b3732eSbholler cmp $0x80,%r8 2231d0b3732eSbholler jl L(fix_16b) 2232d0b3732eSbholler 2233d0b3732eSbholler .balign 16 2234d0b3732eSbhollerL(both_aligned): 2235d0b3732eSbholler 2236d0b3732eSbholler /* 2237d0b3732eSbholler * this 'paired' load/load/store/store seems to do best. 2238d0b3732eSbholler */ 2239d0b3732eSbholler movdqa (%rdx),%xmm0 2240d0b3732eSbholler movdqa 0x10(%rdx),%xmm1 2241d0b3732eSbholler 2242d0b3732eSbholler movdqa %xmm0,(%rcx) 2243d0b3732eSbholler movdqa %xmm1,0x10(%rcx) 2244d0b3732eSbholler lea -0x80(%r8),%r8 2245d0b3732eSbholler 2246d0b3732eSbholler movdqa 0x20(%rdx),%xmm2 2247d0b3732eSbholler movdqa 0x30(%rdx),%xmm3 2248d0b3732eSbholler 2249d0b3732eSbholler movdqa %xmm2,0x20(%rcx) 2250d0b3732eSbholler movdqa %xmm3,0x30(%rcx) 2251d0b3732eSbholler 2252d0b3732eSbholler movdqa 0x40(%rdx),%xmm0 2253d0b3732eSbholler movdqa 0x50(%rdx),%xmm1 2254d0b3732eSbholler cmp $0x80,%r8 2255d0b3732eSbholler 2256d0b3732eSbholler movdqa %xmm0,0x40(%rcx) 2257d0b3732eSbholler movdqa %xmm1,0x50(%rcx) 2258d0b3732eSbholler 2259d0b3732eSbholler movdqa 0x60(%rdx),%xmm2 2260d0b3732eSbholler movdqa 0x70(%rdx),%xmm3 2261d0b3732eSbholler lea 0x80(%rdx),%rdx 2262d0b3732eSbholler movdqa %xmm2,0x60(%rcx) 2263d0b3732eSbholler movdqa %xmm3,0x70(%rcx) 2264d0b3732eSbholler lea 0x80(%rcx),%rcx 2265d0b3732eSbholler jge L(both_aligned) 2266d0b3732eSbholler 2267d0b3732eSbhollerL(fix_16b): 2268d0b3732eSbholler add %r8,%rcx 2269d0b3732eSbholler lea L(fwdPxQx)(%rip),%r10 2270d0b3732eSbholler add %r8,%rdx 2271d0b3732eSbholler 2272d0b3732eSbholler movslq (%r10,%r8,4),%r9 2273d0b3732eSbholler lea (%r9,%r10,1),%r10 2274d0b3732eSbholler jmpq *%r10 2275d0b3732eSbholler 2276d0b3732eSbholler .balign 16 2277d0b3732eSbhollerL(Loop8byte_pre): 2278d0b3732eSbholler # Use 8-byte moves 2279d0b3732eSbholler mov .largest_level_cache_size(%rip),%r9d 2280d0b3732eSbholler shr %r9 # take half of it 2281d0b3732eSbholler cmp %r9,%r8 2282*fad5204eSbostrovs jge L(byte8_nt_top) 2283d0b3732eSbholler # Find out whether to use rep movsq 2284d0b3732eSbholler cmp $4096,%r8 2285d0b3732eSbholler jle L(byte8_top) 2286d0b3732eSbholler mov .amd64cache1half(%rip),%r9d # half of l1 cache 2287d0b3732eSbholler cmp %r9,%r8 2288d0b3732eSbholler jle L(use_rep) 2289d0b3732eSbholler 2290d0b3732eSbholler .balign 16 2291d0b3732eSbhollerL(byte8_top): 2292d0b3732eSbholler mov (%rdx),%r9 2293d0b3732eSbholler mov 0x8(%rdx),%r10 2294d0b3732eSbholler lea -0x40(%r8),%r8 2295d0b3732eSbholler mov %r9,(%rcx) 2296d0b3732eSbholler mov %r10,0x8(%rcx) 2297d0b3732eSbholler mov 0x10(%rdx),%r11 2298d0b3732eSbholler mov 0x18(%rdx),%r9 2299d0b3732eSbholler mov %r11,0x10(%rcx) 2300d0b3732eSbholler mov %r9,0x18(%rcx) 2301d0b3732eSbholler 2302d0b3732eSbholler cmp $0x40,%r8 2303d0b3732eSbholler mov 0x20(%rdx),%r10 2304d0b3732eSbholler mov 0x28(%rdx),%r11 2305d0b3732eSbholler mov %r10,0x20(%rcx) 2306d0b3732eSbholler mov %r11,0x28(%rcx) 2307d0b3732eSbholler mov 0x30(%rdx),%r9 2308d0b3732eSbholler mov 0x38(%rdx),%r10 2309d0b3732eSbholler lea 0x40(%rdx),%rdx 2310d0b3732eSbholler mov %r9,0x30(%rcx) 2311d0b3732eSbholler mov %r10,0x38(%rcx) 2312d0b3732eSbholler lea 0x40(%rcx),%rcx 2313d0b3732eSbholler jg L(byte8_top) 2314d0b3732eSbholler 2315d0b3732eSbhollerL(byte8_end): 2316d0b3732eSbholler lea L(fwdPxQx)(%rip),%r10 2317d0b3732eSbholler lea (%rdx,%r8,1),%rdx 2318d0b3732eSbholler lea (%rcx,%r8,1),%rcx 2319d0b3732eSbholler 2320d0b3732eSbholler movslq (%r10,%r8,4),%r9 2321d0b3732eSbholler lea (%r9,%r10,1),%r10 2322d0b3732eSbholler jmpq *%r10 2323d0b3732eSbholler 2324d0b3732eSbholler .balign 16 2325d0b3732eSbhollerL(use_rep): 2326d0b3732eSbholler mov %rdx,%rsi # %rsi = source 2327d0b3732eSbholler mov %rcx,%rdi # %rdi = destination 2328d0b3732eSbholler mov %r8,%rcx # %rcx = count 2329d0b3732eSbholler shrq $3,%rcx # 8-byte word count 23307c478bd9Sstevel@tonic-gate rep 23317c478bd9Sstevel@tonic-gate movsq 2332d0b3732eSbholler mov %rsi,%rdx # source 2333d0b3732eSbholler mov %rdi,%rcx # destination 2334d0b3732eSbholler andq $7,%r8 # remainder 2335d0b3732eSbholler jnz L(byte8_end) 23367c478bd9Sstevel@tonic-gate ret 23377c478bd9Sstevel@tonic-gate 2338d0b3732eSbholler .balign 16 2339d0b3732eSbhollerL(byte8_nt_top): 2340d0b3732eSbholler sub $0x40,%r8 2341d0b3732eSbholler prefetchnta 0x180(%rdx) 2342d0b3732eSbholler mov (%rdx),%r9 2343d0b3732eSbholler movnti %r9,(%rcx) 2344d0b3732eSbholler mov 0x8(%rdx),%r10 2345d0b3732eSbholler movnti %r10,0x8(%rcx) 2346d0b3732eSbholler mov 0x10(%rdx),%r11 2347d0b3732eSbholler movnti %r11,0x10(%rcx) 2348d0b3732eSbholler mov 0x18(%rdx),%r9 2349d0b3732eSbholler movnti %r9,0x18(%rcx) 2350d0b3732eSbholler mov 0x20(%rdx),%r10 2351d0b3732eSbholler movnti %r10,0x20(%rcx) 2352d0b3732eSbholler mov 0x28(%rdx),%r11 2353d0b3732eSbholler movnti %r11,0x28(%rcx) 2354d0b3732eSbholler mov 0x30(%rdx),%r9 2355d0b3732eSbholler movnti %r9,0x30(%rcx) 2356d0b3732eSbholler mov 0x38(%rdx),%r10 2357d0b3732eSbholler movnti %r10,0x38(%rcx) 23587c478bd9Sstevel@tonic-gate 2359d0b3732eSbholler lea 0x40(%rdx),%rdx 2360d0b3732eSbholler lea 0x40(%rcx),%rcx 2361d0b3732eSbholler cmp $0x40,%r8 2362d0b3732eSbholler jge L(byte8_nt_top) 2363d0b3732eSbholler sfence 2364d0b3732eSbholler jmp L(byte8_end) 23657c478bd9Sstevel@tonic-gate 2366d0b3732eSbholler SET_SIZE(memcpy) 23677c478bd9Sstevel@tonic-gate 2368d0b3732eSbholler .balign 16 2369d0b3732eSbhollerL(CopyBackwards): 2370d0b3732eSbholler mov %rdx,%r8 2371d0b3732eSbholler mov %rdi,%rcx 2372d0b3732eSbholler mov %rsi,%rdx 2373d0b3732eSbholler mov %rdi,%rax # return value 23747c478bd9Sstevel@tonic-gate 2375d0b3732eSbholler # ck alignment of last byte 2376d0b3732eSbholler lea (%rcx,%r8,1),%rcx 2377d0b3732eSbholler test $0x7,%rcx 2378d0b3732eSbholler lea (%rdx,%r8,1),%rdx 2379d0b3732eSbholler jne L(bk_align) 23807c478bd9Sstevel@tonic-gate 2381d0b3732eSbhollerL(bk_qw_aligned): 2382d0b3732eSbholler lea L(bkPxQx)(%rip),%r10 23837c478bd9Sstevel@tonic-gate 2384d0b3732eSbholler cmp $0x90,%r8 # 144 2385d0b3732eSbholler jg L(bk_ck_sse2_alignment) 23867c478bd9Sstevel@tonic-gate 2387d0b3732eSbholler sub %r8,%rcx 23887c478bd9Sstevel@tonic-gate sub %r8,%rdx 23897c478bd9Sstevel@tonic-gate 2390d0b3732eSbholler movslq (%r10,%r8,4),%r9 2391d0b3732eSbholler lea (%r9,%r10,1),%r10 2392d0b3732eSbholler jmpq *%r10 23937c478bd9Sstevel@tonic-gate 2394d0b3732eSbholler .balign 16 2395d0b3732eSbhollerL(bk_align): 2396d0b3732eSbholler # only align if len > 8 2397d0b3732eSbholler cmp $8,%r8 2398d0b3732eSbholler jle L(bk_qw_aligned) 2399d0b3732eSbholler test $0x1,%rcx 2400d0b3732eSbholler je L(bk_tst2) 24017c478bd9Sstevel@tonic-gate dec %rcx 2402d0b3732eSbholler dec %rdx 2403d0b3732eSbholler dec %r8 2404d0b3732eSbholler mov (%rdx),%r9b 2405d0b3732eSbholler mov %r9b,(%rcx) 24067c478bd9Sstevel@tonic-gate 2407d0b3732eSbhollerL(bk_tst2): 2408d0b3732eSbholler test $0x2,%rcx 2409d0b3732eSbholler je L(bk_tst3) 24107c478bd9Sstevel@tonic-gate 2411d0b3732eSbhollerL(bk_got2): 2412d0b3732eSbholler sub $0x2,%rcx 2413d0b3732eSbholler sub $0x2,%rdx 2414d0b3732eSbholler sub $0x2,%r8 2415d0b3732eSbholler movzwq (%rdx),%r9 2416d0b3732eSbholler mov %r9w,(%rcx) 24177c478bd9Sstevel@tonic-gate 2418d0b3732eSbhollerL(bk_tst3): 2419d0b3732eSbholler test $0x4,%rcx 2420d0b3732eSbholler je L(bk_qw_aligned) 24217c478bd9Sstevel@tonic-gate 2422d0b3732eSbhollerL(bk_got3): 2423d0b3732eSbholler sub $0x4,%rcx 2424d0b3732eSbholler sub $0x4,%rdx 2425d0b3732eSbholler sub $0x4,%r8 2426d0b3732eSbholler mov (%rdx),%r9d 2427d0b3732eSbholler mov %r9d,(%rcx) 2428d0b3732eSbholler jmp L(bk_qw_aligned) 24297c478bd9Sstevel@tonic-gate 2430d0b3732eSbholler .balign 16 2431d0b3732eSbhollerL(bk_ck_sse2_alignment): 2432d0b3732eSbholler cmpl $NO_SSE,.memops_method(%rip) 2433d0b3732eSbholler je L(bk_use_rep) 2434d0b3732eSbholler # check alignment of last byte 2435d0b3732eSbholler test $0xf,%rcx 2436d0b3732eSbholler jz L(bk_sse2_cpy) 24377c478bd9Sstevel@tonic-gate 2438d0b3732eSbhollerL(bk_sse2_align): 2439d0b3732eSbholler # only here if already aligned on at least a qword bndry 2440d0b3732eSbholler sub $0x8,%rcx 2441d0b3732eSbholler sub $0x8,%rdx 2442d0b3732eSbholler sub $0x8,%r8 2443d0b3732eSbholler mov (%rdx),%r9 2444d0b3732eSbholler mov %r9,(%rcx) 2445d0b3732eSbholler #jmp L(bk_sse2_cpy) 24467c478bd9Sstevel@tonic-gate 2447d0b3732eSbholler .balign 16 2448d0b3732eSbhollerL(bk_sse2_cpy): 2449d0b3732eSbholler sub $0x80,%rcx # 128 2450d0b3732eSbholler sub $0x80,%rdx 2451d0b3732eSbholler movdqu 0x70(%rdx),%xmm3 2452d0b3732eSbholler movdqu 0x60(%rdx),%xmm2 2453d0b3732eSbholler movdqa %xmm3,0x70(%rcx) 2454d0b3732eSbholler movdqa %xmm2,0x60(%rcx) 2455d0b3732eSbholler sub $0x80,%r8 2456d0b3732eSbholler movdqu 0x50(%rdx),%xmm1 2457d0b3732eSbholler movdqu 0x40(%rdx),%xmm0 2458d0b3732eSbholler movdqa %xmm1,0x50(%rcx) 2459d0b3732eSbholler movdqa %xmm0,0x40(%rcx) 24607c478bd9Sstevel@tonic-gate 2461d0b3732eSbholler cmp $0x80,%r8 2462d0b3732eSbholler movdqu 0x30(%rdx),%xmm3 2463d0b3732eSbholler movdqu 0x20(%rdx),%xmm2 2464d0b3732eSbholler movdqa %xmm3,0x30(%rcx) 2465d0b3732eSbholler movdqa %xmm2,0x20(%rcx) 2466d0b3732eSbholler movdqu 0x10(%rdx),%xmm1 2467d0b3732eSbholler movdqu (%rdx),%xmm0 2468d0b3732eSbholler movdqa %xmm1,0x10(%rcx) 2469d0b3732eSbholler movdqa %xmm0,(%rcx) 2470d0b3732eSbholler jge L(bk_sse2_cpy) 24717c478bd9Sstevel@tonic-gate 2472d0b3732eSbhollerL(bk_sse2_cpy_end): 2473d0b3732eSbholler lea L(bkPxQx)(%rip),%r10 2474d0b3732eSbholler sub %r8,%rdx 2475d0b3732eSbholler sub %r8,%rcx 2476d0b3732eSbholler movslq (%r10,%r8,4),%r9 2477d0b3732eSbholler lea (%r9,%r10,1),%r10 2478d0b3732eSbholler jmpq *%r10 24797c478bd9Sstevel@tonic-gate 2480d0b3732eSbholler .balign 16 2481d0b3732eSbhollerL(bk_use_rep): 2482d0b3732eSbholler xchg %rcx,%r9 2483d0b3732eSbholler mov %rdx,%rsi # source 2484d0b3732eSbholler mov %r9,%rdi # destination 2485d0b3732eSbholler mov %r8,%rcx # count 2486d0b3732eSbholler sub $8,%rsi 2487d0b3732eSbholler sub $8,%rdi 2488d0b3732eSbholler shr $3,%rcx 2489d0b3732eSbholler std # reverse direction 2490d0b3732eSbholler rep 2491d0b3732eSbholler movsq 2492d0b3732eSbholler cld # reset direction flag 2493d0b3732eSbholler 2494d0b3732eSbholler xchg %rcx,%r9 2495d0b3732eSbholler lea L(bkPxQx)(%rip),%r10 2496d0b3732eSbholler sub %r8,%rdx 2497d0b3732eSbholler sub %r8,%rcx 2498d0b3732eSbholler andq $7,%r8 # remainder 2499d0b3732eSbholler jz 2f 2500d0b3732eSbholler movslq (%r10,%r8,4),%r9 2501d0b3732eSbholler lea (%r9,%r10,1),%r10 2502d0b3732eSbholler jmpq *%r10 2503d0b3732eSbholler2: 25047c478bd9Sstevel@tonic-gate ret 25057c478bd9Sstevel@tonic-gate 2506d0b3732eSbholler .balign 16 2507d0b3732eSbhollerL(bkP0QI): 2508d0b3732eSbholler mov 0x88(%rdx),%r10 2509d0b3732eSbholler mov %r10,0x88(%rcx) 2510d0b3732eSbhollerL(bkP0QH): 2511d0b3732eSbholler mov 0x80(%rdx),%r10 2512d0b3732eSbholler mov %r10,0x80(%rcx) 2513d0b3732eSbhollerL(bkP0QG): 2514d0b3732eSbholler mov 0x78(%rdx),%r9 2515d0b3732eSbholler mov %r9,0x78(%rcx) 2516d0b3732eSbhollerL(bkP0QF): 2517d0b3732eSbholler mov 0x70(%rdx),%r11 2518d0b3732eSbholler mov %r11,0x70(%rcx) 2519d0b3732eSbhollerL(bkP0QE): 2520d0b3732eSbholler mov 0x68(%rdx),%r10 2521d0b3732eSbholler mov %r10,0x68(%rcx) 2522d0b3732eSbhollerL(bkP0QD): 2523d0b3732eSbholler mov 0x60(%rdx),%r9 2524d0b3732eSbholler mov %r9,0x60(%rcx) 2525d0b3732eSbhollerL(bkP0QC): 2526d0b3732eSbholler mov 0x58(%rdx),%r11 2527d0b3732eSbholler mov %r11,0x58(%rcx) 2528d0b3732eSbhollerL(bkP0QB): 2529d0b3732eSbholler mov 0x50(%rdx),%r10 2530d0b3732eSbholler mov %r10,0x50(%rcx) 2531d0b3732eSbhollerL(bkP0QA): 2532d0b3732eSbholler mov 0x48(%rdx),%r9 2533d0b3732eSbholler mov %r9,0x48(%rcx) 2534d0b3732eSbhollerL(bkP0Q9): 2535d0b3732eSbholler mov 0x40(%rdx),%r11 2536d0b3732eSbholler mov %r11,0x40(%rcx) 2537d0b3732eSbhollerL(bkP0Q8): 2538d0b3732eSbholler mov 0x38(%rdx),%r10 2539d0b3732eSbholler mov %r10,0x38(%rcx) 2540d0b3732eSbhollerL(bkP0Q7): 2541d0b3732eSbholler mov 0x30(%rdx),%r9 2542d0b3732eSbholler mov %r9,0x30(%rcx) 2543d0b3732eSbhollerL(bkP0Q6): 2544d0b3732eSbholler mov 0x28(%rdx),%r11 2545d0b3732eSbholler mov %r11,0x28(%rcx) 2546d0b3732eSbhollerL(bkP0Q5): 2547d0b3732eSbholler mov 0x20(%rdx),%r10 2548d0b3732eSbholler mov %r10,0x20(%rcx) 2549d0b3732eSbhollerL(bkP0Q4): 2550d0b3732eSbholler mov 0x18(%rdx),%r9 2551d0b3732eSbholler mov %r9,0x18(%rcx) 2552d0b3732eSbhollerL(bkP0Q3): 2553d0b3732eSbholler mov 0x10(%rdx),%r11 2554d0b3732eSbholler mov %r11,0x10(%rcx) 2555d0b3732eSbhollerL(bkP0Q2): 2556d0b3732eSbholler mov 0x8(%rdx),%r10 2557d0b3732eSbholler mov %r10,0x8(%rcx) 2558d0b3732eSbhollerL(bkP0Q1): 2559d0b3732eSbholler mov (%rdx),%r9 2560d0b3732eSbholler mov %r9,(%rcx) 2561d0b3732eSbhollerL(bkP0Q0): 2562d0b3732eSbholler ret 25637c478bd9Sstevel@tonic-gate 2564d0b3732eSbholler .balign 16 2565d0b3732eSbhollerL(bkP1QI): 2566d0b3732eSbholler mov 0x89(%rdx),%r10 2567d0b3732eSbholler mov %r10,0x89(%rcx) 2568d0b3732eSbhollerL(bkP1QH): 2569d0b3732eSbholler mov 0x81(%rdx),%r11 2570d0b3732eSbholler mov %r11,0x81(%rcx) 2571d0b3732eSbhollerL(bkP1QG): 2572d0b3732eSbholler mov 0x79(%rdx),%r10 2573d0b3732eSbholler mov %r10,0x79(%rcx) 2574d0b3732eSbhollerL(bkP1QF): 2575d0b3732eSbholler mov 0x71(%rdx),%r9 2576d0b3732eSbholler mov %r9,0x71(%rcx) 2577d0b3732eSbhollerL(bkP1QE): 2578d0b3732eSbholler mov 0x69(%rdx),%r11 2579d0b3732eSbholler mov %r11,0x69(%rcx) 2580d0b3732eSbhollerL(bkP1QD): 2581d0b3732eSbholler mov 0x61(%rdx),%r10 2582d0b3732eSbholler mov %r10,0x61(%rcx) 2583d0b3732eSbhollerL(bkP1QC): 2584d0b3732eSbholler mov 0x59(%rdx),%r9 2585d0b3732eSbholler mov %r9,0x59(%rcx) 2586d0b3732eSbhollerL(bkP1QB): 2587d0b3732eSbholler mov 0x51(%rdx),%r11 2588d0b3732eSbholler mov %r11,0x51(%rcx) 2589d0b3732eSbhollerL(bkP1QA): 2590d0b3732eSbholler mov 0x49(%rdx),%r10 2591d0b3732eSbholler mov %r10,0x49(%rcx) 2592d0b3732eSbhollerL(bkP1Q9): 2593d0b3732eSbholler mov 0x41(%rdx),%r9 2594d0b3732eSbholler mov %r9,0x41(%rcx) 2595d0b3732eSbhollerL(bkP1Q8): 2596d0b3732eSbholler mov 0x39(%rdx),%r11 2597d0b3732eSbholler mov %r11,0x39(%rcx) 2598d0b3732eSbhollerL(bkP1Q7): 2599d0b3732eSbholler mov 0x31(%rdx),%r10 2600d0b3732eSbholler mov %r10,0x31(%rcx) 2601d0b3732eSbhollerL(bkP1Q6): 2602d0b3732eSbholler mov 0x29(%rdx),%r9 2603d0b3732eSbholler mov %r9,0x29(%rcx) 2604d0b3732eSbhollerL(bkP1Q5): 2605d0b3732eSbholler mov 0x21(%rdx),%r11 2606d0b3732eSbholler mov %r11,0x21(%rcx) 2607d0b3732eSbhollerL(bkP1Q4): 2608d0b3732eSbholler mov 0x19(%rdx),%r10 2609d0b3732eSbholler mov %r10,0x19(%rcx) 2610d0b3732eSbhollerL(bkP1Q3): 2611d0b3732eSbholler mov 0x11(%rdx),%r9 2612d0b3732eSbholler mov %r9,0x11(%rcx) 2613d0b3732eSbhollerL(bkP1Q2): 2614d0b3732eSbholler mov 0x9(%rdx),%r11 2615d0b3732eSbholler mov %r11,0x9(%rcx) 2616d0b3732eSbhollerL(bkP1Q1): 2617d0b3732eSbholler mov 0x1(%rdx),%r10 2618d0b3732eSbholler mov %r10,0x1(%rcx) 2619d0b3732eSbhollerL(bkP1Q0): 2620d0b3732eSbholler mov (%rdx),%r9b 2621d0b3732eSbholler mov %r9b,(%rcx) 2622d0b3732eSbholler ret 2623d0b3732eSbholler 2624d0b3732eSbholler .balign 16 2625d0b3732eSbhollerL(bkP2QI): 2626d0b3732eSbholler mov 0x8a(%rdx),%r10 2627d0b3732eSbholler mov %r10,0x8a(%rcx) 2628d0b3732eSbhollerL(bkP2QH): 2629d0b3732eSbholler mov 0x82(%rdx),%r11 2630d0b3732eSbholler mov %r11,0x82(%rcx) 2631d0b3732eSbhollerL(bkP2QG): 2632d0b3732eSbholler mov 0x7a(%rdx),%r10 2633d0b3732eSbholler mov %r10,0x7a(%rcx) 2634d0b3732eSbhollerL(bkP2QF): 2635d0b3732eSbholler mov 0x72(%rdx),%r9 2636d0b3732eSbholler mov %r9,0x72(%rcx) 2637d0b3732eSbhollerL(bkP2QE): 2638d0b3732eSbholler mov 0x6a(%rdx),%r11 2639d0b3732eSbholler mov %r11,0x6a(%rcx) 2640d0b3732eSbhollerL(bkP2QD): 2641d0b3732eSbholler mov 0x62(%rdx),%r10 2642d0b3732eSbholler mov %r10,0x62(%rcx) 2643d0b3732eSbhollerL(bkP2QC): 2644d0b3732eSbholler mov 0x5a(%rdx),%r9 2645d0b3732eSbholler mov %r9,0x5a(%rcx) 2646d0b3732eSbhollerL(bkP2QB): 2647d0b3732eSbholler mov 0x52(%rdx),%r11 2648d0b3732eSbholler mov %r11,0x52(%rcx) 2649d0b3732eSbhollerL(bkP2QA): 2650d0b3732eSbholler mov 0x4a(%rdx),%r10 2651d0b3732eSbholler mov %r10,0x4a(%rcx) 2652d0b3732eSbhollerL(bkP2Q9): 2653d0b3732eSbholler mov 0x42(%rdx),%r9 2654d0b3732eSbholler mov %r9,0x42(%rcx) 2655d0b3732eSbhollerL(bkP2Q8): 2656d0b3732eSbholler mov 0x3a(%rdx),%r11 2657d0b3732eSbholler mov %r11,0x3a(%rcx) 2658d0b3732eSbhollerL(bkP2Q7): 2659d0b3732eSbholler mov 0x32(%rdx),%r10 2660d0b3732eSbholler mov %r10,0x32(%rcx) 2661d0b3732eSbhollerL(bkP2Q6): 2662d0b3732eSbholler mov 0x2a(%rdx),%r9 2663d0b3732eSbholler mov %r9,0x2a(%rcx) 2664d0b3732eSbhollerL(bkP2Q5): 2665d0b3732eSbholler mov 0x22(%rdx),%r11 2666d0b3732eSbholler mov %r11,0x22(%rcx) 2667d0b3732eSbhollerL(bkP2Q4): 2668d0b3732eSbholler mov 0x1a(%rdx),%r10 2669d0b3732eSbholler mov %r10,0x1a(%rcx) 2670d0b3732eSbhollerL(bkP2Q3): 2671d0b3732eSbholler mov 0x12(%rdx),%r9 2672d0b3732eSbholler mov %r9,0x12(%rcx) 2673d0b3732eSbhollerL(bkP2Q2): 2674d0b3732eSbholler mov 0xa(%rdx),%r11 2675d0b3732eSbholler mov %r11,0xa(%rcx) 2676d0b3732eSbhollerL(bkP2Q1): 2677d0b3732eSbholler mov 0x2(%rdx),%r10 2678d0b3732eSbholler mov %r10,0x2(%rcx) 2679d0b3732eSbhollerL(bkP2Q0): 2680d0b3732eSbholler mov (%rdx),%r9w 2681d0b3732eSbholler mov %r9w,(%rcx) 2682d0b3732eSbholler ret 2683d0b3732eSbholler 2684d0b3732eSbholler .balign 16 2685d0b3732eSbhollerL(bkP3QI): 2686d0b3732eSbholler mov 0x8b(%rdx),%r10 2687d0b3732eSbholler mov %r10,0x8b(%rcx) 2688d0b3732eSbhollerL(bkP3QH): 2689d0b3732eSbholler mov 0x83(%rdx),%r11 2690d0b3732eSbholler mov %r11,0x83(%rcx) 2691d0b3732eSbhollerL(bkP3QG): 2692d0b3732eSbholler mov 0x7b(%rdx),%r10 2693d0b3732eSbholler mov %r10,0x7b(%rcx) 2694d0b3732eSbhollerL(bkP3QF): 2695d0b3732eSbholler mov 0x73(%rdx),%r9 2696d0b3732eSbholler mov %r9,0x73(%rcx) 2697d0b3732eSbhollerL(bkP3QE): 2698d0b3732eSbholler mov 0x6b(%rdx),%r11 2699d0b3732eSbholler mov %r11,0x6b(%rcx) 2700d0b3732eSbhollerL(bkP3QD): 2701d0b3732eSbholler mov 0x63(%rdx),%r10 2702d0b3732eSbholler mov %r10,0x63(%rcx) 2703d0b3732eSbhollerL(bkP3QC): 2704d0b3732eSbholler mov 0x5b(%rdx),%r9 2705d0b3732eSbholler mov %r9,0x5b(%rcx) 2706d0b3732eSbhollerL(bkP3QB): 2707d0b3732eSbholler mov 0x53(%rdx),%r11 2708d0b3732eSbholler mov %r11,0x53(%rcx) 2709d0b3732eSbhollerL(bkP3QA): 2710d0b3732eSbholler mov 0x4b(%rdx),%r10 2711d0b3732eSbholler mov %r10,0x4b(%rcx) 2712d0b3732eSbhollerL(bkP3Q9): 2713d0b3732eSbholler mov 0x43(%rdx),%r9 2714d0b3732eSbholler mov %r9,0x43(%rcx) 2715d0b3732eSbhollerL(bkP3Q8): 2716d0b3732eSbholler mov 0x3b(%rdx),%r11 2717d0b3732eSbholler mov %r11,0x3b(%rcx) 2718d0b3732eSbhollerL(bkP3Q7): 2719d0b3732eSbholler mov 0x33(%rdx),%r10 2720d0b3732eSbholler mov %r10,0x33(%rcx) 2721d0b3732eSbhollerL(bkP3Q6): 2722d0b3732eSbholler mov 0x2b(%rdx),%r9 2723d0b3732eSbholler mov %r9,0x2b(%rcx) 2724d0b3732eSbhollerL(bkP3Q5): 2725d0b3732eSbholler mov 0x23(%rdx),%r11 2726d0b3732eSbholler mov %r11,0x23(%rcx) 2727d0b3732eSbhollerL(bkP3Q4): 2728d0b3732eSbholler mov 0x1b(%rdx),%r10 2729d0b3732eSbholler mov %r10,0x1b(%rcx) 2730d0b3732eSbhollerL(bkP3Q3): 2731d0b3732eSbholler mov 0x13(%rdx),%r9 2732d0b3732eSbholler mov %r9,0x13(%rcx) 2733d0b3732eSbhollerL(bkP3Q2): 2734d0b3732eSbholler mov 0xb(%rdx),%r11 2735d0b3732eSbholler mov %r11,0xb(%rcx) 2736d0b3732eSbhollerL(bkP3Q1): 2737d0b3732eSbholler mov 0x3(%rdx),%r10 2738d0b3732eSbholler mov %r10,0x3(%rcx) 2739d0b3732eSbhollerL(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores 2740d0b3732eSbholler mov 0x1(%rdx),%r9w 2741d0b3732eSbholler mov %r9w,0x1(%rcx) 2742d0b3732eSbholler mov (%rdx),%r10b 2743d0b3732eSbholler mov %r10b,(%rcx) 2744d0b3732eSbholler ret 2745d0b3732eSbholler 2746d0b3732eSbholler .balign 16 2747d0b3732eSbhollerL(bkP4QI): 2748d0b3732eSbholler mov 0x8c(%rdx),%r10 2749d0b3732eSbholler mov %r10,0x8c(%rcx) 2750d0b3732eSbhollerL(bkP4QH): 2751d0b3732eSbholler mov 0x84(%rdx),%r11 2752d0b3732eSbholler mov %r11,0x84(%rcx) 2753d0b3732eSbhollerL(bkP4QG): 2754d0b3732eSbholler mov 0x7c(%rdx),%r10 2755d0b3732eSbholler mov %r10,0x7c(%rcx) 2756d0b3732eSbhollerL(bkP4QF): 2757d0b3732eSbholler mov 0x74(%rdx),%r9 2758d0b3732eSbholler mov %r9,0x74(%rcx) 2759d0b3732eSbhollerL(bkP4QE): 2760d0b3732eSbholler mov 0x6c(%rdx),%r11 2761d0b3732eSbholler mov %r11,0x6c(%rcx) 2762d0b3732eSbhollerL(bkP4QD): 2763d0b3732eSbholler mov 0x64(%rdx),%r10 2764d0b3732eSbholler mov %r10,0x64(%rcx) 2765d0b3732eSbhollerL(bkP4QC): 2766d0b3732eSbholler mov 0x5c(%rdx),%r9 2767d0b3732eSbholler mov %r9,0x5c(%rcx) 2768d0b3732eSbhollerL(bkP4QB): 2769d0b3732eSbholler mov 0x54(%rdx),%r11 2770d0b3732eSbholler mov %r11,0x54(%rcx) 2771d0b3732eSbhollerL(bkP4QA): 2772d0b3732eSbholler mov 0x4c(%rdx),%r10 2773d0b3732eSbholler mov %r10,0x4c(%rcx) 2774d0b3732eSbhollerL(bkP4Q9): 2775d0b3732eSbholler mov 0x44(%rdx),%r9 2776d0b3732eSbholler mov %r9,0x44(%rcx) 2777d0b3732eSbhollerL(bkP4Q8): 2778d0b3732eSbholler mov 0x3c(%rdx),%r11 2779d0b3732eSbholler mov %r11,0x3c(%rcx) 2780d0b3732eSbhollerL(bkP4Q7): 2781d0b3732eSbholler mov 0x34(%rdx),%r10 2782d0b3732eSbholler mov %r10,0x34(%rcx) 2783d0b3732eSbhollerL(bkP4Q6): 2784d0b3732eSbholler mov 0x2c(%rdx),%r9 2785d0b3732eSbholler mov %r9,0x2c(%rcx) 2786d0b3732eSbhollerL(bkP4Q5): 2787d0b3732eSbholler mov 0x24(%rdx),%r11 2788d0b3732eSbholler mov %r11,0x24(%rcx) 2789d0b3732eSbhollerL(bkP4Q4): 2790d0b3732eSbholler mov 0x1c(%rdx),%r10 2791d0b3732eSbholler mov %r10,0x1c(%rcx) 2792d0b3732eSbhollerL(bkP4Q3): 2793d0b3732eSbholler mov 0x14(%rdx),%r9 2794d0b3732eSbholler mov %r9,0x14(%rcx) 2795d0b3732eSbhollerL(bkP4Q2): 2796d0b3732eSbholler mov 0xc(%rdx),%r11 2797d0b3732eSbholler mov %r11,0xc(%rcx) 2798d0b3732eSbhollerL(bkP4Q1): 2799d0b3732eSbholler mov 0x4(%rdx),%r10 2800d0b3732eSbholler mov %r10,0x4(%rcx) 2801d0b3732eSbhollerL(bkP4Q0): 2802d0b3732eSbholler mov (%rdx),%r9d 2803d0b3732eSbholler mov %r9d,(%rcx) 2804d0b3732eSbholler ret 2805d0b3732eSbholler 2806d0b3732eSbholler .balign 16 2807d0b3732eSbhollerL(bkP5QI): 2808d0b3732eSbholler mov 0x8d(%rdx),%r10 2809d0b3732eSbholler mov %r10,0x8d(%rcx) 2810d0b3732eSbhollerL(bkP5QH): 2811d0b3732eSbholler mov 0x85(%rdx),%r9 2812d0b3732eSbholler mov %r9,0x85(%rcx) 2813d0b3732eSbhollerL(bkP5QG): 2814d0b3732eSbholler mov 0x7d(%rdx),%r11 2815d0b3732eSbholler mov %r11,0x7d(%rcx) 2816d0b3732eSbhollerL(bkP5QF): 2817d0b3732eSbholler mov 0x75(%rdx),%r10 2818d0b3732eSbholler mov %r10,0x75(%rcx) 2819d0b3732eSbhollerL(bkP5QE): 2820d0b3732eSbholler mov 0x6d(%rdx),%r9 2821d0b3732eSbholler mov %r9,0x6d(%rcx) 2822d0b3732eSbhollerL(bkP5QD): 2823d0b3732eSbholler mov 0x65(%rdx),%r11 2824d0b3732eSbholler mov %r11,0x65(%rcx) 2825d0b3732eSbhollerL(bkP5QC): 2826d0b3732eSbholler mov 0x5d(%rdx),%r10 2827d0b3732eSbholler mov %r10,0x5d(%rcx) 2828d0b3732eSbhollerL(bkP5QB): 2829d0b3732eSbholler mov 0x55(%rdx),%r9 2830d0b3732eSbholler mov %r9,0x55(%rcx) 2831d0b3732eSbhollerL(bkP5QA): 2832d0b3732eSbholler mov 0x4d(%rdx),%r11 2833d0b3732eSbholler mov %r11,0x4d(%rcx) 2834d0b3732eSbhollerL(bkP5Q9): 2835d0b3732eSbholler mov 0x45(%rdx),%r10 2836d0b3732eSbholler mov %r10,0x45(%rcx) 2837d0b3732eSbhollerL(bkP5Q8): 2838d0b3732eSbholler mov 0x3d(%rdx),%r9 2839d0b3732eSbholler mov %r9,0x3d(%rcx) 2840d0b3732eSbhollerL(bkP5Q7): 2841d0b3732eSbholler mov 0x35(%rdx),%r11 2842d0b3732eSbholler mov %r11,0x35(%rcx) 2843d0b3732eSbhollerL(bkP5Q6): 2844d0b3732eSbholler mov 0x2d(%rdx),%r10 2845d0b3732eSbholler mov %r10,0x2d(%rcx) 2846d0b3732eSbhollerL(bkP5Q5): 2847d0b3732eSbholler mov 0x25(%rdx),%r9 2848d0b3732eSbholler mov %r9,0x25(%rcx) 2849d0b3732eSbhollerL(bkP5Q4): 2850d0b3732eSbholler mov 0x1d(%rdx),%r11 2851d0b3732eSbholler mov %r11,0x1d(%rcx) 2852d0b3732eSbhollerL(bkP5Q3): 2853d0b3732eSbholler mov 0x15(%rdx),%r10 2854d0b3732eSbholler mov %r10,0x15(%rcx) 2855d0b3732eSbhollerL(bkP5Q2): 2856d0b3732eSbholler mov 0xd(%rdx),%r9 2857d0b3732eSbholler mov %r9,0xd(%rcx) 2858d0b3732eSbhollerL(bkP5Q1): 2859d0b3732eSbholler mov 0x5(%rdx),%r11 2860d0b3732eSbholler mov %r11,0x5(%rcx) 2861d0b3732eSbhollerL(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores 2862d0b3732eSbholler mov 0x1(%rdx),%r9d 2863d0b3732eSbholler mov %r9d,0x1(%rcx) 2864d0b3732eSbholler mov (%rdx),%r10b 2865d0b3732eSbholler mov %r10b,(%rcx) 2866d0b3732eSbholler ret 2867d0b3732eSbholler 2868d0b3732eSbholler .balign 16 2869d0b3732eSbhollerL(bkP6QI): 2870d0b3732eSbholler mov 0x8e(%rdx),%r10 2871d0b3732eSbholler mov %r10,0x8e(%rcx) 2872d0b3732eSbhollerL(bkP6QH): 2873d0b3732eSbholler mov 0x86(%rdx),%r11 2874d0b3732eSbholler mov %r11,0x86(%rcx) 2875d0b3732eSbhollerL(bkP6QG): 2876d0b3732eSbholler mov 0x7e(%rdx),%r10 2877d0b3732eSbholler mov %r10,0x7e(%rcx) 2878d0b3732eSbhollerL(bkP6QF): 2879d0b3732eSbholler mov 0x76(%rdx),%r9 2880d0b3732eSbholler mov %r9,0x76(%rcx) 2881d0b3732eSbhollerL(bkP6QE): 2882d0b3732eSbholler mov 0x6e(%rdx),%r11 2883d0b3732eSbholler mov %r11,0x6e(%rcx) 2884d0b3732eSbhollerL(bkP6QD): 2885d0b3732eSbholler mov 0x66(%rdx),%r10 2886d0b3732eSbholler mov %r10,0x66(%rcx) 2887d0b3732eSbhollerL(bkP6QC): 2888d0b3732eSbholler mov 0x5e(%rdx),%r9 2889d0b3732eSbholler mov %r9,0x5e(%rcx) 2890d0b3732eSbhollerL(bkP6QB): 2891d0b3732eSbholler mov 0x56(%rdx),%r11 2892d0b3732eSbholler mov %r11,0x56(%rcx) 2893d0b3732eSbhollerL(bkP6QA): 2894d0b3732eSbholler mov 0x4e(%rdx),%r10 2895d0b3732eSbholler mov %r10,0x4e(%rcx) 2896d0b3732eSbhollerL(bkP6Q9): 2897d0b3732eSbholler mov 0x46(%rdx),%r9 2898d0b3732eSbholler mov %r9,0x46(%rcx) 2899d0b3732eSbhollerL(bkP6Q8): 2900d0b3732eSbholler mov 0x3e(%rdx),%r11 2901d0b3732eSbholler mov %r11,0x3e(%rcx) 2902d0b3732eSbhollerL(bkP6Q7): 2903d0b3732eSbholler mov 0x36(%rdx),%r10 2904d0b3732eSbholler mov %r10,0x36(%rcx) 2905d0b3732eSbhollerL(bkP6Q6): 2906d0b3732eSbholler mov 0x2e(%rdx),%r9 2907d0b3732eSbholler mov %r9,0x2e(%rcx) 2908d0b3732eSbhollerL(bkP6Q5): 2909d0b3732eSbholler mov 0x26(%rdx),%r11 2910d0b3732eSbholler mov %r11,0x26(%rcx) 2911d0b3732eSbhollerL(bkP6Q4): 2912d0b3732eSbholler mov 0x1e(%rdx),%r10 2913d0b3732eSbholler mov %r10,0x1e(%rcx) 2914d0b3732eSbhollerL(bkP6Q3): 2915d0b3732eSbholler mov 0x16(%rdx),%r9 2916d0b3732eSbholler mov %r9,0x16(%rcx) 2917d0b3732eSbhollerL(bkP6Q2): 2918d0b3732eSbholler mov 0xe(%rdx),%r11 2919d0b3732eSbholler mov %r11,0xe(%rcx) 2920d0b3732eSbhollerL(bkP6Q1): 2921d0b3732eSbholler mov 0x6(%rdx),%r10 2922d0b3732eSbholler mov %r10,0x6(%rcx) 2923d0b3732eSbhollerL(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores 2924d0b3732eSbholler mov 0x2(%rdx),%r9d 2925d0b3732eSbholler mov %r9d,0x2(%rcx) 2926d0b3732eSbholler mov (%rdx),%r10w 2927d0b3732eSbholler mov %r10w,(%rcx) 2928d0b3732eSbholler ret 2929d0b3732eSbholler 2930d0b3732eSbholler .balign 16 2931d0b3732eSbhollerL(bkP7QI): 2932d0b3732eSbholler mov 0x8f(%rdx),%r10 2933d0b3732eSbholler mov %r10,0x8f(%rcx) 2934d0b3732eSbhollerL(bkP7QH): 2935d0b3732eSbholler mov 0x87(%rdx),%r11 2936d0b3732eSbholler mov %r11,0x87(%rcx) 2937d0b3732eSbhollerL(bkP7QG): 2938d0b3732eSbholler mov 0x7f(%rdx),%r10 2939d0b3732eSbholler mov %r10,0x7f(%rcx) 2940d0b3732eSbhollerL(bkP7QF): 2941d0b3732eSbholler mov 0x77(%rdx),%r9 2942d0b3732eSbholler mov %r9,0x77(%rcx) 2943d0b3732eSbhollerL(bkP7QE): 2944d0b3732eSbholler mov 0x6f(%rdx),%r11 2945d0b3732eSbholler mov %r11,0x6f(%rcx) 2946d0b3732eSbhollerL(bkP7QD): 2947d0b3732eSbholler mov 0x67(%rdx),%r10 2948d0b3732eSbholler mov %r10,0x67(%rcx) 2949d0b3732eSbhollerL(bkP7QC): 2950d0b3732eSbholler mov 0x5f(%rdx),%r9 2951d0b3732eSbholler mov %r9,0x5f(%rcx) 2952d0b3732eSbhollerL(bkP7QB): 2953d0b3732eSbholler mov 0x57(%rdx),%r11 2954d0b3732eSbholler mov %r11,0x57(%rcx) 2955d0b3732eSbhollerL(bkP7QA): 2956d0b3732eSbholler mov 0x4f(%rdx),%r10 2957d0b3732eSbholler mov %r10,0x4f(%rcx) 2958d0b3732eSbhollerL(bkP7Q9): 2959d0b3732eSbholler mov 0x47(%rdx),%r9 2960d0b3732eSbholler mov %r9,0x47(%rcx) 2961d0b3732eSbhollerL(bkP7Q8): 2962d0b3732eSbholler mov 0x3f(%rdx),%r11 2963d0b3732eSbholler mov %r11,0x3f(%rcx) 2964d0b3732eSbhollerL(bkP7Q7): 2965d0b3732eSbholler mov 0x37(%rdx),%r10 2966d0b3732eSbholler mov %r10,0x37(%rcx) 2967d0b3732eSbhollerL(bkP7Q6): 2968d0b3732eSbholler mov 0x2f(%rdx),%r9 2969d0b3732eSbholler mov %r9,0x2f(%rcx) 2970d0b3732eSbhollerL(bkP7Q5): 2971d0b3732eSbholler mov 0x27(%rdx),%r11 2972d0b3732eSbholler mov %r11,0x27(%rcx) 2973d0b3732eSbhollerL(bkP7Q4): 2974d0b3732eSbholler mov 0x1f(%rdx),%r10 2975d0b3732eSbholler mov %r10,0x1f(%rcx) 2976d0b3732eSbhollerL(bkP7Q3): 2977d0b3732eSbholler mov 0x17(%rdx),%r9 2978d0b3732eSbholler mov %r9,0x17(%rcx) 2979d0b3732eSbhollerL(bkP7Q2): 2980d0b3732eSbholler mov 0xf(%rdx),%r11 2981d0b3732eSbholler mov %r11,0xf(%rcx) 2982d0b3732eSbhollerL(bkP7Q1): 2983d0b3732eSbholler mov 0x7(%rdx),%r10 2984d0b3732eSbholler mov %r10,0x7(%rcx) 2985d0b3732eSbhollerL(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores 2986d0b3732eSbholler mov 0x3(%rdx),%r9d 2987d0b3732eSbholler mov %r9d,0x3(%rcx) 2988d0b3732eSbholler mov 0x1(%rdx),%r10w 2989d0b3732eSbholler mov %r10w,0x1(%rcx) 2990d0b3732eSbholler mov (%rdx),%r11b 2991d0b3732eSbholler mov %r11b,(%rcx) 2992d0b3732eSbholler ret 2993d0b3732eSbholler 2994d0b3732eSbholler .balign 16 2995d0b3732eSbhollerL(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) 2996d0b3732eSbholler .int L(bkP1Q0)-L(bkPxQx) 2997d0b3732eSbholler .int L(bkP2Q0)-L(bkPxQx) 2998d0b3732eSbholler .int L(bkP3Q0)-L(bkPxQx) 2999d0b3732eSbholler .int L(bkP4Q0)-L(bkPxQx) 3000d0b3732eSbholler .int L(bkP5Q0)-L(bkPxQx) 3001d0b3732eSbholler .int L(bkP6Q0)-L(bkPxQx) 3002d0b3732eSbholler .int L(bkP7Q0)-L(bkPxQx) 3003d0b3732eSbholler 3004d0b3732eSbholler .int L(bkP0Q1)-L(bkPxQx) 3005d0b3732eSbholler .int L(bkP1Q1)-L(bkPxQx) 3006d0b3732eSbholler .int L(bkP2Q1)-L(bkPxQx) 3007d0b3732eSbholler .int L(bkP3Q1)-L(bkPxQx) 3008d0b3732eSbholler .int L(bkP4Q1)-L(bkPxQx) 3009d0b3732eSbholler .int L(bkP5Q1)-L(bkPxQx) 3010d0b3732eSbholler .int L(bkP6Q1)-L(bkPxQx) 3011d0b3732eSbholler .int L(bkP7Q1)-L(bkPxQx) 3012d0b3732eSbholler 3013d0b3732eSbholler .int L(bkP0Q2)-L(bkPxQx) 3014d0b3732eSbholler .int L(bkP1Q2)-L(bkPxQx) 3015d0b3732eSbholler .int L(bkP2Q2)-L(bkPxQx) 3016d0b3732eSbholler .int L(bkP3Q2)-L(bkPxQx) 3017d0b3732eSbholler .int L(bkP4Q2)-L(bkPxQx) 3018d0b3732eSbholler .int L(bkP5Q2)-L(bkPxQx) 3019d0b3732eSbholler .int L(bkP6Q2)-L(bkPxQx) 3020d0b3732eSbholler .int L(bkP7Q2)-L(bkPxQx) 3021d0b3732eSbholler 3022d0b3732eSbholler .int L(bkP0Q3)-L(bkPxQx) 3023d0b3732eSbholler .int L(bkP1Q3)-L(bkPxQx) 3024d0b3732eSbholler .int L(bkP2Q3)-L(bkPxQx) 3025d0b3732eSbholler .int L(bkP3Q3)-L(bkPxQx) 3026d0b3732eSbholler .int L(bkP4Q3)-L(bkPxQx) 3027d0b3732eSbholler .int L(bkP5Q3)-L(bkPxQx) 3028d0b3732eSbholler .int L(bkP6Q3)-L(bkPxQx) 3029d0b3732eSbholler .int L(bkP7Q3)-L(bkPxQx) 3030d0b3732eSbholler 3031d0b3732eSbholler .int L(bkP0Q4)-L(bkPxQx) 3032d0b3732eSbholler .int L(bkP1Q4)-L(bkPxQx) 3033d0b3732eSbholler .int L(bkP2Q4)-L(bkPxQx) 3034d0b3732eSbholler .int L(bkP3Q4)-L(bkPxQx) 3035d0b3732eSbholler .int L(bkP4Q4)-L(bkPxQx) 3036d0b3732eSbholler .int L(bkP5Q4)-L(bkPxQx) 3037d0b3732eSbholler .int L(bkP6Q4)-L(bkPxQx) 3038d0b3732eSbholler .int L(bkP7Q4)-L(bkPxQx) 3039d0b3732eSbholler 3040d0b3732eSbholler .int L(bkP0Q5)-L(bkPxQx) 3041d0b3732eSbholler .int L(bkP1Q5)-L(bkPxQx) 3042d0b3732eSbholler .int L(bkP2Q5)-L(bkPxQx) 3043d0b3732eSbholler .int L(bkP3Q5)-L(bkPxQx) 3044d0b3732eSbholler .int L(bkP4Q5)-L(bkPxQx) 3045d0b3732eSbholler .int L(bkP5Q5)-L(bkPxQx) 3046d0b3732eSbholler .int L(bkP6Q5)-L(bkPxQx) 3047d0b3732eSbholler .int L(bkP7Q5)-L(bkPxQx) 3048d0b3732eSbholler 3049d0b3732eSbholler .int L(bkP0Q6)-L(bkPxQx) 3050d0b3732eSbholler .int L(bkP1Q6)-L(bkPxQx) 3051d0b3732eSbholler .int L(bkP2Q6)-L(bkPxQx) 3052d0b3732eSbholler .int L(bkP3Q6)-L(bkPxQx) 3053d0b3732eSbholler .int L(bkP4Q6)-L(bkPxQx) 3054d0b3732eSbholler .int L(bkP5Q6)-L(bkPxQx) 3055d0b3732eSbholler .int L(bkP6Q6)-L(bkPxQx) 3056d0b3732eSbholler .int L(bkP7Q6)-L(bkPxQx) 3057d0b3732eSbholler 3058d0b3732eSbholler .int L(bkP0Q7)-L(bkPxQx) 3059d0b3732eSbholler .int L(bkP1Q7)-L(bkPxQx) 3060d0b3732eSbholler .int L(bkP2Q7)-L(bkPxQx) 3061d0b3732eSbholler .int L(bkP3Q7)-L(bkPxQx) 3062d0b3732eSbholler .int L(bkP4Q7)-L(bkPxQx) 3063d0b3732eSbholler .int L(bkP5Q7)-L(bkPxQx) 3064d0b3732eSbholler .int L(bkP6Q7)-L(bkPxQx) 3065d0b3732eSbholler .int L(bkP7Q7)-L(bkPxQx) 3066d0b3732eSbholler 3067d0b3732eSbholler .int L(bkP0Q8)-L(bkPxQx) 3068d0b3732eSbholler .int L(bkP1Q8)-L(bkPxQx) 3069d0b3732eSbholler .int L(bkP2Q8)-L(bkPxQx) 3070d0b3732eSbholler .int L(bkP3Q8)-L(bkPxQx) 3071d0b3732eSbholler .int L(bkP4Q8)-L(bkPxQx) 3072d0b3732eSbholler .int L(bkP5Q8)-L(bkPxQx) 3073d0b3732eSbholler .int L(bkP6Q8)-L(bkPxQx) 3074d0b3732eSbholler .int L(bkP7Q8)-L(bkPxQx) 3075d0b3732eSbholler 3076d0b3732eSbholler .int L(bkP0Q9)-L(bkPxQx) 3077d0b3732eSbholler .int L(bkP1Q9)-L(bkPxQx) 3078d0b3732eSbholler .int L(bkP2Q9)-L(bkPxQx) 3079d0b3732eSbholler .int L(bkP3Q9)-L(bkPxQx) 3080d0b3732eSbholler .int L(bkP4Q9)-L(bkPxQx) 3081d0b3732eSbholler .int L(bkP5Q9)-L(bkPxQx) 3082d0b3732eSbholler .int L(bkP6Q9)-L(bkPxQx) 3083d0b3732eSbholler .int L(bkP7Q9)-L(bkPxQx) 3084d0b3732eSbholler 3085d0b3732eSbholler .int L(bkP0QA)-L(bkPxQx) 3086d0b3732eSbholler .int L(bkP1QA)-L(bkPxQx) 3087d0b3732eSbholler .int L(bkP2QA)-L(bkPxQx) 3088d0b3732eSbholler .int L(bkP3QA)-L(bkPxQx) 3089d0b3732eSbholler .int L(bkP4QA)-L(bkPxQx) 3090d0b3732eSbholler .int L(bkP5QA)-L(bkPxQx) 3091d0b3732eSbholler .int L(bkP6QA)-L(bkPxQx) 3092d0b3732eSbholler .int L(bkP7QA)-L(bkPxQx) 3093d0b3732eSbholler 3094d0b3732eSbholler .int L(bkP0QB)-L(bkPxQx) 3095d0b3732eSbholler .int L(bkP1QB)-L(bkPxQx) 3096d0b3732eSbholler .int L(bkP2QB)-L(bkPxQx) 3097d0b3732eSbholler .int L(bkP3QB)-L(bkPxQx) 3098d0b3732eSbholler .int L(bkP4QB)-L(bkPxQx) 3099d0b3732eSbholler .int L(bkP5QB)-L(bkPxQx) 3100d0b3732eSbholler .int L(bkP6QB)-L(bkPxQx) 3101d0b3732eSbholler .int L(bkP7QB)-L(bkPxQx) 3102d0b3732eSbholler 3103d0b3732eSbholler .int L(bkP0QC)-L(bkPxQx) 3104d0b3732eSbholler .int L(bkP1QC)-L(bkPxQx) 3105d0b3732eSbholler .int L(bkP2QC)-L(bkPxQx) 3106d0b3732eSbholler .int L(bkP3QC)-L(bkPxQx) 3107d0b3732eSbholler .int L(bkP4QC)-L(bkPxQx) 3108d0b3732eSbholler .int L(bkP5QC)-L(bkPxQx) 3109d0b3732eSbholler .int L(bkP6QC)-L(bkPxQx) 3110d0b3732eSbholler .int L(bkP7QC)-L(bkPxQx) 3111d0b3732eSbholler 3112d0b3732eSbholler .int L(bkP0QD)-L(bkPxQx) 3113d0b3732eSbholler .int L(bkP1QD)-L(bkPxQx) 3114d0b3732eSbholler .int L(bkP2QD)-L(bkPxQx) 3115d0b3732eSbholler .int L(bkP3QD)-L(bkPxQx) 3116d0b3732eSbholler .int L(bkP4QD)-L(bkPxQx) 3117d0b3732eSbholler .int L(bkP5QD)-L(bkPxQx) 3118d0b3732eSbholler .int L(bkP6QD)-L(bkPxQx) 3119d0b3732eSbholler .int L(bkP7QD)-L(bkPxQx) 3120d0b3732eSbholler 3121d0b3732eSbholler .int L(bkP0QE)-L(bkPxQx) 3122d0b3732eSbholler .int L(bkP1QE)-L(bkPxQx) 3123d0b3732eSbholler .int L(bkP2QE)-L(bkPxQx) 3124d0b3732eSbholler .int L(bkP3QE)-L(bkPxQx) 3125d0b3732eSbholler .int L(bkP4QE)-L(bkPxQx) 3126d0b3732eSbholler .int L(bkP5QE)-L(bkPxQx) 3127d0b3732eSbholler .int L(bkP6QE)-L(bkPxQx) 3128d0b3732eSbholler .int L(bkP7QE)-L(bkPxQx) 3129d0b3732eSbholler 3130d0b3732eSbholler .int L(bkP0QF)-L(bkPxQx) 3131d0b3732eSbholler .int L(bkP1QF)-L(bkPxQx) 3132d0b3732eSbholler .int L(bkP2QF)-L(bkPxQx) 3133d0b3732eSbholler .int L(bkP3QF)-L(bkPxQx) 3134d0b3732eSbholler .int L(bkP4QF)-L(bkPxQx) 3135d0b3732eSbholler .int L(bkP5QF)-L(bkPxQx) 3136d0b3732eSbholler .int L(bkP6QF)-L(bkPxQx) 3137d0b3732eSbholler .int L(bkP7QF)-L(bkPxQx) 3138d0b3732eSbholler 3139d0b3732eSbholler .int L(bkP0QG)-L(bkPxQx) 3140d0b3732eSbholler .int L(bkP1QG)-L(bkPxQx) 3141d0b3732eSbholler .int L(bkP2QG)-L(bkPxQx) 3142d0b3732eSbholler .int L(bkP3QG)-L(bkPxQx) 3143d0b3732eSbholler .int L(bkP4QG)-L(bkPxQx) 3144d0b3732eSbholler .int L(bkP5QG)-L(bkPxQx) 3145d0b3732eSbholler .int L(bkP6QG)-L(bkPxQx) 3146d0b3732eSbholler .int L(bkP7QG)-L(bkPxQx) 3147d0b3732eSbholler 3148d0b3732eSbholler .int L(bkP0QH)-L(bkPxQx) 3149d0b3732eSbholler .int L(bkP1QH)-L(bkPxQx) 3150d0b3732eSbholler .int L(bkP2QH)-L(bkPxQx) 3151d0b3732eSbholler .int L(bkP3QH)-L(bkPxQx) 3152d0b3732eSbholler .int L(bkP4QH)-L(bkPxQx) 3153d0b3732eSbholler .int L(bkP5QH)-L(bkPxQx) 3154d0b3732eSbholler .int L(bkP6QH)-L(bkPxQx) 3155d0b3732eSbholler .int L(bkP7QH)-L(bkPxQx) 3156d0b3732eSbholler 3157d0b3732eSbholler .int L(bkP0QI)-L(bkPxQx) 3158d0b3732eSbholler .int L(bkP1QI)-L(bkPxQx) 3159d0b3732eSbholler .int L(bkP2QI)-L(bkPxQx) 3160d0b3732eSbholler .int L(bkP3QI)-L(bkPxQx) 3161d0b3732eSbholler .int L(bkP4QI)-L(bkPxQx) 3162d0b3732eSbholler .int L(bkP5QI)-L(bkPxQx) 3163d0b3732eSbholler .int L(bkP6QI)-L(bkPxQx) 3164d0b3732eSbholler .int L(bkP7QI)-L(bkPxQx) 3165d0b3732eSbholler 31667c478bd9Sstevel@tonic-gate SET_SIZE(memmove) 3167