17c478bd9Sstevel@tonic-gate/* 2*533d3a49SEdward Gillett * CDDL HEADER START 3*533d3a49SEdward Gillett * 4*533d3a49SEdward Gillett * The contents of this file are subject to the terms of the 5*533d3a49SEdward Gillett * Common Development and Distribution License (the "License"). 6*533d3a49SEdward Gillett * You may not use this file except in compliance with the License. 7*533d3a49SEdward Gillett * 8*533d3a49SEdward Gillett * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*533d3a49SEdward Gillett * or http://www.opensolaris.org/os/licensing. 10*533d3a49SEdward Gillett * See the License for the specific language governing permissions 11*533d3a49SEdward Gillett * and limitations under the License. 12*533d3a49SEdward Gillett * 13*533d3a49SEdward Gillett * When distributing Covered Code, include this CDDL HEADER in each 14*533d3a49SEdward Gillett * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*533d3a49SEdward Gillett * If applicable, add the following below this CDDL HEADER, with the 16*533d3a49SEdward Gillett * fields enclosed by brackets "[]" replaced with your own identifying 17*533d3a49SEdward Gillett * information: Portions Copyright [yyyy] [name of copyright owner] 18*533d3a49SEdward Gillett * 19*533d3a49SEdward Gillett * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate 227c478bd9Sstevel@tonic-gate/* 23*533d3a49SEdward Gillett * Copyright (c) 2009, Intel Corporation 247c478bd9Sstevel@tonic-gate * All rights reserved. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 27*533d3a49SEdward Gillett/* 28*533d3a49SEdward Gillett * str[n]cpy - copy [n] chars from second operand into first operand 29*533d3a49SEdward Gillett */ 307c478bd9Sstevel@tonic-gate#include "SYS.h" 31*533d3a49SEdward Gillett#include "proc64_id.h" 327c478bd9Sstevel@tonic-gate 337c478bd9Sstevel@tonic-gate#define LABEL(s) .strcpy/**/s 347c478bd9Sstevel@tonic-gate 357c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCPY 367c478bd9Sstevel@tonic-gate ENTRY(strncpy) 37*533d3a49SEdward Gillett test %edx, %edx 38*533d3a49SEdward Gillett jz LABEL(strncpy_exitz) 39*533d3a49SEdward Gillett mov %rdx, %r8 407c478bd9Sstevel@tonic-gate#else 417c478bd9Sstevel@tonic-gate ENTRY(strcpy) /* (char *, const char *) */ 42*533d3a49SEdward Gillett xor %rdx, %rdx 437c478bd9Sstevel@tonic-gate#endif 44*533d3a49SEdward Gillett mov %esi, %ecx 45*533d3a49SEdward Gillett and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ 46*533d3a49SEdward Gillett and $0xf, %rcx 47*533d3a49SEdward Gillett mov %rdi, %rax /* save destination address for return value */ 48*533d3a49SEdward Gillett 49*533d3a49SEdward Gillett 50*533d3a49SEdward Gillett pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ 51*533d3a49SEdward Gillett pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for null */ 52*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 53*533d3a49SEdward Gillett shr %cl, %edx /* adjust for offset from 16byte boundary */ 54*533d3a49SEdward Gillett test %edx, %edx /* edx will be 0 if chars are non-null */ 55*533d3a49SEdward Gillett jnz LABEL(less16bytes) /* null char found in first 16 bytes examined */ 56*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 57*533d3a49SEdward Gillett /* 58*533d3a49SEdward Gillett * Check if the count is satisfied in first 16 bytes examined. 59*533d3a49SEdward Gillett */ 60*533d3a49SEdward Gillett lea -16(%r8, %rcx), %r11 61*533d3a49SEdward Gillett cmp $0, %r11 62*533d3a49SEdward Gillett jle LABEL(less16bytes) 63*533d3a49SEdward Gillett#endif 64*533d3a49SEdward Gillett mov %rcx, %r9 /* rsi alignment offset */ 65*533d3a49SEdward Gillett or %edi, %ecx 66*533d3a49SEdward Gillett and $0xf, %ecx 67*533d3a49SEdward Gillett lea -16(%r9), %r10 68*533d3a49SEdward Gillett jz LABEL(ashr_0) /* src and dest are both 16 byte aligned */ 69*533d3a49SEdward Gillett 70*533d3a49SEdward Gillett neg %r10 /* max src bytes remaining in current dqword */ 71*533d3a49SEdward Gillett 72*533d3a49SEdward Gillett pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation */ 73*533d3a49SEdward Gillett pcmpeqb 16(%rsi), %xmm0 /* check next 16 bytes in src for a null */ 74*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 75*533d3a49SEdward Gillett test %edx, %edx 76*533d3a49SEdward Gillett jnz LABEL(less32bytes) /* null char found in first 32 bytes examined */ 777c478bd9Sstevel@tonic-gate 787c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCPY 79*533d3a49SEdward Gillett /* 80*533d3a49SEdward Gillett * If strncpy count <= 16 go to exit case 81*533d3a49SEdward Gillett */ 82*533d3a49SEdward Gillett sub $16, %r8 83*533d3a49SEdward Gillett jbe LABEL(less32bytes_strncpy_truncation) 84*533d3a49SEdward Gillett#endif 85*533d3a49SEdward Gillett /* 86*533d3a49SEdward Gillett * At least 16 bytes to copy to destination string. Move them now. 87*533d3a49SEdward Gillett * Don't worry about alignment. 88*533d3a49SEdward Gillett */ 89*533d3a49SEdward Gillett mov (%rsi, %r9), %rdx 90*533d3a49SEdward Gillett mov %rdx, (%rdi) 91*533d3a49SEdward Gillett mov 8(%rsi, %r9), %rdx 92*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 93*533d3a49SEdward Gillett 94*533d3a49SEdward Gillett /* 95*533d3a49SEdward Gillett * so far destination rdi may be aligned by 16, re-calculate rsi and 96*533d3a49SEdward Gillett * jump to corresponding src/dest relative offset case. 97*533d3a49SEdward Gillett * rcx is offset of rsi 98*533d3a49SEdward Gillett * rdx is offset of rdi 99*533d3a49SEdward Gillett */ 100*533d3a49SEdward Gillett and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ 101*533d3a49SEdward Gillett mov %rax, %rdx /* rax contains orignal rdi */ 102*533d3a49SEdward Gillett xor %rdi, %rdx /* same effect as "and $0xf, %rdx" */ 103*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 104*533d3a49SEdward Gillett /* 105*533d3a49SEdward Gillett * Will now do 16 byte aligned stores. Stores may overlap some bytes 106*533d3a49SEdward Gillett * (ie store twice) if destination was unaligned. Compensate here. 107*533d3a49SEdward Gillett */ 108*533d3a49SEdward Gillett add %rdx, %r8 /* compensate for overlap */ 1097c478bd9Sstevel@tonic-gate#endif 1107c478bd9Sstevel@tonic-gate 111*533d3a49SEdward Gillett add $16, %rdi /* next 16 bytes for dest */ 1127c478bd9Sstevel@tonic-gate 113*533d3a49SEdward Gillett /* 114*533d3a49SEdward Gillett * align src to 16-byte boundary. Could be up or down depending on 115*533d3a49SEdward Gillett * whether src offset - dest offset > 0 (up) or 116*533d3a49SEdward Gillett * src offset - dest offset < 0 (down). 117*533d3a49SEdward Gillett */ 118*533d3a49SEdward Gillett sub %rdx, %r9 /* src offset - dest offset */ 1197c478bd9Sstevel@tonic-gate 120*533d3a49SEdward Gillett lea 16(%r9, %rsi), %rsi 121*533d3a49SEdward Gillett mov %esi, %ecx /* for new src offset */ 122*533d3a49SEdward Gillett and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ 123*533d3a49SEdward Gillett 124*533d3a49SEdward Gillett and $0xf, %ecx /* new src offset is 0 if rsi/rdi have same alignment */ 125*533d3a49SEdward Gillett jz LABEL(ashr_0) 126*533d3a49SEdward Gillett 127*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 128*533d3a49SEdward Gillett xor %edx, %edx /* In case unaligned_exit is taken */ 129*533d3a49SEdward Gillett#endif 130*533d3a49SEdward Gillett /* 131*533d3a49SEdward Gillett * Jump to case corresponding to source/dest string relative offsets 132*533d3a49SEdward Gillett * Index = (16 + (src offset - dest offset)) % 16 133*533d3a49SEdward Gillett */ 134*533d3a49SEdward Gillett lea -16(%rcx), %r10 135*533d3a49SEdward Gillett mov %rcx, %r9 136*533d3a49SEdward Gillett neg %r10 /* max src bytes remaining in current dqword */ 137*533d3a49SEdward Gillett lea LABEL(unaligned_table)(%rip), %r11 138*533d3a49SEdward Gillett movslq (%r11, %rcx, 4), %rcx 139*533d3a49SEdward Gillett lea (%r11, %rcx), %rcx 140*533d3a49SEdward Gillett jmp *%rcx 141*533d3a49SEdward Gillett 142*533d3a49SEdward Gillett/* 143*533d3a49SEdward Gillett * ashr_0 handles the following cases: 144*533d3a49SEdward Gillett * src alignment offset = dest alignment offset 145*533d3a49SEdward Gillett */ 146*533d3a49SEdward Gillett .p2align 5 147*533d3a49SEdward GillettLABEL(ashr_0): 148*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 149*533d3a49SEdward Gillett sub $16, %r8 150*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_aligned) 151*533d3a49SEdward Gillett#endif 152*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 /* fetch 16 bytes from src string */ 153*533d3a49SEdward Gillett movdqa %xmm1, (%rdi) /* store 16 bytes into dest string */ 154*533d3a49SEdward Gillett add $16, %rsi 155*533d3a49SEdward Gillett add $16, %rdi 156*533d3a49SEdward Gillett pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for a null */ 157*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 158*533d3a49SEdward Gillett 159*533d3a49SEdward Gillett test %edx, %edx /* edx will be 0 if chars are non-null */ 160*533d3a49SEdward Gillett jnz LABEL(aligned_16bytes) /* exit tail */ 161*533d3a49SEdward Gillett 162*533d3a49SEdward GillettLABEL(ashr_0_loop): 163*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 164*533d3a49SEdward Gillett sub $16, %r8 165*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_aligned) 166*533d3a49SEdward Gillett#endif 167*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 168*533d3a49SEdward Gillett movdqa %xmm1, (%rdi, %rcx) 169*533d3a49SEdward Gillett add $16, %rcx 170*533d3a49SEdward Gillett pcmpeqb (%rsi, %rcx), %xmm0 171*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 172*533d3a49SEdward Gillett test %edx, %edx 173*533d3a49SEdward Gillett jnz LABEL(aligned_exit) 174*533d3a49SEdward Gillett 175*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 176*533d3a49SEdward Gillett sub $16, %r8 177*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_aligned) 178*533d3a49SEdward Gillett#endif 179*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 180*533d3a49SEdward Gillett movdqa %xmm1, (%rdi, %rcx) 181*533d3a49SEdward Gillett add $16, %rcx 182*533d3a49SEdward Gillett pcmpeqb (%rsi, %rcx), %xmm0 183*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 184*533d3a49SEdward Gillett test %edx, %edx 185*533d3a49SEdward Gillett jnz LABEL(aligned_exit) 186*533d3a49SEdward Gillett 187*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 188*533d3a49SEdward Gillett sub $16, %r8 189*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_aligned) 190*533d3a49SEdward Gillett#endif 191*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 192*533d3a49SEdward Gillett movdqa %xmm1, (%rdi, %rcx) 193*533d3a49SEdward Gillett 194*533d3a49SEdward Gillett add $16, %rcx 195*533d3a49SEdward Gillett pcmpeqb (%rsi, %rcx), %xmm0 196*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 197*533d3a49SEdward Gillett test %edx, %edx 198*533d3a49SEdward Gillett jnz LABEL(aligned_exit) 199*533d3a49SEdward Gillett 200*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 201*533d3a49SEdward Gillett sub $16, %r8 202*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_aligned) 203*533d3a49SEdward Gillett#endif 204*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 205*533d3a49SEdward Gillett movdqa %xmm1, (%rdi, %rcx) 206*533d3a49SEdward Gillett add $16, %rcx 207*533d3a49SEdward Gillett pcmpeqb (%rsi, %rcx), %xmm0 208*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 209*533d3a49SEdward Gillett test %edx, %edx 210*533d3a49SEdward Gillett jz LABEL(ashr_0_loop) 211*533d3a49SEdward Gillett jmp LABEL(aligned_exit) 212*533d3a49SEdward Gillett 213*533d3a49SEdward Gillett 214*533d3a49SEdward Gillett/* 215*533d3a49SEdward Gillett * ashr_15 handles the following cases: 216*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 15 217*533d3a49SEdward Gillett * 218*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 219*533d3a49SEdward Gillett * bank, there is no null byte. 220*533d3a49SEdward Gillett */ 221*533d3a49SEdward Gillett .p2align 4 222*533d3a49SEdward GillettLABEL(ashr_15): 223*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 224*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 225*533d3a49SEdward Gillett cmp %r10, %r8 226*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 227*533d3a49SEdward Gillett#endif 228*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 229*533d3a49SEdward Gillett jz LABEL(ashr_15_use_sse2) 230*533d3a49SEdward Gillett 231*533d3a49SEdward Gillett .p2align 4 232*533d3a49SEdward GillettLABEL(ashr_15_use_ssse3): 233*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 234*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 235*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 236*533d3a49SEdward Gillett test %edx, %edx 237*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 238*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 239*533d3a49SEdward Gillett sub $16, %r8 240*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 241*533d3a49SEdward Gillett#endif 242*533d3a49SEdward Gillett 243*533d3a49SEdward Gillett #palignr $15, (%rsi, %rcx), %xmm3 244*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 245*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0f 246*533d3a49SEdward Gillett 247*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 248*533d3a49SEdward Gillett add $16, %rcx 249*533d3a49SEdward Gillett 250*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 251*533d3a49SEdward Gillett cmp %r10, %r8 252*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 253*533d3a49SEdward Gillett#endif 254*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 255*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 256*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 257*533d3a49SEdward Gillett test %edx, %edx 258*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 259*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 260*533d3a49SEdward Gillett sub $16, %r8 261*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 262*533d3a49SEdward Gillett#endif 263*533d3a49SEdward Gillett 264*533d3a49SEdward Gillett #palignr $15, (%rsi, %rcx), %xmm3 265*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 266*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0f 267*533d3a49SEdward Gillett 268*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 269*533d3a49SEdward Gillett add $16, %rcx 270*533d3a49SEdward Gillett 271*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 272*533d3a49SEdward Gillett cmp %r10, %r8 273*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 274*533d3a49SEdward Gillett#endif 275*533d3a49SEdward Gillett jmp LABEL(ashr_15_use_ssse3) 276*533d3a49SEdward Gillett 277*533d3a49SEdward Gillett .p2align 4 278*533d3a49SEdward GillettLABEL(ashr_15_use_sse2): 279*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 280*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 281*533d3a49SEdward Gillett test %edx, %edx 282*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 283*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 284*533d3a49SEdward Gillett sub $16, %r8 285*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 286*533d3a49SEdward Gillett#endif 287*533d3a49SEdward Gillett 288*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 289*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 290*533d3a49SEdward Gillett 291*533d3a49SEdward Gillett psrldq $15, %xmm2 292*533d3a49SEdward Gillett pslldq $1, %xmm3 293*533d3a49SEdward Gillett por %xmm2, %xmm3 294*533d3a49SEdward Gillett 295*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 296*533d3a49SEdward Gillett add $16, %rcx 297*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 298*533d3a49SEdward Gillett cmp %r10, %r8 299*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 300*533d3a49SEdward Gillett#endif 301*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 302*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 303*533d3a49SEdward Gillett test %edx, %edx 304*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 305*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 306*533d3a49SEdward Gillett sub $16, %r8 307*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 308*533d3a49SEdward Gillett#endif 309*533d3a49SEdward Gillett 310*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 311*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 312*533d3a49SEdward Gillett 313*533d3a49SEdward Gillett psrldq $15, %xmm2 314*533d3a49SEdward Gillett pslldq $1, %xmm3 315*533d3a49SEdward Gillett por %xmm2, %xmm3 316*533d3a49SEdward Gillett 317*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 318*533d3a49SEdward Gillett add $16, %rcx 319*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 320*533d3a49SEdward Gillett cmp %r10, %r8 321*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 322*533d3a49SEdward Gillett#endif 323*533d3a49SEdward Gillett jmp LABEL(ashr_15_use_sse2) 324*533d3a49SEdward Gillett 325*533d3a49SEdward Gillett 326*533d3a49SEdward Gillett/* 327*533d3a49SEdward Gillett * ashr_14 handles the following cases: 328*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 14 329*533d3a49SEdward Gillett * 330*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 331*533d3a49SEdward Gillett * bank, there is no null byte. 332*533d3a49SEdward Gillett */ 333*533d3a49SEdward Gillett .p2align 4 334*533d3a49SEdward GillettLABEL(ashr_14): 335*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 336*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 337*533d3a49SEdward Gillett cmp %r10, %r8 338*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 339*533d3a49SEdward Gillett#endif 340*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 341*533d3a49SEdward Gillett jz LABEL(ashr_14_use_sse2) 342*533d3a49SEdward Gillett 343*533d3a49SEdward Gillett .p2align 4 344*533d3a49SEdward GillettLABEL(ashr_14_use_ssse3): 345*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 346*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 347*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 348*533d3a49SEdward Gillett test %edx, %edx 349*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 350*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 351*533d3a49SEdward Gillett sub $16, %r8 352*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 353*533d3a49SEdward Gillett#endif 354*533d3a49SEdward Gillett 355*533d3a49SEdward Gillett #palignr $14, (%rsi, %rcx), %xmm3 356*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 357*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0e 358*533d3a49SEdward Gillett 359*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 360*533d3a49SEdward Gillett add $16, %rcx 361*533d3a49SEdward Gillett 362*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 363*533d3a49SEdward Gillett cmp %r10, %r8 364*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 365*533d3a49SEdward Gillett#endif 366*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 367*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 368*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 369*533d3a49SEdward Gillett test %edx, %edx 370*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 371*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 372*533d3a49SEdward Gillett sub $16, %r8 373*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 374*533d3a49SEdward Gillett#endif 375*533d3a49SEdward Gillett 376*533d3a49SEdward Gillett #palignr $14, (%rsi, %rcx), %xmm3 377*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 378*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0e 379*533d3a49SEdward Gillett 380*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 381*533d3a49SEdward Gillett add $16, %rcx 382*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 383*533d3a49SEdward Gillett cmp %r10, %r8 384*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 385*533d3a49SEdward Gillett#endif 386*533d3a49SEdward Gillett jmp LABEL(ashr_14_use_ssse3) 387*533d3a49SEdward Gillett 388*533d3a49SEdward Gillett .p2align 4 389*533d3a49SEdward GillettLABEL(ashr_14_use_sse2): 390*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 391*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 392*533d3a49SEdward Gillett test %edx, %edx 393*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 394*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 395*533d3a49SEdward Gillett sub $16, %r8 396*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 397*533d3a49SEdward Gillett#endif 398*533d3a49SEdward Gillett 399*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 400*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 401*533d3a49SEdward Gillett 402*533d3a49SEdward Gillett psrldq $14, %xmm2 403*533d3a49SEdward Gillett pslldq $2, %xmm3 404*533d3a49SEdward Gillett por %xmm2, %xmm3 405*533d3a49SEdward Gillett 406*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 407*533d3a49SEdward Gillett add $16, %rcx 408*533d3a49SEdward Gillett 409*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 410*533d3a49SEdward Gillett cmp %r10, %r8 411*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 412*533d3a49SEdward Gillett#endif 413*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 414*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 415*533d3a49SEdward Gillett test %edx, %edx 416*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 417*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 418*533d3a49SEdward Gillett sub $16, %r8 419*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 420*533d3a49SEdward Gillett#endif 421*533d3a49SEdward Gillett 422*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 423*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 424*533d3a49SEdward Gillett 425*533d3a49SEdward Gillett psrldq $14, %xmm2 426*533d3a49SEdward Gillett pslldq $2, %xmm3 427*533d3a49SEdward Gillett por %xmm2, %xmm3 428*533d3a49SEdward Gillett 429*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 430*533d3a49SEdward Gillett add $16, %rcx 431*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 432*533d3a49SEdward Gillett cmp %r10, %r8 433*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 434*533d3a49SEdward Gillett#endif 435*533d3a49SEdward Gillett jmp LABEL(ashr_14_use_sse2) 436*533d3a49SEdward Gillett 437*533d3a49SEdward Gillett 438*533d3a49SEdward Gillett/* 439*533d3a49SEdward Gillett * ashr_13 handles the following cases: 440*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 13 441*533d3a49SEdward Gillett * 442*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 443*533d3a49SEdward Gillett * bank, there is no null byte. 444*533d3a49SEdward Gillett */ 445*533d3a49SEdward Gillett .p2align 4 446*533d3a49SEdward GillettLABEL(ashr_13): 447*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 448*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 449*533d3a49SEdward Gillett cmp %r10, %r8 450*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 451*533d3a49SEdward Gillett#endif 452*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 453*533d3a49SEdward Gillett jz LABEL(ashr_13_use_sse2) 454*533d3a49SEdward Gillett 455*533d3a49SEdward Gillett .p2align 4 456*533d3a49SEdward GillettLABEL(ashr_13_use_ssse3): 457*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 458*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 459*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 460*533d3a49SEdward Gillett test %edx, %edx 461*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 462*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 463*533d3a49SEdward Gillett sub $16, %r8 464*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 465*533d3a49SEdward Gillett#endif 466*533d3a49SEdward Gillett 467*533d3a49SEdward Gillett #palignr $13, (%rsi, %rcx), %xmm3 468*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 469*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0d 470*533d3a49SEdward Gillett 471*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 472*533d3a49SEdward Gillett add $16, %rcx 473*533d3a49SEdward Gillett 474*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 475*533d3a49SEdward Gillett cmp %r10, %r8 476*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 477*533d3a49SEdward Gillett#endif 478*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 479*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 480*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 481*533d3a49SEdward Gillett test %edx, %edx 482*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 483*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 484*533d3a49SEdward Gillett sub $16, %r8 485*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 486*533d3a49SEdward Gillett#endif 487*533d3a49SEdward Gillett 488*533d3a49SEdward Gillett #palignr $13, (%rsi, %rcx), %xmm3 489*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 490*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0d 491*533d3a49SEdward Gillett 492*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 493*533d3a49SEdward Gillett add $16, %rcx 494*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 495*533d3a49SEdward Gillett cmp %r10, %r8 496*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 497*533d3a49SEdward Gillett#endif 498*533d3a49SEdward Gillett jmp LABEL(ashr_13_use_ssse3) 499*533d3a49SEdward Gillett 500*533d3a49SEdward Gillett .p2align 4 501*533d3a49SEdward GillettLABEL(ashr_13_use_sse2): 502*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 503*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 504*533d3a49SEdward Gillett test %edx, %edx 505*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 506*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 507*533d3a49SEdward Gillett sub $16, %r8 508*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 509*533d3a49SEdward Gillett#endif 510*533d3a49SEdward Gillett 511*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 512*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 513*533d3a49SEdward Gillett 514*533d3a49SEdward Gillett psrldq $13, %xmm2 515*533d3a49SEdward Gillett pslldq $3, %xmm3 516*533d3a49SEdward Gillett por %xmm2, %xmm3 517*533d3a49SEdward Gillett 518*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 519*533d3a49SEdward Gillett add $16, %rcx 520*533d3a49SEdward Gillett 521*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 522*533d3a49SEdward Gillett cmp %r10, %r8 523*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 524*533d3a49SEdward Gillett#endif 525*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 526*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 527*533d3a49SEdward Gillett test %edx, %edx 528*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 529*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 530*533d3a49SEdward Gillett sub $16, %r8 531*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 532*533d3a49SEdward Gillett#endif 533*533d3a49SEdward Gillett 534*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 535*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 536*533d3a49SEdward Gillett 537*533d3a49SEdward Gillett psrldq $13, %xmm2 538*533d3a49SEdward Gillett pslldq $3, %xmm3 539*533d3a49SEdward Gillett por %xmm2, %xmm3 540*533d3a49SEdward Gillett 541*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 542*533d3a49SEdward Gillett add $16, %rcx 543*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 544*533d3a49SEdward Gillett cmp %r10, %r8 545*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 546*533d3a49SEdward Gillett#endif 547*533d3a49SEdward Gillett jmp LABEL(ashr_13_use_sse2) 548*533d3a49SEdward Gillett 549*533d3a49SEdward Gillett 550*533d3a49SEdward Gillett/* 551*533d3a49SEdward Gillett * ashr_12 handles the following cases: 552*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 12 553*533d3a49SEdward Gillett * 554*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 555*533d3a49SEdward Gillett * bank, there is no null byte. 556*533d3a49SEdward Gillett */ 557*533d3a49SEdward Gillett .p2align 4 558*533d3a49SEdward GillettLABEL(ashr_12): 559*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 560*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 561*533d3a49SEdward Gillett cmp %r10, %r8 562*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 563*533d3a49SEdward Gillett#endif 564*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 565*533d3a49SEdward Gillett jz LABEL(ashr_12_use_sse2) 566*533d3a49SEdward Gillett 567*533d3a49SEdward Gillett .p2align 4 568*533d3a49SEdward GillettLABEL(ashr_12_use_ssse3): 569*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 570*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 571*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 572*533d3a49SEdward Gillett test %edx, %edx 573*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 574*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 575*533d3a49SEdward Gillett sub $16, %r8 576*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 577*533d3a49SEdward Gillett#endif 578*533d3a49SEdward Gillett 579*533d3a49SEdward Gillett #palignr $12, (%rsi, %rcx), %xmm3 580*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 581*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0c 582*533d3a49SEdward Gillett 583*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 584*533d3a49SEdward Gillett add $16, %rcx 585*533d3a49SEdward Gillett 586*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 587*533d3a49SEdward Gillett cmp %r10, %r8 588*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 589*533d3a49SEdward Gillett#endif 590*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 591*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 592*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 593*533d3a49SEdward Gillett test %edx, %edx 594*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 595*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 596*533d3a49SEdward Gillett sub $16, %r8 597*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 598*533d3a49SEdward Gillett#endif 599*533d3a49SEdward Gillett 600*533d3a49SEdward Gillett #palignr $12, (%rsi, %rcx), %xmm3 601*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 602*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0c 603*533d3a49SEdward Gillett 604*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 605*533d3a49SEdward Gillett add $16, %rcx 606*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 607*533d3a49SEdward Gillett cmp %r10, %r8 608*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 609*533d3a49SEdward Gillett#endif 610*533d3a49SEdward Gillett jmp LABEL(ashr_12_use_ssse3) 611*533d3a49SEdward Gillett 612*533d3a49SEdward Gillett .p2align 4 613*533d3a49SEdward GillettLABEL(ashr_12_use_sse2): 614*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 615*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 616*533d3a49SEdward Gillett test %edx, %edx 617*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 618*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 619*533d3a49SEdward Gillett sub $16, %r8 620*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 621*533d3a49SEdward Gillett#endif 622*533d3a49SEdward Gillett 623*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 624*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 625*533d3a49SEdward Gillett 626*533d3a49SEdward Gillett psrldq $12, %xmm2 627*533d3a49SEdward Gillett pslldq $4, %xmm3 628*533d3a49SEdward Gillett por %xmm2, %xmm3 629*533d3a49SEdward Gillett 630*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 631*533d3a49SEdward Gillett add $16, %rcx 632*533d3a49SEdward Gillett 633*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 634*533d3a49SEdward Gillett cmp %r10, %r8 635*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 636*533d3a49SEdward Gillett#endif 637*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 638*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 639*533d3a49SEdward Gillett test %edx, %edx 640*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 641*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 642*533d3a49SEdward Gillett sub $16, %r8 643*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 644*533d3a49SEdward Gillett#endif 645*533d3a49SEdward Gillett 646*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 647*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 648*533d3a49SEdward Gillett 649*533d3a49SEdward Gillett psrldq $12, %xmm2 650*533d3a49SEdward Gillett pslldq $4, %xmm3 651*533d3a49SEdward Gillett por %xmm2, %xmm3 652*533d3a49SEdward Gillett 653*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 654*533d3a49SEdward Gillett add $16, %rcx 655*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 656*533d3a49SEdward Gillett cmp %r10, %r8 657*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 658*533d3a49SEdward Gillett#endif 659*533d3a49SEdward Gillett jmp LABEL(ashr_12_use_sse2) 660*533d3a49SEdward Gillett 661*533d3a49SEdward Gillett 662*533d3a49SEdward Gillett/* 663*533d3a49SEdward Gillett * ashr_11 handles the following cases: 664*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 11 665*533d3a49SEdward Gillett * 666*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 667*533d3a49SEdward Gillett * bank, there is no null byte. 668*533d3a49SEdward Gillett */ 669*533d3a49SEdward Gillett .p2align 4 670*533d3a49SEdward GillettLABEL(ashr_11): 671*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 672*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 673*533d3a49SEdward Gillett cmp %r10, %r8 674*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 675*533d3a49SEdward Gillett#endif 676*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 677*533d3a49SEdward Gillett jz LABEL(ashr_11_use_sse2) 678*533d3a49SEdward Gillett 679*533d3a49SEdward Gillett .p2align 4 680*533d3a49SEdward GillettLABEL(ashr_11_use_ssse3): 681*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 682*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 683*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 684*533d3a49SEdward Gillett test %edx, %edx 685*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 686*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 687*533d3a49SEdward Gillett sub $16, %r8 688*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 689*533d3a49SEdward Gillett#endif 690*533d3a49SEdward Gillett 691*533d3a49SEdward Gillett #palignr $11, (%rsi, %rcx), %xmm3 692*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 693*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0b 694*533d3a49SEdward Gillett 695*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 696*533d3a49SEdward Gillett add $16, %rcx 697*533d3a49SEdward Gillett 698*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 699*533d3a49SEdward Gillett cmp %r10, %r8 700*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 701*533d3a49SEdward Gillett#endif 702*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 703*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 704*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 705*533d3a49SEdward Gillett test %edx, %edx 706*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 707*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 708*533d3a49SEdward Gillett sub $16, %r8 709*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 710*533d3a49SEdward Gillett#endif 711*533d3a49SEdward Gillett 712*533d3a49SEdward Gillett #palignr $11, (%rsi, %rcx), %xmm3 713*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 714*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0b 715*533d3a49SEdward Gillett 716*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 717*533d3a49SEdward Gillett add $16, %rcx 718*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 719*533d3a49SEdward Gillett cmp %r10, %r8 720*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 721*533d3a49SEdward Gillett#endif 722*533d3a49SEdward Gillett jmp LABEL(ashr_11_use_ssse3) 723*533d3a49SEdward Gillett 724*533d3a49SEdward Gillett .p2align 4 725*533d3a49SEdward GillettLABEL(ashr_11_use_sse2): 726*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 727*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 728*533d3a49SEdward Gillett test %edx, %edx 729*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 730*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 731*533d3a49SEdward Gillett sub $16, %r8 732*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 733*533d3a49SEdward Gillett#endif 734*533d3a49SEdward Gillett 735*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 736*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 737*533d3a49SEdward Gillett 738*533d3a49SEdward Gillett psrldq $11, %xmm2 739*533d3a49SEdward Gillett pslldq $5, %xmm3 740*533d3a49SEdward Gillett por %xmm2, %xmm3 741*533d3a49SEdward Gillett 742*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 743*533d3a49SEdward Gillett add $16, %rcx 744*533d3a49SEdward Gillett 745*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 746*533d3a49SEdward Gillett cmp %r10, %r8 747*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 748*533d3a49SEdward Gillett#endif 749*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 750*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 751*533d3a49SEdward Gillett test %edx, %edx 752*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 753*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 754*533d3a49SEdward Gillett sub $16, %r8 755*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 756*533d3a49SEdward Gillett#endif 757*533d3a49SEdward Gillett 758*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 759*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 760*533d3a49SEdward Gillett 761*533d3a49SEdward Gillett psrldq $11, %xmm2 762*533d3a49SEdward Gillett pslldq $5, %xmm3 763*533d3a49SEdward Gillett por %xmm2, %xmm3 764*533d3a49SEdward Gillett 765*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 766*533d3a49SEdward Gillett add $16, %rcx 767*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 768*533d3a49SEdward Gillett cmp %r10, %r8 769*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 770*533d3a49SEdward Gillett#endif 771*533d3a49SEdward Gillett jmp LABEL(ashr_11_use_sse2) 772*533d3a49SEdward Gillett 773*533d3a49SEdward Gillett 774*533d3a49SEdward Gillett/* 775*533d3a49SEdward Gillett * ashr_10 handles the following cases: 776*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 10 777*533d3a49SEdward Gillett * 778*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 779*533d3a49SEdward Gillett * bank, there is no null byte. 780*533d3a49SEdward Gillett */ 781*533d3a49SEdward Gillett .p2align 4 782*533d3a49SEdward GillettLABEL(ashr_10): 783*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 784*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 785*533d3a49SEdward Gillett cmp %r10, %r8 786*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 787*533d3a49SEdward Gillett#endif 788*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 789*533d3a49SEdward Gillett jz LABEL(ashr_10_use_sse2) 790*533d3a49SEdward Gillett 791*533d3a49SEdward Gillett .p2align 4 792*533d3a49SEdward GillettLABEL(ashr_10_use_ssse3): 793*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 794*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 795*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 796*533d3a49SEdward Gillett test %edx, %edx 797*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 798*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 799*533d3a49SEdward Gillett sub $16, %r8 800*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 801*533d3a49SEdward Gillett#endif 802*533d3a49SEdward Gillett 803*533d3a49SEdward Gillett #palignr $10, (%rsi, %rcx), %xmm3 804*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 805*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0a 806*533d3a49SEdward Gillett 807*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 808*533d3a49SEdward Gillett add $16, %rcx 809*533d3a49SEdward Gillett 810*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 811*533d3a49SEdward Gillett cmp %r10, %r8 812*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 813*533d3a49SEdward Gillett#endif 814*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 815*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 816*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 817*533d3a49SEdward Gillett test %edx, %edx 818*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 819*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 820*533d3a49SEdward Gillett sub $16, %r8 821*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 822*533d3a49SEdward Gillett#endif 823*533d3a49SEdward Gillett 824*533d3a49SEdward Gillett #palignr $10, (%rsi, %rcx), %xmm3 825*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 826*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x0a 827*533d3a49SEdward Gillett 828*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 829*533d3a49SEdward Gillett add $16, %rcx 830*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 831*533d3a49SEdward Gillett cmp %r10, %r8 832*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 833*533d3a49SEdward Gillett#endif 834*533d3a49SEdward Gillett jmp LABEL(ashr_10_use_ssse3) 835*533d3a49SEdward Gillett 836*533d3a49SEdward Gillett .p2align 4 837*533d3a49SEdward GillettLABEL(ashr_10_use_sse2): 838*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 839*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 840*533d3a49SEdward Gillett test %edx, %edx 841*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 842*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 843*533d3a49SEdward Gillett sub $16, %r8 844*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 845*533d3a49SEdward Gillett#endif 846*533d3a49SEdward Gillett 847*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 848*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 849*533d3a49SEdward Gillett 850*533d3a49SEdward Gillett psrldq $10, %xmm2 851*533d3a49SEdward Gillett pslldq $6, %xmm3 852*533d3a49SEdward Gillett por %xmm2, %xmm3 853*533d3a49SEdward Gillett 854*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 855*533d3a49SEdward Gillett add $16, %rcx 856*533d3a49SEdward Gillett 857*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 858*533d3a49SEdward Gillett cmp %r10, %r8 859*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 860*533d3a49SEdward Gillett#endif 861*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 862*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 863*533d3a49SEdward Gillett test %edx, %edx 864*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 865*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 866*533d3a49SEdward Gillett sub $16, %r8 867*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 868*533d3a49SEdward Gillett#endif 869*533d3a49SEdward Gillett 870*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 871*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 872*533d3a49SEdward Gillett 873*533d3a49SEdward Gillett psrldq $10, %xmm2 874*533d3a49SEdward Gillett pslldq $6, %xmm3 875*533d3a49SEdward Gillett por %xmm2, %xmm3 876*533d3a49SEdward Gillett 877*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 878*533d3a49SEdward Gillett add $16, %rcx 879*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 880*533d3a49SEdward Gillett cmp %r10, %r8 881*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 882*533d3a49SEdward Gillett#endif 883*533d3a49SEdward Gillett jmp LABEL(ashr_10_use_sse2) 884*533d3a49SEdward Gillett 885*533d3a49SEdward Gillett 886*533d3a49SEdward Gillett/* 887*533d3a49SEdward Gillett * ashr_9 handles the following cases: 888*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 9 889*533d3a49SEdward Gillett * 890*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 891*533d3a49SEdward Gillett * bank, there is no null byte. 892*533d3a49SEdward Gillett */ 893*533d3a49SEdward Gillett .p2align 4 894*533d3a49SEdward GillettLABEL(ashr_9): 895*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 896*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 897*533d3a49SEdward Gillett cmp %r10, %r8 898*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 899*533d3a49SEdward Gillett#endif 900*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 901*533d3a49SEdward Gillett jz LABEL(ashr_9_use_sse2) 902*533d3a49SEdward Gillett 903*533d3a49SEdward Gillett .p2align 4 904*533d3a49SEdward GillettLABEL(ashr_9_use_ssse3): 905*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 906*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 907*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 908*533d3a49SEdward Gillett test %edx, %edx 909*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 910*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 911*533d3a49SEdward Gillett sub $16, %r8 912*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 913*533d3a49SEdward Gillett#endif 914*533d3a49SEdward Gillett 915*533d3a49SEdward Gillett #palignr $9, (%rsi, %rcx), %xmm3 916*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 917*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x09 918*533d3a49SEdward Gillett 919*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 920*533d3a49SEdward Gillett add $16, %rcx 921*533d3a49SEdward Gillett 922*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 923*533d3a49SEdward Gillett cmp %r10, %r8 924*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 925*533d3a49SEdward Gillett#endif 926*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 927*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 928*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 929*533d3a49SEdward Gillett test %edx, %edx 930*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 931*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 932*533d3a49SEdward Gillett sub $16, %r8 933*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 934*533d3a49SEdward Gillett#endif 935*533d3a49SEdward Gillett 936*533d3a49SEdward Gillett #palignr $9, (%rsi, %rcx), %xmm3 937*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 938*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x09 939*533d3a49SEdward Gillett 940*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 941*533d3a49SEdward Gillett add $16, %rcx 942*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 943*533d3a49SEdward Gillett cmp %r10, %r8 944*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 945*533d3a49SEdward Gillett#endif 946*533d3a49SEdward Gillett jmp LABEL(ashr_9_use_ssse3) 947*533d3a49SEdward Gillett 948*533d3a49SEdward Gillett .p2align 4 949*533d3a49SEdward GillettLABEL(ashr_9_use_sse2): 950*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 951*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 952*533d3a49SEdward Gillett test %edx, %edx 953*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 954*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 955*533d3a49SEdward Gillett sub $16, %r8 956*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 957*533d3a49SEdward Gillett#endif 958*533d3a49SEdward Gillett 959*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 960*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 961*533d3a49SEdward Gillett 962*533d3a49SEdward Gillett psrldq $9, %xmm2 963*533d3a49SEdward Gillett pslldq $7, %xmm3 964*533d3a49SEdward Gillett por %xmm2, %xmm3 965*533d3a49SEdward Gillett 966*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 967*533d3a49SEdward Gillett add $16, %rcx 968*533d3a49SEdward Gillett 969*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 970*533d3a49SEdward Gillett cmp %r10, %r8 971*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 972*533d3a49SEdward Gillett#endif 973*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 974*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 975*533d3a49SEdward Gillett test %edx, %edx 976*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 977*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 978*533d3a49SEdward Gillett sub $16, %r8 979*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 980*533d3a49SEdward Gillett#endif 981*533d3a49SEdward Gillett 982*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 983*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 984*533d3a49SEdward Gillett 985*533d3a49SEdward Gillett psrldq $9, %xmm2 986*533d3a49SEdward Gillett pslldq $7, %xmm3 987*533d3a49SEdward Gillett por %xmm2, %xmm3 988*533d3a49SEdward Gillett 989*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 990*533d3a49SEdward Gillett add $16, %rcx 991*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 992*533d3a49SEdward Gillett cmp %r10, %r8 993*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 994*533d3a49SEdward Gillett#endif 995*533d3a49SEdward Gillett jmp LABEL(ashr_9_use_sse2) 996*533d3a49SEdward Gillett 997*533d3a49SEdward Gillett 998*533d3a49SEdward Gillett/* 999*533d3a49SEdward Gillett * ashr_8 handles the following cases: 1000*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 8 1001*533d3a49SEdward Gillett * 1002*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 1003*533d3a49SEdward Gillett * bank, there is no null byte. 1004*533d3a49SEdward Gillett */ 1005*533d3a49SEdward Gillett .p2align 4 1006*533d3a49SEdward GillettLABEL(ashr_8): 1007*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 1008*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1009*533d3a49SEdward Gillett cmp %r10, %r8 1010*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1011*533d3a49SEdward Gillett#endif 1012*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1013*533d3a49SEdward Gillett jz LABEL(ashr_8_use_sse2) 1014*533d3a49SEdward Gillett 1015*533d3a49SEdward Gillett .p2align 4 1016*533d3a49SEdward GillettLABEL(ashr_8_use_ssse3): 1017*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1018*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1019*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1020*533d3a49SEdward Gillett test %edx, %edx 1021*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1022*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1023*533d3a49SEdward Gillett sub $16, %r8 1024*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1025*533d3a49SEdward Gillett#endif 1026*533d3a49SEdward Gillett 1027*533d3a49SEdward Gillett #palignr $8, (%rsi, %rcx), %xmm3 1028*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1029*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x08 1030*533d3a49SEdward Gillett 1031*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1032*533d3a49SEdward Gillett add $16, %rcx 1033*533d3a49SEdward Gillett 1034*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1035*533d3a49SEdward Gillett cmp %r10, %r8 1036*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1037*533d3a49SEdward Gillett#endif 1038*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1039*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1040*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1041*533d3a49SEdward Gillett test %edx, %edx 1042*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1043*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1044*533d3a49SEdward Gillett sub $16, %r8 1045*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1046*533d3a49SEdward Gillett#endif 1047*533d3a49SEdward Gillett 1048*533d3a49SEdward Gillett #palignr $8, (%rsi, %rcx), %xmm3 1049*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1050*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x08 1051*533d3a49SEdward Gillett 1052*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1053*533d3a49SEdward Gillett add $16, %rcx 1054*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1055*533d3a49SEdward Gillett cmp %r10, %r8 1056*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1057*533d3a49SEdward Gillett#endif 1058*533d3a49SEdward Gillett jmp LABEL(ashr_8_use_ssse3) 1059*533d3a49SEdward Gillett 1060*533d3a49SEdward Gillett .p2align 4 1061*533d3a49SEdward GillettLABEL(ashr_8_use_sse2): 1062*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1063*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1064*533d3a49SEdward Gillett test %edx, %edx 1065*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1066*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1067*533d3a49SEdward Gillett sub $16, %r8 1068*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1069*533d3a49SEdward Gillett#endif 1070*533d3a49SEdward Gillett 1071*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1072*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1073*533d3a49SEdward Gillett 1074*533d3a49SEdward Gillett psrldq $8, %xmm2 1075*533d3a49SEdward Gillett pslldq $8, %xmm3 1076*533d3a49SEdward Gillett por %xmm2, %xmm3 1077*533d3a49SEdward Gillett 1078*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1079*533d3a49SEdward Gillett add $16, %rcx 1080*533d3a49SEdward Gillett 1081*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1082*533d3a49SEdward Gillett cmp %r10, %r8 1083*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1084*533d3a49SEdward Gillett#endif 1085*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1086*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1087*533d3a49SEdward Gillett test %edx, %edx 1088*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1089*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1090*533d3a49SEdward Gillett sub $16, %r8 1091*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1092*533d3a49SEdward Gillett#endif 1093*533d3a49SEdward Gillett 1094*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1095*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1096*533d3a49SEdward Gillett 1097*533d3a49SEdward Gillett psrldq $8, %xmm2 1098*533d3a49SEdward Gillett pslldq $8, %xmm3 1099*533d3a49SEdward Gillett por %xmm2, %xmm3 1100*533d3a49SEdward Gillett 1101*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1102*533d3a49SEdward Gillett add $16, %rcx 1103*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1104*533d3a49SEdward Gillett cmp %r10, %r8 1105*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1106*533d3a49SEdward Gillett#endif 1107*533d3a49SEdward Gillett jmp LABEL(ashr_8_use_sse2) 1108*533d3a49SEdward Gillett 1109*533d3a49SEdward Gillett 1110*533d3a49SEdward Gillett/* 1111*533d3a49SEdward Gillett * ashr_7 handles the following cases: 1112*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 7 1113*533d3a49SEdward Gillett * 1114*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 1115*533d3a49SEdward Gillett * bank, there is no null byte. 1116*533d3a49SEdward Gillett */ 1117*533d3a49SEdward Gillett .p2align 4 1118*533d3a49SEdward GillettLABEL(ashr_7): 1119*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 1120*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1121*533d3a49SEdward Gillett cmp %r10, %r8 1122*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1123*533d3a49SEdward Gillett#endif 1124*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1125*533d3a49SEdward Gillett jz LABEL(ashr_7_use_sse2) 1126*533d3a49SEdward Gillett 1127*533d3a49SEdward Gillett .p2align 4 1128*533d3a49SEdward GillettLABEL(ashr_7_use_ssse3): 1129*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1130*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1131*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1132*533d3a49SEdward Gillett test %edx, %edx 1133*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1134*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1135*533d3a49SEdward Gillett sub $16, %r8 1136*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1137*533d3a49SEdward Gillett#endif 1138*533d3a49SEdward Gillett 1139*533d3a49SEdward Gillett #palignr $7, (%rsi, %rcx), %xmm3 1140*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1141*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x07 1142*533d3a49SEdward Gillett 1143*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1144*533d3a49SEdward Gillett add $16, %rcx 1145*533d3a49SEdward Gillett 1146*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1147*533d3a49SEdward Gillett cmp %r10, %r8 1148*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1149*533d3a49SEdward Gillett#endif 1150*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1151*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1152*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1153*533d3a49SEdward Gillett test %edx, %edx 1154*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1155*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1156*533d3a49SEdward Gillett sub $16, %r8 1157*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1158*533d3a49SEdward Gillett#endif 1159*533d3a49SEdward Gillett 1160*533d3a49SEdward Gillett #palignr $7, (%rsi, %rcx), %xmm3 1161*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1162*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x07 1163*533d3a49SEdward Gillett 1164*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1165*533d3a49SEdward Gillett add $16, %rcx 1166*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1167*533d3a49SEdward Gillett cmp %r10, %r8 1168*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1169*533d3a49SEdward Gillett#endif 1170*533d3a49SEdward Gillett jmp LABEL(ashr_7_use_ssse3) 1171*533d3a49SEdward Gillett 1172*533d3a49SEdward Gillett .p2align 4 1173*533d3a49SEdward GillettLABEL(ashr_7_use_sse2): 1174*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1175*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1176*533d3a49SEdward Gillett test %edx, %edx 1177*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1178*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1179*533d3a49SEdward Gillett sub $16, %r8 1180*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1181*533d3a49SEdward Gillett#endif 1182*533d3a49SEdward Gillett 1183*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1184*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1185*533d3a49SEdward Gillett 1186*533d3a49SEdward Gillett psrldq $7, %xmm2 1187*533d3a49SEdward Gillett pslldq $9, %xmm3 1188*533d3a49SEdward Gillett por %xmm2, %xmm3 1189*533d3a49SEdward Gillett 1190*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1191*533d3a49SEdward Gillett add $16, %rcx 1192*533d3a49SEdward Gillett 1193*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1194*533d3a49SEdward Gillett cmp %r10, %r8 1195*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1196*533d3a49SEdward Gillett#endif 1197*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1198*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1199*533d3a49SEdward Gillett test %edx, %edx 1200*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1201*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1202*533d3a49SEdward Gillett sub $16, %r8 1203*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1204*533d3a49SEdward Gillett#endif 1205*533d3a49SEdward Gillett 1206*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1207*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1208*533d3a49SEdward Gillett 1209*533d3a49SEdward Gillett psrldq $7, %xmm2 1210*533d3a49SEdward Gillett pslldq $9, %xmm3 1211*533d3a49SEdward Gillett por %xmm2, %xmm3 1212*533d3a49SEdward Gillett 1213*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1214*533d3a49SEdward Gillett add $16, %rcx 1215*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1216*533d3a49SEdward Gillett cmp %r10, %r8 1217*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1218*533d3a49SEdward Gillett#endif 1219*533d3a49SEdward Gillett jmp LABEL(ashr_7_use_sse2) 1220*533d3a49SEdward Gillett 1221*533d3a49SEdward Gillett 1222*533d3a49SEdward Gillett/* 1223*533d3a49SEdward Gillett * ashr_6 handles the following cases: 1224*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 6 1225*533d3a49SEdward Gillett * 1226*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 1227*533d3a49SEdward Gillett * bank, there is no null byte. 1228*533d3a49SEdward Gillett */ 1229*533d3a49SEdward Gillett .p2align 4 1230*533d3a49SEdward GillettLABEL(ashr_6): 1231*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 1232*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1233*533d3a49SEdward Gillett cmp %r10, %r8 1234*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1235*533d3a49SEdward Gillett#endif 1236*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1237*533d3a49SEdward Gillett jz LABEL(ashr_6_use_sse2) 1238*533d3a49SEdward Gillett 1239*533d3a49SEdward Gillett .p2align 4 1240*533d3a49SEdward GillettLABEL(ashr_6_use_ssse3): 1241*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1242*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1243*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1244*533d3a49SEdward Gillett test %edx, %edx 1245*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1246*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1247*533d3a49SEdward Gillett sub $16, %r8 1248*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1249*533d3a49SEdward Gillett#endif 1250*533d3a49SEdward Gillett 1251*533d3a49SEdward Gillett #palignr $6, (%rsi, %rcx), %xmm3 1252*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1253*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x06 1254*533d3a49SEdward Gillett 1255*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1256*533d3a49SEdward Gillett add $16, %rcx 1257*533d3a49SEdward Gillett 1258*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1259*533d3a49SEdward Gillett cmp %r10, %r8 1260*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1261*533d3a49SEdward Gillett#endif 1262*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1263*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1264*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1265*533d3a49SEdward Gillett test %edx, %edx 1266*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1267*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1268*533d3a49SEdward Gillett sub $16, %r8 1269*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1270*533d3a49SEdward Gillett#endif 1271*533d3a49SEdward Gillett 1272*533d3a49SEdward Gillett #palignr $6, (%rsi, %rcx), %xmm3 1273*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1274*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x06 1275*533d3a49SEdward Gillett 1276*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1277*533d3a49SEdward Gillett add $16, %rcx 1278*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1279*533d3a49SEdward Gillett cmp %r10, %r8 1280*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1281*533d3a49SEdward Gillett#endif 1282*533d3a49SEdward Gillett jmp LABEL(ashr_6_use_ssse3) 1283*533d3a49SEdward Gillett 1284*533d3a49SEdward Gillett .p2align 4 1285*533d3a49SEdward GillettLABEL(ashr_6_use_sse2): 1286*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1287*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1288*533d3a49SEdward Gillett test %edx, %edx 1289*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1290*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1291*533d3a49SEdward Gillett sub $16, %r8 1292*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1293*533d3a49SEdward Gillett#endif 1294*533d3a49SEdward Gillett 1295*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1296*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1297*533d3a49SEdward Gillett 1298*533d3a49SEdward Gillett psrldq $6, %xmm2 1299*533d3a49SEdward Gillett pslldq $10, %xmm3 1300*533d3a49SEdward Gillett por %xmm2, %xmm3 1301*533d3a49SEdward Gillett 1302*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1303*533d3a49SEdward Gillett add $16, %rcx 1304*533d3a49SEdward Gillett 1305*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1306*533d3a49SEdward Gillett cmp %r10, %r8 1307*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1308*533d3a49SEdward Gillett#endif 1309*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1310*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1311*533d3a49SEdward Gillett test %edx, %edx 1312*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1313*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1314*533d3a49SEdward Gillett sub $16, %r8 1315*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1316*533d3a49SEdward Gillett#endif 1317*533d3a49SEdward Gillett 1318*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1319*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1320*533d3a49SEdward Gillett 1321*533d3a49SEdward Gillett psrldq $6, %xmm2 1322*533d3a49SEdward Gillett pslldq $10, %xmm3 1323*533d3a49SEdward Gillett por %xmm2, %xmm3 1324*533d3a49SEdward Gillett 1325*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1326*533d3a49SEdward Gillett add $16, %rcx 1327*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1328*533d3a49SEdward Gillett cmp %r10, %r8 1329*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1330*533d3a49SEdward Gillett#endif 1331*533d3a49SEdward Gillett jmp LABEL(ashr_6_use_sse2) 1332*533d3a49SEdward Gillett 1333*533d3a49SEdward Gillett 1334*533d3a49SEdward Gillett/* 1335*533d3a49SEdward Gillett * ashr_5 handles the following cases: 1336*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 5 1337*533d3a49SEdward Gillett * 1338*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 1339*533d3a49SEdward Gillett * bank, there is no null byte. 1340*533d3a49SEdward Gillett */ 1341*533d3a49SEdward Gillett .p2align 4 1342*533d3a49SEdward GillettLABEL(ashr_5): 1343*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 1344*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1345*533d3a49SEdward Gillett cmp %r10, %r8 1346*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1347*533d3a49SEdward Gillett#endif 1348*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1349*533d3a49SEdward Gillett jz LABEL(ashr_5_use_sse2) 1350*533d3a49SEdward Gillett 1351*533d3a49SEdward Gillett .p2align 4 1352*533d3a49SEdward GillettLABEL(ashr_5_use_ssse3): 1353*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1354*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1355*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1356*533d3a49SEdward Gillett test %edx, %edx 1357*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1358*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1359*533d3a49SEdward Gillett sub $16, %r8 1360*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1361*533d3a49SEdward Gillett#endif 1362*533d3a49SEdward Gillett 1363*533d3a49SEdward Gillett #palignr $5, (%rsi, %rcx), %xmm3 1364*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1365*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x05 1366*533d3a49SEdward Gillett 1367*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1368*533d3a49SEdward Gillett add $16, %rcx 1369*533d3a49SEdward Gillett 1370*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1371*533d3a49SEdward Gillett cmp %r10, %r8 1372*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1373*533d3a49SEdward Gillett#endif 1374*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1375*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1376*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1377*533d3a49SEdward Gillett test %edx, %edx 1378*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1379*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1380*533d3a49SEdward Gillett sub $16, %r8 1381*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1382*533d3a49SEdward Gillett#endif 1383*533d3a49SEdward Gillett 1384*533d3a49SEdward Gillett #palignr $5, (%rsi, %rcx), %xmm3 1385*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1386*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x05 1387*533d3a49SEdward Gillett 1388*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1389*533d3a49SEdward Gillett add $16, %rcx 1390*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1391*533d3a49SEdward Gillett cmp %r10, %r8 1392*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1393*533d3a49SEdward Gillett#endif 1394*533d3a49SEdward Gillett jmp LABEL(ashr_5_use_ssse3) 1395*533d3a49SEdward Gillett 1396*533d3a49SEdward Gillett .p2align 4 1397*533d3a49SEdward GillettLABEL(ashr_5_use_sse2): 1398*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1399*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1400*533d3a49SEdward Gillett test %edx, %edx 1401*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1402*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1403*533d3a49SEdward Gillett sub $16, %r8 1404*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1405*533d3a49SEdward Gillett#endif 1406*533d3a49SEdward Gillett 1407*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1408*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1409*533d3a49SEdward Gillett 1410*533d3a49SEdward Gillett psrldq $5, %xmm2 1411*533d3a49SEdward Gillett pslldq $11, %xmm3 1412*533d3a49SEdward Gillett por %xmm2, %xmm3 1413*533d3a49SEdward Gillett 1414*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1415*533d3a49SEdward Gillett add $16, %rcx 1416*533d3a49SEdward Gillett 1417*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1418*533d3a49SEdward Gillett cmp %r10, %r8 1419*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1420*533d3a49SEdward Gillett#endif 1421*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1422*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1423*533d3a49SEdward Gillett test %edx, %edx 1424*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1425*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1426*533d3a49SEdward Gillett sub $16, %r8 1427*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1428*533d3a49SEdward Gillett#endif 1429*533d3a49SEdward Gillett 1430*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1431*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1432*533d3a49SEdward Gillett 1433*533d3a49SEdward Gillett psrldq $5, %xmm2 1434*533d3a49SEdward Gillett pslldq $11, %xmm3 1435*533d3a49SEdward Gillett por %xmm2, %xmm3 1436*533d3a49SEdward Gillett 1437*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1438*533d3a49SEdward Gillett add $16, %rcx 1439*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1440*533d3a49SEdward Gillett cmp %r10, %r8 1441*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1442*533d3a49SEdward Gillett#endif 1443*533d3a49SEdward Gillett jmp LABEL(ashr_5_use_sse2) 1444*533d3a49SEdward Gillett 1445*533d3a49SEdward Gillett 1446*533d3a49SEdward Gillett/* 1447*533d3a49SEdward Gillett * ashr_4 handles the following cases: 1448*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 4 1449*533d3a49SEdward Gillett * 1450*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 1451*533d3a49SEdward Gillett * bank, there is no null byte. 1452*533d3a49SEdward Gillett */ 1453*533d3a49SEdward Gillett .p2align 4 1454*533d3a49SEdward GillettLABEL(ashr_4): 1455*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 1456*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1457*533d3a49SEdward Gillett cmp %r10, %r8 1458*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1459*533d3a49SEdward Gillett#endif 1460*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1461*533d3a49SEdward Gillett jz LABEL(ashr_4_use_sse2) 1462*533d3a49SEdward Gillett 1463*533d3a49SEdward Gillett .p2align 4 1464*533d3a49SEdward GillettLABEL(ashr_4_use_ssse3): 1465*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1466*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1467*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1468*533d3a49SEdward Gillett test %edx, %edx 1469*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1470*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1471*533d3a49SEdward Gillett sub $16, %r8 1472*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1473*533d3a49SEdward Gillett#endif 1474*533d3a49SEdward Gillett 1475*533d3a49SEdward Gillett #palignr $4, (%rsi, %rcx), %xmm3 1476*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1477*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x04 1478*533d3a49SEdward Gillett 1479*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1480*533d3a49SEdward Gillett add $16, %rcx 1481*533d3a49SEdward Gillett 1482*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1483*533d3a49SEdward Gillett cmp %r10, %r8 1484*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1485*533d3a49SEdward Gillett#endif 1486*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1487*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1488*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1489*533d3a49SEdward Gillett test %edx, %edx 1490*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1491*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1492*533d3a49SEdward Gillett sub $16, %r8 1493*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1494*533d3a49SEdward Gillett#endif 1495*533d3a49SEdward Gillett 1496*533d3a49SEdward Gillett #palignr $4, (%rsi, %rcx), %xmm3 1497*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1498*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x04 1499*533d3a49SEdward Gillett 1500*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1501*533d3a49SEdward Gillett add $16, %rcx 1502*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1503*533d3a49SEdward Gillett cmp %r10, %r8 1504*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1505*533d3a49SEdward Gillett#endif 1506*533d3a49SEdward Gillett jmp LABEL(ashr_4_use_ssse3) 1507*533d3a49SEdward Gillett 1508*533d3a49SEdward Gillett .p2align 4 1509*533d3a49SEdward GillettLABEL(ashr_4_use_sse2): 1510*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1511*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1512*533d3a49SEdward Gillett test %edx, %edx 1513*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1514*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1515*533d3a49SEdward Gillett sub $16, %r8 1516*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1517*533d3a49SEdward Gillett#endif 1518*533d3a49SEdward Gillett 1519*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1520*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1521*533d3a49SEdward Gillett 1522*533d3a49SEdward Gillett psrldq $4, %xmm2 1523*533d3a49SEdward Gillett pslldq $12, %xmm3 1524*533d3a49SEdward Gillett por %xmm2, %xmm3 1525*533d3a49SEdward Gillett 1526*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1527*533d3a49SEdward Gillett add $16, %rcx 1528*533d3a49SEdward Gillett 1529*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1530*533d3a49SEdward Gillett cmp %r10, %r8 1531*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1532*533d3a49SEdward Gillett#endif 1533*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1534*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1535*533d3a49SEdward Gillett test %edx, %edx 1536*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1537*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1538*533d3a49SEdward Gillett sub $16, %r8 1539*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1540*533d3a49SEdward Gillett#endif 1541*533d3a49SEdward Gillett 1542*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1543*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1544*533d3a49SEdward Gillett 1545*533d3a49SEdward Gillett psrldq $4, %xmm2 1546*533d3a49SEdward Gillett pslldq $12, %xmm3 1547*533d3a49SEdward Gillett por %xmm2, %xmm3 1548*533d3a49SEdward Gillett 1549*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1550*533d3a49SEdward Gillett add $16, %rcx 1551*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1552*533d3a49SEdward Gillett cmp %r10, %r8 1553*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1554*533d3a49SEdward Gillett#endif 1555*533d3a49SEdward Gillett jmp LABEL(ashr_4_use_sse2) 1556*533d3a49SEdward Gillett 1557*533d3a49SEdward Gillett 1558*533d3a49SEdward Gillett/* 1559*533d3a49SEdward Gillett * ashr_3 handles the following cases: 1560*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 3 1561*533d3a49SEdward Gillett * 1562*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 1563*533d3a49SEdward Gillett * bank, there is no null byte. 1564*533d3a49SEdward Gillett */ 1565*533d3a49SEdward Gillett .p2align 4 1566*533d3a49SEdward GillettLABEL(ashr_3): 1567*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 1568*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1569*533d3a49SEdward Gillett cmp %r10, %r8 1570*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1571*533d3a49SEdward Gillett#endif 1572*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1573*533d3a49SEdward Gillett jz LABEL(ashr_3_use_sse2) 1574*533d3a49SEdward Gillett 1575*533d3a49SEdward Gillett .p2align 4 1576*533d3a49SEdward GillettLABEL(ashr_3_use_ssse3): 1577*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1578*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1579*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1580*533d3a49SEdward Gillett test %edx, %edx 1581*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1582*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1583*533d3a49SEdward Gillett sub $16, %r8 1584*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1585*533d3a49SEdward Gillett#endif 1586*533d3a49SEdward Gillett 1587*533d3a49SEdward Gillett #palignr $3, (%rsi, %rcx), %xmm3 1588*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1589*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x03 1590*533d3a49SEdward Gillett 1591*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1592*533d3a49SEdward Gillett add $16, %rcx 1593*533d3a49SEdward Gillett 1594*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1595*533d3a49SEdward Gillett cmp %r10, %r8 1596*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1597*533d3a49SEdward Gillett#endif 1598*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1599*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1600*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1601*533d3a49SEdward Gillett test %edx, %edx 1602*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1603*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1604*533d3a49SEdward Gillett sub $16, %r8 1605*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1606*533d3a49SEdward Gillett#endif 1607*533d3a49SEdward Gillett 1608*533d3a49SEdward Gillett #palignr $3, (%rsi, %rcx), %xmm3 1609*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1610*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x03 1611*533d3a49SEdward Gillett 1612*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1613*533d3a49SEdward Gillett add $16, %rcx 1614*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1615*533d3a49SEdward Gillett cmp %r10, %r8 1616*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1617*533d3a49SEdward Gillett#endif 1618*533d3a49SEdward Gillett jmp LABEL(ashr_3_use_ssse3) 1619*533d3a49SEdward Gillett 1620*533d3a49SEdward Gillett .p2align 4 1621*533d3a49SEdward GillettLABEL(ashr_3_use_sse2): 1622*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1623*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1624*533d3a49SEdward Gillett test %edx, %edx 1625*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1626*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1627*533d3a49SEdward Gillett sub $16, %r8 1628*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1629*533d3a49SEdward Gillett#endif 1630*533d3a49SEdward Gillett 1631*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1632*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1633*533d3a49SEdward Gillett 1634*533d3a49SEdward Gillett psrldq $3, %xmm2 1635*533d3a49SEdward Gillett pslldq $13, %xmm3 1636*533d3a49SEdward Gillett por %xmm2, %xmm3 1637*533d3a49SEdward Gillett 1638*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1639*533d3a49SEdward Gillett add $16, %rcx 1640*533d3a49SEdward Gillett 1641*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1642*533d3a49SEdward Gillett cmp %r10, %r8 1643*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1644*533d3a49SEdward Gillett#endif 1645*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1646*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1647*533d3a49SEdward Gillett test %edx, %edx 1648*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1649*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1650*533d3a49SEdward Gillett sub $16, %r8 1651*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1652*533d3a49SEdward Gillett#endif 1653*533d3a49SEdward Gillett 1654*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1655*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1656*533d3a49SEdward Gillett 1657*533d3a49SEdward Gillett psrldq $3, %xmm2 1658*533d3a49SEdward Gillett pslldq $13, %xmm3 1659*533d3a49SEdward Gillett por %xmm2, %xmm3 1660*533d3a49SEdward Gillett 1661*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1662*533d3a49SEdward Gillett add $16, %rcx 1663*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1664*533d3a49SEdward Gillett cmp %r10, %r8 1665*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1666*533d3a49SEdward Gillett#endif 1667*533d3a49SEdward Gillett jmp LABEL(ashr_3_use_sse2) 1668*533d3a49SEdward Gillett 1669*533d3a49SEdward Gillett 1670*533d3a49SEdward Gillett/* 1671*533d3a49SEdward Gillett * ashr_2 handles the following cases: 1672*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 2 1673*533d3a49SEdward Gillett * 1674*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 1675*533d3a49SEdward Gillett * bank, there is no null byte. 1676*533d3a49SEdward Gillett */ 1677*533d3a49SEdward Gillett .p2align 4 1678*533d3a49SEdward GillettLABEL(ashr_2): 1679*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 1680*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1681*533d3a49SEdward Gillett cmp %r10, %r8 1682*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1683*533d3a49SEdward Gillett#endif 1684*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1685*533d3a49SEdward Gillett jz LABEL(ashr_2_use_sse2) 1686*533d3a49SEdward Gillett 1687*533d3a49SEdward Gillett .p2align 4 1688*533d3a49SEdward GillettLABEL(ashr_2_use_ssse3): 1689*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1690*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1691*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1692*533d3a49SEdward Gillett test %edx, %edx 1693*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1694*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1695*533d3a49SEdward Gillett sub $16, %r8 1696*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1697*533d3a49SEdward Gillett#endif 1698*533d3a49SEdward Gillett 1699*533d3a49SEdward Gillett #palignr $2, (%rsi, %rcx), %xmm3 1700*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1701*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x02 1702*533d3a49SEdward Gillett 1703*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1704*533d3a49SEdward Gillett add $16, %rcx 1705*533d3a49SEdward Gillett 1706*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1707*533d3a49SEdward Gillett cmp %r10, %r8 1708*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1709*533d3a49SEdward Gillett#endif 1710*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1711*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1712*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1713*533d3a49SEdward Gillett test %edx, %edx 1714*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1715*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1716*533d3a49SEdward Gillett sub $16, %r8 1717*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1718*533d3a49SEdward Gillett#endif 1719*533d3a49SEdward Gillett 1720*533d3a49SEdward Gillett #palignr $2, (%rsi, %rcx), %xmm3 1721*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1722*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x02 1723*533d3a49SEdward Gillett 1724*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1725*533d3a49SEdward Gillett add $16, %rcx 1726*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1727*533d3a49SEdward Gillett cmp %r10, %r8 1728*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1729*533d3a49SEdward Gillett#endif 1730*533d3a49SEdward Gillett jmp LABEL(ashr_2_use_ssse3) 1731*533d3a49SEdward Gillett 1732*533d3a49SEdward Gillett .p2align 4 1733*533d3a49SEdward GillettLABEL(ashr_2_use_sse2): 1734*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1735*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1736*533d3a49SEdward Gillett test %edx, %edx 1737*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1738*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1739*533d3a49SEdward Gillett sub $16, %r8 1740*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1741*533d3a49SEdward Gillett#endif 1742*533d3a49SEdward Gillett 1743*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1744*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1745*533d3a49SEdward Gillett 1746*533d3a49SEdward Gillett psrldq $2, %xmm2 1747*533d3a49SEdward Gillett pslldq $14, %xmm3 1748*533d3a49SEdward Gillett por %xmm2, %xmm3 1749*533d3a49SEdward Gillett 1750*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1751*533d3a49SEdward Gillett add $16, %rcx 1752*533d3a49SEdward Gillett 1753*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1754*533d3a49SEdward Gillett cmp %r10, %r8 1755*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1756*533d3a49SEdward Gillett#endif 1757*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1758*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1759*533d3a49SEdward Gillett test %edx, %edx 1760*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1761*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1762*533d3a49SEdward Gillett sub $16, %r8 1763*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1764*533d3a49SEdward Gillett#endif 1765*533d3a49SEdward Gillett 1766*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1767*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1768*533d3a49SEdward Gillett 1769*533d3a49SEdward Gillett psrldq $2, %xmm2 1770*533d3a49SEdward Gillett pslldq $14, %xmm3 1771*533d3a49SEdward Gillett por %xmm2, %xmm3 1772*533d3a49SEdward Gillett 1773*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1774*533d3a49SEdward Gillett add $16, %rcx 1775*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1776*533d3a49SEdward Gillett cmp %r10, %r8 1777*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1778*533d3a49SEdward Gillett#endif 1779*533d3a49SEdward Gillett jmp LABEL(ashr_2_use_sse2) 1780*533d3a49SEdward Gillett 1781*533d3a49SEdward Gillett 1782*533d3a49SEdward Gillett/* 1783*533d3a49SEdward Gillett * ashr_1 handles the following cases: 1784*533d3a49SEdward Gillett * (16 + (src offset - dest offset)) % 16 = 1 1785*533d3a49SEdward Gillett * 1786*533d3a49SEdward Gillett * Based on above operation, start from (%r9 + rsi) to the left of this cache 1787*533d3a49SEdward Gillett * bank, there is no null byte. 1788*533d3a49SEdward Gillett */ 1789*533d3a49SEdward Gillett .p2align 4 1790*533d3a49SEdward GillettLABEL(ashr_1): 1791*533d3a49SEdward Gillett xor %ecx, %ecx /* clear index */ 1792*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1793*533d3a49SEdward Gillett cmp %r10, %r8 1794*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1795*533d3a49SEdward Gillett#endif 1796*533d3a49SEdward Gillett testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ 1797*533d3a49SEdward Gillett jz LABEL(ashr_1_use_sse2) 1798*533d3a49SEdward Gillett 1799*533d3a49SEdward Gillett .p2align 4 1800*533d3a49SEdward GillettLABEL(ashr_1_use_ssse3): 1801*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1802*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1803*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1804*533d3a49SEdward Gillett test %edx, %edx 1805*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1806*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1807*533d3a49SEdward Gillett sub $16, %r8 1808*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1809*533d3a49SEdward Gillett#endif 1810*533d3a49SEdward Gillett 1811*533d3a49SEdward Gillett #palignr $1, (%rsi, %rcx), %xmm3 1812*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1813*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x01 1814*533d3a49SEdward Gillett 1815*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1816*533d3a49SEdward Gillett add $16, %rcx 1817*533d3a49SEdward Gillett 1818*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1819*533d3a49SEdward Gillett cmp %r10, %r8 1820*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1821*533d3a49SEdward Gillett#endif 1822*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1823*533d3a49SEdward Gillett pcmpeqb %xmm3, %xmm0 1824*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1825*533d3a49SEdward Gillett test %edx, %edx 1826*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1827*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1828*533d3a49SEdward Gillett sub $16, %r8 1829*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1830*533d3a49SEdward Gillett#endif 1831*533d3a49SEdward Gillett #palignr $1, (%rsi, %rcx), %xmm3 1832*533d3a49SEdward Gillett .byte 0x66, 0x0F, 0x3A ,0x0F 1833*533d3a49SEdward Gillett .byte 0x1c, 0x0e, 0x01 1834*533d3a49SEdward Gillett 1835*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1836*533d3a49SEdward Gillett add $16, %rcx 1837*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1838*533d3a49SEdward Gillett cmp %r10, %r8 1839*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1840*533d3a49SEdward Gillett#endif 1841*533d3a49SEdward Gillett jmp LABEL(ashr_1_use_ssse3) 1842*533d3a49SEdward Gillett 1843*533d3a49SEdward Gillett .p2align 4 1844*533d3a49SEdward GillettLABEL(ashr_1_use_sse2): 1845*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1846*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1847*533d3a49SEdward Gillett test %edx, %edx 1848*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1849*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1850*533d3a49SEdward Gillett sub $16, %r8 1851*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1852*533d3a49SEdward Gillett#endif 1853*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1854*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1855*533d3a49SEdward Gillett 1856*533d3a49SEdward Gillett psrldq $1, %xmm2 1857*533d3a49SEdward Gillett pslldq $15, %xmm3 1858*533d3a49SEdward Gillett por %xmm2, %xmm3 1859*533d3a49SEdward Gillett 1860*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1861*533d3a49SEdward Gillett add $16, %rcx 1862*533d3a49SEdward Gillett 1863*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1864*533d3a49SEdward Gillett cmp %r10, %r8 1865*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1866*533d3a49SEdward Gillett#endif 1867*533d3a49SEdward Gillett pcmpeqb 16(%rsi, %rcx), %xmm0 1868*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 1869*533d3a49SEdward Gillett test %edx, %edx 1870*533d3a49SEdward Gillett jnz LABEL(unaligned_exit) 1871*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1872*533d3a49SEdward Gillett sub $16, %r8 1873*533d3a49SEdward Gillett jbe LABEL(strncpy_truncation_unaligned) 1874*533d3a49SEdward Gillett#endif 1875*533d3a49SEdward Gillett 1876*533d3a49SEdward Gillett movdqa 16(%rsi, %rcx), %xmm3 1877*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm2 1878*533d3a49SEdward Gillett 1879*533d3a49SEdward Gillett psrldq $1, %xmm2 1880*533d3a49SEdward Gillett pslldq $15, %xmm3 1881*533d3a49SEdward Gillett por %xmm2, %xmm3 1882*533d3a49SEdward Gillett 1883*533d3a49SEdward Gillett movdqa %xmm3, (%rdi, %rcx) 1884*533d3a49SEdward Gillett add $16, %rcx 1885*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1886*533d3a49SEdward Gillett cmp %r10, %r8 1887*533d3a49SEdward Gillett jbe LABEL(unaligned_exit) 1888*533d3a49SEdward Gillett#endif 1889*533d3a49SEdward Gillett jmp LABEL(ashr_1_use_sse2) 1890*533d3a49SEdward Gillett 1891*533d3a49SEdward Gillett 1892*533d3a49SEdward Gillett /* 1893*533d3a49SEdward Gillett * Exit tail code: 1894*533d3a49SEdward Gillett * Up to 32 bytes are copied in the case of strcpy. 1895*533d3a49SEdward Gillett */ 1896*533d3a49SEdward Gillett .p2align 4 1897*533d3a49SEdward GillettLABEL(less32bytes): 1898*533d3a49SEdward Gillett xor %ecx, %ecx 1899*533d3a49SEdward GillettLABEL(unaligned_exit): 1900*533d3a49SEdward Gillett add %r9, %rsi /* r9 holds offset of rsi */ 1901*533d3a49SEdward Gillett mov %rcx, %r9 1902*533d3a49SEdward Gillett mov %r10, %rcx 1903*533d3a49SEdward Gillett shl %cl, %edx /* after shl, calculate the exact number to be filled */ 1904*533d3a49SEdward Gillett mov %r9, %rcx 1905*533d3a49SEdward Gillett .p2align 4 1906*533d3a49SEdward GillettLABEL(aligned_exit): 1907*533d3a49SEdward Gillett add %rcx, %rdi /* locate exact address for rdi */ 1908*533d3a49SEdward GillettLABEL(less16bytes): 1909*533d3a49SEdward Gillett add %rcx, %rsi /* locate exact address for rsi */ 1910*533d3a49SEdward GillettLABEL(aligned_16bytes): 1911*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1912*533d3a49SEdward Gillett /* 1913*533d3a49SEdward Gillett * Null found in 16bytes checked. Set bit in bitmask corresponding to 1914*533d3a49SEdward Gillett * the strncpy count argument. We will copy to the null (inclusive) 1915*533d3a49SEdward Gillett * or count whichever comes first. 1916*533d3a49SEdward Gillett */ 1917*533d3a49SEdward Gillett mov $1, %r9d 1918*533d3a49SEdward Gillett lea -1(%r8), %rcx 1919*533d3a49SEdward Gillett shl %cl, %r9d 1920*533d3a49SEdward Gillett cmp $32, %r8 1921*533d3a49SEdward Gillett ja LABEL(strncpy_tail) 1922*533d3a49SEdward Gillett or %r9d, %edx 1923*533d3a49SEdward GillettLABEL(strncpy_tail): 1924*533d3a49SEdward Gillett#endif 1925*533d3a49SEdward Gillett /* 1926*533d3a49SEdward Gillett * Check to see if BSF is fast on this processor. If not, use a 1927*533d3a49SEdward Gillett * different exit tail. 1928*533d3a49SEdward Gillett */ 1929*533d3a49SEdward Gillett testb $USE_BSF, .memops_method(%rip) 1930*533d3a49SEdward Gillett jz LABEL(AMD_exit) 1931*533d3a49SEdward Gillett bsf %rdx, %rcx /* Find byte with null char */ 1932*533d3a49SEdward Gillett lea LABEL(tail_table)(%rip), %r11 1933*533d3a49SEdward Gillett movslq (%r11, %rcx, 4), %rcx 1934*533d3a49SEdward Gillett lea (%r11, %rcx), %rcx 1935*533d3a49SEdward Gillett jmp *%rcx 1936*533d3a49SEdward Gillett 1937*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1938*533d3a49SEdward Gillett /* 1939*533d3a49SEdward Gillett * Count reached before null found. 1940*533d3a49SEdward Gillett */ 1941*533d3a49SEdward Gillett .p2align 4 1942*533d3a49SEdward GillettLABEL(less32bytes_strncpy_truncation): 1943*533d3a49SEdward Gillett xor %ecx, %ecx 1944*533d3a49SEdward GillettLABEL(strncpy_truncation_unaligned): 1945*533d3a49SEdward Gillett add %r9, %rsi /* next src char to copy */ 1946*533d3a49SEdward GillettLABEL(strncpy_truncation_aligned): 1947*533d3a49SEdward Gillett add %rcx, %rdi 1948*533d3a49SEdward Gillett add %rcx, %rsi 1949*533d3a49SEdward Gillett add $16, %r8 /* compensation */ 1950*533d3a49SEdward Gillett lea -1(%r8), %rcx 1951*533d3a49SEdward Gillett lea LABEL(tail_table)(%rip), %r11 1952*533d3a49SEdward Gillett movslq (%r11, %rcx, 4), %rcx 1953*533d3a49SEdward Gillett lea (%r11, %rcx), %rcx 1954*533d3a49SEdward Gillett jmp *%rcx 1955*533d3a49SEdward Gillett 1956*533d3a49SEdward Gillett .p2align 4 1957*533d3a49SEdward GillettLABEL(strncpy_exitz): 1958*533d3a49SEdward Gillett mov %rdi, %rax 1959*533d3a49SEdward Gillett ret 1960*533d3a49SEdward Gillett#endif 1961*533d3a49SEdward Gillett 1962*533d3a49SEdward Gillett .p2align 4 1963*533d3a49SEdward GillettLABEL(AMD_exit): 1964*533d3a49SEdward Gillett test %dl, %dl 1965*533d3a49SEdward Gillett jz LABEL(AMD_exit_more_8) 1966*533d3a49SEdward Gillett test $0x01, %dl 1967*533d3a49SEdward Gillett jnz LABEL(tail_0) 1968*533d3a49SEdward Gillett test $0x02, %dl 1969*533d3a49SEdward Gillett jnz LABEL(tail_1) 1970*533d3a49SEdward Gillett test $0x04, %dl 1971*533d3a49SEdward Gillett jnz LABEL(tail_2) 1972*533d3a49SEdward Gillett test $0x08, %dl 1973*533d3a49SEdward Gillett jnz LABEL(tail_3) 1974*533d3a49SEdward Gillett test $0x10, %dl 1975*533d3a49SEdward Gillett jnz LABEL(tail_4) 1976*533d3a49SEdward Gillett test $0x20, %dl 1977*533d3a49SEdward Gillett jnz LABEL(tail_5) 1978*533d3a49SEdward Gillett test $0x40, %dl 1979*533d3a49SEdward Gillett jnz LABEL(tail_6) 1980*533d3a49SEdward Gillett 1981*533d3a49SEdward Gillett .p2align 4 1982*533d3a49SEdward GillettLABEL(tail_7): /* 8 bytes */ 1983*533d3a49SEdward Gillett mov (%rsi), %rcx 1984*533d3a49SEdward Gillett mov %rcx, (%rdi) 1985*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1986*533d3a49SEdward Gillett mov $8, %cl 19877c478bd9Sstevel@tonic-gate sub $8, %r8 1988*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 1989*533d3a49SEdward Gillett#endif 1990*533d3a49SEdward Gillett ret 19917c478bd9Sstevel@tonic-gate 1992*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 1993*533d3a49SEdward Gillett /* 1994*533d3a49SEdward Gillett * Null terminated src string shorter than count. Fill the rest of the 1995*533d3a49SEdward Gillett * destination with null chars. 1996*533d3a49SEdward Gillett */ 19977c478bd9Sstevel@tonic-gate .p2align 4 1998*533d3a49SEdward GillettLABEL(strncpy_fill_tail): 1999*533d3a49SEdward Gillett mov %rax, %rdx 2000*533d3a49SEdward Gillett movzx %cl, %rax 2001*533d3a49SEdward Gillett mov %r8, %rcx 2002*533d3a49SEdward Gillett add %rax, %rdi 2003*533d3a49SEdward Gillett xor %eax, %eax 2004*533d3a49SEdward Gillett shr $3, %ecx 2005*533d3a49SEdward Gillett jz LABEL(strncpy_fill_less_8) 20067c478bd9Sstevel@tonic-gate 20077c478bd9Sstevel@tonic-gate rep stosq 2008*533d3a49SEdward GillettLABEL(strncpy_fill_less_8): 2009*533d3a49SEdward Gillett mov %r8, %rcx 2010*533d3a49SEdward Gillett and $7, %rcx 2011*533d3a49SEdward Gillett jz LABEL(strncpy_fill_return) 2012*533d3a49SEdward GillettLABEL(strncpy_fill_less_7): 2013*533d3a49SEdward Gillett sub $1, %ecx 20147c478bd9Sstevel@tonic-gate mov %al, (%rdi, %rcx) 2015*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_less_7) 2016*533d3a49SEdward GillettLABEL(strncpy_fill_return): 2017*533d3a49SEdward Gillett mov %rdx, %rax 20187c478bd9Sstevel@tonic-gate ret 20197c478bd9Sstevel@tonic-gate#endif 20207c478bd9Sstevel@tonic-gate 20217c478bd9Sstevel@tonic-gate .p2align 4 2022*533d3a49SEdward GillettLABEL(tail_0): /* 1 byte */ 2023*533d3a49SEdward Gillett mov (%rsi), %cl 2024*533d3a49SEdward Gillett mov %cl, (%rdi) 2025*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2026*533d3a49SEdward Gillett mov $1, %cl 2027*533d3a49SEdward Gillett sub $1, %r8 2028*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 20297c478bd9Sstevel@tonic-gate#endif 20307c478bd9Sstevel@tonic-gate ret 20317c478bd9Sstevel@tonic-gate 2032*533d3a49SEdward Gillett .p2align 4 2033*533d3a49SEdward GillettLABEL(tail_1): /* 2 bytes */ 2034*533d3a49SEdward Gillett mov (%rsi), %cx 2035*533d3a49SEdward Gillett mov %cx, (%rdi) 2036*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2037*533d3a49SEdward Gillett mov $2, %cl 2038*533d3a49SEdward Gillett sub $2, %r8 2039*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2040*533d3a49SEdward Gillett#endif 2041*533d3a49SEdward Gillett ret 2042*533d3a49SEdward Gillett 2043*533d3a49SEdward Gillett .p2align 4 2044*533d3a49SEdward GillettLABEL(tail_2): /* 3 bytes */ 2045*533d3a49SEdward Gillett mov (%rsi), %cx 2046*533d3a49SEdward Gillett mov %cx, (%rdi) 2047*533d3a49SEdward Gillett mov 1(%rsi), %cx 2048*533d3a49SEdward Gillett mov %cx, 1(%rdi) 2049*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2050*533d3a49SEdward Gillett mov $3, %cl 2051*533d3a49SEdward Gillett sub $3, %r8 2052*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2053*533d3a49SEdward Gillett#endif 2054*533d3a49SEdward Gillett ret 2055*533d3a49SEdward Gillett 2056*533d3a49SEdward Gillett .p2align 4 2057*533d3a49SEdward GillettLABEL(tail_3): /* 4 bytes */ 2058*533d3a49SEdward Gillett mov (%rsi), %ecx 2059*533d3a49SEdward Gillett mov %ecx, (%rdi) 2060*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2061*533d3a49SEdward Gillett mov $4, %cl 2062*533d3a49SEdward Gillett sub $4, %r8 2063*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2064*533d3a49SEdward Gillett#endif 2065*533d3a49SEdward Gillett ret 2066*533d3a49SEdward Gillett 2067*533d3a49SEdward Gillett .p2align 4 2068*533d3a49SEdward GillettLABEL(tail_4): /* 5 bytes */ 2069*533d3a49SEdward Gillett mov (%rsi), %ecx 2070*533d3a49SEdward Gillett mov %ecx, (%rdi) 2071*533d3a49SEdward Gillett mov 1(%rsi), %edx 2072*533d3a49SEdward Gillett mov %edx, 1(%rdi) 2073*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2074*533d3a49SEdward Gillett mov $5, %cl 2075*533d3a49SEdward Gillett sub $5, %r8 2076*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2077*533d3a49SEdward Gillett#endif 2078*533d3a49SEdward Gillett ret 2079*533d3a49SEdward Gillett 2080*533d3a49SEdward Gillett .p2align 4 2081*533d3a49SEdward GillettLABEL(tail_5): /* 6 bytes */ 2082*533d3a49SEdward Gillett mov (%rsi), %ecx 2083*533d3a49SEdward Gillett mov %ecx, (%rdi) 2084*533d3a49SEdward Gillett mov 2(%rsi), %edx 2085*533d3a49SEdward Gillett mov %edx, 2(%rdi) 2086*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2087*533d3a49SEdward Gillett mov $6, %cl 2088*533d3a49SEdward Gillett sub $6, %r8 2089*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2090*533d3a49SEdward Gillett#endif 2091*533d3a49SEdward Gillett ret 2092*533d3a49SEdward Gillett 2093*533d3a49SEdward Gillett .p2align 4 2094*533d3a49SEdward GillettLABEL(tail_6): /* 7 bytes */ 2095*533d3a49SEdward Gillett mov (%rsi), %ecx 2096*533d3a49SEdward Gillett mov %ecx, (%rdi) 2097*533d3a49SEdward Gillett mov 3(%rsi), %edx 2098*533d3a49SEdward Gillett mov %edx,3(%rdi) 2099*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2100*533d3a49SEdward Gillett mov $7, %cl 2101*533d3a49SEdward Gillett sub $7, %r8 2102*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2103*533d3a49SEdward Gillett#endif 2104*533d3a49SEdward Gillett ret 2105*533d3a49SEdward Gillett 2106*533d3a49SEdward Gillett .p2align 4 2107*533d3a49SEdward GillettLABEL(tail_8): /* 9 bytes */ 2108*533d3a49SEdward Gillett mov (%rsi), %rcx 2109*533d3a49SEdward Gillett mov %rcx, (%rdi) 2110*533d3a49SEdward Gillett mov 5(%rsi), %edx 2111*533d3a49SEdward Gillett mov %edx, 5(%rdi) 2112*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2113*533d3a49SEdward Gillett mov $9, %cl 2114*533d3a49SEdward Gillett sub $9, %r8 2115*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2116*533d3a49SEdward Gillett#endif 2117*533d3a49SEdward Gillett ret 2118*533d3a49SEdward Gillett 2119*533d3a49SEdward Gillett .p2align 4 2120*533d3a49SEdward GillettLABEL(AMD_exit_more_8): 2121*533d3a49SEdward Gillett test %dh, %dh 2122*533d3a49SEdward Gillett jz LABEL(AMD_exit_more_16) 2123*533d3a49SEdward Gillett test $0x01, %dh 2124*533d3a49SEdward Gillett jnz LABEL(tail_8) 2125*533d3a49SEdward Gillett test $0x02, %dh 2126*533d3a49SEdward Gillett jnz LABEL(tail_9) 2127*533d3a49SEdward Gillett test $0x04, %dh 2128*533d3a49SEdward Gillett jnz LABEL(tail_10) 2129*533d3a49SEdward Gillett test $0x08, %dh 2130*533d3a49SEdward Gillett jnz LABEL(tail_11) 2131*533d3a49SEdward Gillett test $0x10, %dh 2132*533d3a49SEdward Gillett jnz LABEL(tail_12) 2133*533d3a49SEdward Gillett test $0x20, %dh 2134*533d3a49SEdward Gillett jnz LABEL(tail_13) 2135*533d3a49SEdward Gillett test $0x40, %dh 2136*533d3a49SEdward Gillett jnz LABEL(tail_14) 2137*533d3a49SEdward Gillett 2138*533d3a49SEdward Gillett .p2align 4 2139*533d3a49SEdward GillettLABEL(tail_15): /* 16 bytes */ 2140*533d3a49SEdward Gillett mov (%rsi), %rcx 2141*533d3a49SEdward Gillett mov %rcx, (%rdi) 2142*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2143*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2144*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2145*533d3a49SEdward Gillett mov $16, %cl 2146*533d3a49SEdward Gillett sub $16, %r8 2147*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2148*533d3a49SEdward Gillett#endif 2149*533d3a49SEdward Gillett ret 2150*533d3a49SEdward Gillett 2151*533d3a49SEdward Gillett .p2align 4 2152*533d3a49SEdward GillettLABEL(tail_9): /* 10 bytes */ 2153*533d3a49SEdward Gillett mov (%rsi), %rcx 2154*533d3a49SEdward Gillett mov %rcx, (%rdi) 2155*533d3a49SEdward Gillett mov 6(%rsi), %edx 2156*533d3a49SEdward Gillett mov %edx, 6(%rdi) 2157*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2158*533d3a49SEdward Gillett mov $10, %cl 2159*533d3a49SEdward Gillett sub $10, %r8 2160*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2161*533d3a49SEdward Gillett#endif 2162*533d3a49SEdward Gillett ret 2163*533d3a49SEdward Gillett 2164*533d3a49SEdward Gillett .p2align 4 2165*533d3a49SEdward GillettLABEL(tail_10): /* 11 bytes */ 2166*533d3a49SEdward Gillett mov (%rsi), %rcx 2167*533d3a49SEdward Gillett mov %rcx, (%rdi) 2168*533d3a49SEdward Gillett mov 7(%rsi), %edx 2169*533d3a49SEdward Gillett mov %edx, 7(%rdi) 2170*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2171*533d3a49SEdward Gillett mov $11, %cl 2172*533d3a49SEdward Gillett sub $11, %r8 2173*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2174*533d3a49SEdward Gillett#endif 2175*533d3a49SEdward Gillett ret 2176*533d3a49SEdward Gillett 2177*533d3a49SEdward Gillett .p2align 4 2178*533d3a49SEdward GillettLABEL(tail_11): /* 12 bytes */ 2179*533d3a49SEdward Gillett mov (%rsi), %rcx 2180*533d3a49SEdward Gillett mov %rcx, (%rdi) 2181*533d3a49SEdward Gillett mov 8(%rsi), %edx 2182*533d3a49SEdward Gillett mov %edx, 8(%rdi) 2183*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2184*533d3a49SEdward Gillett mov $12, %cl 2185*533d3a49SEdward Gillett sub $12, %r8 2186*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2187*533d3a49SEdward Gillett#endif 2188*533d3a49SEdward Gillett ret 2189*533d3a49SEdward Gillett 2190*533d3a49SEdward Gillett .p2align 4 2191*533d3a49SEdward GillettLABEL(tail_12): /* 13 bytes */ 2192*533d3a49SEdward Gillett mov (%rsi), %rcx 2193*533d3a49SEdward Gillett mov %rcx, (%rdi) 2194*533d3a49SEdward Gillett mov 5(%rsi), %rcx 2195*533d3a49SEdward Gillett mov %rcx, 5(%rdi) 2196*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2197*533d3a49SEdward Gillett mov $13, %cl 2198*533d3a49SEdward Gillett sub $13, %r8 2199*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2200*533d3a49SEdward Gillett#endif 2201*533d3a49SEdward Gillett ret 2202*533d3a49SEdward Gillett 2203*533d3a49SEdward Gillett .p2align 4 2204*533d3a49SEdward GillettLABEL(tail_13): /* 14 bytes */ 2205*533d3a49SEdward Gillett mov (%rsi), %rcx 2206*533d3a49SEdward Gillett mov %rcx, (%rdi) 2207*533d3a49SEdward Gillett mov 6(%rsi), %rcx 2208*533d3a49SEdward Gillett mov %rcx, 6(%rdi) 2209*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2210*533d3a49SEdward Gillett mov $14, %cl 2211*533d3a49SEdward Gillett sub $14, %r8 2212*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2213*533d3a49SEdward Gillett#endif 2214*533d3a49SEdward Gillett ret 2215*533d3a49SEdward Gillett 2216*533d3a49SEdward Gillett .p2align 4 2217*533d3a49SEdward GillettLABEL(tail_14): /* 15 bytes */ 2218*533d3a49SEdward Gillett mov (%rsi), %rcx 2219*533d3a49SEdward Gillett mov %rcx, (%rdi) 2220*533d3a49SEdward Gillett mov 7(%rsi), %rcx 2221*533d3a49SEdward Gillett mov %rcx, 7(%rdi) 2222*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2223*533d3a49SEdward Gillett mov $15, %cl 2224*533d3a49SEdward Gillett sub $15, %r8 2225*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2226*533d3a49SEdward Gillett#endif 2227*533d3a49SEdward Gillett ret 2228*533d3a49SEdward Gillett 2229*533d3a49SEdward Gillett .p2align 4 2230*533d3a49SEdward GillettLABEL(AMD_exit_more_16): 2231*533d3a49SEdward Gillett shr $16, %edx 2232*533d3a49SEdward Gillett test %dl, %dl 2233*533d3a49SEdward Gillett jz LABEL(AMD_exit_more_24) 2234*533d3a49SEdward Gillett test $0x01, %dl 2235*533d3a49SEdward Gillett jnz LABEL(tail_16) 2236*533d3a49SEdward Gillett test $0x02, %dl 2237*533d3a49SEdward Gillett jnz LABEL(tail_17) 2238*533d3a49SEdward Gillett test $0x04, %dl 2239*533d3a49SEdward Gillett jnz LABEL(tail_18) 2240*533d3a49SEdward Gillett test $0x08, %dl 2241*533d3a49SEdward Gillett jnz LABEL(tail_19) 2242*533d3a49SEdward Gillett test $0x10, %dl 2243*533d3a49SEdward Gillett jnz LABEL(tail_20) 2244*533d3a49SEdward Gillett test $0x20, %dl 2245*533d3a49SEdward Gillett jnz LABEL(tail_21) 2246*533d3a49SEdward Gillett test $0x40, %dl 2247*533d3a49SEdward Gillett jnz LABEL(tail_22) 2248*533d3a49SEdward Gillett 2249*533d3a49SEdward Gillett .p2align 4 2250*533d3a49SEdward GillettLABEL(tail_23): /* 24 bytes */ 2251*533d3a49SEdward Gillett mov (%rsi), %rcx 2252*533d3a49SEdward Gillett mov %rcx, (%rdi) 2253*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2254*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2255*533d3a49SEdward Gillett mov 16(%rsi), %rcx 2256*533d3a49SEdward Gillett mov %rcx, 16(%rdi) 2257*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2258*533d3a49SEdward Gillett mov $24, %cl 2259*533d3a49SEdward Gillett sub $24, %r8 2260*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2261*533d3a49SEdward Gillett#endif 2262*533d3a49SEdward Gillett ret 2263*533d3a49SEdward Gillett 2264*533d3a49SEdward Gillett .p2align 4 2265*533d3a49SEdward GillettLABEL(tail_16): /* 17 bytes */ 2266*533d3a49SEdward Gillett mov (%rsi), %rcx 2267*533d3a49SEdward Gillett mov %rcx, (%rdi) 2268*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2269*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2270*533d3a49SEdward Gillett mov 16(%rsi), %cl 2271*533d3a49SEdward Gillett mov %cl, 16(%rdi) 2272*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2273*533d3a49SEdward Gillett mov $17, %cl 2274*533d3a49SEdward Gillett sub $17, %r8 2275*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2276*533d3a49SEdward Gillett#endif 2277*533d3a49SEdward Gillett ret 2278*533d3a49SEdward Gillett 2279*533d3a49SEdward Gillett .p2align 4 2280*533d3a49SEdward GillettLABEL(tail_17): /* 18 bytes */ 2281*533d3a49SEdward Gillett mov (%rsi), %rcx 2282*533d3a49SEdward Gillett mov %rcx, (%rdi) 2283*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2284*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2285*533d3a49SEdward Gillett mov 16(%rsi), %cx 2286*533d3a49SEdward Gillett mov %cx, 16(%rdi) 2287*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2288*533d3a49SEdward Gillett mov $18, %cl 2289*533d3a49SEdward Gillett sub $18, %r8 2290*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2291*533d3a49SEdward Gillett#endif 2292*533d3a49SEdward Gillett ret 2293*533d3a49SEdward Gillett 2294*533d3a49SEdward Gillett .p2align 4 2295*533d3a49SEdward GillettLABEL(tail_18): /* 19 bytes */ 2296*533d3a49SEdward Gillett mov (%rsi), %rcx 2297*533d3a49SEdward Gillett mov %rcx, (%rdi) 2298*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2299*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2300*533d3a49SEdward Gillett mov 15(%rsi), %ecx 2301*533d3a49SEdward Gillett mov %ecx,15(%rdi) 2302*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2303*533d3a49SEdward Gillett mov $19, %cl 2304*533d3a49SEdward Gillett sub $19, %r8 2305*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2306*533d3a49SEdward Gillett#endif 2307*533d3a49SEdward Gillett ret 2308*533d3a49SEdward Gillett 2309*533d3a49SEdward Gillett .p2align 4 2310*533d3a49SEdward GillettLABEL(tail_19): /* 20 bytes */ 2311*533d3a49SEdward Gillett mov (%rsi), %rcx 2312*533d3a49SEdward Gillett mov %rcx, (%rdi) 2313*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2314*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2315*533d3a49SEdward Gillett mov 16(%rsi), %ecx 2316*533d3a49SEdward Gillett mov %ecx, 16(%rdi) 2317*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2318*533d3a49SEdward Gillett mov $20, %cl 2319*533d3a49SEdward Gillett sub $20, %r8 2320*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2321*533d3a49SEdward Gillett#endif 2322*533d3a49SEdward Gillett ret 2323*533d3a49SEdward Gillett 2324*533d3a49SEdward Gillett .p2align 4 2325*533d3a49SEdward GillettLABEL(tail_20): /* 21 bytes */ 2326*533d3a49SEdward Gillett mov (%rsi), %rcx 2327*533d3a49SEdward Gillett mov %rcx, (%rdi) 2328*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2329*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2330*533d3a49SEdward Gillett mov 13(%rsi), %rcx 2331*533d3a49SEdward Gillett mov %rcx, 13(%rdi) 2332*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2333*533d3a49SEdward Gillett mov $21, %cl 2334*533d3a49SEdward Gillett sub $21, %r8 2335*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2336*533d3a49SEdward Gillett#endif 2337*533d3a49SEdward Gillett ret 2338*533d3a49SEdward Gillett 2339*533d3a49SEdward Gillett .p2align 4 2340*533d3a49SEdward GillettLABEL(tail_21): /* 22 bytes */ 2341*533d3a49SEdward Gillett mov (%rsi), %rcx 2342*533d3a49SEdward Gillett mov %rcx, (%rdi) 2343*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2344*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2345*533d3a49SEdward Gillett mov 14(%rsi), %rcx 2346*533d3a49SEdward Gillett mov %rcx, 14(%rdi) 2347*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2348*533d3a49SEdward Gillett mov $22, %cl 2349*533d3a49SEdward Gillett sub $22, %r8 2350*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2351*533d3a49SEdward Gillett#endif 2352*533d3a49SEdward Gillett ret 2353*533d3a49SEdward Gillett 2354*533d3a49SEdward Gillett .p2align 4 2355*533d3a49SEdward GillettLABEL(tail_22): /* 23 bytes */ 2356*533d3a49SEdward Gillett mov (%rsi), %rcx 2357*533d3a49SEdward Gillett mov %rcx, (%rdi) 2358*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2359*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2360*533d3a49SEdward Gillett mov 15(%rsi), %rcx 2361*533d3a49SEdward Gillett mov %rcx, 15(%rdi) 2362*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2363*533d3a49SEdward Gillett mov $23, %cl 2364*533d3a49SEdward Gillett sub $23, %r8 2365*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2366*533d3a49SEdward Gillett#endif 2367*533d3a49SEdward Gillett ret 2368*533d3a49SEdward Gillett 2369*533d3a49SEdward Gillett .p2align 4 2370*533d3a49SEdward GillettLABEL(AMD_exit_more_24): 2371*533d3a49SEdward Gillett test $0x01, %dh 2372*533d3a49SEdward Gillett jnz LABEL(tail_24) 2373*533d3a49SEdward Gillett test $0x02, %dh 2374*533d3a49SEdward Gillett jnz LABEL(tail_25) 2375*533d3a49SEdward Gillett test $0x04, %dh 2376*533d3a49SEdward Gillett jnz LABEL(tail_26) 2377*533d3a49SEdward Gillett test $0x08, %dh 2378*533d3a49SEdward Gillett jnz LABEL(tail_27) 2379*533d3a49SEdward Gillett test $0x10, %dh 2380*533d3a49SEdward Gillett jnz LABEL(tail_28) 2381*533d3a49SEdward Gillett test $0x20, %dh 2382*533d3a49SEdward Gillett jnz LABEL(tail_29) 2383*533d3a49SEdward Gillett test $0x40, %dh 2384*533d3a49SEdward Gillett jnz LABEL(tail_30) 2385*533d3a49SEdward Gillett 2386*533d3a49SEdward Gillett .p2align 4 2387*533d3a49SEdward GillettLABEL(tail_31): /* 32 bytes */ 2388*533d3a49SEdward Gillett mov (%rsi), %rcx 2389*533d3a49SEdward Gillett mov %rcx, (%rdi) 2390*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2391*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2392*533d3a49SEdward Gillett mov 16(%rsi), %rcx 2393*533d3a49SEdward Gillett mov %rcx, 16(%rdi) 2394*533d3a49SEdward Gillett mov 24(%rsi), %rdx 2395*533d3a49SEdward Gillett mov %rdx, 24(%rdi) 2396*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2397*533d3a49SEdward Gillett mov $32, %cl 2398*533d3a49SEdward Gillett sub $32, %r8 2399*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2400*533d3a49SEdward Gillett#endif 2401*533d3a49SEdward Gillett ret 2402*533d3a49SEdward Gillett 2403*533d3a49SEdward Gillett .p2align 4 2404*533d3a49SEdward GillettLABEL(tail_24): /* 25 bytes */ 2405*533d3a49SEdward Gillett mov (%rsi), %rcx 2406*533d3a49SEdward Gillett mov %rcx, (%rdi) 2407*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2408*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2409*533d3a49SEdward Gillett mov 16(%rsi), %rcx 2410*533d3a49SEdward Gillett mov %rcx, 16(%rdi) 2411*533d3a49SEdward Gillett mov 21(%rsi), %edx 2412*533d3a49SEdward Gillett mov %edx, 21(%rdi) 2413*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2414*533d3a49SEdward Gillett mov $25, %cl 2415*533d3a49SEdward Gillett sub $25, %r8 2416*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2417*533d3a49SEdward Gillett#endif 2418*533d3a49SEdward Gillett ret 2419*533d3a49SEdward Gillett 2420*533d3a49SEdward Gillett .p2align 4 2421*533d3a49SEdward GillettLABEL(tail_25): /* 26 bytes */ 2422*533d3a49SEdward Gillett mov (%rsi), %rcx 2423*533d3a49SEdward Gillett mov %rcx, (%rdi) 2424*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2425*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2426*533d3a49SEdward Gillett mov 16(%rsi), %rcx 2427*533d3a49SEdward Gillett mov %rcx, 16(%rdi) 2428*533d3a49SEdward Gillett mov 22(%rsi), %edx 2429*533d3a49SEdward Gillett mov %edx, 22(%rdi) 2430*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2431*533d3a49SEdward Gillett mov $26, %cl 2432*533d3a49SEdward Gillett sub $26, %r8 2433*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2434*533d3a49SEdward Gillett#endif 2435*533d3a49SEdward Gillett ret 2436*533d3a49SEdward Gillett 2437*533d3a49SEdward Gillett .p2align 4 2438*533d3a49SEdward GillettLABEL(tail_26): /* 27 bytes */ 2439*533d3a49SEdward Gillett mov (%rsi), %rcx 2440*533d3a49SEdward Gillett mov %rcx, (%rdi) 2441*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2442*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2443*533d3a49SEdward Gillett mov 16(%rsi), %rcx 2444*533d3a49SEdward Gillett mov %rcx, 16(%rdi) 2445*533d3a49SEdward Gillett mov 23(%rsi), %edx 2446*533d3a49SEdward Gillett mov %edx, 23(%rdi) 2447*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2448*533d3a49SEdward Gillett mov $27, %cl 2449*533d3a49SEdward Gillett sub $27, %r8 2450*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2451*533d3a49SEdward Gillett#endif 2452*533d3a49SEdward Gillett ret 2453*533d3a49SEdward Gillett 2454*533d3a49SEdward Gillett .p2align 4 2455*533d3a49SEdward GillettLABEL(tail_27): /* 28 bytes */ 2456*533d3a49SEdward Gillett mov (%rsi), %rcx 2457*533d3a49SEdward Gillett mov %rcx, (%rdi) 2458*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2459*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2460*533d3a49SEdward Gillett mov 16(%rsi), %rcx 2461*533d3a49SEdward Gillett mov %rcx, 16(%rdi) 2462*533d3a49SEdward Gillett mov 24(%rsi), %edx 2463*533d3a49SEdward Gillett mov %edx, 24(%rdi) 2464*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2465*533d3a49SEdward Gillett mov $28, %cl 2466*533d3a49SEdward Gillett sub $28, %r8 2467*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2468*533d3a49SEdward Gillett#endif 2469*533d3a49SEdward Gillett ret 2470*533d3a49SEdward Gillett 2471*533d3a49SEdward Gillett .p2align 4 2472*533d3a49SEdward GillettLABEL(tail_28): /* 29 bytes */ 2473*533d3a49SEdward Gillett mov (%rsi), %rcx 2474*533d3a49SEdward Gillett mov %rcx, (%rdi) 2475*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2476*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2477*533d3a49SEdward Gillett mov 16(%rsi), %rcx 2478*533d3a49SEdward Gillett mov %rcx, 16(%rdi) 2479*533d3a49SEdward Gillett mov 21(%rsi), %rdx 2480*533d3a49SEdward Gillett mov %rdx, 21(%rdi) 2481*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2482*533d3a49SEdward Gillett mov $29, %cl 2483*533d3a49SEdward Gillett sub $29, %r8 2484*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2485*533d3a49SEdward Gillett#endif 2486*533d3a49SEdward Gillett ret 2487*533d3a49SEdward Gillett 2488*533d3a49SEdward Gillett .p2align 4 2489*533d3a49SEdward GillettLABEL(tail_29): /* 30 bytes */ 2490*533d3a49SEdward Gillett mov (%rsi), %rcx 2491*533d3a49SEdward Gillett mov %rcx, (%rdi) 2492*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2493*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2494*533d3a49SEdward Gillett mov 16(%rsi), %rcx 2495*533d3a49SEdward Gillett mov %rcx, 16(%rdi) 2496*533d3a49SEdward Gillett mov 22(%rsi), %rdx 2497*533d3a49SEdward Gillett mov %rdx, 22(%rdi) 2498*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2499*533d3a49SEdward Gillett mov $30, %cl 2500*533d3a49SEdward Gillett sub $30, %r8 2501*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2502*533d3a49SEdward Gillett#endif 2503*533d3a49SEdward Gillett ret 2504*533d3a49SEdward Gillett 2505*533d3a49SEdward Gillett .p2align 4 2506*533d3a49SEdward GillettLABEL(tail_30): /* 31 bytes */ 2507*533d3a49SEdward Gillett mov (%rsi), %rcx 2508*533d3a49SEdward Gillett mov %rcx, (%rdi) 2509*533d3a49SEdward Gillett mov 8(%rsi), %rdx 2510*533d3a49SEdward Gillett mov %rdx, 8(%rdi) 2511*533d3a49SEdward Gillett mov 16(%rsi), %rcx 2512*533d3a49SEdward Gillett mov %rcx, 16(%rdi) 2513*533d3a49SEdward Gillett mov 23(%rsi), %rdx 2514*533d3a49SEdward Gillett mov %rdx, 23(%rdi) 2515*533d3a49SEdward Gillett#ifdef USE_AS_STRNCPY 2516*533d3a49SEdward Gillett mov $31, %cl 2517*533d3a49SEdward Gillett sub $31, %r8 2518*533d3a49SEdward Gillett jnz LABEL(strncpy_fill_tail) 2519*533d3a49SEdward Gillett#endif 2520*533d3a49SEdward Gillett ret 2521*533d3a49SEdward Gillett 2522*533d3a49SEdward Gillett .pushsection .rodata 2523*533d3a49SEdward Gillett .p2align 4 2524*533d3a49SEdward GillettLABEL(tail_table): 2525*533d3a49SEdward Gillett .int LABEL(tail_0) - LABEL(tail_table) /* 1 byte */ 2526*533d3a49SEdward Gillett .int LABEL(tail_1) - LABEL(tail_table) 2527*533d3a49SEdward Gillett .int LABEL(tail_2) - LABEL(tail_table) 2528*533d3a49SEdward Gillett .int LABEL(tail_3) - LABEL(tail_table) 2529*533d3a49SEdward Gillett .int LABEL(tail_4) - LABEL(tail_table) 2530*533d3a49SEdward Gillett .int LABEL(tail_5) - LABEL(tail_table) 2531*533d3a49SEdward Gillett .int LABEL(tail_6) - LABEL(tail_table) 2532*533d3a49SEdward Gillett .int LABEL(tail_7) - LABEL(tail_table) 2533*533d3a49SEdward Gillett .int LABEL(tail_8) - LABEL(tail_table) 2534*533d3a49SEdward Gillett .int LABEL(tail_9) - LABEL(tail_table) 2535*533d3a49SEdward Gillett .int LABEL(tail_10) - LABEL(tail_table) 2536*533d3a49SEdward Gillett .int LABEL(tail_11) - LABEL(tail_table) 2537*533d3a49SEdward Gillett .int LABEL(tail_12) - LABEL(tail_table) 2538*533d3a49SEdward Gillett .int LABEL(tail_13) - LABEL(tail_table) 2539*533d3a49SEdward Gillett .int LABEL(tail_14) - LABEL(tail_table) 2540*533d3a49SEdward Gillett .int LABEL(tail_15) - LABEL(tail_table) 2541*533d3a49SEdward Gillett .int LABEL(tail_16) - LABEL(tail_table) 2542*533d3a49SEdward Gillett .int LABEL(tail_17) - LABEL(tail_table) 2543*533d3a49SEdward Gillett .int LABEL(tail_18) - LABEL(tail_table) 2544*533d3a49SEdward Gillett .int LABEL(tail_19) - LABEL(tail_table) 2545*533d3a49SEdward Gillett .int LABEL(tail_20) - LABEL(tail_table) 2546*533d3a49SEdward Gillett .int LABEL(tail_21) - LABEL(tail_table) 2547*533d3a49SEdward Gillett .int LABEL(tail_22) - LABEL(tail_table) 2548*533d3a49SEdward Gillett .int LABEL(tail_23) - LABEL(tail_table) 2549*533d3a49SEdward Gillett .int LABEL(tail_24) - LABEL(tail_table) 2550*533d3a49SEdward Gillett .int LABEL(tail_25) - LABEL(tail_table) 2551*533d3a49SEdward Gillett .int LABEL(tail_26) - LABEL(tail_table) 2552*533d3a49SEdward Gillett .int LABEL(tail_27) - LABEL(tail_table) 2553*533d3a49SEdward Gillett .int LABEL(tail_28) - LABEL(tail_table) 2554*533d3a49SEdward Gillett .int LABEL(tail_29) - LABEL(tail_table) 2555*533d3a49SEdward Gillett .int LABEL(tail_30) - LABEL(tail_table) 2556*533d3a49SEdward Gillett .int LABEL(tail_31) - LABEL(tail_table) /* 32 bytes */ 2557*533d3a49SEdward Gillett 2558*533d3a49SEdward Gillett .p2align 4 2559*533d3a49SEdward GillettLABEL(unaligned_table): 2560*533d3a49SEdward Gillett .int LABEL(ashr_0) - LABEL(unaligned_table) 2561*533d3a49SEdward Gillett .int LABEL(ashr_1) - LABEL(unaligned_table) 2562*533d3a49SEdward Gillett .int LABEL(ashr_2) - LABEL(unaligned_table) 2563*533d3a49SEdward Gillett .int LABEL(ashr_3) - LABEL(unaligned_table) 2564*533d3a49SEdward Gillett .int LABEL(ashr_4) - LABEL(unaligned_table) 2565*533d3a49SEdward Gillett .int LABEL(ashr_5) - LABEL(unaligned_table) 2566*533d3a49SEdward Gillett .int LABEL(ashr_6) - LABEL(unaligned_table) 2567*533d3a49SEdward Gillett .int LABEL(ashr_7) - LABEL(unaligned_table) 2568*533d3a49SEdward Gillett .int LABEL(ashr_8) - LABEL(unaligned_table) 2569*533d3a49SEdward Gillett .int LABEL(ashr_9) - LABEL(unaligned_table) 2570*533d3a49SEdward Gillett .int LABEL(ashr_10) - LABEL(unaligned_table) 2571*533d3a49SEdward Gillett .int LABEL(ashr_11) - LABEL(unaligned_table) 2572*533d3a49SEdward Gillett .int LABEL(ashr_12) - LABEL(unaligned_table) 2573*533d3a49SEdward Gillett .int LABEL(ashr_13) - LABEL(unaligned_table) 2574*533d3a49SEdward Gillett .int LABEL(ashr_14) - LABEL(unaligned_table) 2575*533d3a49SEdward Gillett .int LABEL(ashr_15) - LABEL(unaligned_table) 2576*533d3a49SEdward Gillett .popsection 2577*533d3a49SEdward Gillett 25787c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCPY 25797c478bd9Sstevel@tonic-gate SET_SIZE(strncpy) 25807c478bd9Sstevel@tonic-gate#else 25817c478bd9Sstevel@tonic-gate SET_SIZE(strcpy) /* (char *, const char *) */ 25827c478bd9Sstevel@tonic-gate#endif 2583