17c478bd9Sstevel@tonic-gate/* 2*533d3a49SEdward Gillett * CDDL HEADER START 3*533d3a49SEdward Gillett * 4*533d3a49SEdward Gillett * The contents of this file are subject to the terms of the 5*533d3a49SEdward Gillett * Common Development and Distribution License (the "License"). 6*533d3a49SEdward Gillett * You may not use this file except in compliance with the License. 7*533d3a49SEdward Gillett * 8*533d3a49SEdward Gillett * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*533d3a49SEdward Gillett * or http://www.opensolaris.org/os/licensing. 10*533d3a49SEdward Gillett * See the License for the specific language governing permissions 11*533d3a49SEdward Gillett * and limitations under the License. 12*533d3a49SEdward Gillett * 13*533d3a49SEdward Gillett * When distributing Covered Code, include this CDDL HEADER in each 14*533d3a49SEdward Gillett * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*533d3a49SEdward Gillett * If applicable, add the following below this CDDL HEADER, with the 16*533d3a49SEdward Gillett * fields enclosed by brackets "[]" replaced with your own identifying 17*533d3a49SEdward Gillett * information: Portions Copyright [yyyy] [name of copyright owner] 18*533d3a49SEdward Gillett * 19*533d3a49SEdward Gillett * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate 227c478bd9Sstevel@tonic-gate/* 23*533d3a49SEdward Gillett * Copyright (c) 2009, Intel Corporation 247c478bd9Sstevel@tonic-gate * All rights reserved. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 27*533d3a49SEdward Gillett/* 28*533d3a49SEdward Gillett * str[n]cmp - compare chars between two string 29*533d3a49SEdward Gillett */ 307c478bd9Sstevel@tonic-gate 317c478bd9Sstevel@tonic-gate#include "SYS.h" 32*533d3a49SEdward Gillett#include "proc64_id.h" 337c478bd9Sstevel@tonic-gate 347c478bd9Sstevel@tonic-gate#define LABEL(s) .strcmp/**/s 357c478bd9Sstevel@tonic-gate 367c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 37*533d3a49SEdward Gillett /* 38*533d3a49SEdward Gillett * Since the counter, %r11, is unsigned, we branch to strcmp_exitz 39*533d3a49SEdward Gillett * if the new counter > the old one or is 0. 40*533d3a49SEdward Gillett */ 41*533d3a49SEdward Gillett#define UPDATE_STRNCMP_COUNTER \ 42*533d3a49SEdward Gillett /* calculate left number to compare */ \ 43*533d3a49SEdward Gillett lea -16(%rcx, %r11), %r9; \ 44*533d3a49SEdward Gillett cmp %r9, %r11; \ 45*533d3a49SEdward Gillett jb LABEL(strcmp_exitz); \ 46*533d3a49SEdward Gillett test %r9, %r9; \ 47*533d3a49SEdward Gillett je LABEL(strcmp_exitz); \ 48*533d3a49SEdward Gillett mov %r9, %r11 49*533d3a49SEdward Gillett#else 50*533d3a49SEdward Gillett#define UPDATE_STRNCMP_COUNTER 51*533d3a49SEdward Gillett#endif 52*533d3a49SEdward Gillett 53*533d3a49SEdward Gillett /* 54*533d3a49SEdward Gillett * This implementation uses SSE to compare up to 16 bytes at a time. 55*533d3a49SEdward Gillett */ 56*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 577c478bd9Sstevel@tonic-gate ENTRY(strncmp) 58*533d3a49SEdward Gillett test %rdx, %rdx 59*533d3a49SEdward Gillett je LABEL(strcmp_exitz) 60*533d3a49SEdward Gillett mov %rdx, %r11 617c478bd9Sstevel@tonic-gate#else 627c478bd9Sstevel@tonic-gate ENTRY(strcmp) /* (const char *, const char *) */ 637c478bd9Sstevel@tonic-gate#endif 64*533d3a49SEdward Gillett mov %esi, %ecx 65*533d3a49SEdward Gillett mov %edi, %eax 66*533d3a49SEdward Gillett and $0x3f, %rcx /* rsi alignment in cache line */ 67*533d3a49SEdward Gillett and $0x3f, %rax /* rdi alignment in cache line */ 68*533d3a49SEdward Gillett cmp $0x30, %ecx 69*533d3a49SEdward Gillett ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ 70*533d3a49SEdward Gillett cmp $0x30, %eax 71*533d3a49SEdward Gillett ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ 72*533d3a49SEdward Gillett movlpd (%rdi), %xmm1 73*533d3a49SEdward Gillett movlpd (%rsi), %xmm2 74*533d3a49SEdward Gillett movhpd 8(%rdi), %xmm1 75*533d3a49SEdward Gillett movhpd 8(%rsi), %xmm2 76*533d3a49SEdward Gillett pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ 77*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 78*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ 79*533d3a49SEdward Gillett psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 80*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 81*533d3a49SEdward Gillett sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ 82*533d3a49SEdward Gillett jnz LABEL(less16bytes) /* If not, found mismatch or null char */ 837c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 84*533d3a49SEdward Gillett sub $16, %r11 85*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) /* finish comparision */ 867c478bd9Sstevel@tonic-gate#endif 87*533d3a49SEdward Gillett add $16, %rsi /* prepare to search next 16 bytes */ 88*533d3a49SEdward Gillett add $16, %rdi /* prepare to search next 16 bytes */ 897c478bd9Sstevel@tonic-gate 904fdb7a01SNobutomo Nakano /* 91*533d3a49SEdward Gillett * Determine rdi and rsi string offsets from 16-byte alignment. 92*533d3a49SEdward Gillett * Use relative offset difference between the two to determine which case 93*533d3a49SEdward Gillett * below to use. 944fdb7a01SNobutomo Nakano */ 95*533d3a49SEdward Gillett .p2align 4 96*533d3a49SEdward GillettLABEL(crosscache): 97*533d3a49SEdward Gillett and $0xfffffffffffffff0, %rsi /* force %rsi to be 16 byte aligned */ 98*533d3a49SEdward Gillett and $0xfffffffffffffff0, %rdi /* force %rdi to be 16 byte aligned */ 99*533d3a49SEdward Gillett mov $0xffff, %edx /* for equivalent offset */ 100*533d3a49SEdward Gillett xor %r8d, %r8d 101*533d3a49SEdward Gillett and $0xf, %ecx /* offset of rsi */ 102*533d3a49SEdward Gillett and $0xf, %eax /* offset of rdi */ 103*533d3a49SEdward Gillett cmp %eax, %ecx 104*533d3a49SEdward Gillett je LABEL(ashr_0) /* both strings have the same alignment */ 105*533d3a49SEdward Gillett ja LABEL(bigger) 106*533d3a49SEdward Gillett mov %edx, %r8d /* r8d is offset flag for exit tail */ 107*533d3a49SEdward Gillett xchg %ecx, %eax 108*533d3a49SEdward Gillett xchg %rsi, %rdi 109*533d3a49SEdward GillettLABEL(bigger): 110*533d3a49SEdward Gillett mov %rcx, %r9 111*533d3a49SEdward Gillett sub %rax, %r9 112*533d3a49SEdward Gillett lea LABEL(unaligned_table)(%rip), %r10 113*533d3a49SEdward Gillett movslq (%r10, %r9, 4), %r9 114*533d3a49SEdward Gillett lea (%r10, %r9), %r10 115*533d3a49SEdward Gillett jmp *%r10 /* jump to corresponding case */ 1167c478bd9Sstevel@tonic-gate 117*533d3a49SEdward Gillett/* 118*533d3a49SEdward Gillett * ashr_0 handles the following cases: 119*533d3a49SEdward Gillett * str1 offset = str2 offset 120*533d3a49SEdward Gillett */ 121*533d3a49SEdward Gillett .p2align 4 122*533d3a49SEdward GillettLABEL(ashr_0): 123*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 124*533d3a49SEdward Gillett pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ 125*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 126*533d3a49SEdward Gillett pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ 127*533d3a49SEdward Gillett psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 128*533d3a49SEdward Gillett pmovmskb %xmm1, %r9d 129*533d3a49SEdward Gillett shr %cl, %edx /* adjust 0xffff for offset */ 130*533d3a49SEdward Gillett shr %cl, %r9d /* adjust for 16-byte offset */ 131*533d3a49SEdward Gillett sub %r9d, %edx 132*533d3a49SEdward Gillett /* 133*533d3a49SEdward Gillett * edx must be the same with r9d if in left byte (16-rcx) is equal to 134*533d3a49SEdward Gillett * the start from (16-rax) and no null char was seen. 135*533d3a49SEdward Gillett */ 136*533d3a49SEdward Gillett jne LABEL(less32bytes) /* mismatch or null char */ 137*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 138*533d3a49SEdward Gillett mov $16, %rcx 139*533d3a49SEdward Gillett mov $16, %r9 140*533d3a49SEdward Gillett pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ 141*533d3a49SEdward Gillett 142*533d3a49SEdward Gillett /* 143*533d3a49SEdward Gillett * Now both strings are aligned at 16-byte boundary. Loop over strings 144*533d3a49SEdward Gillett * checking 32-bytes per iteration. 145*533d3a49SEdward Gillett */ 146*533d3a49SEdward Gillett .p2align 4 147*533d3a49SEdward GillettLABEL(loop_ashr_0): 148*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 149*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 150*533d3a49SEdward Gillett 151*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 152*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 153*533d3a49SEdward Gillett psubb %xmm0, %xmm1 154*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 155*533d3a49SEdward Gillett sub $0xffff, %edx 156*533d3a49SEdward Gillett jnz LABEL(exit) /* mismatch or null char seen */ 157*533d3a49SEdward Gillett 158*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 159*533d3a49SEdward Gillett sub $16, %r11 160*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 161*533d3a49SEdward Gillett#endif 162*533d3a49SEdward Gillett add $16, %rcx 163*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 164*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 165*533d3a49SEdward Gillett 166*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 167*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 168*533d3a49SEdward Gillett psubb %xmm0, %xmm1 169*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 170*533d3a49SEdward Gillett sub $0xffff, %edx 171*533d3a49SEdward Gillett jnz LABEL(exit) 172*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 173*533d3a49SEdward Gillett sub $16, %r11 174*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 175*533d3a49SEdward Gillett#endif 176*533d3a49SEdward Gillett add $16, %rcx 177*533d3a49SEdward Gillett jmp LABEL(loop_ashr_0) 178*533d3a49SEdward Gillett 179*533d3a49SEdward Gillett/* 180*533d3a49SEdward Gillett * ashr_1 handles the following cases: 181*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 15 182*533d3a49SEdward Gillett */ 183*533d3a49SEdward Gillett .p2align 4 184*533d3a49SEdward GillettLABEL(ashr_1): 185*533d3a49SEdward Gillett pxor %xmm0, %xmm0 186*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 187*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 188*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 189*533d3a49SEdward Gillett pslldq $15, %xmm2 /* shift first string to align with second */ 190*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ 191*533d3a49SEdward Gillett psubb %xmm0, %xmm2 /* packed sub of comparison results*/ 192*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 193*533d3a49SEdward Gillett shr %cl, %edx /* adjust 0xffff for offset */ 194*533d3a49SEdward Gillett shr %cl, %r9d /* adjust for 16-byte offset */ 195*533d3a49SEdward Gillett sub %r9d, %edx 196*533d3a49SEdward Gillett jnz LABEL(less32bytes) /* mismatch or null char seen */ 197*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 198*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 199*533d3a49SEdward Gillett 200*533d3a49SEdward Gillett pxor %xmm0, %xmm0 201*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 202*533d3a49SEdward Gillett mov $1, %r9d /* rdi bytes already examined. Used in exit code */ 203*533d3a49SEdward Gillett /* 204*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 205*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 206*533d3a49SEdward Gillett * need to do a nibble. 207*533d3a49SEdward Gillett */ 208*533d3a49SEdward Gillett lea 1(%rdi), %r10 209*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 210*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 211*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 2127c478bd9Sstevel@tonic-gate 2137c478bd9Sstevel@tonic-gate .p2align 4 214*533d3a49SEdward GillettLABEL(loop_ashr_1): 215*533d3a49SEdward Gillett add $16, %r10 216*533d3a49SEdward Gillett jg LABEL(nibble_ashr_1) /* cross page boundary */ 2177c478bd9Sstevel@tonic-gate 218*533d3a49SEdward GillettLABEL(gobble_ashr_1): 219*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 220*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 221*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 /* store for next cycle */ 2227c478bd9Sstevel@tonic-gate 223*533d3a49SEdward Gillett psrldq $1, %xmm3 224*533d3a49SEdward Gillett pslldq $15, %xmm2 225*533d3a49SEdward Gillett por %xmm3, %xmm2 /* merge into one 16byte value */ 226*533d3a49SEdward Gillett 227*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 228*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 229*533d3a49SEdward Gillett psubb %xmm0, %xmm1 230*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 231*533d3a49SEdward Gillett sub $0xffff, %edx 232*533d3a49SEdward Gillett jnz LABEL(exit) 233*533d3a49SEdward Gillett 234*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 235*533d3a49SEdward Gillett sub $16, %r11 236*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 237*533d3a49SEdward Gillett#endif 238*533d3a49SEdward Gillett add $16, %rcx 239*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 240*533d3a49SEdward Gillett 241*533d3a49SEdward Gillett add $16, %r10 242*533d3a49SEdward Gillett jg LABEL(nibble_ashr_1) /* cross page boundary */ 243*533d3a49SEdward Gillett 244*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 245*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 246*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 /* store for next cycle */ 247*533d3a49SEdward Gillett 248*533d3a49SEdward Gillett psrldq $1, %xmm3 249*533d3a49SEdward Gillett pslldq $15, %xmm2 250*533d3a49SEdward Gillett por %xmm3, %xmm2 /* merge into one 16byte value */ 251*533d3a49SEdward Gillett 252*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 253*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 254*533d3a49SEdward Gillett psubb %xmm0, %xmm1 255*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 256*533d3a49SEdward Gillett sub $0xffff, %edx 257*533d3a49SEdward Gillett jnz LABEL(exit) 258*533d3a49SEdward Gillett 259*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 260*533d3a49SEdward Gillett sub $16, %r11 261*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 262*533d3a49SEdward Gillett#endif 263*533d3a49SEdward Gillett add $16, %rcx 264*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 265*533d3a49SEdward Gillett jmp LABEL(loop_ashr_1) 266*533d3a49SEdward Gillett 267*533d3a49SEdward Gillett /* 268*533d3a49SEdward Gillett * Nibble avoids loads across page boundary. This is to avoid a potential 269*533d3a49SEdward Gillett * access into unmapped memory. 270*533d3a49SEdward Gillett */ 271*533d3a49SEdward Gillett .p2align 4 272*533d3a49SEdward GillettLABEL(nibble_ashr_1): 273*533d3a49SEdward Gillett psrldq $1, %xmm4 274*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 275*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 276*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 277*533d3a49SEdward Gillett psubb %xmm0, %xmm1 278*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 279*533d3a49SEdward Gillett sub $0x7fff, %edx 280*533d3a49SEdward Gillett jnz LABEL(exit) 281*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 282*533d3a49SEdward Gillett cmp $15, %r11 283*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 284*533d3a49SEdward Gillett#endif 285*533d3a49SEdward Gillett pxor %xmm0, %xmm0 286*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 287*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_1) 288*533d3a49SEdward Gillett 289*533d3a49SEdward Gillett/* 290*533d3a49SEdward Gillett * ashr_2 handles the following cases: 291*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 14 292*533d3a49SEdward Gillett */ 293*533d3a49SEdward Gillett .p2align 4 294*533d3a49SEdward GillettLABEL(ashr_2): 295*533d3a49SEdward Gillett pxor %xmm0, %xmm0 296*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 297*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 298*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 299*533d3a49SEdward Gillett pslldq $14, %xmm2 300*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 301*533d3a49SEdward Gillett psubb %xmm0, %xmm2 302*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 303*533d3a49SEdward Gillett shr %cl, %edx 304*533d3a49SEdward Gillett shr %cl, %r9d 305*533d3a49SEdward Gillett sub %r9d, %edx 306*533d3a49SEdward Gillett jnz LABEL(less32bytes) 307*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 308*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 309*533d3a49SEdward Gillett 310*533d3a49SEdward Gillett pxor %xmm0, %xmm0 311*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 312*533d3a49SEdward Gillett mov $2, %r9d /* rdi bytes already examined. Used in exit code */ 313*533d3a49SEdward Gillett /* 314*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 315*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 316*533d3a49SEdward Gillett * need to do a nibble. 317*533d3a49SEdward Gillett */ 318*533d3a49SEdward Gillett lea 2(%rdi), %r10 319*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 320*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 321*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 3227c478bd9Sstevel@tonic-gate 3237c478bd9Sstevel@tonic-gate .p2align 4 324*533d3a49SEdward GillettLABEL(loop_ashr_2): 325*533d3a49SEdward Gillett add $16, %r10 326*533d3a49SEdward Gillett jg LABEL(nibble_ashr_2) 3277c478bd9Sstevel@tonic-gate 328*533d3a49SEdward GillettLABEL(gobble_ashr_2): 329*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 330*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 331*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 332*533d3a49SEdward Gillett 333*533d3a49SEdward Gillett psrldq $2, %xmm3 334*533d3a49SEdward Gillett pslldq $14, %xmm2 335*533d3a49SEdward Gillett por %xmm3, %xmm2 336*533d3a49SEdward Gillett 337*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 338*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 339*533d3a49SEdward Gillett psubb %xmm0, %xmm1 340*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 341*533d3a49SEdward Gillett sub $0xffff, %edx 342*533d3a49SEdward Gillett jnz LABEL(exit) 3437c478bd9Sstevel@tonic-gate 3447c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 345*533d3a49SEdward Gillett sub $16, %r11 346*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 3477c478bd9Sstevel@tonic-gate#endif 3487c478bd9Sstevel@tonic-gate 349*533d3a49SEdward Gillett add $16, %rcx 350*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 3517c478bd9Sstevel@tonic-gate 352*533d3a49SEdward Gillett add $16, %r10 353*533d3a49SEdward Gillett jg LABEL(nibble_ashr_2) /* cross page boundary */ 3547c478bd9Sstevel@tonic-gate 355*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 356*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 357*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 358*533d3a49SEdward Gillett 359*533d3a49SEdward Gillett psrldq $2, %xmm3 360*533d3a49SEdward Gillett pslldq $14, %xmm2 361*533d3a49SEdward Gillett por %xmm3, %xmm2 362*533d3a49SEdward Gillett 363*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 364*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 365*533d3a49SEdward Gillett psubb %xmm0, %xmm1 366*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 367*533d3a49SEdward Gillett sub $0xffff, %edx 368*533d3a49SEdward Gillett jnz LABEL(exit) 369*533d3a49SEdward Gillett 370*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 371*533d3a49SEdward Gillett sub $16, %r11 372*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 373*533d3a49SEdward Gillett#endif 374*533d3a49SEdward Gillett 375*533d3a49SEdward Gillett add $16, %rcx 376*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 377*533d3a49SEdward Gillett jmp LABEL(loop_ashr_2) 3787c478bd9Sstevel@tonic-gate 3797c478bd9Sstevel@tonic-gate .p2align 4 380*533d3a49SEdward GillettLABEL(nibble_ashr_2): 381*533d3a49SEdward Gillett psrldq $2, %xmm4 382*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 383*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 384*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 385*533d3a49SEdward Gillett psubb %xmm0, %xmm1 386*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 387*533d3a49SEdward Gillett sub $0x3fff, %edx 388*533d3a49SEdward Gillett jnz LABEL(exit) 389*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 390*533d3a49SEdward Gillett cmp $14, %r11 391*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 392*533d3a49SEdward Gillett#endif 393*533d3a49SEdward Gillett pxor %xmm0, %xmm0 394*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 395*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_2) 3967c478bd9Sstevel@tonic-gate 397*533d3a49SEdward Gillett/* 398*533d3a49SEdward Gillett * ashr_3 handles the following cases: 399*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 13 400*533d3a49SEdward Gillett */ 401*533d3a49SEdward Gillett .p2align 4 402*533d3a49SEdward GillettLABEL(ashr_3): 403*533d3a49SEdward Gillett pxor %xmm0, %xmm0 404*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 405*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 406*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 407*533d3a49SEdward Gillett pslldq $13, %xmm2 408*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 409*533d3a49SEdward Gillett psubb %xmm0, %xmm2 410*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 411*533d3a49SEdward Gillett shr %cl, %edx 412*533d3a49SEdward Gillett shr %cl, %r9d 413*533d3a49SEdward Gillett sub %r9d, %edx 414*533d3a49SEdward Gillett jnz LABEL(less32bytes) 415*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 416*533d3a49SEdward Gillett 417*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 418*533d3a49SEdward Gillett 419*533d3a49SEdward Gillett pxor %xmm0, %xmm0 420*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 421*533d3a49SEdward Gillett mov $3, %r9d /* rdi bytes already examined. Used in exit code */ 422*533d3a49SEdward Gillett /* 423*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 424*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 425*533d3a49SEdward Gillett * need to do a nibble. 426*533d3a49SEdward Gillett */ 427*533d3a49SEdward Gillett lea 3(%rdi), %r10 428*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 429*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 430*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 431*533d3a49SEdward Gillett 432*533d3a49SEdward Gillett .p2align 4 433*533d3a49SEdward GillettLABEL(loop_ashr_3): 434*533d3a49SEdward Gillett add $16, %r10 435*533d3a49SEdward Gillett jg LABEL(nibble_ashr_3) 436*533d3a49SEdward Gillett 437*533d3a49SEdward GillettLABEL(gobble_ashr_3): 438*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 439*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 440*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 441*533d3a49SEdward Gillett 442*533d3a49SEdward Gillett psrldq $3, %xmm3 443*533d3a49SEdward Gillett pslldq $13, %xmm2 444*533d3a49SEdward Gillett por %xmm3, %xmm2 445*533d3a49SEdward Gillett 446*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 447*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 448*533d3a49SEdward Gillett psubb %xmm0, %xmm1 449*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 450*533d3a49SEdward Gillett sub $0xffff, %edx 451*533d3a49SEdward Gillett jnz LABEL(exit) 4527c478bd9Sstevel@tonic-gate 4537c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 454*533d3a49SEdward Gillett sub $16, %r11 455*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 4567c478bd9Sstevel@tonic-gate#endif 4577c478bd9Sstevel@tonic-gate 458*533d3a49SEdward Gillett add $16, %rcx 459*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 4607c478bd9Sstevel@tonic-gate 461*533d3a49SEdward Gillett add $16, %r10 462*533d3a49SEdward Gillett jg LABEL(nibble_ashr_3) /* cross page boundary */ 4637c478bd9Sstevel@tonic-gate 464*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 465*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 466*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 4677c478bd9Sstevel@tonic-gate 468*533d3a49SEdward Gillett psrldq $3, %xmm3 469*533d3a49SEdward Gillett pslldq $13, %xmm2 470*533d3a49SEdward Gillett por %xmm3, %xmm2 4717c478bd9Sstevel@tonic-gate 472*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 473*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 474*533d3a49SEdward Gillett psubb %xmm0, %xmm1 475*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 476*533d3a49SEdward Gillett sub $0xffff, %edx 477*533d3a49SEdward Gillett jnz LABEL(exit) 4787c478bd9Sstevel@tonic-gate 4797c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 480*533d3a49SEdward Gillett sub $16, %r11 481*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 4827c478bd9Sstevel@tonic-gate#endif 4837c478bd9Sstevel@tonic-gate 484*533d3a49SEdward Gillett add $16, %rcx 485*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 486*533d3a49SEdward Gillett jmp LABEL(loop_ashr_3) 4877c478bd9Sstevel@tonic-gate 488*533d3a49SEdward Gillett .p2align 4 489*533d3a49SEdward GillettLABEL(nibble_ashr_3): 490*533d3a49SEdward Gillett psrldq $3, %xmm4 491*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 492*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 493*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 494*533d3a49SEdward Gillett psubb %xmm0, %xmm1 495*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 496*533d3a49SEdward Gillett sub $0x1fff, %edx 497*533d3a49SEdward Gillett jnz LABEL(exit) 498*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 499*533d3a49SEdward Gillett cmp $13, %r11 500*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 501*533d3a49SEdward Gillett#endif 502*533d3a49SEdward Gillett pxor %xmm0, %xmm0 503*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 504*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_3) 5057c478bd9Sstevel@tonic-gate 506*533d3a49SEdward Gillett/* 507*533d3a49SEdward Gillett * ashr_4 handles the following cases: 508*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 12 509*533d3a49SEdward Gillett */ 510*533d3a49SEdward Gillett .p2align 4 511*533d3a49SEdward GillettLABEL(ashr_4): 512*533d3a49SEdward Gillett pxor %xmm0, %xmm0 513*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 514*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 515*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 516*533d3a49SEdward Gillett pslldq $12, %xmm2 517*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 518*533d3a49SEdward Gillett psubb %xmm0, %xmm2 519*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 520*533d3a49SEdward Gillett shr %cl, %edx 521*533d3a49SEdward Gillett shr %cl, %r9d 522*533d3a49SEdward Gillett sub %r9d, %edx 523*533d3a49SEdward Gillett jnz LABEL(less32bytes) 524*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 5257c478bd9Sstevel@tonic-gate 526*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 5277c478bd9Sstevel@tonic-gate 528*533d3a49SEdward Gillett pxor %xmm0, %xmm0 529*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 530*533d3a49SEdward Gillett mov $4, %r9d /* rdi bytes already examined. Used in exit code */ 531*533d3a49SEdward Gillett /* 532*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 533*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 534*533d3a49SEdward Gillett * need to do a nibble. 535*533d3a49SEdward Gillett */ 536*533d3a49SEdward Gillett lea 4(%rdi), %r10 537*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 538*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 539*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 5407c478bd9Sstevel@tonic-gate 541*533d3a49SEdward Gillett .p2align 4 542*533d3a49SEdward GillettLABEL(loop_ashr_4): 543*533d3a49SEdward Gillett add $16, %r10 544*533d3a49SEdward Gillett jg LABEL(nibble_ashr_4) 545*533d3a49SEdward Gillett 546*533d3a49SEdward GillettLABEL(gobble_ashr_4): 547*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 548*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 549*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 550*533d3a49SEdward Gillett 551*533d3a49SEdward Gillett psrldq $4, %xmm3 552*533d3a49SEdward Gillett pslldq $12, %xmm2 553*533d3a49SEdward Gillett por %xmm3, %xmm2 554*533d3a49SEdward Gillett 555*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 556*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 557*533d3a49SEdward Gillett psubb %xmm0, %xmm1 558*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 559*533d3a49SEdward Gillett sub $0xffff, %edx 560*533d3a49SEdward Gillett jnz LABEL(exit) 5617c478bd9Sstevel@tonic-gate 5627c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 563*533d3a49SEdward Gillett sub $16, %r11 564*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 5657c478bd9Sstevel@tonic-gate#endif 5667c478bd9Sstevel@tonic-gate 567*533d3a49SEdward Gillett add $16, %rcx 568*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 5697c478bd9Sstevel@tonic-gate 570*533d3a49SEdward Gillett add $16, %r10 571*533d3a49SEdward Gillett jg LABEL(nibble_ashr_4) /* cross page boundary */ 5727c478bd9Sstevel@tonic-gate 573*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 574*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 575*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 5767c478bd9Sstevel@tonic-gate 577*533d3a49SEdward Gillett psrldq $4, %xmm3 578*533d3a49SEdward Gillett pslldq $12, %xmm2 579*533d3a49SEdward Gillett por %xmm3, %xmm2 5807c478bd9Sstevel@tonic-gate 581*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 582*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 583*533d3a49SEdward Gillett psubb %xmm0, %xmm1 584*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 585*533d3a49SEdward Gillett sub $0xffff, %edx 586*533d3a49SEdward Gillett jnz LABEL(exit) 5877c478bd9Sstevel@tonic-gate 5887c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 589*533d3a49SEdward Gillett sub $16, %r11 590*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 5917c478bd9Sstevel@tonic-gate#endif 5927c478bd9Sstevel@tonic-gate 593*533d3a49SEdward Gillett add $16, %rcx 594*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 595*533d3a49SEdward Gillett jmp LABEL(loop_ashr_4) 5967c478bd9Sstevel@tonic-gate 597*533d3a49SEdward Gillett .p2align 4 598*533d3a49SEdward GillettLABEL(nibble_ashr_4): 599*533d3a49SEdward Gillett psrldq $4, %xmm4 600*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 601*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 602*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 603*533d3a49SEdward Gillett psubb %xmm0, %xmm1 604*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 605*533d3a49SEdward Gillett sub $0x0fff, %edx 606*533d3a49SEdward Gillett jnz LABEL(exit) 607*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 608*533d3a49SEdward Gillett cmp $12, %r11 609*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 610*533d3a49SEdward Gillett#endif 611*533d3a49SEdward Gillett pxor %xmm0, %xmm0 612*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 613*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_4) 6147c478bd9Sstevel@tonic-gate 615*533d3a49SEdward Gillett/* 616*533d3a49SEdward Gillett * ashr_5 handles the following cases: 617*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 11 618*533d3a49SEdward Gillett */ 619*533d3a49SEdward Gillett .p2align 4 620*533d3a49SEdward GillettLABEL(ashr_5): 621*533d3a49SEdward Gillett pxor %xmm0, %xmm0 622*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 623*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 624*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 625*533d3a49SEdward Gillett pslldq $11, %xmm2 626*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 627*533d3a49SEdward Gillett psubb %xmm0, %xmm2 628*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 629*533d3a49SEdward Gillett shr %cl, %edx 630*533d3a49SEdward Gillett shr %cl, %r9d 631*533d3a49SEdward Gillett sub %r9d, %edx 632*533d3a49SEdward Gillett jnz LABEL(less32bytes) 633*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 6347c478bd9Sstevel@tonic-gate 635*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 6367c478bd9Sstevel@tonic-gate 637*533d3a49SEdward Gillett pxor %xmm0, %xmm0 638*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 639*533d3a49SEdward Gillett mov $5, %r9d /* rdi bytes already examined. Used in exit code */ 640*533d3a49SEdward Gillett /* 641*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 642*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 643*533d3a49SEdward Gillett * need to do a nibble. 644*533d3a49SEdward Gillett */ 645*533d3a49SEdward Gillett lea 5(%rdi), %r10 646*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 647*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 648*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 6497c478bd9Sstevel@tonic-gate 650*533d3a49SEdward Gillett .p2align 4 651*533d3a49SEdward GillettLABEL(loop_ashr_5): 652*533d3a49SEdward Gillett add $16, %r10 653*533d3a49SEdward Gillett jg LABEL(nibble_ashr_5) 654*533d3a49SEdward Gillett 655*533d3a49SEdward GillettLABEL(gobble_ashr_5): 656*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 657*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 658*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 659*533d3a49SEdward Gillett 660*533d3a49SEdward Gillett psrldq $5, %xmm3 661*533d3a49SEdward Gillett pslldq $11, %xmm2 662*533d3a49SEdward Gillett por %xmm3, %xmm2 663*533d3a49SEdward Gillett 664*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 665*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 666*533d3a49SEdward Gillett psubb %xmm0, %xmm1 667*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 668*533d3a49SEdward Gillett sub $0xffff, %edx 669*533d3a49SEdward Gillett jnz LABEL(exit) 6707c478bd9Sstevel@tonic-gate 6717c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 672*533d3a49SEdward Gillett sub $16, %r11 673*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 6747c478bd9Sstevel@tonic-gate#endif 6757c478bd9Sstevel@tonic-gate 676*533d3a49SEdward Gillett add $16, %rcx 677*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 6787c478bd9Sstevel@tonic-gate 679*533d3a49SEdward Gillett add $16, %r10 680*533d3a49SEdward Gillett jg LABEL(nibble_ashr_5) /* cross page boundary */ 6817c478bd9Sstevel@tonic-gate 682*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 683*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 684*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 6857c478bd9Sstevel@tonic-gate 686*533d3a49SEdward Gillett psrldq $5, %xmm3 687*533d3a49SEdward Gillett pslldq $11, %xmm2 688*533d3a49SEdward Gillett por %xmm3, %xmm2 6897c478bd9Sstevel@tonic-gate 690*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 691*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 692*533d3a49SEdward Gillett psubb %xmm0, %xmm1 693*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 694*533d3a49SEdward Gillett sub $0xffff, %edx 695*533d3a49SEdward Gillett jnz LABEL(exit) 6967c478bd9Sstevel@tonic-gate 6977c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 698*533d3a49SEdward Gillett sub $16, %r11 699*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 7007c478bd9Sstevel@tonic-gate#endif 7017c478bd9Sstevel@tonic-gate 702*533d3a49SEdward Gillett add $16, %rcx 703*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 704*533d3a49SEdward Gillett jmp LABEL(loop_ashr_5) 7057c478bd9Sstevel@tonic-gate 706*533d3a49SEdward Gillett .p2align 4 707*533d3a49SEdward GillettLABEL(nibble_ashr_5): 708*533d3a49SEdward Gillett psrldq $5, %xmm4 709*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 710*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 711*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 712*533d3a49SEdward Gillett psubb %xmm0, %xmm1 713*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 714*533d3a49SEdward Gillett sub $0x07ff, %edx 715*533d3a49SEdward Gillett jnz LABEL(exit) 716*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 717*533d3a49SEdward Gillett cmp $11, %r11 718*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 719*533d3a49SEdward Gillett#endif 720*533d3a49SEdward Gillett pxor %xmm0, %xmm0 721*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 722*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_5) 7237c478bd9Sstevel@tonic-gate 724*533d3a49SEdward Gillett/* 725*533d3a49SEdward Gillett * ashr_6 handles the following cases: 726*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 10 727*533d3a49SEdward Gillett */ 728*533d3a49SEdward Gillett .p2align 4 729*533d3a49SEdward GillettLABEL(ashr_6): 730*533d3a49SEdward Gillett pxor %xmm0, %xmm0 731*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 732*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 733*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 734*533d3a49SEdward Gillett pslldq $10, %xmm2 735*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 736*533d3a49SEdward Gillett psubb %xmm0, %xmm2 737*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 738*533d3a49SEdward Gillett shr %cl, %edx 739*533d3a49SEdward Gillett shr %cl, %r9d 740*533d3a49SEdward Gillett sub %r9d, %edx 741*533d3a49SEdward Gillett jnz LABEL(less32bytes) 742*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 7437c478bd9Sstevel@tonic-gate 744*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 7457c478bd9Sstevel@tonic-gate 746*533d3a49SEdward Gillett pxor %xmm0, %xmm0 747*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 748*533d3a49SEdward Gillett mov $6, %r9d /* rdi bytes already examined. Used in exit code */ 749*533d3a49SEdward Gillett /* 750*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 751*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 752*533d3a49SEdward Gillett * need to do a nibble. 753*533d3a49SEdward Gillett */ 754*533d3a49SEdward Gillett lea 6(%rdi), %r10 755*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 756*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 757*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 7587c478bd9Sstevel@tonic-gate 759*533d3a49SEdward Gillett .p2align 4 760*533d3a49SEdward GillettLABEL(loop_ashr_6): 761*533d3a49SEdward Gillett add $16, %r10 762*533d3a49SEdward Gillett jg LABEL(nibble_ashr_6) 763*533d3a49SEdward Gillett 764*533d3a49SEdward GillettLABEL(gobble_ashr_6): 765*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 766*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 767*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 768*533d3a49SEdward Gillett 769*533d3a49SEdward Gillett psrldq $6, %xmm3 770*533d3a49SEdward Gillett pslldq $10, %xmm2 771*533d3a49SEdward Gillett por %xmm3, %xmm2 772*533d3a49SEdward Gillett 773*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 774*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 775*533d3a49SEdward Gillett psubb %xmm0, %xmm1 776*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 777*533d3a49SEdward Gillett sub $0xffff, %edx 778*533d3a49SEdward Gillett jnz LABEL(exit) 7797c478bd9Sstevel@tonic-gate 7807c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 781*533d3a49SEdward Gillett sub $16, %r11 782*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 7837c478bd9Sstevel@tonic-gate#endif 7847c478bd9Sstevel@tonic-gate 785*533d3a49SEdward Gillett add $16, %rcx 786*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 7877c478bd9Sstevel@tonic-gate 788*533d3a49SEdward Gillett add $16, %r10 789*533d3a49SEdward Gillett jg LABEL(nibble_ashr_6) /* cross page boundary */ 7907c478bd9Sstevel@tonic-gate 791*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 792*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 793*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 7947c478bd9Sstevel@tonic-gate 795*533d3a49SEdward Gillett psrldq $6, %xmm3 796*533d3a49SEdward Gillett pslldq $10, %xmm2 797*533d3a49SEdward Gillett por %xmm3, %xmm2 7987c478bd9Sstevel@tonic-gate 799*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 800*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 801*533d3a49SEdward Gillett psubb %xmm0, %xmm1 802*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 803*533d3a49SEdward Gillett sub $0xffff, %edx 804*533d3a49SEdward Gillett jnz LABEL(exit) 8057c478bd9Sstevel@tonic-gate 8067c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 807*533d3a49SEdward Gillett sub $16, %r11 808*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 8097c478bd9Sstevel@tonic-gate#endif 8107c478bd9Sstevel@tonic-gate 811*533d3a49SEdward Gillett add $16, %rcx 812*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 813*533d3a49SEdward Gillett jmp LABEL(loop_ashr_6) 8147c478bd9Sstevel@tonic-gate 815*533d3a49SEdward Gillett .p2align 4 816*533d3a49SEdward GillettLABEL(nibble_ashr_6): 817*533d3a49SEdward Gillett psrldq $6, %xmm4 818*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 819*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 820*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 821*533d3a49SEdward Gillett psubb %xmm0, %xmm1 822*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 823*533d3a49SEdward Gillett sub $0x03ff, %edx 824*533d3a49SEdward Gillett jnz LABEL(exit) 8257c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 826*533d3a49SEdward Gillett cmp $10, %r11 827*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 8287c478bd9Sstevel@tonic-gate#endif 829*533d3a49SEdward Gillett pxor %xmm0, %xmm0 830*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 831*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_6) 8327c478bd9Sstevel@tonic-gate 833*533d3a49SEdward Gillett/* 834*533d3a49SEdward Gillett * ashr_7 handles the following cases: 835*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 9 836*533d3a49SEdward Gillett */ 837*533d3a49SEdward Gillett .p2align 4 838*533d3a49SEdward GillettLABEL(ashr_7): 839*533d3a49SEdward Gillett pxor %xmm0, %xmm0 840*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 841*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 842*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 843*533d3a49SEdward Gillett pslldq $9, %xmm2 844*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 845*533d3a49SEdward Gillett psubb %xmm0, %xmm2 846*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 847*533d3a49SEdward Gillett shr %cl, %edx 848*533d3a49SEdward Gillett shr %cl, %r9d 849*533d3a49SEdward Gillett sub %r9d, %edx 850*533d3a49SEdward Gillett jnz LABEL(less32bytes) 851*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 8527c478bd9Sstevel@tonic-gate 853*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 854*533d3a49SEdward Gillett 855*533d3a49SEdward Gillett pxor %xmm0, %xmm0 856*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 857*533d3a49SEdward Gillett mov $7, %r9d /* rdi bytes already examined. Used in exit code */ 858*533d3a49SEdward Gillett /* 859*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 860*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 861*533d3a49SEdward Gillett * need to do a nibble. 862*533d3a49SEdward Gillett */ 863*533d3a49SEdward Gillett lea 7(%rdi), %r10 864*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 865*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 866*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 867*533d3a49SEdward Gillett 868*533d3a49SEdward Gillett .p2align 4 869*533d3a49SEdward GillettLABEL(loop_ashr_7): 870*533d3a49SEdward Gillett add $16, %r10 871*533d3a49SEdward Gillett jg LABEL(nibble_ashr_7) 872*533d3a49SEdward Gillett 873*533d3a49SEdward GillettLABEL(gobble_ashr_7): 874*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 875*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 876*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 877*533d3a49SEdward Gillett 878*533d3a49SEdward Gillett psrldq $7, %xmm3 879*533d3a49SEdward Gillett pslldq $9, %xmm2 880*533d3a49SEdward Gillett por %xmm3, %xmm2 881*533d3a49SEdward Gillett 882*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 883*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 884*533d3a49SEdward Gillett psubb %xmm0, %xmm1 885*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 886*533d3a49SEdward Gillett sub $0xffff, %edx 887*533d3a49SEdward Gillett jnz LABEL(exit) 8887c478bd9Sstevel@tonic-gate 8897c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 890*533d3a49SEdward Gillett sub $16, %r11 891*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 8927c478bd9Sstevel@tonic-gate#endif 8937c478bd9Sstevel@tonic-gate 894*533d3a49SEdward Gillett add $16, %rcx 895*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 8967c478bd9Sstevel@tonic-gate 897*533d3a49SEdward Gillett add $16, %r10 898*533d3a49SEdward Gillett jg LABEL(nibble_ashr_7) /* cross page boundary */ 899*533d3a49SEdward Gillett 900*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 901*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 902*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 903*533d3a49SEdward Gillett 904*533d3a49SEdward Gillett psrldq $7, %xmm3 905*533d3a49SEdward Gillett pslldq $9, %xmm2 906*533d3a49SEdward Gillett por %xmm3, %xmm2 907*533d3a49SEdward Gillett 908*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 909*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 910*533d3a49SEdward Gillett psubb %xmm0, %xmm1 911*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 912*533d3a49SEdward Gillett sub $0xffff, %edx 913*533d3a49SEdward Gillett jnz LABEL(exit) 9147c478bd9Sstevel@tonic-gate 9157c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 916*533d3a49SEdward Gillett sub $16, %r11 917*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 9187c478bd9Sstevel@tonic-gate#endif 9197c478bd9Sstevel@tonic-gate 920*533d3a49SEdward Gillett add $16, %rcx 921*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 922*533d3a49SEdward Gillett jmp LABEL(loop_ashr_7) 9237c478bd9Sstevel@tonic-gate 924*533d3a49SEdward Gillett .p2align 4 925*533d3a49SEdward GillettLABEL(nibble_ashr_7): 926*533d3a49SEdward Gillett psrldq $7, %xmm4 927*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 928*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 929*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 930*533d3a49SEdward Gillett psubb %xmm0, %xmm1 931*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 932*533d3a49SEdward Gillett sub $0x01ff, %edx 933*533d3a49SEdward Gillett jnz LABEL(exit) 934*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 935*533d3a49SEdward Gillett cmp $9, %r11 936*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 937*533d3a49SEdward Gillett#endif 938*533d3a49SEdward Gillett pxor %xmm0, %xmm0 939*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 940*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_7) 941*533d3a49SEdward Gillett 942*533d3a49SEdward Gillett/* 943*533d3a49SEdward Gillett * ashr_8 handles the following cases: 944*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 8 945*533d3a49SEdward Gillett */ 946*533d3a49SEdward Gillett .p2align 4 947*533d3a49SEdward GillettLABEL(ashr_8): 948*533d3a49SEdward Gillett pxor %xmm0, %xmm0 949*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 950*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 951*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 952*533d3a49SEdward Gillett pslldq $8, %xmm2 953*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 954*533d3a49SEdward Gillett psubb %xmm0, %xmm2 955*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 956*533d3a49SEdward Gillett shr %cl, %edx 957*533d3a49SEdward Gillett shr %cl, %r9d 958*533d3a49SEdward Gillett sub %r9d, %edx 959*533d3a49SEdward Gillett jnz LABEL(less32bytes) 960*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 961*533d3a49SEdward Gillett 962*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 963*533d3a49SEdward Gillett 964*533d3a49SEdward Gillett pxor %xmm0, %xmm0 965*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 966*533d3a49SEdward Gillett mov $8, %r9d /* rdi bytes already examined. Used in exit code */ 967*533d3a49SEdward Gillett /* 968*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 969*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 970*533d3a49SEdward Gillett * need to do a nibble. 971*533d3a49SEdward Gillett */ 972*533d3a49SEdward Gillett lea 8(%rdi), %r10 973*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 974*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 975*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 976*533d3a49SEdward Gillett 977*533d3a49SEdward Gillett .p2align 4 978*533d3a49SEdward GillettLABEL(loop_ashr_8): 979*533d3a49SEdward Gillett add $16, %r10 980*533d3a49SEdward Gillett jg LABEL(nibble_ashr_8) 981*533d3a49SEdward Gillett 982*533d3a49SEdward GillettLABEL(gobble_ashr_8): 983*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 984*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 985*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 986*533d3a49SEdward Gillett 987*533d3a49SEdward Gillett psrldq $8, %xmm3 988*533d3a49SEdward Gillett pslldq $8, %xmm2 989*533d3a49SEdward Gillett por %xmm3, %xmm2 990*533d3a49SEdward Gillett 991*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 992*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 993*533d3a49SEdward Gillett psubb %xmm0, %xmm1 994*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 995*533d3a49SEdward Gillett sub $0xffff, %edx 996*533d3a49SEdward Gillett jnz LABEL(exit) 9977c478bd9Sstevel@tonic-gate 9987c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 999*533d3a49SEdward Gillett sub $16, %r11 1000*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 10017c478bd9Sstevel@tonic-gate#endif 10027c478bd9Sstevel@tonic-gate 1003*533d3a49SEdward Gillett add $16, %rcx 1004*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 10057c478bd9Sstevel@tonic-gate 1006*533d3a49SEdward Gillett add $16, %r10 1007*533d3a49SEdward Gillett jg LABEL(nibble_ashr_8) /* cross page boundary */ 1008*533d3a49SEdward Gillett 1009*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1010*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1011*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1012*533d3a49SEdward Gillett 1013*533d3a49SEdward Gillett psrldq $8, %xmm3 1014*533d3a49SEdward Gillett pslldq $8, %xmm2 1015*533d3a49SEdward Gillett por %xmm3, %xmm2 1016*533d3a49SEdward Gillett 1017*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1018*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1019*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1020*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1021*533d3a49SEdward Gillett sub $0xffff, %edx 1022*533d3a49SEdward Gillett jnz LABEL(exit) 10237c478bd9Sstevel@tonic-gate 10247c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 1025*533d3a49SEdward Gillett sub $16, %r11 1026*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 10277c478bd9Sstevel@tonic-gate#endif 10287c478bd9Sstevel@tonic-gate 1029*533d3a49SEdward Gillett add $16, %rcx 1030*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1031*533d3a49SEdward Gillett jmp LABEL(loop_ashr_8) 10327c478bd9Sstevel@tonic-gate 1033*533d3a49SEdward Gillett .p2align 4 1034*533d3a49SEdward GillettLABEL(nibble_ashr_8): 1035*533d3a49SEdward Gillett psrldq $8, %xmm4 1036*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1037*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1038*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 1039*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1040*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1041*533d3a49SEdward Gillett sub $0x00ff, %edx 1042*533d3a49SEdward Gillett jnz LABEL(exit) 1043*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1044*533d3a49SEdward Gillett cmp $8, %r11 1045*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1046*533d3a49SEdward Gillett#endif 1047*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1048*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 1049*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_8) 1050*533d3a49SEdward Gillett 1051*533d3a49SEdward Gillett/* 1052*533d3a49SEdward Gillett * ashr_9 handles the following cases: 1053*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 7 1054*533d3a49SEdward Gillett */ 1055*533d3a49SEdward Gillett .p2align 4 1056*533d3a49SEdward GillettLABEL(ashr_9): 1057*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1058*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 1059*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 1060*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1061*533d3a49SEdward Gillett pslldq $7, %xmm2 1062*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 1063*533d3a49SEdward Gillett psubb %xmm0, %xmm2 1064*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 1065*533d3a49SEdward Gillett shr %cl, %edx 1066*533d3a49SEdward Gillett shr %cl, %r9d 1067*533d3a49SEdward Gillett sub %r9d, %edx 1068*533d3a49SEdward Gillett jnz LABEL(less32bytes) 1069*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 1070*533d3a49SEdward Gillett 1071*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 1072*533d3a49SEdward Gillett 1073*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1074*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 1075*533d3a49SEdward Gillett mov $9, %r9d /* rdi bytes already examined. Used in exit code */ 1076*533d3a49SEdward Gillett /* 1077*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 1078*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 1079*533d3a49SEdward Gillett * need to do a nibble. 1080*533d3a49SEdward Gillett */ 1081*533d3a49SEdward Gillett lea 9(%rdi), %r10 1082*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 1083*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 1084*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 1085*533d3a49SEdward Gillett 1086*533d3a49SEdward Gillett .p2align 4 1087*533d3a49SEdward GillettLABEL(loop_ashr_9): 1088*533d3a49SEdward Gillett add $16, %r10 1089*533d3a49SEdward Gillett jg LABEL(nibble_ashr_9) 1090*533d3a49SEdward Gillett 1091*533d3a49SEdward GillettLABEL(gobble_ashr_9): 1092*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1093*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1094*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1095*533d3a49SEdward Gillett 1096*533d3a49SEdward Gillett psrldq $9, %xmm3 1097*533d3a49SEdward Gillett pslldq $7, %xmm2 1098*533d3a49SEdward Gillett por %xmm3, %xmm2 1099*533d3a49SEdward Gillett 1100*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1101*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1102*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1103*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1104*533d3a49SEdward Gillett sub $0xffff, %edx 1105*533d3a49SEdward Gillett jnz LABEL(exit) 11067c478bd9Sstevel@tonic-gate 11077c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 1108*533d3a49SEdward Gillett sub $16, %r11 1109*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 11107c478bd9Sstevel@tonic-gate#endif 11117c478bd9Sstevel@tonic-gate 1112*533d3a49SEdward Gillett add $16, %rcx 1113*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 11147c478bd9Sstevel@tonic-gate 1115*533d3a49SEdward Gillett add $16, %r10 1116*533d3a49SEdward Gillett jg LABEL(nibble_ashr_9) /* cross page boundary */ 1117*533d3a49SEdward Gillett 1118*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1119*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1120*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1121*533d3a49SEdward Gillett 1122*533d3a49SEdward Gillett psrldq $9, %xmm3 1123*533d3a49SEdward Gillett pslldq $7, %xmm2 1124*533d3a49SEdward Gillett por %xmm3, %xmm2 1125*533d3a49SEdward Gillett 1126*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1127*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1128*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1129*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1130*533d3a49SEdward Gillett sub $0xffff, %edx 1131*533d3a49SEdward Gillett jnz LABEL(exit) 11327c478bd9Sstevel@tonic-gate 11337c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 1134*533d3a49SEdward Gillett sub $16, %r11 1135*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 11367c478bd9Sstevel@tonic-gate#endif 11377c478bd9Sstevel@tonic-gate 1138*533d3a49SEdward Gillett add $16, %rcx 1139*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 /* store for next cycle */ 1140*533d3a49SEdward Gillett jmp LABEL(loop_ashr_9) 11417c478bd9Sstevel@tonic-gate 1142*533d3a49SEdward Gillett .p2align 4 1143*533d3a49SEdward GillettLABEL(nibble_ashr_9): 1144*533d3a49SEdward Gillett psrldq $9, %xmm4 1145*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1146*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1147*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 1148*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1149*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1150*533d3a49SEdward Gillett sub $0x007f, %edx 1151*533d3a49SEdward Gillett jnz LABEL(exit) 1152*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1153*533d3a49SEdward Gillett cmp $7, %r11 1154*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1155*533d3a49SEdward Gillett#endif 1156*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1157*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 1158*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_9) 1159*533d3a49SEdward Gillett 1160*533d3a49SEdward Gillett/* 1161*533d3a49SEdward Gillett * ashr_10 handles the following cases: 1162*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 6 1163*533d3a49SEdward Gillett */ 1164*533d3a49SEdward Gillett .p2align 4 1165*533d3a49SEdward GillettLABEL(ashr_10): 1166*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1167*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 1168*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 1169*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1170*533d3a49SEdward Gillett pslldq $6, %xmm2 1171*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 1172*533d3a49SEdward Gillett psubb %xmm0, %xmm2 1173*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 1174*533d3a49SEdward Gillett shr %cl, %edx 1175*533d3a49SEdward Gillett shr %cl, %r9d 1176*533d3a49SEdward Gillett sub %r9d, %edx 1177*533d3a49SEdward Gillett jnz LABEL(less32bytes) 1178*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 1179*533d3a49SEdward Gillett 1180*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 1181*533d3a49SEdward Gillett 1182*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1183*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 1184*533d3a49SEdward Gillett mov $10, %r9d /* rdi bytes already examined. Used in exit code */ 1185*533d3a49SEdward Gillett /* 1186*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 1187*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 1188*533d3a49SEdward Gillett * need to do a nibble. 1189*533d3a49SEdward Gillett */ 1190*533d3a49SEdward Gillett lea 10(%rdi), %r10 1191*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 1192*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 1193*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 1194*533d3a49SEdward Gillett 1195*533d3a49SEdward Gillett .p2align 4 1196*533d3a49SEdward GillettLABEL(loop_ashr_10): 1197*533d3a49SEdward Gillett add $16, %r10 1198*533d3a49SEdward Gillett jg LABEL(nibble_ashr_10) 1199*533d3a49SEdward Gillett 1200*533d3a49SEdward GillettLABEL(gobble_ashr_10): 1201*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1202*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1203*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1204*533d3a49SEdward Gillett 1205*533d3a49SEdward Gillett psrldq $10, %xmm3 1206*533d3a49SEdward Gillett pslldq $6, %xmm2 1207*533d3a49SEdward Gillett por %xmm3, %xmm2 1208*533d3a49SEdward Gillett 1209*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1210*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1211*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1212*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1213*533d3a49SEdward Gillett sub $0xffff, %edx 1214*533d3a49SEdward Gillett jnz LABEL(exit) 12157c478bd9Sstevel@tonic-gate 12167c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 1217*533d3a49SEdward Gillett sub $16, %r11 1218*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 12197c478bd9Sstevel@tonic-gate#endif 12207c478bd9Sstevel@tonic-gate 1221*533d3a49SEdward Gillett add $16, %rcx 1222*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 12237c478bd9Sstevel@tonic-gate 1224*533d3a49SEdward Gillett add $16, %r10 1225*533d3a49SEdward Gillett jg LABEL(nibble_ashr_10) /* cross page boundary */ 12267c478bd9Sstevel@tonic-gate 1227*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1228*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1229*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 12307c478bd9Sstevel@tonic-gate 1231*533d3a49SEdward Gillett psrldq $10, %xmm3 1232*533d3a49SEdward Gillett pslldq $6, %xmm2 1233*533d3a49SEdward Gillett por %xmm3, %xmm2 1234*533d3a49SEdward Gillett 1235*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1236*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1237*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1238*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1239*533d3a49SEdward Gillett sub $0xffff, %edx 1240*533d3a49SEdward Gillett jnz LABEL(exit) 1241*533d3a49SEdward Gillett 1242*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1243*533d3a49SEdward Gillett sub $16, %r11 1244*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1245*533d3a49SEdward Gillett#endif 1246*533d3a49SEdward Gillett 1247*533d3a49SEdward Gillett add $16, %rcx 1248*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1249*533d3a49SEdward Gillett jmp LABEL(loop_ashr_10) 1250*533d3a49SEdward Gillett 1251*533d3a49SEdward Gillett .p2align 4 1252*533d3a49SEdward GillettLABEL(nibble_ashr_10): 1253*533d3a49SEdward Gillett psrldq $10, %xmm4 1254*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1255*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1256*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 1257*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1258*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1259*533d3a49SEdward Gillett sub $0x003f, %edx 1260*533d3a49SEdward Gillett jnz LABEL(exit) 1261*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1262*533d3a49SEdward Gillett cmp $6, %r11 1263*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1264*533d3a49SEdward Gillett#endif 1265*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1266*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 1267*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_10) 1268*533d3a49SEdward Gillett 1269*533d3a49SEdward Gillett/* 1270*533d3a49SEdward Gillett * ashr_11 handles the following cases: 1271*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 5 1272*533d3a49SEdward Gillett */ 1273*533d3a49SEdward Gillett .p2align 4 1274*533d3a49SEdward GillettLABEL(ashr_11): 1275*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1276*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 1277*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 1278*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1279*533d3a49SEdward Gillett pslldq $5, %xmm2 1280*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 1281*533d3a49SEdward Gillett psubb %xmm0, %xmm2 1282*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 1283*533d3a49SEdward Gillett shr %cl, %edx 1284*533d3a49SEdward Gillett shr %cl, %r9d 1285*533d3a49SEdward Gillett sub %r9d, %edx 1286*533d3a49SEdward Gillett jnz LABEL(less32bytes) 1287*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 1288*533d3a49SEdward Gillett 1289*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 1290*533d3a49SEdward Gillett 1291*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1292*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 1293*533d3a49SEdward Gillett mov $11, %r9d /* rdi bytes already examined. Used in exit code */ 1294*533d3a49SEdward Gillett /* 1295*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 1296*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 1297*533d3a49SEdward Gillett * need to do a nibble. 1298*533d3a49SEdward Gillett */ 1299*533d3a49SEdward Gillett lea 11(%rdi), %r10 1300*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 1301*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 1302*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 1303*533d3a49SEdward Gillett 1304*533d3a49SEdward Gillett .p2align 4 1305*533d3a49SEdward GillettLABEL(loop_ashr_11): 1306*533d3a49SEdward Gillett add $16, %r10 1307*533d3a49SEdward Gillett jg LABEL(nibble_ashr_11) 1308*533d3a49SEdward Gillett 1309*533d3a49SEdward GillettLABEL(gobble_ashr_11): 1310*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1311*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1312*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1313*533d3a49SEdward Gillett 1314*533d3a49SEdward Gillett psrldq $11, %xmm3 1315*533d3a49SEdward Gillett pslldq $5, %xmm2 1316*533d3a49SEdward Gillett por %xmm3, %xmm2 1317*533d3a49SEdward Gillett 1318*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1319*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1320*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1321*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1322*533d3a49SEdward Gillett sub $0xffff, %edx 1323*533d3a49SEdward Gillett jnz LABEL(exit) 1324*533d3a49SEdward Gillett 1325*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1326*533d3a49SEdward Gillett sub $16, %r11 1327*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1328*533d3a49SEdward Gillett#endif 1329*533d3a49SEdward Gillett 1330*533d3a49SEdward Gillett add $16, %rcx 1331*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1332*533d3a49SEdward Gillett 1333*533d3a49SEdward Gillett add $16, %r10 1334*533d3a49SEdward Gillett jg LABEL(nibble_ashr_11) /* cross page boundary */ 1335*533d3a49SEdward Gillett 1336*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1337*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1338*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1339*533d3a49SEdward Gillett 1340*533d3a49SEdward Gillett psrldq $11, %xmm3 1341*533d3a49SEdward Gillett pslldq $5, %xmm2 1342*533d3a49SEdward Gillett por %xmm3, %xmm2 1343*533d3a49SEdward Gillett 1344*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1345*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1346*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1347*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1348*533d3a49SEdward Gillett sub $0xffff, %edx 1349*533d3a49SEdward Gillett jnz LABEL(exit) 1350*533d3a49SEdward Gillett 1351*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1352*533d3a49SEdward Gillett sub $16, %r11 1353*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1354*533d3a49SEdward Gillett#endif 1355*533d3a49SEdward Gillett 1356*533d3a49SEdward Gillett add $16, %rcx 1357*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1358*533d3a49SEdward Gillett jmp LABEL(loop_ashr_11) 1359*533d3a49SEdward Gillett 1360*533d3a49SEdward Gillett .p2align 4 1361*533d3a49SEdward GillettLABEL(nibble_ashr_11): 1362*533d3a49SEdward Gillett psrldq $11, %xmm4 1363*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1364*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1365*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 1366*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1367*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1368*533d3a49SEdward Gillett sub $0x001f, %edx 1369*533d3a49SEdward Gillett jnz LABEL(exit) 1370*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1371*533d3a49SEdward Gillett cmp $5, %r11 1372*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1373*533d3a49SEdward Gillett#endif 1374*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1375*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 1376*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_11) 1377*533d3a49SEdward Gillett 1378*533d3a49SEdward Gillett/* 1379*533d3a49SEdward Gillett * ashr_12 handles the following cases: 1380*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 4 1381*533d3a49SEdward Gillett */ 1382*533d3a49SEdward Gillett .p2align 4 1383*533d3a49SEdward GillettLABEL(ashr_12): 1384*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1385*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 1386*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 1387*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1388*533d3a49SEdward Gillett pslldq $4, %xmm2 1389*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 1390*533d3a49SEdward Gillett psubb %xmm0, %xmm2 1391*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 1392*533d3a49SEdward Gillett shr %cl, %edx 1393*533d3a49SEdward Gillett shr %cl, %r9d 1394*533d3a49SEdward Gillett sub %r9d, %edx 1395*533d3a49SEdward Gillett jnz LABEL(less32bytes) 1396*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 1397*533d3a49SEdward Gillett 1398*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 1399*533d3a49SEdward Gillett 1400*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1401*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 1402*533d3a49SEdward Gillett mov $12, %r9d /* rdi bytes already examined. Used in exit code */ 1403*533d3a49SEdward Gillett /* 1404*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 1405*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 1406*533d3a49SEdward Gillett * need to do a nibble. 1407*533d3a49SEdward Gillett */ 1408*533d3a49SEdward Gillett lea 12(%rdi), %r10 1409*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 1410*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 1411*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 1412*533d3a49SEdward Gillett 1413*533d3a49SEdward Gillett .p2align 4 1414*533d3a49SEdward GillettLABEL(loop_ashr_12): 1415*533d3a49SEdward Gillett add $16, %r10 1416*533d3a49SEdward Gillett jg LABEL(nibble_ashr_12) 1417*533d3a49SEdward Gillett 1418*533d3a49SEdward GillettLABEL(gobble_ashr_12): 1419*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1420*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1421*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1422*533d3a49SEdward Gillett 1423*533d3a49SEdward Gillett psrldq $12, %xmm3 1424*533d3a49SEdward Gillett pslldq $4, %xmm2 1425*533d3a49SEdward Gillett por %xmm3, %xmm2 1426*533d3a49SEdward Gillett 1427*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1428*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1429*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1430*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1431*533d3a49SEdward Gillett sub $0xffff, %edx 1432*533d3a49SEdward Gillett jnz LABEL(exit) 1433*533d3a49SEdward Gillett 1434*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1435*533d3a49SEdward Gillett sub $16, %r11 1436*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1437*533d3a49SEdward Gillett#endif 1438*533d3a49SEdward Gillett 1439*533d3a49SEdward Gillett add $16, %rcx 1440*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1441*533d3a49SEdward Gillett 1442*533d3a49SEdward Gillett add $16, %r10 1443*533d3a49SEdward Gillett jg LABEL(nibble_ashr_12) /* cross page boundary */ 1444*533d3a49SEdward Gillett 1445*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1446*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1447*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1448*533d3a49SEdward Gillett 1449*533d3a49SEdward Gillett psrldq $12, %xmm3 1450*533d3a49SEdward Gillett pslldq $4, %xmm2 1451*533d3a49SEdward Gillett por %xmm3, %xmm2 1452*533d3a49SEdward Gillett 1453*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1454*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1455*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1456*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1457*533d3a49SEdward Gillett sub $0xffff, %edx 1458*533d3a49SEdward Gillett jnz LABEL(exit) 1459*533d3a49SEdward Gillett 1460*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1461*533d3a49SEdward Gillett sub $16, %r11 1462*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1463*533d3a49SEdward Gillett#endif 1464*533d3a49SEdward Gillett 1465*533d3a49SEdward Gillett add $16, %rcx 1466*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1467*533d3a49SEdward Gillett jmp LABEL(loop_ashr_12) 1468*533d3a49SEdward Gillett 1469*533d3a49SEdward Gillett .p2align 4 1470*533d3a49SEdward GillettLABEL(nibble_ashr_12): 1471*533d3a49SEdward Gillett psrldq $12, %xmm4 1472*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1473*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1474*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 1475*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1476*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1477*533d3a49SEdward Gillett sub $0x000f, %edx 1478*533d3a49SEdward Gillett jnz LABEL(exit) 1479*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1480*533d3a49SEdward Gillett cmp $4, %r11 1481*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1482*533d3a49SEdward Gillett#endif 1483*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1484*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 1485*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_12) 1486*533d3a49SEdward Gillett 1487*533d3a49SEdward Gillett/* 1488*533d3a49SEdward Gillett * ashr_13 handles the following cases: 1489*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 3 1490*533d3a49SEdward Gillett */ 1491*533d3a49SEdward Gillett .p2align 4 1492*533d3a49SEdward GillettLABEL(ashr_13): 1493*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1494*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 1495*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 1496*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1497*533d3a49SEdward Gillett pslldq $3, %xmm2 1498*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 1499*533d3a49SEdward Gillett psubb %xmm0, %xmm2 1500*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 1501*533d3a49SEdward Gillett shr %cl, %edx 1502*533d3a49SEdward Gillett shr %cl, %r9d 1503*533d3a49SEdward Gillett sub %r9d, %edx 1504*533d3a49SEdward Gillett jnz LABEL(less32bytes) 1505*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 1506*533d3a49SEdward Gillett 1507*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 1508*533d3a49SEdward Gillett 1509*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1510*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 1511*533d3a49SEdward Gillett mov $13, %r9d /* rdi bytes already examined. Used in exit code */ 1512*533d3a49SEdward Gillett /* 1513*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 1514*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 1515*533d3a49SEdward Gillett * need to do a nibble. 1516*533d3a49SEdward Gillett */ 1517*533d3a49SEdward Gillett lea 13(%rdi), %r10 1518*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 1519*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 1520*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 1521*533d3a49SEdward Gillett 1522*533d3a49SEdward Gillett .p2align 4 1523*533d3a49SEdward GillettLABEL(loop_ashr_13): 1524*533d3a49SEdward Gillett add $16, %r10 1525*533d3a49SEdward Gillett jg LABEL(nibble_ashr_13) 1526*533d3a49SEdward Gillett 1527*533d3a49SEdward GillettLABEL(gobble_ashr_13): 1528*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1529*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1530*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1531*533d3a49SEdward Gillett 1532*533d3a49SEdward Gillett psrldq $13, %xmm3 1533*533d3a49SEdward Gillett pslldq $3, %xmm2 1534*533d3a49SEdward Gillett por %xmm3, %xmm2 1535*533d3a49SEdward Gillett 1536*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1537*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1538*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1539*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1540*533d3a49SEdward Gillett sub $0xffff, %edx 1541*533d3a49SEdward Gillett jnz LABEL(exit) 1542*533d3a49SEdward Gillett 1543*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1544*533d3a49SEdward Gillett sub $16, %r11 1545*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1546*533d3a49SEdward Gillett#endif 1547*533d3a49SEdward Gillett 1548*533d3a49SEdward Gillett add $16, %rcx 1549*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1550*533d3a49SEdward Gillett 1551*533d3a49SEdward Gillett add $16, %r10 1552*533d3a49SEdward Gillett jg LABEL(nibble_ashr_13) /* cross page boundary */ 1553*533d3a49SEdward Gillett 1554*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1555*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1556*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1557*533d3a49SEdward Gillett 1558*533d3a49SEdward Gillett psrldq $13, %xmm3 1559*533d3a49SEdward Gillett pslldq $3, %xmm2 1560*533d3a49SEdward Gillett por %xmm3, %xmm2 1561*533d3a49SEdward Gillett 1562*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1563*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1564*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1565*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1566*533d3a49SEdward Gillett sub $0xffff, %edx 1567*533d3a49SEdward Gillett jnz LABEL(exit) 1568*533d3a49SEdward Gillett 1569*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1570*533d3a49SEdward Gillett sub $16, %r11 1571*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1572*533d3a49SEdward Gillett#endif 1573*533d3a49SEdward Gillett 1574*533d3a49SEdward Gillett add $16, %rcx 1575*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1576*533d3a49SEdward Gillett jmp LABEL(loop_ashr_13) 1577*533d3a49SEdward Gillett 1578*533d3a49SEdward Gillett .p2align 4 1579*533d3a49SEdward GillettLABEL(nibble_ashr_13): 1580*533d3a49SEdward Gillett psrldq $13, %xmm4 1581*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1582*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1583*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 1584*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1585*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1586*533d3a49SEdward Gillett sub $0x0007, %edx 1587*533d3a49SEdward Gillett jnz LABEL(exit) 1588*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1589*533d3a49SEdward Gillett cmp $3, %r11 1590*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1591*533d3a49SEdward Gillett#endif 1592*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1593*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 1594*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_13) 1595*533d3a49SEdward Gillett 1596*533d3a49SEdward Gillett/* 1597*533d3a49SEdward Gillett * ashr_14 handles the following cases: 1598*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 2 1599*533d3a49SEdward Gillett */ 1600*533d3a49SEdward Gillett .p2align 4 1601*533d3a49SEdward GillettLABEL(ashr_14): 1602*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1603*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 1604*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 1605*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1606*533d3a49SEdward Gillett pslldq $2, %xmm2 1607*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 1608*533d3a49SEdward Gillett psubb %xmm0, %xmm2 1609*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 1610*533d3a49SEdward Gillett shr %cl, %edx 1611*533d3a49SEdward Gillett shr %cl, %r9d 1612*533d3a49SEdward Gillett sub %r9d, %edx 1613*533d3a49SEdward Gillett jnz LABEL(less32bytes) 1614*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 1615*533d3a49SEdward Gillett 1616*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 1617*533d3a49SEdward Gillett 1618*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1619*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 1620*533d3a49SEdward Gillett mov $14, %r9d /* rdi bytes already examined. Used in exit code */ 1621*533d3a49SEdward Gillett /* 1622*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 1623*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 1624*533d3a49SEdward Gillett * need to do a nibble. 1625*533d3a49SEdward Gillett */ 1626*533d3a49SEdward Gillett lea 14(%rdi), %r10 1627*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 1628*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 1629*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 1630*533d3a49SEdward Gillett 1631*533d3a49SEdward Gillett .p2align 4 1632*533d3a49SEdward GillettLABEL(loop_ashr_14): 1633*533d3a49SEdward Gillett add $16, %r10 1634*533d3a49SEdward Gillett jg LABEL(nibble_ashr_14) 1635*533d3a49SEdward Gillett 1636*533d3a49SEdward GillettLABEL(gobble_ashr_14): 1637*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1638*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1639*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1640*533d3a49SEdward Gillett 1641*533d3a49SEdward Gillett psrldq $14, %xmm3 1642*533d3a49SEdward Gillett pslldq $2, %xmm2 1643*533d3a49SEdward Gillett por %xmm3, %xmm2 1644*533d3a49SEdward Gillett 1645*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1646*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1647*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1648*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1649*533d3a49SEdward Gillett sub $0xffff, %edx 1650*533d3a49SEdward Gillett jnz LABEL(exit) 1651*533d3a49SEdward Gillett 1652*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1653*533d3a49SEdward Gillett sub $16, %r11 1654*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1655*533d3a49SEdward Gillett#endif 1656*533d3a49SEdward Gillett 1657*533d3a49SEdward Gillett add $16, %rcx 1658*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1659*533d3a49SEdward Gillett 1660*533d3a49SEdward Gillett add $16, %r10 1661*533d3a49SEdward Gillett jg LABEL(nibble_ashr_14) /* cross page boundary */ 1662*533d3a49SEdward Gillett 1663*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1664*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1665*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1666*533d3a49SEdward Gillett 1667*533d3a49SEdward Gillett psrldq $14, %xmm3 1668*533d3a49SEdward Gillett pslldq $2, %xmm2 1669*533d3a49SEdward Gillett por %xmm3, %xmm2 1670*533d3a49SEdward Gillett 1671*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1672*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1673*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1674*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1675*533d3a49SEdward Gillett sub $0xffff, %edx 1676*533d3a49SEdward Gillett jnz LABEL(exit) 1677*533d3a49SEdward Gillett 1678*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1679*533d3a49SEdward Gillett sub $16, %r11 1680*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1681*533d3a49SEdward Gillett#endif 1682*533d3a49SEdward Gillett 1683*533d3a49SEdward Gillett add $16, %rcx 1684*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1685*533d3a49SEdward Gillett jmp LABEL(loop_ashr_14) 1686*533d3a49SEdward Gillett 1687*533d3a49SEdward Gillett .p2align 4 1688*533d3a49SEdward GillettLABEL(nibble_ashr_14): 1689*533d3a49SEdward Gillett psrldq $14, %xmm4 1690*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1691*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1692*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 1693*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1694*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1695*533d3a49SEdward Gillett sub $0x0003, %edx 1696*533d3a49SEdward Gillett jnz LABEL(exit) 1697*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1698*533d3a49SEdward Gillett cmp $2, %r11 1699*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1700*533d3a49SEdward Gillett#endif 1701*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1702*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 1703*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_14) 1704*533d3a49SEdward Gillett 1705*533d3a49SEdward Gillett/* 1706*533d3a49SEdward Gillett * ashr_15 handles the following cases: 1707*533d3a49SEdward Gillett * abs(str1 offset - str2 offset) = 1 1708*533d3a49SEdward Gillett */ 1709*533d3a49SEdward Gillett .p2align 4 1710*533d3a49SEdward GillettLABEL(ashr_15): 1711*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1712*533d3a49SEdward Gillett movdqa (%rdi), %xmm2 1713*533d3a49SEdward Gillett movdqa (%rsi), %xmm1 1714*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1715*533d3a49SEdward Gillett pslldq $1, %xmm2 1716*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm2 1717*533d3a49SEdward Gillett psubb %xmm0, %xmm2 1718*533d3a49SEdward Gillett pmovmskb %xmm2, %r9d 1719*533d3a49SEdward Gillett shr %cl, %edx 1720*533d3a49SEdward Gillett shr %cl, %r9d 1721*533d3a49SEdward Gillett sub %r9d, %edx 1722*533d3a49SEdward Gillett jnz LABEL(less32bytes) 1723*533d3a49SEdward Gillett 1724*533d3a49SEdward Gillett movdqa (%rdi), %xmm3 1725*533d3a49SEdward Gillett 1726*533d3a49SEdward Gillett UPDATE_STRNCMP_COUNTER 1727*533d3a49SEdward Gillett 1728*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1729*533d3a49SEdward Gillett mov $16, %rcx /* index for loads */ 1730*533d3a49SEdward Gillett mov $15, %r9d /* rdi bytes already examined. Used in exit code */ 1731*533d3a49SEdward Gillett /* 1732*533d3a49SEdward Gillett * Setup %r10 value allows us to detect crossing a page boundary. 1733*533d3a49SEdward Gillett * When %r10 goes positive we are crossing a page boundary and 1734*533d3a49SEdward Gillett * need to do a nibble. 1735*533d3a49SEdward Gillett */ 1736*533d3a49SEdward Gillett lea 15(%rdi), %r10 1737*533d3a49SEdward Gillett and $0xfff, %r10 /* offset into 4K page */ 1738*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K pagesize */ 1739*533d3a49SEdward Gillett movdqa %xmm3, %xmm4 1740*533d3a49SEdward Gillett 1741*533d3a49SEdward Gillett .p2align 4 1742*533d3a49SEdward GillettLABEL(loop_ashr_15): 1743*533d3a49SEdward Gillett add $16, %r10 1744*533d3a49SEdward Gillett jg LABEL(nibble_ashr_15) 1745*533d3a49SEdward Gillett 1746*533d3a49SEdward GillettLABEL(gobble_ashr_15): 1747*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1748*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1749*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1750*533d3a49SEdward Gillett 1751*533d3a49SEdward Gillett psrldq $15, %xmm3 1752*533d3a49SEdward Gillett pslldq $1, %xmm2 1753*533d3a49SEdward Gillett por %xmm3, %xmm2 1754*533d3a49SEdward Gillett 1755*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1756*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1757*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1758*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1759*533d3a49SEdward Gillett sub $0xffff, %edx 1760*533d3a49SEdward Gillett jnz LABEL(exit) 1761*533d3a49SEdward Gillett 1762*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1763*533d3a49SEdward Gillett sub $16, %r11 1764*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1765*533d3a49SEdward Gillett#endif 1766*533d3a49SEdward Gillett 1767*533d3a49SEdward Gillett add $16, %rcx 1768*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1769*533d3a49SEdward Gillett 1770*533d3a49SEdward Gillett add $16, %r10 1771*533d3a49SEdward Gillett jg LABEL(nibble_ashr_15) /* cross page boundary */ 1772*533d3a49SEdward Gillett 1773*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1774*533d3a49SEdward Gillett movdqa (%rdi, %rcx), %xmm2 1775*533d3a49SEdward Gillett movdqa %xmm2, %xmm4 1776*533d3a49SEdward Gillett 1777*533d3a49SEdward Gillett psrldq $15, %xmm3 1778*533d3a49SEdward Gillett pslldq $1, %xmm2 1779*533d3a49SEdward Gillett por %xmm3, %xmm2 1780*533d3a49SEdward Gillett 1781*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1782*533d3a49SEdward Gillett pcmpeqb %xmm2, %xmm1 1783*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1784*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1785*533d3a49SEdward Gillett sub $0xffff, %edx 1786*533d3a49SEdward Gillett jnz LABEL(exit) 1787*533d3a49SEdward Gillett 1788*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1789*533d3a49SEdward Gillett sub $16, %r11 1790*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1791*533d3a49SEdward Gillett#endif 1792*533d3a49SEdward Gillett 1793*533d3a49SEdward Gillett add $16, %rcx 1794*533d3a49SEdward Gillett movdqa %xmm4, %xmm3 1795*533d3a49SEdward Gillett jmp LABEL(loop_ashr_15) 1796*533d3a49SEdward Gillett 1797*533d3a49SEdward Gillett .p2align 4 1798*533d3a49SEdward GillettLABEL(nibble_ashr_15): 1799*533d3a49SEdward Gillett psrldq $15, %xmm4 1800*533d3a49SEdward Gillett movdqa (%rsi, %rcx), %xmm1 1801*533d3a49SEdward Gillett pcmpeqb %xmm1, %xmm0 1802*533d3a49SEdward Gillett pcmpeqb %xmm4, %xmm1 1803*533d3a49SEdward Gillett psubb %xmm0, %xmm1 1804*533d3a49SEdward Gillett pmovmskb %xmm1, %edx 1805*533d3a49SEdward Gillett sub $0x0001, %edx 1806*533d3a49SEdward Gillett jnz LABEL(exit) 1807*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1808*533d3a49SEdward Gillett cmp $1, %r11 1809*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1810*533d3a49SEdward Gillett#endif 1811*533d3a49SEdward Gillett pxor %xmm0, %xmm0 1812*533d3a49SEdward Gillett sub $0x1000, %r10 /* subtract 4K from %r10 */ 1813*533d3a49SEdward Gillett jmp LABEL(gobble_ashr_15) 1814*533d3a49SEdward Gillett 1815*533d3a49SEdward Gillett .p2align 4 18167c478bd9Sstevel@tonic-gateLABEL(exit): 1817*533d3a49SEdward Gillett lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ 1818*533d3a49SEdward GillettLABEL(less32bytes): 1819*533d3a49SEdward Gillett lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ 1820*533d3a49SEdward Gillett lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ 1821*533d3a49SEdward Gillett test %r8d, %r8d 1822*533d3a49SEdward Gillett jz LABEL(ret) 1823*533d3a49SEdward Gillett xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ 18247c478bd9Sstevel@tonic-gate 1825*533d3a49SEdward Gillett .p2align 4 1826*533d3a49SEdward GillettLABEL(ret): 1827*533d3a49SEdward GillettLABEL(less16bytes): 1828*533d3a49SEdward Gillett /* 1829*533d3a49SEdward Gillett * Check to see if BSF is fast on this processor. If not, use a different 1830*533d3a49SEdward Gillett * exit tail. 1831*533d3a49SEdward Gillett */ 1832*533d3a49SEdward Gillett testl $USE_BSF,.memops_method(%rip) 1833*533d3a49SEdward Gillett jz LABEL(AMD_exit) 1834*533d3a49SEdward Gillett bsf %rdx, %rdx /* find and store bit index in %rdx */ 18357c478bd9Sstevel@tonic-gate 18367c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 1837*533d3a49SEdward Gillett sub %rdx, %r11 1838*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 18397c478bd9Sstevel@tonic-gate#endif 1840*533d3a49SEdward Gillett xor %ecx, %ecx /* clear %ecx */ 1841*533d3a49SEdward Gillett xor %eax, %eax /* clear %eax */ 1842*533d3a49SEdward Gillett 1843*533d3a49SEdward Gillett movb (%rsi, %rdx), %cl 1844*533d3a49SEdward Gillett movb (%rdi, %rdx), %al 1845*533d3a49SEdward Gillett 1846*533d3a49SEdward Gillett sub %ecx, %eax 18477c478bd9Sstevel@tonic-gate ret 18487c478bd9Sstevel@tonic-gate 18497c478bd9Sstevel@tonic-gate#ifdef USE_AS_STRNCMP 1850*533d3a49SEdward GillettLABEL(strcmp_exitz): 1851*533d3a49SEdward Gillett xor %eax, %eax 1852*533d3a49SEdward Gillett ret 1853*533d3a49SEdward Gillett#endif 1854*533d3a49SEdward Gillett 1855*533d3a49SEdward Gillett /* 1856*533d3a49SEdward Gillett * This exit tail does not use the bsf instruction. 1857*533d3a49SEdward Gillett */ 1858*533d3a49SEdward Gillett .p2align 4 1859*533d3a49SEdward GillettLABEL(AMD_exit): 1860*533d3a49SEdward Gillett test %dl, %dl 1861*533d3a49SEdward Gillett jz LABEL(next_8_bytes) 1862*533d3a49SEdward Gillett 1863*533d3a49SEdward Gillett test $0x01, %dl 1864*533d3a49SEdward Gillett jnz LABEL(Byte0) 1865*533d3a49SEdward Gillett 1866*533d3a49SEdward Gillett test $0x02, %dl 1867*533d3a49SEdward Gillett jnz LABEL(Byte1) 1868*533d3a49SEdward Gillett 1869*533d3a49SEdward Gillett test $0x04, %dl 1870*533d3a49SEdward Gillett jnz LABEL(Byte2) 1871*533d3a49SEdward Gillett 1872*533d3a49SEdward Gillett test $0x08, %dl 1873*533d3a49SEdward Gillett jnz LABEL(Byte3) 1874*533d3a49SEdward Gillett 1875*533d3a49SEdward Gillett test $0x10, %dl 1876*533d3a49SEdward Gillett jnz LABEL(Byte4) 1877*533d3a49SEdward Gillett 1878*533d3a49SEdward Gillett test $0x20, %dl 1879*533d3a49SEdward Gillett jnz LABEL(Byte5) 1880*533d3a49SEdward Gillett 1881*533d3a49SEdward Gillett test $0x40, %dl 1882*533d3a49SEdward Gillett jnz LABEL(Byte6) 1883*533d3a49SEdward Gillett 1884*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1885*533d3a49SEdward Gillett sub $7, %r11 1886*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1887*533d3a49SEdward Gillett#endif 1888*533d3a49SEdward Gillett movzx 7(%rsi), %ecx 1889*533d3a49SEdward Gillett movzx 7(%rdi), %eax 1890*533d3a49SEdward Gillett 1891*533d3a49SEdward Gillett sub %ecx, %eax 1892*533d3a49SEdward Gillett ret 1893*533d3a49SEdward Gillett 1894*533d3a49SEdward Gillett .p2align 4 1895*533d3a49SEdward GillettLABEL(Byte0): 1896*533d3a49SEdward Gillett /* 1897*533d3a49SEdward Gillett * never need to handle byte 0 for strncmpy 1898*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1899*533d3a49SEdward Gillett sub $0, %r11 1900*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1901*533d3a49SEdward Gillett#endif 1902*533d3a49SEdward Gillett */ 1903*533d3a49SEdward Gillett movzx (%rsi), %ecx 1904*533d3a49SEdward Gillett movzx (%rdi), %eax 1905*533d3a49SEdward Gillett 1906*533d3a49SEdward Gillett sub %ecx, %eax 1907*533d3a49SEdward Gillett ret 1908*533d3a49SEdward Gillett 1909*533d3a49SEdward Gillett .p2align 4 1910*533d3a49SEdward GillettLABEL(Byte1): 1911*533d3a49SEdward Gillett 1912*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1913*533d3a49SEdward Gillett sub $1, %r11 1914*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1915*533d3a49SEdward Gillett#endif 1916*533d3a49SEdward Gillett movzx 1(%rsi), %ecx 1917*533d3a49SEdward Gillett movzx 1(%rdi), %eax 1918*533d3a49SEdward Gillett 1919*533d3a49SEdward Gillett sub %ecx, %eax 1920*533d3a49SEdward Gillett ret 1921*533d3a49SEdward Gillett 1922*533d3a49SEdward Gillett .p2align 4 1923*533d3a49SEdward GillettLABEL(Byte2): 1924*533d3a49SEdward Gillett 1925*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1926*533d3a49SEdward Gillett sub $2, %r11 1927*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1928*533d3a49SEdward Gillett#endif 1929*533d3a49SEdward Gillett movzx 2(%rsi), %ecx 1930*533d3a49SEdward Gillett movzx 2(%rdi), %eax 1931*533d3a49SEdward Gillett 1932*533d3a49SEdward Gillett sub %ecx, %eax 1933*533d3a49SEdward Gillett ret 1934*533d3a49SEdward Gillett 1935*533d3a49SEdward Gillett .p2align 4 1936*533d3a49SEdward GillettLABEL(Byte3): 1937*533d3a49SEdward Gillett 1938*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1939*533d3a49SEdward Gillett sub $3, %r11 1940*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1941*533d3a49SEdward Gillett#endif 1942*533d3a49SEdward Gillett movzx 3(%rsi), %ecx 1943*533d3a49SEdward Gillett movzx 3(%rdi), %eax 1944*533d3a49SEdward Gillett 1945*533d3a49SEdward Gillett sub %ecx, %eax 1946*533d3a49SEdward Gillett ret 1947*533d3a49SEdward Gillett 1948*533d3a49SEdward Gillett .p2align 4 1949*533d3a49SEdward GillettLABEL(Byte4): 1950*533d3a49SEdward Gillett 1951*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1952*533d3a49SEdward Gillett sub $4, %r11 1953*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1954*533d3a49SEdward Gillett#endif 1955*533d3a49SEdward Gillett movzx 4(%rsi), %ecx 1956*533d3a49SEdward Gillett movzx 4(%rdi), %eax 1957*533d3a49SEdward Gillett 1958*533d3a49SEdward Gillett sub %ecx, %eax 1959*533d3a49SEdward Gillett ret 1960*533d3a49SEdward Gillett 1961*533d3a49SEdward Gillett .p2align 4 1962*533d3a49SEdward GillettLABEL(Byte5): 1963*533d3a49SEdward Gillett 1964*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1965*533d3a49SEdward Gillett sub $5, %r11 1966*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1967*533d3a49SEdward Gillett#endif 1968*533d3a49SEdward Gillett movzx 5(%rsi), %ecx 1969*533d3a49SEdward Gillett movzx 5(%rdi), %eax 1970*533d3a49SEdward Gillett 1971*533d3a49SEdward Gillett sub %ecx, %eax 1972*533d3a49SEdward Gillett ret 1973*533d3a49SEdward Gillett 1974*533d3a49SEdward Gillett .p2align 4 1975*533d3a49SEdward GillettLABEL(Byte6): 1976*533d3a49SEdward Gillett 1977*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1978*533d3a49SEdward Gillett sub $6, %r11 1979*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1980*533d3a49SEdward Gillett#endif 1981*533d3a49SEdward Gillett movzx 6(%rsi), %ecx 1982*533d3a49SEdward Gillett movzx 6(%rdi), %eax 1983*533d3a49SEdward Gillett 1984*533d3a49SEdward Gillett sub %ecx, %eax 1985*533d3a49SEdward Gillett ret 1986*533d3a49SEdward Gillett 1987*533d3a49SEdward Gillett .p2align 4 1988*533d3a49SEdward GillettLABEL(next_8_bytes): 1989*533d3a49SEdward Gillett add $8, %rdi 1990*533d3a49SEdward Gillett add $8, %rsi 1991*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 1992*533d3a49SEdward Gillett sub $8, %r11 1993*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 1994*533d3a49SEdward Gillett#endif 1995*533d3a49SEdward Gillett test $0x01, %dh 1996*533d3a49SEdward Gillett jnz LABEL(Byte0) 1997*533d3a49SEdward Gillett 1998*533d3a49SEdward Gillett test $0x02, %dh 1999*533d3a49SEdward Gillett jnz LABEL(Byte1) 2000*533d3a49SEdward Gillett 2001*533d3a49SEdward Gillett test $0x04, %dh 2002*533d3a49SEdward Gillett jnz LABEL(Byte2) 2003*533d3a49SEdward Gillett 2004*533d3a49SEdward Gillett test $0x08, %dh 2005*533d3a49SEdward Gillett jnz LABEL(Byte3) 2006*533d3a49SEdward Gillett 2007*533d3a49SEdward Gillett test $0x10, %dh 2008*533d3a49SEdward Gillett jnz LABEL(Byte4) 2009*533d3a49SEdward Gillett 2010*533d3a49SEdward Gillett test $0x20, %dh 2011*533d3a49SEdward Gillett jnz LABEL(Byte5) 2012*533d3a49SEdward Gillett 2013*533d3a49SEdward Gillett test $0x40, %dh 2014*533d3a49SEdward Gillett jnz LABEL(Byte6) 2015*533d3a49SEdward Gillett 2016*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 2017*533d3a49SEdward Gillett sub $7, %r11 2018*533d3a49SEdward Gillett jbe LABEL(strcmp_exitz) 2019*533d3a49SEdward Gillett#endif 2020*533d3a49SEdward Gillett movzx 7(%rsi), %ecx 2021*533d3a49SEdward Gillett movzx 7(%rdi), %eax 2022*533d3a49SEdward Gillett 2023*533d3a49SEdward Gillett sub %ecx, %eax 2024*533d3a49SEdward Gillett ret 2025*533d3a49SEdward Gillett 2026*533d3a49SEdward Gillett .pushsection .rodata 2027*533d3a49SEdward Gillett .p2align 4 2028*533d3a49SEdward GillettLABEL(unaligned_table): 2029*533d3a49SEdward Gillett .int LABEL(ashr_0) - LABEL(unaligned_table) 2030*533d3a49SEdward Gillett .int LABEL(ashr_15) - LABEL(unaligned_table) 2031*533d3a49SEdward Gillett .int LABEL(ashr_14) - LABEL(unaligned_table) 2032*533d3a49SEdward Gillett .int LABEL(ashr_13) - LABEL(unaligned_table) 2033*533d3a49SEdward Gillett .int LABEL(ashr_12) - LABEL(unaligned_table) 2034*533d3a49SEdward Gillett .int LABEL(ashr_11) - LABEL(unaligned_table) 2035*533d3a49SEdward Gillett .int LABEL(ashr_10) - LABEL(unaligned_table) 2036*533d3a49SEdward Gillett .int LABEL(ashr_9) - LABEL(unaligned_table) 2037*533d3a49SEdward Gillett .int LABEL(ashr_8) - LABEL(unaligned_table) 2038*533d3a49SEdward Gillett .int LABEL(ashr_7) - LABEL(unaligned_table) 2039*533d3a49SEdward Gillett .int LABEL(ashr_6) - LABEL(unaligned_table) 2040*533d3a49SEdward Gillett .int LABEL(ashr_5) - LABEL(unaligned_table) 2041*533d3a49SEdward Gillett .int LABEL(ashr_4) - LABEL(unaligned_table) 2042*533d3a49SEdward Gillett .int LABEL(ashr_3) - LABEL(unaligned_table) 2043*533d3a49SEdward Gillett .int LABEL(ashr_2) - LABEL(unaligned_table) 2044*533d3a49SEdward Gillett .int LABEL(ashr_1) - LABEL(unaligned_table) 2045*533d3a49SEdward Gillett .popsection 2046*533d3a49SEdward Gillett#ifdef USE_AS_STRNCMP 20477c478bd9Sstevel@tonic-gate SET_SIZE(strncmp) 20487c478bd9Sstevel@tonic-gate#else 20497c478bd9Sstevel@tonic-gate SET_SIZE(strcmp) /* (const char *, const char *) */ 20507c478bd9Sstevel@tonic-gate#endif 2051