1*5d9d9091SRichard Lowe/* 2*5d9d9091SRichard Lowe * CDDL HEADER START 3*5d9d9091SRichard Lowe * 4*5d9d9091SRichard Lowe * The contents of this file are subject to the terms of the 5*5d9d9091SRichard Lowe * Common Development and Distribution License (the "License"). 6*5d9d9091SRichard Lowe * You may not use this file except in compliance with the License. 7*5d9d9091SRichard Lowe * 8*5d9d9091SRichard Lowe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*5d9d9091SRichard Lowe * or http://www.opensolaris.org/os/licensing. 10*5d9d9091SRichard Lowe * See the License for the specific language governing permissions 11*5d9d9091SRichard Lowe * and limitations under the License. 12*5d9d9091SRichard Lowe * 13*5d9d9091SRichard Lowe * When distributing Covered Code, include this CDDL HEADER in each 14*5d9d9091SRichard Lowe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*5d9d9091SRichard Lowe * If applicable, add the following below this CDDL HEADER, with the 16*5d9d9091SRichard Lowe * fields enclosed by brackets "[]" replaced with your own identifying 17*5d9d9091SRichard Lowe * information: Portions Copyright [yyyy] [name of copyright owner] 18*5d9d9091SRichard Lowe * 19*5d9d9091SRichard Lowe * CDDL HEADER END 20*5d9d9091SRichard Lowe */ 21*5d9d9091SRichard Lowe 22*5d9d9091SRichard Lowe/* 23*5d9d9091SRichard Lowe * Copyright (c) 2009, Intel Corporation 24*5d9d9091SRichard Lowe * All rights reserved. 25*5d9d9091SRichard Lowe */ 26*5d9d9091SRichard Lowe 27*5d9d9091SRichard Lowe/* 28*5d9d9091SRichard Lowe * str[n]cmp - compare chars between two string 29*5d9d9091SRichard Lowe */ 30*5d9d9091SRichard Lowe 31*5d9d9091SRichard Lowe#include "SYS.h" 32*5d9d9091SRichard Lowe#include "proc64_id.h" 33*5d9d9091SRichard Lowe 34*5d9d9091SRichard Lowe#define LABEL(s) .strcmp##s 35*5d9d9091SRichard Lowe 36*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 37*5d9d9091SRichard Lowe /* 38*5d9d9091SRichard Lowe * Since the counter, %r11, is unsigned, we branch to strcmp_exitz 39*5d9d9091SRichard Lowe * if the new counter > the old one or is 0. 40*5d9d9091SRichard Lowe */ 41*5d9d9091SRichard Lowe#define UPDATE_STRNCMP_COUNTER \ 42*5d9d9091SRichard Lowe /* calculate left number to compare */ \ 43*5d9d9091SRichard Lowe lea -16(%rcx, %r11), %r9; \ 44*5d9d9091SRichard Lowe cmp %r9, %r11; \ 45*5d9d9091SRichard Lowe jb LABEL(strcmp_exitz); \ 46*5d9d9091SRichard Lowe test %r9, %r9; \ 47*5d9d9091SRichard Lowe je LABEL(strcmp_exitz); \ 48*5d9d9091SRichard Lowe mov %r9, %r11 49*5d9d9091SRichard Lowe#else 50*5d9d9091SRichard Lowe#define UPDATE_STRNCMP_COUNTER 51*5d9d9091SRichard Lowe#endif 52*5d9d9091SRichard Lowe 53*5d9d9091SRichard Lowe /* 54*5d9d9091SRichard Lowe * This implementation uses SSE to compare up to 16 bytes at a time. 55*5d9d9091SRichard Lowe */ 56*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 57*5d9d9091SRichard Lowe ENTRY(strncmp) 58*5d9d9091SRichard Lowe test %rdx, %rdx 59*5d9d9091SRichard Lowe je LABEL(strcmp_exitz) 60*5d9d9091SRichard Lowe mov %rdx, %r11 61*5d9d9091SRichard Lowe#else 62*5d9d9091SRichard Lowe ENTRY(strcmp) /* (const char *, const char *) */ 63*5d9d9091SRichard Lowe#endif 64*5d9d9091SRichard Lowe mov %esi, %ecx 65*5d9d9091SRichard Lowe mov %edi, %eax 66*5d9d9091SRichard Lowe and $0x3f, %rcx /* rsi alignment in cache line */ 67*5d9d9091SRichard Lowe and $0x3f, %rax /* rdi alignment in cache line */ 68*5d9d9091SRichard Lowe cmp $0x30, %ecx 69*5d9d9091SRichard Lowe ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ 70*5d9d9091SRichard Lowe cmp $0x30, %eax 71*5d9d9091SRichard Lowe ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ 72*5d9d9091SRichard Lowe movlpd (%rdi), %xmm1 73*5d9d9091SRichard Lowe movlpd (%rsi), %xmm2 74*5d9d9091SRichard Lowe movhpd 8(%rdi), %xmm1 75*5d9d9091SRichard Lowe movhpd 8(%rsi), %xmm2 76*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ 77*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 78*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ 79*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 80*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 81*5d9d9091SRichard Lowe sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ 82*5d9d9091SRichard Lowe jnz LABEL(less16bytes) /* If not, found mismatch or null char */ 83*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 84*5d9d9091SRichard Lowe sub $16, %r11 85*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) /* finish comparision */ 86*5d9d9091SRichard Lowe#endif 87*5d9d9091SRichard Lowe add $16, %rsi /* prepare to search next 16 bytes */ 88*5d9d9091SRichard Lowe add $16, %rdi /* prepare to search next 16 bytes */ 89*5d9d9091SRichard Lowe 90*5d9d9091SRichard Lowe /* 91*5d9d9091SRichard Lowe * Determine rdi and rsi string offsets from 16-byte alignment. 92*5d9d9091SRichard Lowe * Use relative offset difference between the two to determine which case 93*5d9d9091SRichard Lowe * below to use. 94*5d9d9091SRichard Lowe */ 95*5d9d9091SRichard Lowe .p2align 4 96*5d9d9091SRichard LoweLABEL(crosscache): 97*5d9d9091SRichard Lowe and $0xfffffffffffffff0, %rsi /* force %rsi to be 16 byte aligned */ 98*5d9d9091SRichard Lowe and $0xfffffffffffffff0, %rdi /* force %rdi to be 16 byte aligned */ 99*5d9d9091SRichard Lowe mov $0xffff, %edx /* for equivalent offset */ 100*5d9d9091SRichard Lowe xor %r8d, %r8d 101*5d9d9091SRichard Lowe and $0xf, %ecx /* offset of rsi */ 102*5d9d9091SRichard Lowe and $0xf, %eax /* offset of rdi */ 103*5d9d9091SRichard Lowe cmp %eax, %ecx 104*5d9d9091SRichard Lowe je LABEL(ashr_0) /* both strings have the same alignment */ 105*5d9d9091SRichard Lowe ja LABEL(bigger) 106*5d9d9091SRichard Lowe mov %edx, %r8d /* r8d is offset flag for exit tail */ 107*5d9d9091SRichard Lowe xchg %ecx, %eax 108*5d9d9091SRichard Lowe xchg %rsi, %rdi 109*5d9d9091SRichard LoweLABEL(bigger): 110*5d9d9091SRichard Lowe mov %rcx, %r9 111*5d9d9091SRichard Lowe sub %rax, %r9 112*5d9d9091SRichard Lowe lea LABEL(unaligned_table)(%rip), %r10 113*5d9d9091SRichard Lowe movslq (%r10, %r9, 4), %r9 114*5d9d9091SRichard Lowe lea (%r10, %r9), %r10 115*5d9d9091SRichard Lowe jmp *%r10 /* jump to corresponding case */ 116*5d9d9091SRichard Lowe 117*5d9d9091SRichard Lowe/* 118*5d9d9091SRichard Lowe * ashr_0 handles the following cases: 119*5d9d9091SRichard Lowe * str1 offset = str2 offset 120*5d9d9091SRichard Lowe */ 121*5d9d9091SRichard Lowe .p2align 4 122*5d9d9091SRichard LoweLABEL(ashr_0): 123*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 124*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ 125*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 126*5d9d9091SRichard Lowe pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ 127*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 /* packed sub of comparison results*/ 128*5d9d9091SRichard Lowe pmovmskb %xmm1, %r9d 129*5d9d9091SRichard Lowe shr %cl, %edx /* adjust 0xffff for offset */ 130*5d9d9091SRichard Lowe shr %cl, %r9d /* adjust for 16-byte offset */ 131*5d9d9091SRichard Lowe sub %r9d, %edx 132*5d9d9091SRichard Lowe /* 133*5d9d9091SRichard Lowe * edx must be the same with r9d if in left byte (16-rcx) is equal to 134*5d9d9091SRichard Lowe * the start from (16-rax) and no null char was seen. 135*5d9d9091SRichard Lowe */ 136*5d9d9091SRichard Lowe jne LABEL(less32bytes) /* mismatch or null char */ 137*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 138*5d9d9091SRichard Lowe mov $16, %rcx 139*5d9d9091SRichard Lowe mov $16, %r9 140*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ 141*5d9d9091SRichard Lowe 142*5d9d9091SRichard Lowe /* 143*5d9d9091SRichard Lowe * Now both strings are aligned at 16-byte boundary. Loop over strings 144*5d9d9091SRichard Lowe * checking 32-bytes per iteration. 145*5d9d9091SRichard Lowe */ 146*5d9d9091SRichard Lowe .p2align 4 147*5d9d9091SRichard LoweLABEL(loop_ashr_0): 148*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 149*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 150*5d9d9091SRichard Lowe 151*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 152*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 153*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 154*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 155*5d9d9091SRichard Lowe sub $0xffff, %edx 156*5d9d9091SRichard Lowe jnz LABEL(exit) /* mismatch or null char seen */ 157*5d9d9091SRichard Lowe 158*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 159*5d9d9091SRichard Lowe sub $16, %r11 160*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 161*5d9d9091SRichard Lowe#endif 162*5d9d9091SRichard Lowe add $16, %rcx 163*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 164*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 165*5d9d9091SRichard Lowe 166*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 167*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 168*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 169*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 170*5d9d9091SRichard Lowe sub $0xffff, %edx 171*5d9d9091SRichard Lowe jnz LABEL(exit) 172*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 173*5d9d9091SRichard Lowe sub $16, %r11 174*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 175*5d9d9091SRichard Lowe#endif 176*5d9d9091SRichard Lowe add $16, %rcx 177*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_0) 178*5d9d9091SRichard Lowe 179*5d9d9091SRichard Lowe/* 180*5d9d9091SRichard Lowe * ashr_1 handles the following cases: 181*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 15 182*5d9d9091SRichard Lowe */ 183*5d9d9091SRichard Lowe .p2align 4 184*5d9d9091SRichard LoweLABEL(ashr_1): 185*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 186*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 187*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 188*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 /* Any null chars? */ 189*5d9d9091SRichard Lowe pslldq $15, %xmm2 /* shift first string to align with second */ 190*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ 191*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 /* packed sub of comparison results*/ 192*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 193*5d9d9091SRichard Lowe shr %cl, %edx /* adjust 0xffff for offset */ 194*5d9d9091SRichard Lowe shr %cl, %r9d /* adjust for 16-byte offset */ 195*5d9d9091SRichard Lowe sub %r9d, %edx 196*5d9d9091SRichard Lowe jnz LABEL(less32bytes) /* mismatch or null char seen */ 197*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 198*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 199*5d9d9091SRichard Lowe 200*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 201*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 202*5d9d9091SRichard Lowe mov $1, %r9d /* rdi bytes already examined. Used in exit code */ 203*5d9d9091SRichard Lowe /* 204*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 205*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 206*5d9d9091SRichard Lowe * need to do a nibble. 207*5d9d9091SRichard Lowe */ 208*5d9d9091SRichard Lowe lea 1(%rdi), %r10 209*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 210*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 211*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 212*5d9d9091SRichard Lowe 213*5d9d9091SRichard Lowe .p2align 4 214*5d9d9091SRichard LoweLABEL(loop_ashr_1): 215*5d9d9091SRichard Lowe add $16, %r10 216*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_1) /* cross page boundary */ 217*5d9d9091SRichard Lowe 218*5d9d9091SRichard LoweLABEL(gobble_ashr_1): 219*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 220*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 221*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 /* store for next cycle */ 222*5d9d9091SRichard Lowe 223*5d9d9091SRichard Lowe psrldq $1, %xmm3 224*5d9d9091SRichard Lowe pslldq $15, %xmm2 225*5d9d9091SRichard Lowe por %xmm3, %xmm2 /* merge into one 16byte value */ 226*5d9d9091SRichard Lowe 227*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 228*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 229*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 230*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 231*5d9d9091SRichard Lowe sub $0xffff, %edx 232*5d9d9091SRichard Lowe jnz LABEL(exit) 233*5d9d9091SRichard Lowe 234*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 235*5d9d9091SRichard Lowe sub $16, %r11 236*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 237*5d9d9091SRichard Lowe#endif 238*5d9d9091SRichard Lowe add $16, %rcx 239*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 240*5d9d9091SRichard Lowe 241*5d9d9091SRichard Lowe add $16, %r10 242*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_1) /* cross page boundary */ 243*5d9d9091SRichard Lowe 244*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 245*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 246*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 /* store for next cycle */ 247*5d9d9091SRichard Lowe 248*5d9d9091SRichard Lowe psrldq $1, %xmm3 249*5d9d9091SRichard Lowe pslldq $15, %xmm2 250*5d9d9091SRichard Lowe por %xmm3, %xmm2 /* merge into one 16byte value */ 251*5d9d9091SRichard Lowe 252*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 253*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 254*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 255*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 256*5d9d9091SRichard Lowe sub $0xffff, %edx 257*5d9d9091SRichard Lowe jnz LABEL(exit) 258*5d9d9091SRichard Lowe 259*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 260*5d9d9091SRichard Lowe sub $16, %r11 261*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 262*5d9d9091SRichard Lowe#endif 263*5d9d9091SRichard Lowe add $16, %rcx 264*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 265*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_1) 266*5d9d9091SRichard Lowe 267*5d9d9091SRichard Lowe /* 268*5d9d9091SRichard Lowe * Nibble avoids loads across page boundary. This is to avoid a potential 269*5d9d9091SRichard Lowe * access into unmapped memory. 270*5d9d9091SRichard Lowe */ 271*5d9d9091SRichard Lowe .p2align 4 272*5d9d9091SRichard LoweLABEL(nibble_ashr_1): 273*5d9d9091SRichard Lowe psrldq $1, %xmm4 274*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 275*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 276*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 277*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 278*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 279*5d9d9091SRichard Lowe sub $0x7fff, %edx 280*5d9d9091SRichard Lowe jnz LABEL(exit) 281*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 282*5d9d9091SRichard Lowe cmp $15, %r11 283*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 284*5d9d9091SRichard Lowe#endif 285*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 286*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 287*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_1) 288*5d9d9091SRichard Lowe 289*5d9d9091SRichard Lowe/* 290*5d9d9091SRichard Lowe * ashr_2 handles the following cases: 291*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 14 292*5d9d9091SRichard Lowe */ 293*5d9d9091SRichard Lowe .p2align 4 294*5d9d9091SRichard LoweLABEL(ashr_2): 295*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 296*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 297*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 298*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 299*5d9d9091SRichard Lowe pslldq $14, %xmm2 300*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 301*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 302*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 303*5d9d9091SRichard Lowe shr %cl, %edx 304*5d9d9091SRichard Lowe shr %cl, %r9d 305*5d9d9091SRichard Lowe sub %r9d, %edx 306*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 307*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 308*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 309*5d9d9091SRichard Lowe 310*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 311*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 312*5d9d9091SRichard Lowe mov $2, %r9d /* rdi bytes already examined. Used in exit code */ 313*5d9d9091SRichard Lowe /* 314*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 315*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 316*5d9d9091SRichard Lowe * need to do a nibble. 317*5d9d9091SRichard Lowe */ 318*5d9d9091SRichard Lowe lea 2(%rdi), %r10 319*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 320*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 321*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 322*5d9d9091SRichard Lowe 323*5d9d9091SRichard Lowe .p2align 4 324*5d9d9091SRichard LoweLABEL(loop_ashr_2): 325*5d9d9091SRichard Lowe add $16, %r10 326*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_2) 327*5d9d9091SRichard Lowe 328*5d9d9091SRichard LoweLABEL(gobble_ashr_2): 329*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 330*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 331*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 332*5d9d9091SRichard Lowe 333*5d9d9091SRichard Lowe psrldq $2, %xmm3 334*5d9d9091SRichard Lowe pslldq $14, %xmm2 335*5d9d9091SRichard Lowe por %xmm3, %xmm2 336*5d9d9091SRichard Lowe 337*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 338*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 339*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 340*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 341*5d9d9091SRichard Lowe sub $0xffff, %edx 342*5d9d9091SRichard Lowe jnz LABEL(exit) 343*5d9d9091SRichard Lowe 344*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 345*5d9d9091SRichard Lowe sub $16, %r11 346*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 347*5d9d9091SRichard Lowe#endif 348*5d9d9091SRichard Lowe 349*5d9d9091SRichard Lowe add $16, %rcx 350*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 351*5d9d9091SRichard Lowe 352*5d9d9091SRichard Lowe add $16, %r10 353*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_2) /* cross page boundary */ 354*5d9d9091SRichard Lowe 355*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 356*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 357*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 358*5d9d9091SRichard Lowe 359*5d9d9091SRichard Lowe psrldq $2, %xmm3 360*5d9d9091SRichard Lowe pslldq $14, %xmm2 361*5d9d9091SRichard Lowe por %xmm3, %xmm2 362*5d9d9091SRichard Lowe 363*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 364*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 365*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 366*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 367*5d9d9091SRichard Lowe sub $0xffff, %edx 368*5d9d9091SRichard Lowe jnz LABEL(exit) 369*5d9d9091SRichard Lowe 370*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 371*5d9d9091SRichard Lowe sub $16, %r11 372*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 373*5d9d9091SRichard Lowe#endif 374*5d9d9091SRichard Lowe 375*5d9d9091SRichard Lowe add $16, %rcx 376*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 377*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_2) 378*5d9d9091SRichard Lowe 379*5d9d9091SRichard Lowe .p2align 4 380*5d9d9091SRichard LoweLABEL(nibble_ashr_2): 381*5d9d9091SRichard Lowe psrldq $2, %xmm4 382*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 383*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 384*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 385*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 386*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 387*5d9d9091SRichard Lowe sub $0x3fff, %edx 388*5d9d9091SRichard Lowe jnz LABEL(exit) 389*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 390*5d9d9091SRichard Lowe cmp $14, %r11 391*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 392*5d9d9091SRichard Lowe#endif 393*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 394*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 395*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_2) 396*5d9d9091SRichard Lowe 397*5d9d9091SRichard Lowe/* 398*5d9d9091SRichard Lowe * ashr_3 handles the following cases: 399*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 13 400*5d9d9091SRichard Lowe */ 401*5d9d9091SRichard Lowe .p2align 4 402*5d9d9091SRichard LoweLABEL(ashr_3): 403*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 404*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 405*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 406*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 407*5d9d9091SRichard Lowe pslldq $13, %xmm2 408*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 409*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 410*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 411*5d9d9091SRichard Lowe shr %cl, %edx 412*5d9d9091SRichard Lowe shr %cl, %r9d 413*5d9d9091SRichard Lowe sub %r9d, %edx 414*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 415*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 416*5d9d9091SRichard Lowe 417*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 418*5d9d9091SRichard Lowe 419*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 420*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 421*5d9d9091SRichard Lowe mov $3, %r9d /* rdi bytes already examined. Used in exit code */ 422*5d9d9091SRichard Lowe /* 423*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 424*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 425*5d9d9091SRichard Lowe * need to do a nibble. 426*5d9d9091SRichard Lowe */ 427*5d9d9091SRichard Lowe lea 3(%rdi), %r10 428*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 429*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 430*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 431*5d9d9091SRichard Lowe 432*5d9d9091SRichard Lowe .p2align 4 433*5d9d9091SRichard LoweLABEL(loop_ashr_3): 434*5d9d9091SRichard Lowe add $16, %r10 435*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_3) 436*5d9d9091SRichard Lowe 437*5d9d9091SRichard LoweLABEL(gobble_ashr_3): 438*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 439*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 440*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 441*5d9d9091SRichard Lowe 442*5d9d9091SRichard Lowe psrldq $3, %xmm3 443*5d9d9091SRichard Lowe pslldq $13, %xmm2 444*5d9d9091SRichard Lowe por %xmm3, %xmm2 445*5d9d9091SRichard Lowe 446*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 447*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 448*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 449*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 450*5d9d9091SRichard Lowe sub $0xffff, %edx 451*5d9d9091SRichard Lowe jnz LABEL(exit) 452*5d9d9091SRichard Lowe 453*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 454*5d9d9091SRichard Lowe sub $16, %r11 455*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 456*5d9d9091SRichard Lowe#endif 457*5d9d9091SRichard Lowe 458*5d9d9091SRichard Lowe add $16, %rcx 459*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 460*5d9d9091SRichard Lowe 461*5d9d9091SRichard Lowe add $16, %r10 462*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_3) /* cross page boundary */ 463*5d9d9091SRichard Lowe 464*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 465*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 466*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 467*5d9d9091SRichard Lowe 468*5d9d9091SRichard Lowe psrldq $3, %xmm3 469*5d9d9091SRichard Lowe pslldq $13, %xmm2 470*5d9d9091SRichard Lowe por %xmm3, %xmm2 471*5d9d9091SRichard Lowe 472*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 473*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 474*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 475*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 476*5d9d9091SRichard Lowe sub $0xffff, %edx 477*5d9d9091SRichard Lowe jnz LABEL(exit) 478*5d9d9091SRichard Lowe 479*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 480*5d9d9091SRichard Lowe sub $16, %r11 481*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 482*5d9d9091SRichard Lowe#endif 483*5d9d9091SRichard Lowe 484*5d9d9091SRichard Lowe add $16, %rcx 485*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 486*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_3) 487*5d9d9091SRichard Lowe 488*5d9d9091SRichard Lowe .p2align 4 489*5d9d9091SRichard LoweLABEL(nibble_ashr_3): 490*5d9d9091SRichard Lowe psrldq $3, %xmm4 491*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 492*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 493*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 494*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 495*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 496*5d9d9091SRichard Lowe sub $0x1fff, %edx 497*5d9d9091SRichard Lowe jnz LABEL(exit) 498*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 499*5d9d9091SRichard Lowe cmp $13, %r11 500*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 501*5d9d9091SRichard Lowe#endif 502*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 503*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 504*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_3) 505*5d9d9091SRichard Lowe 506*5d9d9091SRichard Lowe/* 507*5d9d9091SRichard Lowe * ashr_4 handles the following cases: 508*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 12 509*5d9d9091SRichard Lowe */ 510*5d9d9091SRichard Lowe .p2align 4 511*5d9d9091SRichard LoweLABEL(ashr_4): 512*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 513*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 514*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 515*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 516*5d9d9091SRichard Lowe pslldq $12, %xmm2 517*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 518*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 519*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 520*5d9d9091SRichard Lowe shr %cl, %edx 521*5d9d9091SRichard Lowe shr %cl, %r9d 522*5d9d9091SRichard Lowe sub %r9d, %edx 523*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 524*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 525*5d9d9091SRichard Lowe 526*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 527*5d9d9091SRichard Lowe 528*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 529*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 530*5d9d9091SRichard Lowe mov $4, %r9d /* rdi bytes already examined. Used in exit code */ 531*5d9d9091SRichard Lowe /* 532*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 533*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 534*5d9d9091SRichard Lowe * need to do a nibble. 535*5d9d9091SRichard Lowe */ 536*5d9d9091SRichard Lowe lea 4(%rdi), %r10 537*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 538*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 539*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 540*5d9d9091SRichard Lowe 541*5d9d9091SRichard Lowe .p2align 4 542*5d9d9091SRichard LoweLABEL(loop_ashr_4): 543*5d9d9091SRichard Lowe add $16, %r10 544*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_4) 545*5d9d9091SRichard Lowe 546*5d9d9091SRichard LoweLABEL(gobble_ashr_4): 547*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 548*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 549*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 550*5d9d9091SRichard Lowe 551*5d9d9091SRichard Lowe psrldq $4, %xmm3 552*5d9d9091SRichard Lowe pslldq $12, %xmm2 553*5d9d9091SRichard Lowe por %xmm3, %xmm2 554*5d9d9091SRichard Lowe 555*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 556*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 557*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 558*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 559*5d9d9091SRichard Lowe sub $0xffff, %edx 560*5d9d9091SRichard Lowe jnz LABEL(exit) 561*5d9d9091SRichard Lowe 562*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 563*5d9d9091SRichard Lowe sub $16, %r11 564*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 565*5d9d9091SRichard Lowe#endif 566*5d9d9091SRichard Lowe 567*5d9d9091SRichard Lowe add $16, %rcx 568*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 569*5d9d9091SRichard Lowe 570*5d9d9091SRichard Lowe add $16, %r10 571*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_4) /* cross page boundary */ 572*5d9d9091SRichard Lowe 573*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 574*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 575*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 576*5d9d9091SRichard Lowe 577*5d9d9091SRichard Lowe psrldq $4, %xmm3 578*5d9d9091SRichard Lowe pslldq $12, %xmm2 579*5d9d9091SRichard Lowe por %xmm3, %xmm2 580*5d9d9091SRichard Lowe 581*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 582*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 583*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 584*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 585*5d9d9091SRichard Lowe sub $0xffff, %edx 586*5d9d9091SRichard Lowe jnz LABEL(exit) 587*5d9d9091SRichard Lowe 588*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 589*5d9d9091SRichard Lowe sub $16, %r11 590*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 591*5d9d9091SRichard Lowe#endif 592*5d9d9091SRichard Lowe 593*5d9d9091SRichard Lowe add $16, %rcx 594*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 595*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_4) 596*5d9d9091SRichard Lowe 597*5d9d9091SRichard Lowe .p2align 4 598*5d9d9091SRichard LoweLABEL(nibble_ashr_4): 599*5d9d9091SRichard Lowe psrldq $4, %xmm4 600*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 601*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 602*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 603*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 604*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 605*5d9d9091SRichard Lowe sub $0x0fff, %edx 606*5d9d9091SRichard Lowe jnz LABEL(exit) 607*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 608*5d9d9091SRichard Lowe cmp $12, %r11 609*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 610*5d9d9091SRichard Lowe#endif 611*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 612*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 613*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_4) 614*5d9d9091SRichard Lowe 615*5d9d9091SRichard Lowe/* 616*5d9d9091SRichard Lowe * ashr_5 handles the following cases: 617*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 11 618*5d9d9091SRichard Lowe */ 619*5d9d9091SRichard Lowe .p2align 4 620*5d9d9091SRichard LoweLABEL(ashr_5): 621*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 622*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 623*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 624*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 625*5d9d9091SRichard Lowe pslldq $11, %xmm2 626*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 627*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 628*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 629*5d9d9091SRichard Lowe shr %cl, %edx 630*5d9d9091SRichard Lowe shr %cl, %r9d 631*5d9d9091SRichard Lowe sub %r9d, %edx 632*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 633*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 634*5d9d9091SRichard Lowe 635*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 636*5d9d9091SRichard Lowe 637*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 638*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 639*5d9d9091SRichard Lowe mov $5, %r9d /* rdi bytes already examined. Used in exit code */ 640*5d9d9091SRichard Lowe /* 641*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 642*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 643*5d9d9091SRichard Lowe * need to do a nibble. 644*5d9d9091SRichard Lowe */ 645*5d9d9091SRichard Lowe lea 5(%rdi), %r10 646*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 647*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 648*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 649*5d9d9091SRichard Lowe 650*5d9d9091SRichard Lowe .p2align 4 651*5d9d9091SRichard LoweLABEL(loop_ashr_5): 652*5d9d9091SRichard Lowe add $16, %r10 653*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_5) 654*5d9d9091SRichard Lowe 655*5d9d9091SRichard LoweLABEL(gobble_ashr_5): 656*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 657*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 658*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 659*5d9d9091SRichard Lowe 660*5d9d9091SRichard Lowe psrldq $5, %xmm3 661*5d9d9091SRichard Lowe pslldq $11, %xmm2 662*5d9d9091SRichard Lowe por %xmm3, %xmm2 663*5d9d9091SRichard Lowe 664*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 665*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 666*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 667*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 668*5d9d9091SRichard Lowe sub $0xffff, %edx 669*5d9d9091SRichard Lowe jnz LABEL(exit) 670*5d9d9091SRichard Lowe 671*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 672*5d9d9091SRichard Lowe sub $16, %r11 673*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 674*5d9d9091SRichard Lowe#endif 675*5d9d9091SRichard Lowe 676*5d9d9091SRichard Lowe add $16, %rcx 677*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 678*5d9d9091SRichard Lowe 679*5d9d9091SRichard Lowe add $16, %r10 680*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_5) /* cross page boundary */ 681*5d9d9091SRichard Lowe 682*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 683*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 684*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 685*5d9d9091SRichard Lowe 686*5d9d9091SRichard Lowe psrldq $5, %xmm3 687*5d9d9091SRichard Lowe pslldq $11, %xmm2 688*5d9d9091SRichard Lowe por %xmm3, %xmm2 689*5d9d9091SRichard Lowe 690*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 691*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 692*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 693*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 694*5d9d9091SRichard Lowe sub $0xffff, %edx 695*5d9d9091SRichard Lowe jnz LABEL(exit) 696*5d9d9091SRichard Lowe 697*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 698*5d9d9091SRichard Lowe sub $16, %r11 699*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 700*5d9d9091SRichard Lowe#endif 701*5d9d9091SRichard Lowe 702*5d9d9091SRichard Lowe add $16, %rcx 703*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 704*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_5) 705*5d9d9091SRichard Lowe 706*5d9d9091SRichard Lowe .p2align 4 707*5d9d9091SRichard LoweLABEL(nibble_ashr_5): 708*5d9d9091SRichard Lowe psrldq $5, %xmm4 709*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 710*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 711*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 712*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 713*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 714*5d9d9091SRichard Lowe sub $0x07ff, %edx 715*5d9d9091SRichard Lowe jnz LABEL(exit) 716*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 717*5d9d9091SRichard Lowe cmp $11, %r11 718*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 719*5d9d9091SRichard Lowe#endif 720*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 721*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 722*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_5) 723*5d9d9091SRichard Lowe 724*5d9d9091SRichard Lowe/* 725*5d9d9091SRichard Lowe * ashr_6 handles the following cases: 726*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 10 727*5d9d9091SRichard Lowe */ 728*5d9d9091SRichard Lowe .p2align 4 729*5d9d9091SRichard LoweLABEL(ashr_6): 730*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 731*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 732*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 733*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 734*5d9d9091SRichard Lowe pslldq $10, %xmm2 735*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 736*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 737*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 738*5d9d9091SRichard Lowe shr %cl, %edx 739*5d9d9091SRichard Lowe shr %cl, %r9d 740*5d9d9091SRichard Lowe sub %r9d, %edx 741*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 742*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 743*5d9d9091SRichard Lowe 744*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 745*5d9d9091SRichard Lowe 746*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 747*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 748*5d9d9091SRichard Lowe mov $6, %r9d /* rdi bytes already examined. Used in exit code */ 749*5d9d9091SRichard Lowe /* 750*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 751*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 752*5d9d9091SRichard Lowe * need to do a nibble. 753*5d9d9091SRichard Lowe */ 754*5d9d9091SRichard Lowe lea 6(%rdi), %r10 755*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 756*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 757*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 758*5d9d9091SRichard Lowe 759*5d9d9091SRichard Lowe .p2align 4 760*5d9d9091SRichard LoweLABEL(loop_ashr_6): 761*5d9d9091SRichard Lowe add $16, %r10 762*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_6) 763*5d9d9091SRichard Lowe 764*5d9d9091SRichard LoweLABEL(gobble_ashr_6): 765*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 766*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 767*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 768*5d9d9091SRichard Lowe 769*5d9d9091SRichard Lowe psrldq $6, %xmm3 770*5d9d9091SRichard Lowe pslldq $10, %xmm2 771*5d9d9091SRichard Lowe por %xmm3, %xmm2 772*5d9d9091SRichard Lowe 773*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 774*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 775*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 776*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 777*5d9d9091SRichard Lowe sub $0xffff, %edx 778*5d9d9091SRichard Lowe jnz LABEL(exit) 779*5d9d9091SRichard Lowe 780*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 781*5d9d9091SRichard Lowe sub $16, %r11 782*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 783*5d9d9091SRichard Lowe#endif 784*5d9d9091SRichard Lowe 785*5d9d9091SRichard Lowe add $16, %rcx 786*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 787*5d9d9091SRichard Lowe 788*5d9d9091SRichard Lowe add $16, %r10 789*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_6) /* cross page boundary */ 790*5d9d9091SRichard Lowe 791*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 792*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 793*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 794*5d9d9091SRichard Lowe 795*5d9d9091SRichard Lowe psrldq $6, %xmm3 796*5d9d9091SRichard Lowe pslldq $10, %xmm2 797*5d9d9091SRichard Lowe por %xmm3, %xmm2 798*5d9d9091SRichard Lowe 799*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 800*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 801*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 802*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 803*5d9d9091SRichard Lowe sub $0xffff, %edx 804*5d9d9091SRichard Lowe jnz LABEL(exit) 805*5d9d9091SRichard Lowe 806*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 807*5d9d9091SRichard Lowe sub $16, %r11 808*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 809*5d9d9091SRichard Lowe#endif 810*5d9d9091SRichard Lowe 811*5d9d9091SRichard Lowe add $16, %rcx 812*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 813*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_6) 814*5d9d9091SRichard Lowe 815*5d9d9091SRichard Lowe .p2align 4 816*5d9d9091SRichard LoweLABEL(nibble_ashr_6): 817*5d9d9091SRichard Lowe psrldq $6, %xmm4 818*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 819*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 820*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 821*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 822*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 823*5d9d9091SRichard Lowe sub $0x03ff, %edx 824*5d9d9091SRichard Lowe jnz LABEL(exit) 825*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 826*5d9d9091SRichard Lowe cmp $10, %r11 827*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 828*5d9d9091SRichard Lowe#endif 829*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 830*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 831*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_6) 832*5d9d9091SRichard Lowe 833*5d9d9091SRichard Lowe/* 834*5d9d9091SRichard Lowe * ashr_7 handles the following cases: 835*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 9 836*5d9d9091SRichard Lowe */ 837*5d9d9091SRichard Lowe .p2align 4 838*5d9d9091SRichard LoweLABEL(ashr_7): 839*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 840*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 841*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 842*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 843*5d9d9091SRichard Lowe pslldq $9, %xmm2 844*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 845*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 846*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 847*5d9d9091SRichard Lowe shr %cl, %edx 848*5d9d9091SRichard Lowe shr %cl, %r9d 849*5d9d9091SRichard Lowe sub %r9d, %edx 850*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 851*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 852*5d9d9091SRichard Lowe 853*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 854*5d9d9091SRichard Lowe 855*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 856*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 857*5d9d9091SRichard Lowe mov $7, %r9d /* rdi bytes already examined. Used in exit code */ 858*5d9d9091SRichard Lowe /* 859*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 860*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 861*5d9d9091SRichard Lowe * need to do a nibble. 862*5d9d9091SRichard Lowe */ 863*5d9d9091SRichard Lowe lea 7(%rdi), %r10 864*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 865*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 866*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 867*5d9d9091SRichard Lowe 868*5d9d9091SRichard Lowe .p2align 4 869*5d9d9091SRichard LoweLABEL(loop_ashr_7): 870*5d9d9091SRichard Lowe add $16, %r10 871*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_7) 872*5d9d9091SRichard Lowe 873*5d9d9091SRichard LoweLABEL(gobble_ashr_7): 874*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 875*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 876*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 877*5d9d9091SRichard Lowe 878*5d9d9091SRichard Lowe psrldq $7, %xmm3 879*5d9d9091SRichard Lowe pslldq $9, %xmm2 880*5d9d9091SRichard Lowe por %xmm3, %xmm2 881*5d9d9091SRichard Lowe 882*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 883*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 884*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 885*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 886*5d9d9091SRichard Lowe sub $0xffff, %edx 887*5d9d9091SRichard Lowe jnz LABEL(exit) 888*5d9d9091SRichard Lowe 889*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 890*5d9d9091SRichard Lowe sub $16, %r11 891*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 892*5d9d9091SRichard Lowe#endif 893*5d9d9091SRichard Lowe 894*5d9d9091SRichard Lowe add $16, %rcx 895*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 896*5d9d9091SRichard Lowe 897*5d9d9091SRichard Lowe add $16, %r10 898*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_7) /* cross page boundary */ 899*5d9d9091SRichard Lowe 900*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 901*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 902*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 903*5d9d9091SRichard Lowe 904*5d9d9091SRichard Lowe psrldq $7, %xmm3 905*5d9d9091SRichard Lowe pslldq $9, %xmm2 906*5d9d9091SRichard Lowe por %xmm3, %xmm2 907*5d9d9091SRichard Lowe 908*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 909*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 910*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 911*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 912*5d9d9091SRichard Lowe sub $0xffff, %edx 913*5d9d9091SRichard Lowe jnz LABEL(exit) 914*5d9d9091SRichard Lowe 915*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 916*5d9d9091SRichard Lowe sub $16, %r11 917*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 918*5d9d9091SRichard Lowe#endif 919*5d9d9091SRichard Lowe 920*5d9d9091SRichard Lowe add $16, %rcx 921*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 922*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_7) 923*5d9d9091SRichard Lowe 924*5d9d9091SRichard Lowe .p2align 4 925*5d9d9091SRichard LoweLABEL(nibble_ashr_7): 926*5d9d9091SRichard Lowe psrldq $7, %xmm4 927*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 928*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 929*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 930*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 931*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 932*5d9d9091SRichard Lowe sub $0x01ff, %edx 933*5d9d9091SRichard Lowe jnz LABEL(exit) 934*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 935*5d9d9091SRichard Lowe cmp $9, %r11 936*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 937*5d9d9091SRichard Lowe#endif 938*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 939*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 940*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_7) 941*5d9d9091SRichard Lowe 942*5d9d9091SRichard Lowe/* 943*5d9d9091SRichard Lowe * ashr_8 handles the following cases: 944*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 8 945*5d9d9091SRichard Lowe */ 946*5d9d9091SRichard Lowe .p2align 4 947*5d9d9091SRichard LoweLABEL(ashr_8): 948*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 949*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 950*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 951*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 952*5d9d9091SRichard Lowe pslldq $8, %xmm2 953*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 954*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 955*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 956*5d9d9091SRichard Lowe shr %cl, %edx 957*5d9d9091SRichard Lowe shr %cl, %r9d 958*5d9d9091SRichard Lowe sub %r9d, %edx 959*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 960*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 961*5d9d9091SRichard Lowe 962*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 963*5d9d9091SRichard Lowe 964*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 965*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 966*5d9d9091SRichard Lowe mov $8, %r9d /* rdi bytes already examined. Used in exit code */ 967*5d9d9091SRichard Lowe /* 968*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 969*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 970*5d9d9091SRichard Lowe * need to do a nibble. 971*5d9d9091SRichard Lowe */ 972*5d9d9091SRichard Lowe lea 8(%rdi), %r10 973*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 974*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 975*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 976*5d9d9091SRichard Lowe 977*5d9d9091SRichard Lowe .p2align 4 978*5d9d9091SRichard LoweLABEL(loop_ashr_8): 979*5d9d9091SRichard Lowe add $16, %r10 980*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_8) 981*5d9d9091SRichard Lowe 982*5d9d9091SRichard LoweLABEL(gobble_ashr_8): 983*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 984*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 985*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 986*5d9d9091SRichard Lowe 987*5d9d9091SRichard Lowe psrldq $8, %xmm3 988*5d9d9091SRichard Lowe pslldq $8, %xmm2 989*5d9d9091SRichard Lowe por %xmm3, %xmm2 990*5d9d9091SRichard Lowe 991*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 992*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 993*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 994*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 995*5d9d9091SRichard Lowe sub $0xffff, %edx 996*5d9d9091SRichard Lowe jnz LABEL(exit) 997*5d9d9091SRichard Lowe 998*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 999*5d9d9091SRichard Lowe sub $16, %r11 1000*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1001*5d9d9091SRichard Lowe#endif 1002*5d9d9091SRichard Lowe 1003*5d9d9091SRichard Lowe add $16, %rcx 1004*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1005*5d9d9091SRichard Lowe 1006*5d9d9091SRichard Lowe add $16, %r10 1007*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_8) /* cross page boundary */ 1008*5d9d9091SRichard Lowe 1009*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1010*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1011*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1012*5d9d9091SRichard Lowe 1013*5d9d9091SRichard Lowe psrldq $8, %xmm3 1014*5d9d9091SRichard Lowe pslldq $8, %xmm2 1015*5d9d9091SRichard Lowe por %xmm3, %xmm2 1016*5d9d9091SRichard Lowe 1017*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1018*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1019*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1020*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1021*5d9d9091SRichard Lowe sub $0xffff, %edx 1022*5d9d9091SRichard Lowe jnz LABEL(exit) 1023*5d9d9091SRichard Lowe 1024*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1025*5d9d9091SRichard Lowe sub $16, %r11 1026*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1027*5d9d9091SRichard Lowe#endif 1028*5d9d9091SRichard Lowe 1029*5d9d9091SRichard Lowe add $16, %rcx 1030*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1031*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_8) 1032*5d9d9091SRichard Lowe 1033*5d9d9091SRichard Lowe .p2align 4 1034*5d9d9091SRichard LoweLABEL(nibble_ashr_8): 1035*5d9d9091SRichard Lowe psrldq $8, %xmm4 1036*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1037*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1038*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 1039*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1040*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1041*5d9d9091SRichard Lowe sub $0x00ff, %edx 1042*5d9d9091SRichard Lowe jnz LABEL(exit) 1043*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1044*5d9d9091SRichard Lowe cmp $8, %r11 1045*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1046*5d9d9091SRichard Lowe#endif 1047*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1048*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 1049*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_8) 1050*5d9d9091SRichard Lowe 1051*5d9d9091SRichard Lowe/* 1052*5d9d9091SRichard Lowe * ashr_9 handles the following cases: 1053*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 7 1054*5d9d9091SRichard Lowe */ 1055*5d9d9091SRichard Lowe .p2align 4 1056*5d9d9091SRichard LoweLABEL(ashr_9): 1057*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1058*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 1059*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 1060*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1061*5d9d9091SRichard Lowe pslldq $7, %xmm2 1062*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 1063*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 1064*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 1065*5d9d9091SRichard Lowe shr %cl, %edx 1066*5d9d9091SRichard Lowe shr %cl, %r9d 1067*5d9d9091SRichard Lowe sub %r9d, %edx 1068*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 1069*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 1070*5d9d9091SRichard Lowe 1071*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 1072*5d9d9091SRichard Lowe 1073*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1074*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 1075*5d9d9091SRichard Lowe mov $9, %r9d /* rdi bytes already examined. Used in exit code */ 1076*5d9d9091SRichard Lowe /* 1077*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 1078*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 1079*5d9d9091SRichard Lowe * need to do a nibble. 1080*5d9d9091SRichard Lowe */ 1081*5d9d9091SRichard Lowe lea 9(%rdi), %r10 1082*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 1083*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 1084*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 1085*5d9d9091SRichard Lowe 1086*5d9d9091SRichard Lowe .p2align 4 1087*5d9d9091SRichard LoweLABEL(loop_ashr_9): 1088*5d9d9091SRichard Lowe add $16, %r10 1089*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_9) 1090*5d9d9091SRichard Lowe 1091*5d9d9091SRichard LoweLABEL(gobble_ashr_9): 1092*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1093*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1094*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1095*5d9d9091SRichard Lowe 1096*5d9d9091SRichard Lowe psrldq $9, %xmm3 1097*5d9d9091SRichard Lowe pslldq $7, %xmm2 1098*5d9d9091SRichard Lowe por %xmm3, %xmm2 1099*5d9d9091SRichard Lowe 1100*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1101*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1102*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1103*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1104*5d9d9091SRichard Lowe sub $0xffff, %edx 1105*5d9d9091SRichard Lowe jnz LABEL(exit) 1106*5d9d9091SRichard Lowe 1107*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1108*5d9d9091SRichard Lowe sub $16, %r11 1109*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1110*5d9d9091SRichard Lowe#endif 1111*5d9d9091SRichard Lowe 1112*5d9d9091SRichard Lowe add $16, %rcx 1113*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1114*5d9d9091SRichard Lowe 1115*5d9d9091SRichard Lowe add $16, %r10 1116*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_9) /* cross page boundary */ 1117*5d9d9091SRichard Lowe 1118*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1119*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1120*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1121*5d9d9091SRichard Lowe 1122*5d9d9091SRichard Lowe psrldq $9, %xmm3 1123*5d9d9091SRichard Lowe pslldq $7, %xmm2 1124*5d9d9091SRichard Lowe por %xmm3, %xmm2 1125*5d9d9091SRichard Lowe 1126*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1127*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1128*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1129*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1130*5d9d9091SRichard Lowe sub $0xffff, %edx 1131*5d9d9091SRichard Lowe jnz LABEL(exit) 1132*5d9d9091SRichard Lowe 1133*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1134*5d9d9091SRichard Lowe sub $16, %r11 1135*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1136*5d9d9091SRichard Lowe#endif 1137*5d9d9091SRichard Lowe 1138*5d9d9091SRichard Lowe add $16, %rcx 1139*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 /* store for next cycle */ 1140*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_9) 1141*5d9d9091SRichard Lowe 1142*5d9d9091SRichard Lowe .p2align 4 1143*5d9d9091SRichard LoweLABEL(nibble_ashr_9): 1144*5d9d9091SRichard Lowe psrldq $9, %xmm4 1145*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1146*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1147*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 1148*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1149*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1150*5d9d9091SRichard Lowe sub $0x007f, %edx 1151*5d9d9091SRichard Lowe jnz LABEL(exit) 1152*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1153*5d9d9091SRichard Lowe cmp $7, %r11 1154*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1155*5d9d9091SRichard Lowe#endif 1156*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1157*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 1158*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_9) 1159*5d9d9091SRichard Lowe 1160*5d9d9091SRichard Lowe/* 1161*5d9d9091SRichard Lowe * ashr_10 handles the following cases: 1162*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 6 1163*5d9d9091SRichard Lowe */ 1164*5d9d9091SRichard Lowe .p2align 4 1165*5d9d9091SRichard LoweLABEL(ashr_10): 1166*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1167*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 1168*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 1169*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1170*5d9d9091SRichard Lowe pslldq $6, %xmm2 1171*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 1172*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 1173*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 1174*5d9d9091SRichard Lowe shr %cl, %edx 1175*5d9d9091SRichard Lowe shr %cl, %r9d 1176*5d9d9091SRichard Lowe sub %r9d, %edx 1177*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 1178*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 1179*5d9d9091SRichard Lowe 1180*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 1181*5d9d9091SRichard Lowe 1182*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1183*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 1184*5d9d9091SRichard Lowe mov $10, %r9d /* rdi bytes already examined. Used in exit code */ 1185*5d9d9091SRichard Lowe /* 1186*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 1187*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 1188*5d9d9091SRichard Lowe * need to do a nibble. 1189*5d9d9091SRichard Lowe */ 1190*5d9d9091SRichard Lowe lea 10(%rdi), %r10 1191*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 1192*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 1193*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 1194*5d9d9091SRichard Lowe 1195*5d9d9091SRichard Lowe .p2align 4 1196*5d9d9091SRichard LoweLABEL(loop_ashr_10): 1197*5d9d9091SRichard Lowe add $16, %r10 1198*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_10) 1199*5d9d9091SRichard Lowe 1200*5d9d9091SRichard LoweLABEL(gobble_ashr_10): 1201*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1202*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1203*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1204*5d9d9091SRichard Lowe 1205*5d9d9091SRichard Lowe psrldq $10, %xmm3 1206*5d9d9091SRichard Lowe pslldq $6, %xmm2 1207*5d9d9091SRichard Lowe por %xmm3, %xmm2 1208*5d9d9091SRichard Lowe 1209*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1210*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1211*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1212*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1213*5d9d9091SRichard Lowe sub $0xffff, %edx 1214*5d9d9091SRichard Lowe jnz LABEL(exit) 1215*5d9d9091SRichard Lowe 1216*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1217*5d9d9091SRichard Lowe sub $16, %r11 1218*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1219*5d9d9091SRichard Lowe#endif 1220*5d9d9091SRichard Lowe 1221*5d9d9091SRichard Lowe add $16, %rcx 1222*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1223*5d9d9091SRichard Lowe 1224*5d9d9091SRichard Lowe add $16, %r10 1225*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_10) /* cross page boundary */ 1226*5d9d9091SRichard Lowe 1227*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1228*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1229*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1230*5d9d9091SRichard Lowe 1231*5d9d9091SRichard Lowe psrldq $10, %xmm3 1232*5d9d9091SRichard Lowe pslldq $6, %xmm2 1233*5d9d9091SRichard Lowe por %xmm3, %xmm2 1234*5d9d9091SRichard Lowe 1235*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1236*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1237*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1238*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1239*5d9d9091SRichard Lowe sub $0xffff, %edx 1240*5d9d9091SRichard Lowe jnz LABEL(exit) 1241*5d9d9091SRichard Lowe 1242*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1243*5d9d9091SRichard Lowe sub $16, %r11 1244*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1245*5d9d9091SRichard Lowe#endif 1246*5d9d9091SRichard Lowe 1247*5d9d9091SRichard Lowe add $16, %rcx 1248*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1249*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_10) 1250*5d9d9091SRichard Lowe 1251*5d9d9091SRichard Lowe .p2align 4 1252*5d9d9091SRichard LoweLABEL(nibble_ashr_10): 1253*5d9d9091SRichard Lowe psrldq $10, %xmm4 1254*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1255*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1256*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 1257*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1258*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1259*5d9d9091SRichard Lowe sub $0x003f, %edx 1260*5d9d9091SRichard Lowe jnz LABEL(exit) 1261*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1262*5d9d9091SRichard Lowe cmp $6, %r11 1263*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1264*5d9d9091SRichard Lowe#endif 1265*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1266*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 1267*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_10) 1268*5d9d9091SRichard Lowe 1269*5d9d9091SRichard Lowe/* 1270*5d9d9091SRichard Lowe * ashr_11 handles the following cases: 1271*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 5 1272*5d9d9091SRichard Lowe */ 1273*5d9d9091SRichard Lowe .p2align 4 1274*5d9d9091SRichard LoweLABEL(ashr_11): 1275*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1276*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 1277*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 1278*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1279*5d9d9091SRichard Lowe pslldq $5, %xmm2 1280*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 1281*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 1282*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 1283*5d9d9091SRichard Lowe shr %cl, %edx 1284*5d9d9091SRichard Lowe shr %cl, %r9d 1285*5d9d9091SRichard Lowe sub %r9d, %edx 1286*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 1287*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 1288*5d9d9091SRichard Lowe 1289*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 1290*5d9d9091SRichard Lowe 1291*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1292*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 1293*5d9d9091SRichard Lowe mov $11, %r9d /* rdi bytes already examined. Used in exit code */ 1294*5d9d9091SRichard Lowe /* 1295*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 1296*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 1297*5d9d9091SRichard Lowe * need to do a nibble. 1298*5d9d9091SRichard Lowe */ 1299*5d9d9091SRichard Lowe lea 11(%rdi), %r10 1300*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 1301*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 1302*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 1303*5d9d9091SRichard Lowe 1304*5d9d9091SRichard Lowe .p2align 4 1305*5d9d9091SRichard LoweLABEL(loop_ashr_11): 1306*5d9d9091SRichard Lowe add $16, %r10 1307*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_11) 1308*5d9d9091SRichard Lowe 1309*5d9d9091SRichard LoweLABEL(gobble_ashr_11): 1310*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1311*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1312*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1313*5d9d9091SRichard Lowe 1314*5d9d9091SRichard Lowe psrldq $11, %xmm3 1315*5d9d9091SRichard Lowe pslldq $5, %xmm2 1316*5d9d9091SRichard Lowe por %xmm3, %xmm2 1317*5d9d9091SRichard Lowe 1318*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1319*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1320*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1321*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1322*5d9d9091SRichard Lowe sub $0xffff, %edx 1323*5d9d9091SRichard Lowe jnz LABEL(exit) 1324*5d9d9091SRichard Lowe 1325*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1326*5d9d9091SRichard Lowe sub $16, %r11 1327*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1328*5d9d9091SRichard Lowe#endif 1329*5d9d9091SRichard Lowe 1330*5d9d9091SRichard Lowe add $16, %rcx 1331*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1332*5d9d9091SRichard Lowe 1333*5d9d9091SRichard Lowe add $16, %r10 1334*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_11) /* cross page boundary */ 1335*5d9d9091SRichard Lowe 1336*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1337*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1338*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1339*5d9d9091SRichard Lowe 1340*5d9d9091SRichard Lowe psrldq $11, %xmm3 1341*5d9d9091SRichard Lowe pslldq $5, %xmm2 1342*5d9d9091SRichard Lowe por %xmm3, %xmm2 1343*5d9d9091SRichard Lowe 1344*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1345*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1346*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1347*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1348*5d9d9091SRichard Lowe sub $0xffff, %edx 1349*5d9d9091SRichard Lowe jnz LABEL(exit) 1350*5d9d9091SRichard Lowe 1351*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1352*5d9d9091SRichard Lowe sub $16, %r11 1353*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1354*5d9d9091SRichard Lowe#endif 1355*5d9d9091SRichard Lowe 1356*5d9d9091SRichard Lowe add $16, %rcx 1357*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1358*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_11) 1359*5d9d9091SRichard Lowe 1360*5d9d9091SRichard Lowe .p2align 4 1361*5d9d9091SRichard LoweLABEL(nibble_ashr_11): 1362*5d9d9091SRichard Lowe psrldq $11, %xmm4 1363*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1364*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1365*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 1366*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1367*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1368*5d9d9091SRichard Lowe sub $0x001f, %edx 1369*5d9d9091SRichard Lowe jnz LABEL(exit) 1370*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1371*5d9d9091SRichard Lowe cmp $5, %r11 1372*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1373*5d9d9091SRichard Lowe#endif 1374*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1375*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 1376*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_11) 1377*5d9d9091SRichard Lowe 1378*5d9d9091SRichard Lowe/* 1379*5d9d9091SRichard Lowe * ashr_12 handles the following cases: 1380*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 4 1381*5d9d9091SRichard Lowe */ 1382*5d9d9091SRichard Lowe .p2align 4 1383*5d9d9091SRichard LoweLABEL(ashr_12): 1384*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1385*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 1386*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 1387*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1388*5d9d9091SRichard Lowe pslldq $4, %xmm2 1389*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 1390*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 1391*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 1392*5d9d9091SRichard Lowe shr %cl, %edx 1393*5d9d9091SRichard Lowe shr %cl, %r9d 1394*5d9d9091SRichard Lowe sub %r9d, %edx 1395*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 1396*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 1397*5d9d9091SRichard Lowe 1398*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 1399*5d9d9091SRichard Lowe 1400*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1401*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 1402*5d9d9091SRichard Lowe mov $12, %r9d /* rdi bytes already examined. Used in exit code */ 1403*5d9d9091SRichard Lowe /* 1404*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 1405*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 1406*5d9d9091SRichard Lowe * need to do a nibble. 1407*5d9d9091SRichard Lowe */ 1408*5d9d9091SRichard Lowe lea 12(%rdi), %r10 1409*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 1410*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 1411*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 1412*5d9d9091SRichard Lowe 1413*5d9d9091SRichard Lowe .p2align 4 1414*5d9d9091SRichard LoweLABEL(loop_ashr_12): 1415*5d9d9091SRichard Lowe add $16, %r10 1416*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_12) 1417*5d9d9091SRichard Lowe 1418*5d9d9091SRichard LoweLABEL(gobble_ashr_12): 1419*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1420*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1421*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1422*5d9d9091SRichard Lowe 1423*5d9d9091SRichard Lowe psrldq $12, %xmm3 1424*5d9d9091SRichard Lowe pslldq $4, %xmm2 1425*5d9d9091SRichard Lowe por %xmm3, %xmm2 1426*5d9d9091SRichard Lowe 1427*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1428*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1429*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1430*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1431*5d9d9091SRichard Lowe sub $0xffff, %edx 1432*5d9d9091SRichard Lowe jnz LABEL(exit) 1433*5d9d9091SRichard Lowe 1434*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1435*5d9d9091SRichard Lowe sub $16, %r11 1436*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1437*5d9d9091SRichard Lowe#endif 1438*5d9d9091SRichard Lowe 1439*5d9d9091SRichard Lowe add $16, %rcx 1440*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1441*5d9d9091SRichard Lowe 1442*5d9d9091SRichard Lowe add $16, %r10 1443*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_12) /* cross page boundary */ 1444*5d9d9091SRichard Lowe 1445*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1446*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1447*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1448*5d9d9091SRichard Lowe 1449*5d9d9091SRichard Lowe psrldq $12, %xmm3 1450*5d9d9091SRichard Lowe pslldq $4, %xmm2 1451*5d9d9091SRichard Lowe por %xmm3, %xmm2 1452*5d9d9091SRichard Lowe 1453*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1454*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1455*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1456*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1457*5d9d9091SRichard Lowe sub $0xffff, %edx 1458*5d9d9091SRichard Lowe jnz LABEL(exit) 1459*5d9d9091SRichard Lowe 1460*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1461*5d9d9091SRichard Lowe sub $16, %r11 1462*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1463*5d9d9091SRichard Lowe#endif 1464*5d9d9091SRichard Lowe 1465*5d9d9091SRichard Lowe add $16, %rcx 1466*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1467*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_12) 1468*5d9d9091SRichard Lowe 1469*5d9d9091SRichard Lowe .p2align 4 1470*5d9d9091SRichard LoweLABEL(nibble_ashr_12): 1471*5d9d9091SRichard Lowe psrldq $12, %xmm4 1472*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1473*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1474*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 1475*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1476*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1477*5d9d9091SRichard Lowe sub $0x000f, %edx 1478*5d9d9091SRichard Lowe jnz LABEL(exit) 1479*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1480*5d9d9091SRichard Lowe cmp $4, %r11 1481*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1482*5d9d9091SRichard Lowe#endif 1483*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1484*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 1485*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_12) 1486*5d9d9091SRichard Lowe 1487*5d9d9091SRichard Lowe/* 1488*5d9d9091SRichard Lowe * ashr_13 handles the following cases: 1489*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 3 1490*5d9d9091SRichard Lowe */ 1491*5d9d9091SRichard Lowe .p2align 4 1492*5d9d9091SRichard LoweLABEL(ashr_13): 1493*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1494*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 1495*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 1496*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1497*5d9d9091SRichard Lowe pslldq $3, %xmm2 1498*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 1499*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 1500*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 1501*5d9d9091SRichard Lowe shr %cl, %edx 1502*5d9d9091SRichard Lowe shr %cl, %r9d 1503*5d9d9091SRichard Lowe sub %r9d, %edx 1504*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 1505*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 1506*5d9d9091SRichard Lowe 1507*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 1508*5d9d9091SRichard Lowe 1509*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1510*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 1511*5d9d9091SRichard Lowe mov $13, %r9d /* rdi bytes already examined. Used in exit code */ 1512*5d9d9091SRichard Lowe /* 1513*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 1514*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 1515*5d9d9091SRichard Lowe * need to do a nibble. 1516*5d9d9091SRichard Lowe */ 1517*5d9d9091SRichard Lowe lea 13(%rdi), %r10 1518*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 1519*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 1520*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 1521*5d9d9091SRichard Lowe 1522*5d9d9091SRichard Lowe .p2align 4 1523*5d9d9091SRichard LoweLABEL(loop_ashr_13): 1524*5d9d9091SRichard Lowe add $16, %r10 1525*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_13) 1526*5d9d9091SRichard Lowe 1527*5d9d9091SRichard LoweLABEL(gobble_ashr_13): 1528*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1529*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1530*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1531*5d9d9091SRichard Lowe 1532*5d9d9091SRichard Lowe psrldq $13, %xmm3 1533*5d9d9091SRichard Lowe pslldq $3, %xmm2 1534*5d9d9091SRichard Lowe por %xmm3, %xmm2 1535*5d9d9091SRichard Lowe 1536*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1537*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1538*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1539*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1540*5d9d9091SRichard Lowe sub $0xffff, %edx 1541*5d9d9091SRichard Lowe jnz LABEL(exit) 1542*5d9d9091SRichard Lowe 1543*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1544*5d9d9091SRichard Lowe sub $16, %r11 1545*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1546*5d9d9091SRichard Lowe#endif 1547*5d9d9091SRichard Lowe 1548*5d9d9091SRichard Lowe add $16, %rcx 1549*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1550*5d9d9091SRichard Lowe 1551*5d9d9091SRichard Lowe add $16, %r10 1552*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_13) /* cross page boundary */ 1553*5d9d9091SRichard Lowe 1554*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1555*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1556*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1557*5d9d9091SRichard Lowe 1558*5d9d9091SRichard Lowe psrldq $13, %xmm3 1559*5d9d9091SRichard Lowe pslldq $3, %xmm2 1560*5d9d9091SRichard Lowe por %xmm3, %xmm2 1561*5d9d9091SRichard Lowe 1562*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1563*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1564*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1565*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1566*5d9d9091SRichard Lowe sub $0xffff, %edx 1567*5d9d9091SRichard Lowe jnz LABEL(exit) 1568*5d9d9091SRichard Lowe 1569*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1570*5d9d9091SRichard Lowe sub $16, %r11 1571*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1572*5d9d9091SRichard Lowe#endif 1573*5d9d9091SRichard Lowe 1574*5d9d9091SRichard Lowe add $16, %rcx 1575*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1576*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_13) 1577*5d9d9091SRichard Lowe 1578*5d9d9091SRichard Lowe .p2align 4 1579*5d9d9091SRichard LoweLABEL(nibble_ashr_13): 1580*5d9d9091SRichard Lowe psrldq $13, %xmm4 1581*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1582*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1583*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 1584*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1585*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1586*5d9d9091SRichard Lowe sub $0x0007, %edx 1587*5d9d9091SRichard Lowe jnz LABEL(exit) 1588*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1589*5d9d9091SRichard Lowe cmp $3, %r11 1590*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1591*5d9d9091SRichard Lowe#endif 1592*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1593*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 1594*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_13) 1595*5d9d9091SRichard Lowe 1596*5d9d9091SRichard Lowe/* 1597*5d9d9091SRichard Lowe * ashr_14 handles the following cases: 1598*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 2 1599*5d9d9091SRichard Lowe */ 1600*5d9d9091SRichard Lowe .p2align 4 1601*5d9d9091SRichard LoweLABEL(ashr_14): 1602*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1603*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 1604*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 1605*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1606*5d9d9091SRichard Lowe pslldq $2, %xmm2 1607*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 1608*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 1609*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 1610*5d9d9091SRichard Lowe shr %cl, %edx 1611*5d9d9091SRichard Lowe shr %cl, %r9d 1612*5d9d9091SRichard Lowe sub %r9d, %edx 1613*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 1614*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 1615*5d9d9091SRichard Lowe 1616*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 1617*5d9d9091SRichard Lowe 1618*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1619*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 1620*5d9d9091SRichard Lowe mov $14, %r9d /* rdi bytes already examined. Used in exit code */ 1621*5d9d9091SRichard Lowe /* 1622*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 1623*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 1624*5d9d9091SRichard Lowe * need to do a nibble. 1625*5d9d9091SRichard Lowe */ 1626*5d9d9091SRichard Lowe lea 14(%rdi), %r10 1627*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 1628*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 1629*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 1630*5d9d9091SRichard Lowe 1631*5d9d9091SRichard Lowe .p2align 4 1632*5d9d9091SRichard LoweLABEL(loop_ashr_14): 1633*5d9d9091SRichard Lowe add $16, %r10 1634*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_14) 1635*5d9d9091SRichard Lowe 1636*5d9d9091SRichard LoweLABEL(gobble_ashr_14): 1637*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1638*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1639*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1640*5d9d9091SRichard Lowe 1641*5d9d9091SRichard Lowe psrldq $14, %xmm3 1642*5d9d9091SRichard Lowe pslldq $2, %xmm2 1643*5d9d9091SRichard Lowe por %xmm3, %xmm2 1644*5d9d9091SRichard Lowe 1645*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1646*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1647*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1648*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1649*5d9d9091SRichard Lowe sub $0xffff, %edx 1650*5d9d9091SRichard Lowe jnz LABEL(exit) 1651*5d9d9091SRichard Lowe 1652*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1653*5d9d9091SRichard Lowe sub $16, %r11 1654*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1655*5d9d9091SRichard Lowe#endif 1656*5d9d9091SRichard Lowe 1657*5d9d9091SRichard Lowe add $16, %rcx 1658*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1659*5d9d9091SRichard Lowe 1660*5d9d9091SRichard Lowe add $16, %r10 1661*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_14) /* cross page boundary */ 1662*5d9d9091SRichard Lowe 1663*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1664*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1665*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1666*5d9d9091SRichard Lowe 1667*5d9d9091SRichard Lowe psrldq $14, %xmm3 1668*5d9d9091SRichard Lowe pslldq $2, %xmm2 1669*5d9d9091SRichard Lowe por %xmm3, %xmm2 1670*5d9d9091SRichard Lowe 1671*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1672*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1673*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1674*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1675*5d9d9091SRichard Lowe sub $0xffff, %edx 1676*5d9d9091SRichard Lowe jnz LABEL(exit) 1677*5d9d9091SRichard Lowe 1678*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1679*5d9d9091SRichard Lowe sub $16, %r11 1680*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1681*5d9d9091SRichard Lowe#endif 1682*5d9d9091SRichard Lowe 1683*5d9d9091SRichard Lowe add $16, %rcx 1684*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1685*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_14) 1686*5d9d9091SRichard Lowe 1687*5d9d9091SRichard Lowe .p2align 4 1688*5d9d9091SRichard LoweLABEL(nibble_ashr_14): 1689*5d9d9091SRichard Lowe psrldq $14, %xmm4 1690*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1691*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1692*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 1693*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1694*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1695*5d9d9091SRichard Lowe sub $0x0003, %edx 1696*5d9d9091SRichard Lowe jnz LABEL(exit) 1697*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1698*5d9d9091SRichard Lowe cmp $2, %r11 1699*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1700*5d9d9091SRichard Lowe#endif 1701*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1702*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 1703*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_14) 1704*5d9d9091SRichard Lowe 1705*5d9d9091SRichard Lowe/* 1706*5d9d9091SRichard Lowe * ashr_15 handles the following cases: 1707*5d9d9091SRichard Lowe * abs(str1 offset - str2 offset) = 1 1708*5d9d9091SRichard Lowe */ 1709*5d9d9091SRichard Lowe .p2align 4 1710*5d9d9091SRichard LoweLABEL(ashr_15): 1711*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1712*5d9d9091SRichard Lowe movdqa (%rdi), %xmm2 1713*5d9d9091SRichard Lowe movdqa (%rsi), %xmm1 1714*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1715*5d9d9091SRichard Lowe pslldq $1, %xmm2 1716*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm2 1717*5d9d9091SRichard Lowe psubb %xmm0, %xmm2 1718*5d9d9091SRichard Lowe pmovmskb %xmm2, %r9d 1719*5d9d9091SRichard Lowe shr %cl, %edx 1720*5d9d9091SRichard Lowe shr %cl, %r9d 1721*5d9d9091SRichard Lowe sub %r9d, %edx 1722*5d9d9091SRichard Lowe jnz LABEL(less32bytes) 1723*5d9d9091SRichard Lowe 1724*5d9d9091SRichard Lowe movdqa (%rdi), %xmm3 1725*5d9d9091SRichard Lowe 1726*5d9d9091SRichard Lowe UPDATE_STRNCMP_COUNTER 1727*5d9d9091SRichard Lowe 1728*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1729*5d9d9091SRichard Lowe mov $16, %rcx /* index for loads */ 1730*5d9d9091SRichard Lowe mov $15, %r9d /* rdi bytes already examined. Used in exit code */ 1731*5d9d9091SRichard Lowe /* 1732*5d9d9091SRichard Lowe * Setup %r10 value allows us to detect crossing a page boundary. 1733*5d9d9091SRichard Lowe * When %r10 goes positive we are crossing a page boundary and 1734*5d9d9091SRichard Lowe * need to do a nibble. 1735*5d9d9091SRichard Lowe */ 1736*5d9d9091SRichard Lowe lea 15(%rdi), %r10 1737*5d9d9091SRichard Lowe and $0xfff, %r10 /* offset into 4K page */ 1738*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K pagesize */ 1739*5d9d9091SRichard Lowe movdqa %xmm3, %xmm4 1740*5d9d9091SRichard Lowe 1741*5d9d9091SRichard Lowe .p2align 4 1742*5d9d9091SRichard LoweLABEL(loop_ashr_15): 1743*5d9d9091SRichard Lowe add $16, %r10 1744*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_15) 1745*5d9d9091SRichard Lowe 1746*5d9d9091SRichard LoweLABEL(gobble_ashr_15): 1747*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1748*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1749*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1750*5d9d9091SRichard Lowe 1751*5d9d9091SRichard Lowe psrldq $15, %xmm3 1752*5d9d9091SRichard Lowe pslldq $1, %xmm2 1753*5d9d9091SRichard Lowe por %xmm3, %xmm2 1754*5d9d9091SRichard Lowe 1755*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1756*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1757*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1758*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1759*5d9d9091SRichard Lowe sub $0xffff, %edx 1760*5d9d9091SRichard Lowe jnz LABEL(exit) 1761*5d9d9091SRichard Lowe 1762*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1763*5d9d9091SRichard Lowe sub $16, %r11 1764*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1765*5d9d9091SRichard Lowe#endif 1766*5d9d9091SRichard Lowe 1767*5d9d9091SRichard Lowe add $16, %rcx 1768*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1769*5d9d9091SRichard Lowe 1770*5d9d9091SRichard Lowe add $16, %r10 1771*5d9d9091SRichard Lowe jg LABEL(nibble_ashr_15) /* cross page boundary */ 1772*5d9d9091SRichard Lowe 1773*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1774*5d9d9091SRichard Lowe movdqa (%rdi, %rcx), %xmm2 1775*5d9d9091SRichard Lowe movdqa %xmm2, %xmm4 1776*5d9d9091SRichard Lowe 1777*5d9d9091SRichard Lowe psrldq $15, %xmm3 1778*5d9d9091SRichard Lowe pslldq $1, %xmm2 1779*5d9d9091SRichard Lowe por %xmm3, %xmm2 1780*5d9d9091SRichard Lowe 1781*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1782*5d9d9091SRichard Lowe pcmpeqb %xmm2, %xmm1 1783*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1784*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1785*5d9d9091SRichard Lowe sub $0xffff, %edx 1786*5d9d9091SRichard Lowe jnz LABEL(exit) 1787*5d9d9091SRichard Lowe 1788*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1789*5d9d9091SRichard Lowe sub $16, %r11 1790*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1791*5d9d9091SRichard Lowe#endif 1792*5d9d9091SRichard Lowe 1793*5d9d9091SRichard Lowe add $16, %rcx 1794*5d9d9091SRichard Lowe movdqa %xmm4, %xmm3 1795*5d9d9091SRichard Lowe jmp LABEL(loop_ashr_15) 1796*5d9d9091SRichard Lowe 1797*5d9d9091SRichard Lowe .p2align 4 1798*5d9d9091SRichard LoweLABEL(nibble_ashr_15): 1799*5d9d9091SRichard Lowe psrldq $15, %xmm4 1800*5d9d9091SRichard Lowe movdqa (%rsi, %rcx), %xmm1 1801*5d9d9091SRichard Lowe pcmpeqb %xmm1, %xmm0 1802*5d9d9091SRichard Lowe pcmpeqb %xmm4, %xmm1 1803*5d9d9091SRichard Lowe psubb %xmm0, %xmm1 1804*5d9d9091SRichard Lowe pmovmskb %xmm1, %edx 1805*5d9d9091SRichard Lowe sub $0x0001, %edx 1806*5d9d9091SRichard Lowe jnz LABEL(exit) 1807*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1808*5d9d9091SRichard Lowe cmp $1, %r11 1809*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1810*5d9d9091SRichard Lowe#endif 1811*5d9d9091SRichard Lowe pxor %xmm0, %xmm0 1812*5d9d9091SRichard Lowe sub $0x1000, %r10 /* subtract 4K from %r10 */ 1813*5d9d9091SRichard Lowe jmp LABEL(gobble_ashr_15) 1814*5d9d9091SRichard Lowe 1815*5d9d9091SRichard Lowe .p2align 4 1816*5d9d9091SRichard LoweLABEL(exit): 1817*5d9d9091SRichard Lowe lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ 1818*5d9d9091SRichard LoweLABEL(less32bytes): 1819*5d9d9091SRichard Lowe lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ 1820*5d9d9091SRichard Lowe lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ 1821*5d9d9091SRichard Lowe test %r8d, %r8d 1822*5d9d9091SRichard Lowe jz LABEL(ret) 1823*5d9d9091SRichard Lowe xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ 1824*5d9d9091SRichard Lowe 1825*5d9d9091SRichard Lowe .p2align 4 1826*5d9d9091SRichard LoweLABEL(ret): 1827*5d9d9091SRichard LoweLABEL(less16bytes): 1828*5d9d9091SRichard Lowe /* 1829*5d9d9091SRichard Lowe * Check to see if BSF is fast on this processor. If not, use a different 1830*5d9d9091SRichard Lowe * exit tail. 1831*5d9d9091SRichard Lowe */ 1832*5d9d9091SRichard Lowe testl $USE_BSF,.memops_method(%rip) 1833*5d9d9091SRichard Lowe jz LABEL(AMD_exit) 1834*5d9d9091SRichard Lowe bsf %rdx, %rdx /* find and store bit index in %rdx */ 1835*5d9d9091SRichard Lowe 1836*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1837*5d9d9091SRichard Lowe sub %rdx, %r11 1838*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1839*5d9d9091SRichard Lowe#endif 1840*5d9d9091SRichard Lowe xor %ecx, %ecx /* clear %ecx */ 1841*5d9d9091SRichard Lowe xor %eax, %eax /* clear %eax */ 1842*5d9d9091SRichard Lowe 1843*5d9d9091SRichard Lowe movb (%rsi, %rdx), %cl 1844*5d9d9091SRichard Lowe movb (%rdi, %rdx), %al 1845*5d9d9091SRichard Lowe 1846*5d9d9091SRichard Lowe sub %ecx, %eax 1847*5d9d9091SRichard Lowe ret 1848*5d9d9091SRichard Lowe 1849*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1850*5d9d9091SRichard LoweLABEL(strcmp_exitz): 1851*5d9d9091SRichard Lowe xor %eax, %eax 1852*5d9d9091SRichard Lowe ret 1853*5d9d9091SRichard Lowe#endif 1854*5d9d9091SRichard Lowe 1855*5d9d9091SRichard Lowe /* 1856*5d9d9091SRichard Lowe * This exit tail does not use the bsf instruction. 1857*5d9d9091SRichard Lowe */ 1858*5d9d9091SRichard Lowe .p2align 4 1859*5d9d9091SRichard LoweLABEL(AMD_exit): 1860*5d9d9091SRichard Lowe test %dl, %dl 1861*5d9d9091SRichard Lowe jz LABEL(next_8_bytes) 1862*5d9d9091SRichard Lowe 1863*5d9d9091SRichard Lowe test $0x01, %dl 1864*5d9d9091SRichard Lowe jnz LABEL(Byte0) 1865*5d9d9091SRichard Lowe 1866*5d9d9091SRichard Lowe test $0x02, %dl 1867*5d9d9091SRichard Lowe jnz LABEL(Byte1) 1868*5d9d9091SRichard Lowe 1869*5d9d9091SRichard Lowe test $0x04, %dl 1870*5d9d9091SRichard Lowe jnz LABEL(Byte2) 1871*5d9d9091SRichard Lowe 1872*5d9d9091SRichard Lowe test $0x08, %dl 1873*5d9d9091SRichard Lowe jnz LABEL(Byte3) 1874*5d9d9091SRichard Lowe 1875*5d9d9091SRichard Lowe test $0x10, %dl 1876*5d9d9091SRichard Lowe jnz LABEL(Byte4) 1877*5d9d9091SRichard Lowe 1878*5d9d9091SRichard Lowe test $0x20, %dl 1879*5d9d9091SRichard Lowe jnz LABEL(Byte5) 1880*5d9d9091SRichard Lowe 1881*5d9d9091SRichard Lowe test $0x40, %dl 1882*5d9d9091SRichard Lowe jnz LABEL(Byte6) 1883*5d9d9091SRichard Lowe 1884*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1885*5d9d9091SRichard Lowe sub $7, %r11 1886*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1887*5d9d9091SRichard Lowe#endif 1888*5d9d9091SRichard Lowe movzx 7(%rsi), %ecx 1889*5d9d9091SRichard Lowe movzx 7(%rdi), %eax 1890*5d9d9091SRichard Lowe 1891*5d9d9091SRichard Lowe sub %ecx, %eax 1892*5d9d9091SRichard Lowe ret 1893*5d9d9091SRichard Lowe 1894*5d9d9091SRichard Lowe .p2align 4 1895*5d9d9091SRichard LoweLABEL(Byte0): 1896*5d9d9091SRichard Lowe /* 1897*5d9d9091SRichard Lowe * never need to handle byte 0 for strncmpy 1898*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1899*5d9d9091SRichard Lowe sub $0, %r11 1900*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1901*5d9d9091SRichard Lowe#endif 1902*5d9d9091SRichard Lowe */ 1903*5d9d9091SRichard Lowe movzx (%rsi), %ecx 1904*5d9d9091SRichard Lowe movzx (%rdi), %eax 1905*5d9d9091SRichard Lowe 1906*5d9d9091SRichard Lowe sub %ecx, %eax 1907*5d9d9091SRichard Lowe ret 1908*5d9d9091SRichard Lowe 1909*5d9d9091SRichard Lowe .p2align 4 1910*5d9d9091SRichard LoweLABEL(Byte1): 1911*5d9d9091SRichard Lowe 1912*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1913*5d9d9091SRichard Lowe sub $1, %r11 1914*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1915*5d9d9091SRichard Lowe#endif 1916*5d9d9091SRichard Lowe movzx 1(%rsi), %ecx 1917*5d9d9091SRichard Lowe movzx 1(%rdi), %eax 1918*5d9d9091SRichard Lowe 1919*5d9d9091SRichard Lowe sub %ecx, %eax 1920*5d9d9091SRichard Lowe ret 1921*5d9d9091SRichard Lowe 1922*5d9d9091SRichard Lowe .p2align 4 1923*5d9d9091SRichard LoweLABEL(Byte2): 1924*5d9d9091SRichard Lowe 1925*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1926*5d9d9091SRichard Lowe sub $2, %r11 1927*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1928*5d9d9091SRichard Lowe#endif 1929*5d9d9091SRichard Lowe movzx 2(%rsi), %ecx 1930*5d9d9091SRichard Lowe movzx 2(%rdi), %eax 1931*5d9d9091SRichard Lowe 1932*5d9d9091SRichard Lowe sub %ecx, %eax 1933*5d9d9091SRichard Lowe ret 1934*5d9d9091SRichard Lowe 1935*5d9d9091SRichard Lowe .p2align 4 1936*5d9d9091SRichard LoweLABEL(Byte3): 1937*5d9d9091SRichard Lowe 1938*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1939*5d9d9091SRichard Lowe sub $3, %r11 1940*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1941*5d9d9091SRichard Lowe#endif 1942*5d9d9091SRichard Lowe movzx 3(%rsi), %ecx 1943*5d9d9091SRichard Lowe movzx 3(%rdi), %eax 1944*5d9d9091SRichard Lowe 1945*5d9d9091SRichard Lowe sub %ecx, %eax 1946*5d9d9091SRichard Lowe ret 1947*5d9d9091SRichard Lowe 1948*5d9d9091SRichard Lowe .p2align 4 1949*5d9d9091SRichard LoweLABEL(Byte4): 1950*5d9d9091SRichard Lowe 1951*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1952*5d9d9091SRichard Lowe sub $4, %r11 1953*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1954*5d9d9091SRichard Lowe#endif 1955*5d9d9091SRichard Lowe movzx 4(%rsi), %ecx 1956*5d9d9091SRichard Lowe movzx 4(%rdi), %eax 1957*5d9d9091SRichard Lowe 1958*5d9d9091SRichard Lowe sub %ecx, %eax 1959*5d9d9091SRichard Lowe ret 1960*5d9d9091SRichard Lowe 1961*5d9d9091SRichard Lowe .p2align 4 1962*5d9d9091SRichard LoweLABEL(Byte5): 1963*5d9d9091SRichard Lowe 1964*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1965*5d9d9091SRichard Lowe sub $5, %r11 1966*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1967*5d9d9091SRichard Lowe#endif 1968*5d9d9091SRichard Lowe movzx 5(%rsi), %ecx 1969*5d9d9091SRichard Lowe movzx 5(%rdi), %eax 1970*5d9d9091SRichard Lowe 1971*5d9d9091SRichard Lowe sub %ecx, %eax 1972*5d9d9091SRichard Lowe ret 1973*5d9d9091SRichard Lowe 1974*5d9d9091SRichard Lowe .p2align 4 1975*5d9d9091SRichard LoweLABEL(Byte6): 1976*5d9d9091SRichard Lowe 1977*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1978*5d9d9091SRichard Lowe sub $6, %r11 1979*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1980*5d9d9091SRichard Lowe#endif 1981*5d9d9091SRichard Lowe movzx 6(%rsi), %ecx 1982*5d9d9091SRichard Lowe movzx 6(%rdi), %eax 1983*5d9d9091SRichard Lowe 1984*5d9d9091SRichard Lowe sub %ecx, %eax 1985*5d9d9091SRichard Lowe ret 1986*5d9d9091SRichard Lowe 1987*5d9d9091SRichard Lowe .p2align 4 1988*5d9d9091SRichard LoweLABEL(next_8_bytes): 1989*5d9d9091SRichard Lowe add $8, %rdi 1990*5d9d9091SRichard Lowe add $8, %rsi 1991*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 1992*5d9d9091SRichard Lowe sub $8, %r11 1993*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 1994*5d9d9091SRichard Lowe#endif 1995*5d9d9091SRichard Lowe test $0x01, %dh 1996*5d9d9091SRichard Lowe jnz LABEL(Byte0) 1997*5d9d9091SRichard Lowe 1998*5d9d9091SRichard Lowe test $0x02, %dh 1999*5d9d9091SRichard Lowe jnz LABEL(Byte1) 2000*5d9d9091SRichard Lowe 2001*5d9d9091SRichard Lowe test $0x04, %dh 2002*5d9d9091SRichard Lowe jnz LABEL(Byte2) 2003*5d9d9091SRichard Lowe 2004*5d9d9091SRichard Lowe test $0x08, %dh 2005*5d9d9091SRichard Lowe jnz LABEL(Byte3) 2006*5d9d9091SRichard Lowe 2007*5d9d9091SRichard Lowe test $0x10, %dh 2008*5d9d9091SRichard Lowe jnz LABEL(Byte4) 2009*5d9d9091SRichard Lowe 2010*5d9d9091SRichard Lowe test $0x20, %dh 2011*5d9d9091SRichard Lowe jnz LABEL(Byte5) 2012*5d9d9091SRichard Lowe 2013*5d9d9091SRichard Lowe test $0x40, %dh 2014*5d9d9091SRichard Lowe jnz LABEL(Byte6) 2015*5d9d9091SRichard Lowe 2016*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 2017*5d9d9091SRichard Lowe sub $7, %r11 2018*5d9d9091SRichard Lowe jbe LABEL(strcmp_exitz) 2019*5d9d9091SRichard Lowe#endif 2020*5d9d9091SRichard Lowe movzx 7(%rsi), %ecx 2021*5d9d9091SRichard Lowe movzx 7(%rdi), %eax 2022*5d9d9091SRichard Lowe 2023*5d9d9091SRichard Lowe sub %ecx, %eax 2024*5d9d9091SRichard Lowe ret 2025*5d9d9091SRichard Lowe 2026*5d9d9091SRichard Lowe .pushsection .rodata 2027*5d9d9091SRichard Lowe .p2align 4 2028*5d9d9091SRichard LoweLABEL(unaligned_table): 2029*5d9d9091SRichard Lowe .int LABEL(ashr_0) - LABEL(unaligned_table) 2030*5d9d9091SRichard Lowe .int LABEL(ashr_15) - LABEL(unaligned_table) 2031*5d9d9091SRichard Lowe .int LABEL(ashr_14) - LABEL(unaligned_table) 2032*5d9d9091SRichard Lowe .int LABEL(ashr_13) - LABEL(unaligned_table) 2033*5d9d9091SRichard Lowe .int LABEL(ashr_12) - LABEL(unaligned_table) 2034*5d9d9091SRichard Lowe .int LABEL(ashr_11) - LABEL(unaligned_table) 2035*5d9d9091SRichard Lowe .int LABEL(ashr_10) - LABEL(unaligned_table) 2036*5d9d9091SRichard Lowe .int LABEL(ashr_9) - LABEL(unaligned_table) 2037*5d9d9091SRichard Lowe .int LABEL(ashr_8) - LABEL(unaligned_table) 2038*5d9d9091SRichard Lowe .int LABEL(ashr_7) - LABEL(unaligned_table) 2039*5d9d9091SRichard Lowe .int LABEL(ashr_6) - LABEL(unaligned_table) 2040*5d9d9091SRichard Lowe .int LABEL(ashr_5) - LABEL(unaligned_table) 2041*5d9d9091SRichard Lowe .int LABEL(ashr_4) - LABEL(unaligned_table) 2042*5d9d9091SRichard Lowe .int LABEL(ashr_3) - LABEL(unaligned_table) 2043*5d9d9091SRichard Lowe .int LABEL(ashr_2) - LABEL(unaligned_table) 2044*5d9d9091SRichard Lowe .int LABEL(ashr_1) - LABEL(unaligned_table) 2045*5d9d9091SRichard Lowe .popsection 2046*5d9d9091SRichard Lowe#ifdef USE_AS_STRNCMP 2047*5d9d9091SRichard Lowe SET_SIZE(strncmp) 2048*5d9d9091SRichard Lowe#else 2049*5d9d9091SRichard Lowe SET_SIZE(strcmp) /* (const char *, const char *) */ 2050*5d9d9091SRichard Lowe#endif 2051