1*8ddb146aSEd Maste/* 2*8ddb146aSEd MasteCopyright (c) 2014, Intel Corporation 3*8ddb146aSEd MasteAll rights reserved. 4*8ddb146aSEd Maste 5*8ddb146aSEd MasteRedistribution and use in source and binary forms, with or without 6*8ddb146aSEd Mastemodification, are permitted provided that the following conditions are met: 7*8ddb146aSEd Maste 8*8ddb146aSEd Maste * Redistributions of source code must retain the above copyright notice, 9*8ddb146aSEd Maste * this list of conditions and the following disclaimer. 10*8ddb146aSEd Maste 11*8ddb146aSEd Maste * Redistributions in binary form must reproduce the above copyright notice, 12*8ddb146aSEd Maste * this list of conditions and the following disclaimer in the documentation 13*8ddb146aSEd Maste * and/or other materials provided with the distribution. 14*8ddb146aSEd Maste 15*8ddb146aSEd Maste * Neither the name of Intel Corporation nor the names of its contributors 16*8ddb146aSEd Maste * may be used to endorse or promote products derived from this software 17*8ddb146aSEd Maste * without specific prior written permission. 18*8ddb146aSEd Maste 19*8ddb146aSEd MasteTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20*8ddb146aSEd MasteANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21*8ddb146aSEd MasteWARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22*8ddb146aSEd MasteDISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23*8ddb146aSEd MasteANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24*8ddb146aSEd Maste(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25*8ddb146aSEd MasteLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26*8ddb146aSEd MasteANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27*8ddb146aSEd Maste(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28*8ddb146aSEd MasteSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*8ddb146aSEd Maste*/ 30*8ddb146aSEd Maste 31*8ddb146aSEd Maste#ifndef USE_AS_STRCAT 32*8ddb146aSEd Maste 33*8ddb146aSEd Maste#ifndef STRLEN 34*8ddb146aSEd Maste# define STRLEN strlen 35*8ddb146aSEd Maste#endif 36*8ddb146aSEd Maste 37*8ddb146aSEd Maste#ifndef L 38*8ddb146aSEd Maste# define L(label) .L##label 39*8ddb146aSEd Maste#endif 40*8ddb146aSEd Maste 41*8ddb146aSEd Maste#ifndef cfi_startproc 42*8ddb146aSEd Maste# define cfi_startproc .cfi_startproc 43*8ddb146aSEd Maste#endif 44*8ddb146aSEd Maste 45*8ddb146aSEd Maste#ifndef cfi_endproc 46*8ddb146aSEd Maste# define cfi_endproc .cfi_endproc 47*8ddb146aSEd Maste#endif 48*8ddb146aSEd Maste 49*8ddb146aSEd Maste#ifndef ENTRY 50*8ddb146aSEd Maste# define ENTRY(name) \ 51*8ddb146aSEd Maste .type name, @function; \ 52*8ddb146aSEd Maste .globl name; \ 53*8ddb146aSEd Maste .p2align 4; \ 54*8ddb146aSEd Mastename: \ 55*8ddb146aSEd Maste cfi_startproc 56*8ddb146aSEd Maste#endif 57*8ddb146aSEd Maste 58*8ddb146aSEd Maste#ifndef END 59*8ddb146aSEd Maste# define END(name) \ 60*8ddb146aSEd Maste cfi_endproc; \ 61*8ddb146aSEd Maste .size name, .-name 62*8ddb146aSEd Maste#endif 63*8ddb146aSEd Maste#define RETURN ret 64*8ddb146aSEd Maste .section .text.sse2,"ax",@progbits 65*8ddb146aSEd MasteENTRY (STRLEN) 66*8ddb146aSEd Maste/* end ifndef USE_AS_STRCAT */ 67*8ddb146aSEd Maste#endif 68*8ddb146aSEd Maste xor %rax, %rax 69*8ddb146aSEd Maste mov %edi, %ecx 70*8ddb146aSEd Maste and $0x3f, %ecx 71*8ddb146aSEd Maste pxor %xmm0, %xmm0 72*8ddb146aSEd Maste cmp $0x30, %ecx 73*8ddb146aSEd Maste ja L(next) 74*8ddb146aSEd Maste movdqu (%rdi), %xmm1 75*8ddb146aSEd Maste pcmpeqb %xmm1, %xmm0 76*8ddb146aSEd Maste pmovmskb %xmm0, %edx 77*8ddb146aSEd Maste test %edx, %edx 78*8ddb146aSEd Maste jnz L(exit_less16) 79*8ddb146aSEd Maste mov %rdi, %rax 80*8ddb146aSEd Maste and $-16, %rax 81*8ddb146aSEd Maste jmp L(align16_start) 82*8ddb146aSEd MasteL(next): 83*8ddb146aSEd Maste mov %rdi, %rax 84*8ddb146aSEd Maste and $-16, %rax 85*8ddb146aSEd Maste pcmpeqb (%rax), %xmm0 86*8ddb146aSEd Maste mov $-1, %r10d 87*8ddb146aSEd Maste sub %rax, %rcx 88*8ddb146aSEd Maste shl %cl, %r10d 89*8ddb146aSEd Maste pmovmskb %xmm0, %edx 90*8ddb146aSEd Maste and %r10d, %edx 91*8ddb146aSEd Maste jnz L(exit) 92*8ddb146aSEd MasteL(align16_start): 93*8ddb146aSEd Maste pxor %xmm0, %xmm0 94*8ddb146aSEd Maste pxor %xmm1, %xmm1 95*8ddb146aSEd Maste pxor %xmm2, %xmm2 96*8ddb146aSEd Maste pxor %xmm3, %xmm3 97*8ddb146aSEd Maste pcmpeqb 16(%rax), %xmm0 98*8ddb146aSEd Maste pmovmskb %xmm0, %edx 99*8ddb146aSEd Maste test %edx, %edx 100*8ddb146aSEd Maste jnz L(exit16) 101*8ddb146aSEd Maste 102*8ddb146aSEd Maste pcmpeqb 32(%rax), %xmm1 103*8ddb146aSEd Maste pmovmskb %xmm1, %edx 104*8ddb146aSEd Maste test %edx, %edx 105*8ddb146aSEd Maste jnz L(exit32) 106*8ddb146aSEd Maste 107*8ddb146aSEd Maste pcmpeqb 48(%rax), %xmm2 108*8ddb146aSEd Maste pmovmskb %xmm2, %edx 109*8ddb146aSEd Maste test %edx, %edx 110*8ddb146aSEd Maste jnz L(exit48) 111*8ddb146aSEd Maste 112*8ddb146aSEd Maste pcmpeqb 64(%rax), %xmm3 113*8ddb146aSEd Maste pmovmskb %xmm3, %edx 114*8ddb146aSEd Maste test %edx, %edx 115*8ddb146aSEd Maste jnz L(exit64) 116*8ddb146aSEd Maste 117*8ddb146aSEd Maste pcmpeqb 80(%rax), %xmm0 118*8ddb146aSEd Maste add $64, %rax 119*8ddb146aSEd Maste pmovmskb %xmm0, %edx 120*8ddb146aSEd Maste test %edx, %edx 121*8ddb146aSEd Maste jnz L(exit16) 122*8ddb146aSEd Maste 123*8ddb146aSEd Maste pcmpeqb 32(%rax), %xmm1 124*8ddb146aSEd Maste pmovmskb %xmm1, %edx 125*8ddb146aSEd Maste test %edx, %edx 126*8ddb146aSEd Maste jnz L(exit32) 127*8ddb146aSEd Maste 128*8ddb146aSEd Maste pcmpeqb 48(%rax), %xmm2 129*8ddb146aSEd Maste pmovmskb %xmm2, %edx 130*8ddb146aSEd Maste test %edx, %edx 131*8ddb146aSEd Maste jnz L(exit48) 132*8ddb146aSEd Maste 133*8ddb146aSEd Maste pcmpeqb 64(%rax), %xmm3 134*8ddb146aSEd Maste pmovmskb %xmm3, %edx 135*8ddb146aSEd Maste test %edx, %edx 136*8ddb146aSEd Maste jnz L(exit64) 137*8ddb146aSEd Maste 138*8ddb146aSEd Maste pcmpeqb 80(%rax), %xmm0 139*8ddb146aSEd Maste add $64, %rax 140*8ddb146aSEd Maste pmovmskb %xmm0, %edx 141*8ddb146aSEd Maste test %edx, %edx 142*8ddb146aSEd Maste jnz L(exit16) 143*8ddb146aSEd Maste 144*8ddb146aSEd Maste pcmpeqb 32(%rax), %xmm1 145*8ddb146aSEd Maste pmovmskb %xmm1, %edx 146*8ddb146aSEd Maste test %edx, %edx 147*8ddb146aSEd Maste jnz L(exit32) 148*8ddb146aSEd Maste 149*8ddb146aSEd Maste pcmpeqb 48(%rax), %xmm2 150*8ddb146aSEd Maste pmovmskb %xmm2, %edx 151*8ddb146aSEd Maste test %edx, %edx 152*8ddb146aSEd Maste jnz L(exit48) 153*8ddb146aSEd Maste 154*8ddb146aSEd Maste pcmpeqb 64(%rax), %xmm3 155*8ddb146aSEd Maste pmovmskb %xmm3, %edx 156*8ddb146aSEd Maste test %edx, %edx 157*8ddb146aSEd Maste jnz L(exit64) 158*8ddb146aSEd Maste 159*8ddb146aSEd Maste pcmpeqb 80(%rax), %xmm0 160*8ddb146aSEd Maste add $64, %rax 161*8ddb146aSEd Maste pmovmskb %xmm0, %edx 162*8ddb146aSEd Maste test %edx, %edx 163*8ddb146aSEd Maste jnz L(exit16) 164*8ddb146aSEd Maste 165*8ddb146aSEd Maste pcmpeqb 32(%rax), %xmm1 166*8ddb146aSEd Maste pmovmskb %xmm1, %edx 167*8ddb146aSEd Maste test %edx, %edx 168*8ddb146aSEd Maste jnz L(exit32) 169*8ddb146aSEd Maste 170*8ddb146aSEd Maste pcmpeqb 48(%rax), %xmm2 171*8ddb146aSEd Maste pmovmskb %xmm2, %edx 172*8ddb146aSEd Maste test %edx, %edx 173*8ddb146aSEd Maste jnz L(exit48) 174*8ddb146aSEd Maste 175*8ddb146aSEd Maste pcmpeqb 64(%rax), %xmm3 176*8ddb146aSEd Maste pmovmskb %xmm3, %edx 177*8ddb146aSEd Maste test %edx, %edx 178*8ddb146aSEd Maste jnz L(exit64) 179*8ddb146aSEd Maste 180*8ddb146aSEd Maste 181*8ddb146aSEd Maste test $0x3f, %rax 182*8ddb146aSEd Maste jz L(align64_loop) 183*8ddb146aSEd Maste 184*8ddb146aSEd Maste pcmpeqb 80(%rax), %xmm0 185*8ddb146aSEd Maste add $80, %rax 186*8ddb146aSEd Maste pmovmskb %xmm0, %edx 187*8ddb146aSEd Maste test %edx, %edx 188*8ddb146aSEd Maste jnz L(exit) 189*8ddb146aSEd Maste 190*8ddb146aSEd Maste test $0x3f, %rax 191*8ddb146aSEd Maste jz L(align64_loop) 192*8ddb146aSEd Maste 193*8ddb146aSEd Maste pcmpeqb 16(%rax), %xmm1 194*8ddb146aSEd Maste add $16, %rax 195*8ddb146aSEd Maste pmovmskb %xmm1, %edx 196*8ddb146aSEd Maste test %edx, %edx 197*8ddb146aSEd Maste jnz L(exit) 198*8ddb146aSEd Maste 199*8ddb146aSEd Maste test $0x3f, %rax 200*8ddb146aSEd Maste jz L(align64_loop) 201*8ddb146aSEd Maste 202*8ddb146aSEd Maste pcmpeqb 16(%rax), %xmm2 203*8ddb146aSEd Maste add $16, %rax 204*8ddb146aSEd Maste pmovmskb %xmm2, %edx 205*8ddb146aSEd Maste test %edx, %edx 206*8ddb146aSEd Maste jnz L(exit) 207*8ddb146aSEd Maste 208*8ddb146aSEd Maste test $0x3f, %rax 209*8ddb146aSEd Maste jz L(align64_loop) 210*8ddb146aSEd Maste 211*8ddb146aSEd Maste pcmpeqb 16(%rax), %xmm3 212*8ddb146aSEd Maste add $16, %rax 213*8ddb146aSEd Maste pmovmskb %xmm3, %edx 214*8ddb146aSEd Maste test %edx, %edx 215*8ddb146aSEd Maste jnz L(exit) 216*8ddb146aSEd Maste 217*8ddb146aSEd Maste add $16, %rax 218*8ddb146aSEd Maste .p2align 4 219*8ddb146aSEd Maste L(align64_loop): 220*8ddb146aSEd Maste movaps (%rax), %xmm4 221*8ddb146aSEd Maste pminub 16(%rax), %xmm4 222*8ddb146aSEd Maste movaps 32(%rax), %xmm5 223*8ddb146aSEd Maste pminub 48(%rax), %xmm5 224*8ddb146aSEd Maste add $64, %rax 225*8ddb146aSEd Maste pminub %xmm4, %xmm5 226*8ddb146aSEd Maste pcmpeqb %xmm0, %xmm5 227*8ddb146aSEd Maste pmovmskb %xmm5, %edx 228*8ddb146aSEd Maste test %edx, %edx 229*8ddb146aSEd Maste jz L(align64_loop) 230*8ddb146aSEd Maste 231*8ddb146aSEd Maste 232*8ddb146aSEd Maste pcmpeqb -64(%rax), %xmm0 233*8ddb146aSEd Maste sub $80, %rax 234*8ddb146aSEd Maste pmovmskb %xmm0, %edx 235*8ddb146aSEd Maste test %edx, %edx 236*8ddb146aSEd Maste jnz L(exit16) 237*8ddb146aSEd Maste 238*8ddb146aSEd Maste pcmpeqb 32(%rax), %xmm1 239*8ddb146aSEd Maste pmovmskb %xmm1, %edx 240*8ddb146aSEd Maste test %edx, %edx 241*8ddb146aSEd Maste jnz L(exit32) 242*8ddb146aSEd Maste 243*8ddb146aSEd Maste pcmpeqb 48(%rax), %xmm2 244*8ddb146aSEd Maste pmovmskb %xmm2, %edx 245*8ddb146aSEd Maste test %edx, %edx 246*8ddb146aSEd Maste jnz L(exit48) 247*8ddb146aSEd Maste 248*8ddb146aSEd Maste pcmpeqb 64(%rax), %xmm3 249*8ddb146aSEd Maste pmovmskb %xmm3, %edx 250*8ddb146aSEd Maste sub %rdi, %rax 251*8ddb146aSEd Maste bsf %rdx, %rdx 252*8ddb146aSEd Maste add %rdx, %rax 253*8ddb146aSEd Maste add $64, %rax 254*8ddb146aSEd Maste RETURN 255*8ddb146aSEd Maste 256*8ddb146aSEd Maste .p2align 4 257*8ddb146aSEd MasteL(exit): 258*8ddb146aSEd Maste sub %rdi, %rax 259*8ddb146aSEd MasteL(exit_less16): 260*8ddb146aSEd Maste bsf %rdx, %rdx 261*8ddb146aSEd Maste add %rdx, %rax 262*8ddb146aSEd Maste RETURN 263*8ddb146aSEd Maste .p2align 4 264*8ddb146aSEd MasteL(exit16): 265*8ddb146aSEd Maste sub %rdi, %rax 266*8ddb146aSEd Maste bsf %rdx, %rdx 267*8ddb146aSEd Maste add %rdx, %rax 268*8ddb146aSEd Maste add $16, %rax 269*8ddb146aSEd Maste RETURN 270*8ddb146aSEd Maste .p2align 4 271*8ddb146aSEd MasteL(exit32): 272*8ddb146aSEd Maste sub %rdi, %rax 273*8ddb146aSEd Maste bsf %rdx, %rdx 274*8ddb146aSEd Maste add %rdx, %rax 275*8ddb146aSEd Maste add $32, %rax 276*8ddb146aSEd Maste RETURN 277*8ddb146aSEd Maste .p2align 4 278*8ddb146aSEd MasteL(exit48): 279*8ddb146aSEd Maste sub %rdi, %rax 280*8ddb146aSEd Maste bsf %rdx, %rdx 281*8ddb146aSEd Maste add %rdx, %rax 282*8ddb146aSEd Maste add $48, %rax 283*8ddb146aSEd Maste RETURN 284*8ddb146aSEd Maste .p2align 4 285*8ddb146aSEd MasteL(exit64): 286*8ddb146aSEd Maste sub %rdi, %rax 287*8ddb146aSEd Maste bsf %rdx, %rdx 288*8ddb146aSEd Maste add %rdx, %rax 289*8ddb146aSEd Maste add $64, %rax 290*8ddb146aSEd Maste#ifndef USE_AS_STRCAT 291*8ddb146aSEd Maste RETURN 292*8ddb146aSEd Maste 293*8ddb146aSEd MasteEND (STRLEN) 294*8ddb146aSEd Maste#endif 295