1*14289e97SRobert Clausecker/*- 2*14289e97SRobert Clausecker * Copyright (c) 2023 The FreeBSD Foundation 3*14289e97SRobert Clausecker * 4*14289e97SRobert Clausecker * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5*14289e97SRobert Clausecker * under sponsorship from the FreeBSD Foundation. 6*14289e97SRobert Clausecker * 7*14289e97SRobert Clausecker * Redistribution and use in source and binary forms, with or without 8*14289e97SRobert Clausecker * modification, are permitted provided that the following conditions 9*14289e97SRobert Clausecker * are met: 10*14289e97SRobert Clausecker * 1. Redistributions of source code must retain the above copyright 11*14289e97SRobert Clausecker * notice, this list of conditions and the following disclaimer. 12*14289e97SRobert Clausecker * 2. Redistributions in binary form must reproduce the above copyright 13*14289e97SRobert Clausecker * notice, this list of conditions and the following disclaimer in the 14*14289e97SRobert Clausecker * documentation and/or other materials provided with the distribution. 15*14289e97SRobert Clausecker * 16*14289e97SRobert Clausecker * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17*14289e97SRobert Clausecker * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18*14289e97SRobert Clausecker * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19*14289e97SRobert Clausecker * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20*14289e97SRobert Clausecker * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21*14289e97SRobert Clausecker * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22*14289e97SRobert Clausecker * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23*14289e97SRobert Clausecker * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24*14289e97SRobert Clausecker * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25*14289e97SRobert Clausecker * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26*14289e97SRobert Clausecker * SUCH DAMAGE 27*14289e97SRobert Clausecker */ 28*14289e97SRobert Clausecker 29*14289e97SRobert Clausecker#include <machine/asm.h> 30*14289e97SRobert Clausecker#include <machine/param.h> 31*14289e97SRobert Clausecker 32*14289e97SRobert Clausecker#include "amd64_archlevel.h" 33*14289e97SRobert Clausecker 34*14289e97SRobert Clausecker#define ALIGN_TEXT .p2align 4, 0x90 35*14289e97SRobert Clausecker 36*14289e97SRobert ClauseckerARCHFUNCS(strncmp) 37*14289e97SRobert Clausecker ARCHFUNC(strncmp, scalar) 38*14289e97SRobert Clausecker ARCHFUNC(strncmp, baseline) 39*14289e97SRobert ClauseckerENDARCHFUNCS(strncmp) 40*14289e97SRobert Clausecker 41*14289e97SRobert Clausecker/* 42*14289e97SRobert Clausecker * This is just the scalar loop unrolled a bunch of times. 43*14289e97SRobert Clausecker */ 44*14289e97SRobert ClauseckerARCHENTRY(strncmp, scalar) 45*14289e97SRobert Clausecker xor %eax, %eax 46*14289e97SRobert Clausecker sub $4, %rdx # 4 chars left to compare? 47*14289e97SRobert Clausecker jbe 1f 48*14289e97SRobert Clausecker 49*14289e97SRobert Clausecker ALIGN_TEXT 50*14289e97SRobert Clausecker0: movzbl (%rdi), %ecx 51*14289e97SRobert Clausecker test %ecx, %ecx # NUL char in first string? 52*14289e97SRobert Clausecker jz .L0 53*14289e97SRobert Clausecker cmpb (%rsi), %cl # mismatch between strings? 54*14289e97SRobert Clausecker jnz .L0 55*14289e97SRobert Clausecker 56*14289e97SRobert Clausecker movzbl 1(%rdi), %ecx 57*14289e97SRobert Clausecker test %ecx, %ecx 58*14289e97SRobert Clausecker jz .L1 59*14289e97SRobert Clausecker cmpb 1(%rsi), %cl 60*14289e97SRobert Clausecker jnz .L1 61*14289e97SRobert Clausecker 62*14289e97SRobert Clausecker movzbl 2(%rdi), %ecx 63*14289e97SRobert Clausecker test %ecx, %ecx 64*14289e97SRobert Clausecker jz .L2 65*14289e97SRobert Clausecker cmpb 2(%rsi), %cl 66*14289e97SRobert Clausecker jnz .L2 67*14289e97SRobert Clausecker 68*14289e97SRobert Clausecker movzbl 3(%rdi), %ecx 69*14289e97SRobert Clausecker test %ecx, %ecx 70*14289e97SRobert Clausecker jz .L3 71*14289e97SRobert Clausecker cmpb 3(%rsi), %cl 72*14289e97SRobert Clausecker jnz .L3 73*14289e97SRobert Clausecker 74*14289e97SRobert Clausecker add $4, %rdi # advance to next iteration 75*14289e97SRobert Clausecker add $4, %rsi 76*14289e97SRobert Clausecker sub $4, %rdx 77*14289e97SRobert Clausecker ja 0b 78*14289e97SRobert Clausecker 79*14289e97SRobert Clausecker /* end of string within the next 4 characters */ 80*14289e97SRobert Clausecker1: cmp $-4, %edx # end of string reached immediately? 81*14289e97SRobert Clausecker jz .Leq 82*14289e97SRobert Clausecker movzbl (%rdi), %ecx 83*14289e97SRobert Clausecker test %ecx, %ecx 84*14289e97SRobert Clausecker jz .L0 85*14289e97SRobert Clausecker cmpb (%rsi), %cl 86*14289e97SRobert Clausecker jnz .L0 87*14289e97SRobert Clausecker 88*14289e97SRobert Clausecker cmp $-3, %edx # end of string reached after 1 char? 89*14289e97SRobert Clausecker jz .Leq 90*14289e97SRobert Clausecker movzbl 1(%rdi), %ecx 91*14289e97SRobert Clausecker test %ecx, %ecx 92*14289e97SRobert Clausecker jz .L1 93*14289e97SRobert Clausecker cmpb 1(%rsi), %cl 94*14289e97SRobert Clausecker jnz .L1 95*14289e97SRobert Clausecker 96*14289e97SRobert Clausecker cmp $-2, %edx 97*14289e97SRobert Clausecker jz .Leq 98*14289e97SRobert Clausecker movzbl 2(%rdi), %ecx 99*14289e97SRobert Clausecker test %ecx, %ecx 100*14289e97SRobert Clausecker jz .L2 101*14289e97SRobert Clausecker cmpb 2(%rsi), %cl 102*14289e97SRobert Clausecker jnz .L2 103*14289e97SRobert Clausecker 104*14289e97SRobert Clausecker cmp $-1, %edx # either end of string after 3 chars, 105*14289e97SRobert Clausecker jz .Leq # or it boils down to the last char 106*14289e97SRobert Clausecker 107*14289e97SRobert Clausecker.L3: inc %eax 108*14289e97SRobert Clausecker.L2: inc %eax 109*14289e97SRobert Clausecker.L1: inc %eax 110*14289e97SRobert Clausecker.L0: movzbl (%rsi, %rax, 1), %ecx 111*14289e97SRobert Clausecker movzbl (%rdi, %rax, 1), %eax 112*14289e97SRobert Clausecker sub %ecx, %eax 113*14289e97SRobert Clausecker.Leq: ret 114*14289e97SRobert ClauseckerARCHEND(strncmp, scalar) 115*14289e97SRobert Clausecker 116*14289e97SRobert ClauseckerARCHENTRY(strncmp, baseline) 117*14289e97SRobert Clausecker push %rbx 118*14289e97SRobert Clausecker sub $1, %rdx # RDX--, so RDX points to the last byte to compare 119*14289e97SRobert Clausecker jb .Lempty # where there any bytes to compare at all? 120*14289e97SRobert Clausecker 121*14289e97SRobert Clausecker lea 15(%rdi), %r8d # end of head 122*14289e97SRobert Clausecker lea 15(%rsi), %r9d 123*14289e97SRobert Clausecker mov %edi, %eax 124*14289e97SRobert Clausecker mov %esi, %ebx 125*14289e97SRobert Clausecker xor %edi, %r8d # bits that changed between first and last byte 126*14289e97SRobert Clausecker xor %esi, %r9d 127*14289e97SRobert Clausecker and $~0xf, %rdi # align heads to 16 bytes 128*14289e97SRobert Clausecker and $~0xf, %rsi 129*14289e97SRobert Clausecker or %r8d, %r9d 130*14289e97SRobert Clausecker and $0xf, %eax # offset from alignment 131*14289e97SRobert Clausecker and $0xf, %ebx 132*14289e97SRobert Clausecker movdqa (%rdi), %xmm0 # load aligned heads 133*14289e97SRobert Clausecker movdqa (%rsi), %xmm2 134*14289e97SRobert Clausecker pxor %xmm1, %xmm1 135*14289e97SRobert Clausecker cmp $16, %rdx # end of buffer within the first 32 bytes? 136*14289e97SRobert Clausecker jb .Llt16 137*14289e97SRobert Clausecker 138*14289e97SRobert Clausecker test $PAGE_SIZE, %r9d # did the page change? 139*14289e97SRobert Clausecker jz 0f # if not, take fast path 140*14289e97SRobert Clausecker 141*14289e97SRobert Clausecker 142*14289e97SRobert Clausecker /* heads may cross page boundary, avoid unmapped loads */ 143*14289e97SRobert Clausecker movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack 144*14289e97SRobert Clausecker movdqa %xmm2, -16(%rsp) 145*14289e97SRobert Clausecker mov $-1, %r8d 146*14289e97SRobert Clausecker mov $-1, %r9d 147*14289e97SRobert Clausecker mov %eax, %ecx 148*14289e97SRobert Clausecker shl %cl, %r8d # string head in XMM0 149*14289e97SRobert Clausecker mov %ebx, %ecx 150*14289e97SRobert Clausecker shl %cl, %r9d # string head in XMM2 151*14289e97SRobert Clausecker pcmpeqb %xmm1, %xmm0 152*14289e97SRobert Clausecker pcmpeqb %xmm1, %xmm2 153*14289e97SRobert Clausecker pmovmskb %xmm0, %r10d 154*14289e97SRobert Clausecker pmovmskb %xmm2, %r11d 155*14289e97SRobert Clausecker test %r8d, %r10d # NUL byte present in first string? 156*14289e97SRobert Clausecker lea -32(%rsp), %r8 157*14289e97SRobert Clausecker cmovz %rdi, %r8 158*14289e97SRobert Clausecker test %r9d, %r11d # NUL byte present in second string? 159*14289e97SRobert Clausecker lea -16(%rsp), %r9 160*14289e97SRobert Clausecker cmovz %rsi, %r9 161*14289e97SRobert Clausecker movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads 162*14289e97SRobert Clausecker movdqu (%r9, %rbx, 1), %xmm4 163*14289e97SRobert Clausecker jmp 1f 164*14289e97SRobert Clausecker 165*14289e97SRobert Clausecker /* rdx == 0 */ 166*14289e97SRobert Clausecker.Lempty: 167*14289e97SRobert Clausecker xor %eax, %eax # zero-length buffers compare equal 168*14289e97SRobert Clausecker pop %rbx 169*14289e97SRobert Clausecker ret 170*14289e97SRobert Clausecker 171*14289e97SRobert Clausecker0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads 172*14289e97SRobert Clausecker movdqu (%rsi, %rbx, 1), %xmm4 173*14289e97SRobert Clausecker1: pxor %xmm2, %xmm2 174*14289e97SRobert Clausecker pcmpeqb %xmm0, %xmm2 # NUL byte present? 175*14289e97SRobert Clausecker pcmpeqb %xmm0, %xmm4 # which bytes match? 176*14289e97SRobert Clausecker pandn %xmm4, %xmm2 # match and not NUL byte? 177*14289e97SRobert Clausecker pmovmskb %xmm2, %r9d 178*14289e97SRobert Clausecker xor $0xffff, %r9d # mismatch or NUL byte? 179*14289e97SRobert Clausecker jnz .Lhead_mismatch 180*14289e97SRobert Clausecker 181*14289e97SRobert Clausecker /* load head and second chunk */ 182*14289e97SRobert Clausecker movdqa 16(%rdi), %xmm2 # load second chunks 183*14289e97SRobert Clausecker movdqa 16(%rsi), %xmm3 184*14289e97SRobert Clausecker lea -16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk 185*14289e97SRobert Clausecker sub %rbx, %rax # is a&0xf >= b&0xf? 186*14289e97SRobert Clausecker jb .Lswapped # if not, proceed with swapped operands 187*14289e97SRobert Clausecker jmp .Lnormal 188*14289e97SRobert Clausecker 189*14289e97SRobert Clausecker /* buffer ends within the first 16 bytes */ 190*14289e97SRobert Clausecker.Llt16: test $PAGE_SIZE, %r9d # did the page change? 191*14289e97SRobert Clausecker jz 0f # if not, take fast path 192*14289e97SRobert Clausecker 193*14289e97SRobert Clausecker /* heads may cross page boundary */ 194*14289e97SRobert Clausecker movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack 195*14289e97SRobert Clausecker movdqa %xmm2, -16(%rsp) 196*14289e97SRobert Clausecker mov $-1, %r8d 197*14289e97SRobert Clausecker mov $-1, %r9d 198*14289e97SRobert Clausecker mov %eax, %ecx 199*14289e97SRobert Clausecker shl %cl, %r8d # string head in XMM0 200*14289e97SRobert Clausecker mov %ebx, %ecx 201*14289e97SRobert Clausecker shl %cl, %r9d # string head in XMM2 202*14289e97SRobert Clausecker pcmpeqb %xmm1, %xmm0 203*14289e97SRobert Clausecker pcmpeqb %xmm1, %xmm2 204*14289e97SRobert Clausecker pmovmskb %xmm0, %r10d 205*14289e97SRobert Clausecker pmovmskb %xmm2, %r11d 206*14289e97SRobert Clausecker lea (%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0 207*14289e97SRobert Clausecker bts %ecx, %r10d # treat as if NUL byte present 208*14289e97SRobert Clausecker lea (%rdx, %rbx, 1), %ecx 209*14289e97SRobert Clausecker bts %ecx, %r11d 210*14289e97SRobert Clausecker test %r8w, %r10w # NUL byte present in first string head? 211*14289e97SRobert Clausecker lea -32(%rsp), %r8 212*14289e97SRobert Clausecker cmovz %rdi, %r8 213*14289e97SRobert Clausecker test %r9w, %r11w # NUL byte present in second string head? 214*14289e97SRobert Clausecker lea -16(%rsp), %r9 215*14289e97SRobert Clausecker cmovz %rsi, %r9 216*14289e97SRobert Clausecker movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads 217*14289e97SRobert Clausecker movdqu (%r9, %rbx, 1), %xmm4 218*14289e97SRobert Clausecker jmp 1f 219*14289e97SRobert Clausecker 220*14289e97SRobert Clausecker0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads 221*14289e97SRobert Clausecker movdqu (%rsi, %rbx, 1), %xmm4 222*14289e97SRobert Clausecker1: pxor %xmm2, %xmm2 223*14289e97SRobert Clausecker pcmpeqb %xmm0, %xmm2 # NUL byte present? 224*14289e97SRobert Clausecker pcmpeqb %xmm0, %xmm4 # which bytes match? 225*14289e97SRobert Clausecker pandn %xmm4, %xmm2 # match and not NUL byte? 226*14289e97SRobert Clausecker pmovmskb %xmm2, %r9d 227*14289e97SRobert Clausecker btr %edx, %r9d # induce mismatch in last byte of buffer 228*14289e97SRobert Clausecker not %r9d # mismatch or NUL byte? 229*14289e97SRobert Clausecker 230*14289e97SRobert Clausecker /* mismatch in true heads */ 231*14289e97SRobert Clausecker ALIGN_TEXT 232*14289e97SRobert Clausecker.Lhead_mismatch: 233*14289e97SRobert Clausecker tzcnt %r9d, %r9d # where is the mismatch? 234*14289e97SRobert Clausecker add %rax, %rdi # return to true heads 235*14289e97SRobert Clausecker add %rbx, %rsi 236*14289e97SRobert Clausecker movzbl (%rdi, %r9, 1), %eax # mismatching characters 237*14289e97SRobert Clausecker movzbl (%rsi, %r9, 1), %ecx 238*14289e97SRobert Clausecker sub %ecx, %eax 239*14289e97SRobert Clausecker pop %rbx 240*14289e97SRobert Clausecker ret 241*14289e97SRobert Clausecker 242*14289e97SRobert Clausecker /* rax >= 0 */ 243*14289e97SRobert Clausecker ALIGN_TEXT 244*14289e97SRobert Clausecker.Lnormal: 245*14289e97SRobert Clausecker neg %rax 246*14289e97SRobert Clausecker movdqu 16(%rsi, %rax, 1), %xmm0 247*14289e97SRobert Clausecker sub %rdi, %rsi # express RSI as distance from RDI 248*14289e97SRobert Clausecker lea (%rsi, %rax, 1), %rbx # point RBX to offset in second string 249*14289e97SRobert Clausecker neg %rax # ... corresponding to RDI 250*14289e97SRobert Clausecker pcmpeqb %xmm3, %xmm1 # NUL present? 251*14289e97SRobert Clausecker pcmpeqb %xmm2, %xmm0 # Mismatch between chunks? 252*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 253*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 254*14289e97SRobert Clausecker mov $16, %ecx 255*14289e97SRobert Clausecker cmp %rcx, %rdx # does the buffer end within (RDI,RSI,1)? 256*14289e97SRobert Clausecker cmovb %edx, %ecx # ECX = min(16, RDX) 257*14289e97SRobert Clausecker add $32, %rdi # advance to next iteration 258*14289e97SRobert Clausecker bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte 259*14289e97SRobert Clausecker test %r8w, %r8w # NUL or end of buffer found? 260*14289e97SRobert Clausecker jnz .Lnul_found2 261*14289e97SRobert Clausecker xor $0xffff, %r9d 262*14289e97SRobert Clausecker jnz .Lmismatch2 263*14289e97SRobert Clausecker sub $48, %rdx # end of buffer within first main loop iteration? 264*14289e97SRobert Clausecker jb .Ltail # if yes, process tail 265*14289e97SRobert Clausecker 266*14289e97SRobert Clausecker /* 267*14289e97SRobert Clausecker * During the main loop, the layout of the two strings is something like: 268*14289e97SRobert Clausecker * 269*14289e97SRobert Clausecker * v ------1------ v ------2------ v 270*14289e97SRobert Clausecker * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 271*14289e97SRobert Clausecker * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 272*14289e97SRobert Clausecker * 273*14289e97SRobert Clausecker * where v indicates the alignment boundaries and corresponding chunks 274*14289e97SRobert Clausecker * of the strings have the same letters. Chunk A has been checked in 275*14289e97SRobert Clausecker * the previous iteration. This iteration, we first check that string 276*14289e97SRobert Clausecker * RSI doesn't end within region 2, then we compare chunk B between the 277*14289e97SRobert Clausecker * two strings. As RSI is known not to hold a NUL byte in regsions 1 278*14289e97SRobert Clausecker * and 2 at this point, this also ensures that RDI has not ended yet. 279*14289e97SRobert Clausecker */ 280*14289e97SRobert Clausecker ALIGN_TEXT 281*14289e97SRobert Clausecker0: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI 282*14289e97SRobert Clausecker pxor %xmm1, %xmm1 283*14289e97SRobert Clausecker pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? 284*14289e97SRobert Clausecker pcmpeqb (%rdi), %xmm0 # where do the chunks match? 285*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 286*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 287*14289e97SRobert Clausecker test %r8d, %r8d 288*14289e97SRobert Clausecker jnz .Lnul_found 289*14289e97SRobert Clausecker xor $0xffff, %r9d # any mismatches? 290*14289e97SRobert Clausecker jnz .Lmismatch 291*14289e97SRobert Clausecker 292*14289e97SRobert Clausecker /* main loop unrolled twice */ 293*14289e97SRobert Clausecker movdqu 16(%rdi, %rbx, 1), %xmm0 294*14289e97SRobert Clausecker pxor %xmm1, %xmm1 295*14289e97SRobert Clausecker pcmpeqb 16(%rdi, %rsi, 1), %xmm1 296*14289e97SRobert Clausecker pcmpeqb 16(%rdi), %xmm0 297*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 298*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 299*14289e97SRobert Clausecker add $32, %rdi 300*14289e97SRobert Clausecker test %r8d, %r8d 301*14289e97SRobert Clausecker jnz .Lnul_found2 302*14289e97SRobert Clausecker xor $0xffff, %r9d 303*14289e97SRobert Clausecker jnz .Lmismatch2 304*14289e97SRobert Clausecker sub $32, %rdx # end of buffer within next iteration? 305*14289e97SRobert Clausecker jae 0b 306*14289e97SRobert Clausecker 307*14289e97SRobert Clausecker /* end of buffer will occur in next 32 bytes */ 308*14289e97SRobert Clausecker.Ltail: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI 309*14289e97SRobert Clausecker pxor %xmm1, %xmm1 310*14289e97SRobert Clausecker pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? 311*14289e97SRobert Clausecker pcmpeqb (%rdi), %xmm0 # where do the chunks match? 312*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 313*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 314*14289e97SRobert Clausecker bts %edx, %r8d # indicate NUL byte at last byte in buffer 315*14289e97SRobert Clausecker test %r8w, %r8w # NUL byte in first chunk? 316*14289e97SRobert Clausecker jnz .Lnul_found 317*14289e97SRobert Clausecker xor $0xffff, %r9d # any mismatches? 318*14289e97SRobert Clausecker jnz .Lmismatch 319*14289e97SRobert Clausecker 320*14289e97SRobert Clausecker /* main loop unrolled twice */ 321*14289e97SRobert Clausecker movdqu 16(%rdi, %rbx, 1), %xmm0 322*14289e97SRobert Clausecker pxor %xmm1, %xmm1 323*14289e97SRobert Clausecker pcmpeqb 16(%rdi, %rsi, 1), %xmm1 324*14289e97SRobert Clausecker pcmpeqb 16(%rdi), %xmm0 325*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 326*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 327*14289e97SRobert Clausecker sub $16, %edx # take first half into account 328*14289e97SRobert Clausecker bts %edx, %r8d # indicate NUL byte at last byte in buffer 329*14289e97SRobert Clausecker add $32, %rdi 330*14289e97SRobert Clausecker 331*14289e97SRobert Clausecker.Lnul_found2: 332*14289e97SRobert Clausecker sub $16, %rdi 333*14289e97SRobert Clausecker 334*14289e97SRobert Clausecker.Lnul_found: 335*14289e97SRobert Clausecker mov %eax, %ecx 336*14289e97SRobert Clausecker mov %r8d, %r10d 337*14289e97SRobert Clausecker shl %cl, %r8d # adjust NUL mask to positions in RDI/RBX 338*14289e97SRobert Clausecker not %r9d # mask of mismatches 339*14289e97SRobert Clausecker or %r8w, %r9w # NUL bytes als count as mismatches 340*14289e97SRobert Clausecker jnz .Lmismatch 341*14289e97SRobert Clausecker 342*14289e97SRobert Clausecker /* 343*14289e97SRobert Clausecker * (RDI) == (RSI) and NUL is past the string. 344*14289e97SRobert Clausecker * compare (RSI) with the corresponding part 345*14289e97SRobert Clausecker * of the other string until the NUL byte. 346*14289e97SRobert Clausecker */ 347*14289e97SRobert Clausecker movdqu (%rdi, %rax, 1), %xmm0 348*14289e97SRobert Clausecker pcmpeqb (%rdi, %rsi, 1), %xmm0 349*14289e97SRobert Clausecker add %rdi, %rsi # restore RSI pointer 350*14289e97SRobert Clausecker add %rax, %rdi # point RDI to chunk corresponding to (RSI) 351*14289e97SRobert Clausecker pmovmskb %xmm0, %ecx # mask of matches 352*14289e97SRobert Clausecker not %ecx # mask of mismatches 353*14289e97SRobert Clausecker or %r10d, %ecx # mask of mismatches or NUL bytes 354*14289e97SRobert Clausecker tzcnt %ecx, %ecx # location of first mismatch 355*14289e97SRobert Clausecker movzbl (%rdi, %rcx, 1), %eax 356*14289e97SRobert Clausecker movzbl (%rsi, %rcx, 1), %ecx 357*14289e97SRobert Clausecker sub %ecx, %eax 358*14289e97SRobert Clausecker pop %rbx 359*14289e97SRobert Clausecker ret 360*14289e97SRobert Clausecker 361*14289e97SRobert Clausecker.Lmismatch2: 362*14289e97SRobert Clausecker sub $16, %rdi 363*14289e97SRobert Clausecker 364*14289e97SRobert Clausecker /* a mismatch has been found between RBX and RSI */ 365*14289e97SRobert Clausecker.Lmismatch: 366*14289e97SRobert Clausecker tzcnt %r9d, %r9d # where is the mismatch? 367*14289e97SRobert Clausecker add %rdi, %rbx # turn RBX from offset into pointer 368*14289e97SRobert Clausecker movzbl (%rbx, %r9, 1), %ecx 369*14289e97SRobert Clausecker movzbl (%rdi, %r9, 1), %eax 370*14289e97SRobert Clausecker sub %ecx, %eax 371*14289e97SRobert Clausecker pop %rbx 372*14289e97SRobert Clausecker ret 373*14289e97SRobert Clausecker 374*14289e97SRobert Clausecker /* rax < 0 */ 375*14289e97SRobert Clausecker ALIGN_TEXT 376*14289e97SRobert Clausecker.Lswapped: 377*14289e97SRobert Clausecker movdqu 16(%rdi, %rax, 1), %xmm0 378*14289e97SRobert Clausecker sub %rsi, %rdi # express RDI as distance from RDI 379*14289e97SRobert Clausecker lea (%rdi, %rax, 1), %rbx # point RBX to offset in first string 380*14289e97SRobert Clausecker pcmpeqb %xmm2, %xmm1 # NUL present? 381*14289e97SRobert Clausecker pcmpeqb %xmm3, %xmm0 # mismatch between chunks? 382*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 383*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 384*14289e97SRobert Clausecker add %rax, %rdx # RDX points to buffer end in RSI 385*14289e97SRobert Clausecker neg %rax # ... corresponding to RSI 386*14289e97SRobert Clausecker mov $16, %ecx 387*14289e97SRobert Clausecker cmp %rcx, %rdx # does the buffer end within (RSI,RDI,1)? 388*14289e97SRobert Clausecker cmovb %edx, %ecx # ECX = min(16, RDX) 389*14289e97SRobert Clausecker add $32, %rsi 390*14289e97SRobert Clausecker bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte 391*14289e97SRobert Clausecker test %r8w, %r8w # NUL or end of buffer found? 392*14289e97SRobert Clausecker jnz .Lnul_found2s 393*14289e97SRobert Clausecker xor $0xffff, %r9d 394*14289e97SRobert Clausecker jnz .Lmismatch2s 395*14289e97SRobert Clausecker sub $48, %rdx # end of buffer within first main loop iteration? 396*14289e97SRobert Clausecker jb .Ltails # if yes, process tail 397*14289e97SRobert Clausecker 398*14289e97SRobert Clausecker ALIGN_TEXT 399*14289e97SRobert Clausecker0: movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI 400*14289e97SRobert Clausecker pxor %xmm1, %xmm1 401*14289e97SRobert Clausecker pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? 402*14289e97SRobert Clausecker pcmpeqb (%rsi), %xmm0 # where do the chunks match? 403*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 404*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 405*14289e97SRobert Clausecker test %r8d, %r8d 406*14289e97SRobert Clausecker jnz .Lnul_founds 407*14289e97SRobert Clausecker xor $0xffff, %r9d # any mismatches? 408*14289e97SRobert Clausecker jnz .Lmismatchs 409*14289e97SRobert Clausecker 410*14289e97SRobert Clausecker /* main loop unrolled twice */ 411*14289e97SRobert Clausecker movdqu 16(%rsi, %rbx, 1), %xmm0 412*14289e97SRobert Clausecker pxor %xmm1, %xmm1 413*14289e97SRobert Clausecker pcmpeqb 16(%rsi, %rdi, 1), %xmm1 414*14289e97SRobert Clausecker pcmpeqb 16(%rsi), %xmm0 415*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 416*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 417*14289e97SRobert Clausecker add $32, %rsi 418*14289e97SRobert Clausecker test %r8d, %r8d 419*14289e97SRobert Clausecker jnz .Lnul_found2s 420*14289e97SRobert Clausecker xor $0xffff, %r9d 421*14289e97SRobert Clausecker jnz .Lmismatch2s 422*14289e97SRobert Clausecker sub $32, %rdx # end of buffer within next iteration? 423*14289e97SRobert Clausecker jae 0b 424*14289e97SRobert Clausecker 425*14289e97SRobert Clausecker /* end of buffer will occur in next 32 bytes */ 426*14289e97SRobert Clausecker.Ltails: 427*14289e97SRobert Clausecker movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI 428*14289e97SRobert Clausecker pxor %xmm1, %xmm1 429*14289e97SRobert Clausecker pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? 430*14289e97SRobert Clausecker pcmpeqb (%rsi), %xmm0 # where do the chunks match? 431*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 432*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 433*14289e97SRobert Clausecker bts %edx, %r8d # indicate NUL byte at laste byte in buffer 434*14289e97SRobert Clausecker test %r8w, %r8w # NUL byte in first chunk? 435*14289e97SRobert Clausecker jnz .Lnul_founds 436*14289e97SRobert Clausecker xor $0xffff, %r9d # any mismatches? 437*14289e97SRobert Clausecker jnz .Lmismatchs 438*14289e97SRobert Clausecker 439*14289e97SRobert Clausecker /* main loop unrolled twice */ 440*14289e97SRobert Clausecker movdqu 16(%rsi, %rbx, 1), %xmm0 441*14289e97SRobert Clausecker pxor %xmm1, %xmm1 442*14289e97SRobert Clausecker pcmpeqb 16(%rsi, %rdi, 1), %xmm1 443*14289e97SRobert Clausecker pcmpeqb 16(%rsi), %xmm0 444*14289e97SRobert Clausecker pmovmskb %xmm1, %r8d 445*14289e97SRobert Clausecker pmovmskb %xmm0, %r9d 446*14289e97SRobert Clausecker sub $16, %edx # take first half into account 447*14289e97SRobert Clausecker bts %edx, %r8d # indicate NUL byte at laste byte in buffer 448*14289e97SRobert Clausecker add $32, %rsi 449*14289e97SRobert Clausecker 450*14289e97SRobert Clausecker.Lnul_found2s: 451*14289e97SRobert Clausecker sub $16, %rsi 452*14289e97SRobert Clausecker 453*14289e97SRobert Clausecker.Lnul_founds: 454*14289e97SRobert Clausecker mov %eax, %ecx 455*14289e97SRobert Clausecker mov %r8d, %r10d 456*14289e97SRobert Clausecker shl %cl, %r8d # adjust NUL mask to positions in RSI/RBX 457*14289e97SRobert Clausecker not %r9d # mask of mismatches 458*14289e97SRobert Clausecker or %r8w, %r9w # NUL bytes also count as mismatches 459*14289e97SRobert Clausecker jnz .Lmismatchs 460*14289e97SRobert Clausecker 461*14289e97SRobert Clausecker movdqu (%rsi, %rax, 1), %xmm0 462*14289e97SRobert Clausecker pcmpeqb (%rsi, %rdi, 1), %xmm0 463*14289e97SRobert Clausecker add %rsi, %rdi # restore RDI pointer 464*14289e97SRobert Clausecker add %rax, %rsi # point RSI to chunk corresponding to (RDI) 465*14289e97SRobert Clausecker pmovmskb %xmm0, %ecx # mask of matches 466*14289e97SRobert Clausecker not %ecx # mask of mismatches 467*14289e97SRobert Clausecker or %r10d, %ecx # mask of mismatches or NUL bytes 468*14289e97SRobert Clausecker tzcnt %ecx, %ecx # location of first mismatch 469*14289e97SRobert Clausecker movzbl (%rdi, %rcx, 1), %eax 470*14289e97SRobert Clausecker movzbl (%rsi, %rcx, 1), %ecx 471*14289e97SRobert Clausecker sub %ecx, %eax 472*14289e97SRobert Clausecker pop %rbx 473*14289e97SRobert Clausecker ret 474*14289e97SRobert Clausecker 475*14289e97SRobert Clausecker.Lmismatch2s: 476*14289e97SRobert Clausecker sub $16, %rsi 477*14289e97SRobert Clausecker 478*14289e97SRobert Clausecker.Lmismatchs: 479*14289e97SRobert Clausecker tzcnt %r9d, %r9d # where is the mismatch? 480*14289e97SRobert Clausecker add %rsi, %rbx # turn RBX from offset into pointer 481*14289e97SRobert Clausecker movzbl (%rbx, %r9, 1), %eax 482*14289e97SRobert Clausecker movzbl (%rsi, %r9, 1), %ecx 483*14289e97SRobert Clausecker sub %ecx, %eax 484*14289e97SRobert Clausecker pop %rbx 485*14289e97SRobert Clausecker ret 486*14289e97SRobert ClauseckerARCHEND(strncmp, baseline) 487*14289e97SRobert Clausecker 488*14289e97SRobert Clausecker .section .note.GNU-stack,"",%progbits 489