1*bca25680SRobert Clausecker/*- 2*bca25680SRobert Clausecker * Copyright (c) 2023, The FreeBSD Foundation 3*bca25680SRobert Clausecker * 4*bca25680SRobert Clausecker * SPDX-License-Expression: BSD-2-Clause 5*bca25680SRobert Clausecker * 6*bca25680SRobert Clausecker * Portions of this software were developed by Robert Clausecker 7*bca25680SRobert Clausecker * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 8*bca25680SRobert Clausecker * 9*bca25680SRobert Clausecker * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S 10*bca25680SRobert Clausecker * written by J.T. Conklin <jtc@acorntoolworks.com> that was originally 11*bca25680SRobert Clausecker * dedicated to the public domain. 12e5dd4df8SAlan Cox */ 13e5dd4df8SAlan Cox 14e5dd4df8SAlan Cox#include <machine/asm.h> 15*bca25680SRobert Clausecker#include <machine/param.h> 16*bca25680SRobert Clausecker 17e5dd4df8SAlan Cox#if 0 18e5dd4df8SAlan Cox RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $") 19e5dd4df8SAlan Cox#endif 20e5dd4df8SAlan Cox 21*bca25680SRobert Clausecker#include "amd64_archlevel.h" 22*bca25680SRobert Clausecker 23*bca25680SRobert Clausecker#define ALIGN_TEXT .p2align 4, 0x90 24*bca25680SRobert Clausecker 25*bca25680SRobert ClauseckerARCHFUNCS(strcmp) 26*bca25680SRobert Clausecker ARCHFUNC(strcmp, scalar) 27*bca25680SRobert Clausecker ARCHFUNC(strcmp, baseline) 28*bca25680SRobert ClauseckerENDARCHFUNCS(strcmp) 29*bca25680SRobert Clausecker 30*bca25680SRobert ClauseckerARCHENTRY(strcmp, scalar) 31e5dd4df8SAlan Cox /* 32e5dd4df8SAlan Cox * Align s1 to word boundary. 33e5dd4df8SAlan Cox * Consider unrolling loop? 34e5dd4df8SAlan Cox */ 35e5dd4df8SAlan Cox.Ls1align: 36e5dd4df8SAlan Cox testb $7,%dil 37e5dd4df8SAlan Cox je .Ls1aligned 38e5dd4df8SAlan Cox movb (%rdi),%al 39e5dd4df8SAlan Cox incq %rdi 40e5dd4df8SAlan Cox movb (%rsi),%dl 41e5dd4df8SAlan Cox incq %rsi 42e5dd4df8SAlan Cox testb %al,%al 43e5dd4df8SAlan Cox je .Ldone 44e5dd4df8SAlan Cox cmpb %al,%dl 45e5dd4df8SAlan Cox je .Ls1align 46e5dd4df8SAlan Cox jmp .Ldone 47e5dd4df8SAlan Cox 48e5dd4df8SAlan Cox /* 4932223c1bSPedro F. Giffuni * Check whether s2 is aligned to a word boundary. If it is, we 50e5dd4df8SAlan Cox * can compare by words. Otherwise we have to compare by bytes. 51e5dd4df8SAlan Cox */ 52e5dd4df8SAlan Cox.Ls1aligned: 53e5dd4df8SAlan Cox testb $7,%sil 54e5dd4df8SAlan Cox jne .Lbyte_loop 55e5dd4df8SAlan Cox 56e5dd4df8SAlan Cox movabsq $0x0101010101010101,%r8 57e5dd4df8SAlan Cox subq $8,%rdi 58e5dd4df8SAlan Cox movabsq $0x8080808080808080,%r9 59e5dd4df8SAlan Cox subq $8,%rsi 60e5dd4df8SAlan Cox 61*bca25680SRobert Clausecker ALIGN_TEXT 62e5dd4df8SAlan Cox.Lword_loop: 63e5dd4df8SAlan Cox movq 8(%rdi),%rax 64e5dd4df8SAlan Cox addq $8,%rdi 65e5dd4df8SAlan Cox movq 8(%rsi),%rdx 66e5dd4df8SAlan Cox addq $8,%rsi 67e5dd4df8SAlan Cox cmpq %rax,%rdx 68e5dd4df8SAlan Cox jne .Lbyte_loop 69e5dd4df8SAlan Cox subq %r8,%rdx 70e5dd4df8SAlan Cox notq %rax 71e5dd4df8SAlan Cox andq %rax,%rdx 72e5dd4df8SAlan Cox testq %r9,%rdx 73e5dd4df8SAlan Cox je .Lword_loop 74e5dd4df8SAlan Cox 75*bca25680SRobert Clausecker ALIGN_TEXT 76e5dd4df8SAlan Cox.Lbyte_loop: 77e5dd4df8SAlan Cox movb (%rdi),%al 78e5dd4df8SAlan Cox incq %rdi 79e5dd4df8SAlan Cox movb (%rsi),%dl 80e5dd4df8SAlan Cox incq %rsi 81e5dd4df8SAlan Cox testb %al,%al 82e5dd4df8SAlan Cox je .Ldone 83e5dd4df8SAlan Cox cmpb %al,%dl 84e5dd4df8SAlan Cox je .Lbyte_loop 85e5dd4df8SAlan Cox 86e5dd4df8SAlan Cox.Ldone: 87e5dd4df8SAlan Cox movzbq %al,%rax 88e5dd4df8SAlan Cox movzbq %dl,%rdx 89e5dd4df8SAlan Cox subq %rdx,%rax 90e5dd4df8SAlan Cox ret 91*bca25680SRobert ClauseckerARCHEND(strcmp, scalar) 92*bca25680SRobert Clausecker 93*bca25680SRobert ClauseckerARCHENTRY(strcmp, baseline) 94*bca25680SRobert Clausecker /* check if either string crosses a page in the head */ 95*bca25680SRobert Clausecker lea 15(%rdi), %r8d # end of head 96*bca25680SRobert Clausecker lea 15(%rsi), %r9d 97*bca25680SRobert Clausecker mov %edi, %eax 98*bca25680SRobert Clausecker mov %esi, %edx 99*bca25680SRobert Clausecker xor %edi, %r8d # bits that changed between first and last byte 100*bca25680SRobert Clausecker xor %esi, %r9d 101*bca25680SRobert Clausecker and $~0xf, %rdi # align heads to 16 bytes 102*bca25680SRobert Clausecker and $~0xf, %rsi 103*bca25680SRobert Clausecker or %r8d, %r9d # in either RSI or RDI 104*bca25680SRobert Clausecker and $0xf, %eax # offset from alignment 105*bca25680SRobert Clausecker and $0xf, %edx 106*bca25680SRobert Clausecker pxor %xmm1, %xmm1 107*bca25680SRobert Clausecker test $PAGE_SIZE, %r9d # did the page change? 108*bca25680SRobert Clausecker jz 0f # if not, take fast path 109*bca25680SRobert Clausecker 110*bca25680SRobert Clausecker /* heads may cross page boundary, avoid unmapped loads */ 111*bca25680SRobert Clausecker movdqa (%rdi), %xmm0 # load aligned heads 112*bca25680SRobert Clausecker movdqa (%rsi), %xmm2 113*bca25680SRobert Clausecker mov $-1, %r8d 114*bca25680SRobert Clausecker mov $-1, %r9d 115*bca25680SRobert Clausecker mov %eax, %ecx 116*bca25680SRobert Clausecker shl %cl, %r8d # string head in XMM0 117*bca25680SRobert Clausecker mov %edx, %ecx 118*bca25680SRobert Clausecker shl %cl, %r9d # string head in XMM2 119*bca25680SRobert Clausecker movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack 120*bca25680SRobert Clausecker movdqa %xmm2, -24(%rsp) 121*bca25680SRobert Clausecker pcmpeqb %xmm1, %xmm0 122*bca25680SRobert Clausecker pcmpeqb %xmm1, %xmm2 123*bca25680SRobert Clausecker pmovmskb %xmm0, %r10d 124*bca25680SRobert Clausecker pmovmskb %xmm2, %r11d 125*bca25680SRobert Clausecker test %r8d, %r10d # NUL byte present in first string? 126*bca25680SRobert Clausecker lea -40(%rsp), %r8 127*bca25680SRobert Clausecker cmovz %rdi, %r8 128*bca25680SRobert Clausecker test %r9d, %r11d # NUL byte present in second string? 129*bca25680SRobert Clausecker lea -24(%rsp), %r9 130*bca25680SRobert Clausecker cmovz %rsi, %r9 131*bca25680SRobert Clausecker movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads 132*bca25680SRobert Clausecker movdqu (%r9, %rdx, 1), %xmm4 133*bca25680SRobert Clausecker jmp 1f 134*bca25680SRobert Clausecker 135*bca25680SRobert Clausecker0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads 136*bca25680SRobert Clausecker movdqu (%rsi, %rdx, 1), %xmm4 137*bca25680SRobert Clausecker1: pxor %xmm2, %xmm2 138*bca25680SRobert Clausecker pcmpeqb %xmm0, %xmm2 # NUL byte present? 139*bca25680SRobert Clausecker pcmpeqb %xmm0, %xmm4 # which bytes match? 140*bca25680SRobert Clausecker pandn %xmm4, %xmm2 # match and not NUL byte? 141*bca25680SRobert Clausecker pmovmskb %xmm2, %r9d 142*bca25680SRobert Clausecker xor $0xffff, %r9d # mismatch or NUL byte? 143*bca25680SRobert Clausecker jnz .Lhead_mismatch 144*bca25680SRobert Clausecker 145*bca25680SRobert Clausecker /* load head and second chunk */ 146*bca25680SRobert Clausecker movdqa 16(%rdi), %xmm2 # load second chunks 147*bca25680SRobert Clausecker movdqa 16(%rsi), %xmm3 148*bca25680SRobert Clausecker sub %rdx, %rax # is a&0xf >= b&0xf? 149*bca25680SRobert Clausecker jb .Lswapped # if not, proceed with swapped operands 150*bca25680SRobert Clausecker 151*bca25680SRobert Clausecker neg %rax 152*bca25680SRobert Clausecker movdqu 16(%rsi, %rax, 1), %xmm0 153*bca25680SRobert Clausecker sub %rdi, %rsi # express RSI as distance from RDI 154*bca25680SRobert Clausecker lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string 155*bca25680SRobert Clausecker neg %rax 156*bca25680SRobert Clausecker pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI 157*bca25680SRobert Clausecker pcmpeqb %xmm2, %xmm0 158*bca25680SRobert Clausecker pmovmskb %xmm1, %r8d 159*bca25680SRobert Clausecker pmovmskb %xmm0, %r9d 160*bca25680SRobert Clausecker add $16, %rdi 161*bca25680SRobert Clausecker test %r8d, %r8d 162*bca25680SRobert Clausecker jnz .Lnul_found 163*bca25680SRobert Clausecker xor $0xffff, %r9d 164*bca25680SRobert Clausecker jnz .Lmismatch 165*bca25680SRobert Clausecker add $16, %rdi # advance aligned pointers 166*bca25680SRobert Clausecker 167*bca25680SRobert Clausecker /* 168*bca25680SRobert Clausecker * During the main loop, the layout of the two strings is something like: 169*bca25680SRobert Clausecker * 170*bca25680SRobert Clausecker * v ------1------ v ------2------ v 171*bca25680SRobert Clausecker * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 172*bca25680SRobert Clausecker * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 173*bca25680SRobert Clausecker * 174*bca25680SRobert Clausecker * where v indicates the alignment boundaries and corresponding chunks 175*bca25680SRobert Clausecker * of the strings have the same letters. Chunk A has been checked in 176*bca25680SRobert Clausecker * the previous iteration. This iteration, we first check that string 177*bca25680SRobert Clausecker * RSI doesn't end within region 2, then we compare chunk B between the 178*bca25680SRobert Clausecker * two strings. As RSI is known not to hold a NUL byte in regsions 1 179*bca25680SRobert Clausecker * and 2 at this point, this also ensures that RDI has not ended yet. 180*bca25680SRobert Clausecker */ 181*bca25680SRobert Clausecker ALIGN_TEXT 182*bca25680SRobert Clausecker0: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? 183*bca25680SRobert Clausecker pxor %xmm1, %xmm1 184*bca25680SRobert Clausecker pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? 185*bca25680SRobert Clausecker pcmpeqb (%rdi), %xmm0 # where do the chunks match? 186*bca25680SRobert Clausecker pmovmskb %xmm1, %r8d 187*bca25680SRobert Clausecker pmovmskb %xmm0, %r9d 188*bca25680SRobert Clausecker test %r8d, %r8d 189*bca25680SRobert Clausecker jnz .Lnul_found 190*bca25680SRobert Clausecker xor $0xffff, %r9d # any mismatches? 191*bca25680SRobert Clausecker jnz .Lmismatch 192*bca25680SRobert Clausecker 193*bca25680SRobert Clausecker /* main loop unrolled twice */ 194*bca25680SRobert Clausecker movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? 195*bca25680SRobert Clausecker pxor %xmm1, %xmm1 196*bca25680SRobert Clausecker pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI? 197*bca25680SRobert Clausecker pcmpeqb 16(%rdi), %xmm0 # where do the chunks match? 198*bca25680SRobert Clausecker pmovmskb %xmm1, %r8d 199*bca25680SRobert Clausecker pmovmskb %xmm0, %r9d 200*bca25680SRobert Clausecker add $32, %rdi 201*bca25680SRobert Clausecker test %r8d, %r8d 202*bca25680SRobert Clausecker jnz .Lnul_found2 203*bca25680SRobert Clausecker xor $0xffff, %r9d # any mismatches? 204*bca25680SRobert Clausecker jz 0b 205*bca25680SRobert Clausecker 206*bca25680SRobert Clausecker sub $16, %rdi # roll back second increment 207*bca25680SRobert Clausecker 208*bca25680SRobert Clausecker /* a mismatch has been found between RDX and RSI */ 209*bca25680SRobert Clausecker.Lmismatch: 210*bca25680SRobert Clausecker tzcnt %r9d, %r9d # where is the mismatch? 211*bca25680SRobert Clausecker add %rdi, %rdx # turn RDX from offset to pointer 212*bca25680SRobert Clausecker movzbl (%rdx, %r9, 1), %ecx 213*bca25680SRobert Clausecker movzbl (%rdi, %r9, 1), %eax 214*bca25680SRobert Clausecker sub %ecx, %eax # difference of the mismatching chars 215*bca25680SRobert Clausecker ret 216*bca25680SRobert Clausecker 217*bca25680SRobert Clausecker /* mismatch in true heads */ 218*bca25680SRobert Clausecker.Lhead_mismatch: 219*bca25680SRobert Clausecker tzcnt %r9d, %r9d # where is the mismatch? 220*bca25680SRobert Clausecker add %rax, %rdi # return to true heads 221*bca25680SRobert Clausecker add %rdx, %rsi 222*bca25680SRobert Clausecker movzbl (%rdi, %r9, 1), %eax # mismatching characters 223*bca25680SRobert Clausecker movzbl (%rsi, %r9, 1), %ecx 224*bca25680SRobert Clausecker sub %ecx, %eax 225*bca25680SRobert Clausecker ret 226*bca25680SRobert Clausecker 227*bca25680SRobert Clausecker.Lnul_found2: 228*bca25680SRobert Clausecker sub $16, %rdi # roll back second increment 229*bca25680SRobert Clausecker 230*bca25680SRobert Clausecker /* a NUL has been found in RSI */ 231*bca25680SRobert Clausecker.Lnul_found: 232*bca25680SRobert Clausecker mov %eax, %ecx 233*bca25680SRobert Clausecker mov %r8d, %r10d 234*bca25680SRobert Clausecker shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX 235*bca25680SRobert Clausecker xor $0xffff, %r9d # mask of mismatches 236*bca25680SRobert Clausecker or %r8d, %r9d # NUL bytes also count as mismatches 237*bca25680SRobert Clausecker jnz .Lmismatch 238*bca25680SRobert Clausecker 239*bca25680SRobert Clausecker /* 240*bca25680SRobert Clausecker * (RDI) == (RSI) and NUL is past the string. 241*bca25680SRobert Clausecker * Compare (RSI) with the corresponding part 242*bca25680SRobert Clausecker * of the other string until the NUL byte. 243*bca25680SRobert Clausecker */ 244*bca25680SRobert Clausecker movdqu (%rdi, %rax, 1), %xmm0 245*bca25680SRobert Clausecker pcmpeqb (%rdi, %rsi, 1), %xmm0 246*bca25680SRobert Clausecker add %rdi, %rsi # restore RSI pointer 247*bca25680SRobert Clausecker add %rax, %rdi # point RDI to chunk corresponding to (RSI) 248*bca25680SRobert Clausecker pmovmskb %xmm0, %ecx # mask of matches 249*bca25680SRobert Clausecker not %ecx # mask of mismatches 250*bca25680SRobert Clausecker or %r10d, %ecx # mask of mismatches or NUL bytes 251*bca25680SRobert Clausecker tzcnt %ecx, %ecx # location of first mismatch 252*bca25680SRobert Clausecker movzbl (%rdi, %rcx, 1), %eax 253*bca25680SRobert Clausecker movzbl (%rsi, %rcx, 1), %ecx 254*bca25680SRobert Clausecker sub %ecx, %eax 255*bca25680SRobert Clausecker ret 256*bca25680SRobert Clausecker 257*bca25680SRobert Clausecker /* 258*bca25680SRobert Clausecker * If (a&0xf) < (b&0xf), we do the same thing but with swapped 259*bca25680SRobert Clausecker * operands. I found that this performs slightly better than 260*bca25680SRobert Clausecker * using conditional moves to do the swap branchless. 261*bca25680SRobert Clausecker */ 262*bca25680SRobert Clausecker.Lswapped: 263*bca25680SRobert Clausecker movdqu 16(%rdi, %rax, 1), %xmm0 264*bca25680SRobert Clausecker sub %rsi, %rdi # express RDI as distance from RSI 265*bca25680SRobert Clausecker lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI 266*bca25680SRobert Clausecker neg %rax # make difference positive 267*bca25680SRobert Clausecker pcmpeqb %xmm2, %xmm1 268*bca25680SRobert Clausecker pcmpeqb %xmm3, %xmm0 269*bca25680SRobert Clausecker pmovmskb %xmm1, %r8d 270*bca25680SRobert Clausecker pmovmskb %xmm0, %r9d 271*bca25680SRobert Clausecker add $16, %rsi # advance aligned pointers 272*bca25680SRobert Clausecker test %r8d, %r8d 273*bca25680SRobert Clausecker jnz .Lnul_founds 274*bca25680SRobert Clausecker xor $0xffff, %r9d 275*bca25680SRobert Clausecker jnz .Lmismatchs 276*bca25680SRobert Clausecker add $16, %rsi 277*bca25680SRobert Clausecker 278*bca25680SRobert Clausecker /* 279*bca25680SRobert Clausecker * During the main loop, the layout of the two strings is something like: 280*bca25680SRobert Clausecker * 281*bca25680SRobert Clausecker * v ------1------ v ------2------ v 282*bca25680SRobert Clausecker * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 283*bca25680SRobert Clausecker * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 284*bca25680SRobert Clausecker * 285*bca25680SRobert Clausecker * where v indicates the alignment boundaries and corresponding chunks 286*bca25680SRobert Clausecker * of the strings have the same letters. Chunk A has been checked in 287*bca25680SRobert Clausecker * the previous iteration. This iteration, we first check that string 288*bca25680SRobert Clausecker * RSI doesn't end within region 2, then we compare chunk B between the 289*bca25680SRobert Clausecker * two strings. As RSI is known not to hold a NUL byte in regsions 1 290*bca25680SRobert Clausecker * and 2 at this point, this also ensures that RDI has not ended yet. 291*bca25680SRobert Clausecker */ 292*bca25680SRobert Clausecker ALIGN_TEXT 293*bca25680SRobert Clausecker0: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? 294*bca25680SRobert Clausecker pxor %xmm1, %xmm1 295*bca25680SRobert Clausecker pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI? 296*bca25680SRobert Clausecker pcmpeqb (%rsi), %xmm0 # where do the chunks match? 297*bca25680SRobert Clausecker pmovmskb %xmm1, %r8d 298*bca25680SRobert Clausecker pmovmskb %xmm0, %r9d 299*bca25680SRobert Clausecker test %r8d, %r8d 300*bca25680SRobert Clausecker jnz .Lnul_founds 301*bca25680SRobert Clausecker xor $0xffff, %r9d # any mismatches? 302*bca25680SRobert Clausecker jnz .Lmismatchs 303*bca25680SRobert Clausecker 304*bca25680SRobert Clausecker /* main loop unrolled twice */ 305*bca25680SRobert Clausecker movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? 306*bca25680SRobert Clausecker pxor %xmm1, %xmm1 307*bca25680SRobert Clausecker pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI? 308*bca25680SRobert Clausecker pcmpeqb 16(%rsi), %xmm0 # where do the chunks match? 309*bca25680SRobert Clausecker pmovmskb %xmm1, %r8d 310*bca25680SRobert Clausecker pmovmskb %xmm0, %r9d 311*bca25680SRobert Clausecker add $32, %rsi 312*bca25680SRobert Clausecker test %r8d, %r8d 313*bca25680SRobert Clausecker jnz .Lnul_found2s 314*bca25680SRobert Clausecker xor $0xffff, %r9d # any mismatches? 315*bca25680SRobert Clausecker jz 0b 316*bca25680SRobert Clausecker 317*bca25680SRobert Clausecker sub $16, %rsi # roll back second increment 318*bca25680SRobert Clausecker 319*bca25680SRobert Clausecker /* a mismatch has been found between RDX and RDI */ 320*bca25680SRobert Clausecker.Lmismatchs: 321*bca25680SRobert Clausecker tzcnt %r9d, %r9d # where is the mismatch? 322*bca25680SRobert Clausecker add %rsi, %rdx # turn RDX from offset to pointer 323*bca25680SRobert Clausecker movzbl (%rdx, %r9, 1), %eax 324*bca25680SRobert Clausecker movzbl (%rsi, %r9, 1), %ecx 325*bca25680SRobert Clausecker sub %ecx, %eax # difference of the mismatching chars 326*bca25680SRobert Clausecker ret 327*bca25680SRobert Clausecker 328*bca25680SRobert Clausecker.Lnul_found2s: 329*bca25680SRobert Clausecker sub $16, %rsi # roll back second increment 330*bca25680SRobert Clausecker 331*bca25680SRobert Clausecker /* a NUL has been found in RSI */ 332*bca25680SRobert Clausecker.Lnul_founds: 333*bca25680SRobert Clausecker mov %eax, %ecx 334*bca25680SRobert Clausecker mov %r8d, %r10d 335*bca25680SRobert Clausecker shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX 336*bca25680SRobert Clausecker xor $0xffff, %r9d # mask of mismatches 337*bca25680SRobert Clausecker or %r8d, %r9d # NUL bytes also count as mismatches 338*bca25680SRobert Clausecker jnz .Lmismatchs 339*bca25680SRobert Clausecker 340*bca25680SRobert Clausecker /* 341*bca25680SRobert Clausecker * (RDI) == (RSI) and NUL is past the string. 342*bca25680SRobert Clausecker * Compare (RSI) with the corresponding part 343*bca25680SRobert Clausecker * of the other string until the NUL byte. 344*bca25680SRobert Clausecker */ 345*bca25680SRobert Clausecker movdqu (%rsi, %rax, 1), %xmm0 346*bca25680SRobert Clausecker pcmpeqb (%rsi, %rdi, 1), %xmm0 347*bca25680SRobert Clausecker add %rsi, %rdi # restore RDI pointer 348*bca25680SRobert Clausecker add %rax, %rsi # point RSI to chunk corresponding to (RDI) 349*bca25680SRobert Clausecker pmovmskb %xmm0, %ecx # mask of matches 350*bca25680SRobert Clausecker not %ecx # mask of mismatches 351*bca25680SRobert Clausecker or %r10d, %ecx # mask of mismatches or NUL bytes 352*bca25680SRobert Clausecker tzcnt %ecx, %ecx # location of first mismatch 353*bca25680SRobert Clausecker movzbl (%rdi, %rcx, 1), %eax 354*bca25680SRobert Clausecker movzbl (%rsi, %rcx, 1), %ecx 355*bca25680SRobert Clausecker sub %ecx, %eax 356*bca25680SRobert Clausecker ret 357*bca25680SRobert ClauseckerARCHEND(strcmp, baseline) 35893ab7586SKonstantin Belousov 35993ab7586SKonstantin Belousov .section .note.GNU-stack,"",%progbits 360