1/*- 2 * Copyright (c) 2023, The FreeBSD Foundation 3 * 4 * SPDX-License-Expression: BSD-2-Clause 5 * 6 * Portions of this software were developed by Robert Clausecker 7 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 8 * 9 * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S 10 * written by J.T. Conklin <jtc@acorntoolworks.com> that was originally 11 * dedicated to the public domain. 12 */ 13 14#include <machine/asm.h> 15#include <machine/param.h> 16 17#if 0 18 RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $") 19#endif 20 21#include "amd64_archlevel.h" 22 23#define ALIGN_TEXT .p2align 4, 0x90 24 25ARCHFUNCS(strcmp) 26 ARCHFUNC(strcmp, scalar) 27 ARCHFUNC(strcmp, baseline) 28ENDARCHFUNCS(strcmp) 29 30ARCHENTRY(strcmp, scalar) 31 /* 32 * Align s1 to word boundary. 33 * Consider unrolling loop? 34 */ 35.Ls1align: 36 testb $7,%dil 37 je .Ls1aligned 38 movb (%rdi),%al 39 incq %rdi 40 movb (%rsi),%dl 41 incq %rsi 42 testb %al,%al 43 je .Ldone 44 cmpb %al,%dl 45 je .Ls1align 46 jmp .Ldone 47 48 /* 49 * Check whether s2 is aligned to a word boundary. If it is, we 50 * can compare by words. Otherwise we have to compare by bytes. 51 */ 52.Ls1aligned: 53 testb $7,%sil 54 jne .Lbyte_loop 55 56 movabsq $0x0101010101010101,%r8 57 subq $8,%rdi 58 movabsq $0x8080808080808080,%r9 59 subq $8,%rsi 60 61 ALIGN_TEXT 62.Lword_loop: 63 movq 8(%rdi),%rax 64 addq $8,%rdi 65 movq 8(%rsi),%rdx 66 addq $8,%rsi 67 cmpq %rax,%rdx 68 jne .Lbyte_loop 69 subq %r8,%rdx 70 notq %rax 71 andq %rax,%rdx 72 testq %r9,%rdx 73 je .Lword_loop 74 75 ALIGN_TEXT 76.Lbyte_loop: 77 movb (%rdi),%al 78 incq %rdi 79 movb (%rsi),%dl 80 incq %rsi 81 testb %al,%al 82 je .Ldone 83 cmpb %al,%dl 84 je .Lbyte_loop 85 86.Ldone: 87 movzbq %al,%rax 88 movzbq %dl,%rdx 89 subq %rdx,%rax 90 ret 91ARCHEND(strcmp, scalar) 92 93ARCHENTRY(strcmp, baseline) 94 /* check if either string crosses a page in the head */ 95 lea 15(%rdi), %r8d # end of head 96 lea 15(%rsi), %r9d 97 mov %edi, %eax 98 mov %esi, %edx 99 xor %edi, %r8d # bits that changed between first and last byte 100 xor %esi, %r9d 101 and $~0xf, %rdi # align heads to 16 bytes 102 and $~0xf, %rsi 103 or %r8d, %r9d # in either RSI or RDI 104 and $0xf, %eax # offset from alignment 105 and $0xf, %edx 106 pxor %xmm1, %xmm1 107 test $PAGE_SIZE, %r9d # did the page change? 108 jz 0f # if not, take fast path 109 110 /* heads may cross page boundary, avoid unmapped loads */ 111 movdqa (%rdi), %xmm0 # load aligned heads 112 movdqa (%rsi), %xmm2 113 mov $-1, %r8d 114 mov $-1, %r9d 115 mov %eax, %ecx 116 shl %cl, %r8d # string head in XMM0 117 mov %edx, %ecx 118 shl %cl, %r9d # string head in XMM2 119 movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack 120 movdqa %xmm2, -24(%rsp) 121 pcmpeqb %xmm1, %xmm0 122 pcmpeqb %xmm1, %xmm2 123 pmovmskb %xmm0, %r10d 124 pmovmskb %xmm2, %r11d 125 test %r8d, %r10d # NUL byte present in first string? 126 lea -40(%rsp), %r8 127 cmovz %rdi, %r8 128 test %r9d, %r11d # NUL byte present in second string? 129 lea -24(%rsp), %r9 130 cmovz %rsi, %r9 131 movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads 132 movdqu (%r9, %rdx, 1), %xmm4 133 jmp 1f 134 1350: movdqu (%rdi, %rax, 1), %xmm0 # load true heads 136 movdqu (%rsi, %rdx, 1), %xmm4 1371: pxor %xmm2, %xmm2 138 pcmpeqb %xmm0, %xmm2 # NUL byte present? 139 pcmpeqb %xmm0, %xmm4 # which bytes match? 140 pandn %xmm4, %xmm2 # match and not NUL byte? 141 pmovmskb %xmm2, %r9d 142 xor $0xffff, %r9d # mismatch or NUL byte? 143 jnz .Lhead_mismatch 144 145 /* load head and second chunk */ 146 movdqa 16(%rdi), %xmm2 # load second chunks 147 movdqa 16(%rsi), %xmm3 148 sub %rdx, %rax # is a&0xf >= b&0xf? 149 jb .Lswapped # if not, proceed with swapped operands 150 151 neg %rax 152 movdqu 16(%rsi, %rax, 1), %xmm0 153 sub %rdi, %rsi # express RSI as distance from RDI 154 lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string 155 neg %rax 156 pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI 157 pcmpeqb %xmm2, %xmm0 158 pmovmskb %xmm1, %r8d 159 pmovmskb %xmm0, %r9d 160 add $16, %rdi 161 test %r8d, %r8d 162 jnz .Lnul_found 163 xor $0xffff, %r9d 164 jnz .Lmismatch 165 add $16, %rdi # advance aligned pointers 166 167 /* 168 * During the main loop, the layout of the two strings is something like: 169 * 170 * v ------1------ v ------2------ v 171 * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 172 * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 173 * 174 * where v indicates the alignment boundaries and corresponding chunks 175 * of the strings have the same letters. Chunk A has been checked in 176 * the previous iteration. This iteration, we first check that string 177 * RSI doesn't end within region 2, then we compare chunk B between the 178 * two strings. As RSI is known not to hold a NUL byte in regsions 1 179 * and 2 at this point, this also ensures that RDI has not ended yet. 180 */ 181 ALIGN_TEXT 1820: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? 183 pxor %xmm1, %xmm1 184 pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? 185 pcmpeqb (%rdi), %xmm0 # where do the chunks match? 186 pmovmskb %xmm1, %r8d 187 pmovmskb %xmm0, %r9d 188 test %r8d, %r8d 189 jnz .Lnul_found 190 xor $0xffff, %r9d # any mismatches? 191 jnz .Lmismatch 192 193 /* main loop unrolled twice */ 194 movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? 195 pxor %xmm1, %xmm1 196 pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI? 197 pcmpeqb 16(%rdi), %xmm0 # where do the chunks match? 198 pmovmskb %xmm1, %r8d 199 pmovmskb %xmm0, %r9d 200 add $32, %rdi 201 test %r8d, %r8d 202 jnz .Lnul_found2 203 xor $0xffff, %r9d # any mismatches? 204 jz 0b 205 206 sub $16, %rdi # roll back second increment 207 208 /* a mismatch has been found between RDX and RSI */ 209.Lmismatch: 210 tzcnt %r9d, %r9d # where is the mismatch? 211 add %rdi, %rdx # turn RDX from offset to pointer 212 movzbl (%rdx, %r9, 1), %ecx 213 movzbl (%rdi, %r9, 1), %eax 214 sub %ecx, %eax # difference of the mismatching chars 215 ret 216 217 /* mismatch in true heads */ 218.Lhead_mismatch: 219 tzcnt %r9d, %r9d # where is the mismatch? 220 add %rax, %rdi # return to true heads 221 add %rdx, %rsi 222 movzbl (%rdi, %r9, 1), %eax # mismatching characters 223 movzbl (%rsi, %r9, 1), %ecx 224 sub %ecx, %eax 225 ret 226 227.Lnul_found2: 228 sub $16, %rdi # roll back second increment 229 230 /* a NUL has been found in RSI */ 231.Lnul_found: 232 mov %eax, %ecx 233 mov %r8d, %r10d 234 shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX 235 xor $0xffff, %r9d # mask of mismatches 236 or %r8d, %r9d # NUL bytes also count as mismatches 237 jnz .Lmismatch 238 239 /* 240 * (RDI) == (RSI) and NUL is past the string. 241 * Compare (RSI) with the corresponding part 242 * of the other string until the NUL byte. 243 */ 244 movdqu (%rdi, %rax, 1), %xmm0 245 pcmpeqb (%rdi, %rsi, 1), %xmm0 246 add %rdi, %rsi # restore RSI pointer 247 add %rax, %rdi # point RDI to chunk corresponding to (RSI) 248 pmovmskb %xmm0, %ecx # mask of matches 249 not %ecx # mask of mismatches 250 or %r10d, %ecx # mask of mismatches or NUL bytes 251 tzcnt %ecx, %ecx # location of first mismatch 252 movzbl (%rdi, %rcx, 1), %eax 253 movzbl (%rsi, %rcx, 1), %ecx 254 sub %ecx, %eax 255 ret 256 257 /* 258 * If (a&0xf) < (b&0xf), we do the same thing but with swapped 259 * operands. I found that this performs slightly better than 260 * using conditional moves to do the swap branchless. 261 */ 262.Lswapped: 263 movdqu 16(%rdi, %rax, 1), %xmm0 264 sub %rsi, %rdi # express RDI as distance from RSI 265 lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI 266 neg %rax # make difference positive 267 pcmpeqb %xmm2, %xmm1 268 pcmpeqb %xmm3, %xmm0 269 pmovmskb %xmm1, %r8d 270 pmovmskb %xmm0, %r9d 271 add $16, %rsi # advance aligned pointers 272 test %r8d, %r8d 273 jnz .Lnul_founds 274 xor $0xffff, %r9d 275 jnz .Lmismatchs 276 add $16, %rsi 277 278 /* 279 * During the main loop, the layout of the two strings is something like: 280 * 281 * v ------1------ v ------2------ v 282 * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... 283 * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... 284 * 285 * where v indicates the alignment boundaries and corresponding chunks 286 * of the strings have the same letters. Chunk A has been checked in 287 * the previous iteration. This iteration, we first check that string 288 * RSI doesn't end within region 2, then we compare chunk B between the 289 * two strings. As RSI is known not to hold a NUL byte in regsions 1 290 * and 2 at this point, this also ensures that RDI has not ended yet. 291 */ 292 ALIGN_TEXT 2930: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? 294 pxor %xmm1, %xmm1 295 pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI? 296 pcmpeqb (%rsi), %xmm0 # where do the chunks match? 297 pmovmskb %xmm1, %r8d 298 pmovmskb %xmm0, %r9d 299 test %r8d, %r8d 300 jnz .Lnul_founds 301 xor $0xffff, %r9d # any mismatches? 302 jnz .Lmismatchs 303 304 /* main loop unrolled twice */ 305 movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? 306 pxor %xmm1, %xmm1 307 pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI? 308 pcmpeqb 16(%rsi), %xmm0 # where do the chunks match? 309 pmovmskb %xmm1, %r8d 310 pmovmskb %xmm0, %r9d 311 add $32, %rsi 312 test %r8d, %r8d 313 jnz .Lnul_found2s 314 xor $0xffff, %r9d # any mismatches? 315 jz 0b 316 317 sub $16, %rsi # roll back second increment 318 319 /* a mismatch has been found between RDX and RDI */ 320.Lmismatchs: 321 tzcnt %r9d, %r9d # where is the mismatch? 322 add %rsi, %rdx # turn RDX from offset to pointer 323 movzbl (%rdx, %r9, 1), %eax 324 movzbl (%rsi, %r9, 1), %ecx 325 sub %ecx, %eax # difference of the mismatching chars 326 ret 327 328.Lnul_found2s: 329 sub $16, %rsi # roll back second increment 330 331 /* a NUL has been found in RSI */ 332.Lnul_founds: 333 mov %eax, %ecx 334 mov %r8d, %r10d 335 shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX 336 xor $0xffff, %r9d # mask of mismatches 337 or %r8d, %r9d # NUL bytes also count as mismatches 338 jnz .Lmismatchs 339 340 /* 341 * (RDI) == (RSI) and NUL is past the string. 342 * Compare (RSI) with the corresponding part 343 * of the other string until the NUL byte. 344 */ 345 movdqu (%rsi, %rax, 1), %xmm0 346 pcmpeqb (%rsi, %rdi, 1), %xmm0 347 add %rsi, %rdi # restore RDI pointer 348 add %rax, %rsi # point RSI to chunk corresponding to (RDI) 349 pmovmskb %xmm0, %ecx # mask of matches 350 not %ecx # mask of mismatches 351 or %r10d, %ecx # mask of mismatches or NUL bytes 352 tzcnt %ecx, %ecx # location of first mismatch 353 movzbl (%rdi, %rcx, 1), %eax 354 movzbl (%rsi, %rcx, 1), %ecx 355 sub %ecx, %eax 356 ret 357ARCHEND(strcmp, baseline) 358 359 .section .note.GNU-stack,"",%progbits 360