1/*- 2 * Copyright (c) 2018, 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Mateusz Guzik <mjg@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Portions of this software were developed by Robert Clausecker 8 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32#include <machine/asm.h> 33#include <machine/param.h> 34 35#include "amd64_archlevel.h" 36 37/* 38 * Note: this routine was written with kernel use in mind (read: no simd), 39 * it is only present in userspace as a temporary measure until something 40 * better gets imported. 41 */ 42 43#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ 44 45#ifdef BCMP 46#define memcmp bcmp 47#endif 48 49ARCHFUNCS(memcmp) 50 ARCHFUNC(memcmp, scalar) 51 ARCHFUNC(memcmp, baseline) 52ENDARCHFUNCS(memcmp) 53 54ARCHENTRY(memcmp, scalar) 55 xorl %eax,%eax 5610: 57 cmpq $16,%rdx 58 ja 101632f 59 60 cmpb $8,%dl 61 jg 100816f 62 63 cmpb $4,%dl 64 jg 100408f 65 66 cmpb $2,%dl 67 jge 100204f 68 69 cmpb $1,%dl 70 jl 100000f 71 movzbl (%rdi),%eax 72 movzbl (%rsi),%r8d 73 subl %r8d,%eax 74100000: 75 ret 76 77 ALIGN_TEXT 78100816: 79 movq (%rdi),%r8 80 movq (%rsi),%r9 81 cmpq %r8,%r9 82 jne 80f 83 movq -8(%rdi,%rdx),%r8 84 movq -8(%rsi,%rdx),%r9 85 cmpq %r8,%r9 86 jne 10081608f 87 ret 88 ALIGN_TEXT 89100408: 90 movl (%rdi),%r8d 91 movl (%rsi),%r9d 92 cmpl %r8d,%r9d 93 jne 80f 94 movl -4(%rdi,%rdx),%r8d 95 movl -4(%rsi,%rdx),%r9d 96 cmpl %r8d,%r9d 97 jne 10040804f 98 ret 99 ALIGN_TEXT 100100204: 101 movzwl (%rdi),%r8d 102 movzwl (%rsi),%r9d 103 cmpl %r8d,%r9d 104 jne 1f 105 movzwl -2(%rdi,%rdx),%r8d 106 movzwl -2(%rsi,%rdx),%r9d 107 cmpl %r8d,%r9d 108 jne 1f 109 ret 110 ALIGN_TEXT 111101632: 112 cmpq $32,%rdx 113 ja 103200f 114 movq (%rdi),%r8 115 movq (%rsi),%r9 116 cmpq %r8,%r9 117 jne 80f 118 movq 8(%rdi),%r8 119 movq 8(%rsi),%r9 120 cmpq %r8,%r9 121 jne 10163208f 122 movq -16(%rdi,%rdx),%r8 123 movq -16(%rsi,%rdx),%r9 124 cmpq %r8,%r9 125 jne 10163216f 126 movq -8(%rdi,%rdx),%r8 127 movq -8(%rsi,%rdx),%r9 128 cmpq %r8,%r9 129 jne 10163224f 130 ret 131 ALIGN_TEXT 132103200: 133 movq (%rdi),%r8 134 movq 8(%rdi),%r9 135 subq (%rsi),%r8 136 subq 8(%rsi),%r9 137 orq %r8,%r9 138 jnz 10320000f 139 140 movq 16(%rdi),%r8 141 movq 24(%rdi),%r9 142 subq 16(%rsi),%r8 143 subq 24(%rsi),%r9 144 orq %r8,%r9 145 jnz 10320016f 146 147 leaq 32(%rdi),%rdi 148 leaq 32(%rsi),%rsi 149 subq $32,%rdx 150 cmpq $32,%rdx 151 jae 103200b 152 cmpb $0,%dl 153 jne 10b 154 ret 155 156/* 157 * Mismatch was found. 158 */ 159#ifdef BCMP 160 ALIGN_TEXT 16110320016: 16210320000: 16310081608: 16410163224: 16510163216: 16610163208: 16710040804: 16880: 1691: 170 leal 1(%eax),%eax 171 ret 172#else 173/* 174 * We need to compute the difference between strings. 175 * Start with narrowing the range down (16 -> 8 -> 4 bytes). 176 */ 177 ALIGN_TEXT 17810320016: 179 leaq 16(%rdi),%rdi 180 leaq 16(%rsi),%rsi 18110320000: 182 movq (%rdi),%r8 183 movq (%rsi),%r9 184 cmpq %r8,%r9 185 jne 80f 186 leaq 8(%rdi),%rdi 187 leaq 8(%rsi),%rsi 188 jmp 80f 189 ALIGN_TEXT 19010081608: 19110163224: 192 leaq -8(%rdi,%rdx),%rdi 193 leaq -8(%rsi,%rdx),%rsi 194 jmp 80f 195 ALIGN_TEXT 19610163216: 197 leaq -16(%rdi,%rdx),%rdi 198 leaq -16(%rsi,%rdx),%rsi 199 jmp 80f 200 ALIGN_TEXT 20110163208: 202 leaq 8(%rdi),%rdi 203 leaq 8(%rsi),%rsi 204 jmp 80f 205 ALIGN_TEXT 20610040804: 207 leaq -4(%rdi,%rdx),%rdi 208 leaq -4(%rsi,%rdx),%rsi 209 jmp 1f 210 211 ALIGN_TEXT 21280: 213 movl (%rdi),%r8d 214 movl (%rsi),%r9d 215 cmpl %r8d,%r9d 216 jne 1f 217 leaq 4(%rdi),%rdi 218 leaq 4(%rsi),%rsi 219 220/* 221 * We have up to 4 bytes to inspect. 222 */ 2231: 224 movzbl (%rdi),%eax 225 movzbl (%rsi),%r8d 226 cmpb %r8b,%al 227 jne 2f 228 229 movzbl 1(%rdi),%eax 230 movzbl 1(%rsi),%r8d 231 cmpb %r8b,%al 232 jne 2f 233 234 movzbl 2(%rdi),%eax 235 movzbl 2(%rsi),%r8d 236 cmpb %r8b,%al 237 jne 2f 238 239 movzbl 3(%rdi),%eax 240 movzbl 3(%rsi),%r8d 2412: 242 subl %r8d,%eax 243 ret 244#endif 245ARCHEND(memcmp, scalar) 246 247ARCHENTRY(memcmp, baseline) 248 cmp $32, %rdx # enough to permit use of the long kernel? 249 ja .Llong 250 251 test %rdx, %rdx # zero bytes buffer? 252 je .L0 253 254 /* 255 * Compare strings of 1--32 bytes. We want to do this by 256 * loading into two xmm registers and then comparing. To avoid 257 * crossing into unmapped pages, we either load 32 bytes from 258 * the start of the buffer or 32 bytes before its end, depending 259 * on whether there is a page boundary between the overread area 260 * or not. 261 */ 262 263 /* check for page boundaries overreads */ 264 lea 31(%rdi), %eax # end of overread 265 lea 31(%rsi), %r8d 266 lea -1(%rdi, %rdx, 1), %ecx # last character in buffer 267 lea -1(%rsi, %rdx, 1), %r9d 268 xor %ecx, %eax 269 xor %r9d, %r8d 270 test $PAGE_SIZE, %eax # are they on different pages? 271 jz 0f 272 273 /* fix up rdi */ 274 movdqu -32(%rdi, %rdx, 1), %xmm0 275 movdqu -16(%rdi, %rdx, 1), %xmm1 276 lea -8(%rsp), %rdi # end of replacement buffer 277 sub %rdx, %rdi # start of replacement buffer 278 movdqa %xmm0, -40(%rsp) # copy to replacement buffer 279 movdqa %xmm1, -24(%rsp) 280 2810: test $PAGE_SIZE, %r8d 282 jz 0f 283 284 /* fix up rsi */ 285 movdqu -32(%rsi, %rdx, 1), %xmm0 286 movdqu -16(%rsi, %rdx, 1), %xmm1 287 lea -40(%rsp), %rsi # end of replacement buffer 288 sub %rdx, %rsi # start of replacement buffer 289 movdqa %xmm0, -72(%rsp) # copy to replacement buffer 290 movdqa %xmm1, -56(%rsp) 291 292 /* load data and compare properly */ 2930: movdqu 16(%rdi), %xmm1 294 movdqu 16(%rsi), %xmm3 295 movdqu (%rdi), %xmm0 296 movdqu (%rsi), %xmm2 297 mov %edx, %ecx 298 mov $-1, %edx 299 shl %cl, %rdx # ones where the buffer is not 300 pcmpeqb %xmm3, %xmm1 301 pcmpeqb %xmm2, %xmm0 302 pmovmskb %xmm1, %ecx 303 pmovmskb %xmm0, %eax 304 shl $16, %ecx 305 or %ecx, %eax # ones where the buffers match 306 or %edx, %eax # including where the buffer is not 307 not %eax # ones where there is a mismatch 308#ifndef BCMP 309 bsf %eax, %edx # location of the first mismatch 310 cmovz %eax, %edx # including if there is no mismatch 311 movzbl (%rdi, %rdx, 1), %eax # mismatching bytes 312 movzbl (%rsi, %rdx, 1), %edx 313 sub %edx, %eax 314#endif 315 ret 316 317 /* empty input */ 318.L0: xor %eax, %eax 319 ret 320 321 /* compare 33+ bytes */ 322 ALIGN_TEXT 323.Llong: movdqu (%rdi), %xmm0 # load head 324 movdqu (%rsi), %xmm2 325 mov %rdi, %rcx 326 sub %rdi, %rsi # express rsi as distance from rdi 327 and $~0xf, %rdi # align rdi to 16 bytes 328 movdqu 16(%rsi, %rdi, 1), %xmm1 329 pcmpeqb 16(%rdi), %xmm1 # compare second half of this iteration 330 add %rcx, %rdx # pointer to last byte in buffer 331 pcmpeqb %xmm2, %xmm0 332 pmovmskb %xmm0, %eax 333 xor $0xffff, %eax # any mismatch? 334 jne .Lmismatch_head 335 add $64, %rdi # advance to next iteration 336 jmp 1f # and get going with the loop 337 338 /* process buffer 32 bytes at a time */ 339 ALIGN_TEXT 3400: movdqu -32(%rsi, %rdi, 1), %xmm0 341 movdqu -16(%rsi, %rdi, 1), %xmm1 342 pcmpeqb -32(%rdi), %xmm0 343 pcmpeqb -16(%rdi), %xmm1 344 add $32, %rdi # advance to next iteration 3451: pand %xmm0, %xmm1 # 0xff where both halves matched 346 pmovmskb %xmm1, %eax 347 cmp $0xffff, %eax # all bytes matched? 348 jne .Lmismatch 349 cmp %rdx, %rdi # end of buffer reached? 350 jb 0b 351 352 /* less than 32 bytes left to compare */ 353 movdqu -16(%rdx), %xmm1 # load 32 byte tail through end pointer 354 movdqu -16(%rdx, %rsi, 1), %xmm3 355 movdqu -32(%rdx), %xmm0 356 movdqu -32(%rdx, %rsi, 1), %xmm2 357 pcmpeqb %xmm3, %xmm1 358 pcmpeqb %xmm2, %xmm0 359 pmovmskb %xmm1, %ecx 360 pmovmskb %xmm0, %eax 361 shl $16, %ecx 362 or %ecx, %eax # ones where the buffers match 363 not %eax # ones where there is a mismatch 364#ifndef BCMP 365 bsf %eax, %ecx # location of the first mismatch 366 cmovz %eax, %ecx # including if there is no mismatch 367 add %rcx, %rdx # pointer to potential mismatch 368 movzbl -32(%rdx), %eax # mismatching bytes 369 movzbl -32(%rdx, %rsi, 1), %edx 370 sub %edx, %eax 371#endif 372 ret 373 374#ifdef BCMP 375.Lmismatch: 376 mov $1, %eax 377.Lmismatch_head: 378 ret 379#else /* memcmp */ 380.Lmismatch_head: 381 tzcnt %eax, %eax # location of mismatch 382 add %rax, %rcx # pointer to mismatch 383 movzbl (%rcx), %eax # mismatching bytes 384 movzbl (%rcx, %rsi, 1), %ecx 385 sub %ecx, %eax 386 ret 387 388.Lmismatch: 389 movdqu -48(%rsi, %rdi, 1), %xmm1 390 pcmpeqb -48(%rdi), %xmm1 # reconstruct xmm1 before PAND 391 pmovmskb %xmm0, %eax # mismatches in first 16 bytes 392 pmovmskb %xmm1, %edx # mismatches in second 16 bytes 393 shl $16, %edx 394 or %edx, %eax # mismatches in both 395 not %eax # matches in both 396 tzcnt %eax, %eax # location of mismatch 397 add %rax, %rdi # pointer to mismatch 398 movzbl -64(%rdi), %eax # mismatching bytes 399 movzbl -64(%rdi, %rsi, 1), %ecx 400 sub %ecx, %eax 401 ret 402#endif 403ARCHEND(memcmp, baseline) 404 405 .section .note.GNU-stack,"",%progbits 406