/*- * Written by Mateusz Guzik * Copyright (c) 2023 The FreeBSD Foundation * * Portions of this software were developed by Robert Clausecker * under sponsorship from the FreeBSD Foundation. * * Public domain. */ #include __FBSDID("$FreeBSD$"); #include "amd64_archlevel.h" /* * Note: this routine was written with kernel use in mind (read: no simd), * it is only present in userspace as a temporary measure until something * better gets imported. */ #define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ ARCHFUNCS(strlen) ARCHFUNC(strlen, scalar) ARCHFUNC(strlen, baseline) ENDARCHFUNCS(strlen) /* * strlen(string) * %rdi * * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick. * * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added * with leaq. * * For a description see either: * - "Hacker's Delight" by Henry S. Warren, Jr. * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms" * by Agner Fog * * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386. */ ARCHENTRY(strlen, scalar) movabsq $0xfefefefefefefeff,%r8 movabsq $0x8080808080808080,%r9 movq %rdi,%r10 movq %rdi,%rcx testb $7,%dil jz 2f /* * Handle misaligned reads: align to 8 and fill * the spurious bytes. */ andq $~7,%rdi movq (%rdi),%r11 shlq $3,%rcx movq $-1,%rdx shlq %cl,%rdx notq %rdx orq %rdx,%r11 leaq (%r11,%r8),%rcx notq %r11 andq %r11,%rcx andq %r9,%rcx jnz 3f /* * Main loop. */ ALIGN_TEXT 1: leaq 8(%rdi),%rdi 2: movq (%rdi),%r11 leaq (%r11,%r8),%rcx notq %r11 andq %r11,%rcx andq %r9,%rcx jz 1b 3: bsfq %rcx,%rcx shrq $3,%rcx leaq (%rcx,%rdi),%rax subq %r10,%rax ret ARCHEND(strlen, scalar) ARCHENTRY(strlen, baseline) mov %rdi, %rcx pxor %xmm1, %xmm1 and $~0xf, %rdi # align string pcmpeqb (%rdi), %xmm1 # compare head (with junk before string) mov %rcx, %rsi # string pointer copy for later and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment pmovmskb %xmm1, %eax add $32, %rdi # advance to next iteration shr %cl, %eax # clear out matches in junk bytes test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible) jnz 2f ALIGN_TEXT 1: pxor %xmm1, %xmm1 pcmpeqb -16(%rdi), %xmm1 # find NUL bytes pmovmskb %xmm1, %eax test %eax, %eax # were any NUL bytes present? jnz 3f /* the same unrolled once more */ pxor %xmm1, %xmm1 pcmpeqb (%rdi), %xmm1 pmovmskb %xmm1, %eax add $32, %rdi # advance to next iteration test %eax, %eax jz 1b /* match found in loop body */ sub $16, %rdi # undo half the advancement 3: tzcnt %eax, %eax # find the first NUL byte sub %rsi, %rdi # string length until beginning of (%rdi) lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length ret /* match found in head */ 2: tzcnt %eax, %eax # compute string length ret ARCHEND(strlen, baseline) .section .note.GNU-stack,"",%progbits