xref: /freebsd/lib/libc/amd64/string/strlen.S (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1/*-
2 * Written by Mateusz Guzik <mjg@freebsd.org>
3 * Copyright (c) 2023 The FreeBSD Foundation
4 *
5 * Portions of this software were developed by Robert Clausecker
6 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
7 *
8 * Public domain.
9 */
10
11#include <machine/asm.h>
12#include "amd64_archlevel.h"
13
14/*
15 * Note: this routine was written with kernel use in mind (read: no simd),
16 * it is only present in userspace as a temporary measure until something
17 * better gets imported.
18 */
19
20#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
21
22ARCHFUNCS(strlen)
23	ARCHFUNC(strlen, scalar)
24	ARCHFUNC(strlen, baseline)
25ENDARCHFUNCS(strlen)
26
27/*
28 * strlen(string)
29 *	  %rdi
30 *
31 * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
32 *
33 * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added
34 * with leaq.
35 *
36 * For a description see either:
37 * - "Hacker's Delight" by Henry S. Warren, Jr.
38 * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
39 *   by Agner Fog
40 *
41 * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
42 */
43ARCHENTRY(strlen, scalar)
44	movabsq	$0xfefefefefefefeff,%r8
45	movabsq	$0x8080808080808080,%r9
46
47	movq	%rdi,%r10
48	movq	%rdi,%rcx
49	testb	$7,%dil
50	jz	2f
51
52	/*
53	 * Handle misaligned reads: align to 8 and fill
54	 * the spurious bytes.
55	 */
56	andq	$~7,%rdi
57	movq	(%rdi),%r11
58	shlq	$3,%rcx
59	movq	$-1,%rdx
60	shlq	%cl,%rdx
61	notq	%rdx
62	orq	%rdx,%r11
63
64	leaq	(%r11,%r8),%rcx
65	notq	%r11
66	andq	%r11,%rcx
67	andq	%r9,%rcx
68	jnz	3f
69
70	/*
71	 * Main loop.
72	 */
73	ALIGN_TEXT
741:
75	leaq	8(%rdi),%rdi
762:
77	movq	(%rdi),%r11
78	leaq	(%r11,%r8),%rcx
79	notq	%r11
80	andq	%r11,%rcx
81	andq	%r9,%rcx
82	jz	1b
833:
84	bsfq	%rcx,%rcx
85	shrq	$3,%rcx
86	leaq	(%rcx,%rdi),%rax
87	subq	%r10,%rax
88	ret
89ARCHEND(strlen, scalar)
90
91ARCHENTRY(strlen, baseline)
92	mov	%rdi, %rcx
93	pxor	%xmm1, %xmm1
94	and	$~0xf, %rdi			# align string
95	pcmpeqb	(%rdi), %xmm1			# compare head (with junk before string)
96	mov	%rcx, %rsi			# string pointer copy for later
97	and	$0xf, %ecx			# amount of bytes rdi is past 16 byte alignment
98	pmovmskb %xmm1, %eax
99	add	$32, %rdi			# advance to next iteration
100	shr	%cl, %eax			# clear out matches in junk bytes
101	test	%eax, %eax			# any match? (can't use ZF from SHR as CL=0 is possible)
102	jnz	2f
103
104	ALIGN_TEXT
1051:	pxor	%xmm1, %xmm1
106	pcmpeqb	-16(%rdi), %xmm1		# find NUL bytes
107	pmovmskb %xmm1, %eax
108	test	%eax, %eax			# were any NUL bytes present?
109	jnz	3f
110
111	/* the same unrolled once more */
112	pxor	%xmm1, %xmm1
113	pcmpeqb	(%rdi), %xmm1
114	pmovmskb %xmm1, %eax
115	add	$32, %rdi			# advance to next iteration
116	test	%eax, %eax
117	jz	1b
118
119	/* match found in loop body */
120	sub	$16, %rdi			# undo half the advancement
1213:	tzcnt	%eax, %eax			# find the first NUL byte
122	sub	%rsi, %rdi			# string length until beginning of (%rdi)
123	lea	-16(%rdi, %rax, 1), %rax	# that plus loc. of NUL byte: full string length
124	ret
125
126	/* match found in head */
1272:	tzcnt	%eax, %eax			# compute string length
128	ret
129ARCHEND(strlen, baseline)
130
131	.section .note.GNU-stack,"",%progbits
132