xref: /freebsd/lib/libc/amd64/string/strlen.S (revision 7ef62cebc2f965b0f640263e179276928885e33d)
1/*-
2 * Written by Mateusz Guzik <mjg@freebsd.org>
3 * Copyright (c) 2023 The FreeBSD Foundation
4 *
5 * Portions of this software were developed by Robert Clausecker
6 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
7 *
8 * Public domain.
9 */
10
11#include <machine/asm.h>
12__FBSDID("$FreeBSD$");
13
14#include "amd64_archlevel.h"
15
16/*
17 * Note: this routine was written with kernel use in mind (read: no simd),
18 * it is only present in userspace as a temporary measure until something
19 * better gets imported.
20 */
21
22#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
23
24ARCHFUNCS(strlen)
25	ARCHFUNC(strlen, scalar)
26	ARCHFUNC(strlen, baseline)
27ENDARCHFUNCS(strlen)
28
29/*
30 * strlen(string)
31 *	  %rdi
32 *
33 * Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
34 *
35 * 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added
36 * with leaq.
37 *
38 * For a description see either:
39 * - "Hacker's Delight" by Henry S. Warren, Jr.
40 * - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
41 *   by Agner Fog
42 *
43 * The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
44 */
45ARCHENTRY(strlen, scalar)
46	movabsq	$0xfefefefefefefeff,%r8
47	movabsq	$0x8080808080808080,%r9
48
49	movq	%rdi,%r10
50	movq	%rdi,%rcx
51	testb	$7,%dil
52	jz	2f
53
54	/*
55	 * Handle misaligned reads: align to 8 and fill
56	 * the spurious bytes.
57	 */
58	andq	$~7,%rdi
59	movq	(%rdi),%r11
60	shlq	$3,%rcx
61	movq	$-1,%rdx
62	shlq	%cl,%rdx
63	notq	%rdx
64	orq	%rdx,%r11
65
66	leaq	(%r11,%r8),%rcx
67	notq	%r11
68	andq	%r11,%rcx
69	andq	%r9,%rcx
70	jnz	3f
71
72	/*
73	 * Main loop.
74	 */
75	ALIGN_TEXT
761:
77	leaq	8(%rdi),%rdi
782:
79	movq	(%rdi),%r11
80	leaq	(%r11,%r8),%rcx
81	notq	%r11
82	andq	%r11,%rcx
83	andq	%r9,%rcx
84	jz	1b
853:
86	bsfq	%rcx,%rcx
87	shrq	$3,%rcx
88	leaq	(%rcx,%rdi),%rax
89	subq	%r10,%rax
90	ret
91ARCHEND(strlen, scalar)
92
93ARCHENTRY(strlen, baseline)
94	mov	%rdi, %rcx
95	pxor	%xmm1, %xmm1
96	and	$~0xf, %rdi			# align string
97	pcmpeqb	(%rdi), %xmm1			# compare head (with junk before string)
98	mov	%rcx, %rsi			# string pointer copy for later
99	and	$0xf, %ecx			# amount of bytes rdi is past 16 byte alignment
100	pmovmskb %xmm1, %eax
101	add	$32, %rdi			# advance to next iteration
102	shr	%cl, %eax			# clear out matches in junk bytes
103	test	%eax, %eax			# any match? (can't use ZF from SHR as CL=0 is possible)
104	jnz	2f
105
106	ALIGN_TEXT
1071:	pxor	%xmm1, %xmm1
108	pcmpeqb	-16(%rdi), %xmm1		# find NUL bytes
109	pmovmskb %xmm1, %eax
110	test	%eax, %eax			# were any NUL bytes present?
111	jnz	3f
112
113	/* the same unrolled once more */
114	pxor	%xmm1, %xmm1
115	pcmpeqb	(%rdi), %xmm1
116	pmovmskb %xmm1, %eax
117	add	$32, %rdi			# advance to next iteration
118	test	%eax, %eax
119	jz	1b
120
121	/* match found in loop body */
122	sub	$16, %rdi			# undo half the advancement
1233:	tzcnt	%eax, %eax			# find the first NUL byte
124	sub	%rsi, %rdi			# string length until beginning of (%rdi)
125	lea	-16(%rdi, %rax, 1), %rax	# that plus loc. of NUL byte: full string length
126	ret
127
128	/* match found in head */
1292:	tzcnt	%eax, %eax			# compute string length
130	ret
131ARCHEND(strlen, baseline)
132
133	.section .note.GNU-stack,"",%progbits
134