xref: /illumos-gate/usr/src/lib/libc/amd64/gen/strlen.S (revision ddb365bfc9e868ad24ccdcb0dc91af18b10df082)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2009, Intel Corporation
24 * All rights reserved.
25 */
26
27/*
28 *	strlen - calculate the length of string
29 */
30
31#include "SYS.h"
32#include "proc64_id.h"
33
34#define LABEL(s) .strlen##s
35
36	/*
37	 * This implementation uses SSE instructions to compare up to 16 bytes
38	 * at a time looking for the end of string (null char).
39	 */
40	ENTRY(strlen)			/* (const char *s) */
41	mov	%rdi, %rsi		/* keep original %rdi value */
42	mov	%rsi, %rcx
43	pxor	%xmm0, %xmm0		/* 16 null chars */
44	and	$15, %rcx
45	jz	LABEL(align16_loop)	/* string is 16 byte aligned */
46
47	/*
48	 * Unaligned case. Round down to 16-byte boundary before comparing
49	 * 16 bytes for a null char. The code then compensates for any extra chars
50	 * preceding the start of the string.
51	 */
52LABEL(unalign16):
53	and	$0xfffffffffffffff0, %rsi
54
55	pcmpeqb	(%rsi), %xmm0
56	lea	16(%rdi), %rsi
57	pmovmskb %xmm0, %edx
58
59	shr	%cl, %edx		/* Compensate for bytes preceding the string */
60	test	%edx, %edx
61	jnz	LABEL(exit)
62	sub	%rcx, %rsi		/* no null, adjust to next 16-byte boundary */
63	pxor	%xmm0, %xmm0		/* clear xmm0, may have been changed... */
64
65	.p2align 4
66LABEL(align16_loop):			/* 16 byte aligned */
67	pcmpeqb	(%rsi), %xmm0		/* look for null bytes */
68	pmovmskb %xmm0, %edx		/* move each byte mask of %xmm0 to edx */
69
70	add	$16, %rsi		/* prepare to search next 16 bytes */
71	test	%edx, %edx		/* if no null byte, %edx must be 0 */
72	jnz	LABEL(exit)		/* found a null */
73
74	pcmpeqb	(%rsi), %xmm0
75	pmovmskb %xmm0, %edx
76	add	$16, %rsi
77	test	%edx, %edx
78	jnz	LABEL(exit)
79
80	pcmpeqb	(%rsi), %xmm0
81	pmovmskb %xmm0, %edx
82	add	$16, %rsi
83	test	%edx, %edx
84	jnz	LABEL(exit)
85
86	pcmpeqb	(%rsi), %xmm0
87	pmovmskb %xmm0, %edx
88	add	$16, %rsi
89	test	%edx, %edx
90	jz	LABEL(align16_loop)
91
92	.p2align 4
93LABEL(exit):
94	neg	%rdi
95	/*
96	 * Check to see if BSF is fast on this processor. If not, use a different
97	 * exit tail to find first bit set indicating null byte match.
98	 */
99	testl	$USE_BSF, .memops_method(%rip)
100	jz	LABEL(AMD_exit)
101
102	lea	-16(%rdi, %rsi), %rax	/* calculate exact offset */
103	bsf	%edx, %ecx		/* Least significant 1 bit is index of null */
104	lea	(%rax, %rcx),%rax
105	ret
106
107	/*
108	 * This exit tail does not use the bsf instruction.
109	 */
110	.p2align 4
111LABEL(AMD_exit):
112	lea	-16(%rdi, %rsi), %rax
113	test	%dl, %dl
114	jz	LABEL(exit_high)
115	test	$0x01, %dl
116	jnz	LABEL(exit_tail0)
117
118	test	$0x02, %dl
119	jnz	LABEL(exit_tail1)
120
121	.p2align 4
122	test	$0x04, %dl
123	jnz	LABEL(exit_tail2)
124
125	test	$0x08, %dl
126	jnz	LABEL(exit_tail3)
127
128	test	$0x10, %dl
129	jnz	LABEL(exit_tail4)
130
131	test	$0x20, %dl
132	jnz	LABEL(exit_tail5)
133
134	test	$0x40, %dl
135	jnz	LABEL(exit_tail6)
136	add	$7, %rax
137	ret
138
139	.p2align 4
140LABEL(exit_high):
141	add	$8, %rax
142	test	$0x01, %dh
143	jnz	LABEL(exit_tail0)
144
145	test	$0x02, %dh
146	jnz	LABEL(exit_tail1)
147
148	test	$0x04, %dh
149	jnz	LABEL(exit_tail2)
150
151	test	$0x08, %dh
152	jnz	LABEL(exit_tail3)
153
154	test	$0x10, %dh
155	jnz	LABEL(exit_tail4)
156
157	test	$0x20, %dh
158	jnz	LABEL(exit_tail5)
159
160	test	$0x40, %dh
161	jnz	LABEL(exit_tail6)
162	add	$7, %rax
163	ret
164
165	.p2align 4
166LABEL(exit_tail0):
167	xor	%ecx, %ecx
168	ret
169
170	.p2align 4
171LABEL(exit_tail1):
172	add	$1, %rax
173	ret
174
175	.p2align 4
176LABEL(exit_tail2):
177	add	$2, %rax
178	ret
179
180	.p2align 4
181LABEL(exit_tail3):
182	add	$3, %rax
183	ret
184
185	.p2align 4
186LABEL(exit_tail4):
187	add	$4, %rax
188	ret
189
190	.p2align 4
191LABEL(exit_tail5):
192	add	$5, %rax
193	ret
194
195	.p2align 4
196LABEL(exit_tail6):
197	add	$6, %rax
198	ret
199	SET_SIZE(strlen)
200