xref: /titanic_41/usr/src/lib/libc/amd64/gen/strlen.s (revision 533d3a4910febc9985154b885dbe971e3c21ca04)
17c478bd9Sstevel@tonic-gate/*
2*533d3a49SEdward Gillett * CDDL HEADER START
3*533d3a49SEdward Gillett *
4*533d3a49SEdward Gillett * The contents of this file are subject to the terms of the
5*533d3a49SEdward Gillett * Common Development and Distribution License (the "License").
6*533d3a49SEdward Gillett * You may not use this file except in compliance with the License.
7*533d3a49SEdward Gillett *
8*533d3a49SEdward Gillett * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*533d3a49SEdward Gillett * or http://www.opensolaris.org/os/licensing.
10*533d3a49SEdward Gillett * See the License for the specific language governing permissions
11*533d3a49SEdward Gillett * and limitations under the License.
12*533d3a49SEdward Gillett *
13*533d3a49SEdward Gillett * When distributing Covered Code, include this CDDL HEADER in each
14*533d3a49SEdward Gillett * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*533d3a49SEdward Gillett * If applicable, add the following below this CDDL HEADER, with the
16*533d3a49SEdward Gillett * fields enclosed by brackets "[]" replaced with your own identifying
17*533d3a49SEdward Gillett * information: Portions Copyright [yyyy] [name of copyright owner]
18*533d3a49SEdward Gillett *
19*533d3a49SEdward Gillett * CDDL HEADER END
207c478bd9Sstevel@tonic-gate */
217c478bd9Sstevel@tonic-gate
227c478bd9Sstevel@tonic-gate/*
23*533d3a49SEdward Gillett * Copyright (c) 2009, Intel Corporation
247c478bd9Sstevel@tonic-gate * All rights reserved.
257c478bd9Sstevel@tonic-gate */
267c478bd9Sstevel@tonic-gate
27*533d3a49SEdward Gillett/*
28*533d3a49SEdward Gillett *	strlen - calculate the length of string
29*533d3a49SEdward Gillett */
307c478bd9Sstevel@tonic-gate
317c478bd9Sstevel@tonic-gate#include "SYS.h"
32*533d3a49SEdward Gillett#include "proc64_id.h"
337c478bd9Sstevel@tonic-gate
347c478bd9Sstevel@tonic-gate#define LABEL(s) .strlen/**/s
357c478bd9Sstevel@tonic-gate
36*533d3a49SEdward Gillett	/*
37*533d3a49SEdward Gillett	 * This implementation uses SSE instructions to compare up to 16 bytes
38*533d3a49SEdward Gillett	 * at a time looking for the end of string (null char).
39*533d3a49SEdward Gillett	 */
407c478bd9Sstevel@tonic-gate	ENTRY(strlen)			/* (const char *s) */
41*533d3a49SEdward Gillett	mov	%rdi, %rsi		/* keep original %rdi value */
42*533d3a49SEdward Gillett	mov	%rsi, %rcx
43*533d3a49SEdward Gillett	pxor	%xmm0, %xmm0		/* 16 null chars */
44*533d3a49SEdward Gillett	and	$15, %rcx
45*533d3a49SEdward Gillett	jz	LABEL(align16_loop)	/* string is 16 byte aligned */
467c478bd9Sstevel@tonic-gate
47*533d3a49SEdward Gillett	/*
48*533d3a49SEdward Gillett	 * Unaligned case. Round down to 16-byte boundary before comparing
49*533d3a49SEdward Gillett	 * 16 bytes for a null char. The code then compensates for any extra chars
50*533d3a49SEdward Gillett	 * preceding the start of the string.
51*533d3a49SEdward Gillett	 */
52*533d3a49SEdward GillettLABEL(unalign16):
53*533d3a49SEdward Gillett	and	$0xfffffffffffffff0, %rsi
547c478bd9Sstevel@tonic-gate
55*533d3a49SEdward Gillett	pcmpeqb	(%rsi), %xmm0
56*533d3a49SEdward Gillett	lea	16(%rdi), %rsi
57*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
587c478bd9Sstevel@tonic-gate
59*533d3a49SEdward Gillett	shr	%cl, %edx		/* Compensate for bytes preceding the string */
60*533d3a49SEdward Gillett	test	%edx, %edx
61*533d3a49SEdward Gillett	jnz	LABEL(exit)
62*533d3a49SEdward Gillett	sub	%rcx, %rsi		/* no null, adjust to next 16-byte boundary */
63*533d3a49SEdward Gillett	pxor	%xmm0, %xmm0		/* clear xmm0, may have been changed... */
647c478bd9Sstevel@tonic-gate
657c478bd9Sstevel@tonic-gate	.p2align 4
66*533d3a49SEdward GillettLABEL(align16_loop):			/* 16 byte aligned */
67*533d3a49SEdward Gillett	pcmpeqb	(%rsi), %xmm0		/* look for null bytes */
68*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx		/* move each byte mask of %xmm0 to edx */
697c478bd9Sstevel@tonic-gate
70*533d3a49SEdward Gillett	add	$16, %rsi		/* prepare to search next 16 bytes */
71*533d3a49SEdward Gillett	test	%edx, %edx		/* if no null byte, %edx must be 0 */
72*533d3a49SEdward Gillett	jnz	LABEL(exit)		/* found a null */
737c478bd9Sstevel@tonic-gate
74*533d3a49SEdward Gillett	pcmpeqb	(%rsi), %xmm0
75*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
76*533d3a49SEdward Gillett	add	$16, %rsi
77*533d3a49SEdward Gillett	test	%edx, %edx
78*533d3a49SEdward Gillett	jnz	LABEL(exit)
79*533d3a49SEdward Gillett
80*533d3a49SEdward Gillett	pcmpeqb	(%rsi), %xmm0
81*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
82*533d3a49SEdward Gillett	add	$16, %rsi
83*533d3a49SEdward Gillett	test	%edx, %edx
84*533d3a49SEdward Gillett	jnz	LABEL(exit)
85*533d3a49SEdward Gillett
86*533d3a49SEdward Gillett	pcmpeqb	(%rsi), %xmm0
87*533d3a49SEdward Gillett	pmovmskb %xmm0, %edx
88*533d3a49SEdward Gillett	add	$16, %rsi
89*533d3a49SEdward Gillett	test	%edx, %edx
90*533d3a49SEdward Gillett	jz	LABEL(align16_loop)
917c478bd9Sstevel@tonic-gate
927c478bd9Sstevel@tonic-gate	.p2align 4
937c478bd9Sstevel@tonic-gateLABEL(exit):
94*533d3a49SEdward Gillett	neg	%rdi
95*533d3a49SEdward Gillett	/*
96*533d3a49SEdward Gillett	 * Check to see if BSF is fast on this processor. If not, use a different
97*533d3a49SEdward Gillett	 * exit tail to find first bit set indicating null byte match.
98*533d3a49SEdward Gillett	 */
99*533d3a49SEdward Gillett	testl	$USE_BSF, .memops_method(%rip)
100*533d3a49SEdward Gillett	jz	LABEL(AMD_exit)
101*533d3a49SEdward Gillett
102*533d3a49SEdward Gillett	lea	-16(%rdi, %rsi), %rax	/* calculate exact offset */
103*533d3a49SEdward Gillett	bsf	%edx, %ecx		/* Least significant 1 bit is index of null */
104*533d3a49SEdward Gillett	lea	(%rax, %rcx),%rax
1057c478bd9Sstevel@tonic-gate	ret
1067c478bd9Sstevel@tonic-gate
107*533d3a49SEdward Gillett	/*
108*533d3a49SEdward Gillett	 * This exit tail does not use the bsf instruction.
109*533d3a49SEdward Gillett	 */
110*533d3a49SEdward Gillett	.p2align 4
111*533d3a49SEdward GillettLABEL(AMD_exit):
112*533d3a49SEdward Gillett	lea	-16(%rdi, %rsi), %rax
113*533d3a49SEdward Gillett	test	%dl, %dl
114*533d3a49SEdward Gillett	jz	LABEL(exit_high)
115*533d3a49SEdward Gillett	test	$0x01, %dl
116*533d3a49SEdward Gillett	jnz	LABEL(exit_tail0)
117*533d3a49SEdward Gillett
118*533d3a49SEdward Gillett	test	$0x02, %dl
119*533d3a49SEdward Gillett	jnz	LABEL(exit_tail1)
120*533d3a49SEdward Gillett
121*533d3a49SEdward Gillett	.p2align 4
122*533d3a49SEdward Gillett	test	$0x04, %dl
123*533d3a49SEdward Gillett	jnz	LABEL(exit_tail2)
124*533d3a49SEdward Gillett
125*533d3a49SEdward Gillett	test	$0x08, %dl
126*533d3a49SEdward Gillett	jnz	LABEL(exit_tail3)
127*533d3a49SEdward Gillett
128*533d3a49SEdward Gillett	test	$0x10, %dl
129*533d3a49SEdward Gillett	jnz	LABEL(exit_tail4)
130*533d3a49SEdward Gillett
131*533d3a49SEdward Gillett	test	$0x20, %dl
132*533d3a49SEdward Gillett	jnz	LABEL(exit_tail5)
133*533d3a49SEdward Gillett
134*533d3a49SEdward Gillett	test	$0x40, %dl
135*533d3a49SEdward Gillett	jnz	LABEL(exit_tail6)
136*533d3a49SEdward Gillett	add	$7, %rax
137*533d3a49SEdward Gillett	ret
138*533d3a49SEdward Gillett
139*533d3a49SEdward Gillett	.p2align 4
140*533d3a49SEdward GillettLABEL(exit_high):
141*533d3a49SEdward Gillett	add	$8, %rax
142*533d3a49SEdward Gillett	test	$0x01, %dh
143*533d3a49SEdward Gillett	jnz	LABEL(exit_tail0)
144*533d3a49SEdward Gillett
145*533d3a49SEdward Gillett	test	$0x02, %dh
146*533d3a49SEdward Gillett	jnz	LABEL(exit_tail1)
147*533d3a49SEdward Gillett
148*533d3a49SEdward Gillett	test	$0x04, %dh
149*533d3a49SEdward Gillett	jnz	LABEL(exit_tail2)
150*533d3a49SEdward Gillett
151*533d3a49SEdward Gillett	test	$0x08, %dh
152*533d3a49SEdward Gillett	jnz	LABEL(exit_tail3)
153*533d3a49SEdward Gillett
154*533d3a49SEdward Gillett	test	$0x10, %dh
155*533d3a49SEdward Gillett	jnz	LABEL(exit_tail4)
156*533d3a49SEdward Gillett
157*533d3a49SEdward Gillett	test	$0x20, %dh
158*533d3a49SEdward Gillett	jnz	LABEL(exit_tail5)
159*533d3a49SEdward Gillett
160*533d3a49SEdward Gillett	test	$0x40, %dh
161*533d3a49SEdward Gillett	jnz	LABEL(exit_tail6)
162*533d3a49SEdward Gillett	add	$7, %rax
163*533d3a49SEdward Gillett	ret
164*533d3a49SEdward Gillett
165*533d3a49SEdward Gillett	.p2align 4
166*533d3a49SEdward GillettLABEL(exit_tail0):
167*533d3a49SEdward Gillett	xor	%ecx, %ecx
168*533d3a49SEdward Gillett	ret
169*533d3a49SEdward Gillett
170*533d3a49SEdward Gillett	.p2align 4
171*533d3a49SEdward GillettLABEL(exit_tail1):
172*533d3a49SEdward Gillett	add	$1, %rax
173*533d3a49SEdward Gillett	ret
174*533d3a49SEdward Gillett
175*533d3a49SEdward Gillett	.p2align 4
176*533d3a49SEdward GillettLABEL(exit_tail2):
177*533d3a49SEdward Gillett	add	$2, %rax
178*533d3a49SEdward Gillett	ret
179*533d3a49SEdward Gillett
180*533d3a49SEdward Gillett	.p2align 4
181*533d3a49SEdward GillettLABEL(exit_tail3):
182*533d3a49SEdward Gillett	add	$3, %rax
183*533d3a49SEdward Gillett	ret
184*533d3a49SEdward Gillett
185*533d3a49SEdward Gillett	.p2align 4
186*533d3a49SEdward GillettLABEL(exit_tail4):
187*533d3a49SEdward Gillett	add	$4, %rax
188*533d3a49SEdward Gillett	ret
189*533d3a49SEdward Gillett
190*533d3a49SEdward Gillett	.p2align 4
191*533d3a49SEdward GillettLABEL(exit_tail5):
192*533d3a49SEdward Gillett	add	$5, %rax
193*533d3a49SEdward Gillett	ret
194*533d3a49SEdward Gillett
195*533d3a49SEdward Gillett	.p2align 4
196*533d3a49SEdward GillettLABEL(exit_tail6):
197*533d3a49SEdward Gillett	add	$6, %rax
198*533d3a49SEdward Gillett	ret
1997c478bd9Sstevel@tonic-gate	SET_SIZE(strlen)
200