xref: /titanic_50/usr/src/lib/libc/sparcv9/gen/ascii_strcasecmp.s (revision 6a634c9dca3093f3922e4b7ab826d7bdf17bf78e)
1*23a1cceaSRoger A. Faulkner/*
2*23a1cceaSRoger A. Faulkner * CDDL HEADER START
3*23a1cceaSRoger A. Faulkner *
4*23a1cceaSRoger A. Faulkner * The contents of this file are subject to the terms of the
5*23a1cceaSRoger A. Faulkner * Common Development and Distribution License (the "License").
6*23a1cceaSRoger A. Faulkner * You may not use this file except in compliance with the License.
7*23a1cceaSRoger A. Faulkner *
8*23a1cceaSRoger A. Faulkner * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*23a1cceaSRoger A. Faulkner * or http://www.opensolaris.org/os/licensing.
10*23a1cceaSRoger A. Faulkner * See the License for the specific language governing permissions
11*23a1cceaSRoger A. Faulkner * and limitations under the License.
12*23a1cceaSRoger A. Faulkner *
13*23a1cceaSRoger A. Faulkner * When distributing Covered Code, include this CDDL HEADER in each
14*23a1cceaSRoger A. Faulkner * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*23a1cceaSRoger A. Faulkner * If applicable, add the following below this CDDL HEADER, with the
16*23a1cceaSRoger A. Faulkner * fields enclosed by brackets "[]" replaced with your own identifying
17*23a1cceaSRoger A. Faulkner * information: Portions Copyright [yyyy] [name of copyright owner]
18*23a1cceaSRoger A. Faulkner *
19*23a1cceaSRoger A. Faulkner * CDDL HEADER END
20*23a1cceaSRoger A. Faulkner */
21*23a1cceaSRoger A. Faulkner
22*23a1cceaSRoger A. Faulkner/*
23*23a1cceaSRoger A. Faulkner * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24*23a1cceaSRoger A. Faulkner */
25*23a1cceaSRoger A. Faulkner
26*23a1cceaSRoger A. Faulkner/*
27*23a1cceaSRoger A. Faulkner * The ascii_strcasecmp() function is a case insensitive versions of strcmp().
28*23a1cceaSRoger A. Faulkner * It assumes the ASCII character set and ignores differences in case
29*23a1cceaSRoger A. Faulkner * when comparing lower and upper case characters. In other words, it
30*23a1cceaSRoger A. Faulkner * behaves as if both strings had been converted to lower case using
31*23a1cceaSRoger A. Faulkner * tolower() in the "C" locale on each byte, and the results had then
32*23a1cceaSRoger A. Faulkner * been compared using strcmp().
33*23a1cceaSRoger A. Faulkner *
34*23a1cceaSRoger A. Faulkner * The assembly code below is an optimized version of the following C
35*23a1cceaSRoger A. Faulkner * reference:
36*23a1cceaSRoger A. Faulkner *
37*23a1cceaSRoger A. Faulkner * static const char charmap[] = {
38*23a1cceaSRoger A. Faulkner *	'\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
39*23a1cceaSRoger A. Faulkner *	'\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
40*23a1cceaSRoger A. Faulkner *	'\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
41*23a1cceaSRoger A. Faulkner *	'\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
42*23a1cceaSRoger A. Faulkner *	'\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
43*23a1cceaSRoger A. Faulkner *	'\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
44*23a1cceaSRoger A. Faulkner *	'\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
45*23a1cceaSRoger A. Faulkner *	'\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
46*23a1cceaSRoger A. Faulkner *	'\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
47*23a1cceaSRoger A. Faulkner *	'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
48*23a1cceaSRoger A. Faulkner *	'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
49*23a1cceaSRoger A. Faulkner *	'\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
50*23a1cceaSRoger A. Faulkner *	'\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
51*23a1cceaSRoger A. Faulkner *	'\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
52*23a1cceaSRoger A. Faulkner *	'\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
53*23a1cceaSRoger A. Faulkner *	'\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
54*23a1cceaSRoger A. Faulkner *	'\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
55*23a1cceaSRoger A. Faulkner *	'\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
56*23a1cceaSRoger A. Faulkner *	'\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
57*23a1cceaSRoger A. Faulkner *	'\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
58*23a1cceaSRoger A. Faulkner *	'\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
59*23a1cceaSRoger A. Faulkner *	'\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
60*23a1cceaSRoger A. Faulkner *	'\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
61*23a1cceaSRoger A. Faulkner *	'\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
62*23a1cceaSRoger A. Faulkner *	'\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
63*23a1cceaSRoger A. Faulkner *	'\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
64*23a1cceaSRoger A. Faulkner *	'\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
65*23a1cceaSRoger A. Faulkner *	'\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
66*23a1cceaSRoger A. Faulkner *	'\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
67*23a1cceaSRoger A. Faulkner *	'\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
68*23a1cceaSRoger A. Faulkner *	'\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
69*23a1cceaSRoger A. Faulkner *	'\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
70*23a1cceaSRoger A. Faulkner * };
71*23a1cceaSRoger A. Faulkner *
72*23a1cceaSRoger A. Faulkner * int
73*23a1cceaSRoger A. Faulkner * ascii_strcasecmp(const char *s1, const char *s2)
74*23a1cceaSRoger A. Faulkner * {
75*23a1cceaSRoger A. Faulkner *	const unsigned char	*cm = (const unsigned char *)charmap;
76*23a1cceaSRoger A. Faulkner *	const unsigned char	*us1 = (const unsigned char *)s1;
77*23a1cceaSRoger A. Faulkner *	const unsigned char	*us2 = (const unsigned char *)s2;
78*23a1cceaSRoger A. Faulkner *
79*23a1cceaSRoger A. Faulkner *	while (cm[*us1] == cm[*us2++])
80*23a1cceaSRoger A. Faulkner *		if (*us1++ == '\0')
81*23a1cceaSRoger A. Faulkner *			return (0);
82*23a1cceaSRoger A. Faulkner *	return (cm[*us1] - cm[*(us2 - 1)]);
83*23a1cceaSRoger A. Faulkner * }
84*23a1cceaSRoger A. Faulkner *
85*23a1cceaSRoger A. Faulkner * The following algorithm, from a 1987 news posting by Alan Mycroft, is
86*23a1cceaSRoger A. Faulkner * used for finding null bytes in a word:
87*23a1cceaSRoger A. Faulkner *
88*23a1cceaSRoger A. Faulkner * #define has_null(word) ((word - 0x01010101) & (~word & 0x80808080))
89*23a1cceaSRoger A. Faulkner *
90*23a1cceaSRoger A. Faulkner * The following algorithm is used for a wordwise tolower() operation:
91*23a1cceaSRoger A. Faulkner *
92*23a1cceaSRoger A. Faulkner * unsigned int
93*23a1cceaSRoger A. Faulkner * parallel_tolower (unsigned int x)
94*23a1cceaSRoger A. Faulkner * {
95*23a1cceaSRoger A. Faulkner *	unsigned int p;
96*23a1cceaSRoger A. Faulkner *	unsigned int q;
97*23a1cceaSRoger A. Faulkner *
98*23a1cceaSRoger A. Faulkner *	unsigned int m1 = 0x80808080;
99*23a1cceaSRoger A. Faulkner *	unsigned int m2 = 0x3f3f3f3f;
100*23a1cceaSRoger A. Faulkner *	unsigned int m3 = 0x25252525;
101*23a1cceaSRoger A. Faulkner *
102*23a1cceaSRoger A. Faulkner *	q = x & ~m1;// newb = byte & 0x7F
103*23a1cceaSRoger A. Faulkner *	p = q + m2; // newb > 0x5A --> MSB set
104*23a1cceaSRoger A. Faulkner *	q = q + m3; // newb < 0x41 --> MSB clear
105*23a1cceaSRoger A. Faulkner *	p = p & ~q; // newb > 0x40 && newb < 0x5B --> MSB set
106*23a1cceaSRoger A. Faulkner *	q = m1 & ~x;//  byte < 0x80 --> 0x80
107*23a1cceaSRoger A. Faulkner *	q = p & q;  // newb > 0x40 && newb < 0x5B && byte < 0x80 -> 0x80,else 0
108*23a1cceaSRoger A. Faulkner *	q = q >> 2; // newb > 0x40 && newb < 0x5B && byte < 0x80 -> 0x20,else 0
109*23a1cceaSRoger A. Faulkner *	return (x + q); // translate uppercase characters to lowercase
110*23a1cceaSRoger A. Faulkner * }
111*23a1cceaSRoger A. Faulkner *
112*23a1cceaSRoger A. Faulkner * Both algorithms have been tested exhaustively for all possible 2^32 inputs.
113*23a1cceaSRoger A. Faulkner */
114*23a1cceaSRoger A. Faulkner
115*23a1cceaSRoger A. Faulkner#include <sys/asm_linkage.h>
116*23a1cceaSRoger A. Faulkner
117*23a1cceaSRoger A. Faulkner	! The first part of this algorithm walks through the beginning of
118*23a1cceaSRoger A. Faulkner	! both strings a byte at a time until the source ptr is  aligned to
119*23a1cceaSRoger A. Faulkner	! a word boundary. During these steps, the bytes are translated to
120*23a1cceaSRoger A. Faulkner	! lower-case if they are upper-case, and are checked against
121*23a1cceaSRoger A. Faulkner	! the source string.
122*23a1cceaSRoger A. Faulkner
123*23a1cceaSRoger A. Faulkner	ENTRY(ascii_strcasecmp)
124*23a1cceaSRoger A. Faulkner
125*23a1cceaSRoger A. Faulkner	.align 32
126*23a1cceaSRoger A. Faulkner
127*23a1cceaSRoger A. Faulkner	save	%sp, -SA(WINDOWSIZE), %sp
128*23a1cceaSRoger A. Faulkner	subcc	%i0, %i1, %i2		! s1 == s2 ?
129*23a1cceaSRoger A. Faulkner	bz,pn	%ncc, .stringsequal	! yup, done, strings equal
130*23a1cceaSRoger A. Faulkner	andcc	%i0, 3, %i3		! s1 word-aligned ?
131*23a1cceaSRoger A. Faulkner	bz,pn	%ncc, .s1aligned1	! yup
132*23a1cceaSRoger A. Faulkner	sethi	%hi(0x80808080), %i4	! start loading Mycroft's magic1
133*23a1cceaSRoger A. Faulkner
134*23a1cceaSRoger A. Faulkner	ldub	[%i1 + %i2], %i0	! s1[0]
135*23a1cceaSRoger A. Faulkner	ldub	[%i1], %g1		! s2[0]
136*23a1cceaSRoger A. Faulkner	sub	%i0, 'A', %l0		! transform for faster uppercase check
137*23a1cceaSRoger A. Faulkner	sub	%g1, 'A', %l1		! transform for faster uppercase check
138*23a1cceaSRoger A. Faulkner	cmp	%l0, ('Z' - 'A')	! s1[0] uppercase?
139*23a1cceaSRoger A. Faulkner	bleu,a	.noxlate11		! yes
140*23a1cceaSRoger A. Faulkner	add	%i0, ('a' - 'A'), %i0	! s1[0] = tolower(s1[0])
141*23a1cceaSRoger A. Faulkner.noxlate11:
142*23a1cceaSRoger A. Faulkner	cmp	%l1, ('Z' - 'A')	! s2[0] uppercase?
143*23a1cceaSRoger A. Faulkner	bleu,a	.noxlate12		! yes
144*23a1cceaSRoger A. Faulkner	add	%g1, ('a' - 'A'), %g1	! s2[0] = tolower(s2[0])
145*23a1cceaSRoger A. Faulkner.noxlate12:
146*23a1cceaSRoger A. Faulkner	subcc	%i0, %g1, %i0		! tolower(s1[0]) != tolower(s2[0]) ?
147*23a1cceaSRoger A. Faulkner	bne,pn	%ncc, .done		! yup, done
148*23a1cceaSRoger A. Faulkner	inc	%i1			! s1++, s2++
149*23a1cceaSRoger A. Faulkner	addcc	%i0, %g1, %i0		! s1[0] == 0 ?
150*23a1cceaSRoger A. Faulkner	bz,pn	%ncc, .done		! yup, done, strings equal
151*23a1cceaSRoger A. Faulkner	cmp	%i3, 3			! s1 aligned now?
152*23a1cceaSRoger A. Faulkner	bz	%ncc, .s1aligned2	! yup
153*23a1cceaSRoger A. Faulkner	sethi	%hi(0x01010101), %i5	! start loading Mycroft's magic2
154*23a1cceaSRoger A. Faulkner
155*23a1cceaSRoger A. Faulkner	ldub	[%i1 + %i2], %i0	! s1[1]
156*23a1cceaSRoger A. Faulkner	ldub	[%i1], %g1		! s2[1]
157*23a1cceaSRoger A. Faulkner	sub	%i0, 'A', %l0		! transform for faster uppercase check
158*23a1cceaSRoger A. Faulkner	sub	%g1, 'A', %l1		! transform for faster uppercase check
159*23a1cceaSRoger A. Faulkner	cmp	%l0, ('Z' - 'A')	! s1[1] uppercase?
160*23a1cceaSRoger A. Faulkner	bleu,a	.noxlate21		! yes
161*23a1cceaSRoger A. Faulkner	add	%i0, ('a' - 'A'), %i0	! s1[1] = tolower(s1[1])
162*23a1cceaSRoger A. Faulkner.noxlate21:
163*23a1cceaSRoger A. Faulkner	cmp	%l1, ('Z' - 'A')	! s2[1] uppercase?
164*23a1cceaSRoger A. Faulkner	bleu,a	.noxlate22		! yes
165*23a1cceaSRoger A. Faulkner	add	%g1, ('a' - 'A'), %g1	! s2[1] = tolower(s2[1])
166*23a1cceaSRoger A. Faulkner.noxlate22:
167*23a1cceaSRoger A. Faulkner	subcc	%i0, %g1, %i0		! tolower(s1[1]) != tolower(s2[1]) ?
168*23a1cceaSRoger A. Faulkner	bne,pn	%ncc, .done		! yup, done
169*23a1cceaSRoger A. Faulkner	inc	%i1			! s1++, s2++
170*23a1cceaSRoger A. Faulkner	addcc	%i0, %g1, %i0		! s1[1] == 0 ?
171*23a1cceaSRoger A. Faulkner	bz,pn	%ncc, .done		! yup, done, strings equal
172*23a1cceaSRoger A. Faulkner	cmp	%i3, 2			! s1 aligned now?
173*23a1cceaSRoger A. Faulkner	bz	%ncc, .s1aligned3	! yup
174*23a1cceaSRoger A. Faulkner	or	%i4, %lo(0x80808080),%i4! finish loading Mycroft's magic1
175*23a1cceaSRoger A. Faulkner
176*23a1cceaSRoger A. Faulkner	ldub	[%i1 + %i2], %i0	! s1[2]
177*23a1cceaSRoger A. Faulkner	ldub	[%i1], %g1		! s2[2]
178*23a1cceaSRoger A. Faulkner	sub	%i0, 'A', %l0		! transform for faster uppercase check
179*23a1cceaSRoger A. Faulkner	sub	%g1, 'A', %l1		! transform for faster uppercase check
180*23a1cceaSRoger A. Faulkner	cmp	%l0, ('Z' - 'A')	! s1[2] uppercase?
181*23a1cceaSRoger A. Faulkner	bleu,a	.noxlate31		! yes
182*23a1cceaSRoger A. Faulkner	add	%i0, ('a' - 'A'), %i0	! s1[2] = tolower(s1[2])
183*23a1cceaSRoger A. Faulkner.noxlate31:
184*23a1cceaSRoger A. Faulkner	cmp	%l1, ('Z' - 'A')	! s2[2] uppercase?
185*23a1cceaSRoger A. Faulkner	bleu,a	.noxlate32		! yes
186*23a1cceaSRoger A. Faulkner	add	%g1, ('a' - 'A'), %g1	! s2[2] = tolower(s2[2])
187*23a1cceaSRoger A. Faulkner.noxlate32:
188*23a1cceaSRoger A. Faulkner	subcc	%i0, %g1, %i0		! tolower(s1[2]) != tolower(s2[2]) ?
189*23a1cceaSRoger A. Faulkner	bne,pn	%ncc, .done		! yup, done
190*23a1cceaSRoger A. Faulkner	inc	%i1			! s1++, s2++
191*23a1cceaSRoger A. Faulkner	addcc	%i0, %g1, %i0		! s1[2] == 0 ?
192*23a1cceaSRoger A. Faulkner	bz,pn	%ncc, .done		! yup, done, strings equal
193*23a1cceaSRoger A. Faulkner	or	%i5, %lo(0x01010101),%i5! finish loading Mycroft's magic2
194*23a1cceaSRoger A. Faulkner	ba	.s1aligned4		! s1 aligned now
195*23a1cceaSRoger A. Faulkner	andcc	%i1, 3, %i3		! s2 word-aligned ?
196*23a1cceaSRoger A. Faulkner
197*23a1cceaSRoger A. Faulkner	! Here, we initialize our checks for a zero byte and decide
198*23a1cceaSRoger A. Faulkner	! whether or not we can optimize further if we're fortunate
199*23a1cceaSRoger A. Faulkner	! enough to have a word aligned desintation
200*23a1cceaSRoger A. Faulkner
201*23a1cceaSRoger A. Faulkner.s1aligned1:
202*23a1cceaSRoger A. Faulkner	sethi	%hi(0x01010101), %i5	! start loading Mycroft's magic2
203*23a1cceaSRoger A. Faulkner.s1aligned2:
204*23a1cceaSRoger A. Faulkner	or	%i4, %lo(0x80808080),%i4! finish loading Mycroft's magic1
205*23a1cceaSRoger A. Faulkner.s1aligned3:
206*23a1cceaSRoger A. Faulkner	or	%i5, %lo(0x01010101),%i5! finish loading Mycroft's magic2
207*23a1cceaSRoger A. Faulkner	andcc	%i1, 3, %i3		! s2 word aligned ?
208*23a1cceaSRoger A. Faulkner.s1aligned4:
209*23a1cceaSRoger A. Faulkner	sethi	%hi(0x3f3f3f3f), %l2	! load m2 for parallel tolower()
210*23a1cceaSRoger A. Faulkner	sethi	%hi(0x25252525), %l3	! load m3 for parallel tolower()
211*23a1cceaSRoger A. Faulkner	or 	%l2, %lo(0x3f3f3f3f),%l2! finish loading m2
212*23a1cceaSRoger A. Faulkner	bz	.word4			! yup, s2 word-aligned
213*23a1cceaSRoger A. Faulkner	or 	%l3, %lo(0x25252525),%l3! finish loading m3
214*23a1cceaSRoger A. Faulkner
215*23a1cceaSRoger A. Faulkner	add	%i2, %i3, %i2		! start adjusting offset s1-s2
216*23a1cceaSRoger A. Faulkner	sll     %i3, 3, %l6    		! shift factor for left shifts
217*23a1cceaSRoger A. Faulkner	andn	%i1, 3, %i1		! round s1 pointer down to next word
218*23a1cceaSRoger A. Faulkner	sub	%g0, %l6, %l7		! shift factor for right shifts
219*23a1cceaSRoger A. Faulkner	orn	%i3, %g0, %i3		! generate all ones
220*23a1cceaSRoger A. Faulkner	lduw	[%i1], %i0		! new lower word from s2
221*23a1cceaSRoger A. Faulkner	srl	%i3, %l6, %i3		! mask for fixing up bytes
222*23a1cceaSRoger A. Faulkner	sll	%i0, %l6, %g1		! partial unaligned word from s2
223*23a1cceaSRoger A. Faulkner	orn	%i0, %i3, %i0		! force start bytes to non-zero
224*23a1cceaSRoger A. Faulkner	nop				! pad to align loop to 16-byte boundary
225*23a1cceaSRoger A. Faulkner	nop				! pad to align loop to 16-byte boundary
226*23a1cceaSRoger A. Faulkner
227*23a1cceaSRoger A. Faulkner	! This is the comparision procedure used if the destination is not
228*23a1cceaSRoger A. Faulkner	! word aligned, if it is, we use word4 & cmp4
229*23a1cceaSRoger A. Faulkner
230*23a1cceaSRoger A. Faulkner.cmp:
231*23a1cceaSRoger A. Faulkner	andn	%i4, %i0, %l4		! ~word & 0x80808080
232*23a1cceaSRoger A. Faulkner	sub	%i0, %i5, %l5		! word - 0x01010101
233*23a1cceaSRoger A. Faulkner	andcc	%l5, %l4, %g0		! (word - 0x01010101) & ~word & 0x80808080
234*23a1cceaSRoger A. Faulkner	bz,a,pt	%ncc, .doload		! null byte in previous aligned s2 word
235*23a1cceaSRoger A. Faulkner	lduw	[%i1 + 4], %i0		! load next aligned word from s2
236*23a1cceaSRoger A. Faulkner.doload:
237*23a1cceaSRoger A. Faulkner	srl	%i0, %l7, %i3		! byte 1 from new aligned word from s2
238*23a1cceaSRoger A. Faulkner	or	%g1, %i3, %g1		! merge to get unaligned word from s2
239*23a1cceaSRoger A. Faulkner	lduw	[%i1 + %i2], %i3	! x1 = word from s1
240*23a1cceaSRoger A. Faulkner	andn	%i3, %i4, %l0		! q1 = x1 & ~m1
241*23a1cceaSRoger A. Faulkner	andn	%g1, %i4, %l4		! q2 = x2 & ~m1
242*23a1cceaSRoger A. Faulkner	add	%l0, %l2, %l1		! p1 = q1 + m2
243*23a1cceaSRoger A. Faulkner	add	%l4, %l2, %l5		! p2 = q2 + m2
244*23a1cceaSRoger A. Faulkner	add	%l0, %l3, %l0		! q1 = q1 + m3
245*23a1cceaSRoger A. Faulkner	add	%l4, %l3, %l4		! q2 = q2 + m3
246*23a1cceaSRoger A. Faulkner	andn	%l1, %l0, %l1		! p1 = p1 & ~q1
247*23a1cceaSRoger A. Faulkner	andn	%l5, %l4, %l5		! p2 = p2 & ~q2
248*23a1cceaSRoger A. Faulkner	andn	%i4, %i3, %l0		! q1 = m1 & ~x1
249*23a1cceaSRoger A. Faulkner	andn	%i4, %g1, %l4		! q2 = m1 & ~x2
250*23a1cceaSRoger A. Faulkner	and	%l0, %l1, %l0		! q1 = p1 & q1
251*23a1cceaSRoger A. Faulkner	and	%l4, %l5, %l4		! q2 = p2 & q2
252*23a1cceaSRoger A. Faulkner	srl	%l0, 2, %l0		! q1 = q1 >> 2
253*23a1cceaSRoger A. Faulkner	srl	%l4, 2, %l4		! q2 = q2 >> 2
254*23a1cceaSRoger A. Faulkner	add	%l0, %i3, %i3		! lowercase word from s1
255*23a1cceaSRoger A. Faulkner	add	%l4, %g1, %g1		! lowercase word from s2
256*23a1cceaSRoger A. Faulkner	cmp	%i3, %g1		! tolower(*s1) != tolower(*s2) ?
257*23a1cceaSRoger A. Faulkner	bne	%icc, .wordsdiffer	! yup, now find byte that is different
258*23a1cceaSRoger A. Faulkner	add	%i1, 4, %i1		! s1+=4, s2+=4
259*23a1cceaSRoger A. Faulkner	andn	%i4, %i3, %l4		! ~word & 0x80808080
260*23a1cceaSRoger A. Faulkner	sub	%i3, %i5, %l5		! word - 0x01010101
261*23a1cceaSRoger A. Faulkner	andcc	%l5, %l4, %g0		! (word - 0x01010101) & ~word & 0x80808080
262*23a1cceaSRoger A. Faulkner	bz,pt	%ncc, .cmp		! no null-byte in s1 yet
263*23a1cceaSRoger A. Faulkner	sll	%i0, %l6, %g1		! partial unaligned word from s2
264*23a1cceaSRoger A. Faulkner
265*23a1cceaSRoger A. Faulkner	! words are equal but the end of s1 has been reached
266*23a1cceaSRoger A. Faulkner	! this means the strings must be equal
267*23a1cceaSRoger A. Faulkner.stringsequal:
268*23a1cceaSRoger A. Faulkner	ret				! return
269*23a1cceaSRoger A. Faulkner	restore	%g0, %g0, %o0		! return 0, i.e. strings are equal
270*23a1cceaSRoger A. Faulkner	nop				! pad
271*23a1cceaSRoger A. Faulkner
272*23a1cceaSRoger A. Faulkner
273*23a1cceaSRoger A. Faulkner	! we have a word aligned source and destination!  This means
274*23a1cceaSRoger A. Faulkner	! things get to go fast!
275*23a1cceaSRoger A. Faulkner
276*23a1cceaSRoger A. Faulkner.word4:
277*23a1cceaSRoger A. Faulkner	lduw	[%i1 + %i2], %i3	! x1 = word from s1
278*23a1cceaSRoger A. Faulkner
279*23a1cceaSRoger A. Faulkner.cmp4:
280*23a1cceaSRoger A. Faulkner	andn	%i3, %i4, %l0		! q1 = x1 & ~m1
281*23a1cceaSRoger A. Faulkner	lduw	[%i1], %g1		! x2 = word from s2
282*23a1cceaSRoger A. Faulkner	andn	%g1, %i4, %l4		! q2 = x2 & ~m1
283*23a1cceaSRoger A. Faulkner	add	%l0, %l2, %l1		! p1 = q1 + m2
284*23a1cceaSRoger A. Faulkner	add	%l4, %l2, %l5		! p2 = q2 + m2
285*23a1cceaSRoger A. Faulkner	add	%l0, %l3, %l0		! q1 = q1 + m3
286*23a1cceaSRoger A. Faulkner	add	%l4, %l3, %l4		! q2 = q2 + m3
287*23a1cceaSRoger A. Faulkner	andn	%l1, %l0, %l1		! p1 = p1 & ~q1
288*23a1cceaSRoger A. Faulkner	andn	%l5, %l4, %l5		! p2 = p2 & ~q2
289*23a1cceaSRoger A. Faulkner	andn	%i4, %i3, %l0		! q1 = m1 & ~x1
290*23a1cceaSRoger A. Faulkner	andn	%i4, %g1, %l4		! q2 = m1 & ~x2
291*23a1cceaSRoger A. Faulkner	and	%l0, %l1, %l0		! q1 = p1 & q1
292*23a1cceaSRoger A. Faulkner	and	%l4, %l5, %l4		! q2 = p2 & q2
293*23a1cceaSRoger A. Faulkner	srl	%l0, 2, %l0		! q1 = q1 >> 2
294*23a1cceaSRoger A. Faulkner	srl	%l4, 2, %l4		! q2 = q2 >> 2
295*23a1cceaSRoger A. Faulkner	add	%l0, %i3, %i3		! lowercase word from s1
296*23a1cceaSRoger A. Faulkner	add	%l4, %g1, %g1		! lowercase word from s2
297*23a1cceaSRoger A. Faulkner	cmp	%i3, %g1		! tolower(*s1) != tolower(*s2) ?
298*23a1cceaSRoger A. Faulkner	bne,pn	%icc, .wordsdiffer	! yup, now find mismatching character
299*23a1cceaSRoger A. Faulkner	add	%i1, 4, %i1		! s1+=4, s2+=4
300*23a1cceaSRoger A. Faulkner	andn	%i4, %i3, %l4		! ~word & 0x80808080
301*23a1cceaSRoger A. Faulkner	sub	%i3, %i5, %l5		! word - 0x01010101
302*23a1cceaSRoger A. Faulkner	andcc	%l5, %l4, %g0		! (word - 0x01010101) & ~word & 0x80808080
303*23a1cceaSRoger A. Faulkner	bz,a,pt	%icc, .cmp4		! no null-byte in s1 yet
304*23a1cceaSRoger A. Faulkner	lduw	[%i1 + %i2], %i3	! load word from s1
305*23a1cceaSRoger A. Faulkner
306*23a1cceaSRoger A. Faulkner	! words are equal but the end of s1 has been reached
307*23a1cceaSRoger A. Faulkner	! this means the strings must be equal
308*23a1cceaSRoger A. Faulkner.stringsequal4:
309*23a1cceaSRoger A. Faulkner	ret				! return
310*23a1cceaSRoger A. Faulkner	restore	%g0, %g0, %o0		! return 0, i.e. strings are equal
311*23a1cceaSRoger A. Faulkner
312*23a1cceaSRoger A. Faulkner.wordsdiffer:
313*23a1cceaSRoger A. Faulkner	srl	%g1, 24, %i2		! first byte of mismatching word in s2
314*23a1cceaSRoger A. Faulkner	srl	%i3, 24, %i1		! first byte of mismatching word in s1
315*23a1cceaSRoger A. Faulkner	subcc	%i1, %i2, %i0		! *s1-*s2
316*23a1cceaSRoger A. Faulkner	bnz,pn	%ncc, .done		! bytes differ, return difference
317*23a1cceaSRoger A. Faulkner	srl	%g1, 16, %i2		! second byte of mismatching word in s2
318*23a1cceaSRoger A. Faulkner	andcc	%i1, 0xff, %i0		! *s1 == 0 ?
319*23a1cceaSRoger A. Faulkner	bz,pn	%ncc, .done		! yup
320*23a1cceaSRoger A. Faulkner
321*23a1cceaSRoger A. Faulkner	! we know byte 1 is equal, so can compare bytes 1,2 as a group
322*23a1cceaSRoger A. Faulkner
323*23a1cceaSRoger A. Faulkner	srl	%i3, 16, %i1		! second byte of mismatching word in s1
324*23a1cceaSRoger A. Faulkner	subcc	%i1, %i2, %i0		! *s1-*s2
325*23a1cceaSRoger A. Faulkner	bnz,pn	%ncc, .done		! bytes differ, return difference
326*23a1cceaSRoger A. Faulkner	srl	%g1, 8, %i2		! third byte of mismatching word in s2
327*23a1cceaSRoger A. Faulkner	andcc	%i1, 0xff, %i0		! *s1 == 0 ?
328*23a1cceaSRoger A. Faulkner	bz,pn	%ncc, .done		! yup
329*23a1cceaSRoger A. Faulkner
330*23a1cceaSRoger A. Faulkner	! we know bytes 1, 2 are equal, so can compare bytes 1,2,3 as a group
331*23a1cceaSRoger A. Faulkner
332*23a1cceaSRoger A. Faulkner	srl	%i3, 8, %i1		! third byte of mismatching word in s1
333*23a1cceaSRoger A. Faulkner	subcc	%i1, %i2, %i0		! *s1-*s2
334*23a1cceaSRoger A. Faulkner	bnz,pn	%ncc, .done		! bytes differ, return difference
335*23a1cceaSRoger A. Faulkner	andcc	%i1, 0xff, %g0		! *s1 == 0 ?
336*23a1cceaSRoger A. Faulkner	bz,pn	%ncc, .stringsequal	! yup
337*23a1cceaSRoger A. Faulkner
338*23a1cceaSRoger A. Faulkner	! we know bytes 1,2,3 are equal, so can compare bytes 1,2,3,4 as group
339*23a1cceaSRoger A. Faulkner
340*23a1cceaSRoger A. Faulkner	subcc	%i3, %g1, %i0		! *s1-*s2
341*23a1cceaSRoger A. Faulkner	bz,a	.done			! bytes differ, return difference
342*23a1cceaSRoger A. Faulkner	andcc	%i3, 0xff, %i0		! *s1 == 0 ?
343*23a1cceaSRoger A. Faulkner
344*23a1cceaSRoger A. Faulkner.done:
345*23a1cceaSRoger A. Faulkner	ret				! return
346*23a1cceaSRoger A. Faulkner	restore	%i0, %g0, %o0		! return tolower(*s1) - tolower(*s2)
347*23a1cceaSRoger A. Faulkner
348*23a1cceaSRoger A. Faulkner	SET_SIZE(ascii_strcasecmp)
349