xref: /titanic_44/usr/src/lib/libc/amd64/gen/memcmp.s (revision bdfc6d18da790deeec2e0eb09c625902defe2498)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 * Copyright (c) 2002 Advanced Micro Devices, Inc.
29 *
30 * All rights reserved.
31 *
32 * Redistribution and  use in source and binary  forms, with or
33 * without  modification,  are   permitted  provided  that  the
34 * following conditions are met:
35 *
36 * + Redistributions  of source  code  must  retain  the  above
37 *   copyright  notice,   this  list  of   conditions  and  the
38 *   following disclaimer.
39 *
40 * + Redistributions  in binary  form must reproduce  the above
41 *   copyright  notice,   this  list  of   conditions  and  the
42 *   following  disclaimer in  the  documentation and/or  other
43 *   materials provided with the distribution.
44 *
45 * + Neither the  name of Advanced Micro Devices,  Inc. nor the
46 *   names  of  its contributors  may  be  used  to endorse  or
47 *   promote  products  derived   from  this  software  without
48 *   specific prior written permission.
49 *
50 * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
51 * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
52 * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
53 * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
54 * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
55 * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
56 * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
57 * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
58 * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
59 * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
60 * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
61 * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
62 * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
63 * POSSIBILITY OF SUCH DAMAGE.
64 *
65 * It is  licensee's responsibility  to comply with  any export
66 * regulations applicable in licensee's jurisdiction.
67 */
68
69	.ident	"%Z%%M%	%I%	%E% SMI"
70
71	.file	"%M%"
72
73#include <sys/asm_linkage.h>
74
75	ANSI_PRAGMA_WEAK(memcmp,function)
76
77#include "SYS.h"
78#include "cache.h"
79
80#define LABEL(s) .memcmp/**/s
81
82	ENTRY(memcmp)                 /* (const void *, const void*, size_t) */
83
84LABEL(try1):
85        cmp     $8, %rdx
86        jae     LABEL(1after)
87
88LABEL(1):                                /* 1-byte */
89        test    %rdx, %rdx
90        mov     $0, %eax
91        jz      LABEL(exit)
92
93LABEL(1loop):
94        movzbl  (%rdi), %eax
95        movzbl  (%rsi), %ecx
96        sub     %ecx, %eax
97        jnz     LABEL(exit)
98
99        dec     %rdx
100
101        lea     1 (%rdi), %rdi
102        lea     1 (%rsi), %rsi
103
104        jnz     LABEL(1loop)
105
106LABEL(exit):
107        rep
108        ret
109
110        .p2align 4
111
112LABEL(1after):
113
114LABEL(8try):
115        cmp     $32, %rdx
116        jae     LABEL(8after)
117
118LABEL(8):                        /* 8-byte */
119        mov     %edx, %ecx
120        shr     $3, %ecx
121        jz      LABEL(1)
122
123        .p2align 4
124
125LABEL(8loop):
126        mov     (%rsi), %rax
127        cmp     (%rdi), %rax
128        jne     LABEL(1)
129
130        sub     $8, %rdx
131        dec     %ecx
132
133        lea     8 (%rsi), %rsi
134        lea     8 (%rdi), %rdi
135
136        jnz     LABEL(8loop)
137
138LABEL(8skip):
139        and     $7, %edx
140        jnz     LABEL(1)
141
142        xor     %eax, %eax
143        ret
144
145        .p2align 4
146
147LABEL(8after):
148
149LABEL(32try):
150        cmp     $2048, %rdx
151        ja      LABEL(32after)
152
153LABEL(32):                               /* 32-byte */
154        mov     %edx, %ecx
155        shr     $5, %ecx
156        jz      LABEL(8)
157
158        .p2align 4
159
160LABEL(32loop):
161        mov        (%rsi), %rax
162        mov      8 (%rsi),  %r8
163        mov     16 (%rsi),  %r9
164        mov     24 (%rsi), %r10
165        sub        (%rdi), %rax
166        sub      8 (%rdi),  %r8
167        sub     16 (%rdi),  %r9
168        sub     24 (%rdi), %r10
169
170        or      %rax,  %r8
171        or       %r9, %r10
172        or       %r8, %r10
173        jnz     LABEL(8)
174
175        sub     $32, %rdx
176        dec     %ecx
177
178        lea     32 (%rsi), %rsi
179        lea     32 (%rdi), %rdi
180
181        jnz     LABEL(32loop)
182
183LABEL(32skip):
184        and     $31, %edx
185        jnz     LABEL(8)
186
187        xor     %eax, %eax
188        ret
189
190        .p2align 4
191
192LABEL(32after):
193
194	prefetchnta _sref_(.amd64cache1half)	/* 3DNow: use prefetch */
195
196LABEL(srctry):
197        mov     %esi, %r8d      /* align by source */
198
199        and     $7, %r8d
200        jz      LABEL(srcafter)  /* not unaligned */
201
202LABEL(src):                      /* align */
203        lea     -8 (%r8, %rdx), %rdx
204        sub     $8, %r8d
205
206
207LABEL(srcloop):
208        movzbl  (%rdi), %eax
209        movzbl  (%rsi), %ecx
210        sub     %ecx, %eax
211        jnz     LABEL(exit)
212
213        inc     %r8d
214
215        lea     1 (%rdi), %rdi
216        lea     1 (%rsi), %rsi
217
218        jnz     LABEL(srcloop)
219
220        .p2align 4
221
222LABEL(srcafter):
223
224LABEL(64try):
225        mov     _sref_(.amd64cache1half), %rcx
226        cmp	%rdx, %rcx
227        cmova   %rdx, %rcx
228
229LABEL(64):                               /* 64-byte */
230        shr     $6, %rcx
231        jz      LABEL(32)
232
233        .p2align 4
234
235LABEL(64loop):
236        mov        (%rsi), %rax
237        mov      8 (%rsi),  %r8
238        sub        (%rdi), %rax
239        sub      8 (%rdi),  %r8
240        or      %r8,  %rax
241
242        mov     16 (%rsi),  %r9
243        mov     24 (%rsi), %r10
244        sub     16 (%rdi),  %r9
245        sub     24 (%rdi), %r10
246        or      %r10, %r9
247
248        or      %r9,  %rax
249        jnz     LABEL(32)
250
251        mov     32 (%rsi), %rax
252        mov     40 (%rsi),  %r8
253        sub     32 (%rdi), %rax
254        sub     40 (%rdi),  %r8
255        or      %r8,  %rax
256
257        mov     48 (%rsi),  %r9
258        mov     56 (%rsi), %r10
259        sub     48 (%rdi),  %r9
260        sub     56 (%rdi), %r10
261        or      %r10, %r9
262
263        or      %r9,  %rax
264        jnz    	LABEL(32)
265
266        lea     64 (%rsi), %rsi
267        lea     64 (%rdi), %rdi
268
269        sub     $64, %rdx
270        dec     %rcx
271        jnz     LABEL(64loop)
272
273LABEL(64skip):
274        cmp     $2048, %rdx
275        ja     LABEL(64after)
276
277        test    %edx, %edx
278        jnz     LABEL(32)
279
280        xor     %eax, %eax
281        ret
282
283        .p2align 4
284
285LABEL(64after):
286
287LABEL(pretry):
288
289LABEL(pre):                              /* 64-byte prefetching */
290        mov     _sref_(.amd64cache2half), %rcx
291        cmp	%rdx, %rcx
292        cmova   %rdx, %rcx
293
294        shr     $6, %rcx
295        jz      LABEL(preskip)
296
297        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
298        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
299
300        mov        (%rsi), %rax
301        mov      8 (%rsi), %r9
302        mov     16 (%rsi), %r10
303        mov     24 (%rsi), %r11
304        sub        (%rdi), %rax
305        sub      8 (%rdi), %r9
306        sub     16 (%rdi), %r10
307        sub     24 (%rdi), %r11
308
309        or       %r9, %rax
310        or      %r11, %r10
311        or      %r10, %rax
312        jnz     LABEL(32)
313
314        mov     32 (%rsi), %rax
315        mov     40 (%rsi), %r9
316        mov     48 (%rsi), %r10
317        mov     56 (%rsi), %r11
318        sub     32 (%rdi), %rax
319        sub     40 (%rdi), %r9
320        sub     48 (%rdi), %r10
321        sub     56 (%rdi), %r11
322
323        or       %r9, %rax
324        or      %r11, %r10
325        or      %r10, %rax
326        jnz     LABEL(32)
327
328        lea     64 (%rsi), %rsi
329        lea     64 (%rdi), %rdi
330
331        sub     $64, %rdx
332        dec     %rcx
333
334        .p2align 4
335
336LABEL(preloop):
337        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
338        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
339
340        mov        (%rsi), %rax
341        mov      8 (%rsi), %r9
342        mov     16 (%rsi), %r10
343        mov     24 (%rsi), %r11
344        sub        (%rdi), %rax
345        sub      8 (%rdi), %r9
346        sub     16 (%rdi), %r10
347        sub     24 (%rdi), %r11
348
349        or       %r9, %rax
350        or      %r11, %r10
351        or      %r10, %rax
352        jnz     LABEL(32)
353
354        mov     32 (%rsi), %rax
355        mov     40 (%rsi), %r9
356        mov     48 (%rsi), %r10
357        mov     56 (%rsi), %r11
358        sub     32 (%rdi), %rax
359        sub     40 (%rdi), %r9
360        sub     48 (%rdi), %r10
361        sub     56 (%rdi), %r11
362
363        or       %r9, %rax
364        or      %r11, %r10
365        or      %r10, %rax
366        jnz     LABEL(32)
367
368        lea     64 (%rsi), %rsi
369        lea     64 (%rdi), %rdi
370
371        sub     $64, %rdx
372        dec     %rcx
373        jnz     LABEL(preloop)
374
375
376LABEL(preskip):
377        cmp     $2048, %rdx
378        ja      LABEL(preafter)
379
380        test    %edx, %edx
381        jnz     LABEL(32)
382
383        xor     %eax, %eax
384        ret
385
386        .p2align 4
387
388LABEL(preafter):
389
390LABEL(128try):
391
392LABEL(128):                              /* 128-byte */
393        mov     %rdx, %rcx
394        shr     $7, %rcx
395        jz      LABEL(128skip)
396
397        .p2align 4
398
399LABEL(128loop):
400        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
401        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
402
403        mov        (%rsi), %rax
404        mov      8 (%rsi), %r8
405        sub        (%rdi), %rax
406        sub      8 (%rdi), %r8
407        mov     16 (%rsi), %r9
408        mov     24 (%rsi), %r10
409        sub     16 (%rdi), %r9
410        sub     24 (%rdi), %r10
411
412        or       %r8, %rax
413        or       %r9, %r10
414        or      %r10, %rax
415
416        mov     32 (%rsi), %r8
417        mov     40 (%rsi), %r9
418        sub     32 (%rdi), %r8
419        sub     40 (%rdi), %r9
420        mov     48 (%rsi), %r10
421        mov     56 (%rsi), %r11
422        sub     48 (%rdi), %r10
423        sub     56 (%rdi), %r11
424
425        or       %r9, %r8
426        or      %r11, %r10
427        or      %r10, %r8
428
429        or      %r8, %rax
430        jnz     LABEL(32)
431
432        prefetchnta 576 (%rsi)	/* 3DNow: use prefetch */
433        prefetchnta 576 (%rdi)	/* 3DNow: use prefetch */
434
435        mov      64 (%rsi), %rax
436        mov      72 (%rsi), %r8
437        sub      64 (%rdi), %rax
438        sub      72 (%rdi), %r8
439        mov      80 (%rsi), %r9
440        mov      88 (%rsi), %r10
441        sub      80 (%rdi), %r9
442        sub      88 (%rdi), %r10
443
444        or       %r8, %rax
445        or       %r9, %r10
446        or      %r10, %rax
447
448        mov      96 (%rsi), %r8
449        mov     104 (%rsi), %r9
450        sub      96 (%rdi), %r8
451        sub     104 (%rdi), %r9
452        mov     112 (%rsi), %r10
453        mov     120 (%rsi), %r11
454        sub     112 (%rdi), %r10
455        sub     120 (%rdi), %r11
456
457        or       %r9, %r8
458        or      %r11, %r10
459        or      %r10, %r8
460
461        or      %r8, %rax
462        jnz     LABEL(32)
463
464        sub     $128, %rdx
465        dec     %rcx
466
467        lea     128 (%rsi), %rsi
468        lea     128 (%rdi), %rdi
469
470        jnz     LABEL(128loop)
471
472LABEL(128skip):
473        and     $127, %edx
474        jnz     LABEL(32)
475
476        xor     %eax, %eax
477        ret
478
479	SET_SIZE(memcmp)
480