xref: /titanic_44/usr/src/lib/libc/amd64/gen/memcmp.s (revision c28749e97052f09388969427adf7df641cdcdc22)
1/*
2 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
3 * Use is subject to license terms.
4 */
5
6/*
7 * Copyright (c) 2002 Advanced Micro Devices, Inc.
8 *
9 * All rights reserved.
10 *
11 * Redistribution and  use in source and binary  forms, with or
12 * without  modification,  are   permitted  provided  that  the
13 * following conditions are met:
14 *
15 * + Redistributions  of source  code  must  retain  the  above
16 *   copyright  notice,   this  list  of   conditions  and  the
17 *   following disclaimer.
18 *
19 * + Redistributions  in binary  form must reproduce  the above
20 *   copyright  notice,   this  list  of   conditions  and  the
21 *   following  disclaimer in  the  documentation and/or  other
22 *   materials provided with the distribution.
23 *
24 * + Neither the  name of Advanced Micro Devices,  Inc. nor the
25 *   names  of  its contributors  may  be  used  to endorse  or
26 *   promote  products  derived   from  this  software  without
27 *   specific prior written permission.
28 *
29 * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
30 * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
31 * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
32 * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
33 * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
34 * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
35 * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
36 * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
37 * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
38 * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
39 * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
40 * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
41 * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
42 * POSSIBILITY OF SUCH DAMAGE.
43 *
44 * It is  licensee's responsibility  to comply with  any export
45 * regulations applicable in licensee's jurisdiction.
46 */
47
48	.ident	"%Z%%M%	%I%	%E% SMI"
49
50	.file	"%M%"
51
52#include <sys/asm_linkage.h>
53
54	ANSI_PRAGMA_WEAK(memcmp,function)
55
56#include "SYS.h"
57#include "cache.h"
58
59#define LABEL(s) .memcmp/**/s
60
61	ENTRY(memcmp)                 /* (const void *, const void*, size_t) */
62
63LABEL(try1):
64        cmp     $8, %rdx
65        jae     LABEL(1after)
66
67LABEL(1):                                /* 1-byte */
68        test    %rdx, %rdx
69        mov     $0, %eax
70        jz      LABEL(exit)
71
72LABEL(1loop):
73        movzbl  (%rdi), %eax
74        movzbl  (%rsi), %ecx
75        sub     %ecx, %eax
76        jnz     LABEL(exit)
77
78        dec     %rdx
79
80        lea     1 (%rdi), %rdi
81        lea     1 (%rsi), %rsi
82
83        jnz     LABEL(1loop)
84
85LABEL(exit):
86        rep
87        ret
88
89        .p2align 4
90
91LABEL(1after):
92
93LABEL(8try):
94        cmp     $32, %rdx
95        jae     LABEL(8after)
96
97LABEL(8):                        /* 8-byte */
98        mov     %edx, %ecx
99        shr     $3, %ecx
100        jz      LABEL(1)
101
102        .p2align 4
103
104LABEL(8loop):
105        mov     (%rsi), %rax
106        cmp     (%rdi), %rax
107        jne     LABEL(1)
108
109        sub     $8, %rdx
110        dec     %ecx
111
112        lea     8 (%rsi), %rsi
113        lea     8 (%rdi), %rdi
114
115        jnz     LABEL(8loop)
116
117LABEL(8skip):
118        and     $7, %edx
119        jnz     LABEL(1)
120
121        xor     %eax, %eax
122        ret
123
124        .p2align 4
125
126LABEL(8after):
127
128LABEL(32try):
129        cmp     $2048, %rdx
130        ja      LABEL(32after)
131
132LABEL(32):                               /* 32-byte */
133        mov     %edx, %ecx
134        shr     $5, %ecx
135        jz      LABEL(8)
136
137        .p2align 4
138
139LABEL(32loop):
140        mov        (%rsi), %rax
141        mov      8 (%rsi),  %r8
142        mov     16 (%rsi),  %r9
143        mov     24 (%rsi), %r10
144        sub        (%rdi), %rax
145        sub      8 (%rdi),  %r8
146        sub     16 (%rdi),  %r9
147        sub     24 (%rdi), %r10
148
149        or      %rax,  %r8
150        or       %r9, %r10
151        or       %r8, %r10
152        jnz     LABEL(8)
153
154        sub     $32, %rdx
155        dec     %ecx
156
157        lea     32 (%rsi), %rsi
158        lea     32 (%rdi), %rdi
159
160        jnz     LABEL(32loop)
161
162LABEL(32skip):
163        and     $31, %edx
164        jnz     LABEL(8)
165
166        xor     %eax, %eax
167        ret
168
169        .p2align 4
170
171LABEL(32after):
172
173	prefetchnta _sref_(.amd64cache1half)	/* 3DNow: use prefetch */
174
175LABEL(srctry):
176        mov     %esi, %r8d      /* align by source */
177
178        and     $7, %r8d
179        jz      LABEL(srcafter)  /* not unaligned */
180
181LABEL(src):                      /* align */
182        lea     -8 (%r8, %rdx), %rdx
183        sub     $8, %r8d
184
185
186LABEL(srcloop):
187        movzbl  (%rdi), %eax
188        movzbl  (%rsi), %ecx
189        sub     %ecx, %eax
190        jnz     LABEL(exit)
191
192        inc     %r8d
193
194        lea     1 (%rdi), %rdi
195        lea     1 (%rsi), %rsi
196
197        jnz     LABEL(srcloop)
198
199        .p2align 4
200
201LABEL(srcafter):
202
203LABEL(64try):
204        mov     _sref_(.amd64cache1half), %rcx
205        cmp	%rdx, %rcx
206        cmova   %rdx, %rcx
207
208LABEL(64):                               /* 64-byte */
209        shr     $6, %rcx
210        jz      LABEL(32)
211
212        .p2align 4
213
214LABEL(64loop):
215        mov        (%rsi), %rax
216        mov      8 (%rsi),  %r8
217        sub        (%rdi), %rax
218        sub      8 (%rdi),  %r8
219        or      %r8,  %rax
220
221        mov     16 (%rsi),  %r9
222        mov     24 (%rsi), %r10
223        sub     16 (%rdi),  %r9
224        sub     24 (%rdi), %r10
225        or      %r10, %r9
226
227        or      %r9,  %rax
228        jnz     LABEL(32)
229
230        mov     32 (%rsi), %rax
231        mov     40 (%rsi),  %r8
232        sub     32 (%rdi), %rax
233        sub     40 (%rdi),  %r8
234        or      %r8,  %rax
235
236        mov     48 (%rsi),  %r9
237        mov     56 (%rsi), %r10
238        sub     48 (%rdi),  %r9
239        sub     56 (%rdi), %r10
240        or      %r10, %r9
241
242        or      %r9,  %rax
243        jnz    	LABEL(32)
244
245        lea     64 (%rsi), %rsi
246        lea     64 (%rdi), %rdi
247
248        sub     $64, %rdx
249        dec     %rcx
250        jnz     LABEL(64loop)
251
252LABEL(64skip):
253        cmp     $2048, %rdx
254        ja     LABEL(64after)
255
256        test    %edx, %edx
257        jnz     LABEL(32)
258
259        xor     %eax, %eax
260        ret
261
262        .p2align 4
263
264LABEL(64after):
265
266LABEL(pretry):
267
268LABEL(pre):                              /* 64-byte prefetching */
269        mov     _sref_(.amd64cache2half), %rcx
270        cmp	%rdx, %rcx
271        cmova   %rdx, %rcx
272
273        shr     $6, %rcx
274        jz      LABEL(preskip)
275
276        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
277        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
278
279        mov        (%rsi), %rax
280        mov      8 (%rsi), %r9
281        mov     16 (%rsi), %r10
282        mov     24 (%rsi), %r11
283        sub        (%rdi), %rax
284        sub      8 (%rdi), %r9
285        sub     16 (%rdi), %r10
286        sub     24 (%rdi), %r11
287
288        or       %r9, %rax
289        or      %r11, %r10
290        or      %r10, %rax
291        jnz     LABEL(32)
292
293        mov     32 (%rsi), %rax
294        mov     40 (%rsi), %r9
295        mov     48 (%rsi), %r10
296        mov     56 (%rsi), %r11
297        sub     32 (%rdi), %rax
298        sub     40 (%rdi), %r9
299        sub     48 (%rdi), %r10
300        sub     56 (%rdi), %r11
301
302        or       %r9, %rax
303        or      %r11, %r10
304        or      %r10, %rax
305        jnz     LABEL(32)
306
307        lea     64 (%rsi), %rsi
308        lea     64 (%rdi), %rdi
309
310        sub     $64, %rdx
311        dec     %rcx
312
313        .p2align 4
314
315LABEL(preloop):
316        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
317        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
318
319        mov        (%rsi), %rax
320        mov      8 (%rsi), %r9
321        mov     16 (%rsi), %r10
322        mov     24 (%rsi), %r11
323        sub        (%rdi), %rax
324        sub      8 (%rdi), %r9
325        sub     16 (%rdi), %r10
326        sub     24 (%rdi), %r11
327
328        or       %r9, %rax
329        or      %r11, %r10
330        or      %r10, %rax
331        jnz     LABEL(32)
332
333        mov     32 (%rsi), %rax
334        mov     40 (%rsi), %r9
335        mov     48 (%rsi), %r10
336        mov     56 (%rsi), %r11
337        sub     32 (%rdi), %rax
338        sub     40 (%rdi), %r9
339        sub     48 (%rdi), %r10
340        sub     56 (%rdi), %r11
341
342        or       %r9, %rax
343        or      %r11, %r10
344        or      %r10, %rax
345        jnz     LABEL(32)
346
347        lea     64 (%rsi), %rsi
348        lea     64 (%rdi), %rdi
349
350        sub     $64, %rdx
351        dec     %rcx
352        jnz     LABEL(preloop)
353
354
355LABEL(preskip):
356        cmp     $2048, %rdx
357        ja      LABEL(preafter)
358
359        test    %edx, %edx
360        jnz     LABEL(32)
361
362        xor     %eax, %eax
363        ret
364
365        .p2align 4
366
367LABEL(preafter):
368
369LABEL(128try):
370
371LABEL(128):                              /* 128-byte */
372        mov     %rdx, %rcx
373        shr     $7, %rcx
374        jz      LABEL(128skip)
375
376        .p2align 4
377
378LABEL(128loop):
379        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
380        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */
381
382        mov        (%rsi), %rax
383        mov      8 (%rsi), %r8
384        sub        (%rdi), %rax
385        sub      8 (%rdi), %r8
386        mov     16 (%rsi), %r9
387        mov     24 (%rsi), %r10
388        sub     16 (%rdi), %r9
389        sub     24 (%rdi), %r10
390
391        or       %r8, %rax
392        or       %r9, %r10
393        or      %r10, %rax
394
395        mov     32 (%rsi), %r8
396        mov     40 (%rsi), %r9
397        sub     32 (%rdi), %r8
398        sub     40 (%rdi), %r9
399        mov     48 (%rsi), %r10
400        mov     56 (%rsi), %r11
401        sub     48 (%rdi), %r10
402        sub     56 (%rdi), %r11
403
404        or       %r9, %r8
405        or      %r11, %r10
406        or      %r10, %r8
407
408        or      %r8, %rax
409        jnz     LABEL(32)
410
411        prefetchnta 576 (%rsi)	/* 3DNow: use prefetch */
412        prefetchnta 576 (%rdi)	/* 3DNow: use prefetch */
413
414        mov      64 (%rsi), %rax
415        mov      72 (%rsi), %r8
416        sub      64 (%rdi), %rax
417        sub      72 (%rdi), %r8
418        mov      80 (%rsi), %r9
419        mov      88 (%rsi), %r10
420        sub      80 (%rdi), %r9
421        sub      88 (%rdi), %r10
422
423        or       %r8, %rax
424        or       %r9, %r10
425        or      %r10, %rax
426
427        mov      96 (%rsi), %r8
428        mov     104 (%rsi), %r9
429        sub      96 (%rdi), %r8
430        sub     104 (%rdi), %r9
431        mov     112 (%rsi), %r10
432        mov     120 (%rsi), %r11
433        sub     112 (%rdi), %r10
434        sub     120 (%rdi), %r11
435
436        or       %r9, %r8
437        or      %r11, %r10
438        or      %r10, %r8
439
440        or      %r8, %rax
441        jnz     LABEL(32)
442
443        sub     $128, %rdx
444        dec     %rcx
445
446        lea     128 (%rsi), %rsi
447        lea     128 (%rdi), %rdi
448
449        jnz     LABEL(128loop)
450
451LABEL(128skip):
452        and     $127, %edx
453        jnz     LABEL(32)
454
455        xor     %eax, %eax
456        ret
457
458	SET_SIZE(memcmp)
459