/*
 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright (c) 2002 Advanced Micro Devices, Inc.
 * 
 * All rights reserved.
 * 
 * Redistribution and  use in source and binary  forms, with or
 * without  modification,  are   permitted  provided  that  the
 * following conditions are met:
 * 
 * + Redistributions  of source  code  must  retain  the  above
 *   copyright  notice,   this  list  of   conditions  and  the
 *   following disclaimer.
 * 
 * + Redistributions  in binary  form must reproduce  the above
 *   copyright  notice,   this  list  of   conditions  and  the
 *   following  disclaimer in  the  documentation and/or  other
 *   materials provided with the distribution.
 * 
 * + Neither the  name of Advanced Micro Devices,  Inc. nor the
 *   names  of  its contributors  may  be  used  to endorse  or
 *   promote  products  derived   from  this  software  without
 *   specific prior written permission.
 * 
 * THIS  SOFTWARE  IS PROVIDED  BY  THE  COPYRIGHT HOLDERS  AND
 * CONTRIBUTORS AS IS AND  ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING,  BUT NOT  LIMITED TO,  THE IMPLIED  WARRANTIES OF
 * MERCHANTABILITY  AND FITNESS  FOR A  PARTICULAR  PURPOSE ARE
 * DISCLAIMED.  IN  NO  EVENT  SHALL  ADVANCED  MICRO  DEVICES,
 * INC.  OR CONTRIBUTORS  BE LIABLE  FOR ANY  DIRECT, INDIRECT,
 * INCIDENTAL,  SPECIAL,  EXEMPLARY,  OR CONSEQUENTIAL  DAMAGES
 * (INCLUDING,  BUT NOT LIMITED  TO, PROCUREMENT  OF SUBSTITUTE
 * GOODS  OR  SERVICES;  LOSS  OF  USE, DATA,  OR  PROFITS;  OR
 * BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON  ANY THEORY OF
 * LIABILITY,  WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
 * (INCLUDING NEGLIGENCE  OR OTHERWISE) ARISING IN  ANY WAY OUT
 * OF THE  USE  OF  THIS  SOFTWARE, EVEN  IF  ADVISED  OF  THE
 * POSSIBILITY OF SUCH DAMAGE.
 * 
 * It is  licensee's responsibility  to comply with  any export
 * regulations applicable in licensee's jurisdiction.
 */

	.file	"memcmp.s"

#include <sys/asm_linkage.h>

	ANSI_PRAGMA_WEAK(memcmp,function)

#include "SYS.h"
#include "cache.h"

#define LABEL(s) .memcmp/**/s

	ENTRY(memcmp)                 /* (const void *, const void*, size_t) */

LABEL(try1):
        cmp     $8, %rdx
        jae     LABEL(1after)

LABEL(1):                                /* 1-byte */
        test    %rdx, %rdx
        mov     $0, %eax
        jz      LABEL(exit)

LABEL(1loop):
        movzbl  (%rdi), %eax
        movzbl  (%rsi), %ecx
        sub     %ecx, %eax
        jnz     LABEL(exit)

        dec     %rdx

        lea     1 (%rdi), %rdi
        lea     1 (%rsi), %rsi

        jnz     LABEL(1loop)

LABEL(exit):
        rep
        ret

        .p2align 4

LABEL(1after):

LABEL(8try):
        cmp     $32, %rdx
        jae     LABEL(8after)

LABEL(8):                        /* 8-byte */
        mov     %edx, %ecx
        shr     $3, %ecx
        jz      LABEL(1)

        .p2align 4

LABEL(8loop):
        mov     (%rsi), %rax
        cmp     (%rdi), %rax
        jne     LABEL(1)

        sub     $8, %rdx
        dec     %ecx

        lea     8 (%rsi), %rsi
        lea     8 (%rdi), %rdi

        jnz     LABEL(8loop)

LABEL(8skip):
        and     $7, %edx
        jnz     LABEL(1)

        xor     %eax, %eax
        ret

        .p2align 4

LABEL(8after):

LABEL(32try):
        cmp     $2048, %rdx
        ja      LABEL(32after)

LABEL(32):                               /* 32-byte */
        mov     %edx, %ecx
        shr     $5, %ecx
        jz      LABEL(8)

        .p2align 4

LABEL(32loop):
        mov        (%rsi), %rax
        mov      8 (%rsi),  %r8
        mov     16 (%rsi),  %r9
        mov     24 (%rsi), %r10
        sub        (%rdi), %rax
        sub      8 (%rdi),  %r8
        sub     16 (%rdi),  %r9
        sub     24 (%rdi), %r10

        or      %rax,  %r8
        or       %r9, %r10
        or       %r8, %r10
        jnz     LABEL(8)

        sub     $32, %rdx
        dec     %ecx

        lea     32 (%rsi), %rsi
        lea     32 (%rdi), %rdi

        jnz     LABEL(32loop)

LABEL(32skip):
        and     $31, %edx
        jnz     LABEL(8)

        xor     %eax, %eax
        ret

        .p2align 4

LABEL(32after):

	prefetchnta _sref_(.amd64cache1half)	/* 3DNow: use prefetch */

LABEL(srctry):
        mov     %esi, %r8d      /* align by source */

        and     $7, %r8d
        jz      LABEL(srcafter)  /* not unaligned */

LABEL(src):                      /* align */
        lea     -8 (%r8, %rdx), %rdx
        sub     $8, %r8d


LABEL(srcloop):
        movzbl  (%rdi), %eax
        movzbl  (%rsi), %ecx
        sub     %ecx, %eax
        jnz     LABEL(exit)

        inc     %r8d

        lea     1 (%rdi), %rdi
        lea     1 (%rsi), %rsi

        jnz     LABEL(srcloop)

        .p2align 4

LABEL(srcafter):

LABEL(64try):
        mov     _sref_(.amd64cache1half), %rcx
        cmp	%rdx, %rcx
        cmova   %rdx, %rcx

LABEL(64):                               /* 64-byte */
        shr     $6, %rcx
        jz      LABEL(32)

        .p2align 4

LABEL(64loop):
        mov        (%rsi), %rax
        mov      8 (%rsi),  %r8
        sub        (%rdi), %rax
        sub      8 (%rdi),  %r8
        or      %r8,  %rax

        mov     16 (%rsi),  %r9
        mov     24 (%rsi), %r10
        sub     16 (%rdi),  %r9
        sub     24 (%rdi), %r10
        or      %r10, %r9

        or      %r9,  %rax
        jnz     LABEL(32)

        mov     32 (%rsi), %rax
        mov     40 (%rsi),  %r8
        sub     32 (%rdi), %rax
        sub     40 (%rdi),  %r8
        or      %r8,  %rax

        mov     48 (%rsi),  %r9
        mov     56 (%rsi), %r10
        sub     48 (%rdi),  %r9
        sub     56 (%rdi), %r10
        or      %r10, %r9

        or      %r9,  %rax
        jnz    	LABEL(32)

        lea     64 (%rsi), %rsi
        lea     64 (%rdi), %rdi

        sub     $64, %rdx
        dec     %rcx
        jnz     LABEL(64loop)

LABEL(64skip):
        cmp     $2048, %rdx
        ja     LABEL(64after)

        test    %edx, %edx
        jnz     LABEL(32)

        xor     %eax, %eax
        ret

        .p2align 4

LABEL(64after):

LABEL(pretry):

LABEL(pre):                              /* 64-byte prefetching */
        mov     _sref_(.amd64cache2half), %rcx
        cmp	%rdx, %rcx
        cmova   %rdx, %rcx

        shr     $6, %rcx
        jz      LABEL(preskip)

        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */

        mov        (%rsi), %rax
        mov      8 (%rsi), %r9
        mov     16 (%rsi), %r10
        mov     24 (%rsi), %r11
        sub        (%rdi), %rax
        sub      8 (%rdi), %r9
        sub     16 (%rdi), %r10
        sub     24 (%rdi), %r11

        or       %r9, %rax
        or      %r11, %r10
        or      %r10, %rax
        jnz     LABEL(32)

        mov     32 (%rsi), %rax
        mov     40 (%rsi), %r9
        mov     48 (%rsi), %r10
        mov     56 (%rsi), %r11
        sub     32 (%rdi), %rax
        sub     40 (%rdi), %r9
        sub     48 (%rdi), %r10
        sub     56 (%rdi), %r11

        or       %r9, %rax
        or      %r11, %r10
        or      %r10, %rax
        jnz     LABEL(32)

        lea     64 (%rsi), %rsi
        lea     64 (%rdi), %rdi

        sub     $64, %rdx
        dec     %rcx

        .p2align 4

LABEL(preloop):
        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */

        mov        (%rsi), %rax
        mov      8 (%rsi), %r9
        mov     16 (%rsi), %r10
        mov     24 (%rsi), %r11
        sub        (%rdi), %rax
        sub      8 (%rdi), %r9
        sub     16 (%rdi), %r10
        sub     24 (%rdi), %r11

        or       %r9, %rax
        or      %r11, %r10
        or      %r10, %rax
        jnz     LABEL(32)

        mov     32 (%rsi), %rax
        mov     40 (%rsi), %r9
        mov     48 (%rsi), %r10
        mov     56 (%rsi), %r11
        sub     32 (%rdi), %rax
        sub     40 (%rdi), %r9
        sub     48 (%rdi), %r10
        sub     56 (%rdi), %r11

        or       %r9, %rax
        or      %r11, %r10
        or      %r10, %rax
        jnz     LABEL(32)

        lea     64 (%rsi), %rsi
        lea     64 (%rdi), %rdi

        sub     $64, %rdx
        dec     %rcx
        jnz     LABEL(preloop)


LABEL(preskip):
        cmp     $2048, %rdx
        ja      LABEL(preafter)

        test    %edx, %edx
        jnz     LABEL(32)

        xor     %eax, %eax
        ret

        .p2align 4

LABEL(preafter):

LABEL(128try):

LABEL(128):                              /* 128-byte */
        mov     %rdx, %rcx
        shr     $7, %rcx
        jz      LABEL(128skip)

        .p2align 4

LABEL(128loop):
        prefetchnta 512 (%rsi)	/* 3DNow: use prefetch */
        prefetchnta 512 (%rdi)	/* 3DNow: use prefetch */

        mov        (%rsi), %rax
        mov      8 (%rsi), %r8
        sub        (%rdi), %rax
        sub      8 (%rdi), %r8
        mov     16 (%rsi), %r9
        mov     24 (%rsi), %r10
        sub     16 (%rdi), %r9
        sub     24 (%rdi), %r10

        or       %r8, %rax
        or       %r9, %r10
        or      %r10, %rax

        mov     32 (%rsi), %r8
        mov     40 (%rsi), %r9
        sub     32 (%rdi), %r8
        sub     40 (%rdi), %r9
        mov     48 (%rsi), %r10
        mov     56 (%rsi), %r11
        sub     48 (%rdi), %r10
        sub     56 (%rdi), %r11

        or       %r9, %r8
        or      %r11, %r10
        or      %r10, %r8

        or      %r8, %rax
        jnz     LABEL(32)

        prefetchnta 576 (%rsi)	/* 3DNow: use prefetch */
        prefetchnta 576 (%rdi)	/* 3DNow: use prefetch */

        mov      64 (%rsi), %rax
        mov      72 (%rsi), %r8
        sub      64 (%rdi), %rax
        sub      72 (%rdi), %r8
        mov      80 (%rsi), %r9
        mov      88 (%rsi), %r10
        sub      80 (%rdi), %r9
        sub      88 (%rdi), %r10

        or       %r8, %rax
        or       %r9, %r10
        or      %r10, %rax

        mov      96 (%rsi), %r8
        mov     104 (%rsi), %r9
        sub      96 (%rdi), %r8
        sub     104 (%rdi), %r9
        mov     112 (%rsi), %r10
        mov     120 (%rsi), %r11
        sub     112 (%rdi), %r10
        sub     120 (%rdi), %r11

        or       %r9, %r8
        or      %r11, %r10
        or      %r10, %r8

        or      %r8, %rax
        jnz     LABEL(32)

        sub     $128, %rdx
        dec     %rcx

        lea     128 (%rsi), %rsi
        lea     128 (%rdi), %rdi

        jnz     LABEL(128loop)

LABEL(128skip):
        and     $127, %edx
        jnz     LABEL(32)

        xor     %eax, %eax
        ret

	SET_SIZE(memcmp)