1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include <private/bionic_asm.h> 32 33#include "cache.h" 34 35#ifndef L 36# define L(label) .L##label 37#endif 38 39#ifndef ALIGN 40# define ALIGN(n) .p2align n 41#endif 42 43 .section .text.avx2,"ax",@progbits 44 45ENTRY(__memset_chk_avx2) 46 # %rdi = dst, %rsi = byte, %rdx = n, %rcx = dst_len 47 cmp %rcx, %rdx 48 ja __memset_chk_fail 49 // Fall through to memset... 50END(__memset_chk_avx2) 51 52ENTRY(memset_avx2) 53 movq %rdi, %rax 54 and $0xff, %rsi 55 mov $0x0101010101010101, %rcx 56 imul %rsi, %rcx 57 cmpq $16, %rdx 58 jae L(16bytesormore) 59 testb $8, %dl 60 jnz L(8_15bytes) 61 testb $4, %dl 62 jnz L(4_7bytes) 63 testb $2, %dl 64 jnz L(2_3bytes) 65 testb $1, %dl 66 jz L(return) 67 movb %cl, (%rdi) 68L(return): 69 ret 70 71L(8_15bytes): 72 movq %rcx, (%rdi) 73 movq %rcx, -8(%rdi, %rdx) 74 ret 75 76L(4_7bytes): 77 movl %ecx, (%rdi) 78 movl %ecx, -4(%rdi, %rdx) 79 ret 80 81L(2_3bytes): 82 movw %cx, (%rdi) 83 movw %cx, -2(%rdi, %rdx) 84 ret 85 86 ALIGN (4) 87L(16bytesormore): 88 movd %rcx, %xmm0 89 pshufd $0, %xmm0, %xmm0 90 movdqu %xmm0, (%rdi) 91 movdqu %xmm0, -16(%rdi, %rdx) 92 cmpq $32, %rdx 93 jbe L(32bytesless) 94 movdqu %xmm0, 16(%rdi) 95 movdqu %xmm0, -32(%rdi, %rdx) 96 cmpq $64, %rdx 97 jbe L(64bytesless) 98 movdqu %xmm0, 32(%rdi) 99 movdqu %xmm0, 48(%rdi) 100 movdqu %xmm0, -64(%rdi, %rdx) 101 movdqu %xmm0, -48(%rdi, %rdx) 102 cmpq $128, %rdx 103 jbe L(128bytesless) 104 vpbroadcastb %xmm0, %ymm0 105 vmovdqu %ymm0, 64(%rdi) 106 vmovdqu %ymm0, 96(%rdi) 107 vmovdqu %ymm0, -128(%rdi, %rdx) 108 vmovdqu %ymm0, -96(%rdi, %rdx) 109 cmpq $256, %rdx 110 ja L(256bytesmore) 111L(32bytesless): 112L(64bytesless): 113L(128bytesless): 114 ret 115 116 ALIGN (4) 117L(256bytesmore): 118 leaq 128(%rdi), %rcx 119 andq $-128, %rcx 120 movq %rdx, %r8 121 addq %rdi, %rdx 122 andq $-128, %rdx 123 cmpq %rcx, %rdx 124 je L(return) 125 126#ifdef SHARED_CACHE_SIZE 127 cmp $SHARED_CACHE_SIZE, %r8 128#else 129 cmp __x86_64_shared_cache_size(%rip), %r8 130#endif 131 ja L(256bytesmore_nt) 132 133 ALIGN (4) 134L(256bytesmore_normal): 135 vmovdqa %ymm0, (%rcx) 136 vmovdqa %ymm0, 32(%rcx) 137 vmovdqa %ymm0, 64(%rcx) 138 vmovdqa %ymm0, 96(%rcx) 139 addq $128, %rcx 140 cmpq %rcx, %rdx 141 jne L(256bytesmore_normal) 142 ret 143 144 ALIGN (4) 145L(256bytesmore_nt): 146 movntdq %xmm0, (%rcx) 147 movntdq %xmm0, 16(%rcx) 148 movntdq %xmm0, 32(%rcx) 149 movntdq %xmm0, 48(%rcx) 150 movntdq %xmm0, 64(%rcx) 151 movntdq %xmm0, 80(%rcx) 152 movntdq %xmm0, 96(%rcx) 153 movntdq %xmm0, 112(%rcx) 154 leaq 128(%rcx), %rcx 155 cmpq %rcx, %rdx 156 jne L(256bytesmore_nt) 157 sfence 158 ret 159 160END(memset_avx2) 161