/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009, Intel Corporation * All rights reserved. */ /* * strlen - calculate the length of string */ #include "SYS.h" #include "proc64_id.h" #define LABEL(s) .strlen##s /* * This implementation uses SSE instructions to compare up to 16 bytes * at a time looking for the end of string (null char). */ ENTRY(strlen) /* (const char *s) */ mov %rdi, %rsi /* keep original %rdi value */ mov %rsi, %rcx pxor %xmm0, %xmm0 /* 16 null chars */ and $15, %rcx jz LABEL(align16_loop) /* string is 16 byte aligned */ /* * Unaligned case. Round down to 16-byte boundary before comparing * 16 bytes for a null char. The code then compensates for any extra chars * preceding the start of the string. */ LABEL(unalign16): and $0xfffffffffffffff0, %rsi pcmpeqb (%rsi), %xmm0 lea 16(%rdi), %rsi pmovmskb %xmm0, %edx shr %cl, %edx /* Compensate for bytes preceding the string */ test %edx, %edx jnz LABEL(exit) sub %rcx, %rsi /* no null, adjust to next 16-byte boundary */ pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */ .p2align 4 LABEL(align16_loop): /* 16 byte aligned */ pcmpeqb (%rsi), %xmm0 /* look for null bytes */ pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */ add $16, %rsi /* prepare to search next 16 bytes */ test %edx, %edx /* if no null byte, %edx must be 0 */ jnz LABEL(exit) /* found a null */ pcmpeqb (%rsi), %xmm0 pmovmskb %xmm0, %edx add $16, %rsi test %edx, %edx jnz LABEL(exit) pcmpeqb (%rsi), %xmm0 pmovmskb %xmm0, %edx add $16, %rsi test %edx, %edx jnz LABEL(exit) pcmpeqb (%rsi), %xmm0 pmovmskb %xmm0, %edx add $16, %rsi test %edx, %edx jz LABEL(align16_loop) .p2align 4 LABEL(exit): neg %rdi /* * Check to see if BSF is fast on this processor. If not, use a different * exit tail to find first bit set indicating null byte match. */ testl $USE_BSF, .memops_method(%rip) jz LABEL(AMD_exit) lea -16(%rdi, %rsi), %rax /* calculate exact offset */ bsf %edx, %ecx /* Least significant 1 bit is index of null */ lea (%rax, %rcx),%rax ret /* * This exit tail does not use the bsf instruction. */ .p2align 4 LABEL(AMD_exit): lea -16(%rdi, %rsi), %rax test %dl, %dl jz LABEL(exit_high) test $0x01, %dl jnz LABEL(exit_tail0) test $0x02, %dl jnz LABEL(exit_tail1) .p2align 4 test $0x04, %dl jnz LABEL(exit_tail2) test $0x08, %dl jnz LABEL(exit_tail3) test $0x10, %dl jnz LABEL(exit_tail4) test $0x20, %dl jnz LABEL(exit_tail5) test $0x40, %dl jnz LABEL(exit_tail6) add $7, %rax ret .p2align 4 LABEL(exit_high): add $8, %rax test $0x01, %dh jnz LABEL(exit_tail0) test $0x02, %dh jnz LABEL(exit_tail1) test $0x04, %dh jnz LABEL(exit_tail2) test $0x08, %dh jnz LABEL(exit_tail3) test $0x10, %dh jnz LABEL(exit_tail4) test $0x20, %dh jnz LABEL(exit_tail5) test $0x40, %dh jnz LABEL(exit_tail6) add $7, %rax ret .p2align 4 LABEL(exit_tail0): xor %ecx, %ecx ret .p2align 4 LABEL(exit_tail1): add $1, %rax ret .p2align 4 LABEL(exit_tail2): add $2, %rax ret .p2align 4 LABEL(exit_tail3): add $3, %rax ret .p2align 4 LABEL(exit_tail4): add $4, %rax ret .p2align 4 LABEL(exit_tail5): add $5, %rax ret .p2align 4 LABEL(exit_tail6): add $6, %rax ret SET_SIZE(strlen)