17c478bd9Sstevel@tonic-gate/* 2*533d3a49SEdward Gillett * CDDL HEADER START 3*533d3a49SEdward Gillett * 4*533d3a49SEdward Gillett * The contents of this file are subject to the terms of the 5*533d3a49SEdward Gillett * Common Development and Distribution License (the "License"). 6*533d3a49SEdward Gillett * You may not use this file except in compliance with the License. 7*533d3a49SEdward Gillett * 8*533d3a49SEdward Gillett * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*533d3a49SEdward Gillett * or http://www.opensolaris.org/os/licensing. 10*533d3a49SEdward Gillett * See the License for the specific language governing permissions 11*533d3a49SEdward Gillett * and limitations under the License. 12*533d3a49SEdward Gillett * 13*533d3a49SEdward Gillett * When distributing Covered Code, include this CDDL HEADER in each 14*533d3a49SEdward Gillett * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*533d3a49SEdward Gillett * If applicable, add the following below this CDDL HEADER, with the 16*533d3a49SEdward Gillett * fields enclosed by brackets "[]" replaced with your own identifying 17*533d3a49SEdward Gillett * information: Portions Copyright [yyyy] [name of copyright owner] 18*533d3a49SEdward Gillett * 19*533d3a49SEdward Gillett * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 217c478bd9Sstevel@tonic-gate 227c478bd9Sstevel@tonic-gate/* 23*533d3a49SEdward Gillett * Copyright (c) 2009, Intel Corporation 247c478bd9Sstevel@tonic-gate * All rights reserved. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 27*533d3a49SEdward Gillett/* 28*533d3a49SEdward Gillett * strlen - calculate the length of string 29*533d3a49SEdward Gillett */ 307c478bd9Sstevel@tonic-gate 317c478bd9Sstevel@tonic-gate#include "SYS.h" 32*533d3a49SEdward Gillett#include "proc64_id.h" 337c478bd9Sstevel@tonic-gate 347c478bd9Sstevel@tonic-gate#define LABEL(s) .strlen/**/s 357c478bd9Sstevel@tonic-gate 36*533d3a49SEdward Gillett /* 37*533d3a49SEdward Gillett * This implementation uses SSE instructions to compare up to 16 bytes 38*533d3a49SEdward Gillett * at a time looking for the end of string (null char). 39*533d3a49SEdward Gillett */ 407c478bd9Sstevel@tonic-gate ENTRY(strlen) /* (const char *s) */ 41*533d3a49SEdward Gillett mov %rdi, %rsi /* keep original %rdi value */ 42*533d3a49SEdward Gillett mov %rsi, %rcx 43*533d3a49SEdward Gillett pxor %xmm0, %xmm0 /* 16 null chars */ 44*533d3a49SEdward Gillett and $15, %rcx 45*533d3a49SEdward Gillett jz LABEL(align16_loop) /* string is 16 byte aligned */ 467c478bd9Sstevel@tonic-gate 47*533d3a49SEdward Gillett /* 48*533d3a49SEdward Gillett * Unaligned case. Round down to 16-byte boundary before comparing 49*533d3a49SEdward Gillett * 16 bytes for a null char. The code then compensates for any extra chars 50*533d3a49SEdward Gillett * preceding the start of the string. 51*533d3a49SEdward Gillett */ 52*533d3a49SEdward GillettLABEL(unalign16): 53*533d3a49SEdward Gillett and $0xfffffffffffffff0, %rsi 547c478bd9Sstevel@tonic-gate 55*533d3a49SEdward Gillett pcmpeqb (%rsi), %xmm0 56*533d3a49SEdward Gillett lea 16(%rdi), %rsi 57*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 587c478bd9Sstevel@tonic-gate 59*533d3a49SEdward Gillett shr %cl, %edx /* Compensate for bytes preceding the string */ 60*533d3a49SEdward Gillett test %edx, %edx 61*533d3a49SEdward Gillett jnz LABEL(exit) 62*533d3a49SEdward Gillett sub %rcx, %rsi /* no null, adjust to next 16-byte boundary */ 63*533d3a49SEdward Gillett pxor %xmm0, %xmm0 /* clear xmm0, may have been changed... */ 647c478bd9Sstevel@tonic-gate 657c478bd9Sstevel@tonic-gate .p2align 4 66*533d3a49SEdward GillettLABEL(align16_loop): /* 16 byte aligned */ 67*533d3a49SEdward Gillett pcmpeqb (%rsi), %xmm0 /* look for null bytes */ 68*533d3a49SEdward Gillett pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx */ 697c478bd9Sstevel@tonic-gate 70*533d3a49SEdward Gillett add $16, %rsi /* prepare to search next 16 bytes */ 71*533d3a49SEdward Gillett test %edx, %edx /* if no null byte, %edx must be 0 */ 72*533d3a49SEdward Gillett jnz LABEL(exit) /* found a null */ 737c478bd9Sstevel@tonic-gate 74*533d3a49SEdward Gillett pcmpeqb (%rsi), %xmm0 75*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 76*533d3a49SEdward Gillett add $16, %rsi 77*533d3a49SEdward Gillett test %edx, %edx 78*533d3a49SEdward Gillett jnz LABEL(exit) 79*533d3a49SEdward Gillett 80*533d3a49SEdward Gillett pcmpeqb (%rsi), %xmm0 81*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 82*533d3a49SEdward Gillett add $16, %rsi 83*533d3a49SEdward Gillett test %edx, %edx 84*533d3a49SEdward Gillett jnz LABEL(exit) 85*533d3a49SEdward Gillett 86*533d3a49SEdward Gillett pcmpeqb (%rsi), %xmm0 87*533d3a49SEdward Gillett pmovmskb %xmm0, %edx 88*533d3a49SEdward Gillett add $16, %rsi 89*533d3a49SEdward Gillett test %edx, %edx 90*533d3a49SEdward Gillett jz LABEL(align16_loop) 917c478bd9Sstevel@tonic-gate 927c478bd9Sstevel@tonic-gate .p2align 4 937c478bd9Sstevel@tonic-gateLABEL(exit): 94*533d3a49SEdward Gillett neg %rdi 95*533d3a49SEdward Gillett /* 96*533d3a49SEdward Gillett * Check to see if BSF is fast on this processor. If not, use a different 97*533d3a49SEdward Gillett * exit tail to find first bit set indicating null byte match. 98*533d3a49SEdward Gillett */ 99*533d3a49SEdward Gillett testl $USE_BSF, .memops_method(%rip) 100*533d3a49SEdward Gillett jz LABEL(AMD_exit) 101*533d3a49SEdward Gillett 102*533d3a49SEdward Gillett lea -16(%rdi, %rsi), %rax /* calculate exact offset */ 103*533d3a49SEdward Gillett bsf %edx, %ecx /* Least significant 1 bit is index of null */ 104*533d3a49SEdward Gillett lea (%rax, %rcx),%rax 1057c478bd9Sstevel@tonic-gate ret 1067c478bd9Sstevel@tonic-gate 107*533d3a49SEdward Gillett /* 108*533d3a49SEdward Gillett * This exit tail does not use the bsf instruction. 109*533d3a49SEdward Gillett */ 110*533d3a49SEdward Gillett .p2align 4 111*533d3a49SEdward GillettLABEL(AMD_exit): 112*533d3a49SEdward Gillett lea -16(%rdi, %rsi), %rax 113*533d3a49SEdward Gillett test %dl, %dl 114*533d3a49SEdward Gillett jz LABEL(exit_high) 115*533d3a49SEdward Gillett test $0x01, %dl 116*533d3a49SEdward Gillett jnz LABEL(exit_tail0) 117*533d3a49SEdward Gillett 118*533d3a49SEdward Gillett test $0x02, %dl 119*533d3a49SEdward Gillett jnz LABEL(exit_tail1) 120*533d3a49SEdward Gillett 121*533d3a49SEdward Gillett .p2align 4 122*533d3a49SEdward Gillett test $0x04, %dl 123*533d3a49SEdward Gillett jnz LABEL(exit_tail2) 124*533d3a49SEdward Gillett 125*533d3a49SEdward Gillett test $0x08, %dl 126*533d3a49SEdward Gillett jnz LABEL(exit_tail3) 127*533d3a49SEdward Gillett 128*533d3a49SEdward Gillett test $0x10, %dl 129*533d3a49SEdward Gillett jnz LABEL(exit_tail4) 130*533d3a49SEdward Gillett 131*533d3a49SEdward Gillett test $0x20, %dl 132*533d3a49SEdward Gillett jnz LABEL(exit_tail5) 133*533d3a49SEdward Gillett 134*533d3a49SEdward Gillett test $0x40, %dl 135*533d3a49SEdward Gillett jnz LABEL(exit_tail6) 136*533d3a49SEdward Gillett add $7, %rax 137*533d3a49SEdward Gillett ret 138*533d3a49SEdward Gillett 139*533d3a49SEdward Gillett .p2align 4 140*533d3a49SEdward GillettLABEL(exit_high): 141*533d3a49SEdward Gillett add $8, %rax 142*533d3a49SEdward Gillett test $0x01, %dh 143*533d3a49SEdward Gillett jnz LABEL(exit_tail0) 144*533d3a49SEdward Gillett 145*533d3a49SEdward Gillett test $0x02, %dh 146*533d3a49SEdward Gillett jnz LABEL(exit_tail1) 147*533d3a49SEdward Gillett 148*533d3a49SEdward Gillett test $0x04, %dh 149*533d3a49SEdward Gillett jnz LABEL(exit_tail2) 150*533d3a49SEdward Gillett 151*533d3a49SEdward Gillett test $0x08, %dh 152*533d3a49SEdward Gillett jnz LABEL(exit_tail3) 153*533d3a49SEdward Gillett 154*533d3a49SEdward Gillett test $0x10, %dh 155*533d3a49SEdward Gillett jnz LABEL(exit_tail4) 156*533d3a49SEdward Gillett 157*533d3a49SEdward Gillett test $0x20, %dh 158*533d3a49SEdward Gillett jnz LABEL(exit_tail5) 159*533d3a49SEdward Gillett 160*533d3a49SEdward Gillett test $0x40, %dh 161*533d3a49SEdward Gillett jnz LABEL(exit_tail6) 162*533d3a49SEdward Gillett add $7, %rax 163*533d3a49SEdward Gillett ret 164*533d3a49SEdward Gillett 165*533d3a49SEdward Gillett .p2align 4 166*533d3a49SEdward GillettLABEL(exit_tail0): 167*533d3a49SEdward Gillett xor %ecx, %ecx 168*533d3a49SEdward Gillett ret 169*533d3a49SEdward Gillett 170*533d3a49SEdward Gillett .p2align 4 171*533d3a49SEdward GillettLABEL(exit_tail1): 172*533d3a49SEdward Gillett add $1, %rax 173*533d3a49SEdward Gillett ret 174*533d3a49SEdward Gillett 175*533d3a49SEdward Gillett .p2align 4 176*533d3a49SEdward GillettLABEL(exit_tail2): 177*533d3a49SEdward Gillett add $2, %rax 178*533d3a49SEdward Gillett ret 179*533d3a49SEdward Gillett 180*533d3a49SEdward Gillett .p2align 4 181*533d3a49SEdward GillettLABEL(exit_tail3): 182*533d3a49SEdward Gillett add $3, %rax 183*533d3a49SEdward Gillett ret 184*533d3a49SEdward Gillett 185*533d3a49SEdward Gillett .p2align 4 186*533d3a49SEdward GillettLABEL(exit_tail4): 187*533d3a49SEdward Gillett add $4, %rax 188*533d3a49SEdward Gillett ret 189*533d3a49SEdward Gillett 190*533d3a49SEdward Gillett .p2align 4 191*533d3a49SEdward GillettLABEL(exit_tail5): 192*533d3a49SEdward Gillett add $5, %rax 193*533d3a49SEdward Gillett ret 194*533d3a49SEdward Gillett 195*533d3a49SEdward Gillett .p2align 4 196*533d3a49SEdward GillettLABEL(exit_tail6): 197*533d3a49SEdward Gillett add $6, %rax 198*533d3a49SEdward Gillett ret 1997c478bd9Sstevel@tonic-gate SET_SIZE(strlen) 200