/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */


.ident	"%Z%%M%	%I%	%E% SMI"

	.file	"%M%"

/*
 * strncpy(s1, s2)
 *
 * Copy string s2 to s1, truncating or null-padding to always copy n bytes
 * return s1.
 *
 * Fast assembler language version of the following C-program for strncpy
 * which represents the `standard' for the C-library.
 *
 *	char *
 *	strncpy(char *s1, const char *s2, size_t n)
 *	{
 *		char *os1 = s1;
 *	
 *		n++;				
 *		while ((--n != 0) &&  ((*s1++ = *s2++) != '\0'))
 *			;
 *		if (n != 0)
 *			while (--n != 0)
 *				*s1++ = '\0';
 *		return (os1);
 *	}
 */

#include <sys/asm_linkage.h>
#include "synonyms.h"

	! strncpy works similarly to strcpy, except that n bytes of s2
	! are copied to s1. If a null character is reached in s2 yet more
	! bytes remain to be copied, strncpy will copy null bytes into
	! the destination string.
	!
	! This implementation works by first aligning the src ptr and
	! performing small copies until it is aligned.  Then, the string
	! is copied based upon destination alignment.  (byte, half-word,
	! word, etc.)

	ENTRY(strncpy)

	.align 32
	subcc	%g0, %o2, %o4		! n = -n
	bz	.doneshort		! if n == 0, done
	cmp	%o2, 7			! n < 7 ?
	add	%o1, %o2, %o3		! src = src + n
	blu	.shortcpy		! n < 7, use byte-wise copy 
	add	%o0, %o2, %o2		! dst = dst + n
	andcc	%o1, 3, %o5		! src word aligned ?
	bz	.wordaligned		! yup
	save	%sp, -0x40, %sp		! create new register window
	sub	%i5, 4, %i5		! bytes until src aligned
	nop				! align loop on 16-byte boundary
	nop				! align loop on 16-byte boundary

.alignsrc:
	ldub	[%i3 + %i4], %i1	! src[]
	stb	%i1, [%i2 + %i4]	! dst[] = src[]
	inccc	%i4			! src++, dst++, n--
	bz	.done			! n == 0, done
	tst     %i1			! end of src reached (null byte) ?
	bz,a	.bytepad		! yes, at least one byte to pad here
	add 	%i2, %i4, %l0		! need single dest pointer for fill
	inccc	%i5			! src aligned now?
	bnz	.alignsrc		! no, copy another byte
	.empty

.wordaligned:	
	add	%i2, %i4, %l0		! dst
	sethi	%hi(0x01010101), %l1	! Alan Mycroft's magic1
	sub	%i2, 4, %i2		! adjust for dest pre-incr in cpy loops
	or	%l1, %lo(0x01010101),%l1!  finish loading magic1
	andcc	%l0, 3, %g1		! destination word aligned ?
	bnz	.dstnotaligned		! nope
	sll	%l1, 7, %i5		! create Alan Mycroft's magic2
	
.storeword:
	lduw	[%i3 + %i4], %i1	! src dword
	addcc	%i4, 4, %i4		! n += 4, src += 4, dst += 4
	bcs	.lastword		! if counter wraps, last word
	andn	%i5, %i1, %g1		! ~dword & 0x80808080
	sub	%i1, %l1, %l0		! dword - 0x01010101
	andcc	%l0, %g1, %g0		! ((dword - 0x01010101) & ~dword & 0x80808080)
	bz,a	.storeword		! no zero byte if magic expression == 0
	stw	%i1, [%i2 + %i4]	! store word to dst (address pre-incremented)

	! n has not expired, but src is at the end. we need to push out the 
	! remaining src bytes and then start padding with null bytes
	
.zerobyte:
	add	%i2, %i4, %l0		! pointer to dest string
	srl	%i1, 24, %g1		! first byte
	stb	%g1, [%l0]		! store it
	sub	%g1, 1, %g1		! byte == 0 ? -1 : byte - 1
	sra	%g1, 31, %g1		! byte == 0 ? -1 : 0
	andn	%i1, %g1, %i1		! if byte == 0, start padding with null bytes
	srl	%i1, 16, %g1		! second byte
	stb	%g1, [%l0 + 1]		! store it
	and	%g1, 0xff, %g1		! isolate byte
	sub	%g1, 1, %g1		! byte == 0 ? -1 : byte - 1
	sra	%g1, 31, %g1		! byte == 0 ? -1 : 0
	andn	%i1, %g1, %i1		! if byte == 0, start padding with null bytes
	srl	%i1, 8, %g1		! third byte
	stb	%g1, [%l0 + 2]		! store it
	and	%g1, 0xff, %g1		! isolate byte
	sub	%g1, 1, %g1		! byte == 0 ? -1 : byte - 1
	sra	%g1, 31, %g1		! byte == 0 ? -1 : 0
	andn	%i1, %g1, %i1		! if byte == 0, start padding with null bytes
	stb	%i1, [%l0 + 3]		! store fourth byte
	addcc	%i4, 8, %g0		! number of pad bytes < 8 ?
	bcs	.bytepad		! yes, do simple byte wise fill
	add	%l0, 4, %l0		! dst += 4
	andcc	%l0, 3, %l1		! dst offset relative to word boundary
	bz	.fillaligned		! dst already word aligned

	! here there is a least one more byte to zero out: otherwise we would 
	! have exited through label .lastword

	sub	%l1, 4, %l1		! bytes to align dst to word boundary
.makealigned:
	stb	%g0, [%l0]		! dst[] = 0
	addcc	%i4, 1, %i4		! n--
	bz	.done			! n == 0, we are done
	addcc	%l1, 1, %l1		! any more byte needed to align
	bnz	.makealigned		! yup, pad another byte
	add	%l0, 1, %l0		! dst++
	nop				! pad to align copy loop below

	! here we know that there at least another 4 bytes to pad, since
	! we don't get here unless there were >= 8 bytes to pad to begin
	! with, and we have padded at most 3 bytes suring dst aligning

.fillaligned:
	add	%i4, 3, %i2		! round up to next word boundary
	and	%i2, -4, %l1		! pointer to next word boundary
	and	%i2, 4, %i2		! word count odd ? 4 : 0
	stw	%g0, [%l0]		! store first word
	addcc	%l1, %i2, %l1		! dword count == 1 ?
	add	%i4, %i2, %i4		! if word count odd, n -= 4
	bz	.bytepad		! if word count == 1, pad bytes left
	add	%l0, %i2, %l0		! bump dst if word count odd
		
.fillword:	
	addcc	%l1, 8, %l1		! count -= 8
	stw	%g0, [%l0]		! dst[n] = 0
	stw	%g0, [%l0 + 4]		! dst[n+4] = 0
	add	%l0, 8, %l0		! dst += 8
	bcc	.fillword		! fill words until count == 0
	addcc	%i4, 8, %i4		! n -= 8
	bz	.done			! if n == 0, we are done
	.empty

.bytepad:
	and	%i4, 1, %i2		! byte count odd ? 1 : 0
	stb	%g0, [%l0]		! store first byte
	addcc	%i4, %i2, %i4		! byte count == 1 ?
	bz	.done			! yup, we are done
	add	%l0, %i2, %l0		! bump pointer if odd		

.fillbyte:
	addcc	%i4, 2, %i4		! n -= 2
	stb	%g0, [%l0]		! dst[n] = 0
	stb	%g0, [%l0 + 1]		! dst[n+1] = 0
	bnz	.fillbyte		! fill until n == 0
	add	%l0, 2, %l0		! dst += 2

.done:
	ret				! done
	restore	%i0, %g0, %o0		! restore reg window, return dst

	! this is the last word. It may contain null bytes. store bytes 
	! until n == 0. if null byte encountered, continue

.lastword:
	sub	%i4, 4, %i4		! undo counter pre-increment
	add	%i2, 4, %i2		! adjust dst for counter un-bumping
	
	srl	%i1, 24, %g1		! first byte
	stb	%g1, [%i2 + %i4]	! store it
	inccc	%i4			! n--
	bz	.done			! if n == 0, we're done
	sub	%g1, 1, %g1		! byte == 0 ? -1 : byte - 1
	sra	%g1, 31, %g1		! byte == 0 ? -1 : 0
	andn	%i1, %g1, %i1		! if byte == 0, start padding with null
	srl	%i1, 16, %g1		! second byte
	stb	%g1, [%i2 + %i4]	! store it
	inccc	%i4			! n--
	bz	.done			! if n == 0, we're done
	and	%g1, 0xff, %g1		! isolate byte
	sub	%g1, 1, %g1		! byte == 0 ? -1 : byte - 1
	sra	%g1, 31, %g1		! byte == 0 ? -1 : 0
	andn	%i1, %g1, %i1		! if byte == 0, start padding with null
	srl	%i1, 8, %g1		! third byte
	stb	%g1, [%i2 + %i4]	! store it
	inccc	%i4			! n--
	bz	.done			! if n == 0, we're done
	and	%g1, 0xff, %g1		! isolate byte
	sub	%g1, 1, %g1		! byte == 0 ? -1 : byte - 1
	sra	%g1, 31, %g1		! byte == 0 ? -1 : 0
	andn	%i1, %g1, %i1		! if byte == 0, start padding with null
	ba	.done			! here n must be zero, we are done
	stb	%i1, [%i2 + %i4]	! store fourth byte

.dstnotaligned:
	cmp	%g1, 2			! dst half word aligned?
	be	.storehalfword2		! yup, store half word at a time
	.empty
.storebyte:
	lduw	[%i3 + %i4], %i1	! x = src[]
	addcc	%i4, 4, %i4		! src += 4, dst += 4, n -= 4
	bcs	.lastword		! if counter wraps, last word
	andn	%i5, %i1, %g1		! ~x & 0x80808080
	sub	%i1, %l1, %l0		! x - 0x01010101
	andcc	%l0, %g1, %g0		! ((x - 0x01010101) & ~x & 0x80808080)
	bnz	.zerobyte		! end of src found, may need to pad
	add	%i2, %i4, %l0		! dst (in pointer form)
	srl	%i1, 24, %g1		! %g1<7:0> = 1st byte; half-word aligned now 
	stb	%g1, [%l0]		! store first byte
	srl	%i1, 8, %g1		! %g1<15:0> = bytes 2, 3
	sth	%g1, [%l0 + 1]		! store bytes 2, 3
	ba	.storebyte		! next word
	stb	%i1, [%l0 + 3]		! store fourth byte
	nop
	nop

.storehalfword:
	lduw	[%i3 + %i4], %i1	! x = src[]
.storehalfword2:
	addcc	%i4, 4, %i4		! src += 4, dst += 4, n -= 4
	bcs	.lastword		! if counter wraps, last word
	andn	%i5, %i1, %g1		! ~x & 0x80808080
	sub	%i1, %l1, %l0		! x - 0x01010101
	andcc	%l0, %g1, %g0		! ((x -0x01010101) & ~x & 0x8080808080)
	bnz	.zerobyte		! x has zero byte, handle end cases
	add	%i2, %i4, %l0		! dst (in pointer form)
	srl	%i1, 16, %g1		! %g1<15:0> = bytes 1, 2
	sth	%g1, [%l0]		! store bytes 1, 2
	ba	.storehalfword		! next dword
	sth	%i1, [%l0 + 2]		! store bytes 3, 4
	
.shortcpy:
	ldub	[%o3 + %o4], %o5	! src[]
	stb	%o5, [%o2 + %o4]	! dst[] = src[]
	inccc	%o4			! src++, dst++, n--
	bz	.doneshort		! if n == 0, done
	tst	%o5			! src[] == 0 ?
	bnz,a	.shortcpy		! nope, next byte
	nop				! empty delay slot
	
.padbyte:
	stb	%g0, [%o2 + %o4]	! dst[] = 0
.padbyte2:
	addcc	%o4, 1, %o4		! dst++, n--
	bnz,a	.padbyte2		! if n != 0, next byte
	stb	%g0, [%o2 + %o4]	! dst[] = 0
	nop				! align label below to 16-byte boundary

.doneshort:
	retl				! return from leaf
	nop				! empty delay slot
	SET_SIZE(strncpy)