xref: /illumos-gate/usr/src/lib/libc/sparcv9/crt/__align_cpy_8.S (revision 724733535c8d5346d1f18efab32f7a75789f721b)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27	.file	"__align_cpy_8.s"
28
29/* __align_cpy_8(s1, s2, n)
30 *
31 * Copy 8-byte aligned source to 8-byte aligned target in multiples of 8 bytes.
32 *
33 * Input:
34 *	o0	address of target
35 *	o1	address of source
36 *	o2	number of bytes to copy (must be a multiple of 8)
37 * Output:
38 *	o0	address of target
39 * Caller's registers that have been changed by this function:
40 *	o1-o5
41 *
42 * Note:
43 *	This helper routine will not be used by any 32-bit compilations. To do
44 *	so would break binary compatibility with previous versions of Solaris.
45 *
46 * Assumptions:
47 *	Source and target addresses are 8-byte aligned.
48 *	Bytes to be copied are non-overlapping or _exactly_ overlapping.
49 *	The number of bytes to be copied is a multiple of 8.
50 *	Call will _usually_ be made with a byte count of more than 4*8 and
51 *	less than a few hundred bytes.  Legal values are 0 to MAX_SIZE_T.
52 *
53 * Optimization attempt:
54 *	Reasonable speed for a generic v9.  Going for 32 bytes at a time
55 *	rather than 16 bytes at a time did not result in a time saving for
56 *	the number of bytes expected to be copied.  No timing runs using other
57 *	levels of optimization have been tried yet.
58 *
59 * Even when multiples of 16 bytes were used, the savings by going for 32 bytes
60 * at a time were about 2%.  Thus, __align_cpy_16 is a second entry point to
61 * the same code as __align_cpy_8.
62 *
63 * Register usage:
64 *	o1	source address (updated for each read)
65 *	o2	byte count remaining
66 *	o3	contents being copied
67 *	o4	more contents being copied
68 *	o5	target address
69 */
70
71#include <sys/asm_linkage.h>
72
73	ENTRY(__align_cpy_8)
74	ENTRY(__align_cpy_16)
75	cmp	%o0, %o1		! Identical--do nothing.
76	be,pn	%xcc, .done
77	subcc	%o2, 8, %o2
78	bz,pn	%xcc, .wrdbl2		! Only 8 bytes need to be copied.
79	mov	%o0, %o5		! Original target address is returned.
80	bpos,a,pt %xcc, .wrdbl1		! Have at least 16 bytes to copy.
81	ldx	[%o1], %o3
82.done:
83	retl				! No bytes to copy.
84	nop
85
86	.align	32
87.wrdbl1:				! Copy 16 bytes at a time.
88	subcc	%o2, 16, %o2
89	ldx	[%o1+8], %o4
90	add	%o1, 16, %o1
91	stx	%o3, [%o5]
92	stx	%o4, [%o5+8]
93	add	%o5, 16, %o5
94	bg,a,pt	%xcc, .wrdbl1		! Have at least 16 more bytes.
95	ldx	[%o1], %o3
96
97	bz,a,pt	%xcc, .wrdbl3		! Have 8 bytes remaining to copy.
98	ldx	[%o1], %o3
99
100	retl
101	nop
102
103.wrdbl2:
104	ldx	[%o1], %o3		! Copy last 8 bytes.
105.wrdbl3:
106	stx	%o3, [%o5]
107	retl
108	nop
109
110	SET_SIZE(__align_cpy_8)
111	SET_SIZE(__align_cpy_16)
112