xref: /freebsd/contrib/bionic-x86_64-string/sse2-memmove-slm.S (revision 63f537551380d2dab29fa402ad1269feae17e594)
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE		memmove
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef ALIAS_SYMBOL
71# define ALIAS_SYMBOL(alias, original) \
72	.globl alias; \
73	.equ alias, original
74#endif
75
76#ifndef END
77# define END(name)		\
78	cfi_endproc;		\
79	.size name, .-name
80#endif
81
82#define CFI_PUSH(REG)		\
83	cfi_adjust_cfa_offset (4);		\
84	cfi_rel_offset (REG, 0)
85
86#define CFI_POP(REG)		\
87	cfi_adjust_cfa_offset (-4);		\
88	cfi_restore (REG)
89
90#define PUSH(REG)	push REG;
91#define POP(REG)	pop REG;
92
93#define ENTRANCE	PUSH (%rbx);
94#define RETURN_END	POP (%rbx); ret
95#define RETURN		RETURN_END;
96
97	.section .text.sse2,"ax",@progbits
98ENTRY (MEMMOVE)
99	ENTRANCE
100	mov	%rdi, %rax
101
102/* Check whether we should copy backward or forward.  */
103	cmp	%rsi, %rdi
104	je	L(mm_return)
105	jg	L(mm_len_0_or_more_backward)
106
107/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
108	separately.  */
109	cmp	$16, %rdx
110	jbe	L(mm_len_0_16_bytes_forward)
111
112	cmp	$32, %rdx
113	ja	L(mm_len_32_or_more_forward)
114
115/* Copy [0..32] and return.  */
116	movdqu	(%rsi), %xmm0
117	movdqu	-16(%rsi, %rdx), %xmm1
118	movdqu	%xmm0, (%rdi)
119	movdqu	%xmm1, -16(%rdi, %rdx)
120	jmp	L(mm_return)
121
122L(mm_len_32_or_more_forward):
123	cmp	$64, %rdx
124	ja	L(mm_len_64_or_more_forward)
125
126/* Copy [0..64] and return.  */
127	movdqu	(%rsi), %xmm0
128	movdqu	16(%rsi), %xmm1
129	movdqu	-16(%rsi, %rdx), %xmm2
130	movdqu	-32(%rsi, %rdx), %xmm3
131	movdqu	%xmm0, (%rdi)
132	movdqu	%xmm1, 16(%rdi)
133	movdqu	%xmm2, -16(%rdi, %rdx)
134	movdqu	%xmm3, -32(%rdi, %rdx)
135	jmp	L(mm_return)
136
137L(mm_len_64_or_more_forward):
138	cmp	$128, %rdx
139	ja	L(mm_len_128_or_more_forward)
140
141/* Copy [0..128] and return.  */
142	movdqu	(%rsi), %xmm0
143	movdqu	16(%rsi), %xmm1
144	movdqu	32(%rsi), %xmm2
145	movdqu	48(%rsi), %xmm3
146	movdqu	-64(%rsi, %rdx), %xmm4
147	movdqu	-48(%rsi, %rdx), %xmm5
148	movdqu	-32(%rsi, %rdx), %xmm6
149	movdqu	-16(%rsi, %rdx), %xmm7
150	movdqu	%xmm0, (%rdi)
151	movdqu	%xmm1, 16(%rdi)
152	movdqu	%xmm2, 32(%rdi)
153	movdqu	%xmm3, 48(%rdi)
154	movdqu	%xmm4, -64(%rdi, %rdx)
155	movdqu	%xmm5, -48(%rdi, %rdx)
156	movdqu	%xmm6, -32(%rdi, %rdx)
157	movdqu	%xmm7, -16(%rdi, %rdx)
158	jmp	L(mm_return)
159
160L(mm_len_128_or_more_forward):
161/* Aligning the address of destination.  */
162/*  save first unaligned 64 bytes */
163	movdqu	(%rsi), %xmm0
164	movdqu	16(%rsi), %xmm1
165	movdqu	32(%rsi), %xmm2
166	movdqu	48(%rsi), %xmm3
167
168	lea	64(%rdi), %r8
169	and	$-64, %r8  /* r8 now aligned to next 64 byte boundary */
170	sub	%rdi, %rsi /* rsi = src - dst = diff */
171
172	movdqu	(%r8, %rsi), %xmm4
173	movdqu	16(%r8, %rsi), %xmm5
174	movdqu	32(%r8, %rsi), %xmm6
175	movdqu	48(%r8, %rsi), %xmm7
176
177	movdqu	%xmm0, (%rdi)
178	movdqu	%xmm1, 16(%rdi)
179	movdqu	%xmm2, 32(%rdi)
180	movdqu	%xmm3, 48(%rdi)
181	movdqa	%xmm4, (%r8)
182	movaps	%xmm5, 16(%r8)
183	movaps	%xmm6, 32(%r8)
184	movaps	%xmm7, 48(%r8)
185	add	$64, %r8
186
187	lea	(%rdi, %rdx), %rbx
188	and	$-64, %rbx
189	cmp	%r8, %rbx
190	jbe	L(mm_copy_remaining_forward)
191
192	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
193	jae	L(mm_large_page_loop_forward)
194
195	.p2align 4
196L(mm_main_loop_forward):
197
198	prefetcht0 128(%r8, %rsi)
199
200	movdqu	(%r8, %rsi), %xmm0
201	movdqu	16(%r8, %rsi), %xmm1
202	movdqu	32(%r8, %rsi), %xmm2
203	movdqu	48(%r8, %rsi), %xmm3
204	movdqa	%xmm0, (%r8)
205	movaps	%xmm1, 16(%r8)
206	movaps	%xmm2, 32(%r8)
207	movaps	%xmm3, 48(%r8)
208	lea	64(%r8), %r8
209	cmp	%r8, %rbx
210	ja	L(mm_main_loop_forward)
211
212L(mm_copy_remaining_forward):
213	add	%rdi, %rdx
214	sub	%r8, %rdx
215/* We copied all up till %rdi position in the dst.
216	In %rdx now is how many bytes are left to copy.
217	Now we need to advance %r8. */
218	lea	(%r8, %rsi), %r9
219
220L(mm_remaining_0_64_bytes_forward):
221	cmp	$32, %rdx
222	ja	L(mm_remaining_33_64_bytes_forward)
223	cmp	$16, %rdx
224	ja	L(mm_remaining_17_32_bytes_forward)
225	test	%rdx, %rdx
226	.p2align 4,,2
227	je	L(mm_return)
228
229	cmpb	$8, %dl
230	ja	L(mm_remaining_9_16_bytes_forward)
231	cmpb	$4, %dl
232	.p2align 4,,5
233	ja	L(mm_remaining_5_8_bytes_forward)
234	cmpb	$2, %dl
235	.p2align 4,,1
236	ja	L(mm_remaining_3_4_bytes_forward)
237	movzbl	-1(%r9,%rdx), %esi
238	movzbl	(%r9), %ebx
239	movb	%sil, -1(%r8,%rdx)
240	movb	%bl, (%r8)
241	jmp	L(mm_return)
242
243L(mm_remaining_33_64_bytes_forward):
244	movdqu	(%r9), %xmm0
245	movdqu	16(%r9), %xmm1
246	movdqu	-32(%r9, %rdx), %xmm2
247	movdqu	-16(%r9, %rdx), %xmm3
248	movdqu	%xmm0, (%r8)
249	movdqu	%xmm1, 16(%r8)
250	movdqu	%xmm2, -32(%r8, %rdx)
251	movdqu	%xmm3, -16(%r8, %rdx)
252	jmp	L(mm_return)
253
254L(mm_remaining_17_32_bytes_forward):
255	movdqu	(%r9), %xmm0
256	movdqu	-16(%r9, %rdx), %xmm1
257	movdqu	%xmm0, (%r8)
258	movdqu	%xmm1, -16(%r8, %rdx)
259	jmp	L(mm_return)
260
261L(mm_remaining_5_8_bytes_forward):
262	movl	(%r9), %esi
263	movl	-4(%r9,%rdx), %ebx
264	movl	%esi, (%r8)
265	movl	%ebx, -4(%r8,%rdx)
266	jmp	L(mm_return)
267
268L(mm_remaining_9_16_bytes_forward):
269	mov	(%r9), %rsi
270	mov	-8(%r9, %rdx), %rbx
271	mov	%rsi, (%r8)
272	mov	%rbx, -8(%r8, %rdx)
273	jmp	L(mm_return)
274
275L(mm_remaining_3_4_bytes_forward):
276	movzwl	-2(%r9,%rdx), %esi
277	movzwl	(%r9), %ebx
278	movw	%si, -2(%r8,%rdx)
279	movw	%bx, (%r8)
280	jmp	L(mm_return)
281
282L(mm_len_0_16_bytes_forward):
283	testb	$24, %dl
284	jne	L(mm_len_9_16_bytes_forward)
285	testb	$4, %dl
286	.p2align 4,,5
287	jne	L(mm_len_5_8_bytes_forward)
288	test	%rdx, %rdx
289	.p2align 4,,2
290	je	L(mm_return)
291	testb	$2, %dl
292	.p2align 4,,1
293	jne	L(mm_len_2_4_bytes_forward)
294	movzbl	-1(%rsi,%rdx), %ebx
295	movzbl	(%rsi), %esi
296	movb	%bl, -1(%rdi,%rdx)
297	movb	%sil, (%rdi)
298	jmp	L(mm_return)
299
300L(mm_len_2_4_bytes_forward):
301	movzwl	-2(%rsi,%rdx), %ebx
302	movzwl	(%rsi), %esi
303	movw	%bx, -2(%rdi,%rdx)
304	movw	%si, (%rdi)
305	jmp	L(mm_return)
306
307L(mm_len_5_8_bytes_forward):
308	movl	(%rsi), %ebx
309	movl	-4(%rsi,%rdx), %esi
310	movl	%ebx, (%rdi)
311	movl	%esi, -4(%rdi,%rdx)
312	jmp	L(mm_return)
313
314L(mm_len_9_16_bytes_forward):
315	mov	(%rsi), %rbx
316	mov	-8(%rsi, %rdx), %rsi
317	mov	%rbx, (%rdi)
318	mov	%rsi, -8(%rdi, %rdx)
319	jmp	L(mm_return)
320
321L(mm_recalc_len):
322/* Compute in %rdx how many bytes are left to copy after
323	the main loop stops.  */
324	mov 	%rbx, %rdx
325	sub 	%rdi, %rdx
326/* The code for copying backwards.  */
327L(mm_len_0_or_more_backward):
328
329/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
330	separately.  */
331	cmp	$16, %rdx
332	jbe	L(mm_len_0_16_bytes_backward)
333
334	cmp	$32, %rdx
335	ja	L(mm_len_32_or_more_backward)
336
337/* Copy [0..32] and return.  */
338	movdqu	(%rsi), %xmm0
339	movdqu	-16(%rsi, %rdx), %xmm1
340	movdqu	%xmm0, (%rdi)
341	movdqu	%xmm1, -16(%rdi, %rdx)
342	jmp	L(mm_return)
343
344L(mm_len_32_or_more_backward):
345	cmp	$64, %rdx
346	ja	L(mm_len_64_or_more_backward)
347
348/* Copy [0..64] and return.  */
349	movdqu	(%rsi), %xmm0
350	movdqu	16(%rsi), %xmm1
351	movdqu	-16(%rsi, %rdx), %xmm2
352	movdqu	-32(%rsi, %rdx), %xmm3
353	movdqu	%xmm0, (%rdi)
354	movdqu	%xmm1, 16(%rdi)
355	movdqu	%xmm2, -16(%rdi, %rdx)
356	movdqu	%xmm3, -32(%rdi, %rdx)
357	jmp	L(mm_return)
358
359L(mm_len_64_or_more_backward):
360	cmp	$128, %rdx
361	ja	L(mm_len_128_or_more_backward)
362
363/* Copy [0..128] and return.  */
364	movdqu	(%rsi), %xmm0
365	movdqu	16(%rsi), %xmm1
366	movdqu	32(%rsi), %xmm2
367	movdqu	48(%rsi), %xmm3
368	movdqu	-64(%rsi, %rdx), %xmm4
369	movdqu	-48(%rsi, %rdx), %xmm5
370	movdqu	-32(%rsi, %rdx), %xmm6
371	movdqu	-16(%rsi, %rdx), %xmm7
372	movdqu	%xmm0, (%rdi)
373	movdqu	%xmm1, 16(%rdi)
374	movdqu	%xmm2, 32(%rdi)
375	movdqu	%xmm3, 48(%rdi)
376	movdqu	%xmm4, -64(%rdi, %rdx)
377	movdqu	%xmm5, -48(%rdi, %rdx)
378	movdqu	%xmm6, -32(%rdi, %rdx)
379	movdqu	%xmm7, -16(%rdi, %rdx)
380	jmp	L(mm_return)
381
382L(mm_len_128_or_more_backward):
383/* Aligning the address of destination. We need to save
384	16 bits from the source in order not to overwrite them.  */
385	movdqu	-16(%rsi, %rdx), %xmm0
386	movdqu	-32(%rsi, %rdx), %xmm1
387	movdqu	-48(%rsi, %rdx), %xmm2
388	movdqu	-64(%rsi, %rdx), %xmm3
389
390	lea	(%rdi, %rdx), %r9
391	and	$-64, %r9 /* r9 = aligned dst */
392
393	mov	%rsi, %r8
394	sub	%rdi, %r8 /* r8 = src - dst, diff */
395
396	movdqu	-16(%r9, %r8), %xmm4
397	movdqu	-32(%r9, %r8), %xmm5
398	movdqu	-48(%r9, %r8), %xmm6
399	movdqu	-64(%r9, %r8), %xmm7
400
401	movdqu	%xmm0, -16(%rdi, %rdx)
402	movdqu	%xmm1, -32(%rdi, %rdx)
403	movdqu	%xmm2, -48(%rdi, %rdx)
404	movdqu	%xmm3, -64(%rdi, %rdx)
405	movdqa	%xmm4, -16(%r9)
406	movaps	%xmm5, -32(%r9)
407	movaps	%xmm6, -48(%r9)
408	movaps	%xmm7, -64(%r9)
409	lea	-64(%r9), %r9
410
411	lea	64(%rdi), %rbx
412	and	$-64, %rbx
413
414	cmp	%r9, %rbx
415	jae	L(mm_recalc_len)
416
417	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
418	jae	L(mm_large_page_loop_backward)
419
420	.p2align 4
421L(mm_main_loop_backward):
422
423	prefetcht0 -128(%r9, %r8)
424
425	movdqu	-64(%r9, %r8), %xmm0
426	movdqu	-48(%r9, %r8), %xmm1
427	movdqu	-32(%r9, %r8), %xmm2
428	movdqu	-16(%r9, %r8), %xmm3
429	movdqa	%xmm0, -64(%r9)
430	movaps	%xmm1, -48(%r9)
431	movaps	%xmm2, -32(%r9)
432	movaps	%xmm3, -16(%r9)
433	lea	-64(%r9), %r9
434	cmp	%r9, %rbx
435	jb	L(mm_main_loop_backward)
436	jmp	L(mm_recalc_len)
437
438/* Copy [0..16] and return.  */
439L(mm_len_0_16_bytes_backward):
440	testb	$24, %dl
441	jnz	L(mm_len_9_16_bytes_backward)
442	testb	$4, %dl
443	.p2align 4,,5
444	jnz	L(mm_len_5_8_bytes_backward)
445	test	%rdx, %rdx
446	.p2align 4,,2
447	je	L(mm_return)
448	testb	$2, %dl
449	.p2align 4,,1
450	jne	L(mm_len_3_4_bytes_backward)
451	movzbl	-1(%rsi,%rdx), %ebx
452	movzbl	(%rsi), %ecx
453	movb	%bl, -1(%rdi,%rdx)
454	movb	%cl, (%rdi)
455	jmp	L(mm_return)
456
457L(mm_len_3_4_bytes_backward):
458	movzwl	-2(%rsi,%rdx), %ebx
459	movzwl	(%rsi), %ecx
460	movw	%bx, -2(%rdi,%rdx)
461	movw	%cx, (%rdi)
462	jmp	L(mm_return)
463
464L(mm_len_9_16_bytes_backward):
465	movl	-4(%rsi,%rdx), %ebx
466	movl	-8(%rsi,%rdx), %ecx
467	movl	%ebx, -4(%rdi,%rdx)
468	movl	%ecx, -8(%rdi,%rdx)
469	sub	$8, %rdx
470	jmp	L(mm_len_0_16_bytes_backward)
471
472L(mm_len_5_8_bytes_backward):
473	movl	(%rsi), %ebx
474	movl	-4(%rsi,%rdx), %ecx
475	movl	%ebx, (%rdi)
476	movl	%ecx, -4(%rdi,%rdx)
477
478L(mm_return):
479	RETURN
480
481/* Big length copy forward part.  */
482
483	.p2align 4
484L(mm_large_page_loop_forward):
485	movdqu	(%r8, %rsi), %xmm0
486	movdqu	16(%r8, %rsi), %xmm1
487	movdqu	32(%r8, %rsi), %xmm2
488	movdqu	48(%r8, %rsi), %xmm3
489	movntdq	%xmm0, (%r8)
490	movntdq	%xmm1, 16(%r8)
491	movntdq	%xmm2, 32(%r8)
492	movntdq	%xmm3, 48(%r8)
493	lea 	64(%r8), %r8
494	cmp	%r8, %rbx
495	ja	L(mm_large_page_loop_forward)
496	sfence
497	jmp	L(mm_copy_remaining_forward)
498
499/* Big length copy backward part.  */
500	.p2align 4
501L(mm_large_page_loop_backward):
502	movdqu	-64(%r9, %r8), %xmm0
503	movdqu	-48(%r9, %r8), %xmm1
504	movdqu	-32(%r9, %r8), %xmm2
505	movdqu	-16(%r9, %r8), %xmm3
506	movntdq	%xmm0, -64(%r9)
507	movntdq	%xmm1, -48(%r9)
508	movntdq	%xmm2, -32(%r9)
509	movntdq	%xmm3, -16(%r9)
510	lea 	-64(%r9), %r9
511	cmp	%r9, %rbx
512	jb	L(mm_large_page_loop_backward)
513	sfence
514	jmp	L(mm_recalc_len)
515
516END (MEMMOVE)
517
518ALIAS_SYMBOL(memcpy, MEMMOVE)
519