/*
 * This file is in public domain.
 * Written by Dmitry Chagin <dchagin@FreeBSD.org>
 */

#if defined(__FreeBSD__)
#include <machine/specialreg.h>
#else
#define	CPUID2_OSXSAVE			0x08000000
#define	CPUID2_AVX			0x10000000
#define	XFEATURE_ENABLED_X87		0x00000001
#define	XFEATURE_ENABLED_SSE		0x00000002
#define	XFEATURE_ENABLED_AVX		0x00000004
#define	XFEATURE_AVX					\
    (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX)
#endif

	.text

	.globl xregs_banks_max
	.type xregs_banks_max, @function
xregs_banks_max:
	pushq	%rbx
	movl	$1, %eax
	cpuid
	andl	$(CPUID2_AVX|CPUID2_OSXSAVE), %ecx
	cmpl	$(CPUID2_AVX|CPUID2_OSXSAVE), %ecx
	jne	sse
	xorl	%ecx, %ecx
	xgetbv
	andl	$XFEATURE_AVX, %eax
	cmpl	$XFEATURE_AVX, %eax
	jne	sse
	movl	$1, %eax
	jmp	out
sse:
	xorl	%eax, %eax
out:
	popq	%rbx
	retq

	.size xregs_banks_max, . - xregs_banks_max


	.globl cpu_to_xmm
	.type cpu_to_xmm, @function
cpu_to_xmm:
	movdqu	%xmm0, (%rdi)
	movdqu	%xmm1, 1 * 16(%rdi)
	movdqu	%xmm2, 2 * 16(%rdi)
	movdqu	%xmm3, 3 * 16(%rdi)
	movdqu	%xmm4, 4 * 16(%rdi)
	movdqu	%xmm5, 5 * 16(%rdi)
	movdqu	%xmm6, 6 * 16(%rdi)
	movdqu	%xmm7, 7 * 16(%rdi)
	movdqu	%xmm8, 8 * 16(%rdi)
	movdqu	%xmm9, 9 * 16(%rdi)
	movdqu	%xmm10, 10 * 16(%rdi)
	movdqu	%xmm11, 11 * 16(%rdi)
	movdqu	%xmm12, 12 * 16(%rdi)
	movdqu	%xmm13, 13 * 16(%rdi)
	movdqu	%xmm14, 14 * 16(%rdi)
	movdqu	%xmm15, 15 * 16(%rdi)
	retq

	.size cpu_to_xmm, . - cpu_to_xmm


	.globl xmm_to_cpu
	.type xmm_to_cpu, @function
xmm_to_cpu:
	movdqu	(%rdi), %xmm0
	movdqu	1 * 16(%rdi), %xmm1
	movdqu	2 * 16(%rdi), %xmm2
	movdqu	3 * 16(%rdi), %xmm3
	movdqu	4 * 16(%rdi), %xmm4
	movdqu	5 * 16(%rdi), %xmm5
	movdqu	6 * 16(%rdi), %xmm6
	movdqu	7 * 16(%rdi), %xmm7
	movdqu	8 * 16(%rdi), %xmm8
	movdqu	9 * 16(%rdi), %xmm9
	movdqu	10 * 16(%rdi), %xmm10
	movdqu	11 * 16(%rdi), %xmm11
	movdqu	12 * 16(%rdi), %xmm12
	movdqu	13 * 16(%rdi), %xmm13
	movdqu	14 * 16(%rdi), %xmm14
	movdqu	15 * 16(%rdi), %xmm15
	retq

	.size xmm_to_cpu, . - xmm_to_cpu


	.globl cpu_to_avx
	.type cpu_to_avx, @function
cpu_to_avx:
	vmovdqu	%ymm0, (%rdi)
	vmovdqu	%ymm1, 1 * 32(%rdi)
	vmovdqu	%ymm2, 2 * 32(%rdi)
	vmovdqu	%ymm3, 3 * 32(%rdi)
	vmovdqu	%ymm4, 4 * 32(%rdi)
	vmovdqu	%ymm5, 5 * 32(%rdi)
	vmovdqu	%ymm6, 6 * 32(%rdi)
	vmovdqu	%ymm7, 7 * 32(%rdi)
	vmovdqu	%ymm8, 8 * 32(%rdi)
	vmovdqu	%ymm9, 9 * 32(%rdi)
	vmovdqu	%ymm10, 10 * 32(%rdi)
	vmovdqu	%ymm11, 11 * 32(%rdi)
	vmovdqu	%ymm12, 12 * 32(%rdi)
	vmovdqu	%ymm13, 13 * 32(%rdi)
	vmovdqu	%ymm14, 14 * 32(%rdi)
	vmovdqu	%ymm15, 15 * 32(%rdi)
	retq

	.size cpu_to_avx, . - cpu_to_avx


	.globl avx_to_cpu
	.type avx_to_cpu, @function
avx_to_cpu:
	vmovdqu	(%rdi), %ymm0
	vmovdqu	1 * 32(%rdi), %ymm1
	vmovdqu	2 * 32(%rdi), %ymm2
	vmovdqu	3 * 32(%rdi), %ymm3
	vmovdqu	4 * 32(%rdi), %ymm4
	vmovdqu	5 * 32(%rdi), %ymm5
	vmovdqu	6 * 32(%rdi), %ymm6
	vmovdqu	7 * 32(%rdi), %ymm7
	vmovdqu	8 * 32(%rdi), %ymm8
	vmovdqu	9 * 32(%rdi), %ymm9
	vmovdqu	10 * 32(%rdi), %ymm10
	vmovdqu	11 * 32(%rdi), %ymm11
	vmovdqu	12 * 32(%rdi), %ymm12
	vmovdqu	13 * 32(%rdi), %ymm13
	vmovdqu	14 * 32(%rdi), %ymm14
	vmovdqu	15 * 32(%rdi), %ymm15
	retq

	.size avx_to_cpu, . - avx_to_cpu

	.section        .note.GNU-stack,"",@progbits