1104d3bdeSDan OpenSolaris Anderson/* 2104d3bdeSDan OpenSolaris Anderson * CDDL HEADER START 3104d3bdeSDan OpenSolaris Anderson * 4104d3bdeSDan OpenSolaris Anderson * The contents of this file are subject to the terms of the 5104d3bdeSDan OpenSolaris Anderson * Common Development and Distribution License (the "License"). 6104d3bdeSDan OpenSolaris Anderson * You may not use this file except in compliance with the License. 7104d3bdeSDan OpenSolaris Anderson * 8104d3bdeSDan OpenSolaris Anderson * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9104d3bdeSDan OpenSolaris Anderson * or http://www.opensolaris.org/os/licensing. 10104d3bdeSDan OpenSolaris Anderson * See the License for the specific language governing permissions 11104d3bdeSDan OpenSolaris Anderson * and limitations under the License. 12104d3bdeSDan OpenSolaris Anderson * 13104d3bdeSDan OpenSolaris Anderson * When distributing Covered Code, include this CDDL HEADER in each 14104d3bdeSDan OpenSolaris Anderson * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15104d3bdeSDan OpenSolaris Anderson * If applicable, add the following below this CDDL HEADER, with the 16104d3bdeSDan OpenSolaris Anderson * fields enclosed by brackets "[]" replaced with your own identifying 17104d3bdeSDan OpenSolaris Anderson * information: Portions Copyright [yyyy] [name of copyright owner] 18104d3bdeSDan OpenSolaris Anderson * 19104d3bdeSDan OpenSolaris Anderson * CDDL HEADER END 20104d3bdeSDan OpenSolaris Anderson */ 21104d3bdeSDan OpenSolaris Anderson 22104d3bdeSDan OpenSolaris Anderson/* 23104d3bdeSDan OpenSolaris Anderson * Copyright (c) 2009 Intel Corporation 24104d3bdeSDan OpenSolaris Anderson * All Rights Reserved. 25104d3bdeSDan OpenSolaris Anderson */ 26104d3bdeSDan OpenSolaris Anderson/* 27104d3bdeSDan OpenSolaris Anderson * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 28104d3bdeSDan OpenSolaris Anderson * Use is subject to license terms. 29104d3bdeSDan OpenSolaris Anderson */ 30104d3bdeSDan OpenSolaris Anderson 31104d3bdeSDan OpenSolaris Anderson/* 32104d3bdeSDan OpenSolaris Anderson * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 33104d3bdeSDan OpenSolaris Anderson * instructions. This file contains an accelerated 34104d3bdeSDan OpenSolaris Anderson * Galois Field Multiplication implementation. 35104d3bdeSDan OpenSolaris Anderson * 36104d3bdeSDan OpenSolaris Anderson * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, 37104d3bdeSDan OpenSolaris Anderson * carry-less multiplication. More information about PCLMULQDQ can be 38104d3bdeSDan OpenSolaris Anderson * found at: 39104d3bdeSDan OpenSolaris Anderson * http://software.intel.com/en-us/articles/ 40104d3bdeSDan OpenSolaris Anderson * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 41104d3bdeSDan OpenSolaris Anderson * 42104d3bdeSDan OpenSolaris Anderson */ 43104d3bdeSDan OpenSolaris Anderson 44104d3bdeSDan OpenSolaris Anderson/* 45104d3bdeSDan OpenSolaris Anderson * ==================================================================== 46104d3bdeSDan OpenSolaris Anderson * OpenSolaris OS modifications 47104d3bdeSDan OpenSolaris Anderson * 48104d3bdeSDan OpenSolaris Anderson * This source originates as file galois_hash_asm.c from 49104d3bdeSDan OpenSolaris Anderson * Intel Corporation dated September 21, 2009. 50104d3bdeSDan OpenSolaris Anderson * 51104d3bdeSDan OpenSolaris Anderson * This OpenSolaris version has these major changes from the original source: 52104d3bdeSDan OpenSolaris Anderson * 53104d3bdeSDan OpenSolaris Anderson * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 54104d3bdeSDan OpenSolaris Anderson * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function 55104d3bdeSDan OpenSolaris Anderson * definition for lint. 56104d3bdeSDan OpenSolaris Anderson * 57104d3bdeSDan OpenSolaris Anderson * 2. Formatted code, added comments, and added #includes and #defines. 58104d3bdeSDan OpenSolaris Anderson * 59*8de5c4f4SDan OpenSolaris Anderson * 3. If bit CR0.TS is set, clear and set the TS bit, after and before 60104d3bdeSDan OpenSolaris Anderson * calling kpreempt_disable() and kpreempt_enable(). 61104d3bdeSDan OpenSolaris Anderson * If the TS bit is not set, Save and restore %xmm registers at the beginning 62104d3bdeSDan OpenSolaris Anderson * and end of function calls (%xmm* registers are not saved and restored by 63104d3bdeSDan OpenSolaris Anderson * during kernel thread preemption). 64104d3bdeSDan OpenSolaris Anderson * 65*8de5c4f4SDan OpenSolaris Anderson * 4. Removed code to perform hashing. This is already done with C macro 66104d3bdeSDan OpenSolaris Anderson * GHASH in gcm.c. For better performance, this removed code should be 67104d3bdeSDan OpenSolaris Anderson * reintegrated in the future to replace the C GHASH macro. 68104d3bdeSDan OpenSolaris Anderson * 69*8de5c4f4SDan OpenSolaris Anderson * 5. Added code to byte swap 16-byte input and output. 70104d3bdeSDan OpenSolaris Anderson * 71*8de5c4f4SDan OpenSolaris Anderson * 6. Folded in comments from the original C source with embedded assembly 72104d3bdeSDan OpenSolaris Anderson * (SB_w_shift_xor.c) 73104d3bdeSDan OpenSolaris Anderson * 74*8de5c4f4SDan OpenSolaris Anderson * 7. Renamed function and reordered parameters to match OpenSolaris: 75104d3bdeSDan OpenSolaris Anderson * Intel interface: 76104d3bdeSDan OpenSolaris Anderson * void galois_hash_asm(unsigned char *hk, unsigned char *s, 77104d3bdeSDan OpenSolaris Anderson * unsigned char *d, int length) 78104d3bdeSDan OpenSolaris Anderson * OpenSolaris OS interface: 79104d3bdeSDan OpenSolaris Anderson * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 80104d3bdeSDan OpenSolaris Anderson * ==================================================================== 81104d3bdeSDan OpenSolaris Anderson */ 82104d3bdeSDan OpenSolaris Anderson 83104d3bdeSDan OpenSolaris Anderson 84104d3bdeSDan OpenSolaris Anderson#if defined(lint) || defined(__lint) 85104d3bdeSDan OpenSolaris Anderson 86104d3bdeSDan OpenSolaris Anderson#include <sys/types.h> 87104d3bdeSDan OpenSolaris Anderson 88104d3bdeSDan OpenSolaris Anderson/* ARGSUSED */ 89104d3bdeSDan OpenSolaris Andersonvoid 90104d3bdeSDan OpenSolaris Andersongcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { 91104d3bdeSDan OpenSolaris Anderson} 92104d3bdeSDan OpenSolaris Anderson 93104d3bdeSDan OpenSolaris Anderson#else /* lint */ 94104d3bdeSDan OpenSolaris Anderson 95104d3bdeSDan OpenSolaris Anderson#include <sys/asm_linkage.h> 96104d3bdeSDan OpenSolaris Anderson#include <sys/controlregs.h> 97104d3bdeSDan OpenSolaris Anderson#ifdef _KERNEL 98104d3bdeSDan OpenSolaris Anderson#include <sys/machprivregs.h> 99104d3bdeSDan OpenSolaris Anderson#endif 100104d3bdeSDan OpenSolaris Anderson 101104d3bdeSDan OpenSolaris Anderson#ifdef _KERNEL 102104d3bdeSDan OpenSolaris Anderson /* 103104d3bdeSDan OpenSolaris Anderson * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, 104104d3bdeSDan OpenSolaris Anderson * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it 105104d3bdeSDan OpenSolaris Anderson * uses it to pass P2 to syscall. 106104d3bdeSDan OpenSolaris Anderson * This also occurs with the STTS macro, but we don't care if 107104d3bdeSDan OpenSolaris Anderson * P2 (%rsi) is modified just before function exit. 108104d3bdeSDan OpenSolaris Anderson * The CLTS and STTS macros push and pop P1 (%rdi) already. 109104d3bdeSDan OpenSolaris Anderson */ 110104d3bdeSDan OpenSolaris Anderson#ifdef __xpv 111104d3bdeSDan OpenSolaris Anderson#define PROTECTED_CLTS \ 112104d3bdeSDan OpenSolaris Anderson push %rsi; \ 113104d3bdeSDan OpenSolaris Anderson CLTS; \ 114104d3bdeSDan OpenSolaris Anderson pop %rsi 115104d3bdeSDan OpenSolaris Anderson#else 116104d3bdeSDan OpenSolaris Anderson#define PROTECTED_CLTS \ 117104d3bdeSDan OpenSolaris Anderson CLTS 118104d3bdeSDan OpenSolaris Anderson#endif /* __xpv */ 119104d3bdeSDan OpenSolaris Anderson 120104d3bdeSDan OpenSolaris Anderson /* 121104d3bdeSDan OpenSolaris Anderson * If CR0_TS is not set, align stack (with push %rbp) and push 122104d3bdeSDan OpenSolaris Anderson * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS 123104d3bdeSDan OpenSolaris Anderson */ 124104d3bdeSDan OpenSolaris Anderson#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \ 125104d3bdeSDan OpenSolaris Anderson push %rbp; \ 126104d3bdeSDan OpenSolaris Anderson mov %rsp, %rbp; \ 127104d3bdeSDan OpenSolaris Anderson movq %cr0, tmpreg; \ 128104d3bdeSDan OpenSolaris Anderson testq $CR0_TS, tmpreg; \ 129104d3bdeSDan OpenSolaris Anderson jnz 1f; \ 130104d3bdeSDan OpenSolaris Anderson and $-XMM_ALIGN, %rsp; \ 131104d3bdeSDan OpenSolaris Anderson sub $[XMM_SIZE * 11], %rsp; \ 132104d3bdeSDan OpenSolaris Anderson movaps %xmm0, 160(%rsp); \ 133104d3bdeSDan OpenSolaris Anderson movaps %xmm1, 144(%rsp); \ 134104d3bdeSDan OpenSolaris Anderson movaps %xmm2, 128(%rsp); \ 135104d3bdeSDan OpenSolaris Anderson movaps %xmm3, 112(%rsp); \ 136104d3bdeSDan OpenSolaris Anderson movaps %xmm4, 96(%rsp); \ 137104d3bdeSDan OpenSolaris Anderson movaps %xmm5, 80(%rsp); \ 138104d3bdeSDan OpenSolaris Anderson movaps %xmm6, 64(%rsp); \ 139104d3bdeSDan OpenSolaris Anderson movaps %xmm7, 48(%rsp); \ 140104d3bdeSDan OpenSolaris Anderson movaps %xmm8, 32(%rsp); \ 141104d3bdeSDan OpenSolaris Anderson movaps %xmm9, 16(%rsp); \ 142104d3bdeSDan OpenSolaris Anderson movaps %xmm10, (%rsp); \ 143104d3bdeSDan OpenSolaris Anderson jmp 2f; \ 144104d3bdeSDan OpenSolaris Anderson1: \ 145104d3bdeSDan OpenSolaris Anderson PROTECTED_CLTS; \ 146104d3bdeSDan OpenSolaris Anderson2: 147104d3bdeSDan OpenSolaris Anderson 148104d3bdeSDan OpenSolaris Anderson 149104d3bdeSDan OpenSolaris Anderson /* 150104d3bdeSDan OpenSolaris Anderson * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack, 151104d3bdeSDan OpenSolaris Anderson * otherwise set CR0_TS. 152104d3bdeSDan OpenSolaris Anderson */ 153104d3bdeSDan OpenSolaris Anderson#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \ 154104d3bdeSDan OpenSolaris Anderson testq $CR0_TS, tmpreg; \ 155104d3bdeSDan OpenSolaris Anderson jnz 1f; \ 156104d3bdeSDan OpenSolaris Anderson movaps (%rsp), %xmm10; \ 157104d3bdeSDan OpenSolaris Anderson movaps 16(%rsp), %xmm9; \ 158104d3bdeSDan OpenSolaris Anderson movaps 32(%rsp), %xmm8; \ 159104d3bdeSDan OpenSolaris Anderson movaps 48(%rsp), %xmm7; \ 160104d3bdeSDan OpenSolaris Anderson movaps 64(%rsp), %xmm6; \ 161104d3bdeSDan OpenSolaris Anderson movaps 80(%rsp), %xmm5; \ 162104d3bdeSDan OpenSolaris Anderson movaps 96(%rsp), %xmm4; \ 163104d3bdeSDan OpenSolaris Anderson movaps 112(%rsp), %xmm3; \ 164104d3bdeSDan OpenSolaris Anderson movaps 128(%rsp), %xmm2; \ 165104d3bdeSDan OpenSolaris Anderson movaps 144(%rsp), %xmm1; \ 166104d3bdeSDan OpenSolaris Anderson movaps 160(%rsp), %xmm0; \ 167104d3bdeSDan OpenSolaris Anderson jmp 2f; \ 168104d3bdeSDan OpenSolaris Anderson1: \ 169104d3bdeSDan OpenSolaris Anderson STTS(tmpreg); \ 170104d3bdeSDan OpenSolaris Anderson2: \ 171104d3bdeSDan OpenSolaris Anderson mov %rbp, %rsp; \ 172104d3bdeSDan OpenSolaris Anderson pop %rbp 173104d3bdeSDan OpenSolaris Anderson 174104d3bdeSDan OpenSolaris Anderson 175104d3bdeSDan OpenSolaris Anderson#else 176104d3bdeSDan OpenSolaris Anderson#define PROTECTED_CLTS 177104d3bdeSDan OpenSolaris Anderson#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) 178104d3bdeSDan OpenSolaris Anderson#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) 179104d3bdeSDan OpenSolaris Anderson#endif /* _KERNEL */ 180104d3bdeSDan OpenSolaris Anderson 181104d3bdeSDan OpenSolaris Anderson/* 182104d3bdeSDan OpenSolaris Anderson * Use this mask to byte-swap a 16-byte integer with the pshufb instruction 183104d3bdeSDan OpenSolaris Anderson */ 184104d3bdeSDan OpenSolaris Anderson 185104d3bdeSDan OpenSolaris Anderson// static uint8_t byte_swap16_mask[] = { 186104d3bdeSDan OpenSolaris Anderson// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; 187104d3bdeSDan OpenSolaris Anderson.text 188104d3bdeSDan OpenSolaris Anderson.align XMM_ALIGN 189104d3bdeSDan OpenSolaris Anderson.Lbyte_swap16_mask: 190104d3bdeSDan OpenSolaris Anderson .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 191104d3bdeSDan OpenSolaris Anderson 192104d3bdeSDan OpenSolaris Anderson 193104d3bdeSDan OpenSolaris Anderson 194104d3bdeSDan OpenSolaris Anderson/* 195104d3bdeSDan OpenSolaris Anderson * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 196104d3bdeSDan OpenSolaris Anderson * 197104d3bdeSDan OpenSolaris Anderson * Perform a carry-less multiplication (that is, use XOR instead of the 198104d3bdeSDan OpenSolaris Anderson * multiply operator) on P1 and P2 and place the result in P3. 199104d3bdeSDan OpenSolaris Anderson * 200104d3bdeSDan OpenSolaris Anderson * Byte swap the input and the output. 201104d3bdeSDan OpenSolaris Anderson * 202104d3bdeSDan OpenSolaris Anderson * Note: x_in, y, and res all point to a block of 20-byte numbers 203104d3bdeSDan OpenSolaris Anderson * (an array of two 64-bit integers). 204104d3bdeSDan OpenSolaris Anderson * 205104d3bdeSDan OpenSolaris Anderson * Note2: For kernel code, caller is responsible for ensuring 206104d3bdeSDan OpenSolaris Anderson * kpreempt_disable() has been called. This is because %xmm registers are 207104d3bdeSDan OpenSolaris Anderson * not saved/restored. Clear and set the CR0.TS bit on entry and exit, 208104d3bdeSDan OpenSolaris Anderson * respectively, if TS is set on entry. Otherwise, if TS is not set, 209104d3bdeSDan OpenSolaris Anderson * save and restore %xmm registers on the stack. 210104d3bdeSDan OpenSolaris Anderson * 211104d3bdeSDan OpenSolaris Anderson * Note3: Original Intel definition: 212104d3bdeSDan OpenSolaris Anderson * void galois_hash_asm(unsigned char *hk, unsigned char *s, 213104d3bdeSDan OpenSolaris Anderson * unsigned char *d, int length) 214104d3bdeSDan OpenSolaris Anderson * 215104d3bdeSDan OpenSolaris Anderson * Note4: Register/parameter mapping: 216104d3bdeSDan OpenSolaris Anderson * Intel: 217104d3bdeSDan OpenSolaris Anderson * Parameter 1: %rcx (copied to %xmm0) hk or x_in 218104d3bdeSDan OpenSolaris Anderson * Parameter 2: %rdx (copied to %xmm1) s or y 219104d3bdeSDan OpenSolaris Anderson * Parameter 3: %rdi (result) d or res 220104d3bdeSDan OpenSolaris Anderson * OpenSolaris: 221104d3bdeSDan OpenSolaris Anderson * Parameter 1: %rdi (copied to %xmm0) x_in 222104d3bdeSDan OpenSolaris Anderson * Parameter 2: %rsi (copied to %xmm1) y 223104d3bdeSDan OpenSolaris Anderson * Parameter 3: %rdx (result) res 224104d3bdeSDan OpenSolaris Anderson */ 225104d3bdeSDan OpenSolaris Anderson 226104d3bdeSDan OpenSolaris AndersonENTRY_NP(gcm_mul_pclmulqdq) 227104d3bdeSDan OpenSolaris Anderson CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10) 228104d3bdeSDan OpenSolaris Anderson 229104d3bdeSDan OpenSolaris Anderson // 230104d3bdeSDan OpenSolaris Anderson // Copy Parameters 231104d3bdeSDan OpenSolaris Anderson // 232104d3bdeSDan OpenSolaris Anderson movdqu (%rdi), %xmm0 // P1 233104d3bdeSDan OpenSolaris Anderson movdqu (%rsi), %xmm1 // P2 234104d3bdeSDan OpenSolaris Anderson 235104d3bdeSDan OpenSolaris Anderson // 236104d3bdeSDan OpenSolaris Anderson // Byte swap 16-byte input 237104d3bdeSDan OpenSolaris Anderson // 238104d3bdeSDan OpenSolaris Anderson lea .Lbyte_swap16_mask(%rip), %rax 239104d3bdeSDan OpenSolaris Anderson movaps (%rax), %xmm10 240*8de5c4f4SDan OpenSolaris Anderson pshufb %xmm10, %xmm0 241*8de5c4f4SDan OpenSolaris Anderson pshufb %xmm10, %xmm1 242104d3bdeSDan OpenSolaris Anderson 243104d3bdeSDan OpenSolaris Anderson 244104d3bdeSDan OpenSolaris Anderson // 245104d3bdeSDan OpenSolaris Anderson // Multiply with the hash key 246104d3bdeSDan OpenSolaris Anderson // 247104d3bdeSDan OpenSolaris Anderson movdqu %xmm0, %xmm3 248*8de5c4f4SDan OpenSolaris Anderson pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 249104d3bdeSDan OpenSolaris Anderson 250104d3bdeSDan OpenSolaris Anderson movdqu %xmm0, %xmm4 251*8de5c4f4SDan OpenSolaris Anderson pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 252104d3bdeSDan OpenSolaris Anderson 253104d3bdeSDan OpenSolaris Anderson movdqu %xmm0, %xmm5 254*8de5c4f4SDan OpenSolaris Anderson pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 255104d3bdeSDan OpenSolaris Anderson movdqu %xmm0, %xmm6 256*8de5c4f4SDan OpenSolaris Anderson pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 257104d3bdeSDan OpenSolaris Anderson 258104d3bdeSDan OpenSolaris Anderson pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 259104d3bdeSDan OpenSolaris Anderson 260104d3bdeSDan OpenSolaris Anderson movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 261104d3bdeSDan OpenSolaris Anderson psrldq $8, %xmm4 // shift by xmm4 64 bits to the right 262104d3bdeSDan OpenSolaris Anderson pslldq $8, %xmm5 // shift by xmm5 64 bits to the left 263104d3bdeSDan OpenSolaris Anderson pxor %xmm5, %xmm3 264104d3bdeSDan OpenSolaris Anderson pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result 265104d3bdeSDan OpenSolaris Anderson // of the carry-less multiplication of 266104d3bdeSDan OpenSolaris Anderson // xmm0 by xmm1. 267104d3bdeSDan OpenSolaris Anderson 268104d3bdeSDan OpenSolaris Anderson // We shift the result of the multiplication by one bit position 269104d3bdeSDan OpenSolaris Anderson // to the left to cope for the fact that the bits are reversed. 270104d3bdeSDan OpenSolaris Anderson movdqu %xmm3, %xmm7 271104d3bdeSDan OpenSolaris Anderson movdqu %xmm6, %xmm8 272104d3bdeSDan OpenSolaris Anderson pslld $1, %xmm3 273104d3bdeSDan OpenSolaris Anderson pslld $1, %xmm6 274104d3bdeSDan OpenSolaris Anderson psrld $31, %xmm7 275104d3bdeSDan OpenSolaris Anderson psrld $31, %xmm8 276104d3bdeSDan OpenSolaris Anderson movdqu %xmm7, %xmm9 277104d3bdeSDan OpenSolaris Anderson pslldq $4, %xmm8 278104d3bdeSDan OpenSolaris Anderson pslldq $4, %xmm7 279104d3bdeSDan OpenSolaris Anderson psrldq $12, %xmm9 280104d3bdeSDan OpenSolaris Anderson por %xmm7, %xmm3 281104d3bdeSDan OpenSolaris Anderson por %xmm8, %xmm6 282104d3bdeSDan OpenSolaris Anderson por %xmm9, %xmm6 283104d3bdeSDan OpenSolaris Anderson 284104d3bdeSDan OpenSolaris Anderson // 285104d3bdeSDan OpenSolaris Anderson // First phase of the reduction 286104d3bdeSDan OpenSolaris Anderson // 287104d3bdeSDan OpenSolaris Anderson // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 288104d3bdeSDan OpenSolaris Anderson // independently. 289104d3bdeSDan OpenSolaris Anderson movdqu %xmm3, %xmm7 290104d3bdeSDan OpenSolaris Anderson movdqu %xmm3, %xmm8 291104d3bdeSDan OpenSolaris Anderson movdqu %xmm3, %xmm9 292104d3bdeSDan OpenSolaris Anderson pslld $31, %xmm7 // packed right shift shifting << 31 293104d3bdeSDan OpenSolaris Anderson pslld $30, %xmm8 // packed right shift shifting << 30 294104d3bdeSDan OpenSolaris Anderson pslld $25, %xmm9 // packed right shift shifting << 25 295104d3bdeSDan OpenSolaris Anderson pxor %xmm8, %xmm7 // xor the shifted versions 296104d3bdeSDan OpenSolaris Anderson pxor %xmm9, %xmm7 297104d3bdeSDan OpenSolaris Anderson movdqu %xmm7, %xmm8 298104d3bdeSDan OpenSolaris Anderson pslldq $12, %xmm7 299104d3bdeSDan OpenSolaris Anderson psrldq $4, %xmm8 300104d3bdeSDan OpenSolaris Anderson pxor %xmm7, %xmm3 // first phase of the reduction complete 301104d3bdeSDan OpenSolaris Anderson 302104d3bdeSDan OpenSolaris Anderson // 303104d3bdeSDan OpenSolaris Anderson // Second phase of the reduction 304104d3bdeSDan OpenSolaris Anderson // 305104d3bdeSDan OpenSolaris Anderson // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 306104d3bdeSDan OpenSolaris Anderson // shift operations. 307104d3bdeSDan OpenSolaris Anderson movdqu %xmm3, %xmm2 308104d3bdeSDan OpenSolaris Anderson movdqu %xmm3, %xmm4 // packed left shifting >> 1 309104d3bdeSDan OpenSolaris Anderson movdqu %xmm3, %xmm5 310104d3bdeSDan OpenSolaris Anderson psrld $1, %xmm2 311104d3bdeSDan OpenSolaris Anderson psrld $2, %xmm4 // packed left shifting >> 2 312104d3bdeSDan OpenSolaris Anderson psrld $7, %xmm5 // packed left shifting >> 7 313104d3bdeSDan OpenSolaris Anderson pxor %xmm4, %xmm2 // xor the shifted versions 314104d3bdeSDan OpenSolaris Anderson pxor %xmm5, %xmm2 315104d3bdeSDan OpenSolaris Anderson pxor %xmm8, %xmm2 316104d3bdeSDan OpenSolaris Anderson pxor %xmm2, %xmm3 317104d3bdeSDan OpenSolaris Anderson pxor %xmm3, %xmm6 // the result is in xmm6 318104d3bdeSDan OpenSolaris Anderson 319104d3bdeSDan OpenSolaris Anderson // 320104d3bdeSDan OpenSolaris Anderson // Byte swap 16-byte result 321104d3bdeSDan OpenSolaris Anderson // 322*8de5c4f4SDan OpenSolaris Anderson pshufb %xmm10, %xmm6 // %xmm10 has the swap mask 323104d3bdeSDan OpenSolaris Anderson 324104d3bdeSDan OpenSolaris Anderson // 325104d3bdeSDan OpenSolaris Anderson // Store the result 326104d3bdeSDan OpenSolaris Anderson // 327104d3bdeSDan OpenSolaris Anderson movdqu %xmm6, (%rdx) // P3 328104d3bdeSDan OpenSolaris Anderson 329104d3bdeSDan OpenSolaris Anderson 330104d3bdeSDan OpenSolaris Anderson // 331104d3bdeSDan OpenSolaris Anderson // Cleanup and Return 332104d3bdeSDan OpenSolaris Anderson // 333104d3bdeSDan OpenSolaris Anderson SET_TS_OR_POP_XMM_REGISTERS(%r10) 334104d3bdeSDan OpenSolaris Anderson ret 335104d3bdeSDan OpenSolaris Anderson SET_SIZE(gcm_mul_pclmulqdq) 336104d3bdeSDan OpenSolaris Anderson 337104d3bdeSDan OpenSolaris Anderson#endif /* lint || __lint */ 338