1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2009 Intel Corporation 24 * All Rights Reserved. 25 */ 26/* 27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31/* 32 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 33 * instructions. This file contains an accelerated 34 * Galois Field Multiplication implementation. 35 * 36 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, 37 * carry-less multiplication. More information about PCLMULQDQ can be 38 * found at: 39 * http://software.intel.com/en-us/articles/ 40 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 41 * 42 */ 43 44/* 45 * ==================================================================== 46 * OpenSolaris OS modifications 47 * 48 * This source originates as file galois_hash_asm.c from 49 * Intel Corporation dated September 21, 2009. 50 * 51 * This OpenSolaris version has these major changes from the original source: 52 * 53 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 54 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function 55 * definition for lint. 56 * 57 * 2. Formatted code, added comments, and added #includes and #defines. 58 * 59 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before 60 * calling kpreempt_disable() and kpreempt_enable(). 61 * If the TS bit is not set, Save and restore %xmm registers at the beginning 62 * and end of function calls (%xmm* registers are not saved and restored by 63 * during kernel thread preemption). 64 * 65 * 4. Removed code to perform hashing. This is already done with C macro 66 * GHASH in gcm.c. For better performance, this removed code should be 67 * reintegrated in the future to replace the C GHASH macro. 68 * 69 * 5. Added code to byte swap 16-byte input and output. 70 * 71 * 6. Folded in comments from the original C source with embedded assembly 72 * (SB_w_shift_xor.c) 73 * 74 * 7. Renamed function and reordered parameters to match OpenSolaris: 75 * Intel interface: 76 * void galois_hash_asm(unsigned char *hk, unsigned char *s, 77 * unsigned char *d, int length) 78 * OpenSolaris OS interface: 79 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 80 * ==================================================================== 81 */ 82 83 84#if defined(lint) || defined(__lint) /* lint */ 85 86#include <sys/types.h> 87 88void 89gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { 90 (void) x_in, (void) y, (void) res; 91} 92 93#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */ 94 95#define _ASM 96#include <sys/asm_linkage.h> 97 98/* 99 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction 100 */ 101 102// static uint8_t byte_swap16_mask[] = { 103// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; 104SECTION_STATIC 105.balign XMM_ALIGN 106.Lbyte_swap16_mask: 107 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 108 109 110/* 111 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 112 * 113 * Perform a carry-less multiplication (that is, use XOR instead of the 114 * multiply operator) on P1 and P2 and place the result in P3. 115 * 116 * Byte swap the input and the output. 117 * 118 * Note: x_in, y, and res all point to a block of 20-byte numbers 119 * (an array of two 64-bit integers). 120 * 121 * Note2: For kernel code, caller is responsible for ensuring 122 * kpreempt_disable() has been called. This is because %xmm registers are 123 * not saved/restored. Clear and set the CR0.TS bit on entry and exit, 124 * respectively, if TS is set on entry. Otherwise, if TS is not set, 125 * save and restore %xmm registers on the stack. 126 * 127 * Note3: Original Intel definition: 128 * void galois_hash_asm(unsigned char *hk, unsigned char *s, 129 * unsigned char *d, int length) 130 * 131 * Note4: Register/parameter mapping: 132 * Intel: 133 * Parameter 1: %rcx (copied to %xmm0) hk or x_in 134 * Parameter 2: %rdx (copied to %xmm1) s or y 135 * Parameter 3: %rdi (result) d or res 136 * OpenSolaris: 137 * Parameter 1: %rdi (copied to %xmm0) x_in 138 * Parameter 2: %rsi (copied to %xmm1) y 139 * Parameter 3: %rdx (result) res 140 */ 141 142ENTRY_NP(gcm_mul_pclmulqdq) 143 // 144 // Copy Parameters 145 // 146 movdqu (%rdi), %xmm0 // P1 147 movdqu (%rsi), %xmm1 // P2 148 149 // 150 // Byte swap 16-byte input 151 // 152 lea .Lbyte_swap16_mask(%rip), %rax 153 movups (%rax), %xmm10 154 pshufb %xmm10, %xmm0 155 pshufb %xmm10, %xmm1 156 157 158 // 159 // Multiply with the hash key 160 // 161 movdqu %xmm0, %xmm3 162 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 163 164 movdqu %xmm0, %xmm4 165 pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 166 167 movdqu %xmm0, %xmm5 168 pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 169 movdqu %xmm0, %xmm6 170 pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 171 172 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 173 174 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 175 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right 176 pslldq $8, %xmm5 // shift by xmm5 64 bits to the left 177 pxor %xmm5, %xmm3 178 pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result 179 // of the carry-less multiplication of 180 // xmm0 by xmm1. 181 182 // We shift the result of the multiplication by one bit position 183 // to the left to cope for the fact that the bits are reversed. 184 movdqu %xmm3, %xmm7 185 movdqu %xmm6, %xmm8 186 pslld $1, %xmm3 187 pslld $1, %xmm6 188 psrld $31, %xmm7 189 psrld $31, %xmm8 190 movdqu %xmm7, %xmm9 191 pslldq $4, %xmm8 192 pslldq $4, %xmm7 193 psrldq $12, %xmm9 194 por %xmm7, %xmm3 195 por %xmm8, %xmm6 196 por %xmm9, %xmm6 197 198 // 199 // First phase of the reduction 200 // 201 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 202 // independently. 203 movdqu %xmm3, %xmm7 204 movdqu %xmm3, %xmm8 205 movdqu %xmm3, %xmm9 206 pslld $31, %xmm7 // packed right shift shifting << 31 207 pslld $30, %xmm8 // packed right shift shifting << 30 208 pslld $25, %xmm9 // packed right shift shifting << 25 209 pxor %xmm8, %xmm7 // xor the shifted versions 210 pxor %xmm9, %xmm7 211 movdqu %xmm7, %xmm8 212 pslldq $12, %xmm7 213 psrldq $4, %xmm8 214 pxor %xmm7, %xmm3 // first phase of the reduction complete 215 216 // 217 // Second phase of the reduction 218 // 219 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 220 // shift operations. 221 movdqu %xmm3, %xmm2 222 movdqu %xmm3, %xmm4 // packed left shifting >> 1 223 movdqu %xmm3, %xmm5 224 psrld $1, %xmm2 225 psrld $2, %xmm4 // packed left shifting >> 2 226 psrld $7, %xmm5 // packed left shifting >> 7 227 pxor %xmm4, %xmm2 // xor the shifted versions 228 pxor %xmm5, %xmm2 229 pxor %xmm8, %xmm2 230 pxor %xmm2, %xmm3 231 pxor %xmm3, %xmm6 // the result is in xmm6 232 233 // 234 // Byte swap 16-byte result 235 // 236 pshufb %xmm10, %xmm6 // %xmm10 has the swap mask 237 238 // 239 // Store the result 240 // 241 movdqu %xmm6, (%rdx) // P3 242 243 244 // 245 // Return 246 // 247 RET 248 SET_SIZE(gcm_mul_pclmulqdq) 249 250#endif /* lint || __lint */ 251 252#ifdef __ELF__ 253.section .note.GNU-stack,"",%progbits 254#endif 255