1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2009 Intel Corporation 24 * All Rights Reserved. 25 */ 26/* 27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31/* 32 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 33 * instructions. This file contains an accelerated 34 * Galois Field Multiplication implementation. 35 * 36 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, 37 * carry-less multiplication. More information about PCLMULQDQ can be 38 * found at: 39 * http://software.intel.com/en-us/articles/ 40 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 41 * 42 */ 43 44/* 45 * ==================================================================== 46 * OpenSolaris OS modifications 47 * 48 * This source originates as file galois_hash_asm.c from 49 * Intel Corporation dated September 21, 2009. 50 * 51 * This OpenSolaris version has these major changes from the original source: 52 * 53 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 54 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function 55 * definition for lint. 56 * 57 * 2. Formatted code, added comments, and added #includes and #defines. 58 * 59 * 3. Commented out pclmulqdq and pshufb instructions and replaced with 60 * .byte sequences (as pclmulqdq isn't supported yet by all of the gas, as, 61 * and aw assemblers). 62 * 63 * 4. If bit CR0.TS is set, clear and set the TS bit, after and before 64 * calling kpreempt_disable() and kpreempt_enable(). 65 * If the TS bit is not set, Save and restore %xmm registers at the beginning 66 * and end of function calls (%xmm* registers are not saved and restored by 67 * during kernel thread preemption). 68 * 69 * 5. Removed code to perform hashing. This is already done with C macro 70 * GHASH in gcm.c. For better performance, this removed code should be 71 * reintegrated in the future to replace the C GHASH macro. 72 * 73 * 6. Added code to byte swap 16-byte input and output. 74 * 75 * 7. Folded in comments from the original C source with embedded assembly 76 * (SB_w_shift_xor.c) 77 * 78 * 8. Renamed function and reordered parameters to match OpenSolaris: 79 * Intel interface: 80 * void galois_hash_asm(unsigned char *hk, unsigned char *s, 81 * unsigned char *d, int length) 82 * OpenSolaris OS interface: 83 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 84 * ==================================================================== 85 */ 86 87 88#if defined(lint) || defined(__lint) 89 90#include <sys/types.h> 91 92/* ARGSUSED */ 93void 94gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { 95} 96 97#else /* lint */ 98 99#include <sys/asm_linkage.h> 100#include <sys/controlregs.h> 101#ifdef _KERNEL 102#include <sys/machprivregs.h> 103#endif 104 105#ifdef _KERNEL 106 /* 107 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, 108 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it 109 * uses it to pass P2 to syscall. 110 * This also occurs with the STTS macro, but we don't care if 111 * P2 (%rsi) is modified just before function exit. 112 * The CLTS and STTS macros push and pop P1 (%rdi) already. 113 */ 114#ifdef __xpv 115#define PROTECTED_CLTS \ 116 push %rsi; \ 117 CLTS; \ 118 pop %rsi 119#else 120#define PROTECTED_CLTS \ 121 CLTS 122#endif /* __xpv */ 123 124 /* 125 * If CR0_TS is not set, align stack (with push %rbp) and push 126 * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS 127 */ 128#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \ 129 push %rbp; \ 130 mov %rsp, %rbp; \ 131 movq %cr0, tmpreg; \ 132 testq $CR0_TS, tmpreg; \ 133 jnz 1f; \ 134 and $-XMM_ALIGN, %rsp; \ 135 sub $[XMM_SIZE * 11], %rsp; \ 136 movaps %xmm0, 160(%rsp); \ 137 movaps %xmm1, 144(%rsp); \ 138 movaps %xmm2, 128(%rsp); \ 139 movaps %xmm3, 112(%rsp); \ 140 movaps %xmm4, 96(%rsp); \ 141 movaps %xmm5, 80(%rsp); \ 142 movaps %xmm6, 64(%rsp); \ 143 movaps %xmm7, 48(%rsp); \ 144 movaps %xmm8, 32(%rsp); \ 145 movaps %xmm9, 16(%rsp); \ 146 movaps %xmm10, (%rsp); \ 147 jmp 2f; \ 1481: \ 149 PROTECTED_CLTS; \ 1502: 151 152 153 /* 154 * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack, 155 * otherwise set CR0_TS. 156 */ 157#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \ 158 testq $CR0_TS, tmpreg; \ 159 jnz 1f; \ 160 movaps (%rsp), %xmm10; \ 161 movaps 16(%rsp), %xmm9; \ 162 movaps 32(%rsp), %xmm8; \ 163 movaps 48(%rsp), %xmm7; \ 164 movaps 64(%rsp), %xmm6; \ 165 movaps 80(%rsp), %xmm5; \ 166 movaps 96(%rsp), %xmm4; \ 167 movaps 112(%rsp), %xmm3; \ 168 movaps 128(%rsp), %xmm2; \ 169 movaps 144(%rsp), %xmm1; \ 170 movaps 160(%rsp), %xmm0; \ 171 jmp 2f; \ 1721: \ 173 STTS(tmpreg); \ 1742: \ 175 mov %rbp, %rsp; \ 176 pop %rbp 177 178 179#else 180#define PROTECTED_CLTS 181#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) 182#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) 183#endif /* _KERNEL */ 184 185/* 186 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction 187 */ 188 189// static uint8_t byte_swap16_mask[] = { 190// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; 191.text 192.align XMM_ALIGN 193.Lbyte_swap16_mask: 194 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 195 196 197 198/* 199 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 200 * 201 * Perform a carry-less multiplication (that is, use XOR instead of the 202 * multiply operator) on P1 and P2 and place the result in P3. 203 * 204 * Byte swap the input and the output. 205 * 206 * Note: x_in, y, and res all point to a block of 20-byte numbers 207 * (an array of two 64-bit integers). 208 * 209 * Note2: For kernel code, caller is responsible for ensuring 210 * kpreempt_disable() has been called. This is because %xmm registers are 211 * not saved/restored. Clear and set the CR0.TS bit on entry and exit, 212 * respectively, if TS is set on entry. Otherwise, if TS is not set, 213 * save and restore %xmm registers on the stack. 214 * 215 * Note3: Original Intel definition: 216 * void galois_hash_asm(unsigned char *hk, unsigned char *s, 217 * unsigned char *d, int length) 218 * 219 * Note4: Register/parameter mapping: 220 * Intel: 221 * Parameter 1: %rcx (copied to %xmm0) hk or x_in 222 * Parameter 2: %rdx (copied to %xmm1) s or y 223 * Parameter 3: %rdi (result) d or res 224 * OpenSolaris: 225 * Parameter 1: %rdi (copied to %xmm0) x_in 226 * Parameter 2: %rsi (copied to %xmm1) y 227 * Parameter 3: %rdx (result) res 228 */ 229 230ENTRY_NP(gcm_mul_pclmulqdq) 231 CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10) 232 233 // 234 // Copy Parameters 235 // 236 movdqu (%rdi), %xmm0 // P1 237 movdqu (%rsi), %xmm1 // P2 238 239 // 240 // Byte swap 16-byte input 241 // 242 lea .Lbyte_swap16_mask(%rip), %rax 243 movaps (%rax), %xmm10 244 //pshufb %xmm10, %xmm0 245 .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xc2 246 //pshufb %xmm10, %xmm1 247 .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xca 248 249 250 // 251 // Multiply with the hash key 252 // 253 movdqu %xmm0, %xmm3 254 //pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 255 .byte 0x66, 0x0f, 0x3a, 0x44, 0xd9, 0x00 256 257 movdqu %xmm0, %xmm4 258 //pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 259 .byte 0x66, 0x0f, 0x3a, 0x44, 0xe1, 0x10 260 261 movdqu %xmm0, %xmm5 262 //pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 263 .byte 0x66, 0x0f, 0x3a, 0x44, 0xe9, 0x01 264 movdqu %xmm0, %xmm6 265 //pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 266 .byte 0x66, 0x0f, 0x3a, 0x44, 0xf1, 0x11 267 268 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 269 270 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 271 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right 272 pslldq $8, %xmm5 // shift by xmm5 64 bits to the left 273 pxor %xmm5, %xmm3 274 pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result 275 // of the carry-less multiplication of 276 // xmm0 by xmm1. 277 278 // We shift the result of the multiplication by one bit position 279 // to the left to cope for the fact that the bits are reversed. 280 movdqu %xmm3, %xmm7 281 movdqu %xmm6, %xmm8 282 pslld $1, %xmm3 283 pslld $1, %xmm6 284 psrld $31, %xmm7 285 psrld $31, %xmm8 286 movdqu %xmm7, %xmm9 287 pslldq $4, %xmm8 288 pslldq $4, %xmm7 289 psrldq $12, %xmm9 290 por %xmm7, %xmm3 291 por %xmm8, %xmm6 292 por %xmm9, %xmm6 293 294 // 295 // First phase of the reduction 296 // 297 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 298 // independently. 299 movdqu %xmm3, %xmm7 300 movdqu %xmm3, %xmm8 301 movdqu %xmm3, %xmm9 302 pslld $31, %xmm7 // packed right shift shifting << 31 303 pslld $30, %xmm8 // packed right shift shifting << 30 304 pslld $25, %xmm9 // packed right shift shifting << 25 305 pxor %xmm8, %xmm7 // xor the shifted versions 306 pxor %xmm9, %xmm7 307 movdqu %xmm7, %xmm8 308 pslldq $12, %xmm7 309 psrldq $4, %xmm8 310 pxor %xmm7, %xmm3 // first phase of the reduction complete 311 312 // 313 // Second phase of the reduction 314 // 315 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 316 // shift operations. 317 movdqu %xmm3, %xmm2 318 movdqu %xmm3, %xmm4 // packed left shifting >> 1 319 movdqu %xmm3, %xmm5 320 psrld $1, %xmm2 321 psrld $2, %xmm4 // packed left shifting >> 2 322 psrld $7, %xmm5 // packed left shifting >> 7 323 pxor %xmm4, %xmm2 // xor the shifted versions 324 pxor %xmm5, %xmm2 325 pxor %xmm8, %xmm2 326 pxor %xmm2, %xmm3 327 pxor %xmm3, %xmm6 // the result is in xmm6 328 329 // 330 // Byte swap 16-byte result 331 // 332 //pshufb %xmm10, %xmm6 // %xmm10 has the swap mask 333 .byte 0x66, 0x41, 0x0f, 0x38, 0x00, 0xf2 334 335 // 336 // Store the result 337 // 338 movdqu %xmm6, (%rdx) // P3 339 340 341 // 342 // Cleanup and Return 343 // 344 SET_TS_OR_POP_XMM_REGISTERS(%r10) 345 ret 346 SET_SIZE(gcm_mul_pclmulqdq) 347 348#endif /* lint || __lint */ 349