1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2009 Intel Corporation 24 * All Rights Reserved. 25 */ 26/* 27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31/* 32 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI 33 * instructions. This file contains an accelerated 34 * Galois Field Multiplication implementation. 35 * 36 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, 37 * carry-less multiplication. More information about PCLMULQDQ can be 38 * found at: 39 * http://software.intel.com/en-us/articles/ 40 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ 41 * 42 */ 43 44/* 45 * ==================================================================== 46 * OpenSolaris OS modifications 47 * 48 * This source originates as file galois_hash_asm.c from 49 * Intel Corporation dated September 21, 2009. 50 * 51 * This OpenSolaris version has these major changes from the original source: 52 * 53 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from 54 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function 55 * definition for lint. 56 * 57 * 2. Formatted code, added comments, and added #includes and #defines. 58 * 59 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before 60 * calling kpreempt_disable() and kpreempt_enable(). 61 * If the TS bit is not set, Save and restore %xmm registers at the beginning 62 * and end of function calls (%xmm* registers are not saved and restored by 63 * during kernel thread preemption). 64 * 65 * 4. Removed code to perform hashing. This is already done with C macro 66 * GHASH in gcm.c. For better performance, this removed code should be 67 * reintegrated in the future to replace the C GHASH macro. 68 * 69 * 5. Added code to byte swap 16-byte input and output. 70 * 71 * 6. Folded in comments from the original C source with embedded assembly 72 * (SB_w_shift_xor.c) 73 * 74 * 7. Renamed function and reordered parameters to match OpenSolaris: 75 * Intel interface: 76 * void galois_hash_asm(unsigned char *hk, unsigned char *s, 77 * unsigned char *d, int length) 78 * OpenSolaris OS interface: 79 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 80 * ==================================================================== 81 */ 82 83 84#if defined(lint) || defined(__lint) 85 86#include <sys/types.h> 87 88/* ARGSUSED */ 89void 90gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { 91} 92 93#else /* lint */ 94 95#include <sys/asm_linkage.h> 96#include <sys/controlregs.h> 97#ifdef _KERNEL 98#include <sys/machprivregs.h> 99#endif 100 101#ifdef _KERNEL 102 /* 103 * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, 104 * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it 105 * uses it to pass P2 to syscall. 106 * This also occurs with the STTS macro, but we don't care if 107 * P2 (%rsi) is modified just before function exit. 108 * The CLTS and STTS macros push and pop P1 (%rdi) already. 109 */ 110#ifdef __xpv 111#define PROTECTED_CLTS \ 112 push %rsi; \ 113 CLTS; \ 114 pop %rsi 115#else 116#define PROTECTED_CLTS \ 117 CLTS 118#endif /* __xpv */ 119 120 /* 121 * If CR0_TS is not set, align stack (with push %rbp) and push 122 * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS 123 */ 124#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \ 125 push %rbp; \ 126 mov %rsp, %rbp; \ 127 movq %cr0, tmpreg; \ 128 testq $CR0_TS, tmpreg; \ 129 jnz 1f; \ 130 and $-XMM_ALIGN, %rsp; \ 131 sub $[XMM_SIZE * 11], %rsp; \ 132 movaps %xmm0, 160(%rsp); \ 133 movaps %xmm1, 144(%rsp); \ 134 movaps %xmm2, 128(%rsp); \ 135 movaps %xmm3, 112(%rsp); \ 136 movaps %xmm4, 96(%rsp); \ 137 movaps %xmm5, 80(%rsp); \ 138 movaps %xmm6, 64(%rsp); \ 139 movaps %xmm7, 48(%rsp); \ 140 movaps %xmm8, 32(%rsp); \ 141 movaps %xmm9, 16(%rsp); \ 142 movaps %xmm10, (%rsp); \ 143 jmp 2f; \ 1441: \ 145 PROTECTED_CLTS; \ 1462: 147 148 149 /* 150 * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack, 151 * otherwise set CR0_TS. 152 */ 153#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \ 154 testq $CR0_TS, tmpreg; \ 155 jnz 1f; \ 156 movaps (%rsp), %xmm10; \ 157 movaps 16(%rsp), %xmm9; \ 158 movaps 32(%rsp), %xmm8; \ 159 movaps 48(%rsp), %xmm7; \ 160 movaps 64(%rsp), %xmm6; \ 161 movaps 80(%rsp), %xmm5; \ 162 movaps 96(%rsp), %xmm4; \ 163 movaps 112(%rsp), %xmm3; \ 164 movaps 128(%rsp), %xmm2; \ 165 movaps 144(%rsp), %xmm1; \ 166 movaps 160(%rsp), %xmm0; \ 167 jmp 2f; \ 1681: \ 169 STTS(tmpreg); \ 1702: \ 171 mov %rbp, %rsp; \ 172 pop %rbp 173 174 175#else 176#define PROTECTED_CLTS 177#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) 178#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) 179#endif /* _KERNEL */ 180 181/* 182 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction 183 */ 184 185// static uint8_t byte_swap16_mask[] = { 186// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; 187.text 188.align XMM_ALIGN 189.Lbyte_swap16_mask: 190 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 191 192 193 194/* 195 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); 196 * 197 * Perform a carry-less multiplication (that is, use XOR instead of the 198 * multiply operator) on P1 and P2 and place the result in P3. 199 * 200 * Byte swap the input and the output. 201 * 202 * Note: x_in, y, and res all point to a block of 20-byte numbers 203 * (an array of two 64-bit integers). 204 * 205 * Note2: For kernel code, caller is responsible for ensuring 206 * kpreempt_disable() has been called. This is because %xmm registers are 207 * not saved/restored. Clear and set the CR0.TS bit on entry and exit, 208 * respectively, if TS is set on entry. Otherwise, if TS is not set, 209 * save and restore %xmm registers on the stack. 210 * 211 * Note3: Original Intel definition: 212 * void galois_hash_asm(unsigned char *hk, unsigned char *s, 213 * unsigned char *d, int length) 214 * 215 * Note4: Register/parameter mapping: 216 * Intel: 217 * Parameter 1: %rcx (copied to %xmm0) hk or x_in 218 * Parameter 2: %rdx (copied to %xmm1) s or y 219 * Parameter 3: %rdi (result) d or res 220 * OpenSolaris: 221 * Parameter 1: %rdi (copied to %xmm0) x_in 222 * Parameter 2: %rsi (copied to %xmm1) y 223 * Parameter 3: %rdx (result) res 224 */ 225 226ENTRY_NP(gcm_mul_pclmulqdq) 227 CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10) 228 229 // 230 // Copy Parameters 231 // 232 movdqu (%rdi), %xmm0 // P1 233 movdqu (%rsi), %xmm1 // P2 234 235 // 236 // Byte swap 16-byte input 237 // 238 lea .Lbyte_swap16_mask(%rip), %rax 239 movaps (%rax), %xmm10 240 pshufb %xmm10, %xmm0 241 pshufb %xmm10, %xmm1 242 243 244 // 245 // Multiply with the hash key 246 // 247 movdqu %xmm0, %xmm3 248 pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 249 250 movdqu %xmm0, %xmm4 251 pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 252 253 movdqu %xmm0, %xmm5 254 pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 255 movdqu %xmm0, %xmm6 256 pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 257 258 pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 259 260 movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 261 psrldq $8, %xmm4 // shift by xmm4 64 bits to the right 262 pslldq $8, %xmm5 // shift by xmm5 64 bits to the left 263 pxor %xmm5, %xmm3 264 pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result 265 // of the carry-less multiplication of 266 // xmm0 by xmm1. 267 268 // We shift the result of the multiplication by one bit position 269 // to the left to cope for the fact that the bits are reversed. 270 movdqu %xmm3, %xmm7 271 movdqu %xmm6, %xmm8 272 pslld $1, %xmm3 273 pslld $1, %xmm6 274 psrld $31, %xmm7 275 psrld $31, %xmm8 276 movdqu %xmm7, %xmm9 277 pslldq $4, %xmm8 278 pslldq $4, %xmm7 279 psrldq $12, %xmm9 280 por %xmm7, %xmm3 281 por %xmm8, %xmm6 282 por %xmm9, %xmm6 283 284 // 285 // First phase of the reduction 286 // 287 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 288 // independently. 289 movdqu %xmm3, %xmm7 290 movdqu %xmm3, %xmm8 291 movdqu %xmm3, %xmm9 292 pslld $31, %xmm7 // packed right shift shifting << 31 293 pslld $30, %xmm8 // packed right shift shifting << 30 294 pslld $25, %xmm9 // packed right shift shifting << 25 295 pxor %xmm8, %xmm7 // xor the shifted versions 296 pxor %xmm9, %xmm7 297 movdqu %xmm7, %xmm8 298 pslldq $12, %xmm7 299 psrldq $4, %xmm8 300 pxor %xmm7, %xmm3 // first phase of the reduction complete 301 302 // 303 // Second phase of the reduction 304 // 305 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 306 // shift operations. 307 movdqu %xmm3, %xmm2 308 movdqu %xmm3, %xmm4 // packed left shifting >> 1 309 movdqu %xmm3, %xmm5 310 psrld $1, %xmm2 311 psrld $2, %xmm4 // packed left shifting >> 2 312 psrld $7, %xmm5 // packed left shifting >> 7 313 pxor %xmm4, %xmm2 // xor the shifted versions 314 pxor %xmm5, %xmm2 315 pxor %xmm8, %xmm2 316 pxor %xmm2, %xmm3 317 pxor %xmm3, %xmm6 // the result is in xmm6 318 319 // 320 // Byte swap 16-byte result 321 // 322 pshufb %xmm10, %xmm6 // %xmm10 has the swap mask 323 324 // 325 // Store the result 326 // 327 movdqu %xmm6, (%rdx) // P3 328 329 330 // 331 // Cleanup and Return 332 // 333 SET_TS_OR_POP_XMM_REGISTERS(%r10) 334 ret 335 SET_SIZE(gcm_mul_pclmulqdq) 336 337#endif /* lint || __lint */ 338