1 /*===--------------- sm4intrin.h - SM4 intrinsics -----------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error "Never use <sm4intrin.h> directly; include <immintrin.h> instead." 12 #endif // __IMMINTRIN_H 13 14 #ifndef __SM4INTRIN_H 15 #define __SM4INTRIN_H 16 17 /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic 18 /// operates on independent 128-bit lanes. The calculated results are 19 /// stored in \a dst. 20 /// \headerfile <immintrin.h> 21 /// 22 /// \code 23 /// __m128i _mm_sm4key4_epi32(__m128i __A, __m128i __B) 24 /// \endcode 25 /// 26 /// This intrinsic corresponds to the \c VSM4KEY4 instruction. 27 /// 28 /// \param __A 29 /// A 128-bit vector of [4 x int]. 30 /// \param __B 31 /// A 128-bit vector of [4 x int]. 32 /// \returns 33 /// A 128-bit vector of [4 x int]. 34 /// 35 /// \code{.operation} 36 /// DEFINE ROL32(dword, n) { 37 /// count := n % 32 38 /// dest := (dword << count) | (dword >> (32-count)) 39 /// RETURN dest 40 /// } 41 /// DEFINE SBOX_BYTE(dword, i) { 42 /// RETURN sbox[dword.byte[i]] 43 /// } 44 /// DEFINE lower_t(dword) { 45 /// tmp.byte[0] := SBOX_BYTE(dword, 0) 46 /// tmp.byte[1] := SBOX_BYTE(dword, 1) 47 /// tmp.byte[2] := SBOX_BYTE(dword, 2) 48 /// tmp.byte[3] := SBOX_BYTE(dword, 3) 49 /// RETURN tmp 50 /// } 51 /// DEFINE L_KEY(dword) { 52 /// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) 53 /// } 54 /// DEFINE T_KEY(dword) { 55 /// RETURN L_KEY(lower_t(dword)) 56 /// } 57 /// DEFINE F_KEY(X0, X1, X2, X3, round_key) { 58 /// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) 59 /// } 60 /// FOR i:= 0 to 0 61 /// P[0] := __B.xmm[i].dword[0] 62 /// P[1] := __B.xmm[i].dword[1] 63 /// P[2] := __B.xmm[i].dword[2] 64 /// P[3] := __B.xmm[i].dword[3] 65 /// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 66 /// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 67 /// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 68 /// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 69 /// DEST.xmm[i].dword[0] := C[0] 70 /// DEST.xmm[i].dword[1] := C[1] 71 /// DEST.xmm[i].dword[2] := C[2] 72 /// DEST.xmm[i].dword[3] := C[3] 73 /// ENDFOR 74 /// DEST[MAX:128] := 0 75 /// \endcode 76 #define _mm_sm4key4_epi32(A, B) \ 77 (__m128i) __builtin_ia32_vsm4key4128((__v4su)A, (__v4su)B) 78 79 /// This intrinsic performs four rounds of SM4 key expansion. The intrinsic 80 /// operates on independent 128-bit lanes. The calculated results are 81 /// stored in \a dst. 82 /// \headerfile <immintrin.h> 83 /// 84 /// \code 85 /// __m256i _mm256_sm4key4_epi32(__m256i __A, __m256i __B) 86 /// \endcode 87 /// 88 /// This intrinsic corresponds to the \c VSM4KEY4 instruction. 89 /// 90 /// \param __A 91 /// A 256-bit vector of [8 x int]. 92 /// \param __B 93 /// A 256-bit vector of [8 x int]. 94 /// \returns 95 /// A 256-bit vector of [8 x int]. 96 /// 97 /// \code{.operation} 98 /// DEFINE ROL32(dword, n) { 99 /// count := n % 32 100 /// dest := (dword << count) | (dword >> (32-count)) 101 /// RETURN dest 102 /// } 103 /// DEFINE SBOX_BYTE(dword, i) { 104 /// RETURN sbox[dword.byte[i]] 105 /// } 106 /// DEFINE lower_t(dword) { 107 /// tmp.byte[0] := SBOX_BYTE(dword, 0) 108 /// tmp.byte[1] := SBOX_BYTE(dword, 1) 109 /// tmp.byte[2] := SBOX_BYTE(dword, 2) 110 /// tmp.byte[3] := SBOX_BYTE(dword, 3) 111 /// RETURN tmp 112 /// } 113 /// DEFINE L_KEY(dword) { 114 /// RETURN dword ^ ROL32(dword, 13) ^ ROL32(dword, 23) 115 /// } 116 /// DEFINE T_KEY(dword) { 117 /// RETURN L_KEY(lower_t(dword)) 118 /// } 119 /// DEFINE F_KEY(X0, X1, X2, X3, round_key) { 120 /// RETURN X0 ^ T_KEY(X1 ^ X2 ^ X3 ^ round_key) 121 /// } 122 /// FOR i:= 0 to 1 123 /// P[0] := __B.xmm[i].dword[0] 124 /// P[1] := __B.xmm[i].dword[1] 125 /// P[2] := __B.xmm[i].dword[2] 126 /// P[3] := __B.xmm[i].dword[3] 127 /// C[0] := F_KEY(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 128 /// C[1] := F_KEY(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 129 /// C[2] := F_KEY(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 130 /// C[3] := F_KEY(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 131 /// DEST.xmm[i].dword[0] := C[0] 132 /// DEST.xmm[i].dword[1] := C[1] 133 /// DEST.xmm[i].dword[2] := C[2] 134 /// DEST.xmm[i].dword[3] := C[3] 135 /// ENDFOR 136 /// DEST[MAX:256] := 0 137 /// \endcode 138 #define _mm256_sm4key4_epi32(A, B) \ 139 (__m256i) __builtin_ia32_vsm4key4256((__v8su)A, (__v8su)B) 140 141 /// This intrinisc performs four rounds of SM4 encryption. The intrinisc 142 /// operates on independent 128-bit lanes. The calculated results are 143 /// stored in \a dst. 144 /// \headerfile <immintrin.h> 145 /// 146 /// \code 147 /// __m128i _mm_sm4rnds4_epi32(__m128i __A, __m128i __B) 148 /// \endcode 149 /// 150 /// This intrinsic corresponds to the \c VSM4RNDS4 instruction. 151 /// 152 /// \param __A 153 /// A 128-bit vector of [4 x int]. 154 /// \param __B 155 /// A 128-bit vector of [4 x int]. 156 /// \returns 157 /// A 128-bit vector of [4 x int]. 158 /// 159 /// \code{.operation} 160 /// DEFINE ROL32(dword, n) { 161 /// count := n % 32 162 /// dest := (dword << count) | (dword >> (32-count)) 163 /// RETURN dest 164 /// } 165 /// DEFINE lower_t(dword) { 166 /// tmp.byte[0] := SBOX_BYTE(dword, 0) 167 /// tmp.byte[1] := SBOX_BYTE(dword, 1) 168 /// tmp.byte[2] := SBOX_BYTE(dword, 2) 169 /// tmp.byte[3] := SBOX_BYTE(dword, 3) 170 /// RETURN tmp 171 /// } 172 /// DEFINE L_RND(dword) { 173 /// tmp := dword 174 /// tmp := tmp ^ ROL32(dword, 2) 175 /// tmp := tmp ^ ROL32(dword, 10) 176 /// tmp := tmp ^ ROL32(dword, 18) 177 /// tmp := tmp ^ ROL32(dword, 24) 178 /// RETURN tmp 179 /// } 180 /// DEFINE T_RND(dword) { 181 /// RETURN L_RND(lower_t(dword)) 182 /// } 183 /// DEFINE F_RND(X0, X1, X2, X3, round_key) { 184 /// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) 185 /// } 186 /// FOR i:= 0 to 0 187 /// P[0] := __B.xmm[i].dword[0] 188 /// P[1] := __B.xmm[i].dword[1] 189 /// P[2] := __B.xmm[i].dword[2] 190 /// P[3] := __B.xmm[i].dword[3] 191 /// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 192 /// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 193 /// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 194 /// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 195 /// DEST.xmm[i].dword[0] := C[0] 196 /// DEST.xmm[i].dword[1] := C[1] 197 /// DEST.xmm[i].dword[2] := C[2] 198 /// DEST.xmm[i].dword[3] := C[3] 199 /// ENDFOR 200 /// DEST[MAX:128] := 0 201 /// \endcode 202 #define _mm_sm4rnds4_epi32(A, B) \ 203 (__m128i) __builtin_ia32_vsm4rnds4128((__v4su)A, (__v4su)B) 204 205 /// This intrinisc performs four rounds of SM4 encryption. The intrinisc 206 /// operates on independent 128-bit lanes. The calculated results are 207 /// stored in \a dst. 208 /// \headerfile <immintrin.h> 209 /// 210 /// \code 211 /// __m256i _mm256_sm4rnds4_epi32(__m256i __A, __m256i __B) 212 /// \endcode 213 /// 214 /// This intrinsic corresponds to the \c VSM4RNDS4 instruction. 215 /// 216 /// \param __A 217 /// A 256-bit vector of [8 x int]. 218 /// \param __B 219 /// A 256-bit vector of [8 x int]. 220 /// \returns 221 /// A 256-bit vector of [8 x int]. 222 /// 223 /// \code{.operation} 224 /// DEFINE ROL32(dword, n) { 225 /// count := n % 32 226 /// dest := (dword << count) | (dword >> (32-count)) 227 /// RETURN dest 228 /// } 229 /// DEFINE lower_t(dword) { 230 /// tmp.byte[0] := SBOX_BYTE(dword, 0) 231 /// tmp.byte[1] := SBOX_BYTE(dword, 1) 232 /// tmp.byte[2] := SBOX_BYTE(dword, 2) 233 /// tmp.byte[3] := SBOX_BYTE(dword, 3) 234 /// RETURN tmp 235 /// } 236 /// DEFINE L_RND(dword) { 237 /// tmp := dword 238 /// tmp := tmp ^ ROL32(dword, 2) 239 /// tmp := tmp ^ ROL32(dword, 10) 240 /// tmp := tmp ^ ROL32(dword, 18) 241 /// tmp := tmp ^ ROL32(dword, 24) 242 /// RETURN tmp 243 /// } 244 /// DEFINE T_RND(dword) { 245 /// RETURN L_RND(lower_t(dword)) 246 /// } 247 /// DEFINE F_RND(X0, X1, X2, X3, round_key) { 248 /// RETURN X0 ^ T_RND(X1 ^ X2 ^ X3 ^ round_key) 249 /// } 250 /// FOR i:= 0 to 0 251 /// P[0] := __B.xmm[i].dword[0] 252 /// P[1] := __B.xmm[i].dword[1] 253 /// P[2] := __B.xmm[i].dword[2] 254 /// P[3] := __B.xmm[i].dword[3] 255 /// C[0] := F_RND(P[0], P[1], P[2], P[3], __A.xmm[i].dword[0]) 256 /// C[1] := F_RND(P[1], P[2], P[3], C[0], __A.xmm[i].dword[1]) 257 /// C[2] := F_RND(P[2], P[3], C[0], C[1], __A.xmm[i].dword[2]) 258 /// C[3] := F_RND(P[3], C[0], C[1], C[2], __A.xmm[i].dword[3]) 259 /// DEST.xmm[i].dword[0] := C[0] 260 /// DEST.xmm[i].dword[1] := C[1] 261 /// DEST.xmm[i].dword[2] := C[2] 262 /// DEST.xmm[i].dword[3] := C[3] 263 /// ENDFOR 264 /// DEST[MAX:256] := 0 265 /// \endcode 266 #define _mm256_sm4rnds4_epi32(A, B) \ 267 (__m256i) __builtin_ia32_vsm4rnds4256((__v8su)A, (__v8su)B) 268 269 #endif // __SM4INTRIN_H 270