1 /*
2 * Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10 #include <string.h>
11 #include <openssl/crypto.h>
12 #include "internal/cryptlib.h"
13 #include "internal/endian.h"
14 #include "crypto/modes.h"
15
16 #if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
17 typedef size_t size_t_aX __attribute((__aligned__(1)));
18 #else
19 typedef size_t size_t_aX;
20 #endif
21
22 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
23 /* redefine, because alignment is ensured */
24 # undef GETU32
25 # define GETU32(p) BSWAP4(*(const u32 *)(p))
26 # undef PUTU32
27 # define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
28 #endif
29
30 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
31 #define REDUCE1BIT(V) do { \
32 if (sizeof(size_t)==8) { \
33 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
34 V.lo = (V.hi<<63)|(V.lo>>1); \
35 V.hi = (V.hi>>1 )^T; \
36 } \
37 else { \
38 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
39 V.lo = (V.hi<<63)|(V.lo>>1); \
40 V.hi = (V.hi>>1 )^((u64)T<<32); \
41 } \
42 } while(0)
43
44 /*-
45 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
46 * never be set to 8. 8 is effectively reserved for testing purposes.
47 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
48 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
49 * whole spectrum of possible table driven implementations. Why? In
50 * non-"Shoup's" case memory access pattern is segmented in such manner,
51 * that it's trivial to see that cache timing information can reveal
52 * fair portion of intermediate hash value. Given that ciphertext is
53 * always available to attacker, it's possible for him to attempt to
54 * deduce secret parameter H and if successful, tamper with messages
55 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
56 * not as trivial, but there is no reason to believe that it's resistant
57 * to cache-timing attack. And the thing about "8-bit" implementation is
58 * that it consumes 16 (sixteen) times more memory, 4KB per individual
59 * key + 1KB shared. Well, on pros side it should be twice as fast as
60 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
61 * was observed to run ~75% faster, closer to 100% for commercial
62 * compilers... Yet "4-bit" procedure is preferred, because it's
63 * believed to provide better security-performance balance and adequate
64 * all-round performance. "All-round" refers to things like:
65 *
66 * - shorter setup time effectively improves overall timing for
67 * handling short messages;
68 * - larger table allocation can become unbearable because of VM
69 * subsystem penalties (for example on Windows large enough free
70 * results in VM working set trimming, meaning that consequent
71 * malloc would immediately incur working set expansion);
72 * - larger table has larger cache footprint, which can affect
73 * performance of other code paths (not necessarily even from same
74 * thread in Hyper-Threading world);
75 *
76 * Value of 1 is not appropriate for performance reasons.
77 */
78 #if TABLE_BITS==8
79
gcm_init_8bit(u128 Htable[256],u64 H[2])80 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
81 {
82 int i, j;
83 u128 V;
84
85 Htable[0].hi = 0;
86 Htable[0].lo = 0;
87 V.hi = H[0];
88 V.lo = H[1];
89
90 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
91 REDUCE1BIT(V);
92 Htable[i] = V;
93 }
94
95 for (i = 2; i < 256; i <<= 1) {
96 u128 *Hi = Htable + i, H0 = *Hi;
97 for (j = 1; j < i; ++j) {
98 Hi[j].hi = H0.hi ^ Htable[j].hi;
99 Hi[j].lo = H0.lo ^ Htable[j].lo;
100 }
101 }
102 }
103
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])104 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
105 {
106 u128 Z = { 0, 0 };
107 const u8 *xi = (const u8 *)Xi + 15;
108 size_t rem, n = *xi;
109 DECLARE_IS_ENDIAN;
110 static const size_t rem_8bit[256] = {
111 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
112 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
113 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
114 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
115 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
116 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
117 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
118 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
119 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
120 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
121 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
122 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
123 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
124 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
125 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
126 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
127 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
128 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
129 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
130 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
131 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
132 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
133 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
134 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
135 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
136 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
137 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
138 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
139 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
140 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
141 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
142 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
143 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
144 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
145 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
146 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
147 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
148 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
149 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
150 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
151 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
152 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
153 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
154 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
155 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
156 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
157 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
158 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
159 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
160 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
161 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
162 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
163 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
164 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
165 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
166 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
167 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
168 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
169 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
170 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
171 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
172 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
173 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
174 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
175 };
176
177 while (1) {
178 Z.hi ^= Htable[n].hi;
179 Z.lo ^= Htable[n].lo;
180
181 if ((u8 *)Xi == xi)
182 break;
183
184 n = *(--xi);
185
186 rem = (size_t)Z.lo & 0xff;
187 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
188 Z.hi = (Z.hi >> 8);
189 if (sizeof(size_t) == 8)
190 Z.hi ^= rem_8bit[rem];
191 else
192 Z.hi ^= (u64)rem_8bit[rem] << 32;
193 }
194
195 if (IS_LITTLE_ENDIAN) {
196 # ifdef BSWAP8
197 Xi[0] = BSWAP8(Z.hi);
198 Xi[1] = BSWAP8(Z.lo);
199 # else
200 u8 *p = (u8 *)Xi;
201 u32 v;
202 v = (u32)(Z.hi >> 32);
203 PUTU32(p, v);
204 v = (u32)(Z.hi);
205 PUTU32(p + 4, v);
206 v = (u32)(Z.lo >> 32);
207 PUTU32(p + 8, v);
208 v = (u32)(Z.lo);
209 PUTU32(p + 12, v);
210 # endif
211 } else {
212 Xi[0] = Z.hi;
213 Xi[1] = Z.lo;
214 }
215 }
216
217 # define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
218
219 #elif TABLE_BITS==4
220
gcm_init_4bit(u128 Htable[16],u64 H[2])221 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
222 {
223 u128 V;
224 # if defined(OPENSSL_SMALL_FOOTPRINT)
225 int i;
226 # endif
227
228 Htable[0].hi = 0;
229 Htable[0].lo = 0;
230 V.hi = H[0];
231 V.lo = H[1];
232
233 # if defined(OPENSSL_SMALL_FOOTPRINT)
234 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
235 REDUCE1BIT(V);
236 Htable[i] = V;
237 }
238
239 for (i = 2; i < 16; i <<= 1) {
240 u128 *Hi = Htable + i;
241 int j;
242 for (V = *Hi, j = 1; j < i; ++j) {
243 Hi[j].hi = V.hi ^ Htable[j].hi;
244 Hi[j].lo = V.lo ^ Htable[j].lo;
245 }
246 }
247 # else
248 Htable[8] = V;
249 REDUCE1BIT(V);
250 Htable[4] = V;
251 REDUCE1BIT(V);
252 Htable[2] = V;
253 REDUCE1BIT(V);
254 Htable[1] = V;
255 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
256 V = Htable[4];
257 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
258 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
259 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
260 V = Htable[8];
261 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
262 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
263 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
264 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
265 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
266 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
267 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
268 # endif
269 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
270 /*
271 * ARM assembler expects specific dword order in Htable.
272 */
273 {
274 int j;
275 DECLARE_IS_ENDIAN;
276
277 if (IS_LITTLE_ENDIAN)
278 for (j = 0; j < 16; ++j) {
279 V = Htable[j];
280 Htable[j].hi = V.lo;
281 Htable[j].lo = V.hi;
282 } else
283 for (j = 0; j < 16; ++j) {
284 V = Htable[j];
285 Htable[j].hi = V.lo << 32 | V.lo >> 32;
286 Htable[j].lo = V.hi << 32 | V.hi >> 32;
287 }
288 }
289 # endif
290 }
291
292 # ifndef GHASH_ASM
293 static const size_t rem_4bit[16] = {
294 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
295 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
296 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
297 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
298 };
299
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])300 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
301 {
302 u128 Z;
303 int cnt = 15;
304 size_t rem, nlo, nhi;
305 DECLARE_IS_ENDIAN;
306
307 nlo = ((const u8 *)Xi)[15];
308 nhi = nlo >> 4;
309 nlo &= 0xf;
310
311 Z.hi = Htable[nlo].hi;
312 Z.lo = Htable[nlo].lo;
313
314 while (1) {
315 rem = (size_t)Z.lo & 0xf;
316 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
317 Z.hi = (Z.hi >> 4);
318 if (sizeof(size_t) == 8)
319 Z.hi ^= rem_4bit[rem];
320 else
321 Z.hi ^= (u64)rem_4bit[rem] << 32;
322
323 Z.hi ^= Htable[nhi].hi;
324 Z.lo ^= Htable[nhi].lo;
325
326 if (--cnt < 0)
327 break;
328
329 nlo = ((const u8 *)Xi)[cnt];
330 nhi = nlo >> 4;
331 nlo &= 0xf;
332
333 rem = (size_t)Z.lo & 0xf;
334 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
335 Z.hi = (Z.hi >> 4);
336 if (sizeof(size_t) == 8)
337 Z.hi ^= rem_4bit[rem];
338 else
339 Z.hi ^= (u64)rem_4bit[rem] << 32;
340
341 Z.hi ^= Htable[nlo].hi;
342 Z.lo ^= Htable[nlo].lo;
343 }
344
345 if (IS_LITTLE_ENDIAN) {
346 # ifdef BSWAP8
347 Xi[0] = BSWAP8(Z.hi);
348 Xi[1] = BSWAP8(Z.lo);
349 # else
350 u8 *p = (u8 *)Xi;
351 u32 v;
352 v = (u32)(Z.hi >> 32);
353 PUTU32(p, v);
354 v = (u32)(Z.hi);
355 PUTU32(p + 4, v);
356 v = (u32)(Z.lo >> 32);
357 PUTU32(p + 8, v);
358 v = (u32)(Z.lo);
359 PUTU32(p + 12, v);
360 # endif
361 } else {
362 Xi[0] = Z.hi;
363 Xi[1] = Z.lo;
364 }
365 }
366
367 # if !defined(OPENSSL_SMALL_FOOTPRINT)
368 /*
369 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
370 * details... Compiler-generated code doesn't seem to give any
371 * performance improvement, at least not on x86[_64]. It's here
372 * mostly as reference and a placeholder for possible future
373 * non-trivial optimization[s]...
374 */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)375 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
376 const u8 *inp, size_t len)
377 {
378 u128 Z;
379 int cnt;
380 size_t rem, nlo, nhi;
381 DECLARE_IS_ENDIAN;
382
383 # if 1
384 do {
385 cnt = 15;
386 nlo = ((const u8 *)Xi)[15];
387 nlo ^= inp[15];
388 nhi = nlo >> 4;
389 nlo &= 0xf;
390
391 Z.hi = Htable[nlo].hi;
392 Z.lo = Htable[nlo].lo;
393
394 while (1) {
395 rem = (size_t)Z.lo & 0xf;
396 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
397 Z.hi = (Z.hi >> 4);
398 if (sizeof(size_t) == 8)
399 Z.hi ^= rem_4bit[rem];
400 else
401 Z.hi ^= (u64)rem_4bit[rem] << 32;
402
403 Z.hi ^= Htable[nhi].hi;
404 Z.lo ^= Htable[nhi].lo;
405
406 if (--cnt < 0)
407 break;
408
409 nlo = ((const u8 *)Xi)[cnt];
410 nlo ^= inp[cnt];
411 nhi = nlo >> 4;
412 nlo &= 0xf;
413
414 rem = (size_t)Z.lo & 0xf;
415 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
416 Z.hi = (Z.hi >> 4);
417 if (sizeof(size_t) == 8)
418 Z.hi ^= rem_4bit[rem];
419 else
420 Z.hi ^= (u64)rem_4bit[rem] << 32;
421
422 Z.hi ^= Htable[nlo].hi;
423 Z.lo ^= Htable[nlo].lo;
424 }
425 # else
426 /*
427 * Extra 256+16 bytes per-key plus 512 bytes shared tables
428 * [should] give ~50% improvement... One could have PACK()-ed
429 * the rem_8bit even here, but the priority is to minimize
430 * cache footprint...
431 */
432 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
433 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
434 static const unsigned short rem_8bit[256] = {
435 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
436 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
437 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
438 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
439 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
440 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
441 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
442 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
443 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
444 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
445 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
446 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
447 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
448 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
449 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
450 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
451 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
452 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
453 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
454 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
455 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
456 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
457 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
458 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
459 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
460 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
461 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
462 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
463 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
464 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
465 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
466 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
467 };
468 /*
469 * This pre-processing phase slows down procedure by approximately
470 * same time as it makes each loop spin faster. In other words
471 * single block performance is approximately same as straightforward
472 * "4-bit" implementation, and then it goes only faster...
473 */
474 for (cnt = 0; cnt < 16; ++cnt) {
475 Z.hi = Htable[cnt].hi;
476 Z.lo = Htable[cnt].lo;
477 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
478 Hshr4[cnt].hi = (Z.hi >> 4);
479 Hshl4[cnt] = (u8)(Z.lo << 4);
480 }
481
482 do {
483 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
484 nlo = ((const u8 *)Xi)[cnt];
485 nlo ^= inp[cnt];
486 nhi = nlo >> 4;
487 nlo &= 0xf;
488
489 Z.hi ^= Htable[nlo].hi;
490 Z.lo ^= Htable[nlo].lo;
491
492 rem = (size_t)Z.lo & 0xff;
493
494 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
495 Z.hi = (Z.hi >> 8);
496
497 Z.hi ^= Hshr4[nhi].hi;
498 Z.lo ^= Hshr4[nhi].lo;
499 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
500 }
501
502 nlo = ((const u8 *)Xi)[0];
503 nlo ^= inp[0];
504 nhi = nlo >> 4;
505 nlo &= 0xf;
506
507 Z.hi ^= Htable[nlo].hi;
508 Z.lo ^= Htable[nlo].lo;
509
510 rem = (size_t)Z.lo & 0xf;
511
512 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
513 Z.hi = (Z.hi >> 4);
514
515 Z.hi ^= Htable[nhi].hi;
516 Z.lo ^= Htable[nhi].lo;
517 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
518 # endif
519
520 if (IS_LITTLE_ENDIAN) {
521 # ifdef BSWAP8
522 Xi[0] = BSWAP8(Z.hi);
523 Xi[1] = BSWAP8(Z.lo);
524 # else
525 u8 *p = (u8 *)Xi;
526 u32 v;
527 v = (u32)(Z.hi >> 32);
528 PUTU32(p, v);
529 v = (u32)(Z.hi);
530 PUTU32(p + 4, v);
531 v = (u32)(Z.lo >> 32);
532 PUTU32(p + 8, v);
533 v = (u32)(Z.lo);
534 PUTU32(p + 12, v);
535 # endif
536 } else {
537 Xi[0] = Z.hi;
538 Xi[1] = Z.lo;
539 }
540 } while (inp += 16, len -= 16);
541 }
542 # endif
543 # else
544 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
545 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
546 size_t len);
547 # endif
548
549 # define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
550 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
551 # define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
552 /*
553 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
554 * effect. In other words idea is to hash data while it's still in L1 cache
555 * after encryption pass...
556 */
557 # define GHASH_CHUNK (3*1024)
558 # endif
559
560 #else /* TABLE_BITS */
561
562 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
563 {
564 u128 V, Z = { 0, 0 };
565 long X;
566 int i, j;
567 const long *xi = (const long *)Xi;
568 DECLARE_IS_ENDIAN;
569
570 V.hi = H[0]; /* H is in host byte order, no byte swapping */
571 V.lo = H[1];
572
573 for (j = 0; j < 16 / sizeof(long); ++j) {
574 if (IS_LITTLE_ENDIAN) {
575 if (sizeof(long) == 8) {
576 # ifdef BSWAP8
577 X = (long)(BSWAP8(xi[j]));
578 # else
579 const u8 *p = (const u8 *)(xi + j);
580 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
581 # endif
582 } else {
583 const u8 *p = (const u8 *)(xi + j);
584 X = (long)GETU32(p);
585 }
586 } else
587 X = xi[j];
588
589 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
590 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
591 Z.hi ^= V.hi & M;
592 Z.lo ^= V.lo & M;
593
594 REDUCE1BIT(V);
595 }
596 }
597
598 if (IS_LITTLE_ENDIAN) {
599 # ifdef BSWAP8
600 Xi[0] = BSWAP8(Z.hi);
601 Xi[1] = BSWAP8(Z.lo);
602 # else
603 u8 *p = (u8 *)Xi;
604 u32 v;
605 v = (u32)(Z.hi >> 32);
606 PUTU32(p, v);
607 v = (u32)(Z.hi);
608 PUTU32(p + 4, v);
609 v = (u32)(Z.lo >> 32);
610 PUTU32(p + 8, v);
611 v = (u32)(Z.lo);
612 PUTU32(p + 12, v);
613 # endif
614 } else {
615 Xi[0] = Z.hi;
616 Xi[1] = Z.lo;
617 }
618 }
619
620 # define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
621
622 #endif
623
624 #if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
625 # if !defined(I386_ONLY) && \
626 (defined(__i386) || defined(__i386__) || \
627 defined(__x86_64) || defined(__x86_64__) || \
628 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
629 # define GHASH_ASM_X86_OR_64
630 # define GCM_FUNCREF_4BIT
631
632 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
633 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
634 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
635 size_t len);
636
637 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
638 # define gcm_init_avx gcm_init_clmul
639 # define gcm_gmult_avx gcm_gmult_clmul
640 # define gcm_ghash_avx gcm_ghash_clmul
641 # else
642 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
643 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
644 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
645 size_t len);
646 # endif
647
648 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
649 # define GHASH_ASM_X86
650 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
651 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
652 size_t len);
653
654 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
655 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
656 size_t len);
657 # endif
658 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
659 # include "arm_arch.h"
660 # if __ARM_MAX_ARCH__>=7
661 # define GHASH_ASM_ARM
662 # define GCM_FUNCREF_4BIT
663 # define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
664 # if defined(__arm__) || defined(__arm)
665 # define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
666 # endif
667 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
668 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
669 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
670 size_t len);
671 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
672 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
673 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
674 size_t len);
675 # endif
676 # elif defined(__sparc__) || defined(__sparc)
677 # include "crypto/sparc_arch.h"
678 # define GHASH_ASM_SPARC
679 # define GCM_FUNCREF_4BIT
680 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
681 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
682 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
683 size_t len);
684 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
685 # include "crypto/ppc_arch.h"
686 # define GHASH_ASM_PPC
687 # define GCM_FUNCREF_4BIT
688 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
689 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
690 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
691 size_t len);
692 # endif
693 #endif
694
695 #ifdef GCM_FUNCREF_4BIT
696 # undef GCM_MUL
697 # define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
698 # ifdef GHASH
699 # undef GHASH
700 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
701 # endif
702 #endif
703
704 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
705 {
706 DECLARE_IS_ENDIAN;
707
708 memset(ctx, 0, sizeof(*ctx));
709 ctx->block = block;
710 ctx->key = key;
711
712 (*block) (ctx->H.c, ctx->H.c, key);
713
714 if (IS_LITTLE_ENDIAN) {
715 /* H is stored in host byte order */
716 #ifdef BSWAP8
717 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
718 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
719 #else
720 u8 *p = ctx->H.c;
721 u64 hi, lo;
722 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
723 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
724 ctx->H.u[0] = hi;
725 ctx->H.u[1] = lo;
726 #endif
727 }
728 #if TABLE_BITS==8
729 gcm_init_8bit(ctx->Htable, ctx->H.u);
730 #elif TABLE_BITS==4
731 # if defined(GHASH)
732 # define CTX__GHASH(f) (ctx->ghash = (f))
733 # else
734 # define CTX__GHASH(f) (ctx->ghash = NULL)
735 # endif
736 # if defined(GHASH_ASM_X86_OR_64)
737 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
738 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
739 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
740 gcm_init_avx(ctx->Htable, ctx->H.u);
741 ctx->gmult = gcm_gmult_avx;
742 CTX__GHASH(gcm_ghash_avx);
743 } else {
744 gcm_init_clmul(ctx->Htable, ctx->H.u);
745 ctx->gmult = gcm_gmult_clmul;
746 CTX__GHASH(gcm_ghash_clmul);
747 }
748 return;
749 }
750 # endif
751 gcm_init_4bit(ctx->Htable, ctx->H.u);
752 # if defined(GHASH_ASM_X86) /* x86 only */
753 # if defined(OPENSSL_IA32_SSE2)
754 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
755 # else
756 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
757 # endif
758 ctx->gmult = gcm_gmult_4bit_mmx;
759 CTX__GHASH(gcm_ghash_4bit_mmx);
760 } else {
761 ctx->gmult = gcm_gmult_4bit_x86;
762 CTX__GHASH(gcm_ghash_4bit_x86);
763 }
764 # else
765 ctx->gmult = gcm_gmult_4bit;
766 CTX__GHASH(gcm_ghash_4bit);
767 # endif
768 # elif defined(GHASH_ASM_ARM)
769 # ifdef PMULL_CAPABLE
770 if (PMULL_CAPABLE) {
771 gcm_init_v8(ctx->Htable, ctx->H.u);
772 ctx->gmult = gcm_gmult_v8;
773 CTX__GHASH(gcm_ghash_v8);
774 } else
775 # endif
776 # ifdef NEON_CAPABLE
777 if (NEON_CAPABLE) {
778 gcm_init_neon(ctx->Htable, ctx->H.u);
779 ctx->gmult = gcm_gmult_neon;
780 CTX__GHASH(gcm_ghash_neon);
781 } else
782 # endif
783 {
784 gcm_init_4bit(ctx->Htable, ctx->H.u);
785 ctx->gmult = gcm_gmult_4bit;
786 CTX__GHASH(gcm_ghash_4bit);
787 }
788 # elif defined(GHASH_ASM_SPARC)
789 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
790 gcm_init_vis3(ctx->Htable, ctx->H.u);
791 ctx->gmult = gcm_gmult_vis3;
792 CTX__GHASH(gcm_ghash_vis3);
793 } else {
794 gcm_init_4bit(ctx->Htable, ctx->H.u);
795 ctx->gmult = gcm_gmult_4bit;
796 CTX__GHASH(gcm_ghash_4bit);
797 }
798 # elif defined(GHASH_ASM_PPC)
799 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
800 gcm_init_p8(ctx->Htable, ctx->H.u);
801 ctx->gmult = gcm_gmult_p8;
802 CTX__GHASH(gcm_ghash_p8);
803 } else {
804 gcm_init_4bit(ctx->Htable, ctx->H.u);
805 ctx->gmult = gcm_gmult_4bit;
806 CTX__GHASH(gcm_ghash_4bit);
807 }
808 # else
809 gcm_init_4bit(ctx->Htable, ctx->H.u);
810 # endif
811 # undef CTX__GHASH
812 #endif
813 }
814
815 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
816 size_t len)
817 {
818 DECLARE_IS_ENDIAN;
819 unsigned int ctr;
820 #ifdef GCM_FUNCREF_4BIT
821 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
822 #endif
823
824 ctx->len.u[0] = 0; /* AAD length */
825 ctx->len.u[1] = 0; /* message length */
826 ctx->ares = 0;
827 ctx->mres = 0;
828
829 if (len == 12) {
830 memcpy(ctx->Yi.c, iv, 12);
831 ctx->Yi.c[12] = 0;
832 ctx->Yi.c[13] = 0;
833 ctx->Yi.c[14] = 0;
834 ctx->Yi.c[15] = 1;
835 ctr = 1;
836 } else {
837 size_t i;
838 u64 len0 = len;
839
840 /* Borrow ctx->Xi to calculate initial Yi */
841 ctx->Xi.u[0] = 0;
842 ctx->Xi.u[1] = 0;
843
844 while (len >= 16) {
845 for (i = 0; i < 16; ++i)
846 ctx->Xi.c[i] ^= iv[i];
847 GCM_MUL(ctx);
848 iv += 16;
849 len -= 16;
850 }
851 if (len) {
852 for (i = 0; i < len; ++i)
853 ctx->Xi.c[i] ^= iv[i];
854 GCM_MUL(ctx);
855 }
856 len0 <<= 3;
857 if (IS_LITTLE_ENDIAN) {
858 #ifdef BSWAP8
859 ctx->Xi.u[1] ^= BSWAP8(len0);
860 #else
861 ctx->Xi.c[8] ^= (u8)(len0 >> 56);
862 ctx->Xi.c[9] ^= (u8)(len0 >> 48);
863 ctx->Xi.c[10] ^= (u8)(len0 >> 40);
864 ctx->Xi.c[11] ^= (u8)(len0 >> 32);
865 ctx->Xi.c[12] ^= (u8)(len0 >> 24);
866 ctx->Xi.c[13] ^= (u8)(len0 >> 16);
867 ctx->Xi.c[14] ^= (u8)(len0 >> 8);
868 ctx->Xi.c[15] ^= (u8)(len0);
869 #endif
870 } else {
871 ctx->Xi.u[1] ^= len0;
872 }
873
874 GCM_MUL(ctx);
875
876 if (IS_LITTLE_ENDIAN)
877 #ifdef BSWAP4
878 ctr = BSWAP4(ctx->Xi.d[3]);
879 #else
880 ctr = GETU32(ctx->Xi.c + 12);
881 #endif
882 else
883 ctr = ctx->Xi.d[3];
884
885 /* Copy borrowed Xi to Yi */
886 ctx->Yi.u[0] = ctx->Xi.u[0];
887 ctx->Yi.u[1] = ctx->Xi.u[1];
888 }
889
890 ctx->Xi.u[0] = 0;
891 ctx->Xi.u[1] = 0;
892
893 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
894 ++ctr;
895 if (IS_LITTLE_ENDIAN)
896 #ifdef BSWAP4
897 ctx->Yi.d[3] = BSWAP4(ctr);
898 #else
899 PUTU32(ctx->Yi.c + 12, ctr);
900 #endif
901 else
902 ctx->Yi.d[3] = ctr;
903 }
904
905 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
906 size_t len)
907 {
908 size_t i;
909 unsigned int n;
910 u64 alen = ctx->len.u[0];
911 #ifdef GCM_FUNCREF_4BIT
912 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
913 # ifdef GHASH
914 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
915 const u8 *inp, size_t len) = ctx->ghash;
916 # endif
917 #endif
918
919 if (ctx->len.u[1])
920 return -2;
921
922 alen += len;
923 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
924 return -1;
925 ctx->len.u[0] = alen;
926
927 n = ctx->ares;
928 if (n) {
929 while (n && len) {
930 ctx->Xi.c[n] ^= *(aad++);
931 --len;
932 n = (n + 1) % 16;
933 }
934 if (n == 0)
935 GCM_MUL(ctx);
936 else {
937 ctx->ares = n;
938 return 0;
939 }
940 }
941 #ifdef GHASH
942 if ((i = (len & (size_t)-16))) {
943 GHASH(ctx, aad, i);
944 aad += i;
945 len -= i;
946 }
947 #else
948 while (len >= 16) {
949 for (i = 0; i < 16; ++i)
950 ctx->Xi.c[i] ^= aad[i];
951 GCM_MUL(ctx);
952 aad += 16;
953 len -= 16;
954 }
955 #endif
956 if (len) {
957 n = (unsigned int)len;
958 for (i = 0; i < len; ++i)
959 ctx->Xi.c[i] ^= aad[i];
960 }
961
962 ctx->ares = n;
963 return 0;
964 }
965
966 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
967 const unsigned char *in, unsigned char *out,
968 size_t len)
969 {
970 DECLARE_IS_ENDIAN;
971 unsigned int n, ctr, mres;
972 size_t i;
973 u64 mlen = ctx->len.u[1];
974 block128_f block = ctx->block;
975 void *key = ctx->key;
976 #ifdef GCM_FUNCREF_4BIT
977 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
978 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
979 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
980 const u8 *inp, size_t len) = ctx->ghash;
981 # endif
982 #endif
983
984 mlen += len;
985 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
986 return -1;
987 ctx->len.u[1] = mlen;
988
989 mres = ctx->mres;
990
991 if (ctx->ares) {
992 /* First call to encrypt finalizes GHASH(AAD) */
993 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
994 if (len == 0) {
995 GCM_MUL(ctx);
996 ctx->ares = 0;
997 return 0;
998 }
999 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1000 ctx->Xi.u[0] = 0;
1001 ctx->Xi.u[1] = 0;
1002 mres = sizeof(ctx->Xi);
1003 #else
1004 GCM_MUL(ctx);
1005 #endif
1006 ctx->ares = 0;
1007 }
1008
1009 if (IS_LITTLE_ENDIAN)
1010 #ifdef BSWAP4
1011 ctr = BSWAP4(ctx->Yi.d[3]);
1012 #else
1013 ctr = GETU32(ctx->Yi.c + 12);
1014 #endif
1015 else
1016 ctr = ctx->Yi.d[3];
1017
1018 n = mres % 16;
1019 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1020 if (16 % sizeof(size_t) == 0) { /* always true actually */
1021 do {
1022 if (n) {
1023 # if defined(GHASH)
1024 while (n && len) {
1025 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1026 --len;
1027 n = (n + 1) % 16;
1028 }
1029 if (n == 0) {
1030 GHASH(ctx, ctx->Xn, mres);
1031 mres = 0;
1032 } else {
1033 ctx->mres = mres;
1034 return 0;
1035 }
1036 # else
1037 while (n && len) {
1038 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1039 --len;
1040 n = (n + 1) % 16;
1041 }
1042 if (n == 0) {
1043 GCM_MUL(ctx);
1044 mres = 0;
1045 } else {
1046 ctx->mres = n;
1047 return 0;
1048 }
1049 # endif
1050 }
1051 # if defined(STRICT_ALIGNMENT)
1052 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1053 break;
1054 # endif
1055 # if defined(GHASH)
1056 if (len >= 16 && mres) {
1057 GHASH(ctx, ctx->Xn, mres);
1058 mres = 0;
1059 }
1060 # if defined(GHASH_CHUNK)
1061 while (len >= GHASH_CHUNK) {
1062 size_t j = GHASH_CHUNK;
1063
1064 while (j) {
1065 size_t_aX *out_t = (size_t_aX *)out;
1066 const size_t_aX *in_t = (const size_t_aX *)in;
1067
1068 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1069 ++ctr;
1070 if (IS_LITTLE_ENDIAN)
1071 # ifdef BSWAP4
1072 ctx->Yi.d[3] = BSWAP4(ctr);
1073 # else
1074 PUTU32(ctx->Yi.c + 12, ctr);
1075 # endif
1076 else
1077 ctx->Yi.d[3] = ctr;
1078 for (i = 0; i < 16 / sizeof(size_t); ++i)
1079 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1080 out += 16;
1081 in += 16;
1082 j -= 16;
1083 }
1084 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1085 len -= GHASH_CHUNK;
1086 }
1087 # endif
1088 if ((i = (len & (size_t)-16))) {
1089 size_t j = i;
1090
1091 while (len >= 16) {
1092 size_t_aX *out_t = (size_t_aX *)out;
1093 const size_t_aX *in_t = (const size_t_aX *)in;
1094
1095 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1096 ++ctr;
1097 if (IS_LITTLE_ENDIAN)
1098 # ifdef BSWAP4
1099 ctx->Yi.d[3] = BSWAP4(ctr);
1100 # else
1101 PUTU32(ctx->Yi.c + 12, ctr);
1102 # endif
1103 else
1104 ctx->Yi.d[3] = ctr;
1105 for (i = 0; i < 16 / sizeof(size_t); ++i)
1106 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1107 out += 16;
1108 in += 16;
1109 len -= 16;
1110 }
1111 GHASH(ctx, out - j, j);
1112 }
1113 # else
1114 while (len >= 16) {
1115 size_t *out_t = (size_t *)out;
1116 const size_t *in_t = (const size_t *)in;
1117
1118 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1119 ++ctr;
1120 if (IS_LITTLE_ENDIAN)
1121 # ifdef BSWAP4
1122 ctx->Yi.d[3] = BSWAP4(ctr);
1123 # else
1124 PUTU32(ctx->Yi.c + 12, ctr);
1125 # endif
1126 else
1127 ctx->Yi.d[3] = ctr;
1128 for (i = 0; i < 16 / sizeof(size_t); ++i)
1129 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1130 GCM_MUL(ctx);
1131 out += 16;
1132 in += 16;
1133 len -= 16;
1134 }
1135 # endif
1136 if (len) {
1137 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1138 ++ctr;
1139 if (IS_LITTLE_ENDIAN)
1140 # ifdef BSWAP4
1141 ctx->Yi.d[3] = BSWAP4(ctr);
1142 # else
1143 PUTU32(ctx->Yi.c + 12, ctr);
1144 # endif
1145 else
1146 ctx->Yi.d[3] = ctr;
1147 # if defined(GHASH)
1148 while (len--) {
1149 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1150 ++n;
1151 }
1152 # else
1153 while (len--) {
1154 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1155 ++n;
1156 }
1157 mres = n;
1158 # endif
1159 }
1160
1161 ctx->mres = mres;
1162 return 0;
1163 } while (0);
1164 }
1165 #endif
1166 for (i = 0; i < len; ++i) {
1167 if (n == 0) {
1168 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1169 ++ctr;
1170 if (IS_LITTLE_ENDIAN)
1171 #ifdef BSWAP4
1172 ctx->Yi.d[3] = BSWAP4(ctr);
1173 #else
1174 PUTU32(ctx->Yi.c + 12, ctr);
1175 #endif
1176 else
1177 ctx->Yi.d[3] = ctr;
1178 }
1179 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1180 ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1181 n = (n + 1) % 16;
1182 if (mres == sizeof(ctx->Xn)) {
1183 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1184 mres = 0;
1185 }
1186 #else
1187 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1188 mres = n = (n + 1) % 16;
1189 if (n == 0)
1190 GCM_MUL(ctx);
1191 #endif
1192 }
1193
1194 ctx->mres = mres;
1195 return 0;
1196 }
1197
1198 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1199 const unsigned char *in, unsigned char *out,
1200 size_t len)
1201 {
1202 DECLARE_IS_ENDIAN;
1203 unsigned int n, ctr, mres;
1204 size_t i;
1205 u64 mlen = ctx->len.u[1];
1206 block128_f block = ctx->block;
1207 void *key = ctx->key;
1208 #ifdef GCM_FUNCREF_4BIT
1209 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1210 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1211 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1212 const u8 *inp, size_t len) = ctx->ghash;
1213 # endif
1214 #endif
1215
1216 mlen += len;
1217 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1218 return -1;
1219 ctx->len.u[1] = mlen;
1220
1221 mres = ctx->mres;
1222
1223 if (ctx->ares) {
1224 /* First call to decrypt finalizes GHASH(AAD) */
1225 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1226 if (len == 0) {
1227 GCM_MUL(ctx);
1228 ctx->ares = 0;
1229 return 0;
1230 }
1231 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1232 ctx->Xi.u[0] = 0;
1233 ctx->Xi.u[1] = 0;
1234 mres = sizeof(ctx->Xi);
1235 #else
1236 GCM_MUL(ctx);
1237 #endif
1238 ctx->ares = 0;
1239 }
1240
1241 if (IS_LITTLE_ENDIAN)
1242 #ifdef BSWAP4
1243 ctr = BSWAP4(ctx->Yi.d[3]);
1244 #else
1245 ctr = GETU32(ctx->Yi.c + 12);
1246 #endif
1247 else
1248 ctr = ctx->Yi.d[3];
1249
1250 n = mres % 16;
1251 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1252 if (16 % sizeof(size_t) == 0) { /* always true actually */
1253 do {
1254 if (n) {
1255 # if defined(GHASH)
1256 while (n && len) {
1257 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1258 --len;
1259 n = (n + 1) % 16;
1260 }
1261 if (n == 0) {
1262 GHASH(ctx, ctx->Xn, mres);
1263 mres = 0;
1264 } else {
1265 ctx->mres = mres;
1266 return 0;
1267 }
1268 # else
1269 while (n && len) {
1270 u8 c = *(in++);
1271 *(out++) = c ^ ctx->EKi.c[n];
1272 ctx->Xi.c[n] ^= c;
1273 --len;
1274 n = (n + 1) % 16;
1275 }
1276 if (n == 0) {
1277 GCM_MUL(ctx);
1278 mres = 0;
1279 } else {
1280 ctx->mres = n;
1281 return 0;
1282 }
1283 # endif
1284 }
1285 # if defined(STRICT_ALIGNMENT)
1286 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1287 break;
1288 # endif
1289 # if defined(GHASH)
1290 if (len >= 16 && mres) {
1291 GHASH(ctx, ctx->Xn, mres);
1292 mres = 0;
1293 }
1294 # if defined(GHASH_CHUNK)
1295 while (len >= GHASH_CHUNK) {
1296 size_t j = GHASH_CHUNK;
1297
1298 GHASH(ctx, in, GHASH_CHUNK);
1299 while (j) {
1300 size_t_aX *out_t = (size_t_aX *)out;
1301 const size_t_aX *in_t = (const size_t_aX *)in;
1302
1303 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1304 ++ctr;
1305 if (IS_LITTLE_ENDIAN)
1306 # ifdef BSWAP4
1307 ctx->Yi.d[3] = BSWAP4(ctr);
1308 # else
1309 PUTU32(ctx->Yi.c + 12, ctr);
1310 # endif
1311 else
1312 ctx->Yi.d[3] = ctr;
1313 for (i = 0; i < 16 / sizeof(size_t); ++i)
1314 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1315 out += 16;
1316 in += 16;
1317 j -= 16;
1318 }
1319 len -= GHASH_CHUNK;
1320 }
1321 # endif
1322 if ((i = (len & (size_t)-16))) {
1323 GHASH(ctx, in, i);
1324 while (len >= 16) {
1325 size_t_aX *out_t = (size_t_aX *)out;
1326 const size_t_aX *in_t = (const size_t_aX *)in;
1327
1328 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1329 ++ctr;
1330 if (IS_LITTLE_ENDIAN)
1331 # ifdef BSWAP4
1332 ctx->Yi.d[3] = BSWAP4(ctr);
1333 # else
1334 PUTU32(ctx->Yi.c + 12, ctr);
1335 # endif
1336 else
1337 ctx->Yi.d[3] = ctr;
1338 for (i = 0; i < 16 / sizeof(size_t); ++i)
1339 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1340 out += 16;
1341 in += 16;
1342 len -= 16;
1343 }
1344 }
1345 # else
1346 while (len >= 16) {
1347 size_t *out_t = (size_t *)out;
1348 const size_t *in_t = (const size_t *)in;
1349
1350 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1351 ++ctr;
1352 if (IS_LITTLE_ENDIAN)
1353 # ifdef BSWAP4
1354 ctx->Yi.d[3] = BSWAP4(ctr);
1355 # else
1356 PUTU32(ctx->Yi.c + 12, ctr);
1357 # endif
1358 else
1359 ctx->Yi.d[3] = ctr;
1360 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1361 size_t c = in_t[i];
1362 out_t[i] = c ^ ctx->EKi.t[i];
1363 ctx->Xi.t[i] ^= c;
1364 }
1365 GCM_MUL(ctx);
1366 out += 16;
1367 in += 16;
1368 len -= 16;
1369 }
1370 # endif
1371 if (len) {
1372 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1373 ++ctr;
1374 if (IS_LITTLE_ENDIAN)
1375 # ifdef BSWAP4
1376 ctx->Yi.d[3] = BSWAP4(ctr);
1377 # else
1378 PUTU32(ctx->Yi.c + 12, ctr);
1379 # endif
1380 else
1381 ctx->Yi.d[3] = ctr;
1382 # if defined(GHASH)
1383 while (len--) {
1384 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1385 ++n;
1386 }
1387 # else
1388 while (len--) {
1389 u8 c = in[n];
1390 ctx->Xi.c[n] ^= c;
1391 out[n] = c ^ ctx->EKi.c[n];
1392 ++n;
1393 }
1394 mres = n;
1395 # endif
1396 }
1397
1398 ctx->mres = mres;
1399 return 0;
1400 } while (0);
1401 }
1402 #endif
1403 for (i = 0; i < len; ++i) {
1404 u8 c;
1405 if (n == 0) {
1406 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1407 ++ctr;
1408 if (IS_LITTLE_ENDIAN)
1409 #ifdef BSWAP4
1410 ctx->Yi.d[3] = BSWAP4(ctr);
1411 #else
1412 PUTU32(ctx->Yi.c + 12, ctr);
1413 #endif
1414 else
1415 ctx->Yi.d[3] = ctr;
1416 }
1417 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1418 out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1419 n = (n + 1) % 16;
1420 if (mres == sizeof(ctx->Xn)) {
1421 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1422 mres = 0;
1423 }
1424 #else
1425 c = in[i];
1426 out[i] = c ^ ctx->EKi.c[n];
1427 ctx->Xi.c[n] ^= c;
1428 mres = n = (n + 1) % 16;
1429 if (n == 0)
1430 GCM_MUL(ctx);
1431 #endif
1432 }
1433
1434 ctx->mres = mres;
1435 return 0;
1436 }
1437
1438 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1439 const unsigned char *in, unsigned char *out,
1440 size_t len, ctr128_f stream)
1441 {
1442 #if defined(OPENSSL_SMALL_FOOTPRINT)
1443 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1444 #else
1445 DECLARE_IS_ENDIAN;
1446 unsigned int n, ctr, mres;
1447 size_t i;
1448 u64 mlen = ctx->len.u[1];
1449 void *key = ctx->key;
1450 # ifdef GCM_FUNCREF_4BIT
1451 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1452 # ifdef GHASH
1453 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1454 const u8 *inp, size_t len) = ctx->ghash;
1455 # endif
1456 # endif
1457
1458 mlen += len;
1459 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1460 return -1;
1461 ctx->len.u[1] = mlen;
1462
1463 mres = ctx->mres;
1464
1465 if (ctx->ares) {
1466 /* First call to encrypt finalizes GHASH(AAD) */
1467 #if defined(GHASH)
1468 if (len == 0) {
1469 GCM_MUL(ctx);
1470 ctx->ares = 0;
1471 return 0;
1472 }
1473 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1474 ctx->Xi.u[0] = 0;
1475 ctx->Xi.u[1] = 0;
1476 mres = sizeof(ctx->Xi);
1477 #else
1478 GCM_MUL(ctx);
1479 #endif
1480 ctx->ares = 0;
1481 }
1482
1483 if (IS_LITTLE_ENDIAN)
1484 # ifdef BSWAP4
1485 ctr = BSWAP4(ctx->Yi.d[3]);
1486 # else
1487 ctr = GETU32(ctx->Yi.c + 12);
1488 # endif
1489 else
1490 ctr = ctx->Yi.d[3];
1491
1492 n = mres % 16;
1493 if (n) {
1494 # if defined(GHASH)
1495 while (n && len) {
1496 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1497 --len;
1498 n = (n + 1) % 16;
1499 }
1500 if (n == 0) {
1501 GHASH(ctx, ctx->Xn, mres);
1502 mres = 0;
1503 } else {
1504 ctx->mres = mres;
1505 return 0;
1506 }
1507 # else
1508 while (n && len) {
1509 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1510 --len;
1511 n = (n + 1) % 16;
1512 }
1513 if (n == 0) {
1514 GCM_MUL(ctx);
1515 mres = 0;
1516 } else {
1517 ctx->mres = n;
1518 return 0;
1519 }
1520 # endif
1521 }
1522 # if defined(GHASH)
1523 if (len >= 16 && mres) {
1524 GHASH(ctx, ctx->Xn, mres);
1525 mres = 0;
1526 }
1527 # if defined(GHASH_CHUNK)
1528 while (len >= GHASH_CHUNK) {
1529 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1530 ctr += GHASH_CHUNK / 16;
1531 if (IS_LITTLE_ENDIAN)
1532 # ifdef BSWAP4
1533 ctx->Yi.d[3] = BSWAP4(ctr);
1534 # else
1535 PUTU32(ctx->Yi.c + 12, ctr);
1536 # endif
1537 else
1538 ctx->Yi.d[3] = ctr;
1539 GHASH(ctx, out, GHASH_CHUNK);
1540 out += GHASH_CHUNK;
1541 in += GHASH_CHUNK;
1542 len -= GHASH_CHUNK;
1543 }
1544 # endif
1545 # endif
1546 if ((i = (len & (size_t)-16))) {
1547 size_t j = i / 16;
1548
1549 (*stream) (in, out, j, key, ctx->Yi.c);
1550 ctr += (unsigned int)j;
1551 if (IS_LITTLE_ENDIAN)
1552 # ifdef BSWAP4
1553 ctx->Yi.d[3] = BSWAP4(ctr);
1554 # else
1555 PUTU32(ctx->Yi.c + 12, ctr);
1556 # endif
1557 else
1558 ctx->Yi.d[3] = ctr;
1559 in += i;
1560 len -= i;
1561 # if defined(GHASH)
1562 GHASH(ctx, out, i);
1563 out += i;
1564 # else
1565 while (j--) {
1566 for (i = 0; i < 16; ++i)
1567 ctx->Xi.c[i] ^= out[i];
1568 GCM_MUL(ctx);
1569 out += 16;
1570 }
1571 # endif
1572 }
1573 if (len) {
1574 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1575 ++ctr;
1576 if (IS_LITTLE_ENDIAN)
1577 # ifdef BSWAP4
1578 ctx->Yi.d[3] = BSWAP4(ctr);
1579 # else
1580 PUTU32(ctx->Yi.c + 12, ctr);
1581 # endif
1582 else
1583 ctx->Yi.d[3] = ctr;
1584 while (len--) {
1585 # if defined(GHASH)
1586 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1587 # else
1588 ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1589 # endif
1590 ++n;
1591 }
1592 }
1593
1594 ctx->mres = mres;
1595 return 0;
1596 #endif
1597 }
1598
1599 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1600 const unsigned char *in, unsigned char *out,
1601 size_t len, ctr128_f stream)
1602 {
1603 #if defined(OPENSSL_SMALL_FOOTPRINT)
1604 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1605 #else
1606 DECLARE_IS_ENDIAN;
1607 unsigned int n, ctr, mres;
1608 size_t i;
1609 u64 mlen = ctx->len.u[1];
1610 void *key = ctx->key;
1611 # ifdef GCM_FUNCREF_4BIT
1612 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1613 # ifdef GHASH
1614 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1615 const u8 *inp, size_t len) = ctx->ghash;
1616 # endif
1617 # endif
1618
1619 mlen += len;
1620 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1621 return -1;
1622 ctx->len.u[1] = mlen;
1623
1624 mres = ctx->mres;
1625
1626 if (ctx->ares) {
1627 /* First call to decrypt finalizes GHASH(AAD) */
1628 # if defined(GHASH)
1629 if (len == 0) {
1630 GCM_MUL(ctx);
1631 ctx->ares = 0;
1632 return 0;
1633 }
1634 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1635 ctx->Xi.u[0] = 0;
1636 ctx->Xi.u[1] = 0;
1637 mres = sizeof(ctx->Xi);
1638 # else
1639 GCM_MUL(ctx);
1640 # endif
1641 ctx->ares = 0;
1642 }
1643
1644 if (IS_LITTLE_ENDIAN)
1645 # ifdef BSWAP4
1646 ctr = BSWAP4(ctx->Yi.d[3]);
1647 # else
1648 ctr = GETU32(ctx->Yi.c + 12);
1649 # endif
1650 else
1651 ctr = ctx->Yi.d[3];
1652
1653 n = mres % 16;
1654 if (n) {
1655 # if defined(GHASH)
1656 while (n && len) {
1657 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1658 --len;
1659 n = (n + 1) % 16;
1660 }
1661 if (n == 0) {
1662 GHASH(ctx, ctx->Xn, mres);
1663 mres = 0;
1664 } else {
1665 ctx->mres = mres;
1666 return 0;
1667 }
1668 # else
1669 while (n && len) {
1670 u8 c = *(in++);
1671 *(out++) = c ^ ctx->EKi.c[n];
1672 ctx->Xi.c[n] ^= c;
1673 --len;
1674 n = (n + 1) % 16;
1675 }
1676 if (n == 0) {
1677 GCM_MUL(ctx);
1678 mres = 0;
1679 } else {
1680 ctx->mres = n;
1681 return 0;
1682 }
1683 # endif
1684 }
1685 # if defined(GHASH)
1686 if (len >= 16 && mres) {
1687 GHASH(ctx, ctx->Xn, mres);
1688 mres = 0;
1689 }
1690 # if defined(GHASH_CHUNK)
1691 while (len >= GHASH_CHUNK) {
1692 GHASH(ctx, in, GHASH_CHUNK);
1693 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1694 ctr += GHASH_CHUNK / 16;
1695 if (IS_LITTLE_ENDIAN)
1696 # ifdef BSWAP4
1697 ctx->Yi.d[3] = BSWAP4(ctr);
1698 # else
1699 PUTU32(ctx->Yi.c + 12, ctr);
1700 # endif
1701 else
1702 ctx->Yi.d[3] = ctr;
1703 out += GHASH_CHUNK;
1704 in += GHASH_CHUNK;
1705 len -= GHASH_CHUNK;
1706 }
1707 # endif
1708 # endif
1709 if ((i = (len & (size_t)-16))) {
1710 size_t j = i / 16;
1711
1712 # if defined(GHASH)
1713 GHASH(ctx, in, i);
1714 # else
1715 while (j--) {
1716 size_t k;
1717 for (k = 0; k < 16; ++k)
1718 ctx->Xi.c[k] ^= in[k];
1719 GCM_MUL(ctx);
1720 in += 16;
1721 }
1722 j = i / 16;
1723 in -= i;
1724 # endif
1725 (*stream) (in, out, j, key, ctx->Yi.c);
1726 ctr += (unsigned int)j;
1727 if (IS_LITTLE_ENDIAN)
1728 # ifdef BSWAP4
1729 ctx->Yi.d[3] = BSWAP4(ctr);
1730 # else
1731 PUTU32(ctx->Yi.c + 12, ctr);
1732 # endif
1733 else
1734 ctx->Yi.d[3] = ctr;
1735 out += i;
1736 in += i;
1737 len -= i;
1738 }
1739 if (len) {
1740 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1741 ++ctr;
1742 if (IS_LITTLE_ENDIAN)
1743 # ifdef BSWAP4
1744 ctx->Yi.d[3] = BSWAP4(ctr);
1745 # else
1746 PUTU32(ctx->Yi.c + 12, ctr);
1747 # endif
1748 else
1749 ctx->Yi.d[3] = ctr;
1750 while (len--) {
1751 # if defined(GHASH)
1752 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1753 # else
1754 u8 c = in[n];
1755 ctx->Xi.c[mres++] ^= c;
1756 out[n] = c ^ ctx->EKi.c[n];
1757 # endif
1758 ++n;
1759 }
1760 }
1761
1762 ctx->mres = mres;
1763 return 0;
1764 #endif
1765 }
1766
1767 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1768 size_t len)
1769 {
1770 DECLARE_IS_ENDIAN;
1771 u64 alen = ctx->len.u[0] << 3;
1772 u64 clen = ctx->len.u[1] << 3;
1773 #ifdef GCM_FUNCREF_4BIT
1774 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1775 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1776 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1777 const u8 *inp, size_t len) = ctx->ghash;
1778 # endif
1779 #endif
1780
1781 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1782 u128 bitlen;
1783 unsigned int mres = ctx->mres;
1784
1785 if (mres) {
1786 unsigned blocks = (mres + 15) & -16;
1787
1788 memset(ctx->Xn + mres, 0, blocks - mres);
1789 mres = blocks;
1790 if (mres == sizeof(ctx->Xn)) {
1791 GHASH(ctx, ctx->Xn, mres);
1792 mres = 0;
1793 }
1794 } else if (ctx->ares) {
1795 GCM_MUL(ctx);
1796 }
1797 #else
1798 if (ctx->mres || ctx->ares)
1799 GCM_MUL(ctx);
1800 #endif
1801
1802 if (IS_LITTLE_ENDIAN) {
1803 #ifdef BSWAP8
1804 alen = BSWAP8(alen);
1805 clen = BSWAP8(clen);
1806 #else
1807 u8 *p = ctx->len.c;
1808
1809 ctx->len.u[0] = alen;
1810 ctx->len.u[1] = clen;
1811
1812 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1813 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1814 #endif
1815 }
1816
1817 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1818 bitlen.hi = alen;
1819 bitlen.lo = clen;
1820 memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1821 mres += sizeof(bitlen);
1822 GHASH(ctx, ctx->Xn, mres);
1823 #else
1824 ctx->Xi.u[0] ^= alen;
1825 ctx->Xi.u[1] ^= clen;
1826 GCM_MUL(ctx);
1827 #endif
1828
1829 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1830 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1831
1832 if (tag && len <= sizeof(ctx->Xi))
1833 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1834 else
1835 return -1;
1836 }
1837
1838 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1839 {
1840 CRYPTO_gcm128_finish(ctx, NULL, 0);
1841 memcpy(tag, ctx->Xi.c,
1842 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1843 }
1844
1845 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1846 {
1847 GCM128_CONTEXT *ret;
1848
1849 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1850 CRYPTO_gcm128_init(ret, key, block);
1851
1852 return ret;
1853 }
1854
1855 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1856 {
1857 OPENSSL_clear_free(ctx, sizeof(*ctx));
1858 }
1859