Lines Matching +full:128 +full:b
77 // fairly easily be changed to support 128-bit too. However, this would *not*
94 // x^128 + x^7 + x^2 + x, represented using the backwards mapping
99 // "reversed" GHASH reducing polynomial without its x^128 term.
108 // ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
109 // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
157 // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
159 // same size as \a and \b. To complete all steps, this must invoked with \i=0
165 // GF(2^128). Elements of GF(2^128) are represented as binary polynomials
167 // G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is
169 // carryless multiplication of two 128-bit input polynomials to get a 256-bit
170 // intermediate product polynomial, and (b) reduce the intermediate product to
171 // 128 bits by adding multiples of G that cancel out terms in it. (Adding
185 // value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
192 // using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
195 // M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
205 // in the natural order, and the multiplication is actually \a * \b * x^-128 mod
206 // x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs,
212 // 128-bit carryless multiplication, so we break the 128 x 128 multiplication
219 // The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit.
223 // For the reduction, we cancel out the low 128 bits by adding multiples of G =
224 // x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of
225 // which cancels out the next lowest 64 bits. Consider a value x^64*A + B,
226 // where A and B are 128-bit. Adding B_L*G to that value gives:
228 // x^64*A + B + B_L*G
229 // = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
230 // = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
231 // = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
234 // So: if we sum A, B with its halves swapped, and the low half of B times x^63
235 // + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
236 // original value x^64*A + B. I.e., the low 64 bits got canceled out.
244 // A potential optimization, assuming that b is fixed per-key (if a is fixed
246 // reduction described above to precompute a value c such that x^64*c = b mod G,
247 // and then multiply a_L by c (and implicitly by x^64) instead of by b:
259 .macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2
261 vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L
262 vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H
264 vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L
274 vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H
284 // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
286 .macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2
288 _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
292 // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
294 .macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3
295 vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L
296 vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H
297 vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L
298 vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H
304 // Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
358 jne 1b
373 // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
375 // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
380 // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
381 // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit
434 // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
479 // 128-bits each. This leaves VL/16 128-bit intermediate values.
480 // - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
536 // the round key that has been broadcast to all 128-bit lanes of \round_key.
571 // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
649 // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
650 // using vpshufb, copied to all 128-bit lanes.
657 // only the lowest 128-bit lane can be nonzero. When not fully reduced,
670 // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key,
698 // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
713 // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
722 // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
771 jne 1b
798 jl 128f // AES-128?
810 128:
900 jne 1b
1033 jl 128f // AES-128?
1041 128:
1057 // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last