aes-gcm-avx10-x86_64.S - OpenGrok cross reference for /linux/arch/x86/crypto/aes-gcm-avx10-x86

Lines Matching +full:128 +full:b
77 // fairly easily be changed to support 128-bit too.  However, this would *not*
94 	// x^128 + x^7 + x^2 + x, represented using the backwards mapping
99 	// "reversed" GHASH reducing polynomial without its x^128 term.
108 	// ctr_pattern points to the four 128-bit values [0, 1, 2, 3].
109 	// inc_2blocks and inc_4blocks point to the single 128-bit values 2 and
157 // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the
159 // same size as \a and \b.  To complete all steps, this must invoked with \i=0
165 // GF(2^128).  Elements of GF(2^128) are represented as binary polynomials
167 // G.  The GCM specification uses G = x^128 + x^7 + x^2 + x + 1.  Addition is
169 // carryless multiplication of two 128-bit input polynomials to get a 256-bit
170 // intermediate product polynomial, and (b) reduce the intermediate product to
171 // 128 bits by adding multiples of G that cancel out terms in it.  (Adding
185 // value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0
192 // using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each
195 // M+N bits.  In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits
205 // in the natural order, and the multiplication is actually \a * \b * x^-128 mod
206 // x^128 + x^127 + x^126 + x^121 + 1.  This doesn't change the inputs, outputs,
212 // 128-bit carryless multiplication, so we break the 128 x 128 multiplication
219 // The 256-bit product is x^128*HI + x^64*MI + LO.  LO, MI, and HI are 128-bit.
223 // For the reduction, we cancel out the low 128 bits by adding multiples of G =
224 // x^128 + x^127 + x^126 + x^121 + 1.  This is done by two iterations, each of
225 // which cancels out the next lowest 64 bits.  Consider a value x^64*A + B,
226 // where A and B are 128-bit.  Adding B_L*G to that value gives:
228 //       x^64*A + B + B_L*G
229 //     = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1)
230 //     = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L
231 //     = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L
234 // So: if we sum A, B with its halves swapped, and the low half of B times x^63
235 // + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the
236 // original value x^64*A + B.  I.e., the low 64 bits got canceled out.
244 // A potential optimization, assuming that b is fixed per-key (if a is fixed
246 // reduction described above to precompute a value c such that x^64*c = b mod G,
247 // and then multiply a_L by c (and implicitly by x^64) instead of by b:
259 .macro	_ghash_mul_step	i, a, b, dst, gfpoly, t0, t1, t2
261 	vpclmulqdq	$0x00, \a, \b, \t0	  // LO = a_L * b_L
262 	vpclmulqdq	$0x01, \a, \b, \t1	  // MI_0 = a_L * b_H
264 	vpclmulqdq	$0x10, \a, \b, \t2	  // MI_1 = a_H * b_L
274 	vpclmulqdq	$0x11, \a, \b, \dst	  // HI = a_H * b_H
284 // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store
286 .macro	_ghash_mul	a, b, dst, gfpoly, t0, t1, t2
288 	_ghash_mul_step	\i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2
292 // GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the
294 .macro	_ghash_mul_noreduce	a, b, lo, mi, hi, t0, t1, t2, t3
295 	vpclmulqdq	$0x00, \a, \b, \t0	// a_L * b_L
296 	vpclmulqdq	$0x01, \a, \b, \t1	// a_L * b_H
297 	vpclmulqdq	$0x10, \a, \b, \t2	// a_H * b_L
298 	vpclmulqdq	$0x11, \a, \b, \t3	// a_H * b_H
304 // Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit
358 	jne		1b
373 	// by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards
375 	// interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121
380 	// is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2
381 	// << 120) | 1 if a 1 bit was carried out.  However, there's no 128-bit
434 // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store
479 //     128-bits each.  This leaves VL/16 128-bit intermediate values.
480 //   - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM.
536 // the round key that has been broadcast to all 128-bit lanes of \round_key.
571 	// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a).
649 	// BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values
650 	// using vpshufb, copied to all 128-bit lanes.
657 	// only the lowest 128-bit lane can be nonzero.  When not fully reduced,
670 	// copied to all 128-bit lanes.  RNDKEY0 is the zero-th round key,
698 	// GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes.
713 	// round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256
722 	// Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes.
771 	jne		1b
798 	jl		128f	// AES-128?
810 128:
900 	jne		1b
1033 	jl		128f	// AES-128?
1041 128:
1057 	// a) ^ b == vaesenclast(key ^ b, a).  I.e., XOR GHASH_ACC into the last