13cc21519SEric Biggers/* 23cc21519SEric Biggers * ChaCha/XChaCha NEON helper functions 33cc21519SEric Biggers * 43cc21519SEric Biggers * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org> 53cc21519SEric Biggers * 63cc21519SEric Biggers * This program is free software; you can redistribute it and/or modify 73cc21519SEric Biggers * it under the terms of the GNU General Public License version 2 as 83cc21519SEric Biggers * published by the Free Software Foundation. 93cc21519SEric Biggers * 103cc21519SEric Biggers * Based on: 113cc21519SEric Biggers * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions 123cc21519SEric Biggers * 133cc21519SEric Biggers * Copyright (C) 2015 Martin Willi 143cc21519SEric Biggers * 153cc21519SEric Biggers * This program is free software; you can redistribute it and/or modify 163cc21519SEric Biggers * it under the terms of the GNU General Public License as published by 173cc21519SEric Biggers * the Free Software Foundation; either version 2 of the License, or 183cc21519SEric Biggers * (at your option) any later version. 193cc21519SEric Biggers */ 203cc21519SEric Biggers 213cc21519SEric Biggers /* 223cc21519SEric Biggers * NEON doesn't have a rotate instruction. The alternatives are, more or less: 233cc21519SEric Biggers * 243cc21519SEric Biggers * (a) vshl.u32 + vsri.u32 (needs temporary register) 253cc21519SEric Biggers * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register) 263cc21519SEric Biggers * (c) vrev32.16 (16-bit rotations only) 273cc21519SEric Biggers * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only, 283cc21519SEric Biggers * needs index vector) 293cc21519SEric Biggers * 303cc21519SEric Biggers * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations, 313cc21519SEric Biggers * the only choices are (a) and (b). We use (a) since it takes two-thirds the 323cc21519SEric Biggers * cycles of (b) on both Cortex-A7 and Cortex-A53. 333cc21519SEric Biggers * 343cc21519SEric Biggers * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest 353cc21519SEric Biggers * and doesn't need a temporary register. 363cc21519SEric Biggers * 373cc21519SEric Biggers * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence 383cc21519SEric Biggers * is twice as fast as (a), even when doing (a) on multiple registers 393cc21519SEric Biggers * simultaneously to eliminate the stall between vshl and vsri. Also, it 403cc21519SEric Biggers * parallelizes better when temporary registers are scarce. 413cc21519SEric Biggers * 423cc21519SEric Biggers * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as 433cc21519SEric Biggers * (a), so the need to load the rotation table actually makes the vtbl method 443cc21519SEric Biggers * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it 453cc21519SEric Biggers * seems to be a good compromise to get a more significant speed boost on some 463cc21519SEric Biggers * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7. 473cc21519SEric Biggers */ 483cc21519SEric Biggers 493cc21519SEric Biggers#include <linux/linkage.h> 50*86cd97ecSArd Biesheuvel#include <asm/cache.h> 513cc21519SEric Biggers 523cc21519SEric Biggers .text 533cc21519SEric Biggers .fpu neon 543cc21519SEric Biggers .align 5 553cc21519SEric Biggers 563cc21519SEric Biggers/* 573cc21519SEric Biggers * chacha_permute - permute one block 583cc21519SEric Biggers * 593cc21519SEric Biggers * Permute one 64-byte block where the state matrix is stored in the four NEON 603cc21519SEric Biggers * registers q0-q3. It performs matrix operations on four words in parallel, 613cc21519SEric Biggers * but requires shuffling to rearrange the words after each round. 623cc21519SEric Biggers * 633cc21519SEric Biggers * The round count is given in r3. 643cc21519SEric Biggers * 653cc21519SEric Biggers * Clobbers: r3, ip, q4-q5 663cc21519SEric Biggers */ 673cc21519SEric Biggerschacha_permute: 683cc21519SEric Biggers 693cc21519SEric Biggers adr ip, .Lrol8_table 703cc21519SEric Biggers vld1.8 {d10}, [ip, :64] 713cc21519SEric Biggers 723cc21519SEric Biggers.Ldoubleround: 733cc21519SEric Biggers // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 743cc21519SEric Biggers vadd.i32 q0, q0, q1 753cc21519SEric Biggers veor q3, q3, q0 763cc21519SEric Biggers vrev32.16 q3, q3 773cc21519SEric Biggers 783cc21519SEric Biggers // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 793cc21519SEric Biggers vadd.i32 q2, q2, q3 803cc21519SEric Biggers veor q4, q1, q2 813cc21519SEric Biggers vshl.u32 q1, q4, #12 823cc21519SEric Biggers vsri.u32 q1, q4, #20 833cc21519SEric Biggers 843cc21519SEric Biggers // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 853cc21519SEric Biggers vadd.i32 q0, q0, q1 863cc21519SEric Biggers veor q3, q3, q0 873cc21519SEric Biggers vtbl.8 d6, {d6}, d10 883cc21519SEric Biggers vtbl.8 d7, {d7}, d10 893cc21519SEric Biggers 903cc21519SEric Biggers // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 913cc21519SEric Biggers vadd.i32 q2, q2, q3 923cc21519SEric Biggers veor q4, q1, q2 933cc21519SEric Biggers vshl.u32 q1, q4, #7 943cc21519SEric Biggers vsri.u32 q1, q4, #25 953cc21519SEric Biggers 963cc21519SEric Biggers // x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 973cc21519SEric Biggers vext.8 q1, q1, q1, #4 983cc21519SEric Biggers // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 993cc21519SEric Biggers vext.8 q2, q2, q2, #8 1003cc21519SEric Biggers // x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 1013cc21519SEric Biggers vext.8 q3, q3, q3, #12 1023cc21519SEric Biggers 1033cc21519SEric Biggers // x0 += x1, x3 = rotl32(x3 ^ x0, 16) 1043cc21519SEric Biggers vadd.i32 q0, q0, q1 1053cc21519SEric Biggers veor q3, q3, q0 1063cc21519SEric Biggers vrev32.16 q3, q3 1073cc21519SEric Biggers 1083cc21519SEric Biggers // x2 += x3, x1 = rotl32(x1 ^ x2, 12) 1093cc21519SEric Biggers vadd.i32 q2, q2, q3 1103cc21519SEric Biggers veor q4, q1, q2 1113cc21519SEric Biggers vshl.u32 q1, q4, #12 1123cc21519SEric Biggers vsri.u32 q1, q4, #20 1133cc21519SEric Biggers 1143cc21519SEric Biggers // x0 += x1, x3 = rotl32(x3 ^ x0, 8) 1153cc21519SEric Biggers vadd.i32 q0, q0, q1 1163cc21519SEric Biggers veor q3, q3, q0 1173cc21519SEric Biggers vtbl.8 d6, {d6}, d10 1183cc21519SEric Biggers vtbl.8 d7, {d7}, d10 1193cc21519SEric Biggers 1203cc21519SEric Biggers // x2 += x3, x1 = rotl32(x1 ^ x2, 7) 1213cc21519SEric Biggers vadd.i32 q2, q2, q3 1223cc21519SEric Biggers veor q4, q1, q2 1233cc21519SEric Biggers vshl.u32 q1, q4, #7 1243cc21519SEric Biggers vsri.u32 q1, q4, #25 1253cc21519SEric Biggers 1263cc21519SEric Biggers // x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 1273cc21519SEric Biggers vext.8 q1, q1, q1, #12 1283cc21519SEric Biggers // x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 1293cc21519SEric Biggers vext.8 q2, q2, q2, #8 1303cc21519SEric Biggers // x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 1313cc21519SEric Biggers vext.8 q3, q3, q3, #4 1323cc21519SEric Biggers 1333cc21519SEric Biggers subs r3, r3, #2 1343cc21519SEric Biggers bne .Ldoubleround 1353cc21519SEric Biggers 1363cc21519SEric Biggers bx lr 1373cc21519SEric BiggersENDPROC(chacha_permute) 1383cc21519SEric Biggers 1393cc21519SEric BiggersENTRY(chacha_block_xor_neon) 1403cc21519SEric Biggers // r0: Input state matrix, s 1413cc21519SEric Biggers // r1: 1 data block output, o 1423cc21519SEric Biggers // r2: 1 data block input, i 1433cc21519SEric Biggers // r3: nrounds 1443cc21519SEric Biggers push {lr} 1453cc21519SEric Biggers 1463cc21519SEric Biggers // x0..3 = s0..3 1473cc21519SEric Biggers add ip, r0, #0x20 1483cc21519SEric Biggers vld1.32 {q0-q1}, [r0] 1493cc21519SEric Biggers vld1.32 {q2-q3}, [ip] 1503cc21519SEric Biggers 1513cc21519SEric Biggers vmov q8, q0 1523cc21519SEric Biggers vmov q9, q1 1533cc21519SEric Biggers vmov q10, q2 1543cc21519SEric Biggers vmov q11, q3 1553cc21519SEric Biggers 1563cc21519SEric Biggers bl chacha_permute 1573cc21519SEric Biggers 1583cc21519SEric Biggers add ip, r2, #0x20 1593cc21519SEric Biggers vld1.8 {q4-q5}, [r2] 1603cc21519SEric Biggers vld1.8 {q6-q7}, [ip] 1613cc21519SEric Biggers 1623cc21519SEric Biggers // o0 = i0 ^ (x0 + s0) 1633cc21519SEric Biggers vadd.i32 q0, q0, q8 1643cc21519SEric Biggers veor q0, q0, q4 1653cc21519SEric Biggers 1663cc21519SEric Biggers // o1 = i1 ^ (x1 + s1) 1673cc21519SEric Biggers vadd.i32 q1, q1, q9 1683cc21519SEric Biggers veor q1, q1, q5 1693cc21519SEric Biggers 1703cc21519SEric Biggers // o2 = i2 ^ (x2 + s2) 1713cc21519SEric Biggers vadd.i32 q2, q2, q10 1723cc21519SEric Biggers veor q2, q2, q6 1733cc21519SEric Biggers 1743cc21519SEric Biggers // o3 = i3 ^ (x3 + s3) 1753cc21519SEric Biggers vadd.i32 q3, q3, q11 1763cc21519SEric Biggers veor q3, q3, q7 1773cc21519SEric Biggers 1783cc21519SEric Biggers add ip, r1, #0x20 1793cc21519SEric Biggers vst1.8 {q0-q1}, [r1] 1803cc21519SEric Biggers vst1.8 {q2-q3}, [ip] 1813cc21519SEric Biggers 1823cc21519SEric Biggers pop {pc} 1833cc21519SEric BiggersENDPROC(chacha_block_xor_neon) 1843cc21519SEric Biggers 1853cc21519SEric BiggersENTRY(hchacha_block_neon) 1863cc21519SEric Biggers // r0: Input state matrix, s 1873cc21519SEric Biggers // r1: output (8 32-bit words) 1883cc21519SEric Biggers // r2: nrounds 1893cc21519SEric Biggers push {lr} 1903cc21519SEric Biggers 1913cc21519SEric Biggers vld1.32 {q0-q1}, [r0]! 1923cc21519SEric Biggers vld1.32 {q2-q3}, [r0] 1933cc21519SEric Biggers 1943cc21519SEric Biggers mov r3, r2 1953cc21519SEric Biggers bl chacha_permute 1963cc21519SEric Biggers 1973cc21519SEric Biggers vst1.32 {q0}, [r1]! 1983cc21519SEric Biggers vst1.32 {q3}, [r1] 1993cc21519SEric Biggers 2003cc21519SEric Biggers pop {pc} 2013cc21519SEric BiggersENDPROC(hchacha_block_neon) 2023cc21519SEric Biggers 2033cc21519SEric Biggers .align 4 2043cc21519SEric Biggers.Lctrinc: .word 0, 1, 2, 3 2053cc21519SEric Biggers.Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6 2063cc21519SEric Biggers 2073cc21519SEric Biggers .align 5 2083cc21519SEric BiggersENTRY(chacha_4block_xor_neon) 209*86cd97ecSArd Biesheuvel push {r4, lr} 2103cc21519SEric Biggers mov r4, sp // preserve the stack pointer 2113cc21519SEric Biggers sub ip, sp, #0x20 // allocate a 32 byte buffer 2123cc21519SEric Biggers bic ip, ip, #0x1f // aligned to 32 bytes 2133cc21519SEric Biggers mov sp, ip 2143cc21519SEric Biggers 2153cc21519SEric Biggers // r0: Input state matrix, s 2163cc21519SEric Biggers // r1: 4 data blocks output, o 2173cc21519SEric Biggers // r2: 4 data blocks input, i 2183cc21519SEric Biggers // r3: nrounds 2193cc21519SEric Biggers 2203cc21519SEric Biggers // 2213cc21519SEric Biggers // This function encrypts four consecutive ChaCha blocks by loading 2223cc21519SEric Biggers // the state matrix in NEON registers four times. The algorithm performs 2233cc21519SEric Biggers // each operation on the corresponding word of each state matrix, hence 2243cc21519SEric Biggers // requires no word shuffling. The words are re-interleaved before the 2253cc21519SEric Biggers // final addition of the original state and the XORing step. 2263cc21519SEric Biggers // 2273cc21519SEric Biggers 2283cc21519SEric Biggers // x0..15[0-3] = s0..15[0-3] 2293cc21519SEric Biggers add ip, r0, #0x20 2303cc21519SEric Biggers vld1.32 {q0-q1}, [r0] 2313cc21519SEric Biggers vld1.32 {q2-q3}, [ip] 2323cc21519SEric Biggers 233*86cd97ecSArd Biesheuvel adr lr, .Lctrinc 2343cc21519SEric Biggers vdup.32 q15, d7[1] 2353cc21519SEric Biggers vdup.32 q14, d7[0] 236*86cd97ecSArd Biesheuvel vld1.32 {q4}, [lr, :128] 2373cc21519SEric Biggers vdup.32 q13, d6[1] 2383cc21519SEric Biggers vdup.32 q12, d6[0] 2393cc21519SEric Biggers vdup.32 q11, d5[1] 2403cc21519SEric Biggers vdup.32 q10, d5[0] 2413cc21519SEric Biggers vadd.u32 q12, q12, q4 // x12 += counter values 0-3 2423cc21519SEric Biggers vdup.32 q9, d4[1] 2433cc21519SEric Biggers vdup.32 q8, d4[0] 2443cc21519SEric Biggers vdup.32 q7, d3[1] 2453cc21519SEric Biggers vdup.32 q6, d3[0] 2463cc21519SEric Biggers vdup.32 q5, d2[1] 2473cc21519SEric Biggers vdup.32 q4, d2[0] 2483cc21519SEric Biggers vdup.32 q3, d1[1] 2493cc21519SEric Biggers vdup.32 q2, d1[0] 2503cc21519SEric Biggers vdup.32 q1, d0[1] 2513cc21519SEric Biggers vdup.32 q0, d0[0] 2523cc21519SEric Biggers 2533cc21519SEric Biggers adr ip, .Lrol8_table 2543cc21519SEric Biggers b 1f 2553cc21519SEric Biggers 2563cc21519SEric Biggers.Ldoubleround4: 2573cc21519SEric Biggers vld1.32 {q8-q9}, [sp, :256] 2583cc21519SEric Biggers1: 2593cc21519SEric Biggers // x0 += x4, x12 = rotl32(x12 ^ x0, 16) 2603cc21519SEric Biggers // x1 += x5, x13 = rotl32(x13 ^ x1, 16) 2613cc21519SEric Biggers // x2 += x6, x14 = rotl32(x14 ^ x2, 16) 2623cc21519SEric Biggers // x3 += x7, x15 = rotl32(x15 ^ x3, 16) 2633cc21519SEric Biggers vadd.i32 q0, q0, q4 2643cc21519SEric Biggers vadd.i32 q1, q1, q5 2653cc21519SEric Biggers vadd.i32 q2, q2, q6 2663cc21519SEric Biggers vadd.i32 q3, q3, q7 2673cc21519SEric Biggers 2683cc21519SEric Biggers veor q12, q12, q0 2693cc21519SEric Biggers veor q13, q13, q1 2703cc21519SEric Biggers veor q14, q14, q2 2713cc21519SEric Biggers veor q15, q15, q3 2723cc21519SEric Biggers 2733cc21519SEric Biggers vrev32.16 q12, q12 2743cc21519SEric Biggers vrev32.16 q13, q13 2753cc21519SEric Biggers vrev32.16 q14, q14 2763cc21519SEric Biggers vrev32.16 q15, q15 2773cc21519SEric Biggers 2783cc21519SEric Biggers // x8 += x12, x4 = rotl32(x4 ^ x8, 12) 2793cc21519SEric Biggers // x9 += x13, x5 = rotl32(x5 ^ x9, 12) 2803cc21519SEric Biggers // x10 += x14, x6 = rotl32(x6 ^ x10, 12) 2813cc21519SEric Biggers // x11 += x15, x7 = rotl32(x7 ^ x11, 12) 2823cc21519SEric Biggers vadd.i32 q8, q8, q12 2833cc21519SEric Biggers vadd.i32 q9, q9, q13 2843cc21519SEric Biggers vadd.i32 q10, q10, q14 2853cc21519SEric Biggers vadd.i32 q11, q11, q15 2863cc21519SEric Biggers 2873cc21519SEric Biggers vst1.32 {q8-q9}, [sp, :256] 2883cc21519SEric Biggers 2893cc21519SEric Biggers veor q8, q4, q8 2903cc21519SEric Biggers veor q9, q5, q9 2913cc21519SEric Biggers vshl.u32 q4, q8, #12 2923cc21519SEric Biggers vshl.u32 q5, q9, #12 2933cc21519SEric Biggers vsri.u32 q4, q8, #20 2943cc21519SEric Biggers vsri.u32 q5, q9, #20 2953cc21519SEric Biggers 2963cc21519SEric Biggers veor q8, q6, q10 2973cc21519SEric Biggers veor q9, q7, q11 2983cc21519SEric Biggers vshl.u32 q6, q8, #12 2993cc21519SEric Biggers vshl.u32 q7, q9, #12 3003cc21519SEric Biggers vsri.u32 q6, q8, #20 3013cc21519SEric Biggers vsri.u32 q7, q9, #20 3023cc21519SEric Biggers 3033cc21519SEric Biggers // x0 += x4, x12 = rotl32(x12 ^ x0, 8) 3043cc21519SEric Biggers // x1 += x5, x13 = rotl32(x13 ^ x1, 8) 3053cc21519SEric Biggers // x2 += x6, x14 = rotl32(x14 ^ x2, 8) 3063cc21519SEric Biggers // x3 += x7, x15 = rotl32(x15 ^ x3, 8) 3073cc21519SEric Biggers vld1.8 {d16}, [ip, :64] 3083cc21519SEric Biggers vadd.i32 q0, q0, q4 3093cc21519SEric Biggers vadd.i32 q1, q1, q5 3103cc21519SEric Biggers vadd.i32 q2, q2, q6 3113cc21519SEric Biggers vadd.i32 q3, q3, q7 3123cc21519SEric Biggers 3133cc21519SEric Biggers veor q12, q12, q0 3143cc21519SEric Biggers veor q13, q13, q1 3153cc21519SEric Biggers veor q14, q14, q2 3163cc21519SEric Biggers veor q15, q15, q3 3173cc21519SEric Biggers 3183cc21519SEric Biggers vtbl.8 d24, {d24}, d16 3193cc21519SEric Biggers vtbl.8 d25, {d25}, d16 3203cc21519SEric Biggers vtbl.8 d26, {d26}, d16 3213cc21519SEric Biggers vtbl.8 d27, {d27}, d16 3223cc21519SEric Biggers vtbl.8 d28, {d28}, d16 3233cc21519SEric Biggers vtbl.8 d29, {d29}, d16 3243cc21519SEric Biggers vtbl.8 d30, {d30}, d16 3253cc21519SEric Biggers vtbl.8 d31, {d31}, d16 3263cc21519SEric Biggers 3273cc21519SEric Biggers vld1.32 {q8-q9}, [sp, :256] 3283cc21519SEric Biggers 3293cc21519SEric Biggers // x8 += x12, x4 = rotl32(x4 ^ x8, 7) 3303cc21519SEric Biggers // x9 += x13, x5 = rotl32(x5 ^ x9, 7) 3313cc21519SEric Biggers // x10 += x14, x6 = rotl32(x6 ^ x10, 7) 3323cc21519SEric Biggers // x11 += x15, x7 = rotl32(x7 ^ x11, 7) 3333cc21519SEric Biggers vadd.i32 q8, q8, q12 3343cc21519SEric Biggers vadd.i32 q9, q9, q13 3353cc21519SEric Biggers vadd.i32 q10, q10, q14 3363cc21519SEric Biggers vadd.i32 q11, q11, q15 3373cc21519SEric Biggers 3383cc21519SEric Biggers vst1.32 {q8-q9}, [sp, :256] 3393cc21519SEric Biggers 3403cc21519SEric Biggers veor q8, q4, q8 3413cc21519SEric Biggers veor q9, q5, q9 3423cc21519SEric Biggers vshl.u32 q4, q8, #7 3433cc21519SEric Biggers vshl.u32 q5, q9, #7 3443cc21519SEric Biggers vsri.u32 q4, q8, #25 3453cc21519SEric Biggers vsri.u32 q5, q9, #25 3463cc21519SEric Biggers 3473cc21519SEric Biggers veor q8, q6, q10 3483cc21519SEric Biggers veor q9, q7, q11 3493cc21519SEric Biggers vshl.u32 q6, q8, #7 3503cc21519SEric Biggers vshl.u32 q7, q9, #7 3513cc21519SEric Biggers vsri.u32 q6, q8, #25 3523cc21519SEric Biggers vsri.u32 q7, q9, #25 3533cc21519SEric Biggers 3543cc21519SEric Biggers vld1.32 {q8-q9}, [sp, :256] 3553cc21519SEric Biggers 3563cc21519SEric Biggers // x0 += x5, x15 = rotl32(x15 ^ x0, 16) 3573cc21519SEric Biggers // x1 += x6, x12 = rotl32(x12 ^ x1, 16) 3583cc21519SEric Biggers // x2 += x7, x13 = rotl32(x13 ^ x2, 16) 3593cc21519SEric Biggers // x3 += x4, x14 = rotl32(x14 ^ x3, 16) 3603cc21519SEric Biggers vadd.i32 q0, q0, q5 3613cc21519SEric Biggers vadd.i32 q1, q1, q6 3623cc21519SEric Biggers vadd.i32 q2, q2, q7 3633cc21519SEric Biggers vadd.i32 q3, q3, q4 3643cc21519SEric Biggers 3653cc21519SEric Biggers veor q15, q15, q0 3663cc21519SEric Biggers veor q12, q12, q1 3673cc21519SEric Biggers veor q13, q13, q2 3683cc21519SEric Biggers veor q14, q14, q3 3693cc21519SEric Biggers 3703cc21519SEric Biggers vrev32.16 q15, q15 3713cc21519SEric Biggers vrev32.16 q12, q12 3723cc21519SEric Biggers vrev32.16 q13, q13 3733cc21519SEric Biggers vrev32.16 q14, q14 3743cc21519SEric Biggers 3753cc21519SEric Biggers // x10 += x15, x5 = rotl32(x5 ^ x10, 12) 3763cc21519SEric Biggers // x11 += x12, x6 = rotl32(x6 ^ x11, 12) 3773cc21519SEric Biggers // x8 += x13, x7 = rotl32(x7 ^ x8, 12) 3783cc21519SEric Biggers // x9 += x14, x4 = rotl32(x4 ^ x9, 12) 3793cc21519SEric Biggers vadd.i32 q10, q10, q15 3803cc21519SEric Biggers vadd.i32 q11, q11, q12 3813cc21519SEric Biggers vadd.i32 q8, q8, q13 3823cc21519SEric Biggers vadd.i32 q9, q9, q14 3833cc21519SEric Biggers 3843cc21519SEric Biggers vst1.32 {q8-q9}, [sp, :256] 3853cc21519SEric Biggers 3863cc21519SEric Biggers veor q8, q7, q8 3873cc21519SEric Biggers veor q9, q4, q9 3883cc21519SEric Biggers vshl.u32 q7, q8, #12 3893cc21519SEric Biggers vshl.u32 q4, q9, #12 3903cc21519SEric Biggers vsri.u32 q7, q8, #20 3913cc21519SEric Biggers vsri.u32 q4, q9, #20 3923cc21519SEric Biggers 3933cc21519SEric Biggers veor q8, q5, q10 3943cc21519SEric Biggers veor q9, q6, q11 3953cc21519SEric Biggers vshl.u32 q5, q8, #12 3963cc21519SEric Biggers vshl.u32 q6, q9, #12 3973cc21519SEric Biggers vsri.u32 q5, q8, #20 3983cc21519SEric Biggers vsri.u32 q6, q9, #20 3993cc21519SEric Biggers 4003cc21519SEric Biggers // x0 += x5, x15 = rotl32(x15 ^ x0, 8) 4013cc21519SEric Biggers // x1 += x6, x12 = rotl32(x12 ^ x1, 8) 4023cc21519SEric Biggers // x2 += x7, x13 = rotl32(x13 ^ x2, 8) 4033cc21519SEric Biggers // x3 += x4, x14 = rotl32(x14 ^ x3, 8) 4043cc21519SEric Biggers vld1.8 {d16}, [ip, :64] 4053cc21519SEric Biggers vadd.i32 q0, q0, q5 4063cc21519SEric Biggers vadd.i32 q1, q1, q6 4073cc21519SEric Biggers vadd.i32 q2, q2, q7 4083cc21519SEric Biggers vadd.i32 q3, q3, q4 4093cc21519SEric Biggers 4103cc21519SEric Biggers veor q15, q15, q0 4113cc21519SEric Biggers veor q12, q12, q1 4123cc21519SEric Biggers veor q13, q13, q2 4133cc21519SEric Biggers veor q14, q14, q3 4143cc21519SEric Biggers 4153cc21519SEric Biggers vtbl.8 d30, {d30}, d16 4163cc21519SEric Biggers vtbl.8 d31, {d31}, d16 4173cc21519SEric Biggers vtbl.8 d24, {d24}, d16 4183cc21519SEric Biggers vtbl.8 d25, {d25}, d16 4193cc21519SEric Biggers vtbl.8 d26, {d26}, d16 4203cc21519SEric Biggers vtbl.8 d27, {d27}, d16 4213cc21519SEric Biggers vtbl.8 d28, {d28}, d16 4223cc21519SEric Biggers vtbl.8 d29, {d29}, d16 4233cc21519SEric Biggers 4243cc21519SEric Biggers vld1.32 {q8-q9}, [sp, :256] 4253cc21519SEric Biggers 4263cc21519SEric Biggers // x10 += x15, x5 = rotl32(x5 ^ x10, 7) 4273cc21519SEric Biggers // x11 += x12, x6 = rotl32(x6 ^ x11, 7) 4283cc21519SEric Biggers // x8 += x13, x7 = rotl32(x7 ^ x8, 7) 4293cc21519SEric Biggers // x9 += x14, x4 = rotl32(x4 ^ x9, 7) 4303cc21519SEric Biggers vadd.i32 q10, q10, q15 4313cc21519SEric Biggers vadd.i32 q11, q11, q12 4323cc21519SEric Biggers vadd.i32 q8, q8, q13 4333cc21519SEric Biggers vadd.i32 q9, q9, q14 4343cc21519SEric Biggers 4353cc21519SEric Biggers vst1.32 {q8-q9}, [sp, :256] 4363cc21519SEric Biggers 4373cc21519SEric Biggers veor q8, q7, q8 4383cc21519SEric Biggers veor q9, q4, q9 4393cc21519SEric Biggers vshl.u32 q7, q8, #7 4403cc21519SEric Biggers vshl.u32 q4, q9, #7 4413cc21519SEric Biggers vsri.u32 q7, q8, #25 4423cc21519SEric Biggers vsri.u32 q4, q9, #25 4433cc21519SEric Biggers 4443cc21519SEric Biggers veor q8, q5, q10 4453cc21519SEric Biggers veor q9, q6, q11 4463cc21519SEric Biggers vshl.u32 q5, q8, #7 4473cc21519SEric Biggers vshl.u32 q6, q9, #7 4483cc21519SEric Biggers vsri.u32 q5, q8, #25 4493cc21519SEric Biggers vsri.u32 q6, q9, #25 4503cc21519SEric Biggers 4513cc21519SEric Biggers subs r3, r3, #2 4523cc21519SEric Biggers bne .Ldoubleround4 4533cc21519SEric Biggers 4543cc21519SEric Biggers // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15. 4553cc21519SEric Biggers // x8..9[0-3] are on the stack. 4563cc21519SEric Biggers 4573cc21519SEric Biggers // Re-interleave the words in the first two rows of each block (x0..7). 4583cc21519SEric Biggers // Also add the counter values 0-3 to x12[0-3]. 459*86cd97ecSArd Biesheuvel vld1.32 {q8}, [lr, :128] // load counter values 0-3 4603cc21519SEric Biggers vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1) 4613cc21519SEric Biggers vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3) 4623cc21519SEric Biggers vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5) 4633cc21519SEric Biggers vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7) 4643cc21519SEric Biggers vadd.u32 q12, q8 // x12 += counter values 0-3 4653cc21519SEric Biggers vswp d1, d4 4663cc21519SEric Biggers vswp d3, d6 4673cc21519SEric Biggers vld1.32 {q8-q9}, [r0]! // load s0..7 4683cc21519SEric Biggers vswp d9, d12 4693cc21519SEric Biggers vswp d11, d14 4703cc21519SEric Biggers 4713cc21519SEric Biggers // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1) 4723cc21519SEric Biggers // after XORing the first 32 bytes. 4733cc21519SEric Biggers vswp q1, q4 4743cc21519SEric Biggers 4753cc21519SEric Biggers // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7) 4763cc21519SEric Biggers 4773cc21519SEric Biggers // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block) 4783cc21519SEric Biggers vadd.u32 q0, q0, q8 4793cc21519SEric Biggers vadd.u32 q2, q2, q8 4803cc21519SEric Biggers vadd.u32 q4, q4, q8 4813cc21519SEric Biggers vadd.u32 q3, q3, q8 4823cc21519SEric Biggers 4833cc21519SEric Biggers // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block) 4843cc21519SEric Biggers vadd.u32 q1, q1, q9 4853cc21519SEric Biggers vadd.u32 q6, q6, q9 4863cc21519SEric Biggers vadd.u32 q5, q5, q9 4873cc21519SEric Biggers vadd.u32 q7, q7, q9 4883cc21519SEric Biggers 4893cc21519SEric Biggers // XOR first 32 bytes using keystream from first two rows of first block 4903cc21519SEric Biggers vld1.8 {q8-q9}, [r2]! 4913cc21519SEric Biggers veor q8, q8, q0 4923cc21519SEric Biggers veor q9, q9, q1 4933cc21519SEric Biggers vst1.8 {q8-q9}, [r1]! 4943cc21519SEric Biggers 4953cc21519SEric Biggers // Re-interleave the words in the last two rows of each block (x8..15). 4963cc21519SEric Biggers vld1.32 {q8-q9}, [sp, :256] 497*86cd97ecSArd Biesheuvel mov sp, r4 // restore original stack pointer 498*86cd97ecSArd Biesheuvel ldr r4, [r4, #8] // load number of bytes 4993cc21519SEric Biggers vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13) 5003cc21519SEric Biggers vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15) 5013cc21519SEric Biggers vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9) 5023cc21519SEric Biggers vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11) 5033cc21519SEric Biggers vld1.32 {q0-q1}, [r0] // load s8..15 5043cc21519SEric Biggers vswp d25, d28 5053cc21519SEric Biggers vswp d27, d30 5063cc21519SEric Biggers vswp d17, d20 5073cc21519SEric Biggers vswp d19, d22 5083cc21519SEric Biggers 5093cc21519SEric Biggers // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15) 5103cc21519SEric Biggers 5113cc21519SEric Biggers // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block) 5123cc21519SEric Biggers vadd.u32 q8, q8, q0 5133cc21519SEric Biggers vadd.u32 q10, q10, q0 5143cc21519SEric Biggers vadd.u32 q9, q9, q0 5153cc21519SEric Biggers vadd.u32 q11, q11, q0 5163cc21519SEric Biggers 5173cc21519SEric Biggers // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block) 5183cc21519SEric Biggers vadd.u32 q12, q12, q1 5193cc21519SEric Biggers vadd.u32 q14, q14, q1 5203cc21519SEric Biggers vadd.u32 q13, q13, q1 5213cc21519SEric Biggers vadd.u32 q15, q15, q1 5223cc21519SEric Biggers 5233cc21519SEric Biggers // XOR the rest of the data with the keystream 5243cc21519SEric Biggers 5253cc21519SEric Biggers vld1.8 {q0-q1}, [r2]! 526*86cd97ecSArd Biesheuvel subs r4, r4, #96 5273cc21519SEric Biggers veor q0, q0, q8 5283cc21519SEric Biggers veor q1, q1, q12 529*86cd97ecSArd Biesheuvel ble .Lle96 5303cc21519SEric Biggers vst1.8 {q0-q1}, [r1]! 5313cc21519SEric Biggers 5323cc21519SEric Biggers vld1.8 {q0-q1}, [r2]! 533*86cd97ecSArd Biesheuvel subs r4, r4, #32 5343cc21519SEric Biggers veor q0, q0, q2 5353cc21519SEric Biggers veor q1, q1, q6 536*86cd97ecSArd Biesheuvel ble .Lle128 5373cc21519SEric Biggers vst1.8 {q0-q1}, [r1]! 5383cc21519SEric Biggers 5393cc21519SEric Biggers vld1.8 {q0-q1}, [r2]! 540*86cd97ecSArd Biesheuvel subs r4, r4, #32 5413cc21519SEric Biggers veor q0, q0, q10 5423cc21519SEric Biggers veor q1, q1, q14 543*86cd97ecSArd Biesheuvel ble .Lle160 5443cc21519SEric Biggers vst1.8 {q0-q1}, [r1]! 5453cc21519SEric Biggers 5463cc21519SEric Biggers vld1.8 {q0-q1}, [r2]! 547*86cd97ecSArd Biesheuvel subs r4, r4, #32 5483cc21519SEric Biggers veor q0, q0, q4 5493cc21519SEric Biggers veor q1, q1, q5 550*86cd97ecSArd Biesheuvel ble .Lle192 5513cc21519SEric Biggers vst1.8 {q0-q1}, [r1]! 5523cc21519SEric Biggers 5533cc21519SEric Biggers vld1.8 {q0-q1}, [r2]! 554*86cd97ecSArd Biesheuvel subs r4, r4, #32 5553cc21519SEric Biggers veor q0, q0, q9 5563cc21519SEric Biggers veor q1, q1, q13 557*86cd97ecSArd Biesheuvel ble .Lle224 5583cc21519SEric Biggers vst1.8 {q0-q1}, [r1]! 5593cc21519SEric Biggers 5603cc21519SEric Biggers vld1.8 {q0-q1}, [r2]! 561*86cd97ecSArd Biesheuvel subs r4, r4, #32 5623cc21519SEric Biggers veor q0, q0, q3 5633cc21519SEric Biggers veor q1, q1, q7 564*86cd97ecSArd Biesheuvel blt .Llt256 565*86cd97ecSArd Biesheuvel.Lout: 5663cc21519SEric Biggers vst1.8 {q0-q1}, [r1]! 5673cc21519SEric Biggers 5683cc21519SEric Biggers vld1.8 {q0-q1}, [r2] 5693cc21519SEric Biggers veor q0, q0, q11 5703cc21519SEric Biggers veor q1, q1, q15 5713cc21519SEric Biggers vst1.8 {q0-q1}, [r1] 5723cc21519SEric Biggers 573*86cd97ecSArd Biesheuvel pop {r4, pc} 574*86cd97ecSArd Biesheuvel 575*86cd97ecSArd Biesheuvel.Lle192: 576*86cd97ecSArd Biesheuvel vmov q4, q9 577*86cd97ecSArd Biesheuvel vmov q5, q13 578*86cd97ecSArd Biesheuvel 579*86cd97ecSArd Biesheuvel.Lle160: 580*86cd97ecSArd Biesheuvel // nothing to do 581*86cd97ecSArd Biesheuvel 582*86cd97ecSArd Biesheuvel.Lfinalblock: 583*86cd97ecSArd Biesheuvel // Process the final block if processing less than 4 full blocks. 584*86cd97ecSArd Biesheuvel // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the 585*86cd97ecSArd Biesheuvel // previous 32 byte output block that still needs to be written at 586*86cd97ecSArd Biesheuvel // [r1] in q0-q1. 587*86cd97ecSArd Biesheuvel beq .Lfullblock 588*86cd97ecSArd Biesheuvel 589*86cd97ecSArd Biesheuvel.Lpartialblock: 590*86cd97ecSArd Biesheuvel adr lr, .Lpermute + 32 591*86cd97ecSArd Biesheuvel add r2, r2, r4 592*86cd97ecSArd Biesheuvel add lr, lr, r4 593*86cd97ecSArd Biesheuvel add r4, r4, r1 594*86cd97ecSArd Biesheuvel 595*86cd97ecSArd Biesheuvel vld1.8 {q2-q3}, [lr] 596*86cd97ecSArd Biesheuvel vld1.8 {q6-q7}, [r2] 597*86cd97ecSArd Biesheuvel 598*86cd97ecSArd Biesheuvel add r4, r4, #32 599*86cd97ecSArd Biesheuvel 600*86cd97ecSArd Biesheuvel vtbl.8 d4, {q4-q5}, d4 601*86cd97ecSArd Biesheuvel vtbl.8 d5, {q4-q5}, d5 602*86cd97ecSArd Biesheuvel vtbl.8 d6, {q4-q5}, d6 603*86cd97ecSArd Biesheuvel vtbl.8 d7, {q4-q5}, d7 604*86cd97ecSArd Biesheuvel 605*86cd97ecSArd Biesheuvel veor q6, q6, q2 606*86cd97ecSArd Biesheuvel veor q7, q7, q3 607*86cd97ecSArd Biesheuvel 608*86cd97ecSArd Biesheuvel vst1.8 {q6-q7}, [r4] // overlapping stores 609*86cd97ecSArd Biesheuvel vst1.8 {q0-q1}, [r1] 610*86cd97ecSArd Biesheuvel pop {r4, pc} 611*86cd97ecSArd Biesheuvel 612*86cd97ecSArd Biesheuvel.Lfullblock: 613*86cd97ecSArd Biesheuvel vmov q11, q4 614*86cd97ecSArd Biesheuvel vmov q15, q5 615*86cd97ecSArd Biesheuvel b .Lout 616*86cd97ecSArd Biesheuvel.Lle96: 617*86cd97ecSArd Biesheuvel vmov q4, q2 618*86cd97ecSArd Biesheuvel vmov q5, q6 619*86cd97ecSArd Biesheuvel b .Lfinalblock 620*86cd97ecSArd Biesheuvel.Lle128: 621*86cd97ecSArd Biesheuvel vmov q4, q10 622*86cd97ecSArd Biesheuvel vmov q5, q14 623*86cd97ecSArd Biesheuvel b .Lfinalblock 624*86cd97ecSArd Biesheuvel.Lle224: 625*86cd97ecSArd Biesheuvel vmov q4, q3 626*86cd97ecSArd Biesheuvel vmov q5, q7 627*86cd97ecSArd Biesheuvel b .Lfinalblock 628*86cd97ecSArd Biesheuvel.Llt256: 629*86cd97ecSArd Biesheuvel vmov q4, q11 630*86cd97ecSArd Biesheuvel vmov q5, q15 631*86cd97ecSArd Biesheuvel b .Lpartialblock 6323cc21519SEric BiggersENDPROC(chacha_4block_xor_neon) 633*86cd97ecSArd Biesheuvel 634*86cd97ecSArd Biesheuvel .align L1_CACHE_SHIFT 635*86cd97ecSArd Biesheuvel.Lpermute: 636*86cd97ecSArd Biesheuvel .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 637*86cd97ecSArd Biesheuvel .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 638*86cd97ecSArd Biesheuvel .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 639*86cd97ecSArd Biesheuvel .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f 640*86cd97ecSArd Biesheuvel .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 641*86cd97ecSArd Biesheuvel .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 642*86cd97ecSArd Biesheuvel .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 643*86cd97ecSArd Biesheuvel .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f 644