1/*- 2 * Copyright (c) 2023 The FreeBSD Foundation 3 * 4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org> 5 * under sponsorship from the FreeBSD Foundation. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE 27 */ 28 29#include <machine/asm.h> 30 31#include "amd64_archlevel.h" 32 33#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ 34 35ARCHFUNCS(timingsafe_bcmp) 36 ARCHFUNC(timingsafe_bcmp, scalar) 37 ARCHFUNC(timingsafe_bcmp, baseline) 38ENDARCHFUNCS(timingsafe_bcmp) 39 40ARCHENTRY(timingsafe_bcmp, scalar) 41 cmp $16, %rdx # at least 17 bytes to process? 42 ja .Lgt16 43 44 cmp $8, %edx # at least 9 bytes to process? 45 ja .L0916 46 47 cmp $4, %edx # at least 5 bytes to process? 48 ja .L0508 49 50 cmp $2, %edx # at least 3 bytes to process? 51 ja .L0304 52 53 test %edx, %edx # buffer empty? 54 jnz .L0102 55 56 xor %eax, %eax # empty buffer always matches 57 ret 58 59.L0102: movzbl (%rdi), %eax # load 1--2 bytes from first buffer 60 movzbl -1(%rdi, %rdx, 1), %ecx 61 xor (%rsi), %al # xor in second buffer 62 xor -1(%rsi, %rdx, 1), %cl 63 or %ecx, %eax # mismatch in any of the two? 64 ret 65 66.L0304: movzwl (%rdi), %eax 67 movzwl -2(%rdi, %rdx, 1), %ecx 68 xor (%rsi), %ax 69 xor -2(%rsi, %rdx, 1), %cx 70 or %ecx, %eax 71 ret 72 73.L0508: mov (%rdi), %eax 74 mov -4(%rdi, %rdx, 1), %ecx 75 xor (%rsi), %eax 76 xor -4(%rsi, %rdx, 1), %ecx 77 or %ecx, %eax 78 ret 79 80.L0916: mov (%rdi), %rax 81 mov -8(%rdi, %rdx, 1), %rcx 82 xor (%rsi), %rax 83 xor -8(%rsi, %rdx, 1), %rcx 84 or %rcx, %rax 85 setnz %al # ensure EAX nonzero even if only 86 ret # high bits of RAX were set 87 88 /* more than 16 bytes: process buffer in a loop */ 89.Lgt16: mov (%rdi), %rax # process first 16 bytes 90 mov 8(%rdi), %r9 91 mov $32, %ecx 92 xor (%rsi), %rax 93 xor 8(%rsi), %r9 94 or %r9, %rax 95 96 cmp %rdx, %rcx # enough left for a full iteration? 97 jae .Ltail 98 99 /* main loop processing 16 bytes per iteration */ 100 ALIGN_TEXT 1010: mov -16(%rdi, %rcx, 1), %r8 102 mov -8(%rdi, %rcx, 1), %r9 103 xor -16(%rsi, %rcx, 1), %r8 104 xor -8(%rsi, %rcx, 1), %r9 105 add $16, %rcx 106 or %r9, %r8 107 or %r8, %rax 108 109 cmp %rdx, %rcx 110 jb 0b 111 112 /* process last 16 bytes */ 113.Ltail: mov -16(%rdi, %rdx, 1), %r8 114 mov -8(%rdi, %rdx, 1), %r9 115 xor -16(%rsi, %rdx, 1), %r8 116 xor -8(%rsi, %rdx, 1), %r9 117 or %r9, %r8 118 or %r8, %rax 119 setnz %al 120 ret 121ARCHEND(timingsafe_bcmp, scalar) 122 123ARCHENTRY(timingsafe_bcmp, baseline) 124 cmp $32, %rdx # at least 33 bytes to process? 125 ja .Lgt32b 126 127 cmp $16, %edx # at least 17 bytes to process? 128 ja .L1732b 129 130 cmp $8, %edx # at least 9 bytes to process? 131 ja .L0916b 132 133 cmp $4, %edx # at least 5 bytes to process? 134 ja .L0508b 135 136 cmp $2, %edx # at least 3 bytes to process? 137 ja .L0304b 138 139 test %edx, %edx # buffer empty? 140 jnz .L0102b 141 142 xor %eax, %eax # empty buffer always matches 143 ret 144 145.L0102b: 146 movzbl (%rdi), %eax # load 1--2 bytes from first buffer 147 movzbl -1(%rdi, %rdx, 1), %ecx 148 xor (%rsi), %al # xor in second buffer 149 xor -1(%rsi, %rdx, 1), %cl 150 or %ecx, %eax # mismatch in any of the two? 151 ret 152 153.L0304b: 154 movzwl (%rdi), %eax 155 movzwl -2(%rdi, %rdx, 1), %ecx 156 xor (%rsi), %ax 157 xor -2(%rsi, %rdx, 1), %cx 158 or %ecx, %eax 159 ret 160 161.L0508b: 162 mov (%rdi), %eax 163 mov -4(%rdi, %rdx, 1), %ecx 164 xor (%rsi), %eax 165 xor -4(%rsi, %rdx, 1), %ecx 166 or %ecx, %eax 167 ret 168 169.L0916b: 170 mov (%rdi), %rax 171 mov -8(%rdi, %rdx, 1), %rcx 172 xor (%rsi), %rax 173 xor -8(%rsi, %rdx, 1), %rcx 174 or %rcx, %rax 175 setnz %al # ensure EAX nonzero even if only 176 ret # high bits of RAX were set 177 178.L1732b: 179 movdqu (%rdi), %xmm0 180 movdqu (%rsi), %xmm2 181 movdqu -16(%rdi, %rdx, 1), %xmm1 182 movdqu -16(%rsi, %rdx, 1), %xmm3 183 pcmpeqb %xmm2, %xmm0 184 pcmpeqb %xmm3, %xmm1 185 pand %xmm1, %xmm0 186 pmovmskb %xmm0, %eax # 1 where equal 187 xor $0xffff, %eax # 1 where not equal 188 ret 189 190 /* more than 32 bytes: process buffer in a loop */ 191.Lgt32b: 192 movdqu (%rdi), %xmm4 193 movdqu (%rsi), %xmm2 194 movdqu 16(%rdi), %xmm1 195 movdqu 16(%rsi), %xmm3 196 mov $64, %ecx 197 pcmpeqb %xmm2, %xmm4 198 pcmpeqb %xmm3, %xmm1 199 pand %xmm1, %xmm4 200 cmp %rdx, %rcx # enough left for a full iteration? 201 jae .Ltailb 202 203 /* main loop processing 32 bytes per iteration */ 204 ALIGN_TEXT 2050: movdqu -32(%rdi, %rcx, 1), %xmm0 206 movdqu -32(%rsi, %rcx, 1), %xmm2 207 movdqu -16(%rdi, %rcx, 1), %xmm1 208 movdqu -16(%rsi, %rcx, 1), %xmm3 209 add $32, %rcx 210 pcmpeqb %xmm2, %xmm0 211 pcmpeqb %xmm3, %xmm1 212 pand %xmm1, %xmm0 213 pand %xmm0, %xmm4 214 cmp %rdx, %rcx 215 jb 0b 216 217 /* process last 32 bytes */ 218.Ltailb: 219 movdqu -32(%rdi, %rdx, 1), %xmm0 220 movdqu -32(%rsi, %rdx, 1), %xmm2 221 movdqu -16(%rdi, %rdx, 1), %xmm1 222 movdqu -16(%rsi, %rdx, 1), %xmm3 223 pcmpeqb %xmm2, %xmm0 224 pcmpeqb %xmm3, %xmm1 225 pand %xmm1, %xmm0 226 pand %xmm4, %xmm0 227 pmovmskb %xmm0, %eax 228 xor $0xffff, %eax 229 ret 230ARCHEND(timingsafe_bcmp, baseline) 231 232 .section .note.GNU-stack,"",%progbits 233