1 // SPDX-License-Identifier: LGPL-2.1 2 /* 3 * rseq.c 4 * 5 * Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; only 10 * version 2.1 of the License. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 */ 17 18 #define _GNU_SOURCE 19 #include <errno.h> 20 #include <sched.h> 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <string.h> 24 #include <unistd.h> 25 #include <syscall.h> 26 #include <assert.h> 27 #include <signal.h> 28 #include <limits.h> 29 #include <dlfcn.h> 30 #include <stddef.h> 31 #include <sys/auxv.h> 32 #include <linux/auxvec.h> 33 34 #include <linux/compiler.h> 35 36 #include "kselftest.h" 37 #include "rseq.h" 38 39 /* 40 * Define weak versions to play nice with binaries that are statically linked 41 * against a libc that doesn't support registering its own rseq. 42 */ 43 extern __weak ptrdiff_t __rseq_offset; 44 extern __weak unsigned int __rseq_size; 45 extern __weak unsigned int __rseq_flags; 46 47 static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset; 48 static const unsigned int *libc_rseq_size_p = &__rseq_size; 49 static const unsigned int *libc_rseq_flags_p = &__rseq_flags; 50 51 /* Offset from the thread pointer to the rseq area. */ 52 ptrdiff_t rseq_offset; 53 54 /* 55 * Size of the registered rseq area. 0 if the registration was 56 * unsuccessful. 57 */ 58 unsigned int rseq_size = -1U; 59 static unsigned int rseq_alloc_size; 60 61 /* Flags used during rseq registration. */ 62 unsigned int rseq_flags; 63 64 static int rseq_ownership; 65 66 /* Allocate a large area for the TLS. */ 67 #define RSEQ_THREAD_AREA_ALLOC_SIZE 1024 68 69 /* Original struct rseq feature size is 20 bytes. */ 70 #define ORIG_RSEQ_FEATURE_SIZE 20 71 72 /* Original struct rseq allocation size is 32 bytes. */ 73 #define ORIG_RSEQ_ALLOC_SIZE 32 74 75 /* 76 * Use a union to ensure we allocate a TLS area of 1024 bytes to accomodate an 77 * rseq registration that is larger than the current rseq ABI. 78 */ 79 union rseq_tls { 80 struct rseq_abi abi; 81 char dummy[RSEQ_THREAD_AREA_ALLOC_SIZE]; 82 }; 83 84 static 85 __thread union rseq_tls __rseq __attribute__((tls_model("initial-exec"))) = { 86 .abi = { 87 .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED, 88 }, 89 }; 90 91 static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len, 92 int flags, uint32_t sig) 93 { 94 return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig); 95 } 96 97 static int sys_getcpu(unsigned *cpu, unsigned *node) 98 { 99 return syscall(__NR_getcpu, cpu, node, NULL); 100 } 101 102 bool rseq_available(void) 103 { 104 int rc; 105 106 rc = sys_rseq(NULL, 0, 0, 0); 107 if (rc != -1) 108 abort(); 109 switch (errno) { 110 case ENOSYS: 111 return false; 112 case EINVAL: 113 return true; 114 default: 115 abort(); 116 } 117 } 118 119 /* 120 * Return the feature size supported by the kernel. 121 * 122 * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE): 123 * 124 * 0: Return ORIG_RSEQ_FEATURE_SIZE (20) 125 * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE). 126 * 127 * It should never return a value below ORIG_RSEQ_FEATURE_SIZE. 128 */ 129 static unsigned int get_rseq_kernel_feature_size(void) 130 { 131 unsigned long auxv_rseq_feature_size, auxv_rseq_align; 132 133 auxv_rseq_align = getauxval(AT_RSEQ_ALIGN); 134 assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE); 135 136 auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE); 137 assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE); 138 if (auxv_rseq_feature_size) 139 return auxv_rseq_feature_size; 140 else 141 return ORIG_RSEQ_FEATURE_SIZE; 142 } 143 144 int __rseq_register_current_thread(bool nolibc, bool legacy) 145 { 146 unsigned int size; 147 int rc; 148 149 if (!rseq_ownership) { 150 /* Treat libc's ownership as a successful registration. */ 151 return nolibc ? -EBUSY : 0; 152 } 153 154 /* The minimal allocation size is 32, which is the legacy allocation size */ 155 size = get_rseq_kernel_feature_size(); 156 if (legacy || size < ORIG_RSEQ_ALLOC_SIZE) 157 rseq_alloc_size = ORIG_RSEQ_ALLOC_SIZE; 158 else 159 rseq_alloc_size = size; 160 161 rc = sys_rseq(&__rseq.abi, rseq_alloc_size, 0, RSEQ_SIG); 162 if (rc) { 163 /* 164 * After at least one thread has registered successfully 165 * (rseq_size > 0), the registration of other threads should 166 * never fail. 167 */ 168 if (RSEQ_READ_ONCE(rseq_size) > 0) { 169 /* Incoherent success/failure within process. */ 170 abort(); 171 } 172 return -1; 173 } 174 assert(rseq_current_cpu_raw() >= 0); 175 176 /* 177 * The first thread to register sets the rseq_size to mimic the libc 178 * behavior. 179 */ 180 if (RSEQ_READ_ONCE(rseq_size) == 0) 181 RSEQ_WRITE_ONCE(rseq_size, size); 182 183 return 0; 184 } 185 186 int rseq_unregister_current_thread(void) 187 { 188 int rc; 189 190 if (!rseq_ownership) { 191 /* Treat libc's ownership as a successful unregistration. */ 192 return 0; 193 } 194 rc = sys_rseq(&__rseq.abi, rseq_alloc_size, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); 195 if (rc) 196 return -1; 197 return 0; 198 } 199 200 static __attribute__((constructor)) 201 void rseq_init(void) 202 { 203 /* 204 * If the libc's registered rseq size isn't already valid, it may be 205 * because the binary is dynamically linked and not necessarily due to 206 * libc not having registered a restartable sequence. Try to find the 207 * symbols if that's the case. 208 */ 209 if (!libc_rseq_size_p || !*libc_rseq_size_p) { 210 libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset"); 211 libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size"); 212 libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags"); 213 } 214 if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p && 215 *libc_rseq_size_p != 0) { 216 unsigned int libc_rseq_size; 217 218 /* rseq registration owned by glibc */ 219 rseq_offset = *libc_rseq_offset_p; 220 libc_rseq_size = *libc_rseq_size_p; 221 rseq_flags = *libc_rseq_flags_p; 222 223 /* 224 * Previous versions of glibc expose the value 225 * 32 even though the kernel only supported 20 226 * bytes initially. Therefore treat 32 as a 227 * special-case. glibc 2.40 exposes a 20 bytes 228 * __rseq_size without using getauxval(3) to 229 * query the supported size, while still allocating a 32 230 * bytes area. Also treat 20 as a special-case. 231 * 232 * Special-cases are handled by using the following 233 * value as active feature set size: 234 * 235 * rseq_size = min(32, get_rseq_kernel_feature_size()) 236 */ 237 switch (libc_rseq_size) { 238 case ORIG_RSEQ_FEATURE_SIZE: 239 fallthrough; 240 case ORIG_RSEQ_ALLOC_SIZE: 241 { 242 unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size(); 243 244 if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE) 245 rseq_size = rseq_kernel_feature_size; 246 else 247 rseq_size = ORIG_RSEQ_ALLOC_SIZE; 248 break; 249 } 250 default: 251 /* Otherwise just use the __rseq_size from libc as rseq_size. */ 252 rseq_size = libc_rseq_size; 253 break; 254 } 255 return; 256 } 257 rseq_ownership = 1; 258 259 /* Calculate the offset of the rseq area from the thread pointer. */ 260 rseq_offset = (void *)&__rseq.abi - rseq_thread_pointer(); 261 262 /* rseq flags are deprecated, always set to 0. */ 263 rseq_flags = 0; 264 265 /* 266 * Set the size to 0 until at least one thread registers to mimic the 267 * libc behavior. 268 */ 269 rseq_size = 0; 270 } 271 272 static __attribute__((destructor)) 273 void rseq_exit(void) 274 { 275 if (!rseq_ownership) 276 return; 277 rseq_offset = 0; 278 rseq_size = -1U; 279 rseq_ownership = 0; 280 } 281 282 int32_t rseq_fallback_current_cpu(void) 283 { 284 int32_t cpu; 285 286 cpu = sched_getcpu(); 287 if (cpu < 0) { 288 perror("sched_getcpu()"); 289 abort(); 290 } 291 return cpu; 292 } 293 294 int32_t rseq_fallback_current_node(void) 295 { 296 uint32_t cpu_id, node_id; 297 int ret; 298 299 ret = sys_getcpu(&cpu_id, &node_id); 300 if (ret) { 301 perror("sys_getcpu()"); 302 return ret; 303 } 304 return (int32_t) node_id; 305 } 306