1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 32-bit system call dispatch */ 3 4 #include <linux/linkage.h> 5 #include <linux/sys.h> 6 #include <linux/cache.h> 7 #include <linux/syscalls.h> 8 #include <linux/entry-common.h> 9 #include <linux/nospec.h> 10 #include <linux/uaccess.h> 11 #include <asm/apic.h> 12 #include <asm/traps.h> 13 #include <asm/cpufeature.h> 14 #include <asm/syscall.h> 15 16 #ifdef CONFIG_IA32_EMULATION 17 #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat) 18 #else 19 #define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native) 20 #endif 21 22 #define __SYSCALL(nr, sym) extern long __ia32_##sym(const struct pt_regs *); 23 #define __SYSCALL_NORETURN(nr, sym) extern long __noreturn __ia32_##sym(const struct pt_regs *); 24 #include <asm/syscalls_32.h> 25 #undef __SYSCALL 26 27 #undef __SYSCALL_NORETURN 28 #define __SYSCALL_NORETURN __SYSCALL 29 30 /* 31 * The sys_call_table[] is no longer used for system calls, but 32 * kernel/trace/trace_syscalls.c still wants to know the system 33 * call address. 34 */ 35 #ifdef CONFIG_X86_32 36 #define __SYSCALL(nr, sym) __ia32_##sym, 37 const sys_call_ptr_t sys_call_table[] = { 38 #include <asm/syscalls_32.h> 39 }; 40 #undef __SYSCALL 41 #endif 42 43 #define __SYSCALL(nr, sym) case nr: return __ia32_##sym(regs); 44 long ia32_sys_call(const struct pt_regs *regs, unsigned int nr) 45 { 46 switch (nr) { 47 #include <asm/syscalls_32.h> 48 default: return __ia32_sys_ni_syscall(regs); 49 } 50 } 51 52 static __always_inline int syscall_32_enter(struct pt_regs *regs) 53 { 54 if (IS_ENABLED(CONFIG_IA32_EMULATION)) 55 current_thread_info()->status |= TS_COMPAT; 56 57 return (int)regs->orig_ax; 58 } 59 60 #ifdef CONFIG_IA32_EMULATION 61 bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED); 62 63 static int __init ia32_emulation_override_cmdline(char *arg) 64 { 65 return kstrtobool(arg, &__ia32_enabled); 66 } 67 early_param("ia32_emulation", ia32_emulation_override_cmdline); 68 #endif 69 70 /* 71 * Invoke a 32-bit syscall. Called with IRQs on in CT_STATE_KERNEL. 72 */ 73 static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr) 74 { 75 /* 76 * Convert negative numbers to very high and thus out of range 77 * numbers for comparisons. 78 */ 79 unsigned int unr = nr; 80 81 if (likely(unr < IA32_NR_syscalls)) { 82 unr = array_index_nospec(unr, IA32_NR_syscalls); 83 regs->ax = ia32_sys_call(regs, unr); 84 } else if (nr != -1) { 85 regs->ax = __ia32_sys_ni_syscall(regs); 86 } 87 } 88 89 #ifdef CONFIG_IA32_EMULATION 90 static __always_inline bool int80_is_external(void) 91 { 92 const unsigned int offs = (0x80 / 32) * 0x10; 93 const u32 bit = BIT(0x80 % 32); 94 95 /* The local APIC on XENPV guests is fake */ 96 if (cpu_feature_enabled(X86_FEATURE_XENPV)) 97 return false; 98 99 /* 100 * If vector 0x80 is set in the APIC ISR then this is an external 101 * interrupt. Either from broken hardware or injected by a VMM. 102 * 103 * Note: In guest mode this is only valid for secure guests where 104 * the secure module fully controls the vAPIC exposed to the guest. 105 */ 106 return apic_read(APIC_ISR + offs) & bit; 107 } 108 109 /** 110 * do_int80_emulation - 32-bit legacy syscall C entry from asm 111 * @regs: syscall arguments in struct pt_args on the stack. 112 * 113 * This entry point can be used by 32-bit and 64-bit programs to perform 114 * 32-bit system calls. Instances of INT $0x80 can be found inline in 115 * various programs and libraries. It is also used by the vDSO's 116 * __kernel_vsyscall fallback for hardware that doesn't support a faster 117 * entry method. Restarted 32-bit system calls also fall back to INT 118 * $0x80 regardless of what instruction was originally used to do the 119 * system call. 120 * 121 * This is considered a slow path. It is not used by most libc 122 * implementations on modern hardware except during process startup. 123 * 124 * The arguments for the INT $0x80 based syscall are on stack in the 125 * pt_regs structure: 126 * eax: system call number 127 * ebx, ecx, edx, esi, edi, ebp: arg1 - arg 6 128 */ 129 __visible noinstr void do_int80_emulation(struct pt_regs *regs) 130 { 131 int nr; 132 133 /* Kernel does not use INT $0x80! */ 134 if (unlikely(!user_mode(regs))) { 135 irqentry_enter(regs); 136 instrumentation_begin(); 137 panic("Unexpected external interrupt 0x80\n"); 138 } 139 140 /* 141 * Establish kernel context for instrumentation, including for 142 * int80_is_external() below which calls into the APIC driver. 143 * Identical for soft and external interrupts. 144 */ 145 enter_from_user_mode(regs); 146 147 instrumentation_begin(); 148 add_random_kstack_offset(); 149 150 /* Validate that this is a soft interrupt to the extent possible */ 151 if (unlikely(int80_is_external())) 152 panic("Unexpected external interrupt 0x80\n"); 153 154 /* 155 * The low level idtentry code pushed -1 into regs::orig_ax 156 * and regs::ax contains the syscall number. 157 * 158 * User tracing code (ptrace or signal handlers) might assume 159 * that the regs::orig_ax contains a 32-bit number on invoking 160 * a 32-bit syscall. 161 * 162 * Establish the syscall convention by saving the 32bit truncated 163 * syscall number in regs::orig_ax and by invalidating regs::ax. 164 */ 165 regs->orig_ax = regs->ax & GENMASK(31, 0); 166 regs->ax = -ENOSYS; 167 168 nr = syscall_32_enter(regs); 169 170 local_irq_enable(); 171 nr = syscall_enter_from_user_mode_work(regs, nr); 172 do_syscall_32_irqs_on(regs, nr); 173 174 instrumentation_end(); 175 syscall_exit_to_user_mode(regs); 176 } 177 178 #ifdef CONFIG_X86_FRED 179 /* 180 * A FRED-specific INT80 handler is warranted for the follwing reasons: 181 * 182 * 1) As INT instructions and hardware interrupts are separate event 183 * types, FRED does not preclude the use of vector 0x80 for external 184 * interrupts. As a result, the FRED setup code does not reserve 185 * vector 0x80 and calling int80_is_external() is not merely 186 * suboptimal but actively incorrect: it could cause a system call 187 * to be incorrectly ignored. 188 * 189 * 2) It is called only for handling vector 0x80 of event type 190 * EVENT_TYPE_SWINT and will never be called to handle any external 191 * interrupt (event type EVENT_TYPE_EXTINT). 192 * 193 * 3) FRED has separate entry flows depending on if the event came from 194 * user space or kernel space, and because the kernel does not use 195 * INT insns, the FRED kernel entry handler fred_entry_from_kernel() 196 * falls through to fred_bad_type() if the event type is 197 * EVENT_TYPE_SWINT, i.e., INT insns. So if the kernel is handling 198 * an INT insn, it can only be from a user level. 199 * 200 * 4) int80_emulation() does a CLEAR_BRANCH_HISTORY. While FRED will 201 * likely take a different approach if it is ever needed: it 202 * probably belongs in either fred_intx()/ fred_other() or 203 * asm_fred_entrypoint_user(), depending on if this ought to be done 204 * for all entries from userspace or only system 205 * calls. 206 * 207 * 5) INT $0x80 is the fast path for 32-bit system calls under FRED. 208 */ 209 DEFINE_FREDENTRY_RAW(int80_emulation) 210 { 211 int nr; 212 213 enter_from_user_mode(regs); 214 215 instrumentation_begin(); 216 add_random_kstack_offset(); 217 218 /* 219 * FRED pushed 0 into regs::orig_ax and regs::ax contains the 220 * syscall number. 221 * 222 * User tracing code (ptrace or signal handlers) might assume 223 * that the regs::orig_ax contains a 32-bit number on invoking 224 * a 32-bit syscall. 225 * 226 * Establish the syscall convention by saving the 32bit truncated 227 * syscall number in regs::orig_ax and by invalidating regs::ax. 228 */ 229 regs->orig_ax = regs->ax & GENMASK(31, 0); 230 regs->ax = -ENOSYS; 231 232 nr = syscall_32_enter(regs); 233 234 local_irq_enable(); 235 nr = syscall_enter_from_user_mode_work(regs, nr); 236 do_syscall_32_irqs_on(regs, nr); 237 238 instrumentation_end(); 239 syscall_exit_to_user_mode(regs); 240 } 241 #endif /* CONFIG_X86_FRED */ 242 243 #else /* CONFIG_IA32_EMULATION */ 244 245 /* Handles int $0x80 on a 32bit kernel */ 246 __visible noinstr void do_int80_syscall_32(struct pt_regs *regs) 247 { 248 int nr = syscall_32_enter(regs); 249 250 add_random_kstack_offset(); 251 /* 252 * Subtlety here: if ptrace pokes something larger than 2^31-1 into 253 * orig_ax, the int return value truncates it. This matches 254 * the semantics of syscall_get_nr(). 255 */ 256 nr = syscall_enter_from_user_mode(regs, nr); 257 instrumentation_begin(); 258 259 do_syscall_32_irqs_on(regs, nr); 260 261 instrumentation_end(); 262 syscall_exit_to_user_mode(regs); 263 } 264 #endif /* !CONFIG_IA32_EMULATION */ 265 266 static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) 267 { 268 int nr = syscall_32_enter(regs); 269 int res; 270 271 add_random_kstack_offset(); 272 /* 273 * This cannot use syscall_enter_from_user_mode() as it has to 274 * fetch EBP before invoking any of the syscall entry work 275 * functions. 276 */ 277 syscall_enter_from_user_mode_prepare(regs); 278 279 instrumentation_begin(); 280 /* Fetch EBP from where the vDSO stashed it. */ 281 if (IS_ENABLED(CONFIG_X86_64)) { 282 /* 283 * Micro-optimization: the pointer we're following is 284 * explicitly 32 bits, so it can't be out of range. 285 */ 286 res = __get_user(*(u32 *)®s->bp, 287 (u32 __user __force *)(unsigned long)(u32)regs->sp); 288 } else { 289 res = get_user(*(u32 *)®s->bp, 290 (u32 __user __force *)(unsigned long)(u32)regs->sp); 291 } 292 293 if (res) { 294 /* User code screwed up. */ 295 regs->ax = -EFAULT; 296 297 local_irq_disable(); 298 instrumentation_end(); 299 irqentry_exit_to_user_mode(regs); 300 return false; 301 } 302 303 nr = syscall_enter_from_user_mode_work(regs, nr); 304 305 /* Now this is just like a normal syscall. */ 306 do_syscall_32_irqs_on(regs, nr); 307 308 instrumentation_end(); 309 syscall_exit_to_user_mode(regs); 310 return true; 311 } 312 313 /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ 314 __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs) 315 { 316 /* 317 * Called using the internal vDSO SYSENTER/SYSCALL32 calling 318 * convention. Adjust regs so it looks like we entered using int80. 319 */ 320 unsigned long landing_pad = (unsigned long)current->mm->context.vdso + 321 vdso_image_32.sym_int80_landing_pad; 322 323 /* 324 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward 325 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. 326 * Fix it up. 327 */ 328 regs->ip = landing_pad; 329 330 /* Invoke the syscall. If it failed, keep it simple: use IRET. */ 331 if (!__do_fast_syscall_32(regs)) 332 return false; 333 334 /* 335 * Check that the register state is valid for using SYSRETL/SYSEXIT 336 * to exit to userspace. Otherwise use the slower but fully capable 337 * IRET exit path. 338 */ 339 340 /* XEN PV guests always use the IRET path */ 341 if (cpu_feature_enabled(X86_FEATURE_XENPV)) 342 return false; 343 344 /* EIP must point to the VDSO landing pad */ 345 if (unlikely(regs->ip != landing_pad)) 346 return false; 347 348 /* CS and SS must match the values set in MSR_STAR */ 349 if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS)) 350 return false; 351 352 /* If the TF, RF, or VM flags are set, use IRET */ 353 if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM))) 354 return false; 355 356 /* Use SYSRETL/SYSEXIT to exit to userspace */ 357 return true; 358 } 359 360 /* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */ 361 __visible noinstr bool do_SYSENTER_32(struct pt_regs *regs) 362 { 363 /* SYSENTER loses RSP, but the vDSO saved it in RBP. */ 364 regs->sp = regs->bp; 365 366 /* SYSENTER clobbers EFLAGS.IF. Assume it was set in usermode. */ 367 regs->flags |= X86_EFLAGS_IF; 368 369 return do_fast_syscall_32(regs); 370 } 371