tlb.c - OpenGrok cross reference for /linux/arch/x86/mm/tlb.c

Lines Matching +full:cpu +full:- +full:ns
1 // SPDX-License-Identifier: GPL-2.0-only
9 #include <linux/cpu.h>
18 #include <asm/nospec-branch.h>
39  *	TLB flushing, formerly SMP-only
70  * Instead we have a small per-cpu array of ASIDs and cache the last few mm's
71  * that came by on this CPU, allowing cheaper switch_mm between processes on
72  * this CPU.
77  * ASID  - [0, TLB_NR_DYN_ASIDS-1]
78  *         the canonical identifier for an mm, dynamically allocated on each CPU
79  *         [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1]
82  * kPCID - [1, MAX_ASID_AVAILABLE]
86  * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE]
103 #define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS)
106  * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account
107  * for them being zero-based.  Another -1 is because PCID 0 is reserved for
108  * use by non-PCID-aware users.
110 #define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2)
133 	 * The dynamically-assigned ASIDs that get passed in are small  in kern_pcid()
137 	 * If PCID is on, ASID-aware code paths put the ASID+1 into the  in kern_pcid()
139 	 * situation in which PCID-unaware code saves CR3, loads some other  in kern_pcid()
142 	 * that any bugs involving loading a PCID-enabled CR3 with  in kern_pcid()
179 	 * boot because all CPU's the have same capabilities:  in build_cr3_noflush()
226 	struct new_asid ns;  in choose_new_asid()  local
230 		ns.asid = 0;  in choose_new_asid()
231 		ns.need_flush = 1;  in choose_new_asid()
232 		return ns;  in choose_new_asid()
243 			ns.asid = global_asid;  in choose_new_asid()
244 			ns.need_flush = 0;  in choose_new_asid()
245 			return ns;  in choose_new_asid()
254 		    next->context.ctx_id)  in choose_new_asid()
257 		ns.asid = asid;  in choose_new_asid()
258 		ns.need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < next_tlb_gen);  in choose_new_asid()
259 		return ns;  in choose_new_asid()
263 	 * We don't currently own an ASID slot on this CPU.  in choose_new_asid()
266 	ns.asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;  in choose_new_asid()
267 	if (ns.asid >= TLB_NR_DYN_ASIDS) {  in choose_new_asid()
268 		ns.asid = 0;  in choose_new_asid()
271 	ns.need_flush = true;  in choose_new_asid()
273 	return ns;  in choose_new_asid()
277  * Global ASIDs are allocated for multi-threaded processes that are
279  * processes the same PCID on every CPU, for use with hardware-assisted
288 static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1;
293  * freed global ASIDs are safe to re-use.
305 	 * The TLB flush above makes it safe to re-use the previously  in reset_global_asid_space()
323 	if (last_global_asid >= MAX_ASID_AVAILABLE - 1)  in allocate_global_asid()
338 	global_asid_available--;  in allocate_global_asid()
350 	int cpu;  in mm_active_cpus_exceeds()  local
357 	for_each_cpu(cpu, mm_cpumask(mm)) {  in mm_active_cpus_exceeds()
359 		if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm)  in mm_active_cpus_exceeds()
362 		if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))  in mm_active_cpus_exceeds()
413 	/* The global ASID can be re-used only after flush at wrap-around. */  in mm_free_global_asid()
415 	__set_bit(mm->context.global_asid, global_asid_freed);  in mm_free_global_asid()
417 	mm->context.global_asid = 0;  in mm_free_global_asid()
423  * Is the mm transitioning from a CPU-local ASID to a global ASID?
451 	if ((current->pid & 0x1f) != (jiffies & 0x1f))  in consider_global_asid()
464 	struct mm_struct *mm = info->mm;  in finish_asid_transition()
466 	int cpu;  in finish_asid_transition()  local
471 	for_each_cpu(cpu, mm_cpumask(mm)) {  in finish_asid_transition()
473 		 * The remote CPU is context switching. Wait for that to  in finish_asid_transition()
477 		while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING)  in finish_asid_transition()
480 		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm)  in finish_asid_transition()
484 		 * If at least one CPU is not using the global ASID yet,  in finish_asid_transition()
488 		 * This can race with the CPU switching to another task;  in finish_asid_transition()
491 		if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) {  in finish_asid_transition()
492 			flush_tlb_multi(mm_cpumask(info->mm), info);  in finish_asid_transition()
503 	bool pmd = info->stride_shift == PMD_SHIFT;  in broadcast_tlb_flush()
504 	unsigned long asid = mm_global_asid(info->mm);  in broadcast_tlb_flush()
505 	unsigned long addr = info->start;  in broadcast_tlb_flush()
512 	if (info->end == TLB_FLUSH_ALL) {  in broadcast_tlb_flush()
520 		if (info->stride_shift <= PMD_SHIFT) {  in broadcast_tlb_flush()
521 			nr = (info->end - addr) >> info->stride_shift;  in broadcast_tlb_flush()
529 		addr += nr << info->stride_shift;  in broadcast_tlb_flush()
530 	} while (addr < info->end);  in broadcast_tlb_flush()
617  * Invoked from return to user/guest by a task that opted-in to L1D
619  * affinity settings or CPU hotplug. This is part of the paranoid L1D flush
645 		clear_ti_thread_flag(&next->thread_info, TIF_SPEC_L1D_FLUSH);  in l1d_flush_evaluate()
646 		next->l1d_flush_kill.func = l1d_flush_force_sigbus;  in l1d_flush_evaluate()
647 		task_work_add(next, &next->l1d_flush_kill, TWA_RESUME);  in l1d_flush_evaluate()
662 	return (unsigned long)next->mm | spec_bits;  in mm_mangle_tif_spec_bits()
669 	if (!next || !next->mm)  in cond_mitigation()
676 	 * Avoid user->user BTB/RSB poisoning by flushing them when switching  in cond_mitigation()
677 	 * between processes. This stops one process from doing Spectre-v2  in cond_mitigation()
682 	 * same process. Using the mm pointer instead of mm->context.ctx_id  in cond_mitigation()
705 		 * - the same user space task is scheduled out and later  in cond_mitigation()
709 		 * - a user space task belonging to the same process is  in cond_mitigation()
712 		 * - a user space task belonging to the same process is  in cond_mitigation()
732 		 * last on this CPU.  in cond_mitigation()
734 		if ((prev_mm & ~LAST_USER_MM_SPEC_MASK) != (unsigned long)next->mm)  in cond_mitigation()
756 	     atomic_read(&mm->context.perf_rdpmc_allowed))) {  in cr4_update_pce_mm()
780  * 'current->active_mm' up to date.
788 	unsigned cpu = smp_processor_id();  in switch_mm_irqs_off()  local
790 	struct new_asid ns;  in switch_mm_irqs_off()  local
808 	if (WARN_ON_ONCE(__read_cr3() != build_cr3(prev->pgd, prev_asid,  in switch_mm_irqs_off()
817 		 *  Architecturally, the CPU could prefetch something  in switch_mm_irqs_off()
830 	 * core serialization before returning to user-space, after  in switch_mm_irqs_off()
831 	 * storing to rq->curr, when changing mm.  This is because  in switch_mm_irqs_off()
833 	 * to make them issue memory barriers.  However, if another CPU  in switch_mm_irqs_off()
835 	 * membarrier(), it can cause that CPU not to receive an IPI  in switch_mm_irqs_off()
844 			   next->context.ctx_id);  in switch_mm_irqs_off()
852 		 * Even in lazy TLB mode, the CPU should stay set in the  in switch_mm_irqs_off()
858 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))  in switch_mm_irqs_off()
859 			cpumask_set_cpu(cpu, mm_cpumask(next));  in switch_mm_irqs_off()
863 			next_tlb_gen = atomic64_read(&next->context.tlb_gen);  in switch_mm_irqs_off()
864 			ns = choose_new_asid(next, next_tlb_gen);  in switch_mm_irqs_off()
876 		 * If the CPU is not in lazy TLB mode, we are just switching  in switch_mm_irqs_off()
890 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);  in switch_mm_irqs_off()
899 		ns.asid = prev_asid;  in switch_mm_irqs_off()
900 		ns.need_flush = true;  in switch_mm_irqs_off()
916 		 * Make sure this CPU is set in mm_cpumask() such that we'll  in switch_mm_irqs_off()
928 		 * loaded_mm load can happen in mative_flush_tlb_multi() ->  in switch_mm_irqs_off()
934 		if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))  in switch_mm_irqs_off()
935 			cpumask_set_cpu(cpu, mm_cpumask(next));  in switch_mm_irqs_off()
939 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);  in switch_mm_irqs_off()
941 		ns = choose_new_asid(next, next_tlb_gen);  in switch_mm_irqs_off()
946 	if (ns.need_flush) {  in switch_mm_irqs_off()
947 		VM_WARN_ON_ONCE(is_global_asid(ns.asid));  in switch_mm_irqs_off()
948 		this_cpu_write(cpu_tlbstate.ctxs[ns.asid].ctx_id, next->context.ctx_id);  in switch_mm_irqs_off()
949 		this_cpu_write(cpu_tlbstate.ctxs[ns.asid].tlb_gen, next_tlb_gen);  in switch_mm_irqs_off()
950 		load_new_mm_cr3(next->pgd, ns.asid, new_lam, true);  in switch_mm_irqs_off()
955 		load_new_mm_cr3(next->pgd, ns.asid, new_lam, false);  in switch_mm_irqs_off()
964 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, ns.asid);  in switch_mm_irqs_off()
998  * temporary page-table mappings that are required for these write operations to
1003  * It is illegal to schedule while using a temporary mm -- the context switch
1005  * Use a real (non-temporary) mm in a kernel thread if you need to sleep.
1066  * Call this when reinitializing a CPU.  It fixes the following potential
1069  * - The ASID changed from what cpu_tlbstate thinks it is (most likely
1070  *   because the CPU was taken down and came back up with CR3's PCID
1071  *   bits clear.  CPU hotplug can do this.
1073  * - The TLB contains junk in slots corresponding to inactive ASIDs.
1075  * - The CPU went so far out to lunch that it may have missed a TLB
1087 	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));  in initialize_tlbstate_and_flush()
1102 	write_cr3(build_cr3(mm->pgd, 0, 0));  in initialize_tlbstate_and_flush()
1108 	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);  in initialize_tlbstate_and_flush()
1128 	 * - mm_tlb_gen:     the latest generation.  in flush_tlb_func()
1129 	 * - local_tlb_gen:  the generation that this CPU has already caught  in flush_tlb_func()
1131 	 * - f->new_tlb_gen: the generation that the requester of the flush  in flush_tlb_func()
1138 	bool local = smp_processor_id() == f->initiating_cpu;  in flush_tlb_func()
1150 	/* The CPU was left in the mm_cpumask of the target mm. Clear it. */  in flush_tlb_func()
1151 	if (f->mm && f->mm != loaded_mm) {  in flush_tlb_func()
1152 		cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm));  in flush_tlb_func()
1171 		   loaded_mm->context.ctx_id);  in flush_tlb_func()
1176 		 * paging-structure cache to avoid speculatively reading  in flush_tlb_func()
1189 	if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID &&  in flush_tlb_func()
1190 		     f->new_tlb_gen <= local_tlb_gen)) {  in flush_tlb_func()
1192 		 * The TLB is already up to date in respect to f->new_tlb_gen.  in flush_tlb_func()
1204 	mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);  in flush_tlb_func()
1209 		 * happen if two concurrent flushes happen -- the first flush to  in flush_tlb_func()
1217 	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);  in flush_tlb_func()
1222 	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're  in flush_tlb_func()
1231 	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that  in flush_tlb_func()
1234 	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring  in flush_tlb_func()
1242 	 *    processed on this CPU in reverse order, we'll see  in flush_tlb_func()
1248 	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimization.  in flush_tlb_func()
1256 	if (f->end != TLB_FLUSH_ALL &&  in flush_tlb_func()
1257 	    f->new_tlb_gen == local_tlb_gen + 1 &&  in flush_tlb_func()
1258 	    f->new_tlb_gen == mm_tlb_gen) {  in flush_tlb_func()
1260 		unsigned long addr = f->start;  in flush_tlb_func()
1263 		VM_WARN_ON(f->new_tlb_gen == TLB_GENERATION_INVALID);  in flush_tlb_func()
1266 		VM_WARN_ON(f->mm == NULL);  in flush_tlb_func()
1268 		nr_invalidate = (f->end - f->start) >> f->stride_shift;  in flush_tlb_func()
1270 		while (addr < f->end) {  in flush_tlb_func()
1272 			addr += 1UL << f->stride_shift;  in flush_tlb_func()
1291 				(f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN :  in flush_tlb_func()
1296 static bool should_flush_tlb(int cpu, void *data)  in should_flush_tlb()  argument
1298 	struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu);  in should_flush_tlb()
1309 	if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))  in should_flush_tlb()
1313 	if (!info->mm)  in should_flush_tlb()
1317 	 * While switching, the remote CPU could have state from  in should_flush_tlb()
1323 	/* The target mm is loaded, and the CPU is not lazy. */  in should_flush_tlb()
1324 	if (loaded_mm == info->mm)  in should_flush_tlb()
1328 	if (info->trim_cpumask)  in should_flush_tlb()
1336 	if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {  in should_trim_cpumask()
1337 		WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);  in should_trim_cpumask()
1355 	if (info->end == TLB_FLUSH_ALL)  in native_flush_tlb_multi()
1359 				(info->end - info->start) >> PAGE_SHIFT);  in native_flush_tlb_multi()
1363 	 * CPUs in lazy TLB mode. They will flush the CPU themselves  in native_flush_tlb_multi()
1371 	if (info->freed_tables || mm_in_asid_transition(info->mm))  in native_flush_tlb_multi()
1389  * flush is about 100 ns, so this caps the maximum overhead at
1390  * _about_ 3,000 ns.
1411 	 * Ensure that the following code is non-reentrant and flush_tlb_info  in get_flush_tlb_info()
1413 	 * interrupt handlers and machine-check exception handlers.  in get_flush_tlb_info()
1422 	if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) {  in get_flush_tlb_info()
1427 	info->start		= start;  in get_flush_tlb_info()
1428 	info->end		= end;  in get_flush_tlb_info()
1429 	info->mm		= mm;  in get_flush_tlb_info()
1430 	info->stride_shift	= stride_shift;  in get_flush_tlb_info()
1431 	info->freed_tables	= freed_tables;  in get_flush_tlb_info()
1432 	info->new_tlb_gen	= new_tlb_gen;  in get_flush_tlb_info()
1433 	info->initiating_cpu	= smp_processor_id();  in get_flush_tlb_info()
1434 	info->trim_cpumask	= 0;  in get_flush_tlb_info()
1453 	int cpu = get_cpu();  in flush_tlb_mm_range()  local
1464 	 * a local TLB flush is needed. Optimize this use-case by calling  in flush_tlb_mm_range()
1469 	} else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {  in flush_tlb_mm_range()
1470 		info->trim_cpumask = should_trim_cpumask(mm);  in flush_tlb_mm_range()
1495 	/* First try (faster) hardware-assisted TLB invalidation. */  in flush_tlb_all()
1499 		/* Fall back to the IPI-based invalidation. */  in flush_tlb_all()
1508 	for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) {  in invlpgb_kernel_range_flush()
1509 		nr = (info->end - addr) >> PAGE_SHIFT;  in invlpgb_kernel_range_flush()
1528 	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)  in do_kernel_range_flush()
1557 	if (info->end == TLB_FLUSH_ALL)  in flush_tlb_kernel_range()
1575 		build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,  in __get_current_cr3_fast()
1600 	 * If PTI is on, then the kernel is mapped with non-global PTEs, and  in flush_tlb_one_kernel()
1639 	 * 'cpu_pcide' to ensure that *this* CPU will not trigger those  in native_flush_tlb_one_user()
1672 	 * Read-modify-write to CR4 - protect it from preemption and  in native_flush_tlb_global()
1690 	 * to the per CPU variable and to prevent being preempted between  in native_flush_tlb_local()
1697 	/* If current->mm == NULL then the read_cr3() "borrows" an mm */  in native_flush_tlb_local()
1721 		 * !PGE -> !PCID (setup_pcid()), thus every flush is total.  in __flush_tlb_all()
1732 	int cpu = get_cpu();  in arch_tlbbatch_flush()  local
1738 	 * a local TLB flush is needed. Optimize this use-case by calling  in arch_tlbbatch_flush()
1741 	if (cpu_feature_enabled(X86_FEATURE_INVLPGB) && batch->unmapped_pages) {  in arch_tlbbatch_flush()
1743 		batch->unmapped_pages = false;  in arch_tlbbatch_flush()
1744 	} else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {  in arch_tlbbatch_flush()
1745 		flush_tlb_multi(&batch->cpumask, info);  in arch_tlbbatch_flush()
1746 	} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {  in arch_tlbbatch_flush()
1753 	cpumask_clear(&batch->cpumask);  in arch_tlbbatch_flush()
1769 	struct mm_struct *current_mm = current->mm;  in nmi_uaccess_okay()
1775 	 * current_mm->pgd == __va(read_cr3_pa()).  This may be slow, though,  in nmi_uaccess_okay()
1786 	VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa());  in nmi_uaccess_okay()
1808 	len = min(count, sizeof(buf) - 1);  in tlbflush_write_file()
1810 		return -EFAULT;  in tlbflush_write_file()
1814 		return -EINVAL;  in tlbflush_write_file()
1817 		return -EINVAL;  in tlbflush_write_file()