xref: /linux/arch/x86/mm/tlb.c (revision 995231c820e3bd3633cb38bf4ea6f2541e1da331)
1 #include <linux/init.h>
2 
3 #include <linux/mm.h>
4 #include <linux/spinlock.h>
5 #include <linux/smp.h>
6 #include <linux/interrupt.h>
7 #include <linux/export.h>
8 #include <linux/cpu.h>
9 
10 #include <asm/tlbflush.h>
11 #include <asm/mmu_context.h>
12 #include <asm/cache.h>
13 #include <asm/apic.h>
14 #include <asm/uv/uv.h>
15 #include <linux/debugfs.h>
16 
17 /*
18  *	TLB flushing, formerly SMP-only
19  *		c/o Linus Torvalds.
20  *
21  *	These mean you can really definitely utterly forget about
22  *	writing to user space from interrupts. (Its not allowed anyway).
23  *
24  *	Optimizations Manfred Spraul <manfred@colorfullife.com>
25  *
26  *	More scalable flush, from Andi Kleen
27  *
28  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
29  */
30 
31 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
32 
33 DEFINE_STATIC_KEY_TRUE(tlb_use_lazy_mode);
34 
35 static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
36 			    u16 *new_asid, bool *need_flush)
37 {
38 	u16 asid;
39 
40 	if (!static_cpu_has(X86_FEATURE_PCID)) {
41 		*new_asid = 0;
42 		*need_flush = true;
43 		return;
44 	}
45 
46 	for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
47 		if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
48 		    next->context.ctx_id)
49 			continue;
50 
51 		*new_asid = asid;
52 		*need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
53 			       next_tlb_gen);
54 		return;
55 	}
56 
57 	/*
58 	 * We don't currently own an ASID slot on this CPU.
59 	 * Allocate a slot.
60 	 */
61 	*new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
62 	if (*new_asid >= TLB_NR_DYN_ASIDS) {
63 		*new_asid = 0;
64 		this_cpu_write(cpu_tlbstate.next_asid, 1);
65 	}
66 	*need_flush = true;
67 }
68 
69 void leave_mm(int cpu)
70 {
71 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
72 
73 	/*
74 	 * It's plausible that we're in lazy TLB mode while our mm is init_mm.
75 	 * If so, our callers still expect us to flush the TLB, but there
76 	 * aren't any user TLB entries in init_mm to worry about.
77 	 *
78 	 * This needs to happen before any other sanity checks due to
79 	 * intel_idle's shenanigans.
80 	 */
81 	if (loaded_mm == &init_mm)
82 		return;
83 
84 	/* Warn if we're not lazy. */
85 	WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
86 
87 	switch_mm(NULL, &init_mm, NULL);
88 }
89 
90 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
91 	       struct task_struct *tsk)
92 {
93 	unsigned long flags;
94 
95 	local_irq_save(flags);
96 	switch_mm_irqs_off(prev, next, tsk);
97 	local_irq_restore(flags);
98 }
99 
100 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
101 			struct task_struct *tsk)
102 {
103 	struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
104 	u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
105 	unsigned cpu = smp_processor_id();
106 	u64 next_tlb_gen;
107 
108 	/*
109 	 * NB: The scheduler will call us with prev == next when switching
110 	 * from lazy TLB mode to normal mode if active_mm isn't changing.
111 	 * When this happens, we don't assume that CR3 (and hence
112 	 * cpu_tlbstate.loaded_mm) matches next.
113 	 *
114 	 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
115 	 */
116 
117 	/* We don't want flush_tlb_func_* to run concurrently with us. */
118 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
119 		WARN_ON_ONCE(!irqs_disabled());
120 
121 	/*
122 	 * Verify that CR3 is what we think it is.  This will catch
123 	 * hypothetical buggy code that directly switches to swapper_pg_dir
124 	 * without going through leave_mm() / switch_mm_irqs_off() or that
125 	 * does something like write_cr3(read_cr3_pa()).
126 	 *
127 	 * Only do this check if CONFIG_DEBUG_VM=y because __read_cr3()
128 	 * isn't free.
129 	 */
130 #ifdef CONFIG_DEBUG_VM
131 	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
132 		/*
133 		 * If we were to BUG here, we'd be very likely to kill
134 		 * the system so hard that we don't see the call trace.
135 		 * Try to recover instead by ignoring the error and doing
136 		 * a global flush to minimize the chance of corruption.
137 		 *
138 		 * (This is far from being a fully correct recovery.
139 		 *  Architecturally, the CPU could prefetch something
140 		 *  back into an incorrect ASID slot and leave it there
141 		 *  to cause trouble down the road.  It's better than
142 		 *  nothing, though.)
143 		 */
144 		__flush_tlb_all();
145 	}
146 #endif
147 	this_cpu_write(cpu_tlbstate.is_lazy, false);
148 
149 	if (real_prev == next) {
150 		VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
151 			  next->context.ctx_id);
152 
153 		/*
154 		 * We don't currently support having a real mm loaded without
155 		 * our cpu set in mm_cpumask().  We have all the bookkeeping
156 		 * in place to figure out whether we would need to flush
157 		 * if our cpu were cleared in mm_cpumask(), but we don't
158 		 * currently use it.
159 		 */
160 		if (WARN_ON_ONCE(real_prev != &init_mm &&
161 				 !cpumask_test_cpu(cpu, mm_cpumask(next))))
162 			cpumask_set_cpu(cpu, mm_cpumask(next));
163 
164 		return;
165 	} else {
166 		u16 new_asid;
167 		bool need_flush;
168 
169 		if (IS_ENABLED(CONFIG_VMAP_STACK)) {
170 			/*
171 			 * If our current stack is in vmalloc space and isn't
172 			 * mapped in the new pgd, we'll double-fault.  Forcibly
173 			 * map it.
174 			 */
175 			unsigned int index = pgd_index(current_stack_pointer);
176 			pgd_t *pgd = next->pgd + index;
177 
178 			if (unlikely(pgd_none(*pgd)))
179 				set_pgd(pgd, init_mm.pgd[index]);
180 		}
181 
182 		/* Stop remote flushes for the previous mm */
183 		VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
184 				real_prev != &init_mm);
185 		cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
186 
187 		/*
188 		 * Start remote flushes and then read tlb_gen.
189 		 */
190 		cpumask_set_cpu(cpu, mm_cpumask(next));
191 		next_tlb_gen = atomic64_read(&next->context.tlb_gen);
192 
193 		choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
194 
195 		if (need_flush) {
196 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
197 			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
198 			write_cr3(build_cr3(next, new_asid));
199 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
200 					TLB_FLUSH_ALL);
201 		} else {
202 			/* The new ASID is already up to date. */
203 			write_cr3(build_cr3_noflush(next, new_asid));
204 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
205 		}
206 
207 		this_cpu_write(cpu_tlbstate.loaded_mm, next);
208 		this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
209 	}
210 
211 	load_mm_cr4(next);
212 	switch_ldt(real_prev, next);
213 }
214 
215 /*
216  * enter_lazy_tlb() is a hint from the scheduler that we are entering a
217  * kernel thread or other context without an mm.  Acceptable implementations
218  * include doing nothing whatsoever, switching to init_mm, or various clever
219  * lazy tricks to try to minimize TLB flushes.
220  *
221  * The scheduler reserves the right to call enter_lazy_tlb() several times
222  * in a row.  It will notify us that we're going back to a real mm by
223  * calling switch_mm_irqs_off().
224  */
225 void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
226 {
227 	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
228 		return;
229 
230 	if (static_branch_unlikely(&tlb_use_lazy_mode)) {
231 		/*
232 		 * There's a significant optimization that may be possible
233 		 * here.  We have accurate enough TLB flush tracking that we
234 		 * don't need to maintain coherence of TLB per se when we're
235 		 * lazy.  We do, however, need to maintain coherence of
236 		 * paging-structure caches.  We could, in principle, leave our
237 		 * old mm loaded and only switch to init_mm when
238 		 * tlb_remove_page() happens.
239 		 */
240 		this_cpu_write(cpu_tlbstate.is_lazy, true);
241 	} else {
242 		switch_mm(NULL, &init_mm, NULL);
243 	}
244 }
245 
246 /*
247  * Call this when reinitializing a CPU.  It fixes the following potential
248  * problems:
249  *
250  * - The ASID changed from what cpu_tlbstate thinks it is (most likely
251  *   because the CPU was taken down and came back up with CR3's PCID
252  *   bits clear.  CPU hotplug can do this.
253  *
254  * - The TLB contains junk in slots corresponding to inactive ASIDs.
255  *
256  * - The CPU went so far out to lunch that it may have missed a TLB
257  *   flush.
258  */
259 void initialize_tlbstate_and_flush(void)
260 {
261 	int i;
262 	struct mm_struct *mm = this_cpu_read(cpu_tlbstate.loaded_mm);
263 	u64 tlb_gen = atomic64_read(&init_mm.context.tlb_gen);
264 	unsigned long cr3 = __read_cr3();
265 
266 	/* Assert that CR3 already references the right mm. */
267 	WARN_ON((cr3 & CR3_ADDR_MASK) != __pa(mm->pgd));
268 
269 	/*
270 	 * Assert that CR4.PCIDE is set if needed.  (CR4.PCIDE initialization
271 	 * doesn't work like other CR4 bits because it can only be set from
272 	 * long mode.)
273 	 */
274 	WARN_ON(boot_cpu_has(X86_FEATURE_PCID) &&
275 		!(cr4_read_shadow() & X86_CR4_PCIDE));
276 
277 	/* Force ASID 0 and force a TLB flush. */
278 	write_cr3(build_cr3(mm, 0));
279 
280 	/* Reinitialize tlbstate. */
281 	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
282 	this_cpu_write(cpu_tlbstate.next_asid, 1);
283 	this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, mm->context.ctx_id);
284 	this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, tlb_gen);
285 
286 	for (i = 1; i < TLB_NR_DYN_ASIDS; i++)
287 		this_cpu_write(cpu_tlbstate.ctxs[i].ctx_id, 0);
288 }
289 
290 /*
291  * flush_tlb_func_common()'s memory ordering requirement is that any
292  * TLB fills that happen after we flush the TLB are ordered after we
293  * read active_mm's tlb_gen.  We don't need any explicit barriers
294  * because all x86 flush operations are serializing and the
295  * atomic64_read operation won't be reordered by the compiler.
296  */
297 static void flush_tlb_func_common(const struct flush_tlb_info *f,
298 				  bool local, enum tlb_flush_reason reason)
299 {
300 	/*
301 	 * We have three different tlb_gen values in here.  They are:
302 	 *
303 	 * - mm_tlb_gen:     the latest generation.
304 	 * - local_tlb_gen:  the generation that this CPU has already caught
305 	 *                   up to.
306 	 * - f->new_tlb_gen: the generation that the requester of the flush
307 	 *                   wants us to catch up to.
308 	 */
309 	struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
310 	u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
311 	u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
312 	u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
313 
314 	/* This code cannot presently handle being reentered. */
315 	VM_WARN_ON(!irqs_disabled());
316 
317 	if (unlikely(loaded_mm == &init_mm))
318 		return;
319 
320 	VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
321 		   loaded_mm->context.ctx_id);
322 
323 	if (this_cpu_read(cpu_tlbstate.is_lazy)) {
324 		/*
325 		 * We're in lazy mode.  We need to at least flush our
326 		 * paging-structure cache to avoid speculatively reading
327 		 * garbage into our TLB.  Since switching to init_mm is barely
328 		 * slower than a minimal flush, just switch to init_mm.
329 		 */
330 		switch_mm_irqs_off(NULL, &init_mm, NULL);
331 		return;
332 	}
333 
334 	if (unlikely(local_tlb_gen == mm_tlb_gen)) {
335 		/*
336 		 * There's nothing to do: we're already up to date.  This can
337 		 * happen if two concurrent flushes happen -- the first flush to
338 		 * be handled can catch us all the way up, leaving no work for
339 		 * the second flush.
340 		 */
341 		trace_tlb_flush(reason, 0);
342 		return;
343 	}
344 
345 	WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
346 	WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen);
347 
348 	/*
349 	 * If we get to this point, we know that our TLB is out of date.
350 	 * This does not strictly imply that we need to flush (it's
351 	 * possible that f->new_tlb_gen <= local_tlb_gen), but we're
352 	 * going to need to flush in the very near future, so we might
353 	 * as well get it over with.
354 	 *
355 	 * The only question is whether to do a full or partial flush.
356 	 *
357 	 * We do a partial flush if requested and two extra conditions
358 	 * are met:
359 	 *
360 	 * 1. f->new_tlb_gen == local_tlb_gen + 1.  We have an invariant that
361 	 *    we've always done all needed flushes to catch up to
362 	 *    local_tlb_gen.  If, for example, local_tlb_gen == 2 and
363 	 *    f->new_tlb_gen == 3, then we know that the flush needed to bring
364 	 *    us up to date for tlb_gen 3 is the partial flush we're
365 	 *    processing.
366 	 *
367 	 *    As an example of why this check is needed, suppose that there
368 	 *    are two concurrent flushes.  The first is a full flush that
369 	 *    changes context.tlb_gen from 1 to 2.  The second is a partial
370 	 *    flush that changes context.tlb_gen from 2 to 3.  If they get
371 	 *    processed on this CPU in reverse order, we'll see
372 	 *     local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL.
373 	 *    If we were to use __flush_tlb_single() and set local_tlb_gen to
374 	 *    3, we'd be break the invariant: we'd update local_tlb_gen above
375 	 *    1 without the full flush that's needed for tlb_gen 2.
376 	 *
377 	 * 2. f->new_tlb_gen == mm_tlb_gen.  This is purely an optimiation.
378 	 *    Partial TLB flushes are not all that much cheaper than full TLB
379 	 *    flushes, so it seems unlikely that it would be a performance win
380 	 *    to do a partial flush if that won't bring our TLB fully up to
381 	 *    date.  By doing a full flush instead, we can increase
382 	 *    local_tlb_gen all the way to mm_tlb_gen and we can probably
383 	 *    avoid another flush in the very near future.
384 	 */
385 	if (f->end != TLB_FLUSH_ALL &&
386 	    f->new_tlb_gen == local_tlb_gen + 1 &&
387 	    f->new_tlb_gen == mm_tlb_gen) {
388 		/* Partial flush */
389 		unsigned long addr;
390 		unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
391 
392 		addr = f->start;
393 		while (addr < f->end) {
394 			__flush_tlb_single(addr);
395 			addr += PAGE_SIZE;
396 		}
397 		if (local)
398 			count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
399 		trace_tlb_flush(reason, nr_pages);
400 	} else {
401 		/* Full flush. */
402 		local_flush_tlb();
403 		if (local)
404 			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
405 		trace_tlb_flush(reason, TLB_FLUSH_ALL);
406 	}
407 
408 	/* Both paths above update our state to mm_tlb_gen. */
409 	this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
410 }
411 
412 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
413 {
414 	const struct flush_tlb_info *f = info;
415 
416 	flush_tlb_func_common(f, true, reason);
417 }
418 
419 static void flush_tlb_func_remote(void *info)
420 {
421 	const struct flush_tlb_info *f = info;
422 
423 	inc_irq_stat(irq_tlb_count);
424 
425 	if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
426 		return;
427 
428 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
429 	flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
430 }
431 
432 void native_flush_tlb_others(const struct cpumask *cpumask,
433 			     const struct flush_tlb_info *info)
434 {
435 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
436 	if (info->end == TLB_FLUSH_ALL)
437 		trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
438 	else
439 		trace_tlb_flush(TLB_REMOTE_SEND_IPI,
440 				(info->end - info->start) >> PAGE_SHIFT);
441 
442 	if (is_uv_system()) {
443 		/*
444 		 * This whole special case is confused.  UV has a "Broadcast
445 		 * Assist Unit", which seems to be a fancy way to send IPIs.
446 		 * Back when x86 used an explicit TLB flush IPI, UV was
447 		 * optimized to use its own mechanism.  These days, x86 uses
448 		 * smp_call_function_many(), but UV still uses a manual IPI,
449 		 * and that IPI's action is out of date -- it does a manual
450 		 * flush instead of calling flush_tlb_func_remote().  This
451 		 * means that the percpu tlb_gen variables won't be updated
452 		 * and we'll do pointless flushes on future context switches.
453 		 *
454 		 * Rather than hooking native_flush_tlb_others() here, I think
455 		 * that UV should be updated so that smp_call_function_many(),
456 		 * etc, are optimal on UV.
457 		 */
458 		unsigned int cpu;
459 
460 		cpu = smp_processor_id();
461 		cpumask = uv_flush_tlb_others(cpumask, info);
462 		if (cpumask)
463 			smp_call_function_many(cpumask, flush_tlb_func_remote,
464 					       (void *)info, 1);
465 		return;
466 	}
467 	smp_call_function_many(cpumask, flush_tlb_func_remote,
468 			       (void *)info, 1);
469 }
470 
471 /*
472  * See Documentation/x86/tlb.txt for details.  We choose 33
473  * because it is large enough to cover the vast majority (at
474  * least 95%) of allocations, and is small enough that we are
475  * confident it will not cause too much overhead.  Each single
476  * flush is about 100 ns, so this caps the maximum overhead at
477  * _about_ 3,000 ns.
478  *
479  * This is in units of pages.
480  */
481 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
482 
483 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
484 				unsigned long end, unsigned long vmflag)
485 {
486 	int cpu;
487 
488 	struct flush_tlb_info info = {
489 		.mm = mm,
490 	};
491 
492 	cpu = get_cpu();
493 
494 	/* This is also a barrier that synchronizes with switch_mm(). */
495 	info.new_tlb_gen = inc_mm_tlb_gen(mm);
496 
497 	/* Should we flush just the requested range? */
498 	if ((end != TLB_FLUSH_ALL) &&
499 	    !(vmflag & VM_HUGETLB) &&
500 	    ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
501 		info.start = start;
502 		info.end = end;
503 	} else {
504 		info.start = 0UL;
505 		info.end = TLB_FLUSH_ALL;
506 	}
507 
508 	if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
509 		VM_WARN_ON(irqs_disabled());
510 		local_irq_disable();
511 		flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
512 		local_irq_enable();
513 	}
514 
515 	if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
516 		flush_tlb_others(mm_cpumask(mm), &info);
517 
518 	put_cpu();
519 }
520 
521 
522 static void do_flush_tlb_all(void *info)
523 {
524 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
525 	__flush_tlb_all();
526 }
527 
528 void flush_tlb_all(void)
529 {
530 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
531 	on_each_cpu(do_flush_tlb_all, NULL, 1);
532 }
533 
534 static void do_kernel_range_flush(void *info)
535 {
536 	struct flush_tlb_info *f = info;
537 	unsigned long addr;
538 
539 	/* flush range by one by one 'invlpg' */
540 	for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
541 		__flush_tlb_single(addr);
542 }
543 
544 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
545 {
546 
547 	/* Balance as user space task's flush, a bit conservative */
548 	if (end == TLB_FLUSH_ALL ||
549 	    (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
550 		on_each_cpu(do_flush_tlb_all, NULL, 1);
551 	} else {
552 		struct flush_tlb_info info;
553 		info.start = start;
554 		info.end = end;
555 		on_each_cpu(do_kernel_range_flush, &info, 1);
556 	}
557 }
558 
559 void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
560 {
561 	struct flush_tlb_info info = {
562 		.mm = NULL,
563 		.start = 0UL,
564 		.end = TLB_FLUSH_ALL,
565 	};
566 
567 	int cpu = get_cpu();
568 
569 	if (cpumask_test_cpu(cpu, &batch->cpumask)) {
570 		VM_WARN_ON(irqs_disabled());
571 		local_irq_disable();
572 		flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
573 		local_irq_enable();
574 	}
575 
576 	if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
577 		flush_tlb_others(&batch->cpumask, &info);
578 
579 	cpumask_clear(&batch->cpumask);
580 
581 	put_cpu();
582 }
583 
584 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
585 			     size_t count, loff_t *ppos)
586 {
587 	char buf[32];
588 	unsigned int len;
589 
590 	len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
591 	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
592 }
593 
594 static ssize_t tlbflush_write_file(struct file *file,
595 		 const char __user *user_buf, size_t count, loff_t *ppos)
596 {
597 	char buf[32];
598 	ssize_t len;
599 	int ceiling;
600 
601 	len = min(count, sizeof(buf) - 1);
602 	if (copy_from_user(buf, user_buf, len))
603 		return -EFAULT;
604 
605 	buf[len] = '\0';
606 	if (kstrtoint(buf, 0, &ceiling))
607 		return -EINVAL;
608 
609 	if (ceiling < 0)
610 		return -EINVAL;
611 
612 	tlb_single_page_flush_ceiling = ceiling;
613 	return count;
614 }
615 
616 static const struct file_operations fops_tlbflush = {
617 	.read = tlbflush_read_file,
618 	.write = tlbflush_write_file,
619 	.llseek = default_llseek,
620 };
621 
622 static int __init create_tlb_single_page_flush_ceiling(void)
623 {
624 	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
625 			    arch_debugfs_dir, NULL, &fops_tlbflush);
626 	return 0;
627 }
628 late_initcall(create_tlb_single_page_flush_ceiling);
629 
630 static ssize_t tlblazy_read_file(struct file *file, char __user *user_buf,
631 				 size_t count, loff_t *ppos)
632 {
633 	char buf[2];
634 
635 	buf[0] = static_branch_likely(&tlb_use_lazy_mode) ? '1' : '0';
636 	buf[1] = '\n';
637 
638 	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
639 }
640 
641 static ssize_t tlblazy_write_file(struct file *file,
642 		 const char __user *user_buf, size_t count, loff_t *ppos)
643 {
644 	bool val;
645 
646 	if (kstrtobool_from_user(user_buf, count, &val))
647 		return -EINVAL;
648 
649 	if (val)
650 		static_branch_enable(&tlb_use_lazy_mode);
651 	else
652 		static_branch_disable(&tlb_use_lazy_mode);
653 
654 	return count;
655 }
656 
657 static const struct file_operations fops_tlblazy = {
658 	.read = tlblazy_read_file,
659 	.write = tlblazy_write_file,
660 	.llseek = default_llseek,
661 };
662 
663 static int __init init_tlb_use_lazy_mode(void)
664 {
665 	if (boot_cpu_has(X86_FEATURE_PCID)) {
666 		/*
667 		 * Heuristic: with PCID on, switching to and from
668 		 * init_mm is reasonably fast, but remote flush IPIs
669 		 * as expensive as ever, so turn off lazy TLB mode.
670 		 *
671 		 * We can't do this in setup_pcid() because static keys
672 		 * haven't been initialized yet, and it would blow up
673 		 * badly.
674 		 */
675 		static_branch_disable(&tlb_use_lazy_mode);
676 	}
677 
678 	debugfs_create_file("tlb_use_lazy_mode", S_IRUSR | S_IWUSR,
679 			    arch_debugfs_dir, NULL, &fops_tlblazy);
680 	return 0;
681 }
682 late_initcall(init_tlb_use_lazy_mode);
683