xref: /linux/arch/x86/mm/tlb.c (revision 5e8c0fb6a95728b852d56c0a9244425d474670c0)
1 #include <linux/init.h>
2 
3 #include <linux/mm.h>
4 #include <linux/spinlock.h>
5 #include <linux/smp.h>
6 #include <linux/interrupt.h>
7 #include <linux/module.h>
8 #include <linux/cpu.h>
9 
10 #include <asm/tlbflush.h>
11 #include <asm/mmu_context.h>
12 #include <asm/cache.h>
13 #include <asm/apic.h>
14 #include <asm/uv/uv.h>
15 #include <linux/debugfs.h>
16 
17 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
18 			= { &init_mm, 0, };
19 
20 /*
21  *	Smarter SMP flushing macros.
22  *		c/o Linus Torvalds.
23  *
24  *	These mean you can really definitely utterly forget about
25  *	writing to user space from interrupts. (Its not allowed anyway).
26  *
27  *	Optimizations Manfred Spraul <manfred@colorfullife.com>
28  *
29  *	More scalable flush, from Andi Kleen
30  *
31  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
32  */
33 
34 struct flush_tlb_info {
35 	struct mm_struct *flush_mm;
36 	unsigned long flush_start;
37 	unsigned long flush_end;
38 };
39 
40 /*
41  * We cannot call mmdrop() because we are in interrupt context,
42  * instead update mm->cpu_vm_mask.
43  */
44 void leave_mm(int cpu)
45 {
46 	struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
47 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
48 		BUG();
49 	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
50 		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
51 		load_cr3(swapper_pg_dir);
52 		/*
53 		 * This gets called in the idle path where RCU
54 		 * functions differently.  Tracing normally
55 		 * uses RCU, so we have to call the tracepoint
56 		 * specially here.
57 		 */
58 		trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
59 	}
60 }
61 EXPORT_SYMBOL_GPL(leave_mm);
62 
63 /*
64  * The flush IPI assumes that a thread switch happens in this order:
65  * [cpu0: the cpu that switches]
66  * 1) switch_mm() either 1a) or 1b)
67  * 1a) thread switch to a different mm
68  * 1a1) set cpu_tlbstate to TLBSTATE_OK
69  *	Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
70  *	if cpu0 was in lazy tlb mode.
71  * 1a2) update cpu active_mm
72  *	Now cpu0 accepts tlb flushes for the new mm.
73  * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
74  *	Now the other cpus will send tlb flush ipis.
75  * 1a4) change cr3.
76  * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
77  *	Stop ipi delivery for the old mm. This is not synchronized with
78  *	the other cpus, but flush_tlb_func ignore flush ipis for the wrong
79  *	mm, and in the worst case we perform a superfluous tlb flush.
80  * 1b) thread switch without mm change
81  *	cpu active_mm is correct, cpu0 already handles flush ipis.
82  * 1b1) set cpu_tlbstate to TLBSTATE_OK
83  * 1b2) test_and_set the cpu bit in cpu_vm_mask.
84  *	Atomically set the bit [other cpus will start sending flush ipis],
85  *	and test the bit.
86  * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
87  * 2) switch %%esp, ie current
88  *
89  * The interrupt must handle 2 special cases:
90  * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
91  * - the cpu performs speculative tlb reads, i.e. even if the cpu only
92  *   runs in kernel space, the cpu could load tlb entries for user space
93  *   pages.
94  *
95  * The good news is that cpu_tlbstate is local to each cpu, no
96  * write/read ordering problems.
97  */
98 
99 /*
100  * TLB flush funcation:
101  * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
102  * 2) Leave the mm if we are in the lazy tlb mode.
103  */
104 static void flush_tlb_func(void *info)
105 {
106 	struct flush_tlb_info *f = info;
107 
108 	inc_irq_stat(irq_tlb_count);
109 
110 	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
111 		return;
112 	if (!f->flush_end)
113 		f->flush_end = f->flush_start + PAGE_SIZE;
114 
115 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
116 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
117 		if (f->flush_end == TLB_FLUSH_ALL) {
118 			local_flush_tlb();
119 			trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
120 		} else {
121 			unsigned long addr;
122 			unsigned long nr_pages =
123 				f->flush_end - f->flush_start / PAGE_SIZE;
124 			addr = f->flush_start;
125 			while (addr < f->flush_end) {
126 				__flush_tlb_single(addr);
127 				addr += PAGE_SIZE;
128 			}
129 			trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
130 		}
131 	} else
132 		leave_mm(smp_processor_id());
133 
134 }
135 
136 void native_flush_tlb_others(const struct cpumask *cpumask,
137 				 struct mm_struct *mm, unsigned long start,
138 				 unsigned long end)
139 {
140 	struct flush_tlb_info info;
141 	info.flush_mm = mm;
142 	info.flush_start = start;
143 	info.flush_end = end;
144 
145 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
146 	if (is_uv_system()) {
147 		unsigned int cpu;
148 
149 		cpu = smp_processor_id();
150 		cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
151 		if (cpumask)
152 			smp_call_function_many(cpumask, flush_tlb_func,
153 								&info, 1);
154 		return;
155 	}
156 	smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
157 }
158 
159 void flush_tlb_current_task(void)
160 {
161 	struct mm_struct *mm = current->mm;
162 
163 	preempt_disable();
164 
165 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
166 	local_flush_tlb();
167 	trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
168 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
169 		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
170 	preempt_enable();
171 }
172 
173 /*
174  * See Documentation/x86/tlb.txt for details.  We choose 33
175  * because it is large enough to cover the vast majority (at
176  * least 95%) of allocations, and is small enough that we are
177  * confident it will not cause too much overhead.  Each single
178  * flush is about 100 ns, so this caps the maximum overhead at
179  * _about_ 3,000 ns.
180  *
181  * This is in units of pages.
182  */
183 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
184 
185 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
186 				unsigned long end, unsigned long vmflag)
187 {
188 	unsigned long addr;
189 	/* do a global flush by default */
190 	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
191 
192 	preempt_disable();
193 	if (current->active_mm != mm)
194 		goto out;
195 
196 	if (!current->mm) {
197 		leave_mm(smp_processor_id());
198 		goto out;
199 	}
200 
201 	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
202 		base_pages_to_flush = (end - start) >> PAGE_SHIFT;
203 
204 	if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
205 		base_pages_to_flush = TLB_FLUSH_ALL;
206 		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
207 		local_flush_tlb();
208 	} else {
209 		/* flush range by one by one 'invlpg' */
210 		for (addr = start; addr < end;	addr += PAGE_SIZE) {
211 			count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
212 			__flush_tlb_single(addr);
213 		}
214 	}
215 	trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
216 out:
217 	if (base_pages_to_flush == TLB_FLUSH_ALL) {
218 		start = 0UL;
219 		end = TLB_FLUSH_ALL;
220 	}
221 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
222 		flush_tlb_others(mm_cpumask(mm), mm, start, end);
223 	preempt_enable();
224 }
225 
226 void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
227 {
228 	struct mm_struct *mm = vma->vm_mm;
229 
230 	preempt_disable();
231 
232 	if (current->active_mm == mm) {
233 		if (current->mm)
234 			__flush_tlb_one(start);
235 		else
236 			leave_mm(smp_processor_id());
237 	}
238 
239 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
240 		flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
241 
242 	preempt_enable();
243 }
244 
245 static void do_flush_tlb_all(void *info)
246 {
247 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
248 	__flush_tlb_all();
249 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
250 		leave_mm(smp_processor_id());
251 }
252 
253 void flush_tlb_all(void)
254 {
255 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
256 	on_each_cpu(do_flush_tlb_all, NULL, 1);
257 }
258 
259 static void do_kernel_range_flush(void *info)
260 {
261 	struct flush_tlb_info *f = info;
262 	unsigned long addr;
263 
264 	/* flush range by one by one 'invlpg' */
265 	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
266 		__flush_tlb_single(addr);
267 }
268 
269 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
270 {
271 
272 	/* Balance as user space task's flush, a bit conservative */
273 	if (end == TLB_FLUSH_ALL ||
274 	    (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
275 		on_each_cpu(do_flush_tlb_all, NULL, 1);
276 	} else {
277 		struct flush_tlb_info info;
278 		info.flush_start = start;
279 		info.flush_end = end;
280 		on_each_cpu(do_kernel_range_flush, &info, 1);
281 	}
282 }
283 
284 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
285 			     size_t count, loff_t *ppos)
286 {
287 	char buf[32];
288 	unsigned int len;
289 
290 	len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
291 	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
292 }
293 
294 static ssize_t tlbflush_write_file(struct file *file,
295 		 const char __user *user_buf, size_t count, loff_t *ppos)
296 {
297 	char buf[32];
298 	ssize_t len;
299 	int ceiling;
300 
301 	len = min(count, sizeof(buf) - 1);
302 	if (copy_from_user(buf, user_buf, len))
303 		return -EFAULT;
304 
305 	buf[len] = '\0';
306 	if (kstrtoint(buf, 0, &ceiling))
307 		return -EINVAL;
308 
309 	if (ceiling < 0)
310 		return -EINVAL;
311 
312 	tlb_single_page_flush_ceiling = ceiling;
313 	return count;
314 }
315 
316 static const struct file_operations fops_tlbflush = {
317 	.read = tlbflush_read_file,
318 	.write = tlbflush_write_file,
319 	.llseek = default_llseek,
320 };
321 
322 static int __init create_tlb_single_page_flush_ceiling(void)
323 {
324 	debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
325 			    arch_debugfs_dir, NULL, &fops_tlbflush);
326 	return 0;
327 }
328 late_initcall(create_tlb_single_page_flush_ceiling);
329