xref: /linux/arch/x86/mm/kmmio.c (revision 005438a8eef063495ac059d128eea71b58de50e5)
1 /* Support for MMIO probes.
2  * Benfit many code from kprobes
3  * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
4  *     2007 Alexander Eichner
5  *     2008 Pekka Paalanen <pq@iki.fi>
6  */
7 
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 
10 #include <linux/list.h>
11 #include <linux/rculist.h>
12 #include <linux/spinlock.h>
13 #include <linux/hash.h>
14 #include <linux/module.h>
15 #include <linux/kernel.h>
16 #include <linux/uaccess.h>
17 #include <linux/ptrace.h>
18 #include <linux/preempt.h>
19 #include <linux/percpu.h>
20 #include <linux/kdebug.h>
21 #include <linux/mutex.h>
22 #include <linux/io.h>
23 #include <linux/slab.h>
24 #include <asm/cacheflush.h>
25 #include <asm/tlbflush.h>
26 #include <linux/errno.h>
27 #include <asm/debugreg.h>
28 #include <linux/mmiotrace.h>
29 
30 #define KMMIO_PAGE_HASH_BITS 4
31 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
32 
33 struct kmmio_fault_page {
34 	struct list_head list;
35 	struct kmmio_fault_page *release_next;
36 	unsigned long page; /* location of the fault page */
37 	pteval_t old_presence; /* page presence prior to arming */
38 	bool armed;
39 
40 	/*
41 	 * Number of times this page has been registered as a part
42 	 * of a probe. If zero, page is disarmed and this may be freed.
43 	 * Used only by writers (RCU) and post_kmmio_handler().
44 	 * Protected by kmmio_lock, when linked into kmmio_page_table.
45 	 */
46 	int count;
47 
48 	bool scheduled_for_release;
49 };
50 
51 struct kmmio_delayed_release {
52 	struct rcu_head rcu;
53 	struct kmmio_fault_page *release_list;
54 };
55 
56 struct kmmio_context {
57 	struct kmmio_fault_page *fpage;
58 	struct kmmio_probe *probe;
59 	unsigned long saved_flags;
60 	unsigned long addr;
61 	int active;
62 };
63 
64 static DEFINE_SPINLOCK(kmmio_lock);
65 
66 /* Protected by kmmio_lock */
67 unsigned int kmmio_count;
68 
69 /* Read-protected by RCU, write-protected by kmmio_lock. */
70 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
71 static LIST_HEAD(kmmio_probes);
72 
73 static struct list_head *kmmio_page_list(unsigned long page)
74 {
75 	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
76 }
77 
78 /* Accessed per-cpu */
79 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
80 
81 /*
82  * this is basically a dynamic stabbing problem:
83  * Could use the existing prio tree code or
84  * Possible better implementations:
85  * The Interval Skip List: A Data Structure for Finding All Intervals That
86  * Overlap a Point (might be simple)
87  * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
88  */
89 /* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
90 static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
91 {
92 	struct kmmio_probe *p;
93 	list_for_each_entry_rcu(p, &kmmio_probes, list) {
94 		if (addr >= p->addr && addr < (p->addr + p->len))
95 			return p;
96 	}
97 	return NULL;
98 }
99 
100 /* You must be holding RCU read lock. */
101 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
102 {
103 	struct list_head *head;
104 	struct kmmio_fault_page *f;
105 
106 	page &= PAGE_MASK;
107 	head = kmmio_page_list(page);
108 	list_for_each_entry_rcu(f, head, list) {
109 		if (f->page == page)
110 			return f;
111 	}
112 	return NULL;
113 }
114 
115 static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
116 {
117 	pmdval_t v = pmd_val(*pmd);
118 	if (clear) {
119 		*old = v & _PAGE_PRESENT;
120 		v &= ~_PAGE_PRESENT;
121 	} else	/* presume this has been called with clear==true previously */
122 		v |= *old;
123 	set_pmd(pmd, __pmd(v));
124 }
125 
126 static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
127 {
128 	pteval_t v = pte_val(*pte);
129 	if (clear) {
130 		*old = v & _PAGE_PRESENT;
131 		v &= ~_PAGE_PRESENT;
132 	} else	/* presume this has been called with clear==true previously */
133 		v |= *old;
134 	set_pte_atomic(pte, __pte(v));
135 }
136 
137 static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
138 {
139 	unsigned int level;
140 	pte_t *pte = lookup_address(f->page, &level);
141 
142 	if (!pte) {
143 		pr_err("no pte for page 0x%08lx\n", f->page);
144 		return -1;
145 	}
146 
147 	switch (level) {
148 	case PG_LEVEL_2M:
149 		clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
150 		break;
151 	case PG_LEVEL_4K:
152 		clear_pte_presence(pte, clear, &f->old_presence);
153 		break;
154 	default:
155 		pr_err("unexpected page level 0x%x.\n", level);
156 		return -1;
157 	}
158 
159 	__flush_tlb_one(f->page);
160 	return 0;
161 }
162 
163 /*
164  * Mark the given page as not present. Access to it will trigger a fault.
165  *
166  * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
167  * protection is ignored here. RCU read lock is assumed held, so the struct
168  * will not disappear unexpectedly. Furthermore, the caller must guarantee,
169  * that double arming the same virtual address (page) cannot occur.
170  *
171  * Double disarming on the other hand is allowed, and may occur when a fault
172  * and mmiotrace shutdown happen simultaneously.
173  */
174 static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
175 {
176 	int ret;
177 	WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
178 	if (f->armed) {
179 		pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
180 			   f->page, f->count, !!f->old_presence);
181 	}
182 	ret = clear_page_presence(f, true);
183 	WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
184 		  f->page);
185 	f->armed = true;
186 	return ret;
187 }
188 
189 /** Restore the given page to saved presence state. */
190 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
191 {
192 	int ret = clear_page_presence(f, false);
193 	WARN_ONCE(ret < 0,
194 			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
195 	f->armed = false;
196 }
197 
198 /*
199  * This is being called from do_page_fault().
200  *
201  * We may be in an interrupt or a critical section. Also prefecthing may
202  * trigger a page fault. We may be in the middle of process switch.
203  * We cannot take any locks, because we could be executing especially
204  * within a kmmio critical section.
205  *
206  * Local interrupts are disabled, so preemption cannot happen.
207  * Do not enable interrupts, do not sleep, and watch out for other CPUs.
208  */
209 /*
210  * Interrupts are disabled on entry as trap3 is an interrupt gate
211  * and they remain disabled throughout this function.
212  */
213 int kmmio_handler(struct pt_regs *regs, unsigned long addr)
214 {
215 	struct kmmio_context *ctx;
216 	struct kmmio_fault_page *faultpage;
217 	int ret = 0; /* default to fault not handled */
218 
219 	/*
220 	 * Preemption is now disabled to prevent process switch during
221 	 * single stepping. We can only handle one active kmmio trace
222 	 * per cpu, so ensure that we finish it before something else
223 	 * gets to run. We also hold the RCU read lock over single
224 	 * stepping to avoid looking up the probe and kmmio_fault_page
225 	 * again.
226 	 */
227 	preempt_disable();
228 	rcu_read_lock();
229 
230 	faultpage = get_kmmio_fault_page(addr);
231 	if (!faultpage) {
232 		/*
233 		 * Either this page fault is not caused by kmmio, or
234 		 * another CPU just pulled the kmmio probe from under
235 		 * our feet. The latter case should not be possible.
236 		 */
237 		goto no_kmmio;
238 	}
239 
240 	ctx = &get_cpu_var(kmmio_ctx);
241 	if (ctx->active) {
242 		if (addr == ctx->addr) {
243 			/*
244 			 * A second fault on the same page means some other
245 			 * condition needs handling by do_page_fault(), the
246 			 * page really not being present is the most common.
247 			 */
248 			pr_debug("secondary hit for 0x%08lx CPU %d.\n",
249 				 addr, smp_processor_id());
250 
251 			if (!faultpage->old_presence)
252 				pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
253 					addr, smp_processor_id());
254 		} else {
255 			/*
256 			 * Prevent overwriting already in-flight context.
257 			 * This should not happen, let's hope disarming at
258 			 * least prevents a panic.
259 			 */
260 			pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
261 				 smp_processor_id(), addr);
262 			pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
263 			disarm_kmmio_fault_page(faultpage);
264 		}
265 		goto no_kmmio_ctx;
266 	}
267 	ctx->active++;
268 
269 	ctx->fpage = faultpage;
270 	ctx->probe = get_kmmio_probe(addr);
271 	ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
272 	ctx->addr = addr;
273 
274 	if (ctx->probe && ctx->probe->pre_handler)
275 		ctx->probe->pre_handler(ctx->probe, regs, addr);
276 
277 	/*
278 	 * Enable single-stepping and disable interrupts for the faulting
279 	 * context. Local interrupts must not get enabled during stepping.
280 	 */
281 	regs->flags |= X86_EFLAGS_TF;
282 	regs->flags &= ~X86_EFLAGS_IF;
283 
284 	/* Now we set present bit in PTE and single step. */
285 	disarm_kmmio_fault_page(ctx->fpage);
286 
287 	/*
288 	 * If another cpu accesses the same page while we are stepping,
289 	 * the access will not be caught. It will simply succeed and the
290 	 * only downside is we lose the event. If this becomes a problem,
291 	 * the user should drop to single cpu before tracing.
292 	 */
293 
294 	put_cpu_var(kmmio_ctx);
295 	return 1; /* fault handled */
296 
297 no_kmmio_ctx:
298 	put_cpu_var(kmmio_ctx);
299 no_kmmio:
300 	rcu_read_unlock();
301 	preempt_enable_no_resched();
302 	return ret;
303 }
304 
305 /*
306  * Interrupts are disabled on entry as trap1 is an interrupt gate
307  * and they remain disabled throughout this function.
308  * This must always get called as the pair to kmmio_handler().
309  */
310 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
311 {
312 	int ret = 0;
313 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
314 
315 	if (!ctx->active) {
316 		/*
317 		 * debug traps without an active context are due to either
318 		 * something external causing them (f.e. using a debugger while
319 		 * mmio tracing enabled), or erroneous behaviour
320 		 */
321 		pr_warning("unexpected debug trap on CPU %d.\n",
322 			   smp_processor_id());
323 		goto out;
324 	}
325 
326 	if (ctx->probe && ctx->probe->post_handler)
327 		ctx->probe->post_handler(ctx->probe, condition, regs);
328 
329 	/* Prevent racing against release_kmmio_fault_page(). */
330 	spin_lock(&kmmio_lock);
331 	if (ctx->fpage->count)
332 		arm_kmmio_fault_page(ctx->fpage);
333 	spin_unlock(&kmmio_lock);
334 
335 	regs->flags &= ~X86_EFLAGS_TF;
336 	regs->flags |= ctx->saved_flags;
337 
338 	/* These were acquired in kmmio_handler(). */
339 	ctx->active--;
340 	BUG_ON(ctx->active);
341 	rcu_read_unlock();
342 	preempt_enable_no_resched();
343 
344 	/*
345 	 * if somebody else is singlestepping across a probe point, flags
346 	 * will have TF set, in which case, continue the remaining processing
347 	 * of do_debug, as if this is not a probe hit.
348 	 */
349 	if (!(regs->flags & X86_EFLAGS_TF))
350 		ret = 1;
351 out:
352 	put_cpu_var(kmmio_ctx);
353 	return ret;
354 }
355 
356 /* You must be holding kmmio_lock. */
357 static int add_kmmio_fault_page(unsigned long page)
358 {
359 	struct kmmio_fault_page *f;
360 
361 	page &= PAGE_MASK;
362 	f = get_kmmio_fault_page(page);
363 	if (f) {
364 		if (!f->count)
365 			arm_kmmio_fault_page(f);
366 		f->count++;
367 		return 0;
368 	}
369 
370 	f = kzalloc(sizeof(*f), GFP_ATOMIC);
371 	if (!f)
372 		return -1;
373 
374 	f->count = 1;
375 	f->page = page;
376 
377 	if (arm_kmmio_fault_page(f)) {
378 		kfree(f);
379 		return -1;
380 	}
381 
382 	list_add_rcu(&f->list, kmmio_page_list(f->page));
383 
384 	return 0;
385 }
386 
387 /* You must be holding kmmio_lock. */
388 static void release_kmmio_fault_page(unsigned long page,
389 				struct kmmio_fault_page **release_list)
390 {
391 	struct kmmio_fault_page *f;
392 
393 	page &= PAGE_MASK;
394 	f = get_kmmio_fault_page(page);
395 	if (!f)
396 		return;
397 
398 	f->count--;
399 	BUG_ON(f->count < 0);
400 	if (!f->count) {
401 		disarm_kmmio_fault_page(f);
402 		if (!f->scheduled_for_release) {
403 			f->release_next = *release_list;
404 			*release_list = f;
405 			f->scheduled_for_release = true;
406 		}
407 	}
408 }
409 
410 /*
411  * With page-unaligned ioremaps, one or two armed pages may contain
412  * addresses from outside the intended mapping. Events for these addresses
413  * are currently silently dropped. The events may result only from programming
414  * mistakes by accessing addresses before the beginning or past the end of a
415  * mapping.
416  */
417 int register_kmmio_probe(struct kmmio_probe *p)
418 {
419 	unsigned long flags;
420 	int ret = 0;
421 	unsigned long size = 0;
422 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
423 
424 	spin_lock_irqsave(&kmmio_lock, flags);
425 	if (get_kmmio_probe(p->addr)) {
426 		ret = -EEXIST;
427 		goto out;
428 	}
429 	kmmio_count++;
430 	list_add_rcu(&p->list, &kmmio_probes);
431 	while (size < size_lim) {
432 		if (add_kmmio_fault_page(p->addr + size))
433 			pr_err("Unable to set page fault.\n");
434 		size += PAGE_SIZE;
435 	}
436 out:
437 	spin_unlock_irqrestore(&kmmio_lock, flags);
438 	/*
439 	 * XXX: What should I do here?
440 	 * Here was a call to global_flush_tlb(), but it does not exist
441 	 * anymore. It seems it's not needed after all.
442 	 */
443 	return ret;
444 }
445 EXPORT_SYMBOL(register_kmmio_probe);
446 
447 static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
448 {
449 	struct kmmio_delayed_release *dr = container_of(
450 						head,
451 						struct kmmio_delayed_release,
452 						rcu);
453 	struct kmmio_fault_page *f = dr->release_list;
454 	while (f) {
455 		struct kmmio_fault_page *next = f->release_next;
456 		BUG_ON(f->count);
457 		kfree(f);
458 		f = next;
459 	}
460 	kfree(dr);
461 }
462 
463 static void remove_kmmio_fault_pages(struct rcu_head *head)
464 {
465 	struct kmmio_delayed_release *dr =
466 		container_of(head, struct kmmio_delayed_release, rcu);
467 	struct kmmio_fault_page *f = dr->release_list;
468 	struct kmmio_fault_page **prevp = &dr->release_list;
469 	unsigned long flags;
470 
471 	spin_lock_irqsave(&kmmio_lock, flags);
472 	while (f) {
473 		if (!f->count) {
474 			list_del_rcu(&f->list);
475 			prevp = &f->release_next;
476 		} else {
477 			*prevp = f->release_next;
478 			f->release_next = NULL;
479 			f->scheduled_for_release = false;
480 		}
481 		f = *prevp;
482 	}
483 	spin_unlock_irqrestore(&kmmio_lock, flags);
484 
485 	/* This is the real RCU destroy call. */
486 	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
487 }
488 
489 /*
490  * Remove a kmmio probe. You have to synchronize_rcu() before you can be
491  * sure that the callbacks will not be called anymore. Only after that
492  * you may actually release your struct kmmio_probe.
493  *
494  * Unregistering a kmmio fault page has three steps:
495  * 1. release_kmmio_fault_page()
496  *    Disarm the page, wait a grace period to let all faults finish.
497  * 2. remove_kmmio_fault_pages()
498  *    Remove the pages from kmmio_page_table.
499  * 3. rcu_free_kmmio_fault_pages()
500  *    Actually free the kmmio_fault_page structs as with RCU.
501  */
502 void unregister_kmmio_probe(struct kmmio_probe *p)
503 {
504 	unsigned long flags;
505 	unsigned long size = 0;
506 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
507 	struct kmmio_fault_page *release_list = NULL;
508 	struct kmmio_delayed_release *drelease;
509 
510 	spin_lock_irqsave(&kmmio_lock, flags);
511 	while (size < size_lim) {
512 		release_kmmio_fault_page(p->addr + size, &release_list);
513 		size += PAGE_SIZE;
514 	}
515 	list_del_rcu(&p->list);
516 	kmmio_count--;
517 	spin_unlock_irqrestore(&kmmio_lock, flags);
518 
519 	if (!release_list)
520 		return;
521 
522 	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
523 	if (!drelease) {
524 		pr_crit("leaking kmmio_fault_page objects.\n");
525 		return;
526 	}
527 	drelease->release_list = release_list;
528 
529 	/*
530 	 * This is not really RCU here. We have just disarmed a set of
531 	 * pages so that they cannot trigger page faults anymore. However,
532 	 * we cannot remove the pages from kmmio_page_table,
533 	 * because a probe hit might be in flight on another CPU. The
534 	 * pages are collected into a list, and they will be removed from
535 	 * kmmio_page_table when it is certain that no probe hit related to
536 	 * these pages can be in flight. RCU grace period sounds like a
537 	 * good choice.
538 	 *
539 	 * If we removed the pages too early, kmmio page fault handler might
540 	 * not find the respective kmmio_fault_page and determine it's not
541 	 * a kmmio fault, when it actually is. This would lead to madness.
542 	 */
543 	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
544 }
545 EXPORT_SYMBOL(unregister_kmmio_probe);
546 
547 static int
548 kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
549 {
550 	struct die_args *arg = args;
551 	unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
552 
553 	if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
554 		if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
555 			/*
556 			 * Reset the BS bit in dr6 (pointed by args->err) to
557 			 * denote completion of processing
558 			 */
559 			*dr6_p &= ~DR_STEP;
560 			return NOTIFY_STOP;
561 		}
562 
563 	return NOTIFY_DONE;
564 }
565 
566 static struct notifier_block nb_die = {
567 	.notifier_call = kmmio_die_notifier
568 };
569 
570 int kmmio_init(void)
571 {
572 	int i;
573 
574 	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
575 		INIT_LIST_HEAD(&kmmio_page_table[i]);
576 
577 	return register_die_notifier(&nb_die);
578 }
579 
580 void kmmio_cleanup(void)
581 {
582 	int i;
583 
584 	unregister_die_notifier(&nb_die);
585 	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
586 		WARN_ONCE(!list_empty(&kmmio_page_table[i]),
587 			KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
588 	}
589 }
590