1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * User-space Probes (UProbes)
4 *
5 * Copyright (C) IBM Corporation, 2008-2012
6 * Authors:
7 * Srikar Dronamraju
8 * Jim Keniston
9 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
10 */
11
12 #include <linux/kernel.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h> /* read_mapping_page */
15 #include <linux/slab.h>
16 #include <linux/sched.h>
17 #include <linux/sched/mm.h>
18 #include <linux/export.h>
19 #include <linux/rmap.h> /* anon_vma_prepare */
20 #include <linux/mmu_notifier.h>
21 #include <linux/swap.h> /* folio_free_swap */
22 #include <linux/ptrace.h> /* user_enable_single_step */
23 #include <linux/kdebug.h> /* notifier mechanism */
24 #include <linux/percpu-rwsem.h>
25 #include <linux/task_work.h>
26 #include <linux/shmem_fs.h>
27 #include <linux/khugepaged.h>
28 #include <linux/rcupdate_trace.h>
29 #include <linux/workqueue.h>
30 #include <linux/srcu.h>
31 #include <linux/oom.h> /* check_stable_address_space */
32
33 #include <linux/uprobes.h>
34
35 #define UINSNS_PER_PAGE (PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
36 #define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
37
38 static struct rb_root uprobes_tree = RB_ROOT;
39 /*
40 * allows us to skip the uprobe_mmap if there are no uprobe events active
41 * at this time. Probably a fine grained per inode count is better?
42 */
43 #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
44
45 static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
46 static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
47
48 #define UPROBES_HASH_SZ 13
49 /* serialize uprobe->pending_list */
50 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
51 #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
52
53 DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
54
55 /* Covers return_instance's uprobe lifetime. */
56 DEFINE_STATIC_SRCU(uretprobes_srcu);
57
58 /* Have a copy of original instruction */
59 #define UPROBE_COPY_INSN 0
60
61 struct uprobe {
62 struct rb_node rb_node; /* node in the rb tree */
63 refcount_t ref;
64 struct rw_semaphore register_rwsem;
65 struct rw_semaphore consumer_rwsem;
66 struct list_head pending_list;
67 struct list_head consumers;
68 struct inode *inode; /* Also hold a ref to inode */
69 union {
70 struct rcu_head rcu;
71 struct work_struct work;
72 };
73 loff_t offset;
74 loff_t ref_ctr_offset;
75 unsigned long flags; /* "unsigned long" so bitops work */
76
77 /*
78 * The generic code assumes that it has two members of unknown type
79 * owned by the arch-specific code:
80 *
81 * insn - copy_insn() saves the original instruction here for
82 * arch_uprobe_analyze_insn().
83 *
84 * ixol - potentially modified instruction to execute out of
85 * line, copied to xol_area by xol_get_insn_slot().
86 */
87 struct arch_uprobe arch;
88 };
89
90 struct delayed_uprobe {
91 struct list_head list;
92 struct uprobe *uprobe;
93 struct mm_struct *mm;
94 };
95
96 static DEFINE_MUTEX(delayed_uprobe_lock);
97 static LIST_HEAD(delayed_uprobe_list);
98
99 /*
100 * Execute out of line area: anonymous executable mapping installed
101 * by the probed task to execute the copy of the original instruction
102 * mangled by set_swbp().
103 *
104 * On a breakpoint hit, thread contests for a slot. It frees the
105 * slot after singlestep. Currently a fixed number of slots are
106 * allocated.
107 */
108 struct xol_area {
109 wait_queue_head_t wq; /* if all slots are busy */
110 unsigned long *bitmap; /* 0 = free slot */
111
112 struct page *page;
113 /*
114 * We keep the vma's vm_start rather than a pointer to the vma
115 * itself. The probed process or a naughty kernel module could make
116 * the vma go away, and we must handle that reasonably gracefully.
117 */
118 unsigned long vaddr; /* Page(s) of instruction slots */
119 };
120
uprobe_warn(struct task_struct * t,const char * msg)121 static void uprobe_warn(struct task_struct *t, const char *msg)
122 {
123 pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
124 }
125
126 /*
127 * valid_vma: Verify if the specified vma is an executable vma
128 * Relax restrictions while unregistering: vm_flags might have
129 * changed after breakpoint was inserted.
130 * - is_register: indicates if we are in register context.
131 * - Return 1 if the specified virtual address is in an
132 * executable vma.
133 */
valid_vma(struct vm_area_struct * vma,bool is_register)134 static bool valid_vma(struct vm_area_struct *vma, bool is_register)
135 {
136 vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
137
138 if (is_register)
139 flags |= VM_WRITE;
140
141 return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
142 }
143
offset_to_vaddr(struct vm_area_struct * vma,loff_t offset)144 static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
145 {
146 return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
147 }
148
vaddr_to_offset(struct vm_area_struct * vma,unsigned long vaddr)149 static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
150 {
151 return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
152 }
153
154 /**
155 * __replace_page - replace page in vma by new page.
156 * based on replace_page in mm/ksm.c
157 *
158 * @vma: vma that holds the pte pointing to page
159 * @addr: address the old @page is mapped at
160 * @old_page: the page we are replacing by new_page
161 * @new_page: the modified page we replace page by
162 *
163 * If @new_page is NULL, only unmap @old_page.
164 *
165 * Returns 0 on success, negative error code otherwise.
166 */
__replace_page(struct vm_area_struct * vma,unsigned long addr,struct page * old_page,struct page * new_page)167 static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
168 struct page *old_page, struct page *new_page)
169 {
170 struct folio *old_folio = page_folio(old_page);
171 struct folio *new_folio;
172 struct mm_struct *mm = vma->vm_mm;
173 DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
174 int err;
175 struct mmu_notifier_range range;
176
177 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
178 addr + PAGE_SIZE);
179
180 if (new_page) {
181 new_folio = page_folio(new_page);
182 err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
183 if (err)
184 return err;
185 }
186
187 /* For folio_free_swap() below */
188 folio_lock(old_folio);
189
190 mmu_notifier_invalidate_range_start(&range);
191 err = -EAGAIN;
192 if (!page_vma_mapped_walk(&pvmw))
193 goto unlock;
194 VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
195
196 if (new_page) {
197 folio_get(new_folio);
198 folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE);
199 folio_add_lru_vma(new_folio, vma);
200 } else
201 /* no new page, just dec_mm_counter for old_page */
202 dec_mm_counter(mm, MM_ANONPAGES);
203
204 if (!folio_test_anon(old_folio)) {
205 dec_mm_counter(mm, mm_counter_file(old_folio));
206 inc_mm_counter(mm, MM_ANONPAGES);
207 }
208
209 flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
210 ptep_clear_flush(vma, addr, pvmw.pte);
211 if (new_page)
212 set_pte_at(mm, addr, pvmw.pte,
213 mk_pte(new_page, vma->vm_page_prot));
214
215 folio_remove_rmap_pte(old_folio, old_page, vma);
216 if (!folio_mapped(old_folio))
217 folio_free_swap(old_folio);
218 page_vma_mapped_walk_done(&pvmw);
219 folio_put(old_folio);
220
221 err = 0;
222 unlock:
223 mmu_notifier_invalidate_range_end(&range);
224 folio_unlock(old_folio);
225 return err;
226 }
227
228 /**
229 * is_swbp_insn - check if instruction is breakpoint instruction.
230 * @insn: instruction to be checked.
231 * Default implementation of is_swbp_insn
232 * Returns true if @insn is a breakpoint instruction.
233 */
is_swbp_insn(uprobe_opcode_t * insn)234 bool __weak is_swbp_insn(uprobe_opcode_t *insn)
235 {
236 return *insn == UPROBE_SWBP_INSN;
237 }
238
239 /**
240 * is_trap_insn - check if instruction is breakpoint instruction.
241 * @insn: instruction to be checked.
242 * Default implementation of is_trap_insn
243 * Returns true if @insn is a breakpoint instruction.
244 *
245 * This function is needed for the case where an architecture has multiple
246 * trap instructions (like powerpc).
247 */
is_trap_insn(uprobe_opcode_t * insn)248 bool __weak is_trap_insn(uprobe_opcode_t *insn)
249 {
250 return is_swbp_insn(insn);
251 }
252
copy_from_page(struct page * page,unsigned long vaddr,void * dst,int len)253 static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
254 {
255 void *kaddr = kmap_atomic(page);
256 memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
257 kunmap_atomic(kaddr);
258 }
259
copy_to_page(struct page * page,unsigned long vaddr,const void * src,int len)260 static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
261 {
262 void *kaddr = kmap_atomic(page);
263 memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
264 kunmap_atomic(kaddr);
265 }
266
verify_opcode(struct page * page,unsigned long vaddr,uprobe_opcode_t * new_opcode)267 static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
268 {
269 uprobe_opcode_t old_opcode;
270 bool is_swbp;
271
272 /*
273 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
274 * We do not check if it is any other 'trap variant' which could
275 * be conditional trap instruction such as the one powerpc supports.
276 *
277 * The logic is that we do not care if the underlying instruction
278 * is a trap variant; uprobes always wins over any other (gdb)
279 * breakpoint.
280 */
281 copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
282 is_swbp = is_swbp_insn(&old_opcode);
283
284 if (is_swbp_insn(new_opcode)) {
285 if (is_swbp) /* register: already installed? */
286 return 0;
287 } else {
288 if (!is_swbp) /* unregister: was it changed by us? */
289 return 0;
290 }
291
292 return 1;
293 }
294
295 static struct delayed_uprobe *
delayed_uprobe_check(struct uprobe * uprobe,struct mm_struct * mm)296 delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
297 {
298 struct delayed_uprobe *du;
299
300 list_for_each_entry(du, &delayed_uprobe_list, list)
301 if (du->uprobe == uprobe && du->mm == mm)
302 return du;
303 return NULL;
304 }
305
delayed_uprobe_add(struct uprobe * uprobe,struct mm_struct * mm)306 static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
307 {
308 struct delayed_uprobe *du;
309
310 if (delayed_uprobe_check(uprobe, mm))
311 return 0;
312
313 du = kzalloc(sizeof(*du), GFP_KERNEL);
314 if (!du)
315 return -ENOMEM;
316
317 du->uprobe = uprobe;
318 du->mm = mm;
319 list_add(&du->list, &delayed_uprobe_list);
320 return 0;
321 }
322
delayed_uprobe_delete(struct delayed_uprobe * du)323 static void delayed_uprobe_delete(struct delayed_uprobe *du)
324 {
325 if (WARN_ON(!du))
326 return;
327 list_del(&du->list);
328 kfree(du);
329 }
330
delayed_uprobe_remove(struct uprobe * uprobe,struct mm_struct * mm)331 static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
332 {
333 struct list_head *pos, *q;
334 struct delayed_uprobe *du;
335
336 if (!uprobe && !mm)
337 return;
338
339 list_for_each_safe(pos, q, &delayed_uprobe_list) {
340 du = list_entry(pos, struct delayed_uprobe, list);
341
342 if (uprobe && du->uprobe != uprobe)
343 continue;
344 if (mm && du->mm != mm)
345 continue;
346
347 delayed_uprobe_delete(du);
348 }
349 }
350
valid_ref_ctr_vma(struct uprobe * uprobe,struct vm_area_struct * vma)351 static bool valid_ref_ctr_vma(struct uprobe *uprobe,
352 struct vm_area_struct *vma)
353 {
354 unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
355
356 return uprobe->ref_ctr_offset &&
357 vma->vm_file &&
358 file_inode(vma->vm_file) == uprobe->inode &&
359 (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
360 vma->vm_start <= vaddr &&
361 vma->vm_end > vaddr;
362 }
363
364 static struct vm_area_struct *
find_ref_ctr_vma(struct uprobe * uprobe,struct mm_struct * mm)365 find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
366 {
367 VMA_ITERATOR(vmi, mm, 0);
368 struct vm_area_struct *tmp;
369
370 for_each_vma(vmi, tmp)
371 if (valid_ref_ctr_vma(uprobe, tmp))
372 return tmp;
373
374 return NULL;
375 }
376
377 static int
__update_ref_ctr(struct mm_struct * mm,unsigned long vaddr,short d)378 __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
379 {
380 void *kaddr;
381 struct page *page;
382 int ret;
383 short *ptr;
384
385 if (!vaddr || !d)
386 return -EINVAL;
387
388 ret = get_user_pages_remote(mm, vaddr, 1,
389 FOLL_WRITE, &page, NULL);
390 if (unlikely(ret <= 0)) {
391 /*
392 * We are asking for 1 page. If get_user_pages_remote() fails,
393 * it may return 0, in that case we have to return error.
394 */
395 return ret == 0 ? -EBUSY : ret;
396 }
397
398 kaddr = kmap_atomic(page);
399 ptr = kaddr + (vaddr & ~PAGE_MASK);
400
401 if (unlikely(*ptr + d < 0)) {
402 pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
403 "curr val: %d, delta: %d\n", vaddr, *ptr, d);
404 ret = -EINVAL;
405 goto out;
406 }
407
408 *ptr += d;
409 ret = 0;
410 out:
411 kunmap_atomic(kaddr);
412 put_page(page);
413 return ret;
414 }
415
update_ref_ctr_warn(struct uprobe * uprobe,struct mm_struct * mm,short d)416 static void update_ref_ctr_warn(struct uprobe *uprobe,
417 struct mm_struct *mm, short d)
418 {
419 pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
420 "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
421 d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
422 (unsigned long long) uprobe->offset,
423 (unsigned long long) uprobe->ref_ctr_offset, mm);
424 }
425
update_ref_ctr(struct uprobe * uprobe,struct mm_struct * mm,short d)426 static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
427 short d)
428 {
429 struct vm_area_struct *rc_vma;
430 unsigned long rc_vaddr;
431 int ret = 0;
432
433 rc_vma = find_ref_ctr_vma(uprobe, mm);
434
435 if (rc_vma) {
436 rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
437 ret = __update_ref_ctr(mm, rc_vaddr, d);
438 if (ret)
439 update_ref_ctr_warn(uprobe, mm, d);
440
441 if (d > 0)
442 return ret;
443 }
444
445 mutex_lock(&delayed_uprobe_lock);
446 if (d > 0)
447 ret = delayed_uprobe_add(uprobe, mm);
448 else
449 delayed_uprobe_remove(uprobe, mm);
450 mutex_unlock(&delayed_uprobe_lock);
451
452 return ret;
453 }
454
455 /*
456 * NOTE:
457 * Expect the breakpoint instruction to be the smallest size instruction for
458 * the architecture. If an arch has variable length instruction and the
459 * breakpoint instruction is not of the smallest length instruction
460 * supported by that architecture then we need to modify is_trap_at_addr and
461 * uprobe_write_opcode accordingly. This would never be a problem for archs
462 * that have fixed length instructions.
463 *
464 * uprobe_write_opcode - write the opcode at a given virtual address.
465 * @auprobe: arch specific probepoint information.
466 * @mm: the probed process address space.
467 * @vaddr: the virtual address to store the opcode.
468 * @opcode: opcode to be written at @vaddr.
469 *
470 * Called with mm->mmap_lock held for read or write.
471 * Return 0 (success) or a negative errno.
472 */
uprobe_write_opcode(struct arch_uprobe * auprobe,struct mm_struct * mm,unsigned long vaddr,uprobe_opcode_t opcode)473 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
474 unsigned long vaddr, uprobe_opcode_t opcode)
475 {
476 struct uprobe *uprobe;
477 struct page *old_page, *new_page;
478 struct vm_area_struct *vma;
479 int ret, is_register, ref_ctr_updated = 0;
480 bool orig_page_huge = false;
481 unsigned int gup_flags = FOLL_FORCE;
482
483 is_register = is_swbp_insn(&opcode);
484 uprobe = container_of(auprobe, struct uprobe, arch);
485
486 retry:
487 if (is_register)
488 gup_flags |= FOLL_SPLIT_PMD;
489 /* Read the page with vaddr into memory */
490 old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
491 if (IS_ERR(old_page))
492 return PTR_ERR(old_page);
493
494 ret = verify_opcode(old_page, vaddr, &opcode);
495 if (ret <= 0)
496 goto put_old;
497
498 if (is_zero_page(old_page)) {
499 ret = -EINVAL;
500 goto put_old;
501 }
502
503 if (WARN(!is_register && PageCompound(old_page),
504 "uprobe unregister should never work on compound page\n")) {
505 ret = -EINVAL;
506 goto put_old;
507 }
508
509 /* We are going to replace instruction, update ref_ctr. */
510 if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
511 ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
512 if (ret)
513 goto put_old;
514
515 ref_ctr_updated = 1;
516 }
517
518 ret = 0;
519 if (!is_register && !PageAnon(old_page))
520 goto put_old;
521
522 ret = anon_vma_prepare(vma);
523 if (ret)
524 goto put_old;
525
526 ret = -ENOMEM;
527 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
528 if (!new_page)
529 goto put_old;
530
531 __SetPageUptodate(new_page);
532 copy_highpage(new_page, old_page);
533 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
534
535 if (!is_register) {
536 struct page *orig_page;
537 pgoff_t index;
538
539 VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
540
541 index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
542 orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
543 index);
544
545 if (orig_page) {
546 if (PageUptodate(orig_page) &&
547 pages_identical(new_page, orig_page)) {
548 /* let go new_page */
549 put_page(new_page);
550 new_page = NULL;
551
552 if (PageCompound(orig_page))
553 orig_page_huge = true;
554 }
555 put_page(orig_page);
556 }
557 }
558
559 ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
560 if (new_page)
561 put_page(new_page);
562 put_old:
563 put_page(old_page);
564
565 if (unlikely(ret == -EAGAIN))
566 goto retry;
567
568 /* Revert back reference counter if instruction update failed. */
569 if (ret && is_register && ref_ctr_updated)
570 update_ref_ctr(uprobe, mm, -1);
571
572 /* try collapse pmd for compound page */
573 if (!ret && orig_page_huge)
574 collapse_pte_mapped_thp(mm, vaddr, false);
575
576 return ret;
577 }
578
579 /**
580 * set_swbp - store breakpoint at a given address.
581 * @auprobe: arch specific probepoint information.
582 * @mm: the probed process address space.
583 * @vaddr: the virtual address to insert the opcode.
584 *
585 * For mm @mm, store the breakpoint instruction at @vaddr.
586 * Return 0 (success) or a negative errno.
587 */
set_swbp(struct arch_uprobe * auprobe,struct mm_struct * mm,unsigned long vaddr)588 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
589 {
590 return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
591 }
592
593 /**
594 * set_orig_insn - Restore the original instruction.
595 * @mm: the probed process address space.
596 * @auprobe: arch specific probepoint information.
597 * @vaddr: the virtual address to insert the opcode.
598 *
599 * For mm @mm, restore the original opcode (opcode) at @vaddr.
600 * Return 0 (success) or a negative errno.
601 */
602 int __weak
set_orig_insn(struct arch_uprobe * auprobe,struct mm_struct * mm,unsigned long vaddr)603 set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
604 {
605 return uprobe_write_opcode(auprobe, mm, vaddr,
606 *(uprobe_opcode_t *)&auprobe->insn);
607 }
608
609 /* uprobe should have guaranteed positive refcount */
get_uprobe(struct uprobe * uprobe)610 static struct uprobe *get_uprobe(struct uprobe *uprobe)
611 {
612 refcount_inc(&uprobe->ref);
613 return uprobe;
614 }
615
616 /*
617 * uprobe should have guaranteed lifetime, which can be either of:
618 * - caller already has refcount taken (and wants an extra one);
619 * - uprobe is RCU protected and won't be freed until after grace period;
620 * - we are holding uprobes_treelock (for read or write, doesn't matter).
621 */
try_get_uprobe(struct uprobe * uprobe)622 static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
623 {
624 if (refcount_inc_not_zero(&uprobe->ref))
625 return uprobe;
626 return NULL;
627 }
628
uprobe_is_active(struct uprobe * uprobe)629 static inline bool uprobe_is_active(struct uprobe *uprobe)
630 {
631 return !RB_EMPTY_NODE(&uprobe->rb_node);
632 }
633
uprobe_free_rcu_tasks_trace(struct rcu_head * rcu)634 static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu)
635 {
636 struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
637
638 kfree(uprobe);
639 }
640
uprobe_free_srcu(struct rcu_head * rcu)641 static void uprobe_free_srcu(struct rcu_head *rcu)
642 {
643 struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
644
645 call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace);
646 }
647
uprobe_free_deferred(struct work_struct * work)648 static void uprobe_free_deferred(struct work_struct *work)
649 {
650 struct uprobe *uprobe = container_of(work, struct uprobe, work);
651
652 write_lock(&uprobes_treelock);
653
654 if (uprobe_is_active(uprobe)) {
655 write_seqcount_begin(&uprobes_seqcount);
656 rb_erase(&uprobe->rb_node, &uprobes_tree);
657 write_seqcount_end(&uprobes_seqcount);
658 }
659
660 write_unlock(&uprobes_treelock);
661
662 /*
663 * If application munmap(exec_vma) before uprobe_unregister()
664 * gets called, we don't get a chance to remove uprobe from
665 * delayed_uprobe_list from remove_breakpoint(). Do it here.
666 */
667 mutex_lock(&delayed_uprobe_lock);
668 delayed_uprobe_remove(uprobe, NULL);
669 mutex_unlock(&delayed_uprobe_lock);
670
671 /* start srcu -> rcu_tasks_trace -> kfree chain */
672 call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu);
673 }
674
put_uprobe(struct uprobe * uprobe)675 static void put_uprobe(struct uprobe *uprobe)
676 {
677 if (!refcount_dec_and_test(&uprobe->ref))
678 return;
679
680 INIT_WORK(&uprobe->work, uprobe_free_deferred);
681 schedule_work(&uprobe->work);
682 }
683
684 /* Initialize hprobe as SRCU-protected "leased" uprobe */
hprobe_init_leased(struct hprobe * hprobe,struct uprobe * uprobe,int srcu_idx)685 static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx)
686 {
687 WARN_ON(!uprobe);
688 hprobe->state = HPROBE_LEASED;
689 hprobe->uprobe = uprobe;
690 hprobe->srcu_idx = srcu_idx;
691 }
692
693 /* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */
hprobe_init_stable(struct hprobe * hprobe,struct uprobe * uprobe)694 static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe)
695 {
696 hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE;
697 hprobe->uprobe = uprobe;
698 hprobe->srcu_idx = -1;
699 }
700
701 /*
702 * hprobe_consume() fetches hprobe's underlying uprobe and detects whether
703 * uprobe is SRCU protected or is refcounted. hprobe_consume() can be
704 * used only once for a given hprobe.
705 *
706 * Caller has to call hprobe_finalize() and pass previous hprobe_state, so
707 * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever
708 * is appropriate.
709 */
hprobe_consume(struct hprobe * hprobe,enum hprobe_state * hstate)710 static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate)
711 {
712 *hstate = xchg(&hprobe->state, HPROBE_CONSUMED);
713 switch (*hstate) {
714 case HPROBE_LEASED:
715 case HPROBE_STABLE:
716 return hprobe->uprobe;
717 case HPROBE_GONE: /* uprobe is NULL, no SRCU */
718 case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */
719 return NULL;
720 default:
721 WARN(1, "hprobe invalid state %d", *hstate);
722 return NULL;
723 }
724 }
725
726 /*
727 * Reset hprobe state and, if hprobe was LEASED, release SRCU lock.
728 * hprobe_finalize() can only be used from current context after
729 * hprobe_consume() call (which determines uprobe and hstate value).
730 */
hprobe_finalize(struct hprobe * hprobe,enum hprobe_state hstate)731 static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate)
732 {
733 switch (hstate) {
734 case HPROBE_LEASED:
735 __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
736 break;
737 case HPROBE_STABLE:
738 put_uprobe(hprobe->uprobe);
739 break;
740 case HPROBE_GONE:
741 case HPROBE_CONSUMED:
742 break;
743 default:
744 WARN(1, "hprobe invalid state %d", hstate);
745 break;
746 }
747 }
748
749 /*
750 * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED)
751 * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of
752 * them can win the race to perform SRCU unlocking. Whoever wins must perform
753 * SRCU unlock.
754 *
755 * Returns underlying valid uprobe or NULL, if there was no underlying uprobe
756 * to begin with or we failed to bump its refcount and it's going away.
757 *
758 * Returned non-NULL uprobe can be still safely used within an ongoing SRCU
759 * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has
760 * an extra refcount for caller to assume and use. Otherwise, it's not
761 * guaranteed that returned uprobe has a positive refcount, so caller has to
762 * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current
763 * SRCU lock region. See dup_utask().
764 */
hprobe_expire(struct hprobe * hprobe,bool get)765 static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
766 {
767 enum hprobe_state hstate;
768
769 /*
770 * Caller should guarantee that return_instance is not going to be
771 * freed from under us. This can be achieved either through holding
772 * rcu_read_lock() or by owning return_instance in the first place.
773 *
774 * Underlying uprobe is itself protected from reuse by SRCU, so ensure
775 * SRCU lock is held properly.
776 */
777 lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));
778
779 hstate = READ_ONCE(hprobe->state);
780 switch (hstate) {
781 case HPROBE_STABLE:
782 /* uprobe has positive refcount, bump refcount, if necessary */
783 return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe;
784 case HPROBE_GONE:
785 /*
786 * SRCU was unlocked earlier and we didn't manage to take
787 * uprobe refcnt, so it's effectively NULL
788 */
789 return NULL;
790 case HPROBE_CONSUMED:
791 /*
792 * uprobe was consumed, so it's effectively NULL as far as
793 * uretprobe processing logic is concerned
794 */
795 return NULL;
796 case HPROBE_LEASED: {
797 struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe);
798 /*
799 * Try to switch hprobe state, guarding against
800 * hprobe_consume() or another hprobe_expire() racing with us.
801 * Note, if we failed to get uprobe refcount, we use special
802 * HPROBE_GONE state to signal that hprobe->uprobe shouldn't
803 * be used as it will be freed after SRCU is unlocked.
804 */
805 if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) {
806 /* We won the race, we are the ones to unlock SRCU */
807 __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx);
808 return get ? get_uprobe(uprobe) : uprobe;
809 }
810
811 /*
812 * We lost the race, undo refcount bump (if it ever happened),
813 * unless caller would like an extra refcount anyways.
814 */
815 if (uprobe && !get)
816 put_uprobe(uprobe);
817 /*
818 * Even if hprobe_consume() or another hprobe_expire() wins
819 * the state update race and unlocks SRCU from under us, we
820 * still have a guarantee that underyling uprobe won't be
821 * freed due to ongoing caller's SRCU lock region, so we can
822 * return it regardless. Also, if `get` was true, we also have
823 * an extra ref for the caller to own. This is used in dup_utask().
824 */
825 return uprobe;
826 }
827 default:
828 WARN(1, "unknown hprobe state %d", hstate);
829 return NULL;
830 }
831 }
832
833 static __always_inline
uprobe_cmp(const struct inode * l_inode,const loff_t l_offset,const struct uprobe * r)834 int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
835 const struct uprobe *r)
836 {
837 if (l_inode < r->inode)
838 return -1;
839
840 if (l_inode > r->inode)
841 return 1;
842
843 if (l_offset < r->offset)
844 return -1;
845
846 if (l_offset > r->offset)
847 return 1;
848
849 return 0;
850 }
851
852 #define __node_2_uprobe(node) \
853 rb_entry((node), struct uprobe, rb_node)
854
855 struct __uprobe_key {
856 struct inode *inode;
857 loff_t offset;
858 };
859
__uprobe_cmp_key(const void * key,const struct rb_node * b)860 static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
861 {
862 const struct __uprobe_key *a = key;
863 return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
864 }
865
__uprobe_cmp(struct rb_node * a,const struct rb_node * b)866 static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
867 {
868 struct uprobe *u = __node_2_uprobe(a);
869 return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
870 }
871
872 /*
873 * Assumes being inside RCU protected region.
874 * No refcount is taken on returned uprobe.
875 */
find_uprobe_rcu(struct inode * inode,loff_t offset)876 static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
877 {
878 struct __uprobe_key key = {
879 .inode = inode,
880 .offset = offset,
881 };
882 struct rb_node *node;
883 unsigned int seq;
884
885 lockdep_assert(rcu_read_lock_trace_held());
886
887 do {
888 seq = read_seqcount_begin(&uprobes_seqcount);
889 node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
890 /*
891 * Lockless RB-tree lookups can result only in false negatives.
892 * If the element is found, it is correct and can be returned
893 * under RCU protection. If we find nothing, we need to
894 * validate that seqcount didn't change. If it did, we have to
895 * try again as we might have missed the element (false
896 * negative). If seqcount is unchanged, search truly failed.
897 */
898 if (node)
899 return __node_2_uprobe(node);
900 } while (read_seqcount_retry(&uprobes_seqcount, seq));
901
902 return NULL;
903 }
904
905 /*
906 * Attempt to insert a new uprobe into uprobes_tree.
907 *
908 * If uprobe already exists (for given inode+offset), we just increment
909 * refcount of previously existing uprobe.
910 *
911 * If not, a provided new instance of uprobe is inserted into the tree (with
912 * assumed initial refcount == 1).
913 *
914 * In any case, we return a uprobe instance that ends up being in uprobes_tree.
915 * Caller has to clean up new uprobe instance, if it ended up not being
916 * inserted into the tree.
917 *
918 * We assume that uprobes_treelock is held for writing.
919 */
__insert_uprobe(struct uprobe * uprobe)920 static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
921 {
922 struct rb_node *node;
923 again:
924 node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
925 if (node) {
926 struct uprobe *u = __node_2_uprobe(node);
927
928 if (!try_get_uprobe(u)) {
929 rb_erase(node, &uprobes_tree);
930 RB_CLEAR_NODE(&u->rb_node);
931 goto again;
932 }
933
934 return u;
935 }
936
937 return uprobe;
938 }
939
940 /*
941 * Acquire uprobes_treelock and insert uprobe into uprobes_tree
942 * (or reuse existing one, see __insert_uprobe() comments above).
943 */
insert_uprobe(struct uprobe * uprobe)944 static struct uprobe *insert_uprobe(struct uprobe *uprobe)
945 {
946 struct uprobe *u;
947
948 write_lock(&uprobes_treelock);
949 write_seqcount_begin(&uprobes_seqcount);
950 u = __insert_uprobe(uprobe);
951 write_seqcount_end(&uprobes_seqcount);
952 write_unlock(&uprobes_treelock);
953
954 return u;
955 }
956
957 static void
ref_ctr_mismatch_warn(struct uprobe * cur_uprobe,struct uprobe * uprobe)958 ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
959 {
960 pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
961 "ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
962 uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
963 (unsigned long long) cur_uprobe->ref_ctr_offset,
964 (unsigned long long) uprobe->ref_ctr_offset);
965 }
966
alloc_uprobe(struct inode * inode,loff_t offset,loff_t ref_ctr_offset)967 static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
968 loff_t ref_ctr_offset)
969 {
970 struct uprobe *uprobe, *cur_uprobe;
971
972 uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
973 if (!uprobe)
974 return ERR_PTR(-ENOMEM);
975
976 uprobe->inode = inode;
977 uprobe->offset = offset;
978 uprobe->ref_ctr_offset = ref_ctr_offset;
979 INIT_LIST_HEAD(&uprobe->consumers);
980 init_rwsem(&uprobe->register_rwsem);
981 init_rwsem(&uprobe->consumer_rwsem);
982 RB_CLEAR_NODE(&uprobe->rb_node);
983 refcount_set(&uprobe->ref, 1);
984
985 /* add to uprobes_tree, sorted on inode:offset */
986 cur_uprobe = insert_uprobe(uprobe);
987 /* a uprobe exists for this inode:offset combination */
988 if (cur_uprobe != uprobe) {
989 if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
990 ref_ctr_mismatch_warn(cur_uprobe, uprobe);
991 put_uprobe(cur_uprobe);
992 kfree(uprobe);
993 return ERR_PTR(-EINVAL);
994 }
995 kfree(uprobe);
996 uprobe = cur_uprobe;
997 }
998
999 return uprobe;
1000 }
1001
consumer_add(struct uprobe * uprobe,struct uprobe_consumer * uc)1002 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
1003 {
1004 static atomic64_t id;
1005
1006 down_write(&uprobe->consumer_rwsem);
1007 list_add_rcu(&uc->cons_node, &uprobe->consumers);
1008 uc->id = (__u64) atomic64_inc_return(&id);
1009 up_write(&uprobe->consumer_rwsem);
1010 }
1011
1012 /*
1013 * For uprobe @uprobe, delete the consumer @uc.
1014 * Should never be called with consumer that's not part of @uprobe->consumers.
1015 */
consumer_del(struct uprobe * uprobe,struct uprobe_consumer * uc)1016 static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
1017 {
1018 down_write(&uprobe->consumer_rwsem);
1019 list_del_rcu(&uc->cons_node);
1020 up_write(&uprobe->consumer_rwsem);
1021 }
1022
__copy_insn(struct address_space * mapping,struct file * filp,void * insn,int nbytes,loff_t offset)1023 static int __copy_insn(struct address_space *mapping, struct file *filp,
1024 void *insn, int nbytes, loff_t offset)
1025 {
1026 struct page *page;
1027 /*
1028 * Ensure that the page that has the original instruction is populated
1029 * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
1030 * see uprobe_register().
1031 */
1032 if (mapping->a_ops->read_folio)
1033 page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
1034 else
1035 page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
1036 if (IS_ERR(page))
1037 return PTR_ERR(page);
1038
1039 copy_from_page(page, offset, insn, nbytes);
1040 put_page(page);
1041
1042 return 0;
1043 }
1044
copy_insn(struct uprobe * uprobe,struct file * filp)1045 static int copy_insn(struct uprobe *uprobe, struct file *filp)
1046 {
1047 struct address_space *mapping = uprobe->inode->i_mapping;
1048 loff_t offs = uprobe->offset;
1049 void *insn = &uprobe->arch.insn;
1050 int size = sizeof(uprobe->arch.insn);
1051 int len, err = -EIO;
1052
1053 /* Copy only available bytes, -EIO if nothing was read */
1054 do {
1055 if (offs >= i_size_read(uprobe->inode))
1056 break;
1057
1058 len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
1059 err = __copy_insn(mapping, filp, insn, len, offs);
1060 if (err)
1061 break;
1062
1063 insn += len;
1064 offs += len;
1065 size -= len;
1066 } while (size);
1067
1068 return err;
1069 }
1070
prepare_uprobe(struct uprobe * uprobe,struct file * file,struct mm_struct * mm,unsigned long vaddr)1071 static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
1072 struct mm_struct *mm, unsigned long vaddr)
1073 {
1074 int ret = 0;
1075
1076 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
1077 return ret;
1078
1079 /* TODO: move this into _register, until then we abuse this sem. */
1080 down_write(&uprobe->consumer_rwsem);
1081 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
1082 goto out;
1083
1084 ret = copy_insn(uprobe, file);
1085 if (ret)
1086 goto out;
1087
1088 ret = -ENOTSUPP;
1089 if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
1090 goto out;
1091
1092 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
1093 if (ret)
1094 goto out;
1095
1096 smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
1097 set_bit(UPROBE_COPY_INSN, &uprobe->flags);
1098
1099 out:
1100 up_write(&uprobe->consumer_rwsem);
1101
1102 return ret;
1103 }
1104
consumer_filter(struct uprobe_consumer * uc,struct mm_struct * mm)1105 static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
1106 {
1107 return !uc->filter || uc->filter(uc, mm);
1108 }
1109
filter_chain(struct uprobe * uprobe,struct mm_struct * mm)1110 static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
1111 {
1112 struct uprobe_consumer *uc;
1113 bool ret = false;
1114
1115 down_read(&uprobe->consumer_rwsem);
1116 list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
1117 ret = consumer_filter(uc, mm);
1118 if (ret)
1119 break;
1120 }
1121 up_read(&uprobe->consumer_rwsem);
1122
1123 return ret;
1124 }
1125
1126 static int
install_breakpoint(struct uprobe * uprobe,struct mm_struct * mm,struct vm_area_struct * vma,unsigned long vaddr)1127 install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
1128 struct vm_area_struct *vma, unsigned long vaddr)
1129 {
1130 bool first_uprobe;
1131 int ret;
1132
1133 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
1134 if (ret)
1135 return ret;
1136
1137 /*
1138 * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
1139 * the task can hit this breakpoint right after __replace_page().
1140 */
1141 first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
1142 if (first_uprobe)
1143 set_bit(MMF_HAS_UPROBES, &mm->flags);
1144
1145 ret = set_swbp(&uprobe->arch, mm, vaddr);
1146 if (!ret)
1147 clear_bit(MMF_RECALC_UPROBES, &mm->flags);
1148 else if (first_uprobe)
1149 clear_bit(MMF_HAS_UPROBES, &mm->flags);
1150
1151 return ret;
1152 }
1153
1154 static int
remove_breakpoint(struct uprobe * uprobe,struct mm_struct * mm,unsigned long vaddr)1155 remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
1156 {
1157 set_bit(MMF_RECALC_UPROBES, &mm->flags);
1158 return set_orig_insn(&uprobe->arch, mm, vaddr);
1159 }
1160
1161 struct map_info {
1162 struct map_info *next;
1163 struct mm_struct *mm;
1164 unsigned long vaddr;
1165 };
1166
free_map_info(struct map_info * info)1167 static inline struct map_info *free_map_info(struct map_info *info)
1168 {
1169 struct map_info *next = info->next;
1170 kfree(info);
1171 return next;
1172 }
1173
1174 static struct map_info *
build_map_info(struct address_space * mapping,loff_t offset,bool is_register)1175 build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
1176 {
1177 unsigned long pgoff = offset >> PAGE_SHIFT;
1178 struct vm_area_struct *vma;
1179 struct map_info *curr = NULL;
1180 struct map_info *prev = NULL;
1181 struct map_info *info;
1182 int more = 0;
1183
1184 again:
1185 i_mmap_lock_read(mapping);
1186 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1187 if (!valid_vma(vma, is_register))
1188 continue;
1189
1190 if (!prev && !more) {
1191 /*
1192 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
1193 * reclaim. This is optimistic, no harm done if it fails.
1194 */
1195 prev = kmalloc(sizeof(struct map_info),
1196 GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
1197 if (prev)
1198 prev->next = NULL;
1199 }
1200 if (!prev) {
1201 more++;
1202 continue;
1203 }
1204
1205 if (!mmget_not_zero(vma->vm_mm))
1206 continue;
1207
1208 info = prev;
1209 prev = prev->next;
1210 info->next = curr;
1211 curr = info;
1212
1213 info->mm = vma->vm_mm;
1214 info->vaddr = offset_to_vaddr(vma, offset);
1215 }
1216 i_mmap_unlock_read(mapping);
1217
1218 if (!more)
1219 goto out;
1220
1221 prev = curr;
1222 while (curr) {
1223 mmput(curr->mm);
1224 curr = curr->next;
1225 }
1226
1227 do {
1228 info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
1229 if (!info) {
1230 curr = ERR_PTR(-ENOMEM);
1231 goto out;
1232 }
1233 info->next = prev;
1234 prev = info;
1235 } while (--more);
1236
1237 goto again;
1238 out:
1239 while (prev)
1240 prev = free_map_info(prev);
1241 return curr;
1242 }
1243
1244 static int
register_for_each_vma(struct uprobe * uprobe,struct uprobe_consumer * new)1245 register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
1246 {
1247 bool is_register = !!new;
1248 struct map_info *info;
1249 int err = 0;
1250
1251 percpu_down_write(&dup_mmap_sem);
1252 info = build_map_info(uprobe->inode->i_mapping,
1253 uprobe->offset, is_register);
1254 if (IS_ERR(info)) {
1255 err = PTR_ERR(info);
1256 goto out;
1257 }
1258
1259 while (info) {
1260 struct mm_struct *mm = info->mm;
1261 struct vm_area_struct *vma;
1262
1263 if (err && is_register)
1264 goto free;
1265 /*
1266 * We take mmap_lock for writing to avoid the race with
1267 * find_active_uprobe_rcu() which takes mmap_lock for reading.
1268 * Thus this install_breakpoint() can not make
1269 * is_trap_at_addr() true right after find_uprobe_rcu()
1270 * returns NULL in find_active_uprobe_rcu().
1271 */
1272 mmap_write_lock(mm);
1273 if (check_stable_address_space(mm))
1274 goto unlock;
1275
1276 vma = find_vma(mm, info->vaddr);
1277 if (!vma || !valid_vma(vma, is_register) ||
1278 file_inode(vma->vm_file) != uprobe->inode)
1279 goto unlock;
1280
1281 if (vma->vm_start > info->vaddr ||
1282 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
1283 goto unlock;
1284
1285 if (is_register) {
1286 /* consult only the "caller", new consumer. */
1287 if (consumer_filter(new, mm))
1288 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
1289 } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
1290 if (!filter_chain(uprobe, mm))
1291 err |= remove_breakpoint(uprobe, mm, info->vaddr);
1292 }
1293
1294 unlock:
1295 mmap_write_unlock(mm);
1296 free:
1297 mmput(mm);
1298 info = free_map_info(info);
1299 }
1300 out:
1301 percpu_up_write(&dup_mmap_sem);
1302 return err;
1303 }
1304
1305 /**
1306 * uprobe_unregister_nosync - unregister an already registered probe.
1307 * @uprobe: uprobe to remove
1308 * @uc: identify which probe if multiple probes are colocated.
1309 */
uprobe_unregister_nosync(struct uprobe * uprobe,struct uprobe_consumer * uc)1310 void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
1311 {
1312 int err;
1313
1314 down_write(&uprobe->register_rwsem);
1315 consumer_del(uprobe, uc);
1316 err = register_for_each_vma(uprobe, NULL);
1317 up_write(&uprobe->register_rwsem);
1318
1319 /* TODO : cant unregister? schedule a worker thread */
1320 if (unlikely(err)) {
1321 uprobe_warn(current, "unregister, leaking uprobe");
1322 return;
1323 }
1324
1325 put_uprobe(uprobe);
1326 }
1327 EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
1328
uprobe_unregister_sync(void)1329 void uprobe_unregister_sync(void)
1330 {
1331 /*
1332 * Now that handler_chain() and handle_uretprobe_chain() iterate over
1333 * uprobe->consumers list under RCU protection without holding
1334 * uprobe->register_rwsem, we need to wait for RCU grace period to
1335 * make sure that we can't call into just unregistered
1336 * uprobe_consumer's callbacks anymore. If we don't do that, fast and
1337 * unlucky enough caller can free consumer's memory and cause
1338 * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
1339 */
1340 synchronize_rcu_tasks_trace();
1341 synchronize_srcu(&uretprobes_srcu);
1342 }
1343 EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
1344
1345 /**
1346 * uprobe_register - register a probe
1347 * @inode: the file in which the probe has to be placed.
1348 * @offset: offset from the start of the file.
1349 * @ref_ctr_offset: offset of SDT marker / reference counter
1350 * @uc: information on howto handle the probe..
1351 *
1352 * Apart from the access refcount, uprobe_register() takes a creation
1353 * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
1354 * inserted into the rbtree (i.e first consumer for a @inode:@offset
1355 * tuple). Creation refcount stops uprobe_unregister from freeing the
1356 * @uprobe even before the register operation is complete. Creation
1357 * refcount is released when the last @uc for the @uprobe
1358 * unregisters. Caller of uprobe_register() is required to keep @inode
1359 * (and the containing mount) referenced.
1360 *
1361 * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
1362 */
uprobe_register(struct inode * inode,loff_t offset,loff_t ref_ctr_offset,struct uprobe_consumer * uc)1363 struct uprobe *uprobe_register(struct inode *inode,
1364 loff_t offset, loff_t ref_ctr_offset,
1365 struct uprobe_consumer *uc)
1366 {
1367 struct uprobe *uprobe;
1368 int ret;
1369
1370 /* Uprobe must have at least one set consumer */
1371 if (!uc->handler && !uc->ret_handler)
1372 return ERR_PTR(-EINVAL);
1373
1374 /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
1375 if (!inode->i_mapping->a_ops->read_folio &&
1376 !shmem_mapping(inode->i_mapping))
1377 return ERR_PTR(-EIO);
1378 /* Racy, just to catch the obvious mistakes */
1379 if (offset > i_size_read(inode))
1380 return ERR_PTR(-EINVAL);
1381
1382 /*
1383 * This ensures that copy_from_page(), copy_to_page() and
1384 * __update_ref_ctr() can't cross page boundary.
1385 */
1386 if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
1387 return ERR_PTR(-EINVAL);
1388 if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
1389 return ERR_PTR(-EINVAL);
1390
1391 uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
1392 if (IS_ERR(uprobe))
1393 return uprobe;
1394
1395 down_write(&uprobe->register_rwsem);
1396 consumer_add(uprobe, uc);
1397 ret = register_for_each_vma(uprobe, uc);
1398 up_write(&uprobe->register_rwsem);
1399
1400 if (ret) {
1401 uprobe_unregister_nosync(uprobe, uc);
1402 /*
1403 * Registration might have partially succeeded, so we can have
1404 * this consumer being called right at this time. We need to
1405 * sync here. It's ok, it's unlikely slow path.
1406 */
1407 uprobe_unregister_sync();
1408 return ERR_PTR(ret);
1409 }
1410
1411 return uprobe;
1412 }
1413 EXPORT_SYMBOL_GPL(uprobe_register);
1414
1415 /**
1416 * uprobe_apply - add or remove the breakpoints according to @uc->filter
1417 * @uprobe: uprobe which "owns" the breakpoint
1418 * @uc: consumer which wants to add more or remove some breakpoints
1419 * @add: add or remove the breakpoints
1420 * Return: 0 on success or negative error code.
1421 */
uprobe_apply(struct uprobe * uprobe,struct uprobe_consumer * uc,bool add)1422 int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
1423 {
1424 struct uprobe_consumer *con;
1425 int ret = -ENOENT;
1426
1427 down_write(&uprobe->register_rwsem);
1428
1429 rcu_read_lock_trace();
1430 list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
1431 if (con == uc) {
1432 ret = register_for_each_vma(uprobe, add ? uc : NULL);
1433 break;
1434 }
1435 }
1436 rcu_read_unlock_trace();
1437
1438 up_write(&uprobe->register_rwsem);
1439
1440 return ret;
1441 }
1442
unapply_uprobe(struct uprobe * uprobe,struct mm_struct * mm)1443 static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
1444 {
1445 VMA_ITERATOR(vmi, mm, 0);
1446 struct vm_area_struct *vma;
1447 int err = 0;
1448
1449 mmap_read_lock(mm);
1450 for_each_vma(vmi, vma) {
1451 unsigned long vaddr;
1452 loff_t offset;
1453
1454 if (!valid_vma(vma, false) ||
1455 file_inode(vma->vm_file) != uprobe->inode)
1456 continue;
1457
1458 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1459 if (uprobe->offset < offset ||
1460 uprobe->offset >= offset + vma->vm_end - vma->vm_start)
1461 continue;
1462
1463 vaddr = offset_to_vaddr(vma, uprobe->offset);
1464 err |= remove_breakpoint(uprobe, mm, vaddr);
1465 }
1466 mmap_read_unlock(mm);
1467
1468 return err;
1469 }
1470
1471 static struct rb_node *
find_node_in_range(struct inode * inode,loff_t min,loff_t max)1472 find_node_in_range(struct inode *inode, loff_t min, loff_t max)
1473 {
1474 struct rb_node *n = uprobes_tree.rb_node;
1475
1476 while (n) {
1477 struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
1478
1479 if (inode < u->inode) {
1480 n = n->rb_left;
1481 } else if (inode > u->inode) {
1482 n = n->rb_right;
1483 } else {
1484 if (max < u->offset)
1485 n = n->rb_left;
1486 else if (min > u->offset)
1487 n = n->rb_right;
1488 else
1489 break;
1490 }
1491 }
1492
1493 return n;
1494 }
1495
1496 /*
1497 * For a given range in vma, build a list of probes that need to be inserted.
1498 */
build_probe_list(struct inode * inode,struct vm_area_struct * vma,unsigned long start,unsigned long end,struct list_head * head)1499 static void build_probe_list(struct inode *inode,
1500 struct vm_area_struct *vma,
1501 unsigned long start, unsigned long end,
1502 struct list_head *head)
1503 {
1504 loff_t min, max;
1505 struct rb_node *n, *t;
1506 struct uprobe *u;
1507
1508 INIT_LIST_HEAD(head);
1509 min = vaddr_to_offset(vma, start);
1510 max = min + (end - start) - 1;
1511
1512 read_lock(&uprobes_treelock);
1513 n = find_node_in_range(inode, min, max);
1514 if (n) {
1515 for (t = n; t; t = rb_prev(t)) {
1516 u = rb_entry(t, struct uprobe, rb_node);
1517 if (u->inode != inode || u->offset < min)
1518 break;
1519 /* if uprobe went away, it's safe to ignore it */
1520 if (try_get_uprobe(u))
1521 list_add(&u->pending_list, head);
1522 }
1523 for (t = n; (t = rb_next(t)); ) {
1524 u = rb_entry(t, struct uprobe, rb_node);
1525 if (u->inode != inode || u->offset > max)
1526 break;
1527 /* if uprobe went away, it's safe to ignore it */
1528 if (try_get_uprobe(u))
1529 list_add(&u->pending_list, head);
1530 }
1531 }
1532 read_unlock(&uprobes_treelock);
1533 }
1534
1535 /* @vma contains reference counter, not the probed instruction. */
delayed_ref_ctr_inc(struct vm_area_struct * vma)1536 static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
1537 {
1538 struct list_head *pos, *q;
1539 struct delayed_uprobe *du;
1540 unsigned long vaddr;
1541 int ret = 0, err = 0;
1542
1543 mutex_lock(&delayed_uprobe_lock);
1544 list_for_each_safe(pos, q, &delayed_uprobe_list) {
1545 du = list_entry(pos, struct delayed_uprobe, list);
1546
1547 if (du->mm != vma->vm_mm ||
1548 !valid_ref_ctr_vma(du->uprobe, vma))
1549 continue;
1550
1551 vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
1552 ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
1553 if (ret) {
1554 update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
1555 if (!err)
1556 err = ret;
1557 }
1558 delayed_uprobe_delete(du);
1559 }
1560 mutex_unlock(&delayed_uprobe_lock);
1561 return err;
1562 }
1563
1564 /*
1565 * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
1566 *
1567 * Currently we ignore all errors and always return 0, the callers
1568 * can't handle the failure anyway.
1569 */
uprobe_mmap(struct vm_area_struct * vma)1570 int uprobe_mmap(struct vm_area_struct *vma)
1571 {
1572 struct list_head tmp_list;
1573 struct uprobe *uprobe, *u;
1574 struct inode *inode;
1575
1576 if (no_uprobe_events())
1577 return 0;
1578
1579 if (vma->vm_file &&
1580 (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
1581 test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
1582 delayed_ref_ctr_inc(vma);
1583
1584 if (!valid_vma(vma, true))
1585 return 0;
1586
1587 inode = file_inode(vma->vm_file);
1588 if (!inode)
1589 return 0;
1590
1591 mutex_lock(uprobes_mmap_hash(inode));
1592 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1593 /*
1594 * We can race with uprobe_unregister(), this uprobe can be already
1595 * removed. But in this case filter_chain() must return false, all
1596 * consumers have gone away.
1597 */
1598 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1599 if (!fatal_signal_pending(current) &&
1600 filter_chain(uprobe, vma->vm_mm)) {
1601 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1602 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1603 }
1604 put_uprobe(uprobe);
1605 }
1606 mutex_unlock(uprobes_mmap_hash(inode));
1607
1608 return 0;
1609 }
1610
1611 static bool
vma_has_uprobes(struct vm_area_struct * vma,unsigned long start,unsigned long end)1612 vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1613 {
1614 loff_t min, max;
1615 struct inode *inode;
1616 struct rb_node *n;
1617
1618 inode = file_inode(vma->vm_file);
1619
1620 min = vaddr_to_offset(vma, start);
1621 max = min + (end - start) - 1;
1622
1623 read_lock(&uprobes_treelock);
1624 n = find_node_in_range(inode, min, max);
1625 read_unlock(&uprobes_treelock);
1626
1627 return !!n;
1628 }
1629
1630 /*
1631 * Called in context of a munmap of a vma.
1632 */
uprobe_munmap(struct vm_area_struct * vma,unsigned long start,unsigned long end)1633 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1634 {
1635 if (no_uprobe_events() || !valid_vma(vma, false))
1636 return;
1637
1638 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1639 return;
1640
1641 if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
1642 test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
1643 return;
1644
1645 if (vma_has_uprobes(vma, start, end))
1646 set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
1647 }
1648
xol_fault(const struct vm_special_mapping * sm,struct vm_area_struct * vma,struct vm_fault * vmf)1649 static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
1650 struct vm_area_struct *vma, struct vm_fault *vmf)
1651 {
1652 struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
1653
1654 vmf->page = area->page;
1655 get_page(vmf->page);
1656 return 0;
1657 }
1658
xol_mremap(const struct vm_special_mapping * sm,struct vm_area_struct * new_vma)1659 static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
1660 {
1661 return -EPERM;
1662 }
1663
1664 static const struct vm_special_mapping xol_mapping = {
1665 .name = "[uprobes]",
1666 .fault = xol_fault,
1667 .mremap = xol_mremap,
1668 };
1669
1670 /* Slot allocation for XOL */
xol_add_vma(struct mm_struct * mm,struct xol_area * area)1671 static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1672 {
1673 struct vm_area_struct *vma;
1674 int ret;
1675
1676 if (mmap_write_lock_killable(mm))
1677 return -EINTR;
1678
1679 if (mm->uprobes_state.xol_area) {
1680 ret = -EALREADY;
1681 goto fail;
1682 }
1683
1684 if (!area->vaddr) {
1685 /* Try to map as high as possible, this is only a hint. */
1686 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1687 PAGE_SIZE, 0, 0);
1688 if (IS_ERR_VALUE(area->vaddr)) {
1689 ret = area->vaddr;
1690 goto fail;
1691 }
1692 }
1693
1694 vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1695 VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
1696 &xol_mapping);
1697 if (IS_ERR(vma)) {
1698 ret = PTR_ERR(vma);
1699 goto fail;
1700 }
1701
1702 ret = 0;
1703 /* pairs with get_xol_area() */
1704 smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
1705 fail:
1706 mmap_write_unlock(mm);
1707
1708 return ret;
1709 }
1710
arch_uprobe_trampoline(unsigned long * psize)1711 void * __weak arch_uprobe_trampoline(unsigned long *psize)
1712 {
1713 static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1714
1715 *psize = UPROBE_SWBP_INSN_SIZE;
1716 return &insn;
1717 }
1718
__create_xol_area(unsigned long vaddr)1719 static struct xol_area *__create_xol_area(unsigned long vaddr)
1720 {
1721 struct mm_struct *mm = current->mm;
1722 unsigned long insns_size;
1723 struct xol_area *area;
1724 void *insns;
1725
1726 area = kzalloc(sizeof(*area), GFP_KERNEL);
1727 if (unlikely(!area))
1728 goto out;
1729
1730 area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
1731 GFP_KERNEL);
1732 if (!area->bitmap)
1733 goto free_area;
1734
1735 area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
1736 if (!area->page)
1737 goto free_bitmap;
1738
1739 area->vaddr = vaddr;
1740 init_waitqueue_head(&area->wq);
1741 /* Reserve the 1st slot for get_trampoline_vaddr() */
1742 set_bit(0, area->bitmap);
1743 insns = arch_uprobe_trampoline(&insns_size);
1744 arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
1745
1746 if (!xol_add_vma(mm, area))
1747 return area;
1748
1749 __free_page(area->page);
1750 free_bitmap:
1751 kfree(area->bitmap);
1752 free_area:
1753 kfree(area);
1754 out:
1755 return NULL;
1756 }
1757
1758 /*
1759 * get_xol_area - Allocate process's xol_area if necessary.
1760 * This area will be used for storing instructions for execution out of line.
1761 *
1762 * Returns the allocated area or NULL.
1763 */
get_xol_area(void)1764 static struct xol_area *get_xol_area(void)
1765 {
1766 struct mm_struct *mm = current->mm;
1767 struct xol_area *area;
1768
1769 if (!mm->uprobes_state.xol_area)
1770 __create_xol_area(0);
1771
1772 /* Pairs with xol_add_vma() smp_store_release() */
1773 area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
1774 return area;
1775 }
1776
1777 /*
1778 * uprobe_clear_state - Free the area allocated for slots.
1779 */
uprobe_clear_state(struct mm_struct * mm)1780 void uprobe_clear_state(struct mm_struct *mm)
1781 {
1782 struct xol_area *area = mm->uprobes_state.xol_area;
1783
1784 mutex_lock(&delayed_uprobe_lock);
1785 delayed_uprobe_remove(NULL, mm);
1786 mutex_unlock(&delayed_uprobe_lock);
1787
1788 if (!area)
1789 return;
1790
1791 put_page(area->page);
1792 kfree(area->bitmap);
1793 kfree(area);
1794 }
1795
uprobe_start_dup_mmap(void)1796 void uprobe_start_dup_mmap(void)
1797 {
1798 percpu_down_read(&dup_mmap_sem);
1799 }
1800
uprobe_end_dup_mmap(void)1801 void uprobe_end_dup_mmap(void)
1802 {
1803 percpu_up_read(&dup_mmap_sem);
1804 }
1805
uprobe_dup_mmap(struct mm_struct * oldmm,struct mm_struct * newmm)1806 void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1807 {
1808 if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
1809 set_bit(MMF_HAS_UPROBES, &newmm->flags);
1810 /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
1811 set_bit(MMF_RECALC_UPROBES, &newmm->flags);
1812 }
1813 }
1814
xol_get_slot_nr(struct xol_area * area)1815 static unsigned long xol_get_slot_nr(struct xol_area *area)
1816 {
1817 unsigned long slot_nr;
1818
1819 slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1820 if (slot_nr < UINSNS_PER_PAGE) {
1821 if (!test_and_set_bit(slot_nr, area->bitmap))
1822 return slot_nr;
1823 }
1824
1825 return UINSNS_PER_PAGE;
1826 }
1827
1828 /*
1829 * xol_get_insn_slot - allocate a slot for xol.
1830 */
xol_get_insn_slot(struct uprobe * uprobe,struct uprobe_task * utask)1831 static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask)
1832 {
1833 struct xol_area *area = get_xol_area();
1834 unsigned long slot_nr;
1835
1836 if (!area)
1837 return false;
1838
1839 wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE);
1840
1841 utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES;
1842 arch_uprobe_copy_ixol(area->page, utask->xol_vaddr,
1843 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1844 return true;
1845 }
1846
1847 /*
1848 * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot()
1849 */
xol_free_insn_slot(struct uprobe_task * utask)1850 static void xol_free_insn_slot(struct uprobe_task *utask)
1851 {
1852 struct xol_area *area = current->mm->uprobes_state.xol_area;
1853 unsigned long offset = utask->xol_vaddr - area->vaddr;
1854 unsigned int slot_nr;
1855
1856 utask->xol_vaddr = 0;
1857 /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */
1858 if (WARN_ON_ONCE(offset >= PAGE_SIZE))
1859 return;
1860
1861 slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1862 clear_bit(slot_nr, area->bitmap);
1863 smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
1864 if (waitqueue_active(&area->wq))
1865 wake_up(&area->wq);
1866 }
1867
arch_uprobe_copy_ixol(struct page * page,unsigned long vaddr,void * src,unsigned long len)1868 void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
1869 void *src, unsigned long len)
1870 {
1871 /* Initialize the slot */
1872 copy_to_page(page, vaddr, src, len);
1873
1874 /*
1875 * We probably need flush_icache_user_page() but it needs vma.
1876 * This should work on most of architectures by default. If
1877 * architecture needs to do something different it can define
1878 * its own version of the function.
1879 */
1880 flush_dcache_page(page);
1881 }
1882
1883 /**
1884 * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1885 * @regs: Reflects the saved state of the task after it has hit a breakpoint
1886 * instruction.
1887 * Return the address of the breakpoint instruction.
1888 */
uprobe_get_swbp_addr(struct pt_regs * regs)1889 unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1890 {
1891 return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1892 }
1893
uprobe_get_trap_addr(struct pt_regs * regs)1894 unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1895 {
1896 struct uprobe_task *utask = current->utask;
1897
1898 if (unlikely(utask && utask->active_uprobe))
1899 return utask->vaddr;
1900
1901 return instruction_pointer(regs);
1902 }
1903
ri_pool_push(struct uprobe_task * utask,struct return_instance * ri)1904 static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
1905 {
1906 ri->cons_cnt = 0;
1907 ri->next = utask->ri_pool;
1908 utask->ri_pool = ri;
1909 }
1910
ri_pool_pop(struct uprobe_task * utask)1911 static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
1912 {
1913 struct return_instance *ri = utask->ri_pool;
1914
1915 if (likely(ri))
1916 utask->ri_pool = ri->next;
1917
1918 return ri;
1919 }
1920
ri_free(struct return_instance * ri)1921 static void ri_free(struct return_instance *ri)
1922 {
1923 kfree(ri->extra_consumers);
1924 kfree_rcu(ri, rcu);
1925 }
1926
free_ret_instance(struct uprobe_task * utask,struct return_instance * ri,bool cleanup_hprobe)1927 static void free_ret_instance(struct uprobe_task *utask,
1928 struct return_instance *ri, bool cleanup_hprobe)
1929 {
1930 unsigned seq;
1931
1932 if (cleanup_hprobe) {
1933 enum hprobe_state hstate;
1934
1935 (void)hprobe_consume(&ri->hprobe, &hstate);
1936 hprobe_finalize(&ri->hprobe, hstate);
1937 }
1938
1939 /*
1940 * At this point return_instance is unlinked from utask's
1941 * return_instances list and this has become visible to ri_timer().
1942 * If seqcount now indicates that ri_timer's return instance
1943 * processing loop isn't active, we can return ri into the pool of
1944 * to-be-reused return instances for future uretprobes. If ri_timer()
1945 * happens to be running right now, though, we fallback to safety and
1946 * just perform RCU-delated freeing of ri.
1947 */
1948 if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
1949 /* immediate reuse of ri without RCU GP is OK */
1950 ri_pool_push(utask, ri);
1951 } else {
1952 /* we might be racing with ri_timer(), so play it safe */
1953 ri_free(ri);
1954 }
1955 }
1956
1957 /*
1958 * Called with no locks held.
1959 * Called in context of an exiting or an exec-ing thread.
1960 */
uprobe_free_utask(struct task_struct * t)1961 void uprobe_free_utask(struct task_struct *t)
1962 {
1963 struct uprobe_task *utask = t->utask;
1964 struct return_instance *ri, *ri_next;
1965
1966 if (!utask)
1967 return;
1968
1969 t->utask = NULL;
1970 WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
1971
1972 timer_delete_sync(&utask->ri_timer);
1973
1974 ri = utask->return_instances;
1975 while (ri) {
1976 ri_next = ri->next;
1977 free_ret_instance(utask, ri, true /* cleanup_hprobe */);
1978 ri = ri_next;
1979 }
1980
1981 /* free_ret_instance() above might add to ri_pool, so this loop should come last */
1982 ri = utask->ri_pool;
1983 while (ri) {
1984 ri_next = ri->next;
1985 ri_free(ri);
1986 ri = ri_next;
1987 }
1988
1989 kfree(utask);
1990 }
1991
1992 #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
1993
1994 #define for_each_ret_instance_rcu(pos, head) \
1995 for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next))
1996
ri_timer(struct timer_list * timer)1997 static void ri_timer(struct timer_list *timer)
1998 {
1999 struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer);
2000 struct return_instance *ri;
2001
2002 /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
2003 guard(srcu)(&uretprobes_srcu);
2004 /* RCU protects return_instance from freeing. */
2005 guard(rcu)();
2006
2007 write_seqcount_begin(&utask->ri_seqcount);
2008
2009 for_each_ret_instance_rcu(ri, utask->return_instances)
2010 hprobe_expire(&ri->hprobe, false);
2011
2012 write_seqcount_end(&utask->ri_seqcount);
2013 }
2014
alloc_utask(void)2015 static struct uprobe_task *alloc_utask(void)
2016 {
2017 struct uprobe_task *utask;
2018
2019 utask = kzalloc(sizeof(*utask), GFP_KERNEL);
2020 if (!utask)
2021 return NULL;
2022
2023 timer_setup(&utask->ri_timer, ri_timer, 0);
2024 seqcount_init(&utask->ri_seqcount);
2025
2026 return utask;
2027 }
2028
2029 /*
2030 * Allocate a uprobe_task object for the task if necessary.
2031 * Called when the thread hits a breakpoint.
2032 *
2033 * Returns:
2034 * - pointer to new uprobe_task on success
2035 * - NULL otherwise
2036 */
get_utask(void)2037 static struct uprobe_task *get_utask(void)
2038 {
2039 if (!current->utask)
2040 current->utask = alloc_utask();
2041 return current->utask;
2042 }
2043
alloc_return_instance(struct uprobe_task * utask)2044 static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
2045 {
2046 struct return_instance *ri;
2047
2048 ri = ri_pool_pop(utask);
2049 if (ri)
2050 return ri;
2051
2052 ri = kzalloc(sizeof(*ri), GFP_KERNEL);
2053 if (!ri)
2054 return ZERO_SIZE_PTR;
2055
2056 return ri;
2057 }
2058
dup_return_instance(struct return_instance * old)2059 static struct return_instance *dup_return_instance(struct return_instance *old)
2060 {
2061 struct return_instance *ri;
2062
2063 ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
2064 if (!ri)
2065 return NULL;
2066
2067 if (unlikely(old->cons_cnt > 1)) {
2068 ri->extra_consumers = kmemdup(old->extra_consumers,
2069 sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1),
2070 GFP_KERNEL);
2071 if (!ri->extra_consumers) {
2072 kfree(ri);
2073 return NULL;
2074 }
2075 }
2076
2077 return ri;
2078 }
2079
dup_utask(struct task_struct * t,struct uprobe_task * o_utask)2080 static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
2081 {
2082 struct uprobe_task *n_utask;
2083 struct return_instance **p, *o, *n;
2084 struct uprobe *uprobe;
2085
2086 n_utask = alloc_utask();
2087 if (!n_utask)
2088 return -ENOMEM;
2089 t->utask = n_utask;
2090
2091 /* protect uprobes from freeing, we'll need try_get_uprobe() them */
2092 guard(srcu)(&uretprobes_srcu);
2093
2094 p = &n_utask->return_instances;
2095 for (o = o_utask->return_instances; o; o = o->next) {
2096 n = dup_return_instance(o);
2097 if (!n)
2098 return -ENOMEM;
2099
2100 /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */
2101 uprobe = hprobe_expire(&o->hprobe, true);
2102
2103 /*
2104 * New utask will have stable properly refcounted uprobe or
2105 * NULL. Even if we failed to get refcounted uprobe, we still
2106 * need to preserve full set of return_instances for proper
2107 * uretprobe handling and nesting in forked task.
2108 */
2109 hprobe_init_stable(&n->hprobe, uprobe);
2110
2111 n->next = NULL;
2112 rcu_assign_pointer(*p, n);
2113 p = &n->next;
2114
2115 n_utask->depth++;
2116 }
2117
2118 return 0;
2119 }
2120
dup_xol_work(struct callback_head * work)2121 static void dup_xol_work(struct callback_head *work)
2122 {
2123 if (current->flags & PF_EXITING)
2124 return;
2125
2126 if (!__create_xol_area(current->utask->dup_xol_addr) &&
2127 !fatal_signal_pending(current))
2128 uprobe_warn(current, "dup xol area");
2129 }
2130
2131 /*
2132 * Called in context of a new clone/fork from copy_process.
2133 */
uprobe_copy_process(struct task_struct * t,unsigned long flags)2134 void uprobe_copy_process(struct task_struct *t, unsigned long flags)
2135 {
2136 struct uprobe_task *utask = current->utask;
2137 struct mm_struct *mm = current->mm;
2138 struct xol_area *area;
2139
2140 t->utask = NULL;
2141
2142 if (!utask || !utask->return_instances)
2143 return;
2144
2145 if (mm == t->mm && !(flags & CLONE_VFORK))
2146 return;
2147
2148 if (dup_utask(t, utask))
2149 return uprobe_warn(t, "dup ret instances");
2150
2151 /* The task can fork() after dup_xol_work() fails */
2152 area = mm->uprobes_state.xol_area;
2153 if (!area)
2154 return uprobe_warn(t, "dup xol area");
2155
2156 if (mm == t->mm)
2157 return;
2158
2159 t->utask->dup_xol_addr = area->vaddr;
2160 init_task_work(&t->utask->dup_xol_work, dup_xol_work);
2161 task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
2162 }
2163
2164 /*
2165 * Current area->vaddr notion assume the trampoline address is always
2166 * equal area->vaddr.
2167 *
2168 * Returns -1 in case the xol_area is not allocated.
2169 */
uprobe_get_trampoline_vaddr(void)2170 unsigned long uprobe_get_trampoline_vaddr(void)
2171 {
2172 struct xol_area *area;
2173 unsigned long trampoline_vaddr = -1;
2174
2175 /* Pairs with xol_add_vma() smp_store_release() */
2176 area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
2177 if (area)
2178 trampoline_vaddr = area->vaddr;
2179
2180 return trampoline_vaddr;
2181 }
2182
cleanup_return_instances(struct uprobe_task * utask,bool chained,struct pt_regs * regs)2183 static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
2184 struct pt_regs *regs)
2185 {
2186 struct return_instance *ri = utask->return_instances, *ri_next;
2187 enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
2188
2189 while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
2190 ri_next = ri->next;
2191 rcu_assign_pointer(utask->return_instances, ri_next);
2192 utask->depth--;
2193
2194 free_ret_instance(utask, ri, true /* cleanup_hprobe */);
2195 ri = ri_next;
2196 }
2197 }
2198
prepare_uretprobe(struct uprobe * uprobe,struct pt_regs * regs,struct return_instance * ri)2199 static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
2200 struct return_instance *ri)
2201 {
2202 struct uprobe_task *utask = current->utask;
2203 unsigned long orig_ret_vaddr, trampoline_vaddr;
2204 bool chained;
2205 int srcu_idx;
2206
2207 if (!get_xol_area())
2208 goto free;
2209
2210 if (utask->depth >= MAX_URETPROBE_DEPTH) {
2211 printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
2212 " nestedness limit pid/tgid=%d/%d\n",
2213 current->pid, current->tgid);
2214 goto free;
2215 }
2216
2217 trampoline_vaddr = uprobe_get_trampoline_vaddr();
2218 orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
2219 if (orig_ret_vaddr == -1)
2220 goto free;
2221
2222 /* drop the entries invalidated by longjmp() */
2223 chained = (orig_ret_vaddr == trampoline_vaddr);
2224 cleanup_return_instances(utask, chained, regs);
2225
2226 /*
2227 * We don't want to keep trampoline address in stack, rather keep the
2228 * original return address of first caller thru all the consequent
2229 * instances. This also makes breakpoint unwrapping easier.
2230 */
2231 if (chained) {
2232 if (!utask->return_instances) {
2233 /*
2234 * This situation is not possible. Likely we have an
2235 * attack from user-space.
2236 */
2237 uprobe_warn(current, "handle tail call");
2238 goto free;
2239 }
2240 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
2241 }
2242
2243 /* __srcu_read_lock() because SRCU lock survives switch to user space */
2244 srcu_idx = __srcu_read_lock(&uretprobes_srcu);
2245
2246 ri->func = instruction_pointer(regs);
2247 ri->stack = user_stack_pointer(regs);
2248 ri->orig_ret_vaddr = orig_ret_vaddr;
2249 ri->chained = chained;
2250
2251 utask->depth++;
2252
2253 hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx);
2254 ri->next = utask->return_instances;
2255 rcu_assign_pointer(utask->return_instances, ri);
2256
2257 mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD);
2258
2259 return;
2260 free:
2261 ri_free(ri);
2262 }
2263
2264 /* Prepare to single-step probed instruction out of line. */
2265 static int
pre_ssout(struct uprobe * uprobe,struct pt_regs * regs,unsigned long bp_vaddr)2266 pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
2267 {
2268 struct uprobe_task *utask = current->utask;
2269 int err;
2270
2271 if (!try_get_uprobe(uprobe))
2272 return -EINVAL;
2273
2274 if (!xol_get_insn_slot(uprobe, utask)) {
2275 err = -ENOMEM;
2276 goto err_out;
2277 }
2278
2279 utask->vaddr = bp_vaddr;
2280 err = arch_uprobe_pre_xol(&uprobe->arch, regs);
2281 if (unlikely(err)) {
2282 xol_free_insn_slot(utask);
2283 goto err_out;
2284 }
2285
2286 utask->active_uprobe = uprobe;
2287 utask->state = UTASK_SSTEP;
2288 return 0;
2289 err_out:
2290 put_uprobe(uprobe);
2291 return err;
2292 }
2293
2294 /*
2295 * If we are singlestepping, then ensure this thread is not connected to
2296 * non-fatal signals until completion of singlestep. When xol insn itself
2297 * triggers the signal, restart the original insn even if the task is
2298 * already SIGKILL'ed (since coredump should report the correct ip). This
2299 * is even more important if the task has a handler for SIGSEGV/etc, The
2300 * _same_ instruction should be repeated again after return from the signal
2301 * handler, and SSTEP can never finish in this case.
2302 */
uprobe_deny_signal(void)2303 bool uprobe_deny_signal(void)
2304 {
2305 struct task_struct *t = current;
2306 struct uprobe_task *utask = t->utask;
2307
2308 if (likely(!utask || !utask->active_uprobe))
2309 return false;
2310
2311 WARN_ON_ONCE(utask->state != UTASK_SSTEP);
2312
2313 if (task_sigpending(t)) {
2314 spin_lock_irq(&t->sighand->siglock);
2315 clear_tsk_thread_flag(t, TIF_SIGPENDING);
2316 spin_unlock_irq(&t->sighand->siglock);
2317
2318 if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
2319 utask->state = UTASK_SSTEP_TRAPPED;
2320 set_tsk_thread_flag(t, TIF_UPROBE);
2321 }
2322 }
2323
2324 return true;
2325 }
2326
mmf_recalc_uprobes(struct mm_struct * mm)2327 static void mmf_recalc_uprobes(struct mm_struct *mm)
2328 {
2329 VMA_ITERATOR(vmi, mm, 0);
2330 struct vm_area_struct *vma;
2331
2332 for_each_vma(vmi, vma) {
2333 if (!valid_vma(vma, false))
2334 continue;
2335 /*
2336 * This is not strictly accurate, we can race with
2337 * uprobe_unregister() and see the already removed
2338 * uprobe if delete_uprobe() was not yet called.
2339 * Or this uprobe can be filtered out.
2340 */
2341 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
2342 return;
2343 }
2344
2345 clear_bit(MMF_HAS_UPROBES, &mm->flags);
2346 }
2347
is_trap_at_addr(struct mm_struct * mm,unsigned long vaddr)2348 static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
2349 {
2350 struct page *page;
2351 uprobe_opcode_t opcode;
2352 int result;
2353
2354 if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
2355 return -EINVAL;
2356
2357 pagefault_disable();
2358 result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
2359 pagefault_enable();
2360
2361 if (likely(result == 0))
2362 goto out;
2363
2364 result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
2365 if (result < 0)
2366 return result;
2367
2368 copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
2369 put_page(page);
2370 out:
2371 /* This needs to return true for any variant of the trap insn */
2372 return is_trap_insn(&opcode);
2373 }
2374
find_active_uprobe_speculative(unsigned long bp_vaddr)2375 static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
2376 {
2377 struct mm_struct *mm = current->mm;
2378 struct uprobe *uprobe = NULL;
2379 struct vm_area_struct *vma;
2380 struct file *vm_file;
2381 loff_t offset;
2382 unsigned int seq;
2383
2384 guard(rcu)();
2385
2386 if (!mmap_lock_speculate_try_begin(mm, &seq))
2387 return NULL;
2388
2389 vma = vma_lookup(mm, bp_vaddr);
2390 if (!vma)
2391 return NULL;
2392
2393 /*
2394 * vm_file memory can be reused for another instance of struct file,
2395 * but can't be freed from under us, so it's safe to read fields from
2396 * it, even if the values are some garbage values; ultimately
2397 * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
2398 * that whatever we speculatively found is correct
2399 */
2400 vm_file = READ_ONCE(vma->vm_file);
2401 if (!vm_file)
2402 return NULL;
2403
2404 offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
2405 uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
2406 if (!uprobe)
2407 return NULL;
2408
2409 /* now double check that nothing about MM changed */
2410 if (mmap_lock_speculate_retry(mm, seq))
2411 return NULL;
2412
2413 return uprobe;
2414 }
2415
2416 /* assumes being inside RCU protected region */
find_active_uprobe_rcu(unsigned long bp_vaddr,int * is_swbp)2417 static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
2418 {
2419 struct mm_struct *mm = current->mm;
2420 struct uprobe *uprobe = NULL;
2421 struct vm_area_struct *vma;
2422
2423 uprobe = find_active_uprobe_speculative(bp_vaddr);
2424 if (uprobe)
2425 return uprobe;
2426
2427 mmap_read_lock(mm);
2428 vma = vma_lookup(mm, bp_vaddr);
2429 if (vma) {
2430 if (vma->vm_file) {
2431 struct inode *inode = file_inode(vma->vm_file);
2432 loff_t offset = vaddr_to_offset(vma, bp_vaddr);
2433
2434 uprobe = find_uprobe_rcu(inode, offset);
2435 }
2436
2437 if (!uprobe)
2438 *is_swbp = is_trap_at_addr(mm, bp_vaddr);
2439 } else {
2440 *is_swbp = -EFAULT;
2441 }
2442
2443 if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
2444 mmf_recalc_uprobes(mm);
2445 mmap_read_unlock(mm);
2446
2447 return uprobe;
2448 }
2449
push_consumer(struct return_instance * ri,__u64 id,__u64 cookie)2450 static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie)
2451 {
2452 struct return_consumer *ric;
2453
2454 if (unlikely(ri == ZERO_SIZE_PTR))
2455 return ri;
2456
2457 if (unlikely(ri->cons_cnt > 0)) {
2458 ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
2459 if (!ric) {
2460 ri_free(ri);
2461 return ZERO_SIZE_PTR;
2462 }
2463 ri->extra_consumers = ric;
2464 }
2465
2466 ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1];
2467 ric->id = id;
2468 ric->cookie = cookie;
2469
2470 ri->cons_cnt++;
2471 return ri;
2472 }
2473
2474 static struct return_consumer *
return_consumer_find(struct return_instance * ri,int * iter,int id)2475 return_consumer_find(struct return_instance *ri, int *iter, int id)
2476 {
2477 struct return_consumer *ric;
2478 int idx;
2479
2480 for (idx = *iter; idx < ri->cons_cnt; idx++)
2481 {
2482 ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1];
2483 if (ric->id == id) {
2484 *iter = idx + 1;
2485 return ric;
2486 }
2487 }
2488
2489 return NULL;
2490 }
2491
ignore_ret_handler(int rc)2492 static bool ignore_ret_handler(int rc)
2493 {
2494 return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE;
2495 }
2496
handler_chain(struct uprobe * uprobe,struct pt_regs * regs)2497 static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
2498 {
2499 struct uprobe_consumer *uc;
2500 bool has_consumers = false, remove = true;
2501 struct return_instance *ri = NULL;
2502 struct uprobe_task *utask = current->utask;
2503
2504 utask->auprobe = &uprobe->arch;
2505
2506 list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
2507 bool session = uc->handler && uc->ret_handler;
2508 __u64 cookie = 0;
2509 int rc = 0;
2510
2511 if (uc->handler) {
2512 rc = uc->handler(uc, regs, &cookie);
2513 WARN(rc < 0 || rc > 2,
2514 "bad rc=0x%x from %ps()\n", rc, uc->handler);
2515 }
2516
2517 remove &= rc == UPROBE_HANDLER_REMOVE;
2518 has_consumers = true;
2519
2520 if (!uc->ret_handler || ignore_ret_handler(rc))
2521 continue;
2522
2523 if (!ri)
2524 ri = alloc_return_instance(utask);
2525
2526 if (session)
2527 ri = push_consumer(ri, uc->id, cookie);
2528 }
2529 utask->auprobe = NULL;
2530
2531 if (!ZERO_OR_NULL_PTR(ri))
2532 prepare_uretprobe(uprobe, regs, ri);
2533
2534 if (remove && has_consumers) {
2535 down_read(&uprobe->register_rwsem);
2536
2537 /* re-check that removal is still required, this time under lock */
2538 if (!filter_chain(uprobe, current->mm)) {
2539 WARN_ON(!uprobe_is_active(uprobe));
2540 unapply_uprobe(uprobe, current->mm);
2541 }
2542
2543 up_read(&uprobe->register_rwsem);
2544 }
2545 }
2546
2547 static void
handle_uretprobe_chain(struct return_instance * ri,struct uprobe * uprobe,struct pt_regs * regs)2548 handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs)
2549 {
2550 struct return_consumer *ric;
2551 struct uprobe_consumer *uc;
2552 int ric_idx = 0;
2553
2554 /* all consumers unsubscribed meanwhile */
2555 if (unlikely(!uprobe))
2556 return;
2557
2558 rcu_read_lock_trace();
2559 list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
2560 bool session = uc->handler && uc->ret_handler;
2561
2562 if (uc->ret_handler) {
2563 ric = return_consumer_find(ri, &ric_idx, uc->id);
2564 if (!session || ric)
2565 uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL);
2566 }
2567 }
2568 rcu_read_unlock_trace();
2569 }
2570
find_next_ret_chain(struct return_instance * ri)2571 static struct return_instance *find_next_ret_chain(struct return_instance *ri)
2572 {
2573 bool chained;
2574
2575 do {
2576 chained = ri->chained;
2577 ri = ri->next; /* can't be NULL if chained */
2578 } while (chained);
2579
2580 return ri;
2581 }
2582
uprobe_handle_trampoline(struct pt_regs * regs)2583 void uprobe_handle_trampoline(struct pt_regs *regs)
2584 {
2585 struct uprobe_task *utask;
2586 struct return_instance *ri, *ri_next, *next_chain;
2587 struct uprobe *uprobe;
2588 enum hprobe_state hstate;
2589 bool valid;
2590
2591 utask = current->utask;
2592 if (!utask)
2593 goto sigill;
2594
2595 ri = utask->return_instances;
2596 if (!ri)
2597 goto sigill;
2598
2599 do {
2600 /*
2601 * We should throw out the frames invalidated by longjmp().
2602 * If this chain is valid, then the next one should be alive
2603 * or NULL; the latter case means that nobody but ri->func
2604 * could hit this trampoline on return. TODO: sigaltstack().
2605 */
2606 next_chain = find_next_ret_chain(ri);
2607 valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs);
2608
2609 instruction_pointer_set(regs, ri->orig_ret_vaddr);
2610 do {
2611 /* pop current instance from the stack of pending return instances,
2612 * as it's not pending anymore: we just fixed up original
2613 * instruction pointer in regs and are about to call handlers;
2614 * this allows fixup_uretprobe_trampoline_entries() to properly fix up
2615 * captured stack traces from uretprobe handlers, in which pending
2616 * trampoline addresses on the stack are replaced with correct
2617 * original return addresses
2618 */
2619 ri_next = ri->next;
2620 rcu_assign_pointer(utask->return_instances, ri_next);
2621 utask->depth--;
2622
2623 uprobe = hprobe_consume(&ri->hprobe, &hstate);
2624 if (valid)
2625 handle_uretprobe_chain(ri, uprobe, regs);
2626 hprobe_finalize(&ri->hprobe, hstate);
2627
2628 /* We already took care of hprobe, no need to waste more time on that. */
2629 free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
2630 ri = ri_next;
2631 } while (ri != next_chain);
2632 } while (!valid);
2633
2634 return;
2635
2636 sigill:
2637 uprobe_warn(current, "handle uretprobe, sending SIGILL.");
2638 force_sig(SIGILL);
2639 }
2640
arch_uprobe_ignore(struct arch_uprobe * aup,struct pt_regs * regs)2641 bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
2642 {
2643 return false;
2644 }
2645
arch_uretprobe_is_alive(struct return_instance * ret,enum rp_check ctx,struct pt_regs * regs)2646 bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
2647 struct pt_regs *regs)
2648 {
2649 return true;
2650 }
2651
2652 /*
2653 * Run handler and ask thread to singlestep.
2654 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
2655 */
handle_swbp(struct pt_regs * regs)2656 static void handle_swbp(struct pt_regs *regs)
2657 {
2658 struct uprobe *uprobe;
2659 unsigned long bp_vaddr;
2660 int is_swbp;
2661
2662 bp_vaddr = uprobe_get_swbp_addr(regs);
2663 if (bp_vaddr == uprobe_get_trampoline_vaddr())
2664 return uprobe_handle_trampoline(regs);
2665
2666 rcu_read_lock_trace();
2667
2668 uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
2669 if (!uprobe) {
2670 if (is_swbp > 0) {
2671 /* No matching uprobe; signal SIGTRAP. */
2672 force_sig(SIGTRAP);
2673 } else {
2674 /*
2675 * Either we raced with uprobe_unregister() or we can't
2676 * access this memory. The latter is only possible if
2677 * another thread plays with our ->mm. In both cases
2678 * we can simply restart. If this vma was unmapped we
2679 * can pretend this insn was not executed yet and get
2680 * the (correct) SIGSEGV after restart.
2681 */
2682 instruction_pointer_set(regs, bp_vaddr);
2683 }
2684 goto out;
2685 }
2686
2687 /* change it in advance for ->handler() and restart */
2688 instruction_pointer_set(regs, bp_vaddr);
2689
2690 /*
2691 * TODO: move copy_insn/etc into _register and remove this hack.
2692 * After we hit the bp, _unregister + _register can install the
2693 * new and not-yet-analyzed uprobe at the same address, restart.
2694 */
2695 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
2696 goto out;
2697
2698 /*
2699 * Pairs with the smp_wmb() in prepare_uprobe().
2700 *
2701 * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
2702 * we must also see the stores to &uprobe->arch performed by the
2703 * prepare_uprobe() call.
2704 */
2705 smp_rmb();
2706
2707 /* Tracing handlers use ->utask to communicate with fetch methods */
2708 if (!get_utask())
2709 goto out;
2710
2711 if (arch_uprobe_ignore(&uprobe->arch, regs))
2712 goto out;
2713
2714 handler_chain(uprobe, regs);
2715
2716 if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
2717 goto out;
2718
2719 if (pre_ssout(uprobe, regs, bp_vaddr))
2720 goto out;
2721
2722 out:
2723 /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
2724 rcu_read_unlock_trace();
2725 }
2726
2727 /*
2728 * Perform required fix-ups and disable singlestep.
2729 * Allow pending signals to take effect.
2730 */
handle_singlestep(struct uprobe_task * utask,struct pt_regs * regs)2731 static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
2732 {
2733 struct uprobe *uprobe;
2734 int err = 0;
2735
2736 uprobe = utask->active_uprobe;
2737 if (utask->state == UTASK_SSTEP_ACK)
2738 err = arch_uprobe_post_xol(&uprobe->arch, regs);
2739 else if (utask->state == UTASK_SSTEP_TRAPPED)
2740 arch_uprobe_abort_xol(&uprobe->arch, regs);
2741 else
2742 WARN_ON_ONCE(1);
2743
2744 put_uprobe(uprobe);
2745 utask->active_uprobe = NULL;
2746 utask->state = UTASK_RUNNING;
2747 xol_free_insn_slot(utask);
2748
2749 spin_lock_irq(¤t->sighand->siglock);
2750 recalc_sigpending(); /* see uprobe_deny_signal() */
2751 spin_unlock_irq(¤t->sighand->siglock);
2752
2753 if (unlikely(err)) {
2754 uprobe_warn(current, "execute the probed insn, sending SIGILL.");
2755 force_sig(SIGILL);
2756 }
2757 }
2758
2759 /*
2760 * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
2761 * allows the thread to return from interrupt. After that handle_swbp()
2762 * sets utask->active_uprobe.
2763 *
2764 * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
2765 * and allows the thread to return from interrupt.
2766 *
2767 * While returning to userspace, thread notices the TIF_UPROBE flag and calls
2768 * uprobe_notify_resume().
2769 */
uprobe_notify_resume(struct pt_regs * regs)2770 void uprobe_notify_resume(struct pt_regs *regs)
2771 {
2772 struct uprobe_task *utask;
2773
2774 clear_thread_flag(TIF_UPROBE);
2775
2776 utask = current->utask;
2777 if (utask && utask->active_uprobe)
2778 handle_singlestep(utask, regs);
2779 else
2780 handle_swbp(regs);
2781 }
2782
2783 /*
2784 * uprobe_pre_sstep_notifier gets called from interrupt context as part of
2785 * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
2786 */
uprobe_pre_sstep_notifier(struct pt_regs * regs)2787 int uprobe_pre_sstep_notifier(struct pt_regs *regs)
2788 {
2789 if (!current->mm)
2790 return 0;
2791
2792 if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) &&
2793 (!current->utask || !current->utask->return_instances))
2794 return 0;
2795
2796 set_thread_flag(TIF_UPROBE);
2797 return 1;
2798 }
2799
2800 /*
2801 * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
2802 * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
2803 */
uprobe_post_sstep_notifier(struct pt_regs * regs)2804 int uprobe_post_sstep_notifier(struct pt_regs *regs)
2805 {
2806 struct uprobe_task *utask = current->utask;
2807
2808 if (!current->mm || !utask || !utask->active_uprobe)
2809 /* task is currently not uprobed */
2810 return 0;
2811
2812 utask->state = UTASK_SSTEP_ACK;
2813 set_thread_flag(TIF_UPROBE);
2814 return 1;
2815 }
2816
2817 static struct notifier_block uprobe_exception_nb = {
2818 .notifier_call = arch_uprobe_exception_notify,
2819 .priority = INT_MAX-1, /* notified after kprobes, kgdb */
2820 };
2821
uprobes_init(void)2822 void __init uprobes_init(void)
2823 {
2824 int i;
2825
2826 for (i = 0; i < UPROBES_HASH_SZ; i++)
2827 mutex_init(&uprobes_mmap_mutex[i]);
2828
2829 BUG_ON(register_die_notifier(&uprobe_exception_nb));
2830 }
2831