xref: /linux/kernel/events/uprobes.c (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * User-space Probes (UProbes)
4  *
5  * Copyright (C) IBM Corporation, 2008-2012
6  * Authors:
7  *	Srikar Dronamraju
8  *	Jim Keniston
9  * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
10  */
11 
12 #include <linux/kernel.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>	/* read_mapping_page */
15 #include <linux/slab.h>
16 #include <linux/sched.h>
17 #include <linux/sched/mm.h>
18 #include <linux/sched/coredump.h>
19 #include <linux/export.h>
20 #include <linux/rmap.h>		/* anon_vma_prepare */
21 #include <linux/mmu_notifier.h>
22 #include <linux/swap.h>		/* folio_free_swap */
23 #include <linux/ptrace.h>	/* user_enable_single_step */
24 #include <linux/kdebug.h>	/* notifier mechanism */
25 #include <linux/percpu-rwsem.h>
26 #include <linux/task_work.h>
27 #include <linux/shmem_fs.h>
28 #include <linux/khugepaged.h>
29 
30 #include <linux/uprobes.h>
31 
32 #define UINSNS_PER_PAGE			(PAGE_SIZE/UPROBE_XOL_SLOT_BYTES)
33 #define MAX_UPROBE_XOL_SLOTS		UINSNS_PER_PAGE
34 
35 static struct rb_root uprobes_tree = RB_ROOT;
36 /*
37  * allows us to skip the uprobe_mmap if there are no uprobe events active
38  * at this time.  Probably a fine grained per inode count is better?
39  */
40 #define no_uprobe_events()	RB_EMPTY_ROOT(&uprobes_tree)
41 
42 static DEFINE_RWLOCK(uprobes_treelock);	/* serialize rbtree access */
43 static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
44 
45 DEFINE_STATIC_SRCU(uprobes_srcu);
46 
47 #define UPROBES_HASH_SZ	13
48 /* serialize uprobe->pending_list */
49 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
50 #define uprobes_mmap_hash(v)	(&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
51 
52 DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
53 
54 /* Have a copy of original instruction */
55 #define UPROBE_COPY_INSN	0
56 
57 struct uprobe {
58 	struct rb_node		rb_node;	/* node in the rb tree */
59 	refcount_t		ref;
60 	struct rw_semaphore	register_rwsem;
61 	struct rw_semaphore	consumer_rwsem;
62 	struct list_head	pending_list;
63 	struct list_head	consumers;
64 	struct inode		*inode;		/* Also hold a ref to inode */
65 	struct rcu_head		rcu;
66 	loff_t			offset;
67 	loff_t			ref_ctr_offset;
68 	unsigned long		flags;
69 
70 	/*
71 	 * The generic code assumes that it has two members of unknown type
72 	 * owned by the arch-specific code:
73 	 *
74 	 * 	insn -	copy_insn() saves the original instruction here for
75 	 *		arch_uprobe_analyze_insn().
76 	 *
77 	 *	ixol -	potentially modified instruction to execute out of
78 	 *		line, copied to xol_area by xol_get_insn_slot().
79 	 */
80 	struct arch_uprobe	arch;
81 };
82 
83 struct delayed_uprobe {
84 	struct list_head list;
85 	struct uprobe *uprobe;
86 	struct mm_struct *mm;
87 };
88 
89 static DEFINE_MUTEX(delayed_uprobe_lock);
90 static LIST_HEAD(delayed_uprobe_list);
91 
92 /*
93  * Execute out of line area: anonymous executable mapping installed
94  * by the probed task to execute the copy of the original instruction
95  * mangled by set_swbp().
96  *
97  * On a breakpoint hit, thread contests for a slot.  It frees the
98  * slot after singlestep. Currently a fixed number of slots are
99  * allocated.
100  */
101 struct xol_area {
102 	wait_queue_head_t 		wq;		/* if all slots are busy */
103 	atomic_t 			slot_count;	/* number of in-use slots */
104 	unsigned long 			*bitmap;	/* 0 = free slot */
105 
106 	struct page			*page;
107 	/*
108 	 * We keep the vma's vm_start rather than a pointer to the vma
109 	 * itself.  The probed process or a naughty kernel module could make
110 	 * the vma go away, and we must handle that reasonably gracefully.
111 	 */
112 	unsigned long 			vaddr;		/* Page(s) of instruction slots */
113 };
114 
115 static void uprobe_warn(struct task_struct *t, const char *msg)
116 {
117 	pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
118 }
119 
120 /*
121  * valid_vma: Verify if the specified vma is an executable vma
122  * Relax restrictions while unregistering: vm_flags might have
123  * changed after breakpoint was inserted.
124  *	- is_register: indicates if we are in register context.
125  *	- Return 1 if the specified virtual address is in an
126  *	  executable vma.
127  */
128 static bool valid_vma(struct vm_area_struct *vma, bool is_register)
129 {
130 	vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
131 
132 	if (is_register)
133 		flags |= VM_WRITE;
134 
135 	return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
136 }
137 
138 static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
139 {
140 	return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
141 }
142 
143 static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
144 {
145 	return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
146 }
147 
148 /**
149  * __replace_page - replace page in vma by new page.
150  * based on replace_page in mm/ksm.c
151  *
152  * @vma:      vma that holds the pte pointing to page
153  * @addr:     address the old @page is mapped at
154  * @old_page: the page we are replacing by new_page
155  * @new_page: the modified page we replace page by
156  *
157  * If @new_page is NULL, only unmap @old_page.
158  *
159  * Returns 0 on success, negative error code otherwise.
160  */
161 static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
162 				struct page *old_page, struct page *new_page)
163 {
164 	struct folio *old_folio = page_folio(old_page);
165 	struct folio *new_folio;
166 	struct mm_struct *mm = vma->vm_mm;
167 	DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0);
168 	int err;
169 	struct mmu_notifier_range range;
170 
171 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
172 				addr + PAGE_SIZE);
173 
174 	if (new_page) {
175 		new_folio = page_folio(new_page);
176 		err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL);
177 		if (err)
178 			return err;
179 	}
180 
181 	/* For folio_free_swap() below */
182 	folio_lock(old_folio);
183 
184 	mmu_notifier_invalidate_range_start(&range);
185 	err = -EAGAIN;
186 	if (!page_vma_mapped_walk(&pvmw))
187 		goto unlock;
188 	VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
189 
190 	if (new_page) {
191 		folio_get(new_folio);
192 		folio_add_new_anon_rmap(new_folio, vma, addr, RMAP_EXCLUSIVE);
193 		folio_add_lru_vma(new_folio, vma);
194 	} else
195 		/* no new page, just dec_mm_counter for old_page */
196 		dec_mm_counter(mm, MM_ANONPAGES);
197 
198 	if (!folio_test_anon(old_folio)) {
199 		dec_mm_counter(mm, mm_counter_file(old_folio));
200 		inc_mm_counter(mm, MM_ANONPAGES);
201 	}
202 
203 	flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
204 	ptep_clear_flush(vma, addr, pvmw.pte);
205 	if (new_page)
206 		set_pte_at(mm, addr, pvmw.pte,
207 			   mk_pte(new_page, vma->vm_page_prot));
208 
209 	folio_remove_rmap_pte(old_folio, old_page, vma);
210 	if (!folio_mapped(old_folio))
211 		folio_free_swap(old_folio);
212 	page_vma_mapped_walk_done(&pvmw);
213 	folio_put(old_folio);
214 
215 	err = 0;
216  unlock:
217 	mmu_notifier_invalidate_range_end(&range);
218 	folio_unlock(old_folio);
219 	return err;
220 }
221 
222 /**
223  * is_swbp_insn - check if instruction is breakpoint instruction.
224  * @insn: instruction to be checked.
225  * Default implementation of is_swbp_insn
226  * Returns true if @insn is a breakpoint instruction.
227  */
228 bool __weak is_swbp_insn(uprobe_opcode_t *insn)
229 {
230 	return *insn == UPROBE_SWBP_INSN;
231 }
232 
233 /**
234  * is_trap_insn - check if instruction is breakpoint instruction.
235  * @insn: instruction to be checked.
236  * Default implementation of is_trap_insn
237  * Returns true if @insn is a breakpoint instruction.
238  *
239  * This function is needed for the case where an architecture has multiple
240  * trap instructions (like powerpc).
241  */
242 bool __weak is_trap_insn(uprobe_opcode_t *insn)
243 {
244 	return is_swbp_insn(insn);
245 }
246 
247 static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
248 {
249 	void *kaddr = kmap_atomic(page);
250 	memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
251 	kunmap_atomic(kaddr);
252 }
253 
254 static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
255 {
256 	void *kaddr = kmap_atomic(page);
257 	memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
258 	kunmap_atomic(kaddr);
259 }
260 
261 static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
262 {
263 	uprobe_opcode_t old_opcode;
264 	bool is_swbp;
265 
266 	/*
267 	 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
268 	 * We do not check if it is any other 'trap variant' which could
269 	 * be conditional trap instruction such as the one powerpc supports.
270 	 *
271 	 * The logic is that we do not care if the underlying instruction
272 	 * is a trap variant; uprobes always wins over any other (gdb)
273 	 * breakpoint.
274 	 */
275 	copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
276 	is_swbp = is_swbp_insn(&old_opcode);
277 
278 	if (is_swbp_insn(new_opcode)) {
279 		if (is_swbp)		/* register: already installed? */
280 			return 0;
281 	} else {
282 		if (!is_swbp)		/* unregister: was it changed by us? */
283 			return 0;
284 	}
285 
286 	return 1;
287 }
288 
289 static struct delayed_uprobe *
290 delayed_uprobe_check(struct uprobe *uprobe, struct mm_struct *mm)
291 {
292 	struct delayed_uprobe *du;
293 
294 	list_for_each_entry(du, &delayed_uprobe_list, list)
295 		if (du->uprobe == uprobe && du->mm == mm)
296 			return du;
297 	return NULL;
298 }
299 
300 static int delayed_uprobe_add(struct uprobe *uprobe, struct mm_struct *mm)
301 {
302 	struct delayed_uprobe *du;
303 
304 	if (delayed_uprobe_check(uprobe, mm))
305 		return 0;
306 
307 	du  = kzalloc(sizeof(*du), GFP_KERNEL);
308 	if (!du)
309 		return -ENOMEM;
310 
311 	du->uprobe = uprobe;
312 	du->mm = mm;
313 	list_add(&du->list, &delayed_uprobe_list);
314 	return 0;
315 }
316 
317 static void delayed_uprobe_delete(struct delayed_uprobe *du)
318 {
319 	if (WARN_ON(!du))
320 		return;
321 	list_del(&du->list);
322 	kfree(du);
323 }
324 
325 static void delayed_uprobe_remove(struct uprobe *uprobe, struct mm_struct *mm)
326 {
327 	struct list_head *pos, *q;
328 	struct delayed_uprobe *du;
329 
330 	if (!uprobe && !mm)
331 		return;
332 
333 	list_for_each_safe(pos, q, &delayed_uprobe_list) {
334 		du = list_entry(pos, struct delayed_uprobe, list);
335 
336 		if (uprobe && du->uprobe != uprobe)
337 			continue;
338 		if (mm && du->mm != mm)
339 			continue;
340 
341 		delayed_uprobe_delete(du);
342 	}
343 }
344 
345 static bool valid_ref_ctr_vma(struct uprobe *uprobe,
346 			      struct vm_area_struct *vma)
347 {
348 	unsigned long vaddr = offset_to_vaddr(vma, uprobe->ref_ctr_offset);
349 
350 	return uprobe->ref_ctr_offset &&
351 		vma->vm_file &&
352 		file_inode(vma->vm_file) == uprobe->inode &&
353 		(vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
354 		vma->vm_start <= vaddr &&
355 		vma->vm_end > vaddr;
356 }
357 
358 static struct vm_area_struct *
359 find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm)
360 {
361 	VMA_ITERATOR(vmi, mm, 0);
362 	struct vm_area_struct *tmp;
363 
364 	for_each_vma(vmi, tmp)
365 		if (valid_ref_ctr_vma(uprobe, tmp))
366 			return tmp;
367 
368 	return NULL;
369 }
370 
371 static int
372 __update_ref_ctr(struct mm_struct *mm, unsigned long vaddr, short d)
373 {
374 	void *kaddr;
375 	struct page *page;
376 	int ret;
377 	short *ptr;
378 
379 	if (!vaddr || !d)
380 		return -EINVAL;
381 
382 	ret = get_user_pages_remote(mm, vaddr, 1,
383 				    FOLL_WRITE, &page, NULL);
384 	if (unlikely(ret <= 0)) {
385 		/*
386 		 * We are asking for 1 page. If get_user_pages_remote() fails,
387 		 * it may return 0, in that case we have to return error.
388 		 */
389 		return ret == 0 ? -EBUSY : ret;
390 	}
391 
392 	kaddr = kmap_atomic(page);
393 	ptr = kaddr + (vaddr & ~PAGE_MASK);
394 
395 	if (unlikely(*ptr + d < 0)) {
396 		pr_warn("ref_ctr going negative. vaddr: 0x%lx, "
397 			"curr val: %d, delta: %d\n", vaddr, *ptr, d);
398 		ret = -EINVAL;
399 		goto out;
400 	}
401 
402 	*ptr += d;
403 	ret = 0;
404 out:
405 	kunmap_atomic(kaddr);
406 	put_page(page);
407 	return ret;
408 }
409 
410 static void update_ref_ctr_warn(struct uprobe *uprobe,
411 				struct mm_struct *mm, short d)
412 {
413 	pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
414 		"0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
415 		d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
416 		(unsigned long long) uprobe->offset,
417 		(unsigned long long) uprobe->ref_ctr_offset, mm);
418 }
419 
420 static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
421 			  short d)
422 {
423 	struct vm_area_struct *rc_vma;
424 	unsigned long rc_vaddr;
425 	int ret = 0;
426 
427 	rc_vma = find_ref_ctr_vma(uprobe, mm);
428 
429 	if (rc_vma) {
430 		rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
431 		ret = __update_ref_ctr(mm, rc_vaddr, d);
432 		if (ret)
433 			update_ref_ctr_warn(uprobe, mm, d);
434 
435 		if (d > 0)
436 			return ret;
437 	}
438 
439 	mutex_lock(&delayed_uprobe_lock);
440 	if (d > 0)
441 		ret = delayed_uprobe_add(uprobe, mm);
442 	else
443 		delayed_uprobe_remove(uprobe, mm);
444 	mutex_unlock(&delayed_uprobe_lock);
445 
446 	return ret;
447 }
448 
449 /*
450  * NOTE:
451  * Expect the breakpoint instruction to be the smallest size instruction for
452  * the architecture. If an arch has variable length instruction and the
453  * breakpoint instruction is not of the smallest length instruction
454  * supported by that architecture then we need to modify is_trap_at_addr and
455  * uprobe_write_opcode accordingly. This would never be a problem for archs
456  * that have fixed length instructions.
457  *
458  * uprobe_write_opcode - write the opcode at a given virtual address.
459  * @auprobe: arch specific probepoint information.
460  * @mm: the probed process address space.
461  * @vaddr: the virtual address to store the opcode.
462  * @opcode: opcode to be written at @vaddr.
463  *
464  * Called with mm->mmap_lock held for read or write.
465  * Return 0 (success) or a negative errno.
466  */
467 int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
468 			unsigned long vaddr, uprobe_opcode_t opcode)
469 {
470 	struct uprobe *uprobe;
471 	struct page *old_page, *new_page;
472 	struct vm_area_struct *vma;
473 	int ret, is_register, ref_ctr_updated = 0;
474 	bool orig_page_huge = false;
475 	unsigned int gup_flags = FOLL_FORCE;
476 
477 	is_register = is_swbp_insn(&opcode);
478 	uprobe = container_of(auprobe, struct uprobe, arch);
479 
480 retry:
481 	if (is_register)
482 		gup_flags |= FOLL_SPLIT_PMD;
483 	/* Read the page with vaddr into memory */
484 	old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
485 	if (IS_ERR(old_page))
486 		return PTR_ERR(old_page);
487 
488 	ret = verify_opcode(old_page, vaddr, &opcode);
489 	if (ret <= 0)
490 		goto put_old;
491 
492 	if (WARN(!is_register && PageCompound(old_page),
493 		 "uprobe unregister should never work on compound page\n")) {
494 		ret = -EINVAL;
495 		goto put_old;
496 	}
497 
498 	/* We are going to replace instruction, update ref_ctr. */
499 	if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
500 		ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
501 		if (ret)
502 			goto put_old;
503 
504 		ref_ctr_updated = 1;
505 	}
506 
507 	ret = 0;
508 	if (!is_register && !PageAnon(old_page))
509 		goto put_old;
510 
511 	ret = anon_vma_prepare(vma);
512 	if (ret)
513 		goto put_old;
514 
515 	ret = -ENOMEM;
516 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
517 	if (!new_page)
518 		goto put_old;
519 
520 	__SetPageUptodate(new_page);
521 	copy_highpage(new_page, old_page);
522 	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
523 
524 	if (!is_register) {
525 		struct page *orig_page;
526 		pgoff_t index;
527 
528 		VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
529 
530 		index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
531 		orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
532 					  index);
533 
534 		if (orig_page) {
535 			if (PageUptodate(orig_page) &&
536 			    pages_identical(new_page, orig_page)) {
537 				/* let go new_page */
538 				put_page(new_page);
539 				new_page = NULL;
540 
541 				if (PageCompound(orig_page))
542 					orig_page_huge = true;
543 			}
544 			put_page(orig_page);
545 		}
546 	}
547 
548 	ret = __replace_page(vma, vaddr & PAGE_MASK, old_page, new_page);
549 	if (new_page)
550 		put_page(new_page);
551 put_old:
552 	put_page(old_page);
553 
554 	if (unlikely(ret == -EAGAIN))
555 		goto retry;
556 
557 	/* Revert back reference counter if instruction update failed. */
558 	if (ret && is_register && ref_ctr_updated)
559 		update_ref_ctr(uprobe, mm, -1);
560 
561 	/* try collapse pmd for compound page */
562 	if (!ret && orig_page_huge)
563 		collapse_pte_mapped_thp(mm, vaddr, false);
564 
565 	return ret;
566 }
567 
568 /**
569  * set_swbp - store breakpoint at a given address.
570  * @auprobe: arch specific probepoint information.
571  * @mm: the probed process address space.
572  * @vaddr: the virtual address to insert the opcode.
573  *
574  * For mm @mm, store the breakpoint instruction at @vaddr.
575  * Return 0 (success) or a negative errno.
576  */
577 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
578 {
579 	return uprobe_write_opcode(auprobe, mm, vaddr, UPROBE_SWBP_INSN);
580 }
581 
582 /**
583  * set_orig_insn - Restore the original instruction.
584  * @mm: the probed process address space.
585  * @auprobe: arch specific probepoint information.
586  * @vaddr: the virtual address to insert the opcode.
587  *
588  * For mm @mm, restore the original opcode (opcode) at @vaddr.
589  * Return 0 (success) or a negative errno.
590  */
591 int __weak
592 set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
593 {
594 	return uprobe_write_opcode(auprobe, mm, vaddr,
595 			*(uprobe_opcode_t *)&auprobe->insn);
596 }
597 
598 /* uprobe should have guaranteed positive refcount */
599 static struct uprobe *get_uprobe(struct uprobe *uprobe)
600 {
601 	refcount_inc(&uprobe->ref);
602 	return uprobe;
603 }
604 
605 /*
606  * uprobe should have guaranteed lifetime, which can be either of:
607  *   - caller already has refcount taken (and wants an extra one);
608  *   - uprobe is RCU protected and won't be freed until after grace period;
609  *   - we are holding uprobes_treelock (for read or write, doesn't matter).
610  */
611 static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
612 {
613 	if (refcount_inc_not_zero(&uprobe->ref))
614 		return uprobe;
615 	return NULL;
616 }
617 
618 static inline bool uprobe_is_active(struct uprobe *uprobe)
619 {
620 	return !RB_EMPTY_NODE(&uprobe->rb_node);
621 }
622 
623 static void uprobe_free_rcu(struct rcu_head *rcu)
624 {
625 	struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
626 
627 	kfree(uprobe);
628 }
629 
630 static void put_uprobe(struct uprobe *uprobe)
631 {
632 	if (!refcount_dec_and_test(&uprobe->ref))
633 		return;
634 
635 	write_lock(&uprobes_treelock);
636 
637 	if (uprobe_is_active(uprobe)) {
638 		write_seqcount_begin(&uprobes_seqcount);
639 		rb_erase(&uprobe->rb_node, &uprobes_tree);
640 		write_seqcount_end(&uprobes_seqcount);
641 	}
642 
643 	write_unlock(&uprobes_treelock);
644 
645 	/*
646 	 * If application munmap(exec_vma) before uprobe_unregister()
647 	 * gets called, we don't get a chance to remove uprobe from
648 	 * delayed_uprobe_list from remove_breakpoint(). Do it here.
649 	 */
650 	mutex_lock(&delayed_uprobe_lock);
651 	delayed_uprobe_remove(uprobe, NULL);
652 	mutex_unlock(&delayed_uprobe_lock);
653 
654 	call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
655 }
656 
657 static __always_inline
658 int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset,
659 	       const struct uprobe *r)
660 {
661 	if (l_inode < r->inode)
662 		return -1;
663 
664 	if (l_inode > r->inode)
665 		return 1;
666 
667 	if (l_offset < r->offset)
668 		return -1;
669 
670 	if (l_offset > r->offset)
671 		return 1;
672 
673 	return 0;
674 }
675 
676 #define __node_2_uprobe(node) \
677 	rb_entry((node), struct uprobe, rb_node)
678 
679 struct __uprobe_key {
680 	struct inode *inode;
681 	loff_t offset;
682 };
683 
684 static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b)
685 {
686 	const struct __uprobe_key *a = key;
687 	return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b));
688 }
689 
690 static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
691 {
692 	struct uprobe *u = __node_2_uprobe(a);
693 	return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
694 }
695 
696 /*
697  * Assumes being inside RCU protected region.
698  * No refcount is taken on returned uprobe.
699  */
700 static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
701 {
702 	struct __uprobe_key key = {
703 		.inode = inode,
704 		.offset = offset,
705 	};
706 	struct rb_node *node;
707 	unsigned int seq;
708 
709 	lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
710 
711 	do {
712 		seq = read_seqcount_begin(&uprobes_seqcount);
713 		node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
714 		/*
715 		 * Lockless RB-tree lookups can result only in false negatives.
716 		 * If the element is found, it is correct and can be returned
717 		 * under RCU protection. If we find nothing, we need to
718 		 * validate that seqcount didn't change. If it did, we have to
719 		 * try again as we might have missed the element (false
720 		 * negative). If seqcount is unchanged, search truly failed.
721 		 */
722 		if (node)
723 			return __node_2_uprobe(node);
724 	} while (read_seqcount_retry(&uprobes_seqcount, seq));
725 
726 	return NULL;
727 }
728 
729 /*
730  * Attempt to insert a new uprobe into uprobes_tree.
731  *
732  * If uprobe already exists (for given inode+offset), we just increment
733  * refcount of previously existing uprobe.
734  *
735  * If not, a provided new instance of uprobe is inserted into the tree (with
736  * assumed initial refcount == 1).
737  *
738  * In any case, we return a uprobe instance that ends up being in uprobes_tree.
739  * Caller has to clean up new uprobe instance, if it ended up not being
740  * inserted into the tree.
741  *
742  * We assume that uprobes_treelock is held for writing.
743  */
744 static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
745 {
746 	struct rb_node *node;
747 again:
748 	node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
749 	if (node) {
750 		struct uprobe *u = __node_2_uprobe(node);
751 
752 		if (!try_get_uprobe(u)) {
753 			rb_erase(node, &uprobes_tree);
754 			RB_CLEAR_NODE(&u->rb_node);
755 			goto again;
756 		}
757 
758 		return u;
759 	}
760 
761 	return uprobe;
762 }
763 
764 /*
765  * Acquire uprobes_treelock and insert uprobe into uprobes_tree
766  * (or reuse existing one, see __insert_uprobe() comments above).
767  */
768 static struct uprobe *insert_uprobe(struct uprobe *uprobe)
769 {
770 	struct uprobe *u;
771 
772 	write_lock(&uprobes_treelock);
773 	write_seqcount_begin(&uprobes_seqcount);
774 	u = __insert_uprobe(uprobe);
775 	write_seqcount_end(&uprobes_seqcount);
776 	write_unlock(&uprobes_treelock);
777 
778 	return u;
779 }
780 
781 static void
782 ref_ctr_mismatch_warn(struct uprobe *cur_uprobe, struct uprobe *uprobe)
783 {
784 	pr_warn("ref_ctr_offset mismatch. inode: 0x%lx offset: 0x%llx "
785 		"ref_ctr_offset(old): 0x%llx ref_ctr_offset(new): 0x%llx\n",
786 		uprobe->inode->i_ino, (unsigned long long) uprobe->offset,
787 		(unsigned long long) cur_uprobe->ref_ctr_offset,
788 		(unsigned long long) uprobe->ref_ctr_offset);
789 }
790 
791 static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
792 				   loff_t ref_ctr_offset)
793 {
794 	struct uprobe *uprobe, *cur_uprobe;
795 
796 	uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
797 	if (!uprobe)
798 		return ERR_PTR(-ENOMEM);
799 
800 	uprobe->inode = inode;
801 	uprobe->offset = offset;
802 	uprobe->ref_ctr_offset = ref_ctr_offset;
803 	INIT_LIST_HEAD(&uprobe->consumers);
804 	init_rwsem(&uprobe->register_rwsem);
805 	init_rwsem(&uprobe->consumer_rwsem);
806 	RB_CLEAR_NODE(&uprobe->rb_node);
807 	refcount_set(&uprobe->ref, 1);
808 
809 	/* add to uprobes_tree, sorted on inode:offset */
810 	cur_uprobe = insert_uprobe(uprobe);
811 	/* a uprobe exists for this inode:offset combination */
812 	if (cur_uprobe != uprobe) {
813 		if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
814 			ref_ctr_mismatch_warn(cur_uprobe, uprobe);
815 			put_uprobe(cur_uprobe);
816 			kfree(uprobe);
817 			return ERR_PTR(-EINVAL);
818 		}
819 		kfree(uprobe);
820 		uprobe = cur_uprobe;
821 	}
822 
823 	return uprobe;
824 }
825 
826 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
827 {
828 	down_write(&uprobe->consumer_rwsem);
829 	list_add_rcu(&uc->cons_node, &uprobe->consumers);
830 	up_write(&uprobe->consumer_rwsem);
831 }
832 
833 /*
834  * For uprobe @uprobe, delete the consumer @uc.
835  * Should never be called with consumer that's not part of @uprobe->consumers.
836  */
837 static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
838 {
839 	down_write(&uprobe->consumer_rwsem);
840 	list_del_rcu(&uc->cons_node);
841 	up_write(&uprobe->consumer_rwsem);
842 }
843 
844 static int __copy_insn(struct address_space *mapping, struct file *filp,
845 			void *insn, int nbytes, loff_t offset)
846 {
847 	struct page *page;
848 	/*
849 	 * Ensure that the page that has the original instruction is populated
850 	 * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(),
851 	 * see uprobe_register().
852 	 */
853 	if (mapping->a_ops->read_folio)
854 		page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
855 	else
856 		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
857 	if (IS_ERR(page))
858 		return PTR_ERR(page);
859 
860 	copy_from_page(page, offset, insn, nbytes);
861 	put_page(page);
862 
863 	return 0;
864 }
865 
866 static int copy_insn(struct uprobe *uprobe, struct file *filp)
867 {
868 	struct address_space *mapping = uprobe->inode->i_mapping;
869 	loff_t offs = uprobe->offset;
870 	void *insn = &uprobe->arch.insn;
871 	int size = sizeof(uprobe->arch.insn);
872 	int len, err = -EIO;
873 
874 	/* Copy only available bytes, -EIO if nothing was read */
875 	do {
876 		if (offs >= i_size_read(uprobe->inode))
877 			break;
878 
879 		len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
880 		err = __copy_insn(mapping, filp, insn, len, offs);
881 		if (err)
882 			break;
883 
884 		insn += len;
885 		offs += len;
886 		size -= len;
887 	} while (size);
888 
889 	return err;
890 }
891 
892 static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
893 				struct mm_struct *mm, unsigned long vaddr)
894 {
895 	int ret = 0;
896 
897 	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
898 		return ret;
899 
900 	/* TODO: move this into _register, until then we abuse this sem. */
901 	down_write(&uprobe->consumer_rwsem);
902 	if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
903 		goto out;
904 
905 	ret = copy_insn(uprobe, file);
906 	if (ret)
907 		goto out;
908 
909 	ret = -ENOTSUPP;
910 	if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn))
911 		goto out;
912 
913 	ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
914 	if (ret)
915 		goto out;
916 
917 	smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
918 	set_bit(UPROBE_COPY_INSN, &uprobe->flags);
919 
920  out:
921 	up_write(&uprobe->consumer_rwsem);
922 
923 	return ret;
924 }
925 
926 static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
927 {
928 	return !uc->filter || uc->filter(uc, mm);
929 }
930 
931 static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
932 {
933 	struct uprobe_consumer *uc;
934 	bool ret = false;
935 
936 	down_read(&uprobe->consumer_rwsem);
937 	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
938 				 srcu_read_lock_held(&uprobes_srcu)) {
939 		ret = consumer_filter(uc, mm);
940 		if (ret)
941 			break;
942 	}
943 	up_read(&uprobe->consumer_rwsem);
944 
945 	return ret;
946 }
947 
948 static int
949 install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
950 			struct vm_area_struct *vma, unsigned long vaddr)
951 {
952 	bool first_uprobe;
953 	int ret;
954 
955 	ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
956 	if (ret)
957 		return ret;
958 
959 	/*
960 	 * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
961 	 * the task can hit this breakpoint right after __replace_page().
962 	 */
963 	first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
964 	if (first_uprobe)
965 		set_bit(MMF_HAS_UPROBES, &mm->flags);
966 
967 	ret = set_swbp(&uprobe->arch, mm, vaddr);
968 	if (!ret)
969 		clear_bit(MMF_RECALC_UPROBES, &mm->flags);
970 	else if (first_uprobe)
971 		clear_bit(MMF_HAS_UPROBES, &mm->flags);
972 
973 	return ret;
974 }
975 
976 static int
977 remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
978 {
979 	set_bit(MMF_RECALC_UPROBES, &mm->flags);
980 	return set_orig_insn(&uprobe->arch, mm, vaddr);
981 }
982 
983 struct map_info {
984 	struct map_info *next;
985 	struct mm_struct *mm;
986 	unsigned long vaddr;
987 };
988 
989 static inline struct map_info *free_map_info(struct map_info *info)
990 {
991 	struct map_info *next = info->next;
992 	kfree(info);
993 	return next;
994 }
995 
996 static struct map_info *
997 build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
998 {
999 	unsigned long pgoff = offset >> PAGE_SHIFT;
1000 	struct vm_area_struct *vma;
1001 	struct map_info *curr = NULL;
1002 	struct map_info *prev = NULL;
1003 	struct map_info *info;
1004 	int more = 0;
1005 
1006  again:
1007 	i_mmap_lock_read(mapping);
1008 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1009 		if (!valid_vma(vma, is_register))
1010 			continue;
1011 
1012 		if (!prev && !more) {
1013 			/*
1014 			 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through
1015 			 * reclaim. This is optimistic, no harm done if it fails.
1016 			 */
1017 			prev = kmalloc(sizeof(struct map_info),
1018 					GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
1019 			if (prev)
1020 				prev->next = NULL;
1021 		}
1022 		if (!prev) {
1023 			more++;
1024 			continue;
1025 		}
1026 
1027 		if (!mmget_not_zero(vma->vm_mm))
1028 			continue;
1029 
1030 		info = prev;
1031 		prev = prev->next;
1032 		info->next = curr;
1033 		curr = info;
1034 
1035 		info->mm = vma->vm_mm;
1036 		info->vaddr = offset_to_vaddr(vma, offset);
1037 	}
1038 	i_mmap_unlock_read(mapping);
1039 
1040 	if (!more)
1041 		goto out;
1042 
1043 	prev = curr;
1044 	while (curr) {
1045 		mmput(curr->mm);
1046 		curr = curr->next;
1047 	}
1048 
1049 	do {
1050 		info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
1051 		if (!info) {
1052 			curr = ERR_PTR(-ENOMEM);
1053 			goto out;
1054 		}
1055 		info->next = prev;
1056 		prev = info;
1057 	} while (--more);
1058 
1059 	goto again;
1060  out:
1061 	while (prev)
1062 		prev = free_map_info(prev);
1063 	return curr;
1064 }
1065 
1066 static int
1067 register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
1068 {
1069 	bool is_register = !!new;
1070 	struct map_info *info;
1071 	int err = 0;
1072 
1073 	percpu_down_write(&dup_mmap_sem);
1074 	info = build_map_info(uprobe->inode->i_mapping,
1075 					uprobe->offset, is_register);
1076 	if (IS_ERR(info)) {
1077 		err = PTR_ERR(info);
1078 		goto out;
1079 	}
1080 
1081 	while (info) {
1082 		struct mm_struct *mm = info->mm;
1083 		struct vm_area_struct *vma;
1084 
1085 		if (err && is_register)
1086 			goto free;
1087 		/*
1088 		 * We take mmap_lock for writing to avoid the race with
1089 		 * find_active_uprobe_rcu() which takes mmap_lock for reading.
1090 		 * Thus this install_breakpoint() can not make
1091 		 * is_trap_at_addr() true right after find_uprobe_rcu()
1092 		 * returns NULL in find_active_uprobe_rcu().
1093 		 */
1094 		mmap_write_lock(mm);
1095 		vma = find_vma(mm, info->vaddr);
1096 		if (!vma || !valid_vma(vma, is_register) ||
1097 		    file_inode(vma->vm_file) != uprobe->inode)
1098 			goto unlock;
1099 
1100 		if (vma->vm_start > info->vaddr ||
1101 		    vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
1102 			goto unlock;
1103 
1104 		if (is_register) {
1105 			/* consult only the "caller", new consumer. */
1106 			if (consumer_filter(new, mm))
1107 				err = install_breakpoint(uprobe, mm, vma, info->vaddr);
1108 		} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
1109 			if (!filter_chain(uprobe, mm))
1110 				err |= remove_breakpoint(uprobe, mm, info->vaddr);
1111 		}
1112 
1113  unlock:
1114 		mmap_write_unlock(mm);
1115  free:
1116 		mmput(mm);
1117 		info = free_map_info(info);
1118 	}
1119  out:
1120 	percpu_up_write(&dup_mmap_sem);
1121 	return err;
1122 }
1123 
1124 /**
1125  * uprobe_unregister_nosync - unregister an already registered probe.
1126  * @uprobe: uprobe to remove
1127  * @uc: identify which probe if multiple probes are colocated.
1128  */
1129 void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
1130 {
1131 	int err;
1132 
1133 	down_write(&uprobe->register_rwsem);
1134 	consumer_del(uprobe, uc);
1135 	err = register_for_each_vma(uprobe, NULL);
1136 	up_write(&uprobe->register_rwsem);
1137 
1138 	/* TODO : cant unregister? schedule a worker thread */
1139 	if (unlikely(err)) {
1140 		uprobe_warn(current, "unregister, leaking uprobe");
1141 		return;
1142 	}
1143 
1144 	put_uprobe(uprobe);
1145 }
1146 EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
1147 
1148 void uprobe_unregister_sync(void)
1149 {
1150 	/*
1151 	 * Now that handler_chain() and handle_uretprobe_chain() iterate over
1152 	 * uprobe->consumers list under RCU protection without holding
1153 	 * uprobe->register_rwsem, we need to wait for RCU grace period to
1154 	 * make sure that we can't call into just unregistered
1155 	 * uprobe_consumer's callbacks anymore. If we don't do that, fast and
1156 	 * unlucky enough caller can free consumer's memory and cause
1157 	 * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
1158 	 */
1159 	synchronize_srcu(&uprobes_srcu);
1160 }
1161 EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
1162 
1163 /**
1164  * uprobe_register - register a probe
1165  * @inode: the file in which the probe has to be placed.
1166  * @offset: offset from the start of the file.
1167  * @ref_ctr_offset: offset of SDT marker / reference counter
1168  * @uc: information on howto handle the probe..
1169  *
1170  * Apart from the access refcount, uprobe_register() takes a creation
1171  * refcount (thro alloc_uprobe) if and only if this @uprobe is getting
1172  * inserted into the rbtree (i.e first consumer for a @inode:@offset
1173  * tuple).  Creation refcount stops uprobe_unregister from freeing the
1174  * @uprobe even before the register operation is complete. Creation
1175  * refcount is released when the last @uc for the @uprobe
1176  * unregisters. Caller of uprobe_register() is required to keep @inode
1177  * (and the containing mount) referenced.
1178  *
1179  * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
1180  */
1181 struct uprobe *uprobe_register(struct inode *inode,
1182 				loff_t offset, loff_t ref_ctr_offset,
1183 				struct uprobe_consumer *uc)
1184 {
1185 	struct uprobe *uprobe;
1186 	int ret;
1187 
1188 	/* Uprobe must have at least one set consumer */
1189 	if (!uc->handler && !uc->ret_handler)
1190 		return ERR_PTR(-EINVAL);
1191 
1192 	/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
1193 	if (!inode->i_mapping->a_ops->read_folio &&
1194 	    !shmem_mapping(inode->i_mapping))
1195 		return ERR_PTR(-EIO);
1196 	/* Racy, just to catch the obvious mistakes */
1197 	if (offset > i_size_read(inode))
1198 		return ERR_PTR(-EINVAL);
1199 
1200 	/*
1201 	 * This ensures that copy_from_page(), copy_to_page() and
1202 	 * __update_ref_ctr() can't cross page boundary.
1203 	 */
1204 	if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
1205 		return ERR_PTR(-EINVAL);
1206 	if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
1207 		return ERR_PTR(-EINVAL);
1208 
1209 	uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
1210 	if (IS_ERR(uprobe))
1211 		return uprobe;
1212 
1213 	down_write(&uprobe->register_rwsem);
1214 	consumer_add(uprobe, uc);
1215 	ret = register_for_each_vma(uprobe, uc);
1216 	up_write(&uprobe->register_rwsem);
1217 
1218 	if (ret) {
1219 		uprobe_unregister_nosync(uprobe, uc);
1220 		/*
1221 		 * Registration might have partially succeeded, so we can have
1222 		 * this consumer being called right at this time. We need to
1223 		 * sync here. It's ok, it's unlikely slow path.
1224 		 */
1225 		uprobe_unregister_sync();
1226 		return ERR_PTR(ret);
1227 	}
1228 
1229 	return uprobe;
1230 }
1231 EXPORT_SYMBOL_GPL(uprobe_register);
1232 
1233 /**
1234  * uprobe_apply - add or remove the breakpoints according to @uc->filter
1235  * @uprobe: uprobe which "owns" the breakpoint
1236  * @uc: consumer which wants to add more or remove some breakpoints
1237  * @add: add or remove the breakpoints
1238  * Return: 0 on success or negative error code.
1239  */
1240 int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
1241 {
1242 	struct uprobe_consumer *con;
1243 	int ret = -ENOENT, srcu_idx;
1244 
1245 	down_write(&uprobe->register_rwsem);
1246 
1247 	srcu_idx = srcu_read_lock(&uprobes_srcu);
1248 	list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
1249 				 srcu_read_lock_held(&uprobes_srcu)) {
1250 		if (con == uc) {
1251 			ret = register_for_each_vma(uprobe, add ? uc : NULL);
1252 			break;
1253 		}
1254 	}
1255 	srcu_read_unlock(&uprobes_srcu, srcu_idx);
1256 
1257 	up_write(&uprobe->register_rwsem);
1258 
1259 	return ret;
1260 }
1261 
1262 static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
1263 {
1264 	VMA_ITERATOR(vmi, mm, 0);
1265 	struct vm_area_struct *vma;
1266 	int err = 0;
1267 
1268 	mmap_read_lock(mm);
1269 	for_each_vma(vmi, vma) {
1270 		unsigned long vaddr;
1271 		loff_t offset;
1272 
1273 		if (!valid_vma(vma, false) ||
1274 		    file_inode(vma->vm_file) != uprobe->inode)
1275 			continue;
1276 
1277 		offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1278 		if (uprobe->offset <  offset ||
1279 		    uprobe->offset >= offset + vma->vm_end - vma->vm_start)
1280 			continue;
1281 
1282 		vaddr = offset_to_vaddr(vma, uprobe->offset);
1283 		err |= remove_breakpoint(uprobe, mm, vaddr);
1284 	}
1285 	mmap_read_unlock(mm);
1286 
1287 	return err;
1288 }
1289 
1290 static struct rb_node *
1291 find_node_in_range(struct inode *inode, loff_t min, loff_t max)
1292 {
1293 	struct rb_node *n = uprobes_tree.rb_node;
1294 
1295 	while (n) {
1296 		struct uprobe *u = rb_entry(n, struct uprobe, rb_node);
1297 
1298 		if (inode < u->inode) {
1299 			n = n->rb_left;
1300 		} else if (inode > u->inode) {
1301 			n = n->rb_right;
1302 		} else {
1303 			if (max < u->offset)
1304 				n = n->rb_left;
1305 			else if (min > u->offset)
1306 				n = n->rb_right;
1307 			else
1308 				break;
1309 		}
1310 	}
1311 
1312 	return n;
1313 }
1314 
1315 /*
1316  * For a given range in vma, build a list of probes that need to be inserted.
1317  */
1318 static void build_probe_list(struct inode *inode,
1319 				struct vm_area_struct *vma,
1320 				unsigned long start, unsigned long end,
1321 				struct list_head *head)
1322 {
1323 	loff_t min, max;
1324 	struct rb_node *n, *t;
1325 	struct uprobe *u;
1326 
1327 	INIT_LIST_HEAD(head);
1328 	min = vaddr_to_offset(vma, start);
1329 	max = min + (end - start) - 1;
1330 
1331 	read_lock(&uprobes_treelock);
1332 	n = find_node_in_range(inode, min, max);
1333 	if (n) {
1334 		for (t = n; t; t = rb_prev(t)) {
1335 			u = rb_entry(t, struct uprobe, rb_node);
1336 			if (u->inode != inode || u->offset < min)
1337 				break;
1338 			/* if uprobe went away, it's safe to ignore it */
1339 			if (try_get_uprobe(u))
1340 				list_add(&u->pending_list, head);
1341 		}
1342 		for (t = n; (t = rb_next(t)); ) {
1343 			u = rb_entry(t, struct uprobe, rb_node);
1344 			if (u->inode != inode || u->offset > max)
1345 				break;
1346 			/* if uprobe went away, it's safe to ignore it */
1347 			if (try_get_uprobe(u))
1348 				list_add(&u->pending_list, head);
1349 		}
1350 	}
1351 	read_unlock(&uprobes_treelock);
1352 }
1353 
1354 /* @vma contains reference counter, not the probed instruction. */
1355 static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
1356 {
1357 	struct list_head *pos, *q;
1358 	struct delayed_uprobe *du;
1359 	unsigned long vaddr;
1360 	int ret = 0, err = 0;
1361 
1362 	mutex_lock(&delayed_uprobe_lock);
1363 	list_for_each_safe(pos, q, &delayed_uprobe_list) {
1364 		du = list_entry(pos, struct delayed_uprobe, list);
1365 
1366 		if (du->mm != vma->vm_mm ||
1367 		    !valid_ref_ctr_vma(du->uprobe, vma))
1368 			continue;
1369 
1370 		vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
1371 		ret = __update_ref_ctr(vma->vm_mm, vaddr, 1);
1372 		if (ret) {
1373 			update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1);
1374 			if (!err)
1375 				err = ret;
1376 		}
1377 		delayed_uprobe_delete(du);
1378 	}
1379 	mutex_unlock(&delayed_uprobe_lock);
1380 	return err;
1381 }
1382 
1383 /*
1384  * Called from mmap_region/vma_merge with mm->mmap_lock acquired.
1385  *
1386  * Currently we ignore all errors and always return 0, the callers
1387  * can't handle the failure anyway.
1388  */
1389 int uprobe_mmap(struct vm_area_struct *vma)
1390 {
1391 	struct list_head tmp_list;
1392 	struct uprobe *uprobe, *u;
1393 	struct inode *inode;
1394 
1395 	if (no_uprobe_events())
1396 		return 0;
1397 
1398 	if (vma->vm_file &&
1399 	    (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE &&
1400 	    test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags))
1401 		delayed_ref_ctr_inc(vma);
1402 
1403 	if (!valid_vma(vma, true))
1404 		return 0;
1405 
1406 	inode = file_inode(vma->vm_file);
1407 	if (!inode)
1408 		return 0;
1409 
1410 	mutex_lock(uprobes_mmap_hash(inode));
1411 	build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
1412 	/*
1413 	 * We can race with uprobe_unregister(), this uprobe can be already
1414 	 * removed. But in this case filter_chain() must return false, all
1415 	 * consumers have gone away.
1416 	 */
1417 	list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
1418 		if (!fatal_signal_pending(current) &&
1419 		    filter_chain(uprobe, vma->vm_mm)) {
1420 			unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
1421 			install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
1422 		}
1423 		put_uprobe(uprobe);
1424 	}
1425 	mutex_unlock(uprobes_mmap_hash(inode));
1426 
1427 	return 0;
1428 }
1429 
1430 static bool
1431 vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1432 {
1433 	loff_t min, max;
1434 	struct inode *inode;
1435 	struct rb_node *n;
1436 
1437 	inode = file_inode(vma->vm_file);
1438 
1439 	min = vaddr_to_offset(vma, start);
1440 	max = min + (end - start) - 1;
1441 
1442 	read_lock(&uprobes_treelock);
1443 	n = find_node_in_range(inode, min, max);
1444 	read_unlock(&uprobes_treelock);
1445 
1446 	return !!n;
1447 }
1448 
1449 /*
1450  * Called in context of a munmap of a vma.
1451  */
1452 void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1453 {
1454 	if (no_uprobe_events() || !valid_vma(vma, false))
1455 		return;
1456 
1457 	if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
1458 		return;
1459 
1460 	if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
1461 	     test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
1462 		return;
1463 
1464 	if (vma_has_uprobes(vma, start, end))
1465 		set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
1466 }
1467 
1468 static vm_fault_t xol_fault(const struct vm_special_mapping *sm,
1469 			    struct vm_area_struct *vma, struct vm_fault *vmf)
1470 {
1471 	struct xol_area *area = vma->vm_mm->uprobes_state.xol_area;
1472 
1473 	vmf->page = area->page;
1474 	get_page(vmf->page);
1475 	return 0;
1476 }
1477 
1478 static const struct vm_special_mapping xol_mapping = {
1479 	.name = "[uprobes]",
1480 	.fault = xol_fault,
1481 };
1482 
1483 /* Slot allocation for XOL */
1484 static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1485 {
1486 	struct vm_area_struct *vma;
1487 	int ret;
1488 
1489 	if (mmap_write_lock_killable(mm))
1490 		return -EINTR;
1491 
1492 	if (mm->uprobes_state.xol_area) {
1493 		ret = -EALREADY;
1494 		goto fail;
1495 	}
1496 
1497 	if (!area->vaddr) {
1498 		/* Try to map as high as possible, this is only a hint. */
1499 		area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1500 						PAGE_SIZE, 0, 0);
1501 		if (IS_ERR_VALUE(area->vaddr)) {
1502 			ret = area->vaddr;
1503 			goto fail;
1504 		}
1505 	}
1506 
1507 	vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1508 				VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
1509 				&xol_mapping);
1510 	if (IS_ERR(vma)) {
1511 		ret = PTR_ERR(vma);
1512 		goto fail;
1513 	}
1514 
1515 	ret = 0;
1516 	/* pairs with get_xol_area() */
1517 	smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
1518  fail:
1519 	mmap_write_unlock(mm);
1520 
1521 	return ret;
1522 }
1523 
1524 void * __weak arch_uprobe_trampoline(unsigned long *psize)
1525 {
1526 	static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1527 
1528 	*psize = UPROBE_SWBP_INSN_SIZE;
1529 	return &insn;
1530 }
1531 
1532 static struct xol_area *__create_xol_area(unsigned long vaddr)
1533 {
1534 	struct mm_struct *mm = current->mm;
1535 	unsigned long insns_size;
1536 	struct xol_area *area;
1537 	void *insns;
1538 
1539 	area = kzalloc(sizeof(*area), GFP_KERNEL);
1540 	if (unlikely(!area))
1541 		goto out;
1542 
1543 	area->bitmap = kcalloc(BITS_TO_LONGS(UINSNS_PER_PAGE), sizeof(long),
1544 			       GFP_KERNEL);
1545 	if (!area->bitmap)
1546 		goto free_area;
1547 
1548 	area->page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
1549 	if (!area->page)
1550 		goto free_bitmap;
1551 
1552 	area->vaddr = vaddr;
1553 	init_waitqueue_head(&area->wq);
1554 	/* Reserve the 1st slot for get_trampoline_vaddr() */
1555 	set_bit(0, area->bitmap);
1556 	atomic_set(&area->slot_count, 1);
1557 	insns = arch_uprobe_trampoline(&insns_size);
1558 	arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
1559 
1560 	if (!xol_add_vma(mm, area))
1561 		return area;
1562 
1563 	__free_page(area->page);
1564  free_bitmap:
1565 	kfree(area->bitmap);
1566  free_area:
1567 	kfree(area);
1568  out:
1569 	return NULL;
1570 }
1571 
1572 /*
1573  * get_xol_area - Allocate process's xol_area if necessary.
1574  * This area will be used for storing instructions for execution out of line.
1575  *
1576  * Returns the allocated area or NULL.
1577  */
1578 static struct xol_area *get_xol_area(void)
1579 {
1580 	struct mm_struct *mm = current->mm;
1581 	struct xol_area *area;
1582 
1583 	if (!mm->uprobes_state.xol_area)
1584 		__create_xol_area(0);
1585 
1586 	/* Pairs with xol_add_vma() smp_store_release() */
1587 	area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */
1588 	return area;
1589 }
1590 
1591 /*
1592  * uprobe_clear_state - Free the area allocated for slots.
1593  */
1594 void uprobe_clear_state(struct mm_struct *mm)
1595 {
1596 	struct xol_area *area = mm->uprobes_state.xol_area;
1597 
1598 	mutex_lock(&delayed_uprobe_lock);
1599 	delayed_uprobe_remove(NULL, mm);
1600 	mutex_unlock(&delayed_uprobe_lock);
1601 
1602 	if (!area)
1603 		return;
1604 
1605 	put_page(area->page);
1606 	kfree(area->bitmap);
1607 	kfree(area);
1608 }
1609 
1610 void uprobe_start_dup_mmap(void)
1611 {
1612 	percpu_down_read(&dup_mmap_sem);
1613 }
1614 
1615 void uprobe_end_dup_mmap(void)
1616 {
1617 	percpu_up_read(&dup_mmap_sem);
1618 }
1619 
1620 void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
1621 {
1622 	if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
1623 		set_bit(MMF_HAS_UPROBES, &newmm->flags);
1624 		/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
1625 		set_bit(MMF_RECALC_UPROBES, &newmm->flags);
1626 	}
1627 }
1628 
1629 /*
1630  *  - search for a free slot.
1631  */
1632 static unsigned long xol_take_insn_slot(struct xol_area *area)
1633 {
1634 	unsigned long slot_addr;
1635 	int slot_nr;
1636 
1637 	do {
1638 		slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE);
1639 		if (slot_nr < UINSNS_PER_PAGE) {
1640 			if (!test_and_set_bit(slot_nr, area->bitmap))
1641 				break;
1642 
1643 			slot_nr = UINSNS_PER_PAGE;
1644 			continue;
1645 		}
1646 		wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE));
1647 	} while (slot_nr >= UINSNS_PER_PAGE);
1648 
1649 	slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES);
1650 	atomic_inc(&area->slot_count);
1651 
1652 	return slot_addr;
1653 }
1654 
1655 /*
1656  * xol_get_insn_slot - allocate a slot for xol.
1657  * Returns the allocated slot address or 0.
1658  */
1659 static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1660 {
1661 	struct xol_area *area;
1662 	unsigned long xol_vaddr;
1663 
1664 	area = get_xol_area();
1665 	if (!area)
1666 		return 0;
1667 
1668 	xol_vaddr = xol_take_insn_slot(area);
1669 	if (unlikely(!xol_vaddr))
1670 		return 0;
1671 
1672 	arch_uprobe_copy_ixol(area->page, xol_vaddr,
1673 			      &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1674 
1675 	return xol_vaddr;
1676 }
1677 
1678 /*
1679  * xol_free_insn_slot - If slot was earlier allocated by
1680  * @xol_get_insn_slot(), make the slot available for
1681  * subsequent requests.
1682  */
1683 static void xol_free_insn_slot(struct task_struct *tsk)
1684 {
1685 	struct xol_area *area;
1686 	unsigned long vma_end;
1687 	unsigned long slot_addr;
1688 
1689 	if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask)
1690 		return;
1691 
1692 	slot_addr = tsk->utask->xol_vaddr;
1693 	if (unlikely(!slot_addr))
1694 		return;
1695 
1696 	area = tsk->mm->uprobes_state.xol_area;
1697 	vma_end = area->vaddr + PAGE_SIZE;
1698 	if (area->vaddr <= slot_addr && slot_addr < vma_end) {
1699 		unsigned long offset;
1700 		int slot_nr;
1701 
1702 		offset = slot_addr - area->vaddr;
1703 		slot_nr = offset / UPROBE_XOL_SLOT_BYTES;
1704 		if (slot_nr >= UINSNS_PER_PAGE)
1705 			return;
1706 
1707 		clear_bit(slot_nr, area->bitmap);
1708 		atomic_dec(&area->slot_count);
1709 		smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
1710 		if (waitqueue_active(&area->wq))
1711 			wake_up(&area->wq);
1712 
1713 		tsk->utask->xol_vaddr = 0;
1714 	}
1715 }
1716 
1717 void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
1718 				  void *src, unsigned long len)
1719 {
1720 	/* Initialize the slot */
1721 	copy_to_page(page, vaddr, src, len);
1722 
1723 	/*
1724 	 * We probably need flush_icache_user_page() but it needs vma.
1725 	 * This should work on most of architectures by default. If
1726 	 * architecture needs to do something different it can define
1727 	 * its own version of the function.
1728 	 */
1729 	flush_dcache_page(page);
1730 }
1731 
1732 /**
1733  * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs
1734  * @regs: Reflects the saved state of the task after it has hit a breakpoint
1735  * instruction.
1736  * Return the address of the breakpoint instruction.
1737  */
1738 unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1739 {
1740 	return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
1741 }
1742 
1743 unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1744 {
1745 	struct uprobe_task *utask = current->utask;
1746 
1747 	if (unlikely(utask && utask->active_uprobe))
1748 		return utask->vaddr;
1749 
1750 	return instruction_pointer(regs);
1751 }
1752 
1753 static struct return_instance *free_ret_instance(struct return_instance *ri)
1754 {
1755 	struct return_instance *next = ri->next;
1756 	put_uprobe(ri->uprobe);
1757 	kfree(ri);
1758 	return next;
1759 }
1760 
1761 /*
1762  * Called with no locks held.
1763  * Called in context of an exiting or an exec-ing thread.
1764  */
1765 void uprobe_free_utask(struct task_struct *t)
1766 {
1767 	struct uprobe_task *utask = t->utask;
1768 	struct return_instance *ri;
1769 
1770 	if (!utask)
1771 		return;
1772 
1773 	if (utask->active_uprobe)
1774 		put_uprobe(utask->active_uprobe);
1775 
1776 	ri = utask->return_instances;
1777 	while (ri)
1778 		ri = free_ret_instance(ri);
1779 
1780 	xol_free_insn_slot(t);
1781 	kfree(utask);
1782 	t->utask = NULL;
1783 }
1784 
1785 /*
1786  * Allocate a uprobe_task object for the task if necessary.
1787  * Called when the thread hits a breakpoint.
1788  *
1789  * Returns:
1790  * - pointer to new uprobe_task on success
1791  * - NULL otherwise
1792  */
1793 static struct uprobe_task *get_utask(void)
1794 {
1795 	if (!current->utask)
1796 		current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1797 	return current->utask;
1798 }
1799 
1800 static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
1801 {
1802 	struct uprobe_task *n_utask;
1803 	struct return_instance **p, *o, *n;
1804 
1805 	n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1806 	if (!n_utask)
1807 		return -ENOMEM;
1808 	t->utask = n_utask;
1809 
1810 	p = &n_utask->return_instances;
1811 	for (o = o_utask->return_instances; o; o = o->next) {
1812 		n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1813 		if (!n)
1814 			return -ENOMEM;
1815 
1816 		*n = *o;
1817 		/*
1818 		 * uprobe's refcnt has to be positive at this point, kept by
1819 		 * utask->return_instances items; return_instances can't be
1820 		 * removed right now, as task is blocked due to duping; so
1821 		 * get_uprobe() is safe to use here.
1822 		 */
1823 		get_uprobe(n->uprobe);
1824 		n->next = NULL;
1825 
1826 		*p = n;
1827 		p = &n->next;
1828 		n_utask->depth++;
1829 	}
1830 
1831 	return 0;
1832 }
1833 
1834 static void dup_xol_work(struct callback_head *work)
1835 {
1836 	if (current->flags & PF_EXITING)
1837 		return;
1838 
1839 	if (!__create_xol_area(current->utask->dup_xol_addr) &&
1840 			!fatal_signal_pending(current))
1841 		uprobe_warn(current, "dup xol area");
1842 }
1843 
1844 /*
1845  * Called in context of a new clone/fork from copy_process.
1846  */
1847 void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1848 {
1849 	struct uprobe_task *utask = current->utask;
1850 	struct mm_struct *mm = current->mm;
1851 	struct xol_area *area;
1852 
1853 	t->utask = NULL;
1854 
1855 	if (!utask || !utask->return_instances)
1856 		return;
1857 
1858 	if (mm == t->mm && !(flags & CLONE_VFORK))
1859 		return;
1860 
1861 	if (dup_utask(t, utask))
1862 		return uprobe_warn(t, "dup ret instances");
1863 
1864 	/* The task can fork() after dup_xol_work() fails */
1865 	area = mm->uprobes_state.xol_area;
1866 	if (!area)
1867 		return uprobe_warn(t, "dup xol area");
1868 
1869 	if (mm == t->mm)
1870 		return;
1871 
1872 	t->utask->dup_xol_addr = area->vaddr;
1873 	init_task_work(&t->utask->dup_xol_work, dup_xol_work);
1874 	task_work_add(t, &t->utask->dup_xol_work, TWA_RESUME);
1875 }
1876 
1877 /*
1878  * Current area->vaddr notion assume the trampoline address is always
1879  * equal area->vaddr.
1880  *
1881  * Returns -1 in case the xol_area is not allocated.
1882  */
1883 unsigned long uprobe_get_trampoline_vaddr(void)
1884 {
1885 	struct xol_area *area;
1886 	unsigned long trampoline_vaddr = -1;
1887 
1888 	/* Pairs with xol_add_vma() smp_store_release() */
1889 	area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
1890 	if (area)
1891 		trampoline_vaddr = area->vaddr;
1892 
1893 	return trampoline_vaddr;
1894 }
1895 
1896 static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
1897 					struct pt_regs *regs)
1898 {
1899 	struct return_instance *ri = utask->return_instances;
1900 	enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
1901 
1902 	while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
1903 		ri = free_ret_instance(ri);
1904 		utask->depth--;
1905 	}
1906 	utask->return_instances = ri;
1907 }
1908 
1909 static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1910 {
1911 	struct return_instance *ri;
1912 	struct uprobe_task *utask;
1913 	unsigned long orig_ret_vaddr, trampoline_vaddr;
1914 	bool chained;
1915 
1916 	if (!get_xol_area())
1917 		return;
1918 
1919 	utask = get_utask();
1920 	if (!utask)
1921 		return;
1922 
1923 	if (utask->depth >= MAX_URETPROBE_DEPTH) {
1924 		printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
1925 				" nestedness limit pid/tgid=%d/%d\n",
1926 				current->pid, current->tgid);
1927 		return;
1928 	}
1929 
1930 	/* we need to bump refcount to store uprobe in utask */
1931 	if (!try_get_uprobe(uprobe))
1932 		return;
1933 
1934 	ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1935 	if (!ri)
1936 		goto fail;
1937 
1938 	trampoline_vaddr = uprobe_get_trampoline_vaddr();
1939 	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
1940 	if (orig_ret_vaddr == -1)
1941 		goto fail;
1942 
1943 	/* drop the entries invalidated by longjmp() */
1944 	chained = (orig_ret_vaddr == trampoline_vaddr);
1945 	cleanup_return_instances(utask, chained, regs);
1946 
1947 	/*
1948 	 * We don't want to keep trampoline address in stack, rather keep the
1949 	 * original return address of first caller thru all the consequent
1950 	 * instances. This also makes breakpoint unwrapping easier.
1951 	 */
1952 	if (chained) {
1953 		if (!utask->return_instances) {
1954 			/*
1955 			 * This situation is not possible. Likely we have an
1956 			 * attack from user-space.
1957 			 */
1958 			uprobe_warn(current, "handle tail call");
1959 			goto fail;
1960 		}
1961 		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
1962 	}
1963 	ri->uprobe = uprobe;
1964 	ri->func = instruction_pointer(regs);
1965 	ri->stack = user_stack_pointer(regs);
1966 	ri->orig_ret_vaddr = orig_ret_vaddr;
1967 	ri->chained = chained;
1968 
1969 	utask->depth++;
1970 	ri->next = utask->return_instances;
1971 	utask->return_instances = ri;
1972 
1973 	return;
1974 fail:
1975 	kfree(ri);
1976 	put_uprobe(uprobe);
1977 }
1978 
1979 /* Prepare to single-step probed instruction out of line. */
1980 static int
1981 pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
1982 {
1983 	struct uprobe_task *utask;
1984 	unsigned long xol_vaddr;
1985 	int err;
1986 
1987 	utask = get_utask();
1988 	if (!utask)
1989 		return -ENOMEM;
1990 
1991 	if (!try_get_uprobe(uprobe))
1992 		return -EINVAL;
1993 
1994 	xol_vaddr = xol_get_insn_slot(uprobe);
1995 	if (!xol_vaddr) {
1996 		err = -ENOMEM;
1997 		goto err_out;
1998 	}
1999 
2000 	utask->xol_vaddr = xol_vaddr;
2001 	utask->vaddr = bp_vaddr;
2002 
2003 	err = arch_uprobe_pre_xol(&uprobe->arch, regs);
2004 	if (unlikely(err)) {
2005 		xol_free_insn_slot(current);
2006 		goto err_out;
2007 	}
2008 
2009 	utask->active_uprobe = uprobe;
2010 	utask->state = UTASK_SSTEP;
2011 	return 0;
2012 err_out:
2013 	put_uprobe(uprobe);
2014 	return err;
2015 }
2016 
2017 /*
2018  * If we are singlestepping, then ensure this thread is not connected to
2019  * non-fatal signals until completion of singlestep.  When xol insn itself
2020  * triggers the signal,  restart the original insn even if the task is
2021  * already SIGKILL'ed (since coredump should report the correct ip).  This
2022  * is even more important if the task has a handler for SIGSEGV/etc, The
2023  * _same_ instruction should be repeated again after return from the signal
2024  * handler, and SSTEP can never finish in this case.
2025  */
2026 bool uprobe_deny_signal(void)
2027 {
2028 	struct task_struct *t = current;
2029 	struct uprobe_task *utask = t->utask;
2030 
2031 	if (likely(!utask || !utask->active_uprobe))
2032 		return false;
2033 
2034 	WARN_ON_ONCE(utask->state != UTASK_SSTEP);
2035 
2036 	if (task_sigpending(t)) {
2037 		spin_lock_irq(&t->sighand->siglock);
2038 		clear_tsk_thread_flag(t, TIF_SIGPENDING);
2039 		spin_unlock_irq(&t->sighand->siglock);
2040 
2041 		if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
2042 			utask->state = UTASK_SSTEP_TRAPPED;
2043 			set_tsk_thread_flag(t, TIF_UPROBE);
2044 		}
2045 	}
2046 
2047 	return true;
2048 }
2049 
2050 static void mmf_recalc_uprobes(struct mm_struct *mm)
2051 {
2052 	VMA_ITERATOR(vmi, mm, 0);
2053 	struct vm_area_struct *vma;
2054 
2055 	for_each_vma(vmi, vma) {
2056 		if (!valid_vma(vma, false))
2057 			continue;
2058 		/*
2059 		 * This is not strictly accurate, we can race with
2060 		 * uprobe_unregister() and see the already removed
2061 		 * uprobe if delete_uprobe() was not yet called.
2062 		 * Or this uprobe can be filtered out.
2063 		 */
2064 		if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
2065 			return;
2066 	}
2067 
2068 	clear_bit(MMF_HAS_UPROBES, &mm->flags);
2069 }
2070 
2071 static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
2072 {
2073 	struct page *page;
2074 	uprobe_opcode_t opcode;
2075 	int result;
2076 
2077 	if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
2078 		return -EINVAL;
2079 
2080 	pagefault_disable();
2081 	result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
2082 	pagefault_enable();
2083 
2084 	if (likely(result == 0))
2085 		goto out;
2086 
2087 	result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
2088 	if (result < 0)
2089 		return result;
2090 
2091 	copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
2092 	put_page(page);
2093  out:
2094 	/* This needs to return true for any variant of the trap insn */
2095 	return is_trap_insn(&opcode);
2096 }
2097 
2098 /* assumes being inside RCU protected region */
2099 static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
2100 {
2101 	struct mm_struct *mm = current->mm;
2102 	struct uprobe *uprobe = NULL;
2103 	struct vm_area_struct *vma;
2104 
2105 	mmap_read_lock(mm);
2106 	vma = vma_lookup(mm, bp_vaddr);
2107 	if (vma) {
2108 		if (valid_vma(vma, false)) {
2109 			struct inode *inode = file_inode(vma->vm_file);
2110 			loff_t offset = vaddr_to_offset(vma, bp_vaddr);
2111 
2112 			uprobe = find_uprobe_rcu(inode, offset);
2113 		}
2114 
2115 		if (!uprobe)
2116 			*is_swbp = is_trap_at_addr(mm, bp_vaddr);
2117 	} else {
2118 		*is_swbp = -EFAULT;
2119 	}
2120 
2121 	if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
2122 		mmf_recalc_uprobes(mm);
2123 	mmap_read_unlock(mm);
2124 
2125 	return uprobe;
2126 }
2127 
2128 static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
2129 {
2130 	struct uprobe_consumer *uc;
2131 	int remove = UPROBE_HANDLER_REMOVE;
2132 	bool need_prep = false; /* prepare return uprobe, when needed */
2133 	bool has_consumers = false;
2134 
2135 	current->utask->auprobe = &uprobe->arch;
2136 
2137 	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
2138 				 srcu_read_lock_held(&uprobes_srcu)) {
2139 		int rc = 0;
2140 
2141 		if (uc->handler) {
2142 			rc = uc->handler(uc, regs);
2143 			WARN(rc & ~UPROBE_HANDLER_MASK,
2144 				"bad rc=0x%x from %ps()\n", rc, uc->handler);
2145 		}
2146 
2147 		if (uc->ret_handler)
2148 			need_prep = true;
2149 
2150 		remove &= rc;
2151 		has_consumers = true;
2152 	}
2153 	current->utask->auprobe = NULL;
2154 
2155 	if (need_prep && !remove)
2156 		prepare_uretprobe(uprobe, regs); /* put bp at return */
2157 
2158 	if (remove && has_consumers) {
2159 		down_read(&uprobe->register_rwsem);
2160 
2161 		/* re-check that removal is still required, this time under lock */
2162 		if (!filter_chain(uprobe, current->mm)) {
2163 			WARN_ON(!uprobe_is_active(uprobe));
2164 			unapply_uprobe(uprobe, current->mm);
2165 		}
2166 
2167 		up_read(&uprobe->register_rwsem);
2168 	}
2169 }
2170 
2171 static void
2172 handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
2173 {
2174 	struct uprobe *uprobe = ri->uprobe;
2175 	struct uprobe_consumer *uc;
2176 	int srcu_idx;
2177 
2178 	srcu_idx = srcu_read_lock(&uprobes_srcu);
2179 	list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
2180 				 srcu_read_lock_held(&uprobes_srcu)) {
2181 		if (uc->ret_handler)
2182 			uc->ret_handler(uc, ri->func, regs);
2183 	}
2184 	srcu_read_unlock(&uprobes_srcu, srcu_idx);
2185 }
2186 
2187 static struct return_instance *find_next_ret_chain(struct return_instance *ri)
2188 {
2189 	bool chained;
2190 
2191 	do {
2192 		chained = ri->chained;
2193 		ri = ri->next;	/* can't be NULL if chained */
2194 	} while (chained);
2195 
2196 	return ri;
2197 }
2198 
2199 void uprobe_handle_trampoline(struct pt_regs *regs)
2200 {
2201 	struct uprobe_task *utask;
2202 	struct return_instance *ri, *next;
2203 	bool valid;
2204 
2205 	utask = current->utask;
2206 	if (!utask)
2207 		goto sigill;
2208 
2209 	ri = utask->return_instances;
2210 	if (!ri)
2211 		goto sigill;
2212 
2213 	do {
2214 		/*
2215 		 * We should throw out the frames invalidated by longjmp().
2216 		 * If this chain is valid, then the next one should be alive
2217 		 * or NULL; the latter case means that nobody but ri->func
2218 		 * could hit this trampoline on return. TODO: sigaltstack().
2219 		 */
2220 		next = find_next_ret_chain(ri);
2221 		valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
2222 
2223 		instruction_pointer_set(regs, ri->orig_ret_vaddr);
2224 		do {
2225 			/* pop current instance from the stack of pending return instances,
2226 			 * as it's not pending anymore: we just fixed up original
2227 			 * instruction pointer in regs and are about to call handlers;
2228 			 * this allows fixup_uretprobe_trampoline_entries() to properly fix up
2229 			 * captured stack traces from uretprobe handlers, in which pending
2230 			 * trampoline addresses on the stack are replaced with correct
2231 			 * original return addresses
2232 			 */
2233 			utask->return_instances = ri->next;
2234 			if (valid)
2235 				handle_uretprobe_chain(ri, regs);
2236 			ri = free_ret_instance(ri);
2237 			utask->depth--;
2238 		} while (ri != next);
2239 	} while (!valid);
2240 
2241 	utask->return_instances = ri;
2242 	return;
2243 
2244  sigill:
2245 	uprobe_warn(current, "handle uretprobe, sending SIGILL.");
2246 	force_sig(SIGILL);
2247 
2248 }
2249 
2250 bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
2251 {
2252 	return false;
2253 }
2254 
2255 bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
2256 					struct pt_regs *regs)
2257 {
2258 	return true;
2259 }
2260 
2261 /*
2262  * Run handler and ask thread to singlestep.
2263  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
2264  */
2265 static void handle_swbp(struct pt_regs *regs)
2266 {
2267 	struct uprobe *uprobe;
2268 	unsigned long bp_vaddr;
2269 	int is_swbp, srcu_idx;
2270 
2271 	bp_vaddr = uprobe_get_swbp_addr(regs);
2272 	if (bp_vaddr == uprobe_get_trampoline_vaddr())
2273 		return uprobe_handle_trampoline(regs);
2274 
2275 	srcu_idx = srcu_read_lock(&uprobes_srcu);
2276 
2277 	uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
2278 	if (!uprobe) {
2279 		if (is_swbp > 0) {
2280 			/* No matching uprobe; signal SIGTRAP. */
2281 			force_sig(SIGTRAP);
2282 		} else {
2283 			/*
2284 			 * Either we raced with uprobe_unregister() or we can't
2285 			 * access this memory. The latter is only possible if
2286 			 * another thread plays with our ->mm. In both cases
2287 			 * we can simply restart. If this vma was unmapped we
2288 			 * can pretend this insn was not executed yet and get
2289 			 * the (correct) SIGSEGV after restart.
2290 			 */
2291 			instruction_pointer_set(regs, bp_vaddr);
2292 		}
2293 		goto out;
2294 	}
2295 
2296 	/* change it in advance for ->handler() and restart */
2297 	instruction_pointer_set(regs, bp_vaddr);
2298 
2299 	/*
2300 	 * TODO: move copy_insn/etc into _register and remove this hack.
2301 	 * After we hit the bp, _unregister + _register can install the
2302 	 * new and not-yet-analyzed uprobe at the same address, restart.
2303 	 */
2304 	if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
2305 		goto out;
2306 
2307 	/*
2308 	 * Pairs with the smp_wmb() in prepare_uprobe().
2309 	 *
2310 	 * Guarantees that if we see the UPROBE_COPY_INSN bit set, then
2311 	 * we must also see the stores to &uprobe->arch performed by the
2312 	 * prepare_uprobe() call.
2313 	 */
2314 	smp_rmb();
2315 
2316 	/* Tracing handlers use ->utask to communicate with fetch methods */
2317 	if (!get_utask())
2318 		goto out;
2319 
2320 	if (arch_uprobe_ignore(&uprobe->arch, regs))
2321 		goto out;
2322 
2323 	handler_chain(uprobe, regs);
2324 
2325 	if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
2326 		goto out;
2327 
2328 	if (pre_ssout(uprobe, regs, bp_vaddr))
2329 		goto out;
2330 
2331 out:
2332 	/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
2333 	srcu_read_unlock(&uprobes_srcu, srcu_idx);
2334 }
2335 
2336 /*
2337  * Perform required fix-ups and disable singlestep.
2338  * Allow pending signals to take effect.
2339  */
2340 static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
2341 {
2342 	struct uprobe *uprobe;
2343 	int err = 0;
2344 
2345 	uprobe = utask->active_uprobe;
2346 	if (utask->state == UTASK_SSTEP_ACK)
2347 		err = arch_uprobe_post_xol(&uprobe->arch, regs);
2348 	else if (utask->state == UTASK_SSTEP_TRAPPED)
2349 		arch_uprobe_abort_xol(&uprobe->arch, regs);
2350 	else
2351 		WARN_ON_ONCE(1);
2352 
2353 	put_uprobe(uprobe);
2354 	utask->active_uprobe = NULL;
2355 	utask->state = UTASK_RUNNING;
2356 	xol_free_insn_slot(current);
2357 
2358 	spin_lock_irq(&current->sighand->siglock);
2359 	recalc_sigpending(); /* see uprobe_deny_signal() */
2360 	spin_unlock_irq(&current->sighand->siglock);
2361 
2362 	if (unlikely(err)) {
2363 		uprobe_warn(current, "execute the probed insn, sending SIGILL.");
2364 		force_sig(SIGILL);
2365 	}
2366 }
2367 
2368 /*
2369  * On breakpoint hit, breakpoint notifier sets the TIF_UPROBE flag and
2370  * allows the thread to return from interrupt. After that handle_swbp()
2371  * sets utask->active_uprobe.
2372  *
2373  * On singlestep exception, singlestep notifier sets the TIF_UPROBE flag
2374  * and allows the thread to return from interrupt.
2375  *
2376  * While returning to userspace, thread notices the TIF_UPROBE flag and calls
2377  * uprobe_notify_resume().
2378  */
2379 void uprobe_notify_resume(struct pt_regs *regs)
2380 {
2381 	struct uprobe_task *utask;
2382 
2383 	clear_thread_flag(TIF_UPROBE);
2384 
2385 	utask = current->utask;
2386 	if (utask && utask->active_uprobe)
2387 		handle_singlestep(utask, regs);
2388 	else
2389 		handle_swbp(regs);
2390 }
2391 
2392 /*
2393  * uprobe_pre_sstep_notifier gets called from interrupt context as part of
2394  * notifier mechanism. Set TIF_UPROBE flag and indicate breakpoint hit.
2395  */
2396 int uprobe_pre_sstep_notifier(struct pt_regs *regs)
2397 {
2398 	if (!current->mm)
2399 		return 0;
2400 
2401 	if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
2402 	    (!current->utask || !current->utask->return_instances))
2403 		return 0;
2404 
2405 	set_thread_flag(TIF_UPROBE);
2406 	return 1;
2407 }
2408 
2409 /*
2410  * uprobe_post_sstep_notifier gets called in interrupt context as part of notifier
2411  * mechanism. Set TIF_UPROBE flag and indicate completion of singlestep.
2412  */
2413 int uprobe_post_sstep_notifier(struct pt_regs *regs)
2414 {
2415 	struct uprobe_task *utask = current->utask;
2416 
2417 	if (!current->mm || !utask || !utask->active_uprobe)
2418 		/* task is currently not uprobed */
2419 		return 0;
2420 
2421 	utask->state = UTASK_SSTEP_ACK;
2422 	set_thread_flag(TIF_UPROBE);
2423 	return 1;
2424 }
2425 
2426 static struct notifier_block uprobe_exception_nb = {
2427 	.notifier_call		= arch_uprobe_exception_notify,
2428 	.priority		= INT_MAX-1,	/* notified after kprobes, kgdb */
2429 };
2430 
2431 void __init uprobes_init(void)
2432 {
2433 	int i;
2434 
2435 	for (i = 0; i < UPROBES_HASH_SZ; i++)
2436 		mutex_init(&uprobes_mmap_mutex[i]);
2437 
2438 	BUG_ON(register_die_notifier(&uprobe_exception_nb));
2439 }
2440