xref: /linux/drivers/gpu/drm/drm_gpusvm.c (revision c0d6f52f9b62479d61f8cd4faf9fb2f8bce6e301)
1 // SPDX-License-Identifier: GPL-2.0-only OR MIT
2 /*
3  * Copyright © 2024 Intel Corporation
4  *
5  * Authors:
6  *     Matthew Brost <matthew.brost@intel.com>
7  */
8 
9 #include <linux/dma-mapping.h>
10 #include <linux/export.h>
11 #include <linux/hmm.h>
12 #include <linux/hugetlb_inline.h>
13 #include <linux/memremap.h>
14 #include <linux/mm_types.h>
15 #include <linux/slab.h>
16 
17 #include <drm/drm_device.h>
18 #include <drm/drm_gpusvm.h>
19 #include <drm/drm_pagemap.h>
20 #include <drm/drm_print.h>
21 
22 /**
23  * DOC: Overview
24  *
25  * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
26  * is a component of the DRM framework designed to manage shared virtual memory
27  * between the CPU and GPU. It enables efficient data exchange and processing
28  * for GPU-accelerated applications by allowing memory sharing and
29  * synchronization between the CPU's and GPU's virtual address spaces.
30  *
31  * Key GPU SVM Components:
32  *
33  * - Notifiers:
34  *	Used for tracking memory intervals and notifying the GPU of changes,
35  *	notifiers are sized based on a GPU SVM initialization parameter, with a
36  *	recommendation of 512M or larger. They maintain a Red-BlacK tree and a
37  *	list of ranges that fall within the notifier interval.  Notifiers are
38  *	tracked within a GPU SVM Red-BlacK tree and list and are dynamically
39  *	inserted or removed as ranges within the interval are created or
40  *	destroyed.
41  * - Ranges:
42  *	Represent memory ranges mapped in a DRM device and managed by GPU SVM.
43  *	They are sized based on an array of chunk sizes, which is a GPU SVM
44  *	initialization parameter, and the CPU address space.  Upon GPU fault,
45  *	the largest aligned chunk that fits within the faulting CPU address
46  *	space is chosen for the range size. Ranges are expected to be
47  *	dynamically allocated on GPU fault and removed on an MMU notifier UNMAP
48  *	event. As mentioned above, ranges are tracked in a notifier's Red-Black
49  *	tree.
50  *
51  * - Operations:
52  *	Define the interface for driver-specific GPU SVM operations such as
53  *	range allocation, notifier allocation, and invalidations.
54  *
55  * - Device Memory Allocations:
56  *	Embedded structure containing enough information for GPU SVM to migrate
57  *	to / from device memory.
58  *
59  * - Device Memory Operations:
60  *	Define the interface for driver-specific device memory operations
61  *	release memory, populate pfns, and copy to / from device memory.
62  *
63  * This layer provides interfaces for allocating, mapping, migrating, and
64  * releasing memory ranges between the CPU and GPU. It handles all core memory
65  * management interactions (DMA mapping, HMM, and migration) and provides
66  * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
67  * to build the expected driver components for an SVM implementation as detailed
68  * below.
69  *
70  * Expected Driver Components:
71  *
72  * - GPU page fault handler:
73  *	Used to create ranges and notifiers based on the fault address,
74  *	optionally migrate the range to device memory, and create GPU bindings.
75  *
76  * - Garbage collector:
77  *	Used to unmap and destroy GPU bindings for ranges.  Ranges are expected
78  *	to be added to the garbage collector upon a MMU_NOTIFY_UNMAP event in
79  *	notifier callback.
80  *
81  * - Notifier callback:
82  *	Used to invalidate and DMA unmap GPU bindings for ranges.
83  */
84 
85 /**
86  * DOC: Locking
87  *
88  * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
89  * mmap lock as needed.
90  *
91  * GPU SVM introduces a global notifier lock, which safeguards the notifier's
92  * range RB tree and list, as well as the range's DMA mappings and sequence
93  * number. GPU SVM manages all necessary locking and unlocking operations,
94  * except for the recheck range's pages being valid
95  * (drm_gpusvm_range_pages_valid) when the driver is committing GPU bindings.
96  * This lock corresponds to the ``driver->update`` lock mentioned in
97  * Documentation/mm/hmm.rst. Future revisions may transition from a GPU SVM
98  * global lock to a per-notifier lock if finer-grained locking is deemed
99  * necessary.
100  *
101  * In addition to the locking mentioned above, the driver should implement a
102  * lock to safeguard core GPU SVM function calls that modify state, such as
103  * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. This lock is
104  * denoted as 'driver_svm_lock' in code examples. Finer grained driver side
105  * locking should also be possible for concurrent GPU fault processing within a
106  * single GPU SVM. The 'driver_svm_lock' can be via drm_gpusvm_driver_set_lock
107  * to add annotations to GPU SVM.
108  */
109 
110 /**
111  * DOC: Partial Unmapping of Ranges
112  *
113  * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
114  * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
115  * being that a subset of the range still has CPU and GPU mappings. If the
116  * backing store for the range is in device memory, a subset of the backing
117  * store has references. One option would be to split the range and device
118  * memory backing store, but the implementation for this would be quite
119  * complicated. Given that partial unmappings are rare and driver-defined range
120  * sizes are relatively small, GPU SVM does not support splitting of ranges.
121  *
122  * With no support for range splitting, upon partial unmapping of a range, the
123  * driver is expected to invalidate and destroy the entire range. If the range
124  * has device memory as its backing, the driver is also expected to migrate any
125  * remaining pages back to RAM.
126  */
127 
128 /**
129  * DOC: Examples
130  *
131  * This section provides three examples of how to build the expected driver
132  * components: the GPU page fault handler, the garbage collector, and the
133  * notifier callback.
134  *
135  * The generic code provided does not include logic for complex migration
136  * policies, optimized invalidations, fined grained driver locking, or other
137  * potentially required driver locking (e.g., DMA-resv locks).
138  *
139  * 1) GPU page fault handler
140  *
141  * .. code-block:: c
142  *
143  *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
144  *	{
145  *		int err = 0;
146  *
147  *		driver_alloc_and_setup_memory_for_bind(gpusvm, range);
148  *
149  *		drm_gpusvm_notifier_lock(gpusvm);
150  *		if (drm_gpusvm_range_pages_valid(range))
151  *			driver_commit_bind(gpusvm, range);
152  *		else
153  *			err = -EAGAIN;
154  *		drm_gpusvm_notifier_unlock(gpusvm);
155  *
156  *		return err;
157  *	}
158  *
159  *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, unsigned long fault_addr,
160  *			     unsigned long gpuva_start, unsigned long gpuva_end)
161  *	{
162  *		struct drm_gpusvm_ctx ctx = {};
163  *		int err;
164  *
165  *		driver_svm_lock();
166  *	retry:
167  *		// Always process UNMAPs first so view of GPU SVM ranges is current
168  *		driver_garbage_collector(gpusvm);
169  *
170  *		range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
171  *							gpuva_start, gpuva_end,
172  *						        &ctx);
173  *		if (IS_ERR(range)) {
174  *			err = PTR_ERR(range);
175  *			goto unlock;
176  *		}
177  *
178  *		if (driver_migration_policy(range)) {
179  *			err = drm_pagemap_populate_mm(driver_choose_drm_pagemap(),
180  *						      gpuva_start, gpuva_end, gpusvm->mm,
181  *						      ctx->timeslice_ms);
182  *			if (err)	// CPU mappings may have changed
183  *				goto retry;
184  *		}
185  *
186  *		err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
187  *		if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {	// CPU mappings changed
188  *			if (err == -EOPNOTSUPP)
189  *				drm_gpusvm_range_evict(gpusvm, range);
190  *			goto retry;
191  *		} else if (err) {
192  *			goto unlock;
193  *		}
194  *
195  *		err = driver_bind_range(gpusvm, range);
196  *		if (err == -EAGAIN)	// CPU mappings changed
197  *			goto retry
198  *
199  *	unlock:
200  *		driver_svm_unlock();
201  *		return err;
202  *	}
203  *
204  * 2) Garbage Collector
205  *
206  * .. code-block:: c
207  *
208  *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
209  *					struct drm_gpusvm_range *range)
210  *	{
211  *		assert_driver_svm_locked(gpusvm);
212  *
213  *		// Partial unmap, migrate any remaining device memory pages back to RAM
214  *		if (range->flags.partial_unmap)
215  *			drm_gpusvm_range_evict(gpusvm, range);
216  *
217  *		driver_unbind_range(range);
218  *		drm_gpusvm_range_remove(gpusvm, range);
219  *	}
220  *
221  *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
222  *	{
223  *		assert_driver_svm_locked(gpusvm);
224  *
225  *		for_each_range_in_garbage_collector(gpusvm, range)
226  *			__driver_garbage_collector(gpusvm, range);
227  *	}
228  *
229  * 3) Notifier callback
230  *
231  * .. code-block:: c
232  *
233  *	void driver_invalidation(struct drm_gpusvm *gpusvm,
234  *				 struct drm_gpusvm_notifier *notifier,
235  *				 const struct mmu_notifier_range *mmu_range)
236  *	{
237  *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
238  *		struct drm_gpusvm_range *range = NULL;
239  *
240  *		driver_invalidate_device_pages(gpusvm, mmu_range->start, mmu_range->end);
241  *
242  *		drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
243  *					  mmu_range->end) {
244  *			drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
245  *
246  *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
247  *				continue;
248  *
249  *			drm_gpusvm_range_set_unmapped(range, mmu_range);
250  *			driver_garbage_collector_add(gpusvm, range);
251  *		}
252  *	}
253  */
254 
255 /**
256  * npages_in_range() - Calculate the number of pages in a given range
257  * @start: The start address of the range
258  * @end: The end address of the range
259  *
260  * This macro calculates the number of pages in a given memory range,
261  * specified by the start and end addresses. It divides the difference
262  * between the end and start addresses by the page size (PAGE_SIZE) to
263  * determine the number of pages in the range.
264  *
265  * Return: The number of pages in the specified range.
266  */
267 static unsigned long
268 npages_in_range(unsigned long start, unsigned long end)
269 {
270 	return (end - start) >> PAGE_SHIFT;
271 }
272 
273 /**
274  * drm_gpusvm_notifier_find() - Find GPU SVM notifier from GPU SVM
275  * @gpusvm: Pointer to the GPU SVM structure.
276  * @start: Start address of the notifier
277  * @end: End address of the notifier
278  *
279  * Return: A pointer to the drm_gpusvm_notifier if found or NULL
280  */
281 struct drm_gpusvm_notifier *
282 drm_gpusvm_notifier_find(struct drm_gpusvm *gpusvm, unsigned long start,
283 			 unsigned long end)
284 {
285 	struct interval_tree_node *itree;
286 
287 	itree = interval_tree_iter_first(&gpusvm->root, start, end - 1);
288 
289 	if (itree)
290 		return container_of(itree, struct drm_gpusvm_notifier, itree);
291 	else
292 		return NULL;
293 }
294 EXPORT_SYMBOL_GPL(drm_gpusvm_notifier_find);
295 
296 /**
297  * drm_gpusvm_range_find() - Find GPU SVM range from GPU SVM notifier
298  * @notifier: Pointer to the GPU SVM notifier structure.
299  * @start: Start address of the range
300  * @end: End address of the range
301  *
302  * Return: A pointer to the drm_gpusvm_range if found or NULL
303  */
304 struct drm_gpusvm_range *
305 drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, unsigned long start,
306 		      unsigned long end)
307 {
308 	struct interval_tree_node *itree;
309 
310 	itree = interval_tree_iter_first(&notifier->root, start, end - 1);
311 
312 	if (itree)
313 		return container_of(itree, struct drm_gpusvm_range, itree);
314 	else
315 		return NULL;
316 }
317 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find);
318 
319 /**
320  * drm_gpusvm_notifier_invalidate() - Invalidate a GPU SVM notifier.
321  * @mni: Pointer to the mmu_interval_notifier structure.
322  * @mmu_range: Pointer to the mmu_notifier_range structure.
323  * @cur_seq: Current sequence number.
324  *
325  * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
326  * notifier sequence number and calls the driver invalidate vfunc under
327  * gpusvm->notifier_lock.
328  *
329  * Return: true if the operation succeeds, false otherwise.
330  */
331 static bool
332 drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
333 			       const struct mmu_notifier_range *mmu_range,
334 			       unsigned long cur_seq)
335 {
336 	struct drm_gpusvm_notifier *notifier =
337 		container_of(mni, typeof(*notifier), notifier);
338 	struct drm_gpusvm *gpusvm = notifier->gpusvm;
339 
340 	if (!mmu_notifier_range_blockable(mmu_range))
341 		return false;
342 
343 	down_write(&gpusvm->notifier_lock);
344 	mmu_interval_set_seq(mni, cur_seq);
345 	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
346 	up_write(&gpusvm->notifier_lock);
347 
348 	return true;
349 }
350 
351 /*
352  * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
353  */
354 static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
355 	.invalidate = drm_gpusvm_notifier_invalidate,
356 };
357 
358 /**
359  * drm_gpusvm_init() - Initialize the GPU SVM.
360  * @gpusvm: Pointer to the GPU SVM structure.
361  * @name: Name of the GPU SVM.
362  * @drm: Pointer to the DRM device structure.
363  * @mm: Pointer to the mm_struct for the address space.
364  * @mm_start: Start address of GPU SVM.
365  * @mm_range: Range of the GPU SVM.
366  * @notifier_size: Size of individual notifiers.
367  * @ops: Pointer to the operations structure for GPU SVM.
368  * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
369  *               Entries should be powers of 2 in descending order with last
370  *               entry being SZ_4K.
371  * @num_chunks: Number of chunks.
372  *
373  * This function initializes the GPU SVM.
374  *
375  * Note: If only using the simple drm_gpusvm_pages API (get/unmap/free),
376  * then only @gpusvm, @name, and @drm are expected. However, the same base
377  * @gpusvm can also be used with both modes together in which case the full
378  * setup is needed, where the core drm_gpusvm_pages API will simply never use
379  * the other fields.
380  *
381  * Return: 0 on success, a negative error code on failure.
382  */
383 int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
384 		    const char *name, struct drm_device *drm,
385 		    struct mm_struct *mm,
386 		    unsigned long mm_start, unsigned long mm_range,
387 		    unsigned long notifier_size,
388 		    const struct drm_gpusvm_ops *ops,
389 		    const unsigned long *chunk_sizes, int num_chunks)
390 {
391 	if (mm) {
392 		if (!ops->invalidate || !num_chunks)
393 			return -EINVAL;
394 		mmgrab(mm);
395 	} else {
396 		/* No full SVM mode, only core drm_gpusvm_pages API. */
397 		if (ops || num_chunks || mm_range || notifier_size)
398 			return -EINVAL;
399 	}
400 
401 	gpusvm->name = name;
402 	gpusvm->drm = drm;
403 	gpusvm->mm = mm;
404 	gpusvm->mm_start = mm_start;
405 	gpusvm->mm_range = mm_range;
406 	gpusvm->notifier_size = notifier_size;
407 	gpusvm->ops = ops;
408 	gpusvm->chunk_sizes = chunk_sizes;
409 	gpusvm->num_chunks = num_chunks;
410 
411 	gpusvm->root = RB_ROOT_CACHED;
412 	INIT_LIST_HEAD(&gpusvm->notifier_list);
413 
414 	init_rwsem(&gpusvm->notifier_lock);
415 
416 	fs_reclaim_acquire(GFP_KERNEL);
417 	might_lock(&gpusvm->notifier_lock);
418 	fs_reclaim_release(GFP_KERNEL);
419 
420 #ifdef CONFIG_LOCKDEP
421 	gpusvm->lock_dep_map = NULL;
422 #endif
423 
424 	return 0;
425 }
426 EXPORT_SYMBOL_GPL(drm_gpusvm_init);
427 
428 /**
429  * to_drm_gpusvm_notifier() - retrieve the container struct for a given rbtree node
430  * @node: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
431  *
432  * Return: A pointer to the containing drm_gpusvm_notifier structure.
433  */
434 static struct drm_gpusvm_notifier *to_drm_gpusvm_notifier(struct rb_node *node)
435 {
436 	return container_of(node, struct drm_gpusvm_notifier, itree.rb);
437 }
438 
439 /**
440  * drm_gpusvm_notifier_insert() - Insert GPU SVM notifier
441  * @gpusvm: Pointer to the GPU SVM structure
442  * @notifier: Pointer to the GPU SVM notifier structure
443  *
444  * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
445  */
446 static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
447 				       struct drm_gpusvm_notifier *notifier)
448 {
449 	struct rb_node *node;
450 	struct list_head *head;
451 
452 	interval_tree_insert(&notifier->itree, &gpusvm->root);
453 
454 	node = rb_prev(&notifier->itree.rb);
455 	if (node)
456 		head = &(to_drm_gpusvm_notifier(node))->entry;
457 	else
458 		head = &gpusvm->notifier_list;
459 
460 	list_add(&notifier->entry, head);
461 }
462 
463 /**
464  * drm_gpusvm_notifier_remove() - Remove GPU SVM notifier
465  * @gpusvm: Pointer to the GPU SVM tructure
466  * @notifier: Pointer to the GPU SVM notifier structure
467  *
468  * This function removes the GPU SVM notifier from the GPU SVM RB tree and list.
469  */
470 static void drm_gpusvm_notifier_remove(struct drm_gpusvm *gpusvm,
471 				       struct drm_gpusvm_notifier *notifier)
472 {
473 	interval_tree_remove(&notifier->itree, &gpusvm->root);
474 	list_del(&notifier->entry);
475 }
476 
477 /**
478  * drm_gpusvm_fini() - Finalize the GPU SVM.
479  * @gpusvm: Pointer to the GPU SVM structure.
480  *
481  * This function finalizes the GPU SVM by cleaning up any remaining ranges and
482  * notifiers, and dropping a reference to struct MM.
483  */
484 void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
485 {
486 	struct drm_gpusvm_notifier *notifier, *next;
487 
488 	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
489 		struct drm_gpusvm_range *range, *__next;
490 
491 		/*
492 		 * Remove notifier first to avoid racing with any invalidation
493 		 */
494 		mmu_interval_notifier_remove(&notifier->notifier);
495 		notifier->flags.removed = true;
496 
497 		drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
498 					       LONG_MAX)
499 			drm_gpusvm_range_remove(gpusvm, range);
500 	}
501 
502 	if (gpusvm->mm)
503 		mmdrop(gpusvm->mm);
504 	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
505 }
506 EXPORT_SYMBOL_GPL(drm_gpusvm_fini);
507 
508 /**
509  * drm_gpusvm_notifier_alloc() - Allocate GPU SVM notifier
510  * @gpusvm: Pointer to the GPU SVM structure
511  * @fault_addr: Fault address
512  *
513  * This function allocates and initializes the GPU SVM notifier structure.
514  *
515  * Return: Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
516  */
517 static struct drm_gpusvm_notifier *
518 drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, unsigned long fault_addr)
519 {
520 	struct drm_gpusvm_notifier *notifier;
521 
522 	if (gpusvm->ops->notifier_alloc)
523 		notifier = gpusvm->ops->notifier_alloc();
524 	else
525 		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
526 
527 	if (!notifier)
528 		return ERR_PTR(-ENOMEM);
529 
530 	notifier->gpusvm = gpusvm;
531 	notifier->itree.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
532 	notifier->itree.last = ALIGN(fault_addr + 1, gpusvm->notifier_size) - 1;
533 	INIT_LIST_HEAD(&notifier->entry);
534 	notifier->root = RB_ROOT_CACHED;
535 	INIT_LIST_HEAD(&notifier->range_list);
536 
537 	return notifier;
538 }
539 
540 /**
541  * drm_gpusvm_notifier_free() - Free GPU SVM notifier
542  * @gpusvm: Pointer to the GPU SVM structure
543  * @notifier: Pointer to the GPU SVM notifier structure
544  *
545  * This function frees the GPU SVM notifier structure.
546  */
547 static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
548 				     struct drm_gpusvm_notifier *notifier)
549 {
550 	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
551 
552 	if (gpusvm->ops->notifier_free)
553 		gpusvm->ops->notifier_free(notifier);
554 	else
555 		kfree(notifier);
556 }
557 
558 /**
559  * to_drm_gpusvm_range() - retrieve the container struct for a given rbtree node
560  * @node: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
561  *
562  * Return: A pointer to the containing drm_gpusvm_range structure.
563  */
564 static struct drm_gpusvm_range *to_drm_gpusvm_range(struct rb_node *node)
565 {
566 	return container_of(node, struct drm_gpusvm_range, itree.rb);
567 }
568 
569 /**
570  * drm_gpusvm_range_insert() - Insert GPU SVM range
571  * @notifier: Pointer to the GPU SVM notifier structure
572  * @range: Pointer to the GPU SVM range structure
573  *
574  * This function inserts the GPU SVM range into the notifier RB tree and list.
575  */
576 static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
577 				    struct drm_gpusvm_range *range)
578 {
579 	struct rb_node *node;
580 	struct list_head *head;
581 
582 	drm_gpusvm_notifier_lock(notifier->gpusvm);
583 	interval_tree_insert(&range->itree, &notifier->root);
584 
585 	node = rb_prev(&range->itree.rb);
586 	if (node)
587 		head = &(to_drm_gpusvm_range(node))->entry;
588 	else
589 		head = &notifier->range_list;
590 
591 	list_add(&range->entry, head);
592 	drm_gpusvm_notifier_unlock(notifier->gpusvm);
593 }
594 
595 /**
596  * __drm_gpusvm_range_remove() - Remove GPU SVM range
597  * @notifier: Pointer to the GPU SVM notifier structure
598  * @range: Pointer to the GPU SVM range structure
599  *
600  * This macro removes the GPU SVM range from the notifier RB tree and list.
601  */
602 static void __drm_gpusvm_range_remove(struct drm_gpusvm_notifier *notifier,
603 				      struct drm_gpusvm_range *range)
604 {
605 	interval_tree_remove(&range->itree, &notifier->root);
606 	list_del(&range->entry);
607 }
608 
609 /**
610  * drm_gpusvm_range_alloc() - Allocate GPU SVM range
611  * @gpusvm: Pointer to the GPU SVM structure
612  * @notifier: Pointer to the GPU SVM notifier structure
613  * @fault_addr: Fault address
614  * @chunk_size: Chunk size
615  * @migrate_devmem: Flag indicating whether to migrate device memory
616  *
617  * This function allocates and initializes the GPU SVM range structure.
618  *
619  * Return: Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
620  */
621 static struct drm_gpusvm_range *
622 drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
623 		       struct drm_gpusvm_notifier *notifier,
624 		       unsigned long fault_addr, unsigned long chunk_size,
625 		       bool migrate_devmem)
626 {
627 	struct drm_gpusvm_range *range;
628 
629 	if (gpusvm->ops->range_alloc)
630 		range = gpusvm->ops->range_alloc(gpusvm);
631 	else
632 		range = kzalloc(sizeof(*range), GFP_KERNEL);
633 
634 	if (!range)
635 		return ERR_PTR(-ENOMEM);
636 
637 	kref_init(&range->refcount);
638 	range->gpusvm = gpusvm;
639 	range->notifier = notifier;
640 	range->itree.start = ALIGN_DOWN(fault_addr, chunk_size);
641 	range->itree.last = ALIGN(fault_addr + 1, chunk_size) - 1;
642 	INIT_LIST_HEAD(&range->entry);
643 	range->pages.notifier_seq = LONG_MAX;
644 	range->pages.flags.migrate_devmem = migrate_devmem ? 1 : 0;
645 
646 	return range;
647 }
648 
649 /**
650  * drm_gpusvm_hmm_pfn_to_order() - Get the largest CPU mapping order.
651  * @hmm_pfn: The current hmm_pfn.
652  * @hmm_pfn_index: Index of the @hmm_pfn within the pfn array.
653  * @npages: Number of pages within the pfn array i.e the hmm range size.
654  *
655  * To allow skipping PFNs with the same flags (like when they belong to
656  * the same huge PTE) when looping over the pfn array, take a given a hmm_pfn,
657  * and return the largest order that will fit inside the CPU PTE, but also
658  * crucially accounting for the original hmm range boundaries.
659  *
660  * Return: The largest order that will safely fit within the size of the hmm_pfn
661  * CPU PTE.
662  */
663 static unsigned int drm_gpusvm_hmm_pfn_to_order(unsigned long hmm_pfn,
664 						unsigned long hmm_pfn_index,
665 						unsigned long npages)
666 {
667 	unsigned long size;
668 
669 	size = 1UL << hmm_pfn_to_map_order(hmm_pfn);
670 	size -= (hmm_pfn & ~HMM_PFN_FLAGS) & (size - 1);
671 	hmm_pfn_index += size;
672 	if (hmm_pfn_index > npages)
673 		size -= (hmm_pfn_index - npages);
674 
675 	return ilog2(size);
676 }
677 
678 /**
679  * drm_gpusvm_check_pages() - Check pages
680  * @gpusvm: Pointer to the GPU SVM structure
681  * @notifier: Pointer to the GPU SVM notifier structure
682  * @start: Start address
683  * @end: End address
684  * @dev_private_owner: The device private page owner
685  *
686  * Check if pages between start and end have been faulted in on the CPU. Use to
687  * prevent migration of pages without CPU backing store.
688  *
689  * Return: True if pages have been faulted into CPU, False otherwise
690  */
691 static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
692 				   struct drm_gpusvm_notifier *notifier,
693 				   unsigned long start, unsigned long end,
694 				   void *dev_private_owner)
695 {
696 	struct hmm_range hmm_range = {
697 		.default_flags = 0,
698 		.notifier = &notifier->notifier,
699 		.start = start,
700 		.end = end,
701 		.dev_private_owner = dev_private_owner,
702 	};
703 	unsigned long timeout =
704 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
705 	unsigned long *pfns;
706 	unsigned long npages = npages_in_range(start, end);
707 	int err, i;
708 
709 	mmap_assert_locked(gpusvm->mm);
710 
711 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
712 	if (!pfns)
713 		return false;
714 
715 	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
716 	hmm_range.hmm_pfns = pfns;
717 
718 	while (true) {
719 		err = hmm_range_fault(&hmm_range);
720 		if (err == -EBUSY) {
721 			if (time_after(jiffies, timeout))
722 				break;
723 
724 			hmm_range.notifier_seq =
725 				mmu_interval_read_begin(&notifier->notifier);
726 			continue;
727 		}
728 		break;
729 	}
730 	if (err)
731 		goto err_free;
732 
733 	for (i = 0; i < npages;) {
734 		if (!(pfns[i] & HMM_PFN_VALID)) {
735 			err = -EFAULT;
736 			goto err_free;
737 		}
738 		i += 0x1 << drm_gpusvm_hmm_pfn_to_order(pfns[i], i, npages);
739 	}
740 
741 err_free:
742 	kvfree(pfns);
743 	return err ? false : true;
744 }
745 
746 /**
747  * drm_gpusvm_scan_mm() - Check the migration state of a drm_gpusvm_range
748  * @range: Pointer to the struct drm_gpusvm_range to check.
749  * @dev_private_owner: The struct dev_private_owner to use to determine
750  * compatible device-private pages.
751  * @pagemap: The struct dev_pagemap pointer to use for pagemap-specific
752  * checks.
753  *
754  * Scan the CPU address space corresponding to @range and return the
755  * current migration state. Note that the result may be invalid as
756  * soon as the function returns. It's an advisory check.
757  *
758  * TODO: Bail early and call hmm_range_fault() for subranges.
759  *
760  * Return: See &enum drm_gpusvm_scan_result.
761  */
762 enum drm_gpusvm_scan_result drm_gpusvm_scan_mm(struct drm_gpusvm_range *range,
763 					       void *dev_private_owner,
764 					       const struct dev_pagemap *pagemap)
765 {
766 	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
767 	unsigned long start = drm_gpusvm_range_start(range);
768 	unsigned long end = drm_gpusvm_range_end(range);
769 	struct hmm_range hmm_range = {
770 		.default_flags = 0,
771 		.notifier = notifier,
772 		.start = start,
773 		.end = end,
774 		.dev_private_owner = dev_private_owner,
775 	};
776 	unsigned long timeout =
777 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
778 	enum drm_gpusvm_scan_result state = DRM_GPUSVM_SCAN_UNPOPULATED, new_state;
779 	unsigned long *pfns;
780 	unsigned long npages = npages_in_range(start, end);
781 	const struct dev_pagemap *other = NULL;
782 	int err, i;
783 
784 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
785 	if (!pfns)
786 		return DRM_GPUSVM_SCAN_UNPOPULATED;
787 
788 	hmm_range.hmm_pfns = pfns;
789 
790 retry:
791 	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
792 	mmap_read_lock(range->gpusvm->mm);
793 
794 	while (true) {
795 		err = hmm_range_fault(&hmm_range);
796 		if (err == -EBUSY) {
797 			if (time_after(jiffies, timeout))
798 				break;
799 
800 			hmm_range.notifier_seq =
801 				mmu_interval_read_begin(notifier);
802 			continue;
803 		}
804 		break;
805 	}
806 	mmap_read_unlock(range->gpusvm->mm);
807 	if (err)
808 		goto err_free;
809 
810 	drm_gpusvm_notifier_lock(range->gpusvm);
811 	if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
812 		drm_gpusvm_notifier_unlock(range->gpusvm);
813 		goto retry;
814 	}
815 
816 	for (i = 0; i < npages;) {
817 		struct page *page;
818 		const struct dev_pagemap *cur = NULL;
819 
820 		if (!(pfns[i] & HMM_PFN_VALID)) {
821 			state = DRM_GPUSVM_SCAN_UNPOPULATED;
822 			goto err_free;
823 		}
824 
825 		page = hmm_pfn_to_page(pfns[i]);
826 		if (is_device_private_page(page) ||
827 		    is_device_coherent_page(page))
828 			cur = page_pgmap(page);
829 
830 		if (cur == pagemap) {
831 			new_state = DRM_GPUSVM_SCAN_EQUAL;
832 		} else if (cur && (cur == other || !other)) {
833 			new_state = DRM_GPUSVM_SCAN_OTHER;
834 			other = cur;
835 		} else if (cur) {
836 			new_state = DRM_GPUSVM_SCAN_MIXED_DEVICE;
837 		} else {
838 			new_state = DRM_GPUSVM_SCAN_SYSTEM;
839 		}
840 
841 		/*
842 		 * TODO: Could use an array for state
843 		 * transitions, and caller might want it
844 		 * to bail early for some results.
845 		 */
846 		if (state == DRM_GPUSVM_SCAN_UNPOPULATED) {
847 			state = new_state;
848 		} else if (state != new_state) {
849 			if (new_state == DRM_GPUSVM_SCAN_SYSTEM ||
850 			    state == DRM_GPUSVM_SCAN_SYSTEM)
851 				state = DRM_GPUSVM_SCAN_MIXED;
852 			else if (state != DRM_GPUSVM_SCAN_MIXED)
853 				state = DRM_GPUSVM_SCAN_MIXED_DEVICE;
854 		}
855 
856 		i += 1ul << drm_gpusvm_hmm_pfn_to_order(pfns[i], i, npages);
857 	}
858 
859 err_free:
860 	drm_gpusvm_notifier_unlock(range->gpusvm);
861 
862 	kvfree(pfns);
863 	return state;
864 }
865 EXPORT_SYMBOL(drm_gpusvm_scan_mm);
866 
867 /**
868  * drm_gpusvm_range_chunk_size() - Determine chunk size for GPU SVM range
869  * @gpusvm: Pointer to the GPU SVM structure
870  * @notifier: Pointer to the GPU SVM notifier structure
871  * @vas: Pointer to the virtual memory area structure
872  * @fault_addr: Fault address
873  * @gpuva_start: Start address of GPUVA which mirrors CPU
874  * @gpuva_end: End address of GPUVA which mirrors CPU
875  * @check_pages_threshold: Check CPU pages for present threshold
876  * @dev_private_owner: The device private page owner
877  *
878  * This function determines the chunk size for the GPU SVM range based on the
879  * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
880  * memory area boundaries.
881  *
882  * Return: Chunk size on success, LONG_MAX on failure.
883  */
884 static unsigned long
885 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
886 			    struct drm_gpusvm_notifier *notifier,
887 			    struct vm_area_struct *vas,
888 			    unsigned long fault_addr,
889 			    unsigned long gpuva_start,
890 			    unsigned long gpuva_end,
891 			    unsigned long check_pages_threshold,
892 			    void *dev_private_owner)
893 {
894 	unsigned long start, end;
895 	int i = 0;
896 
897 retry:
898 	for (; i < gpusvm->num_chunks; ++i) {
899 		start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
900 		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
901 
902 		if (start >= vas->vm_start && end <= vas->vm_end &&
903 		    start >= drm_gpusvm_notifier_start(notifier) &&
904 		    end <= drm_gpusvm_notifier_end(notifier) &&
905 		    start >= gpuva_start && end <= gpuva_end)
906 			break;
907 	}
908 
909 	if (i == gpusvm->num_chunks)
910 		return LONG_MAX;
911 
912 	/*
913 	 * If allocation more than page, ensure not to overlap with existing
914 	 * ranges.
915 	 */
916 	if (end - start != SZ_4K) {
917 		struct drm_gpusvm_range *range;
918 
919 		range = drm_gpusvm_range_find(notifier, start, end);
920 		if (range) {
921 			++i;
922 			goto retry;
923 		}
924 
925 		/*
926 		 * XXX: Only create range on pages CPU has faulted in. Without
927 		 * this check, or prefault, on BMG 'xe_exec_system_allocator --r
928 		 * process-many-malloc' fails. In the failure case, each process
929 		 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
930 		 * ranges. When migrating the SVM ranges, some processes fail in
931 		 * drm_pagemap_migrate_to_devmem with 'migrate.cpages != npages'
932 		 * and then upon drm_gpusvm_range_get_pages device pages from
933 		 * other processes are collected + faulted in which creates all
934 		 * sorts of problems. Unsure exactly how this happening, also
935 		 * problem goes away if 'xe_exec_system_allocator --r
936 		 * process-many-malloc' mallocs at least 64k at a time.
937 		 */
938 		if (end - start <= check_pages_threshold &&
939 		    !drm_gpusvm_check_pages(gpusvm, notifier, start, end, dev_private_owner)) {
940 			++i;
941 			goto retry;
942 		}
943 	}
944 
945 	return end - start;
946 }
947 
948 #ifdef CONFIG_LOCKDEP
949 /**
950  * drm_gpusvm_driver_lock_held() - Assert GPU SVM driver lock is held
951  * @gpusvm: Pointer to the GPU SVM structure.
952  *
953  * Ensure driver lock is held.
954  */
955 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
956 {
957 	if ((gpusvm)->lock_dep_map)
958 		lockdep_assert(lock_is_held_type((gpusvm)->lock_dep_map, 0));
959 }
960 #else
961 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
962 {
963 }
964 #endif
965 
966 /**
967  * drm_gpusvm_find_vma_start() - Find start address for first VMA in range
968  * @gpusvm: Pointer to the GPU SVM structure
969  * @start: The inclusive start user address.
970  * @end: The exclusive end user address.
971  *
972  * Returns: The start address of first VMA within the provided range,
973  * ULONG_MAX otherwise. Assumes start_addr < end_addr.
974  */
975 unsigned long
976 drm_gpusvm_find_vma_start(struct drm_gpusvm *gpusvm,
977 			  unsigned long start,
978 			  unsigned long end)
979 {
980 	struct mm_struct *mm = gpusvm->mm;
981 	struct vm_area_struct *vma;
982 	unsigned long addr = ULONG_MAX;
983 
984 	if (!mmget_not_zero(mm))
985 		return addr;
986 
987 	mmap_read_lock(mm);
988 
989 	vma = find_vma_intersection(mm, start, end);
990 	if (vma)
991 		addr =  vma->vm_start;
992 
993 	mmap_read_unlock(mm);
994 	mmput(mm);
995 
996 	return addr;
997 }
998 EXPORT_SYMBOL_GPL(drm_gpusvm_find_vma_start);
999 
1000 /**
1001  * drm_gpusvm_range_find_or_insert() - Find or insert GPU SVM range
1002  * @gpusvm: Pointer to the GPU SVM structure
1003  * @fault_addr: Fault address
1004  * @gpuva_start: Start address of GPUVA which mirrors CPU
1005  * @gpuva_end: End address of GPUVA which mirrors CPU
1006  * @ctx: GPU SVM context
1007  *
1008  * This function finds or inserts a newly allocated a GPU SVM range based on the
1009  * fault address. Caller must hold a lock to protect range lookup and insertion.
1010  *
1011  * Return: Pointer to the GPU SVM range on success, ERR_PTR() on failure.
1012  */
1013 struct drm_gpusvm_range *
1014 drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
1015 				unsigned long fault_addr,
1016 				unsigned long gpuva_start,
1017 				unsigned long gpuva_end,
1018 				const struct drm_gpusvm_ctx *ctx)
1019 {
1020 	struct drm_gpusvm_notifier *notifier;
1021 	struct drm_gpusvm_range *range;
1022 	struct mm_struct *mm = gpusvm->mm;
1023 	struct vm_area_struct *vas;
1024 	bool notifier_alloc = false;
1025 	unsigned long chunk_size;
1026 	int err;
1027 	bool migrate_devmem;
1028 
1029 	drm_gpusvm_driver_lock_held(gpusvm);
1030 
1031 	if (fault_addr < gpusvm->mm_start ||
1032 	    fault_addr > gpusvm->mm_start + gpusvm->mm_range)
1033 		return ERR_PTR(-EINVAL);
1034 
1035 	if (!mmget_not_zero(mm))
1036 		return ERR_PTR(-EFAULT);
1037 
1038 	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr, fault_addr + 1);
1039 	if (!notifier) {
1040 		notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
1041 		if (IS_ERR(notifier)) {
1042 			err = PTR_ERR(notifier);
1043 			goto err_mmunlock;
1044 		}
1045 		notifier_alloc = true;
1046 		err = mmu_interval_notifier_insert(&notifier->notifier,
1047 						   mm,
1048 						   drm_gpusvm_notifier_start(notifier),
1049 						   drm_gpusvm_notifier_size(notifier),
1050 						   &drm_gpusvm_notifier_ops);
1051 		if (err)
1052 			goto err_notifier;
1053 	}
1054 
1055 	mmap_read_lock(mm);
1056 
1057 	vas = vma_lookup(mm, fault_addr);
1058 	if (!vas) {
1059 		err = -ENOENT;
1060 		goto err_notifier_remove;
1061 	}
1062 
1063 	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
1064 		err = -EPERM;
1065 		goto err_notifier_remove;
1066 	}
1067 
1068 	range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
1069 	if (range)
1070 		goto out_mmunlock;
1071 	/*
1072 	 * XXX: Short-circuiting migration based on migrate_vma_* current
1073 	 * limitations. If/when migrate_vma_* add more support, this logic will
1074 	 * have to change.
1075 	 */
1076 	migrate_devmem = ctx->devmem_possible &&
1077 		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
1078 
1079 	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
1080 						 fault_addr, gpuva_start,
1081 						 gpuva_end,
1082 						 ctx->check_pages_threshold,
1083 						 ctx->device_private_page_owner);
1084 	if (chunk_size == LONG_MAX) {
1085 		err = -EINVAL;
1086 		goto err_notifier_remove;
1087 	}
1088 
1089 	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
1090 				       migrate_devmem);
1091 	if (IS_ERR(range)) {
1092 		err = PTR_ERR(range);
1093 		goto err_notifier_remove;
1094 	}
1095 
1096 	drm_gpusvm_range_insert(notifier, range);
1097 	if (notifier_alloc)
1098 		drm_gpusvm_notifier_insert(gpusvm, notifier);
1099 
1100 out_mmunlock:
1101 	mmap_read_unlock(mm);
1102 	mmput(mm);
1103 
1104 	return range;
1105 
1106 err_notifier_remove:
1107 	mmap_read_unlock(mm);
1108 	if (notifier_alloc)
1109 		mmu_interval_notifier_remove(&notifier->notifier);
1110 err_notifier:
1111 	if (notifier_alloc)
1112 		drm_gpusvm_notifier_free(gpusvm, notifier);
1113 err_mmunlock:
1114 	mmput(mm);
1115 	return ERR_PTR(err);
1116 }
1117 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find_or_insert);
1118 
1119 /**
1120  * __drm_gpusvm_unmap_pages() - Unmap pages associated with GPU SVM pages (internal)
1121  * @gpusvm: Pointer to the GPU SVM structure
1122  * @svm_pages: Pointer to the GPU SVM pages structure
1123  * @npages: Number of pages to unmap
1124  *
1125  * This function unmap pages associated with a GPU SVM pages struct. Assumes and
1126  * asserts correct locking is in place when called.
1127  */
1128 static void __drm_gpusvm_unmap_pages(struct drm_gpusvm *gpusvm,
1129 				     struct drm_gpusvm_pages *svm_pages,
1130 				     unsigned long npages)
1131 {
1132 	struct drm_pagemap *dpagemap = svm_pages->dpagemap;
1133 	struct device *dev = gpusvm->drm->dev;
1134 	unsigned long i, j;
1135 
1136 	lockdep_assert_held(&gpusvm->notifier_lock);
1137 
1138 	if (svm_pages->flags.has_dma_mapping) {
1139 		struct drm_gpusvm_pages_flags flags = {
1140 			.__flags = svm_pages->flags.__flags,
1141 		};
1142 
1143 		for (i = 0, j = 0; i < npages; j++) {
1144 			struct drm_pagemap_addr *addr = &svm_pages->dma_addr[j];
1145 
1146 			if (addr->proto == DRM_INTERCONNECT_SYSTEM)
1147 				dma_unmap_page(dev,
1148 					       addr->addr,
1149 					       PAGE_SIZE << addr->order,
1150 					       addr->dir);
1151 			else if (dpagemap && dpagemap->ops->device_unmap)
1152 				dpagemap->ops->device_unmap(dpagemap,
1153 							    dev, *addr);
1154 			i += 1 << addr->order;
1155 		}
1156 
1157 		/* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1158 		flags.has_devmem_pages = false;
1159 		flags.has_dma_mapping = false;
1160 		WRITE_ONCE(svm_pages->flags.__flags, flags.__flags);
1161 
1162 		drm_pagemap_put(svm_pages->dpagemap);
1163 		svm_pages->dpagemap = NULL;
1164 	}
1165 }
1166 
1167 /**
1168  * __drm_gpusvm_free_pages() - Free dma array associated with GPU SVM pages
1169  * @gpusvm: Pointer to the GPU SVM structure
1170  * @svm_pages: Pointer to the GPU SVM pages structure
1171  *
1172  * This function frees the dma address array associated with a GPU SVM range.
1173  */
1174 static void __drm_gpusvm_free_pages(struct drm_gpusvm *gpusvm,
1175 				    struct drm_gpusvm_pages *svm_pages)
1176 {
1177 	lockdep_assert_held(&gpusvm->notifier_lock);
1178 
1179 	if (svm_pages->dma_addr) {
1180 		kvfree(svm_pages->dma_addr);
1181 		svm_pages->dma_addr = NULL;
1182 	}
1183 }
1184 
1185 /**
1186  * drm_gpusvm_free_pages() - Free dma-mapping associated with GPU SVM pages
1187  * struct
1188  * @gpusvm: Pointer to the GPU SVM structure
1189  * @svm_pages: Pointer to the GPU SVM pages structure
1190  * @npages: Number of mapped pages
1191  *
1192  * This function unmaps and frees the dma address array associated with a GPU
1193  * SVM pages struct.
1194  */
1195 void drm_gpusvm_free_pages(struct drm_gpusvm *gpusvm,
1196 			   struct drm_gpusvm_pages *svm_pages,
1197 			   unsigned long npages)
1198 {
1199 	drm_gpusvm_notifier_lock(gpusvm);
1200 	__drm_gpusvm_unmap_pages(gpusvm, svm_pages, npages);
1201 	__drm_gpusvm_free_pages(gpusvm, svm_pages);
1202 	drm_gpusvm_notifier_unlock(gpusvm);
1203 }
1204 EXPORT_SYMBOL_GPL(drm_gpusvm_free_pages);
1205 
1206 /**
1207  * drm_gpusvm_range_remove() - Remove GPU SVM range
1208  * @gpusvm: Pointer to the GPU SVM structure
1209  * @range: Pointer to the GPU SVM range to be removed
1210  *
1211  * This function removes the specified GPU SVM range and also removes the parent
1212  * GPU SVM notifier if no more ranges remain in the notifier. The caller must
1213  * hold a lock to protect range and notifier removal.
1214  */
1215 void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
1216 			     struct drm_gpusvm_range *range)
1217 {
1218 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1219 					       drm_gpusvm_range_end(range));
1220 	struct drm_gpusvm_notifier *notifier;
1221 
1222 	drm_gpusvm_driver_lock_held(gpusvm);
1223 
1224 	notifier = drm_gpusvm_notifier_find(gpusvm,
1225 					    drm_gpusvm_range_start(range),
1226 					    drm_gpusvm_range_start(range) + 1);
1227 	if (WARN_ON_ONCE(!notifier))
1228 		return;
1229 
1230 	drm_gpusvm_notifier_lock(gpusvm);
1231 	__drm_gpusvm_unmap_pages(gpusvm, &range->pages, npages);
1232 	__drm_gpusvm_free_pages(gpusvm, &range->pages);
1233 	__drm_gpusvm_range_remove(notifier, range);
1234 	drm_gpusvm_notifier_unlock(gpusvm);
1235 
1236 	drm_gpusvm_range_put(range);
1237 
1238 	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
1239 		if (!notifier->flags.removed)
1240 			mmu_interval_notifier_remove(&notifier->notifier);
1241 		drm_gpusvm_notifier_remove(gpusvm, notifier);
1242 		drm_gpusvm_notifier_free(gpusvm, notifier);
1243 	}
1244 }
1245 EXPORT_SYMBOL_GPL(drm_gpusvm_range_remove);
1246 
1247 /**
1248  * drm_gpusvm_range_get() - Get a reference to GPU SVM range
1249  * @range: Pointer to the GPU SVM range
1250  *
1251  * This function increments the reference count of the specified GPU SVM range.
1252  *
1253  * Return: Pointer to the GPU SVM range.
1254  */
1255 struct drm_gpusvm_range *
1256 drm_gpusvm_range_get(struct drm_gpusvm_range *range)
1257 {
1258 	kref_get(&range->refcount);
1259 
1260 	return range;
1261 }
1262 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get);
1263 
1264 /**
1265  * drm_gpusvm_range_destroy() - Destroy GPU SVM range
1266  * @refcount: Pointer to the reference counter embedded in the GPU SVM range
1267  *
1268  * This function destroys the specified GPU SVM range when its reference count
1269  * reaches zero. If a custom range-free function is provided, it is invoked to
1270  * free the range; otherwise, the range is deallocated using kfree().
1271  */
1272 static void drm_gpusvm_range_destroy(struct kref *refcount)
1273 {
1274 	struct drm_gpusvm_range *range =
1275 		container_of(refcount, struct drm_gpusvm_range, refcount);
1276 	struct drm_gpusvm *gpusvm = range->gpusvm;
1277 
1278 	if (gpusvm->ops->range_free)
1279 		gpusvm->ops->range_free(range);
1280 	else
1281 		kfree(range);
1282 }
1283 
1284 /**
1285  * drm_gpusvm_range_put() - Put a reference to GPU SVM range
1286  * @range: Pointer to the GPU SVM range
1287  *
1288  * This function decrements the reference count of the specified GPU SVM range
1289  * and frees it when the count reaches zero.
1290  */
1291 void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
1292 {
1293 	kref_put(&range->refcount, drm_gpusvm_range_destroy);
1294 }
1295 EXPORT_SYMBOL_GPL(drm_gpusvm_range_put);
1296 
1297 /**
1298  * drm_gpusvm_pages_valid() - GPU SVM range pages valid
1299  * @gpusvm: Pointer to the GPU SVM structure
1300  * @svm_pages: Pointer to the GPU SVM pages structure
1301  *
1302  * This function determines if a GPU SVM range pages are valid. Expected be
1303  * called holding gpusvm->notifier_lock and as the last step before committing a
1304  * GPU binding. This is akin to a notifier seqno check in the HMM documentation
1305  * but due to wider notifiers (i.e., notifiers which span multiple ranges) this
1306  * function is required for finer grained checking (i.e., per range) if pages
1307  * are valid.
1308  *
1309  * Return: True if GPU SVM range has valid pages, False otherwise
1310  */
1311 static bool drm_gpusvm_pages_valid(struct drm_gpusvm *gpusvm,
1312 				   struct drm_gpusvm_pages *svm_pages)
1313 {
1314 	lockdep_assert_held(&gpusvm->notifier_lock);
1315 
1316 	return svm_pages->flags.has_devmem_pages || svm_pages->flags.has_dma_mapping;
1317 }
1318 
1319 /**
1320  * drm_gpusvm_range_pages_valid() - GPU SVM range pages valid
1321  * @gpusvm: Pointer to the GPU SVM structure
1322  * @range: Pointer to the GPU SVM range structure
1323  *
1324  * This function determines if a GPU SVM range pages are valid. Expected be
1325  * called holding gpusvm->notifier_lock and as the last step before committing a
1326  * GPU binding. This is akin to a notifier seqno check in the HMM documentation
1327  * but due to wider notifiers (i.e., notifiers which span multiple ranges) this
1328  * function is required for finer grained checking (i.e., per range) if pages
1329  * are valid.
1330  *
1331  * Return: True if GPU SVM range has valid pages, False otherwise
1332  */
1333 bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
1334 				  struct drm_gpusvm_range *range)
1335 {
1336 	return drm_gpusvm_pages_valid(gpusvm, &range->pages);
1337 }
1338 EXPORT_SYMBOL_GPL(drm_gpusvm_range_pages_valid);
1339 
1340 /**
1341  * drm_gpusvm_range_pages_valid_unlocked() - GPU SVM range pages valid unlocked
1342  * @gpusvm: Pointer to the GPU SVM structure
1343  * @range: Pointer to the GPU SVM range structure
1344  *
1345  * This function determines if a GPU SVM range pages are valid. Expected be
1346  * called without holding gpusvm->notifier_lock.
1347  *
1348  * Return: True if GPU SVM range has valid pages, False otherwise
1349  */
1350 static bool drm_gpusvm_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
1351 					    struct drm_gpusvm_pages *svm_pages)
1352 {
1353 	bool pages_valid;
1354 
1355 	if (!svm_pages->dma_addr)
1356 		return false;
1357 
1358 	drm_gpusvm_notifier_lock(gpusvm);
1359 	pages_valid = drm_gpusvm_pages_valid(gpusvm, svm_pages);
1360 	if (!pages_valid)
1361 		__drm_gpusvm_free_pages(gpusvm, svm_pages);
1362 	drm_gpusvm_notifier_unlock(gpusvm);
1363 
1364 	return pages_valid;
1365 }
1366 
1367 /**
1368  * drm_gpusvm_get_pages() - Get pages and populate GPU SVM pages struct
1369  * @gpusvm: Pointer to the GPU SVM structure
1370  * @svm_pages: The SVM pages to populate. This will contain the dma-addresses
1371  * @mm: The mm corresponding to the CPU range
1372  * @notifier: The corresponding notifier for the given CPU range
1373  * @pages_start: Start CPU address for the pages
1374  * @pages_end: End CPU address for the pages (exclusive)
1375  * @ctx: GPU SVM context
1376  *
1377  * This function gets and maps pages for CPU range and ensures they are
1378  * mapped for DMA access.
1379  *
1380  * Return: 0 on success, negative error code on failure.
1381  */
1382 int drm_gpusvm_get_pages(struct drm_gpusvm *gpusvm,
1383 			 struct drm_gpusvm_pages *svm_pages,
1384 			 struct mm_struct *mm,
1385 			 struct mmu_interval_notifier *notifier,
1386 			 unsigned long pages_start, unsigned long pages_end,
1387 			 const struct drm_gpusvm_ctx *ctx)
1388 {
1389 	struct hmm_range hmm_range = {
1390 		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
1391 			HMM_PFN_REQ_WRITE),
1392 		.notifier = notifier,
1393 		.start = pages_start,
1394 		.end = pages_end,
1395 		.dev_private_owner = ctx->device_private_page_owner,
1396 	};
1397 	void *zdd;
1398 	unsigned long timeout =
1399 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1400 	unsigned long i, j;
1401 	unsigned long npages = npages_in_range(pages_start, pages_end);
1402 	unsigned long num_dma_mapped;
1403 	unsigned int order = 0;
1404 	unsigned long *pfns;
1405 	int err = 0;
1406 	struct dev_pagemap *pagemap;
1407 	struct drm_pagemap *dpagemap;
1408 	struct drm_gpusvm_pages_flags flags;
1409 	enum dma_data_direction dma_dir = ctx->read_only ? DMA_TO_DEVICE :
1410 							   DMA_BIDIRECTIONAL;
1411 
1412 retry:
1413 	if (time_after(jiffies, timeout))
1414 		return -EBUSY;
1415 
1416 	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
1417 	if (drm_gpusvm_pages_valid_unlocked(gpusvm, svm_pages))
1418 		goto set_seqno;
1419 
1420 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
1421 	if (!pfns)
1422 		return -ENOMEM;
1423 
1424 	if (!mmget_not_zero(mm)) {
1425 		err = -EFAULT;
1426 		goto err_free;
1427 	}
1428 
1429 	hmm_range.hmm_pfns = pfns;
1430 	while (true) {
1431 		mmap_read_lock(mm);
1432 		err = hmm_range_fault(&hmm_range);
1433 		mmap_read_unlock(mm);
1434 
1435 		if (err == -EBUSY) {
1436 			if (time_after(jiffies, timeout))
1437 				break;
1438 
1439 			hmm_range.notifier_seq =
1440 				mmu_interval_read_begin(notifier);
1441 			continue;
1442 		}
1443 		break;
1444 	}
1445 	mmput(mm);
1446 	if (err)
1447 		goto err_free;
1448 
1449 map_pages:
1450 	/*
1451 	 * Perform all dma mappings under the notifier lock to not
1452 	 * access freed pages. A notifier will either block on
1453 	 * the notifier lock or unmap dma.
1454 	 */
1455 	drm_gpusvm_notifier_lock(gpusvm);
1456 
1457 	flags.__flags = svm_pages->flags.__flags;
1458 	if (flags.unmapped) {
1459 		drm_gpusvm_notifier_unlock(gpusvm);
1460 		err = -EFAULT;
1461 		goto err_free;
1462 	}
1463 
1464 	if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
1465 		drm_gpusvm_notifier_unlock(gpusvm);
1466 		kvfree(pfns);
1467 		goto retry;
1468 	}
1469 
1470 	if (!svm_pages->dma_addr) {
1471 		/* Unlock and restart mapping to allocate memory. */
1472 		drm_gpusvm_notifier_unlock(gpusvm);
1473 		svm_pages->dma_addr =
1474 			kvmalloc_array(npages, sizeof(*svm_pages->dma_addr), GFP_KERNEL);
1475 		if (!svm_pages->dma_addr) {
1476 			err = -ENOMEM;
1477 			goto err_free;
1478 		}
1479 		goto map_pages;
1480 	}
1481 
1482 	zdd = NULL;
1483 	pagemap = NULL;
1484 	num_dma_mapped = 0;
1485 	for (i = 0, j = 0; i < npages; ++j) {
1486 		struct page *page = hmm_pfn_to_page(pfns[i]);
1487 
1488 		order = drm_gpusvm_hmm_pfn_to_order(pfns[i], i, npages);
1489 		if (is_device_private_page(page) ||
1490 		    is_device_coherent_page(page)) {
1491 			if (!ctx->allow_mixed &&
1492 			    zdd != page->zone_device_data && i > 0) {
1493 				err = -EOPNOTSUPP;
1494 				goto err_unmap;
1495 			}
1496 			zdd = page->zone_device_data;
1497 			if (pagemap != page_pgmap(page)) {
1498 				if (i > 0) {
1499 					err = -EOPNOTSUPP;
1500 					goto err_unmap;
1501 				}
1502 
1503 				pagemap = page_pgmap(page);
1504 				dpagemap = drm_pagemap_page_to_dpagemap(page);
1505 				if (drm_WARN_ON(gpusvm->drm, !dpagemap)) {
1506 					/*
1507 					 * Raced. This is not supposed to happen
1508 					 * since hmm_range_fault() should've migrated
1509 					 * this page to system.
1510 					 */
1511 					err = -EAGAIN;
1512 					goto err_unmap;
1513 				}
1514 			}
1515 			svm_pages->dma_addr[j] =
1516 				dpagemap->ops->device_map(dpagemap,
1517 							  gpusvm->drm->dev,
1518 							  page, order,
1519 							  dma_dir);
1520 			if (dma_mapping_error(gpusvm->drm->dev,
1521 					      svm_pages->dma_addr[j].addr)) {
1522 				err = -EFAULT;
1523 				goto err_unmap;
1524 			}
1525 		} else {
1526 			dma_addr_t addr;
1527 
1528 			if (is_zone_device_page(page) ||
1529 			    (pagemap && !ctx->allow_mixed)) {
1530 				err = -EOPNOTSUPP;
1531 				goto err_unmap;
1532 			}
1533 
1534 			if (ctx->devmem_only) {
1535 				err = -EFAULT;
1536 				goto err_unmap;
1537 			}
1538 
1539 			addr = dma_map_page(gpusvm->drm->dev,
1540 					    page, 0,
1541 					    PAGE_SIZE << order,
1542 					    dma_dir);
1543 			if (dma_mapping_error(gpusvm->drm->dev, addr)) {
1544 				err = -EFAULT;
1545 				goto err_unmap;
1546 			}
1547 
1548 			svm_pages->dma_addr[j] = drm_pagemap_addr_encode
1549 				(addr, DRM_INTERCONNECT_SYSTEM, order,
1550 				 dma_dir);
1551 		}
1552 		i += 1 << order;
1553 		num_dma_mapped = i;
1554 		flags.has_dma_mapping = true;
1555 	}
1556 
1557 	if (pagemap) {
1558 		flags.has_devmem_pages = true;
1559 		drm_pagemap_get(dpagemap);
1560 		drm_pagemap_put(svm_pages->dpagemap);
1561 		svm_pages->dpagemap = dpagemap;
1562 	}
1563 
1564 	/* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1565 	WRITE_ONCE(svm_pages->flags.__flags, flags.__flags);
1566 
1567 	drm_gpusvm_notifier_unlock(gpusvm);
1568 	kvfree(pfns);
1569 set_seqno:
1570 	svm_pages->notifier_seq = hmm_range.notifier_seq;
1571 
1572 	return 0;
1573 
1574 err_unmap:
1575 	__drm_gpusvm_unmap_pages(gpusvm, svm_pages, num_dma_mapped);
1576 	drm_gpusvm_notifier_unlock(gpusvm);
1577 err_free:
1578 	kvfree(pfns);
1579 	if (err == -EAGAIN)
1580 		goto retry;
1581 	return err;
1582 }
1583 EXPORT_SYMBOL_GPL(drm_gpusvm_get_pages);
1584 
1585 /**
1586  * drm_gpusvm_range_get_pages() - Get pages for a GPU SVM range
1587  * @gpusvm: Pointer to the GPU SVM structure
1588  * @range: Pointer to the GPU SVM range structure
1589  * @ctx: GPU SVM context
1590  *
1591  * This function gets pages for a GPU SVM range and ensures they are mapped for
1592  * DMA access.
1593  *
1594  * Return: 0 on success, negative error code on failure.
1595  */
1596 int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
1597 			       struct drm_gpusvm_range *range,
1598 			       const struct drm_gpusvm_ctx *ctx)
1599 {
1600 	return drm_gpusvm_get_pages(gpusvm, &range->pages, gpusvm->mm,
1601 				    &range->notifier->notifier,
1602 				    drm_gpusvm_range_start(range),
1603 				    drm_gpusvm_range_end(range), ctx);
1604 }
1605 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get_pages);
1606 
1607 /**
1608  * drm_gpusvm_unmap_pages() - Unmap GPU svm pages
1609  * @gpusvm: Pointer to the GPU SVM structure
1610  * @svm_pages: Pointer to the GPU SVM pages structure
1611  * @npages: Number of pages in @svm_pages.
1612  * @ctx: GPU SVM context
1613  *
1614  * This function unmaps pages associated with a GPU SVM pages struct. If
1615  * @in_notifier is set, it is assumed that gpusvm->notifier_lock is held in
1616  * write mode; if it is clear, it acquires gpusvm->notifier_lock in read mode.
1617  * Must be called in the invalidate() callback of the corresponding notifier for
1618  * IOMMU security model.
1619  */
1620 void drm_gpusvm_unmap_pages(struct drm_gpusvm *gpusvm,
1621 			    struct drm_gpusvm_pages *svm_pages,
1622 			    unsigned long npages,
1623 			    const struct drm_gpusvm_ctx *ctx)
1624 {
1625 	if (ctx->in_notifier)
1626 		lockdep_assert_held_write(&gpusvm->notifier_lock);
1627 	else
1628 		drm_gpusvm_notifier_lock(gpusvm);
1629 
1630 	__drm_gpusvm_unmap_pages(gpusvm, svm_pages, npages);
1631 
1632 	if (!ctx->in_notifier)
1633 		drm_gpusvm_notifier_unlock(gpusvm);
1634 }
1635 EXPORT_SYMBOL_GPL(drm_gpusvm_unmap_pages);
1636 
1637 /**
1638  * drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range
1639  * @gpusvm: Pointer to the GPU SVM structure
1640  * @range: Pointer to the GPU SVM range structure
1641  * @ctx: GPU SVM context
1642  *
1643  * This function unmaps pages associated with a GPU SVM range. If @in_notifier
1644  * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
1645  * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
1646  * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
1647  * security model.
1648  */
1649 void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1650 				  struct drm_gpusvm_range *range,
1651 				  const struct drm_gpusvm_ctx *ctx)
1652 {
1653 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1654 					       drm_gpusvm_range_end(range));
1655 
1656 	return drm_gpusvm_unmap_pages(gpusvm, &range->pages, npages, ctx);
1657 }
1658 EXPORT_SYMBOL_GPL(drm_gpusvm_range_unmap_pages);
1659 
1660 /**
1661  * drm_gpusvm_range_evict() - Evict GPU SVM range
1662  * @gpusvm: Pointer to the GPU SVM structure
1663  * @range: Pointer to the GPU SVM range to be removed
1664  *
1665  * This function evicts the specified GPU SVM range.
1666  *
1667  * Return: 0 on success, a negative error code on failure.
1668  */
1669 int drm_gpusvm_range_evict(struct drm_gpusvm *gpusvm,
1670 			   struct drm_gpusvm_range *range)
1671 {
1672 	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
1673 	struct hmm_range hmm_range = {
1674 		.default_flags = HMM_PFN_REQ_FAULT,
1675 		.notifier = notifier,
1676 		.start = drm_gpusvm_range_start(range),
1677 		.end = drm_gpusvm_range_end(range),
1678 		.dev_private_owner = NULL,
1679 	};
1680 	unsigned long timeout =
1681 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1682 	unsigned long *pfns;
1683 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1684 					       drm_gpusvm_range_end(range));
1685 	int err = 0;
1686 	struct mm_struct *mm = gpusvm->mm;
1687 
1688 	if (!mmget_not_zero(mm))
1689 		return -EFAULT;
1690 
1691 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
1692 	if (!pfns)
1693 		return -ENOMEM;
1694 
1695 	hmm_range.hmm_pfns = pfns;
1696 	while (!time_after(jiffies, timeout)) {
1697 		hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
1698 		if (time_after(jiffies, timeout)) {
1699 			err = -ETIME;
1700 			break;
1701 		}
1702 
1703 		mmap_read_lock(mm);
1704 		err = hmm_range_fault(&hmm_range);
1705 		mmap_read_unlock(mm);
1706 		if (err != -EBUSY)
1707 			break;
1708 	}
1709 
1710 	kvfree(pfns);
1711 	mmput(mm);
1712 
1713 	return err;
1714 }
1715 EXPORT_SYMBOL_GPL(drm_gpusvm_range_evict);
1716 
1717 /**
1718  * drm_gpusvm_has_mapping() - Check if GPU SVM has mapping for the given address range
1719  * @gpusvm: Pointer to the GPU SVM structure.
1720  * @start: Start address
1721  * @end: End address
1722  *
1723  * Return: True if GPU SVM has mapping, False otherwise
1724  */
1725 bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, unsigned long start,
1726 			    unsigned long end)
1727 {
1728 	struct drm_gpusvm_notifier *notifier;
1729 
1730 	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
1731 		struct drm_gpusvm_range *range = NULL;
1732 
1733 		drm_gpusvm_for_each_range(range, notifier, start, end)
1734 			return true;
1735 	}
1736 
1737 	return false;
1738 }
1739 EXPORT_SYMBOL_GPL(drm_gpusvm_has_mapping);
1740 
1741 /**
1742  * drm_gpusvm_range_set_unmapped() - Mark a GPU SVM range as unmapped
1743  * @range: Pointer to the GPU SVM range structure.
1744  * @mmu_range: Pointer to the MMU notifier range structure.
1745  *
1746  * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
1747  * if the range partially falls within the provided MMU notifier range.
1748  */
1749 void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
1750 				   const struct mmu_notifier_range *mmu_range)
1751 {
1752 	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
1753 
1754 	range->pages.flags.unmapped = true;
1755 	if (drm_gpusvm_range_start(range) < mmu_range->start ||
1756 	    drm_gpusvm_range_end(range) > mmu_range->end)
1757 		range->pages.flags.partial_unmap = true;
1758 }
1759 EXPORT_SYMBOL_GPL(drm_gpusvm_range_set_unmapped);
1760 
1761 MODULE_DESCRIPTION("DRM GPUSVM");
1762 MODULE_LICENSE("GPL");
1763