xref: /linux/drivers/gpu/drm/drm_gpusvm.c (revision af53f0fd99c3bbb3afd29f1612c9e88c5a92cc01)
1 // SPDX-License-Identifier: GPL-2.0-only OR MIT
2 /*
3  * Copyright © 2024 Intel Corporation
4  *
5  * Authors:
6  *     Matthew Brost <matthew.brost@intel.com>
7  */
8 
9 #include <linux/dma-mapping.h>
10 #include <linux/hmm.h>
11 #include <linux/memremap.h>
12 #include <linux/migrate.h>
13 #include <linux/mm_types.h>
14 #include <linux/pagemap.h>
15 #include <linux/slab.h>
16 
17 #include <drm/drm_device.h>
18 #include <drm/drm_gpusvm.h>
19 #include <drm/drm_pagemap.h>
20 #include <drm/drm_print.h>
21 
22 /**
23  * DOC: Overview
24  *
25  * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
26  * is a component of the DRM framework designed to manage shared virtual memory
27  * between the CPU and GPU. It enables efficient data exchange and processing
28  * for GPU-accelerated applications by allowing memory sharing and
29  * synchronization between the CPU's and GPU's virtual address spaces.
30  *
31  * Key GPU SVM Components:
32  *
33  * - Notifiers:
34  *	Used for tracking memory intervals and notifying the GPU of changes,
35  *	notifiers are sized based on a GPU SVM initialization parameter, with a
36  *	recommendation of 512M or larger. They maintain a Red-BlacK tree and a
37  *	list of ranges that fall within the notifier interval.  Notifiers are
38  *	tracked within a GPU SVM Red-BlacK tree and list and are dynamically
39  *	inserted or removed as ranges within the interval are created or
40  *	destroyed.
41  * - Ranges:
42  *	Represent memory ranges mapped in a DRM device and managed by GPU SVM.
43  *	They are sized based on an array of chunk sizes, which is a GPU SVM
44  *	initialization parameter, and the CPU address space.  Upon GPU fault,
45  *	the largest aligned chunk that fits within the faulting CPU address
46  *	space is chosen for the range size. Ranges are expected to be
47  *	dynamically allocated on GPU fault and removed on an MMU notifier UNMAP
48  *	event. As mentioned above, ranges are tracked in a notifier's Red-Black
49  *	tree.
50  *
51  * - Operations:
52  *	Define the interface for driver-specific GPU SVM operations such as
53  *	range allocation, notifier allocation, and invalidations.
54  *
55  * - Device Memory Allocations:
56  *	Embedded structure containing enough information for GPU SVM to migrate
57  *	to / from device memory.
58  *
59  * - Device Memory Operations:
60  *	Define the interface for driver-specific device memory operations
61  *	release memory, populate pfns, and copy to / from device memory.
62  *
63  * This layer provides interfaces for allocating, mapping, migrating, and
64  * releasing memory ranges between the CPU and GPU. It handles all core memory
65  * management interactions (DMA mapping, HMM, and migration) and provides
66  * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
67  * to build the expected driver components for an SVM implementation as detailed
68  * below.
69  *
70  * Expected Driver Components:
71  *
72  * - GPU page fault handler:
73  *	Used to create ranges and notifiers based on the fault address,
74  *	optionally migrate the range to device memory, and create GPU bindings.
75  *
76  * - Garbage collector:
77  *	Used to unmap and destroy GPU bindings for ranges.  Ranges are expected
78  *	to be added to the garbage collector upon a MMU_NOTIFY_UNMAP event in
79  *	notifier callback.
80  *
81  * - Notifier callback:
82  *	Used to invalidate and DMA unmap GPU bindings for ranges.
83  */
84 
85 /**
86  * DOC: Locking
87  *
88  * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
89  * mmap lock as needed.
90  *
91  * GPU SVM introduces a global notifier lock, which safeguards the notifier's
92  * range RB tree and list, as well as the range's DMA mappings and sequence
93  * number. GPU SVM manages all necessary locking and unlocking operations,
94  * except for the recheck range's pages being valid
95  * (drm_gpusvm_range_pages_valid) when the driver is committing GPU bindings.
96  * This lock corresponds to the ``driver->update`` lock mentioned in
97  * Documentation/mm/hmm.rst. Future revisions may transition from a GPU SVM
98  * global lock to a per-notifier lock if finer-grained locking is deemed
99  * necessary.
100  *
101  * In addition to the locking mentioned above, the driver should implement a
102  * lock to safeguard core GPU SVM function calls that modify state, such as
103  * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. This lock is
104  * denoted as 'driver_svm_lock' in code examples. Finer grained driver side
105  * locking should also be possible for concurrent GPU fault processing within a
106  * single GPU SVM. The 'driver_svm_lock' can be via drm_gpusvm_driver_set_lock
107  * to add annotations to GPU SVM.
108  */
109 
110 /**
111  * DOC: Migration
112  *
113  * The migration support is quite simple, allowing migration between RAM and
114  * device memory at the range granularity. For example, GPU SVM currently does
115  * not support mixing RAM and device memory pages within a range. This means
116  * that upon GPU fault, the entire range can be migrated to device memory, and
117  * upon CPU fault, the entire range is migrated to RAM. Mixed RAM and device
118  * memory storage within a range could be added in the future if required.
119  *
120  * The reasoning for only supporting range granularity is as follows: it
121  * simplifies the implementation, and range sizes are driver-defined and should
122  * be relatively small.
123  */
124 
125 /**
126  * DOC: Partial Unmapping of Ranges
127  *
128  * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
129  * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
130  * being that a subset of the range still has CPU and GPU mappings. If the
131  * backing store for the range is in device memory, a subset of the backing
132  * store has references. One option would be to split the range and device
133  * memory backing store, but the implementation for this would be quite
134  * complicated. Given that partial unmappings are rare and driver-defined range
135  * sizes are relatively small, GPU SVM does not support splitting of ranges.
136  *
137  * With no support for range splitting, upon partial unmapping of a range, the
138  * driver is expected to invalidate and destroy the entire range. If the range
139  * has device memory as its backing, the driver is also expected to migrate any
140  * remaining pages back to RAM.
141  */
142 
143 /**
144  * DOC: Examples
145  *
146  * This section provides three examples of how to build the expected driver
147  * components: the GPU page fault handler, the garbage collector, and the
148  * notifier callback.
149  *
150  * The generic code provided does not include logic for complex migration
151  * policies, optimized invalidations, fined grained driver locking, or other
152  * potentially required driver locking (e.g., DMA-resv locks).
153  *
154  * 1) GPU page fault handler
155  *
156  * .. code-block:: c
157  *
158  *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
159  *	{
160  *		int err = 0;
161  *
162  *		driver_alloc_and_setup_memory_for_bind(gpusvm, range);
163  *
164  *		drm_gpusvm_notifier_lock(gpusvm);
165  *		if (drm_gpusvm_range_pages_valid(range))
166  *			driver_commit_bind(gpusvm, range);
167  *		else
168  *			err = -EAGAIN;
169  *		drm_gpusvm_notifier_unlock(gpusvm);
170  *
171  *		return err;
172  *	}
173  *
174  *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, unsigned long fault_addr,
175  *			     unsigned long gpuva_start, unsigned long gpuva_end)
176  *	{
177  *		struct drm_gpusvm_ctx ctx = {};
178  *		int err;
179  *
180  *		driver_svm_lock();
181  *	retry:
182  *		// Always process UNMAPs first so view of GPU SVM ranges is current
183  *		driver_garbage_collector(gpusvm);
184  *
185  *		range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
186  *							gpuva_start, gpuva_end,
187  *						        &ctx);
188  *		if (IS_ERR(range)) {
189  *			err = PTR_ERR(range);
190  *			goto unlock;
191  *		}
192  *
193  *		if (driver_migration_policy(range)) {
194  *			mmap_read_lock(mm);
195  *			devmem = driver_alloc_devmem();
196  *			err = drm_gpusvm_migrate_to_devmem(gpusvm, range,
197  *							   devmem_allocation,
198  *							   &ctx);
199  *			mmap_read_unlock(mm);
200  *			if (err)	// CPU mappings may have changed
201  *				goto retry;
202  *		}
203  *
204  *		err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
205  *		if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {	// CPU mappings changed
206  *			if (err == -EOPNOTSUPP)
207  *				drm_gpusvm_range_evict(gpusvm, range);
208  *			goto retry;
209  *		} else if (err) {
210  *			goto unlock;
211  *		}
212  *
213  *		err = driver_bind_range(gpusvm, range);
214  *		if (err == -EAGAIN)	// CPU mappings changed
215  *			goto retry
216  *
217  *	unlock:
218  *		driver_svm_unlock();
219  *		return err;
220  *	}
221  *
222  * 2) Garbage Collector
223  *
224  * .. code-block:: c
225  *
226  *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
227  *					struct drm_gpusvm_range *range)
228  *	{
229  *		assert_driver_svm_locked(gpusvm);
230  *
231  *		// Partial unmap, migrate any remaining device memory pages back to RAM
232  *		if (range->flags.partial_unmap)
233  *			drm_gpusvm_range_evict(gpusvm, range);
234  *
235  *		driver_unbind_range(range);
236  *		drm_gpusvm_range_remove(gpusvm, range);
237  *	}
238  *
239  *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
240  *	{
241  *		assert_driver_svm_locked(gpusvm);
242  *
243  *		for_each_range_in_garbage_collector(gpusvm, range)
244  *			__driver_garbage_collector(gpusvm, range);
245  *	}
246  *
247  * 3) Notifier callback
248  *
249  * .. code-block:: c
250  *
251  *	void driver_invalidation(struct drm_gpusvm *gpusvm,
252  *				 struct drm_gpusvm_notifier *notifier,
253  *				 const struct mmu_notifier_range *mmu_range)
254  *	{
255  *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
256  *		struct drm_gpusvm_range *range = NULL;
257  *
258  *		driver_invalidate_device_pages(gpusvm, mmu_range->start, mmu_range->end);
259  *
260  *		drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
261  *					  mmu_range->end) {
262  *			drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
263  *
264  *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
265  *				continue;
266  *
267  *			drm_gpusvm_range_set_unmapped(range, mmu_range);
268  *			driver_garbage_collector_add(gpusvm, range);
269  *		}
270  *	}
271  */
272 
273 /**
274  * npages_in_range() - Calculate the number of pages in a given range
275  * @start: The start address of the range
276  * @end: The end address of the range
277  *
278  * This macro calculates the number of pages in a given memory range,
279  * specified by the start and end addresses. It divides the difference
280  * between the end and start addresses by the page size (PAGE_SIZE) to
281  * determine the number of pages in the range.
282  *
283  * Return: The number of pages in the specified range.
284  */
285 static unsigned long
286 npages_in_range(unsigned long start, unsigned long end)
287 {
288 	return (end - start) >> PAGE_SHIFT;
289 }
290 
291 /**
292  * struct drm_gpusvm_zdd - GPU SVM zone device data
293  *
294  * @refcount: Reference count for the zdd
295  * @devmem_allocation: device memory allocation
296  * @device_private_page_owner: Device private pages owner
297  *
298  * This structure serves as a generic wrapper installed in
299  * page->zone_device_data. It provides infrastructure for looking up a device
300  * memory allocation upon CPU page fault and asynchronously releasing device
301  * memory once the CPU has no page references. Asynchronous release is useful
302  * because CPU page references can be dropped in IRQ contexts, while releasing
303  * device memory likely requires sleeping locks.
304  */
305 struct drm_gpusvm_zdd {
306 	struct kref refcount;
307 	struct drm_gpusvm_devmem *devmem_allocation;
308 	void *device_private_page_owner;
309 };
310 
311 /**
312  * drm_gpusvm_zdd_alloc() - Allocate a zdd structure.
313  * @device_private_page_owner: Device private pages owner
314  *
315  * This function allocates and initializes a new zdd structure. It sets up the
316  * reference count and initializes the destroy work.
317  *
318  * Return: Pointer to the allocated zdd on success, ERR_PTR() on failure.
319  */
320 static struct drm_gpusvm_zdd *
321 drm_gpusvm_zdd_alloc(void *device_private_page_owner)
322 {
323 	struct drm_gpusvm_zdd *zdd;
324 
325 	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
326 	if (!zdd)
327 		return NULL;
328 
329 	kref_init(&zdd->refcount);
330 	zdd->devmem_allocation = NULL;
331 	zdd->device_private_page_owner = device_private_page_owner;
332 
333 	return zdd;
334 }
335 
336 /**
337  * drm_gpusvm_zdd_get() - Get a reference to a zdd structure.
338  * @zdd: Pointer to the zdd structure.
339  *
340  * This function increments the reference count of the provided zdd structure.
341  *
342  * Return: Pointer to the zdd structure.
343  */
344 static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct drm_gpusvm_zdd *zdd)
345 {
346 	kref_get(&zdd->refcount);
347 	return zdd;
348 }
349 
350 /**
351  * drm_gpusvm_zdd_destroy() - Destroy a zdd structure.
352  * @ref: Pointer to the reference count structure.
353  *
354  * This function queues the destroy_work of the zdd for asynchronous destruction.
355  */
356 static void drm_gpusvm_zdd_destroy(struct kref *ref)
357 {
358 	struct drm_gpusvm_zdd *zdd =
359 		container_of(ref, struct drm_gpusvm_zdd, refcount);
360 	struct drm_gpusvm_devmem *devmem = zdd->devmem_allocation;
361 
362 	if (devmem) {
363 		complete_all(&devmem->detached);
364 		if (devmem->ops->devmem_release)
365 			devmem->ops->devmem_release(devmem);
366 	}
367 	kfree(zdd);
368 }
369 
370 /**
371  * drm_gpusvm_zdd_put() - Put a zdd reference.
372  * @zdd: Pointer to the zdd structure.
373  *
374  * This function decrements the reference count of the provided zdd structure
375  * and schedules its destruction if the count drops to zero.
376  */
377 static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
378 {
379 	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
380 }
381 
382 /**
383  * drm_gpusvm_range_find() - Find GPU SVM range from GPU SVM notifier
384  * @notifier: Pointer to the GPU SVM notifier structure.
385  * @start: Start address of the range
386  * @end: End address of the range
387  *
388  * Return: A pointer to the drm_gpusvm_range if found or NULL
389  */
390 struct drm_gpusvm_range *
391 drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, unsigned long start,
392 		      unsigned long end)
393 {
394 	struct interval_tree_node *itree;
395 
396 	itree = interval_tree_iter_first(&notifier->root, start, end - 1);
397 
398 	if (itree)
399 		return container_of(itree, struct drm_gpusvm_range, itree);
400 	else
401 		return NULL;
402 }
403 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find);
404 
405 /**
406  * drm_gpusvm_for_each_range_safe() - Safely iterate over GPU SVM ranges in a notifier
407  * @range__: Iterator variable for the ranges
408  * @next__: Iterator variable for the ranges temporay storage
409  * @notifier__: Pointer to the GPU SVM notifier
410  * @start__: Start address of the range
411  * @end__: End address of the range
412  *
413  * This macro is used to iterate over GPU SVM ranges in a notifier while
414  * removing ranges from it.
415  */
416 #define drm_gpusvm_for_each_range_safe(range__, next__, notifier__, start__, end__)	\
417 	for ((range__) = drm_gpusvm_range_find((notifier__), (start__), (end__)),	\
418 	     (next__) = __drm_gpusvm_range_next(range__);				\
419 	     (range__) && (drm_gpusvm_range_start(range__) < (end__));			\
420 	     (range__) = (next__), (next__) = __drm_gpusvm_range_next(range__))
421 
422 /**
423  * __drm_gpusvm_notifier_next() - get the next drm_gpusvm_notifier in the list
424  * @notifier: a pointer to the current drm_gpusvm_notifier
425  *
426  * Return: A pointer to the next drm_gpusvm_notifier if available, or NULL if
427  *         the current notifier is the last one or if the input notifier is
428  *         NULL.
429  */
430 static struct drm_gpusvm_notifier *
431 __drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
432 {
433 	if (notifier && !list_is_last(&notifier->entry,
434 				      &notifier->gpusvm->notifier_list))
435 		return list_next_entry(notifier, entry);
436 
437 	return NULL;
438 }
439 
440 static struct drm_gpusvm_notifier *
441 notifier_iter_first(struct rb_root_cached *root, unsigned long start,
442 		    unsigned long last)
443 {
444 	struct interval_tree_node *itree;
445 
446 	itree = interval_tree_iter_first(root, start, last);
447 
448 	if (itree)
449 		return container_of(itree, struct drm_gpusvm_notifier, itree);
450 	else
451 		return NULL;
452 }
453 
454 /**
455  * drm_gpusvm_for_each_notifier() - Iterate over GPU SVM notifiers in a gpusvm
456  * @notifier__: Iterator variable for the notifiers
457  * @notifier__: Pointer to the GPU SVM notifier
458  * @start__: Start address of the notifier
459  * @end__: End address of the notifier
460  *
461  * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
462  */
463 #define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__, end__)		\
464 	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1);	\
465 	     (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__));		\
466 	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
467 
468 /**
469  * drm_gpusvm_for_each_notifier_safe() - Safely iterate over GPU SVM notifiers in a gpusvm
470  * @notifier__: Iterator variable for the notifiers
471  * @next__: Iterator variable for the notifiers temporay storage
472  * @notifier__: Pointer to the GPU SVM notifier
473  * @start__: Start address of the notifier
474  * @end__: End address of the notifier
475  *
476  * This macro is used to iterate over GPU SVM notifiers in a gpusvm while
477  * removing notifiers from it.
478  */
479 #define drm_gpusvm_for_each_notifier_safe(notifier__, next__, gpusvm__, start__, end__)	\
480 	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1),	\
481 	     (next__) = __drm_gpusvm_notifier_next(notifier__);				\
482 	     (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__));		\
483 	     (notifier__) = (next__), (next__) = __drm_gpusvm_notifier_next(notifier__))
484 
485 /**
486  * drm_gpusvm_notifier_invalidate() - Invalidate a GPU SVM notifier.
487  * @mni: Pointer to the mmu_interval_notifier structure.
488  * @mmu_range: Pointer to the mmu_notifier_range structure.
489  * @cur_seq: Current sequence number.
490  *
491  * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
492  * notifier sequence number and calls the driver invalidate vfunc under
493  * gpusvm->notifier_lock.
494  *
495  * Return: true if the operation succeeds, false otherwise.
496  */
497 static bool
498 drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
499 			       const struct mmu_notifier_range *mmu_range,
500 			       unsigned long cur_seq)
501 {
502 	struct drm_gpusvm_notifier *notifier =
503 		container_of(mni, typeof(*notifier), notifier);
504 	struct drm_gpusvm *gpusvm = notifier->gpusvm;
505 
506 	if (!mmu_notifier_range_blockable(mmu_range))
507 		return false;
508 
509 	down_write(&gpusvm->notifier_lock);
510 	mmu_interval_set_seq(mni, cur_seq);
511 	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
512 	up_write(&gpusvm->notifier_lock);
513 
514 	return true;
515 }
516 
517 /*
518  * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
519  */
520 static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
521 	.invalidate = drm_gpusvm_notifier_invalidate,
522 };
523 
524 /**
525  * drm_gpusvm_init() - Initialize the GPU SVM.
526  * @gpusvm: Pointer to the GPU SVM structure.
527  * @name: Name of the GPU SVM.
528  * @drm: Pointer to the DRM device structure.
529  * @mm: Pointer to the mm_struct for the address space.
530  * @device_private_page_owner: Device private pages owner.
531  * @mm_start: Start address of GPU SVM.
532  * @mm_range: Range of the GPU SVM.
533  * @notifier_size: Size of individual notifiers.
534  * @ops: Pointer to the operations structure for GPU SVM.
535  * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
536  *               Entries should be powers of 2 in descending order with last
537  *               entry being SZ_4K.
538  * @num_chunks: Number of chunks.
539  *
540  * This function initializes the GPU SVM.
541  *
542  * Return: 0 on success, a negative error code on failure.
543  */
544 int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
545 		    const char *name, struct drm_device *drm,
546 		    struct mm_struct *mm, void *device_private_page_owner,
547 		    unsigned long mm_start, unsigned long mm_range,
548 		    unsigned long notifier_size,
549 		    const struct drm_gpusvm_ops *ops,
550 		    const unsigned long *chunk_sizes, int num_chunks)
551 {
552 	if (!ops->invalidate || !num_chunks)
553 		return -EINVAL;
554 
555 	gpusvm->name = name;
556 	gpusvm->drm = drm;
557 	gpusvm->mm = mm;
558 	gpusvm->device_private_page_owner = device_private_page_owner;
559 	gpusvm->mm_start = mm_start;
560 	gpusvm->mm_range = mm_range;
561 	gpusvm->notifier_size = notifier_size;
562 	gpusvm->ops = ops;
563 	gpusvm->chunk_sizes = chunk_sizes;
564 	gpusvm->num_chunks = num_chunks;
565 
566 	mmgrab(mm);
567 	gpusvm->root = RB_ROOT_CACHED;
568 	INIT_LIST_HEAD(&gpusvm->notifier_list);
569 
570 	init_rwsem(&gpusvm->notifier_lock);
571 
572 	fs_reclaim_acquire(GFP_KERNEL);
573 	might_lock(&gpusvm->notifier_lock);
574 	fs_reclaim_release(GFP_KERNEL);
575 
576 #ifdef CONFIG_LOCKDEP
577 	gpusvm->lock_dep_map = NULL;
578 #endif
579 
580 	return 0;
581 }
582 EXPORT_SYMBOL_GPL(drm_gpusvm_init);
583 
584 /**
585  * drm_gpusvm_notifier_find() - Find GPU SVM notifier
586  * @gpusvm: Pointer to the GPU SVM structure
587  * @fault_addr: Fault address
588  *
589  * This function finds the GPU SVM notifier associated with the fault address.
590  *
591  * Return: Pointer to the GPU SVM notifier on success, NULL otherwise.
592  */
593 static struct drm_gpusvm_notifier *
594 drm_gpusvm_notifier_find(struct drm_gpusvm *gpusvm,
595 			 unsigned long fault_addr)
596 {
597 	return notifier_iter_first(&gpusvm->root, fault_addr, fault_addr + 1);
598 }
599 
600 /**
601  * to_drm_gpusvm_notifier() - retrieve the container struct for a given rbtree node
602  * @node: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
603  *
604  * Return: A pointer to the containing drm_gpusvm_notifier structure.
605  */
606 static struct drm_gpusvm_notifier *to_drm_gpusvm_notifier(struct rb_node *node)
607 {
608 	return container_of(node, struct drm_gpusvm_notifier, itree.rb);
609 }
610 
611 /**
612  * drm_gpusvm_notifier_insert() - Insert GPU SVM notifier
613  * @gpusvm: Pointer to the GPU SVM structure
614  * @notifier: Pointer to the GPU SVM notifier structure
615  *
616  * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
617  */
618 static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
619 				       struct drm_gpusvm_notifier *notifier)
620 {
621 	struct rb_node *node;
622 	struct list_head *head;
623 
624 	interval_tree_insert(&notifier->itree, &gpusvm->root);
625 
626 	node = rb_prev(&notifier->itree.rb);
627 	if (node)
628 		head = &(to_drm_gpusvm_notifier(node))->entry;
629 	else
630 		head = &gpusvm->notifier_list;
631 
632 	list_add(&notifier->entry, head);
633 }
634 
635 /**
636  * drm_gpusvm_notifier_remove() - Remove GPU SVM notifier
637  * @gpusvm: Pointer to the GPU SVM tructure
638  * @notifier: Pointer to the GPU SVM notifier structure
639  *
640  * This function removes the GPU SVM notifier from the GPU SVM RB tree and list.
641  */
642 static void drm_gpusvm_notifier_remove(struct drm_gpusvm *gpusvm,
643 				       struct drm_gpusvm_notifier *notifier)
644 {
645 	interval_tree_remove(&notifier->itree, &gpusvm->root);
646 	list_del(&notifier->entry);
647 }
648 
649 /**
650  * drm_gpusvm_fini() - Finalize the GPU SVM.
651  * @gpusvm: Pointer to the GPU SVM structure.
652  *
653  * This function finalizes the GPU SVM by cleaning up any remaining ranges and
654  * notifiers, and dropping a reference to struct MM.
655  */
656 void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
657 {
658 	struct drm_gpusvm_notifier *notifier, *next;
659 
660 	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
661 		struct drm_gpusvm_range *range, *__next;
662 
663 		/*
664 		 * Remove notifier first to avoid racing with any invalidation
665 		 */
666 		mmu_interval_notifier_remove(&notifier->notifier);
667 		notifier->flags.removed = true;
668 
669 		drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
670 					       LONG_MAX)
671 			drm_gpusvm_range_remove(gpusvm, range);
672 	}
673 
674 	mmdrop(gpusvm->mm);
675 	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
676 }
677 EXPORT_SYMBOL_GPL(drm_gpusvm_fini);
678 
679 /**
680  * drm_gpusvm_notifier_alloc() - Allocate GPU SVM notifier
681  * @gpusvm: Pointer to the GPU SVM structure
682  * @fault_addr: Fault address
683  *
684  * This function allocates and initializes the GPU SVM notifier structure.
685  *
686  * Return: Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
687  */
688 static struct drm_gpusvm_notifier *
689 drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, unsigned long fault_addr)
690 {
691 	struct drm_gpusvm_notifier *notifier;
692 
693 	if (gpusvm->ops->notifier_alloc)
694 		notifier = gpusvm->ops->notifier_alloc();
695 	else
696 		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
697 
698 	if (!notifier)
699 		return ERR_PTR(-ENOMEM);
700 
701 	notifier->gpusvm = gpusvm;
702 	notifier->itree.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
703 	notifier->itree.last = ALIGN(fault_addr + 1, gpusvm->notifier_size) - 1;
704 	INIT_LIST_HEAD(&notifier->entry);
705 	notifier->root = RB_ROOT_CACHED;
706 	INIT_LIST_HEAD(&notifier->range_list);
707 
708 	return notifier;
709 }
710 
711 /**
712  * drm_gpusvm_notifier_free() - Free GPU SVM notifier
713  * @gpusvm: Pointer to the GPU SVM structure
714  * @notifier: Pointer to the GPU SVM notifier structure
715  *
716  * This function frees the GPU SVM notifier structure.
717  */
718 static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
719 				     struct drm_gpusvm_notifier *notifier)
720 {
721 	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
722 
723 	if (gpusvm->ops->notifier_free)
724 		gpusvm->ops->notifier_free(notifier);
725 	else
726 		kfree(notifier);
727 }
728 
729 /**
730  * to_drm_gpusvm_range() - retrieve the container struct for a given rbtree node
731  * @node: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
732  *
733  * Return: A pointer to the containing drm_gpusvm_range structure.
734  */
735 static struct drm_gpusvm_range *to_drm_gpusvm_range(struct rb_node *node)
736 {
737 	return container_of(node, struct drm_gpusvm_range, itree.rb);
738 }
739 
740 /**
741  * drm_gpusvm_range_insert() - Insert GPU SVM range
742  * @notifier: Pointer to the GPU SVM notifier structure
743  * @range: Pointer to the GPU SVM range structure
744  *
745  * This function inserts the GPU SVM range into the notifier RB tree and list.
746  */
747 static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
748 				    struct drm_gpusvm_range *range)
749 {
750 	struct rb_node *node;
751 	struct list_head *head;
752 
753 	drm_gpusvm_notifier_lock(notifier->gpusvm);
754 	interval_tree_insert(&range->itree, &notifier->root);
755 
756 	node = rb_prev(&range->itree.rb);
757 	if (node)
758 		head = &(to_drm_gpusvm_range(node))->entry;
759 	else
760 		head = &notifier->range_list;
761 
762 	list_add(&range->entry, head);
763 	drm_gpusvm_notifier_unlock(notifier->gpusvm);
764 }
765 
766 /**
767  * __drm_gpusvm_range_remove() - Remove GPU SVM range
768  * @notifier: Pointer to the GPU SVM notifier structure
769  * @range: Pointer to the GPU SVM range structure
770  *
771  * This macro removes the GPU SVM range from the notifier RB tree and list.
772  */
773 static void __drm_gpusvm_range_remove(struct drm_gpusvm_notifier *notifier,
774 				      struct drm_gpusvm_range *range)
775 {
776 	interval_tree_remove(&range->itree, &notifier->root);
777 	list_del(&range->entry);
778 }
779 
780 /**
781  * drm_gpusvm_range_alloc() - Allocate GPU SVM range
782  * @gpusvm: Pointer to the GPU SVM structure
783  * @notifier: Pointer to the GPU SVM notifier structure
784  * @fault_addr: Fault address
785  * @chunk_size: Chunk size
786  * @migrate_devmem: Flag indicating whether to migrate device memory
787  *
788  * This function allocates and initializes the GPU SVM range structure.
789  *
790  * Return: Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
791  */
792 static struct drm_gpusvm_range *
793 drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
794 		       struct drm_gpusvm_notifier *notifier,
795 		       unsigned long fault_addr, unsigned long chunk_size,
796 		       bool migrate_devmem)
797 {
798 	struct drm_gpusvm_range *range;
799 
800 	if (gpusvm->ops->range_alloc)
801 		range = gpusvm->ops->range_alloc(gpusvm);
802 	else
803 		range = kzalloc(sizeof(*range), GFP_KERNEL);
804 
805 	if (!range)
806 		return ERR_PTR(-ENOMEM);
807 
808 	kref_init(&range->refcount);
809 	range->gpusvm = gpusvm;
810 	range->notifier = notifier;
811 	range->itree.start = ALIGN_DOWN(fault_addr, chunk_size);
812 	range->itree.last = ALIGN(fault_addr + 1, chunk_size) - 1;
813 	INIT_LIST_HEAD(&range->entry);
814 	range->notifier_seq = LONG_MAX;
815 	range->flags.migrate_devmem = migrate_devmem ? 1 : 0;
816 
817 	return range;
818 }
819 
820 /**
821  * drm_gpusvm_check_pages() - Check pages
822  * @gpusvm: Pointer to the GPU SVM structure
823  * @notifier: Pointer to the GPU SVM notifier structure
824  * @start: Start address
825  * @end: End address
826  *
827  * Check if pages between start and end have been faulted in on the CPU. Use to
828  * prevent migration of pages without CPU backing store.
829  *
830  * Return: True if pages have been faulted into CPU, False otherwise
831  */
832 static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
833 				   struct drm_gpusvm_notifier *notifier,
834 				   unsigned long start, unsigned long end)
835 {
836 	struct hmm_range hmm_range = {
837 		.default_flags = 0,
838 		.notifier = &notifier->notifier,
839 		.start = start,
840 		.end = end,
841 		.dev_private_owner = gpusvm->device_private_page_owner,
842 	};
843 	unsigned long timeout =
844 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
845 	unsigned long *pfns;
846 	unsigned long npages = npages_in_range(start, end);
847 	int err, i;
848 
849 	mmap_assert_locked(gpusvm->mm);
850 
851 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
852 	if (!pfns)
853 		return false;
854 
855 	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
856 	hmm_range.hmm_pfns = pfns;
857 
858 	while (true) {
859 		err = hmm_range_fault(&hmm_range);
860 		if (err == -EBUSY) {
861 			if (time_after(jiffies, timeout))
862 				break;
863 
864 			hmm_range.notifier_seq =
865 				mmu_interval_read_begin(&notifier->notifier);
866 			continue;
867 		}
868 		break;
869 	}
870 	if (err)
871 		goto err_free;
872 
873 	for (i = 0; i < npages;) {
874 		if (!(pfns[i] & HMM_PFN_VALID)) {
875 			err = -EFAULT;
876 			goto err_free;
877 		}
878 		i += 0x1 << hmm_pfn_to_map_order(pfns[i]);
879 	}
880 
881 err_free:
882 	kvfree(pfns);
883 	return err ? false : true;
884 }
885 
886 /**
887  * drm_gpusvm_range_chunk_size() - Determine chunk size for GPU SVM range
888  * @gpusvm: Pointer to the GPU SVM structure
889  * @notifier: Pointer to the GPU SVM notifier structure
890  * @vas: Pointer to the virtual memory area structure
891  * @fault_addr: Fault address
892  * @gpuva_start: Start address of GPUVA which mirrors CPU
893  * @gpuva_end: End address of GPUVA which mirrors CPU
894  * @check_pages_threshold: Check CPU pages for present threshold
895  *
896  * This function determines the chunk size for the GPU SVM range based on the
897  * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
898  * memory area boundaries.
899  *
900  * Return: Chunk size on success, LONG_MAX on failure.
901  */
902 static unsigned long
903 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
904 			    struct drm_gpusvm_notifier *notifier,
905 			    struct vm_area_struct *vas,
906 			    unsigned long fault_addr,
907 			    unsigned long gpuva_start,
908 			    unsigned long gpuva_end,
909 			    unsigned long check_pages_threshold)
910 {
911 	unsigned long start, end;
912 	int i = 0;
913 
914 retry:
915 	for (; i < gpusvm->num_chunks; ++i) {
916 		start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
917 		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
918 
919 		if (start >= vas->vm_start && end <= vas->vm_end &&
920 		    start >= drm_gpusvm_notifier_start(notifier) &&
921 		    end <= drm_gpusvm_notifier_end(notifier) &&
922 		    start >= gpuva_start && end <= gpuva_end)
923 			break;
924 	}
925 
926 	if (i == gpusvm->num_chunks)
927 		return LONG_MAX;
928 
929 	/*
930 	 * If allocation more than page, ensure not to overlap with existing
931 	 * ranges.
932 	 */
933 	if (end - start != SZ_4K) {
934 		struct drm_gpusvm_range *range;
935 
936 		range = drm_gpusvm_range_find(notifier, start, end);
937 		if (range) {
938 			++i;
939 			goto retry;
940 		}
941 
942 		/*
943 		 * XXX: Only create range on pages CPU has faulted in. Without
944 		 * this check, or prefault, on BMG 'xe_exec_system_allocator --r
945 		 * process-many-malloc' fails. In the failure case, each process
946 		 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
947 		 * ranges. When migrating the SVM ranges, some processes fail in
948 		 * drm_gpusvm_migrate_to_devmem with 'migrate.cpages != npages'
949 		 * and then upon drm_gpusvm_range_get_pages device pages from
950 		 * other processes are collected + faulted in which creates all
951 		 * sorts of problems. Unsure exactly how this happening, also
952 		 * problem goes away if 'xe_exec_system_allocator --r
953 		 * process-many-malloc' mallocs at least 64k at a time.
954 		 */
955 		if (end - start <= check_pages_threshold &&
956 		    !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) {
957 			++i;
958 			goto retry;
959 		}
960 	}
961 
962 	return end - start;
963 }
964 
965 #ifdef CONFIG_LOCKDEP
966 /**
967  * drm_gpusvm_driver_lock_held() - Assert GPU SVM driver lock is held
968  * @gpusvm: Pointer to the GPU SVM structure.
969  *
970  * Ensure driver lock is held.
971  */
972 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
973 {
974 	if ((gpusvm)->lock_dep_map)
975 		lockdep_assert(lock_is_held_type((gpusvm)->lock_dep_map, 0));
976 }
977 #else
978 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
979 {
980 }
981 #endif
982 
983 /**
984  * drm_gpusvm_find_vma_start() - Find start address for first VMA in range
985  * @gpusvm: Pointer to the GPU SVM structure
986  * @start: The inclusive start user address.
987  * @end: The exclusive end user address.
988  *
989  * Returns: The start address of first VMA within the provided range,
990  * ULONG_MAX otherwise. Assumes start_addr < end_addr.
991  */
992 unsigned long
993 drm_gpusvm_find_vma_start(struct drm_gpusvm *gpusvm,
994 			  unsigned long start,
995 			  unsigned long end)
996 {
997 	struct mm_struct *mm = gpusvm->mm;
998 	struct vm_area_struct *vma;
999 	unsigned long addr = ULONG_MAX;
1000 
1001 	if (!mmget_not_zero(mm))
1002 		return addr;
1003 
1004 	mmap_read_lock(mm);
1005 
1006 	vma = find_vma_intersection(mm, start, end);
1007 	if (vma)
1008 		addr =  vma->vm_start;
1009 
1010 	mmap_read_unlock(mm);
1011 	mmput(mm);
1012 
1013 	return addr;
1014 }
1015 EXPORT_SYMBOL_GPL(drm_gpusvm_find_vma_start);
1016 
1017 /**
1018  * drm_gpusvm_range_find_or_insert() - Find or insert GPU SVM range
1019  * @gpusvm: Pointer to the GPU SVM structure
1020  * @fault_addr: Fault address
1021  * @gpuva_start: Start address of GPUVA which mirrors CPU
1022  * @gpuva_end: End address of GPUVA which mirrors CPU
1023  * @ctx: GPU SVM context
1024  *
1025  * This function finds or inserts a newly allocated a GPU SVM range based on the
1026  * fault address. Caller must hold a lock to protect range lookup and insertion.
1027  *
1028  * Return: Pointer to the GPU SVM range on success, ERR_PTR() on failure.
1029  */
1030 struct drm_gpusvm_range *
1031 drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
1032 				unsigned long fault_addr,
1033 				unsigned long gpuva_start,
1034 				unsigned long gpuva_end,
1035 				const struct drm_gpusvm_ctx *ctx)
1036 {
1037 	struct drm_gpusvm_notifier *notifier;
1038 	struct drm_gpusvm_range *range;
1039 	struct mm_struct *mm = gpusvm->mm;
1040 	struct vm_area_struct *vas;
1041 	bool notifier_alloc = false;
1042 	unsigned long chunk_size;
1043 	int err;
1044 	bool migrate_devmem;
1045 
1046 	drm_gpusvm_driver_lock_held(gpusvm);
1047 
1048 	if (fault_addr < gpusvm->mm_start ||
1049 	    fault_addr > gpusvm->mm_start + gpusvm->mm_range)
1050 		return ERR_PTR(-EINVAL);
1051 
1052 	if (!mmget_not_zero(mm))
1053 		return ERR_PTR(-EFAULT);
1054 
1055 	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
1056 	if (!notifier) {
1057 		notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
1058 		if (IS_ERR(notifier)) {
1059 			err = PTR_ERR(notifier);
1060 			goto err_mmunlock;
1061 		}
1062 		notifier_alloc = true;
1063 		err = mmu_interval_notifier_insert(&notifier->notifier,
1064 						   mm,
1065 						   drm_gpusvm_notifier_start(notifier),
1066 						   drm_gpusvm_notifier_size(notifier),
1067 						   &drm_gpusvm_notifier_ops);
1068 		if (err)
1069 			goto err_notifier;
1070 	}
1071 
1072 	mmap_read_lock(mm);
1073 
1074 	vas = vma_lookup(mm, fault_addr);
1075 	if (!vas) {
1076 		err = -ENOENT;
1077 		goto err_notifier_remove;
1078 	}
1079 
1080 	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
1081 		err = -EPERM;
1082 		goto err_notifier_remove;
1083 	}
1084 
1085 	range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
1086 	if (range)
1087 		goto out_mmunlock;
1088 	/*
1089 	 * XXX: Short-circuiting migration based on migrate_vma_* current
1090 	 * limitations. If/when migrate_vma_* add more support, this logic will
1091 	 * have to change.
1092 	 */
1093 	migrate_devmem = ctx->devmem_possible &&
1094 		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
1095 
1096 	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
1097 						 fault_addr, gpuva_start,
1098 						 gpuva_end,
1099 						 ctx->check_pages_threshold);
1100 	if (chunk_size == LONG_MAX) {
1101 		err = -EINVAL;
1102 		goto err_notifier_remove;
1103 	}
1104 
1105 	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
1106 				       migrate_devmem);
1107 	if (IS_ERR(range)) {
1108 		err = PTR_ERR(range);
1109 		goto err_notifier_remove;
1110 	}
1111 
1112 	drm_gpusvm_range_insert(notifier, range);
1113 	if (notifier_alloc)
1114 		drm_gpusvm_notifier_insert(gpusvm, notifier);
1115 
1116 out_mmunlock:
1117 	mmap_read_unlock(mm);
1118 	mmput(mm);
1119 
1120 	return range;
1121 
1122 err_notifier_remove:
1123 	mmap_read_unlock(mm);
1124 	if (notifier_alloc)
1125 		mmu_interval_notifier_remove(&notifier->notifier);
1126 err_notifier:
1127 	if (notifier_alloc)
1128 		drm_gpusvm_notifier_free(gpusvm, notifier);
1129 err_mmunlock:
1130 	mmput(mm);
1131 	return ERR_PTR(err);
1132 }
1133 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find_or_insert);
1134 
1135 /**
1136  * __drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range (internal)
1137  * @gpusvm: Pointer to the GPU SVM structure
1138  * @range: Pointer to the GPU SVM range structure
1139  * @npages: Number of pages to unmap
1140  *
1141  * This function unmap pages associated with a GPU SVM range. Assumes and
1142  * asserts correct locking is in place when called.
1143  */
1144 static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1145 					   struct drm_gpusvm_range *range,
1146 					   unsigned long npages)
1147 {
1148 	unsigned long i, j;
1149 	struct drm_pagemap *dpagemap = range->dpagemap;
1150 	struct device *dev = gpusvm->drm->dev;
1151 
1152 	lockdep_assert_held(&gpusvm->notifier_lock);
1153 
1154 	if (range->flags.has_dma_mapping) {
1155 		struct drm_gpusvm_range_flags flags = {
1156 			.__flags = range->flags.__flags,
1157 		};
1158 
1159 		for (i = 0, j = 0; i < npages; j++) {
1160 			struct drm_pagemap_device_addr *addr = &range->dma_addr[j];
1161 
1162 			if (addr->proto == DRM_INTERCONNECT_SYSTEM)
1163 				dma_unmap_page(dev,
1164 					       addr->addr,
1165 					       PAGE_SIZE << addr->order,
1166 					       addr->dir);
1167 			else if (dpagemap && dpagemap->ops->device_unmap)
1168 				dpagemap->ops->device_unmap(dpagemap,
1169 							    dev, *addr);
1170 			i += 1 << addr->order;
1171 		}
1172 
1173 		/* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1174 		flags.has_devmem_pages = false;
1175 		flags.has_dma_mapping = false;
1176 		WRITE_ONCE(range->flags.__flags, flags.__flags);
1177 
1178 		range->dpagemap = NULL;
1179 	}
1180 }
1181 
1182 /**
1183  * drm_gpusvm_range_free_pages() - Free pages associated with a GPU SVM range
1184  * @gpusvm: Pointer to the GPU SVM structure
1185  * @range: Pointer to the GPU SVM range structure
1186  *
1187  * This function frees the dma address array associated with a GPU SVM range.
1188  */
1189 static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
1190 					struct drm_gpusvm_range *range)
1191 {
1192 	lockdep_assert_held(&gpusvm->notifier_lock);
1193 
1194 	if (range->dma_addr) {
1195 		kvfree(range->dma_addr);
1196 		range->dma_addr = NULL;
1197 	}
1198 }
1199 
1200 /**
1201  * drm_gpusvm_range_remove() - Remove GPU SVM range
1202  * @gpusvm: Pointer to the GPU SVM structure
1203  * @range: Pointer to the GPU SVM range to be removed
1204  *
1205  * This function removes the specified GPU SVM range and also removes the parent
1206  * GPU SVM notifier if no more ranges remain in the notifier. The caller must
1207  * hold a lock to protect range and notifier removal.
1208  */
1209 void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
1210 			     struct drm_gpusvm_range *range)
1211 {
1212 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1213 					       drm_gpusvm_range_end(range));
1214 	struct drm_gpusvm_notifier *notifier;
1215 
1216 	drm_gpusvm_driver_lock_held(gpusvm);
1217 
1218 	notifier = drm_gpusvm_notifier_find(gpusvm,
1219 					    drm_gpusvm_range_start(range));
1220 	if (WARN_ON_ONCE(!notifier))
1221 		return;
1222 
1223 	drm_gpusvm_notifier_lock(gpusvm);
1224 	__drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1225 	drm_gpusvm_range_free_pages(gpusvm, range);
1226 	__drm_gpusvm_range_remove(notifier, range);
1227 	drm_gpusvm_notifier_unlock(gpusvm);
1228 
1229 	drm_gpusvm_range_put(range);
1230 
1231 	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
1232 		if (!notifier->flags.removed)
1233 			mmu_interval_notifier_remove(&notifier->notifier);
1234 		drm_gpusvm_notifier_remove(gpusvm, notifier);
1235 		drm_gpusvm_notifier_free(gpusvm, notifier);
1236 	}
1237 }
1238 EXPORT_SYMBOL_GPL(drm_gpusvm_range_remove);
1239 
1240 /**
1241  * drm_gpusvm_range_get() - Get a reference to GPU SVM range
1242  * @range: Pointer to the GPU SVM range
1243  *
1244  * This function increments the reference count of the specified GPU SVM range.
1245  *
1246  * Return: Pointer to the GPU SVM range.
1247  */
1248 struct drm_gpusvm_range *
1249 drm_gpusvm_range_get(struct drm_gpusvm_range *range)
1250 {
1251 	kref_get(&range->refcount);
1252 
1253 	return range;
1254 }
1255 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get);
1256 
1257 /**
1258  * drm_gpusvm_range_destroy() - Destroy GPU SVM range
1259  * @refcount: Pointer to the reference counter embedded in the GPU SVM range
1260  *
1261  * This function destroys the specified GPU SVM range when its reference count
1262  * reaches zero. If a custom range-free function is provided, it is invoked to
1263  * free the range; otherwise, the range is deallocated using kfree().
1264  */
1265 static void drm_gpusvm_range_destroy(struct kref *refcount)
1266 {
1267 	struct drm_gpusvm_range *range =
1268 		container_of(refcount, struct drm_gpusvm_range, refcount);
1269 	struct drm_gpusvm *gpusvm = range->gpusvm;
1270 
1271 	if (gpusvm->ops->range_free)
1272 		gpusvm->ops->range_free(range);
1273 	else
1274 		kfree(range);
1275 }
1276 
1277 /**
1278  * drm_gpusvm_range_put() - Put a reference to GPU SVM range
1279  * @range: Pointer to the GPU SVM range
1280  *
1281  * This function decrements the reference count of the specified GPU SVM range
1282  * and frees it when the count reaches zero.
1283  */
1284 void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
1285 {
1286 	kref_put(&range->refcount, drm_gpusvm_range_destroy);
1287 }
1288 EXPORT_SYMBOL_GPL(drm_gpusvm_range_put);
1289 
1290 /**
1291  * drm_gpusvm_range_pages_valid() - GPU SVM range pages valid
1292  * @gpusvm: Pointer to the GPU SVM structure
1293  * @range: Pointer to the GPU SVM range structure
1294  *
1295  * This function determines if a GPU SVM range pages are valid. Expected be
1296  * called holding gpusvm->notifier_lock and as the last step before committing a
1297  * GPU binding. This is akin to a notifier seqno check in the HMM documentation
1298  * but due to wider notifiers (i.e., notifiers which span multiple ranges) this
1299  * function is required for finer grained checking (i.e., per range) if pages
1300  * are valid.
1301  *
1302  * Return: True if GPU SVM range has valid pages, False otherwise
1303  */
1304 bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
1305 				  struct drm_gpusvm_range *range)
1306 {
1307 	lockdep_assert_held(&gpusvm->notifier_lock);
1308 
1309 	return range->flags.has_devmem_pages || range->flags.has_dma_mapping;
1310 }
1311 EXPORT_SYMBOL_GPL(drm_gpusvm_range_pages_valid);
1312 
1313 /**
1314  * drm_gpusvm_range_pages_valid_unlocked() - GPU SVM range pages valid unlocked
1315  * @gpusvm: Pointer to the GPU SVM structure
1316  * @range: Pointer to the GPU SVM range structure
1317  *
1318  * This function determines if a GPU SVM range pages are valid. Expected be
1319  * called without holding gpusvm->notifier_lock.
1320  *
1321  * Return: True if GPU SVM range has valid pages, False otherwise
1322  */
1323 static bool
1324 drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
1325 				      struct drm_gpusvm_range *range)
1326 {
1327 	bool pages_valid;
1328 
1329 	if (!range->dma_addr)
1330 		return false;
1331 
1332 	drm_gpusvm_notifier_lock(gpusvm);
1333 	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
1334 	if (!pages_valid)
1335 		drm_gpusvm_range_free_pages(gpusvm, range);
1336 	drm_gpusvm_notifier_unlock(gpusvm);
1337 
1338 	return pages_valid;
1339 }
1340 
1341 /**
1342  * drm_gpusvm_range_get_pages() - Get pages for a GPU SVM range
1343  * @gpusvm: Pointer to the GPU SVM structure
1344  * @range: Pointer to the GPU SVM range structure
1345  * @ctx: GPU SVM context
1346  *
1347  * This function gets pages for a GPU SVM range and ensures they are mapped for
1348  * DMA access.
1349  *
1350  * Return: 0 on success, negative error code on failure.
1351  */
1352 int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
1353 			       struct drm_gpusvm_range *range,
1354 			       const struct drm_gpusvm_ctx *ctx)
1355 {
1356 	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
1357 	struct hmm_range hmm_range = {
1358 		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
1359 			HMM_PFN_REQ_WRITE),
1360 		.notifier = notifier,
1361 		.start = drm_gpusvm_range_start(range),
1362 		.end = drm_gpusvm_range_end(range),
1363 		.dev_private_owner = gpusvm->device_private_page_owner,
1364 	};
1365 	struct mm_struct *mm = gpusvm->mm;
1366 	struct drm_gpusvm_zdd *zdd;
1367 	unsigned long timeout =
1368 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1369 	unsigned long i, j;
1370 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1371 					       drm_gpusvm_range_end(range));
1372 	unsigned long num_dma_mapped;
1373 	unsigned int order = 0;
1374 	unsigned long *pfns;
1375 	struct page **pages;
1376 	int err = 0;
1377 	struct dev_pagemap *pagemap;
1378 	struct drm_pagemap *dpagemap;
1379 	struct drm_gpusvm_range_flags flags;
1380 
1381 retry:
1382 	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
1383 	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
1384 		goto set_seqno;
1385 
1386 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
1387 	if (!pfns)
1388 		return -ENOMEM;
1389 
1390 	if (!mmget_not_zero(mm)) {
1391 		err = -EFAULT;
1392 		goto err_free;
1393 	}
1394 
1395 	hmm_range.hmm_pfns = pfns;
1396 	while (true) {
1397 		mmap_read_lock(mm);
1398 		err = hmm_range_fault(&hmm_range);
1399 		mmap_read_unlock(mm);
1400 
1401 		if (err == -EBUSY) {
1402 			if (time_after(jiffies, timeout))
1403 				break;
1404 
1405 			hmm_range.notifier_seq =
1406 				mmu_interval_read_begin(notifier);
1407 			continue;
1408 		}
1409 		break;
1410 	}
1411 	mmput(mm);
1412 	if (err)
1413 		goto err_free;
1414 
1415 	pages = (struct page **)pfns;
1416 map_pages:
1417 	/*
1418 	 * Perform all dma mappings under the notifier lock to not
1419 	 * access freed pages. A notifier will either block on
1420 	 * the notifier lock or unmap dma.
1421 	 */
1422 	drm_gpusvm_notifier_lock(gpusvm);
1423 
1424 	flags.__flags = range->flags.__flags;
1425 	if (flags.unmapped) {
1426 		drm_gpusvm_notifier_unlock(gpusvm);
1427 		err = -EFAULT;
1428 		goto err_free;
1429 	}
1430 
1431 	if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
1432 		drm_gpusvm_notifier_unlock(gpusvm);
1433 		kvfree(pfns);
1434 		goto retry;
1435 	}
1436 
1437 	if (!range->dma_addr) {
1438 		/* Unlock and restart mapping to allocate memory. */
1439 		drm_gpusvm_notifier_unlock(gpusvm);
1440 		range->dma_addr = kvmalloc_array(npages,
1441 						 sizeof(*range->dma_addr),
1442 						 GFP_KERNEL);
1443 		if (!range->dma_addr) {
1444 			err = -ENOMEM;
1445 			goto err_free;
1446 		}
1447 		goto map_pages;
1448 	}
1449 
1450 	zdd = NULL;
1451 	num_dma_mapped = 0;
1452 	for (i = 0, j = 0; i < npages; ++j) {
1453 		struct page *page = hmm_pfn_to_page(pfns[i]);
1454 
1455 		order = hmm_pfn_to_map_order(pfns[i]);
1456 		if (is_device_private_page(page) ||
1457 		    is_device_coherent_page(page)) {
1458 			if (zdd != page->zone_device_data && i > 0) {
1459 				err = -EOPNOTSUPP;
1460 				goto err_unmap;
1461 			}
1462 			zdd = page->zone_device_data;
1463 			if (pagemap != page_pgmap(page)) {
1464 				if (i > 0) {
1465 					err = -EOPNOTSUPP;
1466 					goto err_unmap;
1467 				}
1468 
1469 				pagemap = page_pgmap(page);
1470 				dpagemap = zdd->devmem_allocation->dpagemap;
1471 				if (drm_WARN_ON(gpusvm->drm, !dpagemap)) {
1472 					/*
1473 					 * Raced. This is not supposed to happen
1474 					 * since hmm_range_fault() should've migrated
1475 					 * this page to system.
1476 					 */
1477 					err = -EAGAIN;
1478 					goto err_unmap;
1479 				}
1480 			}
1481 			range->dma_addr[j] =
1482 				dpagemap->ops->device_map(dpagemap,
1483 							  gpusvm->drm->dev,
1484 							  page, order,
1485 							  DMA_BIDIRECTIONAL);
1486 			if (dma_mapping_error(gpusvm->drm->dev,
1487 					      range->dma_addr[j].addr)) {
1488 				err = -EFAULT;
1489 				goto err_unmap;
1490 			}
1491 
1492 			pages[i] = page;
1493 		} else {
1494 			dma_addr_t addr;
1495 
1496 			if (is_zone_device_page(page) || zdd) {
1497 				err = -EOPNOTSUPP;
1498 				goto err_unmap;
1499 			}
1500 
1501 			if (ctx->devmem_only) {
1502 				err = -EFAULT;
1503 				goto err_unmap;
1504 			}
1505 
1506 			addr = dma_map_page(gpusvm->drm->dev,
1507 					    page, 0,
1508 					    PAGE_SIZE << order,
1509 					    DMA_BIDIRECTIONAL);
1510 			if (dma_mapping_error(gpusvm->drm->dev, addr)) {
1511 				err = -EFAULT;
1512 				goto err_unmap;
1513 			}
1514 
1515 			range->dma_addr[j] = drm_pagemap_device_addr_encode
1516 				(addr, DRM_INTERCONNECT_SYSTEM, order,
1517 				 DMA_BIDIRECTIONAL);
1518 		}
1519 		i += 1 << order;
1520 		num_dma_mapped = i;
1521 		flags.has_dma_mapping = true;
1522 	}
1523 
1524 	if (zdd) {
1525 		flags.has_devmem_pages = true;
1526 		range->dpagemap = dpagemap;
1527 	}
1528 
1529 	/* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1530 	WRITE_ONCE(range->flags.__flags, flags.__flags);
1531 
1532 	drm_gpusvm_notifier_unlock(gpusvm);
1533 	kvfree(pfns);
1534 set_seqno:
1535 	range->notifier_seq = hmm_range.notifier_seq;
1536 
1537 	return 0;
1538 
1539 err_unmap:
1540 	__drm_gpusvm_range_unmap_pages(gpusvm, range, num_dma_mapped);
1541 	drm_gpusvm_notifier_unlock(gpusvm);
1542 err_free:
1543 	kvfree(pfns);
1544 	if (err == -EAGAIN)
1545 		goto retry;
1546 	return err;
1547 }
1548 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get_pages);
1549 
1550 /**
1551  * drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range
1552  * @gpusvm: Pointer to the GPU SVM structure
1553  * @range: Pointer to the GPU SVM range structure
1554  * @ctx: GPU SVM context
1555  *
1556  * This function unmaps pages associated with a GPU SVM range. If @in_notifier
1557  * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
1558  * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
1559  * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
1560  * security model.
1561  */
1562 void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1563 				  struct drm_gpusvm_range *range,
1564 				  const struct drm_gpusvm_ctx *ctx)
1565 {
1566 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1567 					       drm_gpusvm_range_end(range));
1568 
1569 	if (ctx->in_notifier)
1570 		lockdep_assert_held_write(&gpusvm->notifier_lock);
1571 	else
1572 		drm_gpusvm_notifier_lock(gpusvm);
1573 
1574 	__drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1575 
1576 	if (!ctx->in_notifier)
1577 		drm_gpusvm_notifier_unlock(gpusvm);
1578 }
1579 EXPORT_SYMBOL_GPL(drm_gpusvm_range_unmap_pages);
1580 
1581 /**
1582  * drm_gpusvm_migration_unlock_put_page() - Put a migration page
1583  * @page: Pointer to the page to put
1584  *
1585  * This function unlocks and puts a page.
1586  */
1587 static void drm_gpusvm_migration_unlock_put_page(struct page *page)
1588 {
1589 	unlock_page(page);
1590 	put_page(page);
1591 }
1592 
1593 /**
1594  * drm_gpusvm_migration_unlock_put_pages() - Put migration pages
1595  * @npages: Number of pages
1596  * @migrate_pfn: Array of migrate page frame numbers
1597  *
1598  * This function unlocks and puts an array of pages.
1599  */
1600 static void drm_gpusvm_migration_unlock_put_pages(unsigned long npages,
1601 						  unsigned long *migrate_pfn)
1602 {
1603 	unsigned long i;
1604 
1605 	for (i = 0; i < npages; ++i) {
1606 		struct page *page;
1607 
1608 		if (!migrate_pfn[i])
1609 			continue;
1610 
1611 		page = migrate_pfn_to_page(migrate_pfn[i]);
1612 		drm_gpusvm_migration_unlock_put_page(page);
1613 		migrate_pfn[i] = 0;
1614 	}
1615 }
1616 
1617 /**
1618  * drm_gpusvm_get_devmem_page() - Get a reference to a device memory page
1619  * @page: Pointer to the page
1620  * @zdd: Pointer to the GPU SVM zone device data
1621  *
1622  * This function associates the given page with the specified GPU SVM zone
1623  * device data and initializes it for zone device usage.
1624  */
1625 static void drm_gpusvm_get_devmem_page(struct page *page,
1626 				       struct drm_gpusvm_zdd *zdd)
1627 {
1628 	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
1629 	zone_device_page_init(page);
1630 }
1631 
1632 /**
1633  * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
1634  * @dev: The device for which the pages are being mapped
1635  * @dma_addr: Array to store DMA addresses corresponding to mapped pages
1636  * @migrate_pfn: Array of migrate page frame numbers to map
1637  * @npages: Number of pages to map
1638  * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
1639  *
1640  * This function maps pages of memory for migration usage in GPU SVM. It
1641  * iterates over each page frame number provided in @migrate_pfn, maps the
1642  * corresponding page, and stores the DMA address in the provided @dma_addr
1643  * array.
1644  *
1645  * Return: 0 on success, -EFAULT if an error occurs during mapping.
1646  */
1647 static int drm_gpusvm_migrate_map_pages(struct device *dev,
1648 					dma_addr_t *dma_addr,
1649 					unsigned long *migrate_pfn,
1650 					unsigned long npages,
1651 					enum dma_data_direction dir)
1652 {
1653 	unsigned long i;
1654 
1655 	for (i = 0; i < npages; ++i) {
1656 		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
1657 
1658 		if (!page)
1659 			continue;
1660 
1661 		if (WARN_ON_ONCE(is_zone_device_page(page)))
1662 			return -EFAULT;
1663 
1664 		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
1665 		if (dma_mapping_error(dev, dma_addr[i]))
1666 			return -EFAULT;
1667 	}
1668 
1669 	return 0;
1670 }
1671 
1672 /**
1673  * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
1674  * @dev: The device for which the pages were mapped
1675  * @dma_addr: Array of DMA addresses corresponding to mapped pages
1676  * @npages: Number of pages to unmap
1677  * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
1678  *
1679  * This function unmaps previously mapped pages of memory for GPU Shared Virtual
1680  * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
1681  * if it's valid and not already unmapped, and unmaps the corresponding page.
1682  */
1683 static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
1684 					   dma_addr_t *dma_addr,
1685 					   unsigned long npages,
1686 					   enum dma_data_direction dir)
1687 {
1688 	unsigned long i;
1689 
1690 	for (i = 0; i < npages; ++i) {
1691 		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
1692 			continue;
1693 
1694 		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
1695 	}
1696 }
1697 
1698 /**
1699  * drm_gpusvm_migrate_to_devmem() - Migrate GPU SVM range to device memory
1700  * @gpusvm: Pointer to the GPU SVM structure
1701  * @range: Pointer to the GPU SVM range structure
1702  * @devmem_allocation: Pointer to the device memory allocation. The caller
1703  *                     should hold a reference to the device memory allocation,
1704  *                     which should be dropped via ops->devmem_release or upon
1705  *                     the failure of this function.
1706  * @ctx: GPU SVM context
1707  *
1708  * This function migrates the specified GPU SVM range to device memory. It
1709  * performs the necessary setup and invokes the driver-specific operations for
1710  * migration to device memory. Upon successful return, @devmem_allocation can
1711  * safely reference @range until ops->devmem_release is called which only upon
1712  * successful return. Expected to be called while holding the mmap lock in read
1713  * mode.
1714  *
1715  * Return: 0 on success, negative error code on failure.
1716  */
1717 int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm,
1718 				 struct drm_gpusvm_range *range,
1719 				 struct drm_gpusvm_devmem *devmem_allocation,
1720 				 const struct drm_gpusvm_ctx *ctx)
1721 {
1722 	const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
1723 	unsigned long start = drm_gpusvm_range_start(range),
1724 		      end = drm_gpusvm_range_end(range);
1725 	struct migrate_vma migrate = {
1726 		.start		= start,
1727 		.end		= end,
1728 		.pgmap_owner	= gpusvm->device_private_page_owner,
1729 		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
1730 	};
1731 	struct mm_struct *mm = gpusvm->mm;
1732 	unsigned long i, npages = npages_in_range(start, end);
1733 	struct vm_area_struct *vas;
1734 	struct drm_gpusvm_zdd *zdd = NULL;
1735 	struct page **pages;
1736 	dma_addr_t *dma_addr;
1737 	void *buf;
1738 	int err;
1739 
1740 	mmap_assert_locked(gpusvm->mm);
1741 
1742 	if (!range->flags.migrate_devmem)
1743 		return -EINVAL;
1744 
1745 	if (!ops->populate_devmem_pfn || !ops->copy_to_devmem ||
1746 	    !ops->copy_to_ram)
1747 		return -EOPNOTSUPP;
1748 
1749 	vas = vma_lookup(mm, start);
1750 	if (!vas) {
1751 		err = -ENOENT;
1752 		goto err_out;
1753 	}
1754 
1755 	if (end > vas->vm_end || start < vas->vm_start) {
1756 		err = -EINVAL;
1757 		goto err_out;
1758 	}
1759 
1760 	if (!vma_is_anonymous(vas)) {
1761 		err = -EBUSY;
1762 		goto err_out;
1763 	}
1764 
1765 	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
1766 		       sizeof(*pages), GFP_KERNEL);
1767 	if (!buf) {
1768 		err = -ENOMEM;
1769 		goto err_out;
1770 	}
1771 	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
1772 	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
1773 
1774 	zdd = drm_gpusvm_zdd_alloc(gpusvm->device_private_page_owner);
1775 	if (!zdd) {
1776 		err = -ENOMEM;
1777 		goto err_free;
1778 	}
1779 
1780 	migrate.vma = vas;
1781 	migrate.src = buf;
1782 	migrate.dst = migrate.src + npages;
1783 
1784 	err = migrate_vma_setup(&migrate);
1785 	if (err)
1786 		goto err_free;
1787 
1788 	if (!migrate.cpages) {
1789 		err = -EFAULT;
1790 		goto err_free;
1791 	}
1792 
1793 	if (migrate.cpages != npages) {
1794 		err = -EBUSY;
1795 		goto err_finalize;
1796 	}
1797 
1798 	err = ops->populate_devmem_pfn(devmem_allocation, npages, migrate.dst);
1799 	if (err)
1800 		goto err_finalize;
1801 
1802 	err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
1803 					   migrate.src, npages, DMA_TO_DEVICE);
1804 	if (err)
1805 		goto err_finalize;
1806 
1807 	for (i = 0; i < npages; ++i) {
1808 		struct page *page = pfn_to_page(migrate.dst[i]);
1809 
1810 		pages[i] = page;
1811 		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
1812 		drm_gpusvm_get_devmem_page(page, zdd);
1813 	}
1814 
1815 	err = ops->copy_to_devmem(pages, dma_addr, npages);
1816 	if (err)
1817 		goto err_finalize;
1818 
1819 	/* Upon success bind devmem allocation to range and zdd */
1820 	devmem_allocation->timeslice_expiration = get_jiffies_64() +
1821 		msecs_to_jiffies(ctx->timeslice_ms);
1822 	zdd->devmem_allocation = devmem_allocation;	/* Owns ref */
1823 
1824 err_finalize:
1825 	if (err)
1826 		drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
1827 	migrate_vma_pages(&migrate);
1828 	migrate_vma_finalize(&migrate);
1829 	drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
1830 				       DMA_TO_DEVICE);
1831 err_free:
1832 	if (zdd)
1833 		drm_gpusvm_zdd_put(zdd);
1834 	kvfree(buf);
1835 err_out:
1836 	return err;
1837 }
1838 EXPORT_SYMBOL_GPL(drm_gpusvm_migrate_to_devmem);
1839 
1840 /**
1841  * drm_gpusvm_migrate_populate_ram_pfn() - Populate RAM PFNs for a VM area
1842  * @vas: Pointer to the VM area structure, can be NULL
1843  * @fault_page: Fault page
1844  * @npages: Number of pages to populate
1845  * @mpages: Number of pages to migrate
1846  * @src_mpfn: Source array of migrate PFNs
1847  * @mpfn: Array of migrate PFNs to populate
1848  * @addr: Start address for PFN allocation
1849  *
1850  * This function populates the RAM migrate page frame numbers (PFNs) for the
1851  * specified VM area structure. It allocates and locks pages in the VM area for
1852  * RAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
1853  * alloc_page for allocation.
1854  *
1855  * Return: 0 on success, negative error code on failure.
1856  */
1857 static int drm_gpusvm_migrate_populate_ram_pfn(struct vm_area_struct *vas,
1858 					       struct page *fault_page,
1859 					       unsigned long npages,
1860 					       unsigned long *mpages,
1861 					       unsigned long *src_mpfn,
1862 					       unsigned long *mpfn,
1863 					       unsigned long addr)
1864 {
1865 	unsigned long i;
1866 
1867 	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
1868 		struct page *page, *src_page;
1869 
1870 		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
1871 			continue;
1872 
1873 		src_page = migrate_pfn_to_page(src_mpfn[i]);
1874 		if (!src_page)
1875 			continue;
1876 
1877 		if (fault_page) {
1878 			if (src_page->zone_device_data !=
1879 			    fault_page->zone_device_data)
1880 				continue;
1881 		}
1882 
1883 		if (vas)
1884 			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
1885 		else
1886 			page = alloc_page(GFP_HIGHUSER);
1887 
1888 		if (!page)
1889 			goto free_pages;
1890 
1891 		mpfn[i] = migrate_pfn(page_to_pfn(page));
1892 	}
1893 
1894 	for (i = 0; i < npages; ++i) {
1895 		struct page *page = migrate_pfn_to_page(mpfn[i]);
1896 
1897 		if (!page)
1898 			continue;
1899 
1900 		WARN_ON_ONCE(!trylock_page(page));
1901 		++*mpages;
1902 	}
1903 
1904 	return 0;
1905 
1906 free_pages:
1907 	for (i = 0; i < npages; ++i) {
1908 		struct page *page = migrate_pfn_to_page(mpfn[i]);
1909 
1910 		if (!page)
1911 			continue;
1912 
1913 		put_page(page);
1914 		mpfn[i] = 0;
1915 	}
1916 	return -ENOMEM;
1917 }
1918 
1919 /**
1920  * drm_gpusvm_evict_to_ram() - Evict GPU SVM range to RAM
1921  * @devmem_allocation: Pointer to the device memory allocation
1922  *
1923  * Similar to __drm_gpusvm_migrate_to_ram but does not require mmap lock and
1924  * migration done via migrate_device_* functions.
1925  *
1926  * Return: 0 on success, negative error code on failure.
1927  */
1928 int drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem *devmem_allocation)
1929 {
1930 	const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
1931 	unsigned long npages, mpages = 0;
1932 	struct page **pages;
1933 	unsigned long *src, *dst;
1934 	dma_addr_t *dma_addr;
1935 	void *buf;
1936 	int i, err = 0;
1937 	unsigned int retry_count = 2;
1938 
1939 	npages = devmem_allocation->size >> PAGE_SHIFT;
1940 
1941 retry:
1942 	if (!mmget_not_zero(devmem_allocation->mm))
1943 		return -EFAULT;
1944 
1945 	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
1946 		       sizeof(*pages), GFP_KERNEL);
1947 	if (!buf) {
1948 		err = -ENOMEM;
1949 		goto err_out;
1950 	}
1951 	src = buf;
1952 	dst = buf + (sizeof(*src) * npages);
1953 	dma_addr = buf + (2 * sizeof(*src) * npages);
1954 	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
1955 
1956 	err = ops->populate_devmem_pfn(devmem_allocation, npages, src);
1957 	if (err)
1958 		goto err_free;
1959 
1960 	err = migrate_device_pfns(src, npages);
1961 	if (err)
1962 		goto err_free;
1963 
1964 	err = drm_gpusvm_migrate_populate_ram_pfn(NULL, NULL, npages, &mpages,
1965 						  src, dst, 0);
1966 	if (err || !mpages)
1967 		goto err_finalize;
1968 
1969 	err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
1970 					   dst, npages, DMA_FROM_DEVICE);
1971 	if (err)
1972 		goto err_finalize;
1973 
1974 	for (i = 0; i < npages; ++i)
1975 		pages[i] = migrate_pfn_to_page(src[i]);
1976 
1977 	err = ops->copy_to_ram(pages, dma_addr, npages);
1978 	if (err)
1979 		goto err_finalize;
1980 
1981 err_finalize:
1982 	if (err)
1983 		drm_gpusvm_migration_unlock_put_pages(npages, dst);
1984 	migrate_device_pages(src, dst, npages);
1985 	migrate_device_finalize(src, dst, npages);
1986 	drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
1987 				       DMA_FROM_DEVICE);
1988 err_free:
1989 	kvfree(buf);
1990 err_out:
1991 	mmput_async(devmem_allocation->mm);
1992 
1993 	if (completion_done(&devmem_allocation->detached))
1994 		return 0;
1995 
1996 	if (retry_count--) {
1997 		cond_resched();
1998 		goto retry;
1999 	}
2000 
2001 	return err ?: -EBUSY;
2002 }
2003 EXPORT_SYMBOL_GPL(drm_gpusvm_evict_to_ram);
2004 
2005 /**
2006  * __drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (internal)
2007  * @vas: Pointer to the VM area structure
2008  * @device_private_page_owner: Device private pages owner
2009  * @page: Pointer to the page for fault handling (can be NULL)
2010  * @fault_addr: Fault address
2011  * @size: Size of migration
2012  *
2013  * This internal function performs the migration of the specified GPU SVM range
2014  * to RAM. It sets up the migration, populates + dma maps RAM PFNs, and
2015  * invokes the driver-specific operations for migration to RAM.
2016  *
2017  * Return: 0 on success, negative error code on failure.
2018  */
2019 static int __drm_gpusvm_migrate_to_ram(struct vm_area_struct *vas,
2020 				       void *device_private_page_owner,
2021 				       struct page *page,
2022 				       unsigned long fault_addr,
2023 				       unsigned long size)
2024 {
2025 	struct migrate_vma migrate = {
2026 		.vma		= vas,
2027 		.pgmap_owner	= device_private_page_owner,
2028 		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
2029 			MIGRATE_VMA_SELECT_DEVICE_COHERENT,
2030 		.fault_page	= page,
2031 	};
2032 	struct drm_gpusvm_zdd *zdd;
2033 	const struct drm_gpusvm_devmem_ops *ops;
2034 	struct device *dev = NULL;
2035 	unsigned long npages, mpages = 0;
2036 	struct page **pages;
2037 	dma_addr_t *dma_addr;
2038 	unsigned long start, end;
2039 	void *buf;
2040 	int i, err = 0;
2041 
2042 	if (page) {
2043 		zdd = page->zone_device_data;
2044 		if (time_before64(get_jiffies_64(),
2045 				  zdd->devmem_allocation->timeslice_expiration))
2046 			return 0;
2047 	}
2048 
2049 	start = ALIGN_DOWN(fault_addr, size);
2050 	end = ALIGN(fault_addr + 1, size);
2051 
2052 	/* Corner where VMA area struct has been partially unmapped */
2053 	if (start < vas->vm_start)
2054 		start = vas->vm_start;
2055 	if (end > vas->vm_end)
2056 		end = vas->vm_end;
2057 
2058 	migrate.start = start;
2059 	migrate.end = end;
2060 	npages = npages_in_range(start, end);
2061 
2062 	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
2063 		       sizeof(*pages), GFP_KERNEL);
2064 	if (!buf) {
2065 		err = -ENOMEM;
2066 		goto err_out;
2067 	}
2068 	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
2069 	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
2070 
2071 	migrate.vma = vas;
2072 	migrate.src = buf;
2073 	migrate.dst = migrate.src + npages;
2074 
2075 	err = migrate_vma_setup(&migrate);
2076 	if (err)
2077 		goto err_free;
2078 
2079 	/* Raced with another CPU fault, nothing to do */
2080 	if (!migrate.cpages)
2081 		goto err_free;
2082 
2083 	if (!page) {
2084 		for (i = 0; i < npages; ++i) {
2085 			if (!(migrate.src[i] & MIGRATE_PFN_MIGRATE))
2086 				continue;
2087 
2088 			page = migrate_pfn_to_page(migrate.src[i]);
2089 			break;
2090 		}
2091 
2092 		if (!page)
2093 			goto err_finalize;
2094 	}
2095 	zdd = page->zone_device_data;
2096 	ops = zdd->devmem_allocation->ops;
2097 	dev = zdd->devmem_allocation->dev;
2098 
2099 	err = drm_gpusvm_migrate_populate_ram_pfn(vas, page, npages, &mpages,
2100 						  migrate.src, migrate.dst,
2101 						  start);
2102 	if (err)
2103 		goto err_finalize;
2104 
2105 	err = drm_gpusvm_migrate_map_pages(dev, dma_addr, migrate.dst, npages,
2106 					   DMA_FROM_DEVICE);
2107 	if (err)
2108 		goto err_finalize;
2109 
2110 	for (i = 0; i < npages; ++i)
2111 		pages[i] = migrate_pfn_to_page(migrate.src[i]);
2112 
2113 	err = ops->copy_to_ram(pages, dma_addr, npages);
2114 	if (err)
2115 		goto err_finalize;
2116 
2117 err_finalize:
2118 	if (err)
2119 		drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
2120 	migrate_vma_pages(&migrate);
2121 	migrate_vma_finalize(&migrate);
2122 	if (dev)
2123 		drm_gpusvm_migrate_unmap_pages(dev, dma_addr, npages,
2124 					       DMA_FROM_DEVICE);
2125 err_free:
2126 	kvfree(buf);
2127 err_out:
2128 
2129 	return err;
2130 }
2131 
2132 /**
2133  * drm_gpusvm_range_evict - Evict GPU SVM range
2134  * @range: Pointer to the GPU SVM range to be removed
2135  *
2136  * This function evicts the specified GPU SVM range. This function will not
2137  * evict coherent pages.
2138  *
2139  * Return: 0 on success, a negative error code on failure.
2140  */
2141 int drm_gpusvm_range_evict(struct drm_gpusvm *gpusvm,
2142 			   struct drm_gpusvm_range *range)
2143 {
2144 	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
2145 	struct hmm_range hmm_range = {
2146 		.default_flags = HMM_PFN_REQ_FAULT,
2147 		.notifier = notifier,
2148 		.start = drm_gpusvm_range_start(range),
2149 		.end = drm_gpusvm_range_end(range),
2150 		.dev_private_owner = NULL,
2151 	};
2152 	unsigned long timeout =
2153 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
2154 	unsigned long *pfns;
2155 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
2156 					       drm_gpusvm_range_end(range));
2157 	int err = 0;
2158 	struct mm_struct *mm = gpusvm->mm;
2159 
2160 	if (!mmget_not_zero(mm))
2161 		return -EFAULT;
2162 
2163 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
2164 	if (!pfns)
2165 		return -ENOMEM;
2166 
2167 	hmm_range.hmm_pfns = pfns;
2168 	while (!time_after(jiffies, timeout)) {
2169 		hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
2170 		if (time_after(jiffies, timeout)) {
2171 			err = -ETIME;
2172 			break;
2173 		}
2174 
2175 		mmap_read_lock(mm);
2176 		err = hmm_range_fault(&hmm_range);
2177 		mmap_read_unlock(mm);
2178 		if (err != -EBUSY)
2179 			break;
2180 	}
2181 
2182 	kvfree(pfns);
2183 	mmput(mm);
2184 
2185 	return err;
2186 }
2187 EXPORT_SYMBOL_GPL(drm_gpusvm_range_evict);
2188 
2189 /**
2190  * drm_gpusvm_page_free() - Put GPU SVM zone device data associated with a page
2191  * @page: Pointer to the page
2192  *
2193  * This function is a callback used to put the GPU SVM zone device data
2194  * associated with a page when it is being released.
2195  */
2196 static void drm_gpusvm_page_free(struct page *page)
2197 {
2198 	drm_gpusvm_zdd_put(page->zone_device_data);
2199 }
2200 
2201 /**
2202  * drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (page fault handler)
2203  * @vmf: Pointer to the fault information structure
2204  *
2205  * This function is a page fault handler used to migrate a GPU SVM range to RAM.
2206  * It retrieves the GPU SVM range information from the faulting page and invokes
2207  * the internal migration function to migrate the range back to RAM.
2208  *
2209  * Return: VM_FAULT_SIGBUS on failure, 0 on success.
2210  */
2211 static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
2212 {
2213 	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
2214 	int err;
2215 
2216 	err = __drm_gpusvm_migrate_to_ram(vmf->vma,
2217 					  zdd->device_private_page_owner,
2218 					  vmf->page, vmf->address,
2219 					  zdd->devmem_allocation->size);
2220 
2221 	return err ? VM_FAULT_SIGBUS : 0;
2222 }
2223 
2224 /*
2225  * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
2226  */
2227 static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
2228 	.page_free = drm_gpusvm_page_free,
2229 	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
2230 };
2231 
2232 /**
2233  * drm_gpusvm_pagemap_ops_get() - Retrieve GPU SVM device page map operations
2234  *
2235  * Return: Pointer to the GPU SVM device page map operations structure.
2236  */
2237 const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
2238 {
2239 	return &drm_gpusvm_pagemap_ops;
2240 }
2241 EXPORT_SYMBOL_GPL(drm_gpusvm_pagemap_ops_get);
2242 
2243 /**
2244  * drm_gpusvm_has_mapping() - Check if GPU SVM has mapping for the given address range
2245  * @gpusvm: Pointer to the GPU SVM structure.
2246  * @start: Start address
2247  * @end: End address
2248  *
2249  * Return: True if GPU SVM has mapping, False otherwise
2250  */
2251 bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, unsigned long start,
2252 			    unsigned long end)
2253 {
2254 	struct drm_gpusvm_notifier *notifier;
2255 
2256 	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
2257 		struct drm_gpusvm_range *range = NULL;
2258 
2259 		drm_gpusvm_for_each_range(range, notifier, start, end)
2260 			return true;
2261 	}
2262 
2263 	return false;
2264 }
2265 EXPORT_SYMBOL_GPL(drm_gpusvm_has_mapping);
2266 
2267 /**
2268  * drm_gpusvm_range_set_unmapped() - Mark a GPU SVM range as unmapped
2269  * @range: Pointer to the GPU SVM range structure.
2270  * @mmu_range: Pointer to the MMU notifier range structure.
2271  *
2272  * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
2273  * if the range partially falls within the provided MMU notifier range.
2274  */
2275 void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
2276 				   const struct mmu_notifier_range *mmu_range)
2277 {
2278 	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
2279 
2280 	range->flags.unmapped = true;
2281 	if (drm_gpusvm_range_start(range) < mmu_range->start ||
2282 	    drm_gpusvm_range_end(range) > mmu_range->end)
2283 		range->flags.partial_unmap = true;
2284 }
2285 EXPORT_SYMBOL_GPL(drm_gpusvm_range_set_unmapped);
2286 
2287 /**
2288  * drm_gpusvm_devmem_init() - Initialize a GPU SVM device memory allocation
2289  *
2290  * @dev: Pointer to the device structure which device memory allocation belongs to
2291  * @mm: Pointer to the mm_struct for the address space
2292  * @ops: Pointer to the operations structure for GPU SVM device memory
2293  * @dpagemap: The struct drm_pagemap we're allocating from.
2294  * @size: Size of device memory allocation
2295  */
2296 void drm_gpusvm_devmem_init(struct drm_gpusvm_devmem *devmem_allocation,
2297 			    struct device *dev, struct mm_struct *mm,
2298 			    const struct drm_gpusvm_devmem_ops *ops,
2299 			    struct drm_pagemap *dpagemap, size_t size)
2300 {
2301 	init_completion(&devmem_allocation->detached);
2302 	devmem_allocation->dev = dev;
2303 	devmem_allocation->mm = mm;
2304 	devmem_allocation->ops = ops;
2305 	devmem_allocation->dpagemap = dpagemap;
2306 	devmem_allocation->size = size;
2307 }
2308 EXPORT_SYMBOL_GPL(drm_gpusvm_devmem_init);
2309 
2310 MODULE_DESCRIPTION("DRM GPUSVM");
2311 MODULE_LICENSE("GPL");
2312