xref: /linux/drivers/gpu/drm/drm_gpusvm.c (revision 3972872e459d812ab5e481a231a6066cf4f4d0f4)
1 // SPDX-License-Identifier: GPL-2.0-only OR MIT
2 /*
3  * Copyright © 2024 Intel Corporation
4  *
5  * Authors:
6  *     Matthew Brost <matthew.brost@intel.com>
7  */
8 
9 #include <linux/dma-mapping.h>
10 #include <linux/hmm.h>
11 #include <linux/memremap.h>
12 #include <linux/migrate.h>
13 #include <linux/mm_types.h>
14 #include <linux/pagemap.h>
15 #include <linux/slab.h>
16 
17 #include <drm/drm_device.h>
18 #include <drm/drm_gpusvm.h>
19 #include <drm/drm_pagemap.h>
20 #include <drm/drm_print.h>
21 
22 /**
23  * DOC: Overview
24  *
25  * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
26  * is a component of the DRM framework designed to manage shared virtual memory
27  * between the CPU and GPU. It enables efficient data exchange and processing
28  * for GPU-accelerated applications by allowing memory sharing and
29  * synchronization between the CPU's and GPU's virtual address spaces.
30  *
31  * Key GPU SVM Components:
32  *
33  * - Notifiers:
34  *	Used for tracking memory intervals and notifying the GPU of changes,
35  *	notifiers are sized based on a GPU SVM initialization parameter, with a
36  *	recommendation of 512M or larger. They maintain a Red-BlacK tree and a
37  *	list of ranges that fall within the notifier interval.  Notifiers are
38  *	tracked within a GPU SVM Red-BlacK tree and list and are dynamically
39  *	inserted or removed as ranges within the interval are created or
40  *	destroyed.
41  * - Ranges:
42  *	Represent memory ranges mapped in a DRM device and managed by GPU SVM.
43  *	They are sized based on an array of chunk sizes, which is a GPU SVM
44  *	initialization parameter, and the CPU address space.  Upon GPU fault,
45  *	the largest aligned chunk that fits within the faulting CPU address
46  *	space is chosen for the range size. Ranges are expected to be
47  *	dynamically allocated on GPU fault and removed on an MMU notifier UNMAP
48  *	event. As mentioned above, ranges are tracked in a notifier's Red-Black
49  *	tree.
50  *
51  * - Operations:
52  *	Define the interface for driver-specific GPU SVM operations such as
53  *	range allocation, notifier allocation, and invalidations.
54  *
55  * - Device Memory Allocations:
56  *	Embedded structure containing enough information for GPU SVM to migrate
57  *	to / from device memory.
58  *
59  * - Device Memory Operations:
60  *	Define the interface for driver-specific device memory operations
61  *	release memory, populate pfns, and copy to / from device memory.
62  *
63  * This layer provides interfaces for allocating, mapping, migrating, and
64  * releasing memory ranges between the CPU and GPU. It handles all core memory
65  * management interactions (DMA mapping, HMM, and migration) and provides
66  * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
67  * to build the expected driver components for an SVM implementation as detailed
68  * below.
69  *
70  * Expected Driver Components:
71  *
72  * - GPU page fault handler:
73  *	Used to create ranges and notifiers based on the fault address,
74  *	optionally migrate the range to device memory, and create GPU bindings.
75  *
76  * - Garbage collector:
77  *	Used to unmap and destroy GPU bindings for ranges.  Ranges are expected
78  *	to be added to the garbage collector upon a MMU_NOTIFY_UNMAP event in
79  *	notifier callback.
80  *
81  * - Notifier callback:
82  *	Used to invalidate and DMA unmap GPU bindings for ranges.
83  */
84 
85 /**
86  * DOC: Locking
87  *
88  * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
89  * mmap lock as needed.
90  *
91  * GPU SVM introduces a global notifier lock, which safeguards the notifier's
92  * range RB tree and list, as well as the range's DMA mappings and sequence
93  * number. GPU SVM manages all necessary locking and unlocking operations,
94  * except for the recheck range's pages being valid
95  * (drm_gpusvm_range_pages_valid) when the driver is committing GPU bindings.
96  * This lock corresponds to the ``driver->update`` lock mentioned in
97  * Documentation/mm/hmm.rst. Future revisions may transition from a GPU SVM
98  * global lock to a per-notifier lock if finer-grained locking is deemed
99  * necessary.
100  *
101  * In addition to the locking mentioned above, the driver should implement a
102  * lock to safeguard core GPU SVM function calls that modify state, such as
103  * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. This lock is
104  * denoted as 'driver_svm_lock' in code examples. Finer grained driver side
105  * locking should also be possible for concurrent GPU fault processing within a
106  * single GPU SVM. The 'driver_svm_lock' can be via drm_gpusvm_driver_set_lock
107  * to add annotations to GPU SVM.
108  */
109 
110 /**
111  * DOC: Migration
112  *
113  * The migration support is quite simple, allowing migration between RAM and
114  * device memory at the range granularity. For example, GPU SVM currently does
115  * not support mixing RAM and device memory pages within a range. This means
116  * that upon GPU fault, the entire range can be migrated to device memory, and
117  * upon CPU fault, the entire range is migrated to RAM. Mixed RAM and device
118  * memory storage within a range could be added in the future if required.
119  *
120  * The reasoning for only supporting range granularity is as follows: it
121  * simplifies the implementation, and range sizes are driver-defined and should
122  * be relatively small.
123  */
124 
125 /**
126  * DOC: Partial Unmapping of Ranges
127  *
128  * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
129  * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
130  * being that a subset of the range still has CPU and GPU mappings. If the
131  * backing store for the range is in device memory, a subset of the backing
132  * store has references. One option would be to split the range and device
133  * memory backing store, but the implementation for this would be quite
134  * complicated. Given that partial unmappings are rare and driver-defined range
135  * sizes are relatively small, GPU SVM does not support splitting of ranges.
136  *
137  * With no support for range splitting, upon partial unmapping of a range, the
138  * driver is expected to invalidate and destroy the entire range. If the range
139  * has device memory as its backing, the driver is also expected to migrate any
140  * remaining pages back to RAM.
141  */
142 
143 /**
144  * DOC: Examples
145  *
146  * This section provides three examples of how to build the expected driver
147  * components: the GPU page fault handler, the garbage collector, and the
148  * notifier callback.
149  *
150  * The generic code provided does not include logic for complex migration
151  * policies, optimized invalidations, fined grained driver locking, or other
152  * potentially required driver locking (e.g., DMA-resv locks).
153  *
154  * 1) GPU page fault handler
155  *
156  * .. code-block:: c
157  *
158  *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
159  *	{
160  *		int err = 0;
161  *
162  *		driver_alloc_and_setup_memory_for_bind(gpusvm, range);
163  *
164  *		drm_gpusvm_notifier_lock(gpusvm);
165  *		if (drm_gpusvm_range_pages_valid(range))
166  *			driver_commit_bind(gpusvm, range);
167  *		else
168  *			err = -EAGAIN;
169  *		drm_gpusvm_notifier_unlock(gpusvm);
170  *
171  *		return err;
172  *	}
173  *
174  *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, unsigned long fault_addr,
175  *			     unsigned long gpuva_start, unsigned long gpuva_end)
176  *	{
177  *		struct drm_gpusvm_ctx ctx = {};
178  *		int err;
179  *
180  *		driver_svm_lock();
181  *	retry:
182  *		// Always process UNMAPs first so view of GPU SVM ranges is current
183  *		driver_garbage_collector(gpusvm);
184  *
185  *		range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
186  *							gpuva_start, gpuva_end,
187  *						        &ctx);
188  *		if (IS_ERR(range)) {
189  *			err = PTR_ERR(range);
190  *			goto unlock;
191  *		}
192  *
193  *		if (driver_migration_policy(range)) {
194  *			mmap_read_lock(mm);
195  *			devmem = driver_alloc_devmem();
196  *			err = drm_gpusvm_migrate_to_devmem(gpusvm, range,
197  *							   devmem_allocation,
198  *							   &ctx);
199  *			mmap_read_unlock(mm);
200  *			if (err)	// CPU mappings may have changed
201  *				goto retry;
202  *		}
203  *
204  *		err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
205  *		if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {	// CPU mappings changed
206  *			if (err == -EOPNOTSUPP)
207  *				drm_gpusvm_range_evict(gpusvm, range);
208  *			goto retry;
209  *		} else if (err) {
210  *			goto unlock;
211  *		}
212  *
213  *		err = driver_bind_range(gpusvm, range);
214  *		if (err == -EAGAIN)	// CPU mappings changed
215  *			goto retry
216  *
217  *	unlock:
218  *		driver_svm_unlock();
219  *		return err;
220  *	}
221  *
222  * 2) Garbage Collector
223  *
224  * .. code-block:: c
225  *
226  *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
227  *					struct drm_gpusvm_range *range)
228  *	{
229  *		assert_driver_svm_locked(gpusvm);
230  *
231  *		// Partial unmap, migrate any remaining device memory pages back to RAM
232  *		if (range->flags.partial_unmap)
233  *			drm_gpusvm_range_evict(gpusvm, range);
234  *
235  *		driver_unbind_range(range);
236  *		drm_gpusvm_range_remove(gpusvm, range);
237  *	}
238  *
239  *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
240  *	{
241  *		assert_driver_svm_locked(gpusvm);
242  *
243  *		for_each_range_in_garbage_collector(gpusvm, range)
244  *			__driver_garbage_collector(gpusvm, range);
245  *	}
246  *
247  * 3) Notifier callback
248  *
249  * .. code-block:: c
250  *
251  *	void driver_invalidation(struct drm_gpusvm *gpusvm,
252  *				 struct drm_gpusvm_notifier *notifier,
253  *				 const struct mmu_notifier_range *mmu_range)
254  *	{
255  *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
256  *		struct drm_gpusvm_range *range = NULL;
257  *
258  *		driver_invalidate_device_pages(gpusvm, mmu_range->start, mmu_range->end);
259  *
260  *		drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
261  *					  mmu_range->end) {
262  *			drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
263  *
264  *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
265  *				continue;
266  *
267  *			drm_gpusvm_range_set_unmapped(range, mmu_range);
268  *			driver_garbage_collector_add(gpusvm, range);
269  *		}
270  *	}
271  */
272 
273 /**
274  * npages_in_range() - Calculate the number of pages in a given range
275  * @start: The start address of the range
276  * @end: The end address of the range
277  *
278  * This macro calculates the number of pages in a given memory range,
279  * specified by the start and end addresses. It divides the difference
280  * between the end and start addresses by the page size (PAGE_SIZE) to
281  * determine the number of pages in the range.
282  *
283  * Return: The number of pages in the specified range.
284  */
285 static unsigned long
286 npages_in_range(unsigned long start, unsigned long end)
287 {
288 	return (end - start) >> PAGE_SHIFT;
289 }
290 
291 /**
292  * struct drm_gpusvm_zdd - GPU SVM zone device data
293  *
294  * @refcount: Reference count for the zdd
295  * @devmem_allocation: device memory allocation
296  * @device_private_page_owner: Device private pages owner
297  *
298  * This structure serves as a generic wrapper installed in
299  * page->zone_device_data. It provides infrastructure for looking up a device
300  * memory allocation upon CPU page fault and asynchronously releasing device
301  * memory once the CPU has no page references. Asynchronous release is useful
302  * because CPU page references can be dropped in IRQ contexts, while releasing
303  * device memory likely requires sleeping locks.
304  */
305 struct drm_gpusvm_zdd {
306 	struct kref refcount;
307 	struct drm_gpusvm_devmem *devmem_allocation;
308 	void *device_private_page_owner;
309 };
310 
311 /**
312  * drm_gpusvm_zdd_alloc() - Allocate a zdd structure.
313  * @device_private_page_owner: Device private pages owner
314  *
315  * This function allocates and initializes a new zdd structure. It sets up the
316  * reference count and initializes the destroy work.
317  *
318  * Return: Pointer to the allocated zdd on success, ERR_PTR() on failure.
319  */
320 static struct drm_gpusvm_zdd *
321 drm_gpusvm_zdd_alloc(void *device_private_page_owner)
322 {
323 	struct drm_gpusvm_zdd *zdd;
324 
325 	zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
326 	if (!zdd)
327 		return NULL;
328 
329 	kref_init(&zdd->refcount);
330 	zdd->devmem_allocation = NULL;
331 	zdd->device_private_page_owner = device_private_page_owner;
332 
333 	return zdd;
334 }
335 
336 /**
337  * drm_gpusvm_zdd_get() - Get a reference to a zdd structure.
338  * @zdd: Pointer to the zdd structure.
339  *
340  * This function increments the reference count of the provided zdd structure.
341  *
342  * Return: Pointer to the zdd structure.
343  */
344 static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct drm_gpusvm_zdd *zdd)
345 {
346 	kref_get(&zdd->refcount);
347 	return zdd;
348 }
349 
350 /**
351  * drm_gpusvm_zdd_destroy() - Destroy a zdd structure.
352  * @ref: Pointer to the reference count structure.
353  *
354  * This function queues the destroy_work of the zdd for asynchronous destruction.
355  */
356 static void drm_gpusvm_zdd_destroy(struct kref *ref)
357 {
358 	struct drm_gpusvm_zdd *zdd =
359 		container_of(ref, struct drm_gpusvm_zdd, refcount);
360 	struct drm_gpusvm_devmem *devmem = zdd->devmem_allocation;
361 
362 	if (devmem) {
363 		complete_all(&devmem->detached);
364 		if (devmem->ops->devmem_release)
365 			devmem->ops->devmem_release(devmem);
366 	}
367 	kfree(zdd);
368 }
369 
370 /**
371  * drm_gpusvm_zdd_put() - Put a zdd reference.
372  * @zdd: Pointer to the zdd structure.
373  *
374  * This function decrements the reference count of the provided zdd structure
375  * and schedules its destruction if the count drops to zero.
376  */
377 static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
378 {
379 	kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
380 }
381 
382 /**
383  * drm_gpusvm_range_find() - Find GPU SVM range from GPU SVM notifier
384  * @notifier: Pointer to the GPU SVM notifier structure.
385  * @start: Start address of the range
386  * @end: End address of the range
387  *
388  * Return: A pointer to the drm_gpusvm_range if found or NULL
389  */
390 struct drm_gpusvm_range *
391 drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, unsigned long start,
392 		      unsigned long end)
393 {
394 	struct interval_tree_node *itree;
395 
396 	itree = interval_tree_iter_first(&notifier->root, start, end - 1);
397 
398 	if (itree)
399 		return container_of(itree, struct drm_gpusvm_range, itree);
400 	else
401 		return NULL;
402 }
403 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find);
404 
405 /**
406  * drm_gpusvm_for_each_range_safe() - Safely iterate over GPU SVM ranges in a notifier
407  * @range__: Iterator variable for the ranges
408  * @next__: Iterator variable for the ranges temporay storage
409  * @notifier__: Pointer to the GPU SVM notifier
410  * @start__: Start address of the range
411  * @end__: End address of the range
412  *
413  * This macro is used to iterate over GPU SVM ranges in a notifier while
414  * removing ranges from it.
415  */
416 #define drm_gpusvm_for_each_range_safe(range__, next__, notifier__, start__, end__)	\
417 	for ((range__) = drm_gpusvm_range_find((notifier__), (start__), (end__)),	\
418 	     (next__) = __drm_gpusvm_range_next(range__);				\
419 	     (range__) && (drm_gpusvm_range_start(range__) < (end__));			\
420 	     (range__) = (next__), (next__) = __drm_gpusvm_range_next(range__))
421 
422 /**
423  * __drm_gpusvm_notifier_next() - get the next drm_gpusvm_notifier in the list
424  * @notifier: a pointer to the current drm_gpusvm_notifier
425  *
426  * Return: A pointer to the next drm_gpusvm_notifier if available, or NULL if
427  *         the current notifier is the last one or if the input notifier is
428  *         NULL.
429  */
430 static struct drm_gpusvm_notifier *
431 __drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
432 {
433 	if (notifier && !list_is_last(&notifier->entry,
434 				      &notifier->gpusvm->notifier_list))
435 		return list_next_entry(notifier, entry);
436 
437 	return NULL;
438 }
439 
440 static struct drm_gpusvm_notifier *
441 notifier_iter_first(struct rb_root_cached *root, unsigned long start,
442 		    unsigned long last)
443 {
444 	struct interval_tree_node *itree;
445 
446 	itree = interval_tree_iter_first(root, start, last);
447 
448 	if (itree)
449 		return container_of(itree, struct drm_gpusvm_notifier, itree);
450 	else
451 		return NULL;
452 }
453 
454 /**
455  * drm_gpusvm_for_each_notifier() - Iterate over GPU SVM notifiers in a gpusvm
456  * @notifier__: Iterator variable for the notifiers
457  * @notifier__: Pointer to the GPU SVM notifier
458  * @start__: Start address of the notifier
459  * @end__: End address of the notifier
460  *
461  * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
462  */
463 #define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__, end__)		\
464 	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1);	\
465 	     (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__));		\
466 	     (notifier__) = __drm_gpusvm_notifier_next(notifier__))
467 
468 /**
469  * drm_gpusvm_for_each_notifier_safe() - Safely iterate over GPU SVM notifiers in a gpusvm
470  * @notifier__: Iterator variable for the notifiers
471  * @next__: Iterator variable for the notifiers temporay storage
472  * @notifier__: Pointer to the GPU SVM notifier
473  * @start__: Start address of the notifier
474  * @end__: End address of the notifier
475  *
476  * This macro is used to iterate over GPU SVM notifiers in a gpusvm while
477  * removing notifiers from it.
478  */
479 #define drm_gpusvm_for_each_notifier_safe(notifier__, next__, gpusvm__, start__, end__)	\
480 	for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1),	\
481 	     (next__) = __drm_gpusvm_notifier_next(notifier__);				\
482 	     (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__));		\
483 	     (notifier__) = (next__), (next__) = __drm_gpusvm_notifier_next(notifier__))
484 
485 /**
486  * drm_gpusvm_notifier_invalidate() - Invalidate a GPU SVM notifier.
487  * @mni: Pointer to the mmu_interval_notifier structure.
488  * @mmu_range: Pointer to the mmu_notifier_range structure.
489  * @cur_seq: Current sequence number.
490  *
491  * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
492  * notifier sequence number and calls the driver invalidate vfunc under
493  * gpusvm->notifier_lock.
494  *
495  * Return: true if the operation succeeds, false otherwise.
496  */
497 static bool
498 drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
499 			       const struct mmu_notifier_range *mmu_range,
500 			       unsigned long cur_seq)
501 {
502 	struct drm_gpusvm_notifier *notifier =
503 		container_of(mni, typeof(*notifier), notifier);
504 	struct drm_gpusvm *gpusvm = notifier->gpusvm;
505 
506 	if (!mmu_notifier_range_blockable(mmu_range))
507 		return false;
508 
509 	down_write(&gpusvm->notifier_lock);
510 	mmu_interval_set_seq(mni, cur_seq);
511 	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
512 	up_write(&gpusvm->notifier_lock);
513 
514 	return true;
515 }
516 
517 /*
518  * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
519  */
520 static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
521 	.invalidate = drm_gpusvm_notifier_invalidate,
522 };
523 
524 /**
525  * drm_gpusvm_init() - Initialize the GPU SVM.
526  * @gpusvm: Pointer to the GPU SVM structure.
527  * @name: Name of the GPU SVM.
528  * @drm: Pointer to the DRM device structure.
529  * @mm: Pointer to the mm_struct for the address space.
530  * @device_private_page_owner: Device private pages owner.
531  * @mm_start: Start address of GPU SVM.
532  * @mm_range: Range of the GPU SVM.
533  * @notifier_size: Size of individual notifiers.
534  * @ops: Pointer to the operations structure for GPU SVM.
535  * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
536  *               Entries should be powers of 2 in descending order with last
537  *               entry being SZ_4K.
538  * @num_chunks: Number of chunks.
539  *
540  * This function initializes the GPU SVM.
541  *
542  * Return: 0 on success, a negative error code on failure.
543  */
544 int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
545 		    const char *name, struct drm_device *drm,
546 		    struct mm_struct *mm, void *device_private_page_owner,
547 		    unsigned long mm_start, unsigned long mm_range,
548 		    unsigned long notifier_size,
549 		    const struct drm_gpusvm_ops *ops,
550 		    const unsigned long *chunk_sizes, int num_chunks)
551 {
552 	if (!ops->invalidate || !num_chunks)
553 		return -EINVAL;
554 
555 	gpusvm->name = name;
556 	gpusvm->drm = drm;
557 	gpusvm->mm = mm;
558 	gpusvm->device_private_page_owner = device_private_page_owner;
559 	gpusvm->mm_start = mm_start;
560 	gpusvm->mm_range = mm_range;
561 	gpusvm->notifier_size = notifier_size;
562 	gpusvm->ops = ops;
563 	gpusvm->chunk_sizes = chunk_sizes;
564 	gpusvm->num_chunks = num_chunks;
565 
566 	mmgrab(mm);
567 	gpusvm->root = RB_ROOT_CACHED;
568 	INIT_LIST_HEAD(&gpusvm->notifier_list);
569 
570 	init_rwsem(&gpusvm->notifier_lock);
571 
572 	fs_reclaim_acquire(GFP_KERNEL);
573 	might_lock(&gpusvm->notifier_lock);
574 	fs_reclaim_release(GFP_KERNEL);
575 
576 #ifdef CONFIG_LOCKDEP
577 	gpusvm->lock_dep_map = NULL;
578 #endif
579 
580 	return 0;
581 }
582 EXPORT_SYMBOL_GPL(drm_gpusvm_init);
583 
584 /**
585  * drm_gpusvm_notifier_find() - Find GPU SVM notifier
586  * @gpusvm: Pointer to the GPU SVM structure
587  * @fault_addr: Fault address
588  *
589  * This function finds the GPU SVM notifier associated with the fault address.
590  *
591  * Return: Pointer to the GPU SVM notifier on success, NULL otherwise.
592  */
593 static struct drm_gpusvm_notifier *
594 drm_gpusvm_notifier_find(struct drm_gpusvm *gpusvm,
595 			 unsigned long fault_addr)
596 {
597 	return notifier_iter_first(&gpusvm->root, fault_addr, fault_addr + 1);
598 }
599 
600 /**
601  * to_drm_gpusvm_notifier() - retrieve the container struct for a given rbtree node
602  * @node: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
603  *
604  * Return: A pointer to the containing drm_gpusvm_notifier structure.
605  */
606 static struct drm_gpusvm_notifier *to_drm_gpusvm_notifier(struct rb_node *node)
607 {
608 	return container_of(node, struct drm_gpusvm_notifier, itree.rb);
609 }
610 
611 /**
612  * drm_gpusvm_notifier_insert() - Insert GPU SVM notifier
613  * @gpusvm: Pointer to the GPU SVM structure
614  * @notifier: Pointer to the GPU SVM notifier structure
615  *
616  * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
617  */
618 static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
619 				       struct drm_gpusvm_notifier *notifier)
620 {
621 	struct rb_node *node;
622 	struct list_head *head;
623 
624 	interval_tree_insert(&notifier->itree, &gpusvm->root);
625 
626 	node = rb_prev(&notifier->itree.rb);
627 	if (node)
628 		head = &(to_drm_gpusvm_notifier(node))->entry;
629 	else
630 		head = &gpusvm->notifier_list;
631 
632 	list_add(&notifier->entry, head);
633 }
634 
635 /**
636  * drm_gpusvm_notifier_remove() - Remove GPU SVM notifier
637  * @gpusvm: Pointer to the GPU SVM tructure
638  * @notifier: Pointer to the GPU SVM notifier structure
639  *
640  * This function removes the GPU SVM notifier from the GPU SVM RB tree and list.
641  */
642 static void drm_gpusvm_notifier_remove(struct drm_gpusvm *gpusvm,
643 				       struct drm_gpusvm_notifier *notifier)
644 {
645 	interval_tree_remove(&notifier->itree, &gpusvm->root);
646 	list_del(&notifier->entry);
647 }
648 
649 /**
650  * drm_gpusvm_fini() - Finalize the GPU SVM.
651  * @gpusvm: Pointer to the GPU SVM structure.
652  *
653  * This function finalizes the GPU SVM by cleaning up any remaining ranges and
654  * notifiers, and dropping a reference to struct MM.
655  */
656 void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
657 {
658 	struct drm_gpusvm_notifier *notifier, *next;
659 
660 	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
661 		struct drm_gpusvm_range *range, *__next;
662 
663 		/*
664 		 * Remove notifier first to avoid racing with any invalidation
665 		 */
666 		mmu_interval_notifier_remove(&notifier->notifier);
667 		notifier->flags.removed = true;
668 
669 		drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
670 					       LONG_MAX)
671 			drm_gpusvm_range_remove(gpusvm, range);
672 	}
673 
674 	mmdrop(gpusvm->mm);
675 	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
676 }
677 EXPORT_SYMBOL_GPL(drm_gpusvm_fini);
678 
679 /**
680  * drm_gpusvm_notifier_alloc() - Allocate GPU SVM notifier
681  * @gpusvm: Pointer to the GPU SVM structure
682  * @fault_addr: Fault address
683  *
684  * This function allocates and initializes the GPU SVM notifier structure.
685  *
686  * Return: Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
687  */
688 static struct drm_gpusvm_notifier *
689 drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, unsigned long fault_addr)
690 {
691 	struct drm_gpusvm_notifier *notifier;
692 
693 	if (gpusvm->ops->notifier_alloc)
694 		notifier = gpusvm->ops->notifier_alloc();
695 	else
696 		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
697 
698 	if (!notifier)
699 		return ERR_PTR(-ENOMEM);
700 
701 	notifier->gpusvm = gpusvm;
702 	notifier->itree.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
703 	notifier->itree.last = ALIGN(fault_addr + 1, gpusvm->notifier_size) - 1;
704 	INIT_LIST_HEAD(&notifier->entry);
705 	notifier->root = RB_ROOT_CACHED;
706 	INIT_LIST_HEAD(&notifier->range_list);
707 
708 	return notifier;
709 }
710 
711 /**
712  * drm_gpusvm_notifier_free() - Free GPU SVM notifier
713  * @gpusvm: Pointer to the GPU SVM structure
714  * @notifier: Pointer to the GPU SVM notifier structure
715  *
716  * This function frees the GPU SVM notifier structure.
717  */
718 static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
719 				     struct drm_gpusvm_notifier *notifier)
720 {
721 	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
722 
723 	if (gpusvm->ops->notifier_free)
724 		gpusvm->ops->notifier_free(notifier);
725 	else
726 		kfree(notifier);
727 }
728 
729 /**
730  * to_drm_gpusvm_range() - retrieve the container struct for a given rbtree node
731  * @node: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
732  *
733  * Return: A pointer to the containing drm_gpusvm_range structure.
734  */
735 static struct drm_gpusvm_range *to_drm_gpusvm_range(struct rb_node *node)
736 {
737 	return container_of(node, struct drm_gpusvm_range, itree.rb);
738 }
739 
740 /**
741  * drm_gpusvm_range_insert() - Insert GPU SVM range
742  * @notifier: Pointer to the GPU SVM notifier structure
743  * @range: Pointer to the GPU SVM range structure
744  *
745  * This function inserts the GPU SVM range into the notifier RB tree and list.
746  */
747 static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
748 				    struct drm_gpusvm_range *range)
749 {
750 	struct rb_node *node;
751 	struct list_head *head;
752 
753 	drm_gpusvm_notifier_lock(notifier->gpusvm);
754 	interval_tree_insert(&range->itree, &notifier->root);
755 
756 	node = rb_prev(&range->itree.rb);
757 	if (node)
758 		head = &(to_drm_gpusvm_range(node))->entry;
759 	else
760 		head = &notifier->range_list;
761 
762 	list_add(&range->entry, head);
763 	drm_gpusvm_notifier_unlock(notifier->gpusvm);
764 }
765 
766 /**
767  * __drm_gpusvm_range_remove() - Remove GPU SVM range
768  * @notifier: Pointer to the GPU SVM notifier structure
769  * @range: Pointer to the GPU SVM range structure
770  *
771  * This macro removes the GPU SVM range from the notifier RB tree and list.
772  */
773 static void __drm_gpusvm_range_remove(struct drm_gpusvm_notifier *notifier,
774 				      struct drm_gpusvm_range *range)
775 {
776 	interval_tree_remove(&range->itree, &notifier->root);
777 	list_del(&range->entry);
778 }
779 
780 /**
781  * drm_gpusvm_range_alloc() - Allocate GPU SVM range
782  * @gpusvm: Pointer to the GPU SVM structure
783  * @notifier: Pointer to the GPU SVM notifier structure
784  * @fault_addr: Fault address
785  * @chunk_size: Chunk size
786  * @migrate_devmem: Flag indicating whether to migrate device memory
787  *
788  * This function allocates and initializes the GPU SVM range structure.
789  *
790  * Return: Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
791  */
792 static struct drm_gpusvm_range *
793 drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
794 		       struct drm_gpusvm_notifier *notifier,
795 		       unsigned long fault_addr, unsigned long chunk_size,
796 		       bool migrate_devmem)
797 {
798 	struct drm_gpusvm_range *range;
799 
800 	if (gpusvm->ops->range_alloc)
801 		range = gpusvm->ops->range_alloc(gpusvm);
802 	else
803 		range = kzalloc(sizeof(*range), GFP_KERNEL);
804 
805 	if (!range)
806 		return ERR_PTR(-ENOMEM);
807 
808 	kref_init(&range->refcount);
809 	range->gpusvm = gpusvm;
810 	range->notifier = notifier;
811 	range->itree.start = ALIGN_DOWN(fault_addr, chunk_size);
812 	range->itree.last = ALIGN(fault_addr + 1, chunk_size) - 1;
813 	INIT_LIST_HEAD(&range->entry);
814 	range->notifier_seq = LONG_MAX;
815 	range->flags.migrate_devmem = migrate_devmem ? 1 : 0;
816 
817 	return range;
818 }
819 
820 /**
821  * drm_gpusvm_check_pages() - Check pages
822  * @gpusvm: Pointer to the GPU SVM structure
823  * @notifier: Pointer to the GPU SVM notifier structure
824  * @start: Start address
825  * @end: End address
826  *
827  * Check if pages between start and end have been faulted in on the CPU. Use to
828  * prevent migration of pages without CPU backing store.
829  *
830  * Return: True if pages have been faulted into CPU, False otherwise
831  */
832 static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
833 				   struct drm_gpusvm_notifier *notifier,
834 				   unsigned long start, unsigned long end)
835 {
836 	struct hmm_range hmm_range = {
837 		.default_flags = 0,
838 		.notifier = &notifier->notifier,
839 		.start = start,
840 		.end = end,
841 		.dev_private_owner = gpusvm->device_private_page_owner,
842 	};
843 	unsigned long timeout =
844 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
845 	unsigned long *pfns;
846 	unsigned long npages = npages_in_range(start, end);
847 	int err, i;
848 
849 	mmap_assert_locked(gpusvm->mm);
850 
851 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
852 	if (!pfns)
853 		return false;
854 
855 	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
856 	hmm_range.hmm_pfns = pfns;
857 
858 	while (true) {
859 		err = hmm_range_fault(&hmm_range);
860 		if (err == -EBUSY) {
861 			if (time_after(jiffies, timeout))
862 				break;
863 
864 			hmm_range.notifier_seq =
865 				mmu_interval_read_begin(&notifier->notifier);
866 			continue;
867 		}
868 		break;
869 	}
870 	if (err)
871 		goto err_free;
872 
873 	for (i = 0; i < npages;) {
874 		if (!(pfns[i] & HMM_PFN_VALID)) {
875 			err = -EFAULT;
876 			goto err_free;
877 		}
878 		i += 0x1 << hmm_pfn_to_map_order(pfns[i]);
879 	}
880 
881 err_free:
882 	kvfree(pfns);
883 	return err ? false : true;
884 }
885 
886 /**
887  * drm_gpusvm_range_chunk_size() - Determine chunk size for GPU SVM range
888  * @gpusvm: Pointer to the GPU SVM structure
889  * @notifier: Pointer to the GPU SVM notifier structure
890  * @vas: Pointer to the virtual memory area structure
891  * @fault_addr: Fault address
892  * @gpuva_start: Start address of GPUVA which mirrors CPU
893  * @gpuva_end: End address of GPUVA which mirrors CPU
894  * @check_pages_threshold: Check CPU pages for present threshold
895  *
896  * This function determines the chunk size for the GPU SVM range based on the
897  * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
898  * memory area boundaries.
899  *
900  * Return: Chunk size on success, LONG_MAX on failure.
901  */
902 static unsigned long
903 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
904 			    struct drm_gpusvm_notifier *notifier,
905 			    struct vm_area_struct *vas,
906 			    unsigned long fault_addr,
907 			    unsigned long gpuva_start,
908 			    unsigned long gpuva_end,
909 			    unsigned long check_pages_threshold)
910 {
911 	unsigned long start, end;
912 	int i = 0;
913 
914 retry:
915 	for (; i < gpusvm->num_chunks; ++i) {
916 		start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
917 		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
918 
919 		if (start >= vas->vm_start && end <= vas->vm_end &&
920 		    start >= drm_gpusvm_notifier_start(notifier) &&
921 		    end <= drm_gpusvm_notifier_end(notifier) &&
922 		    start >= gpuva_start && end <= gpuva_end)
923 			break;
924 	}
925 
926 	if (i == gpusvm->num_chunks)
927 		return LONG_MAX;
928 
929 	/*
930 	 * If allocation more than page, ensure not to overlap with existing
931 	 * ranges.
932 	 */
933 	if (end - start != SZ_4K) {
934 		struct drm_gpusvm_range *range;
935 
936 		range = drm_gpusvm_range_find(notifier, start, end);
937 		if (range) {
938 			++i;
939 			goto retry;
940 		}
941 
942 		/*
943 		 * XXX: Only create range on pages CPU has faulted in. Without
944 		 * this check, or prefault, on BMG 'xe_exec_system_allocator --r
945 		 * process-many-malloc' fails. In the failure case, each process
946 		 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
947 		 * ranges. When migrating the SVM ranges, some processes fail in
948 		 * drm_gpusvm_migrate_to_devmem with 'migrate.cpages != npages'
949 		 * and then upon drm_gpusvm_range_get_pages device pages from
950 		 * other processes are collected + faulted in which creates all
951 		 * sorts of problems. Unsure exactly how this happening, also
952 		 * problem goes away if 'xe_exec_system_allocator --r
953 		 * process-many-malloc' mallocs at least 64k at a time.
954 		 */
955 		if (end - start <= check_pages_threshold &&
956 		    !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) {
957 			++i;
958 			goto retry;
959 		}
960 	}
961 
962 	return end - start;
963 }
964 
965 #ifdef CONFIG_LOCKDEP
966 /**
967  * drm_gpusvm_driver_lock_held() - Assert GPU SVM driver lock is held
968  * @gpusvm: Pointer to the GPU SVM structure.
969  *
970  * Ensure driver lock is held.
971  */
972 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
973 {
974 	if ((gpusvm)->lock_dep_map)
975 		lockdep_assert(lock_is_held_type((gpusvm)->lock_dep_map, 0));
976 }
977 #else
978 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
979 {
980 }
981 #endif
982 
983 /**
984  * drm_gpusvm_find_vma_start() - Find start address for first VMA in range
985  * @gpusvm: Pointer to the GPU SVM structure
986  * @start: The inclusive start user address.
987  * @end: The exclusive end user address.
988  *
989  * Returns: The start address of first VMA within the provided range,
990  * ULONG_MAX otherwise. Assumes start_addr < end_addr.
991  */
992 unsigned long
993 drm_gpusvm_find_vma_start(struct drm_gpusvm *gpusvm,
994 			  unsigned long start,
995 			  unsigned long end)
996 {
997 	struct mm_struct *mm = gpusvm->mm;
998 	struct vm_area_struct *vma;
999 	unsigned long addr = ULONG_MAX;
1000 
1001 	if (!mmget_not_zero(mm))
1002 		return addr;
1003 
1004 	mmap_read_lock(mm);
1005 
1006 	vma = find_vma_intersection(mm, start, end);
1007 	if (vma)
1008 		addr =  vma->vm_start;
1009 
1010 	mmap_read_unlock(mm);
1011 	mmput(mm);
1012 
1013 	return addr;
1014 }
1015 EXPORT_SYMBOL_GPL(drm_gpusvm_find_vma_start);
1016 
1017 /**
1018  * drm_gpusvm_range_find_or_insert() - Find or insert GPU SVM range
1019  * @gpusvm: Pointer to the GPU SVM structure
1020  * @fault_addr: Fault address
1021  * @gpuva_start: Start address of GPUVA which mirrors CPU
1022  * @gpuva_end: End address of GPUVA which mirrors CPU
1023  * @ctx: GPU SVM context
1024  *
1025  * This function finds or inserts a newly allocated a GPU SVM range based on the
1026  * fault address. Caller must hold a lock to protect range lookup and insertion.
1027  *
1028  * Return: Pointer to the GPU SVM range on success, ERR_PTR() on failure.
1029  */
1030 struct drm_gpusvm_range *
1031 drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
1032 				unsigned long fault_addr,
1033 				unsigned long gpuva_start,
1034 				unsigned long gpuva_end,
1035 				const struct drm_gpusvm_ctx *ctx)
1036 {
1037 	struct drm_gpusvm_notifier *notifier;
1038 	struct drm_gpusvm_range *range;
1039 	struct mm_struct *mm = gpusvm->mm;
1040 	struct vm_area_struct *vas;
1041 	bool notifier_alloc = false;
1042 	unsigned long chunk_size;
1043 	int err;
1044 	bool migrate_devmem;
1045 
1046 	drm_gpusvm_driver_lock_held(gpusvm);
1047 
1048 	if (fault_addr < gpusvm->mm_start ||
1049 	    fault_addr > gpusvm->mm_start + gpusvm->mm_range)
1050 		return ERR_PTR(-EINVAL);
1051 
1052 	if (!mmget_not_zero(mm))
1053 		return ERR_PTR(-EFAULT);
1054 
1055 	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
1056 	if (!notifier) {
1057 		notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
1058 		if (IS_ERR(notifier)) {
1059 			err = PTR_ERR(notifier);
1060 			goto err_mmunlock;
1061 		}
1062 		notifier_alloc = true;
1063 		err = mmu_interval_notifier_insert(&notifier->notifier,
1064 						   mm,
1065 						   drm_gpusvm_notifier_start(notifier),
1066 						   drm_gpusvm_notifier_size(notifier),
1067 						   &drm_gpusvm_notifier_ops);
1068 		if (err)
1069 			goto err_notifier;
1070 	}
1071 
1072 	mmap_read_lock(mm);
1073 
1074 	vas = vma_lookup(mm, fault_addr);
1075 	if (!vas) {
1076 		err = -ENOENT;
1077 		goto err_notifier_remove;
1078 	}
1079 
1080 	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
1081 		err = -EPERM;
1082 		goto err_notifier_remove;
1083 	}
1084 
1085 	range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
1086 	if (range)
1087 		goto out_mmunlock;
1088 	/*
1089 	 * XXX: Short-circuiting migration based on migrate_vma_* current
1090 	 * limitations. If/when migrate_vma_* add more support, this logic will
1091 	 * have to change.
1092 	 */
1093 	migrate_devmem = ctx->devmem_possible &&
1094 		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
1095 
1096 	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
1097 						 fault_addr, gpuva_start,
1098 						 gpuva_end,
1099 						 ctx->check_pages_threshold);
1100 	if (chunk_size == LONG_MAX) {
1101 		err = -EINVAL;
1102 		goto err_notifier_remove;
1103 	}
1104 
1105 	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
1106 				       migrate_devmem);
1107 	if (IS_ERR(range)) {
1108 		err = PTR_ERR(range);
1109 		goto err_notifier_remove;
1110 	}
1111 
1112 	drm_gpusvm_range_insert(notifier, range);
1113 	if (notifier_alloc)
1114 		drm_gpusvm_notifier_insert(gpusvm, notifier);
1115 
1116 out_mmunlock:
1117 	mmap_read_unlock(mm);
1118 	mmput(mm);
1119 
1120 	return range;
1121 
1122 err_notifier_remove:
1123 	mmap_read_unlock(mm);
1124 	if (notifier_alloc)
1125 		mmu_interval_notifier_remove(&notifier->notifier);
1126 err_notifier:
1127 	if (notifier_alloc)
1128 		drm_gpusvm_notifier_free(gpusvm, notifier);
1129 err_mmunlock:
1130 	mmput(mm);
1131 	return ERR_PTR(err);
1132 }
1133 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find_or_insert);
1134 
1135 /**
1136  * __drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range (internal)
1137  * @gpusvm: Pointer to the GPU SVM structure
1138  * @range: Pointer to the GPU SVM range structure
1139  * @npages: Number of pages to unmap
1140  *
1141  * This function unmap pages associated with a GPU SVM range. Assumes and
1142  * asserts correct locking is in place when called.
1143  */
1144 static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1145 					   struct drm_gpusvm_range *range,
1146 					   unsigned long npages)
1147 {
1148 	unsigned long i, j;
1149 	struct drm_pagemap *dpagemap = range->dpagemap;
1150 	struct device *dev = gpusvm->drm->dev;
1151 
1152 	lockdep_assert_held(&gpusvm->notifier_lock);
1153 
1154 	if (range->flags.has_dma_mapping) {
1155 		struct drm_gpusvm_range_flags flags = {
1156 			.__flags = range->flags.__flags,
1157 		};
1158 
1159 		for (i = 0, j = 0; i < npages; j++) {
1160 			struct drm_pagemap_device_addr *addr = &range->dma_addr[j];
1161 
1162 			if (addr->proto == DRM_INTERCONNECT_SYSTEM)
1163 				dma_unmap_page(dev,
1164 					       addr->addr,
1165 					       PAGE_SIZE << addr->order,
1166 					       addr->dir);
1167 			else if (dpagemap && dpagemap->ops->device_unmap)
1168 				dpagemap->ops->device_unmap(dpagemap,
1169 							    dev, *addr);
1170 			i += 1 << addr->order;
1171 		}
1172 
1173 		/* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1174 		flags.has_devmem_pages = false;
1175 		flags.has_dma_mapping = false;
1176 		WRITE_ONCE(range->flags.__flags, flags.__flags);
1177 
1178 		range->dpagemap = NULL;
1179 	}
1180 }
1181 
1182 /**
1183  * drm_gpusvm_range_free_pages() - Free pages associated with a GPU SVM range
1184  * @gpusvm: Pointer to the GPU SVM structure
1185  * @range: Pointer to the GPU SVM range structure
1186  *
1187  * This function frees the dma address array associated with a GPU SVM range.
1188  */
1189 static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
1190 					struct drm_gpusvm_range *range)
1191 {
1192 	lockdep_assert_held(&gpusvm->notifier_lock);
1193 
1194 	if (range->dma_addr) {
1195 		kvfree(range->dma_addr);
1196 		range->dma_addr = NULL;
1197 	}
1198 }
1199 
1200 /**
1201  * drm_gpusvm_range_remove() - Remove GPU SVM range
1202  * @gpusvm: Pointer to the GPU SVM structure
1203  * @range: Pointer to the GPU SVM range to be removed
1204  *
1205  * This function removes the specified GPU SVM range and also removes the parent
1206  * GPU SVM notifier if no more ranges remain in the notifier. The caller must
1207  * hold a lock to protect range and notifier removal.
1208  */
1209 void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
1210 			     struct drm_gpusvm_range *range)
1211 {
1212 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1213 					       drm_gpusvm_range_end(range));
1214 	struct drm_gpusvm_notifier *notifier;
1215 
1216 	drm_gpusvm_driver_lock_held(gpusvm);
1217 
1218 	notifier = drm_gpusvm_notifier_find(gpusvm,
1219 					    drm_gpusvm_range_start(range));
1220 	if (WARN_ON_ONCE(!notifier))
1221 		return;
1222 
1223 	drm_gpusvm_notifier_lock(gpusvm);
1224 	__drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1225 	drm_gpusvm_range_free_pages(gpusvm, range);
1226 	__drm_gpusvm_range_remove(notifier, range);
1227 	drm_gpusvm_notifier_unlock(gpusvm);
1228 
1229 	drm_gpusvm_range_put(range);
1230 
1231 	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
1232 		if (!notifier->flags.removed)
1233 			mmu_interval_notifier_remove(&notifier->notifier);
1234 		drm_gpusvm_notifier_remove(gpusvm, notifier);
1235 		drm_gpusvm_notifier_free(gpusvm, notifier);
1236 	}
1237 }
1238 EXPORT_SYMBOL_GPL(drm_gpusvm_range_remove);
1239 
1240 /**
1241  * drm_gpusvm_range_get() - Get a reference to GPU SVM range
1242  * @range: Pointer to the GPU SVM range
1243  *
1244  * This function increments the reference count of the specified GPU SVM range.
1245  *
1246  * Return: Pointer to the GPU SVM range.
1247  */
1248 struct drm_gpusvm_range *
1249 drm_gpusvm_range_get(struct drm_gpusvm_range *range)
1250 {
1251 	kref_get(&range->refcount);
1252 
1253 	return range;
1254 }
1255 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get);
1256 
1257 /**
1258  * drm_gpusvm_range_destroy() - Destroy GPU SVM range
1259  * @refcount: Pointer to the reference counter embedded in the GPU SVM range
1260  *
1261  * This function destroys the specified GPU SVM range when its reference count
1262  * reaches zero. If a custom range-free function is provided, it is invoked to
1263  * free the range; otherwise, the range is deallocated using kfree().
1264  */
1265 static void drm_gpusvm_range_destroy(struct kref *refcount)
1266 {
1267 	struct drm_gpusvm_range *range =
1268 		container_of(refcount, struct drm_gpusvm_range, refcount);
1269 	struct drm_gpusvm *gpusvm = range->gpusvm;
1270 
1271 	if (gpusvm->ops->range_free)
1272 		gpusvm->ops->range_free(range);
1273 	else
1274 		kfree(range);
1275 }
1276 
1277 /**
1278  * drm_gpusvm_range_put() - Put a reference to GPU SVM range
1279  * @range: Pointer to the GPU SVM range
1280  *
1281  * This function decrements the reference count of the specified GPU SVM range
1282  * and frees it when the count reaches zero.
1283  */
1284 void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
1285 {
1286 	kref_put(&range->refcount, drm_gpusvm_range_destroy);
1287 }
1288 EXPORT_SYMBOL_GPL(drm_gpusvm_range_put);
1289 
1290 /**
1291  * drm_gpusvm_range_pages_valid() - GPU SVM range pages valid
1292  * @gpusvm: Pointer to the GPU SVM structure
1293  * @range: Pointer to the GPU SVM range structure
1294  *
1295  * This function determines if a GPU SVM range pages are valid. Expected be
1296  * called holding gpusvm->notifier_lock and as the last step before committing a
1297  * GPU binding. This is akin to a notifier seqno check in the HMM documentation
1298  * but due to wider notifiers (i.e., notifiers which span multiple ranges) this
1299  * function is required for finer grained checking (i.e., per range) if pages
1300  * are valid.
1301  *
1302  * Return: True if GPU SVM range has valid pages, False otherwise
1303  */
1304 bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
1305 				  struct drm_gpusvm_range *range)
1306 {
1307 	lockdep_assert_held(&gpusvm->notifier_lock);
1308 
1309 	return range->flags.has_devmem_pages || range->flags.has_dma_mapping;
1310 }
1311 EXPORT_SYMBOL_GPL(drm_gpusvm_range_pages_valid);
1312 
1313 /**
1314  * drm_gpusvm_range_pages_valid_unlocked() - GPU SVM range pages valid unlocked
1315  * @gpusvm: Pointer to the GPU SVM structure
1316  * @range: Pointer to the GPU SVM range structure
1317  *
1318  * This function determines if a GPU SVM range pages are valid. Expected be
1319  * called without holding gpusvm->notifier_lock.
1320  *
1321  * Return: True if GPU SVM range has valid pages, False otherwise
1322  */
1323 static bool
1324 drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
1325 				      struct drm_gpusvm_range *range)
1326 {
1327 	bool pages_valid;
1328 
1329 	if (!range->dma_addr)
1330 		return false;
1331 
1332 	drm_gpusvm_notifier_lock(gpusvm);
1333 	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
1334 	if (!pages_valid)
1335 		drm_gpusvm_range_free_pages(gpusvm, range);
1336 	drm_gpusvm_notifier_unlock(gpusvm);
1337 
1338 	return pages_valid;
1339 }
1340 
1341 /**
1342  * drm_gpusvm_range_get_pages() - Get pages for a GPU SVM range
1343  * @gpusvm: Pointer to the GPU SVM structure
1344  * @range: Pointer to the GPU SVM range structure
1345  * @ctx: GPU SVM context
1346  *
1347  * This function gets pages for a GPU SVM range and ensures they are mapped for
1348  * DMA access.
1349  *
1350  * Return: 0 on success, negative error code on failure.
1351  */
1352 int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
1353 			       struct drm_gpusvm_range *range,
1354 			       const struct drm_gpusvm_ctx *ctx)
1355 {
1356 	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
1357 	struct hmm_range hmm_range = {
1358 		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
1359 			HMM_PFN_REQ_WRITE),
1360 		.notifier = notifier,
1361 		.start = drm_gpusvm_range_start(range),
1362 		.end = drm_gpusvm_range_end(range),
1363 		.dev_private_owner = gpusvm->device_private_page_owner,
1364 	};
1365 	struct mm_struct *mm = gpusvm->mm;
1366 	struct drm_gpusvm_zdd *zdd;
1367 	unsigned long timeout =
1368 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1369 	unsigned long i, j;
1370 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1371 					       drm_gpusvm_range_end(range));
1372 	unsigned long num_dma_mapped;
1373 	unsigned int order = 0;
1374 	unsigned long *pfns;
1375 	int err = 0;
1376 	struct dev_pagemap *pagemap;
1377 	struct drm_pagemap *dpagemap;
1378 	struct drm_gpusvm_range_flags flags;
1379 
1380 retry:
1381 	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
1382 	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
1383 		goto set_seqno;
1384 
1385 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
1386 	if (!pfns)
1387 		return -ENOMEM;
1388 
1389 	if (!mmget_not_zero(mm)) {
1390 		err = -EFAULT;
1391 		goto err_free;
1392 	}
1393 
1394 	hmm_range.hmm_pfns = pfns;
1395 	while (true) {
1396 		mmap_read_lock(mm);
1397 		err = hmm_range_fault(&hmm_range);
1398 		mmap_read_unlock(mm);
1399 
1400 		if (err == -EBUSY) {
1401 			if (time_after(jiffies, timeout))
1402 				break;
1403 
1404 			hmm_range.notifier_seq =
1405 				mmu_interval_read_begin(notifier);
1406 			continue;
1407 		}
1408 		break;
1409 	}
1410 	mmput(mm);
1411 	if (err)
1412 		goto err_free;
1413 
1414 map_pages:
1415 	/*
1416 	 * Perform all dma mappings under the notifier lock to not
1417 	 * access freed pages. A notifier will either block on
1418 	 * the notifier lock or unmap dma.
1419 	 */
1420 	drm_gpusvm_notifier_lock(gpusvm);
1421 
1422 	flags.__flags = range->flags.__flags;
1423 	if (flags.unmapped) {
1424 		drm_gpusvm_notifier_unlock(gpusvm);
1425 		err = -EFAULT;
1426 		goto err_free;
1427 	}
1428 
1429 	if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
1430 		drm_gpusvm_notifier_unlock(gpusvm);
1431 		kvfree(pfns);
1432 		goto retry;
1433 	}
1434 
1435 	if (!range->dma_addr) {
1436 		/* Unlock and restart mapping to allocate memory. */
1437 		drm_gpusvm_notifier_unlock(gpusvm);
1438 		range->dma_addr = kvmalloc_array(npages,
1439 						 sizeof(*range->dma_addr),
1440 						 GFP_KERNEL);
1441 		if (!range->dma_addr) {
1442 			err = -ENOMEM;
1443 			goto err_free;
1444 		}
1445 		goto map_pages;
1446 	}
1447 
1448 	zdd = NULL;
1449 	num_dma_mapped = 0;
1450 	for (i = 0, j = 0; i < npages; ++j) {
1451 		struct page *page = hmm_pfn_to_page(pfns[i]);
1452 
1453 		order = hmm_pfn_to_map_order(pfns[i]);
1454 		if (is_device_private_page(page) ||
1455 		    is_device_coherent_page(page)) {
1456 			if (zdd != page->zone_device_data && i > 0) {
1457 				err = -EOPNOTSUPP;
1458 				goto err_unmap;
1459 			}
1460 			zdd = page->zone_device_data;
1461 			if (pagemap != page_pgmap(page)) {
1462 				if (i > 0) {
1463 					err = -EOPNOTSUPP;
1464 					goto err_unmap;
1465 				}
1466 
1467 				pagemap = page_pgmap(page);
1468 				dpagemap = zdd->devmem_allocation->dpagemap;
1469 				if (drm_WARN_ON(gpusvm->drm, !dpagemap)) {
1470 					/*
1471 					 * Raced. This is not supposed to happen
1472 					 * since hmm_range_fault() should've migrated
1473 					 * this page to system.
1474 					 */
1475 					err = -EAGAIN;
1476 					goto err_unmap;
1477 				}
1478 			}
1479 			range->dma_addr[j] =
1480 				dpagemap->ops->device_map(dpagemap,
1481 							  gpusvm->drm->dev,
1482 							  page, order,
1483 							  DMA_BIDIRECTIONAL);
1484 			if (dma_mapping_error(gpusvm->drm->dev,
1485 					      range->dma_addr[j].addr)) {
1486 				err = -EFAULT;
1487 				goto err_unmap;
1488 			}
1489 		} else {
1490 			dma_addr_t addr;
1491 
1492 			if (is_zone_device_page(page) || zdd) {
1493 				err = -EOPNOTSUPP;
1494 				goto err_unmap;
1495 			}
1496 
1497 			if (ctx->devmem_only) {
1498 				err = -EFAULT;
1499 				goto err_unmap;
1500 			}
1501 
1502 			addr = dma_map_page(gpusvm->drm->dev,
1503 					    page, 0,
1504 					    PAGE_SIZE << order,
1505 					    DMA_BIDIRECTIONAL);
1506 			if (dma_mapping_error(gpusvm->drm->dev, addr)) {
1507 				err = -EFAULT;
1508 				goto err_unmap;
1509 			}
1510 
1511 			range->dma_addr[j] = drm_pagemap_device_addr_encode
1512 				(addr, DRM_INTERCONNECT_SYSTEM, order,
1513 				 DMA_BIDIRECTIONAL);
1514 		}
1515 		i += 1 << order;
1516 		num_dma_mapped = i;
1517 		flags.has_dma_mapping = true;
1518 	}
1519 
1520 	if (zdd) {
1521 		flags.has_devmem_pages = true;
1522 		range->dpagemap = dpagemap;
1523 	}
1524 
1525 	/* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1526 	WRITE_ONCE(range->flags.__flags, flags.__flags);
1527 
1528 	drm_gpusvm_notifier_unlock(gpusvm);
1529 	kvfree(pfns);
1530 set_seqno:
1531 	range->notifier_seq = hmm_range.notifier_seq;
1532 
1533 	return 0;
1534 
1535 err_unmap:
1536 	__drm_gpusvm_range_unmap_pages(gpusvm, range, num_dma_mapped);
1537 	drm_gpusvm_notifier_unlock(gpusvm);
1538 err_free:
1539 	kvfree(pfns);
1540 	if (err == -EAGAIN)
1541 		goto retry;
1542 	return err;
1543 }
1544 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get_pages);
1545 
1546 /**
1547  * drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range
1548  * @gpusvm: Pointer to the GPU SVM structure
1549  * @range: Pointer to the GPU SVM range structure
1550  * @ctx: GPU SVM context
1551  *
1552  * This function unmaps pages associated with a GPU SVM range. If @in_notifier
1553  * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
1554  * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
1555  * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
1556  * security model.
1557  */
1558 void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1559 				  struct drm_gpusvm_range *range,
1560 				  const struct drm_gpusvm_ctx *ctx)
1561 {
1562 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1563 					       drm_gpusvm_range_end(range));
1564 
1565 	if (ctx->in_notifier)
1566 		lockdep_assert_held_write(&gpusvm->notifier_lock);
1567 	else
1568 		drm_gpusvm_notifier_lock(gpusvm);
1569 
1570 	__drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1571 
1572 	if (!ctx->in_notifier)
1573 		drm_gpusvm_notifier_unlock(gpusvm);
1574 }
1575 EXPORT_SYMBOL_GPL(drm_gpusvm_range_unmap_pages);
1576 
1577 /**
1578  * drm_gpusvm_migration_unlock_put_page() - Put a migration page
1579  * @page: Pointer to the page to put
1580  *
1581  * This function unlocks and puts a page.
1582  */
1583 static void drm_gpusvm_migration_unlock_put_page(struct page *page)
1584 {
1585 	unlock_page(page);
1586 	put_page(page);
1587 }
1588 
1589 /**
1590  * drm_gpusvm_migration_unlock_put_pages() - Put migration pages
1591  * @npages: Number of pages
1592  * @migrate_pfn: Array of migrate page frame numbers
1593  *
1594  * This function unlocks and puts an array of pages.
1595  */
1596 static void drm_gpusvm_migration_unlock_put_pages(unsigned long npages,
1597 						  unsigned long *migrate_pfn)
1598 {
1599 	unsigned long i;
1600 
1601 	for (i = 0; i < npages; ++i) {
1602 		struct page *page;
1603 
1604 		if (!migrate_pfn[i])
1605 			continue;
1606 
1607 		page = migrate_pfn_to_page(migrate_pfn[i]);
1608 		drm_gpusvm_migration_unlock_put_page(page);
1609 		migrate_pfn[i] = 0;
1610 	}
1611 }
1612 
1613 /**
1614  * drm_gpusvm_get_devmem_page() - Get a reference to a device memory page
1615  * @page: Pointer to the page
1616  * @zdd: Pointer to the GPU SVM zone device data
1617  *
1618  * This function associates the given page with the specified GPU SVM zone
1619  * device data and initializes it for zone device usage.
1620  */
1621 static void drm_gpusvm_get_devmem_page(struct page *page,
1622 				       struct drm_gpusvm_zdd *zdd)
1623 {
1624 	page->zone_device_data = drm_gpusvm_zdd_get(zdd);
1625 	zone_device_page_init(page);
1626 }
1627 
1628 /**
1629  * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
1630  * @dev: The device for which the pages are being mapped
1631  * @dma_addr: Array to store DMA addresses corresponding to mapped pages
1632  * @migrate_pfn: Array of migrate page frame numbers to map
1633  * @npages: Number of pages to map
1634  * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
1635  *
1636  * This function maps pages of memory for migration usage in GPU SVM. It
1637  * iterates over each page frame number provided in @migrate_pfn, maps the
1638  * corresponding page, and stores the DMA address in the provided @dma_addr
1639  * array.
1640  *
1641  * Return: 0 on success, -EFAULT if an error occurs during mapping.
1642  */
1643 static int drm_gpusvm_migrate_map_pages(struct device *dev,
1644 					dma_addr_t *dma_addr,
1645 					unsigned long *migrate_pfn,
1646 					unsigned long npages,
1647 					enum dma_data_direction dir)
1648 {
1649 	unsigned long i;
1650 
1651 	for (i = 0; i < npages; ++i) {
1652 		struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
1653 
1654 		if (!page)
1655 			continue;
1656 
1657 		if (WARN_ON_ONCE(is_zone_device_page(page)))
1658 			return -EFAULT;
1659 
1660 		dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
1661 		if (dma_mapping_error(dev, dma_addr[i]))
1662 			return -EFAULT;
1663 	}
1664 
1665 	return 0;
1666 }
1667 
1668 /**
1669  * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
1670  * @dev: The device for which the pages were mapped
1671  * @dma_addr: Array of DMA addresses corresponding to mapped pages
1672  * @npages: Number of pages to unmap
1673  * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
1674  *
1675  * This function unmaps previously mapped pages of memory for GPU Shared Virtual
1676  * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
1677  * if it's valid and not already unmapped, and unmaps the corresponding page.
1678  */
1679 static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
1680 					   dma_addr_t *dma_addr,
1681 					   unsigned long npages,
1682 					   enum dma_data_direction dir)
1683 {
1684 	unsigned long i;
1685 
1686 	for (i = 0; i < npages; ++i) {
1687 		if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
1688 			continue;
1689 
1690 		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
1691 	}
1692 }
1693 
1694 /**
1695  * drm_gpusvm_migrate_to_devmem() - Migrate GPU SVM range to device memory
1696  * @gpusvm: Pointer to the GPU SVM structure
1697  * @range: Pointer to the GPU SVM range structure
1698  * @devmem_allocation: Pointer to the device memory allocation. The caller
1699  *                     should hold a reference to the device memory allocation,
1700  *                     which should be dropped via ops->devmem_release or upon
1701  *                     the failure of this function.
1702  * @ctx: GPU SVM context
1703  *
1704  * This function migrates the specified GPU SVM range to device memory. It
1705  * performs the necessary setup and invokes the driver-specific operations for
1706  * migration to device memory. Upon successful return, @devmem_allocation can
1707  * safely reference @range until ops->devmem_release is called which only upon
1708  * successful return. Expected to be called while holding the mmap lock in read
1709  * mode.
1710  *
1711  * Return: 0 on success, negative error code on failure.
1712  */
1713 int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm,
1714 				 struct drm_gpusvm_range *range,
1715 				 struct drm_gpusvm_devmem *devmem_allocation,
1716 				 const struct drm_gpusvm_ctx *ctx)
1717 {
1718 	const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
1719 	unsigned long start = drm_gpusvm_range_start(range),
1720 		      end = drm_gpusvm_range_end(range);
1721 	struct migrate_vma migrate = {
1722 		.start		= start,
1723 		.end		= end,
1724 		.pgmap_owner	= gpusvm->device_private_page_owner,
1725 		.flags		= MIGRATE_VMA_SELECT_SYSTEM,
1726 	};
1727 	struct mm_struct *mm = gpusvm->mm;
1728 	unsigned long i, npages = npages_in_range(start, end);
1729 	struct vm_area_struct *vas;
1730 	struct drm_gpusvm_zdd *zdd = NULL;
1731 	struct page **pages;
1732 	dma_addr_t *dma_addr;
1733 	void *buf;
1734 	int err;
1735 
1736 	mmap_assert_locked(gpusvm->mm);
1737 
1738 	if (!range->flags.migrate_devmem)
1739 		return -EINVAL;
1740 
1741 	if (!ops->populate_devmem_pfn || !ops->copy_to_devmem ||
1742 	    !ops->copy_to_ram)
1743 		return -EOPNOTSUPP;
1744 
1745 	vas = vma_lookup(mm, start);
1746 	if (!vas) {
1747 		err = -ENOENT;
1748 		goto err_out;
1749 	}
1750 
1751 	if (end > vas->vm_end || start < vas->vm_start) {
1752 		err = -EINVAL;
1753 		goto err_out;
1754 	}
1755 
1756 	if (!vma_is_anonymous(vas)) {
1757 		err = -EBUSY;
1758 		goto err_out;
1759 	}
1760 
1761 	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
1762 		       sizeof(*pages), GFP_KERNEL);
1763 	if (!buf) {
1764 		err = -ENOMEM;
1765 		goto err_out;
1766 	}
1767 	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
1768 	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
1769 
1770 	zdd = drm_gpusvm_zdd_alloc(gpusvm->device_private_page_owner);
1771 	if (!zdd) {
1772 		err = -ENOMEM;
1773 		goto err_free;
1774 	}
1775 
1776 	migrate.vma = vas;
1777 	migrate.src = buf;
1778 	migrate.dst = migrate.src + npages;
1779 
1780 	err = migrate_vma_setup(&migrate);
1781 	if (err)
1782 		goto err_free;
1783 
1784 	if (!migrate.cpages) {
1785 		err = -EFAULT;
1786 		goto err_free;
1787 	}
1788 
1789 	if (migrate.cpages != npages) {
1790 		err = -EBUSY;
1791 		goto err_finalize;
1792 	}
1793 
1794 	err = ops->populate_devmem_pfn(devmem_allocation, npages, migrate.dst);
1795 	if (err)
1796 		goto err_finalize;
1797 
1798 	err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
1799 					   migrate.src, npages, DMA_TO_DEVICE);
1800 	if (err)
1801 		goto err_finalize;
1802 
1803 	for (i = 0; i < npages; ++i) {
1804 		struct page *page = pfn_to_page(migrate.dst[i]);
1805 
1806 		pages[i] = page;
1807 		migrate.dst[i] = migrate_pfn(migrate.dst[i]);
1808 		drm_gpusvm_get_devmem_page(page, zdd);
1809 	}
1810 
1811 	err = ops->copy_to_devmem(pages, dma_addr, npages);
1812 	if (err)
1813 		goto err_finalize;
1814 
1815 	/* Upon success bind devmem allocation to range and zdd */
1816 	devmem_allocation->timeslice_expiration = get_jiffies_64() +
1817 		msecs_to_jiffies(ctx->timeslice_ms);
1818 	zdd->devmem_allocation = devmem_allocation;	/* Owns ref */
1819 
1820 err_finalize:
1821 	if (err)
1822 		drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
1823 	migrate_vma_pages(&migrate);
1824 	migrate_vma_finalize(&migrate);
1825 	drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
1826 				       DMA_TO_DEVICE);
1827 err_free:
1828 	if (zdd)
1829 		drm_gpusvm_zdd_put(zdd);
1830 	kvfree(buf);
1831 err_out:
1832 	return err;
1833 }
1834 EXPORT_SYMBOL_GPL(drm_gpusvm_migrate_to_devmem);
1835 
1836 /**
1837  * drm_gpusvm_migrate_populate_ram_pfn() - Populate RAM PFNs for a VM area
1838  * @vas: Pointer to the VM area structure, can be NULL
1839  * @fault_page: Fault page
1840  * @npages: Number of pages to populate
1841  * @mpages: Number of pages to migrate
1842  * @src_mpfn: Source array of migrate PFNs
1843  * @mpfn: Array of migrate PFNs to populate
1844  * @addr: Start address for PFN allocation
1845  *
1846  * This function populates the RAM migrate page frame numbers (PFNs) for the
1847  * specified VM area structure. It allocates and locks pages in the VM area for
1848  * RAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
1849  * alloc_page for allocation.
1850  *
1851  * Return: 0 on success, negative error code on failure.
1852  */
1853 static int drm_gpusvm_migrate_populate_ram_pfn(struct vm_area_struct *vas,
1854 					       struct page *fault_page,
1855 					       unsigned long npages,
1856 					       unsigned long *mpages,
1857 					       unsigned long *src_mpfn,
1858 					       unsigned long *mpfn,
1859 					       unsigned long addr)
1860 {
1861 	unsigned long i;
1862 
1863 	for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
1864 		struct page *page, *src_page;
1865 
1866 		if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
1867 			continue;
1868 
1869 		src_page = migrate_pfn_to_page(src_mpfn[i]);
1870 		if (!src_page)
1871 			continue;
1872 
1873 		if (fault_page) {
1874 			if (src_page->zone_device_data !=
1875 			    fault_page->zone_device_data)
1876 				continue;
1877 		}
1878 
1879 		if (vas)
1880 			page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
1881 		else
1882 			page = alloc_page(GFP_HIGHUSER);
1883 
1884 		if (!page)
1885 			goto free_pages;
1886 
1887 		mpfn[i] = migrate_pfn(page_to_pfn(page));
1888 	}
1889 
1890 	for (i = 0; i < npages; ++i) {
1891 		struct page *page = migrate_pfn_to_page(mpfn[i]);
1892 
1893 		if (!page)
1894 			continue;
1895 
1896 		WARN_ON_ONCE(!trylock_page(page));
1897 		++*mpages;
1898 	}
1899 
1900 	return 0;
1901 
1902 free_pages:
1903 	for (i = 0; i < npages; ++i) {
1904 		struct page *page = migrate_pfn_to_page(mpfn[i]);
1905 
1906 		if (!page)
1907 			continue;
1908 
1909 		put_page(page);
1910 		mpfn[i] = 0;
1911 	}
1912 	return -ENOMEM;
1913 }
1914 
1915 /**
1916  * drm_gpusvm_evict_to_ram() - Evict GPU SVM range to RAM
1917  * @devmem_allocation: Pointer to the device memory allocation
1918  *
1919  * Similar to __drm_gpusvm_migrate_to_ram but does not require mmap lock and
1920  * migration done via migrate_device_* functions.
1921  *
1922  * Return: 0 on success, negative error code on failure.
1923  */
1924 int drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem *devmem_allocation)
1925 {
1926 	const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
1927 	unsigned long npages, mpages = 0;
1928 	struct page **pages;
1929 	unsigned long *src, *dst;
1930 	dma_addr_t *dma_addr;
1931 	void *buf;
1932 	int i, err = 0;
1933 	unsigned int retry_count = 2;
1934 
1935 	npages = devmem_allocation->size >> PAGE_SHIFT;
1936 
1937 retry:
1938 	if (!mmget_not_zero(devmem_allocation->mm))
1939 		return -EFAULT;
1940 
1941 	buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
1942 		       sizeof(*pages), GFP_KERNEL);
1943 	if (!buf) {
1944 		err = -ENOMEM;
1945 		goto err_out;
1946 	}
1947 	src = buf;
1948 	dst = buf + (sizeof(*src) * npages);
1949 	dma_addr = buf + (2 * sizeof(*src) * npages);
1950 	pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
1951 
1952 	err = ops->populate_devmem_pfn(devmem_allocation, npages, src);
1953 	if (err)
1954 		goto err_free;
1955 
1956 	err = migrate_device_pfns(src, npages);
1957 	if (err)
1958 		goto err_free;
1959 
1960 	err = drm_gpusvm_migrate_populate_ram_pfn(NULL, NULL, npages, &mpages,
1961 						  src, dst, 0);
1962 	if (err || !mpages)
1963 		goto err_finalize;
1964 
1965 	err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
1966 					   dst, npages, DMA_FROM_DEVICE);
1967 	if (err)
1968 		goto err_finalize;
1969 
1970 	for (i = 0; i < npages; ++i)
1971 		pages[i] = migrate_pfn_to_page(src[i]);
1972 
1973 	err = ops->copy_to_ram(pages, dma_addr, npages);
1974 	if (err)
1975 		goto err_finalize;
1976 
1977 err_finalize:
1978 	if (err)
1979 		drm_gpusvm_migration_unlock_put_pages(npages, dst);
1980 	migrate_device_pages(src, dst, npages);
1981 	migrate_device_finalize(src, dst, npages);
1982 	drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
1983 				       DMA_FROM_DEVICE);
1984 err_free:
1985 	kvfree(buf);
1986 err_out:
1987 	mmput_async(devmem_allocation->mm);
1988 
1989 	if (completion_done(&devmem_allocation->detached))
1990 		return 0;
1991 
1992 	if (retry_count--) {
1993 		cond_resched();
1994 		goto retry;
1995 	}
1996 
1997 	return err ?: -EBUSY;
1998 }
1999 EXPORT_SYMBOL_GPL(drm_gpusvm_evict_to_ram);
2000 
2001 /**
2002  * __drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (internal)
2003  * @vas: Pointer to the VM area structure
2004  * @device_private_page_owner: Device private pages owner
2005  * @page: Pointer to the page for fault handling (can be NULL)
2006  * @fault_addr: Fault address
2007  * @size: Size of migration
2008  *
2009  * This internal function performs the migration of the specified GPU SVM range
2010  * to RAM. It sets up the migration, populates + dma maps RAM PFNs, and
2011  * invokes the driver-specific operations for migration to RAM.
2012  *
2013  * Return: 0 on success, negative error code on failure.
2014  */
2015 static int __drm_gpusvm_migrate_to_ram(struct vm_area_struct *vas,
2016 				       void *device_private_page_owner,
2017 				       struct page *page,
2018 				       unsigned long fault_addr,
2019 				       unsigned long size)
2020 {
2021 	struct migrate_vma migrate = {
2022 		.vma		= vas,
2023 		.pgmap_owner	= device_private_page_owner,
2024 		.flags		= MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
2025 			MIGRATE_VMA_SELECT_DEVICE_COHERENT,
2026 		.fault_page	= page,
2027 	};
2028 	struct drm_gpusvm_zdd *zdd;
2029 	const struct drm_gpusvm_devmem_ops *ops;
2030 	struct device *dev = NULL;
2031 	unsigned long npages, mpages = 0;
2032 	struct page **pages;
2033 	dma_addr_t *dma_addr;
2034 	unsigned long start, end;
2035 	void *buf;
2036 	int i, err = 0;
2037 
2038 	if (page) {
2039 		zdd = page->zone_device_data;
2040 		if (time_before64(get_jiffies_64(),
2041 				  zdd->devmem_allocation->timeslice_expiration))
2042 			return 0;
2043 	}
2044 
2045 	start = ALIGN_DOWN(fault_addr, size);
2046 	end = ALIGN(fault_addr + 1, size);
2047 
2048 	/* Corner where VMA area struct has been partially unmapped */
2049 	if (start < vas->vm_start)
2050 		start = vas->vm_start;
2051 	if (end > vas->vm_end)
2052 		end = vas->vm_end;
2053 
2054 	migrate.start = start;
2055 	migrate.end = end;
2056 	npages = npages_in_range(start, end);
2057 
2058 	buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
2059 		       sizeof(*pages), GFP_KERNEL);
2060 	if (!buf) {
2061 		err = -ENOMEM;
2062 		goto err_out;
2063 	}
2064 	dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
2065 	pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
2066 
2067 	migrate.vma = vas;
2068 	migrate.src = buf;
2069 	migrate.dst = migrate.src + npages;
2070 
2071 	err = migrate_vma_setup(&migrate);
2072 	if (err)
2073 		goto err_free;
2074 
2075 	/* Raced with another CPU fault, nothing to do */
2076 	if (!migrate.cpages)
2077 		goto err_free;
2078 
2079 	if (!page) {
2080 		for (i = 0; i < npages; ++i) {
2081 			if (!(migrate.src[i] & MIGRATE_PFN_MIGRATE))
2082 				continue;
2083 
2084 			page = migrate_pfn_to_page(migrate.src[i]);
2085 			break;
2086 		}
2087 
2088 		if (!page)
2089 			goto err_finalize;
2090 	}
2091 	zdd = page->zone_device_data;
2092 	ops = zdd->devmem_allocation->ops;
2093 	dev = zdd->devmem_allocation->dev;
2094 
2095 	err = drm_gpusvm_migrate_populate_ram_pfn(vas, page, npages, &mpages,
2096 						  migrate.src, migrate.dst,
2097 						  start);
2098 	if (err)
2099 		goto err_finalize;
2100 
2101 	err = drm_gpusvm_migrate_map_pages(dev, dma_addr, migrate.dst, npages,
2102 					   DMA_FROM_DEVICE);
2103 	if (err)
2104 		goto err_finalize;
2105 
2106 	for (i = 0; i < npages; ++i)
2107 		pages[i] = migrate_pfn_to_page(migrate.src[i]);
2108 
2109 	err = ops->copy_to_ram(pages, dma_addr, npages);
2110 	if (err)
2111 		goto err_finalize;
2112 
2113 err_finalize:
2114 	if (err)
2115 		drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
2116 	migrate_vma_pages(&migrate);
2117 	migrate_vma_finalize(&migrate);
2118 	if (dev)
2119 		drm_gpusvm_migrate_unmap_pages(dev, dma_addr, npages,
2120 					       DMA_FROM_DEVICE);
2121 err_free:
2122 	kvfree(buf);
2123 err_out:
2124 
2125 	return err;
2126 }
2127 
2128 /**
2129  * drm_gpusvm_range_evict - Evict GPU SVM range
2130  * @range: Pointer to the GPU SVM range to be removed
2131  *
2132  * This function evicts the specified GPU SVM range. This function will not
2133  * evict coherent pages.
2134  *
2135  * Return: 0 on success, a negative error code on failure.
2136  */
2137 int drm_gpusvm_range_evict(struct drm_gpusvm *gpusvm,
2138 			   struct drm_gpusvm_range *range)
2139 {
2140 	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
2141 	struct hmm_range hmm_range = {
2142 		.default_flags = HMM_PFN_REQ_FAULT,
2143 		.notifier = notifier,
2144 		.start = drm_gpusvm_range_start(range),
2145 		.end = drm_gpusvm_range_end(range),
2146 		.dev_private_owner = NULL,
2147 	};
2148 	unsigned long timeout =
2149 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
2150 	unsigned long *pfns;
2151 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
2152 					       drm_gpusvm_range_end(range));
2153 	int err = 0;
2154 	struct mm_struct *mm = gpusvm->mm;
2155 
2156 	if (!mmget_not_zero(mm))
2157 		return -EFAULT;
2158 
2159 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
2160 	if (!pfns)
2161 		return -ENOMEM;
2162 
2163 	hmm_range.hmm_pfns = pfns;
2164 	while (!time_after(jiffies, timeout)) {
2165 		hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
2166 		if (time_after(jiffies, timeout)) {
2167 			err = -ETIME;
2168 			break;
2169 		}
2170 
2171 		mmap_read_lock(mm);
2172 		err = hmm_range_fault(&hmm_range);
2173 		mmap_read_unlock(mm);
2174 		if (err != -EBUSY)
2175 			break;
2176 	}
2177 
2178 	kvfree(pfns);
2179 	mmput(mm);
2180 
2181 	return err;
2182 }
2183 EXPORT_SYMBOL_GPL(drm_gpusvm_range_evict);
2184 
2185 /**
2186  * drm_gpusvm_page_free() - Put GPU SVM zone device data associated with a page
2187  * @page: Pointer to the page
2188  *
2189  * This function is a callback used to put the GPU SVM zone device data
2190  * associated with a page when it is being released.
2191  */
2192 static void drm_gpusvm_page_free(struct page *page)
2193 {
2194 	drm_gpusvm_zdd_put(page->zone_device_data);
2195 }
2196 
2197 /**
2198  * drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (page fault handler)
2199  * @vmf: Pointer to the fault information structure
2200  *
2201  * This function is a page fault handler used to migrate a GPU SVM range to RAM.
2202  * It retrieves the GPU SVM range information from the faulting page and invokes
2203  * the internal migration function to migrate the range back to RAM.
2204  *
2205  * Return: VM_FAULT_SIGBUS on failure, 0 on success.
2206  */
2207 static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
2208 {
2209 	struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
2210 	int err;
2211 
2212 	err = __drm_gpusvm_migrate_to_ram(vmf->vma,
2213 					  zdd->device_private_page_owner,
2214 					  vmf->page, vmf->address,
2215 					  zdd->devmem_allocation->size);
2216 
2217 	return err ? VM_FAULT_SIGBUS : 0;
2218 }
2219 
2220 /*
2221  * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
2222  */
2223 static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
2224 	.page_free = drm_gpusvm_page_free,
2225 	.migrate_to_ram = drm_gpusvm_migrate_to_ram,
2226 };
2227 
2228 /**
2229  * drm_gpusvm_pagemap_ops_get() - Retrieve GPU SVM device page map operations
2230  *
2231  * Return: Pointer to the GPU SVM device page map operations structure.
2232  */
2233 const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
2234 {
2235 	return &drm_gpusvm_pagemap_ops;
2236 }
2237 EXPORT_SYMBOL_GPL(drm_gpusvm_pagemap_ops_get);
2238 
2239 /**
2240  * drm_gpusvm_has_mapping() - Check if GPU SVM has mapping for the given address range
2241  * @gpusvm: Pointer to the GPU SVM structure.
2242  * @start: Start address
2243  * @end: End address
2244  *
2245  * Return: True if GPU SVM has mapping, False otherwise
2246  */
2247 bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, unsigned long start,
2248 			    unsigned long end)
2249 {
2250 	struct drm_gpusvm_notifier *notifier;
2251 
2252 	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
2253 		struct drm_gpusvm_range *range = NULL;
2254 
2255 		drm_gpusvm_for_each_range(range, notifier, start, end)
2256 			return true;
2257 	}
2258 
2259 	return false;
2260 }
2261 EXPORT_SYMBOL_GPL(drm_gpusvm_has_mapping);
2262 
2263 /**
2264  * drm_gpusvm_range_set_unmapped() - Mark a GPU SVM range as unmapped
2265  * @range: Pointer to the GPU SVM range structure.
2266  * @mmu_range: Pointer to the MMU notifier range structure.
2267  *
2268  * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
2269  * if the range partially falls within the provided MMU notifier range.
2270  */
2271 void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
2272 				   const struct mmu_notifier_range *mmu_range)
2273 {
2274 	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
2275 
2276 	range->flags.unmapped = true;
2277 	if (drm_gpusvm_range_start(range) < mmu_range->start ||
2278 	    drm_gpusvm_range_end(range) > mmu_range->end)
2279 		range->flags.partial_unmap = true;
2280 }
2281 EXPORT_SYMBOL_GPL(drm_gpusvm_range_set_unmapped);
2282 
2283 /**
2284  * drm_gpusvm_devmem_init() - Initialize a GPU SVM device memory allocation
2285  *
2286  * @dev: Pointer to the device structure which device memory allocation belongs to
2287  * @mm: Pointer to the mm_struct for the address space
2288  * @ops: Pointer to the operations structure for GPU SVM device memory
2289  * @dpagemap: The struct drm_pagemap we're allocating from.
2290  * @size: Size of device memory allocation
2291  */
2292 void drm_gpusvm_devmem_init(struct drm_gpusvm_devmem *devmem_allocation,
2293 			    struct device *dev, struct mm_struct *mm,
2294 			    const struct drm_gpusvm_devmem_ops *ops,
2295 			    struct drm_pagemap *dpagemap, size_t size)
2296 {
2297 	init_completion(&devmem_allocation->detached);
2298 	devmem_allocation->dev = dev;
2299 	devmem_allocation->mm = mm;
2300 	devmem_allocation->ops = ops;
2301 	devmem_allocation->dpagemap = dpagemap;
2302 	devmem_allocation->size = size;
2303 }
2304 EXPORT_SYMBOL_GPL(drm_gpusvm_devmem_init);
2305 
2306 MODULE_DESCRIPTION("DRM GPUSVM");
2307 MODULE_LICENSE("GPL");
2308