xref: /linux/drivers/gpu/drm/drm_gpusvm.c (revision 92d6295a29dba56148406a8452c69ab49787741b)
1 // SPDX-License-Identifier: GPL-2.0-only OR MIT
2 /*
3  * Copyright © 2024 Intel Corporation
4  *
5  * Authors:
6  *     Matthew Brost <matthew.brost@intel.com>
7  */
8 
9 #include <linux/dma-mapping.h>
10 #include <linux/export.h>
11 #include <linux/hmm.h>
12 #include <linux/hugetlb_inline.h>
13 #include <linux/memremap.h>
14 #include <linux/mm_types.h>
15 #include <linux/slab.h>
16 
17 #include <drm/drm_device.h>
18 #include <drm/drm_gpusvm.h>
19 #include <drm/drm_pagemap.h>
20 #include <drm/drm_print.h>
21 
22 /**
23  * DOC: Overview
24  *
25  * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
26  * is a component of the DRM framework designed to manage shared virtual memory
27  * between the CPU and GPU. It enables efficient data exchange and processing
28  * for GPU-accelerated applications by allowing memory sharing and
29  * synchronization between the CPU's and GPU's virtual address spaces.
30  *
31  * Key GPU SVM Components:
32  *
33  * - Notifiers:
34  *	Used for tracking memory intervals and notifying the GPU of changes,
35  *	notifiers are sized based on a GPU SVM initialization parameter, with a
36  *	recommendation of 512M or larger. They maintain a Red-BlacK tree and a
37  *	list of ranges that fall within the notifier interval.  Notifiers are
38  *	tracked within a GPU SVM Red-BlacK tree and list and are dynamically
39  *	inserted or removed as ranges within the interval are created or
40  *	destroyed.
41  * - Ranges:
42  *	Represent memory ranges mapped in a DRM device and managed by GPU SVM.
43  *	They are sized based on an array of chunk sizes, which is a GPU SVM
44  *	initialization parameter, and the CPU address space.  Upon GPU fault,
45  *	the largest aligned chunk that fits within the faulting CPU address
46  *	space is chosen for the range size. Ranges are expected to be
47  *	dynamically allocated on GPU fault and removed on an MMU notifier UNMAP
48  *	event. As mentioned above, ranges are tracked in a notifier's Red-Black
49  *	tree.
50  *
51  * - Operations:
52  *	Define the interface for driver-specific GPU SVM operations such as
53  *	range allocation, notifier allocation, and invalidations.
54  *
55  * - Device Memory Allocations:
56  *	Embedded structure containing enough information for GPU SVM to migrate
57  *	to / from device memory.
58  *
59  * - Device Memory Operations:
60  *	Define the interface for driver-specific device memory operations
61  *	release memory, populate pfns, and copy to / from device memory.
62  *
63  * This layer provides interfaces for allocating, mapping, migrating, and
64  * releasing memory ranges between the CPU and GPU. It handles all core memory
65  * management interactions (DMA mapping, HMM, and migration) and provides
66  * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
67  * to build the expected driver components for an SVM implementation as detailed
68  * below.
69  *
70  * Expected Driver Components:
71  *
72  * - GPU page fault handler:
73  *	Used to create ranges and notifiers based on the fault address,
74  *	optionally migrate the range to device memory, and create GPU bindings.
75  *
76  * - Garbage collector:
77  *	Used to unmap and destroy GPU bindings for ranges.  Ranges are expected
78  *	to be added to the garbage collector upon a MMU_NOTIFY_UNMAP event in
79  *	notifier callback.
80  *
81  * - Notifier callback:
82  *	Used to invalidate and DMA unmap GPU bindings for ranges.
83  */
84 
85 /**
86  * DOC: Locking
87  *
88  * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
89  * mmap lock as needed.
90  *
91  * GPU SVM introduces a global notifier lock, which safeguards the notifier's
92  * range RB tree and list, as well as the range's DMA mappings and sequence
93  * number. GPU SVM manages all necessary locking and unlocking operations,
94  * except for the recheck range's pages being valid
95  * (drm_gpusvm_range_pages_valid) when the driver is committing GPU bindings.
96  * This lock corresponds to the ``driver->update`` lock mentioned in
97  * Documentation/mm/hmm.rst. Future revisions may transition from a GPU SVM
98  * global lock to a per-notifier lock if finer-grained locking is deemed
99  * necessary.
100  *
101  * In addition to the locking mentioned above, the driver should implement a
102  * lock to safeguard core GPU SVM function calls that modify state, such as
103  * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. This lock is
104  * denoted as 'driver_svm_lock' in code examples. Finer grained driver side
105  * locking should also be possible for concurrent GPU fault processing within a
106  * single GPU SVM. The 'driver_svm_lock' can be via drm_gpusvm_driver_set_lock
107  * to add annotations to GPU SVM.
108  */
109 
110 /**
111  * DOC: Partial Unmapping of Ranges
112  *
113  * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
114  * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
115  * being that a subset of the range still has CPU and GPU mappings. If the
116  * backing store for the range is in device memory, a subset of the backing
117  * store has references. One option would be to split the range and device
118  * memory backing store, but the implementation for this would be quite
119  * complicated. Given that partial unmappings are rare and driver-defined range
120  * sizes are relatively small, GPU SVM does not support splitting of ranges.
121  *
122  * With no support for range splitting, upon partial unmapping of a range, the
123  * driver is expected to invalidate and destroy the entire range. If the range
124  * has device memory as its backing, the driver is also expected to migrate any
125  * remaining pages back to RAM.
126  */
127 
128 /**
129  * DOC: Examples
130  *
131  * This section provides three examples of how to build the expected driver
132  * components: the GPU page fault handler, the garbage collector, and the
133  * notifier callback.
134  *
135  * The generic code provided does not include logic for complex migration
136  * policies, optimized invalidations, fined grained driver locking, or other
137  * potentially required driver locking (e.g., DMA-resv locks).
138  *
139  * 1) GPU page fault handler
140  *
141  * .. code-block:: c
142  *
143  *	int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
144  *	{
145  *		int err = 0;
146  *
147  *		driver_alloc_and_setup_memory_for_bind(gpusvm, range);
148  *
149  *		drm_gpusvm_notifier_lock(gpusvm);
150  *		if (drm_gpusvm_range_pages_valid(range))
151  *			driver_commit_bind(gpusvm, range);
152  *		else
153  *			err = -EAGAIN;
154  *		drm_gpusvm_notifier_unlock(gpusvm);
155  *
156  *		return err;
157  *	}
158  *
159  *	int driver_gpu_fault(struct drm_gpusvm *gpusvm, unsigned long fault_addr,
160  *			     unsigned long gpuva_start, unsigned long gpuva_end)
161  *	{
162  *		struct drm_gpusvm_ctx ctx = {};
163  *		int err;
164  *
165  *		driver_svm_lock();
166  *	retry:
167  *		// Always process UNMAPs first so view of GPU SVM ranges is current
168  *		driver_garbage_collector(gpusvm);
169  *
170  *		range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
171  *							gpuva_start, gpuva_end,
172  *						        &ctx);
173  *		if (IS_ERR(range)) {
174  *			err = PTR_ERR(range);
175  *			goto unlock;
176  *		}
177  *
178  *		if (driver_migration_policy(range)) {
179  *			err = drm_pagemap_populate_mm(driver_choose_drm_pagemap(),
180  *						      gpuva_start, gpuva_end, gpusvm->mm,
181  *						      ctx->timeslice_ms);
182  *			if (err)	// CPU mappings may have changed
183  *				goto retry;
184  *		}
185  *
186  *		err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
187  *		if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) {	// CPU mappings changed
188  *			if (err == -EOPNOTSUPP)
189  *				drm_gpusvm_range_evict(gpusvm, range);
190  *			goto retry;
191  *		} else if (err) {
192  *			goto unlock;
193  *		}
194  *
195  *		err = driver_bind_range(gpusvm, range);
196  *		if (err == -EAGAIN)	// CPU mappings changed
197  *			goto retry
198  *
199  *	unlock:
200  *		driver_svm_unlock();
201  *		return err;
202  *	}
203  *
204  * 2) Garbage Collector
205  *
206  * .. code-block:: c
207  *
208  *	void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
209  *					struct drm_gpusvm_range *range)
210  *	{
211  *		assert_driver_svm_locked(gpusvm);
212  *
213  *		// Partial unmap, migrate any remaining device memory pages back to RAM
214  *		if (range->flags.partial_unmap)
215  *			drm_gpusvm_range_evict(gpusvm, range);
216  *
217  *		driver_unbind_range(range);
218  *		drm_gpusvm_range_remove(gpusvm, range);
219  *	}
220  *
221  *	void driver_garbage_collector(struct drm_gpusvm *gpusvm)
222  *	{
223  *		assert_driver_svm_locked(gpusvm);
224  *
225  *		for_each_range_in_garbage_collector(gpusvm, range)
226  *			__driver_garbage_collector(gpusvm, range);
227  *	}
228  *
229  * 3) Notifier callback
230  *
231  * .. code-block:: c
232  *
233  *	void driver_invalidation(struct drm_gpusvm *gpusvm,
234  *				 struct drm_gpusvm_notifier *notifier,
235  *				 const struct mmu_notifier_range *mmu_range)
236  *	{
237  *		struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
238  *		struct drm_gpusvm_range *range = NULL;
239  *
240  *		driver_invalidate_device_pages(gpusvm, mmu_range->start, mmu_range->end);
241  *
242  *		drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
243  *					  mmu_range->end) {
244  *			drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
245  *
246  *			if (mmu_range->event != MMU_NOTIFY_UNMAP)
247  *				continue;
248  *
249  *			drm_gpusvm_range_set_unmapped(range, mmu_range);
250  *			driver_garbage_collector_add(gpusvm, range);
251  *		}
252  *	}
253  */
254 
255 /**
256  * npages_in_range() - Calculate the number of pages in a given range
257  * @start: The start address of the range
258  * @end: The end address of the range
259  *
260  * This macro calculates the number of pages in a given memory range,
261  * specified by the start and end addresses. It divides the difference
262  * between the end and start addresses by the page size (PAGE_SIZE) to
263  * determine the number of pages in the range.
264  *
265  * Return: The number of pages in the specified range.
266  */
267 static unsigned long
268 npages_in_range(unsigned long start, unsigned long end)
269 {
270 	return (end - start) >> PAGE_SHIFT;
271 }
272 
273 /**
274  * drm_gpusvm_notifier_find() - Find GPU SVM notifier from GPU SVM
275  * @gpusvm: Pointer to the GPU SVM structure.
276  * @start: Start address of the notifier
277  * @end: End address of the notifier
278  *
279  * Return: A pointer to the drm_gpusvm_notifier if found or NULL
280  */
281 struct drm_gpusvm_notifier *
282 drm_gpusvm_notifier_find(struct drm_gpusvm *gpusvm, unsigned long start,
283 			 unsigned long end)
284 {
285 	struct interval_tree_node *itree;
286 
287 	itree = interval_tree_iter_first(&gpusvm->root, start, end - 1);
288 
289 	if (itree)
290 		return container_of(itree, struct drm_gpusvm_notifier, itree);
291 	else
292 		return NULL;
293 }
294 EXPORT_SYMBOL_GPL(drm_gpusvm_notifier_find);
295 
296 /**
297  * drm_gpusvm_range_find() - Find GPU SVM range from GPU SVM notifier
298  * @notifier: Pointer to the GPU SVM notifier structure.
299  * @start: Start address of the range
300  * @end: End address of the range
301  *
302  * Return: A pointer to the drm_gpusvm_range if found or NULL
303  */
304 struct drm_gpusvm_range *
305 drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, unsigned long start,
306 		      unsigned long end)
307 {
308 	struct interval_tree_node *itree;
309 
310 	itree = interval_tree_iter_first(&notifier->root, start, end - 1);
311 
312 	if (itree)
313 		return container_of(itree, struct drm_gpusvm_range, itree);
314 	else
315 		return NULL;
316 }
317 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find);
318 
319 /**
320  * drm_gpusvm_notifier_invalidate() - Invalidate a GPU SVM notifier.
321  * @mni: Pointer to the mmu_interval_notifier structure.
322  * @mmu_range: Pointer to the mmu_notifier_range structure.
323  * @cur_seq: Current sequence number.
324  *
325  * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
326  * notifier sequence number and calls the driver invalidate vfunc under
327  * gpusvm->notifier_lock.
328  *
329  * Return: true if the operation succeeds, false otherwise.
330  */
331 static bool
332 drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
333 			       const struct mmu_notifier_range *mmu_range,
334 			       unsigned long cur_seq)
335 {
336 	struct drm_gpusvm_notifier *notifier =
337 		container_of(mni, typeof(*notifier), notifier);
338 	struct drm_gpusvm *gpusvm = notifier->gpusvm;
339 
340 	if (!mmu_notifier_range_blockable(mmu_range))
341 		return false;
342 
343 	down_write(&gpusvm->notifier_lock);
344 	mmu_interval_set_seq(mni, cur_seq);
345 	gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
346 	up_write(&gpusvm->notifier_lock);
347 
348 	return true;
349 }
350 
351 /*
352  * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
353  */
354 static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
355 	.invalidate = drm_gpusvm_notifier_invalidate,
356 };
357 
358 /**
359  * drm_gpusvm_init() - Initialize the GPU SVM.
360  * @gpusvm: Pointer to the GPU SVM structure.
361  * @name: Name of the GPU SVM.
362  * @drm: Pointer to the DRM device structure.
363  * @mm: Pointer to the mm_struct for the address space.
364  * @device_private_page_owner: Device private pages owner.
365  * @mm_start: Start address of GPU SVM.
366  * @mm_range: Range of the GPU SVM.
367  * @notifier_size: Size of individual notifiers.
368  * @ops: Pointer to the operations structure for GPU SVM.
369  * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
370  *               Entries should be powers of 2 in descending order with last
371  *               entry being SZ_4K.
372  * @num_chunks: Number of chunks.
373  *
374  * This function initializes the GPU SVM.
375  *
376  * Return: 0 on success, a negative error code on failure.
377  */
378 int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
379 		    const char *name, struct drm_device *drm,
380 		    struct mm_struct *mm, void *device_private_page_owner,
381 		    unsigned long mm_start, unsigned long mm_range,
382 		    unsigned long notifier_size,
383 		    const struct drm_gpusvm_ops *ops,
384 		    const unsigned long *chunk_sizes, int num_chunks)
385 {
386 	if (!ops->invalidate || !num_chunks)
387 		return -EINVAL;
388 
389 	gpusvm->name = name;
390 	gpusvm->drm = drm;
391 	gpusvm->mm = mm;
392 	gpusvm->device_private_page_owner = device_private_page_owner;
393 	gpusvm->mm_start = mm_start;
394 	gpusvm->mm_range = mm_range;
395 	gpusvm->notifier_size = notifier_size;
396 	gpusvm->ops = ops;
397 	gpusvm->chunk_sizes = chunk_sizes;
398 	gpusvm->num_chunks = num_chunks;
399 
400 	mmgrab(mm);
401 	gpusvm->root = RB_ROOT_CACHED;
402 	INIT_LIST_HEAD(&gpusvm->notifier_list);
403 
404 	init_rwsem(&gpusvm->notifier_lock);
405 
406 	fs_reclaim_acquire(GFP_KERNEL);
407 	might_lock(&gpusvm->notifier_lock);
408 	fs_reclaim_release(GFP_KERNEL);
409 
410 #ifdef CONFIG_LOCKDEP
411 	gpusvm->lock_dep_map = NULL;
412 #endif
413 
414 	return 0;
415 }
416 EXPORT_SYMBOL_GPL(drm_gpusvm_init);
417 
418 /**
419  * to_drm_gpusvm_notifier() - retrieve the container struct for a given rbtree node
420  * @node: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
421  *
422  * Return: A pointer to the containing drm_gpusvm_notifier structure.
423  */
424 static struct drm_gpusvm_notifier *to_drm_gpusvm_notifier(struct rb_node *node)
425 {
426 	return container_of(node, struct drm_gpusvm_notifier, itree.rb);
427 }
428 
429 /**
430  * drm_gpusvm_notifier_insert() - Insert GPU SVM notifier
431  * @gpusvm: Pointer to the GPU SVM structure
432  * @notifier: Pointer to the GPU SVM notifier structure
433  *
434  * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
435  */
436 static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
437 				       struct drm_gpusvm_notifier *notifier)
438 {
439 	struct rb_node *node;
440 	struct list_head *head;
441 
442 	interval_tree_insert(&notifier->itree, &gpusvm->root);
443 
444 	node = rb_prev(&notifier->itree.rb);
445 	if (node)
446 		head = &(to_drm_gpusvm_notifier(node))->entry;
447 	else
448 		head = &gpusvm->notifier_list;
449 
450 	list_add(&notifier->entry, head);
451 }
452 
453 /**
454  * drm_gpusvm_notifier_remove() - Remove GPU SVM notifier
455  * @gpusvm: Pointer to the GPU SVM tructure
456  * @notifier: Pointer to the GPU SVM notifier structure
457  *
458  * This function removes the GPU SVM notifier from the GPU SVM RB tree and list.
459  */
460 static void drm_gpusvm_notifier_remove(struct drm_gpusvm *gpusvm,
461 				       struct drm_gpusvm_notifier *notifier)
462 {
463 	interval_tree_remove(&notifier->itree, &gpusvm->root);
464 	list_del(&notifier->entry);
465 }
466 
467 /**
468  * drm_gpusvm_fini() - Finalize the GPU SVM.
469  * @gpusvm: Pointer to the GPU SVM structure.
470  *
471  * This function finalizes the GPU SVM by cleaning up any remaining ranges and
472  * notifiers, and dropping a reference to struct MM.
473  */
474 void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
475 {
476 	struct drm_gpusvm_notifier *notifier, *next;
477 
478 	drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
479 		struct drm_gpusvm_range *range, *__next;
480 
481 		/*
482 		 * Remove notifier first to avoid racing with any invalidation
483 		 */
484 		mmu_interval_notifier_remove(&notifier->notifier);
485 		notifier->flags.removed = true;
486 
487 		drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
488 					       LONG_MAX)
489 			drm_gpusvm_range_remove(gpusvm, range);
490 	}
491 
492 	mmdrop(gpusvm->mm);
493 	WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
494 }
495 EXPORT_SYMBOL_GPL(drm_gpusvm_fini);
496 
497 /**
498  * drm_gpusvm_notifier_alloc() - Allocate GPU SVM notifier
499  * @gpusvm: Pointer to the GPU SVM structure
500  * @fault_addr: Fault address
501  *
502  * This function allocates and initializes the GPU SVM notifier structure.
503  *
504  * Return: Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
505  */
506 static struct drm_gpusvm_notifier *
507 drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, unsigned long fault_addr)
508 {
509 	struct drm_gpusvm_notifier *notifier;
510 
511 	if (gpusvm->ops->notifier_alloc)
512 		notifier = gpusvm->ops->notifier_alloc();
513 	else
514 		notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
515 
516 	if (!notifier)
517 		return ERR_PTR(-ENOMEM);
518 
519 	notifier->gpusvm = gpusvm;
520 	notifier->itree.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
521 	notifier->itree.last = ALIGN(fault_addr + 1, gpusvm->notifier_size) - 1;
522 	INIT_LIST_HEAD(&notifier->entry);
523 	notifier->root = RB_ROOT_CACHED;
524 	INIT_LIST_HEAD(&notifier->range_list);
525 
526 	return notifier;
527 }
528 
529 /**
530  * drm_gpusvm_notifier_free() - Free GPU SVM notifier
531  * @gpusvm: Pointer to the GPU SVM structure
532  * @notifier: Pointer to the GPU SVM notifier structure
533  *
534  * This function frees the GPU SVM notifier structure.
535  */
536 static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
537 				     struct drm_gpusvm_notifier *notifier)
538 {
539 	WARN_ON(!RB_EMPTY_ROOT(&notifier->root.rb_root));
540 
541 	if (gpusvm->ops->notifier_free)
542 		gpusvm->ops->notifier_free(notifier);
543 	else
544 		kfree(notifier);
545 }
546 
547 /**
548  * to_drm_gpusvm_range() - retrieve the container struct for a given rbtree node
549  * @node: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
550  *
551  * Return: A pointer to the containing drm_gpusvm_range structure.
552  */
553 static struct drm_gpusvm_range *to_drm_gpusvm_range(struct rb_node *node)
554 {
555 	return container_of(node, struct drm_gpusvm_range, itree.rb);
556 }
557 
558 /**
559  * drm_gpusvm_range_insert() - Insert GPU SVM range
560  * @notifier: Pointer to the GPU SVM notifier structure
561  * @range: Pointer to the GPU SVM range structure
562  *
563  * This function inserts the GPU SVM range into the notifier RB tree and list.
564  */
565 static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
566 				    struct drm_gpusvm_range *range)
567 {
568 	struct rb_node *node;
569 	struct list_head *head;
570 
571 	drm_gpusvm_notifier_lock(notifier->gpusvm);
572 	interval_tree_insert(&range->itree, &notifier->root);
573 
574 	node = rb_prev(&range->itree.rb);
575 	if (node)
576 		head = &(to_drm_gpusvm_range(node))->entry;
577 	else
578 		head = &notifier->range_list;
579 
580 	list_add(&range->entry, head);
581 	drm_gpusvm_notifier_unlock(notifier->gpusvm);
582 }
583 
584 /**
585  * __drm_gpusvm_range_remove() - Remove GPU SVM range
586  * @notifier: Pointer to the GPU SVM notifier structure
587  * @range: Pointer to the GPU SVM range structure
588  *
589  * This macro removes the GPU SVM range from the notifier RB tree and list.
590  */
591 static void __drm_gpusvm_range_remove(struct drm_gpusvm_notifier *notifier,
592 				      struct drm_gpusvm_range *range)
593 {
594 	interval_tree_remove(&range->itree, &notifier->root);
595 	list_del(&range->entry);
596 }
597 
598 /**
599  * drm_gpusvm_range_alloc() - Allocate GPU SVM range
600  * @gpusvm: Pointer to the GPU SVM structure
601  * @notifier: Pointer to the GPU SVM notifier structure
602  * @fault_addr: Fault address
603  * @chunk_size: Chunk size
604  * @migrate_devmem: Flag indicating whether to migrate device memory
605  *
606  * This function allocates and initializes the GPU SVM range structure.
607  *
608  * Return: Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
609  */
610 static struct drm_gpusvm_range *
611 drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
612 		       struct drm_gpusvm_notifier *notifier,
613 		       unsigned long fault_addr, unsigned long chunk_size,
614 		       bool migrate_devmem)
615 {
616 	struct drm_gpusvm_range *range;
617 
618 	if (gpusvm->ops->range_alloc)
619 		range = gpusvm->ops->range_alloc(gpusvm);
620 	else
621 		range = kzalloc(sizeof(*range), GFP_KERNEL);
622 
623 	if (!range)
624 		return ERR_PTR(-ENOMEM);
625 
626 	kref_init(&range->refcount);
627 	range->gpusvm = gpusvm;
628 	range->notifier = notifier;
629 	range->itree.start = ALIGN_DOWN(fault_addr, chunk_size);
630 	range->itree.last = ALIGN(fault_addr + 1, chunk_size) - 1;
631 	INIT_LIST_HEAD(&range->entry);
632 	range->notifier_seq = LONG_MAX;
633 	range->flags.migrate_devmem = migrate_devmem ? 1 : 0;
634 
635 	return range;
636 }
637 
638 /**
639  * drm_gpusvm_check_pages() - Check pages
640  * @gpusvm: Pointer to the GPU SVM structure
641  * @notifier: Pointer to the GPU SVM notifier structure
642  * @start: Start address
643  * @end: End address
644  *
645  * Check if pages between start and end have been faulted in on the CPU. Use to
646  * prevent migration of pages without CPU backing store.
647  *
648  * Return: True if pages have been faulted into CPU, False otherwise
649  */
650 static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
651 				   struct drm_gpusvm_notifier *notifier,
652 				   unsigned long start, unsigned long end)
653 {
654 	struct hmm_range hmm_range = {
655 		.default_flags = 0,
656 		.notifier = &notifier->notifier,
657 		.start = start,
658 		.end = end,
659 		.dev_private_owner = gpusvm->device_private_page_owner,
660 	};
661 	unsigned long timeout =
662 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
663 	unsigned long *pfns;
664 	unsigned long npages = npages_in_range(start, end);
665 	int err, i;
666 
667 	mmap_assert_locked(gpusvm->mm);
668 
669 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
670 	if (!pfns)
671 		return false;
672 
673 	hmm_range.notifier_seq = mmu_interval_read_begin(&notifier->notifier);
674 	hmm_range.hmm_pfns = pfns;
675 
676 	while (true) {
677 		err = hmm_range_fault(&hmm_range);
678 		if (err == -EBUSY) {
679 			if (time_after(jiffies, timeout))
680 				break;
681 
682 			hmm_range.notifier_seq =
683 				mmu_interval_read_begin(&notifier->notifier);
684 			continue;
685 		}
686 		break;
687 	}
688 	if (err)
689 		goto err_free;
690 
691 	for (i = 0; i < npages;) {
692 		if (!(pfns[i] & HMM_PFN_VALID)) {
693 			err = -EFAULT;
694 			goto err_free;
695 		}
696 		i += 0x1 << hmm_pfn_to_map_order(pfns[i]);
697 	}
698 
699 err_free:
700 	kvfree(pfns);
701 	return err ? false : true;
702 }
703 
704 /**
705  * drm_gpusvm_range_chunk_size() - Determine chunk size for GPU SVM range
706  * @gpusvm: Pointer to the GPU SVM structure
707  * @notifier: Pointer to the GPU SVM notifier structure
708  * @vas: Pointer to the virtual memory area structure
709  * @fault_addr: Fault address
710  * @gpuva_start: Start address of GPUVA which mirrors CPU
711  * @gpuva_end: End address of GPUVA which mirrors CPU
712  * @check_pages_threshold: Check CPU pages for present threshold
713  *
714  * This function determines the chunk size for the GPU SVM range based on the
715  * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
716  * memory area boundaries.
717  *
718  * Return: Chunk size on success, LONG_MAX on failure.
719  */
720 static unsigned long
721 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
722 			    struct drm_gpusvm_notifier *notifier,
723 			    struct vm_area_struct *vas,
724 			    unsigned long fault_addr,
725 			    unsigned long gpuva_start,
726 			    unsigned long gpuva_end,
727 			    unsigned long check_pages_threshold)
728 {
729 	unsigned long start, end;
730 	int i = 0;
731 
732 retry:
733 	for (; i < gpusvm->num_chunks; ++i) {
734 		start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
735 		end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
736 
737 		if (start >= vas->vm_start && end <= vas->vm_end &&
738 		    start >= drm_gpusvm_notifier_start(notifier) &&
739 		    end <= drm_gpusvm_notifier_end(notifier) &&
740 		    start >= gpuva_start && end <= gpuva_end)
741 			break;
742 	}
743 
744 	if (i == gpusvm->num_chunks)
745 		return LONG_MAX;
746 
747 	/*
748 	 * If allocation more than page, ensure not to overlap with existing
749 	 * ranges.
750 	 */
751 	if (end - start != SZ_4K) {
752 		struct drm_gpusvm_range *range;
753 
754 		range = drm_gpusvm_range_find(notifier, start, end);
755 		if (range) {
756 			++i;
757 			goto retry;
758 		}
759 
760 		/*
761 		 * XXX: Only create range on pages CPU has faulted in. Without
762 		 * this check, or prefault, on BMG 'xe_exec_system_allocator --r
763 		 * process-many-malloc' fails. In the failure case, each process
764 		 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
765 		 * ranges. When migrating the SVM ranges, some processes fail in
766 		 * drm_pagemap_migrate_to_devmem with 'migrate.cpages != npages'
767 		 * and then upon drm_gpusvm_range_get_pages device pages from
768 		 * other processes are collected + faulted in which creates all
769 		 * sorts of problems. Unsure exactly how this happening, also
770 		 * problem goes away if 'xe_exec_system_allocator --r
771 		 * process-many-malloc' mallocs at least 64k at a time.
772 		 */
773 		if (end - start <= check_pages_threshold &&
774 		    !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) {
775 			++i;
776 			goto retry;
777 		}
778 	}
779 
780 	return end - start;
781 }
782 
783 #ifdef CONFIG_LOCKDEP
784 /**
785  * drm_gpusvm_driver_lock_held() - Assert GPU SVM driver lock is held
786  * @gpusvm: Pointer to the GPU SVM structure.
787  *
788  * Ensure driver lock is held.
789  */
790 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
791 {
792 	if ((gpusvm)->lock_dep_map)
793 		lockdep_assert(lock_is_held_type((gpusvm)->lock_dep_map, 0));
794 }
795 #else
796 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
797 {
798 }
799 #endif
800 
801 /**
802  * drm_gpusvm_find_vma_start() - Find start address for first VMA in range
803  * @gpusvm: Pointer to the GPU SVM structure
804  * @start: The inclusive start user address.
805  * @end: The exclusive end user address.
806  *
807  * Returns: The start address of first VMA within the provided range,
808  * ULONG_MAX otherwise. Assumes start_addr < end_addr.
809  */
810 unsigned long
811 drm_gpusvm_find_vma_start(struct drm_gpusvm *gpusvm,
812 			  unsigned long start,
813 			  unsigned long end)
814 {
815 	struct mm_struct *mm = gpusvm->mm;
816 	struct vm_area_struct *vma;
817 	unsigned long addr = ULONG_MAX;
818 
819 	if (!mmget_not_zero(mm))
820 		return addr;
821 
822 	mmap_read_lock(mm);
823 
824 	vma = find_vma_intersection(mm, start, end);
825 	if (vma)
826 		addr =  vma->vm_start;
827 
828 	mmap_read_unlock(mm);
829 	mmput(mm);
830 
831 	return addr;
832 }
833 EXPORT_SYMBOL_GPL(drm_gpusvm_find_vma_start);
834 
835 /**
836  * drm_gpusvm_range_find_or_insert() - Find or insert GPU SVM range
837  * @gpusvm: Pointer to the GPU SVM structure
838  * @fault_addr: Fault address
839  * @gpuva_start: Start address of GPUVA which mirrors CPU
840  * @gpuva_end: End address of GPUVA which mirrors CPU
841  * @ctx: GPU SVM context
842  *
843  * This function finds or inserts a newly allocated a GPU SVM range based on the
844  * fault address. Caller must hold a lock to protect range lookup and insertion.
845  *
846  * Return: Pointer to the GPU SVM range on success, ERR_PTR() on failure.
847  */
848 struct drm_gpusvm_range *
849 drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
850 				unsigned long fault_addr,
851 				unsigned long gpuva_start,
852 				unsigned long gpuva_end,
853 				const struct drm_gpusvm_ctx *ctx)
854 {
855 	struct drm_gpusvm_notifier *notifier;
856 	struct drm_gpusvm_range *range;
857 	struct mm_struct *mm = gpusvm->mm;
858 	struct vm_area_struct *vas;
859 	bool notifier_alloc = false;
860 	unsigned long chunk_size;
861 	int err;
862 	bool migrate_devmem;
863 
864 	drm_gpusvm_driver_lock_held(gpusvm);
865 
866 	if (fault_addr < gpusvm->mm_start ||
867 	    fault_addr > gpusvm->mm_start + gpusvm->mm_range)
868 		return ERR_PTR(-EINVAL);
869 
870 	if (!mmget_not_zero(mm))
871 		return ERR_PTR(-EFAULT);
872 
873 	notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr, fault_addr + 1);
874 	if (!notifier) {
875 		notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
876 		if (IS_ERR(notifier)) {
877 			err = PTR_ERR(notifier);
878 			goto err_mmunlock;
879 		}
880 		notifier_alloc = true;
881 		err = mmu_interval_notifier_insert(&notifier->notifier,
882 						   mm,
883 						   drm_gpusvm_notifier_start(notifier),
884 						   drm_gpusvm_notifier_size(notifier),
885 						   &drm_gpusvm_notifier_ops);
886 		if (err)
887 			goto err_notifier;
888 	}
889 
890 	mmap_read_lock(mm);
891 
892 	vas = vma_lookup(mm, fault_addr);
893 	if (!vas) {
894 		err = -ENOENT;
895 		goto err_notifier_remove;
896 	}
897 
898 	if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
899 		err = -EPERM;
900 		goto err_notifier_remove;
901 	}
902 
903 	range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
904 	if (range)
905 		goto out_mmunlock;
906 	/*
907 	 * XXX: Short-circuiting migration based on migrate_vma_* current
908 	 * limitations. If/when migrate_vma_* add more support, this logic will
909 	 * have to change.
910 	 */
911 	migrate_devmem = ctx->devmem_possible &&
912 		vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
913 
914 	chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
915 						 fault_addr, gpuva_start,
916 						 gpuva_end,
917 						 ctx->check_pages_threshold);
918 	if (chunk_size == LONG_MAX) {
919 		err = -EINVAL;
920 		goto err_notifier_remove;
921 	}
922 
923 	range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
924 				       migrate_devmem);
925 	if (IS_ERR(range)) {
926 		err = PTR_ERR(range);
927 		goto err_notifier_remove;
928 	}
929 
930 	drm_gpusvm_range_insert(notifier, range);
931 	if (notifier_alloc)
932 		drm_gpusvm_notifier_insert(gpusvm, notifier);
933 
934 out_mmunlock:
935 	mmap_read_unlock(mm);
936 	mmput(mm);
937 
938 	return range;
939 
940 err_notifier_remove:
941 	mmap_read_unlock(mm);
942 	if (notifier_alloc)
943 		mmu_interval_notifier_remove(&notifier->notifier);
944 err_notifier:
945 	if (notifier_alloc)
946 		drm_gpusvm_notifier_free(gpusvm, notifier);
947 err_mmunlock:
948 	mmput(mm);
949 	return ERR_PTR(err);
950 }
951 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find_or_insert);
952 
953 /**
954  * __drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range (internal)
955  * @gpusvm: Pointer to the GPU SVM structure
956  * @range: Pointer to the GPU SVM range structure
957  * @npages: Number of pages to unmap
958  *
959  * This function unmap pages associated with a GPU SVM range. Assumes and
960  * asserts correct locking is in place when called.
961  */
962 static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
963 					   struct drm_gpusvm_range *range,
964 					   unsigned long npages)
965 {
966 	unsigned long i, j;
967 	struct drm_pagemap *dpagemap = range->dpagemap;
968 	struct device *dev = gpusvm->drm->dev;
969 
970 	lockdep_assert_held(&gpusvm->notifier_lock);
971 
972 	if (range->flags.has_dma_mapping) {
973 		struct drm_gpusvm_range_flags flags = {
974 			.__flags = range->flags.__flags,
975 		};
976 
977 		for (i = 0, j = 0; i < npages; j++) {
978 			struct drm_pagemap_addr *addr = &range->dma_addr[j];
979 
980 			if (addr->proto == DRM_INTERCONNECT_SYSTEM)
981 				dma_unmap_page(dev,
982 					       addr->addr,
983 					       PAGE_SIZE << addr->order,
984 					       addr->dir);
985 			else if (dpagemap && dpagemap->ops->device_unmap)
986 				dpagemap->ops->device_unmap(dpagemap,
987 							    dev, *addr);
988 			i += 1 << addr->order;
989 		}
990 
991 		/* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
992 		flags.has_devmem_pages = false;
993 		flags.has_dma_mapping = false;
994 		WRITE_ONCE(range->flags.__flags, flags.__flags);
995 
996 		range->dpagemap = NULL;
997 	}
998 }
999 
1000 /**
1001  * drm_gpusvm_range_free_pages() - Free pages associated with a GPU SVM range
1002  * @gpusvm: Pointer to the GPU SVM structure
1003  * @range: Pointer to the GPU SVM range structure
1004  *
1005  * This function frees the dma address array associated with a GPU SVM range.
1006  */
1007 static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
1008 					struct drm_gpusvm_range *range)
1009 {
1010 	lockdep_assert_held(&gpusvm->notifier_lock);
1011 
1012 	if (range->dma_addr) {
1013 		kvfree(range->dma_addr);
1014 		range->dma_addr = NULL;
1015 	}
1016 }
1017 
1018 /**
1019  * drm_gpusvm_range_remove() - Remove GPU SVM range
1020  * @gpusvm: Pointer to the GPU SVM structure
1021  * @range: Pointer to the GPU SVM range to be removed
1022  *
1023  * This function removes the specified GPU SVM range and also removes the parent
1024  * GPU SVM notifier if no more ranges remain in the notifier. The caller must
1025  * hold a lock to protect range and notifier removal.
1026  */
1027 void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
1028 			     struct drm_gpusvm_range *range)
1029 {
1030 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1031 					       drm_gpusvm_range_end(range));
1032 	struct drm_gpusvm_notifier *notifier;
1033 
1034 	drm_gpusvm_driver_lock_held(gpusvm);
1035 
1036 	notifier = drm_gpusvm_notifier_find(gpusvm,
1037 					    drm_gpusvm_range_start(range),
1038 					    drm_gpusvm_range_start(range) + 1);
1039 	if (WARN_ON_ONCE(!notifier))
1040 		return;
1041 
1042 	drm_gpusvm_notifier_lock(gpusvm);
1043 	__drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1044 	drm_gpusvm_range_free_pages(gpusvm, range);
1045 	__drm_gpusvm_range_remove(notifier, range);
1046 	drm_gpusvm_notifier_unlock(gpusvm);
1047 
1048 	drm_gpusvm_range_put(range);
1049 
1050 	if (RB_EMPTY_ROOT(&notifier->root.rb_root)) {
1051 		if (!notifier->flags.removed)
1052 			mmu_interval_notifier_remove(&notifier->notifier);
1053 		drm_gpusvm_notifier_remove(gpusvm, notifier);
1054 		drm_gpusvm_notifier_free(gpusvm, notifier);
1055 	}
1056 }
1057 EXPORT_SYMBOL_GPL(drm_gpusvm_range_remove);
1058 
1059 /**
1060  * drm_gpusvm_range_get() - Get a reference to GPU SVM range
1061  * @range: Pointer to the GPU SVM range
1062  *
1063  * This function increments the reference count of the specified GPU SVM range.
1064  *
1065  * Return: Pointer to the GPU SVM range.
1066  */
1067 struct drm_gpusvm_range *
1068 drm_gpusvm_range_get(struct drm_gpusvm_range *range)
1069 {
1070 	kref_get(&range->refcount);
1071 
1072 	return range;
1073 }
1074 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get);
1075 
1076 /**
1077  * drm_gpusvm_range_destroy() - Destroy GPU SVM range
1078  * @refcount: Pointer to the reference counter embedded in the GPU SVM range
1079  *
1080  * This function destroys the specified GPU SVM range when its reference count
1081  * reaches zero. If a custom range-free function is provided, it is invoked to
1082  * free the range; otherwise, the range is deallocated using kfree().
1083  */
1084 static void drm_gpusvm_range_destroy(struct kref *refcount)
1085 {
1086 	struct drm_gpusvm_range *range =
1087 		container_of(refcount, struct drm_gpusvm_range, refcount);
1088 	struct drm_gpusvm *gpusvm = range->gpusvm;
1089 
1090 	if (gpusvm->ops->range_free)
1091 		gpusvm->ops->range_free(range);
1092 	else
1093 		kfree(range);
1094 }
1095 
1096 /**
1097  * drm_gpusvm_range_put() - Put a reference to GPU SVM range
1098  * @range: Pointer to the GPU SVM range
1099  *
1100  * This function decrements the reference count of the specified GPU SVM range
1101  * and frees it when the count reaches zero.
1102  */
1103 void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
1104 {
1105 	kref_put(&range->refcount, drm_gpusvm_range_destroy);
1106 }
1107 EXPORT_SYMBOL_GPL(drm_gpusvm_range_put);
1108 
1109 /**
1110  * drm_gpusvm_range_pages_valid() - GPU SVM range pages valid
1111  * @gpusvm: Pointer to the GPU SVM structure
1112  * @range: Pointer to the GPU SVM range structure
1113  *
1114  * This function determines if a GPU SVM range pages are valid. Expected be
1115  * called holding gpusvm->notifier_lock and as the last step before committing a
1116  * GPU binding. This is akin to a notifier seqno check in the HMM documentation
1117  * but due to wider notifiers (i.e., notifiers which span multiple ranges) this
1118  * function is required for finer grained checking (i.e., per range) if pages
1119  * are valid.
1120  *
1121  * Return: True if GPU SVM range has valid pages, False otherwise
1122  */
1123 bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
1124 				  struct drm_gpusvm_range *range)
1125 {
1126 	lockdep_assert_held(&gpusvm->notifier_lock);
1127 
1128 	return range->flags.has_devmem_pages || range->flags.has_dma_mapping;
1129 }
1130 EXPORT_SYMBOL_GPL(drm_gpusvm_range_pages_valid);
1131 
1132 /**
1133  * drm_gpusvm_range_pages_valid_unlocked() - GPU SVM range pages valid unlocked
1134  * @gpusvm: Pointer to the GPU SVM structure
1135  * @range: Pointer to the GPU SVM range structure
1136  *
1137  * This function determines if a GPU SVM range pages are valid. Expected be
1138  * called without holding gpusvm->notifier_lock.
1139  *
1140  * Return: True if GPU SVM range has valid pages, False otherwise
1141  */
1142 static bool
1143 drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
1144 				      struct drm_gpusvm_range *range)
1145 {
1146 	bool pages_valid;
1147 
1148 	if (!range->dma_addr)
1149 		return false;
1150 
1151 	drm_gpusvm_notifier_lock(gpusvm);
1152 	pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
1153 	if (!pages_valid)
1154 		drm_gpusvm_range_free_pages(gpusvm, range);
1155 	drm_gpusvm_notifier_unlock(gpusvm);
1156 
1157 	return pages_valid;
1158 }
1159 
1160 /**
1161  * drm_gpusvm_range_get_pages() - Get pages for a GPU SVM range
1162  * @gpusvm: Pointer to the GPU SVM structure
1163  * @range: Pointer to the GPU SVM range structure
1164  * @ctx: GPU SVM context
1165  *
1166  * This function gets pages for a GPU SVM range and ensures they are mapped for
1167  * DMA access.
1168  *
1169  * Return: 0 on success, negative error code on failure.
1170  */
1171 int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
1172 			       struct drm_gpusvm_range *range,
1173 			       const struct drm_gpusvm_ctx *ctx)
1174 {
1175 	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
1176 	struct hmm_range hmm_range = {
1177 		.default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
1178 			HMM_PFN_REQ_WRITE),
1179 		.notifier = notifier,
1180 		.start = drm_gpusvm_range_start(range),
1181 		.end = drm_gpusvm_range_end(range),
1182 		.dev_private_owner = gpusvm->device_private_page_owner,
1183 	};
1184 	struct mm_struct *mm = gpusvm->mm;
1185 	void *zdd;
1186 	unsigned long timeout =
1187 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1188 	unsigned long i, j;
1189 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1190 					       drm_gpusvm_range_end(range));
1191 	unsigned long num_dma_mapped;
1192 	unsigned int order = 0;
1193 	unsigned long *pfns;
1194 	int err = 0;
1195 	struct dev_pagemap *pagemap;
1196 	struct drm_pagemap *dpagemap;
1197 	struct drm_gpusvm_range_flags flags;
1198 
1199 retry:
1200 	hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
1201 	if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
1202 		goto set_seqno;
1203 
1204 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
1205 	if (!pfns)
1206 		return -ENOMEM;
1207 
1208 	if (!mmget_not_zero(mm)) {
1209 		err = -EFAULT;
1210 		goto err_free;
1211 	}
1212 
1213 	hmm_range.hmm_pfns = pfns;
1214 	while (true) {
1215 		mmap_read_lock(mm);
1216 		err = hmm_range_fault(&hmm_range);
1217 		mmap_read_unlock(mm);
1218 
1219 		if (err == -EBUSY) {
1220 			if (time_after(jiffies, timeout))
1221 				break;
1222 
1223 			hmm_range.notifier_seq =
1224 				mmu_interval_read_begin(notifier);
1225 			continue;
1226 		}
1227 		break;
1228 	}
1229 	mmput(mm);
1230 	if (err)
1231 		goto err_free;
1232 
1233 map_pages:
1234 	/*
1235 	 * Perform all dma mappings under the notifier lock to not
1236 	 * access freed pages. A notifier will either block on
1237 	 * the notifier lock or unmap dma.
1238 	 */
1239 	drm_gpusvm_notifier_lock(gpusvm);
1240 
1241 	flags.__flags = range->flags.__flags;
1242 	if (flags.unmapped) {
1243 		drm_gpusvm_notifier_unlock(gpusvm);
1244 		err = -EFAULT;
1245 		goto err_free;
1246 	}
1247 
1248 	if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
1249 		drm_gpusvm_notifier_unlock(gpusvm);
1250 		kvfree(pfns);
1251 		goto retry;
1252 	}
1253 
1254 	if (!range->dma_addr) {
1255 		/* Unlock and restart mapping to allocate memory. */
1256 		drm_gpusvm_notifier_unlock(gpusvm);
1257 		range->dma_addr = kvmalloc_array(npages,
1258 						 sizeof(*range->dma_addr),
1259 						 GFP_KERNEL);
1260 		if (!range->dma_addr) {
1261 			err = -ENOMEM;
1262 			goto err_free;
1263 		}
1264 		goto map_pages;
1265 	}
1266 
1267 	zdd = NULL;
1268 	pagemap = NULL;
1269 	num_dma_mapped = 0;
1270 	for (i = 0, j = 0; i < npages; ++j) {
1271 		struct page *page = hmm_pfn_to_page(pfns[i]);
1272 
1273 		order = hmm_pfn_to_map_order(pfns[i]);
1274 		if (is_device_private_page(page) ||
1275 		    is_device_coherent_page(page)) {
1276 			if (zdd != page->zone_device_data && i > 0) {
1277 				err = -EOPNOTSUPP;
1278 				goto err_unmap;
1279 			}
1280 			zdd = page->zone_device_data;
1281 			if (pagemap != page_pgmap(page)) {
1282 				if (i > 0) {
1283 					err = -EOPNOTSUPP;
1284 					goto err_unmap;
1285 				}
1286 
1287 				pagemap = page_pgmap(page);
1288 				dpagemap = drm_pagemap_page_to_dpagemap(page);
1289 				if (drm_WARN_ON(gpusvm->drm, !dpagemap)) {
1290 					/*
1291 					 * Raced. This is not supposed to happen
1292 					 * since hmm_range_fault() should've migrated
1293 					 * this page to system.
1294 					 */
1295 					err = -EAGAIN;
1296 					goto err_unmap;
1297 				}
1298 			}
1299 			range->dma_addr[j] =
1300 				dpagemap->ops->device_map(dpagemap,
1301 							  gpusvm->drm->dev,
1302 							  page, order,
1303 							  DMA_BIDIRECTIONAL);
1304 			if (dma_mapping_error(gpusvm->drm->dev,
1305 					      range->dma_addr[j].addr)) {
1306 				err = -EFAULT;
1307 				goto err_unmap;
1308 			}
1309 		} else {
1310 			dma_addr_t addr;
1311 
1312 			if (is_zone_device_page(page) || pagemap) {
1313 				err = -EOPNOTSUPP;
1314 				goto err_unmap;
1315 			}
1316 
1317 			if (ctx->devmem_only) {
1318 				err = -EFAULT;
1319 				goto err_unmap;
1320 			}
1321 
1322 			addr = dma_map_page(gpusvm->drm->dev,
1323 					    page, 0,
1324 					    PAGE_SIZE << order,
1325 					    DMA_BIDIRECTIONAL);
1326 			if (dma_mapping_error(gpusvm->drm->dev, addr)) {
1327 				err = -EFAULT;
1328 				goto err_unmap;
1329 			}
1330 
1331 			range->dma_addr[j] = drm_pagemap_addr_encode
1332 				(addr, DRM_INTERCONNECT_SYSTEM, order,
1333 				 DMA_BIDIRECTIONAL);
1334 		}
1335 		i += 1 << order;
1336 		num_dma_mapped = i;
1337 		flags.has_dma_mapping = true;
1338 	}
1339 
1340 	if (pagemap) {
1341 		flags.has_devmem_pages = true;
1342 		range->dpagemap = dpagemap;
1343 	}
1344 
1345 	/* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1346 	WRITE_ONCE(range->flags.__flags, flags.__flags);
1347 
1348 	drm_gpusvm_notifier_unlock(gpusvm);
1349 	kvfree(pfns);
1350 set_seqno:
1351 	range->notifier_seq = hmm_range.notifier_seq;
1352 
1353 	return 0;
1354 
1355 err_unmap:
1356 	__drm_gpusvm_range_unmap_pages(gpusvm, range, num_dma_mapped);
1357 	drm_gpusvm_notifier_unlock(gpusvm);
1358 err_free:
1359 	kvfree(pfns);
1360 	if (err == -EAGAIN)
1361 		goto retry;
1362 	return err;
1363 }
1364 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get_pages);
1365 
1366 /**
1367  * drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range
1368  * drm_gpusvm_range_evict() - Evict GPU SVM range
1369  * @gpusvm: Pointer to the GPU SVM structure
1370  * @range: Pointer to the GPU SVM range structure
1371  * @ctx: GPU SVM context
1372  *
1373  * This function unmaps pages associated with a GPU SVM range. If @in_notifier
1374  * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
1375  * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
1376  * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
1377  * security model.
1378  */
1379 void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1380 				  struct drm_gpusvm_range *range,
1381 				  const struct drm_gpusvm_ctx *ctx)
1382 {
1383 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1384 					       drm_gpusvm_range_end(range));
1385 
1386 	if (ctx->in_notifier)
1387 		lockdep_assert_held_write(&gpusvm->notifier_lock);
1388 	else
1389 		drm_gpusvm_notifier_lock(gpusvm);
1390 
1391 	__drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1392 
1393 	if (!ctx->in_notifier)
1394 		drm_gpusvm_notifier_unlock(gpusvm);
1395 }
1396 EXPORT_SYMBOL_GPL(drm_gpusvm_range_unmap_pages);
1397 
1398 /**
1399  * drm_gpusvm_range_evict() - Evict GPU SVM range
1400  * @gpusvm: Pointer to the GPU SVM structure
1401  * @range: Pointer to the GPU SVM range to be removed
1402  *
1403  * This function evicts the specified GPU SVM range.
1404  *
1405  * Return: 0 on success, a negative error code on failure.
1406  */
1407 int drm_gpusvm_range_evict(struct drm_gpusvm *gpusvm,
1408 			   struct drm_gpusvm_range *range)
1409 {
1410 	struct mmu_interval_notifier *notifier = &range->notifier->notifier;
1411 	struct hmm_range hmm_range = {
1412 		.default_flags = HMM_PFN_REQ_FAULT,
1413 		.notifier = notifier,
1414 		.start = drm_gpusvm_range_start(range),
1415 		.end = drm_gpusvm_range_end(range),
1416 		.dev_private_owner = NULL,
1417 	};
1418 	unsigned long timeout =
1419 		jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1420 	unsigned long *pfns;
1421 	unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1422 					       drm_gpusvm_range_end(range));
1423 	int err = 0;
1424 	struct mm_struct *mm = gpusvm->mm;
1425 
1426 	if (!mmget_not_zero(mm))
1427 		return -EFAULT;
1428 
1429 	pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
1430 	if (!pfns)
1431 		return -ENOMEM;
1432 
1433 	hmm_range.hmm_pfns = pfns;
1434 	while (!time_after(jiffies, timeout)) {
1435 		hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
1436 		if (time_after(jiffies, timeout)) {
1437 			err = -ETIME;
1438 			break;
1439 		}
1440 
1441 		mmap_read_lock(mm);
1442 		err = hmm_range_fault(&hmm_range);
1443 		mmap_read_unlock(mm);
1444 		if (err != -EBUSY)
1445 			break;
1446 	}
1447 
1448 	kvfree(pfns);
1449 	mmput(mm);
1450 
1451 	return err;
1452 }
1453 EXPORT_SYMBOL_GPL(drm_gpusvm_range_evict);
1454 
1455 /**
1456  * drm_gpusvm_has_mapping() - Check if GPU SVM has mapping for the given address range
1457  * @gpusvm: Pointer to the GPU SVM structure.
1458  * @start: Start address
1459  * @end: End address
1460  *
1461  * Return: True if GPU SVM has mapping, False otherwise
1462  */
1463 bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, unsigned long start,
1464 			    unsigned long end)
1465 {
1466 	struct drm_gpusvm_notifier *notifier;
1467 
1468 	drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
1469 		struct drm_gpusvm_range *range = NULL;
1470 
1471 		drm_gpusvm_for_each_range(range, notifier, start, end)
1472 			return true;
1473 	}
1474 
1475 	return false;
1476 }
1477 EXPORT_SYMBOL_GPL(drm_gpusvm_has_mapping);
1478 
1479 /**
1480  * drm_gpusvm_range_set_unmapped() - Mark a GPU SVM range as unmapped
1481  * @range: Pointer to the GPU SVM range structure.
1482  * @mmu_range: Pointer to the MMU notifier range structure.
1483  *
1484  * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
1485  * if the range partially falls within the provided MMU notifier range.
1486  */
1487 void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
1488 				   const struct mmu_notifier_range *mmu_range)
1489 {
1490 	lockdep_assert_held_write(&range->gpusvm->notifier_lock);
1491 
1492 	range->flags.unmapped = true;
1493 	if (drm_gpusvm_range_start(range) < mmu_range->start ||
1494 	    drm_gpusvm_range_end(range) > mmu_range->end)
1495 		range->flags.partial_unmap = true;
1496 }
1497 EXPORT_SYMBOL_GPL(drm_gpusvm_range_set_unmapped);
1498 
1499 MODULE_DESCRIPTION("DRM GPUSVM");
1500 MODULE_LICENSE("GPL");
1501