1 // SPDX-License-Identifier: GPL-2.0-only OR MIT
2 /*
3 * Copyright © 2024 Intel Corporation
4 *
5 * Authors:
6 * Matthew Brost <matthew.brost@intel.com>
7 */
8
9 #include <linux/dma-mapping.h>
10 #include <linux/hmm.h>
11 #include <linux/memremap.h>
12 #include <linux/migrate.h>
13 #include <linux/mm_types.h>
14 #include <linux/pagemap.h>
15 #include <linux/slab.h>
16
17 #include <drm/drm_device.h>
18 #include <drm/drm_gpusvm.h>
19 #include <drm/drm_pagemap.h>
20 #include <drm/drm_print.h>
21
22 /**
23 * DOC: Overview
24 *
25 * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
26 * is a component of the DRM framework designed to manage shared virtual memory
27 * between the CPU and GPU. It enables efficient data exchange and processing
28 * for GPU-accelerated applications by allowing memory sharing and
29 * synchronization between the CPU's and GPU's virtual address spaces.
30 *
31 * Key GPU SVM Components:
32 *
33 * - Notifiers:
34 * Used for tracking memory intervals and notifying the GPU of changes,
35 * notifiers are sized based on a GPU SVM initialization parameter, with a
36 * recommendation of 512M or larger. They maintain a Red-BlacK tree and a
37 * list of ranges that fall within the notifier interval. Notifiers are
38 * tracked within a GPU SVM Red-BlacK tree and list and are dynamically
39 * inserted or removed as ranges within the interval are created or
40 * destroyed.
41 * - Ranges:
42 * Represent memory ranges mapped in a DRM device and managed by GPU SVM.
43 * They are sized based on an array of chunk sizes, which is a GPU SVM
44 * initialization parameter, and the CPU address space. Upon GPU fault,
45 * the largest aligned chunk that fits within the faulting CPU address
46 * space is chosen for the range size. Ranges are expected to be
47 * dynamically allocated on GPU fault and removed on an MMU notifier UNMAP
48 * event. As mentioned above, ranges are tracked in a notifier's Red-Black
49 * tree.
50 *
51 * - Operations:
52 * Define the interface for driver-specific GPU SVM operations such as
53 * range allocation, notifier allocation, and invalidations.
54 *
55 * - Device Memory Allocations:
56 * Embedded structure containing enough information for GPU SVM to migrate
57 * to / from device memory.
58 *
59 * - Device Memory Operations:
60 * Define the interface for driver-specific device memory operations
61 * release memory, populate pfns, and copy to / from device memory.
62 *
63 * This layer provides interfaces for allocating, mapping, migrating, and
64 * releasing memory ranges between the CPU and GPU. It handles all core memory
65 * management interactions (DMA mapping, HMM, and migration) and provides
66 * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
67 * to build the expected driver components for an SVM implementation as detailed
68 * below.
69 *
70 * Expected Driver Components:
71 *
72 * - GPU page fault handler:
73 * Used to create ranges and notifiers based on the fault address,
74 * optionally migrate the range to device memory, and create GPU bindings.
75 *
76 * - Garbage collector:
77 * Used to unmap and destroy GPU bindings for ranges. Ranges are expected
78 * to be added to the garbage collector upon a MMU_NOTIFY_UNMAP event in
79 * notifier callback.
80 *
81 * - Notifier callback:
82 * Used to invalidate and DMA unmap GPU bindings for ranges.
83 */
84
85 /**
86 * DOC: Locking
87 *
88 * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
89 * mmap lock as needed.
90 *
91 * GPU SVM introduces a global notifier lock, which safeguards the notifier's
92 * range RB tree and list, as well as the range's DMA mappings and sequence
93 * number. GPU SVM manages all necessary locking and unlocking operations,
94 * except for the recheck range's pages being valid
95 * (drm_gpusvm_range_pages_valid) when the driver is committing GPU bindings.
96 * This lock corresponds to the ``driver->update`` lock mentioned in
97 * Documentation/mm/hmm.rst. Future revisions may transition from a GPU SVM
98 * global lock to a per-notifier lock if finer-grained locking is deemed
99 * necessary.
100 *
101 * In addition to the locking mentioned above, the driver should implement a
102 * lock to safeguard core GPU SVM function calls that modify state, such as
103 * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. This lock is
104 * denoted as 'driver_svm_lock' in code examples. Finer grained driver side
105 * locking should also be possible for concurrent GPU fault processing within a
106 * single GPU SVM. The 'driver_svm_lock' can be via drm_gpusvm_driver_set_lock
107 * to add annotations to GPU SVM.
108 */
109
110 /**
111 * DOC: Migration
112 *
113 * The migration support is quite simple, allowing migration between RAM and
114 * device memory at the range granularity. For example, GPU SVM currently does
115 * not support mixing RAM and device memory pages within a range. This means
116 * that upon GPU fault, the entire range can be migrated to device memory, and
117 * upon CPU fault, the entire range is migrated to RAM. Mixed RAM and device
118 * memory storage within a range could be added in the future if required.
119 *
120 * The reasoning for only supporting range granularity is as follows: it
121 * simplifies the implementation, and range sizes are driver-defined and should
122 * be relatively small.
123 */
124
125 /**
126 * DOC: Partial Unmapping of Ranges
127 *
128 * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
129 * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
130 * being that a subset of the range still has CPU and GPU mappings. If the
131 * backing store for the range is in device memory, a subset of the backing
132 * store has references. One option would be to split the range and device
133 * memory backing store, but the implementation for this would be quite
134 * complicated. Given that partial unmappings are rare and driver-defined range
135 * sizes are relatively small, GPU SVM does not support splitting of ranges.
136 *
137 * With no support for range splitting, upon partial unmapping of a range, the
138 * driver is expected to invalidate and destroy the entire range. If the range
139 * has device memory as its backing, the driver is also expected to migrate any
140 * remaining pages back to RAM.
141 */
142
143 /**
144 * DOC: Examples
145 *
146 * This section provides three examples of how to build the expected driver
147 * components: the GPU page fault handler, the garbage collector, and the
148 * notifier callback.
149 *
150 * The generic code provided does not include logic for complex migration
151 * policies, optimized invalidations, fined grained driver locking, or other
152 * potentially required driver locking (e.g., DMA-resv locks).
153 *
154 * 1) GPU page fault handler
155 *
156 * .. code-block:: c
157 *
158 * int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
159 * {
160 * int err = 0;
161 *
162 * driver_alloc_and_setup_memory_for_bind(gpusvm, range);
163 *
164 * drm_gpusvm_notifier_lock(gpusvm);
165 * if (drm_gpusvm_range_pages_valid(range))
166 * driver_commit_bind(gpusvm, range);
167 * else
168 * err = -EAGAIN;
169 * drm_gpusvm_notifier_unlock(gpusvm);
170 *
171 * return err;
172 * }
173 *
174 * int driver_gpu_fault(struct drm_gpusvm *gpusvm, unsigned long fault_addr,
175 * unsigned long gpuva_start, unsigned long gpuva_end)
176 * {
177 * struct drm_gpusvm_ctx ctx = {};
178 * int err;
179 *
180 * driver_svm_lock();
181 * retry:
182 * // Always process UNMAPs first so view of GPU SVM ranges is current
183 * driver_garbage_collector(gpusvm);
184 *
185 * range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
186 * gpuva_start, gpuva_end,
187 * &ctx);
188 * if (IS_ERR(range)) {
189 * err = PTR_ERR(range);
190 * goto unlock;
191 * }
192 *
193 * if (driver_migration_policy(range)) {
194 * mmap_read_lock(mm);
195 * devmem = driver_alloc_devmem();
196 * err = drm_gpusvm_migrate_to_devmem(gpusvm, range,
197 * devmem_allocation,
198 * &ctx);
199 * mmap_read_unlock(mm);
200 * if (err) // CPU mappings may have changed
201 * goto retry;
202 * }
203 *
204 * err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
205 * if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) { // CPU mappings changed
206 * if (err == -EOPNOTSUPP)
207 * drm_gpusvm_range_evict(gpusvm, range);
208 * goto retry;
209 * } else if (err) {
210 * goto unlock;
211 * }
212 *
213 * err = driver_bind_range(gpusvm, range);
214 * if (err == -EAGAIN) // CPU mappings changed
215 * goto retry
216 *
217 * unlock:
218 * driver_svm_unlock();
219 * return err;
220 * }
221 *
222 * 2) Garbage Collector
223 *
224 * .. code-block:: c
225 *
226 * void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
227 * struct drm_gpusvm_range *range)
228 * {
229 * assert_driver_svm_locked(gpusvm);
230 *
231 * // Partial unmap, migrate any remaining device memory pages back to RAM
232 * if (range->flags.partial_unmap)
233 * drm_gpusvm_range_evict(gpusvm, range);
234 *
235 * driver_unbind_range(range);
236 * drm_gpusvm_range_remove(gpusvm, range);
237 * }
238 *
239 * void driver_garbage_collector(struct drm_gpusvm *gpusvm)
240 * {
241 * assert_driver_svm_locked(gpusvm);
242 *
243 * for_each_range_in_garbage_collector(gpusvm, range)
244 * __driver_garbage_collector(gpusvm, range);
245 * }
246 *
247 * 3) Notifier callback
248 *
249 * .. code-block:: c
250 *
251 * void driver_invalidation(struct drm_gpusvm *gpusvm,
252 * struct drm_gpusvm_notifier *notifier,
253 * const struct mmu_notifier_range *mmu_range)
254 * {
255 * struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
256 * struct drm_gpusvm_range *range = NULL;
257 *
258 * driver_invalidate_device_pages(gpusvm, mmu_range->start, mmu_range->end);
259 *
260 * drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
261 * mmu_range->end) {
262 * drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
263 *
264 * if (mmu_range->event != MMU_NOTIFY_UNMAP)
265 * continue;
266 *
267 * drm_gpusvm_range_set_unmapped(range, mmu_range);
268 * driver_garbage_collector_add(gpusvm, range);
269 * }
270 * }
271 */
272
273 /**
274 * npages_in_range() - Calculate the number of pages in a given range
275 * @start: The start address of the range
276 * @end: The end address of the range
277 *
278 * This macro calculates the number of pages in a given memory range,
279 * specified by the start and end addresses. It divides the difference
280 * between the end and start addresses by the page size (PAGE_SIZE) to
281 * determine the number of pages in the range.
282 *
283 * Return: The number of pages in the specified range.
284 */
285 static unsigned long
npages_in_range(unsigned long start,unsigned long end)286 npages_in_range(unsigned long start, unsigned long end)
287 {
288 return (end - start) >> PAGE_SHIFT;
289 }
290
291 /**
292 * struct drm_gpusvm_zdd - GPU SVM zone device data
293 *
294 * @refcount: Reference count for the zdd
295 * @devmem_allocation: device memory allocation
296 * @device_private_page_owner: Device private pages owner
297 *
298 * This structure serves as a generic wrapper installed in
299 * page->zone_device_data. It provides infrastructure for looking up a device
300 * memory allocation upon CPU page fault and asynchronously releasing device
301 * memory once the CPU has no page references. Asynchronous release is useful
302 * because CPU page references can be dropped in IRQ contexts, while releasing
303 * device memory likely requires sleeping locks.
304 */
305 struct drm_gpusvm_zdd {
306 struct kref refcount;
307 struct drm_gpusvm_devmem *devmem_allocation;
308 void *device_private_page_owner;
309 };
310
311 /**
312 * drm_gpusvm_zdd_alloc() - Allocate a zdd structure.
313 * @device_private_page_owner: Device private pages owner
314 *
315 * This function allocates and initializes a new zdd structure. It sets up the
316 * reference count and initializes the destroy work.
317 *
318 * Return: Pointer to the allocated zdd on success, ERR_PTR() on failure.
319 */
320 static struct drm_gpusvm_zdd *
drm_gpusvm_zdd_alloc(void * device_private_page_owner)321 drm_gpusvm_zdd_alloc(void *device_private_page_owner)
322 {
323 struct drm_gpusvm_zdd *zdd;
324
325 zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
326 if (!zdd)
327 return NULL;
328
329 kref_init(&zdd->refcount);
330 zdd->devmem_allocation = NULL;
331 zdd->device_private_page_owner = device_private_page_owner;
332
333 return zdd;
334 }
335
336 /**
337 * drm_gpusvm_zdd_get() - Get a reference to a zdd structure.
338 * @zdd: Pointer to the zdd structure.
339 *
340 * This function increments the reference count of the provided zdd structure.
341 *
342 * Return: Pointer to the zdd structure.
343 */
drm_gpusvm_zdd_get(struct drm_gpusvm_zdd * zdd)344 static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct drm_gpusvm_zdd *zdd)
345 {
346 kref_get(&zdd->refcount);
347 return zdd;
348 }
349
350 /**
351 * drm_gpusvm_zdd_destroy() - Destroy a zdd structure.
352 * @ref: Pointer to the reference count structure.
353 *
354 * This function queues the destroy_work of the zdd for asynchronous destruction.
355 */
drm_gpusvm_zdd_destroy(struct kref * ref)356 static void drm_gpusvm_zdd_destroy(struct kref *ref)
357 {
358 struct drm_gpusvm_zdd *zdd =
359 container_of(ref, struct drm_gpusvm_zdd, refcount);
360 struct drm_gpusvm_devmem *devmem = zdd->devmem_allocation;
361
362 if (devmem) {
363 complete_all(&devmem->detached);
364 if (devmem->ops->devmem_release)
365 devmem->ops->devmem_release(devmem);
366 }
367 kfree(zdd);
368 }
369
370 /**
371 * drm_gpusvm_zdd_put() - Put a zdd reference.
372 * @zdd: Pointer to the zdd structure.
373 *
374 * This function decrements the reference count of the provided zdd structure
375 * and schedules its destruction if the count drops to zero.
376 */
drm_gpusvm_zdd_put(struct drm_gpusvm_zdd * zdd)377 static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
378 {
379 kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
380 }
381
382 /**
383 * drm_gpusvm_range_find() - Find GPU SVM range from GPU SVM notifier
384 * @notifier: Pointer to the GPU SVM notifier structure.
385 * @start: Start address of the range
386 * @end: End address of the range
387 *
388 * Return: A pointer to the drm_gpusvm_range if found or NULL
389 */
390 struct drm_gpusvm_range *
drm_gpusvm_range_find(struct drm_gpusvm_notifier * notifier,unsigned long start,unsigned long end)391 drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, unsigned long start,
392 unsigned long end)
393 {
394 struct interval_tree_node *itree;
395
396 itree = interval_tree_iter_first(¬ifier->root, start, end - 1);
397
398 if (itree)
399 return container_of(itree, struct drm_gpusvm_range, itree);
400 else
401 return NULL;
402 }
403 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find);
404
405 /**
406 * drm_gpusvm_for_each_range_safe() - Safely iterate over GPU SVM ranges in a notifier
407 * @range__: Iterator variable for the ranges
408 * @next__: Iterator variable for the ranges temporay storage
409 * @notifier__: Pointer to the GPU SVM notifier
410 * @start__: Start address of the range
411 * @end__: End address of the range
412 *
413 * This macro is used to iterate over GPU SVM ranges in a notifier while
414 * removing ranges from it.
415 */
416 #define drm_gpusvm_for_each_range_safe(range__, next__, notifier__, start__, end__) \
417 for ((range__) = drm_gpusvm_range_find((notifier__), (start__), (end__)), \
418 (next__) = __drm_gpusvm_range_next(range__); \
419 (range__) && (drm_gpusvm_range_start(range__) < (end__)); \
420 (range__) = (next__), (next__) = __drm_gpusvm_range_next(range__))
421
422 /**
423 * __drm_gpusvm_notifier_next() - get the next drm_gpusvm_notifier in the list
424 * @notifier: a pointer to the current drm_gpusvm_notifier
425 *
426 * Return: A pointer to the next drm_gpusvm_notifier if available, or NULL if
427 * the current notifier is the last one or if the input notifier is
428 * NULL.
429 */
430 static struct drm_gpusvm_notifier *
__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier * notifier)431 __drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
432 {
433 if (notifier && !list_is_last(¬ifier->entry,
434 ¬ifier->gpusvm->notifier_list))
435 return list_next_entry(notifier, entry);
436
437 return NULL;
438 }
439
440 static struct drm_gpusvm_notifier *
notifier_iter_first(struct rb_root_cached * root,unsigned long start,unsigned long last)441 notifier_iter_first(struct rb_root_cached *root, unsigned long start,
442 unsigned long last)
443 {
444 struct interval_tree_node *itree;
445
446 itree = interval_tree_iter_first(root, start, last);
447
448 if (itree)
449 return container_of(itree, struct drm_gpusvm_notifier, itree);
450 else
451 return NULL;
452 }
453
454 /**
455 * drm_gpusvm_for_each_notifier() - Iterate over GPU SVM notifiers in a gpusvm
456 * @notifier__: Iterator variable for the notifiers
457 * @notifier__: Pointer to the GPU SVM notifier
458 * @start__: Start address of the notifier
459 * @end__: End address of the notifier
460 *
461 * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
462 */
463 #define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__, end__) \
464 for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1); \
465 (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__)); \
466 (notifier__) = __drm_gpusvm_notifier_next(notifier__))
467
468 /**
469 * drm_gpusvm_for_each_notifier_safe() - Safely iterate over GPU SVM notifiers in a gpusvm
470 * @notifier__: Iterator variable for the notifiers
471 * @next__: Iterator variable for the notifiers temporay storage
472 * @notifier__: Pointer to the GPU SVM notifier
473 * @start__: Start address of the notifier
474 * @end__: End address of the notifier
475 *
476 * This macro is used to iterate over GPU SVM notifiers in a gpusvm while
477 * removing notifiers from it.
478 */
479 #define drm_gpusvm_for_each_notifier_safe(notifier__, next__, gpusvm__, start__, end__) \
480 for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1), \
481 (next__) = __drm_gpusvm_notifier_next(notifier__); \
482 (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__)); \
483 (notifier__) = (next__), (next__) = __drm_gpusvm_notifier_next(notifier__))
484
485 /**
486 * drm_gpusvm_notifier_invalidate() - Invalidate a GPU SVM notifier.
487 * @mni: Pointer to the mmu_interval_notifier structure.
488 * @mmu_range: Pointer to the mmu_notifier_range structure.
489 * @cur_seq: Current sequence number.
490 *
491 * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
492 * notifier sequence number and calls the driver invalidate vfunc under
493 * gpusvm->notifier_lock.
494 *
495 * Return: true if the operation succeeds, false otherwise.
496 */
497 static bool
drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier * mni,const struct mmu_notifier_range * mmu_range,unsigned long cur_seq)498 drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
499 const struct mmu_notifier_range *mmu_range,
500 unsigned long cur_seq)
501 {
502 struct drm_gpusvm_notifier *notifier =
503 container_of(mni, typeof(*notifier), notifier);
504 struct drm_gpusvm *gpusvm = notifier->gpusvm;
505
506 if (!mmu_notifier_range_blockable(mmu_range))
507 return false;
508
509 down_write(&gpusvm->notifier_lock);
510 mmu_interval_set_seq(mni, cur_seq);
511 gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
512 up_write(&gpusvm->notifier_lock);
513
514 return true;
515 }
516
517 /*
518 * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
519 */
520 static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
521 .invalidate = drm_gpusvm_notifier_invalidate,
522 };
523
524 /**
525 * drm_gpusvm_init() - Initialize the GPU SVM.
526 * @gpusvm: Pointer to the GPU SVM structure.
527 * @name: Name of the GPU SVM.
528 * @drm: Pointer to the DRM device structure.
529 * @mm: Pointer to the mm_struct for the address space.
530 * @device_private_page_owner: Device private pages owner.
531 * @mm_start: Start address of GPU SVM.
532 * @mm_range: Range of the GPU SVM.
533 * @notifier_size: Size of individual notifiers.
534 * @ops: Pointer to the operations structure for GPU SVM.
535 * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
536 * Entries should be powers of 2 in descending order with last
537 * entry being SZ_4K.
538 * @num_chunks: Number of chunks.
539 *
540 * This function initializes the GPU SVM.
541 *
542 * Return: 0 on success, a negative error code on failure.
543 */
drm_gpusvm_init(struct drm_gpusvm * gpusvm,const char * name,struct drm_device * drm,struct mm_struct * mm,void * device_private_page_owner,unsigned long mm_start,unsigned long mm_range,unsigned long notifier_size,const struct drm_gpusvm_ops * ops,const unsigned long * chunk_sizes,int num_chunks)544 int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
545 const char *name, struct drm_device *drm,
546 struct mm_struct *mm, void *device_private_page_owner,
547 unsigned long mm_start, unsigned long mm_range,
548 unsigned long notifier_size,
549 const struct drm_gpusvm_ops *ops,
550 const unsigned long *chunk_sizes, int num_chunks)
551 {
552 if (!ops->invalidate || !num_chunks)
553 return -EINVAL;
554
555 gpusvm->name = name;
556 gpusvm->drm = drm;
557 gpusvm->mm = mm;
558 gpusvm->device_private_page_owner = device_private_page_owner;
559 gpusvm->mm_start = mm_start;
560 gpusvm->mm_range = mm_range;
561 gpusvm->notifier_size = notifier_size;
562 gpusvm->ops = ops;
563 gpusvm->chunk_sizes = chunk_sizes;
564 gpusvm->num_chunks = num_chunks;
565
566 mmgrab(mm);
567 gpusvm->root = RB_ROOT_CACHED;
568 INIT_LIST_HEAD(&gpusvm->notifier_list);
569
570 init_rwsem(&gpusvm->notifier_lock);
571
572 fs_reclaim_acquire(GFP_KERNEL);
573 might_lock(&gpusvm->notifier_lock);
574 fs_reclaim_release(GFP_KERNEL);
575
576 #ifdef CONFIG_LOCKDEP
577 gpusvm->lock_dep_map = NULL;
578 #endif
579
580 return 0;
581 }
582 EXPORT_SYMBOL_GPL(drm_gpusvm_init);
583
584 /**
585 * drm_gpusvm_notifier_find() - Find GPU SVM notifier
586 * @gpusvm: Pointer to the GPU SVM structure
587 * @fault_addr: Fault address
588 *
589 * This function finds the GPU SVM notifier associated with the fault address.
590 *
591 * Return: Pointer to the GPU SVM notifier on success, NULL otherwise.
592 */
593 static struct drm_gpusvm_notifier *
drm_gpusvm_notifier_find(struct drm_gpusvm * gpusvm,unsigned long fault_addr)594 drm_gpusvm_notifier_find(struct drm_gpusvm *gpusvm,
595 unsigned long fault_addr)
596 {
597 return notifier_iter_first(&gpusvm->root, fault_addr, fault_addr + 1);
598 }
599
600 /**
601 * to_drm_gpusvm_notifier() - retrieve the container struct for a given rbtree node
602 * @node: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
603 *
604 * Return: A pointer to the containing drm_gpusvm_notifier structure.
605 */
to_drm_gpusvm_notifier(struct rb_node * node)606 static struct drm_gpusvm_notifier *to_drm_gpusvm_notifier(struct rb_node *node)
607 {
608 return container_of(node, struct drm_gpusvm_notifier, itree.rb);
609 }
610
611 /**
612 * drm_gpusvm_notifier_insert() - Insert GPU SVM notifier
613 * @gpusvm: Pointer to the GPU SVM structure
614 * @notifier: Pointer to the GPU SVM notifier structure
615 *
616 * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
617 */
drm_gpusvm_notifier_insert(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier)618 static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
619 struct drm_gpusvm_notifier *notifier)
620 {
621 struct rb_node *node;
622 struct list_head *head;
623
624 interval_tree_insert(¬ifier->itree, &gpusvm->root);
625
626 node = rb_prev(¬ifier->itree.rb);
627 if (node)
628 head = &(to_drm_gpusvm_notifier(node))->entry;
629 else
630 head = &gpusvm->notifier_list;
631
632 list_add(¬ifier->entry, head);
633 }
634
635 /**
636 * drm_gpusvm_notifier_remove() - Remove GPU SVM notifier
637 * @gpusvm: Pointer to the GPU SVM tructure
638 * @notifier: Pointer to the GPU SVM notifier structure
639 *
640 * This function removes the GPU SVM notifier from the GPU SVM RB tree and list.
641 */
drm_gpusvm_notifier_remove(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier)642 static void drm_gpusvm_notifier_remove(struct drm_gpusvm *gpusvm,
643 struct drm_gpusvm_notifier *notifier)
644 {
645 interval_tree_remove(¬ifier->itree, &gpusvm->root);
646 list_del(¬ifier->entry);
647 }
648
649 /**
650 * drm_gpusvm_fini() - Finalize the GPU SVM.
651 * @gpusvm: Pointer to the GPU SVM structure.
652 *
653 * This function finalizes the GPU SVM by cleaning up any remaining ranges and
654 * notifiers, and dropping a reference to struct MM.
655 */
drm_gpusvm_fini(struct drm_gpusvm * gpusvm)656 void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
657 {
658 struct drm_gpusvm_notifier *notifier, *next;
659
660 drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
661 struct drm_gpusvm_range *range, *__next;
662
663 /*
664 * Remove notifier first to avoid racing with any invalidation
665 */
666 mmu_interval_notifier_remove(¬ifier->notifier);
667 notifier->flags.removed = true;
668
669 drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
670 LONG_MAX)
671 drm_gpusvm_range_remove(gpusvm, range);
672 }
673
674 mmdrop(gpusvm->mm);
675 WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
676 }
677 EXPORT_SYMBOL_GPL(drm_gpusvm_fini);
678
679 /**
680 * drm_gpusvm_notifier_alloc() - Allocate GPU SVM notifier
681 * @gpusvm: Pointer to the GPU SVM structure
682 * @fault_addr: Fault address
683 *
684 * This function allocates and initializes the GPU SVM notifier structure.
685 *
686 * Return: Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
687 */
688 static struct drm_gpusvm_notifier *
drm_gpusvm_notifier_alloc(struct drm_gpusvm * gpusvm,unsigned long fault_addr)689 drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, unsigned long fault_addr)
690 {
691 struct drm_gpusvm_notifier *notifier;
692
693 if (gpusvm->ops->notifier_alloc)
694 notifier = gpusvm->ops->notifier_alloc();
695 else
696 notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
697
698 if (!notifier)
699 return ERR_PTR(-ENOMEM);
700
701 notifier->gpusvm = gpusvm;
702 notifier->itree.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
703 notifier->itree.last = ALIGN(fault_addr + 1, gpusvm->notifier_size) - 1;
704 INIT_LIST_HEAD(¬ifier->entry);
705 notifier->root = RB_ROOT_CACHED;
706 INIT_LIST_HEAD(¬ifier->range_list);
707
708 return notifier;
709 }
710
711 /**
712 * drm_gpusvm_notifier_free() - Free GPU SVM notifier
713 * @gpusvm: Pointer to the GPU SVM structure
714 * @notifier: Pointer to the GPU SVM notifier structure
715 *
716 * This function frees the GPU SVM notifier structure.
717 */
drm_gpusvm_notifier_free(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier)718 static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
719 struct drm_gpusvm_notifier *notifier)
720 {
721 WARN_ON(!RB_EMPTY_ROOT(¬ifier->root.rb_root));
722
723 if (gpusvm->ops->notifier_free)
724 gpusvm->ops->notifier_free(notifier);
725 else
726 kfree(notifier);
727 }
728
729 /**
730 * to_drm_gpusvm_range() - retrieve the container struct for a given rbtree node
731 * @node: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
732 *
733 * Return: A pointer to the containing drm_gpusvm_range structure.
734 */
to_drm_gpusvm_range(struct rb_node * node)735 static struct drm_gpusvm_range *to_drm_gpusvm_range(struct rb_node *node)
736 {
737 return container_of(node, struct drm_gpusvm_range, itree.rb);
738 }
739
740 /**
741 * drm_gpusvm_range_insert() - Insert GPU SVM range
742 * @notifier: Pointer to the GPU SVM notifier structure
743 * @range: Pointer to the GPU SVM range structure
744 *
745 * This function inserts the GPU SVM range into the notifier RB tree and list.
746 */
drm_gpusvm_range_insert(struct drm_gpusvm_notifier * notifier,struct drm_gpusvm_range * range)747 static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
748 struct drm_gpusvm_range *range)
749 {
750 struct rb_node *node;
751 struct list_head *head;
752
753 drm_gpusvm_notifier_lock(notifier->gpusvm);
754 interval_tree_insert(&range->itree, ¬ifier->root);
755
756 node = rb_prev(&range->itree.rb);
757 if (node)
758 head = &(to_drm_gpusvm_range(node))->entry;
759 else
760 head = ¬ifier->range_list;
761
762 list_add(&range->entry, head);
763 drm_gpusvm_notifier_unlock(notifier->gpusvm);
764 }
765
766 /**
767 * __drm_gpusvm_range_remove() - Remove GPU SVM range
768 * @notifier: Pointer to the GPU SVM notifier structure
769 * @range: Pointer to the GPU SVM range structure
770 *
771 * This macro removes the GPU SVM range from the notifier RB tree and list.
772 */
__drm_gpusvm_range_remove(struct drm_gpusvm_notifier * notifier,struct drm_gpusvm_range * range)773 static void __drm_gpusvm_range_remove(struct drm_gpusvm_notifier *notifier,
774 struct drm_gpusvm_range *range)
775 {
776 interval_tree_remove(&range->itree, ¬ifier->root);
777 list_del(&range->entry);
778 }
779
780 /**
781 * drm_gpusvm_range_alloc() - Allocate GPU SVM range
782 * @gpusvm: Pointer to the GPU SVM structure
783 * @notifier: Pointer to the GPU SVM notifier structure
784 * @fault_addr: Fault address
785 * @chunk_size: Chunk size
786 * @migrate_devmem: Flag indicating whether to migrate device memory
787 *
788 * This function allocates and initializes the GPU SVM range structure.
789 *
790 * Return: Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
791 */
792 static struct drm_gpusvm_range *
drm_gpusvm_range_alloc(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier,unsigned long fault_addr,unsigned long chunk_size,bool migrate_devmem)793 drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
794 struct drm_gpusvm_notifier *notifier,
795 unsigned long fault_addr, unsigned long chunk_size,
796 bool migrate_devmem)
797 {
798 struct drm_gpusvm_range *range;
799
800 if (gpusvm->ops->range_alloc)
801 range = gpusvm->ops->range_alloc(gpusvm);
802 else
803 range = kzalloc(sizeof(*range), GFP_KERNEL);
804
805 if (!range)
806 return ERR_PTR(-ENOMEM);
807
808 kref_init(&range->refcount);
809 range->gpusvm = gpusvm;
810 range->notifier = notifier;
811 range->itree.start = ALIGN_DOWN(fault_addr, chunk_size);
812 range->itree.last = ALIGN(fault_addr + 1, chunk_size) - 1;
813 INIT_LIST_HEAD(&range->entry);
814 range->notifier_seq = LONG_MAX;
815 range->flags.migrate_devmem = migrate_devmem ? 1 : 0;
816
817 return range;
818 }
819
820 /**
821 * drm_gpusvm_check_pages() - Check pages
822 * @gpusvm: Pointer to the GPU SVM structure
823 * @notifier: Pointer to the GPU SVM notifier structure
824 * @start: Start address
825 * @end: End address
826 *
827 * Check if pages between start and end have been faulted in on the CPU. Use to
828 * prevent migration of pages without CPU backing store.
829 *
830 * Return: True if pages have been faulted into CPU, False otherwise
831 */
drm_gpusvm_check_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier,unsigned long start,unsigned long end)832 static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
833 struct drm_gpusvm_notifier *notifier,
834 unsigned long start, unsigned long end)
835 {
836 struct hmm_range hmm_range = {
837 .default_flags = 0,
838 .notifier = ¬ifier->notifier,
839 .start = start,
840 .end = end,
841 .dev_private_owner = gpusvm->device_private_page_owner,
842 };
843 unsigned long timeout =
844 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
845 unsigned long *pfns;
846 unsigned long npages = npages_in_range(start, end);
847 int err, i;
848
849 mmap_assert_locked(gpusvm->mm);
850
851 pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
852 if (!pfns)
853 return false;
854
855 hmm_range.notifier_seq = mmu_interval_read_begin(¬ifier->notifier);
856 hmm_range.hmm_pfns = pfns;
857
858 while (true) {
859 err = hmm_range_fault(&hmm_range);
860 if (err == -EBUSY) {
861 if (time_after(jiffies, timeout))
862 break;
863
864 hmm_range.notifier_seq =
865 mmu_interval_read_begin(¬ifier->notifier);
866 continue;
867 }
868 break;
869 }
870 if (err)
871 goto err_free;
872
873 for (i = 0; i < npages;) {
874 if (!(pfns[i] & HMM_PFN_VALID)) {
875 err = -EFAULT;
876 goto err_free;
877 }
878 i += 0x1 << hmm_pfn_to_map_order(pfns[i]);
879 }
880
881 err_free:
882 kvfree(pfns);
883 return err ? false : true;
884 }
885
886 /**
887 * drm_gpusvm_range_chunk_size() - Determine chunk size for GPU SVM range
888 * @gpusvm: Pointer to the GPU SVM structure
889 * @notifier: Pointer to the GPU SVM notifier structure
890 * @vas: Pointer to the virtual memory area structure
891 * @fault_addr: Fault address
892 * @gpuva_start: Start address of GPUVA which mirrors CPU
893 * @gpuva_end: End address of GPUVA which mirrors CPU
894 * @check_pages_threshold: Check CPU pages for present threshold
895 *
896 * This function determines the chunk size for the GPU SVM range based on the
897 * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
898 * memory area boundaries.
899 *
900 * Return: Chunk size on success, LONG_MAX on failure.
901 */
902 static unsigned long
drm_gpusvm_range_chunk_size(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier,struct vm_area_struct * vas,unsigned long fault_addr,unsigned long gpuva_start,unsigned long gpuva_end,unsigned long check_pages_threshold)903 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
904 struct drm_gpusvm_notifier *notifier,
905 struct vm_area_struct *vas,
906 unsigned long fault_addr,
907 unsigned long gpuva_start,
908 unsigned long gpuva_end,
909 unsigned long check_pages_threshold)
910 {
911 unsigned long start, end;
912 int i = 0;
913
914 retry:
915 for (; i < gpusvm->num_chunks; ++i) {
916 start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
917 end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
918
919 if (start >= vas->vm_start && end <= vas->vm_end &&
920 start >= drm_gpusvm_notifier_start(notifier) &&
921 end <= drm_gpusvm_notifier_end(notifier) &&
922 start >= gpuva_start && end <= gpuva_end)
923 break;
924 }
925
926 if (i == gpusvm->num_chunks)
927 return LONG_MAX;
928
929 /*
930 * If allocation more than page, ensure not to overlap with existing
931 * ranges.
932 */
933 if (end - start != SZ_4K) {
934 struct drm_gpusvm_range *range;
935
936 range = drm_gpusvm_range_find(notifier, start, end);
937 if (range) {
938 ++i;
939 goto retry;
940 }
941
942 /*
943 * XXX: Only create range on pages CPU has faulted in. Without
944 * this check, or prefault, on BMG 'xe_exec_system_allocator --r
945 * process-many-malloc' fails. In the failure case, each process
946 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
947 * ranges. When migrating the SVM ranges, some processes fail in
948 * drm_gpusvm_migrate_to_devmem with 'migrate.cpages != npages'
949 * and then upon drm_gpusvm_range_get_pages device pages from
950 * other processes are collected + faulted in which creates all
951 * sorts of problems. Unsure exactly how this happening, also
952 * problem goes away if 'xe_exec_system_allocator --r
953 * process-many-malloc' mallocs at least 64k at a time.
954 */
955 if (end - start <= check_pages_threshold &&
956 !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) {
957 ++i;
958 goto retry;
959 }
960 }
961
962 return end - start;
963 }
964
965 #ifdef CONFIG_LOCKDEP
966 /**
967 * drm_gpusvm_driver_lock_held() - Assert GPU SVM driver lock is held
968 * @gpusvm: Pointer to the GPU SVM structure.
969 *
970 * Ensure driver lock is held.
971 */
drm_gpusvm_driver_lock_held(struct drm_gpusvm * gpusvm)972 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
973 {
974 if ((gpusvm)->lock_dep_map)
975 lockdep_assert(lock_is_held_type((gpusvm)->lock_dep_map, 0));
976 }
977 #else
drm_gpusvm_driver_lock_held(struct drm_gpusvm * gpusvm)978 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
979 {
980 }
981 #endif
982
983 /**
984 * drm_gpusvm_range_find_or_insert() - Find or insert GPU SVM range
985 * @gpusvm: Pointer to the GPU SVM structure
986 * @fault_addr: Fault address
987 * @gpuva_start: Start address of GPUVA which mirrors CPU
988 * @gpuva_end: End address of GPUVA which mirrors CPU
989 * @ctx: GPU SVM context
990 *
991 * This function finds or inserts a newly allocated a GPU SVM range based on the
992 * fault address. Caller must hold a lock to protect range lookup and insertion.
993 *
994 * Return: Pointer to the GPU SVM range on success, ERR_PTR() on failure.
995 */
996 struct drm_gpusvm_range *
drm_gpusvm_range_find_or_insert(struct drm_gpusvm * gpusvm,unsigned long fault_addr,unsigned long gpuva_start,unsigned long gpuva_end,const struct drm_gpusvm_ctx * ctx)997 drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
998 unsigned long fault_addr,
999 unsigned long gpuva_start,
1000 unsigned long gpuva_end,
1001 const struct drm_gpusvm_ctx *ctx)
1002 {
1003 struct drm_gpusvm_notifier *notifier;
1004 struct drm_gpusvm_range *range;
1005 struct mm_struct *mm = gpusvm->mm;
1006 struct vm_area_struct *vas;
1007 bool notifier_alloc = false;
1008 unsigned long chunk_size;
1009 int err;
1010 bool migrate_devmem;
1011
1012 drm_gpusvm_driver_lock_held(gpusvm);
1013
1014 if (fault_addr < gpusvm->mm_start ||
1015 fault_addr > gpusvm->mm_start + gpusvm->mm_range)
1016 return ERR_PTR(-EINVAL);
1017
1018 if (!mmget_not_zero(mm))
1019 return ERR_PTR(-EFAULT);
1020
1021 notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
1022 if (!notifier) {
1023 notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
1024 if (IS_ERR(notifier)) {
1025 err = PTR_ERR(notifier);
1026 goto err_mmunlock;
1027 }
1028 notifier_alloc = true;
1029 err = mmu_interval_notifier_insert(¬ifier->notifier,
1030 mm,
1031 drm_gpusvm_notifier_start(notifier),
1032 drm_gpusvm_notifier_size(notifier),
1033 &drm_gpusvm_notifier_ops);
1034 if (err)
1035 goto err_notifier;
1036 }
1037
1038 mmap_read_lock(mm);
1039
1040 vas = vma_lookup(mm, fault_addr);
1041 if (!vas) {
1042 err = -ENOENT;
1043 goto err_notifier_remove;
1044 }
1045
1046 if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
1047 err = -EPERM;
1048 goto err_notifier_remove;
1049 }
1050
1051 range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
1052 if (range)
1053 goto out_mmunlock;
1054 /*
1055 * XXX: Short-circuiting migration based on migrate_vma_* current
1056 * limitations. If/when migrate_vma_* add more support, this logic will
1057 * have to change.
1058 */
1059 migrate_devmem = ctx->devmem_possible &&
1060 vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
1061
1062 chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
1063 fault_addr, gpuva_start,
1064 gpuva_end,
1065 ctx->check_pages_threshold);
1066 if (chunk_size == LONG_MAX) {
1067 err = -EINVAL;
1068 goto err_notifier_remove;
1069 }
1070
1071 range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
1072 migrate_devmem);
1073 if (IS_ERR(range)) {
1074 err = PTR_ERR(range);
1075 goto err_notifier_remove;
1076 }
1077
1078 drm_gpusvm_range_insert(notifier, range);
1079 if (notifier_alloc)
1080 drm_gpusvm_notifier_insert(gpusvm, notifier);
1081
1082 out_mmunlock:
1083 mmap_read_unlock(mm);
1084 mmput(mm);
1085
1086 return range;
1087
1088 err_notifier_remove:
1089 mmap_read_unlock(mm);
1090 if (notifier_alloc)
1091 mmu_interval_notifier_remove(¬ifier->notifier);
1092 err_notifier:
1093 if (notifier_alloc)
1094 drm_gpusvm_notifier_free(gpusvm, notifier);
1095 err_mmunlock:
1096 mmput(mm);
1097 return ERR_PTR(err);
1098 }
1099 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find_or_insert);
1100
1101 /**
1102 * __drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range (internal)
1103 * @gpusvm: Pointer to the GPU SVM structure
1104 * @range: Pointer to the GPU SVM range structure
1105 * @npages: Number of pages to unmap
1106 *
1107 * This function unmap pages associated with a GPU SVM range. Assumes and
1108 * asserts correct locking is in place when called.
1109 */
__drm_gpusvm_range_unmap_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range,unsigned long npages)1110 static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1111 struct drm_gpusvm_range *range,
1112 unsigned long npages)
1113 {
1114 unsigned long i, j;
1115 struct drm_pagemap *dpagemap = range->dpagemap;
1116 struct device *dev = gpusvm->drm->dev;
1117
1118 lockdep_assert_held(&gpusvm->notifier_lock);
1119
1120 if (range->flags.has_dma_mapping) {
1121 struct drm_gpusvm_range_flags flags = {
1122 .__flags = range->flags.__flags,
1123 };
1124
1125 for (i = 0, j = 0; i < npages; j++) {
1126 struct drm_pagemap_device_addr *addr = &range->dma_addr[j];
1127
1128 if (addr->proto == DRM_INTERCONNECT_SYSTEM)
1129 dma_unmap_page(dev,
1130 addr->addr,
1131 PAGE_SIZE << addr->order,
1132 addr->dir);
1133 else if (dpagemap && dpagemap->ops->device_unmap)
1134 dpagemap->ops->device_unmap(dpagemap,
1135 dev, *addr);
1136 i += 1 << addr->order;
1137 }
1138
1139 /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1140 flags.has_devmem_pages = false;
1141 flags.has_dma_mapping = false;
1142 WRITE_ONCE(range->flags.__flags, flags.__flags);
1143
1144 range->dpagemap = NULL;
1145 }
1146 }
1147
1148 /**
1149 * drm_gpusvm_range_free_pages() - Free pages associated with a GPU SVM range
1150 * @gpusvm: Pointer to the GPU SVM structure
1151 * @range: Pointer to the GPU SVM range structure
1152 *
1153 * This function frees the dma address array associated with a GPU SVM range.
1154 */
drm_gpusvm_range_free_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)1155 static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
1156 struct drm_gpusvm_range *range)
1157 {
1158 lockdep_assert_held(&gpusvm->notifier_lock);
1159
1160 if (range->dma_addr) {
1161 kvfree(range->dma_addr);
1162 range->dma_addr = NULL;
1163 }
1164 }
1165
1166 /**
1167 * drm_gpusvm_range_remove() - Remove GPU SVM range
1168 * @gpusvm: Pointer to the GPU SVM structure
1169 * @range: Pointer to the GPU SVM range to be removed
1170 *
1171 * This function removes the specified GPU SVM range and also removes the parent
1172 * GPU SVM notifier if no more ranges remain in the notifier. The caller must
1173 * hold a lock to protect range and notifier removal.
1174 */
drm_gpusvm_range_remove(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)1175 void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
1176 struct drm_gpusvm_range *range)
1177 {
1178 unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1179 drm_gpusvm_range_end(range));
1180 struct drm_gpusvm_notifier *notifier;
1181
1182 drm_gpusvm_driver_lock_held(gpusvm);
1183
1184 notifier = drm_gpusvm_notifier_find(gpusvm,
1185 drm_gpusvm_range_start(range));
1186 if (WARN_ON_ONCE(!notifier))
1187 return;
1188
1189 drm_gpusvm_notifier_lock(gpusvm);
1190 __drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1191 drm_gpusvm_range_free_pages(gpusvm, range);
1192 __drm_gpusvm_range_remove(notifier, range);
1193 drm_gpusvm_notifier_unlock(gpusvm);
1194
1195 drm_gpusvm_range_put(range);
1196
1197 if (RB_EMPTY_ROOT(¬ifier->root.rb_root)) {
1198 if (!notifier->flags.removed)
1199 mmu_interval_notifier_remove(¬ifier->notifier);
1200 drm_gpusvm_notifier_remove(gpusvm, notifier);
1201 drm_gpusvm_notifier_free(gpusvm, notifier);
1202 }
1203 }
1204 EXPORT_SYMBOL_GPL(drm_gpusvm_range_remove);
1205
1206 /**
1207 * drm_gpusvm_range_get() - Get a reference to GPU SVM range
1208 * @range: Pointer to the GPU SVM range
1209 *
1210 * This function increments the reference count of the specified GPU SVM range.
1211 *
1212 * Return: Pointer to the GPU SVM range.
1213 */
1214 struct drm_gpusvm_range *
drm_gpusvm_range_get(struct drm_gpusvm_range * range)1215 drm_gpusvm_range_get(struct drm_gpusvm_range *range)
1216 {
1217 kref_get(&range->refcount);
1218
1219 return range;
1220 }
1221 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get);
1222
1223 /**
1224 * drm_gpusvm_range_destroy() - Destroy GPU SVM range
1225 * @refcount: Pointer to the reference counter embedded in the GPU SVM range
1226 *
1227 * This function destroys the specified GPU SVM range when its reference count
1228 * reaches zero. If a custom range-free function is provided, it is invoked to
1229 * free the range; otherwise, the range is deallocated using kfree().
1230 */
drm_gpusvm_range_destroy(struct kref * refcount)1231 static void drm_gpusvm_range_destroy(struct kref *refcount)
1232 {
1233 struct drm_gpusvm_range *range =
1234 container_of(refcount, struct drm_gpusvm_range, refcount);
1235 struct drm_gpusvm *gpusvm = range->gpusvm;
1236
1237 if (gpusvm->ops->range_free)
1238 gpusvm->ops->range_free(range);
1239 else
1240 kfree(range);
1241 }
1242
1243 /**
1244 * drm_gpusvm_range_put() - Put a reference to GPU SVM range
1245 * @range: Pointer to the GPU SVM range
1246 *
1247 * This function decrements the reference count of the specified GPU SVM range
1248 * and frees it when the count reaches zero.
1249 */
drm_gpusvm_range_put(struct drm_gpusvm_range * range)1250 void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
1251 {
1252 kref_put(&range->refcount, drm_gpusvm_range_destroy);
1253 }
1254 EXPORT_SYMBOL_GPL(drm_gpusvm_range_put);
1255
1256 /**
1257 * drm_gpusvm_range_pages_valid() - GPU SVM range pages valid
1258 * @gpusvm: Pointer to the GPU SVM structure
1259 * @range: Pointer to the GPU SVM range structure
1260 *
1261 * This function determines if a GPU SVM range pages are valid. Expected be
1262 * called holding gpusvm->notifier_lock and as the last step before committing a
1263 * GPU binding. This is akin to a notifier seqno check in the HMM documentation
1264 * but due to wider notifiers (i.e., notifiers which span multiple ranges) this
1265 * function is required for finer grained checking (i.e., per range) if pages
1266 * are valid.
1267 *
1268 * Return: True if GPU SVM range has valid pages, False otherwise
1269 */
drm_gpusvm_range_pages_valid(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)1270 bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
1271 struct drm_gpusvm_range *range)
1272 {
1273 lockdep_assert_held(&gpusvm->notifier_lock);
1274
1275 return range->flags.has_devmem_pages || range->flags.has_dma_mapping;
1276 }
1277 EXPORT_SYMBOL_GPL(drm_gpusvm_range_pages_valid);
1278
1279 /**
1280 * drm_gpusvm_range_pages_valid_unlocked() - GPU SVM range pages valid unlocked
1281 * @gpusvm: Pointer to the GPU SVM structure
1282 * @range: Pointer to the GPU SVM range structure
1283 *
1284 * This function determines if a GPU SVM range pages are valid. Expected be
1285 * called without holding gpusvm->notifier_lock.
1286 *
1287 * Return: True if GPU SVM range has valid pages, False otherwise
1288 */
1289 static bool
drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)1290 drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
1291 struct drm_gpusvm_range *range)
1292 {
1293 bool pages_valid;
1294
1295 if (!range->dma_addr)
1296 return false;
1297
1298 drm_gpusvm_notifier_lock(gpusvm);
1299 pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
1300 if (!pages_valid)
1301 drm_gpusvm_range_free_pages(gpusvm, range);
1302 drm_gpusvm_notifier_unlock(gpusvm);
1303
1304 return pages_valid;
1305 }
1306
1307 /**
1308 * drm_gpusvm_range_get_pages() - Get pages for a GPU SVM range
1309 * @gpusvm: Pointer to the GPU SVM structure
1310 * @range: Pointer to the GPU SVM range structure
1311 * @ctx: GPU SVM context
1312 *
1313 * This function gets pages for a GPU SVM range and ensures they are mapped for
1314 * DMA access.
1315 *
1316 * Return: 0 on success, negative error code on failure.
1317 */
drm_gpusvm_range_get_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range,const struct drm_gpusvm_ctx * ctx)1318 int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
1319 struct drm_gpusvm_range *range,
1320 const struct drm_gpusvm_ctx *ctx)
1321 {
1322 struct mmu_interval_notifier *notifier = &range->notifier->notifier;
1323 struct hmm_range hmm_range = {
1324 .default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
1325 HMM_PFN_REQ_WRITE),
1326 .notifier = notifier,
1327 .start = drm_gpusvm_range_start(range),
1328 .end = drm_gpusvm_range_end(range),
1329 .dev_private_owner = gpusvm->device_private_page_owner,
1330 };
1331 struct mm_struct *mm = gpusvm->mm;
1332 struct drm_gpusvm_zdd *zdd;
1333 unsigned long timeout =
1334 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1335 unsigned long i, j;
1336 unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1337 drm_gpusvm_range_end(range));
1338 unsigned long num_dma_mapped;
1339 unsigned int order = 0;
1340 unsigned long *pfns;
1341 int err = 0;
1342 struct dev_pagemap *pagemap;
1343 struct drm_pagemap *dpagemap;
1344 struct drm_gpusvm_range_flags flags;
1345
1346 retry:
1347 hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
1348 if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
1349 goto set_seqno;
1350
1351 pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
1352 if (!pfns)
1353 return -ENOMEM;
1354
1355 if (!mmget_not_zero(mm)) {
1356 err = -EFAULT;
1357 goto err_free;
1358 }
1359
1360 hmm_range.hmm_pfns = pfns;
1361 while (true) {
1362 mmap_read_lock(mm);
1363 err = hmm_range_fault(&hmm_range);
1364 mmap_read_unlock(mm);
1365
1366 if (err == -EBUSY) {
1367 if (time_after(jiffies, timeout))
1368 break;
1369
1370 hmm_range.notifier_seq =
1371 mmu_interval_read_begin(notifier);
1372 continue;
1373 }
1374 break;
1375 }
1376 mmput(mm);
1377 if (err)
1378 goto err_free;
1379
1380 map_pages:
1381 /*
1382 * Perform all dma mappings under the notifier lock to not
1383 * access freed pages. A notifier will either block on
1384 * the notifier lock or unmap dma.
1385 */
1386 drm_gpusvm_notifier_lock(gpusvm);
1387
1388 flags.__flags = range->flags.__flags;
1389 if (flags.unmapped) {
1390 drm_gpusvm_notifier_unlock(gpusvm);
1391 err = -EFAULT;
1392 goto err_free;
1393 }
1394
1395 if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
1396 drm_gpusvm_notifier_unlock(gpusvm);
1397 kvfree(pfns);
1398 goto retry;
1399 }
1400
1401 if (!range->dma_addr) {
1402 /* Unlock and restart mapping to allocate memory. */
1403 drm_gpusvm_notifier_unlock(gpusvm);
1404 range->dma_addr = kvmalloc_array(npages,
1405 sizeof(*range->dma_addr),
1406 GFP_KERNEL);
1407 if (!range->dma_addr) {
1408 err = -ENOMEM;
1409 goto err_free;
1410 }
1411 goto map_pages;
1412 }
1413
1414 zdd = NULL;
1415 num_dma_mapped = 0;
1416 for (i = 0, j = 0; i < npages; ++j) {
1417 struct page *page = hmm_pfn_to_page(pfns[i]);
1418
1419 order = hmm_pfn_to_map_order(pfns[i]);
1420 if (is_device_private_page(page) ||
1421 is_device_coherent_page(page)) {
1422 if (zdd != page->zone_device_data && i > 0) {
1423 err = -EOPNOTSUPP;
1424 goto err_unmap;
1425 }
1426 zdd = page->zone_device_data;
1427 if (pagemap != page_pgmap(page)) {
1428 if (i > 0) {
1429 err = -EOPNOTSUPP;
1430 goto err_unmap;
1431 }
1432
1433 pagemap = page_pgmap(page);
1434 dpagemap = zdd->devmem_allocation->dpagemap;
1435 if (drm_WARN_ON(gpusvm->drm, !dpagemap)) {
1436 /*
1437 * Raced. This is not supposed to happen
1438 * since hmm_range_fault() should've migrated
1439 * this page to system.
1440 */
1441 err = -EAGAIN;
1442 goto err_unmap;
1443 }
1444 }
1445 range->dma_addr[j] =
1446 dpagemap->ops->device_map(dpagemap,
1447 gpusvm->drm->dev,
1448 page, order,
1449 DMA_BIDIRECTIONAL);
1450 if (dma_mapping_error(gpusvm->drm->dev,
1451 range->dma_addr[j].addr)) {
1452 err = -EFAULT;
1453 goto err_unmap;
1454 }
1455 } else {
1456 dma_addr_t addr;
1457
1458 if (is_zone_device_page(page) || zdd) {
1459 err = -EOPNOTSUPP;
1460 goto err_unmap;
1461 }
1462
1463 if (ctx->devmem_only) {
1464 err = -EFAULT;
1465 goto err_unmap;
1466 }
1467
1468 addr = dma_map_page(gpusvm->drm->dev,
1469 page, 0,
1470 PAGE_SIZE << order,
1471 DMA_BIDIRECTIONAL);
1472 if (dma_mapping_error(gpusvm->drm->dev, addr)) {
1473 err = -EFAULT;
1474 goto err_unmap;
1475 }
1476
1477 range->dma_addr[j] = drm_pagemap_device_addr_encode
1478 (addr, DRM_INTERCONNECT_SYSTEM, order,
1479 DMA_BIDIRECTIONAL);
1480 }
1481 i += 1 << order;
1482 num_dma_mapped = i;
1483 flags.has_dma_mapping = true;
1484 }
1485
1486 if (zdd) {
1487 flags.has_devmem_pages = true;
1488 range->dpagemap = dpagemap;
1489 }
1490
1491 /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1492 WRITE_ONCE(range->flags.__flags, flags.__flags);
1493
1494 drm_gpusvm_notifier_unlock(gpusvm);
1495 kvfree(pfns);
1496 set_seqno:
1497 range->notifier_seq = hmm_range.notifier_seq;
1498
1499 return 0;
1500
1501 err_unmap:
1502 __drm_gpusvm_range_unmap_pages(gpusvm, range, num_dma_mapped);
1503 drm_gpusvm_notifier_unlock(gpusvm);
1504 err_free:
1505 kvfree(pfns);
1506 if (err == -EAGAIN)
1507 goto retry;
1508 return err;
1509 }
1510 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get_pages);
1511
1512 /**
1513 * drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range
1514 * @gpusvm: Pointer to the GPU SVM structure
1515 * @range: Pointer to the GPU SVM range structure
1516 * @ctx: GPU SVM context
1517 *
1518 * This function unmaps pages associated with a GPU SVM range. If @in_notifier
1519 * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
1520 * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
1521 * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
1522 * security model.
1523 */
drm_gpusvm_range_unmap_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range,const struct drm_gpusvm_ctx * ctx)1524 void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1525 struct drm_gpusvm_range *range,
1526 const struct drm_gpusvm_ctx *ctx)
1527 {
1528 unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1529 drm_gpusvm_range_end(range));
1530
1531 if (ctx->in_notifier)
1532 lockdep_assert_held_write(&gpusvm->notifier_lock);
1533 else
1534 drm_gpusvm_notifier_lock(gpusvm);
1535
1536 __drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1537
1538 if (!ctx->in_notifier)
1539 drm_gpusvm_notifier_unlock(gpusvm);
1540 }
1541 EXPORT_SYMBOL_GPL(drm_gpusvm_range_unmap_pages);
1542
1543 /**
1544 * drm_gpusvm_migration_unlock_put_page() - Put a migration page
1545 * @page: Pointer to the page to put
1546 *
1547 * This function unlocks and puts a page.
1548 */
drm_gpusvm_migration_unlock_put_page(struct page * page)1549 static void drm_gpusvm_migration_unlock_put_page(struct page *page)
1550 {
1551 unlock_page(page);
1552 put_page(page);
1553 }
1554
1555 /**
1556 * drm_gpusvm_migration_unlock_put_pages() - Put migration pages
1557 * @npages: Number of pages
1558 * @migrate_pfn: Array of migrate page frame numbers
1559 *
1560 * This function unlocks and puts an array of pages.
1561 */
drm_gpusvm_migration_unlock_put_pages(unsigned long npages,unsigned long * migrate_pfn)1562 static void drm_gpusvm_migration_unlock_put_pages(unsigned long npages,
1563 unsigned long *migrate_pfn)
1564 {
1565 unsigned long i;
1566
1567 for (i = 0; i < npages; ++i) {
1568 struct page *page;
1569
1570 if (!migrate_pfn[i])
1571 continue;
1572
1573 page = migrate_pfn_to_page(migrate_pfn[i]);
1574 drm_gpusvm_migration_unlock_put_page(page);
1575 migrate_pfn[i] = 0;
1576 }
1577 }
1578
1579 /**
1580 * drm_gpusvm_get_devmem_page() - Get a reference to a device memory page
1581 * @page: Pointer to the page
1582 * @zdd: Pointer to the GPU SVM zone device data
1583 *
1584 * This function associates the given page with the specified GPU SVM zone
1585 * device data and initializes it for zone device usage.
1586 */
drm_gpusvm_get_devmem_page(struct page * page,struct drm_gpusvm_zdd * zdd)1587 static void drm_gpusvm_get_devmem_page(struct page *page,
1588 struct drm_gpusvm_zdd *zdd)
1589 {
1590 page->zone_device_data = drm_gpusvm_zdd_get(zdd);
1591 zone_device_page_init(page);
1592 }
1593
1594 /**
1595 * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
1596 * @dev: The device for which the pages are being mapped
1597 * @dma_addr: Array to store DMA addresses corresponding to mapped pages
1598 * @migrate_pfn: Array of migrate page frame numbers to map
1599 * @npages: Number of pages to map
1600 * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
1601 *
1602 * This function maps pages of memory for migration usage in GPU SVM. It
1603 * iterates over each page frame number provided in @migrate_pfn, maps the
1604 * corresponding page, and stores the DMA address in the provided @dma_addr
1605 * array.
1606 *
1607 * Return: 0 on success, -EFAULT if an error occurs during mapping.
1608 */
drm_gpusvm_migrate_map_pages(struct device * dev,dma_addr_t * dma_addr,unsigned long * migrate_pfn,unsigned long npages,enum dma_data_direction dir)1609 static int drm_gpusvm_migrate_map_pages(struct device *dev,
1610 dma_addr_t *dma_addr,
1611 unsigned long *migrate_pfn,
1612 unsigned long npages,
1613 enum dma_data_direction dir)
1614 {
1615 unsigned long i;
1616
1617 for (i = 0; i < npages; ++i) {
1618 struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
1619
1620 if (!page)
1621 continue;
1622
1623 if (WARN_ON_ONCE(is_zone_device_page(page)))
1624 return -EFAULT;
1625
1626 dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
1627 if (dma_mapping_error(dev, dma_addr[i]))
1628 return -EFAULT;
1629 }
1630
1631 return 0;
1632 }
1633
1634 /**
1635 * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
1636 * @dev: The device for which the pages were mapped
1637 * @dma_addr: Array of DMA addresses corresponding to mapped pages
1638 * @npages: Number of pages to unmap
1639 * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
1640 *
1641 * This function unmaps previously mapped pages of memory for GPU Shared Virtual
1642 * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
1643 * if it's valid and not already unmapped, and unmaps the corresponding page.
1644 */
drm_gpusvm_migrate_unmap_pages(struct device * dev,dma_addr_t * dma_addr,unsigned long npages,enum dma_data_direction dir)1645 static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
1646 dma_addr_t *dma_addr,
1647 unsigned long npages,
1648 enum dma_data_direction dir)
1649 {
1650 unsigned long i;
1651
1652 for (i = 0; i < npages; ++i) {
1653 if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
1654 continue;
1655
1656 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
1657 }
1658 }
1659
1660 /**
1661 * drm_gpusvm_migrate_to_devmem() - Migrate GPU SVM range to device memory
1662 * @gpusvm: Pointer to the GPU SVM structure
1663 * @range: Pointer to the GPU SVM range structure
1664 * @devmem_allocation: Pointer to the device memory allocation. The caller
1665 * should hold a reference to the device memory allocation,
1666 * which should be dropped via ops->devmem_release or upon
1667 * the failure of this function.
1668 * @ctx: GPU SVM context
1669 *
1670 * This function migrates the specified GPU SVM range to device memory. It
1671 * performs the necessary setup and invokes the driver-specific operations for
1672 * migration to device memory. Upon successful return, @devmem_allocation can
1673 * safely reference @range until ops->devmem_release is called which only upon
1674 * successful return. Expected to be called while holding the mmap lock in read
1675 * mode.
1676 *
1677 * Return: 0 on success, negative error code on failure.
1678 */
drm_gpusvm_migrate_to_devmem(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range,struct drm_gpusvm_devmem * devmem_allocation,const struct drm_gpusvm_ctx * ctx)1679 int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm,
1680 struct drm_gpusvm_range *range,
1681 struct drm_gpusvm_devmem *devmem_allocation,
1682 const struct drm_gpusvm_ctx *ctx)
1683 {
1684 const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
1685 unsigned long start = drm_gpusvm_range_start(range),
1686 end = drm_gpusvm_range_end(range);
1687 struct migrate_vma migrate = {
1688 .start = start,
1689 .end = end,
1690 .pgmap_owner = gpusvm->device_private_page_owner,
1691 .flags = MIGRATE_VMA_SELECT_SYSTEM,
1692 };
1693 struct mm_struct *mm = gpusvm->mm;
1694 unsigned long i, npages = npages_in_range(start, end);
1695 struct vm_area_struct *vas;
1696 struct drm_gpusvm_zdd *zdd = NULL;
1697 struct page **pages;
1698 dma_addr_t *dma_addr;
1699 void *buf;
1700 int err;
1701
1702 mmap_assert_locked(gpusvm->mm);
1703
1704 if (!range->flags.migrate_devmem)
1705 return -EINVAL;
1706
1707 if (!ops->populate_devmem_pfn || !ops->copy_to_devmem ||
1708 !ops->copy_to_ram)
1709 return -EOPNOTSUPP;
1710
1711 vas = vma_lookup(mm, start);
1712 if (!vas) {
1713 err = -ENOENT;
1714 goto err_out;
1715 }
1716
1717 if (end > vas->vm_end || start < vas->vm_start) {
1718 err = -EINVAL;
1719 goto err_out;
1720 }
1721
1722 if (!vma_is_anonymous(vas)) {
1723 err = -EBUSY;
1724 goto err_out;
1725 }
1726
1727 buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
1728 sizeof(*pages), GFP_KERNEL);
1729 if (!buf) {
1730 err = -ENOMEM;
1731 goto err_out;
1732 }
1733 dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
1734 pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
1735
1736 zdd = drm_gpusvm_zdd_alloc(gpusvm->device_private_page_owner);
1737 if (!zdd) {
1738 err = -ENOMEM;
1739 goto err_free;
1740 }
1741
1742 migrate.vma = vas;
1743 migrate.src = buf;
1744 migrate.dst = migrate.src + npages;
1745
1746 err = migrate_vma_setup(&migrate);
1747 if (err)
1748 goto err_free;
1749
1750 if (!migrate.cpages) {
1751 err = -EFAULT;
1752 goto err_free;
1753 }
1754
1755 if (migrate.cpages != npages) {
1756 err = -EBUSY;
1757 goto err_finalize;
1758 }
1759
1760 err = ops->populate_devmem_pfn(devmem_allocation, npages, migrate.dst);
1761 if (err)
1762 goto err_finalize;
1763
1764 err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
1765 migrate.src, npages, DMA_TO_DEVICE);
1766 if (err)
1767 goto err_finalize;
1768
1769 for (i = 0; i < npages; ++i) {
1770 struct page *page = pfn_to_page(migrate.dst[i]);
1771
1772 pages[i] = page;
1773 migrate.dst[i] = migrate_pfn(migrate.dst[i]);
1774 drm_gpusvm_get_devmem_page(page, zdd);
1775 }
1776
1777 err = ops->copy_to_devmem(pages, dma_addr, npages);
1778 if (err)
1779 goto err_finalize;
1780
1781 /* Upon success bind devmem allocation to range and zdd */
1782 devmem_allocation->timeslice_expiration = get_jiffies_64() +
1783 msecs_to_jiffies(ctx->timeslice_ms);
1784 zdd->devmem_allocation = devmem_allocation; /* Owns ref */
1785
1786 err_finalize:
1787 if (err)
1788 drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
1789 migrate_vma_pages(&migrate);
1790 migrate_vma_finalize(&migrate);
1791 drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
1792 DMA_TO_DEVICE);
1793 err_free:
1794 if (zdd)
1795 drm_gpusvm_zdd_put(zdd);
1796 kvfree(buf);
1797 err_out:
1798 return err;
1799 }
1800 EXPORT_SYMBOL_GPL(drm_gpusvm_migrate_to_devmem);
1801
1802 /**
1803 * drm_gpusvm_migrate_populate_ram_pfn() - Populate RAM PFNs for a VM area
1804 * @vas: Pointer to the VM area structure, can be NULL
1805 * @fault_page: Fault page
1806 * @npages: Number of pages to populate
1807 * @mpages: Number of pages to migrate
1808 * @src_mpfn: Source array of migrate PFNs
1809 * @mpfn: Array of migrate PFNs to populate
1810 * @addr: Start address for PFN allocation
1811 *
1812 * This function populates the RAM migrate page frame numbers (PFNs) for the
1813 * specified VM area structure. It allocates and locks pages in the VM area for
1814 * RAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
1815 * alloc_page for allocation.
1816 *
1817 * Return: 0 on success, negative error code on failure.
1818 */
drm_gpusvm_migrate_populate_ram_pfn(struct vm_area_struct * vas,struct page * fault_page,unsigned long npages,unsigned long * mpages,unsigned long * src_mpfn,unsigned long * mpfn,unsigned long addr)1819 static int drm_gpusvm_migrate_populate_ram_pfn(struct vm_area_struct *vas,
1820 struct page *fault_page,
1821 unsigned long npages,
1822 unsigned long *mpages,
1823 unsigned long *src_mpfn,
1824 unsigned long *mpfn,
1825 unsigned long addr)
1826 {
1827 unsigned long i;
1828
1829 for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
1830 struct page *page, *src_page;
1831
1832 if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
1833 continue;
1834
1835 src_page = migrate_pfn_to_page(src_mpfn[i]);
1836 if (!src_page)
1837 continue;
1838
1839 if (fault_page) {
1840 if (src_page->zone_device_data !=
1841 fault_page->zone_device_data)
1842 continue;
1843 }
1844
1845 if (vas)
1846 page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
1847 else
1848 page = alloc_page(GFP_HIGHUSER);
1849
1850 if (!page)
1851 goto free_pages;
1852
1853 mpfn[i] = migrate_pfn(page_to_pfn(page));
1854 }
1855
1856 for (i = 0; i < npages; ++i) {
1857 struct page *page = migrate_pfn_to_page(mpfn[i]);
1858
1859 if (!page)
1860 continue;
1861
1862 WARN_ON_ONCE(!trylock_page(page));
1863 ++*mpages;
1864 }
1865
1866 return 0;
1867
1868 free_pages:
1869 for (i = 0; i < npages; ++i) {
1870 struct page *page = migrate_pfn_to_page(mpfn[i]);
1871
1872 if (!page)
1873 continue;
1874
1875 put_page(page);
1876 mpfn[i] = 0;
1877 }
1878 return -ENOMEM;
1879 }
1880
1881 /**
1882 * drm_gpusvm_evict_to_ram() - Evict GPU SVM range to RAM
1883 * @devmem_allocation: Pointer to the device memory allocation
1884 *
1885 * Similar to __drm_gpusvm_migrate_to_ram but does not require mmap lock and
1886 * migration done via migrate_device_* functions.
1887 *
1888 * Return: 0 on success, negative error code on failure.
1889 */
drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem * devmem_allocation)1890 int drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem *devmem_allocation)
1891 {
1892 const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
1893 unsigned long npages, mpages = 0;
1894 struct page **pages;
1895 unsigned long *src, *dst;
1896 dma_addr_t *dma_addr;
1897 void *buf;
1898 int i, err = 0;
1899 unsigned int retry_count = 2;
1900
1901 npages = devmem_allocation->size >> PAGE_SHIFT;
1902
1903 retry:
1904 if (!mmget_not_zero(devmem_allocation->mm))
1905 return -EFAULT;
1906
1907 buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
1908 sizeof(*pages), GFP_KERNEL);
1909 if (!buf) {
1910 err = -ENOMEM;
1911 goto err_out;
1912 }
1913 src = buf;
1914 dst = buf + (sizeof(*src) * npages);
1915 dma_addr = buf + (2 * sizeof(*src) * npages);
1916 pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
1917
1918 err = ops->populate_devmem_pfn(devmem_allocation, npages, src);
1919 if (err)
1920 goto err_free;
1921
1922 err = migrate_device_pfns(src, npages);
1923 if (err)
1924 goto err_free;
1925
1926 err = drm_gpusvm_migrate_populate_ram_pfn(NULL, NULL, npages, &mpages,
1927 src, dst, 0);
1928 if (err || !mpages)
1929 goto err_finalize;
1930
1931 err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
1932 dst, npages, DMA_FROM_DEVICE);
1933 if (err)
1934 goto err_finalize;
1935
1936 for (i = 0; i < npages; ++i)
1937 pages[i] = migrate_pfn_to_page(src[i]);
1938
1939 err = ops->copy_to_ram(pages, dma_addr, npages);
1940 if (err)
1941 goto err_finalize;
1942
1943 err_finalize:
1944 if (err)
1945 drm_gpusvm_migration_unlock_put_pages(npages, dst);
1946 migrate_device_pages(src, dst, npages);
1947 migrate_device_finalize(src, dst, npages);
1948 drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
1949 DMA_FROM_DEVICE);
1950 err_free:
1951 kvfree(buf);
1952 err_out:
1953 mmput_async(devmem_allocation->mm);
1954
1955 if (completion_done(&devmem_allocation->detached))
1956 return 0;
1957
1958 if (retry_count--) {
1959 cond_resched();
1960 goto retry;
1961 }
1962
1963 return err ?: -EBUSY;
1964 }
1965 EXPORT_SYMBOL_GPL(drm_gpusvm_evict_to_ram);
1966
1967 /**
1968 * __drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (internal)
1969 * @vas: Pointer to the VM area structure
1970 * @device_private_page_owner: Device private pages owner
1971 * @page: Pointer to the page for fault handling (can be NULL)
1972 * @fault_addr: Fault address
1973 * @size: Size of migration
1974 *
1975 * This internal function performs the migration of the specified GPU SVM range
1976 * to RAM. It sets up the migration, populates + dma maps RAM PFNs, and
1977 * invokes the driver-specific operations for migration to RAM.
1978 *
1979 * Return: 0 on success, negative error code on failure.
1980 */
__drm_gpusvm_migrate_to_ram(struct vm_area_struct * vas,void * device_private_page_owner,struct page * page,unsigned long fault_addr,unsigned long size)1981 static int __drm_gpusvm_migrate_to_ram(struct vm_area_struct *vas,
1982 void *device_private_page_owner,
1983 struct page *page,
1984 unsigned long fault_addr,
1985 unsigned long size)
1986 {
1987 struct migrate_vma migrate = {
1988 .vma = vas,
1989 .pgmap_owner = device_private_page_owner,
1990 .flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
1991 MIGRATE_VMA_SELECT_DEVICE_COHERENT,
1992 .fault_page = page,
1993 };
1994 struct drm_gpusvm_zdd *zdd;
1995 const struct drm_gpusvm_devmem_ops *ops;
1996 struct device *dev = NULL;
1997 unsigned long npages, mpages = 0;
1998 struct page **pages;
1999 dma_addr_t *dma_addr;
2000 unsigned long start, end;
2001 void *buf;
2002 int i, err = 0;
2003
2004 if (page) {
2005 zdd = page->zone_device_data;
2006 if (time_before64(get_jiffies_64(),
2007 zdd->devmem_allocation->timeslice_expiration))
2008 return 0;
2009 }
2010
2011 start = ALIGN_DOWN(fault_addr, size);
2012 end = ALIGN(fault_addr + 1, size);
2013
2014 /* Corner where VMA area struct has been partially unmapped */
2015 if (start < vas->vm_start)
2016 start = vas->vm_start;
2017 if (end > vas->vm_end)
2018 end = vas->vm_end;
2019
2020 migrate.start = start;
2021 migrate.end = end;
2022 npages = npages_in_range(start, end);
2023
2024 buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
2025 sizeof(*pages), GFP_KERNEL);
2026 if (!buf) {
2027 err = -ENOMEM;
2028 goto err_out;
2029 }
2030 dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
2031 pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
2032
2033 migrate.vma = vas;
2034 migrate.src = buf;
2035 migrate.dst = migrate.src + npages;
2036
2037 err = migrate_vma_setup(&migrate);
2038 if (err)
2039 goto err_free;
2040
2041 /* Raced with another CPU fault, nothing to do */
2042 if (!migrate.cpages)
2043 goto err_free;
2044
2045 if (!page) {
2046 for (i = 0; i < npages; ++i) {
2047 if (!(migrate.src[i] & MIGRATE_PFN_MIGRATE))
2048 continue;
2049
2050 page = migrate_pfn_to_page(migrate.src[i]);
2051 break;
2052 }
2053
2054 if (!page)
2055 goto err_finalize;
2056 }
2057 zdd = page->zone_device_data;
2058 ops = zdd->devmem_allocation->ops;
2059 dev = zdd->devmem_allocation->dev;
2060
2061 err = drm_gpusvm_migrate_populate_ram_pfn(vas, page, npages, &mpages,
2062 migrate.src, migrate.dst,
2063 start);
2064 if (err)
2065 goto err_finalize;
2066
2067 err = drm_gpusvm_migrate_map_pages(dev, dma_addr, migrate.dst, npages,
2068 DMA_FROM_DEVICE);
2069 if (err)
2070 goto err_finalize;
2071
2072 for (i = 0; i < npages; ++i)
2073 pages[i] = migrate_pfn_to_page(migrate.src[i]);
2074
2075 err = ops->copy_to_ram(pages, dma_addr, npages);
2076 if (err)
2077 goto err_finalize;
2078
2079 err_finalize:
2080 if (err)
2081 drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
2082 migrate_vma_pages(&migrate);
2083 migrate_vma_finalize(&migrate);
2084 if (dev)
2085 drm_gpusvm_migrate_unmap_pages(dev, dma_addr, npages,
2086 DMA_FROM_DEVICE);
2087 err_free:
2088 kvfree(buf);
2089 err_out:
2090
2091 return err;
2092 }
2093
2094 /**
2095 * drm_gpusvm_range_evict - Evict GPU SVM range
2096 * @range: Pointer to the GPU SVM range to be removed
2097 *
2098 * This function evicts the specified GPU SVM range. This function will not
2099 * evict coherent pages.
2100 *
2101 * Return: 0 on success, a negative error code on failure.
2102 */
drm_gpusvm_range_evict(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)2103 int drm_gpusvm_range_evict(struct drm_gpusvm *gpusvm,
2104 struct drm_gpusvm_range *range)
2105 {
2106 struct mmu_interval_notifier *notifier = &range->notifier->notifier;
2107 struct hmm_range hmm_range = {
2108 .default_flags = HMM_PFN_REQ_FAULT,
2109 .notifier = notifier,
2110 .start = drm_gpusvm_range_start(range),
2111 .end = drm_gpusvm_range_end(range),
2112 .dev_private_owner = NULL,
2113 };
2114 unsigned long timeout =
2115 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
2116 unsigned long *pfns;
2117 unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
2118 drm_gpusvm_range_end(range));
2119 int err = 0;
2120 struct mm_struct *mm = gpusvm->mm;
2121
2122 if (!mmget_not_zero(mm))
2123 return -EFAULT;
2124
2125 pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
2126 if (!pfns)
2127 return -ENOMEM;
2128
2129 hmm_range.hmm_pfns = pfns;
2130 while (!time_after(jiffies, timeout)) {
2131 hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
2132 if (time_after(jiffies, timeout)) {
2133 err = -ETIME;
2134 break;
2135 }
2136
2137 mmap_read_lock(mm);
2138 err = hmm_range_fault(&hmm_range);
2139 mmap_read_unlock(mm);
2140 if (err != -EBUSY)
2141 break;
2142 }
2143
2144 kvfree(pfns);
2145 mmput(mm);
2146
2147 return err;
2148 }
2149 EXPORT_SYMBOL_GPL(drm_gpusvm_range_evict);
2150
2151 /**
2152 * drm_gpusvm_page_free() - Put GPU SVM zone device data associated with a page
2153 * @page: Pointer to the page
2154 *
2155 * This function is a callback used to put the GPU SVM zone device data
2156 * associated with a page when it is being released.
2157 */
drm_gpusvm_page_free(struct page * page)2158 static void drm_gpusvm_page_free(struct page *page)
2159 {
2160 drm_gpusvm_zdd_put(page->zone_device_data);
2161 }
2162
2163 /**
2164 * drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (page fault handler)
2165 * @vmf: Pointer to the fault information structure
2166 *
2167 * This function is a page fault handler used to migrate a GPU SVM range to RAM.
2168 * It retrieves the GPU SVM range information from the faulting page and invokes
2169 * the internal migration function to migrate the range back to RAM.
2170 *
2171 * Return: VM_FAULT_SIGBUS on failure, 0 on success.
2172 */
drm_gpusvm_migrate_to_ram(struct vm_fault * vmf)2173 static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
2174 {
2175 struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
2176 int err;
2177
2178 err = __drm_gpusvm_migrate_to_ram(vmf->vma,
2179 zdd->device_private_page_owner,
2180 vmf->page, vmf->address,
2181 zdd->devmem_allocation->size);
2182
2183 return err ? VM_FAULT_SIGBUS : 0;
2184 }
2185
2186 /*
2187 * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
2188 */
2189 static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
2190 .page_free = drm_gpusvm_page_free,
2191 .migrate_to_ram = drm_gpusvm_migrate_to_ram,
2192 };
2193
2194 /**
2195 * drm_gpusvm_pagemap_ops_get() - Retrieve GPU SVM device page map operations
2196 *
2197 * Return: Pointer to the GPU SVM device page map operations structure.
2198 */
drm_gpusvm_pagemap_ops_get(void)2199 const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
2200 {
2201 return &drm_gpusvm_pagemap_ops;
2202 }
2203 EXPORT_SYMBOL_GPL(drm_gpusvm_pagemap_ops_get);
2204
2205 /**
2206 * drm_gpusvm_has_mapping() - Check if GPU SVM has mapping for the given address range
2207 * @gpusvm: Pointer to the GPU SVM structure.
2208 * @start: Start address
2209 * @end: End address
2210 *
2211 * Return: True if GPU SVM has mapping, False otherwise
2212 */
drm_gpusvm_has_mapping(struct drm_gpusvm * gpusvm,unsigned long start,unsigned long end)2213 bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, unsigned long start,
2214 unsigned long end)
2215 {
2216 struct drm_gpusvm_notifier *notifier;
2217
2218 drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
2219 struct drm_gpusvm_range *range = NULL;
2220
2221 drm_gpusvm_for_each_range(range, notifier, start, end)
2222 return true;
2223 }
2224
2225 return false;
2226 }
2227 EXPORT_SYMBOL_GPL(drm_gpusvm_has_mapping);
2228
2229 /**
2230 * drm_gpusvm_range_set_unmapped() - Mark a GPU SVM range as unmapped
2231 * @range: Pointer to the GPU SVM range structure.
2232 * @mmu_range: Pointer to the MMU notifier range structure.
2233 *
2234 * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
2235 * if the range partially falls within the provided MMU notifier range.
2236 */
drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range * range,const struct mmu_notifier_range * mmu_range)2237 void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
2238 const struct mmu_notifier_range *mmu_range)
2239 {
2240 lockdep_assert_held_write(&range->gpusvm->notifier_lock);
2241
2242 range->flags.unmapped = true;
2243 if (drm_gpusvm_range_start(range) < mmu_range->start ||
2244 drm_gpusvm_range_end(range) > mmu_range->end)
2245 range->flags.partial_unmap = true;
2246 }
2247 EXPORT_SYMBOL_GPL(drm_gpusvm_range_set_unmapped);
2248
2249 /**
2250 * drm_gpusvm_devmem_init() - Initialize a GPU SVM device memory allocation
2251 *
2252 * @dev: Pointer to the device structure which device memory allocation belongs to
2253 * @mm: Pointer to the mm_struct for the address space
2254 * @ops: Pointer to the operations structure for GPU SVM device memory
2255 * @dpagemap: The struct drm_pagemap we're allocating from.
2256 * @size: Size of device memory allocation
2257 */
drm_gpusvm_devmem_init(struct drm_gpusvm_devmem * devmem_allocation,struct device * dev,struct mm_struct * mm,const struct drm_gpusvm_devmem_ops * ops,struct drm_pagemap * dpagemap,size_t size)2258 void drm_gpusvm_devmem_init(struct drm_gpusvm_devmem *devmem_allocation,
2259 struct device *dev, struct mm_struct *mm,
2260 const struct drm_gpusvm_devmem_ops *ops,
2261 struct drm_pagemap *dpagemap, size_t size)
2262 {
2263 init_completion(&devmem_allocation->detached);
2264 devmem_allocation->dev = dev;
2265 devmem_allocation->mm = mm;
2266 devmem_allocation->ops = ops;
2267 devmem_allocation->dpagemap = dpagemap;
2268 devmem_allocation->size = size;
2269 }
2270 EXPORT_SYMBOL_GPL(drm_gpusvm_devmem_init);
2271
2272 MODULE_DESCRIPTION("DRM GPUSVM");
2273 MODULE_LICENSE("GPL");
2274