1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
3 */
4 #include <linux/iommu.h>
5 #include <linux/iommufd.h>
6 #include <linux/pci-ats.h>
7 #include <linux/slab.h>
8 #include <uapi/linux/iommufd.h>
9
10 #include "../iommu-priv.h"
11 #include "io_pagetable.h"
12 #include "iommufd_private.h"
13
14 static bool allow_unsafe_interrupts;
15 module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
16 MODULE_PARM_DESC(
17 allow_unsafe_interrupts,
18 "Allow IOMMUFD to bind to devices even if the platform cannot isolate "
19 "the MSI interrupt window. Enabling this is a security weakness.");
20
21 struct iommufd_attach {
22 struct iommufd_hw_pagetable *hwpt;
23 struct xarray device_array;
24 };
25
iommufd_group_release(struct kref * kref)26 static void iommufd_group_release(struct kref *kref)
27 {
28 struct iommufd_group *igroup =
29 container_of(kref, struct iommufd_group, ref);
30
31 WARN_ON(!xa_empty(&igroup->pasid_attach));
32
33 xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup,
34 NULL, GFP_KERNEL);
35 iommu_group_put(igroup->group);
36 mutex_destroy(&igroup->lock);
37 kfree(igroup);
38 }
39
iommufd_put_group(struct iommufd_group * group)40 static void iommufd_put_group(struct iommufd_group *group)
41 {
42 kref_put(&group->ref, iommufd_group_release);
43 }
44
iommufd_group_try_get(struct iommufd_group * igroup,struct iommu_group * group)45 static bool iommufd_group_try_get(struct iommufd_group *igroup,
46 struct iommu_group *group)
47 {
48 if (!igroup)
49 return false;
50 /*
51 * group ID's cannot be re-used until the group is put back which does
52 * not happen if we could get an igroup pointer under the xa_lock.
53 */
54 if (WARN_ON(igroup->group != group))
55 return false;
56 return kref_get_unless_zero(&igroup->ref);
57 }
58
59 /*
60 * iommufd needs to store some more data for each iommu_group, we keep a
61 * parallel xarray indexed by iommu_group id to hold this instead of putting it
62 * in the core structure. To keep things simple the iommufd_group memory is
63 * unique within the iommufd_ctx. This makes it easy to check there are no
64 * memory leaks.
65 */
iommufd_get_group(struct iommufd_ctx * ictx,struct device * dev)66 static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
67 struct device *dev)
68 {
69 struct iommufd_group *new_igroup;
70 struct iommufd_group *cur_igroup;
71 struct iommufd_group *igroup;
72 struct iommu_group *group;
73 unsigned int id;
74
75 group = iommu_group_get(dev);
76 if (!group)
77 return ERR_PTR(-ENODEV);
78
79 id = iommu_group_id(group);
80
81 xa_lock(&ictx->groups);
82 igroup = xa_load(&ictx->groups, id);
83 if (iommufd_group_try_get(igroup, group)) {
84 xa_unlock(&ictx->groups);
85 iommu_group_put(group);
86 return igroup;
87 }
88 xa_unlock(&ictx->groups);
89
90 new_igroup = kzalloc(sizeof(*new_igroup), GFP_KERNEL);
91 if (!new_igroup) {
92 iommu_group_put(group);
93 return ERR_PTR(-ENOMEM);
94 }
95
96 kref_init(&new_igroup->ref);
97 mutex_init(&new_igroup->lock);
98 xa_init(&new_igroup->pasid_attach);
99 new_igroup->sw_msi_start = PHYS_ADDR_MAX;
100 /* group reference moves into new_igroup */
101 new_igroup->group = group;
102
103 /*
104 * The ictx is not additionally refcounted here becase all objects using
105 * an igroup must put it before their destroy completes.
106 */
107 new_igroup->ictx = ictx;
108
109 /*
110 * We dropped the lock so igroup is invalid. NULL is a safe and likely
111 * value to assume for the xa_cmpxchg algorithm.
112 */
113 cur_igroup = NULL;
114 xa_lock(&ictx->groups);
115 while (true) {
116 igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup, new_igroup,
117 GFP_KERNEL);
118 if (xa_is_err(igroup)) {
119 xa_unlock(&ictx->groups);
120 iommufd_put_group(new_igroup);
121 return ERR_PTR(xa_err(igroup));
122 }
123
124 /* new_group was successfully installed */
125 if (cur_igroup == igroup) {
126 xa_unlock(&ictx->groups);
127 return new_igroup;
128 }
129
130 /* Check again if the current group is any good */
131 if (iommufd_group_try_get(igroup, group)) {
132 xa_unlock(&ictx->groups);
133 iommufd_put_group(new_igroup);
134 return igroup;
135 }
136 cur_igroup = igroup;
137 }
138 }
139
iommufd_device_remove_vdev(struct iommufd_device * idev)140 static void iommufd_device_remove_vdev(struct iommufd_device *idev)
141 {
142 struct iommufd_vdevice *vdev;
143
144 mutex_lock(&idev->igroup->lock);
145 /* prevent new references from vdev */
146 idev->destroying = true;
147 /* vdev has been completely destroyed by userspace */
148 if (!idev->vdev)
149 goto out_unlock;
150
151 vdev = iommufd_get_vdevice(idev->ictx, idev->vdev->obj.id);
152 /*
153 * An ongoing vdev destroy ioctl has removed the vdev from the object
154 * xarray, but has not finished iommufd_vdevice_destroy() yet as it
155 * needs the same mutex. We exit the locking then wait on wait_cnt
156 * reference for the vdev destruction.
157 */
158 if (IS_ERR(vdev))
159 goto out_unlock;
160
161 /* Should never happen */
162 if (WARN_ON(vdev != idev->vdev)) {
163 iommufd_put_object(idev->ictx, &vdev->obj);
164 goto out_unlock;
165 }
166
167 /*
168 * vdev is still alive. Hold a users refcount to prevent racing with
169 * userspace destruction, then use iommufd_object_tombstone_user() to
170 * destroy it and leave a tombstone.
171 */
172 refcount_inc(&vdev->obj.users);
173 iommufd_put_object(idev->ictx, &vdev->obj);
174 mutex_unlock(&idev->igroup->lock);
175 iommufd_object_tombstone_user(idev->ictx, &vdev->obj);
176 return;
177
178 out_unlock:
179 mutex_unlock(&idev->igroup->lock);
180 }
181
iommufd_device_pre_destroy(struct iommufd_object * obj)182 void iommufd_device_pre_destroy(struct iommufd_object *obj)
183 {
184 struct iommufd_device *idev =
185 container_of(obj, struct iommufd_device, obj);
186
187 /* Release the wait_cnt reference on this */
188 iommufd_device_remove_vdev(idev);
189 }
190
iommufd_device_destroy(struct iommufd_object * obj)191 void iommufd_device_destroy(struct iommufd_object *obj)
192 {
193 struct iommufd_device *idev =
194 container_of(obj, struct iommufd_device, obj);
195
196 iommu_device_release_dma_owner(idev->dev);
197 iommufd_put_group(idev->igroup);
198 if (!iommufd_selftest_is_mock_dev(idev->dev))
199 iommufd_ctx_put(idev->ictx);
200 }
201
202 /**
203 * iommufd_device_bind - Bind a physical device to an iommu fd
204 * @ictx: iommufd file descriptor
205 * @dev: Pointer to a physical device struct
206 * @id: Output ID number to return to userspace for this device
207 *
208 * A successful bind establishes an ownership over the device and returns
209 * struct iommufd_device pointer, otherwise returns error pointer.
210 *
211 * A driver using this API must set driver_managed_dma and must not touch
212 * the device until this routine succeeds and establishes ownership.
213 *
214 * Binding a PCI device places the entire RID under iommufd control.
215 *
216 * The caller must undo this with iommufd_device_unbind()
217 */
iommufd_device_bind(struct iommufd_ctx * ictx,struct device * dev,u32 * id)218 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
219 struct device *dev, u32 *id)
220 {
221 struct iommufd_device *idev;
222 struct iommufd_group *igroup;
223 int rc;
224
225 /*
226 * iommufd always sets IOMMU_CACHE because we offer no way for userspace
227 * to restore cache coherency.
228 */
229 if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY))
230 return ERR_PTR(-EINVAL);
231
232 igroup = iommufd_get_group(ictx, dev);
233 if (IS_ERR(igroup))
234 return ERR_CAST(igroup);
235
236 /*
237 * For historical compat with VFIO the insecure interrupt path is
238 * allowed if the module parameter is set. Secure/Isolated means that a
239 * MemWr operation from the device (eg a simple DMA) cannot trigger an
240 * interrupt outside this iommufd context.
241 */
242 if (!iommufd_selftest_is_mock_dev(dev) &&
243 !iommu_group_has_isolated_msi(igroup->group)) {
244 if (!allow_unsafe_interrupts) {
245 rc = -EPERM;
246 goto out_group_put;
247 }
248
249 dev_warn(
250 dev,
251 "MSI interrupts are not secure, they cannot be isolated by the platform. "
252 "Check that platform features like interrupt remapping are enabled. "
253 "Use the \"allow_unsafe_interrupts\" module parameter to override\n");
254 }
255
256 rc = iommu_device_claim_dma_owner(dev, ictx);
257 if (rc)
258 goto out_group_put;
259
260 idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE);
261 if (IS_ERR(idev)) {
262 rc = PTR_ERR(idev);
263 goto out_release_owner;
264 }
265 idev->ictx = ictx;
266 if (!iommufd_selftest_is_mock_dev(dev))
267 iommufd_ctx_get(ictx);
268 idev->dev = dev;
269 idev->enforce_cache_coherency =
270 device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
271 /* The calling driver is a user until iommufd_device_unbind() */
272 refcount_inc(&idev->obj.users);
273 /* igroup refcount moves into iommufd_device */
274 idev->igroup = igroup;
275
276 /*
277 * If the caller fails after this success it must call
278 * iommufd_unbind_device() which is safe since we hold this refcount.
279 * This also means the device is a leaf in the graph and no other object
280 * can take a reference on it.
281 */
282 iommufd_object_finalize(ictx, &idev->obj);
283 *id = idev->obj.id;
284 return idev;
285
286 out_release_owner:
287 iommu_device_release_dma_owner(dev);
288 out_group_put:
289 iommufd_put_group(igroup);
290 return ERR_PTR(rc);
291 }
292 EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, "IOMMUFD");
293
294 /**
295 * iommufd_ctx_has_group - True if any device within the group is bound
296 * to the ictx
297 * @ictx: iommufd file descriptor
298 * @group: Pointer to a physical iommu_group struct
299 *
300 * True if any device within the group has been bound to this ictx, ex. via
301 * iommufd_device_bind(), therefore implying ictx ownership of the group.
302 */
iommufd_ctx_has_group(struct iommufd_ctx * ictx,struct iommu_group * group)303 bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group)
304 {
305 struct iommufd_object *obj;
306 unsigned long index;
307
308 if (!ictx || !group)
309 return false;
310
311 xa_lock(&ictx->objects);
312 xa_for_each(&ictx->objects, index, obj) {
313 if (obj->type == IOMMUFD_OBJ_DEVICE &&
314 container_of(obj, struct iommufd_device, obj)
315 ->igroup->group == group) {
316 xa_unlock(&ictx->objects);
317 return true;
318 }
319 }
320 xa_unlock(&ictx->objects);
321 return false;
322 }
323 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, "IOMMUFD");
324
325 /**
326 * iommufd_device_unbind - Undo iommufd_device_bind()
327 * @idev: Device returned by iommufd_device_bind()
328 *
329 * Release the device from iommufd control. The DMA ownership will return back
330 * to unowned with DMA controlled by the DMA API. This invalidates the
331 * iommufd_device pointer, other APIs that consume it must not be called
332 * concurrently.
333 */
iommufd_device_unbind(struct iommufd_device * idev)334 void iommufd_device_unbind(struct iommufd_device *idev)
335 {
336 iommufd_object_destroy_user(idev->ictx, &idev->obj);
337 }
338 EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, "IOMMUFD");
339
iommufd_device_to_ictx(struct iommufd_device * idev)340 struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev)
341 {
342 return idev->ictx;
343 }
344 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, "IOMMUFD");
345
iommufd_device_to_id(struct iommufd_device * idev)346 u32 iommufd_device_to_id(struct iommufd_device *idev)
347 {
348 return idev->obj.id;
349 }
350 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD");
351
iommufd_group_device_num(struct iommufd_group * igroup,ioasid_t pasid)352 static unsigned int iommufd_group_device_num(struct iommufd_group *igroup,
353 ioasid_t pasid)
354 {
355 struct iommufd_attach *attach;
356 struct iommufd_device *idev;
357 unsigned int count = 0;
358 unsigned long index;
359
360 lockdep_assert_held(&igroup->lock);
361
362 attach = xa_load(&igroup->pasid_attach, pasid);
363 if (attach)
364 xa_for_each(&attach->device_array, index, idev)
365 count++;
366 return count;
367 }
368
369 #ifdef CONFIG_IRQ_MSI_IOMMU
iommufd_group_setup_msi(struct iommufd_group * igroup,struct iommufd_hwpt_paging * hwpt_paging)370 static int iommufd_group_setup_msi(struct iommufd_group *igroup,
371 struct iommufd_hwpt_paging *hwpt_paging)
372 {
373 struct iommufd_ctx *ictx = igroup->ictx;
374 struct iommufd_sw_msi_map *cur;
375
376 if (igroup->sw_msi_start == PHYS_ADDR_MAX)
377 return 0;
378
379 /*
380 * Install all the MSI pages the device has been using into the domain
381 */
382 guard(mutex)(&ictx->sw_msi_lock);
383 list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
384 int rc;
385
386 if (cur->sw_msi_start != igroup->sw_msi_start ||
387 !test_bit(cur->id, igroup->required_sw_msi.bitmap))
388 continue;
389
390 rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur);
391 if (rc)
392 return rc;
393 }
394 return 0;
395 }
396 #else
397 static inline int
iommufd_group_setup_msi(struct iommufd_group * igroup,struct iommufd_hwpt_paging * hwpt_paging)398 iommufd_group_setup_msi(struct iommufd_group *igroup,
399 struct iommufd_hwpt_paging *hwpt_paging)
400 {
401 return 0;
402 }
403 #endif
404
405 static bool
iommufd_group_first_attach(struct iommufd_group * igroup,ioasid_t pasid)406 iommufd_group_first_attach(struct iommufd_group *igroup, ioasid_t pasid)
407 {
408 lockdep_assert_held(&igroup->lock);
409 return !xa_load(&igroup->pasid_attach, pasid);
410 }
411
412 static int
iommufd_device_attach_reserved_iova(struct iommufd_device * idev,struct iommufd_hwpt_paging * hwpt_paging)413 iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
414 struct iommufd_hwpt_paging *hwpt_paging)
415 {
416 struct iommufd_group *igroup = idev->igroup;
417 int rc;
418
419 lockdep_assert_held(&igroup->lock);
420
421 rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
422 idev->dev,
423 &igroup->sw_msi_start);
424 if (rc)
425 return rc;
426
427 if (iommufd_group_first_attach(igroup, IOMMU_NO_PASID)) {
428 rc = iommufd_group_setup_msi(igroup, hwpt_paging);
429 if (rc) {
430 iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt,
431 idev->dev);
432 return rc;
433 }
434 }
435 return 0;
436 }
437
438 /* The device attach/detach/replace helpers for attach_handle */
439
iommufd_device_is_attached(struct iommufd_device * idev,ioasid_t pasid)440 static bool iommufd_device_is_attached(struct iommufd_device *idev,
441 ioasid_t pasid)
442 {
443 struct iommufd_attach *attach;
444
445 attach = xa_load(&idev->igroup->pasid_attach, pasid);
446 return xa_load(&attach->device_array, idev->obj.id);
447 }
448
iommufd_hwpt_pasid_compat(struct iommufd_hw_pagetable * hwpt,struct iommufd_device * idev,ioasid_t pasid)449 static int iommufd_hwpt_pasid_compat(struct iommufd_hw_pagetable *hwpt,
450 struct iommufd_device *idev,
451 ioasid_t pasid)
452 {
453 struct iommufd_group *igroup = idev->igroup;
454
455 lockdep_assert_held(&igroup->lock);
456
457 if (pasid == IOMMU_NO_PASID) {
458 unsigned long start = IOMMU_NO_PASID;
459
460 if (!hwpt->pasid_compat &&
461 xa_find_after(&igroup->pasid_attach,
462 &start, UINT_MAX, XA_PRESENT))
463 return -EINVAL;
464 } else {
465 struct iommufd_attach *attach;
466
467 if (!hwpt->pasid_compat)
468 return -EINVAL;
469
470 attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
471 if (attach && attach->hwpt && !attach->hwpt->pasid_compat)
472 return -EINVAL;
473 }
474
475 return 0;
476 }
477
iommufd_hwpt_compatible_device(struct iommufd_hw_pagetable * hwpt,struct iommufd_device * idev)478 static bool iommufd_hwpt_compatible_device(struct iommufd_hw_pagetable *hwpt,
479 struct iommufd_device *idev)
480 {
481 struct pci_dev *pdev;
482
483 if (!hwpt->fault || !dev_is_pci(idev->dev))
484 return true;
485
486 /*
487 * Once we turn on PCI/PRI support for VF, the response failure code
488 * should not be forwarded to the hardware due to PRI being a shared
489 * resource between PF and VFs. There is no coordination for this
490 * shared capability. This waits for a vPRI reset to recover.
491 */
492 pdev = to_pci_dev(idev->dev);
493
494 return (!pdev->is_virtfn || !pci_pri_supported(pdev));
495 }
496
iommufd_hwpt_attach_device(struct iommufd_hw_pagetable * hwpt,struct iommufd_device * idev,ioasid_t pasid)497 static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt,
498 struct iommufd_device *idev,
499 ioasid_t pasid)
500 {
501 struct iommufd_attach_handle *handle;
502 int rc;
503
504 if (!iommufd_hwpt_compatible_device(hwpt, idev))
505 return -EINVAL;
506
507 rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid);
508 if (rc)
509 return rc;
510
511 handle = kzalloc(sizeof(*handle), GFP_KERNEL);
512 if (!handle)
513 return -ENOMEM;
514
515 handle->idev = idev;
516 if (pasid == IOMMU_NO_PASID)
517 rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group,
518 &handle->handle);
519 else
520 rc = iommu_attach_device_pasid(hwpt->domain, idev->dev, pasid,
521 &handle->handle);
522 if (rc)
523 goto out_free_handle;
524
525 return 0;
526
527 out_free_handle:
528 kfree(handle);
529 return rc;
530 }
531
532 static struct iommufd_attach_handle *
iommufd_device_get_attach_handle(struct iommufd_device * idev,ioasid_t pasid)533 iommufd_device_get_attach_handle(struct iommufd_device *idev, ioasid_t pasid)
534 {
535 struct iommu_attach_handle *handle;
536
537 lockdep_assert_held(&idev->igroup->lock);
538
539 handle = iommu_attach_handle_get(idev->igroup->group, pasid, 0);
540 if (IS_ERR(handle))
541 return NULL;
542 return to_iommufd_handle(handle);
543 }
544
iommufd_hwpt_detach_device(struct iommufd_hw_pagetable * hwpt,struct iommufd_device * idev,ioasid_t pasid)545 static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt,
546 struct iommufd_device *idev,
547 ioasid_t pasid)
548 {
549 struct iommufd_attach_handle *handle;
550
551 handle = iommufd_device_get_attach_handle(idev, pasid);
552 if (pasid == IOMMU_NO_PASID)
553 iommu_detach_group_handle(hwpt->domain, idev->igroup->group);
554 else
555 iommu_detach_device_pasid(hwpt->domain, idev->dev, pasid);
556
557 iommufd_auto_response_faults(hwpt, handle);
558 kfree(handle);
559 }
560
iommufd_hwpt_replace_device(struct iommufd_device * idev,ioasid_t pasid,struct iommufd_hw_pagetable * hwpt,struct iommufd_hw_pagetable * old)561 static int iommufd_hwpt_replace_device(struct iommufd_device *idev,
562 ioasid_t pasid,
563 struct iommufd_hw_pagetable *hwpt,
564 struct iommufd_hw_pagetable *old)
565 {
566 struct iommufd_attach_handle *handle, *old_handle;
567 int rc;
568
569 if (!iommufd_hwpt_compatible_device(hwpt, idev))
570 return -EINVAL;
571
572 rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid);
573 if (rc)
574 return rc;
575
576 old_handle = iommufd_device_get_attach_handle(idev, pasid);
577
578 handle = kzalloc(sizeof(*handle), GFP_KERNEL);
579 if (!handle)
580 return -ENOMEM;
581
582 handle->idev = idev;
583 if (pasid == IOMMU_NO_PASID)
584 rc = iommu_replace_group_handle(idev->igroup->group,
585 hwpt->domain, &handle->handle);
586 else
587 rc = iommu_replace_device_pasid(hwpt->domain, idev->dev,
588 pasid, &handle->handle);
589 if (rc)
590 goto out_free_handle;
591
592 iommufd_auto_response_faults(hwpt, old_handle);
593 kfree(old_handle);
594
595 return 0;
596
597 out_free_handle:
598 kfree(handle);
599 return rc;
600 }
601
iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable * hwpt,struct iommufd_device * idev,ioasid_t pasid)602 int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
603 struct iommufd_device *idev, ioasid_t pasid)
604 {
605 struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt);
606 bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID;
607 struct iommufd_group *igroup = idev->igroup;
608 struct iommufd_hw_pagetable *old_hwpt;
609 struct iommufd_attach *attach;
610 int rc;
611
612 mutex_lock(&igroup->lock);
613
614 attach = xa_cmpxchg(&igroup->pasid_attach, pasid, NULL,
615 XA_ZERO_ENTRY, GFP_KERNEL);
616 if (xa_is_err(attach)) {
617 rc = xa_err(attach);
618 goto err_unlock;
619 }
620
621 if (!attach) {
622 attach = kzalloc(sizeof(*attach), GFP_KERNEL);
623 if (!attach) {
624 rc = -ENOMEM;
625 goto err_release_pasid;
626 }
627 xa_init(&attach->device_array);
628 }
629
630 old_hwpt = attach->hwpt;
631
632 rc = xa_insert(&attach->device_array, idev->obj.id, XA_ZERO_ENTRY,
633 GFP_KERNEL);
634 if (rc) {
635 WARN_ON(rc == -EBUSY && !old_hwpt);
636 goto err_free_attach;
637 }
638
639 if (old_hwpt && old_hwpt != hwpt) {
640 rc = -EINVAL;
641 goto err_release_devid;
642 }
643
644 if (attach_resv) {
645 rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging);
646 if (rc)
647 goto err_release_devid;
648 }
649
650 /*
651 * Only attach to the group once for the first device that is in the
652 * group. All the other devices will follow this attachment. The user
653 * should attach every device individually to the hwpt as the per-device
654 * reserved regions are only updated during individual device
655 * attachment.
656 */
657 if (iommufd_group_first_attach(igroup, pasid)) {
658 rc = iommufd_hwpt_attach_device(hwpt, idev, pasid);
659 if (rc)
660 goto err_unresv;
661 attach->hwpt = hwpt;
662 WARN_ON(xa_is_err(xa_store(&igroup->pasid_attach, pasid, attach,
663 GFP_KERNEL)));
664 }
665 refcount_inc(&hwpt->obj.users);
666 WARN_ON(xa_is_err(xa_store(&attach->device_array, idev->obj.id,
667 idev, GFP_KERNEL)));
668 mutex_unlock(&igroup->lock);
669 return 0;
670 err_unresv:
671 if (attach_resv)
672 iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev);
673 err_release_devid:
674 xa_release(&attach->device_array, idev->obj.id);
675 err_free_attach:
676 if (iommufd_group_first_attach(igroup, pasid))
677 kfree(attach);
678 err_release_pasid:
679 if (iommufd_group_first_attach(igroup, pasid))
680 xa_release(&igroup->pasid_attach, pasid);
681 err_unlock:
682 mutex_unlock(&igroup->lock);
683 return rc;
684 }
685
686 struct iommufd_hw_pagetable *
iommufd_hw_pagetable_detach(struct iommufd_device * idev,ioasid_t pasid)687 iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid)
688 {
689 struct iommufd_group *igroup = idev->igroup;
690 struct iommufd_hwpt_paging *hwpt_paging;
691 struct iommufd_hw_pagetable *hwpt;
692 struct iommufd_attach *attach;
693
694 mutex_lock(&igroup->lock);
695 attach = xa_load(&igroup->pasid_attach, pasid);
696 if (!attach) {
697 mutex_unlock(&igroup->lock);
698 return NULL;
699 }
700
701 hwpt = attach->hwpt;
702 hwpt_paging = find_hwpt_paging(hwpt);
703
704 xa_erase(&attach->device_array, idev->obj.id);
705 if (xa_empty(&attach->device_array)) {
706 iommufd_hwpt_detach_device(hwpt, idev, pasid);
707 xa_erase(&igroup->pasid_attach, pasid);
708 kfree(attach);
709 }
710 if (hwpt_paging && pasid == IOMMU_NO_PASID)
711 iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev);
712 mutex_unlock(&igroup->lock);
713
714 iommufd_hw_pagetable_put(idev->ictx, hwpt);
715
716 /* Caller must destroy hwpt */
717 return hwpt;
718 }
719
720 static struct iommufd_hw_pagetable *
iommufd_device_do_attach(struct iommufd_device * idev,ioasid_t pasid,struct iommufd_hw_pagetable * hwpt)721 iommufd_device_do_attach(struct iommufd_device *idev, ioasid_t pasid,
722 struct iommufd_hw_pagetable *hwpt)
723 {
724 int rc;
725
726 rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid);
727 if (rc)
728 return ERR_PTR(rc);
729 return NULL;
730 }
731
732 static void
iommufd_group_remove_reserved_iova(struct iommufd_group * igroup,struct iommufd_hwpt_paging * hwpt_paging)733 iommufd_group_remove_reserved_iova(struct iommufd_group *igroup,
734 struct iommufd_hwpt_paging *hwpt_paging)
735 {
736 struct iommufd_attach *attach;
737 struct iommufd_device *cur;
738 unsigned long index;
739
740 lockdep_assert_held(&igroup->lock);
741
742 attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
743 xa_for_each(&attach->device_array, index, cur)
744 iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev);
745 }
746
747 static int
iommufd_group_do_replace_reserved_iova(struct iommufd_group * igroup,struct iommufd_hwpt_paging * hwpt_paging)748 iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup,
749 struct iommufd_hwpt_paging *hwpt_paging)
750 {
751 struct iommufd_hwpt_paging *old_hwpt_paging;
752 struct iommufd_attach *attach;
753 struct iommufd_device *cur;
754 unsigned long index;
755 int rc;
756
757 lockdep_assert_held(&igroup->lock);
758
759 attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
760 old_hwpt_paging = find_hwpt_paging(attach->hwpt);
761 if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) {
762 xa_for_each(&attach->device_array, index, cur) {
763 rc = iopt_table_enforce_dev_resv_regions(
764 &hwpt_paging->ioas->iopt, cur->dev, NULL);
765 if (rc)
766 goto err_unresv;
767 }
768 }
769
770 rc = iommufd_group_setup_msi(igroup, hwpt_paging);
771 if (rc)
772 goto err_unresv;
773 return 0;
774
775 err_unresv:
776 iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
777 return rc;
778 }
779
780 static struct iommufd_hw_pagetable *
iommufd_device_do_replace(struct iommufd_device * idev,ioasid_t pasid,struct iommufd_hw_pagetable * hwpt)781 iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid,
782 struct iommufd_hw_pagetable *hwpt)
783 {
784 struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt);
785 bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID;
786 struct iommufd_hwpt_paging *old_hwpt_paging;
787 struct iommufd_group *igroup = idev->igroup;
788 struct iommufd_hw_pagetable *old_hwpt;
789 struct iommufd_attach *attach;
790 unsigned int num_devices;
791 int rc;
792
793 mutex_lock(&igroup->lock);
794
795 attach = xa_load(&igroup->pasid_attach, pasid);
796 if (!attach) {
797 rc = -EINVAL;
798 goto err_unlock;
799 }
800
801 old_hwpt = attach->hwpt;
802
803 WARN_ON(!old_hwpt || xa_empty(&attach->device_array));
804
805 if (!iommufd_device_is_attached(idev, pasid)) {
806 rc = -EINVAL;
807 goto err_unlock;
808 }
809
810 if (hwpt == old_hwpt) {
811 mutex_unlock(&igroup->lock);
812 return NULL;
813 }
814
815 if (attach_resv) {
816 rc = iommufd_group_do_replace_reserved_iova(igroup, hwpt_paging);
817 if (rc)
818 goto err_unlock;
819 }
820
821 rc = iommufd_hwpt_replace_device(idev, pasid, hwpt, old_hwpt);
822 if (rc)
823 goto err_unresv;
824
825 old_hwpt_paging = find_hwpt_paging(old_hwpt);
826 if (old_hwpt_paging && pasid == IOMMU_NO_PASID &&
827 (!hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas))
828 iommufd_group_remove_reserved_iova(igroup, old_hwpt_paging);
829
830 attach->hwpt = hwpt;
831
832 num_devices = iommufd_group_device_num(igroup, pasid);
833 /*
834 * Move the refcounts held by the device_array to the new hwpt. Retain a
835 * refcount for this thread as the caller will free it.
836 */
837 refcount_add(num_devices, &hwpt->obj.users);
838 if (num_devices > 1)
839 WARN_ON(refcount_sub_and_test(num_devices - 1,
840 &old_hwpt->obj.users));
841 mutex_unlock(&igroup->lock);
842
843 /* Caller must destroy old_hwpt */
844 return old_hwpt;
845 err_unresv:
846 if (attach_resv)
847 iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
848 err_unlock:
849 mutex_unlock(&igroup->lock);
850 return ERR_PTR(rc);
851 }
852
853 typedef struct iommufd_hw_pagetable *(*attach_fn)(
854 struct iommufd_device *idev, ioasid_t pasid,
855 struct iommufd_hw_pagetable *hwpt);
856
857 /*
858 * When automatically managing the domains we search for a compatible domain in
859 * the iopt and if one is found use it, otherwise create a new domain.
860 * Automatic domain selection will never pick a manually created domain.
861 */
862 static struct iommufd_hw_pagetable *
iommufd_device_auto_get_domain(struct iommufd_device * idev,ioasid_t pasid,struct iommufd_ioas * ioas,u32 * pt_id,attach_fn do_attach)863 iommufd_device_auto_get_domain(struct iommufd_device *idev, ioasid_t pasid,
864 struct iommufd_ioas *ioas, u32 *pt_id,
865 attach_fn do_attach)
866 {
867 /*
868 * iommufd_hw_pagetable_attach() is called by
869 * iommufd_hw_pagetable_alloc() in immediate attachment mode, same as
870 * iommufd_device_do_attach(). So if we are in this mode then we prefer
871 * to use the immediate_attach path as it supports drivers that can't
872 * directly allocate a domain.
873 */
874 bool immediate_attach = do_attach == iommufd_device_do_attach;
875 struct iommufd_hw_pagetable *destroy_hwpt;
876 struct iommufd_hwpt_paging *hwpt_paging;
877 struct iommufd_hw_pagetable *hwpt;
878
879 /*
880 * There is no differentiation when domains are allocated, so any domain
881 * that is willing to attach to the device is interchangeable with any
882 * other.
883 */
884 mutex_lock(&ioas->mutex);
885 list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
886 if (!hwpt_paging->auto_domain)
887 continue;
888
889 hwpt = &hwpt_paging->common;
890 if (!iommufd_lock_obj(&hwpt->obj))
891 continue;
892 destroy_hwpt = (*do_attach)(idev, pasid, hwpt);
893 if (IS_ERR(destroy_hwpt)) {
894 iommufd_put_object(idev->ictx, &hwpt->obj);
895 /*
896 * -EINVAL means the domain is incompatible with the
897 * device. Other error codes should propagate to
898 * userspace as failure. Success means the domain is
899 * attached.
900 */
901 if (PTR_ERR(destroy_hwpt) == -EINVAL)
902 continue;
903 goto out_unlock;
904 }
905 *pt_id = hwpt->obj.id;
906 iommufd_put_object(idev->ictx, &hwpt->obj);
907 goto out_unlock;
908 }
909
910 hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, pasid,
911 0, immediate_attach, NULL);
912 if (IS_ERR(hwpt_paging)) {
913 destroy_hwpt = ERR_CAST(hwpt_paging);
914 goto out_unlock;
915 }
916 hwpt = &hwpt_paging->common;
917
918 if (!immediate_attach) {
919 destroy_hwpt = (*do_attach)(idev, pasid, hwpt);
920 if (IS_ERR(destroy_hwpt))
921 goto out_abort;
922 } else {
923 destroy_hwpt = NULL;
924 }
925
926 hwpt_paging->auto_domain = true;
927 *pt_id = hwpt->obj.id;
928
929 iommufd_object_finalize(idev->ictx, &hwpt->obj);
930 mutex_unlock(&ioas->mutex);
931 return destroy_hwpt;
932
933 out_abort:
934 iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj);
935 out_unlock:
936 mutex_unlock(&ioas->mutex);
937 return destroy_hwpt;
938 }
939
iommufd_device_change_pt(struct iommufd_device * idev,ioasid_t pasid,u32 * pt_id,attach_fn do_attach)940 static int iommufd_device_change_pt(struct iommufd_device *idev,
941 ioasid_t pasid,
942 u32 *pt_id, attach_fn do_attach)
943 {
944 struct iommufd_hw_pagetable *destroy_hwpt;
945 struct iommufd_object *pt_obj;
946
947 pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY);
948 if (IS_ERR(pt_obj))
949 return PTR_ERR(pt_obj);
950
951 switch (pt_obj->type) {
952 case IOMMUFD_OBJ_HWPT_NESTED:
953 case IOMMUFD_OBJ_HWPT_PAGING: {
954 struct iommufd_hw_pagetable *hwpt =
955 container_of(pt_obj, struct iommufd_hw_pagetable, obj);
956
957 destroy_hwpt = (*do_attach)(idev, pasid, hwpt);
958 if (IS_ERR(destroy_hwpt))
959 goto out_put_pt_obj;
960 break;
961 }
962 case IOMMUFD_OBJ_IOAS: {
963 struct iommufd_ioas *ioas =
964 container_of(pt_obj, struct iommufd_ioas, obj);
965
966 destroy_hwpt = iommufd_device_auto_get_domain(idev, pasid, ioas,
967 pt_id, do_attach);
968 if (IS_ERR(destroy_hwpt))
969 goto out_put_pt_obj;
970 break;
971 }
972 default:
973 destroy_hwpt = ERR_PTR(-EINVAL);
974 goto out_put_pt_obj;
975 }
976 iommufd_put_object(idev->ictx, pt_obj);
977
978 /* This destruction has to be after we unlock everything */
979 if (destroy_hwpt)
980 iommufd_hw_pagetable_put(idev->ictx, destroy_hwpt);
981 return 0;
982
983 out_put_pt_obj:
984 iommufd_put_object(idev->ictx, pt_obj);
985 return PTR_ERR(destroy_hwpt);
986 }
987
988 /**
989 * iommufd_device_attach - Connect a device/pasid to an iommu_domain
990 * @idev: device to attach
991 * @pasid: pasid to attach
992 * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
993 * Output the IOMMUFD_OBJ_HWPT_PAGING ID
994 *
995 * This connects the device/pasid to an iommu_domain, either automatically
996 * or manually selected. Once this completes the device could do DMA with
997 * @pasid. @pasid is IOMMU_NO_PASID if this attach is for no pasid usage.
998 *
999 * The caller should return the resulting pt_id back to userspace.
1000 * This function is undone by calling iommufd_device_detach().
1001 */
iommufd_device_attach(struct iommufd_device * idev,ioasid_t pasid,u32 * pt_id)1002 int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid,
1003 u32 *pt_id)
1004 {
1005 int rc;
1006
1007 rc = iommufd_device_change_pt(idev, pasid, pt_id,
1008 &iommufd_device_do_attach);
1009 if (rc)
1010 return rc;
1011
1012 /*
1013 * Pairs with iommufd_device_detach() - catches caller bugs attempting
1014 * to destroy a device with an attachment.
1015 */
1016 refcount_inc(&idev->obj.users);
1017 return 0;
1018 }
1019 EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD");
1020
1021 /**
1022 * iommufd_device_replace - Change the device/pasid's iommu_domain
1023 * @idev: device to change
1024 * @pasid: pasid to change
1025 * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
1026 * Output the IOMMUFD_OBJ_HWPT_PAGING ID
1027 *
1028 * This is the same as::
1029 *
1030 * iommufd_device_detach();
1031 * iommufd_device_attach();
1032 *
1033 * If it fails then no change is made to the attachment. The iommu driver may
1034 * implement this so there is no disruption in translation. This can only be
1035 * called if iommufd_device_attach() has already succeeded. @pasid is
1036 * IOMMU_NO_PASID for no pasid usage.
1037 */
iommufd_device_replace(struct iommufd_device * idev,ioasid_t pasid,u32 * pt_id)1038 int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid,
1039 u32 *pt_id)
1040 {
1041 return iommufd_device_change_pt(idev, pasid, pt_id,
1042 &iommufd_device_do_replace);
1043 }
1044 EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, "IOMMUFD");
1045
1046 /**
1047 * iommufd_device_detach - Disconnect a device/device to an iommu_domain
1048 * @idev: device to detach
1049 * @pasid: pasid to detach
1050 *
1051 * Undo iommufd_device_attach(). This disconnects the idev from the previously
1052 * attached pt_id. The device returns back to a blocked DMA translation.
1053 * @pasid is IOMMU_NO_PASID for no pasid usage.
1054 */
iommufd_device_detach(struct iommufd_device * idev,ioasid_t pasid)1055 void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid)
1056 {
1057 struct iommufd_hw_pagetable *hwpt;
1058
1059 hwpt = iommufd_hw_pagetable_detach(idev, pasid);
1060 if (!hwpt)
1061 return;
1062 refcount_dec(&idev->obj.users);
1063 }
1064 EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, "IOMMUFD");
1065
1066 /*
1067 * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at
1068 * a valid cur_ioas (access->ioas). A caller passing in a valid new_ioas should
1069 * call iommufd_put_object() if it does an iommufd_get_object() for a new_ioas.
1070 */
iommufd_access_change_ioas(struct iommufd_access * access,struct iommufd_ioas * new_ioas)1071 static int iommufd_access_change_ioas(struct iommufd_access *access,
1072 struct iommufd_ioas *new_ioas)
1073 {
1074 u32 iopt_access_list_id = access->iopt_access_list_id;
1075 struct iommufd_ioas *cur_ioas = access->ioas;
1076 int rc;
1077
1078 lockdep_assert_held(&access->ioas_lock);
1079
1080 /* We are racing with a concurrent detach, bail */
1081 if (cur_ioas != access->ioas_unpin)
1082 return -EBUSY;
1083
1084 if (cur_ioas == new_ioas)
1085 return 0;
1086
1087 /*
1088 * Set ioas to NULL to block any further iommufd_access_pin_pages().
1089 * iommufd_access_unpin_pages() can continue using access->ioas_unpin.
1090 */
1091 access->ioas = NULL;
1092
1093 if (new_ioas) {
1094 rc = iopt_add_access(&new_ioas->iopt, access);
1095 if (rc) {
1096 access->ioas = cur_ioas;
1097 return rc;
1098 }
1099 refcount_inc(&new_ioas->obj.users);
1100 }
1101
1102 if (cur_ioas) {
1103 if (!iommufd_access_is_internal(access) && access->ops->unmap) {
1104 mutex_unlock(&access->ioas_lock);
1105 access->ops->unmap(access->data, 0, ULONG_MAX);
1106 mutex_lock(&access->ioas_lock);
1107 }
1108 iopt_remove_access(&cur_ioas->iopt, access, iopt_access_list_id);
1109 refcount_dec(&cur_ioas->obj.users);
1110 }
1111
1112 access->ioas = new_ioas;
1113 access->ioas_unpin = new_ioas;
1114
1115 return 0;
1116 }
1117
iommufd_access_change_ioas_id(struct iommufd_access * access,u32 id)1118 static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id)
1119 {
1120 struct iommufd_ioas *ioas = iommufd_get_ioas(access->ictx, id);
1121 int rc;
1122
1123 if (IS_ERR(ioas))
1124 return PTR_ERR(ioas);
1125 rc = iommufd_access_change_ioas(access, ioas);
1126 iommufd_put_object(access->ictx, &ioas->obj);
1127 return rc;
1128 }
1129
iommufd_access_destroy_object(struct iommufd_object * obj)1130 void iommufd_access_destroy_object(struct iommufd_object *obj)
1131 {
1132 struct iommufd_access *access =
1133 container_of(obj, struct iommufd_access, obj);
1134
1135 mutex_lock(&access->ioas_lock);
1136 if (access->ioas)
1137 WARN_ON(iommufd_access_change_ioas(access, NULL));
1138 mutex_unlock(&access->ioas_lock);
1139 if (!iommufd_access_is_internal(access))
1140 iommufd_ctx_put(access->ictx);
1141 }
1142
__iommufd_access_create(struct iommufd_ctx * ictx)1143 static struct iommufd_access *__iommufd_access_create(struct iommufd_ctx *ictx)
1144 {
1145 struct iommufd_access *access;
1146
1147 /*
1148 * There is no uAPI for the access object, but to keep things symmetric
1149 * use the object infrastructure anyhow.
1150 */
1151 access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS);
1152 if (IS_ERR(access))
1153 return access;
1154
1155 /* The calling driver is a user until iommufd_access_destroy() */
1156 refcount_inc(&access->obj.users);
1157 mutex_init(&access->ioas_lock);
1158 return access;
1159 }
1160
iommufd_access_create_internal(struct iommufd_ctx * ictx)1161 struct iommufd_access *iommufd_access_create_internal(struct iommufd_ctx *ictx)
1162 {
1163 struct iommufd_access *access;
1164
1165 access = __iommufd_access_create(ictx);
1166 if (IS_ERR(access))
1167 return access;
1168 access->iova_alignment = PAGE_SIZE;
1169
1170 iommufd_object_finalize(ictx, &access->obj);
1171 return access;
1172 }
1173
1174 /**
1175 * iommufd_access_create - Create an iommufd_access
1176 * @ictx: iommufd file descriptor
1177 * @ops: Driver's ops to associate with the access
1178 * @data: Opaque data to pass into ops functions
1179 * @id: Output ID number to return to userspace for this access
1180 *
1181 * An iommufd_access allows a driver to read/write to the IOAS without using
1182 * DMA. The underlying CPU memory can be accessed using the
1183 * iommufd_access_pin_pages() or iommufd_access_rw() functions.
1184 *
1185 * The provided ops are required to use iommufd_access_pin_pages().
1186 */
1187 struct iommufd_access *
iommufd_access_create(struct iommufd_ctx * ictx,const struct iommufd_access_ops * ops,void * data,u32 * id)1188 iommufd_access_create(struct iommufd_ctx *ictx,
1189 const struct iommufd_access_ops *ops, void *data, u32 *id)
1190 {
1191 struct iommufd_access *access;
1192
1193 access = __iommufd_access_create(ictx);
1194 if (IS_ERR(access))
1195 return access;
1196
1197 access->data = data;
1198 access->ops = ops;
1199
1200 if (ops->needs_pin_pages)
1201 access->iova_alignment = PAGE_SIZE;
1202 else
1203 access->iova_alignment = 1;
1204
1205 access->ictx = ictx;
1206 iommufd_ctx_get(ictx);
1207 iommufd_object_finalize(ictx, &access->obj);
1208 *id = access->obj.id;
1209 return access;
1210 }
1211 EXPORT_SYMBOL_NS_GPL(iommufd_access_create, "IOMMUFD");
1212
1213 /**
1214 * iommufd_access_destroy - Destroy an iommufd_access
1215 * @access: The access to destroy
1216 *
1217 * The caller must stop using the access before destroying it.
1218 */
iommufd_access_destroy(struct iommufd_access * access)1219 void iommufd_access_destroy(struct iommufd_access *access)
1220 {
1221 iommufd_object_destroy_user(access->ictx, &access->obj);
1222 }
1223 EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, "IOMMUFD");
1224
iommufd_access_detach(struct iommufd_access * access)1225 void iommufd_access_detach(struct iommufd_access *access)
1226 {
1227 mutex_lock(&access->ioas_lock);
1228 if (WARN_ON(!access->ioas)) {
1229 mutex_unlock(&access->ioas_lock);
1230 return;
1231 }
1232 WARN_ON(iommufd_access_change_ioas(access, NULL));
1233 mutex_unlock(&access->ioas_lock);
1234 }
1235 EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, "IOMMUFD");
1236
iommufd_access_attach(struct iommufd_access * access,u32 ioas_id)1237 int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id)
1238 {
1239 int rc;
1240
1241 mutex_lock(&access->ioas_lock);
1242 if (WARN_ON(access->ioas)) {
1243 mutex_unlock(&access->ioas_lock);
1244 return -EINVAL;
1245 }
1246
1247 rc = iommufd_access_change_ioas_id(access, ioas_id);
1248 mutex_unlock(&access->ioas_lock);
1249 return rc;
1250 }
1251 EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, "IOMMUFD");
1252
iommufd_access_attach_internal(struct iommufd_access * access,struct iommufd_ioas * ioas)1253 int iommufd_access_attach_internal(struct iommufd_access *access,
1254 struct iommufd_ioas *ioas)
1255 {
1256 int rc;
1257
1258 mutex_lock(&access->ioas_lock);
1259 if (WARN_ON(access->ioas)) {
1260 mutex_unlock(&access->ioas_lock);
1261 return -EINVAL;
1262 }
1263
1264 rc = iommufd_access_change_ioas(access, ioas);
1265 mutex_unlock(&access->ioas_lock);
1266 return rc;
1267 }
1268
iommufd_access_replace(struct iommufd_access * access,u32 ioas_id)1269 int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id)
1270 {
1271 int rc;
1272
1273 mutex_lock(&access->ioas_lock);
1274 if (!access->ioas) {
1275 mutex_unlock(&access->ioas_lock);
1276 return -ENOENT;
1277 }
1278 rc = iommufd_access_change_ioas_id(access, ioas_id);
1279 mutex_unlock(&access->ioas_lock);
1280 return rc;
1281 }
1282 EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, "IOMMUFD");
1283
1284 /**
1285 * iommufd_access_notify_unmap - Notify users of an iopt to stop using it
1286 * @iopt: iopt to work on
1287 * @iova: Starting iova in the iopt
1288 * @length: Number of bytes
1289 *
1290 * After this function returns there should be no users attached to the pages
1291 * linked to this iopt that intersect with iova,length. Anyone that has attached
1292 * a user through iopt_access_pages() needs to detach it through
1293 * iommufd_access_unpin_pages() before this function returns.
1294 *
1295 * iommufd_access_destroy() will wait for any outstanding unmap callback to
1296 * complete. Once iommufd_access_destroy() no unmap ops are running or will
1297 * run in the future. Due to this a driver must not create locking that prevents
1298 * unmap to complete while iommufd_access_destroy() is running.
1299 */
iommufd_access_notify_unmap(struct io_pagetable * iopt,unsigned long iova,unsigned long length)1300 void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
1301 unsigned long length)
1302 {
1303 struct iommufd_ioas *ioas =
1304 container_of(iopt, struct iommufd_ioas, iopt);
1305 struct iommufd_access *access;
1306 unsigned long index;
1307
1308 xa_lock(&ioas->iopt.access_list);
1309 xa_for_each(&ioas->iopt.access_list, index, access) {
1310 if (!iommufd_lock_obj(&access->obj) ||
1311 iommufd_access_is_internal(access))
1312 continue;
1313 xa_unlock(&ioas->iopt.access_list);
1314
1315 access->ops->unmap(access->data, iova, length);
1316
1317 iommufd_put_object(access->ictx, &access->obj);
1318 xa_lock(&ioas->iopt.access_list);
1319 }
1320 xa_unlock(&ioas->iopt.access_list);
1321 }
1322
1323 /**
1324 * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages
1325 * @access: IOAS access to act on
1326 * @iova: Starting IOVA
1327 * @length: Number of bytes to access
1328 *
1329 * Return the struct page's. The caller must stop accessing them before calling
1330 * this. The iova/length must exactly match the one provided to access_pages.
1331 */
iommufd_access_unpin_pages(struct iommufd_access * access,unsigned long iova,unsigned long length)1332 void iommufd_access_unpin_pages(struct iommufd_access *access,
1333 unsigned long iova, unsigned long length)
1334 {
1335 bool internal = iommufd_access_is_internal(access);
1336 struct iopt_area_contig_iter iter;
1337 struct io_pagetable *iopt;
1338 unsigned long last_iova;
1339 struct iopt_area *area;
1340
1341 if (WARN_ON(!length) ||
1342 WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
1343 return;
1344
1345 mutex_lock(&access->ioas_lock);
1346 /*
1347 * The driver must be doing something wrong if it calls this before an
1348 * iommufd_access_attach() or after an iommufd_access_detach().
1349 */
1350 if (WARN_ON(!access->ioas_unpin)) {
1351 mutex_unlock(&access->ioas_lock);
1352 return;
1353 }
1354 iopt = &access->ioas_unpin->iopt;
1355
1356 down_read(&iopt->iova_rwsem);
1357 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
1358 iopt_area_remove_access(
1359 area, iopt_area_iova_to_index(area, iter.cur_iova),
1360 iopt_area_iova_to_index(
1361 area,
1362 min(last_iova, iopt_area_last_iova(area))),
1363 internal);
1364 WARN_ON(!iopt_area_contig_done(&iter));
1365 up_read(&iopt->iova_rwsem);
1366 mutex_unlock(&access->ioas_lock);
1367 }
1368 EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, "IOMMUFD");
1369
iopt_area_contig_is_aligned(struct iopt_area_contig_iter * iter)1370 static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter)
1371 {
1372 if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE)
1373 return false;
1374
1375 if (!iopt_area_contig_done(iter) &&
1376 (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) %
1377 PAGE_SIZE) != (PAGE_SIZE - 1))
1378 return false;
1379 return true;
1380 }
1381
check_area_prot(struct iopt_area * area,unsigned int flags)1382 static bool check_area_prot(struct iopt_area *area, unsigned int flags)
1383 {
1384 if (flags & IOMMUFD_ACCESS_RW_WRITE)
1385 return area->iommu_prot & IOMMU_WRITE;
1386 return area->iommu_prot & IOMMU_READ;
1387 }
1388
1389 /**
1390 * iommufd_access_pin_pages() - Return a list of pages under the iova
1391 * @access: IOAS access to act on
1392 * @iova: Starting IOVA
1393 * @length: Number of bytes to access
1394 * @out_pages: Output page list
1395 * @flags: IOPMMUFD_ACCESS_RW_* flags
1396 *
1397 * Reads @length bytes starting at iova and returns the struct page * pointers.
1398 * These can be kmap'd by the caller for CPU access.
1399 *
1400 * The caller must perform iommufd_access_unpin_pages() when done to balance
1401 * this.
1402 *
1403 * This API always requires a page aligned iova. This happens naturally if the
1404 * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However
1405 * smaller alignments have corner cases where this API can fail on otherwise
1406 * aligned iova.
1407 */
iommufd_access_pin_pages(struct iommufd_access * access,unsigned long iova,unsigned long length,struct page ** out_pages,unsigned int flags)1408 int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
1409 unsigned long length, struct page **out_pages,
1410 unsigned int flags)
1411 {
1412 bool internal = iommufd_access_is_internal(access);
1413 struct iopt_area_contig_iter iter;
1414 struct io_pagetable *iopt;
1415 unsigned long last_iova;
1416 struct iopt_area *area;
1417 int rc;
1418
1419 /* Driver's ops don't support pin_pages */
1420 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
1421 WARN_ON(access->iova_alignment != PAGE_SIZE ||
1422 (!internal && !access->ops->unmap)))
1423 return -EINVAL;
1424
1425 if (!length)
1426 return -EINVAL;
1427 if (check_add_overflow(iova, length - 1, &last_iova))
1428 return -EOVERFLOW;
1429
1430 mutex_lock(&access->ioas_lock);
1431 if (!access->ioas) {
1432 mutex_unlock(&access->ioas_lock);
1433 return -ENOENT;
1434 }
1435 iopt = &access->ioas->iopt;
1436
1437 down_read(&iopt->iova_rwsem);
1438 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
1439 unsigned long last = min(last_iova, iopt_area_last_iova(area));
1440 unsigned long last_index = iopt_area_iova_to_index(area, last);
1441 unsigned long index =
1442 iopt_area_iova_to_index(area, iter.cur_iova);
1443
1444 if (area->prevent_access ||
1445 !iopt_area_contig_is_aligned(&iter)) {
1446 rc = -EINVAL;
1447 goto err_remove;
1448 }
1449
1450 if (!check_area_prot(area, flags)) {
1451 rc = -EPERM;
1452 goto err_remove;
1453 }
1454
1455 rc = iopt_area_add_access(area, index, last_index, out_pages,
1456 flags, internal);
1457 if (rc)
1458 goto err_remove;
1459 out_pages += last_index - index + 1;
1460 }
1461 if (!iopt_area_contig_done(&iter)) {
1462 rc = -ENOENT;
1463 goto err_remove;
1464 }
1465
1466 up_read(&iopt->iova_rwsem);
1467 mutex_unlock(&access->ioas_lock);
1468 return 0;
1469
1470 err_remove:
1471 if (iova < iter.cur_iova) {
1472 last_iova = iter.cur_iova - 1;
1473 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
1474 iopt_area_remove_access(
1475 area,
1476 iopt_area_iova_to_index(area, iter.cur_iova),
1477 iopt_area_iova_to_index(
1478 area, min(last_iova,
1479 iopt_area_last_iova(area))),
1480 internal);
1481 }
1482 up_read(&iopt->iova_rwsem);
1483 mutex_unlock(&access->ioas_lock);
1484 return rc;
1485 }
1486 EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, "IOMMUFD");
1487
1488 /**
1489 * iommufd_access_rw - Read or write data under the iova
1490 * @access: IOAS access to act on
1491 * @iova: Starting IOVA
1492 * @data: Kernel buffer to copy to/from
1493 * @length: Number of bytes to access
1494 * @flags: IOMMUFD_ACCESS_RW_* flags
1495 *
1496 * Copy kernel to/from data into the range given by IOVA/length. If flags
1497 * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized
1498 * by changing it into copy_to/from_user().
1499 */
iommufd_access_rw(struct iommufd_access * access,unsigned long iova,void * data,size_t length,unsigned int flags)1500 int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
1501 void *data, size_t length, unsigned int flags)
1502 {
1503 struct iopt_area_contig_iter iter;
1504 struct io_pagetable *iopt;
1505 struct iopt_area *area;
1506 unsigned long last_iova;
1507 int rc = -EINVAL;
1508
1509 if (!length)
1510 return -EINVAL;
1511 if (check_add_overflow(iova, length - 1, &last_iova))
1512 return -EOVERFLOW;
1513
1514 mutex_lock(&access->ioas_lock);
1515 if (!access->ioas) {
1516 mutex_unlock(&access->ioas_lock);
1517 return -ENOENT;
1518 }
1519 iopt = &access->ioas->iopt;
1520
1521 down_read(&iopt->iova_rwsem);
1522 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
1523 unsigned long last = min(last_iova, iopt_area_last_iova(area));
1524 unsigned long bytes = (last - iter.cur_iova) + 1;
1525
1526 if (area->prevent_access) {
1527 rc = -EINVAL;
1528 goto err_out;
1529 }
1530
1531 if (!check_area_prot(area, flags)) {
1532 rc = -EPERM;
1533 goto err_out;
1534 }
1535
1536 rc = iopt_pages_rw_access(
1537 area->pages, iopt_area_start_byte(area, iter.cur_iova),
1538 data, bytes, flags);
1539 if (rc)
1540 goto err_out;
1541 data += bytes;
1542 }
1543 if (!iopt_area_contig_done(&iter))
1544 rc = -ENOENT;
1545 err_out:
1546 up_read(&iopt->iova_rwsem);
1547 mutex_unlock(&access->ioas_lock);
1548 return rc;
1549 }
1550 EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, "IOMMUFD");
1551
iommufd_get_hw_info(struct iommufd_ucmd * ucmd)1552 int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
1553 {
1554 const u32 SUPPORTED_FLAGS = IOMMU_HW_INFO_FLAG_INPUT_TYPE;
1555 struct iommu_hw_info *cmd = ucmd->cmd;
1556 void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr);
1557 const struct iommu_ops *ops;
1558 struct iommufd_device *idev;
1559 unsigned int data_len;
1560 unsigned int copy_len;
1561 void *data;
1562 int rc;
1563
1564 if (cmd->flags & ~SUPPORTED_FLAGS)
1565 return -EOPNOTSUPP;
1566 if (cmd->__reserved[0] || cmd->__reserved[1] || cmd->__reserved[2])
1567 return -EOPNOTSUPP;
1568
1569 /* Clear the type field since drivers don't support a random input */
1570 if (!(cmd->flags & IOMMU_HW_INFO_FLAG_INPUT_TYPE))
1571 cmd->in_data_type = IOMMU_HW_INFO_TYPE_DEFAULT;
1572
1573 idev = iommufd_get_device(ucmd, cmd->dev_id);
1574 if (IS_ERR(idev))
1575 return PTR_ERR(idev);
1576
1577 ops = dev_iommu_ops(idev->dev);
1578 if (ops->hw_info) {
1579 data = ops->hw_info(idev->dev, &data_len, &cmd->out_data_type);
1580 if (IS_ERR(data)) {
1581 rc = PTR_ERR(data);
1582 goto out_put;
1583 }
1584
1585 /*
1586 * drivers that have hw_info callback should have a unique
1587 * iommu_hw_info_type.
1588 */
1589 if (WARN_ON_ONCE(cmd->out_data_type ==
1590 IOMMU_HW_INFO_TYPE_NONE)) {
1591 rc = -EOPNOTSUPP;
1592 goto out_free;
1593 }
1594 } else {
1595 cmd->out_data_type = IOMMU_HW_INFO_TYPE_NONE;
1596 data_len = 0;
1597 data = NULL;
1598 }
1599
1600 copy_len = min(cmd->data_len, data_len);
1601 if (copy_to_user(user_ptr, data, copy_len)) {
1602 rc = -EFAULT;
1603 goto out_free;
1604 }
1605
1606 /*
1607 * Zero the trailing bytes if the user buffer is bigger than the
1608 * data size kernel actually has.
1609 */
1610 if (copy_len < cmd->data_len) {
1611 if (clear_user(user_ptr + copy_len, cmd->data_len - copy_len)) {
1612 rc = -EFAULT;
1613 goto out_free;
1614 }
1615 }
1616
1617 /*
1618 * We return the length the kernel supports so userspace may know what
1619 * the kernel capability is. It could be larger than the input buffer.
1620 */
1621 cmd->data_len = data_len;
1622
1623 cmd->out_capabilities = 0;
1624 if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING))
1625 cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING;
1626
1627 cmd->out_max_pasid_log2 = 0;
1628 /*
1629 * Currently, all iommu drivers enable PASID in the probe_device()
1630 * op if iommu and device supports it. So the max_pasids stored in
1631 * dev->iommu indicates both PASID support and enable status. A
1632 * non-zero dev->iommu->max_pasids means PASID is supported and
1633 * enabled. The iommufd only reports PASID capability to userspace
1634 * if it's enabled.
1635 */
1636 if (idev->dev->iommu->max_pasids) {
1637 cmd->out_max_pasid_log2 = ilog2(idev->dev->iommu->max_pasids);
1638
1639 if (dev_is_pci(idev->dev)) {
1640 struct pci_dev *pdev = to_pci_dev(idev->dev);
1641 int ctrl;
1642
1643 ctrl = pci_pasid_status(pdev);
1644
1645 WARN_ON_ONCE(ctrl < 0 ||
1646 !(ctrl & PCI_PASID_CTRL_ENABLE));
1647
1648 if (ctrl & PCI_PASID_CTRL_EXEC)
1649 cmd->out_capabilities |=
1650 IOMMU_HW_CAP_PCI_PASID_EXEC;
1651 if (ctrl & PCI_PASID_CTRL_PRIV)
1652 cmd->out_capabilities |=
1653 IOMMU_HW_CAP_PCI_PASID_PRIV;
1654 }
1655 }
1656
1657 rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
1658 out_free:
1659 kfree(data);
1660 out_put:
1661 iommufd_put_object(ucmd->ictx, &idev->obj);
1662 return rc;
1663 }
1664