xref: /linux/drivers/iommu/iommufd/device.c (revision e0c0ab04f6785abaa71b9b8dc252cb1a2072c225)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
3  */
4 #include <linux/iommu.h>
5 #include <linux/iommufd.h>
6 #include <linux/pci-ats.h>
7 #include <linux/slab.h>
8 #include <uapi/linux/iommufd.h>
9 
10 #include "../iommu-priv.h"
11 #include "io_pagetable.h"
12 #include "iommufd_private.h"
13 
14 static bool allow_unsafe_interrupts;
15 module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
16 MODULE_PARM_DESC(
17 	allow_unsafe_interrupts,
18 	"Allow IOMMUFD to bind to devices even if the platform cannot isolate "
19 	"the MSI interrupt window. Enabling this is a security weakness.");
20 
21 struct iommufd_attach {
22 	struct iommufd_hw_pagetable *hwpt;
23 	struct xarray device_array;
24 };
25 
26 static void iommufd_group_release(struct kref *kref)
27 {
28 	struct iommufd_group *igroup =
29 		container_of(kref, struct iommufd_group, ref);
30 
31 	WARN_ON(!xa_empty(&igroup->pasid_attach));
32 
33 	xa_cmpxchg(&igroup->ictx->groups, iommu_group_id(igroup->group), igroup,
34 		   NULL, GFP_KERNEL);
35 	iommu_group_put(igroup->group);
36 	mutex_destroy(&igroup->lock);
37 	kfree(igroup);
38 }
39 
40 static void iommufd_put_group(struct iommufd_group *group)
41 {
42 	kref_put(&group->ref, iommufd_group_release);
43 }
44 
45 static bool iommufd_group_try_get(struct iommufd_group *igroup,
46 				  struct iommu_group *group)
47 {
48 	if (!igroup)
49 		return false;
50 	/*
51 	 * group ID's cannot be re-used until the group is put back which does
52 	 * not happen if we could get an igroup pointer under the xa_lock.
53 	 */
54 	if (WARN_ON(igroup->group != group))
55 		return false;
56 	return kref_get_unless_zero(&igroup->ref);
57 }
58 
59 /*
60  * iommufd needs to store some more data for each iommu_group, we keep a
61  * parallel xarray indexed by iommu_group id to hold this instead of putting it
62  * in the core structure. To keep things simple the iommufd_group memory is
63  * unique within the iommufd_ctx. This makes it easy to check there are no
64  * memory leaks.
65  */
66 static struct iommufd_group *iommufd_get_group(struct iommufd_ctx *ictx,
67 					       struct device *dev)
68 {
69 	struct iommufd_group *new_igroup;
70 	struct iommufd_group *cur_igroup;
71 	struct iommufd_group *igroup;
72 	struct iommu_group *group;
73 	unsigned int id;
74 
75 	group = iommu_group_get(dev);
76 	if (!group)
77 		return ERR_PTR(-ENODEV);
78 
79 	id = iommu_group_id(group);
80 
81 	xa_lock(&ictx->groups);
82 	igroup = xa_load(&ictx->groups, id);
83 	if (iommufd_group_try_get(igroup, group)) {
84 		xa_unlock(&ictx->groups);
85 		iommu_group_put(group);
86 		return igroup;
87 	}
88 	xa_unlock(&ictx->groups);
89 
90 	new_igroup = kzalloc(sizeof(*new_igroup), GFP_KERNEL);
91 	if (!new_igroup) {
92 		iommu_group_put(group);
93 		return ERR_PTR(-ENOMEM);
94 	}
95 
96 	kref_init(&new_igroup->ref);
97 	mutex_init(&new_igroup->lock);
98 	xa_init(&new_igroup->pasid_attach);
99 	new_igroup->sw_msi_start = PHYS_ADDR_MAX;
100 	/* group reference moves into new_igroup */
101 	new_igroup->group = group;
102 
103 	/*
104 	 * The ictx is not additionally refcounted here becase all objects using
105 	 * an igroup must put it before their destroy completes.
106 	 */
107 	new_igroup->ictx = ictx;
108 
109 	/*
110 	 * We dropped the lock so igroup is invalid. NULL is a safe and likely
111 	 * value to assume for the xa_cmpxchg algorithm.
112 	 */
113 	cur_igroup = NULL;
114 	xa_lock(&ictx->groups);
115 	while (true) {
116 		igroup = __xa_cmpxchg(&ictx->groups, id, cur_igroup, new_igroup,
117 				      GFP_KERNEL);
118 		if (xa_is_err(igroup)) {
119 			xa_unlock(&ictx->groups);
120 			iommufd_put_group(new_igroup);
121 			return ERR_PTR(xa_err(igroup));
122 		}
123 
124 		/* new_group was successfully installed */
125 		if (cur_igroup == igroup) {
126 			xa_unlock(&ictx->groups);
127 			return new_igroup;
128 		}
129 
130 		/* Check again if the current group is any good */
131 		if (iommufd_group_try_get(igroup, group)) {
132 			xa_unlock(&ictx->groups);
133 			iommufd_put_group(new_igroup);
134 			return igroup;
135 		}
136 		cur_igroup = igroup;
137 	}
138 }
139 
140 void iommufd_device_destroy(struct iommufd_object *obj)
141 {
142 	struct iommufd_device *idev =
143 		container_of(obj, struct iommufd_device, obj);
144 
145 	iommu_device_release_dma_owner(idev->dev);
146 	iommufd_put_group(idev->igroup);
147 	if (!iommufd_selftest_is_mock_dev(idev->dev))
148 		iommufd_ctx_put(idev->ictx);
149 }
150 
151 /**
152  * iommufd_device_bind - Bind a physical device to an iommu fd
153  * @ictx: iommufd file descriptor
154  * @dev: Pointer to a physical device struct
155  * @id: Output ID number to return to userspace for this device
156  *
157  * A successful bind establishes an ownership over the device and returns
158  * struct iommufd_device pointer, otherwise returns error pointer.
159  *
160  * A driver using this API must set driver_managed_dma and must not touch
161  * the device until this routine succeeds and establishes ownership.
162  *
163  * Binding a PCI device places the entire RID under iommufd control.
164  *
165  * The caller must undo this with iommufd_device_unbind()
166  */
167 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
168 					   struct device *dev, u32 *id)
169 {
170 	struct iommufd_device *idev;
171 	struct iommufd_group *igroup;
172 	int rc;
173 
174 	/*
175 	 * iommufd always sets IOMMU_CACHE because we offer no way for userspace
176 	 * to restore cache coherency.
177 	 */
178 	if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY))
179 		return ERR_PTR(-EINVAL);
180 
181 	igroup = iommufd_get_group(ictx, dev);
182 	if (IS_ERR(igroup))
183 		return ERR_CAST(igroup);
184 
185 	/*
186 	 * For historical compat with VFIO the insecure interrupt path is
187 	 * allowed if the module parameter is set. Secure/Isolated means that a
188 	 * MemWr operation from the device (eg a simple DMA) cannot trigger an
189 	 * interrupt outside this iommufd context.
190 	 */
191 	if (!iommufd_selftest_is_mock_dev(dev) &&
192 	    !iommu_group_has_isolated_msi(igroup->group)) {
193 		if (!allow_unsafe_interrupts) {
194 			rc = -EPERM;
195 			goto out_group_put;
196 		}
197 
198 		dev_warn(
199 			dev,
200 			"MSI interrupts are not secure, they cannot be isolated by the platform. "
201 			"Check that platform features like interrupt remapping are enabled. "
202 			"Use the \"allow_unsafe_interrupts\" module parameter to override\n");
203 	}
204 
205 	rc = iommu_device_claim_dma_owner(dev, ictx);
206 	if (rc)
207 		goto out_group_put;
208 
209 	idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE);
210 	if (IS_ERR(idev)) {
211 		rc = PTR_ERR(idev);
212 		goto out_release_owner;
213 	}
214 	idev->ictx = ictx;
215 	if (!iommufd_selftest_is_mock_dev(dev))
216 		iommufd_ctx_get(ictx);
217 	idev->dev = dev;
218 	idev->enforce_cache_coherency =
219 		device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
220 	/* The calling driver is a user until iommufd_device_unbind() */
221 	refcount_inc(&idev->obj.users);
222 	/* igroup refcount moves into iommufd_device */
223 	idev->igroup = igroup;
224 
225 	/*
226 	 * If the caller fails after this success it must call
227 	 * iommufd_unbind_device() which is safe since we hold this refcount.
228 	 * This also means the device is a leaf in the graph and no other object
229 	 * can take a reference on it.
230 	 */
231 	iommufd_object_finalize(ictx, &idev->obj);
232 	*id = idev->obj.id;
233 	return idev;
234 
235 out_release_owner:
236 	iommu_device_release_dma_owner(dev);
237 out_group_put:
238 	iommufd_put_group(igroup);
239 	return ERR_PTR(rc);
240 }
241 EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, "IOMMUFD");
242 
243 /**
244  * iommufd_ctx_has_group - True if any device within the group is bound
245  *                         to the ictx
246  * @ictx: iommufd file descriptor
247  * @group: Pointer to a physical iommu_group struct
248  *
249  * True if any device within the group has been bound to this ictx, ex. via
250  * iommufd_device_bind(), therefore implying ictx ownership of the group.
251  */
252 bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group)
253 {
254 	struct iommufd_object *obj;
255 	unsigned long index;
256 
257 	if (!ictx || !group)
258 		return false;
259 
260 	xa_lock(&ictx->objects);
261 	xa_for_each(&ictx->objects, index, obj) {
262 		if (obj->type == IOMMUFD_OBJ_DEVICE &&
263 		    container_of(obj, struct iommufd_device, obj)
264 				    ->igroup->group == group) {
265 			xa_unlock(&ictx->objects);
266 			return true;
267 		}
268 	}
269 	xa_unlock(&ictx->objects);
270 	return false;
271 }
272 EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, "IOMMUFD");
273 
274 /**
275  * iommufd_device_unbind - Undo iommufd_device_bind()
276  * @idev: Device returned by iommufd_device_bind()
277  *
278  * Release the device from iommufd control. The DMA ownership will return back
279  * to unowned with DMA controlled by the DMA API. This invalidates the
280  * iommufd_device pointer, other APIs that consume it must not be called
281  * concurrently.
282  */
283 void iommufd_device_unbind(struct iommufd_device *idev)
284 {
285 	iommufd_object_destroy_user(idev->ictx, &idev->obj);
286 }
287 EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, "IOMMUFD");
288 
289 struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev)
290 {
291 	return idev->ictx;
292 }
293 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, "IOMMUFD");
294 
295 u32 iommufd_device_to_id(struct iommufd_device *idev)
296 {
297 	return idev->obj.id;
298 }
299 EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, "IOMMUFD");
300 
301 static unsigned int iommufd_group_device_num(struct iommufd_group *igroup,
302 					     ioasid_t pasid)
303 {
304 	struct iommufd_attach *attach;
305 	struct iommufd_device *idev;
306 	unsigned int count = 0;
307 	unsigned long index;
308 
309 	lockdep_assert_held(&igroup->lock);
310 
311 	attach = xa_load(&igroup->pasid_attach, pasid);
312 	if (attach)
313 		xa_for_each(&attach->device_array, index, idev)
314 			count++;
315 	return count;
316 }
317 
318 #ifdef CONFIG_IRQ_MSI_IOMMU
319 static int iommufd_group_setup_msi(struct iommufd_group *igroup,
320 				   struct iommufd_hwpt_paging *hwpt_paging)
321 {
322 	struct iommufd_ctx *ictx = igroup->ictx;
323 	struct iommufd_sw_msi_map *cur;
324 
325 	if (igroup->sw_msi_start == PHYS_ADDR_MAX)
326 		return 0;
327 
328 	/*
329 	 * Install all the MSI pages the device has been using into the domain
330 	 */
331 	guard(mutex)(&ictx->sw_msi_lock);
332 	list_for_each_entry(cur, &ictx->sw_msi_list, sw_msi_item) {
333 		int rc;
334 
335 		if (cur->sw_msi_start != igroup->sw_msi_start ||
336 		    !test_bit(cur->id, igroup->required_sw_msi.bitmap))
337 			continue;
338 
339 		rc = iommufd_sw_msi_install(ictx, hwpt_paging, cur);
340 		if (rc)
341 			return rc;
342 	}
343 	return 0;
344 }
345 #else
346 static inline int
347 iommufd_group_setup_msi(struct iommufd_group *igroup,
348 			struct iommufd_hwpt_paging *hwpt_paging)
349 {
350 	return 0;
351 }
352 #endif
353 
354 static bool
355 iommufd_group_first_attach(struct iommufd_group *igroup, ioasid_t pasid)
356 {
357 	lockdep_assert_held(&igroup->lock);
358 	return !xa_load(&igroup->pasid_attach, pasid);
359 }
360 
361 static int
362 iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
363 				    struct iommufd_hwpt_paging *hwpt_paging)
364 {
365 	struct iommufd_group *igroup = idev->igroup;
366 	int rc;
367 
368 	lockdep_assert_held(&igroup->lock);
369 
370 	rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
371 						 idev->dev,
372 						 &igroup->sw_msi_start);
373 	if (rc)
374 		return rc;
375 
376 	if (iommufd_group_first_attach(igroup, IOMMU_NO_PASID)) {
377 		rc = iommufd_group_setup_msi(igroup, hwpt_paging);
378 		if (rc) {
379 			iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt,
380 						  idev->dev);
381 			return rc;
382 		}
383 	}
384 	return 0;
385 }
386 
387 /* The device attach/detach/replace helpers for attach_handle */
388 
389 static bool iommufd_device_is_attached(struct iommufd_device *idev,
390 				       ioasid_t pasid)
391 {
392 	struct iommufd_attach *attach;
393 
394 	attach = xa_load(&idev->igroup->pasid_attach, pasid);
395 	return xa_load(&attach->device_array, idev->obj.id);
396 }
397 
398 static int iommufd_hwpt_pasid_compat(struct iommufd_hw_pagetable *hwpt,
399 				     struct iommufd_device *idev,
400 				     ioasid_t pasid)
401 {
402 	struct iommufd_group *igroup = idev->igroup;
403 
404 	lockdep_assert_held(&igroup->lock);
405 
406 	if (pasid == IOMMU_NO_PASID) {
407 		unsigned long start = IOMMU_NO_PASID;
408 
409 		if (!hwpt->pasid_compat &&
410 		    xa_find_after(&igroup->pasid_attach,
411 				  &start, UINT_MAX, XA_PRESENT))
412 			return -EINVAL;
413 	} else {
414 		struct iommufd_attach *attach;
415 
416 		if (!hwpt->pasid_compat)
417 			return -EINVAL;
418 
419 		attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
420 		if (attach && attach->hwpt && !attach->hwpt->pasid_compat)
421 			return -EINVAL;
422 	}
423 
424 	return 0;
425 }
426 
427 static bool iommufd_hwpt_compatible_device(struct iommufd_hw_pagetable *hwpt,
428 					   struct iommufd_device *idev)
429 {
430 	struct pci_dev *pdev;
431 
432 	if (!hwpt->fault || !dev_is_pci(idev->dev))
433 		return true;
434 
435 	/*
436 	 * Once we turn on PCI/PRI support for VF, the response failure code
437 	 * should not be forwarded to the hardware due to PRI being a shared
438 	 * resource between PF and VFs. There is no coordination for this
439 	 * shared capability. This waits for a vPRI reset to recover.
440 	 */
441 	pdev = to_pci_dev(idev->dev);
442 
443 	return (!pdev->is_virtfn || !pci_pri_supported(pdev));
444 }
445 
446 static int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt,
447 				      struct iommufd_device *idev,
448 				      ioasid_t pasid)
449 {
450 	struct iommufd_attach_handle *handle;
451 	int rc;
452 
453 	if (!iommufd_hwpt_compatible_device(hwpt, idev))
454 		return -EINVAL;
455 
456 	rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid);
457 	if (rc)
458 		return rc;
459 
460 	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
461 	if (!handle)
462 		return -ENOMEM;
463 
464 	handle->idev = idev;
465 	if (pasid == IOMMU_NO_PASID)
466 		rc = iommu_attach_group_handle(hwpt->domain, idev->igroup->group,
467 					       &handle->handle);
468 	else
469 		rc = iommu_attach_device_pasid(hwpt->domain, idev->dev, pasid,
470 					       &handle->handle);
471 	if (rc)
472 		goto out_free_handle;
473 
474 	return 0;
475 
476 out_free_handle:
477 	kfree(handle);
478 	return rc;
479 }
480 
481 static struct iommufd_attach_handle *
482 iommufd_device_get_attach_handle(struct iommufd_device *idev, ioasid_t pasid)
483 {
484 	struct iommu_attach_handle *handle;
485 
486 	lockdep_assert_held(&idev->igroup->lock);
487 
488 	handle =
489 		iommu_attach_handle_get(idev->igroup->group, pasid, 0);
490 	if (IS_ERR(handle))
491 		return NULL;
492 	return to_iommufd_handle(handle);
493 }
494 
495 static void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt,
496 				       struct iommufd_device *idev,
497 				       ioasid_t pasid)
498 {
499 	struct iommufd_attach_handle *handle;
500 
501 	handle = iommufd_device_get_attach_handle(idev, pasid);
502 	if (pasid == IOMMU_NO_PASID)
503 		iommu_detach_group_handle(hwpt->domain, idev->igroup->group);
504 	else
505 		iommu_detach_device_pasid(hwpt->domain, idev->dev, pasid);
506 
507 	iommufd_auto_response_faults(hwpt, handle);
508 	kfree(handle);
509 }
510 
511 static int iommufd_hwpt_replace_device(struct iommufd_device *idev,
512 				       ioasid_t pasid,
513 				       struct iommufd_hw_pagetable *hwpt,
514 				       struct iommufd_hw_pagetable *old)
515 {
516 	struct iommufd_attach_handle *handle, *old_handle;
517 	int rc;
518 
519 	if (!iommufd_hwpt_compatible_device(hwpt, idev))
520 		return -EINVAL;
521 
522 	rc = iommufd_hwpt_pasid_compat(hwpt, idev, pasid);
523 	if (rc)
524 		return rc;
525 
526 	old_handle = iommufd_device_get_attach_handle(idev, pasid);
527 
528 	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
529 	if (!handle)
530 		return -ENOMEM;
531 
532 	handle->idev = idev;
533 	if (pasid == IOMMU_NO_PASID)
534 		rc = iommu_replace_group_handle(idev->igroup->group,
535 						hwpt->domain, &handle->handle);
536 	else
537 		rc = iommu_replace_device_pasid(hwpt->domain, idev->dev,
538 						pasid, &handle->handle);
539 	if (rc)
540 		goto out_free_handle;
541 
542 	iommufd_auto_response_faults(hwpt, old_handle);
543 	kfree(old_handle);
544 
545 	return 0;
546 
547 out_free_handle:
548 	kfree(handle);
549 	return rc;
550 }
551 
552 int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
553 				struct iommufd_device *idev, ioasid_t pasid)
554 {
555 	struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt);
556 	bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID;
557 	struct iommufd_group *igroup = idev->igroup;
558 	struct iommufd_hw_pagetable *old_hwpt;
559 	struct iommufd_attach *attach;
560 	int rc;
561 
562 	mutex_lock(&igroup->lock);
563 
564 	attach = xa_cmpxchg(&igroup->pasid_attach, pasid, NULL,
565 			    XA_ZERO_ENTRY, GFP_KERNEL);
566 	if (xa_is_err(attach)) {
567 		rc = xa_err(attach);
568 		goto err_unlock;
569 	}
570 
571 	if (!attach) {
572 		attach = kzalloc(sizeof(*attach), GFP_KERNEL);
573 		if (!attach) {
574 			rc = -ENOMEM;
575 			goto err_release_pasid;
576 		}
577 		xa_init(&attach->device_array);
578 	}
579 
580 	old_hwpt = attach->hwpt;
581 
582 	rc = xa_insert(&attach->device_array, idev->obj.id, XA_ZERO_ENTRY,
583 		       GFP_KERNEL);
584 	if (rc) {
585 		WARN_ON(rc == -EBUSY && !old_hwpt);
586 		goto err_free_attach;
587 	}
588 
589 	if (old_hwpt && old_hwpt != hwpt) {
590 		rc = -EINVAL;
591 		goto err_release_devid;
592 	}
593 
594 	if (attach_resv) {
595 		rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging);
596 		if (rc)
597 			goto err_release_devid;
598 	}
599 
600 	/*
601 	 * Only attach to the group once for the first device that is in the
602 	 * group. All the other devices will follow this attachment. The user
603 	 * should attach every device individually to the hwpt as the per-device
604 	 * reserved regions are only updated during individual device
605 	 * attachment.
606 	 */
607 	if (iommufd_group_first_attach(igroup, pasid)) {
608 		rc = iommufd_hwpt_attach_device(hwpt, idev, pasid);
609 		if (rc)
610 			goto err_unresv;
611 		attach->hwpt = hwpt;
612 		WARN_ON(xa_is_err(xa_store(&igroup->pasid_attach, pasid, attach,
613 					   GFP_KERNEL)));
614 	}
615 	refcount_inc(&hwpt->obj.users);
616 	WARN_ON(xa_is_err(xa_store(&attach->device_array, idev->obj.id,
617 				   idev, GFP_KERNEL)));
618 	mutex_unlock(&igroup->lock);
619 	return 0;
620 err_unresv:
621 	if (attach_resv)
622 		iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev);
623 err_release_devid:
624 	xa_release(&attach->device_array, idev->obj.id);
625 err_free_attach:
626 	if (iommufd_group_first_attach(igroup, pasid))
627 		kfree(attach);
628 err_release_pasid:
629 	if (iommufd_group_first_attach(igroup, pasid))
630 		xa_release(&igroup->pasid_attach, pasid);
631 err_unlock:
632 	mutex_unlock(&igroup->lock);
633 	return rc;
634 }
635 
636 struct iommufd_hw_pagetable *
637 iommufd_hw_pagetable_detach(struct iommufd_device *idev, ioasid_t pasid)
638 {
639 	struct iommufd_group *igroup = idev->igroup;
640 	struct iommufd_hwpt_paging *hwpt_paging;
641 	struct iommufd_hw_pagetable *hwpt;
642 	struct iommufd_attach *attach;
643 
644 	mutex_lock(&igroup->lock);
645 	attach = xa_load(&igroup->pasid_attach, pasid);
646 	if (!attach) {
647 		mutex_unlock(&igroup->lock);
648 		return NULL;
649 	}
650 
651 	hwpt = attach->hwpt;
652 	hwpt_paging = find_hwpt_paging(hwpt);
653 
654 	xa_erase(&attach->device_array, idev->obj.id);
655 	if (xa_empty(&attach->device_array)) {
656 		iommufd_hwpt_detach_device(hwpt, idev, pasid);
657 		xa_erase(&igroup->pasid_attach, pasid);
658 		kfree(attach);
659 	}
660 	if (hwpt_paging && pasid == IOMMU_NO_PASID)
661 		iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev);
662 	mutex_unlock(&igroup->lock);
663 
664 	/* Caller must destroy hwpt */
665 	return hwpt;
666 }
667 
668 static struct iommufd_hw_pagetable *
669 iommufd_device_do_attach(struct iommufd_device *idev, ioasid_t pasid,
670 			 struct iommufd_hw_pagetable *hwpt)
671 {
672 	int rc;
673 
674 	rc = iommufd_hw_pagetable_attach(hwpt, idev, pasid);
675 	if (rc)
676 		return ERR_PTR(rc);
677 	return NULL;
678 }
679 
680 static void
681 iommufd_group_remove_reserved_iova(struct iommufd_group *igroup,
682 				   struct iommufd_hwpt_paging *hwpt_paging)
683 {
684 	struct iommufd_attach *attach;
685 	struct iommufd_device *cur;
686 	unsigned long index;
687 
688 	lockdep_assert_held(&igroup->lock);
689 
690 	attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
691 	xa_for_each(&attach->device_array, index, cur)
692 		iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev);
693 }
694 
695 static int
696 iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup,
697 				       struct iommufd_hwpt_paging *hwpt_paging)
698 {
699 	struct iommufd_hwpt_paging *old_hwpt_paging;
700 	struct iommufd_attach *attach;
701 	struct iommufd_device *cur;
702 	unsigned long index;
703 	int rc;
704 
705 	lockdep_assert_held(&igroup->lock);
706 
707 	attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID);
708 	old_hwpt_paging = find_hwpt_paging(attach->hwpt);
709 	if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) {
710 		xa_for_each(&attach->device_array, index, cur) {
711 			rc = iopt_table_enforce_dev_resv_regions(
712 				&hwpt_paging->ioas->iopt, cur->dev, NULL);
713 			if (rc)
714 				goto err_unresv;
715 		}
716 	}
717 
718 	rc = iommufd_group_setup_msi(igroup, hwpt_paging);
719 	if (rc)
720 		goto err_unresv;
721 	return 0;
722 
723 err_unresv:
724 	iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
725 	return rc;
726 }
727 
728 static struct iommufd_hw_pagetable *
729 iommufd_device_do_replace(struct iommufd_device *idev, ioasid_t pasid,
730 			  struct iommufd_hw_pagetable *hwpt)
731 {
732 	struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt);
733 	bool attach_resv = hwpt_paging && pasid == IOMMU_NO_PASID;
734 	struct iommufd_hwpt_paging *old_hwpt_paging;
735 	struct iommufd_group *igroup = idev->igroup;
736 	struct iommufd_hw_pagetable *old_hwpt;
737 	struct iommufd_attach *attach;
738 	unsigned int num_devices;
739 	int rc;
740 
741 	mutex_lock(&igroup->lock);
742 
743 	attach = xa_load(&igroup->pasid_attach, pasid);
744 	if (!attach) {
745 		rc = -EINVAL;
746 		goto err_unlock;
747 	}
748 
749 	old_hwpt = attach->hwpt;
750 
751 	WARN_ON(!old_hwpt || xa_empty(&attach->device_array));
752 
753 	if (!iommufd_device_is_attached(idev, pasid)) {
754 		rc = -EINVAL;
755 		goto err_unlock;
756 	}
757 
758 	if (hwpt == old_hwpt) {
759 		mutex_unlock(&igroup->lock);
760 		return NULL;
761 	}
762 
763 	if (attach_resv) {
764 		rc = iommufd_group_do_replace_reserved_iova(igroup, hwpt_paging);
765 		if (rc)
766 			goto err_unlock;
767 	}
768 
769 	rc = iommufd_hwpt_replace_device(idev, pasid, hwpt, old_hwpt);
770 	if (rc)
771 		goto err_unresv;
772 
773 	old_hwpt_paging = find_hwpt_paging(old_hwpt);
774 	if (old_hwpt_paging && pasid == IOMMU_NO_PASID &&
775 	    (!hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas))
776 		iommufd_group_remove_reserved_iova(igroup, old_hwpt_paging);
777 
778 	attach->hwpt = hwpt;
779 
780 	num_devices = iommufd_group_device_num(igroup, pasid);
781 	/*
782 	 * Move the refcounts held by the device_array to the new hwpt. Retain a
783 	 * refcount for this thread as the caller will free it.
784 	 */
785 	refcount_add(num_devices, &hwpt->obj.users);
786 	if (num_devices > 1)
787 		WARN_ON(refcount_sub_and_test(num_devices - 1,
788 					      &old_hwpt->obj.users));
789 	mutex_unlock(&igroup->lock);
790 
791 	/* Caller must destroy old_hwpt */
792 	return old_hwpt;
793 err_unresv:
794 	if (attach_resv)
795 		iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
796 err_unlock:
797 	mutex_unlock(&igroup->lock);
798 	return ERR_PTR(rc);
799 }
800 
801 typedef struct iommufd_hw_pagetable *(*attach_fn)(
802 	struct iommufd_device *idev, ioasid_t pasid,
803 	struct iommufd_hw_pagetable *hwpt);
804 
805 /*
806  * When automatically managing the domains we search for a compatible domain in
807  * the iopt and if one is found use it, otherwise create a new domain.
808  * Automatic domain selection will never pick a manually created domain.
809  */
810 static struct iommufd_hw_pagetable *
811 iommufd_device_auto_get_domain(struct iommufd_device *idev, ioasid_t pasid,
812 			       struct iommufd_ioas *ioas, u32 *pt_id,
813 			       attach_fn do_attach)
814 {
815 	/*
816 	 * iommufd_hw_pagetable_attach() is called by
817 	 * iommufd_hw_pagetable_alloc() in immediate attachment mode, same as
818 	 * iommufd_device_do_attach(). So if we are in this mode then we prefer
819 	 * to use the immediate_attach path as it supports drivers that can't
820 	 * directly allocate a domain.
821 	 */
822 	bool immediate_attach = do_attach == iommufd_device_do_attach;
823 	struct iommufd_hw_pagetable *destroy_hwpt;
824 	struct iommufd_hwpt_paging *hwpt_paging;
825 	struct iommufd_hw_pagetable *hwpt;
826 
827 	/*
828 	 * There is no differentiation when domains are allocated, so any domain
829 	 * that is willing to attach to the device is interchangeable with any
830 	 * other.
831 	 */
832 	mutex_lock(&ioas->mutex);
833 	list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
834 		if (!hwpt_paging->auto_domain)
835 			continue;
836 
837 		hwpt = &hwpt_paging->common;
838 		if (!iommufd_lock_obj(&hwpt->obj))
839 			continue;
840 		destroy_hwpt = (*do_attach)(idev, pasid, hwpt);
841 		if (IS_ERR(destroy_hwpt)) {
842 			iommufd_put_object(idev->ictx, &hwpt->obj);
843 			/*
844 			 * -EINVAL means the domain is incompatible with the
845 			 * device. Other error codes should propagate to
846 			 * userspace as failure. Success means the domain is
847 			 * attached.
848 			 */
849 			if (PTR_ERR(destroy_hwpt) == -EINVAL)
850 				continue;
851 			goto out_unlock;
852 		}
853 		*pt_id = hwpt->obj.id;
854 		iommufd_put_object(idev->ictx, &hwpt->obj);
855 		goto out_unlock;
856 	}
857 
858 	hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, pasid,
859 						0, immediate_attach, NULL);
860 	if (IS_ERR(hwpt_paging)) {
861 		destroy_hwpt = ERR_CAST(hwpt_paging);
862 		goto out_unlock;
863 	}
864 	hwpt = &hwpt_paging->common;
865 
866 	if (!immediate_attach) {
867 		destroy_hwpt = (*do_attach)(idev, pasid, hwpt);
868 		if (IS_ERR(destroy_hwpt))
869 			goto out_abort;
870 	} else {
871 		destroy_hwpt = NULL;
872 	}
873 
874 	hwpt_paging->auto_domain = true;
875 	*pt_id = hwpt->obj.id;
876 
877 	iommufd_object_finalize(idev->ictx, &hwpt->obj);
878 	mutex_unlock(&ioas->mutex);
879 	return destroy_hwpt;
880 
881 out_abort:
882 	iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj);
883 out_unlock:
884 	mutex_unlock(&ioas->mutex);
885 	return destroy_hwpt;
886 }
887 
888 static int iommufd_device_change_pt(struct iommufd_device *idev,
889 				    ioasid_t pasid,
890 				    u32 *pt_id, attach_fn do_attach)
891 {
892 	struct iommufd_hw_pagetable *destroy_hwpt;
893 	struct iommufd_object *pt_obj;
894 
895 	pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY);
896 	if (IS_ERR(pt_obj))
897 		return PTR_ERR(pt_obj);
898 
899 	switch (pt_obj->type) {
900 	case IOMMUFD_OBJ_HWPT_NESTED:
901 	case IOMMUFD_OBJ_HWPT_PAGING: {
902 		struct iommufd_hw_pagetable *hwpt =
903 			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
904 
905 		destroy_hwpt = (*do_attach)(idev, pasid, hwpt);
906 		if (IS_ERR(destroy_hwpt))
907 			goto out_put_pt_obj;
908 		break;
909 	}
910 	case IOMMUFD_OBJ_IOAS: {
911 		struct iommufd_ioas *ioas =
912 			container_of(pt_obj, struct iommufd_ioas, obj);
913 
914 		destroy_hwpt = iommufd_device_auto_get_domain(idev, pasid, ioas,
915 							      pt_id, do_attach);
916 		if (IS_ERR(destroy_hwpt))
917 			goto out_put_pt_obj;
918 		break;
919 	}
920 	default:
921 		destroy_hwpt = ERR_PTR(-EINVAL);
922 		goto out_put_pt_obj;
923 	}
924 	iommufd_put_object(idev->ictx, pt_obj);
925 
926 	/* This destruction has to be after we unlock everything */
927 	if (destroy_hwpt)
928 		iommufd_hw_pagetable_put(idev->ictx, destroy_hwpt);
929 	return 0;
930 
931 out_put_pt_obj:
932 	iommufd_put_object(idev->ictx, pt_obj);
933 	return PTR_ERR(destroy_hwpt);
934 }
935 
936 /**
937  * iommufd_device_attach - Connect a device/pasid to an iommu_domain
938  * @idev: device to attach
939  * @pasid: pasid to attach
940  * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
941  *         Output the IOMMUFD_OBJ_HWPT_PAGING ID
942  *
943  * This connects the device/pasid to an iommu_domain, either automatically
944  * or manually selected. Once this completes the device could do DMA with
945  * @pasid. @pasid is IOMMU_NO_PASID if this attach is for no pasid usage.
946  *
947  * The caller should return the resulting pt_id back to userspace.
948  * This function is undone by calling iommufd_device_detach().
949  */
950 int iommufd_device_attach(struct iommufd_device *idev, ioasid_t pasid,
951 			  u32 *pt_id)
952 {
953 	int rc;
954 
955 	rc = iommufd_device_change_pt(idev, pasid, pt_id,
956 				      &iommufd_device_do_attach);
957 	if (rc)
958 		return rc;
959 
960 	/*
961 	 * Pairs with iommufd_device_detach() - catches caller bugs attempting
962 	 * to destroy a device with an attachment.
963 	 */
964 	refcount_inc(&idev->obj.users);
965 	return 0;
966 }
967 EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, "IOMMUFD");
968 
969 /**
970  * iommufd_device_replace - Change the device/pasid's iommu_domain
971  * @idev: device to change
972  * @pasid: pasid to change
973  * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
974  *         Output the IOMMUFD_OBJ_HWPT_PAGING ID
975  *
976  * This is the same as::
977  *
978  *   iommufd_device_detach();
979  *   iommufd_device_attach();
980  *
981  * If it fails then no change is made to the attachment. The iommu driver may
982  * implement this so there is no disruption in translation. This can only be
983  * called if iommufd_device_attach() has already succeeded. @pasid is
984  * IOMMU_NO_PASID for no pasid usage.
985  */
986 int iommufd_device_replace(struct iommufd_device *idev, ioasid_t pasid,
987 			   u32 *pt_id)
988 {
989 	return iommufd_device_change_pt(idev, pasid, pt_id,
990 					&iommufd_device_do_replace);
991 }
992 EXPORT_SYMBOL_NS_GPL(iommufd_device_replace, "IOMMUFD");
993 
994 /**
995  * iommufd_device_detach - Disconnect a device/device to an iommu_domain
996  * @idev: device to detach
997  * @pasid: pasid to detach
998  *
999  * Undo iommufd_device_attach(). This disconnects the idev from the previously
1000  * attached pt_id. The device returns back to a blocked DMA translation.
1001  * @pasid is IOMMU_NO_PASID for no pasid usage.
1002  */
1003 void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid)
1004 {
1005 	struct iommufd_hw_pagetable *hwpt;
1006 
1007 	hwpt = iommufd_hw_pagetable_detach(idev, pasid);
1008 	if (!hwpt)
1009 		return;
1010 	iommufd_hw_pagetable_put(idev->ictx, hwpt);
1011 	refcount_dec(&idev->obj.users);
1012 }
1013 EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, "IOMMUFD");
1014 
1015 /*
1016  * On success, it will refcount_inc() at a valid new_ioas and refcount_dec() at
1017  * a valid cur_ioas (access->ioas). A caller passing in a valid new_ioas should
1018  * call iommufd_put_object() if it does an iommufd_get_object() for a new_ioas.
1019  */
1020 static int iommufd_access_change_ioas(struct iommufd_access *access,
1021 				      struct iommufd_ioas *new_ioas)
1022 {
1023 	u32 iopt_access_list_id = access->iopt_access_list_id;
1024 	struct iommufd_ioas *cur_ioas = access->ioas;
1025 	int rc;
1026 
1027 	lockdep_assert_held(&access->ioas_lock);
1028 
1029 	/* We are racing with a concurrent detach, bail */
1030 	if (cur_ioas != access->ioas_unpin)
1031 		return -EBUSY;
1032 
1033 	if (cur_ioas == new_ioas)
1034 		return 0;
1035 
1036 	/*
1037 	 * Set ioas to NULL to block any further iommufd_access_pin_pages().
1038 	 * iommufd_access_unpin_pages() can continue using access->ioas_unpin.
1039 	 */
1040 	access->ioas = NULL;
1041 
1042 	if (new_ioas) {
1043 		rc = iopt_add_access(&new_ioas->iopt, access);
1044 		if (rc) {
1045 			access->ioas = cur_ioas;
1046 			return rc;
1047 		}
1048 		refcount_inc(&new_ioas->obj.users);
1049 	}
1050 
1051 	if (cur_ioas) {
1052 		if (access->ops->unmap) {
1053 			mutex_unlock(&access->ioas_lock);
1054 			access->ops->unmap(access->data, 0, ULONG_MAX);
1055 			mutex_lock(&access->ioas_lock);
1056 		}
1057 		iopt_remove_access(&cur_ioas->iopt, access, iopt_access_list_id);
1058 		refcount_dec(&cur_ioas->obj.users);
1059 	}
1060 
1061 	access->ioas = new_ioas;
1062 	access->ioas_unpin = new_ioas;
1063 
1064 	return 0;
1065 }
1066 
1067 static int iommufd_access_change_ioas_id(struct iommufd_access *access, u32 id)
1068 {
1069 	struct iommufd_ioas *ioas = iommufd_get_ioas(access->ictx, id);
1070 	int rc;
1071 
1072 	if (IS_ERR(ioas))
1073 		return PTR_ERR(ioas);
1074 	rc = iommufd_access_change_ioas(access, ioas);
1075 	iommufd_put_object(access->ictx, &ioas->obj);
1076 	return rc;
1077 }
1078 
1079 void iommufd_access_destroy_object(struct iommufd_object *obj)
1080 {
1081 	struct iommufd_access *access =
1082 		container_of(obj, struct iommufd_access, obj);
1083 
1084 	mutex_lock(&access->ioas_lock);
1085 	if (access->ioas)
1086 		WARN_ON(iommufd_access_change_ioas(access, NULL));
1087 	mutex_unlock(&access->ioas_lock);
1088 	iommufd_ctx_put(access->ictx);
1089 }
1090 
1091 /**
1092  * iommufd_access_create - Create an iommufd_access
1093  * @ictx: iommufd file descriptor
1094  * @ops: Driver's ops to associate with the access
1095  * @data: Opaque data to pass into ops functions
1096  * @id: Output ID number to return to userspace for this access
1097  *
1098  * An iommufd_access allows a driver to read/write to the IOAS without using
1099  * DMA. The underlying CPU memory can be accessed using the
1100  * iommufd_access_pin_pages() or iommufd_access_rw() functions.
1101  *
1102  * The provided ops are required to use iommufd_access_pin_pages().
1103  */
1104 struct iommufd_access *
1105 iommufd_access_create(struct iommufd_ctx *ictx,
1106 		      const struct iommufd_access_ops *ops, void *data, u32 *id)
1107 {
1108 	struct iommufd_access *access;
1109 
1110 	/*
1111 	 * There is no uAPI for the access object, but to keep things symmetric
1112 	 * use the object infrastructure anyhow.
1113 	 */
1114 	access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS);
1115 	if (IS_ERR(access))
1116 		return access;
1117 
1118 	access->data = data;
1119 	access->ops = ops;
1120 
1121 	if (ops->needs_pin_pages)
1122 		access->iova_alignment = PAGE_SIZE;
1123 	else
1124 		access->iova_alignment = 1;
1125 
1126 	/* The calling driver is a user until iommufd_access_destroy() */
1127 	refcount_inc(&access->obj.users);
1128 	access->ictx = ictx;
1129 	iommufd_ctx_get(ictx);
1130 	iommufd_object_finalize(ictx, &access->obj);
1131 	*id = access->obj.id;
1132 	mutex_init(&access->ioas_lock);
1133 	return access;
1134 }
1135 EXPORT_SYMBOL_NS_GPL(iommufd_access_create, "IOMMUFD");
1136 
1137 /**
1138  * iommufd_access_destroy - Destroy an iommufd_access
1139  * @access: The access to destroy
1140  *
1141  * The caller must stop using the access before destroying it.
1142  */
1143 void iommufd_access_destroy(struct iommufd_access *access)
1144 {
1145 	iommufd_object_destroy_user(access->ictx, &access->obj);
1146 }
1147 EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, "IOMMUFD");
1148 
1149 void iommufd_access_detach(struct iommufd_access *access)
1150 {
1151 	mutex_lock(&access->ioas_lock);
1152 	if (WARN_ON(!access->ioas)) {
1153 		mutex_unlock(&access->ioas_lock);
1154 		return;
1155 	}
1156 	WARN_ON(iommufd_access_change_ioas(access, NULL));
1157 	mutex_unlock(&access->ioas_lock);
1158 }
1159 EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, "IOMMUFD");
1160 
1161 int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id)
1162 {
1163 	int rc;
1164 
1165 	mutex_lock(&access->ioas_lock);
1166 	if (WARN_ON(access->ioas)) {
1167 		mutex_unlock(&access->ioas_lock);
1168 		return -EINVAL;
1169 	}
1170 
1171 	rc = iommufd_access_change_ioas_id(access, ioas_id);
1172 	mutex_unlock(&access->ioas_lock);
1173 	return rc;
1174 }
1175 EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, "IOMMUFD");
1176 
1177 int iommufd_access_replace(struct iommufd_access *access, u32 ioas_id)
1178 {
1179 	int rc;
1180 
1181 	mutex_lock(&access->ioas_lock);
1182 	if (!access->ioas) {
1183 		mutex_unlock(&access->ioas_lock);
1184 		return -ENOENT;
1185 	}
1186 	rc = iommufd_access_change_ioas_id(access, ioas_id);
1187 	mutex_unlock(&access->ioas_lock);
1188 	return rc;
1189 }
1190 EXPORT_SYMBOL_NS_GPL(iommufd_access_replace, "IOMMUFD");
1191 
1192 /**
1193  * iommufd_access_notify_unmap - Notify users of an iopt to stop using it
1194  * @iopt: iopt to work on
1195  * @iova: Starting iova in the iopt
1196  * @length: Number of bytes
1197  *
1198  * After this function returns there should be no users attached to the pages
1199  * linked to this iopt that intersect with iova,length. Anyone that has attached
1200  * a user through iopt_access_pages() needs to detach it through
1201  * iommufd_access_unpin_pages() before this function returns.
1202  *
1203  * iommufd_access_destroy() will wait for any outstanding unmap callback to
1204  * complete. Once iommufd_access_destroy() no unmap ops are running or will
1205  * run in the future. Due to this a driver must not create locking that prevents
1206  * unmap to complete while iommufd_access_destroy() is running.
1207  */
1208 void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
1209 				 unsigned long length)
1210 {
1211 	struct iommufd_ioas *ioas =
1212 		container_of(iopt, struct iommufd_ioas, iopt);
1213 	struct iommufd_access *access;
1214 	unsigned long index;
1215 
1216 	xa_lock(&ioas->iopt.access_list);
1217 	xa_for_each(&ioas->iopt.access_list, index, access) {
1218 		if (!iommufd_lock_obj(&access->obj))
1219 			continue;
1220 		xa_unlock(&ioas->iopt.access_list);
1221 
1222 		access->ops->unmap(access->data, iova, length);
1223 
1224 		iommufd_put_object(access->ictx, &access->obj);
1225 		xa_lock(&ioas->iopt.access_list);
1226 	}
1227 	xa_unlock(&ioas->iopt.access_list);
1228 }
1229 
1230 /**
1231  * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages
1232  * @access: IOAS access to act on
1233  * @iova: Starting IOVA
1234  * @length: Number of bytes to access
1235  *
1236  * Return the struct page's. The caller must stop accessing them before calling
1237  * this. The iova/length must exactly match the one provided to access_pages.
1238  */
1239 void iommufd_access_unpin_pages(struct iommufd_access *access,
1240 				unsigned long iova, unsigned long length)
1241 {
1242 	struct iopt_area_contig_iter iter;
1243 	struct io_pagetable *iopt;
1244 	unsigned long last_iova;
1245 	struct iopt_area *area;
1246 
1247 	if (WARN_ON(!length) ||
1248 	    WARN_ON(check_add_overflow(iova, length - 1, &last_iova)))
1249 		return;
1250 
1251 	mutex_lock(&access->ioas_lock);
1252 	/*
1253 	 * The driver must be doing something wrong if it calls this before an
1254 	 * iommufd_access_attach() or after an iommufd_access_detach().
1255 	 */
1256 	if (WARN_ON(!access->ioas_unpin)) {
1257 		mutex_unlock(&access->ioas_lock);
1258 		return;
1259 	}
1260 	iopt = &access->ioas_unpin->iopt;
1261 
1262 	down_read(&iopt->iova_rwsem);
1263 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
1264 		iopt_area_remove_access(
1265 			area, iopt_area_iova_to_index(area, iter.cur_iova),
1266 			iopt_area_iova_to_index(
1267 				area,
1268 				min(last_iova, iopt_area_last_iova(area))));
1269 	WARN_ON(!iopt_area_contig_done(&iter));
1270 	up_read(&iopt->iova_rwsem);
1271 	mutex_unlock(&access->ioas_lock);
1272 }
1273 EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, "IOMMUFD");
1274 
1275 static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter)
1276 {
1277 	if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE)
1278 		return false;
1279 
1280 	if (!iopt_area_contig_done(iter) &&
1281 	    (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) %
1282 	     PAGE_SIZE) != (PAGE_SIZE - 1))
1283 		return false;
1284 	return true;
1285 }
1286 
1287 static bool check_area_prot(struct iopt_area *area, unsigned int flags)
1288 {
1289 	if (flags & IOMMUFD_ACCESS_RW_WRITE)
1290 		return area->iommu_prot & IOMMU_WRITE;
1291 	return area->iommu_prot & IOMMU_READ;
1292 }
1293 
1294 /**
1295  * iommufd_access_pin_pages() - Return a list of pages under the iova
1296  * @access: IOAS access to act on
1297  * @iova: Starting IOVA
1298  * @length: Number of bytes to access
1299  * @out_pages: Output page list
1300  * @flags: IOPMMUFD_ACCESS_RW_* flags
1301  *
1302  * Reads @length bytes starting at iova and returns the struct page * pointers.
1303  * These can be kmap'd by the caller for CPU access.
1304  *
1305  * The caller must perform iommufd_access_unpin_pages() when done to balance
1306  * this.
1307  *
1308  * This API always requires a page aligned iova. This happens naturally if the
1309  * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However
1310  * smaller alignments have corner cases where this API can fail on otherwise
1311  * aligned iova.
1312  */
1313 int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova,
1314 			     unsigned long length, struct page **out_pages,
1315 			     unsigned int flags)
1316 {
1317 	struct iopt_area_contig_iter iter;
1318 	struct io_pagetable *iopt;
1319 	unsigned long last_iova;
1320 	struct iopt_area *area;
1321 	int rc;
1322 
1323 	/* Driver's ops don't support pin_pages */
1324 	if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
1325 	    WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap))
1326 		return -EINVAL;
1327 
1328 	if (!length)
1329 		return -EINVAL;
1330 	if (check_add_overflow(iova, length - 1, &last_iova))
1331 		return -EOVERFLOW;
1332 
1333 	mutex_lock(&access->ioas_lock);
1334 	if (!access->ioas) {
1335 		mutex_unlock(&access->ioas_lock);
1336 		return -ENOENT;
1337 	}
1338 	iopt = &access->ioas->iopt;
1339 
1340 	down_read(&iopt->iova_rwsem);
1341 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
1342 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
1343 		unsigned long last_index = iopt_area_iova_to_index(area, last);
1344 		unsigned long index =
1345 			iopt_area_iova_to_index(area, iter.cur_iova);
1346 
1347 		if (area->prevent_access ||
1348 		    !iopt_area_contig_is_aligned(&iter)) {
1349 			rc = -EINVAL;
1350 			goto err_remove;
1351 		}
1352 
1353 		if (!check_area_prot(area, flags)) {
1354 			rc = -EPERM;
1355 			goto err_remove;
1356 		}
1357 
1358 		rc = iopt_area_add_access(area, index, last_index, out_pages,
1359 					  flags);
1360 		if (rc)
1361 			goto err_remove;
1362 		out_pages += last_index - index + 1;
1363 	}
1364 	if (!iopt_area_contig_done(&iter)) {
1365 		rc = -ENOENT;
1366 		goto err_remove;
1367 	}
1368 
1369 	up_read(&iopt->iova_rwsem);
1370 	mutex_unlock(&access->ioas_lock);
1371 	return 0;
1372 
1373 err_remove:
1374 	if (iova < iter.cur_iova) {
1375 		last_iova = iter.cur_iova - 1;
1376 		iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova)
1377 			iopt_area_remove_access(
1378 				area,
1379 				iopt_area_iova_to_index(area, iter.cur_iova),
1380 				iopt_area_iova_to_index(
1381 					area, min(last_iova,
1382 						  iopt_area_last_iova(area))));
1383 	}
1384 	up_read(&iopt->iova_rwsem);
1385 	mutex_unlock(&access->ioas_lock);
1386 	return rc;
1387 }
1388 EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, "IOMMUFD");
1389 
1390 /**
1391  * iommufd_access_rw - Read or write data under the iova
1392  * @access: IOAS access to act on
1393  * @iova: Starting IOVA
1394  * @data: Kernel buffer to copy to/from
1395  * @length: Number of bytes to access
1396  * @flags: IOMMUFD_ACCESS_RW_* flags
1397  *
1398  * Copy kernel to/from data into the range given by IOVA/length. If flags
1399  * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized
1400  * by changing it into copy_to/from_user().
1401  */
1402 int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
1403 		      void *data, size_t length, unsigned int flags)
1404 {
1405 	struct iopt_area_contig_iter iter;
1406 	struct io_pagetable *iopt;
1407 	struct iopt_area *area;
1408 	unsigned long last_iova;
1409 	int rc = -EINVAL;
1410 
1411 	if (!length)
1412 		return -EINVAL;
1413 	if (check_add_overflow(iova, length - 1, &last_iova))
1414 		return -EOVERFLOW;
1415 
1416 	mutex_lock(&access->ioas_lock);
1417 	if (!access->ioas) {
1418 		mutex_unlock(&access->ioas_lock);
1419 		return -ENOENT;
1420 	}
1421 	iopt = &access->ioas->iopt;
1422 
1423 	down_read(&iopt->iova_rwsem);
1424 	iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
1425 		unsigned long last = min(last_iova, iopt_area_last_iova(area));
1426 		unsigned long bytes = (last - iter.cur_iova) + 1;
1427 
1428 		if (area->prevent_access) {
1429 			rc = -EINVAL;
1430 			goto err_out;
1431 		}
1432 
1433 		if (!check_area_prot(area, flags)) {
1434 			rc = -EPERM;
1435 			goto err_out;
1436 		}
1437 
1438 		rc = iopt_pages_rw_access(
1439 			area->pages, iopt_area_start_byte(area, iter.cur_iova),
1440 			data, bytes, flags);
1441 		if (rc)
1442 			goto err_out;
1443 		data += bytes;
1444 	}
1445 	if (!iopt_area_contig_done(&iter))
1446 		rc = -ENOENT;
1447 err_out:
1448 	up_read(&iopt->iova_rwsem);
1449 	mutex_unlock(&access->ioas_lock);
1450 	return rc;
1451 }
1452 EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, "IOMMUFD");
1453 
1454 int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
1455 {
1456 	struct iommu_hw_info *cmd = ucmd->cmd;
1457 	void __user *user_ptr = u64_to_user_ptr(cmd->data_uptr);
1458 	const struct iommu_ops *ops;
1459 	struct iommufd_device *idev;
1460 	unsigned int data_len;
1461 	unsigned int copy_len;
1462 	void *data;
1463 	int rc;
1464 
1465 	if (cmd->flags || cmd->__reserved[0] || cmd->__reserved[1] ||
1466 	    cmd->__reserved[2])
1467 		return -EOPNOTSUPP;
1468 
1469 	idev = iommufd_get_device(ucmd, cmd->dev_id);
1470 	if (IS_ERR(idev))
1471 		return PTR_ERR(idev);
1472 
1473 	ops = dev_iommu_ops(idev->dev);
1474 	if (ops->hw_info) {
1475 		data = ops->hw_info(idev->dev, &data_len, &cmd->out_data_type);
1476 		if (IS_ERR(data)) {
1477 			rc = PTR_ERR(data);
1478 			goto out_put;
1479 		}
1480 
1481 		/*
1482 		 * drivers that have hw_info callback should have a unique
1483 		 * iommu_hw_info_type.
1484 		 */
1485 		if (WARN_ON_ONCE(cmd->out_data_type ==
1486 				 IOMMU_HW_INFO_TYPE_NONE)) {
1487 			rc = -ENODEV;
1488 			goto out_free;
1489 		}
1490 	} else {
1491 		cmd->out_data_type = IOMMU_HW_INFO_TYPE_NONE;
1492 		data_len = 0;
1493 		data = NULL;
1494 	}
1495 
1496 	copy_len = min(cmd->data_len, data_len);
1497 	if (copy_to_user(user_ptr, data, copy_len)) {
1498 		rc = -EFAULT;
1499 		goto out_free;
1500 	}
1501 
1502 	/*
1503 	 * Zero the trailing bytes if the user buffer is bigger than the
1504 	 * data size kernel actually has.
1505 	 */
1506 	if (copy_len < cmd->data_len) {
1507 		if (clear_user(user_ptr + copy_len, cmd->data_len - copy_len)) {
1508 			rc = -EFAULT;
1509 			goto out_free;
1510 		}
1511 	}
1512 
1513 	/*
1514 	 * We return the length the kernel supports so userspace may know what
1515 	 * the kernel capability is. It could be larger than the input buffer.
1516 	 */
1517 	cmd->data_len = data_len;
1518 
1519 	cmd->out_capabilities = 0;
1520 	if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING))
1521 		cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING;
1522 
1523 	cmd->out_max_pasid_log2 = 0;
1524 	/*
1525 	 * Currently, all iommu drivers enable PASID in the probe_device()
1526 	 * op if iommu and device supports it. So the max_pasids stored in
1527 	 * dev->iommu indicates both PASID support and enable status. A
1528 	 * non-zero dev->iommu->max_pasids means PASID is supported and
1529 	 * enabled. The iommufd only reports PASID capability to userspace
1530 	 * if it's enabled.
1531 	 */
1532 	if (idev->dev->iommu->max_pasids) {
1533 		cmd->out_max_pasid_log2 = ilog2(idev->dev->iommu->max_pasids);
1534 
1535 		if (dev_is_pci(idev->dev)) {
1536 			struct pci_dev *pdev = to_pci_dev(idev->dev);
1537 			int ctrl;
1538 
1539 			ctrl = pci_pasid_status(pdev);
1540 
1541 			WARN_ON_ONCE(ctrl < 0 ||
1542 				     !(ctrl & PCI_PASID_CTRL_ENABLE));
1543 
1544 			if (ctrl & PCI_PASID_CTRL_EXEC)
1545 				cmd->out_capabilities |=
1546 						IOMMU_HW_CAP_PCI_PASID_EXEC;
1547 			if (ctrl & PCI_PASID_CTRL_PRIV)
1548 				cmd->out_capabilities |=
1549 						IOMMU_HW_CAP_PCI_PASID_PRIV;
1550 		}
1551 	}
1552 
1553 	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
1554 out_free:
1555 	kfree(data);
1556 out_put:
1557 	iommufd_put_object(ucmd->ictx, &idev->obj);
1558 	return rc;
1559 }
1560