xref: /linux/drivers/iommu/iommufd/vfio_compat.c (revision cdd30ebb1b9f36159d66f088b61aee264e649d7a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
3  */
4 #include <linux/file.h>
5 #include <linux/interval_tree.h>
6 #include <linux/iommu.h>
7 #include <linux/iommufd.h>
8 #include <linux/slab.h>
9 #include <linux/vfio.h>
10 #include <uapi/linux/vfio.h>
11 #include <uapi/linux/iommufd.h>
12 
13 #include "iommufd_private.h"
14 
get_compat_ioas(struct iommufd_ctx * ictx)15 static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
16 {
17 	struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);
18 
19 	xa_lock(&ictx->objects);
20 	if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
21 		goto out_unlock;
22 	ioas = ictx->vfio_ioas;
23 out_unlock:
24 	xa_unlock(&ictx->objects);
25 	return ioas;
26 }
27 
28 /**
29  * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists
30  * @ictx: Context to operate on
31  * @out_ioas_id: The IOAS ID of the compatibility IOAS
32  *
33  * Return the ID of the current compatibility IOAS. The ID can be passed into
34  * other functions that take an ioas_id.
35  */
iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx * ictx,u32 * out_ioas_id)36 int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
37 {
38 	struct iommufd_ioas *ioas;
39 
40 	ioas = get_compat_ioas(ictx);
41 	if (IS_ERR(ioas))
42 		return PTR_ERR(ioas);
43 	*out_ioas_id = ioas->obj.id;
44 	iommufd_put_object(ictx, &ioas->obj);
45 	return 0;
46 }
47 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, "IOMMUFD_VFIO");
48 
49 /**
50  * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached
51  * @ictx: Context to operate on
52  *
53  * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types.
54  */
iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx * ictx)55 int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
56 {
57 	int ret;
58 
59 	xa_lock(&ictx->objects);
60 	if (!ictx->vfio_ioas) {
61 		ictx->no_iommu_mode = 1;
62 		ret = 0;
63 	} else {
64 		ret = -EINVAL;
65 	}
66 	xa_unlock(&ictx->objects);
67 	return ret;
68 }
69 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, "IOMMUFD_VFIO");
70 
71 /**
72  * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created
73  * @ictx: Context to operate on
74  *
75  * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
76  * on since they do not have an IOAS ID input in their ABI. Only attaching a
77  * group should cause a default creation of the internal ioas, this does nothing
78  * if an existing ioas has already been assigned somehow.
79  */
iommufd_vfio_compat_ioas_create(struct iommufd_ctx * ictx)80 int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx)
81 {
82 	struct iommufd_ioas *ioas = NULL;
83 	int ret;
84 
85 	ioas = iommufd_ioas_alloc(ictx);
86 	if (IS_ERR(ioas))
87 		return PTR_ERR(ioas);
88 
89 	xa_lock(&ictx->objects);
90 	/*
91 	 * VFIO won't allow attaching a container to both iommu and no iommu
92 	 * operation
93 	 */
94 	if (ictx->no_iommu_mode) {
95 		ret = -EINVAL;
96 		goto out_abort;
97 	}
98 
99 	if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) {
100 		ret = 0;
101 		iommufd_put_object(ictx, &ictx->vfio_ioas->obj);
102 		goto out_abort;
103 	}
104 	ictx->vfio_ioas = ioas;
105 	xa_unlock(&ictx->objects);
106 
107 	/*
108 	 * An automatically created compat IOAS is treated as a userspace
109 	 * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
110 	 * and if not manually destroyed it will be destroyed automatically
111 	 * at iommufd release.
112 	 */
113 	iommufd_object_finalize(ictx, &ioas->obj);
114 	return 0;
115 
116 out_abort:
117 	xa_unlock(&ictx->objects);
118 	iommufd_object_abort(ictx, &ioas->obj);
119 	return ret;
120 }
121 EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, "IOMMUFD_VFIO");
122 
iommufd_vfio_ioas(struct iommufd_ucmd * ucmd)123 int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
124 {
125 	struct iommu_vfio_ioas *cmd = ucmd->cmd;
126 	struct iommufd_ioas *ioas;
127 
128 	if (cmd->__reserved)
129 		return -EOPNOTSUPP;
130 	switch (cmd->op) {
131 	case IOMMU_VFIO_IOAS_GET:
132 		ioas = get_compat_ioas(ucmd->ictx);
133 		if (IS_ERR(ioas))
134 			return PTR_ERR(ioas);
135 		cmd->ioas_id = ioas->obj.id;
136 		iommufd_put_object(ucmd->ictx, &ioas->obj);
137 		return iommufd_ucmd_respond(ucmd, sizeof(*cmd));
138 
139 	case IOMMU_VFIO_IOAS_SET:
140 		ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
141 		if (IS_ERR(ioas))
142 			return PTR_ERR(ioas);
143 		xa_lock(&ucmd->ictx->objects);
144 		ucmd->ictx->vfio_ioas = ioas;
145 		xa_unlock(&ucmd->ictx->objects);
146 		iommufd_put_object(ucmd->ictx, &ioas->obj);
147 		return 0;
148 
149 	case IOMMU_VFIO_IOAS_CLEAR:
150 		xa_lock(&ucmd->ictx->objects);
151 		ucmd->ictx->vfio_ioas = NULL;
152 		xa_unlock(&ucmd->ictx->objects);
153 		return 0;
154 	default:
155 		return -EOPNOTSUPP;
156 	}
157 }
158 
iommufd_vfio_map_dma(struct iommufd_ctx * ictx,unsigned int cmd,void __user * arg)159 static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
160 				void __user *arg)
161 {
162 	u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
163 	size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
164 	struct vfio_iommu_type1_dma_map map;
165 	int iommu_prot = IOMMU_CACHE;
166 	struct iommufd_ioas *ioas;
167 	unsigned long iova;
168 	int rc;
169 
170 	if (copy_from_user(&map, arg, minsz))
171 		return -EFAULT;
172 
173 	if (map.argsz < minsz || map.flags & ~supported_flags)
174 		return -EINVAL;
175 
176 	if (map.flags & VFIO_DMA_MAP_FLAG_READ)
177 		iommu_prot |= IOMMU_READ;
178 	if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
179 		iommu_prot |= IOMMU_WRITE;
180 
181 	ioas = get_compat_ioas(ictx);
182 	if (IS_ERR(ioas))
183 		return PTR_ERR(ioas);
184 
185 	/*
186 	 * Maps created through the legacy interface always use VFIO compatible
187 	 * rlimit accounting. If the user wishes to use the faster user based
188 	 * rlimit accounting then they must use the new interface.
189 	 */
190 	iova = map.iova;
191 	rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
192 				 map.size, iommu_prot, 0);
193 	iommufd_put_object(ictx, &ioas->obj);
194 	return rc;
195 }
196 
iommufd_vfio_unmap_dma(struct iommufd_ctx * ictx,unsigned int cmd,void __user * arg)197 static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
198 				  void __user *arg)
199 {
200 	size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
201 	/*
202 	 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
203 	 * dirty tracking direction:
204 	 *  https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
205 	 *  https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
206 	 */
207 	u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
208 	struct vfio_iommu_type1_dma_unmap unmap;
209 	unsigned long unmapped = 0;
210 	struct iommufd_ioas *ioas;
211 	int rc;
212 
213 	if (copy_from_user(&unmap, arg, minsz))
214 		return -EFAULT;
215 
216 	if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
217 		return -EINVAL;
218 
219 	ioas = get_compat_ioas(ictx);
220 	if (IS_ERR(ioas))
221 		return PTR_ERR(ioas);
222 
223 	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
224 		if (unmap.iova != 0 || unmap.size != 0) {
225 			rc = -EINVAL;
226 			goto err_put;
227 		}
228 		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
229 	} else {
230 		if (READ_ONCE(ioas->iopt.disable_large_pages)) {
231 			/*
232 			 * Create cuts at the start and last of the requested
233 			 * range. If the start IOVA is 0 then it doesn't need to
234 			 * be cut.
235 			 */
236 			unsigned long iovas[] = { unmap.iova + unmap.size - 1,
237 						  unmap.iova - 1 };
238 
239 			rc = iopt_cut_iova(&ioas->iopt, iovas,
240 					   unmap.iova ? 2 : 1);
241 			if (rc)
242 				goto err_put;
243 		}
244 		rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
245 				     &unmapped);
246 	}
247 	unmap.size = unmapped;
248 	if (copy_to_user(arg, &unmap, minsz))
249 		rc = -EFAULT;
250 
251 err_put:
252 	iommufd_put_object(ictx, &ioas->obj);
253 	return rc;
254 }
255 
iommufd_vfio_cc_iommu(struct iommufd_ctx * ictx)256 static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
257 {
258 	struct iommufd_hwpt_paging *hwpt_paging;
259 	struct iommufd_ioas *ioas;
260 	int rc = 1;
261 
262 	ioas = get_compat_ioas(ictx);
263 	if (IS_ERR(ioas))
264 		return PTR_ERR(ioas);
265 
266 	mutex_lock(&ioas->mutex);
267 	list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
268 		if (!hwpt_paging->enforce_cache_coherency) {
269 			rc = 0;
270 			break;
271 		}
272 	}
273 	mutex_unlock(&ioas->mutex);
274 
275 	iommufd_put_object(ictx, &ioas->obj);
276 	return rc;
277 }
278 
iommufd_vfio_check_extension(struct iommufd_ctx * ictx,unsigned long type)279 static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
280 					unsigned long type)
281 {
282 	switch (type) {
283 	case VFIO_TYPE1_IOMMU:
284 	case VFIO_TYPE1v2_IOMMU:
285 	case VFIO_UNMAP_ALL:
286 		return 1;
287 
288 	case VFIO_NOIOMMU_IOMMU:
289 		return IS_ENABLED(CONFIG_VFIO_NOIOMMU);
290 
291 	case VFIO_DMA_CC_IOMMU:
292 		return iommufd_vfio_cc_iommu(ictx);
293 
294 	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
295 		return 0;
296 
297 	/*
298 	 * VFIO_DMA_MAP_FLAG_VADDR
299 	 * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
300 	 * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
301 	 *
302 	 * It is hard to see how this could be implemented safely.
303 	 */
304 	case VFIO_UPDATE_VADDR:
305 	default:
306 		return 0;
307 	}
308 }
309 
iommufd_vfio_set_iommu(struct iommufd_ctx * ictx,unsigned long type)310 static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
311 {
312 	bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode);
313 	struct iommufd_ioas *ioas = NULL;
314 	int rc = 0;
315 
316 	/*
317 	 * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all
318 	 * other ioctls. We let them keep working but they mostly fail since no
319 	 * IOAS should exist.
320 	 */
321 	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU &&
322 	    no_iommu_mode) {
323 		if (!capable(CAP_SYS_RAWIO))
324 			return -EPERM;
325 		return 0;
326 	}
327 
328 	if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) ||
329 	    no_iommu_mode)
330 		return -EINVAL;
331 
332 	/* VFIO fails the set_iommu if there is no group */
333 	ioas = get_compat_ioas(ictx);
334 	if (IS_ERR(ioas))
335 		return PTR_ERR(ioas);
336 
337 	/*
338 	 * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
339 	 * the middle of mapped ranges. This is complicated by huge page support
340 	 * which creates single large IOPTEs that cannot be split by the iommu
341 	 * driver. TYPE1 is very old at this point and likely nothing uses it,
342 	 * however it is simple enough to emulate by simply disabling the
343 	 * problematic large IOPTEs. Then we can safely unmap within any range.
344 	 */
345 	if (type == VFIO_TYPE1_IOMMU)
346 		rc = iopt_disable_large_pages(&ioas->iopt);
347 	iommufd_put_object(ictx, &ioas->obj);
348 	return rc;
349 }
350 
iommufd_get_pagesizes(struct iommufd_ioas * ioas)351 static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
352 {
353 	struct io_pagetable *iopt = &ioas->iopt;
354 	unsigned long pgsize_bitmap = ULONG_MAX;
355 	struct iommu_domain *domain;
356 	unsigned long index;
357 
358 	down_read(&iopt->domains_rwsem);
359 	xa_for_each(&iopt->domains, index, domain)
360 		pgsize_bitmap &= domain->pgsize_bitmap;
361 
362 	/* See vfio_update_pgsize_bitmap() */
363 	if (pgsize_bitmap & ~PAGE_MASK) {
364 		pgsize_bitmap &= PAGE_MASK;
365 		pgsize_bitmap |= PAGE_SIZE;
366 	}
367 	pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
368 	up_read(&iopt->domains_rwsem);
369 	return pgsize_bitmap;
370 }
371 
iommufd_fill_cap_iova(struct iommufd_ioas * ioas,struct vfio_info_cap_header __user * cur,size_t avail)372 static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
373 				 struct vfio_info_cap_header __user *cur,
374 				 size_t avail)
375 {
376 	struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
377 		container_of(cur,
378 			     struct vfio_iommu_type1_info_cap_iova_range __user,
379 			     header);
380 	struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
381 		.header = {
382 			.id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
383 			.version = 1,
384 		},
385 	};
386 	struct interval_tree_span_iter span;
387 
388 	interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
389 				    ULONG_MAX) {
390 		struct vfio_iova_range range;
391 
392 		if (!span.is_hole)
393 			continue;
394 		range.start = span.start_hole;
395 		range.end = span.last_hole;
396 		if (avail >= struct_size(&cap_iovas, iova_ranges,
397 					 cap_iovas.nr_iovas + 1) &&
398 		    copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
399 				 &range, sizeof(range)))
400 			return -EFAULT;
401 		cap_iovas.nr_iovas++;
402 	}
403 	if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
404 	    copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
405 		return -EFAULT;
406 	return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
407 }
408 
iommufd_fill_cap_dma_avail(struct iommufd_ioas * ioas,struct vfio_info_cap_header __user * cur,size_t avail)409 static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
410 				      struct vfio_info_cap_header __user *cur,
411 				      size_t avail)
412 {
413 	struct vfio_iommu_type1_info_dma_avail cap_dma = {
414 		.header = {
415 			.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
416 			.version = 1,
417 		},
418 		/*
419 		 * iommufd's limit is based on the cgroup's memory limit.
420 		 * Normally vfio would return U16_MAX here, and provide a module
421 		 * parameter to adjust it. Since S390 qemu userspace actually
422 		 * pays attention and needs a value bigger than U16_MAX return
423 		 * U32_MAX.
424 		 */
425 		.avail = U32_MAX,
426 	};
427 
428 	if (avail >= sizeof(cap_dma) &&
429 	    copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
430 		return -EFAULT;
431 	return sizeof(cap_dma);
432 }
433 
iommufd_vfio_iommu_get_info(struct iommufd_ctx * ictx,void __user * arg)434 static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
435 				       void __user *arg)
436 {
437 	typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
438 				   struct vfio_info_cap_header __user *cur,
439 				   size_t avail);
440 	static const fill_cap_fn fill_fns[] = {
441 		iommufd_fill_cap_dma_avail,
442 		iommufd_fill_cap_iova,
443 	};
444 	size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
445 	struct vfio_info_cap_header __user *last_cap = NULL;
446 	struct vfio_iommu_type1_info info = {};
447 	struct iommufd_ioas *ioas;
448 	size_t total_cap_size;
449 	int rc;
450 	int i;
451 
452 	if (copy_from_user(&info, arg, minsz))
453 		return -EFAULT;
454 
455 	if (info.argsz < minsz)
456 		return -EINVAL;
457 	minsz = min_t(size_t, info.argsz, sizeof(info));
458 
459 	ioas = get_compat_ioas(ictx);
460 	if (IS_ERR(ioas))
461 		return PTR_ERR(ioas);
462 
463 	info.flags = VFIO_IOMMU_INFO_PGSIZES;
464 	info.iova_pgsizes = iommufd_get_pagesizes(ioas);
465 	info.cap_offset = 0;
466 
467 	down_read(&ioas->iopt.iova_rwsem);
468 	total_cap_size = sizeof(info);
469 	for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
470 		int cap_size;
471 
472 		if (info.argsz > total_cap_size)
473 			cap_size = fill_fns[i](ioas, arg + total_cap_size,
474 					       info.argsz - total_cap_size);
475 		else
476 			cap_size = fill_fns[i](ioas, NULL, 0);
477 		if (cap_size < 0) {
478 			rc = cap_size;
479 			goto out_put;
480 		}
481 		cap_size = ALIGN(cap_size, sizeof(u64));
482 
483 		if (last_cap && info.argsz >= total_cap_size &&
484 		    put_user(total_cap_size, &last_cap->next)) {
485 			rc = -EFAULT;
486 			goto out_put;
487 		}
488 		last_cap = arg + total_cap_size;
489 		total_cap_size += cap_size;
490 	}
491 
492 	/*
493 	 * If the user did not provide enough space then only some caps are
494 	 * returned and the argsz will be updated to the correct amount to get
495 	 * all caps.
496 	 */
497 	if (info.argsz >= total_cap_size)
498 		info.cap_offset = sizeof(info);
499 	info.argsz = total_cap_size;
500 	info.flags |= VFIO_IOMMU_INFO_CAPS;
501 	if (copy_to_user(arg, &info, minsz)) {
502 		rc = -EFAULT;
503 		goto out_put;
504 	}
505 	rc = 0;
506 
507 out_put:
508 	up_read(&ioas->iopt.iova_rwsem);
509 	iommufd_put_object(ictx, &ioas->obj);
510 	return rc;
511 }
512 
iommufd_vfio_ioctl(struct iommufd_ctx * ictx,unsigned int cmd,unsigned long arg)513 int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
514 		       unsigned long arg)
515 {
516 	void __user *uarg = (void __user *)arg;
517 
518 	switch (cmd) {
519 	case VFIO_GET_API_VERSION:
520 		return VFIO_API_VERSION;
521 	case VFIO_SET_IOMMU:
522 		return iommufd_vfio_set_iommu(ictx, arg);
523 	case VFIO_CHECK_EXTENSION:
524 		return iommufd_vfio_check_extension(ictx, arg);
525 	case VFIO_IOMMU_GET_INFO:
526 		return iommufd_vfio_iommu_get_info(ictx, uarg);
527 	case VFIO_IOMMU_MAP_DMA:
528 		return iommufd_vfio_map_dma(ictx, cmd, uarg);
529 	case VFIO_IOMMU_UNMAP_DMA:
530 		return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
531 	case VFIO_IOMMU_DIRTY_PAGES:
532 	default:
533 		return -ENOIOCTLCMD;
534 	}
535 	return -ENOIOCTLCMD;
536 }
537