xref: /linux/drivers/vfio/pci/vfio_pci_dmabuf.c (revision 056daec2925dc200b22c30419bc7b9e01f7843c4)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
3  */
4 #include <linux/dma-buf-mapping.h>
5 #include <linux/pci-p2pdma.h>
6 #include <linux/dma-resv.h>
7 
8 #include "vfio_pci_priv.h"
9 
10 MODULE_IMPORT_NS("DMA_BUF");
11 
12 struct vfio_pci_dma_buf {
13 	struct dma_buf *dmabuf;
14 	struct vfio_pci_core_device *vdev;
15 	struct list_head dmabufs_elm;
16 	size_t size;
17 	struct dma_buf_phys_vec *phys_vec;
18 	struct p2pdma_provider *provider;
19 	u32 nr_ranges;
20 	u8 revoked : 1;
21 };
22 
vfio_pci_dma_buf_attach(struct dma_buf * dmabuf,struct dma_buf_attachment * attachment)23 static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
24 				   struct dma_buf_attachment *attachment)
25 {
26 	struct vfio_pci_dma_buf *priv = dmabuf->priv;
27 
28 	if (!attachment->peer2peer)
29 		return -EOPNOTSUPP;
30 
31 	if (priv->revoked)
32 		return -ENODEV;
33 
34 	return 0;
35 }
36 
37 static struct sg_table *
vfio_pci_dma_buf_map(struct dma_buf_attachment * attachment,enum dma_data_direction dir)38 vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
39 		     enum dma_data_direction dir)
40 {
41 	struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
42 
43 	dma_resv_assert_held(priv->dmabuf->resv);
44 
45 	if (priv->revoked)
46 		return ERR_PTR(-ENODEV);
47 
48 	return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
49 				       priv->phys_vec, priv->nr_ranges,
50 				       priv->size, dir);
51 }
52 
vfio_pci_dma_buf_unmap(struct dma_buf_attachment * attachment,struct sg_table * sgt,enum dma_data_direction dir)53 static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
54 				   struct sg_table *sgt,
55 				   enum dma_data_direction dir)
56 {
57 	dma_buf_free_sgt(attachment, sgt, dir);
58 }
59 
vfio_pci_dma_buf_release(struct dma_buf * dmabuf)60 static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
61 {
62 	struct vfio_pci_dma_buf *priv = dmabuf->priv;
63 
64 	/*
65 	 * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
66 	 * The refcount prevents both.
67 	 */
68 	if (priv->vdev) {
69 		down_write(&priv->vdev->memory_lock);
70 		list_del_init(&priv->dmabufs_elm);
71 		up_write(&priv->vdev->memory_lock);
72 		vfio_device_put_registration(&priv->vdev->vdev);
73 	}
74 	kfree(priv->phys_vec);
75 	kfree(priv);
76 }
77 
78 static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
79 	.attach = vfio_pci_dma_buf_attach,
80 	.map_dma_buf = vfio_pci_dma_buf_map,
81 	.unmap_dma_buf = vfio_pci_dma_buf_unmap,
82 	.release = vfio_pci_dma_buf_release,
83 };
84 
85 /*
86  * This is a temporary "private interconnect" between VFIO DMABUF and iommufd.
87  * It allows the two co-operating drivers to exchange the physical address of
88  * the BAR. This is to be replaced with a formal DMABUF system for negotiated
89  * interconnect types.
90  *
91  * If this function succeeds the following are true:
92  *  - There is one physical range and it is pointing to MMIO
93  *  - When move_notify is called it means revoke, not move, vfio_dma_buf_map
94  *    will fail if it is currently revoked
95  */
vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment * attachment,struct dma_buf_phys_vec * phys)96 int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
97 				 struct dma_buf_phys_vec *phys)
98 {
99 	struct vfio_pci_dma_buf *priv;
100 
101 	dma_resv_assert_held(attachment->dmabuf->resv);
102 
103 	if (attachment->dmabuf->ops != &vfio_pci_dmabuf_ops)
104 		return -EOPNOTSUPP;
105 
106 	priv = attachment->dmabuf->priv;
107 	if (priv->revoked)
108 		return -ENODEV;
109 
110 	/* More than one range to iommufd will require proper DMABUF support */
111 	if (priv->nr_ranges != 1)
112 		return -EOPNOTSUPP;
113 
114 	*phys = priv->phys_vec[0];
115 	return 0;
116 }
117 EXPORT_SYMBOL_FOR_MODULES(vfio_pci_dma_buf_iommufd_map, "iommufd");
118 
vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec * phys_vec,struct vfio_region_dma_range * dma_ranges,size_t nr_ranges,phys_addr_t start,phys_addr_t len)119 int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
120 				struct vfio_region_dma_range *dma_ranges,
121 				size_t nr_ranges, phys_addr_t start,
122 				phys_addr_t len)
123 {
124 	phys_addr_t max_addr;
125 	unsigned int i;
126 
127 	max_addr = start + len;
128 	for (i = 0; i < nr_ranges; i++) {
129 		phys_addr_t end;
130 
131 		if (!dma_ranges[i].length)
132 			return -EINVAL;
133 
134 		if (check_add_overflow(start, dma_ranges[i].offset,
135 				       &phys_vec[i].paddr) ||
136 		    check_add_overflow(phys_vec[i].paddr,
137 				       dma_ranges[i].length, &end))
138 			return -EOVERFLOW;
139 		if (end > max_addr)
140 			return -EINVAL;
141 
142 		phys_vec[i].len = dma_ranges[i].length;
143 	}
144 	return 0;
145 }
146 EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec);
147 
vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device * vdev,struct p2pdma_provider ** provider,unsigned int region_index,struct dma_buf_phys_vec * phys_vec,struct vfio_region_dma_range * dma_ranges,size_t nr_ranges)148 int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
149 				  struct p2pdma_provider **provider,
150 				  unsigned int region_index,
151 				  struct dma_buf_phys_vec *phys_vec,
152 				  struct vfio_region_dma_range *dma_ranges,
153 				  size_t nr_ranges)
154 {
155 	struct pci_dev *pdev = vdev->pdev;
156 
157 	*provider = pcim_p2pdma_provider(pdev, region_index);
158 	if (!*provider)
159 		return -EINVAL;
160 
161 	return vfio_pci_core_fill_phys_vec(
162 		phys_vec, dma_ranges, nr_ranges,
163 		pci_resource_start(pdev, region_index),
164 		pci_resource_len(pdev, region_index));
165 }
166 EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys);
167 
validate_dmabuf_input(struct vfio_device_feature_dma_buf * dma_buf,struct vfio_region_dma_range * dma_ranges,size_t * lengthp)168 static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf,
169 				 struct vfio_region_dma_range *dma_ranges,
170 				 size_t *lengthp)
171 {
172 	size_t length = 0;
173 	u32 i;
174 
175 	for (i = 0; i < dma_buf->nr_ranges; i++) {
176 		u64 offset = dma_ranges[i].offset;
177 		u64 len = dma_ranges[i].length;
178 
179 		if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
180 			return -EINVAL;
181 
182 		if (check_add_overflow(length, len, &length))
183 			return -EINVAL;
184 	}
185 
186 	/*
187 	 * dma_iova_try_alloc() will WARN on if userspace proposes a size that
188 	 * is too big, eg with lots of ranges.
189 	 */
190 	if ((u64)(length) & DMA_IOVA_USE_SWIOTLB)
191 		return -EINVAL;
192 
193 	*lengthp = length;
194 	return 0;
195 }
196 
vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device * vdev,u32 flags,struct vfio_device_feature_dma_buf __user * arg,size_t argsz)197 int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
198 				  struct vfio_device_feature_dma_buf __user *arg,
199 				  size_t argsz)
200 {
201 	struct vfio_device_feature_dma_buf get_dma_buf = {};
202 	struct vfio_region_dma_range *dma_ranges;
203 	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
204 	struct vfio_pci_dma_buf *priv;
205 	size_t length;
206 	int ret;
207 
208 	if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys)
209 		return -EOPNOTSUPP;
210 
211 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
212 				 sizeof(get_dma_buf));
213 	if (ret != 1)
214 		return ret;
215 
216 	if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
217 		return -EFAULT;
218 
219 	if (!get_dma_buf.nr_ranges || get_dma_buf.flags)
220 		return -EINVAL;
221 
222 	/*
223 	 * For PCI the region_index is the BAR number like everything else.
224 	 */
225 	if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX)
226 		return -ENODEV;
227 
228 	dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges,
229 				       sizeof(*dma_ranges));
230 	if (IS_ERR(dma_ranges))
231 		return PTR_ERR(dma_ranges);
232 
233 	ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length);
234 	if (ret)
235 		goto err_free_ranges;
236 
237 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
238 	if (!priv) {
239 		ret = -ENOMEM;
240 		goto err_free_ranges;
241 	}
242 	priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec),
243 				 GFP_KERNEL);
244 	if (!priv->phys_vec) {
245 		ret = -ENOMEM;
246 		goto err_free_priv;
247 	}
248 
249 	priv->vdev = vdev;
250 	priv->nr_ranges = get_dma_buf.nr_ranges;
251 	priv->size = length;
252 	ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
253 					     get_dma_buf.region_index,
254 					     priv->phys_vec, dma_ranges,
255 					     priv->nr_ranges);
256 	if (ret)
257 		goto err_free_phys;
258 
259 	kfree(dma_ranges);
260 	dma_ranges = NULL;
261 
262 	if (!vfio_device_try_get_registration(&vdev->vdev)) {
263 		ret = -ENODEV;
264 		goto err_free_phys;
265 	}
266 
267 	exp_info.ops = &vfio_pci_dmabuf_ops;
268 	exp_info.size = priv->size;
269 	exp_info.flags = get_dma_buf.open_flags;
270 	exp_info.priv = priv;
271 
272 	priv->dmabuf = dma_buf_export(&exp_info);
273 	if (IS_ERR(priv->dmabuf)) {
274 		ret = PTR_ERR(priv->dmabuf);
275 		goto err_dev_put;
276 	}
277 
278 	/* dma_buf_put() now frees priv */
279 	INIT_LIST_HEAD(&priv->dmabufs_elm);
280 	down_write(&vdev->memory_lock);
281 	dma_resv_lock(priv->dmabuf->resv, NULL);
282 	priv->revoked = !__vfio_pci_memory_enabled(vdev);
283 	list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
284 	dma_resv_unlock(priv->dmabuf->resv);
285 	up_write(&vdev->memory_lock);
286 
287 	/*
288 	 * dma_buf_fd() consumes the reference, when the file closes the dmabuf
289 	 * will be released.
290 	 */
291 	ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
292 	if (ret < 0)
293 		goto err_dma_buf;
294 	return ret;
295 
296 err_dma_buf:
297 	dma_buf_put(priv->dmabuf);
298 err_dev_put:
299 	vfio_device_put_registration(&vdev->vdev);
300 err_free_phys:
301 	kfree(priv->phys_vec);
302 err_free_priv:
303 	kfree(priv);
304 err_free_ranges:
305 	kfree(dma_ranges);
306 	return ret;
307 }
308 
vfio_pci_dma_buf_move(struct vfio_pci_core_device * vdev,bool revoked)309 void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
310 {
311 	struct vfio_pci_dma_buf *priv;
312 	struct vfio_pci_dma_buf *tmp;
313 
314 	lockdep_assert_held_write(&vdev->memory_lock);
315 
316 	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
317 		if (!get_file_active(&priv->dmabuf->file))
318 			continue;
319 
320 		if (priv->revoked != revoked) {
321 			dma_resv_lock(priv->dmabuf->resv, NULL);
322 			priv->revoked = revoked;
323 			dma_buf_move_notify(priv->dmabuf);
324 			dma_resv_unlock(priv->dmabuf->resv);
325 		}
326 		fput(priv->dmabuf->file);
327 	}
328 }
329 
vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device * vdev)330 void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
331 {
332 	struct vfio_pci_dma_buf *priv;
333 	struct vfio_pci_dma_buf *tmp;
334 
335 	down_write(&vdev->memory_lock);
336 	list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
337 		if (!get_file_active(&priv->dmabuf->file))
338 			continue;
339 
340 		dma_resv_lock(priv->dmabuf->resv, NULL);
341 		list_del_init(&priv->dmabufs_elm);
342 		priv->vdev = NULL;
343 		priv->revoked = true;
344 		dma_buf_move_notify(priv->dmabuf);
345 		dma_resv_unlock(priv->dmabuf->resv);
346 		vfio_device_put_registration(&vdev->vdev);
347 		fput(priv->dmabuf->file);
348 	}
349 	up_write(&vdev->memory_lock);
350 }
351