1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
3 */
4 #include <linux/dma-buf-mapping.h>
5 #include <linux/pci-p2pdma.h>
6 #include <linux/dma-resv.h>
7
8 #include "vfio_pci_priv.h"
9
10 MODULE_IMPORT_NS("DMA_BUF");
11
12 struct vfio_pci_dma_buf {
13 struct dma_buf *dmabuf;
14 struct vfio_pci_core_device *vdev;
15 struct list_head dmabufs_elm;
16 size_t size;
17 struct dma_buf_phys_vec *phys_vec;
18 struct p2pdma_provider *provider;
19 u32 nr_ranges;
20 u8 revoked : 1;
21 };
22
vfio_pci_dma_buf_attach(struct dma_buf * dmabuf,struct dma_buf_attachment * attachment)23 static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf,
24 struct dma_buf_attachment *attachment)
25 {
26 struct vfio_pci_dma_buf *priv = dmabuf->priv;
27
28 if (!attachment->peer2peer)
29 return -EOPNOTSUPP;
30
31 if (priv->revoked)
32 return -ENODEV;
33
34 return 0;
35 }
36
37 static struct sg_table *
vfio_pci_dma_buf_map(struct dma_buf_attachment * attachment,enum dma_data_direction dir)38 vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment,
39 enum dma_data_direction dir)
40 {
41 struct vfio_pci_dma_buf *priv = attachment->dmabuf->priv;
42
43 dma_resv_assert_held(priv->dmabuf->resv);
44
45 if (priv->revoked)
46 return ERR_PTR(-ENODEV);
47
48 return dma_buf_phys_vec_to_sgt(attachment, priv->provider,
49 priv->phys_vec, priv->nr_ranges,
50 priv->size, dir);
51 }
52
vfio_pci_dma_buf_unmap(struct dma_buf_attachment * attachment,struct sg_table * sgt,enum dma_data_direction dir)53 static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment,
54 struct sg_table *sgt,
55 enum dma_data_direction dir)
56 {
57 dma_buf_free_sgt(attachment, sgt, dir);
58 }
59
vfio_pci_dma_buf_release(struct dma_buf * dmabuf)60 static void vfio_pci_dma_buf_release(struct dma_buf *dmabuf)
61 {
62 struct vfio_pci_dma_buf *priv = dmabuf->priv;
63
64 /*
65 * Either this or vfio_pci_dma_buf_cleanup() will remove from the list.
66 * The refcount prevents both.
67 */
68 if (priv->vdev) {
69 down_write(&priv->vdev->memory_lock);
70 list_del_init(&priv->dmabufs_elm);
71 up_write(&priv->vdev->memory_lock);
72 vfio_device_put_registration(&priv->vdev->vdev);
73 }
74 kfree(priv->phys_vec);
75 kfree(priv);
76 }
77
78 static const struct dma_buf_ops vfio_pci_dmabuf_ops = {
79 .attach = vfio_pci_dma_buf_attach,
80 .map_dma_buf = vfio_pci_dma_buf_map,
81 .unmap_dma_buf = vfio_pci_dma_buf_unmap,
82 .release = vfio_pci_dma_buf_release,
83 };
84
85 /*
86 * This is a temporary "private interconnect" between VFIO DMABUF and iommufd.
87 * It allows the two co-operating drivers to exchange the physical address of
88 * the BAR. This is to be replaced with a formal DMABUF system for negotiated
89 * interconnect types.
90 *
91 * If this function succeeds the following are true:
92 * - There is one physical range and it is pointing to MMIO
93 * - When move_notify is called it means revoke, not move, vfio_dma_buf_map
94 * will fail if it is currently revoked
95 */
vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment * attachment,struct dma_buf_phys_vec * phys)96 int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment,
97 struct dma_buf_phys_vec *phys)
98 {
99 struct vfio_pci_dma_buf *priv;
100
101 dma_resv_assert_held(attachment->dmabuf->resv);
102
103 if (attachment->dmabuf->ops != &vfio_pci_dmabuf_ops)
104 return -EOPNOTSUPP;
105
106 priv = attachment->dmabuf->priv;
107 if (priv->revoked)
108 return -ENODEV;
109
110 /* More than one range to iommufd will require proper DMABUF support */
111 if (priv->nr_ranges != 1)
112 return -EOPNOTSUPP;
113
114 *phys = priv->phys_vec[0];
115 return 0;
116 }
117 EXPORT_SYMBOL_FOR_MODULES(vfio_pci_dma_buf_iommufd_map, "iommufd");
118
vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec * phys_vec,struct vfio_region_dma_range * dma_ranges,size_t nr_ranges,phys_addr_t start,phys_addr_t len)119 int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec,
120 struct vfio_region_dma_range *dma_ranges,
121 size_t nr_ranges, phys_addr_t start,
122 phys_addr_t len)
123 {
124 phys_addr_t max_addr;
125 unsigned int i;
126
127 max_addr = start + len;
128 for (i = 0; i < nr_ranges; i++) {
129 phys_addr_t end;
130
131 if (!dma_ranges[i].length)
132 return -EINVAL;
133
134 if (check_add_overflow(start, dma_ranges[i].offset,
135 &phys_vec[i].paddr) ||
136 check_add_overflow(phys_vec[i].paddr,
137 dma_ranges[i].length, &end))
138 return -EOVERFLOW;
139 if (end > max_addr)
140 return -EINVAL;
141
142 phys_vec[i].len = dma_ranges[i].length;
143 }
144 return 0;
145 }
146 EXPORT_SYMBOL_GPL(vfio_pci_core_fill_phys_vec);
147
vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device * vdev,struct p2pdma_provider ** provider,unsigned int region_index,struct dma_buf_phys_vec * phys_vec,struct vfio_region_dma_range * dma_ranges,size_t nr_ranges)148 int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev,
149 struct p2pdma_provider **provider,
150 unsigned int region_index,
151 struct dma_buf_phys_vec *phys_vec,
152 struct vfio_region_dma_range *dma_ranges,
153 size_t nr_ranges)
154 {
155 struct pci_dev *pdev = vdev->pdev;
156
157 *provider = pcim_p2pdma_provider(pdev, region_index);
158 if (!*provider)
159 return -EINVAL;
160
161 return vfio_pci_core_fill_phys_vec(
162 phys_vec, dma_ranges, nr_ranges,
163 pci_resource_start(pdev, region_index),
164 pci_resource_len(pdev, region_index));
165 }
166 EXPORT_SYMBOL_GPL(vfio_pci_core_get_dmabuf_phys);
167
validate_dmabuf_input(struct vfio_device_feature_dma_buf * dma_buf,struct vfio_region_dma_range * dma_ranges,size_t * lengthp)168 static int validate_dmabuf_input(struct vfio_device_feature_dma_buf *dma_buf,
169 struct vfio_region_dma_range *dma_ranges,
170 size_t *lengthp)
171 {
172 size_t length = 0;
173 u32 i;
174
175 for (i = 0; i < dma_buf->nr_ranges; i++) {
176 u64 offset = dma_ranges[i].offset;
177 u64 len = dma_ranges[i].length;
178
179 if (!len || !PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
180 return -EINVAL;
181
182 if (check_add_overflow(length, len, &length))
183 return -EINVAL;
184 }
185
186 /*
187 * dma_iova_try_alloc() will WARN on if userspace proposes a size that
188 * is too big, eg with lots of ranges.
189 */
190 if ((u64)(length) & DMA_IOVA_USE_SWIOTLB)
191 return -EINVAL;
192
193 *lengthp = length;
194 return 0;
195 }
196
vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device * vdev,u32 flags,struct vfio_device_feature_dma_buf __user * arg,size_t argsz)197 int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags,
198 struct vfio_device_feature_dma_buf __user *arg,
199 size_t argsz)
200 {
201 struct vfio_device_feature_dma_buf get_dma_buf = {};
202 struct vfio_region_dma_range *dma_ranges;
203 DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
204 struct vfio_pci_dma_buf *priv;
205 size_t length;
206 int ret;
207
208 if (!vdev->pci_ops || !vdev->pci_ops->get_dmabuf_phys)
209 return -EOPNOTSUPP;
210
211 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
212 sizeof(get_dma_buf));
213 if (ret != 1)
214 return ret;
215
216 if (copy_from_user(&get_dma_buf, arg, sizeof(get_dma_buf)))
217 return -EFAULT;
218
219 if (!get_dma_buf.nr_ranges || get_dma_buf.flags)
220 return -EINVAL;
221
222 /*
223 * For PCI the region_index is the BAR number like everything else.
224 */
225 if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX)
226 return -ENODEV;
227
228 dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges,
229 sizeof(*dma_ranges));
230 if (IS_ERR(dma_ranges))
231 return PTR_ERR(dma_ranges);
232
233 ret = validate_dmabuf_input(&get_dma_buf, dma_ranges, &length);
234 if (ret)
235 goto err_free_ranges;
236
237 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
238 if (!priv) {
239 ret = -ENOMEM;
240 goto err_free_ranges;
241 }
242 priv->phys_vec = kcalloc(get_dma_buf.nr_ranges, sizeof(*priv->phys_vec),
243 GFP_KERNEL);
244 if (!priv->phys_vec) {
245 ret = -ENOMEM;
246 goto err_free_priv;
247 }
248
249 priv->vdev = vdev;
250 priv->nr_ranges = get_dma_buf.nr_ranges;
251 priv->size = length;
252 ret = vdev->pci_ops->get_dmabuf_phys(vdev, &priv->provider,
253 get_dma_buf.region_index,
254 priv->phys_vec, dma_ranges,
255 priv->nr_ranges);
256 if (ret)
257 goto err_free_phys;
258
259 kfree(dma_ranges);
260 dma_ranges = NULL;
261
262 if (!vfio_device_try_get_registration(&vdev->vdev)) {
263 ret = -ENODEV;
264 goto err_free_phys;
265 }
266
267 exp_info.ops = &vfio_pci_dmabuf_ops;
268 exp_info.size = priv->size;
269 exp_info.flags = get_dma_buf.open_flags;
270 exp_info.priv = priv;
271
272 priv->dmabuf = dma_buf_export(&exp_info);
273 if (IS_ERR(priv->dmabuf)) {
274 ret = PTR_ERR(priv->dmabuf);
275 goto err_dev_put;
276 }
277
278 /* dma_buf_put() now frees priv */
279 INIT_LIST_HEAD(&priv->dmabufs_elm);
280 down_write(&vdev->memory_lock);
281 dma_resv_lock(priv->dmabuf->resv, NULL);
282 priv->revoked = !__vfio_pci_memory_enabled(vdev);
283 list_add_tail(&priv->dmabufs_elm, &vdev->dmabufs);
284 dma_resv_unlock(priv->dmabuf->resv);
285 up_write(&vdev->memory_lock);
286
287 /*
288 * dma_buf_fd() consumes the reference, when the file closes the dmabuf
289 * will be released.
290 */
291 ret = dma_buf_fd(priv->dmabuf, get_dma_buf.open_flags);
292 if (ret < 0)
293 goto err_dma_buf;
294 return ret;
295
296 err_dma_buf:
297 dma_buf_put(priv->dmabuf);
298 err_dev_put:
299 vfio_device_put_registration(&vdev->vdev);
300 err_free_phys:
301 kfree(priv->phys_vec);
302 err_free_priv:
303 kfree(priv);
304 err_free_ranges:
305 kfree(dma_ranges);
306 return ret;
307 }
308
vfio_pci_dma_buf_move(struct vfio_pci_core_device * vdev,bool revoked)309 void vfio_pci_dma_buf_move(struct vfio_pci_core_device *vdev, bool revoked)
310 {
311 struct vfio_pci_dma_buf *priv;
312 struct vfio_pci_dma_buf *tmp;
313
314 lockdep_assert_held_write(&vdev->memory_lock);
315
316 list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
317 if (!get_file_active(&priv->dmabuf->file))
318 continue;
319
320 if (priv->revoked != revoked) {
321 dma_resv_lock(priv->dmabuf->resv, NULL);
322 priv->revoked = revoked;
323 dma_buf_move_notify(priv->dmabuf);
324 dma_resv_unlock(priv->dmabuf->resv);
325 }
326 fput(priv->dmabuf->file);
327 }
328 }
329
vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device * vdev)330 void vfio_pci_dma_buf_cleanup(struct vfio_pci_core_device *vdev)
331 {
332 struct vfio_pci_dma_buf *priv;
333 struct vfio_pci_dma_buf *tmp;
334
335 down_write(&vdev->memory_lock);
336 list_for_each_entry_safe(priv, tmp, &vdev->dmabufs, dmabufs_elm) {
337 if (!get_file_active(&priv->dmabuf->file))
338 continue;
339
340 dma_resv_lock(priv->dmabuf->resv, NULL);
341 list_del_init(&priv->dmabufs_elm);
342 priv->vdev = NULL;
343 priv->revoked = true;
344 dma_buf_move_notify(priv->dmabuf);
345 dma_resv_unlock(priv->dmabuf->resv);
346 vfio_device_put_registration(&vdev->vdev);
347 fput(priv->dmabuf->file);
348 }
349 up_write(&vdev->memory_lock);
350 }
351