xref: /linux/drivers/vfio/pci/vfio_pci_core.c (revision ed5c2f5fd10dda07263f79f338a512c0f49f76f5)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
4  *     Author: Alex Williamson <alex.williamson@redhat.com>
5  *
6  * Derived from original vfio:
7  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
8  * Author: Tom Lyon, pugs@cisco.com
9  */
10 
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12 
13 #include <linux/aperture.h>
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/file.h>
17 #include <linux/interrupt.h>
18 #include <linux/iommu.h>
19 #include <linux/module.h>
20 #include <linux/mutex.h>
21 #include <linux/notifier.h>
22 #include <linux/pci.h>
23 #include <linux/pm_runtime.h>
24 #include <linux/slab.h>
25 #include <linux/types.h>
26 #include <linux/uaccess.h>
27 #include <linux/vgaarb.h>
28 #include <linux/nospec.h>
29 #include <linux/sched/mm.h>
30 
31 #include <linux/vfio_pci_core.h>
32 
33 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
34 #define DRIVER_DESC "core driver for VFIO based PCI devices"
35 
36 static bool nointxmask;
37 static bool disable_vga;
38 static bool disable_idle_d3;
39 
40 /* List of PF's that vfio_pci_core_sriov_configure() has been called on */
41 static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
42 static LIST_HEAD(vfio_pci_sriov_pfs);
43 
44 static inline bool vfio_vga_disabled(void)
45 {
46 #ifdef CONFIG_VFIO_PCI_VGA
47 	return disable_vga;
48 #else
49 	return true;
50 #endif
51 }
52 
53 /*
54  * Our VGA arbiter participation is limited since we don't know anything
55  * about the device itself.  However, if the device is the only VGA device
56  * downstream of a bridge and VFIO VGA support is disabled, then we can
57  * safely return legacy VGA IO and memory as not decoded since the user
58  * has no way to get to it and routing can be disabled externally at the
59  * bridge.
60  */
61 static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga)
62 {
63 	struct pci_dev *tmp = NULL;
64 	unsigned char max_busnr;
65 	unsigned int decodes;
66 
67 	if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
68 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
69 		       VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
70 
71 	max_busnr = pci_bus_max_busnr(pdev->bus);
72 	decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
73 
74 	while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
75 		if (tmp == pdev ||
76 		    pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
77 		    pci_is_root_bus(tmp->bus))
78 			continue;
79 
80 		if (tmp->bus->number >= pdev->bus->number &&
81 		    tmp->bus->number <= max_busnr) {
82 			pci_dev_put(tmp);
83 			decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
84 			break;
85 		}
86 	}
87 
88 	return decodes;
89 }
90 
91 static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
92 {
93 	struct resource *res;
94 	int i;
95 	struct vfio_pci_dummy_resource *dummy_res;
96 
97 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
98 		int bar = i + PCI_STD_RESOURCES;
99 
100 		res = &vdev->pdev->resource[bar];
101 
102 		if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
103 			goto no_mmap;
104 
105 		if (!(res->flags & IORESOURCE_MEM))
106 			goto no_mmap;
107 
108 		/*
109 		 * The PCI core shouldn't set up a resource with a
110 		 * type but zero size. But there may be bugs that
111 		 * cause us to do that.
112 		 */
113 		if (!resource_size(res))
114 			goto no_mmap;
115 
116 		if (resource_size(res) >= PAGE_SIZE) {
117 			vdev->bar_mmap_supported[bar] = true;
118 			continue;
119 		}
120 
121 		if (!(res->start & ~PAGE_MASK)) {
122 			/*
123 			 * Add a dummy resource to reserve the remainder
124 			 * of the exclusive page in case that hot-add
125 			 * device's bar is assigned into it.
126 			 */
127 			dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL);
128 			if (dummy_res == NULL)
129 				goto no_mmap;
130 
131 			dummy_res->resource.name = "vfio sub-page reserved";
132 			dummy_res->resource.start = res->end + 1;
133 			dummy_res->resource.end = res->start + PAGE_SIZE - 1;
134 			dummy_res->resource.flags = res->flags;
135 			if (request_resource(res->parent,
136 						&dummy_res->resource)) {
137 				kfree(dummy_res);
138 				goto no_mmap;
139 			}
140 			dummy_res->index = bar;
141 			list_add(&dummy_res->res_next,
142 					&vdev->dummy_resources_list);
143 			vdev->bar_mmap_supported[bar] = true;
144 			continue;
145 		}
146 		/*
147 		 * Here we don't handle the case when the BAR is not page
148 		 * aligned because we can't expect the BAR will be
149 		 * assigned into the same location in a page in guest
150 		 * when we passthrough the BAR. And it's hard to access
151 		 * this BAR in userspace because we have no way to get
152 		 * the BAR's location in a page.
153 		 */
154 no_mmap:
155 		vdev->bar_mmap_supported[bar] = false;
156 	}
157 }
158 
159 struct vfio_pci_group_info;
160 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
161 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
162 				      struct vfio_pci_group_info *groups);
163 
164 /*
165  * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
166  * _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
167  * If a device implements the former but not the latter we would typically
168  * expect broken_intx_masking be set and require an exclusive interrupt.
169  * However since we do have control of the device's ability to assert INTx,
170  * we can instead pretend that the device does not implement INTx, virtualizing
171  * the pin register to report zero and maintaining DisINTx set on the host.
172  */
173 static bool vfio_pci_nointx(struct pci_dev *pdev)
174 {
175 	switch (pdev->vendor) {
176 	case PCI_VENDOR_ID_INTEL:
177 		switch (pdev->device) {
178 		/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
179 		case 0x1572:
180 		case 0x1574:
181 		case 0x1580 ... 0x1581:
182 		case 0x1583 ... 0x158b:
183 		case 0x37d0 ... 0x37d2:
184 		/* X550 */
185 		case 0x1563:
186 			return true;
187 		default:
188 			return false;
189 		}
190 	}
191 
192 	return false;
193 }
194 
195 static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
196 {
197 	struct pci_dev *pdev = vdev->pdev;
198 	u16 pmcsr;
199 
200 	if (!pdev->pm_cap)
201 		return;
202 
203 	pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
204 
205 	vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
206 }
207 
208 /*
209  * pci_set_power_state() wrapper handling devices which perform a soft reset on
210  * D3->D0 transition.  Save state prior to D0/1/2->D3, stash it on the vdev,
211  * restore when returned to D0.  Saved separately from pci_saved_state for use
212  * by PM capability emulation and separately from pci_dev internal saved state
213  * to avoid it being overwritten and consumed around other resets.
214  */
215 int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
216 {
217 	struct pci_dev *pdev = vdev->pdev;
218 	bool needs_restore = false, needs_save = false;
219 	int ret;
220 
221 	/* Prevent changing power state for PFs with VFs enabled */
222 	if (pci_num_vf(pdev) && state > PCI_D0)
223 		return -EBUSY;
224 
225 	if (vdev->needs_pm_restore) {
226 		if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
227 			pci_save_state(pdev);
228 			needs_save = true;
229 		}
230 
231 		if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
232 			needs_restore = true;
233 	}
234 
235 	ret = pci_set_power_state(pdev, state);
236 
237 	if (!ret) {
238 		/* D3 might be unsupported via quirk, skip unless in D3 */
239 		if (needs_save && pdev->current_state >= PCI_D3hot) {
240 			/*
241 			 * The current PCI state will be saved locally in
242 			 * 'pm_save' during the D3hot transition. When the
243 			 * device state is changed to D0 again with the current
244 			 * function, then pci_store_saved_state() will restore
245 			 * the state and will free the memory pointed by
246 			 * 'pm_save'. There are few cases where the PCI power
247 			 * state can be changed to D0 without the involvement
248 			 * of the driver. For these cases, free the earlier
249 			 * allocated memory first before overwriting 'pm_save'
250 			 * to prevent the memory leak.
251 			 */
252 			kfree(vdev->pm_save);
253 			vdev->pm_save = pci_store_saved_state(pdev);
254 		} else if (needs_restore) {
255 			pci_load_and_free_saved_state(pdev, &vdev->pm_save);
256 			pci_restore_state(pdev);
257 		}
258 	}
259 
260 	return ret;
261 }
262 
263 /*
264  * The dev_pm_ops needs to be provided to make pci-driver runtime PM working,
265  * so use structure without any callbacks.
266  *
267  * The pci-driver core runtime PM routines always save the device state
268  * before going into suspended state. If the device is going into low power
269  * state with only with runtime PM ops, then no explicit handling is needed
270  * for the devices which have NoSoftRst-.
271  */
272 static const struct dev_pm_ops vfio_pci_core_pm_ops = { };
273 
274 int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
275 {
276 	struct pci_dev *pdev = vdev->pdev;
277 	int ret;
278 	u16 cmd;
279 	u8 msix_pos;
280 
281 	if (!disable_idle_d3) {
282 		ret = pm_runtime_resume_and_get(&pdev->dev);
283 		if (ret < 0)
284 			return ret;
285 	}
286 
287 	/* Don't allow our initial saved state to include busmaster */
288 	pci_clear_master(pdev);
289 
290 	ret = pci_enable_device(pdev);
291 	if (ret)
292 		goto out_power;
293 
294 	/* If reset fails because of the device lock, fail this path entirely */
295 	ret = pci_try_reset_function(pdev);
296 	if (ret == -EAGAIN)
297 		goto out_disable_device;
298 
299 	vdev->reset_works = !ret;
300 	pci_save_state(pdev);
301 	vdev->pci_saved_state = pci_store_saved_state(pdev);
302 	if (!vdev->pci_saved_state)
303 		pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
304 
305 	if (likely(!nointxmask)) {
306 		if (vfio_pci_nointx(pdev)) {
307 			pci_info(pdev, "Masking broken INTx support\n");
308 			vdev->nointx = true;
309 			pci_intx(pdev, 0);
310 		} else
311 			vdev->pci_2_3 = pci_intx_mask_supported(pdev);
312 	}
313 
314 	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
315 	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
316 		cmd &= ~PCI_COMMAND_INTX_DISABLE;
317 		pci_write_config_word(pdev, PCI_COMMAND, cmd);
318 	}
319 
320 	ret = vfio_pci_zdev_open_device(vdev);
321 	if (ret)
322 		goto out_free_state;
323 
324 	ret = vfio_config_init(vdev);
325 	if (ret)
326 		goto out_free_zdev;
327 
328 	msix_pos = pdev->msix_cap;
329 	if (msix_pos) {
330 		u16 flags;
331 		u32 table;
332 
333 		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
334 		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
335 
336 		vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
337 		vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
338 		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
339 	} else
340 		vdev->msix_bar = 0xFF;
341 
342 	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
343 		vdev->has_vga = true;
344 
345 
346 	return 0;
347 
348 out_free_zdev:
349 	vfio_pci_zdev_close_device(vdev);
350 out_free_state:
351 	kfree(vdev->pci_saved_state);
352 	vdev->pci_saved_state = NULL;
353 out_disable_device:
354 	pci_disable_device(pdev);
355 out_power:
356 	if (!disable_idle_d3)
357 		pm_runtime_put(&pdev->dev);
358 	return ret;
359 }
360 EXPORT_SYMBOL_GPL(vfio_pci_core_enable);
361 
362 void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
363 {
364 	struct pci_dev *pdev = vdev->pdev;
365 	struct vfio_pci_dummy_resource *dummy_res, *tmp;
366 	struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
367 	int i, bar;
368 
369 	/* For needs_reset */
370 	lockdep_assert_held(&vdev->vdev.dev_set->lock);
371 
372 	/*
373 	 * This function can be invoked while the power state is non-D0.
374 	 * This function calls __pci_reset_function_locked() which internally
375 	 * can use pci_pm_reset() for the function reset. pci_pm_reset() will
376 	 * fail if the power state is non-D0. Also, for the devices which
377 	 * have NoSoftRst-, the reset function can cause the PCI config space
378 	 * reset without restoring the original state (saved locally in
379 	 * 'vdev->pm_save').
380 	 */
381 	vfio_pci_set_power_state(vdev, PCI_D0);
382 
383 	/* Stop the device from further DMA */
384 	pci_clear_master(pdev);
385 
386 	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
387 				VFIO_IRQ_SET_ACTION_TRIGGER,
388 				vdev->irq_type, 0, 0, NULL);
389 
390 	/* Device closed, don't need mutex here */
391 	list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
392 				 &vdev->ioeventfds_list, next) {
393 		vfio_virqfd_disable(&ioeventfd->virqfd);
394 		list_del(&ioeventfd->next);
395 		kfree(ioeventfd);
396 	}
397 	vdev->ioeventfds_nr = 0;
398 
399 	vdev->virq_disabled = false;
400 
401 	for (i = 0; i < vdev->num_regions; i++)
402 		vdev->region[i].ops->release(vdev, &vdev->region[i]);
403 
404 	vdev->num_regions = 0;
405 	kfree(vdev->region);
406 	vdev->region = NULL; /* don't krealloc a freed pointer */
407 
408 	vfio_config_free(vdev);
409 
410 	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
411 		bar = i + PCI_STD_RESOURCES;
412 		if (!vdev->barmap[bar])
413 			continue;
414 		pci_iounmap(pdev, vdev->barmap[bar]);
415 		pci_release_selected_regions(pdev, 1 << bar);
416 		vdev->barmap[bar] = NULL;
417 	}
418 
419 	list_for_each_entry_safe(dummy_res, tmp,
420 				 &vdev->dummy_resources_list, res_next) {
421 		list_del(&dummy_res->res_next);
422 		release_resource(&dummy_res->resource);
423 		kfree(dummy_res);
424 	}
425 
426 	vdev->needs_reset = true;
427 
428 	vfio_pci_zdev_close_device(vdev);
429 
430 	/*
431 	 * If we have saved state, restore it.  If we can reset the device,
432 	 * even better.  Resetting with current state seems better than
433 	 * nothing, but saving and restoring current state without reset
434 	 * is just busy work.
435 	 */
436 	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
437 		pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
438 
439 		if (!vdev->reset_works)
440 			goto out;
441 
442 		pci_save_state(pdev);
443 	}
444 
445 	/*
446 	 * Disable INTx and MSI, presumably to avoid spurious interrupts
447 	 * during reset.  Stolen from pci_reset_function()
448 	 */
449 	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
450 
451 	/*
452 	 * Try to get the locks ourselves to prevent a deadlock. The
453 	 * success of this is dependent on being able to lock the device,
454 	 * which is not always possible.
455 	 * We can not use the "try" reset interface here, which will
456 	 * overwrite the previously restored configuration information.
457 	 */
458 	if (vdev->reset_works && pci_dev_trylock(pdev)) {
459 		if (!__pci_reset_function_locked(pdev))
460 			vdev->needs_reset = false;
461 		pci_dev_unlock(pdev);
462 	}
463 
464 	pci_restore_state(pdev);
465 out:
466 	pci_disable_device(pdev);
467 
468 	vfio_pci_dev_set_try_reset(vdev->vdev.dev_set);
469 
470 	/* Put the pm-runtime usage counter acquired during enable */
471 	if (!disable_idle_d3)
472 		pm_runtime_put(&pdev->dev);
473 }
474 EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
475 
476 void vfio_pci_core_close_device(struct vfio_device *core_vdev)
477 {
478 	struct vfio_pci_core_device *vdev =
479 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
480 
481 	if (vdev->sriov_pf_core_dev) {
482 		mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
483 		WARN_ON(!vdev->sriov_pf_core_dev->vf_token->users);
484 		vdev->sriov_pf_core_dev->vf_token->users--;
485 		mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
486 	}
487 	vfio_spapr_pci_eeh_release(vdev->pdev);
488 	vfio_pci_core_disable(vdev);
489 
490 	mutex_lock(&vdev->igate);
491 	if (vdev->err_trigger) {
492 		eventfd_ctx_put(vdev->err_trigger);
493 		vdev->err_trigger = NULL;
494 	}
495 	if (vdev->req_trigger) {
496 		eventfd_ctx_put(vdev->req_trigger);
497 		vdev->req_trigger = NULL;
498 	}
499 	mutex_unlock(&vdev->igate);
500 }
501 EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
502 
503 void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
504 {
505 	vfio_pci_probe_mmaps(vdev);
506 	vfio_spapr_pci_eeh_open(vdev->pdev);
507 
508 	if (vdev->sriov_pf_core_dev) {
509 		mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
510 		vdev->sriov_pf_core_dev->vf_token->users++;
511 		mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
512 	}
513 }
514 EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
515 
516 static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
517 {
518 	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
519 		u8 pin;
520 
521 		if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
522 		    vdev->nointx || vdev->pdev->is_virtfn)
523 			return 0;
524 
525 		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
526 
527 		return pin ? 1 : 0;
528 	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
529 		u8 pos;
530 		u16 flags;
531 
532 		pos = vdev->pdev->msi_cap;
533 		if (pos) {
534 			pci_read_config_word(vdev->pdev,
535 					     pos + PCI_MSI_FLAGS, &flags);
536 			return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
537 		}
538 	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
539 		u8 pos;
540 		u16 flags;
541 
542 		pos = vdev->pdev->msix_cap;
543 		if (pos) {
544 			pci_read_config_word(vdev->pdev,
545 					     pos + PCI_MSIX_FLAGS, &flags);
546 
547 			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
548 		}
549 	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
550 		if (pci_is_pcie(vdev->pdev))
551 			return 1;
552 	} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
553 		return 1;
554 	}
555 
556 	return 0;
557 }
558 
559 static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
560 {
561 	(*(int *)data)++;
562 	return 0;
563 }
564 
565 struct vfio_pci_fill_info {
566 	int max;
567 	int cur;
568 	struct vfio_pci_dependent_device *devices;
569 };
570 
571 static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
572 {
573 	struct vfio_pci_fill_info *fill = data;
574 	struct iommu_group *iommu_group;
575 
576 	if (fill->cur == fill->max)
577 		return -EAGAIN; /* Something changed, try again */
578 
579 	iommu_group = iommu_group_get(&pdev->dev);
580 	if (!iommu_group)
581 		return -EPERM; /* Cannot reset non-isolated devices */
582 
583 	fill->devices[fill->cur].group_id = iommu_group_id(iommu_group);
584 	fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus);
585 	fill->devices[fill->cur].bus = pdev->bus->number;
586 	fill->devices[fill->cur].devfn = pdev->devfn;
587 	fill->cur++;
588 	iommu_group_put(iommu_group);
589 	return 0;
590 }
591 
592 struct vfio_pci_group_info {
593 	int count;
594 	struct file **files;
595 };
596 
597 static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
598 {
599 	for (; pdev; pdev = pdev->bus->self)
600 		if (pdev->bus == slot->bus)
601 			return (pdev->slot == slot);
602 	return false;
603 }
604 
605 struct vfio_pci_walk_info {
606 	int (*fn)(struct pci_dev *pdev, void *data);
607 	void *data;
608 	struct pci_dev *pdev;
609 	bool slot;
610 	int ret;
611 };
612 
613 static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
614 {
615 	struct vfio_pci_walk_info *walk = data;
616 
617 	if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
618 		walk->ret = walk->fn(pdev, walk->data);
619 
620 	return walk->ret;
621 }
622 
623 static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
624 					 int (*fn)(struct pci_dev *,
625 						   void *data), void *data,
626 					 bool slot)
627 {
628 	struct vfio_pci_walk_info walk = {
629 		.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
630 	};
631 
632 	pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
633 
634 	return walk.ret;
635 }
636 
637 static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
638 			      struct vfio_info_cap *caps)
639 {
640 	struct vfio_info_cap_header header = {
641 		.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
642 		.version = 1
643 	};
644 
645 	return vfio_info_add_capability(caps, &header, sizeof(header));
646 }
647 
648 int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev,
649 				 unsigned int type, unsigned int subtype,
650 				 const struct vfio_pci_regops *ops,
651 				 size_t size, u32 flags, void *data)
652 {
653 	struct vfio_pci_region *region;
654 
655 	region = krealloc(vdev->region,
656 			  (vdev->num_regions + 1) * sizeof(*region),
657 			  GFP_KERNEL);
658 	if (!region)
659 		return -ENOMEM;
660 
661 	vdev->region = region;
662 	vdev->region[vdev->num_regions].type = type;
663 	vdev->region[vdev->num_regions].subtype = subtype;
664 	vdev->region[vdev->num_regions].ops = ops;
665 	vdev->region[vdev->num_regions].size = size;
666 	vdev->region[vdev->num_regions].flags = flags;
667 	vdev->region[vdev->num_regions].data = data;
668 
669 	vdev->num_regions++;
670 
671 	return 0;
672 }
673 EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region);
674 
675 long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
676 		unsigned long arg)
677 {
678 	struct vfio_pci_core_device *vdev =
679 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
680 	unsigned long minsz;
681 
682 	if (cmd == VFIO_DEVICE_GET_INFO) {
683 		struct vfio_device_info info;
684 		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
685 		unsigned long capsz;
686 		int ret;
687 
688 		minsz = offsetofend(struct vfio_device_info, num_irqs);
689 
690 		/* For backward compatibility, cannot require this */
691 		capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
692 
693 		if (copy_from_user(&info, (void __user *)arg, minsz))
694 			return -EFAULT;
695 
696 		if (info.argsz < minsz)
697 			return -EINVAL;
698 
699 		if (info.argsz >= capsz) {
700 			minsz = capsz;
701 			info.cap_offset = 0;
702 		}
703 
704 		info.flags = VFIO_DEVICE_FLAGS_PCI;
705 
706 		if (vdev->reset_works)
707 			info.flags |= VFIO_DEVICE_FLAGS_RESET;
708 
709 		info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
710 		info.num_irqs = VFIO_PCI_NUM_IRQS;
711 
712 		ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
713 		if (ret && ret != -ENODEV) {
714 			pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n");
715 			return ret;
716 		}
717 
718 		if (caps.size) {
719 			info.flags |= VFIO_DEVICE_FLAGS_CAPS;
720 			if (info.argsz < sizeof(info) + caps.size) {
721 				info.argsz = sizeof(info) + caps.size;
722 			} else {
723 				vfio_info_cap_shift(&caps, sizeof(info));
724 				if (copy_to_user((void __user *)arg +
725 						  sizeof(info), caps.buf,
726 						  caps.size)) {
727 					kfree(caps.buf);
728 					return -EFAULT;
729 				}
730 				info.cap_offset = sizeof(info);
731 			}
732 
733 			kfree(caps.buf);
734 		}
735 
736 		return copy_to_user((void __user *)arg, &info, minsz) ?
737 			-EFAULT : 0;
738 
739 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
740 		struct pci_dev *pdev = vdev->pdev;
741 		struct vfio_region_info info;
742 		struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
743 		int i, ret;
744 
745 		minsz = offsetofend(struct vfio_region_info, offset);
746 
747 		if (copy_from_user(&info, (void __user *)arg, minsz))
748 			return -EFAULT;
749 
750 		if (info.argsz < minsz)
751 			return -EINVAL;
752 
753 		switch (info.index) {
754 		case VFIO_PCI_CONFIG_REGION_INDEX:
755 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
756 			info.size = pdev->cfg_size;
757 			info.flags = VFIO_REGION_INFO_FLAG_READ |
758 				     VFIO_REGION_INFO_FLAG_WRITE;
759 			break;
760 		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
761 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
762 			info.size = pci_resource_len(pdev, info.index);
763 			if (!info.size) {
764 				info.flags = 0;
765 				break;
766 			}
767 
768 			info.flags = VFIO_REGION_INFO_FLAG_READ |
769 				     VFIO_REGION_INFO_FLAG_WRITE;
770 			if (vdev->bar_mmap_supported[info.index]) {
771 				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
772 				if (info.index == vdev->msix_bar) {
773 					ret = msix_mmappable_cap(vdev, &caps);
774 					if (ret)
775 						return ret;
776 				}
777 			}
778 
779 			break;
780 		case VFIO_PCI_ROM_REGION_INDEX:
781 		{
782 			void __iomem *io;
783 			size_t size;
784 			u16 cmd;
785 
786 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
787 			info.flags = 0;
788 
789 			/* Report the BAR size, not the ROM size */
790 			info.size = pci_resource_len(pdev, info.index);
791 			if (!info.size) {
792 				/* Shadow ROMs appear as PCI option ROMs */
793 				if (pdev->resource[PCI_ROM_RESOURCE].flags &
794 							IORESOURCE_ROM_SHADOW)
795 					info.size = 0x20000;
796 				else
797 					break;
798 			}
799 
800 			/*
801 			 * Is it really there?  Enable memory decode for
802 			 * implicit access in pci_map_rom().
803 			 */
804 			cmd = vfio_pci_memory_lock_and_enable(vdev);
805 			io = pci_map_rom(pdev, &size);
806 			if (io) {
807 				info.flags = VFIO_REGION_INFO_FLAG_READ;
808 				pci_unmap_rom(pdev, io);
809 			} else {
810 				info.size = 0;
811 			}
812 			vfio_pci_memory_unlock_and_restore(vdev, cmd);
813 
814 			break;
815 		}
816 		case VFIO_PCI_VGA_REGION_INDEX:
817 			if (!vdev->has_vga)
818 				return -EINVAL;
819 
820 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
821 			info.size = 0xc0000;
822 			info.flags = VFIO_REGION_INFO_FLAG_READ |
823 				     VFIO_REGION_INFO_FLAG_WRITE;
824 
825 			break;
826 		default:
827 		{
828 			struct vfio_region_info_cap_type cap_type = {
829 					.header.id = VFIO_REGION_INFO_CAP_TYPE,
830 					.header.version = 1 };
831 
832 			if (info.index >=
833 			    VFIO_PCI_NUM_REGIONS + vdev->num_regions)
834 				return -EINVAL;
835 			info.index = array_index_nospec(info.index,
836 							VFIO_PCI_NUM_REGIONS +
837 							vdev->num_regions);
838 
839 			i = info.index - VFIO_PCI_NUM_REGIONS;
840 
841 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
842 			info.size = vdev->region[i].size;
843 			info.flags = vdev->region[i].flags;
844 
845 			cap_type.type = vdev->region[i].type;
846 			cap_type.subtype = vdev->region[i].subtype;
847 
848 			ret = vfio_info_add_capability(&caps, &cap_type.header,
849 						       sizeof(cap_type));
850 			if (ret)
851 				return ret;
852 
853 			if (vdev->region[i].ops->add_capability) {
854 				ret = vdev->region[i].ops->add_capability(vdev,
855 						&vdev->region[i], &caps);
856 				if (ret)
857 					return ret;
858 			}
859 		}
860 		}
861 
862 		if (caps.size) {
863 			info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
864 			if (info.argsz < sizeof(info) + caps.size) {
865 				info.argsz = sizeof(info) + caps.size;
866 				info.cap_offset = 0;
867 			} else {
868 				vfio_info_cap_shift(&caps, sizeof(info));
869 				if (copy_to_user((void __user *)arg +
870 						  sizeof(info), caps.buf,
871 						  caps.size)) {
872 					kfree(caps.buf);
873 					return -EFAULT;
874 				}
875 				info.cap_offset = sizeof(info);
876 			}
877 
878 			kfree(caps.buf);
879 		}
880 
881 		return copy_to_user((void __user *)arg, &info, minsz) ?
882 			-EFAULT : 0;
883 
884 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
885 		struct vfio_irq_info info;
886 
887 		minsz = offsetofend(struct vfio_irq_info, count);
888 
889 		if (copy_from_user(&info, (void __user *)arg, minsz))
890 			return -EFAULT;
891 
892 		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
893 			return -EINVAL;
894 
895 		switch (info.index) {
896 		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
897 		case VFIO_PCI_REQ_IRQ_INDEX:
898 			break;
899 		case VFIO_PCI_ERR_IRQ_INDEX:
900 			if (pci_is_pcie(vdev->pdev))
901 				break;
902 			fallthrough;
903 		default:
904 			return -EINVAL;
905 		}
906 
907 		info.flags = VFIO_IRQ_INFO_EVENTFD;
908 
909 		info.count = vfio_pci_get_irq_count(vdev, info.index);
910 
911 		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
912 			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
913 				       VFIO_IRQ_INFO_AUTOMASKED);
914 		else
915 			info.flags |= VFIO_IRQ_INFO_NORESIZE;
916 
917 		return copy_to_user((void __user *)arg, &info, minsz) ?
918 			-EFAULT : 0;
919 
920 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
921 		struct vfio_irq_set hdr;
922 		u8 *data = NULL;
923 		int max, ret = 0;
924 		size_t data_size = 0;
925 
926 		minsz = offsetofend(struct vfio_irq_set, count);
927 
928 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
929 			return -EFAULT;
930 
931 		max = vfio_pci_get_irq_count(vdev, hdr.index);
932 
933 		ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
934 						 VFIO_PCI_NUM_IRQS, &data_size);
935 		if (ret)
936 			return ret;
937 
938 		if (data_size) {
939 			data = memdup_user((void __user *)(arg + minsz),
940 					    data_size);
941 			if (IS_ERR(data))
942 				return PTR_ERR(data);
943 		}
944 
945 		mutex_lock(&vdev->igate);
946 
947 		ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
948 					      hdr.start, hdr.count, data);
949 
950 		mutex_unlock(&vdev->igate);
951 		kfree(data);
952 
953 		return ret;
954 
955 	} else if (cmd == VFIO_DEVICE_RESET) {
956 		int ret;
957 
958 		if (!vdev->reset_works)
959 			return -EINVAL;
960 
961 		vfio_pci_zap_and_down_write_memory_lock(vdev);
962 
963 		/*
964 		 * This function can be invoked while the power state is non-D0.
965 		 * If pci_try_reset_function() has been called while the power
966 		 * state is non-D0, then pci_try_reset_function() will
967 		 * internally set the power state to D0 without vfio driver
968 		 * involvement. For the devices which have NoSoftRst-, the
969 		 * reset function can cause the PCI config space reset without
970 		 * restoring the original state (saved locally in
971 		 * 'vdev->pm_save').
972 		 */
973 		vfio_pci_set_power_state(vdev, PCI_D0);
974 
975 		ret = pci_try_reset_function(vdev->pdev);
976 		up_write(&vdev->memory_lock);
977 
978 		return ret;
979 
980 	} else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) {
981 		struct vfio_pci_hot_reset_info hdr;
982 		struct vfio_pci_fill_info fill = { 0 };
983 		struct vfio_pci_dependent_device *devices = NULL;
984 		bool slot = false;
985 		int ret = 0;
986 
987 		minsz = offsetofend(struct vfio_pci_hot_reset_info, count);
988 
989 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
990 			return -EFAULT;
991 
992 		if (hdr.argsz < minsz)
993 			return -EINVAL;
994 
995 		hdr.flags = 0;
996 
997 		/* Can we do a slot or bus reset or neither? */
998 		if (!pci_probe_reset_slot(vdev->pdev->slot))
999 			slot = true;
1000 		else if (pci_probe_reset_bus(vdev->pdev->bus))
1001 			return -ENODEV;
1002 
1003 		/* How many devices are affected? */
1004 		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1005 						    vfio_pci_count_devs,
1006 						    &fill.max, slot);
1007 		if (ret)
1008 			return ret;
1009 
1010 		WARN_ON(!fill.max); /* Should always be at least one */
1011 
1012 		/*
1013 		 * If there's enough space, fill it now, otherwise return
1014 		 * -ENOSPC and the number of devices affected.
1015 		 */
1016 		if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) {
1017 			ret = -ENOSPC;
1018 			hdr.count = fill.max;
1019 			goto reset_info_exit;
1020 		}
1021 
1022 		devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL);
1023 		if (!devices)
1024 			return -ENOMEM;
1025 
1026 		fill.devices = devices;
1027 
1028 		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1029 						    vfio_pci_fill_devs,
1030 						    &fill, slot);
1031 
1032 		/*
1033 		 * If a device was removed between counting and filling,
1034 		 * we may come up short of fill.max.  If a device was
1035 		 * added, we'll have a return of -EAGAIN above.
1036 		 */
1037 		if (!ret)
1038 			hdr.count = fill.cur;
1039 
1040 reset_info_exit:
1041 		if (copy_to_user((void __user *)arg, &hdr, minsz))
1042 			ret = -EFAULT;
1043 
1044 		if (!ret) {
1045 			if (copy_to_user((void __user *)(arg + minsz), devices,
1046 					 hdr.count * sizeof(*devices)))
1047 				ret = -EFAULT;
1048 		}
1049 
1050 		kfree(devices);
1051 		return ret;
1052 
1053 	} else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) {
1054 		struct vfio_pci_hot_reset hdr;
1055 		int32_t *group_fds;
1056 		struct file **files;
1057 		struct vfio_pci_group_info info;
1058 		bool slot = false;
1059 		int file_idx, count = 0, ret = 0;
1060 
1061 		minsz = offsetofend(struct vfio_pci_hot_reset, count);
1062 
1063 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
1064 			return -EFAULT;
1065 
1066 		if (hdr.argsz < minsz || hdr.flags)
1067 			return -EINVAL;
1068 
1069 		/* Can we do a slot or bus reset or neither? */
1070 		if (!pci_probe_reset_slot(vdev->pdev->slot))
1071 			slot = true;
1072 		else if (pci_probe_reset_bus(vdev->pdev->bus))
1073 			return -ENODEV;
1074 
1075 		/*
1076 		 * We can't let userspace give us an arbitrarily large
1077 		 * buffer to copy, so verify how many we think there
1078 		 * could be.  Note groups can have multiple devices so
1079 		 * one group per device is the max.
1080 		 */
1081 		ret = vfio_pci_for_each_slot_or_bus(vdev->pdev,
1082 						    vfio_pci_count_devs,
1083 						    &count, slot);
1084 		if (ret)
1085 			return ret;
1086 
1087 		/* Somewhere between 1 and count is OK */
1088 		if (!hdr.count || hdr.count > count)
1089 			return -EINVAL;
1090 
1091 		group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL);
1092 		files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL);
1093 		if (!group_fds || !files) {
1094 			kfree(group_fds);
1095 			kfree(files);
1096 			return -ENOMEM;
1097 		}
1098 
1099 		if (copy_from_user(group_fds, (void __user *)(arg + minsz),
1100 				   hdr.count * sizeof(*group_fds))) {
1101 			kfree(group_fds);
1102 			kfree(files);
1103 			return -EFAULT;
1104 		}
1105 
1106 		/*
1107 		 * For each group_fd, get the group through the vfio external
1108 		 * user interface and store the group and iommu ID.  This
1109 		 * ensures the group is held across the reset.
1110 		 */
1111 		for (file_idx = 0; file_idx < hdr.count; file_idx++) {
1112 			struct file *file = fget(group_fds[file_idx]);
1113 
1114 			if (!file) {
1115 				ret = -EBADF;
1116 				break;
1117 			}
1118 
1119 			/* Ensure the FD is a vfio group FD.*/
1120 			if (!vfio_file_iommu_group(file)) {
1121 				fput(file);
1122 				ret = -EINVAL;
1123 				break;
1124 			}
1125 
1126 			files[file_idx] = file;
1127 		}
1128 
1129 		kfree(group_fds);
1130 
1131 		/* release reference to groups on error */
1132 		if (ret)
1133 			goto hot_reset_release;
1134 
1135 		info.count = hdr.count;
1136 		info.files = files;
1137 
1138 		ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info);
1139 
1140 hot_reset_release:
1141 		for (file_idx--; file_idx >= 0; file_idx--)
1142 			fput(files[file_idx]);
1143 
1144 		kfree(files);
1145 		return ret;
1146 	} else if (cmd == VFIO_DEVICE_IOEVENTFD) {
1147 		struct vfio_device_ioeventfd ioeventfd;
1148 		int count;
1149 
1150 		minsz = offsetofend(struct vfio_device_ioeventfd, fd);
1151 
1152 		if (copy_from_user(&ioeventfd, (void __user *)arg, minsz))
1153 			return -EFAULT;
1154 
1155 		if (ioeventfd.argsz < minsz)
1156 			return -EINVAL;
1157 
1158 		if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
1159 			return -EINVAL;
1160 
1161 		count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
1162 
1163 		if (hweight8(count) != 1 || ioeventfd.fd < -1)
1164 			return -EINVAL;
1165 
1166 		return vfio_pci_ioeventfd(vdev, ioeventfd.offset,
1167 					  ioeventfd.data, count, ioeventfd.fd);
1168 	}
1169 	return -ENOTTY;
1170 }
1171 EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
1172 
1173 static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
1174 				       void __user *arg, size_t argsz)
1175 {
1176 	struct vfio_pci_core_device *vdev =
1177 		container_of(device, struct vfio_pci_core_device, vdev);
1178 	uuid_t uuid;
1179 	int ret;
1180 
1181 	if (!vdev->vf_token)
1182 		return -ENOTTY;
1183 	/*
1184 	 * We do not support GET of the VF Token UUID as this could
1185 	 * expose the token of the previous device user.
1186 	 */
1187 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
1188 				 sizeof(uuid));
1189 	if (ret != 1)
1190 		return ret;
1191 
1192 	if (copy_from_user(&uuid, arg, sizeof(uuid)))
1193 		return -EFAULT;
1194 
1195 	mutex_lock(&vdev->vf_token->lock);
1196 	uuid_copy(&vdev->vf_token->uuid, &uuid);
1197 	mutex_unlock(&vdev->vf_token->lock);
1198 	return 0;
1199 }
1200 
1201 int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
1202 				void __user *arg, size_t argsz)
1203 {
1204 	switch (flags & VFIO_DEVICE_FEATURE_MASK) {
1205 	case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
1206 		return vfio_pci_core_feature_token(device, flags, arg, argsz);
1207 	default:
1208 		return -ENOTTY;
1209 	}
1210 }
1211 EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl_feature);
1212 
1213 static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
1214 			   size_t count, loff_t *ppos, bool iswrite)
1215 {
1216 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
1217 
1218 	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1219 		return -EINVAL;
1220 
1221 	switch (index) {
1222 	case VFIO_PCI_CONFIG_REGION_INDEX:
1223 		return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
1224 
1225 	case VFIO_PCI_ROM_REGION_INDEX:
1226 		if (iswrite)
1227 			return -EINVAL;
1228 		return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
1229 
1230 	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
1231 		return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
1232 
1233 	case VFIO_PCI_VGA_REGION_INDEX:
1234 		return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
1235 	default:
1236 		index -= VFIO_PCI_NUM_REGIONS;
1237 		return vdev->region[index].ops->rw(vdev, buf,
1238 						   count, ppos, iswrite);
1239 	}
1240 
1241 	return -EINVAL;
1242 }
1243 
1244 ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
1245 		size_t count, loff_t *ppos)
1246 {
1247 	struct vfio_pci_core_device *vdev =
1248 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1249 
1250 	if (!count)
1251 		return 0;
1252 
1253 	return vfio_pci_rw(vdev, buf, count, ppos, false);
1254 }
1255 EXPORT_SYMBOL_GPL(vfio_pci_core_read);
1256 
1257 ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
1258 		size_t count, loff_t *ppos)
1259 {
1260 	struct vfio_pci_core_device *vdev =
1261 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1262 
1263 	if (!count)
1264 		return 0;
1265 
1266 	return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
1267 }
1268 EXPORT_SYMBOL_GPL(vfio_pci_core_write);
1269 
1270 /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1271 static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
1272 {
1273 	struct vfio_pci_mmap_vma *mmap_vma, *tmp;
1274 
1275 	/*
1276 	 * Lock ordering:
1277 	 * vma_lock is nested under mmap_lock for vm_ops callback paths.
1278 	 * The memory_lock semaphore is used by both code paths calling
1279 	 * into this function to zap vmas and the vm_ops.fault callback
1280 	 * to protect the memory enable state of the device.
1281 	 *
1282 	 * When zapping vmas we need to maintain the mmap_lock => vma_lock
1283 	 * ordering, which requires using vma_lock to walk vma_list to
1284 	 * acquire an mm, then dropping vma_lock to get the mmap_lock and
1285 	 * reacquiring vma_lock.  This logic is derived from similar
1286 	 * requirements in uverbs_user_mmap_disassociate().
1287 	 *
1288 	 * mmap_lock must always be the top-level lock when it is taken.
1289 	 * Therefore we can only hold the memory_lock write lock when
1290 	 * vma_list is empty, as we'd need to take mmap_lock to clear
1291 	 * entries.  vma_list can only be guaranteed empty when holding
1292 	 * vma_lock, thus memory_lock is nested under vma_lock.
1293 	 *
1294 	 * This enables the vm_ops.fault callback to acquire vma_lock,
1295 	 * followed by memory_lock read lock, while already holding
1296 	 * mmap_lock without risk of deadlock.
1297 	 */
1298 	while (1) {
1299 		struct mm_struct *mm = NULL;
1300 
1301 		if (try) {
1302 			if (!mutex_trylock(&vdev->vma_lock))
1303 				return 0;
1304 		} else {
1305 			mutex_lock(&vdev->vma_lock);
1306 		}
1307 		while (!list_empty(&vdev->vma_list)) {
1308 			mmap_vma = list_first_entry(&vdev->vma_list,
1309 						    struct vfio_pci_mmap_vma,
1310 						    vma_next);
1311 			mm = mmap_vma->vma->vm_mm;
1312 			if (mmget_not_zero(mm))
1313 				break;
1314 
1315 			list_del(&mmap_vma->vma_next);
1316 			kfree(mmap_vma);
1317 			mm = NULL;
1318 		}
1319 		if (!mm)
1320 			return 1;
1321 		mutex_unlock(&vdev->vma_lock);
1322 
1323 		if (try) {
1324 			if (!mmap_read_trylock(mm)) {
1325 				mmput(mm);
1326 				return 0;
1327 			}
1328 		} else {
1329 			mmap_read_lock(mm);
1330 		}
1331 		if (try) {
1332 			if (!mutex_trylock(&vdev->vma_lock)) {
1333 				mmap_read_unlock(mm);
1334 				mmput(mm);
1335 				return 0;
1336 			}
1337 		} else {
1338 			mutex_lock(&vdev->vma_lock);
1339 		}
1340 		list_for_each_entry_safe(mmap_vma, tmp,
1341 					 &vdev->vma_list, vma_next) {
1342 			struct vm_area_struct *vma = mmap_vma->vma;
1343 
1344 			if (vma->vm_mm != mm)
1345 				continue;
1346 
1347 			list_del(&mmap_vma->vma_next);
1348 			kfree(mmap_vma);
1349 
1350 			zap_vma_ptes(vma, vma->vm_start,
1351 				     vma->vm_end - vma->vm_start);
1352 		}
1353 		mutex_unlock(&vdev->vma_lock);
1354 		mmap_read_unlock(mm);
1355 		mmput(mm);
1356 	}
1357 }
1358 
1359 void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
1360 {
1361 	vfio_pci_zap_and_vma_lock(vdev, false);
1362 	down_write(&vdev->memory_lock);
1363 	mutex_unlock(&vdev->vma_lock);
1364 }
1365 
1366 u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
1367 {
1368 	u16 cmd;
1369 
1370 	down_write(&vdev->memory_lock);
1371 	pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
1372 	if (!(cmd & PCI_COMMAND_MEMORY))
1373 		pci_write_config_word(vdev->pdev, PCI_COMMAND,
1374 				      cmd | PCI_COMMAND_MEMORY);
1375 
1376 	return cmd;
1377 }
1378 
1379 void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
1380 {
1381 	pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
1382 	up_write(&vdev->memory_lock);
1383 }
1384 
1385 /* Caller holds vma_lock */
1386 static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
1387 			      struct vm_area_struct *vma)
1388 {
1389 	struct vfio_pci_mmap_vma *mmap_vma;
1390 
1391 	mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL);
1392 	if (!mmap_vma)
1393 		return -ENOMEM;
1394 
1395 	mmap_vma->vma = vma;
1396 	list_add(&mmap_vma->vma_next, &vdev->vma_list);
1397 
1398 	return 0;
1399 }
1400 
1401 /*
1402  * Zap mmaps on open so that we can fault them in on access and therefore
1403  * our vma_list only tracks mappings accessed since last zap.
1404  */
1405 static void vfio_pci_mmap_open(struct vm_area_struct *vma)
1406 {
1407 	zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1408 }
1409 
1410 static void vfio_pci_mmap_close(struct vm_area_struct *vma)
1411 {
1412 	struct vfio_pci_core_device *vdev = vma->vm_private_data;
1413 	struct vfio_pci_mmap_vma *mmap_vma;
1414 
1415 	mutex_lock(&vdev->vma_lock);
1416 	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1417 		if (mmap_vma->vma == vma) {
1418 			list_del(&mmap_vma->vma_next);
1419 			kfree(mmap_vma);
1420 			break;
1421 		}
1422 	}
1423 	mutex_unlock(&vdev->vma_lock);
1424 }
1425 
1426 static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
1427 {
1428 	struct vm_area_struct *vma = vmf->vma;
1429 	struct vfio_pci_core_device *vdev = vma->vm_private_data;
1430 	struct vfio_pci_mmap_vma *mmap_vma;
1431 	vm_fault_t ret = VM_FAULT_NOPAGE;
1432 
1433 	mutex_lock(&vdev->vma_lock);
1434 	down_read(&vdev->memory_lock);
1435 
1436 	if (!__vfio_pci_memory_enabled(vdev)) {
1437 		ret = VM_FAULT_SIGBUS;
1438 		goto up_out;
1439 	}
1440 
1441 	/*
1442 	 * We populate the whole vma on fault, so we need to test whether
1443 	 * the vma has already been mapped, such as for concurrent faults
1444 	 * to the same vma.  io_remap_pfn_range() will trigger a BUG_ON if
1445 	 * we ask it to fill the same range again.
1446 	 */
1447 	list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
1448 		if (mmap_vma->vma == vma)
1449 			goto up_out;
1450 	}
1451 
1452 	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
1453 			       vma->vm_end - vma->vm_start,
1454 			       vma->vm_page_prot)) {
1455 		ret = VM_FAULT_SIGBUS;
1456 		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1457 		goto up_out;
1458 	}
1459 
1460 	if (__vfio_pci_add_vma(vdev, vma)) {
1461 		ret = VM_FAULT_OOM;
1462 		zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
1463 	}
1464 
1465 up_out:
1466 	up_read(&vdev->memory_lock);
1467 	mutex_unlock(&vdev->vma_lock);
1468 	return ret;
1469 }
1470 
1471 static const struct vm_operations_struct vfio_pci_mmap_ops = {
1472 	.open = vfio_pci_mmap_open,
1473 	.close = vfio_pci_mmap_close,
1474 	.fault = vfio_pci_mmap_fault,
1475 };
1476 
1477 int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
1478 {
1479 	struct vfio_pci_core_device *vdev =
1480 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1481 	struct pci_dev *pdev = vdev->pdev;
1482 	unsigned int index;
1483 	u64 phys_len, req_len, pgoff, req_start;
1484 	int ret;
1485 
1486 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
1487 
1488 	if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
1489 		return -EINVAL;
1490 	if (vma->vm_end < vma->vm_start)
1491 		return -EINVAL;
1492 	if ((vma->vm_flags & VM_SHARED) == 0)
1493 		return -EINVAL;
1494 	if (index >= VFIO_PCI_NUM_REGIONS) {
1495 		int regnum = index - VFIO_PCI_NUM_REGIONS;
1496 		struct vfio_pci_region *region = vdev->region + regnum;
1497 
1498 		if (region->ops && region->ops->mmap &&
1499 		    (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
1500 			return region->ops->mmap(vdev, region, vma);
1501 		return -EINVAL;
1502 	}
1503 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
1504 		return -EINVAL;
1505 	if (!vdev->bar_mmap_supported[index])
1506 		return -EINVAL;
1507 
1508 	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
1509 	req_len = vma->vm_end - vma->vm_start;
1510 	pgoff = vma->vm_pgoff &
1511 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
1512 	req_start = pgoff << PAGE_SHIFT;
1513 
1514 	if (req_start + req_len > phys_len)
1515 		return -EINVAL;
1516 
1517 	/*
1518 	 * Even though we don't make use of the barmap for the mmap,
1519 	 * we need to request the region and the barmap tracks that.
1520 	 */
1521 	if (!vdev->barmap[index]) {
1522 		ret = pci_request_selected_regions(pdev,
1523 						   1 << index, "vfio-pci");
1524 		if (ret)
1525 			return ret;
1526 
1527 		vdev->barmap[index] = pci_iomap(pdev, index, 0);
1528 		if (!vdev->barmap[index]) {
1529 			pci_release_selected_regions(pdev, 1 << index);
1530 			return -ENOMEM;
1531 		}
1532 	}
1533 
1534 	vma->vm_private_data = vdev;
1535 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1536 	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
1537 
1538 	/*
1539 	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1540 	 * change vm_flags within the fault handler.  Set them now.
1541 	 */
1542 	vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1543 	vma->vm_ops = &vfio_pci_mmap_ops;
1544 
1545 	return 0;
1546 }
1547 EXPORT_SYMBOL_GPL(vfio_pci_core_mmap);
1548 
1549 void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
1550 {
1551 	struct vfio_pci_core_device *vdev =
1552 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1553 	struct pci_dev *pdev = vdev->pdev;
1554 
1555 	mutex_lock(&vdev->igate);
1556 
1557 	if (vdev->req_trigger) {
1558 		if (!(count % 10))
1559 			pci_notice_ratelimited(pdev,
1560 				"Relaying device request to user (#%u)\n",
1561 				count);
1562 		eventfd_signal(vdev->req_trigger, 1);
1563 	} else if (count == 0) {
1564 		pci_warn(pdev,
1565 			"No device request channel registered, blocked until released by user\n");
1566 	}
1567 
1568 	mutex_unlock(&vdev->igate);
1569 }
1570 EXPORT_SYMBOL_GPL(vfio_pci_core_request);
1571 
1572 static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
1573 				      bool vf_token, uuid_t *uuid)
1574 {
1575 	/*
1576 	 * There's always some degree of trust or collaboration between SR-IOV
1577 	 * PF and VFs, even if just that the PF hosts the SR-IOV capability and
1578 	 * can disrupt VFs with a reset, but often the PF has more explicit
1579 	 * access to deny service to the VF or access data passed through the
1580 	 * VF.  We therefore require an opt-in via a shared VF token (UUID) to
1581 	 * represent this trust.  This both prevents that a VF driver might
1582 	 * assume the PF driver is a trusted, in-kernel driver, and also that
1583 	 * a PF driver might be replaced with a rogue driver, unknown to in-use
1584 	 * VF drivers.
1585 	 *
1586 	 * Therefore when presented with a VF, if the PF is a vfio device and
1587 	 * it is bound to the vfio-pci driver, the user needs to provide a VF
1588 	 * token to access the device, in the form of appending a vf_token to
1589 	 * the device name, for example:
1590 	 *
1591 	 * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
1592 	 *
1593 	 * When presented with a PF which has VFs in use, the user must also
1594 	 * provide the current VF token to prove collaboration with existing
1595 	 * VF users.  If VFs are not in use, the VF token provided for the PF
1596 	 * device will act to set the VF token.
1597 	 *
1598 	 * If the VF token is provided but unused, an error is generated.
1599 	 */
1600 	if (vdev->pdev->is_virtfn) {
1601 		struct vfio_pci_core_device *pf_vdev = vdev->sriov_pf_core_dev;
1602 		bool match;
1603 
1604 		if (!pf_vdev) {
1605 			if (!vf_token)
1606 				return 0; /* PF is not vfio-pci, no VF token */
1607 
1608 			pci_info_ratelimited(vdev->pdev,
1609 				"VF token incorrectly provided, PF not bound to vfio-pci\n");
1610 			return -EINVAL;
1611 		}
1612 
1613 		if (!vf_token) {
1614 			pci_info_ratelimited(vdev->pdev,
1615 				"VF token required to access device\n");
1616 			return -EACCES;
1617 		}
1618 
1619 		mutex_lock(&pf_vdev->vf_token->lock);
1620 		match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
1621 		mutex_unlock(&pf_vdev->vf_token->lock);
1622 
1623 		if (!match) {
1624 			pci_info_ratelimited(vdev->pdev,
1625 				"Incorrect VF token provided for device\n");
1626 			return -EACCES;
1627 		}
1628 	} else if (vdev->vf_token) {
1629 		mutex_lock(&vdev->vf_token->lock);
1630 		if (vdev->vf_token->users) {
1631 			if (!vf_token) {
1632 				mutex_unlock(&vdev->vf_token->lock);
1633 				pci_info_ratelimited(vdev->pdev,
1634 					"VF token required to access device\n");
1635 				return -EACCES;
1636 			}
1637 
1638 			if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
1639 				mutex_unlock(&vdev->vf_token->lock);
1640 				pci_info_ratelimited(vdev->pdev,
1641 					"Incorrect VF token provided for device\n");
1642 				return -EACCES;
1643 			}
1644 		} else if (vf_token) {
1645 			uuid_copy(&vdev->vf_token->uuid, uuid);
1646 		}
1647 
1648 		mutex_unlock(&vdev->vf_token->lock);
1649 	} else if (vf_token) {
1650 		pci_info_ratelimited(vdev->pdev,
1651 			"VF token incorrectly provided, not a PF or VF\n");
1652 		return -EINVAL;
1653 	}
1654 
1655 	return 0;
1656 }
1657 
1658 #define VF_TOKEN_ARG "vf_token="
1659 
1660 int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
1661 {
1662 	struct vfio_pci_core_device *vdev =
1663 		container_of(core_vdev, struct vfio_pci_core_device, vdev);
1664 	bool vf_token = false;
1665 	uuid_t uuid;
1666 	int ret;
1667 
1668 	if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
1669 		return 0; /* No match */
1670 
1671 	if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
1672 		buf += strlen(pci_name(vdev->pdev));
1673 
1674 		if (*buf != ' ')
1675 			return 0; /* No match: non-whitespace after name */
1676 
1677 		while (*buf) {
1678 			if (*buf == ' ') {
1679 				buf++;
1680 				continue;
1681 			}
1682 
1683 			if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
1684 						  strlen(VF_TOKEN_ARG))) {
1685 				buf += strlen(VF_TOKEN_ARG);
1686 
1687 				if (strlen(buf) < UUID_STRING_LEN)
1688 					return -EINVAL;
1689 
1690 				ret = uuid_parse(buf, &uuid);
1691 				if (ret)
1692 					return ret;
1693 
1694 				vf_token = true;
1695 				buf += UUID_STRING_LEN;
1696 			} else {
1697 				/* Unknown/duplicate option */
1698 				return -EINVAL;
1699 			}
1700 		}
1701 	}
1702 
1703 	ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
1704 	if (ret)
1705 		return ret;
1706 
1707 	return 1; /* Match */
1708 }
1709 EXPORT_SYMBOL_GPL(vfio_pci_core_match);
1710 
1711 static int vfio_pci_bus_notifier(struct notifier_block *nb,
1712 				 unsigned long action, void *data)
1713 {
1714 	struct vfio_pci_core_device *vdev = container_of(nb,
1715 						    struct vfio_pci_core_device, nb);
1716 	struct device *dev = data;
1717 	struct pci_dev *pdev = to_pci_dev(dev);
1718 	struct pci_dev *physfn = pci_physfn(pdev);
1719 
1720 	if (action == BUS_NOTIFY_ADD_DEVICE &&
1721 	    pdev->is_virtfn && physfn == vdev->pdev) {
1722 		pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
1723 			 pci_name(pdev));
1724 		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
1725 						  vdev->vdev.ops->name);
1726 	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
1727 		   pdev->is_virtfn && physfn == vdev->pdev) {
1728 		struct pci_driver *drv = pci_dev_driver(pdev);
1729 
1730 		if (drv && drv != pci_dev_driver(vdev->pdev))
1731 			pci_warn(vdev->pdev,
1732 				 "VF %s bound to driver %s while PF bound to driver %s\n",
1733 				 pci_name(pdev), drv->name,
1734 				 pci_dev_driver(vdev->pdev)->name);
1735 	}
1736 
1737 	return 0;
1738 }
1739 
1740 static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev)
1741 {
1742 	struct pci_dev *pdev = vdev->pdev;
1743 	struct vfio_pci_core_device *cur;
1744 	struct pci_dev *physfn;
1745 	int ret;
1746 
1747 	if (pdev->is_virtfn) {
1748 		/*
1749 		 * If this VF was created by our vfio_pci_core_sriov_configure()
1750 		 * then we can find the PF vfio_pci_core_device now, and due to
1751 		 * the locking in pci_disable_sriov() it cannot change until
1752 		 * this VF device driver is removed.
1753 		 */
1754 		physfn = pci_physfn(vdev->pdev);
1755 		mutex_lock(&vfio_pci_sriov_pfs_mutex);
1756 		list_for_each_entry(cur, &vfio_pci_sriov_pfs, sriov_pfs_item) {
1757 			if (cur->pdev == physfn) {
1758 				vdev->sriov_pf_core_dev = cur;
1759 				break;
1760 			}
1761 		}
1762 		mutex_unlock(&vfio_pci_sriov_pfs_mutex);
1763 		return 0;
1764 	}
1765 
1766 	/* Not a SRIOV PF */
1767 	if (!pdev->is_physfn)
1768 		return 0;
1769 
1770 	vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
1771 	if (!vdev->vf_token)
1772 		return -ENOMEM;
1773 
1774 	mutex_init(&vdev->vf_token->lock);
1775 	uuid_gen(&vdev->vf_token->uuid);
1776 
1777 	vdev->nb.notifier_call = vfio_pci_bus_notifier;
1778 	ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
1779 	if (ret) {
1780 		kfree(vdev->vf_token);
1781 		return ret;
1782 	}
1783 	return 0;
1784 }
1785 
1786 static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev)
1787 {
1788 	if (!vdev->vf_token)
1789 		return;
1790 
1791 	bus_unregister_notifier(&pci_bus_type, &vdev->nb);
1792 	WARN_ON(vdev->vf_token->users);
1793 	mutex_destroy(&vdev->vf_token->lock);
1794 	kfree(vdev->vf_token);
1795 }
1796 
1797 static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev)
1798 {
1799 	struct pci_dev *pdev = vdev->pdev;
1800 	int ret;
1801 
1802 	if (!vfio_pci_is_vga(pdev))
1803 		return 0;
1804 
1805 	ret = aperture_remove_conflicting_pci_devices(pdev, vdev->vdev.ops->name);
1806 	if (ret)
1807 		return ret;
1808 
1809 	ret = vga_client_register(pdev, vfio_pci_set_decode);
1810 	if (ret)
1811 		return ret;
1812 	vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false));
1813 	return 0;
1814 }
1815 
1816 static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
1817 {
1818 	struct pci_dev *pdev = vdev->pdev;
1819 
1820 	if (!vfio_pci_is_vga(pdev))
1821 		return;
1822 	vga_client_unregister(pdev);
1823 	vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
1824 					      VGA_RSRC_LEGACY_IO |
1825 					      VGA_RSRC_LEGACY_MEM);
1826 }
1827 
1828 void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev,
1829 			       struct pci_dev *pdev,
1830 			       const struct vfio_device_ops *vfio_pci_ops)
1831 {
1832 	vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops);
1833 	vdev->pdev = pdev;
1834 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
1835 	mutex_init(&vdev->igate);
1836 	spin_lock_init(&vdev->irqlock);
1837 	mutex_init(&vdev->ioeventfds_lock);
1838 	INIT_LIST_HEAD(&vdev->dummy_resources_list);
1839 	INIT_LIST_HEAD(&vdev->ioeventfds_list);
1840 	mutex_init(&vdev->vma_lock);
1841 	INIT_LIST_HEAD(&vdev->vma_list);
1842 	INIT_LIST_HEAD(&vdev->sriov_pfs_item);
1843 	init_rwsem(&vdev->memory_lock);
1844 }
1845 EXPORT_SYMBOL_GPL(vfio_pci_core_init_device);
1846 
1847 void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev)
1848 {
1849 	mutex_destroy(&vdev->igate);
1850 	mutex_destroy(&vdev->ioeventfds_lock);
1851 	mutex_destroy(&vdev->vma_lock);
1852 	vfio_uninit_group_dev(&vdev->vdev);
1853 	kfree(vdev->region);
1854 	kfree(vdev->pm_save);
1855 }
1856 EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device);
1857 
1858 int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
1859 {
1860 	struct pci_dev *pdev = vdev->pdev;
1861 	struct device *dev = &pdev->dev;
1862 	int ret;
1863 
1864 	/* Drivers must set the vfio_pci_core_device to their drvdata */
1865 	if (WARN_ON(vdev != dev_get_drvdata(dev)))
1866 		return -EINVAL;
1867 
1868 	if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
1869 		return -EINVAL;
1870 
1871 	if (vdev->vdev.mig_ops) {
1872 		if (!(vdev->vdev.mig_ops->migration_get_state &&
1873 		      vdev->vdev.mig_ops->migration_set_state) ||
1874 		    !(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY))
1875 			return -EINVAL;
1876 	}
1877 
1878 	/*
1879 	 * Prevent binding to PFs with VFs enabled, the VFs might be in use
1880 	 * by the host or other users.  We cannot capture the VFs if they
1881 	 * already exist, nor can we track VF users.  Disabling SR-IOV here
1882 	 * would initiate removing the VFs, which would unbind the driver,
1883 	 * which is prone to blocking if that VF is also in use by vfio-pci.
1884 	 * Just reject these PFs and let the user sort it out.
1885 	 */
1886 	if (pci_num_vf(pdev)) {
1887 		pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
1888 		return -EBUSY;
1889 	}
1890 
1891 	if (pci_is_root_bus(pdev->bus)) {
1892 		ret = vfio_assign_device_set(&vdev->vdev, vdev);
1893 	} else if (!pci_probe_reset_slot(pdev->slot)) {
1894 		ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
1895 	} else {
1896 		/*
1897 		 * If there is no slot reset support for this device, the whole
1898 		 * bus needs to be grouped together to support bus-wide resets.
1899 		 */
1900 		ret = vfio_assign_device_set(&vdev->vdev, pdev->bus);
1901 	}
1902 
1903 	if (ret)
1904 		return ret;
1905 	ret = vfio_pci_vf_init(vdev);
1906 	if (ret)
1907 		return ret;
1908 	ret = vfio_pci_vga_init(vdev);
1909 	if (ret)
1910 		goto out_vf;
1911 
1912 	vfio_pci_probe_power_state(vdev);
1913 
1914 	/*
1915 	 * pci-core sets the device power state to an unknown value at
1916 	 * bootup and after being removed from a driver.  The only
1917 	 * transition it allows from this unknown state is to D0, which
1918 	 * typically happens when a driver calls pci_enable_device().
1919 	 * We're not ready to enable the device yet, but we do want to
1920 	 * be able to get to D3.  Therefore first do a D0 transition
1921 	 * before enabling runtime PM.
1922 	 */
1923 	vfio_pci_set_power_state(vdev, PCI_D0);
1924 
1925 	dev->driver->pm = &vfio_pci_core_pm_ops;
1926 	pm_runtime_allow(dev);
1927 	if (!disable_idle_d3)
1928 		pm_runtime_put(dev);
1929 
1930 	ret = vfio_register_group_dev(&vdev->vdev);
1931 	if (ret)
1932 		goto out_power;
1933 	return 0;
1934 
1935 out_power:
1936 	if (!disable_idle_d3)
1937 		pm_runtime_get_noresume(dev);
1938 
1939 	pm_runtime_forbid(dev);
1940 out_vf:
1941 	vfio_pci_vf_uninit(vdev);
1942 	return ret;
1943 }
1944 EXPORT_SYMBOL_GPL(vfio_pci_core_register_device);
1945 
1946 void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
1947 {
1948 	vfio_pci_core_sriov_configure(vdev, 0);
1949 
1950 	vfio_unregister_group_dev(&vdev->vdev);
1951 
1952 	vfio_pci_vf_uninit(vdev);
1953 	vfio_pci_vga_uninit(vdev);
1954 
1955 	if (!disable_idle_d3)
1956 		pm_runtime_get_noresume(&vdev->pdev->dev);
1957 
1958 	pm_runtime_forbid(&vdev->pdev->dev);
1959 }
1960 EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device);
1961 
1962 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
1963 						pci_channel_state_t state)
1964 {
1965 	struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
1966 
1967 	mutex_lock(&vdev->igate);
1968 
1969 	if (vdev->err_trigger)
1970 		eventfd_signal(vdev->err_trigger, 1);
1971 
1972 	mutex_unlock(&vdev->igate);
1973 
1974 	return PCI_ERS_RESULT_CAN_RECOVER;
1975 }
1976 EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected);
1977 
1978 int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
1979 				  int nr_virtfn)
1980 {
1981 	struct pci_dev *pdev = vdev->pdev;
1982 	int ret = 0;
1983 
1984 	device_lock_assert(&pdev->dev);
1985 
1986 	if (nr_virtfn) {
1987 		mutex_lock(&vfio_pci_sriov_pfs_mutex);
1988 		/*
1989 		 * The thread that adds the vdev to the list is the only thread
1990 		 * that gets to call pci_enable_sriov() and we will only allow
1991 		 * it to be called once without going through
1992 		 * pci_disable_sriov()
1993 		 */
1994 		if (!list_empty(&vdev->sriov_pfs_item)) {
1995 			ret = -EINVAL;
1996 			goto out_unlock;
1997 		}
1998 		list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs);
1999 		mutex_unlock(&vfio_pci_sriov_pfs_mutex);
2000 
2001 		/*
2002 		 * The PF power state should always be higher than the VF power
2003 		 * state. The PF can be in low power state either with runtime
2004 		 * power management (when there is no user) or PCI_PM_CTRL
2005 		 * register write by the user. If PF is in the low power state,
2006 		 * then change the power state to D0 first before enabling
2007 		 * SR-IOV. Also, this function can be called at any time, and
2008 		 * userspace PCI_PM_CTRL write can race against this code path,
2009 		 * so protect the same with 'memory_lock'.
2010 		 */
2011 		ret = pm_runtime_resume_and_get(&pdev->dev);
2012 		if (ret)
2013 			goto out_del;
2014 
2015 		down_write(&vdev->memory_lock);
2016 		vfio_pci_set_power_state(vdev, PCI_D0);
2017 		ret = pci_enable_sriov(pdev, nr_virtfn);
2018 		up_write(&vdev->memory_lock);
2019 		if (ret) {
2020 			pm_runtime_put(&pdev->dev);
2021 			goto out_del;
2022 		}
2023 		return nr_virtfn;
2024 	}
2025 
2026 	if (pci_num_vf(pdev)) {
2027 		pci_disable_sriov(pdev);
2028 		pm_runtime_put(&pdev->dev);
2029 	}
2030 
2031 out_del:
2032 	mutex_lock(&vfio_pci_sriov_pfs_mutex);
2033 	list_del_init(&vdev->sriov_pfs_item);
2034 out_unlock:
2035 	mutex_unlock(&vfio_pci_sriov_pfs_mutex);
2036 	return ret;
2037 }
2038 EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
2039 
2040 const struct pci_error_handlers vfio_pci_core_err_handlers = {
2041 	.error_detected = vfio_pci_core_aer_err_detected,
2042 };
2043 EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
2044 
2045 static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev,
2046 			       struct vfio_pci_group_info *groups)
2047 {
2048 	unsigned int i;
2049 
2050 	for (i = 0; i < groups->count; i++)
2051 		if (vfio_file_has_dev(groups->files[i], &vdev->vdev))
2052 			return true;
2053 	return false;
2054 }
2055 
2056 static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
2057 {
2058 	struct vfio_device_set *dev_set = data;
2059 	struct vfio_device *cur;
2060 
2061 	list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
2062 		if (cur->dev == &pdev->dev)
2063 			return 0;
2064 	return -EBUSY;
2065 }
2066 
2067 /*
2068  * vfio-core considers a group to be viable and will create a vfio_device even
2069  * if some devices are bound to drivers like pci-stub or pcieport. Here we
2070  * require all PCI devices to be inside our dev_set since that ensures they stay
2071  * put and that every driver controlling the device can co-ordinate with the
2072  * device reset.
2073  *
2074  * Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
2075  * reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
2076  */
2077 static struct pci_dev *
2078 vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
2079 {
2080 	struct pci_dev *pdev;
2081 
2082 	lockdep_assert_held(&dev_set->lock);
2083 
2084 	/*
2085 	 * By definition all PCI devices in the dev_set share the same PCI
2086 	 * reset, so any pci_dev will have the same outcomes for
2087 	 * pci_probe_reset_*() and pci_reset_bus().
2088 	 */
2089 	pdev = list_first_entry(&dev_set->device_list,
2090 				struct vfio_pci_core_device,
2091 				vdev.dev_set_list)->pdev;
2092 
2093 	/* pci_reset_bus() is supported */
2094 	if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
2095 		return NULL;
2096 
2097 	if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
2098 					  dev_set,
2099 					  !pci_probe_reset_slot(pdev->slot)))
2100 		return NULL;
2101 	return pdev;
2102 }
2103 
2104 static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
2105 {
2106 	struct vfio_pci_core_device *cur;
2107 	int ret;
2108 
2109 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2110 		ret = pm_runtime_resume_and_get(&cur->pdev->dev);
2111 		if (ret)
2112 			goto unwind;
2113 	}
2114 
2115 	return 0;
2116 
2117 unwind:
2118 	list_for_each_entry_continue_reverse(cur, &dev_set->device_list,
2119 					     vdev.dev_set_list)
2120 		pm_runtime_put(&cur->pdev->dev);
2121 
2122 	return ret;
2123 }
2124 
2125 /*
2126  * We need to get memory_lock for each device, but devices can share mmap_lock,
2127  * therefore we need to zap and hold the vma_lock for each device, and only then
2128  * get each memory_lock.
2129  */
2130 static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
2131 				      struct vfio_pci_group_info *groups)
2132 {
2133 	struct vfio_pci_core_device *cur_mem;
2134 	struct vfio_pci_core_device *cur_vma;
2135 	struct vfio_pci_core_device *cur;
2136 	struct pci_dev *pdev;
2137 	bool is_mem = true;
2138 	int ret;
2139 
2140 	mutex_lock(&dev_set->lock);
2141 	cur_mem = list_first_entry(&dev_set->device_list,
2142 				   struct vfio_pci_core_device,
2143 				   vdev.dev_set_list);
2144 
2145 	pdev = vfio_pci_dev_set_resettable(dev_set);
2146 	if (!pdev) {
2147 		ret = -EINVAL;
2148 		goto err_unlock;
2149 	}
2150 
2151 	list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
2152 		/*
2153 		 * Test whether all the affected devices are contained by the
2154 		 * set of groups provided by the user.
2155 		 */
2156 		if (!vfio_dev_in_groups(cur_vma, groups)) {
2157 			ret = -EINVAL;
2158 			goto err_undo;
2159 		}
2160 
2161 		/*
2162 		 * Locking multiple devices is prone to deadlock, runaway and
2163 		 * unwind if we hit contention.
2164 		 */
2165 		if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
2166 			ret = -EBUSY;
2167 			goto err_undo;
2168 		}
2169 	}
2170 	cur_vma = NULL;
2171 
2172 	list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
2173 		if (!down_write_trylock(&cur_mem->memory_lock)) {
2174 			ret = -EBUSY;
2175 			goto err_undo;
2176 		}
2177 		mutex_unlock(&cur_mem->vma_lock);
2178 	}
2179 	cur_mem = NULL;
2180 
2181 	/*
2182 	 * The pci_reset_bus() will reset all the devices in the bus.
2183 	 * The power state can be non-D0 for some of the devices in the bus.
2184 	 * For these devices, the pci_reset_bus() will internally set
2185 	 * the power state to D0 without vfio driver involvement.
2186 	 * For the devices which have NoSoftRst-, the reset function can
2187 	 * cause the PCI config space reset without restoring the original
2188 	 * state (saved locally in 'vdev->pm_save').
2189 	 */
2190 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
2191 		vfio_pci_set_power_state(cur, PCI_D0);
2192 
2193 	ret = pci_reset_bus(pdev);
2194 
2195 err_undo:
2196 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2197 		if (cur == cur_mem)
2198 			is_mem = false;
2199 		if (cur == cur_vma)
2200 			break;
2201 		if (is_mem)
2202 			up_write(&cur->memory_lock);
2203 		else
2204 			mutex_unlock(&cur->vma_lock);
2205 	}
2206 err_unlock:
2207 	mutex_unlock(&dev_set->lock);
2208 	return ret;
2209 }
2210 
2211 static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
2212 {
2213 	struct vfio_pci_core_device *cur;
2214 	bool needs_reset = false;
2215 
2216 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2217 		/* No VFIO device in the set can have an open device FD */
2218 		if (cur->vdev.open_count)
2219 			return false;
2220 		needs_reset |= cur->needs_reset;
2221 	}
2222 	return needs_reset;
2223 }
2224 
2225 /*
2226  * If a bus or slot reset is available for the provided dev_set and:
2227  *  - All of the devices affected by that bus or slot reset are unused
2228  *  - At least one of the affected devices is marked dirty via
2229  *    needs_reset (such as by lack of FLR support)
2230  * Then attempt to perform that bus or slot reset.
2231  */
2232 static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
2233 {
2234 	struct vfio_pci_core_device *cur;
2235 	struct pci_dev *pdev;
2236 	bool reset_done = false;
2237 
2238 	if (!vfio_pci_dev_set_needs_reset(dev_set))
2239 		return;
2240 
2241 	pdev = vfio_pci_dev_set_resettable(dev_set);
2242 	if (!pdev)
2243 		return;
2244 
2245 	/*
2246 	 * Some of the devices in the bus can be in the runtime suspended
2247 	 * state. Increment the usage count for all the devices in the dev_set
2248 	 * before reset and decrement the same after reset.
2249 	 */
2250 	if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set))
2251 		return;
2252 
2253 	if (!pci_reset_bus(pdev))
2254 		reset_done = true;
2255 
2256 	list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
2257 		if (reset_done)
2258 			cur->needs_reset = false;
2259 
2260 		if (!disable_idle_d3)
2261 			pm_runtime_put(&cur->pdev->dev);
2262 	}
2263 }
2264 
2265 void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
2266 			      bool is_disable_idle_d3)
2267 {
2268 	nointxmask = is_nointxmask;
2269 	disable_vga = is_disable_vga;
2270 	disable_idle_d3 = is_disable_idle_d3;
2271 }
2272 EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
2273 
2274 static void vfio_pci_core_cleanup(void)
2275 {
2276 	vfio_pci_uninit_perm_bits();
2277 }
2278 
2279 static int __init vfio_pci_core_init(void)
2280 {
2281 	/* Allocate shared config space permission data used by all devices */
2282 	return vfio_pci_init_perm_bits();
2283 }
2284 
2285 module_init(vfio_pci_core_init);
2286 module_exit(vfio_pci_core_cleanup);
2287 
2288 MODULE_LICENSE("GPL v2");
2289 MODULE_AUTHOR(DRIVER_AUTHOR);
2290 MODULE_DESCRIPTION(DRIVER_DESC);
2291