xref: /freebsd/sys/amd64/vmm/io/ppt.c (revision cddbc3b40812213ff00041f79174cac0be360a2a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/malloc.h>
38 #include <sys/module.h>
39 #include <sys/bus.h>
40 #include <sys/pciio.h>
41 #include <sys/rman.h>
42 #include <sys/smp.h>
43 #include <sys/sysctl.h>
44 
45 #include <dev/pci/pcivar.h>
46 #include <dev/pci/pcireg.h>
47 
48 #include <machine/resource.h>
49 
50 #include <machine/vmm.h>
51 #include <machine/vmm_dev.h>
52 
53 #include "vmm_lapic.h"
54 #include "vmm_ktr.h"
55 
56 #include "iommu.h"
57 #include "ppt.h"
58 
59 /* XXX locking */
60 
61 #define	MAX_MSIMSGS	32
62 
63 /*
64  * If the MSI-X table is located in the middle of a BAR then that MMIO
65  * region gets split into two segments - one segment above the MSI-X table
66  * and the other segment below the MSI-X table - with a hole in place of
67  * the MSI-X table so accesses to it can be trapped and emulated.
68  *
69  * So, allocate a MMIO segment for each BAR register + 1 additional segment.
70  */
71 #define	MAX_MMIOSEGS	((PCIR_MAX_BAR_0 + 1) + 1)
72 
73 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
74 
75 struct pptintr_arg {				/* pptintr(pptintr_arg) */
76 	struct pptdev	*pptdev;
77 	uint64_t	addr;
78 	uint64_t	msg_data;
79 };
80 
81 struct pptseg {
82 	vm_paddr_t	gpa;
83 	size_t		len;
84 	int		wired;
85 };
86 
87 struct pptdev {
88 	device_t	dev;
89 	struct vm	*vm;			/* owner of this device */
90 	TAILQ_ENTRY(pptdev)	next;
91 	struct pptseg mmio[MAX_MMIOSEGS];
92 	struct {
93 		int	num_msgs;		/* guest state */
94 
95 		int	startrid;		/* host state */
96 		struct resource *res[MAX_MSIMSGS];
97 		void	*cookie[MAX_MSIMSGS];
98 		struct pptintr_arg arg[MAX_MSIMSGS];
99 	} msi;
100 
101 	struct {
102 		int num_msgs;
103 		int startrid;
104 		int msix_table_rid;
105 		struct resource *msix_table_res;
106 		struct resource **res;
107 		void **cookie;
108 		struct pptintr_arg *arg;
109 	} msix;
110 };
111 
112 SYSCTL_DECL(_hw_vmm);
113 SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
114 
115 static int num_pptdevs;
116 SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
117     "number of pci passthru devices");
118 
119 static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
120 
121 static int
122 ppt_probe(device_t dev)
123 {
124 	int bus, slot, func;
125 	struct pci_devinfo *dinfo;
126 
127 	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
128 
129 	bus = pci_get_bus(dev);
130 	slot = pci_get_slot(dev);
131 	func = pci_get_function(dev);
132 
133 	/*
134 	 * To qualify as a pci passthrough device a device must:
135 	 * - be allowed by administrator to be used in this role
136 	 * - be an endpoint device
137 	 */
138 	if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
139 		return (ENXIO);
140 	else if (vmm_is_pptdev(bus, slot, func))
141 		return (0);
142 	else
143 		/*
144 		 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
145 		 * SR-IOV infrastructure specified as "ppt" passthrough devices.
146 		 * All normal devices that did not have "ppt" specified as their
147 		 * driver will not be matched by this.
148 		 */
149 		return (BUS_PROBE_NOWILDCARD);
150 }
151 
152 static int
153 ppt_attach(device_t dev)
154 {
155 	struct pptdev *ppt;
156 
157 	ppt = device_get_softc(dev);
158 
159 	iommu_remove_device(iommu_host_domain(), pci_get_rid(dev));
160 	num_pptdevs++;
161 	TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
162 	ppt->dev = dev;
163 
164 	if (bootverbose)
165 		device_printf(dev, "attached\n");
166 
167 	return (0);
168 }
169 
170 static int
171 ppt_detach(device_t dev)
172 {
173 	struct pptdev *ppt;
174 
175 	ppt = device_get_softc(dev);
176 
177 	if (ppt->vm != NULL)
178 		return (EBUSY);
179 	num_pptdevs--;
180 	TAILQ_REMOVE(&pptdev_list, ppt, next);
181 	pci_disable_busmaster(dev);
182 	iommu_add_device(iommu_host_domain(), pci_get_rid(dev));
183 
184 	return (0);
185 }
186 
187 static device_method_t ppt_methods[] = {
188 	/* Device interface */
189 	DEVMETHOD(device_probe,		ppt_probe),
190 	DEVMETHOD(device_attach,	ppt_attach),
191 	DEVMETHOD(device_detach,	ppt_detach),
192 	{0, 0}
193 };
194 
195 static devclass_t ppt_devclass;
196 DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
197 DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
198 
199 static struct pptdev *
200 ppt_find(int bus, int slot, int func)
201 {
202 	device_t dev;
203 	struct pptdev *ppt;
204 	int b, s, f;
205 
206 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
207 		dev = ppt->dev;
208 		b = pci_get_bus(dev);
209 		s = pci_get_slot(dev);
210 		f = pci_get_function(dev);
211 		if (bus == b && slot == s && func == f)
212 			return (ppt);
213 	}
214 	return (NULL);
215 }
216 
217 static void
218 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
219 {
220 	int i;
221 	struct pptseg *seg;
222 
223 	for (i = 0; i < MAX_MMIOSEGS; i++) {
224 		seg = &ppt->mmio[i];
225 		if (seg->len == 0)
226 			continue;
227 		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
228 		bzero(seg, sizeof(struct pptseg));
229 	}
230 }
231 
232 static void
233 ppt_teardown_msi(struct pptdev *ppt)
234 {
235 	int i, rid;
236 	void *cookie;
237 	struct resource *res;
238 
239 	if (ppt->msi.num_msgs == 0)
240 		return;
241 
242 	for (i = 0; i < ppt->msi.num_msgs; i++) {
243 		rid = ppt->msi.startrid + i;
244 		res = ppt->msi.res[i];
245 		cookie = ppt->msi.cookie[i];
246 
247 		if (cookie != NULL)
248 			bus_teardown_intr(ppt->dev, res, cookie);
249 
250 		if (res != NULL)
251 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
252 
253 		ppt->msi.res[i] = NULL;
254 		ppt->msi.cookie[i] = NULL;
255 	}
256 
257 	if (ppt->msi.startrid == 1)
258 		pci_release_msi(ppt->dev);
259 
260 	ppt->msi.num_msgs = 0;
261 }
262 
263 static void
264 ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
265 {
266 	int rid;
267 	struct resource *res;
268 	void *cookie;
269 
270 	rid = ppt->msix.startrid + idx;
271 	res = ppt->msix.res[idx];
272 	cookie = ppt->msix.cookie[idx];
273 
274 	if (cookie != NULL)
275 		bus_teardown_intr(ppt->dev, res, cookie);
276 
277 	if (res != NULL)
278 		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
279 
280 	ppt->msix.res[idx] = NULL;
281 	ppt->msix.cookie[idx] = NULL;
282 }
283 
284 static void
285 ppt_teardown_msix(struct pptdev *ppt)
286 {
287 	int i;
288 
289 	if (ppt->msix.num_msgs == 0)
290 		return;
291 
292 	for (i = 0; i < ppt->msix.num_msgs; i++)
293 		ppt_teardown_msix_intr(ppt, i);
294 
295 	if (ppt->msix.msix_table_res) {
296 		bus_release_resource(ppt->dev, SYS_RES_MEMORY,
297 				     ppt->msix.msix_table_rid,
298 				     ppt->msix.msix_table_res);
299 		ppt->msix.msix_table_res = NULL;
300 		ppt->msix.msix_table_rid = 0;
301 	}
302 
303 	free(ppt->msix.res, M_PPTMSIX);
304 	free(ppt->msix.cookie, M_PPTMSIX);
305 	free(ppt->msix.arg, M_PPTMSIX);
306 
307 	pci_release_msi(ppt->dev);
308 
309 	ppt->msix.num_msgs = 0;
310 }
311 
312 int
313 ppt_avail_devices(void)
314 {
315 
316 	return (num_pptdevs);
317 }
318 
319 int
320 ppt_assigned_devices(struct vm *vm)
321 {
322 	struct pptdev *ppt;
323 	int num;
324 
325 	num = 0;
326 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
327 		if (ppt->vm == vm)
328 			num++;
329 	}
330 	return (num);
331 }
332 
333 boolean_t
334 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
335 {
336 	int i;
337 	struct pptdev *ppt;
338 	struct pptseg *seg;
339 
340 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
341 		if (ppt->vm != vm)
342 			continue;
343 
344 		for (i = 0; i < MAX_MMIOSEGS; i++) {
345 			seg = &ppt->mmio[i];
346 			if (seg->len == 0)
347 				continue;
348 			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
349 				return (TRUE);
350 		}
351 	}
352 
353 	return (FALSE);
354 }
355 
356 static void
357 ppt_pci_reset(device_t dev)
358 {
359 
360 	if (pcie_flr(dev,
361 	     max(pcie_get_max_completion_timeout(dev) / 1000, 10), true))
362 		return;
363 
364 	pci_power_reset(dev);
365 }
366 
367 int
368 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
369 {
370 	struct pptdev *ppt;
371 
372 	ppt = ppt_find(bus, slot, func);
373 	if (ppt != NULL) {
374 		/*
375 		 * If this device is owned by a different VM then we
376 		 * cannot change its owner.
377 		 */
378 		if (ppt->vm != NULL && ppt->vm != vm)
379 			return (EBUSY);
380 
381 		pci_save_state(ppt->dev);
382 		ppt_pci_reset(ppt->dev);
383 		pci_restore_state(ppt->dev);
384 		ppt->vm = vm;
385 		iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
386 		return (0);
387 	}
388 	return (ENOENT);
389 }
390 
391 int
392 ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
393 {
394 	struct pptdev *ppt;
395 
396 	ppt = ppt_find(bus, slot, func);
397 	if (ppt != NULL) {
398 		/*
399 		 * If this device is not owned by this 'vm' then bail out.
400 		 */
401 		if (ppt->vm != vm)
402 			return (EBUSY);
403 
404 		pci_save_state(ppt->dev);
405 		ppt_pci_reset(ppt->dev);
406 		pci_restore_state(ppt->dev);
407 		ppt_unmap_mmio(vm, ppt);
408 		ppt_teardown_msi(ppt);
409 		ppt_teardown_msix(ppt);
410 		iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
411 		ppt->vm = NULL;
412 		return (0);
413 	}
414 	return (ENOENT);
415 }
416 
417 int
418 ppt_unassign_all(struct vm *vm)
419 {
420 	struct pptdev *ppt;
421 	int bus, slot, func;
422 	device_t dev;
423 
424 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
425 		if (ppt->vm == vm) {
426 			dev = ppt->dev;
427 			bus = pci_get_bus(dev);
428 			slot = pci_get_slot(dev);
429 			func = pci_get_function(dev);
430 			vm_unassign_pptdev(vm, bus, slot, func);
431 		}
432 	}
433 
434 	return (0);
435 }
436 
437 int
438 ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
439 	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
440 {
441 	int i, error;
442 	struct pptseg *seg;
443 	struct pptdev *ppt;
444 
445 	ppt = ppt_find(bus, slot, func);
446 	if (ppt != NULL) {
447 		if (ppt->vm != vm)
448 			return (EBUSY);
449 
450 		for (i = 0; i < MAX_MMIOSEGS; i++) {
451 			seg = &ppt->mmio[i];
452 			if (seg->len == 0) {
453 				error = vm_map_mmio(vm, gpa, len, hpa);
454 				if (error == 0) {
455 					seg->gpa = gpa;
456 					seg->len = len;
457 				}
458 				return (error);
459 			}
460 		}
461 		return (ENOSPC);
462 	}
463 	return (ENOENT);
464 }
465 
466 static int
467 pptintr(void *arg)
468 {
469 	struct pptdev *ppt;
470 	struct pptintr_arg *pptarg;
471 
472 	pptarg = arg;
473 	ppt = pptarg->pptdev;
474 
475 	if (ppt->vm != NULL)
476 		lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
477 	else {
478 		/*
479 		 * XXX
480 		 * This is not expected to happen - panic?
481 		 */
482 	}
483 
484 	/*
485 	 * For legacy interrupts give other filters a chance in case
486 	 * the interrupt was not generated by the passthrough device.
487 	 */
488 	if (ppt->msi.startrid == 0)
489 		return (FILTER_STRAY);
490 	else
491 		return (FILTER_HANDLED);
492 }
493 
494 int
495 ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
496 	      uint64_t addr, uint64_t msg, int numvec)
497 {
498 	int i, rid, flags;
499 	int msi_count, startrid, error, tmp;
500 	struct pptdev *ppt;
501 
502 	if (numvec < 0 || numvec > MAX_MSIMSGS)
503 		return (EINVAL);
504 
505 	ppt = ppt_find(bus, slot, func);
506 	if (ppt == NULL)
507 		return (ENOENT);
508 	if (ppt->vm != vm)		/* Make sure we own this device */
509 		return (EBUSY);
510 
511 	/* Free any allocated resources */
512 	ppt_teardown_msi(ppt);
513 
514 	if (numvec == 0)		/* nothing more to do */
515 		return (0);
516 
517 	flags = RF_ACTIVE;
518 	msi_count = pci_msi_count(ppt->dev);
519 	if (msi_count == 0) {
520 		startrid = 0;		/* legacy interrupt */
521 		msi_count = 1;
522 		flags |= RF_SHAREABLE;
523 	} else
524 		startrid = 1;		/* MSI */
525 
526 	/*
527 	 * The device must be capable of supporting the number of vectors
528 	 * the guest wants to allocate.
529 	 */
530 	if (numvec > msi_count)
531 		return (EINVAL);
532 
533 	/*
534 	 * Make sure that we can allocate all the MSI vectors that are needed
535 	 * by the guest.
536 	 */
537 	if (startrid == 1) {
538 		tmp = numvec;
539 		error = pci_alloc_msi(ppt->dev, &tmp);
540 		if (error)
541 			return (error);
542 		else if (tmp != numvec) {
543 			pci_release_msi(ppt->dev);
544 			return (ENOSPC);
545 		} else {
546 			/* success */
547 		}
548 	}
549 
550 	ppt->msi.startrid = startrid;
551 
552 	/*
553 	 * Allocate the irq resource and attach it to the interrupt handler.
554 	 */
555 	for (i = 0; i < numvec; i++) {
556 		ppt->msi.num_msgs = i + 1;
557 		ppt->msi.cookie[i] = NULL;
558 
559 		rid = startrid + i;
560 		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
561 							 &rid, flags);
562 		if (ppt->msi.res[i] == NULL)
563 			break;
564 
565 		ppt->msi.arg[i].pptdev = ppt;
566 		ppt->msi.arg[i].addr = addr;
567 		ppt->msi.arg[i].msg_data = msg + i;
568 
569 		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
570 				       INTR_TYPE_NET | INTR_MPSAFE,
571 				       pptintr, NULL, &ppt->msi.arg[i],
572 				       &ppt->msi.cookie[i]);
573 		if (error != 0)
574 			break;
575 	}
576 
577 	if (i < numvec) {
578 		ppt_teardown_msi(ppt);
579 		return (ENXIO);
580 	}
581 
582 	return (0);
583 }
584 
585 int
586 ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
587 	       int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
588 {
589 	struct pptdev *ppt;
590 	struct pci_devinfo *dinfo;
591 	int numvec, alloced, rid, error;
592 	size_t res_size, cookie_size, arg_size;
593 
594 	ppt = ppt_find(bus, slot, func);
595 	if (ppt == NULL)
596 		return (ENOENT);
597 	if (ppt->vm != vm)		/* Make sure we own this device */
598 		return (EBUSY);
599 
600 	dinfo = device_get_ivars(ppt->dev);
601 	if (!dinfo)
602 		return (ENXIO);
603 
604 	/*
605 	 * First-time configuration:
606 	 * 	Allocate the MSI-X table
607 	 *	Allocate the IRQ resources
608 	 *	Set up some variables in ppt->msix
609 	 */
610 	if (ppt->msix.num_msgs == 0) {
611 		numvec = pci_msix_count(ppt->dev);
612 		if (numvec <= 0)
613 			return (EINVAL);
614 
615 		ppt->msix.startrid = 1;
616 		ppt->msix.num_msgs = numvec;
617 
618 		res_size = numvec * sizeof(ppt->msix.res[0]);
619 		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
620 		arg_size = numvec * sizeof(ppt->msix.arg[0]);
621 
622 		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
623 		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
624 					  M_WAITOK | M_ZERO);
625 		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
626 
627 		rid = dinfo->cfg.msix.msix_table_bar;
628 		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
629 					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
630 
631 		if (ppt->msix.msix_table_res == NULL) {
632 			ppt_teardown_msix(ppt);
633 			return (ENOSPC);
634 		}
635 		ppt->msix.msix_table_rid = rid;
636 
637 		alloced = numvec;
638 		error = pci_alloc_msix(ppt->dev, &alloced);
639 		if (error || alloced != numvec) {
640 			ppt_teardown_msix(ppt);
641 			return (error == 0 ? ENOSPC: error);
642 		}
643 	}
644 
645 	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
646 		/* Tear down the IRQ if it's already set up */
647 		ppt_teardown_msix_intr(ppt, idx);
648 
649 		/* Allocate the IRQ resource */
650 		ppt->msix.cookie[idx] = NULL;
651 		rid = ppt->msix.startrid + idx;
652 		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
653 							    &rid, RF_ACTIVE);
654 		if (ppt->msix.res[idx] == NULL)
655 			return (ENXIO);
656 
657 		ppt->msix.arg[idx].pptdev = ppt;
658 		ppt->msix.arg[idx].addr = addr;
659 		ppt->msix.arg[idx].msg_data = msg;
660 
661 		/* Setup the MSI-X interrupt */
662 		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
663 				       INTR_TYPE_NET | INTR_MPSAFE,
664 				       pptintr, NULL, &ppt->msix.arg[idx],
665 				       &ppt->msix.cookie[idx]);
666 
667 		if (error != 0) {
668 			bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
669 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
670 			ppt->msix.cookie[idx] = NULL;
671 			ppt->msix.res[idx] = NULL;
672 			return (ENXIO);
673 		}
674 	} else {
675 		/* Masked, tear it down if it's already been set up */
676 		ppt_teardown_msix_intr(ppt, idx);
677 	}
678 
679 	return (0);
680 }
681