xref: /freebsd/sys/amd64/vmm/io/ppt.c (revision f5147e312f43a9050468de539aeafa072caa1a60)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  * $FreeBSD$
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/malloc.h>
38 #include <sys/module.h>
39 #include <sys/bus.h>
40 #include <sys/pciio.h>
41 #include <sys/rman.h>
42 #include <sys/smp.h>
43 #include <sys/sysctl.h>
44 
45 #include <dev/pci/pcivar.h>
46 #include <dev/pci/pcireg.h>
47 
48 #include <machine/resource.h>
49 
50 #include <machine/vmm.h>
51 #include <machine/vmm_dev.h>
52 
53 #include "vmm_lapic.h"
54 #include "vmm_ktr.h"
55 
56 #include "iommu.h"
57 #include "ppt.h"
58 
59 /* XXX locking */
60 
61 #define	MAX_MSIMSGS	32
62 
63 /*
64  * If the MSI-X table is located in the middle of a BAR then that MMIO
65  * region gets split into two segments - one segment above the MSI-X table
66  * and the other segment below the MSI-X table - with a hole in place of
67  * the MSI-X table so accesses to it can be trapped and emulated.
68  *
69  * So, allocate a MMIO segment for each BAR register + 1 additional segment.
70  */
71 #define	MAX_MMIOSEGS	((PCIR_MAX_BAR_0 + 1) + 1)
72 
73 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
74 
75 struct pptintr_arg {				/* pptintr(pptintr_arg) */
76 	struct pptdev	*pptdev;
77 	uint64_t	addr;
78 	uint64_t	msg_data;
79 };
80 
81 struct pptseg {
82 	vm_paddr_t	gpa;
83 	size_t		len;
84 	int		wired;
85 };
86 
87 struct pptdev {
88 	device_t	dev;
89 	struct vm	*vm;			/* owner of this device */
90 	TAILQ_ENTRY(pptdev)	next;
91 	struct pptseg mmio[MAX_MMIOSEGS];
92 	struct {
93 		int	num_msgs;		/* guest state */
94 
95 		int	startrid;		/* host state */
96 		struct resource *res[MAX_MSIMSGS];
97 		void	*cookie[MAX_MSIMSGS];
98 		struct pptintr_arg arg[MAX_MSIMSGS];
99 	} msi;
100 
101 	struct {
102 		int num_msgs;
103 		int startrid;
104 		int msix_table_rid;
105 		struct resource *msix_table_res;
106 		struct resource **res;
107 		void **cookie;
108 		struct pptintr_arg *arg;
109 	} msix;
110 };
111 
112 SYSCTL_DECL(_hw_vmm);
113 SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
114 
115 static int num_pptdevs;
116 SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
117     "number of pci passthru devices");
118 
119 static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
120 
121 static int
122 ppt_probe(device_t dev)
123 {
124 	int bus, slot, func;
125 	struct pci_devinfo *dinfo;
126 
127 	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
128 
129 	bus = pci_get_bus(dev);
130 	slot = pci_get_slot(dev);
131 	func = pci_get_function(dev);
132 
133 	/*
134 	 * To qualify as a pci passthrough device a device must:
135 	 * - be allowed by administrator to be used in this role
136 	 * - be an endpoint device
137 	 */
138 	if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
139 		return (ENXIO);
140 	else if (vmm_is_pptdev(bus, slot, func))
141 		return (0);
142 	else
143 		/*
144 		 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
145 		 * SR-IOV infrastructure specified as "ppt" passthrough devices.
146 		 * All normal devices that did not have "ppt" specified as their
147 		 * driver will not be matched by this.
148 		 */
149 		return (BUS_PROBE_NOWILDCARD);
150 }
151 
152 static int
153 ppt_attach(device_t dev)
154 {
155 	struct pptdev *ppt;
156 
157 	ppt = device_get_softc(dev);
158 
159 	iommu_remove_device(iommu_host_domain(), pci_get_rid(dev));
160 	num_pptdevs++;
161 	TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
162 	ppt->dev = dev;
163 
164 	if (bootverbose)
165 		device_printf(dev, "attached\n");
166 
167 	return (0);
168 }
169 
170 static int
171 ppt_detach(device_t dev)
172 {
173 	struct pptdev *ppt;
174 
175 	ppt = device_get_softc(dev);
176 
177 	if (ppt->vm != NULL)
178 		return (EBUSY);
179 	num_pptdevs--;
180 	TAILQ_REMOVE(&pptdev_list, ppt, next);
181 	pci_disable_busmaster(dev);
182 	iommu_add_device(iommu_host_domain(), pci_get_rid(dev));
183 
184 	return (0);
185 }
186 
187 static device_method_t ppt_methods[] = {
188 	/* Device interface */
189 	DEVMETHOD(device_probe,		ppt_probe),
190 	DEVMETHOD(device_attach,	ppt_attach),
191 	DEVMETHOD(device_detach,	ppt_detach),
192 	{0, 0}
193 };
194 
195 static devclass_t ppt_devclass;
196 DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
197 DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
198 
199 static struct pptdev *
200 ppt_find(int bus, int slot, int func)
201 {
202 	device_t dev;
203 	struct pptdev *ppt;
204 	int b, s, f;
205 
206 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
207 		dev = ppt->dev;
208 		b = pci_get_bus(dev);
209 		s = pci_get_slot(dev);
210 		f = pci_get_function(dev);
211 		if (bus == b && slot == s && func == f)
212 			return (ppt);
213 	}
214 	return (NULL);
215 }
216 
217 static void
218 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
219 {
220 	int i;
221 	struct pptseg *seg;
222 
223 	for (i = 0; i < MAX_MMIOSEGS; i++) {
224 		seg = &ppt->mmio[i];
225 		if (seg->len == 0)
226 			continue;
227 		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
228 		bzero(seg, sizeof(struct pptseg));
229 	}
230 }
231 
232 static void
233 ppt_teardown_msi(struct pptdev *ppt)
234 {
235 	int i, rid;
236 	void *cookie;
237 	struct resource *res;
238 
239 	if (ppt->msi.num_msgs == 0)
240 		return;
241 
242 	for (i = 0; i < ppt->msi.num_msgs; i++) {
243 		rid = ppt->msi.startrid + i;
244 		res = ppt->msi.res[i];
245 		cookie = ppt->msi.cookie[i];
246 
247 		if (cookie != NULL)
248 			bus_teardown_intr(ppt->dev, res, cookie);
249 
250 		if (res != NULL)
251 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
252 
253 		ppt->msi.res[i] = NULL;
254 		ppt->msi.cookie[i] = NULL;
255 	}
256 
257 	if (ppt->msi.startrid == 1)
258 		pci_release_msi(ppt->dev);
259 
260 	ppt->msi.num_msgs = 0;
261 }
262 
263 static void
264 ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
265 {
266 	int rid;
267 	struct resource *res;
268 	void *cookie;
269 
270 	rid = ppt->msix.startrid + idx;
271 	res = ppt->msix.res[idx];
272 	cookie = ppt->msix.cookie[idx];
273 
274 	if (cookie != NULL)
275 		bus_teardown_intr(ppt->dev, res, cookie);
276 
277 	if (res != NULL)
278 		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
279 
280 	ppt->msix.res[idx] = NULL;
281 	ppt->msix.cookie[idx] = NULL;
282 }
283 
284 static void
285 ppt_teardown_msix(struct pptdev *ppt)
286 {
287 	int i;
288 
289 	if (ppt->msix.num_msgs == 0)
290 		return;
291 
292 	for (i = 0; i < ppt->msix.num_msgs; i++)
293 		ppt_teardown_msix_intr(ppt, i);
294 
295 	if (ppt->msix.msix_table_res) {
296 		bus_release_resource(ppt->dev, SYS_RES_MEMORY,
297 				     ppt->msix.msix_table_rid,
298 				     ppt->msix.msix_table_res);
299 		ppt->msix.msix_table_res = NULL;
300 		ppt->msix.msix_table_rid = 0;
301 	}
302 
303 	free(ppt->msix.res, M_PPTMSIX);
304 	free(ppt->msix.cookie, M_PPTMSIX);
305 	free(ppt->msix.arg, M_PPTMSIX);
306 
307 	pci_release_msi(ppt->dev);
308 
309 	ppt->msix.num_msgs = 0;
310 }
311 
312 int
313 ppt_avail_devices(void)
314 {
315 
316 	return (num_pptdevs);
317 }
318 
319 int
320 ppt_assigned_devices(struct vm *vm)
321 {
322 	struct pptdev *ppt;
323 	int num;
324 
325 	num = 0;
326 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
327 		if (ppt->vm == vm)
328 			num++;
329 	}
330 	return (num);
331 }
332 
333 boolean_t
334 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
335 {
336 	int i;
337 	struct pptdev *ppt;
338 	struct pptseg *seg;
339 
340 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
341 		if (ppt->vm != vm)
342 			continue;
343 
344 		for (i = 0; i < MAX_MMIOSEGS; i++) {
345 			seg = &ppt->mmio[i];
346 			if (seg->len == 0)
347 				continue;
348 			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
349 				return (TRUE);
350 		}
351 	}
352 
353 	return (FALSE);
354 }
355 
356 int
357 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
358 {
359 	struct pptdev *ppt;
360 
361 	ppt = ppt_find(bus, slot, func);
362 	if (ppt != NULL) {
363 		/*
364 		 * If this device is owned by a different VM then we
365 		 * cannot change its owner.
366 		 */
367 		if (ppt->vm != NULL && ppt->vm != vm)
368 			return (EBUSY);
369 
370 		pci_save_state(ppt->dev);
371 		pcie_flr(ppt->dev,
372 		    max(pcie_get_max_completion_timeout(ppt->dev) / 1000, 10),
373 		    true);
374 		pci_restore_state(ppt->dev);
375 		ppt->vm = vm;
376 		iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
377 		return (0);
378 	}
379 	return (ENOENT);
380 }
381 
382 int
383 ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
384 {
385 	struct pptdev *ppt;
386 
387 	ppt = ppt_find(bus, slot, func);
388 	if (ppt != NULL) {
389 		/*
390 		 * If this device is not owned by this 'vm' then bail out.
391 		 */
392 		if (ppt->vm != vm)
393 			return (EBUSY);
394 
395 		pci_save_state(ppt->dev);
396 		pcie_flr(ppt->dev,
397 		    max(pcie_get_max_completion_timeout(ppt->dev) / 1000, 10),
398 		    true);
399 		pci_restore_state(ppt->dev);
400 		ppt_unmap_mmio(vm, ppt);
401 		ppt_teardown_msi(ppt);
402 		ppt_teardown_msix(ppt);
403 		iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
404 		ppt->vm = NULL;
405 		return (0);
406 	}
407 	return (ENOENT);
408 }
409 
410 int
411 ppt_unassign_all(struct vm *vm)
412 {
413 	struct pptdev *ppt;
414 	int bus, slot, func;
415 	device_t dev;
416 
417 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
418 		if (ppt->vm == vm) {
419 			dev = ppt->dev;
420 			bus = pci_get_bus(dev);
421 			slot = pci_get_slot(dev);
422 			func = pci_get_function(dev);
423 			vm_unassign_pptdev(vm, bus, slot, func);
424 		}
425 	}
426 
427 	return (0);
428 }
429 
430 int
431 ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
432 	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
433 {
434 	int i, error;
435 	struct pptseg *seg;
436 	struct pptdev *ppt;
437 
438 	ppt = ppt_find(bus, slot, func);
439 	if (ppt != NULL) {
440 		if (ppt->vm != vm)
441 			return (EBUSY);
442 
443 		for (i = 0; i < MAX_MMIOSEGS; i++) {
444 			seg = &ppt->mmio[i];
445 			if (seg->len == 0) {
446 				error = vm_map_mmio(vm, gpa, len, hpa);
447 				if (error == 0) {
448 					seg->gpa = gpa;
449 					seg->len = len;
450 				}
451 				return (error);
452 			}
453 		}
454 		return (ENOSPC);
455 	}
456 	return (ENOENT);
457 }
458 
459 static int
460 pptintr(void *arg)
461 {
462 	struct pptdev *ppt;
463 	struct pptintr_arg *pptarg;
464 
465 	pptarg = arg;
466 	ppt = pptarg->pptdev;
467 
468 	if (ppt->vm != NULL)
469 		lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
470 	else {
471 		/*
472 		 * XXX
473 		 * This is not expected to happen - panic?
474 		 */
475 	}
476 
477 	/*
478 	 * For legacy interrupts give other filters a chance in case
479 	 * the interrupt was not generated by the passthrough device.
480 	 */
481 	if (ppt->msi.startrid == 0)
482 		return (FILTER_STRAY);
483 	else
484 		return (FILTER_HANDLED);
485 }
486 
487 int
488 ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
489 	      uint64_t addr, uint64_t msg, int numvec)
490 {
491 	int i, rid, flags;
492 	int msi_count, startrid, error, tmp;
493 	struct pptdev *ppt;
494 
495 	if (numvec < 0 || numvec > MAX_MSIMSGS)
496 		return (EINVAL);
497 
498 	ppt = ppt_find(bus, slot, func);
499 	if (ppt == NULL)
500 		return (ENOENT);
501 	if (ppt->vm != vm)		/* Make sure we own this device */
502 		return (EBUSY);
503 
504 	/* Free any allocated resources */
505 	ppt_teardown_msi(ppt);
506 
507 	if (numvec == 0)		/* nothing more to do */
508 		return (0);
509 
510 	flags = RF_ACTIVE;
511 	msi_count = pci_msi_count(ppt->dev);
512 	if (msi_count == 0) {
513 		startrid = 0;		/* legacy interrupt */
514 		msi_count = 1;
515 		flags |= RF_SHAREABLE;
516 	} else
517 		startrid = 1;		/* MSI */
518 
519 	/*
520 	 * The device must be capable of supporting the number of vectors
521 	 * the guest wants to allocate.
522 	 */
523 	if (numvec > msi_count)
524 		return (EINVAL);
525 
526 	/*
527 	 * Make sure that we can allocate all the MSI vectors that are needed
528 	 * by the guest.
529 	 */
530 	if (startrid == 1) {
531 		tmp = numvec;
532 		error = pci_alloc_msi(ppt->dev, &tmp);
533 		if (error)
534 			return (error);
535 		else if (tmp != numvec) {
536 			pci_release_msi(ppt->dev);
537 			return (ENOSPC);
538 		} else {
539 			/* success */
540 		}
541 	}
542 
543 	ppt->msi.startrid = startrid;
544 
545 	/*
546 	 * Allocate the irq resource and attach it to the interrupt handler.
547 	 */
548 	for (i = 0; i < numvec; i++) {
549 		ppt->msi.num_msgs = i + 1;
550 		ppt->msi.cookie[i] = NULL;
551 
552 		rid = startrid + i;
553 		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
554 							 &rid, flags);
555 		if (ppt->msi.res[i] == NULL)
556 			break;
557 
558 		ppt->msi.arg[i].pptdev = ppt;
559 		ppt->msi.arg[i].addr = addr;
560 		ppt->msi.arg[i].msg_data = msg + i;
561 
562 		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
563 				       INTR_TYPE_NET | INTR_MPSAFE,
564 				       pptintr, NULL, &ppt->msi.arg[i],
565 				       &ppt->msi.cookie[i]);
566 		if (error != 0)
567 			break;
568 	}
569 
570 	if (i < numvec) {
571 		ppt_teardown_msi(ppt);
572 		return (ENXIO);
573 	}
574 
575 	return (0);
576 }
577 
578 int
579 ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
580 	       int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
581 {
582 	struct pptdev *ppt;
583 	struct pci_devinfo *dinfo;
584 	int numvec, alloced, rid, error;
585 	size_t res_size, cookie_size, arg_size;
586 
587 	ppt = ppt_find(bus, slot, func);
588 	if (ppt == NULL)
589 		return (ENOENT);
590 	if (ppt->vm != vm)		/* Make sure we own this device */
591 		return (EBUSY);
592 
593 	dinfo = device_get_ivars(ppt->dev);
594 	if (!dinfo)
595 		return (ENXIO);
596 
597 	/*
598 	 * First-time configuration:
599 	 * 	Allocate the MSI-X table
600 	 *	Allocate the IRQ resources
601 	 *	Set up some variables in ppt->msix
602 	 */
603 	if (ppt->msix.num_msgs == 0) {
604 		numvec = pci_msix_count(ppt->dev);
605 		if (numvec <= 0)
606 			return (EINVAL);
607 
608 		ppt->msix.startrid = 1;
609 		ppt->msix.num_msgs = numvec;
610 
611 		res_size = numvec * sizeof(ppt->msix.res[0]);
612 		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
613 		arg_size = numvec * sizeof(ppt->msix.arg[0]);
614 
615 		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
616 		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
617 					  M_WAITOK | M_ZERO);
618 		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
619 
620 		rid = dinfo->cfg.msix.msix_table_bar;
621 		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
622 					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
623 
624 		if (ppt->msix.msix_table_res == NULL) {
625 			ppt_teardown_msix(ppt);
626 			return (ENOSPC);
627 		}
628 		ppt->msix.msix_table_rid = rid;
629 
630 		alloced = numvec;
631 		error = pci_alloc_msix(ppt->dev, &alloced);
632 		if (error || alloced != numvec) {
633 			ppt_teardown_msix(ppt);
634 			return (error == 0 ? ENOSPC: error);
635 		}
636 	}
637 
638 	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
639 		/* Tear down the IRQ if it's already set up */
640 		ppt_teardown_msix_intr(ppt, idx);
641 
642 		/* Allocate the IRQ resource */
643 		ppt->msix.cookie[idx] = NULL;
644 		rid = ppt->msix.startrid + idx;
645 		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
646 							    &rid, RF_ACTIVE);
647 		if (ppt->msix.res[idx] == NULL)
648 			return (ENXIO);
649 
650 		ppt->msix.arg[idx].pptdev = ppt;
651 		ppt->msix.arg[idx].addr = addr;
652 		ppt->msix.arg[idx].msg_data = msg;
653 
654 		/* Setup the MSI-X interrupt */
655 		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
656 				       INTR_TYPE_NET | INTR_MPSAFE,
657 				       pptintr, NULL, &ppt->msix.arg[idx],
658 				       &ppt->msix.cookie[idx]);
659 
660 		if (error != 0) {
661 			bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
662 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
663 			ppt->msix.cookie[idx] = NULL;
664 			ppt->msix.res[idx] = NULL;
665 			return (ENXIO);
666 		}
667 	} else {
668 		/* Masked, tear it down if it's already been set up */
669 		ppt_teardown_msix_intr(ppt, idx);
670 	}
671 
672 	return (0);
673 }
674