xref: /freebsd/sys/amd64/vmm/io/ppt.c (revision 718cf2ccb9956613756ab15d7a0e28f2c8e91cab)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/bus.h>
38 #include <sys/pciio.h>
39 #include <sys/rman.h>
40 #include <sys/smp.h>
41 #include <sys/sysctl.h>
42 
43 #include <dev/pci/pcivar.h>
44 #include <dev/pci/pcireg.h>
45 
46 #include <machine/resource.h>
47 
48 #include <machine/vmm.h>
49 #include <machine/vmm_dev.h>
50 
51 #include "vmm_lapic.h"
52 #include "vmm_ktr.h"
53 
54 #include "iommu.h"
55 #include "ppt.h"
56 
57 /* XXX locking */
58 
59 #define	MAX_MSIMSGS	32
60 
61 /*
62  * If the MSI-X table is located in the middle of a BAR then that MMIO
63  * region gets split into two segments - one segment above the MSI-X table
64  * and the other segment below the MSI-X table - with a hole in place of
65  * the MSI-X table so accesses to it can be trapped and emulated.
66  *
67  * So, allocate a MMIO segment for each BAR register + 1 additional segment.
68  */
69 #define	MAX_MMIOSEGS	((PCIR_MAX_BAR_0 + 1) + 1)
70 
71 MALLOC_DEFINE(M_PPTMSIX, "pptmsix", "Passthru MSI-X resources");
72 
73 struct pptintr_arg {				/* pptintr(pptintr_arg) */
74 	struct pptdev	*pptdev;
75 	uint64_t	addr;
76 	uint64_t	msg_data;
77 };
78 
79 struct pptseg {
80 	vm_paddr_t	gpa;
81 	size_t		len;
82 	int		wired;
83 };
84 
85 struct pptdev {
86 	device_t	dev;
87 	struct vm	*vm;			/* owner of this device */
88 	TAILQ_ENTRY(pptdev)	next;
89 	struct pptseg mmio[MAX_MMIOSEGS];
90 	struct {
91 		int	num_msgs;		/* guest state */
92 
93 		int	startrid;		/* host state */
94 		struct resource *res[MAX_MSIMSGS];
95 		void	*cookie[MAX_MSIMSGS];
96 		struct pptintr_arg arg[MAX_MSIMSGS];
97 	} msi;
98 
99 	struct {
100 		int num_msgs;
101 		int startrid;
102 		int msix_table_rid;
103 		struct resource *msix_table_res;
104 		struct resource **res;
105 		void **cookie;
106 		struct pptintr_arg *arg;
107 	} msix;
108 };
109 
110 SYSCTL_DECL(_hw_vmm);
111 SYSCTL_NODE(_hw_vmm, OID_AUTO, ppt, CTLFLAG_RW, 0, "bhyve passthru devices");
112 
113 static int num_pptdevs;
114 SYSCTL_INT(_hw_vmm_ppt, OID_AUTO, devices, CTLFLAG_RD, &num_pptdevs, 0,
115     "number of pci passthru devices");
116 
117 static TAILQ_HEAD(, pptdev) pptdev_list = TAILQ_HEAD_INITIALIZER(pptdev_list);
118 
119 static int
120 ppt_probe(device_t dev)
121 {
122 	int bus, slot, func;
123 	struct pci_devinfo *dinfo;
124 
125 	dinfo = (struct pci_devinfo *)device_get_ivars(dev);
126 
127 	bus = pci_get_bus(dev);
128 	slot = pci_get_slot(dev);
129 	func = pci_get_function(dev);
130 
131 	/*
132 	 * To qualify as a pci passthrough device a device must:
133 	 * - be allowed by administrator to be used in this role
134 	 * - be an endpoint device
135 	 */
136 	if ((dinfo->cfg.hdrtype & PCIM_HDRTYPE) != PCIM_HDRTYPE_NORMAL)
137 		return (ENXIO);
138 	else if (vmm_is_pptdev(bus, slot, func))
139 		return (0);
140 	else
141 		/*
142 		 * Returning BUS_PROBE_NOWILDCARD here matches devices that the
143 		 * SR-IOV infrastructure specified as "ppt" passthrough devices.
144 		 * All normal devices that did not have "ppt" specified as their
145 		 * driver will not be matched by this.
146 		 */
147 		return (BUS_PROBE_NOWILDCARD);
148 }
149 
150 static int
151 ppt_attach(device_t dev)
152 {
153 	struct pptdev *ppt;
154 
155 	ppt = device_get_softc(dev);
156 
157 	iommu_remove_device(iommu_host_domain(), pci_get_rid(dev));
158 	num_pptdevs++;
159 	TAILQ_INSERT_TAIL(&pptdev_list, ppt, next);
160 	ppt->dev = dev;
161 
162 	if (bootverbose)
163 		device_printf(dev, "attached\n");
164 
165 	return (0);
166 }
167 
168 static int
169 ppt_detach(device_t dev)
170 {
171 	struct pptdev *ppt;
172 
173 	ppt = device_get_softc(dev);
174 
175 	if (ppt->vm != NULL)
176 		return (EBUSY);
177 	num_pptdevs--;
178 	TAILQ_REMOVE(&pptdev_list, ppt, next);
179 	pci_disable_busmaster(dev);
180 	iommu_add_device(iommu_host_domain(), pci_get_rid(dev));
181 
182 	return (0);
183 }
184 
185 static device_method_t ppt_methods[] = {
186 	/* Device interface */
187 	DEVMETHOD(device_probe,		ppt_probe),
188 	DEVMETHOD(device_attach,	ppt_attach),
189 	DEVMETHOD(device_detach,	ppt_detach),
190 	{0, 0}
191 };
192 
193 static devclass_t ppt_devclass;
194 DEFINE_CLASS_0(ppt, ppt_driver, ppt_methods, sizeof(struct pptdev));
195 DRIVER_MODULE(ppt, pci, ppt_driver, ppt_devclass, NULL, NULL);
196 
197 static struct pptdev *
198 ppt_find(int bus, int slot, int func)
199 {
200 	device_t dev;
201 	struct pptdev *ppt;
202 	int b, s, f;
203 
204 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
205 		dev = ppt->dev;
206 		b = pci_get_bus(dev);
207 		s = pci_get_slot(dev);
208 		f = pci_get_function(dev);
209 		if (bus == b && slot == s && func == f)
210 			return (ppt);
211 	}
212 	return (NULL);
213 }
214 
215 static void
216 ppt_unmap_mmio(struct vm *vm, struct pptdev *ppt)
217 {
218 	int i;
219 	struct pptseg *seg;
220 
221 	for (i = 0; i < MAX_MMIOSEGS; i++) {
222 		seg = &ppt->mmio[i];
223 		if (seg->len == 0)
224 			continue;
225 		(void)vm_unmap_mmio(vm, seg->gpa, seg->len);
226 		bzero(seg, sizeof(struct pptseg));
227 	}
228 }
229 
230 static void
231 ppt_teardown_msi(struct pptdev *ppt)
232 {
233 	int i, rid;
234 	void *cookie;
235 	struct resource *res;
236 
237 	if (ppt->msi.num_msgs == 0)
238 		return;
239 
240 	for (i = 0; i < ppt->msi.num_msgs; i++) {
241 		rid = ppt->msi.startrid + i;
242 		res = ppt->msi.res[i];
243 		cookie = ppt->msi.cookie[i];
244 
245 		if (cookie != NULL)
246 			bus_teardown_intr(ppt->dev, res, cookie);
247 
248 		if (res != NULL)
249 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
250 
251 		ppt->msi.res[i] = NULL;
252 		ppt->msi.cookie[i] = NULL;
253 	}
254 
255 	if (ppt->msi.startrid == 1)
256 		pci_release_msi(ppt->dev);
257 
258 	ppt->msi.num_msgs = 0;
259 }
260 
261 static void
262 ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
263 {
264 	int rid;
265 	struct resource *res;
266 	void *cookie;
267 
268 	rid = ppt->msix.startrid + idx;
269 	res = ppt->msix.res[idx];
270 	cookie = ppt->msix.cookie[idx];
271 
272 	if (cookie != NULL)
273 		bus_teardown_intr(ppt->dev, res, cookie);
274 
275 	if (res != NULL)
276 		bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, res);
277 
278 	ppt->msix.res[idx] = NULL;
279 	ppt->msix.cookie[idx] = NULL;
280 }
281 
282 static void
283 ppt_teardown_msix(struct pptdev *ppt)
284 {
285 	int i;
286 
287 	if (ppt->msix.num_msgs == 0)
288 		return;
289 
290 	for (i = 0; i < ppt->msix.num_msgs; i++)
291 		ppt_teardown_msix_intr(ppt, i);
292 
293 	if (ppt->msix.msix_table_res) {
294 		bus_release_resource(ppt->dev, SYS_RES_MEMORY,
295 				     ppt->msix.msix_table_rid,
296 				     ppt->msix.msix_table_res);
297 		ppt->msix.msix_table_res = NULL;
298 		ppt->msix.msix_table_rid = 0;
299 	}
300 
301 	free(ppt->msix.res, M_PPTMSIX);
302 	free(ppt->msix.cookie, M_PPTMSIX);
303 	free(ppt->msix.arg, M_PPTMSIX);
304 
305 	pci_release_msi(ppt->dev);
306 
307 	ppt->msix.num_msgs = 0;
308 }
309 
310 int
311 ppt_avail_devices(void)
312 {
313 
314 	return (num_pptdevs);
315 }
316 
317 int
318 ppt_assigned_devices(struct vm *vm)
319 {
320 	struct pptdev *ppt;
321 	int num;
322 
323 	num = 0;
324 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
325 		if (ppt->vm == vm)
326 			num++;
327 	}
328 	return (num);
329 }
330 
331 boolean_t
332 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
333 {
334 	int i;
335 	struct pptdev *ppt;
336 	struct pptseg *seg;
337 
338 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
339 		if (ppt->vm != vm)
340 			continue;
341 
342 		for (i = 0; i < MAX_MMIOSEGS; i++) {
343 			seg = &ppt->mmio[i];
344 			if (seg->len == 0)
345 				continue;
346 			if (gpa >= seg->gpa && gpa < seg->gpa + seg->len)
347 				return (TRUE);
348 		}
349 	}
350 
351 	return (FALSE);
352 }
353 
354 int
355 ppt_assign_device(struct vm *vm, int bus, int slot, int func)
356 {
357 	struct pptdev *ppt;
358 
359 	ppt = ppt_find(bus, slot, func);
360 	if (ppt != NULL) {
361 		/*
362 		 * If this device is owned by a different VM then we
363 		 * cannot change its owner.
364 		 */
365 		if (ppt->vm != NULL && ppt->vm != vm)
366 			return (EBUSY);
367 
368 		pci_save_state(ppt->dev);
369 		pcie_flr(ppt->dev,
370 		    max(pcie_get_max_completion_timeout(ppt->dev) / 1000, 10),
371 		    true);
372 		pci_restore_state(ppt->dev);
373 		ppt->vm = vm;
374 		iommu_add_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
375 		return (0);
376 	}
377 	return (ENOENT);
378 }
379 
380 int
381 ppt_unassign_device(struct vm *vm, int bus, int slot, int func)
382 {
383 	struct pptdev *ppt;
384 
385 	ppt = ppt_find(bus, slot, func);
386 	if (ppt != NULL) {
387 		/*
388 		 * If this device is not owned by this 'vm' then bail out.
389 		 */
390 		if (ppt->vm != vm)
391 			return (EBUSY);
392 
393 		pci_save_state(ppt->dev);
394 		pcie_flr(ppt->dev,
395 		    max(pcie_get_max_completion_timeout(ppt->dev) / 1000, 10),
396 		    true);
397 		pci_restore_state(ppt->dev);
398 		ppt_unmap_mmio(vm, ppt);
399 		ppt_teardown_msi(ppt);
400 		ppt_teardown_msix(ppt);
401 		iommu_remove_device(vm_iommu_domain(vm), pci_get_rid(ppt->dev));
402 		ppt->vm = NULL;
403 		return (0);
404 	}
405 	return (ENOENT);
406 }
407 
408 int
409 ppt_unassign_all(struct vm *vm)
410 {
411 	struct pptdev *ppt;
412 	int bus, slot, func;
413 	device_t dev;
414 
415 	TAILQ_FOREACH(ppt, &pptdev_list, next) {
416 		if (ppt->vm == vm) {
417 			dev = ppt->dev;
418 			bus = pci_get_bus(dev);
419 			slot = pci_get_slot(dev);
420 			func = pci_get_function(dev);
421 			vm_unassign_pptdev(vm, bus, slot, func);
422 		}
423 	}
424 
425 	return (0);
426 }
427 
428 int
429 ppt_map_mmio(struct vm *vm, int bus, int slot, int func,
430 	     vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
431 {
432 	int i, error;
433 	struct pptseg *seg;
434 	struct pptdev *ppt;
435 
436 	ppt = ppt_find(bus, slot, func);
437 	if (ppt != NULL) {
438 		if (ppt->vm != vm)
439 			return (EBUSY);
440 
441 		for (i = 0; i < MAX_MMIOSEGS; i++) {
442 			seg = &ppt->mmio[i];
443 			if (seg->len == 0) {
444 				error = vm_map_mmio(vm, gpa, len, hpa);
445 				if (error == 0) {
446 					seg->gpa = gpa;
447 					seg->len = len;
448 				}
449 				return (error);
450 			}
451 		}
452 		return (ENOSPC);
453 	}
454 	return (ENOENT);
455 }
456 
457 static int
458 pptintr(void *arg)
459 {
460 	struct pptdev *ppt;
461 	struct pptintr_arg *pptarg;
462 
463 	pptarg = arg;
464 	ppt = pptarg->pptdev;
465 
466 	if (ppt->vm != NULL)
467 		lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
468 	else {
469 		/*
470 		 * XXX
471 		 * This is not expected to happen - panic?
472 		 */
473 	}
474 
475 	/*
476 	 * For legacy interrupts give other filters a chance in case
477 	 * the interrupt was not generated by the passthrough device.
478 	 */
479 	if (ppt->msi.startrid == 0)
480 		return (FILTER_STRAY);
481 	else
482 		return (FILTER_HANDLED);
483 }
484 
485 int
486 ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func,
487 	      uint64_t addr, uint64_t msg, int numvec)
488 {
489 	int i, rid, flags;
490 	int msi_count, startrid, error, tmp;
491 	struct pptdev *ppt;
492 
493 	if (numvec < 0 || numvec > MAX_MSIMSGS)
494 		return (EINVAL);
495 
496 	ppt = ppt_find(bus, slot, func);
497 	if (ppt == NULL)
498 		return (ENOENT);
499 	if (ppt->vm != vm)		/* Make sure we own this device */
500 		return (EBUSY);
501 
502 	/* Free any allocated resources */
503 	ppt_teardown_msi(ppt);
504 
505 	if (numvec == 0)		/* nothing more to do */
506 		return (0);
507 
508 	flags = RF_ACTIVE;
509 	msi_count = pci_msi_count(ppt->dev);
510 	if (msi_count == 0) {
511 		startrid = 0;		/* legacy interrupt */
512 		msi_count = 1;
513 		flags |= RF_SHAREABLE;
514 	} else
515 		startrid = 1;		/* MSI */
516 
517 	/*
518 	 * The device must be capable of supporting the number of vectors
519 	 * the guest wants to allocate.
520 	 */
521 	if (numvec > msi_count)
522 		return (EINVAL);
523 
524 	/*
525 	 * Make sure that we can allocate all the MSI vectors that are needed
526 	 * by the guest.
527 	 */
528 	if (startrid == 1) {
529 		tmp = numvec;
530 		error = pci_alloc_msi(ppt->dev, &tmp);
531 		if (error)
532 			return (error);
533 		else if (tmp != numvec) {
534 			pci_release_msi(ppt->dev);
535 			return (ENOSPC);
536 		} else {
537 			/* success */
538 		}
539 	}
540 
541 	ppt->msi.startrid = startrid;
542 
543 	/*
544 	 * Allocate the irq resource and attach it to the interrupt handler.
545 	 */
546 	for (i = 0; i < numvec; i++) {
547 		ppt->msi.num_msgs = i + 1;
548 		ppt->msi.cookie[i] = NULL;
549 
550 		rid = startrid + i;
551 		ppt->msi.res[i] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
552 							 &rid, flags);
553 		if (ppt->msi.res[i] == NULL)
554 			break;
555 
556 		ppt->msi.arg[i].pptdev = ppt;
557 		ppt->msi.arg[i].addr = addr;
558 		ppt->msi.arg[i].msg_data = msg + i;
559 
560 		error = bus_setup_intr(ppt->dev, ppt->msi.res[i],
561 				       INTR_TYPE_NET | INTR_MPSAFE,
562 				       pptintr, NULL, &ppt->msi.arg[i],
563 				       &ppt->msi.cookie[i]);
564 		if (error != 0)
565 			break;
566 	}
567 
568 	if (i < numvec) {
569 		ppt_teardown_msi(ppt);
570 		return (ENXIO);
571 	}
572 
573 	return (0);
574 }
575 
576 int
577 ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func,
578 	       int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
579 {
580 	struct pptdev *ppt;
581 	struct pci_devinfo *dinfo;
582 	int numvec, alloced, rid, error;
583 	size_t res_size, cookie_size, arg_size;
584 
585 	ppt = ppt_find(bus, slot, func);
586 	if (ppt == NULL)
587 		return (ENOENT);
588 	if (ppt->vm != vm)		/* Make sure we own this device */
589 		return (EBUSY);
590 
591 	dinfo = device_get_ivars(ppt->dev);
592 	if (!dinfo)
593 		return (ENXIO);
594 
595 	/*
596 	 * First-time configuration:
597 	 * 	Allocate the MSI-X table
598 	 *	Allocate the IRQ resources
599 	 *	Set up some variables in ppt->msix
600 	 */
601 	if (ppt->msix.num_msgs == 0) {
602 		numvec = pci_msix_count(ppt->dev);
603 		if (numvec <= 0)
604 			return (EINVAL);
605 
606 		ppt->msix.startrid = 1;
607 		ppt->msix.num_msgs = numvec;
608 
609 		res_size = numvec * sizeof(ppt->msix.res[0]);
610 		cookie_size = numvec * sizeof(ppt->msix.cookie[0]);
611 		arg_size = numvec * sizeof(ppt->msix.arg[0]);
612 
613 		ppt->msix.res = malloc(res_size, M_PPTMSIX, M_WAITOK | M_ZERO);
614 		ppt->msix.cookie = malloc(cookie_size, M_PPTMSIX,
615 					  M_WAITOK | M_ZERO);
616 		ppt->msix.arg = malloc(arg_size, M_PPTMSIX, M_WAITOK | M_ZERO);
617 
618 		rid = dinfo->cfg.msix.msix_table_bar;
619 		ppt->msix.msix_table_res = bus_alloc_resource_any(ppt->dev,
620 					       SYS_RES_MEMORY, &rid, RF_ACTIVE);
621 
622 		if (ppt->msix.msix_table_res == NULL) {
623 			ppt_teardown_msix(ppt);
624 			return (ENOSPC);
625 		}
626 		ppt->msix.msix_table_rid = rid;
627 
628 		alloced = numvec;
629 		error = pci_alloc_msix(ppt->dev, &alloced);
630 		if (error || alloced != numvec) {
631 			ppt_teardown_msix(ppt);
632 			return (error == 0 ? ENOSPC: error);
633 		}
634 	}
635 
636 	if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
637 		/* Tear down the IRQ if it's already set up */
638 		ppt_teardown_msix_intr(ppt, idx);
639 
640 		/* Allocate the IRQ resource */
641 		ppt->msix.cookie[idx] = NULL;
642 		rid = ppt->msix.startrid + idx;
643 		ppt->msix.res[idx] = bus_alloc_resource_any(ppt->dev, SYS_RES_IRQ,
644 							    &rid, RF_ACTIVE);
645 		if (ppt->msix.res[idx] == NULL)
646 			return (ENXIO);
647 
648 		ppt->msix.arg[idx].pptdev = ppt;
649 		ppt->msix.arg[idx].addr = addr;
650 		ppt->msix.arg[idx].msg_data = msg;
651 
652 		/* Setup the MSI-X interrupt */
653 		error = bus_setup_intr(ppt->dev, ppt->msix.res[idx],
654 				       INTR_TYPE_NET | INTR_MPSAFE,
655 				       pptintr, NULL, &ppt->msix.arg[idx],
656 				       &ppt->msix.cookie[idx]);
657 
658 		if (error != 0) {
659 			bus_teardown_intr(ppt->dev, ppt->msix.res[idx], ppt->msix.cookie[idx]);
660 			bus_release_resource(ppt->dev, SYS_RES_IRQ, rid, ppt->msix.res[idx]);
661 			ppt->msix.cookie[idx] = NULL;
662 			ppt->msix.res[idx] = NULL;
663 			return (ENXIO);
664 		}
665 	} else {
666 		/* Masked, tear it down if it's already been set up */
667 		ppt_teardown_msix_intr(ppt, idx);
668 	}
669 
670 	return (0);
671 }
672