xref: /freebsd/sys/dev/pci/pci_iov.c (revision 6829dae12bb055451fa467da4589c43bd03b1e64)
1 /*-
2  * Copyright (c) 2013-2015 Sandvine Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include "opt_bus.h"
31 
32 #include <sys/param.h>
33 #include <sys/conf.h>
34 #include <sys/kernel.h>
35 #include <sys/systm.h>
36 #include <sys/bus.h>
37 #include <sys/fcntl.h>
38 #include <sys/ioccom.h>
39 #include <sys/iov.h>
40 #include <sys/linker.h>
41 #include <sys/malloc.h>
42 #include <sys/module.h>
43 #include <sys/pciio.h>
44 #include <sys/queue.h>
45 #include <sys/rman.h>
46 #include <sys/sysctl.h>
47 
48 #include <machine/bus.h>
49 #include <machine/stdarg.h>
50 
51 #include <sys/nv.h>
52 #include <sys/iov_schema.h>
53 
54 #include <dev/pci/pcireg.h>
55 #include <dev/pci/pcivar.h>
56 #include <dev/pci/pci_iov.h>
57 #include <dev/pci/pci_private.h>
58 #include <dev/pci/pci_iov_private.h>
59 #include <dev/pci/schema_private.h>
60 
61 #include "pcib_if.h"
62 
63 static MALLOC_DEFINE(M_SRIOV, "sr_iov", "PCI SR-IOV allocations");
64 
65 static d_ioctl_t pci_iov_ioctl;
66 
67 static struct cdevsw iov_cdevsw = {
68 	.d_version = D_VERSION,
69 	.d_name = "iov",
70 	.d_ioctl = pci_iov_ioctl
71 };
72 
73 SYSCTL_DECL(_hw_pci);
74 
75 /*
76  * The maximum amount of memory we will allocate for user configuration of an
77  * SR-IOV device.  1MB ought to be enough for anyone, but leave this
78  * configurable just in case.
79  */
80 static u_long pci_iov_max_config = 1024 * 1024;
81 SYSCTL_ULONG(_hw_pci, OID_AUTO, iov_max_config, CTLFLAG_RWTUN,
82     &pci_iov_max_config, 0, "Maximum allowed size of SR-IOV configuration.");
83 
84 
85 #define IOV_READ(d, r, w) \
86 	pci_read_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, w)
87 
88 #define IOV_WRITE(d, r, v, w) \
89 	pci_write_config((d)->cfg.dev, (d)->cfg.iov->iov_pos + r, v, w)
90 
91 static nvlist_t	*pci_iov_build_schema(nvlist_t **pf_schema,
92 		    nvlist_t **vf_schema);
93 static void	pci_iov_build_pf_schema(nvlist_t *schema,
94 		    nvlist_t **driver_schema);
95 static void	pci_iov_build_vf_schema(nvlist_t *schema,
96 		    nvlist_t **driver_schema);
97 static nvlist_t	*pci_iov_get_pf_subsystem_schema(void);
98 static nvlist_t	*pci_iov_get_vf_subsystem_schema(void);
99 
100 int
101 pci_iov_attach_name(device_t dev, struct nvlist *pf_schema,
102     struct nvlist *vf_schema, const char *fmt, ...)
103 {
104 	char buf[NAME_MAX + 1];
105 	va_list ap;
106 
107 	va_start(ap, fmt);
108 	vsnprintf(buf, sizeof(buf), fmt, ap);
109 	va_end(ap);
110 	return (PCI_IOV_ATTACH(device_get_parent(dev), dev, pf_schema,
111 	    vf_schema, buf));
112 }
113 
114 int
115 pci_iov_attach_method(device_t bus, device_t dev, nvlist_t *pf_schema,
116     nvlist_t *vf_schema, const char *name)
117 {
118 	device_t pcib;
119 	struct pci_devinfo *dinfo;
120 	struct pcicfg_iov *iov;
121 	nvlist_t *schema;
122 	uint32_t version;
123 	int error;
124 	int iov_pos;
125 
126 	dinfo = device_get_ivars(dev);
127 	pcib = device_get_parent(bus);
128 	schema = NULL;
129 
130 	error = pci_find_extcap(dev, PCIZ_SRIOV, &iov_pos);
131 
132 	if (error != 0)
133 		return (error);
134 
135 	version = pci_read_config(dev, iov_pos, 4);
136 	if (PCI_EXTCAP_VER(version) != 1) {
137 		if (bootverbose)
138 			device_printf(dev,
139 			    "Unsupported version of SR-IOV (%d) detected\n",
140 			    PCI_EXTCAP_VER(version));
141 
142 		return (ENXIO);
143 	}
144 
145 	iov = malloc(sizeof(*dinfo->cfg.iov), M_SRIOV, M_WAITOK | M_ZERO);
146 
147 	mtx_lock(&Giant);
148 	if (dinfo->cfg.iov != NULL) {
149 		error = EBUSY;
150 		goto cleanup;
151 	}
152 	iov->iov_pos = iov_pos;
153 
154 	schema = pci_iov_build_schema(&pf_schema, &vf_schema);
155 	if (schema == NULL) {
156 		error = ENOMEM;
157 		goto cleanup;
158 	}
159 
160 	error = pci_iov_validate_schema(schema);
161 	if (error != 0)
162 		goto cleanup;
163 	iov->iov_schema = schema;
164 
165 	iov->iov_cdev = make_dev(&iov_cdevsw, device_get_unit(dev),
166 	    UID_ROOT, GID_WHEEL, 0600, "iov/%s", name);
167 
168 	if (iov->iov_cdev == NULL) {
169 		error = ENOMEM;
170 		goto cleanup;
171 	}
172 
173 	dinfo->cfg.iov = iov;
174 	iov->iov_cdev->si_drv1 = dinfo;
175 	mtx_unlock(&Giant);
176 
177 	return (0);
178 
179 cleanup:
180 	nvlist_destroy(schema);
181 	nvlist_destroy(pf_schema);
182 	nvlist_destroy(vf_schema);
183 	free(iov, M_SRIOV);
184 	mtx_unlock(&Giant);
185 	return (error);
186 }
187 
188 int
189 pci_iov_detach_method(device_t bus, device_t dev)
190 {
191 	struct pci_devinfo *dinfo;
192 	struct pcicfg_iov *iov;
193 
194 	mtx_lock(&Giant);
195 	dinfo = device_get_ivars(dev);
196 	iov = dinfo->cfg.iov;
197 
198 	if (iov == NULL) {
199 		mtx_unlock(&Giant);
200 		return (0);
201 	}
202 
203 	if (iov->iov_num_vfs != 0 || iov->iov_flags & IOV_BUSY) {
204 		mtx_unlock(&Giant);
205 		return (EBUSY);
206 	}
207 
208 	dinfo->cfg.iov = NULL;
209 
210 	if (iov->iov_cdev) {
211 		destroy_dev(iov->iov_cdev);
212 		iov->iov_cdev = NULL;
213 	}
214 	nvlist_destroy(iov->iov_schema);
215 
216 	free(iov, M_SRIOV);
217 	mtx_unlock(&Giant);
218 
219 	return (0);
220 }
221 
222 static nvlist_t *
223 pci_iov_build_schema(nvlist_t **pf, nvlist_t **vf)
224 {
225 	nvlist_t *schema, *pf_driver, *vf_driver;
226 
227 	/* We always take ownership of the schemas. */
228 	pf_driver = *pf;
229 	*pf = NULL;
230 	vf_driver = *vf;
231 	*vf = NULL;
232 
233 	schema = pci_iov_schema_alloc_node();
234 	if (schema == NULL)
235 		goto cleanup;
236 
237 	pci_iov_build_pf_schema(schema, &pf_driver);
238 	pci_iov_build_vf_schema(schema, &vf_driver);
239 
240 	if (nvlist_error(schema) != 0)
241 		goto cleanup;
242 
243 	return (schema);
244 
245 cleanup:
246 	nvlist_destroy(schema);
247 	nvlist_destroy(pf_driver);
248 	nvlist_destroy(vf_driver);
249 	return (NULL);
250 }
251 
252 static void
253 pci_iov_build_pf_schema(nvlist_t *schema, nvlist_t **driver_schema)
254 {
255 	nvlist_t *pf_schema, *iov_schema;
256 
257 	pf_schema = pci_iov_schema_alloc_node();
258 	if (pf_schema == NULL) {
259 		nvlist_set_error(schema, ENOMEM);
260 		return;
261 	}
262 
263 	iov_schema = pci_iov_get_pf_subsystem_schema();
264 
265 	/*
266 	 * Note that if either *driver_schema or iov_schema is NULL, then
267 	 * nvlist_move_nvlist will put the schema in the error state and
268 	 * SR-IOV will fail to initialize later, so we don't have to explicitly
269 	 * handle that case.
270 	 */
271 	nvlist_move_nvlist(pf_schema, DRIVER_CONFIG_NAME, *driver_schema);
272 	nvlist_move_nvlist(pf_schema, IOV_CONFIG_NAME, iov_schema);
273 	nvlist_move_nvlist(schema, PF_CONFIG_NAME, pf_schema);
274 	*driver_schema = NULL;
275 }
276 
277 static void
278 pci_iov_build_vf_schema(nvlist_t *schema, nvlist_t **driver_schema)
279 {
280 	nvlist_t *vf_schema, *iov_schema;
281 
282 	vf_schema = pci_iov_schema_alloc_node();
283 	if (vf_schema == NULL) {
284 		nvlist_set_error(schema, ENOMEM);
285 		return;
286 	}
287 
288 	iov_schema = pci_iov_get_vf_subsystem_schema();
289 
290 	/*
291 	 * Note that if either *driver_schema or iov_schema is NULL, then
292 	 * nvlist_move_nvlist will put the schema in the error state and
293 	 * SR-IOV will fail to initialize later, so we don't have to explicitly
294 	 * handle that case.
295 	 */
296 	nvlist_move_nvlist(vf_schema, DRIVER_CONFIG_NAME, *driver_schema);
297 	nvlist_move_nvlist(vf_schema, IOV_CONFIG_NAME, iov_schema);
298 	nvlist_move_nvlist(schema, VF_SCHEMA_NAME, vf_schema);
299 	*driver_schema = NULL;
300 }
301 
302 static nvlist_t *
303 pci_iov_get_pf_subsystem_schema(void)
304 {
305 	nvlist_t *pf;
306 
307 	pf = pci_iov_schema_alloc_node();
308 	if (pf == NULL)
309 		return (NULL);
310 
311 	pci_iov_schema_add_uint16(pf, "num_vfs", IOV_SCHEMA_REQUIRED, -1);
312 	pci_iov_schema_add_string(pf, "device", IOV_SCHEMA_REQUIRED, NULL);
313 
314 	return (pf);
315 }
316 
317 static nvlist_t *
318 pci_iov_get_vf_subsystem_schema(void)
319 {
320 	nvlist_t *vf;
321 
322 	vf = pci_iov_schema_alloc_node();
323 	if (vf == NULL)
324 		return (NULL);
325 
326 	pci_iov_schema_add_bool(vf, "passthrough", IOV_SCHEMA_HASDEFAULT, 0);
327 
328 	return (vf);
329 }
330 
331 static int
332 pci_iov_alloc_bar(struct pci_devinfo *dinfo, int bar, pci_addr_t bar_shift)
333 {
334 	struct resource *res;
335 	struct pcicfg_iov *iov;
336 	device_t dev, bus;
337 	rman_res_t start, end;
338 	pci_addr_t bar_size;
339 	int rid;
340 
341 	iov = dinfo->cfg.iov;
342 	dev = dinfo->cfg.dev;
343 	bus = device_get_parent(dev);
344 	rid = iov->iov_pos + PCIR_SRIOV_BAR(bar);
345 	bar_size = 1 << bar_shift;
346 
347 	res = pci_alloc_multi_resource(bus, dev, SYS_RES_MEMORY, &rid, 0,
348 	    ~0, 1, iov->iov_num_vfs, RF_ACTIVE);
349 
350 	if (res == NULL)
351 		return (ENXIO);
352 
353 	iov->iov_bar[bar].res = res;
354 	iov->iov_bar[bar].bar_size = bar_size;
355 	iov->iov_bar[bar].bar_shift = bar_shift;
356 
357 	start = rman_get_start(res);
358 	end = rman_get_end(res);
359 	return (rman_manage_region(&iov->rman, start, end));
360 }
361 
362 static void
363 pci_iov_add_bars(struct pcicfg_iov *iov, struct pci_devinfo *dinfo)
364 {
365 	struct pci_iov_bar *bar;
366 	uint64_t bar_start;
367 	int i;
368 
369 	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
370 		bar = &iov->iov_bar[i];
371 		if (bar->res != NULL) {
372 			bar_start = rman_get_start(bar->res) +
373 			    dinfo->cfg.vf.index * bar->bar_size;
374 
375 			pci_add_bar(dinfo->cfg.dev, PCIR_BAR(i), bar_start,
376 			    bar->bar_shift);
377 		}
378 	}
379 }
380 
381 static int
382 pci_iov_parse_config(struct pcicfg_iov *iov, struct pci_iov_arg *arg,
383     nvlist_t **ret)
384 {
385 	void *packed_config;
386 	nvlist_t *config;
387 	int error;
388 
389 	config = NULL;
390 	packed_config = NULL;
391 
392 	if (arg->len > pci_iov_max_config) {
393 		error = EMSGSIZE;
394 		goto out;
395 	}
396 
397 	packed_config = malloc(arg->len, M_SRIOV, M_WAITOK);
398 
399 	error = copyin(arg->config, packed_config, arg->len);
400 	if (error != 0)
401 		goto out;
402 
403 	config = nvlist_unpack(packed_config, arg->len, NV_FLAG_IGNORE_CASE);
404 	if (config == NULL) {
405 		error = EINVAL;
406 		goto out;
407 	}
408 
409 	error = pci_iov_schema_validate_config(iov->iov_schema, config);
410 	if (error != 0)
411 		goto out;
412 
413 	error = nvlist_error(config);
414 	if (error != 0)
415 		goto out;
416 
417 	*ret = config;
418 	config = NULL;
419 
420 out:
421 	nvlist_destroy(config);
422 	free(packed_config, M_SRIOV);
423 	return (error);
424 }
425 
426 /*
427  * Set the ARI_EN bit in the lowest-numbered PCI function with the SR-IOV
428  * capability.  This bit is only writeable on the lowest-numbered PF but
429  * affects all PFs on the device.
430  */
431 static int
432 pci_iov_set_ari(device_t bus)
433 {
434 	device_t lowest;
435 	device_t *devlist;
436 	int i, error, devcount, lowest_func, lowest_pos, iov_pos, dev_func;
437 	uint16_t iov_ctl;
438 
439 	/* If ARI is disabled on the downstream port there is nothing to do. */
440 	if (!PCIB_ARI_ENABLED(device_get_parent(bus)))
441 		return (0);
442 
443 	error = device_get_children(bus, &devlist, &devcount);
444 
445 	if (error != 0)
446 		return (error);
447 
448 	lowest = NULL;
449 	for (i = 0; i < devcount; i++) {
450 		if (pci_find_extcap(devlist[i], PCIZ_SRIOV, &iov_pos) == 0) {
451 			dev_func = pci_get_function(devlist[i]);
452 			if (lowest == NULL || dev_func < lowest_func) {
453 				lowest = devlist[i];
454 				lowest_func = dev_func;
455 				lowest_pos = iov_pos;
456 			}
457 		}
458 	}
459 	free(devlist, M_TEMP);
460 
461 	/*
462 	 * If we called this function some device must have the SR-IOV
463 	 * capability.
464 	 */
465 	KASSERT(lowest != NULL,
466 	    ("Could not find child of %s with SR-IOV capability",
467 	    device_get_nameunit(bus)));
468 
469 	iov_ctl = pci_read_config(lowest, lowest_pos + PCIR_SRIOV_CTL, 2);
470 	iov_ctl |= PCIM_SRIOV_ARI_EN;
471 	pci_write_config(lowest, lowest_pos + PCIR_SRIOV_CTL, iov_ctl, 2);
472 	if ((pci_read_config(lowest, lowest_pos + PCIR_SRIOV_CTL, 2) &
473 	    PCIM_SRIOV_ARI_EN) == 0) {
474 		device_printf(lowest, "failed to enable ARI\n");
475 		return (ENXIO);
476 	}
477 	return (0);
478 }
479 
480 static int
481 pci_iov_config_page_size(struct pci_devinfo *dinfo)
482 {
483 	uint32_t page_cap, page_size;
484 
485 	page_cap = IOV_READ(dinfo, PCIR_SRIOV_PAGE_CAP, 4);
486 
487 	/*
488 	 * If the system page size is less than the smallest SR-IOV page size
489 	 * then round up to the smallest SR-IOV page size.
490 	 */
491 	if (PAGE_SHIFT < PCI_SRIOV_BASE_PAGE_SHIFT)
492 		page_size = (1 << 0);
493 	else
494 		page_size = (1 << (PAGE_SHIFT - PCI_SRIOV_BASE_PAGE_SHIFT));
495 
496 	/* Check that the device supports the system page size. */
497 	if (!(page_size & page_cap))
498 		return (ENXIO);
499 
500 	IOV_WRITE(dinfo, PCIR_SRIOV_PAGE_SIZE, page_size, 4);
501 	return (0);
502 }
503 
504 static int
505 pci_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *config)
506 {
507 	const nvlist_t *device, *driver_config;
508 
509 	device = nvlist_get_nvlist(config, PF_CONFIG_NAME);
510 	driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
511 	return (PCI_IOV_INIT(dev, num_vfs, driver_config));
512 }
513 
514 static int
515 pci_iov_init_rman(device_t pf, struct pcicfg_iov *iov)
516 {
517 	int error;
518 
519 	iov->rman.rm_start = 0;
520 	iov->rman.rm_end = ~0;
521 	iov->rman.rm_type = RMAN_ARRAY;
522 	snprintf(iov->rman_name, sizeof(iov->rman_name), "%s VF I/O memory",
523 	    device_get_nameunit(pf));
524 	iov->rman.rm_descr = iov->rman_name;
525 
526 	error = rman_init(&iov->rman);
527 	if (error != 0)
528 		return (error);
529 
530 	iov->iov_flags |= IOV_RMAN_INITED;
531 	return (0);
532 }
533 
534 static int
535 pci_iov_alloc_bar_ea(struct pci_devinfo *dinfo, int bar)
536 {
537 	struct pcicfg_iov *iov;
538 	rman_res_t start, end;
539 	struct resource *res;
540 	struct resource_list *rl;
541 	struct resource_list_entry *rle;
542 
543 	rl = &dinfo->resources;
544 	iov = dinfo->cfg.iov;
545 
546 	rle = resource_list_find(rl, SYS_RES_MEMORY,
547 	    iov->iov_pos + PCIR_SRIOV_BAR(bar));
548 	if (rle == NULL)
549 		rle = resource_list_find(rl, SYS_RES_IOPORT,
550 		    iov->iov_pos + PCIR_SRIOV_BAR(bar));
551 	if (rle == NULL)
552 		return (ENXIO);
553 	res = rle->res;
554 
555 	iov->iov_bar[bar].res = res;
556 	iov->iov_bar[bar].bar_size = rman_get_size(res) / iov->iov_num_vfs;
557 	iov->iov_bar[bar].bar_shift = pci_mapsize(iov->iov_bar[bar].bar_size);
558 
559 	start = rman_get_start(res);
560 	end = rman_get_end(res);
561 
562 	return (rman_manage_region(&iov->rman, start, end));
563 }
564 
565 static int
566 pci_iov_setup_bars(struct pci_devinfo *dinfo)
567 {
568 	device_t dev;
569 	struct pcicfg_iov *iov;
570 	pci_addr_t bar_value, testval;
571 	int i, last_64, error;
572 
573 	iov = dinfo->cfg.iov;
574 	dev = dinfo->cfg.dev;
575 	last_64 = 0;
576 
577 	pci_add_resources_ea(device_get_parent(dev), dev, 1);
578 
579 	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
580 		/* First, try to use BARs allocated with EA */
581 		error = pci_iov_alloc_bar_ea(dinfo, i);
582 		if (error == 0)
583 			continue;
584 
585 		/* Allocate legacy-BAR only if EA is not enabled */
586 		if (pci_ea_is_enabled(dev, iov->iov_pos + PCIR_SRIOV_BAR(i)))
587 			continue;
588 
589 		/*
590 		 * If a PCI BAR is a 64-bit wide BAR, then it spans two
591 		 * consecutive registers.  Therefore if the last BAR that
592 		 * we looked at was a 64-bit BAR, we need to skip this
593 		 * register as it's the second half of the last BAR.
594 		 */
595 		if (!last_64) {
596 			pci_read_bar(dev,
597 			    iov->iov_pos + PCIR_SRIOV_BAR(i),
598 			    &bar_value, &testval, &last_64);
599 
600 			if (testval != 0) {
601 				error = pci_iov_alloc_bar(dinfo, i,
602 				   pci_mapsize(testval));
603 				if (error != 0)
604 					return (error);
605 			}
606 		} else
607 			last_64 = 0;
608 	}
609 
610 	return (0);
611 }
612 
613 static void
614 pci_iov_enumerate_vfs(struct pci_devinfo *dinfo, const nvlist_t *config,
615     uint16_t first_rid, uint16_t rid_stride)
616 {
617 	char device_name[VF_MAX_NAME];
618 	const nvlist_t *device, *driver_config, *iov_config;
619 	device_t bus, dev, vf;
620 	struct pcicfg_iov *iov;
621 	struct pci_devinfo *vfinfo;
622 	int i, error;
623 	uint16_t vid, did, next_rid;
624 
625 	iov = dinfo->cfg.iov;
626 	dev = dinfo->cfg.dev;
627 	bus = device_get_parent(dev);
628 	next_rid = first_rid;
629 	vid = pci_get_vendor(dev);
630 	did = IOV_READ(dinfo, PCIR_SRIOV_VF_DID, 2);
631 
632 	for (i = 0; i < iov->iov_num_vfs; i++, next_rid += rid_stride) {
633 		snprintf(device_name, sizeof(device_name), VF_PREFIX"%d", i);
634 		device = nvlist_get_nvlist(config, device_name);
635 		iov_config = nvlist_get_nvlist(device, IOV_CONFIG_NAME);
636 		driver_config = nvlist_get_nvlist(device, DRIVER_CONFIG_NAME);
637 
638 		vf = PCI_CREATE_IOV_CHILD(bus, dev, next_rid, vid, did);
639 		if (vf == NULL)
640 			break;
641 
642 		/*
643 		 * If we are creating passthrough devices then force the ppt
644 		 * driver to attach to prevent a VF driver from claiming the
645 		 * VFs.
646 		 */
647 		if (nvlist_get_bool(iov_config, "passthrough"))
648 			device_set_devclass_fixed(vf, "ppt");
649 
650 		vfinfo = device_get_ivars(vf);
651 
652 		vfinfo->cfg.iov = iov;
653 		vfinfo->cfg.vf.index = i;
654 
655 		pci_iov_add_bars(iov, vfinfo);
656 
657 		error = PCI_IOV_ADD_VF(dev, i, driver_config);
658 		if (error != 0) {
659 			device_printf(dev, "Failed to add VF %d\n", i);
660 			device_delete_child(bus, vf);
661 		}
662 	}
663 
664 	bus_generic_attach(bus);
665 }
666 
667 static int
668 pci_iov_config(struct cdev *cdev, struct pci_iov_arg *arg)
669 {
670 	device_t bus, dev;
671 	struct pci_devinfo *dinfo;
672 	struct pcicfg_iov *iov;
673 	nvlist_t *config;
674 	int i, error;
675 	uint16_t rid_off, rid_stride;
676 	uint16_t first_rid, last_rid;
677 	uint16_t iov_ctl;
678 	uint16_t num_vfs, total_vfs;
679 	int iov_inited;
680 
681 	mtx_lock(&Giant);
682 	dinfo = cdev->si_drv1;
683 	iov = dinfo->cfg.iov;
684 	dev = dinfo->cfg.dev;
685 	bus = device_get_parent(dev);
686 	iov_inited = 0;
687 	config = NULL;
688 
689 	if ((iov->iov_flags & IOV_BUSY) || iov->iov_num_vfs != 0) {
690 		mtx_unlock(&Giant);
691 		return (EBUSY);
692 	}
693 	iov->iov_flags |= IOV_BUSY;
694 
695 	error = pci_iov_parse_config(iov, arg, &config);
696 	if (error != 0)
697 		goto out;
698 
699 	num_vfs = pci_iov_config_get_num_vfs(config);
700 	total_vfs = IOV_READ(dinfo, PCIR_SRIOV_TOTAL_VFS, 2);
701 	if (num_vfs > total_vfs) {
702 		error = EINVAL;
703 		goto out;
704 	}
705 
706 	error = pci_iov_config_page_size(dinfo);
707 	if (error != 0)
708 		goto out;
709 
710 	error = pci_iov_set_ari(bus);
711 	if (error != 0)
712 		goto out;
713 
714 	error = pci_iov_init(dev, num_vfs, config);
715 	if (error != 0)
716 		goto out;
717 	iov_inited = 1;
718 
719 	IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, num_vfs, 2);
720 
721 	rid_off = IOV_READ(dinfo, PCIR_SRIOV_VF_OFF, 2);
722 	rid_stride = IOV_READ(dinfo, PCIR_SRIOV_VF_STRIDE, 2);
723 
724 	first_rid = pci_get_rid(dev) + rid_off;
725 	last_rid = first_rid + (num_vfs - 1) * rid_stride;
726 
727 	/* We don't yet support allocating extra bus numbers for VFs. */
728 	if (pci_get_bus(dev) != PCI_RID2BUS(last_rid)) {
729 		error = ENOSPC;
730 		goto out;
731 	}
732 
733 	iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
734 	iov_ctl &= ~(PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE);
735 	IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
736 
737 	error = pci_iov_init_rman(dev, iov);
738 	if (error != 0)
739 		goto out;
740 
741 	iov->iov_num_vfs = num_vfs;
742 
743 	error = pci_iov_setup_bars(dinfo);
744 	if (error != 0)
745 		goto out;
746 
747 	iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
748 	iov_ctl |= PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE;
749 	IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
750 
751 	/* Per specification, we must wait 100ms before accessing VFs. */
752 	pause("iov", roundup(hz, 10));
753 	pci_iov_enumerate_vfs(dinfo, config, first_rid, rid_stride);
754 
755 	nvlist_destroy(config);
756 	iov->iov_flags &= ~IOV_BUSY;
757 	mtx_unlock(&Giant);
758 
759 	return (0);
760 out:
761 	if (iov_inited)
762 		PCI_IOV_UNINIT(dev);
763 
764 	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
765 		if (iov->iov_bar[i].res != NULL) {
766 			pci_release_resource(bus, dev, SYS_RES_MEMORY,
767 			    iov->iov_pos + PCIR_SRIOV_BAR(i),
768 			    iov->iov_bar[i].res);
769 			pci_delete_resource(bus, dev, SYS_RES_MEMORY,
770 			    iov->iov_pos + PCIR_SRIOV_BAR(i));
771 			iov->iov_bar[i].res = NULL;
772 		}
773 	}
774 
775 	if (iov->iov_flags & IOV_RMAN_INITED) {
776 		rman_fini(&iov->rman);
777 		iov->iov_flags &= ~IOV_RMAN_INITED;
778 	}
779 
780 	nvlist_destroy(config);
781 	iov->iov_num_vfs = 0;
782 	iov->iov_flags &= ~IOV_BUSY;
783 	mtx_unlock(&Giant);
784 	return (error);
785 }
786 
787 void
788 pci_iov_cfg_restore(device_t dev, struct pci_devinfo *dinfo)
789 {
790 	struct pcicfg_iov *iov;
791 
792 	iov = dinfo->cfg.iov;
793 
794 	IOV_WRITE(dinfo, PCIR_SRIOV_PAGE_SIZE, iov->iov_page_size, 4);
795 	IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, iov->iov_num_vfs, 2);
796 	IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov->iov_ctl, 2);
797 }
798 
799 void
800 pci_iov_cfg_save(device_t dev, struct pci_devinfo *dinfo)
801 {
802 	struct pcicfg_iov *iov;
803 
804 	iov = dinfo->cfg.iov;
805 
806 	iov->iov_page_size = IOV_READ(dinfo, PCIR_SRIOV_PAGE_SIZE, 4);
807 	iov->iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
808 }
809 
810 /* Return true if child is a VF of the given PF. */
811 static int
812 pci_iov_is_child_vf(struct pcicfg_iov *pf, device_t child)
813 {
814 	struct pci_devinfo *vfinfo;
815 
816 	vfinfo = device_get_ivars(child);
817 
818 	if (!(vfinfo->cfg.flags & PCICFG_VF))
819 		return (0);
820 
821 	return (pf == vfinfo->cfg.iov);
822 }
823 
824 static int
825 pci_iov_delete(struct cdev *cdev)
826 {
827 	device_t bus, dev, vf, *devlist;
828 	struct pci_devinfo *dinfo;
829 	struct pcicfg_iov *iov;
830 	int i, error, devcount;
831 	uint32_t iov_ctl;
832 
833 	mtx_lock(&Giant);
834 	dinfo = cdev->si_drv1;
835 	iov = dinfo->cfg.iov;
836 	dev = dinfo->cfg.dev;
837 	bus = device_get_parent(dev);
838 	devlist = NULL;
839 
840 	if (iov->iov_flags & IOV_BUSY) {
841 		mtx_unlock(&Giant);
842 		return (EBUSY);
843 	}
844 
845 	if (iov->iov_num_vfs == 0) {
846 		mtx_unlock(&Giant);
847 		return (ECHILD);
848 	}
849 
850 	iov->iov_flags |= IOV_BUSY;
851 
852 	error = device_get_children(bus, &devlist, &devcount);
853 
854 	if (error != 0)
855 		goto out;
856 
857 	for (i = 0; i < devcount; i++) {
858 		vf = devlist[i];
859 
860 		if (!pci_iov_is_child_vf(iov, vf))
861 			continue;
862 
863 		error = device_detach(vf);
864 		if (error != 0) {
865 			device_printf(dev,
866 			   "Could not disable SR-IOV: failed to detach VF %s\n",
867 			    device_get_nameunit(vf));
868 			goto out;
869 		}
870 	}
871 
872 	for (i = 0; i < devcount; i++) {
873 		vf = devlist[i];
874 
875 		if (pci_iov_is_child_vf(iov, vf))
876 			device_delete_child(bus, vf);
877 	}
878 	PCI_IOV_UNINIT(dev);
879 
880 	iov_ctl = IOV_READ(dinfo, PCIR_SRIOV_CTL, 2);
881 	iov_ctl &= ~(PCIM_SRIOV_VF_EN | PCIM_SRIOV_VF_MSE);
882 	IOV_WRITE(dinfo, PCIR_SRIOV_CTL, iov_ctl, 2);
883 	IOV_WRITE(dinfo, PCIR_SRIOV_NUM_VFS, 0, 2);
884 
885 	iov->iov_num_vfs = 0;
886 
887 	for (i = 0; i <= PCIR_MAX_BAR_0; i++) {
888 		if (iov->iov_bar[i].res != NULL) {
889 			pci_release_resource(bus, dev, SYS_RES_MEMORY,
890 			    iov->iov_pos + PCIR_SRIOV_BAR(i),
891 			    iov->iov_bar[i].res);
892 			pci_delete_resource(bus, dev, SYS_RES_MEMORY,
893 			    iov->iov_pos + PCIR_SRIOV_BAR(i));
894 			iov->iov_bar[i].res = NULL;
895 		}
896 	}
897 
898 	if (iov->iov_flags & IOV_RMAN_INITED) {
899 		rman_fini(&iov->rman);
900 		iov->iov_flags &= ~IOV_RMAN_INITED;
901 	}
902 
903 	error = 0;
904 out:
905 	free(devlist, M_TEMP);
906 	iov->iov_flags &= ~IOV_BUSY;
907 	mtx_unlock(&Giant);
908 	return (error);
909 }
910 
911 static int
912 pci_iov_get_schema_ioctl(struct cdev *cdev, struct pci_iov_schema *output)
913 {
914 	struct pci_devinfo *dinfo;
915 	void *packed;
916 	size_t output_len, size;
917 	int error;
918 
919 	packed = NULL;
920 
921 	mtx_lock(&Giant);
922 	dinfo = cdev->si_drv1;
923 	packed = nvlist_pack(dinfo->cfg.iov->iov_schema, &size);
924 	mtx_unlock(&Giant);
925 
926 	if (packed == NULL) {
927 		error = ENOMEM;
928 		goto fail;
929 	}
930 
931 	output_len = output->len;
932 	output->len = size;
933 	if (size <= output_len) {
934 		error = copyout(packed, output->schema, size);
935 
936 		if (error != 0)
937 			goto fail;
938 
939 		output->error = 0;
940 	} else
941 		/*
942 		 * If we return an error then the ioctl code won't copyout
943 		 * output back to userland, so we flag the error in the struct
944 		 * instead.
945 		 */
946 		output->error = EMSGSIZE;
947 
948 	error = 0;
949 
950 fail:
951 	free(packed, M_NVLIST);
952 
953 	return (error);
954 }
955 
956 static int
957 pci_iov_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
958     struct thread *td)
959 {
960 
961 	switch (cmd) {
962 	case IOV_CONFIG:
963 		return (pci_iov_config(dev, (struct pci_iov_arg *)data));
964 	case IOV_DELETE:
965 		return (pci_iov_delete(dev));
966 	case IOV_GET_SCHEMA:
967 		return (pci_iov_get_schema_ioctl(dev,
968 		    (struct pci_iov_schema *)data));
969 	default:
970 		return (EINVAL);
971 	}
972 }
973 
974 struct resource *
975 pci_vf_alloc_mem_resource(device_t dev, device_t child, int *rid,
976     rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
977 {
978 	struct pci_devinfo *dinfo;
979 	struct pcicfg_iov *iov;
980 	struct pci_map *map;
981 	struct resource *res;
982 	struct resource_list_entry *rle;
983 	rman_res_t bar_start, bar_end;
984 	pci_addr_t bar_length;
985 	int error;
986 
987 	dinfo = device_get_ivars(child);
988 	iov = dinfo->cfg.iov;
989 
990 	map = pci_find_bar(child, *rid);
991 	if (map == NULL)
992 		return (NULL);
993 
994 	bar_length = 1 << map->pm_size;
995 	bar_start = map->pm_value;
996 	bar_end = bar_start + bar_length - 1;
997 
998 	/* Make sure that the resource fits the constraints. */
999 	if (bar_start >= end || bar_end <= bar_start || count != 1)
1000 		return (NULL);
1001 
1002 	/* Clamp the resource to the constraints if necessary. */
1003 	if (bar_start < start)
1004 		bar_start = start;
1005 	if (bar_end > end)
1006 		bar_end = end;
1007 	bar_length = bar_end - bar_start + 1;
1008 
1009 	res = rman_reserve_resource(&iov->rman, bar_start, bar_end,
1010 	    bar_length, flags, child);
1011 	if (res == NULL)
1012 		return (NULL);
1013 
1014 	rle = resource_list_add(&dinfo->resources, SYS_RES_MEMORY, *rid,
1015 	    bar_start, bar_end, 1);
1016 	if (rle == NULL) {
1017 		rman_release_resource(res);
1018 		return (NULL);
1019 	}
1020 
1021 	rman_set_rid(res, *rid);
1022 
1023 	if (flags & RF_ACTIVE) {
1024 		error = bus_activate_resource(child, SYS_RES_MEMORY, *rid, res);
1025 		if (error != 0) {
1026 			resource_list_delete(&dinfo->resources, SYS_RES_MEMORY,
1027 			    *rid);
1028 			rman_release_resource(res);
1029 			return (NULL);
1030 		}
1031 	}
1032 	rle->res = res;
1033 
1034 	return (res);
1035 }
1036 
1037 int
1038 pci_vf_release_mem_resource(device_t dev, device_t child, int rid,
1039     struct resource *r)
1040 {
1041 	struct pci_devinfo *dinfo;
1042 	struct resource_list_entry *rle;
1043 	int error;
1044 
1045 	dinfo = device_get_ivars(child);
1046 
1047 	if (rman_get_flags(r) & RF_ACTIVE) {
1048 		error = bus_deactivate_resource(child, SYS_RES_MEMORY, rid, r);
1049 		if (error != 0)
1050 			return (error);
1051 	}
1052 
1053 	rle = resource_list_find(&dinfo->resources, SYS_RES_MEMORY, rid);
1054 	if (rle != NULL) {
1055 		rle->res = NULL;
1056 		resource_list_delete(&dinfo->resources, SYS_RES_MEMORY,
1057 		    rid);
1058 	}
1059 
1060 	return (rman_release_resource(r));
1061 }
1062 
1063