xref: /freebsd/sys/dev/hyperv/pcib/vmbus_pcib.c (revision dd41de95a84d979615a2ef11df6850622bf6184e)
1 /*-
2  * Copyright (c) 2016-2017 Microsoft Corp.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #ifdef NEW_PCIB
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/types.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/kernel.h>
38 #include <sys/queue.h>
39 #include <sys/lock.h>
40 #include <sys/sx.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/bus.h>
44 #include <sys/rman.h>
45 #include <sys/mutex.h>
46 #include <sys/errno.h>
47 
48 #include <vm/vm.h>
49 #include <vm/vm_param.h>
50 #include <vm/vm_kern.h>
51 #include <vm/pmap.h>
52 
53 #include <machine/atomic.h>
54 #include <machine/bus.h>
55 #include <machine/frame.h>
56 #include <machine/pci_cfgreg.h>
57 #include <machine/resource.h>
58 
59 #include <sys/pciio.h>
60 #include <dev/pci/pcireg.h>
61 #include <dev/pci/pcivar.h>
62 #include <dev/pci/pci_private.h>
63 #include <dev/pci/pcib_private.h>
64 #include "pcib_if.h"
65 
66 #include <machine/intr_machdep.h>
67 #include <x86/apicreg.h>
68 
69 #include <dev/hyperv/include/hyperv.h>
70 #include <dev/hyperv/include/hyperv_busdma.h>
71 #include <dev/hyperv/include/vmbus_xact.h>
72 #include <dev/hyperv/vmbus/vmbus_reg.h>
73 #include <dev/hyperv/vmbus/vmbus_chanvar.h>
74 
75 #include "vmbus_if.h"
76 
77 #if __FreeBSD_version < 1100000
78 typedef u_long rman_res_t;
79 #define RM_MAX_END	(~(rman_res_t)0)
80 #endif
81 
82 struct completion {
83 	unsigned int done;
84 	struct mtx lock;
85 };
86 
87 static void
88 init_completion(struct completion *c)
89 {
90 	memset(c, 0, sizeof(*c));
91 	mtx_init(&c->lock, "hvcmpl", NULL, MTX_DEF);
92 	c->done = 0;
93 }
94 
95 static void
96 free_completion(struct completion *c)
97 {
98 	mtx_destroy(&c->lock);
99 }
100 
101 static void
102 complete(struct completion *c)
103 {
104 	mtx_lock(&c->lock);
105 	c->done++;
106 	mtx_unlock(&c->lock);
107 	wakeup(c);
108 }
109 
110 static void
111 wait_for_completion(struct completion *c)
112 {
113 	mtx_lock(&c->lock);
114 	while (c->done == 0)
115 		mtx_sleep(c, &c->lock, 0, "hvwfc", 0);
116 	c->done--;
117 	mtx_unlock(&c->lock);
118 }
119 
120 /*
121  * Return: 0 if completed, a non-zero value if timed out.
122  */
123 static int
124 wait_for_completion_timeout(struct completion *c, int timeout)
125 {
126 	int ret;
127 
128 	mtx_lock(&c->lock);
129 
130 	if (c->done == 0)
131 		mtx_sleep(c, &c->lock, 0, "hvwfc", timeout);
132 
133 	if (c->done > 0) {
134 		c->done--;
135 		ret = 0;
136 	} else {
137 		ret = 1;
138 	}
139 
140 	mtx_unlock(&c->lock);
141 
142 	return (ret);
143 }
144 
145 #define PCI_MAKE_VERSION(major, minor) ((uint32_t)(((major) << 16) | (major)))
146 
147 enum {
148 	PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),
149 	PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1
150 };
151 
152 #define PCI_CONFIG_MMIO_LENGTH	0x2000
153 #define CFG_PAGE_OFFSET 0x1000
154 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
155 
156 /*
157  * Message Types
158  */
159 
160 enum pci_message_type {
161 	/*
162 	 * Version 1.1
163 	 */
164 	PCI_MESSAGE_BASE                = 0x42490000,
165 	PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
166 	PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
167 	PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
168 	PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
169 	PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
170 	PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
171 	PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
172 	PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
173 	PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
174 	PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
175 	PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
176 	PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
177 	PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
178 	PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
179 	PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
180 	PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
181 	PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
182 	PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
183 	PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
184 	PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
185 	PCI_MESSAGE_MAXIMUM
186 };
187 
188 /*
189  * Structures defining the virtual PCI Express protocol.
190  */
191 
192 union pci_version {
193 	struct {
194 		uint16_t minor_version;
195 		uint16_t major_version;
196 	} parts;
197 	uint32_t version;
198 } __packed;
199 
200 /*
201  * This representation is the one used in Windows, which is
202  * what is expected when sending this back and forth with
203  * the Hyper-V parent partition.
204  */
205 union win_slot_encoding {
206 	struct {
207 		uint32_t	slot:5;
208 		uint32_t	func:3;
209 		uint32_t	reserved:24;
210 	} bits;
211 	uint32_t val;
212 } __packed;
213 
214 struct pci_func_desc {
215 	uint16_t	v_id;	/* vendor ID */
216 	uint16_t	d_id;	/* device ID */
217 	uint8_t		rev;
218 	uint8_t		prog_intf;
219 	uint8_t		subclass;
220 	uint8_t		base_class;
221 	uint32_t	subsystem_id;
222 	union win_slot_encoding wslot;
223 	uint32_t	ser;	/* serial number */
224 } __packed;
225 
226 struct hv_msi_desc {
227 	uint8_t		vector;
228 	uint8_t		delivery_mode;
229 	uint16_t	vector_count;
230 	uint32_t	reserved;
231 	uint64_t	cpu_mask;
232 } __packed;
233 
234 struct tran_int_desc {
235 	uint16_t	reserved;
236 	uint16_t	vector_count;
237 	uint32_t	data;
238 	uint64_t	address;
239 } __packed;
240 
241 struct pci_message {
242 	uint32_t type;
243 } __packed;
244 
245 struct pci_child_message {
246 	struct pci_message message_type;
247 	union win_slot_encoding wslot;
248 } __packed;
249 
250 struct pci_incoming_message {
251 	struct vmbus_chanpkt_hdr hdr;
252 	struct pci_message message_type;
253 } __packed;
254 
255 struct pci_response {
256 	struct vmbus_chanpkt_hdr hdr;
257 	int32_t status;	/* negative values are failures */
258 } __packed;
259 
260 struct pci_packet {
261 	void (*completion_func)(void *context, struct pci_response *resp,
262 	    int resp_packet_size);
263 	void *compl_ctxt;
264 
265 	struct pci_message message[0];
266 };
267 
268 /*
269  * Specific message types supporting the PCI protocol.
270  */
271 
272 struct pci_version_request {
273 	struct pci_message message_type;
274 	uint32_t protocol_version;
275 	uint32_t is_last_attempt:1;
276 	uint32_t reservedz:31;
277 } __packed;
278 
279 struct pci_bus_d0_entry {
280 	struct pci_message message_type;
281 	uint32_t reserved;
282 	uint64_t mmio_base;
283 } __packed;
284 
285 struct pci_bus_relations {
286 	struct pci_incoming_message incoming;
287 	uint32_t device_count;
288 	struct pci_func_desc func[0];
289 } __packed;
290 
291 #define MAX_NUM_BARS	(PCIR_MAX_BAR_0 + 1)
292 struct pci_q_res_req_response {
293 	struct vmbus_chanpkt_hdr hdr;
294 	int32_t status; /* negative values are failures */
295 	uint32_t probed_bar[MAX_NUM_BARS];
296 } __packed;
297 
298 struct pci_resources_assigned {
299 	struct pci_message message_type;
300 	union win_slot_encoding wslot;
301 	uint8_t memory_range[0x14][MAX_NUM_BARS]; /* unused here */
302 	uint32_t msi_descriptors;
303 	uint32_t reserved[4];
304 } __packed;
305 
306 struct pci_create_interrupt {
307 	struct pci_message message_type;
308 	union win_slot_encoding wslot;
309 	struct hv_msi_desc int_desc;
310 } __packed;
311 
312 struct pci_create_int_response {
313 	struct pci_response response;
314 	uint32_t reserved;
315 	struct tran_int_desc int_desc;
316 } __packed;
317 
318 struct pci_delete_interrupt {
319 	struct pci_message message_type;
320 	union win_slot_encoding wslot;
321 	struct tran_int_desc int_desc;
322 } __packed;
323 
324 struct pci_dev_incoming {
325 	struct pci_incoming_message incoming;
326 	union win_slot_encoding wslot;
327 } __packed;
328 
329 struct pci_eject_response {
330 	struct pci_message message_type;
331 	union win_slot_encoding wslot;
332 	uint32_t status;
333 } __packed;
334 
335 /*
336  * Driver specific state.
337  */
338 
339 enum hv_pcibus_state {
340 	hv_pcibus_init = 0,
341 	hv_pcibus_installed,
342 };
343 
344 struct hv_pcibus {
345 	device_t pcib;
346 	device_t pci_bus;
347 	struct vmbus_pcib_softc *sc;
348 
349 	uint16_t pci_domain;
350 
351 	enum hv_pcibus_state state;
352 
353 	struct resource *cfg_res;
354 
355 	struct completion query_completion, *query_comp;
356 
357 	struct mtx config_lock; /* Avoid two threads writing index page */
358 	struct mtx device_list_lock;    /* Protect lists below */
359 	TAILQ_HEAD(, hv_pci_dev) children;
360 	TAILQ_HEAD(, hv_dr_state) dr_list;
361 
362 	volatile int detaching;
363 };
364 
365 struct hv_pci_dev {
366 	TAILQ_ENTRY(hv_pci_dev) link;
367 
368 	struct pci_func_desc desc;
369 
370 	bool reported_missing;
371 
372 	struct hv_pcibus *hbus;
373 	struct task eject_task;
374 
375 	TAILQ_HEAD(, hv_irq_desc) irq_desc_list;
376 
377 	/*
378 	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
379 	 * read it back, for each of the BAR offsets within config space.
380 	 */
381 	uint32_t probed_bar[MAX_NUM_BARS];
382 };
383 
384 /*
385  * Tracks "Device Relations" messages from the host, which must be both
386  * processed in order.
387  */
388 struct hv_dr_work {
389 	struct task task;
390 	struct hv_pcibus *bus;
391 };
392 
393 struct hv_dr_state {
394 	TAILQ_ENTRY(hv_dr_state) link;
395 	uint32_t device_count;
396 	struct pci_func_desc func[0];
397 };
398 
399 struct hv_irq_desc {
400 	TAILQ_ENTRY(hv_irq_desc) link;
401 	struct tran_int_desc desc;
402 	int irq;
403 };
404 
405 #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
406 #define PCI_SLOT(devfn)         (((devfn) >> 3) & 0x1f)
407 #define PCI_FUNC(devfn)         ((devfn) & 0x07)
408 
409 static uint32_t
410 devfn_to_wslot(unsigned int devfn)
411 {
412 	union win_slot_encoding wslot;
413 
414 	wslot.val = 0;
415 	wslot.bits.slot = PCI_SLOT(devfn);
416 	wslot.bits.func = PCI_FUNC(devfn);
417 
418 	return (wslot.val);
419 }
420 
421 static unsigned int
422 wslot_to_devfn(uint32_t wslot)
423 {
424 	union win_slot_encoding encoding;
425 	unsigned int slot;
426 	unsigned int func;
427 
428 	encoding.val = wslot;
429 
430 	slot = encoding.bits.slot;
431 	func = encoding.bits.func;
432 
433 	return (PCI_DEVFN(slot, func));
434 }
435 
436 struct vmbus_pcib_softc {
437 	struct vmbus_channel	*chan;
438 	void *rx_buf;
439 
440 	struct taskqueue	*taskq;
441 
442 	struct hv_pcibus	*hbus;
443 };
444 
445 /* {44C4F61D-4444-4400-9D52-802E27EDE19F} */
446 static const struct hyperv_guid g_pass_through_dev_type = {
447 	.hv_guid = {0x1D, 0xF6, 0xC4, 0x44, 0x44, 0x44, 0x00, 0x44,
448 	    0x9D, 0x52, 0x80, 0x2E, 0x27, 0xED, 0xE1, 0x9F}
449 };
450 
451 struct hv_pci_compl {
452 	struct completion host_event;
453 	int32_t completion_status;
454 };
455 
456 struct q_res_req_compl {
457 	struct completion host_event;
458 	struct hv_pci_dev *hpdev;
459 };
460 
461 struct compose_comp_ctxt {
462 	struct hv_pci_compl comp_pkt;
463 	struct tran_int_desc int_desc;
464 };
465 
466 /*
467  * It is possible the device is revoked during initialization.
468  * Check if this happens during wait.
469  * Return: 0 if response arrived, ENODEV if device revoked.
470  */
471 static int
472 wait_for_response(struct hv_pcibus *hbus, struct completion *c)
473 {
474 	do {
475 		if (vmbus_chan_is_revoked(hbus->sc->chan)) {
476 			device_printf(hbus->pcib,
477 			    "The device is revoked.\n");
478 			return (ENODEV);
479 		}
480 	} while (wait_for_completion_timeout(c, hz /10) != 0);
481 
482 	return 0;
483 }
484 
485 static void
486 hv_pci_generic_compl(void *context, struct pci_response *resp,
487     int resp_packet_size)
488 {
489 	struct hv_pci_compl *comp_pkt = context;
490 
491 	if (resp_packet_size >= sizeof(struct pci_response))
492 		comp_pkt->completion_status = resp->status;
493 	else
494 		comp_pkt->completion_status = -1;
495 
496 	complete(&comp_pkt->host_event);
497 }
498 
499 static void
500 q_resource_requirements(void *context, struct pci_response *resp,
501     int resp_packet_size)
502 {
503 	struct q_res_req_compl *completion = context;
504 	struct pci_q_res_req_response *q_res_req =
505 	    (struct pci_q_res_req_response *)resp;
506 	int i;
507 
508 	if (resp->status < 0) {
509 		printf("vmbus_pcib: failed to query resource requirements\n");
510 	} else {
511 		for (i = 0; i < MAX_NUM_BARS; i++)
512 			completion->hpdev->probed_bar[i] =
513 			    q_res_req->probed_bar[i];
514 	}
515 
516 	complete(&completion->host_event);
517 }
518 
519 static void
520 hv_pci_compose_compl(void *context, struct pci_response *resp,
521     int resp_packet_size)
522 {
523 	struct compose_comp_ctxt *comp_pkt = context;
524 	struct pci_create_int_response *int_resp =
525 	    (struct pci_create_int_response *)resp;
526 
527 	comp_pkt->comp_pkt.completion_status = resp->status;
528 	comp_pkt->int_desc = int_resp->int_desc;
529 	complete(&comp_pkt->comp_pkt.host_event);
530 }
531 
532 static void
533 hv_int_desc_free(struct hv_pci_dev *hpdev, struct hv_irq_desc *hid)
534 {
535 	struct pci_delete_interrupt *int_pkt;
536 	struct {
537 		struct pci_packet pkt;
538 		uint8_t buffer[sizeof(struct pci_delete_interrupt)];
539 	} ctxt;
540 
541 	memset(&ctxt, 0, sizeof(ctxt));
542 	int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
543 	int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE;
544 	int_pkt->wslot.val = hpdev->desc.wslot.val;
545 	int_pkt->int_desc = hid->desc;
546 
547 	vmbus_chan_send(hpdev->hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
548 	    int_pkt, sizeof(*int_pkt), 0);
549 
550 	free(hid, M_DEVBUF);
551 }
552 
553 static void
554 hv_pci_delete_device(struct hv_pci_dev *hpdev)
555 {
556 	struct hv_pcibus *hbus = hpdev->hbus;
557 	struct hv_irq_desc *hid, *tmp_hid;
558 	device_t pci_dev;
559 	int devfn;
560 
561 	devfn = wslot_to_devfn(hpdev->desc.wslot.val);
562 
563 	mtx_lock(&Giant);
564 
565 	pci_dev = pci_find_dbsf(hbus->pci_domain,
566 	    0, PCI_SLOT(devfn), PCI_FUNC(devfn));
567 	if (pci_dev)
568 		device_delete_child(hbus->pci_bus, pci_dev);
569 
570 	mtx_unlock(&Giant);
571 
572 	mtx_lock(&hbus->device_list_lock);
573 	TAILQ_REMOVE(&hbus->children, hpdev, link);
574 	mtx_unlock(&hbus->device_list_lock);
575 
576 	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid)
577 		hv_int_desc_free(hpdev, hid);
578 
579 	free(hpdev, M_DEVBUF);
580 }
581 
582 static struct hv_pci_dev *
583 new_pcichild_device(struct hv_pcibus *hbus, struct pci_func_desc *desc)
584 {
585 	struct hv_pci_dev *hpdev;
586 	struct pci_child_message *res_req;
587 	struct q_res_req_compl comp_pkt;
588 	struct {
589 		struct pci_packet pkt;
590 		uint8_t buffer[sizeof(struct pci_child_message)];
591 	} ctxt;
592 	int ret;
593 
594 	hpdev = malloc(sizeof(*hpdev), M_DEVBUF, M_WAITOK | M_ZERO);
595 	hpdev->hbus = hbus;
596 
597 	TAILQ_INIT(&hpdev->irq_desc_list);
598 
599 	init_completion(&comp_pkt.host_event);
600 	comp_pkt.hpdev = hpdev;
601 
602 	ctxt.pkt.compl_ctxt = &comp_pkt;
603 	ctxt.pkt.completion_func = q_resource_requirements;
604 
605 	res_req = (struct pci_child_message *)&ctxt.pkt.message;
606 	res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
607 	res_req->wslot.val = desc->wslot.val;
608 
609 	ret = vmbus_chan_send(hbus->sc->chan,
610 	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
611 	    res_req, sizeof(*res_req), (uint64_t)(uintptr_t)&ctxt.pkt);
612 	if (ret)
613 		goto err;
614 
615 	if (wait_for_response(hbus, &comp_pkt.host_event))
616 		goto err;
617 
618 	free_completion(&comp_pkt.host_event);
619 
620 	hpdev->desc = *desc;
621 
622 	mtx_lock(&hbus->device_list_lock);
623 	if (TAILQ_EMPTY(&hbus->children))
624 		hbus->pci_domain = desc->ser & 0xFFFF;
625 	TAILQ_INSERT_TAIL(&hbus->children, hpdev, link);
626 	mtx_unlock(&hbus->device_list_lock);
627 	return (hpdev);
628 err:
629 	free_completion(&comp_pkt.host_event);
630 	free(hpdev, M_DEVBUF);
631 	return (NULL);
632 }
633 
634 #if __FreeBSD_version < 1100000
635 
636 /* Old versions don't have BUS_RESCAN(). Let's copy it from FreeBSD 11. */
637 
638 static struct pci_devinfo *
639 pci_identify_function(device_t pcib, device_t dev, int domain, int busno,
640     int slot, int func, size_t dinfo_size)
641 {
642 	struct pci_devinfo *dinfo;
643 
644 	dinfo = pci_read_device(pcib, domain, busno, slot, func, dinfo_size);
645 	if (dinfo != NULL)
646 		pci_add_child(dev, dinfo);
647 
648 	return (dinfo);
649 }
650 
651 static int
652 pci_rescan(device_t dev)
653 {
654 #define	REG(n, w)	PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
655 	device_t pcib = device_get_parent(dev);
656 	struct pci_softc *sc;
657 	device_t child, *devlist, *unchanged;
658 	int devcount, error, i, j, maxslots, oldcount;
659 	int busno, domain, s, f, pcifunchigh;
660 	uint8_t hdrtype;
661 
662 	/* No need to check for ARI on a rescan. */
663 	error = device_get_children(dev, &devlist, &devcount);
664 	if (error)
665 		return (error);
666 	if (devcount != 0) {
667 		unchanged = malloc(devcount * sizeof(device_t), M_TEMP,
668 		    M_NOWAIT | M_ZERO);
669 		if (unchanged == NULL) {
670 			free(devlist, M_TEMP);
671 			return (ENOMEM);
672 		}
673 	} else
674 		unchanged = NULL;
675 
676 	sc = device_get_softc(dev);
677 	domain = pcib_get_domain(dev);
678 	busno = pcib_get_bus(dev);
679 	maxslots = PCIB_MAXSLOTS(pcib);
680 	for (s = 0; s <= maxslots; s++) {
681 		/* If function 0 is not present, skip to the next slot. */
682 		f = 0;
683 		if (REG(PCIR_VENDOR, 2) == 0xffff)
684 			continue;
685 		pcifunchigh = 0;
686 		hdrtype = REG(PCIR_HDRTYPE, 1);
687 		if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
688 			continue;
689 		if (hdrtype & PCIM_MFDEV)
690 			pcifunchigh = PCIB_MAXFUNCS(pcib);
691 		for (f = 0; f <= pcifunchigh; f++) {
692 			if (REG(PCIR_VENDOR, 2) == 0xffff)
693 				continue;
694 
695 			/*
696 			 * Found a valid function.  Check if a
697 			 * device_t for this device already exists.
698 			 */
699 			for (i = 0; i < devcount; i++) {
700 				child = devlist[i];
701 				if (child == NULL)
702 					continue;
703 				if (pci_get_slot(child) == s &&
704 				    pci_get_function(child) == f) {
705 					unchanged[i] = child;
706 					goto next_func;
707 				}
708 			}
709 
710 			pci_identify_function(pcib, dev, domain, busno, s, f,
711 			    sizeof(struct pci_devinfo));
712 		next_func:;
713 		}
714 	}
715 
716 	/* Remove devices that are no longer present. */
717 	for (i = 0; i < devcount; i++) {
718 		if (unchanged[i] != NULL)
719 			continue;
720 		device_delete_child(dev, devlist[i]);
721 	}
722 
723 	free(devlist, M_TEMP);
724 	oldcount = devcount;
725 
726 	/* Try to attach the devices just added. */
727 	error = device_get_children(dev, &devlist, &devcount);
728 	if (error) {
729 		free(unchanged, M_TEMP);
730 		return (error);
731 	}
732 
733 	for (i = 0; i < devcount; i++) {
734 		for (j = 0; j < oldcount; j++) {
735 			if (devlist[i] == unchanged[j])
736 				goto next_device;
737 		}
738 
739 		device_probe_and_attach(devlist[i]);
740 	next_device:;
741 	}
742 
743 	free(unchanged, M_TEMP);
744 	free(devlist, M_TEMP);
745 	return (0);
746 #undef REG
747 }
748 
749 #else
750 
751 static int
752 pci_rescan(device_t dev)
753 {
754 	return (BUS_RESCAN(dev));
755 }
756 
757 #endif
758 
759 static void
760 pci_devices_present_work(void *arg, int pending __unused)
761 {
762 	struct hv_dr_work *dr_wrk = arg;
763 	struct hv_dr_state *dr = NULL;
764 	struct hv_pcibus *hbus;
765 	uint32_t child_no;
766 	bool found;
767 	struct pci_func_desc *new_desc;
768 	struct hv_pci_dev *hpdev, *tmp_hpdev;
769 	struct completion *query_comp;
770 	bool need_rescan = false;
771 
772 	hbus = dr_wrk->bus;
773 	free(dr_wrk, M_DEVBUF);
774 
775 	/* Pull this off the queue and process it if it was the last one. */
776 	mtx_lock(&hbus->device_list_lock);
777 	while (!TAILQ_EMPTY(&hbus->dr_list)) {
778 		dr = TAILQ_FIRST(&hbus->dr_list);
779 		TAILQ_REMOVE(&hbus->dr_list, dr, link);
780 
781 		/* Throw this away if the list still has stuff in it. */
782 		if (!TAILQ_EMPTY(&hbus->dr_list)) {
783 			free(dr, M_DEVBUF);
784 			continue;
785 		}
786 	}
787 	mtx_unlock(&hbus->device_list_lock);
788 
789 	if (!dr)
790 		return;
791 
792 	/* First, mark all existing children as reported missing. */
793 	mtx_lock(&hbus->device_list_lock);
794 	TAILQ_FOREACH(hpdev, &hbus->children, link)
795 		hpdev->reported_missing = true;
796 	mtx_unlock(&hbus->device_list_lock);
797 
798 	/* Next, add back any reported devices. */
799 	for (child_no = 0; child_no < dr->device_count; child_no++) {
800 		found = false;
801 		new_desc = &dr->func[child_no];
802 
803 		mtx_lock(&hbus->device_list_lock);
804 		TAILQ_FOREACH(hpdev, &hbus->children, link) {
805 			if ((hpdev->desc.wslot.val ==
806 			    new_desc->wslot.val) &&
807 			    (hpdev->desc.v_id == new_desc->v_id) &&
808 			    (hpdev->desc.d_id == new_desc->d_id) &&
809 			    (hpdev->desc.ser == new_desc->ser)) {
810 				hpdev->reported_missing = false;
811 				found = true;
812 				break;
813 			}
814 		}
815 		mtx_unlock(&hbus->device_list_lock);
816 
817 		if (!found) {
818 			if (!need_rescan)
819 				need_rescan = true;
820 
821 			hpdev = new_pcichild_device(hbus, new_desc);
822 			if (!hpdev)
823 				printf("vmbus_pcib: failed to add a child\n");
824 		}
825 	}
826 
827 	/* Remove missing device(s), if any */
828 	TAILQ_FOREACH_SAFE(hpdev, &hbus->children, link, tmp_hpdev) {
829 		if (hpdev->reported_missing)
830 			hv_pci_delete_device(hpdev);
831 	}
832 
833 	/* Rescan the bus to find any new device, if necessary. */
834 	if (hbus->state == hv_pcibus_installed && need_rescan)
835 		pci_rescan(hbus->pci_bus);
836 
837 	/* Wake up hv_pci_query_relations(), if it's waiting. */
838 	query_comp = hbus->query_comp;
839 	if (query_comp) {
840 		hbus->query_comp = NULL;
841 		complete(query_comp);
842 	}
843 
844 	free(dr, M_DEVBUF);
845 }
846 
847 static struct hv_pci_dev *
848 get_pcichild_wslot(struct hv_pcibus *hbus, uint32_t wslot)
849 {
850 	struct hv_pci_dev *hpdev, *ret = NULL;
851 
852 	mtx_lock(&hbus->device_list_lock);
853 	TAILQ_FOREACH(hpdev, &hbus->children, link) {
854 		if (hpdev->desc.wslot.val == wslot) {
855 			ret = hpdev;
856 			break;
857 		}
858 	}
859 	mtx_unlock(&hbus->device_list_lock);
860 
861 	return (ret);
862 }
863 
864 static void
865 hv_pci_devices_present(struct hv_pcibus *hbus,
866     struct pci_bus_relations *relations)
867 {
868 	struct hv_dr_state *dr;
869 	struct hv_dr_work *dr_wrk;
870 	unsigned long dr_size;
871 
872 	if (hbus->detaching && relations->device_count > 0)
873 		return;
874 
875 	dr_size = offsetof(struct hv_dr_state, func) +
876 	    (sizeof(struct pci_func_desc) * relations->device_count);
877 	dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO);
878 
879 	dr->device_count = relations->device_count;
880 	if (dr->device_count != 0)
881 		memcpy(dr->func, relations->func,
882 		    sizeof(struct pci_func_desc) * dr->device_count);
883 
884 	mtx_lock(&hbus->device_list_lock);
885 	TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link);
886 	mtx_unlock(&hbus->device_list_lock);
887 
888 	dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO);
889 	dr_wrk->bus = hbus;
890 	TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk);
891 	taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task);
892 }
893 
894 static void
895 hv_eject_device_work(void *arg, int pending __unused)
896 {
897 	struct hv_pci_dev *hpdev = arg;
898 	union win_slot_encoding wslot = hpdev->desc.wslot;
899 	struct hv_pcibus *hbus = hpdev->hbus;
900 	struct pci_eject_response *eject_pkt;
901 	struct {
902 		struct pci_packet pkt;
903 		uint8_t buffer[sizeof(struct pci_eject_response)];
904 	} ctxt;
905 
906 	hv_pci_delete_device(hpdev);
907 
908 	memset(&ctxt, 0, sizeof(ctxt));
909 	eject_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
910 	eject_pkt->message_type.type = PCI_EJECTION_COMPLETE;
911 	eject_pkt->wslot.val = wslot.val;
912 	vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
913 	    eject_pkt, sizeof(*eject_pkt), 0);
914 }
915 
916 static void
917 hv_pci_eject_device(struct hv_pci_dev *hpdev)
918 {
919 	struct hv_pcibus *hbus = hpdev->hbus;
920 	struct taskqueue *taskq;
921 
922 	if (hbus->detaching)
923 		return;
924 
925 	/*
926 	 * Push this task into the same taskqueue on which
927 	 * vmbus_pcib_attach() runs, so we're sure this task can't run
928 	 * concurrently with vmbus_pcib_attach().
929 	 */
930 	TASK_INIT(&hpdev->eject_task, 0, hv_eject_device_work, hpdev);
931 	taskq = vmbus_chan_mgmt_tq(hbus->sc->chan);
932 	taskqueue_enqueue(taskq, &hpdev->eject_task);
933 }
934 
935 #define PCIB_PACKET_SIZE	0x100
936 
937 static void
938 vmbus_pcib_on_channel_callback(struct vmbus_channel *chan, void *arg)
939 {
940 	struct vmbus_pcib_softc *sc = arg;
941 	struct hv_pcibus *hbus = sc->hbus;
942 
943 	void *buffer;
944 	int bufferlen = PCIB_PACKET_SIZE;
945 
946 	struct pci_packet *comp_packet;
947 	struct pci_response *response;
948 	struct pci_incoming_message *new_msg;
949 	struct pci_bus_relations *bus_rel;
950 	struct pci_dev_incoming *dev_msg;
951 	struct hv_pci_dev *hpdev;
952 
953 	buffer = sc->rx_buf;
954 	do {
955 		struct vmbus_chanpkt_hdr *pkt = buffer;
956 		uint32_t bytes_rxed;
957 		int ret;
958 
959 		bytes_rxed = bufferlen;
960 		ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
961 
962 		if (ret == ENOBUFS) {
963 			/* Handle large packet */
964 			if (bufferlen > PCIB_PACKET_SIZE) {
965 				free(buffer, M_DEVBUF);
966 				buffer = NULL;
967 			}
968 
969 			/* alloc new buffer */
970 			buffer = malloc(bytes_rxed, M_DEVBUF, M_WAITOK | M_ZERO);
971 			bufferlen = bytes_rxed;
972 
973 			continue;
974 		}
975 
976 		if (ret != 0) {
977 			/* ignore EIO or EAGAIN */
978 			break;
979 		}
980 
981 		if (bytes_rxed <= sizeof(struct pci_response))
982 			continue;
983 
984 		switch (pkt->cph_type) {
985 		case VMBUS_CHANPKT_TYPE_COMP:
986 			comp_packet =
987 			    (struct pci_packet *)(uintptr_t)pkt->cph_xactid;
988 			response = (struct pci_response *)pkt;
989 			comp_packet->completion_func(comp_packet->compl_ctxt,
990 			    response, bytes_rxed);
991 			break;
992 		case VMBUS_CHANPKT_TYPE_INBAND:
993 			new_msg = (struct pci_incoming_message *)buffer;
994 
995 			switch (new_msg->message_type.type) {
996 			case PCI_BUS_RELATIONS:
997 				bus_rel = (struct pci_bus_relations *)buffer;
998 
999 				if (bus_rel->device_count == 0)
1000 					break;
1001 
1002 				if (bytes_rxed <
1003 				    offsetof(struct pci_bus_relations, func) +
1004 				        (sizeof(struct pci_func_desc) *
1005 				            (bus_rel->device_count)))
1006 					break;
1007 
1008 				hv_pci_devices_present(hbus, bus_rel);
1009 				break;
1010 
1011 			case PCI_EJECT:
1012 				dev_msg = (struct pci_dev_incoming *)buffer;
1013 				hpdev = get_pcichild_wslot(hbus,
1014 				    dev_msg->wslot.val);
1015 
1016 				if (hpdev)
1017 					hv_pci_eject_device(hpdev);
1018 
1019 				break;
1020 			default:
1021 				printf("vmbus_pcib: Unknown msg type 0x%x\n",
1022 				    new_msg->message_type.type);
1023 				break;
1024 			}
1025 			break;
1026 		default:
1027 			printf("vmbus_pcib: Unknown VMBus msg type %hd\n",
1028 			    pkt->cph_type);
1029 			break;
1030 		}
1031 	} while (1);
1032 
1033 	if (bufferlen > PCIB_PACKET_SIZE)
1034 		free(buffer, M_DEVBUF);
1035 }
1036 
1037 static int
1038 hv_pci_protocol_negotiation(struct hv_pcibus *hbus)
1039 {
1040 	struct pci_version_request *version_req;
1041 	struct hv_pci_compl comp_pkt;
1042 	struct {
1043 		struct pci_packet pkt;
1044 		uint8_t buffer[sizeof(struct pci_version_request)];
1045 	} ctxt;
1046 	int ret;
1047 
1048 	init_completion(&comp_pkt.host_event);
1049 
1050 	ctxt.pkt.completion_func = hv_pci_generic_compl;
1051 	ctxt.pkt.compl_ctxt = &comp_pkt;
1052 	version_req = (struct pci_version_request *)&ctxt.pkt.message;
1053 	version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
1054 	version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT;
1055 	version_req->is_last_attempt = 1;
1056 
1057 	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
1058 	    VMBUS_CHANPKT_FLAG_RC, version_req, sizeof(*version_req),
1059 	    (uint64_t)(uintptr_t)&ctxt.pkt);
1060 	if (!ret)
1061 		ret = wait_for_response(hbus, &comp_pkt.host_event);
1062 
1063 	if (ret) {
1064 		device_printf(hbus->pcib,
1065 		    "vmbus_pcib failed to request version: %d\n",
1066 		    ret);
1067 		goto out;
1068 	}
1069 
1070 	if (comp_pkt.completion_status < 0) {
1071 		device_printf(hbus->pcib,
1072 		    "vmbus_pcib version negotiation failed: %x\n",
1073 		    comp_pkt.completion_status);
1074 		ret = EPROTO;
1075 	} else {
1076 		ret = 0;
1077 	}
1078 out:
1079 	free_completion(&comp_pkt.host_event);
1080 	return (ret);
1081 }
1082 
1083 /* Ask the host to send along the list of child devices */
1084 static int
1085 hv_pci_query_relations(struct hv_pcibus *hbus)
1086 {
1087 	struct pci_message message;
1088 	int ret;
1089 
1090 	message.type = PCI_QUERY_BUS_RELATIONS;
1091 	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
1092 	    &message, sizeof(message), 0);
1093 	return (ret);
1094 }
1095 
1096 static int
1097 hv_pci_enter_d0(struct hv_pcibus *hbus)
1098 {
1099 	struct pci_bus_d0_entry *d0_entry;
1100 	struct hv_pci_compl comp_pkt;
1101 	struct {
1102 		struct pci_packet pkt;
1103 		uint8_t buffer[sizeof(struct pci_bus_d0_entry)];
1104 	} ctxt;
1105 	int ret;
1106 
1107 	/*
1108 	 * Tell the host that the bus is ready to use, and moved into the
1109 	 * powered-on state.  This includes telling the host which region
1110 	 * of memory-mapped I/O space has been chosen for configuration space
1111 	 * access.
1112 	 */
1113 	init_completion(&comp_pkt.host_event);
1114 
1115 	ctxt.pkt.completion_func = hv_pci_generic_compl;
1116 	ctxt.pkt.compl_ctxt = &comp_pkt;
1117 
1118 	d0_entry = (struct pci_bus_d0_entry *)&ctxt.pkt.message;
1119 	memset(d0_entry, 0, sizeof(*d0_entry));
1120 	d0_entry->message_type.type = PCI_BUS_D0ENTRY;
1121 	d0_entry->mmio_base = rman_get_start(hbus->cfg_res);
1122 
1123 	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
1124 	    VMBUS_CHANPKT_FLAG_RC, d0_entry, sizeof(*d0_entry),
1125 	    (uint64_t)(uintptr_t)&ctxt.pkt);
1126 	if (!ret)
1127 		ret = wait_for_response(hbus, &comp_pkt.host_event);
1128 
1129 	if (ret)
1130 		goto out;
1131 
1132 	if (comp_pkt.completion_status < 0) {
1133 		device_printf(hbus->pcib, "vmbus_pcib failed to enable D0\n");
1134 		ret = EPROTO;
1135 	} else {
1136 		ret = 0;
1137 	}
1138 
1139 out:
1140 	free_completion(&comp_pkt.host_event);
1141 	return (ret);
1142 }
1143 
1144 /*
1145  * It looks this is only needed by Windows VM, but let's send the message too
1146  * just to make the host happy.
1147  */
1148 static int
1149 hv_send_resources_allocated(struct hv_pcibus *hbus)
1150 {
1151 	struct pci_resources_assigned *res_assigned;
1152 	struct hv_pci_compl comp_pkt;
1153 	struct hv_pci_dev *hpdev;
1154 	struct pci_packet *pkt;
1155 	uint32_t wslot;
1156 	int ret = 0;
1157 
1158 	pkt = malloc(sizeof(*pkt) + sizeof(*res_assigned),
1159 	    M_DEVBUF, M_WAITOK | M_ZERO);
1160 
1161 	for (wslot = 0; wslot < 256; wslot++) {
1162 		hpdev = get_pcichild_wslot(hbus, wslot);
1163 		if (!hpdev)
1164 			continue;
1165 
1166 		init_completion(&comp_pkt.host_event);
1167 
1168 		memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned));
1169 		pkt->completion_func = hv_pci_generic_compl;
1170 		pkt->compl_ctxt = &comp_pkt;
1171 
1172 		res_assigned = (struct pci_resources_assigned *)&pkt->message;
1173 		res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED;
1174 		res_assigned->wslot.val = hpdev->desc.wslot.val;
1175 
1176 		ret = vmbus_chan_send(hbus->sc->chan,
1177 		    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
1178 		    &pkt->message, sizeof(*res_assigned),
1179 		    (uint64_t)(uintptr_t)pkt);
1180 		if (!ret)
1181 			ret = wait_for_response(hbus, &comp_pkt.host_event);
1182 
1183 		free_completion(&comp_pkt.host_event);
1184 
1185 		if (ret)
1186 			break;
1187 
1188 		if (comp_pkt.completion_status < 0) {
1189 			ret = EPROTO;
1190 			device_printf(hbus->pcib,
1191 			    "failed to send PCI_RESOURCES_ASSIGNED\n");
1192 			break;
1193 		}
1194 	}
1195 
1196 	free(pkt, M_DEVBUF);
1197 	return (ret);
1198 }
1199 
1200 static int
1201 hv_send_resources_released(struct hv_pcibus *hbus)
1202 {
1203 	struct pci_child_message pkt;
1204 	struct hv_pci_dev *hpdev;
1205 	uint32_t wslot;
1206 	int ret;
1207 
1208 	for (wslot = 0; wslot < 256; wslot++) {
1209 		hpdev = get_pcichild_wslot(hbus, wslot);
1210 		if (!hpdev)
1211 			continue;
1212 
1213 		pkt.message_type.type = PCI_RESOURCES_RELEASED;
1214 		pkt.wslot.val = hpdev->desc.wslot.val;
1215 
1216 		ret = vmbus_chan_send(hbus->sc->chan,
1217 		    VMBUS_CHANPKT_TYPE_INBAND, 0, &pkt, sizeof(pkt), 0);
1218 		if (ret)
1219 			return (ret);
1220 	}
1221 
1222 	return (0);
1223 }
1224 
1225 #define hv_cfg_read(x, s)						\
1226 static inline uint##x##_t hv_cfg_read_##s(struct hv_pcibus *bus,	\
1227     bus_size_t offset)							\
1228 {									\
1229 	return (bus_read_##s(bus->cfg_res, offset));			\
1230 }
1231 
1232 #define hv_cfg_write(x, s)						\
1233 static inline void hv_cfg_write_##s(struct hv_pcibus *bus,		\
1234     bus_size_t offset, uint##x##_t val)					\
1235 {									\
1236 	return (bus_write_##s(bus->cfg_res, offset, val));		\
1237 }
1238 
1239 hv_cfg_read(8, 1)
1240 hv_cfg_read(16, 2)
1241 hv_cfg_read(32, 4)
1242 
1243 hv_cfg_write(8, 1)
1244 hv_cfg_write(16, 2)
1245 hv_cfg_write(32, 4)
1246 
1247 static void
1248 _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size,
1249     uint32_t *val)
1250 {
1251 	struct hv_pcibus *hbus = hpdev->hbus;
1252 	bus_size_t addr = CFG_PAGE_OFFSET + where;
1253 
1254 	/*
1255 	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
1256 	 */
1257 	if (where + size <= PCIR_COMMAND) {
1258 		memcpy(val, ((uint8_t *)&hpdev->desc.v_id) + where, size);
1259 	} else if (where >= PCIR_REVID && where + size <=
1260 		   PCIR_CACHELNSZ) {
1261 		memcpy(val, ((uint8_t *)&hpdev->desc.rev) + where -
1262 		       PCIR_REVID, size);
1263 	} else if (where >= PCIR_SUBVEND_0 && where + size <=
1264 		   PCIR_BIOS) {
1265 		memcpy(val, (uint8_t *)&hpdev->desc.subsystem_id + where -
1266 		       PCIR_SUBVEND_0, size);
1267 	} else if (where >= PCIR_BIOS && where + size <=
1268 		   PCIR_CAP_PTR) {
1269 		/* ROM BARs are unimplemented */
1270 		*val = 0;
1271 	} else if ((where >= PCIR_INTLINE && where + size <=
1272 		   PCIR_INTPIN) ||(where == PCIR_INTPIN && size == 1)) {
1273 		/*
1274 		 * Interrupt Line and Interrupt PIN are hard-wired to zero
1275 		 * because this front-end only supports message-signaled
1276 		 * interrupts.
1277 		 */
1278 		*val = 0;
1279 	} else if (where + size <= CFG_PAGE_SIZE) {
1280 		mtx_lock(&hbus->config_lock);
1281 
1282 		/* Choose the function to be read. */
1283 		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
1284 
1285 		/* Make sure the function was chosen before we start reading.*/
1286 		mb();
1287 
1288 		/* Read from that function's config space. */
1289 		switch (size) {
1290 		case 1:
1291 			*((uint8_t *)val) = hv_cfg_read_1(hbus, addr);
1292 			break;
1293 		case 2:
1294 			*((uint16_t *)val) = hv_cfg_read_2(hbus, addr);
1295 			break;
1296 		default:
1297 			*((uint32_t *)val) = hv_cfg_read_4(hbus, addr);
1298 			break;
1299 		}
1300 		/*
1301 		 * Make sure the write was done before we release the lock,
1302 		 * allowing consecutive reads/writes.
1303 		 */
1304 		mb();
1305 
1306 		mtx_unlock(&hbus->config_lock);
1307 	} else {
1308 		/* Invalid config read: it's unlikely to reach here. */
1309 		memset(val, 0, size);
1310 	}
1311 }
1312 
1313 static void
1314 _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size,
1315     uint32_t val)
1316 {
1317 	struct hv_pcibus *hbus = hpdev->hbus;
1318 	bus_size_t addr = CFG_PAGE_OFFSET + where;
1319 
1320 	/* SSIDs and ROM BARs are read-only */
1321 	if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_CAP_PTR)
1322 		return;
1323 
1324 	if (where >= PCIR_COMMAND && where + size <= CFG_PAGE_SIZE) {
1325 		mtx_lock(&hbus->config_lock);
1326 
1327 		/* Choose the function to be written. */
1328 		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
1329 
1330 		/* Make sure the function was chosen before we start writing.*/
1331 		wmb();
1332 
1333 		/* Write to that function's config space. */
1334 		switch (size) {
1335 		case 1:
1336 			hv_cfg_write_1(hbus, addr, (uint8_t)val);
1337 			break;
1338 		case 2:
1339 			hv_cfg_write_2(hbus, addr, (uint16_t)val);
1340 			break;
1341 		default:
1342 			hv_cfg_write_4(hbus, addr, (uint32_t)val);
1343 			break;
1344 		}
1345 
1346 		/*
1347 		 * Make sure the write was done before we release the lock,
1348 		 * allowing consecutive reads/writes.
1349 		 */
1350 		mb();
1351 
1352 		mtx_unlock(&hbus->config_lock);
1353 	} else {
1354 		/* Invalid config write: it's unlikely to reach here. */
1355 		return;
1356 	}
1357 }
1358 
1359 static void
1360 vmbus_pcib_set_detaching(void *arg, int pending __unused)
1361 {
1362 	struct hv_pcibus *hbus = arg;
1363 
1364 	atomic_set_int(&hbus->detaching, 1);
1365 }
1366 
1367 static void
1368 vmbus_pcib_pre_detach(struct hv_pcibus *hbus)
1369 {
1370 	struct task task;
1371 
1372 	TASK_INIT(&task, 0, vmbus_pcib_set_detaching, hbus);
1373 
1374 	/*
1375 	 * Make sure the channel callback won't push any possible new
1376 	 * PCI_BUS_RELATIONS and PCI_EJECT tasks to sc->taskq.
1377 	 */
1378 	vmbus_chan_run_task(hbus->sc->chan, &task);
1379 
1380 	taskqueue_drain_all(hbus->sc->taskq);
1381 }
1382 
1383 
1384 /*
1385  * Standard probe entry point.
1386  *
1387  */
1388 static int
1389 vmbus_pcib_probe(device_t dev)
1390 {
1391 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1392 	    &g_pass_through_dev_type) == 0) {
1393 		device_set_desc(dev, "Hyper-V PCI Express Pass Through");
1394 		return (BUS_PROBE_DEFAULT);
1395 	}
1396 	return (ENXIO);
1397 }
1398 
1399 /*
1400  * Standard attach entry point.
1401  *
1402  */
1403 static int
1404 vmbus_pcib_attach(device_t dev)
1405 {
1406 	const int pci_ring_size = (4 * PAGE_SIZE);
1407 	const struct hyperv_guid *inst_guid;
1408 	struct vmbus_channel *channel;
1409 	struct vmbus_pcib_softc *sc;
1410 	struct hv_pcibus *hbus;
1411 	int rid = 0;
1412 	int ret;
1413 
1414 	hbus = malloc(sizeof(*hbus), M_DEVBUF, M_WAITOK | M_ZERO);
1415 	hbus->pcib = dev;
1416 
1417 	channel = vmbus_get_channel(dev);
1418 	inst_guid = vmbus_chan_guid_inst(channel);
1419 	hbus->pci_domain = inst_guid->hv_guid[9] |
1420 			  (inst_guid->hv_guid[8] << 8);
1421 
1422 	mtx_init(&hbus->config_lock, "hbcfg", NULL, MTX_DEF);
1423 	mtx_init(&hbus->device_list_lock, "hbdl", NULL, MTX_DEF);
1424 	TAILQ_INIT(&hbus->children);
1425 	TAILQ_INIT(&hbus->dr_list);
1426 
1427 	hbus->cfg_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid,
1428 	    0, RM_MAX_END, PCI_CONFIG_MMIO_LENGTH,
1429 	    RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE));
1430 
1431 	if (!hbus->cfg_res) {
1432 		device_printf(dev, "failed to get resource for cfg window\n");
1433 		ret = ENXIO;
1434 		goto free_bus;
1435 	}
1436 
1437 	sc = device_get_softc(dev);
1438 	sc->chan = channel;
1439 	sc->rx_buf = malloc(PCIB_PACKET_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
1440 	sc->hbus = hbus;
1441 
1442 	/*
1443 	 * The taskq is used to handle PCI_BUS_RELATIONS and PCI_EJECT
1444 	 * messages. NB: we can't handle the messages in the channel callback
1445 	 * directly, because the message handlers need to send new messages
1446 	 * to the host and waits for the host's completion messages, which
1447 	 * must also be handled by the channel callback.
1448 	 */
1449 	sc->taskq = taskqueue_create("vmbus_pcib_tq", M_WAITOK,
1450 	    taskqueue_thread_enqueue, &sc->taskq);
1451 	taskqueue_start_threads(&sc->taskq, 1, PI_NET, "vmbus_pcib_tq");
1452 
1453 	hbus->sc = sc;
1454 
1455 	init_completion(&hbus->query_completion);
1456 	hbus->query_comp = &hbus->query_completion;
1457 
1458 	ret = vmbus_chan_open(sc->chan, pci_ring_size, pci_ring_size,
1459 		NULL, 0, vmbus_pcib_on_channel_callback, sc);
1460 	if (ret)
1461 		goto free_res;
1462 
1463 	ret = hv_pci_protocol_negotiation(hbus);
1464 	if (ret)
1465 		goto vmbus_close;
1466 
1467 	ret = hv_pci_query_relations(hbus);
1468 	if (!ret)
1469 		ret = wait_for_response(hbus, hbus->query_comp);
1470 
1471 	if (ret)
1472 		goto vmbus_close;
1473 
1474 	ret = hv_pci_enter_d0(hbus);
1475 	if (ret)
1476 		goto vmbus_close;
1477 
1478 	ret = hv_send_resources_allocated(hbus);
1479 	if (ret)
1480 		goto vmbus_close;
1481 
1482 	hbus->pci_bus = device_add_child(dev, "pci", -1);
1483 	if (!hbus->pci_bus) {
1484 		device_printf(dev, "failed to create pci bus\n");
1485 		ret = ENXIO;
1486 		goto vmbus_close;
1487 	}
1488 
1489 	bus_generic_attach(dev);
1490 
1491 	hbus->state = hv_pcibus_installed;
1492 
1493 	return (0);
1494 
1495 vmbus_close:
1496 	vmbus_pcib_pre_detach(hbus);
1497 	vmbus_chan_close(sc->chan);
1498 free_res:
1499 	taskqueue_free(sc->taskq);
1500 	free_completion(&hbus->query_completion);
1501 	free(sc->rx_buf, M_DEVBUF);
1502 	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
1503 free_bus:
1504 	mtx_destroy(&hbus->device_list_lock);
1505 	mtx_destroy(&hbus->config_lock);
1506 	free(hbus, M_DEVBUF);
1507 	return (ret);
1508 }
1509 
1510 /*
1511  * Standard detach entry point
1512  */
1513 static int
1514 vmbus_pcib_detach(device_t dev)
1515 {
1516 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1517 	struct hv_pcibus *hbus = sc->hbus;
1518 	struct pci_message teardown_packet;
1519 	struct pci_bus_relations relations;
1520 	int ret;
1521 
1522 	vmbus_pcib_pre_detach(hbus);
1523 
1524 	if (hbus->state == hv_pcibus_installed)
1525 		bus_generic_detach(dev);
1526 
1527 	/* Delete any children which might still exist. */
1528 	memset(&relations, 0, sizeof(relations));
1529 	hv_pci_devices_present(hbus, &relations);
1530 
1531 	ret = hv_send_resources_released(hbus);
1532 	if (ret)
1533 		device_printf(dev, "failed to send PCI_RESOURCES_RELEASED\n");
1534 
1535 	teardown_packet.type = PCI_BUS_D0EXIT;
1536 	ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
1537 	    &teardown_packet, sizeof(struct pci_message), 0);
1538 	if (ret)
1539 		device_printf(dev, "failed to send PCI_BUS_D0EXIT\n");
1540 
1541 	taskqueue_drain_all(hbus->sc->taskq);
1542 	vmbus_chan_close(sc->chan);
1543 	taskqueue_free(sc->taskq);
1544 
1545 	free_completion(&hbus->query_completion);
1546 	free(sc->rx_buf, M_DEVBUF);
1547 	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
1548 
1549 	mtx_destroy(&hbus->device_list_lock);
1550 	mtx_destroy(&hbus->config_lock);
1551 	free(hbus, M_DEVBUF);
1552 
1553 	return (0);
1554 }
1555 
1556 static int
1557 vmbus_pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *val)
1558 {
1559 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1560 
1561 	switch (which) {
1562 	case PCIB_IVAR_DOMAIN:
1563 		*val = sc->hbus->pci_domain;
1564 		return (0);
1565 
1566 	case PCIB_IVAR_BUS:
1567 		/* There is only bus 0. */
1568 		*val = 0;
1569 		return (0);
1570 	}
1571 	return (ENOENT);
1572 }
1573 
1574 static int
1575 vmbus_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t val)
1576 {
1577 	return (ENOENT);
1578 }
1579 
1580 static struct resource *
1581 vmbus_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
1582 	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
1583 {
1584 	unsigned int bar_no;
1585 	struct hv_pci_dev *hpdev;
1586 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1587 	struct resource *res;
1588 	unsigned int devfn;
1589 
1590 	if (type == PCI_RES_BUS)
1591 		return (pci_domain_alloc_bus(sc->hbus->pci_domain, child, rid,
1592 		    start, end, count, flags));
1593 
1594 	/* Devices with port I/O BAR are not supported. */
1595 	if (type == SYS_RES_IOPORT)
1596 		return (NULL);
1597 
1598 	if (type == SYS_RES_MEMORY) {
1599 		devfn = PCI_DEVFN(pci_get_slot(child),
1600 		    pci_get_function(child));
1601 		hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1602 		if (!hpdev)
1603 			return (NULL);
1604 
1605 		bar_no = PCI_RID2BAR(*rid);
1606 		if (bar_no >= MAX_NUM_BARS)
1607 			return (NULL);
1608 
1609 		/* Make sure a 32-bit BAR gets a 32-bit address */
1610 		if (!(hpdev->probed_bar[bar_no] & PCIM_BAR_MEM_64))
1611 			end = ulmin(end, 0xFFFFFFFF);
1612 	}
1613 
1614 	res = bus_generic_alloc_resource(dev, child, type, rid,
1615 		start, end, count, flags);
1616 	/*
1617 	 * If this is a request for a specific range, assume it is
1618 	 * correct and pass it up to the parent.
1619 	 */
1620 	if (res == NULL && start + count - 1 == end)
1621 		res = bus_generic_alloc_resource(dev, child, type, rid,
1622 		    start, end, count, flags);
1623 	return (res);
1624 }
1625 
1626 static int
1627 vmbus_pcib_release_resource(device_t dev, device_t child, int type, int rid,
1628     struct resource *r)
1629 {
1630 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1631 
1632 	if (type == PCI_RES_BUS)
1633 		return (pci_domain_release_bus(sc->hbus->pci_domain, child,
1634 		    rid, r));
1635 
1636 	if (type == SYS_RES_IOPORT)
1637 		return (EINVAL);
1638 
1639 	return (bus_generic_release_resource(dev, child, type, rid, r));
1640 }
1641 
1642 #if __FreeBSD_version >= 1100000
1643 static int
1644 vmbus_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op,
1645     size_t setsize, cpuset_t *cpuset)
1646 {
1647 	return (bus_get_cpus(pcib, op, setsize, cpuset));
1648 }
1649 #endif
1650 
1651 static uint32_t
1652 vmbus_pcib_read_config(device_t dev, u_int bus, u_int slot, u_int func,
1653     u_int reg, int bytes)
1654 {
1655 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1656 	struct hv_pci_dev *hpdev;
1657 	unsigned int devfn = PCI_DEVFN(slot, func);
1658 	uint32_t data = 0;
1659 
1660 	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
1661 
1662 	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1663 	if (!hpdev)
1664 		return (~0);
1665 
1666 	_hv_pcifront_read_config(hpdev, reg, bytes, &data);
1667 
1668 	return (data);
1669 }
1670 
1671 static void
1672 vmbus_pcib_write_config(device_t dev, u_int bus, u_int slot, u_int func,
1673     u_int reg, uint32_t data, int bytes)
1674 {
1675 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1676 	struct hv_pci_dev *hpdev;
1677 	unsigned int devfn = PCI_DEVFN(slot, func);
1678 
1679 	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
1680 
1681 	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1682 	if (!hpdev)
1683 		return;
1684 
1685 	_hv_pcifront_write_config(hpdev, reg, bytes, data);
1686 }
1687 
1688 static int
1689 vmbus_pcib_route_intr(device_t pcib, device_t dev, int pin)
1690 {
1691 	/* We only support MSI/MSI-X and don't support INTx interrupt. */
1692 	return (PCI_INVALID_IRQ);
1693 }
1694 
1695 static int
1696 vmbus_pcib_alloc_msi(device_t pcib, device_t dev, int count,
1697     int maxcount, int *irqs)
1698 {
1699 	return (PCIB_ALLOC_MSI(device_get_parent(pcib), dev, count, maxcount,
1700 	    irqs));
1701 }
1702 
1703 static int
1704 vmbus_pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs)
1705 {
1706 	return (PCIB_RELEASE_MSI(device_get_parent(pcib), dev, count, irqs));
1707 }
1708 
1709 static int
1710 vmbus_pcib_alloc_msix(device_t pcib, device_t dev, int *irq)
1711 {
1712 	return (PCIB_ALLOC_MSIX(device_get_parent(pcib), dev, irq));
1713 }
1714 
1715 static int
1716 vmbus_pcib_release_msix(device_t pcib, device_t dev, int irq)
1717 {
1718 	return (PCIB_RELEASE_MSIX(device_get_parent(pcib), dev, irq));
1719 }
1720 
1721 #define	MSI_INTEL_ADDR_DEST	0x000ff000
1722 #define	MSI_INTEL_DATA_INTVEC	IOART_INTVEC	/* Interrupt vector. */
1723 #define	MSI_INTEL_DATA_DELFIXED	IOART_DELFIXED
1724 
1725 static int
1726 vmbus_pcib_map_msi(device_t pcib, device_t child, int irq,
1727     uint64_t *addr, uint32_t *data)
1728 {
1729 	unsigned int devfn;
1730 	struct hv_pci_dev *hpdev;
1731 
1732 	uint64_t v_addr;
1733 	uint32_t v_data;
1734 	struct hv_irq_desc *hid, *tmp_hid;
1735 	unsigned int cpu, vcpu_id;
1736 	unsigned int vector;
1737 
1738 	struct vmbus_pcib_softc *sc = device_get_softc(pcib);
1739 	struct pci_create_interrupt *int_pkt;
1740 	struct compose_comp_ctxt comp;
1741 	struct {
1742 		struct pci_packet pkt;
1743 		uint8_t buffer[sizeof(struct pci_create_interrupt)];
1744 	} ctxt;
1745 
1746 	int ret;
1747 
1748 	devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child));
1749 	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1750 	if (!hpdev)
1751 		return (ENOENT);
1752 
1753 	ret = PCIB_MAP_MSI(device_get_parent(pcib), child, irq,
1754 	    &v_addr, &v_data);
1755 	if (ret)
1756 		return (ret);
1757 
1758 	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) {
1759 		if (hid->irq == irq) {
1760 			TAILQ_REMOVE(&hpdev->irq_desc_list, hid, link);
1761 			hv_int_desc_free(hpdev, hid);
1762 			break;
1763 		}
1764 	}
1765 
1766 	cpu = (v_addr & MSI_INTEL_ADDR_DEST) >> 12;
1767 	vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu);
1768 	vector = v_data & MSI_INTEL_DATA_INTVEC;
1769 
1770 	init_completion(&comp.comp_pkt.host_event);
1771 
1772 	memset(&ctxt, 0, sizeof(ctxt));
1773 	ctxt.pkt.completion_func = hv_pci_compose_compl;
1774 	ctxt.pkt.compl_ctxt = &comp;
1775 
1776 	int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
1777 	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1778 	int_pkt->wslot.val = hpdev->desc.wslot.val;
1779 	int_pkt->int_desc.vector = vector;
1780 	int_pkt->int_desc.vector_count = 1;
1781 	int_pkt->int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED;
1782 	int_pkt->int_desc.cpu_mask = 1ULL << vcpu_id;
1783 
1784 	ret = vmbus_chan_send(sc->chan,	VMBUS_CHANPKT_TYPE_INBAND,
1785 	    VMBUS_CHANPKT_FLAG_RC, int_pkt, sizeof(*int_pkt),
1786 	    (uint64_t)(uintptr_t)&ctxt.pkt);
1787 	if (ret) {
1788 		free_completion(&comp.comp_pkt.host_event);
1789 		return (ret);
1790 	}
1791 
1792 	wait_for_completion(&comp.comp_pkt.host_event);
1793 	free_completion(&comp.comp_pkt.host_event);
1794 
1795 	if (comp.comp_pkt.completion_status < 0)
1796 		return (EPROTO);
1797 
1798 	*addr = comp.int_desc.address;
1799 	*data = comp.int_desc.data;
1800 
1801 	hid = malloc(sizeof(struct hv_irq_desc), M_DEVBUF, M_WAITOK | M_ZERO);
1802 	hid->irq = irq;
1803 	hid->desc = comp.int_desc;
1804 	TAILQ_INSERT_TAIL(&hpdev->irq_desc_list, hid, link);
1805 
1806 	return (0);
1807 }
1808 
1809 static device_method_t vmbus_pcib_methods[] = {
1810 	/* Device interface */
1811 	DEVMETHOD(device_probe,         vmbus_pcib_probe),
1812 	DEVMETHOD(device_attach,        vmbus_pcib_attach),
1813 	DEVMETHOD(device_detach,        vmbus_pcib_detach),
1814 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
1815 	DEVMETHOD(device_suspend,	bus_generic_suspend),
1816 	DEVMETHOD(device_resume,	bus_generic_resume),
1817 
1818 	/* Bus interface */
1819 	DEVMETHOD(bus_read_ivar,		vmbus_pcib_read_ivar),
1820 	DEVMETHOD(bus_write_ivar,		vmbus_pcib_write_ivar),
1821 	DEVMETHOD(bus_alloc_resource,		vmbus_pcib_alloc_resource),
1822 	DEVMETHOD(bus_release_resource,		vmbus_pcib_release_resource),
1823 	DEVMETHOD(bus_activate_resource,   bus_generic_activate_resource),
1824 	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
1825 	DEVMETHOD(bus_setup_intr,	   bus_generic_setup_intr),
1826 	DEVMETHOD(bus_teardown_intr,	   bus_generic_teardown_intr),
1827 #if __FreeBSD_version >= 1100000
1828 	DEVMETHOD(bus_get_cpus,			vmbus_pcib_get_cpus),
1829 #endif
1830 
1831 	/* pcib interface */
1832 	DEVMETHOD(pcib_maxslots,		pcib_maxslots),
1833 	DEVMETHOD(pcib_read_config,		vmbus_pcib_read_config),
1834 	DEVMETHOD(pcib_write_config,		vmbus_pcib_write_config),
1835 	DEVMETHOD(pcib_route_interrupt,		vmbus_pcib_route_intr),
1836 	DEVMETHOD(pcib_alloc_msi,		vmbus_pcib_alloc_msi),
1837 	DEVMETHOD(pcib_release_msi,		vmbus_pcib_release_msi),
1838 	DEVMETHOD(pcib_alloc_msix,		vmbus_pcib_alloc_msix),
1839 	DEVMETHOD(pcib_release_msix,		vmbus_pcib_release_msix),
1840 	DEVMETHOD(pcib_map_msi,			vmbus_pcib_map_msi),
1841 	DEVMETHOD(pcib_request_feature,		pcib_request_feature_allow),
1842 
1843 	DEVMETHOD_END
1844 };
1845 
1846 static devclass_t pcib_devclass;
1847 
1848 DEFINE_CLASS_0(pcib, vmbus_pcib_driver, vmbus_pcib_methods,
1849 		sizeof(struct vmbus_pcib_softc));
1850 DRIVER_MODULE(vmbus_pcib, vmbus, vmbus_pcib_driver, pcib_devclass, 0, 0);
1851 MODULE_DEPEND(vmbus_pcib, vmbus, 1, 1, 1);
1852 MODULE_DEPEND(vmbus_pcib, pci, 1, 1, 1);
1853 
1854 #endif /* NEW_PCIB */
1855