xref: /freebsd/sys/dev/hyperv/pcib/vmbus_pcib.c (revision f37852c17391fdf0e8309bcf684384dd0d854e43)
1 /*-
2  * Copyright (c) 2016 Microsoft Corp.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #ifdef NEW_PCIB
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/types.h>
35 #include <sys/malloc.h>
36 #include <sys/module.h>
37 #include <sys/kernel.h>
38 #include <sys/queue.h>
39 #include <sys/lock.h>
40 #include <sys/sx.h>
41 #include <sys/smp.h>
42 #include <sys/sysctl.h>
43 #include <sys/bus.h>
44 #include <sys/rman.h>
45 #include <sys/mutex.h>
46 #include <sys/errno.h>
47 
48 #include <vm/vm.h>
49 #include <vm/vm_param.h>
50 #include <vm/vm_kern.h>
51 #include <vm/pmap.h>
52 
53 #include <machine/atomic.h>
54 #include <machine/bus.h>
55 #include <machine/frame.h>
56 #include <machine/pci_cfgreg.h>
57 #include <machine/resource.h>
58 
59 #include <sys/pciio.h>
60 #include <dev/pci/pcireg.h>
61 #include <dev/pci/pcivar.h>
62 #include <dev/pci/pci_private.h>
63 #include <dev/pci/pcib_private.h>
64 #include "pcib_if.h"
65 
66 #include <machine/intr_machdep.h>
67 #include <x86/apicreg.h>
68 
69 #include <dev/hyperv/include/hyperv.h>
70 #include <dev/hyperv/include/hyperv_busdma.h>
71 #include <dev/hyperv/include/vmbus_xact.h>
72 #include <dev/hyperv/vmbus/vmbus_reg.h>
73 #include <dev/hyperv/vmbus/vmbus_chanvar.h>
74 
75 #include "vmbus_if.h"
76 
77 #if __FreeBSD_version < 1100000
78 typedef u_long rman_res_t;
79 #define RM_MAX_END	(~(rman_res_t)0)
80 #endif
81 
82 struct completion {
83 	unsigned int done;
84 	struct mtx lock;
85 };
86 
87 static void
88 init_completion(struct completion *c)
89 {
90 	memset(c, 0, sizeof(*c));
91 	mtx_init(&c->lock, "hvcmpl", NULL, MTX_DEF);
92 	c->done = 0;
93 }
94 
95 static void
96 free_completion(struct completion *c)
97 {
98 	mtx_destroy(&c->lock);
99 }
100 
101 static void
102 complete(struct completion *c)
103 {
104 	mtx_lock(&c->lock);
105 	c->done++;
106 	mtx_unlock(&c->lock);
107 	wakeup(c);
108 }
109 
110 static void
111 wait_for_completion(struct completion *c)
112 {
113 	mtx_lock(&c->lock);
114 	while (c->done == 0)
115 		mtx_sleep(c, &c->lock, 0, "hvwfc", 0);
116 	c->done--;
117 	mtx_unlock(&c->lock);
118 }
119 
120 #define PCI_MAKE_VERSION(major, minor) ((uint32_t)(((major) << 16) | (major)))
121 
122 enum {
123 	PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1),
124 	PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1
125 };
126 
127 #define PCI_CONFIG_MMIO_LENGTH	0x2000
128 #define CFG_PAGE_OFFSET 0x1000
129 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
130 
131 /*
132  * Message Types
133  */
134 
135 enum pci_message_type {
136 	/*
137 	 * Version 1.1
138 	 */
139 	PCI_MESSAGE_BASE                = 0x42490000,
140 	PCI_BUS_RELATIONS               = PCI_MESSAGE_BASE + 0,
141 	PCI_QUERY_BUS_RELATIONS         = PCI_MESSAGE_BASE + 1,
142 	PCI_POWER_STATE_CHANGE          = PCI_MESSAGE_BASE + 4,
143 	PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
144 	PCI_QUERY_RESOURCE_RESOURCES    = PCI_MESSAGE_BASE + 6,
145 	PCI_BUS_D0ENTRY                 = PCI_MESSAGE_BASE + 7,
146 	PCI_BUS_D0EXIT                  = PCI_MESSAGE_BASE + 8,
147 	PCI_READ_BLOCK                  = PCI_MESSAGE_BASE + 9,
148 	PCI_WRITE_BLOCK                 = PCI_MESSAGE_BASE + 0xA,
149 	PCI_EJECT                       = PCI_MESSAGE_BASE + 0xB,
150 	PCI_QUERY_STOP                  = PCI_MESSAGE_BASE + 0xC,
151 	PCI_REENABLE                    = PCI_MESSAGE_BASE + 0xD,
152 	PCI_QUERY_STOP_FAILED           = PCI_MESSAGE_BASE + 0xE,
153 	PCI_EJECTION_COMPLETE           = PCI_MESSAGE_BASE + 0xF,
154 	PCI_RESOURCES_ASSIGNED          = PCI_MESSAGE_BASE + 0x10,
155 	PCI_RESOURCES_RELEASED          = PCI_MESSAGE_BASE + 0x11,
156 	PCI_INVALIDATE_BLOCK            = PCI_MESSAGE_BASE + 0x12,
157 	PCI_QUERY_PROTOCOL_VERSION      = PCI_MESSAGE_BASE + 0x13,
158 	PCI_CREATE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x14,
159 	PCI_DELETE_INTERRUPT_MESSAGE    = PCI_MESSAGE_BASE + 0x15,
160 	PCI_MESSAGE_MAXIMUM
161 };
162 
163 /*
164  * Structures defining the virtual PCI Express protocol.
165  */
166 
167 union pci_version {
168 	struct {
169 		uint16_t minor_version;
170 		uint16_t major_version;
171 	} parts;
172 	uint32_t version;
173 } __packed;
174 
175 /*
176  * This representation is the one used in Windows, which is
177  * what is expected when sending this back and forth with
178  * the Hyper-V parent partition.
179  */
180 union win_slot_encoding {
181 	struct {
182 		uint32_t	slot:5;
183 		uint32_t	func:3;
184 		uint32_t	reserved:24;
185 	} bits;
186 	uint32_t val;
187 } __packed;
188 
189 struct pci_func_desc {
190 	uint16_t	v_id;	/* vendor ID */
191 	uint16_t	d_id;	/* device ID */
192 	uint8_t		rev;
193 	uint8_t		prog_intf;
194 	uint8_t		subclass;
195 	uint8_t		base_class;
196 	uint32_t	subsystem_id;
197 	union win_slot_encoding wslot;
198 	uint32_t	ser;	/* serial number */
199 } __packed;
200 
201 struct hv_msi_desc {
202 	uint8_t		vector;
203 	uint8_t		delivery_mode;
204 	uint16_t	vector_count;
205 	uint32_t	reserved;
206 	uint64_t	cpu_mask;
207 } __packed;
208 
209 struct tran_int_desc {
210 	uint16_t	reserved;
211 	uint16_t	vector_count;
212 	uint32_t	data;
213 	uint64_t	address;
214 } __packed;
215 
216 struct pci_message {
217 	uint32_t type;
218 } __packed;
219 
220 struct pci_child_message {
221 	struct pci_message message_type;
222 	union win_slot_encoding wslot;
223 } __packed;
224 
225 struct pci_incoming_message {
226 	struct vmbus_chanpkt_hdr hdr;
227 	struct pci_message message_type;
228 } __packed;
229 
230 struct pci_response {
231 	struct vmbus_chanpkt_hdr hdr;
232 	int32_t status;	/* negative values are failures */
233 } __packed;
234 
235 struct pci_packet {
236 	void (*completion_func)(void *context, struct pci_response *resp,
237 	    int resp_packet_size);
238 	void *compl_ctxt;
239 
240 	struct pci_message message[0];
241 };
242 
243 /*
244  * Specific message types supporting the PCI protocol.
245  */
246 
247 struct pci_version_request {
248 	struct pci_message message_type;
249 	uint32_t protocol_version;
250 	uint32_t is_last_attempt:1;
251 	uint32_t reservedz:31;
252 } __packed;
253 
254 struct pci_bus_d0_entry {
255 	struct pci_message message_type;
256 	uint32_t reserved;
257 	uint64_t mmio_base;
258 } __packed;
259 
260 struct pci_bus_relations {
261 	struct pci_incoming_message incoming;
262 	uint32_t device_count;
263 	struct pci_func_desc func[0];
264 } __packed;
265 
266 #define MAX_NUM_BARS	(PCIR_MAX_BAR_0 + 1)
267 struct pci_q_res_req_response {
268 	struct vmbus_chanpkt_hdr hdr;
269 	int32_t status; /* negative values are failures */
270 	uint32_t probed_bar[MAX_NUM_BARS];
271 } __packed;
272 
273 struct pci_resources_assigned {
274 	struct pci_message message_type;
275 	union win_slot_encoding wslot;
276 	uint8_t memory_range[0x14][MAX_NUM_BARS]; /* unused here */
277 	uint32_t msi_descriptors;
278 	uint32_t reserved[4];
279 } __packed;
280 
281 struct pci_create_interrupt {
282 	struct pci_message message_type;
283 	union win_slot_encoding wslot;
284 	struct hv_msi_desc int_desc;
285 } __packed;
286 
287 struct pci_create_int_response {
288 	struct pci_response response;
289 	uint32_t reserved;
290 	struct tran_int_desc int_desc;
291 } __packed;
292 
293 struct pci_delete_interrupt {
294 	struct pci_message message_type;
295 	union win_slot_encoding wslot;
296 	struct tran_int_desc int_desc;
297 } __packed;
298 
299 struct pci_dev_incoming {
300 	struct pci_incoming_message incoming;
301 	union win_slot_encoding wslot;
302 } __packed;
303 
304 struct pci_eject_response {
305 	struct pci_message message_type;
306 	union win_slot_encoding wslot;
307 	uint32_t status;
308 } __packed;
309 
310 /*
311  * Driver specific state.
312  */
313 
314 enum hv_pcibus_state {
315 	hv_pcibus_init = 0,
316 	hv_pcibus_installed,
317 };
318 
319 struct hv_pcibus {
320 	device_t pcib;
321 	device_t pci_bus;
322 	struct vmbus_pcib_softc *sc;
323 
324 	uint16_t pci_domain;
325 
326 	enum hv_pcibus_state state;
327 
328 	struct resource *cfg_res;
329 
330 	struct completion query_completion, *query_comp;
331 
332 	struct mtx config_lock; /* Avoid two threads writing index page */
333 	struct mtx device_list_lock;    /* Protect lists below */
334 	TAILQ_HEAD(, hv_pci_dev) children;
335 	TAILQ_HEAD(, hv_dr_state) dr_list;
336 
337 	volatile int detaching;
338 };
339 
340 struct hv_pci_dev {
341 	TAILQ_ENTRY(hv_pci_dev) link;
342 
343 	struct pci_func_desc desc;
344 
345 	bool reported_missing;
346 
347 	struct hv_pcibus *hbus;
348 	struct task eject_task;
349 
350 	TAILQ_HEAD(, hv_irq_desc) irq_desc_list;
351 
352 	/*
353 	 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
354 	 * read it back, for each of the BAR offsets within config space.
355 	 */
356 	uint32_t probed_bar[MAX_NUM_BARS];
357 };
358 
359 /*
360  * Tracks "Device Relations" messages from the host, which must be both
361  * processed in order.
362  */
363 struct hv_dr_work {
364 	struct task task;
365 	struct hv_pcibus *bus;
366 };
367 
368 struct hv_dr_state {
369 	TAILQ_ENTRY(hv_dr_state) link;
370 	uint32_t device_count;
371 	struct pci_func_desc func[0];
372 };
373 
374 struct hv_irq_desc {
375 	TAILQ_ENTRY(hv_irq_desc) link;
376 	struct tran_int_desc desc;
377 	int irq;
378 };
379 
380 #define PCI_DEVFN(slot, func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
381 #define PCI_SLOT(devfn)         (((devfn) >> 3) & 0x1f)
382 #define PCI_FUNC(devfn)         ((devfn) & 0x07)
383 
384 static uint32_t
385 devfn_to_wslot(unsigned int devfn)
386 {
387 	union win_slot_encoding wslot;
388 
389 	wslot.val = 0;
390 	wslot.bits.slot = PCI_SLOT(devfn);
391 	wslot.bits.func = PCI_FUNC(devfn);
392 
393 	return (wslot.val);
394 }
395 
396 static unsigned int
397 wslot_to_devfn(uint32_t wslot)
398 {
399 	union win_slot_encoding encoding;
400 	unsigned int slot;
401 	unsigned int func;
402 
403 	encoding.val = wslot;
404 
405 	slot = encoding.bits.slot;
406 	func = encoding.bits.func;
407 
408 	return (PCI_DEVFN(slot, func));
409 }
410 
411 struct vmbus_pcib_softc {
412 	struct vmbus_channel	*chan;
413 	void *rx_buf;
414 
415 	struct taskqueue	*taskq;
416 
417 	struct hv_pcibus	*hbus;
418 };
419 
420 /* {44C4F61D-4444-4400-9D52-802E27EDE19F} */
421 static const struct hyperv_guid g_pass_through_dev_type = {
422 	.hv_guid = {0x1D, 0xF6, 0xC4, 0x44, 0x44, 0x44, 0x00, 0x44,
423 	    0x9D, 0x52, 0x80, 0x2E, 0x27, 0xED, 0xE1, 0x9F}
424 };
425 
426 struct hv_pci_compl {
427 	struct completion host_event;
428 	int32_t completion_status;
429 };
430 
431 struct q_res_req_compl {
432 	struct completion host_event;
433 	struct hv_pci_dev *hpdev;
434 };
435 
436 struct compose_comp_ctxt {
437 	struct hv_pci_compl comp_pkt;
438 	struct tran_int_desc int_desc;
439 };
440 
441 static void
442 hv_pci_generic_compl(void *context, struct pci_response *resp,
443     int resp_packet_size)
444 {
445 	struct hv_pci_compl *comp_pkt = context;
446 
447 	if (resp_packet_size >= sizeof(struct pci_response))
448 		comp_pkt->completion_status = resp->status;
449 	else
450 		comp_pkt->completion_status = -1;
451 
452 	complete(&comp_pkt->host_event);
453 }
454 
455 static void
456 q_resource_requirements(void *context, struct pci_response *resp,
457     int resp_packet_size)
458 {
459 	struct q_res_req_compl *completion = context;
460 	struct pci_q_res_req_response *q_res_req =
461 	    (struct pci_q_res_req_response *)resp;
462 	int i;
463 
464 	if (resp->status < 0) {
465 		printf("vmbus_pcib: failed to query resource requirements\n");
466 	} else {
467 		for (i = 0; i < MAX_NUM_BARS; i++)
468 			completion->hpdev->probed_bar[i] =
469 			    q_res_req->probed_bar[i];
470 	}
471 
472 	complete(&completion->host_event);
473 }
474 
475 static void
476 hv_pci_compose_compl(void *context, struct pci_response *resp,
477     int resp_packet_size)
478 {
479 	struct compose_comp_ctxt *comp_pkt = context;
480 	struct pci_create_int_response *int_resp =
481 	    (struct pci_create_int_response *)resp;
482 
483 	comp_pkt->comp_pkt.completion_status = resp->status;
484 	comp_pkt->int_desc = int_resp->int_desc;
485 	complete(&comp_pkt->comp_pkt.host_event);
486 }
487 
488 static void
489 hv_int_desc_free(struct hv_pci_dev *hpdev, struct hv_irq_desc *hid)
490 {
491 	struct pci_delete_interrupt *int_pkt;
492 	struct {
493 		struct pci_packet pkt;
494 		uint8_t buffer[sizeof(struct pci_delete_interrupt)];
495 	} ctxt;
496 
497 	memset(&ctxt, 0, sizeof(ctxt));
498 	int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
499 	int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE;
500 	int_pkt->wslot.val = hpdev->desc.wslot.val;
501 	int_pkt->int_desc = hid->desc;
502 
503 	vmbus_chan_send(hpdev->hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
504 	    int_pkt, sizeof(*int_pkt), 0);
505 
506 	free(hid, M_DEVBUF);
507 }
508 
509 static void
510 hv_pci_delete_device(struct hv_pci_dev *hpdev)
511 {
512 	struct hv_pcibus *hbus = hpdev->hbus;
513 	struct hv_irq_desc *hid, *tmp_hid;
514 	device_t pci_dev;
515 	int devfn;
516 
517 	devfn = wslot_to_devfn(hpdev->desc.wslot.val);
518 
519 	mtx_lock(&Giant);
520 
521 	pci_dev = pci_find_dbsf(hbus->pci_domain,
522 	    0, PCI_SLOT(devfn), PCI_FUNC(devfn));
523 	if (pci_dev)
524 		device_delete_child(hbus->pci_bus, pci_dev);
525 
526 	mtx_unlock(&Giant);
527 
528 	mtx_lock(&hbus->device_list_lock);
529 	TAILQ_REMOVE(&hbus->children, hpdev, link);
530 	mtx_unlock(&hbus->device_list_lock);
531 
532 	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid)
533 		hv_int_desc_free(hpdev, hid);
534 
535 	free(hpdev, M_DEVBUF);
536 }
537 
538 static struct hv_pci_dev *
539 new_pcichild_device(struct hv_pcibus *hbus, struct pci_func_desc *desc)
540 {
541 	struct hv_pci_dev *hpdev;
542 	struct pci_child_message *res_req;
543 	struct q_res_req_compl comp_pkt;
544 	struct {
545 		struct pci_packet pkt;
546 		uint8_t buffer[sizeof(struct pci_child_message)];
547 	} ctxt;
548 	int ret;
549 
550 	hpdev = malloc(sizeof(*hpdev), M_DEVBUF, M_WAITOK | M_ZERO);
551 	hpdev->hbus = hbus;
552 
553 	TAILQ_INIT(&hpdev->irq_desc_list);
554 
555 	init_completion(&comp_pkt.host_event);
556 	comp_pkt.hpdev = hpdev;
557 
558 	ctxt.pkt.compl_ctxt = &comp_pkt;
559 	ctxt.pkt.completion_func = q_resource_requirements;
560 
561 	res_req = (struct pci_child_message *)&ctxt.pkt.message;
562 	res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
563 	res_req->wslot.val = desc->wslot.val;
564 
565 	ret = vmbus_chan_send(hbus->sc->chan,
566 	    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
567 	    res_req, sizeof(*res_req), (uint64_t)&ctxt.pkt);
568 	if (ret)
569 		goto err;
570 
571 	wait_for_completion(&comp_pkt.host_event);
572 	free_completion(&comp_pkt.host_event);
573 
574 	hpdev->desc = *desc;
575 
576 	mtx_lock(&hbus->device_list_lock);
577 	if (TAILQ_EMPTY(&hbus->children))
578 		hbus->pci_domain = desc->ser & 0xFFFF;
579 	TAILQ_INSERT_TAIL(&hbus->children, hpdev, link);
580 	mtx_unlock(&hbus->device_list_lock);
581 	return (hpdev);
582 err:
583 	free_completion(&comp_pkt.host_event);
584 	free(hpdev, M_DEVBUF);
585 	return (NULL);
586 }
587 
588 #if __FreeBSD_version < 1100000
589 
590 /* Old versions don't have BUS_RESCAN(). Let's copy it from FreeBSD 11. */
591 
592 static struct pci_devinfo *
593 pci_identify_function(device_t pcib, device_t dev, int domain, int busno,
594     int slot, int func, size_t dinfo_size)
595 {
596 	struct pci_devinfo *dinfo;
597 
598 	dinfo = pci_read_device(pcib, domain, busno, slot, func, dinfo_size);
599 	if (dinfo != NULL)
600 		pci_add_child(dev, dinfo);
601 
602 	return (dinfo);
603 }
604 
605 static int
606 pci_rescan(device_t dev)
607 {
608 #define	REG(n, w)	PCIB_READ_CONFIG(pcib, busno, s, f, n, w)
609 	device_t pcib = device_get_parent(dev);
610 	struct pci_softc *sc;
611 	device_t child, *devlist, *unchanged;
612 	int devcount, error, i, j, maxslots, oldcount;
613 	int busno, domain, s, f, pcifunchigh;
614 	uint8_t hdrtype;
615 
616 	/* No need to check for ARI on a rescan. */
617 	error = device_get_children(dev, &devlist, &devcount);
618 	if (error)
619 		return (error);
620 	if (devcount != 0) {
621 		unchanged = malloc(devcount * sizeof(device_t), M_TEMP,
622 		    M_NOWAIT | M_ZERO);
623 		if (unchanged == NULL) {
624 			free(devlist, M_TEMP);
625 			return (ENOMEM);
626 		}
627 	} else
628 		unchanged = NULL;
629 
630 	sc = device_get_softc(dev);
631 	domain = pcib_get_domain(dev);
632 	busno = pcib_get_bus(dev);
633 	maxslots = PCIB_MAXSLOTS(pcib);
634 	for (s = 0; s <= maxslots; s++) {
635 		/* If function 0 is not present, skip to the next slot. */
636 		f = 0;
637 		if (REG(PCIR_VENDOR, 2) == 0xffff)
638 			continue;
639 		pcifunchigh = 0;
640 		hdrtype = REG(PCIR_HDRTYPE, 1);
641 		if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE)
642 			continue;
643 		if (hdrtype & PCIM_MFDEV)
644 			pcifunchigh = PCIB_MAXFUNCS(pcib);
645 		for (f = 0; f <= pcifunchigh; f++) {
646 			if (REG(PCIR_VENDOR, 2) == 0xffff)
647 				continue;
648 
649 			/*
650 			 * Found a valid function.  Check if a
651 			 * device_t for this device already exists.
652 			 */
653 			for (i = 0; i < devcount; i++) {
654 				child = devlist[i];
655 				if (child == NULL)
656 					continue;
657 				if (pci_get_slot(child) == s &&
658 				    pci_get_function(child) == f) {
659 					unchanged[i] = child;
660 					goto next_func;
661 				}
662 			}
663 
664 			pci_identify_function(pcib, dev, domain, busno, s, f,
665 			    sizeof(struct pci_devinfo));
666 		next_func:;
667 		}
668 	}
669 
670 	/* Remove devices that are no longer present. */
671 	for (i = 0; i < devcount; i++) {
672 		if (unchanged[i] != NULL)
673 			continue;
674 		device_delete_child(dev, devlist[i]);
675 	}
676 
677 	free(devlist, M_TEMP);
678 	oldcount = devcount;
679 
680 	/* Try to attach the devices just added. */
681 	error = device_get_children(dev, &devlist, &devcount);
682 	if (error) {
683 		free(unchanged, M_TEMP);
684 		return (error);
685 	}
686 
687 	for (i = 0; i < devcount; i++) {
688 		for (j = 0; j < oldcount; j++) {
689 			if (devlist[i] == unchanged[j])
690 				goto next_device;
691 		}
692 
693 		device_probe_and_attach(devlist[i]);
694 	next_device:;
695 	}
696 
697 	free(unchanged, M_TEMP);
698 	free(devlist, M_TEMP);
699 	return (0);
700 #undef REG
701 }
702 
703 #else
704 
705 static int
706 pci_rescan(device_t dev)
707 {
708 	return (BUS_RESCAN(dev));
709 }
710 
711 #endif
712 
713 static void
714 pci_devices_present_work(void *arg, int pending __unused)
715 {
716 	struct hv_dr_work *dr_wrk = arg;
717 	struct hv_dr_state *dr = NULL;
718 	struct hv_pcibus *hbus;
719 	uint32_t child_no;
720 	bool found;
721 	struct pci_func_desc *new_desc;
722 	struct hv_pci_dev *hpdev, *tmp_hpdev;
723 	struct completion *query_comp;
724 	bool need_rescan = false;
725 
726 	hbus = dr_wrk->bus;
727 	free(dr_wrk, M_DEVBUF);
728 
729 	/* Pull this off the queue and process it if it was the last one. */
730 	mtx_lock(&hbus->device_list_lock);
731 	while (!TAILQ_EMPTY(&hbus->dr_list)) {
732 		dr = TAILQ_FIRST(&hbus->dr_list);
733 		TAILQ_REMOVE(&hbus->dr_list, dr, link);
734 
735 		/* Throw this away if the list still has stuff in it. */
736 		if (!TAILQ_EMPTY(&hbus->dr_list)) {
737 			free(dr, M_DEVBUF);
738 			continue;
739 		}
740 	}
741 	mtx_unlock(&hbus->device_list_lock);
742 
743 	if (!dr)
744 		return;
745 
746 	/* First, mark all existing children as reported missing. */
747 	mtx_lock(&hbus->device_list_lock);
748 	TAILQ_FOREACH(hpdev, &hbus->children, link)
749 		hpdev->reported_missing = true;
750 	mtx_unlock(&hbus->device_list_lock);
751 
752 	/* Next, add back any reported devices. */
753 	for (child_no = 0; child_no < dr->device_count; child_no++) {
754 		found = false;
755 		new_desc = &dr->func[child_no];
756 
757 		mtx_lock(&hbus->device_list_lock);
758 		TAILQ_FOREACH(hpdev, &hbus->children, link) {
759 			if ((hpdev->desc.wslot.val ==
760 			    new_desc->wslot.val) &&
761 			    (hpdev->desc.v_id == new_desc->v_id) &&
762 			    (hpdev->desc.d_id == new_desc->d_id) &&
763 			    (hpdev->desc.ser == new_desc->ser)) {
764 				hpdev->reported_missing = false;
765 				found = true;
766 				break;
767 			}
768 		}
769 		mtx_unlock(&hbus->device_list_lock);
770 
771 		if (!found) {
772 			if (!need_rescan)
773 				need_rescan = true;
774 
775 			hpdev = new_pcichild_device(hbus, new_desc);
776 			if (!hpdev)
777 				printf("vmbus_pcib: failed to add a child\n");
778 		}
779 	}
780 
781 	/* Remove missing device(s), if any */
782 	TAILQ_FOREACH_SAFE(hpdev, &hbus->children, link, tmp_hpdev) {
783 		if (hpdev->reported_missing)
784 			hv_pci_delete_device(hpdev);
785 	}
786 
787 	/* Rescan the bus to find any new device, if necessary. */
788 	if (hbus->state == hv_pcibus_installed && need_rescan)
789 		pci_rescan(hbus->pci_bus);
790 
791 	/* Wake up hv_pci_query_relations(), if it's waiting. */
792 	query_comp = hbus->query_comp;
793 	if (query_comp) {
794 		hbus->query_comp = NULL;
795 		complete(query_comp);
796 	}
797 
798 	free(dr, M_DEVBUF);
799 }
800 
801 static struct hv_pci_dev *
802 get_pcichild_wslot(struct hv_pcibus *hbus, uint32_t wslot)
803 {
804 	struct hv_pci_dev *hpdev, *ret = NULL;
805 
806 	mtx_lock(&hbus->device_list_lock);
807 	TAILQ_FOREACH(hpdev, &hbus->children, link) {
808 		if (hpdev->desc.wslot.val == wslot) {
809 			ret = hpdev;
810 			break;
811 		}
812 	}
813 	mtx_unlock(&hbus->device_list_lock);
814 
815 	return (ret);
816 }
817 
818 static void
819 hv_pci_devices_present(struct hv_pcibus *hbus,
820     struct pci_bus_relations *relations)
821 {
822 	struct hv_dr_state *dr;
823 	struct hv_dr_work *dr_wrk;
824 	unsigned long dr_size;
825 
826 	if (hbus->detaching && relations->device_count > 0)
827 		return;
828 
829 	dr_size = offsetof(struct hv_dr_state, func) +
830 	    (sizeof(struct pci_func_desc) * relations->device_count);
831 	dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO);
832 
833 	dr->device_count = relations->device_count;
834 	if (dr->device_count != 0)
835 		memcpy(dr->func, relations->func,
836 		    sizeof(struct pci_func_desc) * dr->device_count);
837 
838 	mtx_lock(&hbus->device_list_lock);
839 	TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link);
840 	mtx_unlock(&hbus->device_list_lock);
841 
842 	dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO);
843 	dr_wrk->bus = hbus;
844 	TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk);
845 	taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task);
846 }
847 
848 static void
849 hv_eject_device_work(void *arg, int pending __unused)
850 {
851 	struct hv_pci_dev *hpdev = arg;
852 	union win_slot_encoding wslot = hpdev->desc.wslot;
853 	struct hv_pcibus *hbus = hpdev->hbus;
854 	struct pci_eject_response *eject_pkt;
855 	struct {
856 		struct pci_packet pkt;
857 		uint8_t buffer[sizeof(struct pci_eject_response)];
858 	} ctxt;
859 
860 	hv_pci_delete_device(hpdev);
861 
862 	memset(&ctxt, 0, sizeof(ctxt));
863 	eject_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
864 	eject_pkt->message_type.type = PCI_EJECTION_COMPLETE;
865 	eject_pkt->wslot.val = wslot.val;
866 	vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
867 	    eject_pkt, sizeof(*eject_pkt), 0);
868 }
869 
870 static void
871 hv_pci_eject_device(struct hv_pci_dev *hpdev)
872 {
873 	struct hv_pcibus *hbus = hpdev->hbus;
874 	struct taskqueue *taskq;
875 
876 	if (hbus->detaching)
877 		return;
878 
879 	/*
880 	 * Push this task into the same taskqueue on which
881 	 * vmbus_pcib_attach() runs, so we're sure this task can't run
882 	 * concurrently with vmbus_pcib_attach().
883 	 */
884 	TASK_INIT(&hpdev->eject_task, 0, hv_eject_device_work, hpdev);
885 	taskq = vmbus_chan_mgmt_tq(hbus->sc->chan);
886 	taskqueue_enqueue(taskq, &hpdev->eject_task);
887 }
888 
889 #define PCIB_PACKET_SIZE	0x100
890 
891 static void
892 vmbus_pcib_on_channel_callback(struct vmbus_channel *chan, void *arg)
893 {
894 	struct vmbus_pcib_softc *sc = arg;
895 	struct hv_pcibus *hbus = sc->hbus;
896 
897 	void *buffer;
898 	int bufferlen = PCIB_PACKET_SIZE;
899 
900 	struct pci_packet *comp_packet;
901 	struct pci_response *response;
902 	struct pci_incoming_message *new_msg;
903 	struct pci_bus_relations *bus_rel;
904 	struct pci_dev_incoming *dev_msg;
905 	struct hv_pci_dev *hpdev;
906 
907 	buffer = sc->rx_buf;
908 	do {
909 		struct vmbus_chanpkt_hdr *pkt = buffer;
910 		uint32_t bytes_rxed;
911 		int ret;
912 
913 		bytes_rxed = bufferlen;
914 		ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed);
915 
916 		if (ret == ENOBUFS) {
917 			/* Handle large packet */
918 			if (bufferlen > PCIB_PACKET_SIZE) {
919 				free(buffer, M_DEVBUF);
920 				buffer = NULL;
921 			}
922 
923 			/* alloc new buffer */
924 			buffer = malloc(bytes_rxed, M_DEVBUF, M_WAITOK | M_ZERO);
925 			bufferlen = bytes_rxed;
926 
927 			continue;
928 		}
929 
930 		if (ret != 0) {
931 			/* ignore EIO or EAGAIN */
932 			break;
933 		}
934 
935 		if (bytes_rxed <= sizeof(struct pci_response))
936 			continue;
937 
938 		switch (pkt->cph_type) {
939 		case VMBUS_CHANPKT_TYPE_COMP:
940 			comp_packet = (struct pci_packet *)pkt->cph_xactid;
941 			response = (struct pci_response *)pkt;
942 			comp_packet->completion_func(comp_packet->compl_ctxt,
943 			    response, bytes_rxed);
944 			break;
945 		case VMBUS_CHANPKT_TYPE_INBAND:
946 			new_msg = (struct pci_incoming_message *)buffer;
947 
948 			switch (new_msg->message_type.type) {
949 			case PCI_BUS_RELATIONS:
950 				bus_rel = (struct pci_bus_relations *)buffer;
951 
952 				if (bus_rel->device_count == 0)
953 					break;
954 
955 				if (bytes_rxed <
956 				    offsetof(struct pci_bus_relations, func) +
957 				        (sizeof(struct pci_func_desc) *
958 				            (bus_rel->device_count)))
959 					break;
960 
961 				hv_pci_devices_present(hbus, bus_rel);
962 				break;
963 
964 			case PCI_EJECT:
965 				dev_msg = (struct pci_dev_incoming *)buffer;
966 				hpdev = get_pcichild_wslot(hbus,
967 				    dev_msg->wslot.val);
968 
969 				if (hpdev)
970 					hv_pci_eject_device(hpdev);
971 
972 				break;
973 			default:
974 				printf("vmbus_pcib: Unknown msg type 0x%x\n",
975 				    new_msg->message_type.type);
976 				break;
977 			}
978 			break;
979 		default:
980 			printf("vmbus_pcib: Unknown VMBus msg type %hd\n",
981 			    pkt->cph_type);
982 			break;
983 		}
984 	} while (1);
985 
986 	if (bufferlen > PCIB_PACKET_SIZE)
987 		free(buffer, M_DEVBUF);
988 }
989 
990 static int
991 hv_pci_protocol_negotiation(struct hv_pcibus *hbus)
992 {
993 	struct pci_version_request *version_req;
994 	struct hv_pci_compl comp_pkt;
995 	struct {
996 		struct pci_packet pkt;
997 		uint8_t buffer[sizeof(struct pci_version_request)];
998 	} ctxt;
999 	int ret;
1000 
1001 	init_completion(&comp_pkt.host_event);
1002 
1003 	ctxt.pkt.completion_func = hv_pci_generic_compl;
1004 	ctxt.pkt.compl_ctxt = &comp_pkt;
1005 	version_req = (struct pci_version_request *)&ctxt.pkt.message;
1006 	version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
1007 	version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT;
1008 	version_req->is_last_attempt = 1;
1009 
1010 	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
1011 	    VMBUS_CHANPKT_FLAG_RC, version_req, sizeof(*version_req),
1012 	    (uint64_t)&ctxt.pkt);
1013 	if (ret)
1014 		goto out;
1015 
1016 	wait_for_completion(&comp_pkt.host_event);
1017 
1018 	if (comp_pkt.completion_status < 0) {
1019 		device_printf(hbus->pcib,
1020 		    "vmbus_pcib version negotiation failed: %x\n",
1021 		    comp_pkt.completion_status);
1022 		ret = EPROTO;
1023 	} else {
1024 		ret = 0;
1025 	}
1026 out:
1027 	free_completion(&comp_pkt.host_event);
1028 	return (ret);
1029 }
1030 
1031 /* Ask the host to send along the list of child devices */
1032 static int
1033 hv_pci_query_relations(struct hv_pcibus *hbus)
1034 {
1035 	struct pci_message message;
1036 	int ret;
1037 
1038 	message.type = PCI_QUERY_BUS_RELATIONS;
1039 	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
1040 	    &message, sizeof(message), 0);
1041 	return (ret);
1042 }
1043 
1044 static int
1045 hv_pci_enter_d0(struct hv_pcibus *hbus)
1046 {
1047 	struct pci_bus_d0_entry *d0_entry;
1048 	struct hv_pci_compl comp_pkt;
1049 	struct {
1050 		struct pci_packet pkt;
1051 		uint8_t buffer[sizeof(struct pci_bus_d0_entry)];
1052 	} ctxt;
1053 	int ret;
1054 
1055 	/*
1056 	 * Tell the host that the bus is ready to use, and moved into the
1057 	 * powered-on state.  This includes telling the host which region
1058 	 * of memory-mapped I/O space has been chosen for configuration space
1059 	 * access.
1060 	 */
1061 	init_completion(&comp_pkt.host_event);
1062 
1063 	ctxt.pkt.completion_func = hv_pci_generic_compl;
1064 	ctxt.pkt.compl_ctxt = &comp_pkt;
1065 
1066 	d0_entry = (struct pci_bus_d0_entry *)&ctxt.pkt.message;
1067 	memset(d0_entry, 0, sizeof(*d0_entry));
1068 	d0_entry->message_type.type = PCI_BUS_D0ENTRY;
1069 	d0_entry->mmio_base = rman_get_start(hbus->cfg_res);
1070 
1071 	ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND,
1072 	    VMBUS_CHANPKT_FLAG_RC, d0_entry, sizeof(*d0_entry),
1073 	    (uint64_t)&ctxt.pkt);
1074 	if (ret)
1075 		goto out;
1076 
1077 	wait_for_completion(&comp_pkt.host_event);
1078 
1079 	if (comp_pkt.completion_status < 0) {
1080 		device_printf(hbus->pcib, "vmbus_pcib failed to enable D0\n");
1081 		ret = EPROTO;
1082 	} else {
1083 		ret = 0;
1084 	}
1085 
1086 out:
1087 	free_completion(&comp_pkt.host_event);
1088 	return (ret);
1089 }
1090 
1091 /*
1092  * It looks this is only needed by Windows VM, but let's send the message too
1093  * just to make the host happy.
1094  */
1095 static int
1096 hv_send_resources_allocated(struct hv_pcibus *hbus)
1097 {
1098 	struct pci_resources_assigned *res_assigned;
1099 	struct hv_pci_compl comp_pkt;
1100 	struct hv_pci_dev *hpdev;
1101 	struct pci_packet *pkt;
1102 	uint32_t wslot;
1103 	int ret = 0;
1104 
1105 	pkt = malloc(sizeof(*pkt) + sizeof(*res_assigned),
1106 	    M_DEVBUF, M_WAITOK | M_ZERO);
1107 
1108 	for (wslot = 0; wslot < 256; wslot++) {
1109 		hpdev = get_pcichild_wslot(hbus, wslot);
1110 		if (!hpdev)
1111 			continue;
1112 
1113 		init_completion(&comp_pkt.host_event);
1114 
1115 		memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned));
1116 		pkt->completion_func = hv_pci_generic_compl;
1117 		pkt->compl_ctxt = &comp_pkt;
1118 
1119 		res_assigned = (struct pci_resources_assigned *)&pkt->message;
1120 		res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED;
1121 		res_assigned->wslot.val = hpdev->desc.wslot.val;
1122 
1123 		ret = vmbus_chan_send(hbus->sc->chan,
1124 		    VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC,
1125 		    &pkt->message, sizeof(*res_assigned), (uint64_t)pkt);
1126 		if (ret) {
1127 			free_completion(&comp_pkt.host_event);
1128 			break;
1129 		}
1130 
1131 		wait_for_completion(&comp_pkt.host_event);
1132 		free_completion(&comp_pkt.host_event);
1133 
1134 		if (comp_pkt.completion_status < 0) {
1135 			ret = EPROTO;
1136 			device_printf(hbus->pcib,
1137 			    "failed to send PCI_RESOURCES_ASSIGNED\n");
1138 			break;
1139 		}
1140 	}
1141 
1142 	free(pkt, M_DEVBUF);
1143 	return (ret);
1144 }
1145 
1146 static int
1147 hv_send_resources_released(struct hv_pcibus *hbus)
1148 {
1149 	struct pci_child_message pkt;
1150 	struct hv_pci_dev *hpdev;
1151 	uint32_t wslot;
1152 	int ret;
1153 
1154 	for (wslot = 0; wslot < 256; wslot++) {
1155 		hpdev = get_pcichild_wslot(hbus, wslot);
1156 		if (!hpdev)
1157 			continue;
1158 
1159 		pkt.message_type.type = PCI_RESOURCES_RELEASED;
1160 		pkt.wslot.val = hpdev->desc.wslot.val;
1161 
1162 		ret = vmbus_chan_send(hbus->sc->chan,
1163 		    VMBUS_CHANPKT_TYPE_INBAND, 0, &pkt, sizeof(pkt), 0);
1164 		if (ret)
1165 			return (ret);
1166 	}
1167 
1168 	return (0);
1169 }
1170 
1171 #define hv_cfg_read(x, s)						\
1172 static inline uint##x##_t hv_cfg_read_##s(struct hv_pcibus *bus,	\
1173     bus_size_t offset)							\
1174 {									\
1175 	return (bus_read_##s(bus->cfg_res, offset));			\
1176 }
1177 
1178 #define hv_cfg_write(x, s)						\
1179 static inline void hv_cfg_write_##s(struct hv_pcibus *bus,		\
1180     bus_size_t offset, uint##x##_t val)					\
1181 {									\
1182 	return (bus_write_##s(bus->cfg_res, offset, val));		\
1183 }
1184 
1185 hv_cfg_read(8, 1)
1186 hv_cfg_read(16, 2)
1187 hv_cfg_read(32, 4)
1188 
1189 hv_cfg_write(8, 1)
1190 hv_cfg_write(16, 2)
1191 hv_cfg_write(32, 4)
1192 
1193 static void
1194 _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size,
1195     uint32_t *val)
1196 {
1197 	struct hv_pcibus *hbus = hpdev->hbus;
1198 	bus_size_t addr = CFG_PAGE_OFFSET + where;
1199 
1200 	/*
1201 	 * If the attempt is to read the IDs or the ROM BAR, simulate that.
1202 	 */
1203 	if (where + size <= PCIR_COMMAND) {
1204 		memcpy(val, ((uint8_t *)&hpdev->desc.v_id) + where, size);
1205 	} else if (where >= PCIR_REVID && where + size <=
1206 		   PCIR_CACHELNSZ) {
1207 		memcpy(val, ((uint8_t *)&hpdev->desc.rev) + where -
1208 		       PCIR_REVID, size);
1209 	} else if (where >= PCIR_SUBVEND_0 && where + size <=
1210 		   PCIR_BIOS) {
1211 		memcpy(val, (uint8_t *)&hpdev->desc.subsystem_id + where -
1212 		       PCIR_SUBVEND_0, size);
1213 	} else if (where >= PCIR_BIOS && where + size <=
1214 		   PCIR_CAP_PTR) {
1215 		/* ROM BARs are unimplemented */
1216 		*val = 0;
1217 	} else if ((where >= PCIR_INTLINE && where + size <=
1218 		   PCIR_INTPIN) ||(where == PCIR_INTPIN && size == 1)) {
1219 		/*
1220 		 * Interrupt Line and Interrupt PIN are hard-wired to zero
1221 		 * because this front-end only supports message-signaled
1222 		 * interrupts.
1223 		 */
1224 		*val = 0;
1225 	} else if (where + size <= CFG_PAGE_SIZE) {
1226 		mtx_lock(&hbus->config_lock);
1227 
1228 		/* Choose the function to be read. */
1229 		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
1230 
1231 		/* Make sure the function was chosen before we start reading.*/
1232 		mb();
1233 
1234 		/* Read from that function's config space. */
1235 		switch (size) {
1236 		case 1:
1237 			*((uint8_t *)val) = hv_cfg_read_1(hbus, addr);
1238 			break;
1239 		case 2:
1240 			*((uint16_t *)val) = hv_cfg_read_2(hbus, addr);
1241 			break;
1242 		default:
1243 			*((uint32_t *)val) = hv_cfg_read_4(hbus, addr);
1244 			break;
1245 		}
1246 		/*
1247 		 * Make sure the write was done before we release the lock,
1248 		 * allowing consecutive reads/writes.
1249 		 */
1250 		mb();
1251 
1252 		mtx_unlock(&hbus->config_lock);
1253 	} else {
1254 		/* Invalid config read: it's unlikely to reach here. */
1255 		memset(val, 0, size);
1256 	}
1257 }
1258 
1259 static void
1260 _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size,
1261     uint32_t val)
1262 {
1263 	struct hv_pcibus *hbus = hpdev->hbus;
1264 	bus_size_t addr = CFG_PAGE_OFFSET + where;
1265 
1266 	/* SSIDs and ROM BARs are read-only */
1267 	if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_CAP_PTR)
1268 		return;
1269 
1270 	if (where >= PCIR_COMMAND && where + size <= CFG_PAGE_SIZE) {
1271 		mtx_lock(&hbus->config_lock);
1272 
1273 		/* Choose the function to be written. */
1274 		hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val);
1275 
1276 		/* Make sure the function was chosen before we start writing.*/
1277 		wmb();
1278 
1279 		/* Write to that function's config space. */
1280 		switch (size) {
1281 		case 1:
1282 			hv_cfg_write_1(hbus, addr, (uint8_t)val);
1283 			break;
1284 		case 2:
1285 			hv_cfg_write_2(hbus, addr, (uint16_t)val);
1286 			break;
1287 		default:
1288 			hv_cfg_write_4(hbus, addr, (uint32_t)val);
1289 			break;
1290 		}
1291 
1292 		/*
1293 		 * Make sure the write was done before we release the lock,
1294 		 * allowing consecutive reads/writes.
1295 		 */
1296 		mb();
1297 
1298 		mtx_unlock(&hbus->config_lock);
1299 	} else {
1300 		/* Invalid config write: it's unlikely to reach here. */
1301 		return;
1302 	}
1303 }
1304 
1305 static void
1306 vmbus_pcib_set_detaching(void *arg, int pending __unused)
1307 {
1308 	struct hv_pcibus *hbus = arg;
1309 
1310 	atomic_set_int(&hbus->detaching, 1);
1311 }
1312 
1313 static void
1314 vmbus_pcib_pre_detach(struct hv_pcibus *hbus)
1315 {
1316 	struct task task;
1317 
1318 	TASK_INIT(&task, 0, vmbus_pcib_set_detaching, hbus);
1319 
1320 	/*
1321 	 * Make sure the channel callback won't push any possible new
1322 	 * PCI_BUS_RELATIONS and PCI_EJECT tasks to sc->taskq.
1323 	 */
1324 	vmbus_chan_run_task(hbus->sc->chan, &task);
1325 
1326 	taskqueue_drain_all(hbus->sc->taskq);
1327 }
1328 
1329 
1330 /*
1331  * Standard probe entry point.
1332  *
1333  */
1334 static int
1335 vmbus_pcib_probe(device_t dev)
1336 {
1337 	if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
1338 	    &g_pass_through_dev_type) == 0) {
1339 		device_set_desc(dev, "Hyper-V PCI Express Pass Through");
1340 		return (BUS_PROBE_DEFAULT);
1341 	}
1342 	return (ENXIO);
1343 }
1344 
1345 /*
1346  * Standard attach entry point.
1347  *
1348  */
1349 static int
1350 vmbus_pcib_attach(device_t dev)
1351 {
1352 	const int pci_ring_size = (4 * PAGE_SIZE);
1353 	const struct hyperv_guid *inst_guid;
1354 	struct vmbus_channel *channel;
1355 	struct vmbus_pcib_softc *sc;
1356 	struct hv_pcibus *hbus;
1357 	int rid = 0;
1358 	int ret;
1359 
1360 	hbus = malloc(sizeof(*hbus), M_DEVBUF, M_WAITOK | M_ZERO);
1361 	hbus->pcib = dev;
1362 
1363 	channel = vmbus_get_channel(dev);
1364 	inst_guid = vmbus_chan_guid_inst(channel);
1365 	hbus->pci_domain = inst_guid->hv_guid[9] |
1366 			  (inst_guid->hv_guid[8] << 8);
1367 
1368 	mtx_init(&hbus->config_lock, "hbcfg", NULL, MTX_DEF);
1369 	mtx_init(&hbus->device_list_lock, "hbdl", NULL, MTX_DEF);
1370 	TAILQ_INIT(&hbus->children);
1371 	TAILQ_INIT(&hbus->dr_list);
1372 
1373 	hbus->cfg_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid,
1374 	    0, RM_MAX_END, PCI_CONFIG_MMIO_LENGTH,
1375 	    RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE));
1376 
1377 	if (!hbus->cfg_res) {
1378 		device_printf(dev, "failed to get resource for cfg window\n");
1379 		ret = ENXIO;
1380 		goto free_bus;
1381 	}
1382 
1383 	sc = device_get_softc(dev);
1384 	sc->chan = channel;
1385 	sc->rx_buf = malloc(PCIB_PACKET_SIZE, M_DEVBUF, M_WAITOK | M_ZERO);
1386 	sc->hbus = hbus;
1387 
1388 	/*
1389 	 * The taskq is used to handle PCI_BUS_RELATIONS and PCI_EJECT
1390 	 * messages. NB: we can't handle the messages in the channel callback
1391 	 * directly, because the message handlers need to send new messages
1392 	 * to the host and waits for the host's completion messages, which
1393 	 * must also be handled by the channel callback.
1394 	 */
1395 	sc->taskq = taskqueue_create("vmbus_pcib_tq", M_WAITOK,
1396 	    taskqueue_thread_enqueue, &sc->taskq);
1397 	taskqueue_start_threads(&sc->taskq, 1, PI_NET, "vmbus_pcib_tq");
1398 
1399 	hbus->sc = sc;
1400 
1401 	init_completion(&hbus->query_completion);
1402 	hbus->query_comp = &hbus->query_completion;
1403 
1404 	ret = vmbus_chan_open(sc->chan, pci_ring_size, pci_ring_size,
1405 		NULL, 0, vmbus_pcib_on_channel_callback, sc);
1406 	if (ret)
1407 		goto free_res;
1408 
1409 	ret = hv_pci_protocol_negotiation(hbus);
1410 	if (ret)
1411 		goto vmbus_close;
1412 
1413 	ret = hv_pci_query_relations(hbus);
1414 	if (ret)
1415 		goto vmbus_close;
1416 	wait_for_completion(hbus->query_comp);
1417 
1418 	ret = hv_pci_enter_d0(hbus);
1419 	if (ret)
1420 		goto vmbus_close;
1421 
1422 	ret = hv_send_resources_allocated(hbus);
1423 	if (ret)
1424 		goto vmbus_close;
1425 
1426 	hbus->pci_bus = device_add_child(dev, "pci", -1);
1427 	if (!hbus->pci_bus) {
1428 		device_printf(dev, "failed to create pci bus\n");
1429 		ret = ENXIO;
1430 		goto vmbus_close;
1431 	}
1432 
1433 	bus_generic_attach(dev);
1434 
1435 	hbus->state = hv_pcibus_installed;
1436 
1437 	return (0);
1438 
1439 vmbus_close:
1440 	vmbus_pcib_pre_detach(hbus);
1441 	vmbus_chan_close(sc->chan);
1442 free_res:
1443 	taskqueue_free(sc->taskq);
1444 	free_completion(&hbus->query_completion);
1445 	free(sc->rx_buf, M_DEVBUF);
1446 	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
1447 free_bus:
1448 	mtx_destroy(&hbus->device_list_lock);
1449 	mtx_destroy(&hbus->config_lock);
1450 	free(hbus, M_DEVBUF);
1451 	return (ret);
1452 }
1453 
1454 /*
1455  * Standard detach entry point
1456  */
1457 static int
1458 vmbus_pcib_detach(device_t dev)
1459 {
1460 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1461 	struct hv_pcibus *hbus = sc->hbus;
1462 	struct pci_message teardown_packet;
1463 	struct pci_bus_relations relations;
1464 	int ret;
1465 
1466 	vmbus_pcib_pre_detach(hbus);
1467 
1468 	if (hbus->state == hv_pcibus_installed)
1469 		bus_generic_detach(dev);
1470 
1471 	/* Delete any children which might still exist. */
1472 	memset(&relations, 0, sizeof(relations));
1473 	hv_pci_devices_present(hbus, &relations);
1474 
1475 	ret = hv_send_resources_released(hbus);
1476 	if (ret)
1477 		device_printf(dev, "failed to send PCI_RESOURCES_RELEASED\n");
1478 
1479 	teardown_packet.type = PCI_BUS_D0EXIT;
1480 	ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0,
1481 	    &teardown_packet, sizeof(struct pci_message), 0);
1482 	if (ret)
1483 		device_printf(dev, "failed to send PCI_BUS_D0EXIT\n");
1484 
1485 	taskqueue_drain_all(hbus->sc->taskq);
1486 	vmbus_chan_close(sc->chan);
1487 	taskqueue_free(sc->taskq);
1488 
1489 	free_completion(&hbus->query_completion);
1490 	free(sc->rx_buf, M_DEVBUF);
1491 	bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res);
1492 
1493 	mtx_destroy(&hbus->device_list_lock);
1494 	mtx_destroy(&hbus->config_lock);
1495 	free(hbus, M_DEVBUF);
1496 
1497 	return (0);
1498 }
1499 
1500 static int
1501 vmbus_pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *val)
1502 {
1503 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1504 
1505 	switch (which) {
1506 	case PCIB_IVAR_DOMAIN:
1507 		*val = sc->hbus->pci_domain;
1508 		return (0);
1509 
1510 	case PCIB_IVAR_BUS:
1511 		/* There is only bus 0. */
1512 		*val = 0;
1513 		return (0);
1514 	}
1515 	return (ENOENT);
1516 }
1517 
1518 static int
1519 vmbus_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t val)
1520 {
1521 	return (ENOENT);
1522 }
1523 
1524 static struct resource *
1525 vmbus_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid,
1526 	rman_res_t start, rman_res_t end, rman_res_t count, u_int flags)
1527 {
1528 	unsigned int bar_no;
1529 	struct hv_pci_dev *hpdev;
1530 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1531 	struct resource *res;
1532 	unsigned int devfn;
1533 
1534 	if (type == PCI_RES_BUS)
1535 		return (pci_domain_alloc_bus(sc->hbus->pci_domain, child, rid,
1536 		    start, end, count, flags));
1537 
1538 	/* Devices with port I/O BAR are not supported. */
1539 	if (type == SYS_RES_IOPORT)
1540 		return (NULL);
1541 
1542 	if (type == SYS_RES_MEMORY) {
1543 		devfn = PCI_DEVFN(pci_get_slot(child),
1544 		    pci_get_function(child));
1545 		hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1546 		if (!hpdev)
1547 			return (NULL);
1548 
1549 		bar_no = PCI_RID2BAR(*rid);
1550 		if (bar_no >= MAX_NUM_BARS)
1551 			return (NULL);
1552 
1553 		/* Make sure a 32-bit BAR gets a 32-bit address */
1554 		if (!(hpdev->probed_bar[bar_no] & PCIM_BAR_MEM_64))
1555 			end = ulmin(end, 0xFFFFFFFF);
1556 	}
1557 
1558 	res = bus_generic_alloc_resource(dev, child, type, rid,
1559 		start, end, count, flags);
1560 	/*
1561 	 * If this is a request for a specific range, assume it is
1562 	 * correct and pass it up to the parent.
1563 	 */
1564 	if (res == NULL && start + count - 1 == end)
1565 		res = bus_generic_alloc_resource(dev, child, type, rid,
1566 		    start, end, count, flags);
1567 	return (res);
1568 }
1569 
1570 static int
1571 vmbus_pcib_release_resource(device_t dev, device_t child, int type, int rid,
1572     struct resource *r)
1573 {
1574 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1575 
1576 	if (type == PCI_RES_BUS)
1577 		return (pci_domain_release_bus(sc->hbus->pci_domain, child,
1578 		    rid, r));
1579 
1580 	if (type == SYS_RES_IOPORT)
1581 		return (EINVAL);
1582 
1583 	return (bus_generic_release_resource(dev, child, type, rid, r));
1584 }
1585 
1586 #if __FreeBSD_version >= 1100000
1587 static int
1588 vmbus_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op,
1589     size_t setsize, cpuset_t *cpuset)
1590 {
1591 	return (bus_get_cpus(pcib, op, setsize, cpuset));
1592 }
1593 #endif
1594 
1595 static uint32_t
1596 vmbus_pcib_read_config(device_t dev, u_int bus, u_int slot, u_int func,
1597     u_int reg, int bytes)
1598 {
1599 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1600 	struct hv_pci_dev *hpdev;
1601 	unsigned int devfn = PCI_DEVFN(slot, func);
1602 	uint32_t data = 0;
1603 
1604 	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
1605 
1606 	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1607 	if (!hpdev)
1608 		return (~0);
1609 
1610 	_hv_pcifront_read_config(hpdev, reg, bytes, &data);
1611 
1612 	return (data);
1613 }
1614 
1615 static void
1616 vmbus_pcib_write_config(device_t dev, u_int bus, u_int slot, u_int func,
1617     u_int reg, uint32_t data, int bytes)
1618 {
1619 	struct vmbus_pcib_softc *sc = device_get_softc(dev);
1620 	struct hv_pci_dev *hpdev;
1621 	unsigned int devfn = PCI_DEVFN(slot, func);
1622 
1623 	KASSERT(bus == 0, ("bus should be 0, but is %u", bus));
1624 
1625 	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1626 	if (!hpdev)
1627 		return;
1628 
1629 	_hv_pcifront_write_config(hpdev, reg, bytes, data);
1630 }
1631 
1632 static int
1633 vmbus_pcib_route_intr(device_t pcib, device_t dev, int pin)
1634 {
1635 	/* We only support MSI/MSI-X and don't support INTx interrupt. */
1636 	return (PCI_INVALID_IRQ);
1637 }
1638 
1639 static int
1640 vmbus_pcib_alloc_msi(device_t pcib, device_t dev, int count,
1641     int maxcount, int *irqs)
1642 {
1643 	return (PCIB_ALLOC_MSI(device_get_parent(pcib), dev, count, maxcount,
1644 	    irqs));
1645 }
1646 
1647 static int
1648 vmbus_pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs)
1649 {
1650 	return (PCIB_RELEASE_MSI(device_get_parent(pcib), dev, count, irqs));
1651 }
1652 
1653 static int
1654 vmbus_pcib_alloc_msix(device_t pcib, device_t dev, int *irq)
1655 {
1656 	return (PCIB_ALLOC_MSIX(device_get_parent(pcib), dev, irq));
1657 }
1658 
1659 static int
1660 vmbus_pcib_release_msix(device_t pcib, device_t dev, int irq)
1661 {
1662 	return (PCIB_RELEASE_MSIX(device_get_parent(pcib), dev, irq));
1663 }
1664 
1665 #define	MSI_INTEL_ADDR_DEST	0x000ff000
1666 #define	MSI_INTEL_DATA_INTVEC	IOART_INTVEC	/* Interrupt vector. */
1667 #define	MSI_INTEL_DATA_DELFIXED	IOART_DELFIXED
1668 
1669 static int
1670 vmbus_pcib_map_msi(device_t pcib, device_t child, int irq,
1671     uint64_t *addr, uint32_t *data)
1672 {
1673 	unsigned int devfn;
1674 	struct hv_pci_dev *hpdev;
1675 
1676 	uint64_t v_addr;
1677 	uint32_t v_data;
1678 	struct hv_irq_desc *hid, *tmp_hid;
1679 	unsigned int cpu, vcpu_id;
1680 	unsigned int vector;
1681 
1682 	struct vmbus_pcib_softc *sc = device_get_softc(pcib);
1683 	struct pci_create_interrupt *int_pkt;
1684 	struct compose_comp_ctxt comp;
1685 	struct {
1686 		struct pci_packet pkt;
1687 		uint8_t buffer[sizeof(struct pci_create_interrupt)];
1688 	} ctxt;
1689 
1690 	int ret;
1691 
1692 	devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child));
1693 	hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn));
1694 	if (!hpdev)
1695 		return (ENOENT);
1696 
1697 	ret = PCIB_MAP_MSI(device_get_parent(pcib), child, irq,
1698 	    &v_addr, &v_data);
1699 	if (ret)
1700 		return (ret);
1701 
1702 	TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) {
1703 		if (hid->irq == irq) {
1704 			TAILQ_REMOVE(&hpdev->irq_desc_list, hid, link);
1705 			hv_int_desc_free(hpdev, hid);
1706 			break;
1707 		}
1708 	}
1709 
1710 	cpu = (v_addr & MSI_INTEL_ADDR_DEST) >> 12;
1711 	vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu);
1712 	vector = v_data & MSI_INTEL_DATA_INTVEC;
1713 
1714 	init_completion(&comp.comp_pkt.host_event);
1715 
1716 	memset(&ctxt, 0, sizeof(ctxt));
1717 	ctxt.pkt.completion_func = hv_pci_compose_compl;
1718 	ctxt.pkt.compl_ctxt = &comp;
1719 
1720 	int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message;
1721 	int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1722 	int_pkt->wslot.val = hpdev->desc.wslot.val;
1723 	int_pkt->int_desc.vector = vector;
1724 	int_pkt->int_desc.vector_count = 1;
1725 	int_pkt->int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED;
1726 	int_pkt->int_desc.cpu_mask = 1ULL << vcpu_id;
1727 
1728 	ret = vmbus_chan_send(sc->chan,	VMBUS_CHANPKT_TYPE_INBAND,
1729 	    VMBUS_CHANPKT_FLAG_RC, int_pkt, sizeof(*int_pkt),
1730 	    (uint64_t)&ctxt.pkt);
1731 	if (ret) {
1732 		free_completion(&comp.comp_pkt.host_event);
1733 		return (ret);
1734 	}
1735 
1736 	wait_for_completion(&comp.comp_pkt.host_event);
1737 	free_completion(&comp.comp_pkt.host_event);
1738 
1739 	if (comp.comp_pkt.completion_status < 0)
1740 		return (EPROTO);
1741 
1742 	*addr = comp.int_desc.address;
1743 	*data = comp.int_desc.data;
1744 
1745 	hid = malloc(sizeof(struct hv_irq_desc), M_DEVBUF, M_WAITOK | M_ZERO);
1746 	hid->irq = irq;
1747 	hid->desc = comp.int_desc;
1748 	TAILQ_INSERT_TAIL(&hpdev->irq_desc_list, hid, link);
1749 
1750 	return (0);
1751 }
1752 
1753 static device_method_t vmbus_pcib_methods[] = {
1754 	/* Device interface */
1755 	DEVMETHOD(device_probe,         vmbus_pcib_probe),
1756 	DEVMETHOD(device_attach,        vmbus_pcib_attach),
1757 	DEVMETHOD(device_detach,        vmbus_pcib_detach),
1758 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
1759 	DEVMETHOD(device_suspend,	bus_generic_suspend),
1760 	DEVMETHOD(device_resume,	bus_generic_resume),
1761 
1762 	/* Bus interface */
1763 	DEVMETHOD(bus_read_ivar,		vmbus_pcib_read_ivar),
1764 	DEVMETHOD(bus_write_ivar,		vmbus_pcib_write_ivar),
1765 	DEVMETHOD(bus_alloc_resource,		vmbus_pcib_alloc_resource),
1766 	DEVMETHOD(bus_release_resource,		vmbus_pcib_release_resource),
1767 	DEVMETHOD(bus_activate_resource,   bus_generic_activate_resource),
1768 	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
1769 	DEVMETHOD(bus_setup_intr,	   bus_generic_setup_intr),
1770 	DEVMETHOD(bus_teardown_intr,	   bus_generic_teardown_intr),
1771 #if __FreeBSD_version >= 1100000
1772 	DEVMETHOD(bus_get_cpus,			vmbus_pcib_get_cpus),
1773 #endif
1774 
1775 	/* pcib interface */
1776 	DEVMETHOD(pcib_maxslots,		pcib_maxslots),
1777 	DEVMETHOD(pcib_read_config,		vmbus_pcib_read_config),
1778 	DEVMETHOD(pcib_write_config,		vmbus_pcib_write_config),
1779 	DEVMETHOD(pcib_route_interrupt,		vmbus_pcib_route_intr),
1780 	DEVMETHOD(pcib_alloc_msi,		vmbus_pcib_alloc_msi),
1781 	DEVMETHOD(pcib_release_msi,		vmbus_pcib_release_msi),
1782 	DEVMETHOD(pcib_alloc_msix,		vmbus_pcib_alloc_msix),
1783 	DEVMETHOD(pcib_release_msix,		vmbus_pcib_release_msix),
1784 	DEVMETHOD(pcib_map_msi,			vmbus_pcib_map_msi),
1785 	DEVMETHOD(pcib_request_feature,		pcib_request_feature_allow),
1786 
1787 	DEVMETHOD_END
1788 };
1789 
1790 static devclass_t pcib_devclass;
1791 
1792 DEFINE_CLASS_0(pcib, vmbus_pcib_driver, vmbus_pcib_methods,
1793 		sizeof(struct vmbus_pcib_softc));
1794 DRIVER_MODULE(vmbus_pcib, vmbus, vmbus_pcib_driver, pcib_devclass, 0, 0);
1795 MODULE_DEPEND(vmbus_pcib, vmbus, 1, 1, 1);
1796 MODULE_DEPEND(vmbus_pcib, pci, 1, 1, 1);
1797 
1798 #endif /* NEW_PCIB */
1799